數據庫連接池與 Tomcat 線程監控警示系統設計

1. 監控架構設計

┌─────────────────┐       ┌────────────────┐       ┌───────────────┐
│   應用服務器    │──────▶│  監控數據收集  │──────▶│  監控數據存儲 │
└─────────────────┘       └────────────────┘       └───────────────┘
                                  │                        │
                                  ▼                        ▼
                          ┌────────────────┐      ┌────────────────┐
                          │   閾值檢測    │◀─────│   監控面板    │
                          └────────────────┘      └────────────────┘
                                  │
                                  ▼
                          ┌────────────────┐
                          │   告警通知    │
                          └────────────────┘

2. 監控指標設計

數據庫連接池監控

@Configuration
public class DatabasePoolConfig {

    @Bean
    public DataSourcePoolMetricsCollector dataSourceMetricsCollector(DataSource dataSource) {
        return new DataSourcePoolMetricsCollector(dataSource);
    }
}

@Component
@Slf4j
public class DataSourcePoolMetricsCollector {
    private final DataSource dataSource;
    private final MeterRegistry meterRegistry;

    // 配置閾值
    @Value("${monitor.db.active-ratio-threshold:0.8}")
    private double activeRatioThreshold;

    @Value("${monitor.db.waiting-threshold:0}")
    private int waitingThreadThreshold;

    @Scheduled(fixedRate = 5000)
    public void collectMetrics() {
        if (dataSource instanceof HikariDataSource) {
            HikariDataSource hikari = (HikariDataSource) dataSource;
            HikariPoolMXBean poolBean = hikari.getHikariPoolMXBean();

            int active = poolBean.getActiveConnections();
            int total = poolBean.getTotalConnections();
            int waiting = poolBean.getThreadsAwaitingConnection();
            double ratio = (double) active / total;

            // 記錄指標
            registerMetrics(active, total, waiting, ratio);

            // 檢查告警條件
            checkAlertConditions(active, total, waiting, ratio);
        }
    }

    private void checkAlertConditions(int active, int total, int waiting, double ratio) {
        if (ratio > activeRatioThreshold) {
            AlertEvent event = new AlertEvent(
                AlertLevel.WARNING,
                "DatabasePool",
                String.format("Connection pool utilization high: %.2f%% (%d/%d)", 
                    ratio * 100, active, total)
            );
            alertService.sendAlert(event);
        }

        if (waiting > waitingThreadThreshold) {
            AlertEvent event = new AlertEvent(
                AlertLevel.CRITICAL,
                "DatabasePool",
                String.format("Threads waiting for connection: %d", waiting)
            );
            alertService.sendAlert(event);
        }
    }
}

Tomcat 線程監控

@Configuration
public class TomcatMetricsConfig {

    @Bean
    public TomcatMetricsCollector tomcatMetricsCollector(
            ServletWebServerApplicationContext context) {
        return new TomcatMetricsCollector(context);
    }
}

@Component
@Slf4j
public class TomcatMetricsCollector {
    private final ServletWebServerApplicationContext context;
    private final MeterRegistry meterRegistry;

    @Value("${monitor.tomcat.thread-utilization-threshold:0.8}")
    private double threadUtilizationThreshold;

    @Value("${monitor.tomcat.busy-thread-threshold:0.6}")
    private double busyThreadThreshold;

    @Scheduled(fixedRate = 5000)
    public void collectMetrics() {
        TomcatWebServer tomcatWebServer = 
            (TomcatWebServer) context.getWebServer();
        Tomcat tomcat = tomcatWebServer.getTomcat();

        for (Connector connector : tomcat.getService().findConnectors()) {
            ThreadPoolExecutor executor = 
                (ThreadPoolExecutor) connector.getProtocolHandler().getExecutor();

            int maxThreads = executor.getMaximumPoolSize();
            int activeThreads = executor.getActiveCount();
            int queueSize = executor.getQueue().size();
            double utilization = (double) activeThreads / maxThreads;

            // 記錄指標
            registerMetrics(connector.getPort(), maxThreads, activeThreads, 
                queueSize, utilization);

            // 檢查告警條件
            checkAlertConditions(connector.getPort(), maxThreads, activeThreads, 
                queueSize, utilization);
        }
    }

    private void checkAlertConditions(int port, int maxThreads, int activeThreads, 
            int queueSize, double utilization) {
        if (utilization > threadUtilizationThreshold) {
            AlertEvent event = new AlertEvent(
                AlertLevel.WARNING,
                "TomcatThreads",
                String.format("Thread pool utilization high on port %d: %.2f%% (%d/%d)", 
                    port, utilization * 100, activeThreads, maxThreads)
            );
            alertService.sendAlert(event);
        }

        if (queueSize > 10) {
            AlertEvent event = new AlertEvent(
                AlertLevel.WARNING,
                "TomcatThreads",
                String.format("Request queue building up on port %d: %d requests waiting", 
                    port, queueSize)
            );
            alertService.sendAlert(event);
        }
    }
}

3. 告警服務設計

@Service
@Slf4j
public class AlertService {

    private final List<AlertNotifier> notifiers;

    @Value("${monitor.alert.rate-limit-seconds:60}")
    private int rateLimitSeconds;

    private final Map<String, Instant> lastAlertTimes = new ConcurrentHashMap<>();

    public void sendAlert(AlertEvent event) {
        String alertKey = event.getSource() + ":" + event.getMessage();

        // 檢查告警頻率限制
        Instant lastAlert = lastAlertTimes.get(alertKey);
        Instant now = Instant.now();

        if (lastAlert != null && 
            Duration.between(lastAlert, now).getSeconds() < rateLimitSeconds) {
            log.debug("Skipping alert due to rate limiting: {}", event);
            return;
        }

        // 更新最後告警時間
        lastAlertTimes.put(alertKey, now);

        // 發送到所有配置的通知器
        for (AlertNotifier notifier : notifiers) {
            if (notifier.shouldNotify(event.getLevel())) {
                notifier.sendNotification(event);
            }
        }
    }
}

4. 通知實現

@Component
@Slf4j
public class EmailAlertNotifier implements AlertNotifier {

    private final JavaMailSender mailSender;

    @Value("${monitor.alert.email.recipients}")
    private String[] recipients;

    @Override
    public boolean shouldNotify(AlertLevel level) {
        return level.ordinal() >= AlertLevel.WARNING.ordinal();
    }

    @Override
    public void sendNotification(AlertEvent event) {
        SimpleMailMessage message = new SimpleMailMessage();
        message.setTo(recipients);
        message.setSubject(String.format("[%s] %s Alert", 
            event.getLevel(), event.getSource()));
        message.setText(event.getMessage());
        mailSender.send(message);
    }
}

@Component
@Slf4j
public class SlackAlertNotifier implements AlertNotifier {

    @Value("${monitor.alert.slack.webhook-url}")
    private String webhookUrl;

    @Override
    public boolean shouldNotify(AlertLevel level) {
        return level.ordinal() >= AlertLevel.CRITICAL.ordinal();
    }

    @Override
    public void sendNotification(AlertEvent event) {
        // 實現Slack webhook通知
        // ...
    }
}

5. 健康檢查端點

@Component
public class ConnectionPoolHealthIndicator implements HealthIndicator {

    private final DataSource dataSource;

    @Override
    public Health health() {
        if (dataSource instanceof HikariDataSource) {
            HikariDataSource hikari = (HikariDataSource) dataSource;
            HikariPoolMXBean poolBean = hikari.getHikariPoolMXBean();

            int active = poolBean.getActiveConnections();
            int total = poolBean.getTotalConnections();
            int waiting = poolBean.getThreadsAwaitingConnection();
            double ratio = total > 0 ? (double) active / total : 0;

            Health.Builder builder = Health.up();
            builder.withDetail("activeConnections", active);
            builder.withDetail("totalConnections", total);
            builder.withDetail("waitingThreads", waiting);
            builder.withDetail("utilizationRatio", ratio);

            if (ratio > 0.8 || waiting > 0) {
                return builder.status(Status.WARNING).build();
            }

            return builder.build();
        }

        return Health.unknown().build();
    }
}

6. 配置文件設計

# 監控配置
monitor:
  # 數據庫連接池監控
  db:
    active-ratio-threshold: 0.8  # 活躍連接佔比閾值
    waiting-threshold: 0         # 等待線程閾值
    acquisition-timeout-threshold: 1000  # 獲取連接超時閾值(毫秒)

  # Tomcat線程監控
  tomcat:
    thread-utilization-threshold: 0.8  # 線程池利用率閾值
    busy-thread-threshold: 0.6        # 繁忙線程比例閾值
    queue-size-threshold: 10          # 請求隊列大小閾值

  # 告警配置
  alert:
    rate-limit-seconds: 60  # 相同告警的最小間隔(秒)
    email:
      recipients: ops@example.com,dba@example.com
    slack:
      webhook-url: https://hooks.slack.com/services/xxx/yyy/zzz
      channel: #system-alerts

# 數據源配置
spring:
  datasource:
    hikari:
      pool-name: App-DB-Pool
      maximum-pool-size: 20
      minimum-idle: 5
      connection-timeout: 30000
      register-mbeans: true  # 開啟JMX監控

  # Actuator配置
  boot:
    admin:
      client:
        url: http://admin-server:8080

# 服務器配置
server:
  tomcat:
    max-threads: 200
    min-spare-threads: 20
    max-connections: 10000
    connection-timeout: 20000
    mbeanregistry:
      enabled: true  # 開啟Tomcat MBean註冊

7. 集成到監控面板

使用 Prometheus + Grafana 進行可視化監控:

# prometheus.yml
scrape_configs:
  - job_name: 'spring-app'
    metrics_path: '/actuator/prometheus'
    static_configs:
      - targets: ['app-server:8080']

Grafana 儀表板設計: - 數據庫連接池儀表板 - Tomcat 線程池儀表板 - 系統警示一覽表

總結

這套監控告警系統的優點:

  1. 全面監控:覆蓋數據庫連接池和Tomcat線程池的關鍵指標
  2. 可配置性:通過配置文件設置告警閾值和通知方式
  3. 多級告警:支援不同嚴重級別的告警
  4. 告警限流:防止告警風暴
  5. 多渠道通知:支援電子郵件、Slack等多種通知方式
  6. 可視化:與Prometheus和Grafana集成,提供直觀的監控面板

通過這套系統,運維團隊可以及時發現並解決數據庫連接池和Tomcat線程池問題,提高系統穩定性。