服务健康检查

场景

多服务器运行后，偶尔有服务挂掉。

问题：
- 某个应用服务器进程崩溃
- 某个数据库连接池耗尽
- 某个 Redis 实例内存溢出
- 人工发现和恢复太慢

解决方案：健康检查

1. 健康检查端点

@app.route('/health')
def health_check():
    """健康检查端点"""

    health_status = {
        'status': 'healthy',
        'timestamp': datetime.now().isoformat(),
        'checks': {}
    }

    # 检查数据库连接
    try:
        with get_db_connection() as conn:
            cursor = conn.cursor()
            cursor.execute('SELECT 1')
            health_status['checks']['database'] = {
                'status': 'healthy',
                'response_time': cursor.execute('SELECT 1')  # 简化示例
            }
    except Exception as e:
        health_status['checks']['database'] = {
            'status': 'unhealthy',
            'error': str(e)
        }
        health_status['status'] = 'unhealthy'

    # 检查 Redis 连接
    try:
        redis_client.ping()
        health_status['checks']['redis'] = {
            'status': 'healthy'
        }
    except Exception as e:
        health_status['checks']['redis'] = {
            'status': 'unhealthy',
            'error': str(e)
        }
        health_status['status'] = 'unhealthy'

    # 检查外部 API
    try:
        response = requests.get(
            'https://weather-api.kuaiyizhi.cn/health',
            timeout=3
        )
        health_status['checks']['external_api'] = {
            'status': 'healthy' if response.status_code == 200 else 'degraded'
        }
    except Exception as e:
        health_status['checks']['external_api'] = {
            'status': 'unhealthy',
            'error': str(e)
        }

    # 返回相应状态码
    status_code = 200 if health_status['status'] == 'healthy' else 503

    return jsonify(health_status), status_code

2. 自动重启脚本

import subprocess
import signal
import time

class ServiceMonitor:
    """服务监控器"""

    def __init__(self, service_name, health_url):
        self.service_name = service_name
        self.health_url = health_url
        self.failure_count = 0
        self.max_failures = 3

    def check_service(self):
        """检查服务健康状态"""

        try:
            response = requests.get(self.health_url, timeout=5)
            if response.status_code == 200:
                self.failure_count = 0
                return True
            else:
                self.failure_count += 1
                return False

        except Exception as e:
            self.failure_count += 1
            logging.error(f'Health check failed: {e}')
            return False

    def restart_service(self):
        """重启服务"""

        logging.warning(f'Restarting service {self.service_name}')

        try:
            # 使用 systemctl 重启
            subprocess.run(
                ['systemctl', 'restart', self.service_name],
                check=True
            )
            logging.info(f'Service {self.service_name} restarted successfully')
            return True

        except subprocess.CalledProcessError as e:
            logging.error(f'Failed to restart service: {e}')
            return False

    def monitor_loop(self):
        """监控循环"""

        while True:
            if not self.check_service():
                logging.warning(
                    f'Service {self.service_name} unhealthy '
                    f'({self.failure_count}/{self.max_failures})'
                )

                if self.failure_count >= self.max_failures:
                    self.restart_service()
                    self.failure_count = 0

            time.sleep(10)  # 每 10 秒检查一次

# 启动监控
if __name__ == '__main__':
    monitor = ServiceMonitor(
        service_name='api-server',
        health_url='http://localhost:8080/health'
    )

    monitor.monitor_loop()

3. Systemd 服务配置

# /etc/systemd/system/api-server.service

[Unit]
Description=API Server
After=network.target

[Service]
Type=simple
User=apiuser
WorkingDirectory=/opt/api-server
ExecStart=/usr/bin/python3 /opt/api-server/app.py
Restart=always
RestartSec=10
StandardOutput=journal
StandardError=journal

[Install]
WantedBy=multi-user.target

4. 进程管理

import psutil
import os

def check_process_health():
    """检查进程健康"""

    current_process = psutil.Process()

    # 检查 CPU 使用率
    cpu_percent = current_process.cpu_percent(interval=1)
    if cpu_percent > 90:
        logging.warning(f'High CPU usage: {cpu_percent}%')

    # 检查内存使用
    memory_info = current_process.memory_info()
    memory_percent = current_process.memory_percent()
    if memory_percent > 80:
        logging.warning(f'High memory usage: {memory_percent}%')

    # 检查线程数
    num_threads = current_process.num_threads()
    if num_threads > 100:
        logging.warning(f'Many threads: {num_threads}')

    # 检查文件描述符
    num_fds = current_process.num_fds()
    if num_fds > 1000:
        logging.warning(f'Many open files: {num_fds}')

    return {
        'cpu': cpu_percent,
        'memory': memory_percent,
        'threads': num_threads,
        'fds': num_fds
    }

监控告警

def send_service_alert(service_name, issue):
    """发送服务告警"""

    message = f"""
Service Alert: {service_name}

Issue: {issue}
Time: {datetime.now().isoformat()}

Please check immediately.
    """

    send_alert(message)

# 在监控脚本中使用
def monitor_with_alerts():
    """带告警的监控"""

    if not check_service_health():
        send_service_alert(
            'api-server',
            'Health check failed'
        )

效果验证

优化前

服务崩溃：
- 需要人工发现
- 人工重启
- 恢复时间：10-30 分钟

优化后

自动监控：
- 自动发现故障
- 自动重启
- 恢复时间：30 秒

本节小结

✅ 完成的工作：

实现了健康检查端点
实现了自动重启脚本
配置了 systemd 服务
添加了监控告警

✅ 效果：

自动恢复故障
恢复时间从 30 分钟降到 30 秒
减少人工干预

⚠️ 下一步：数据库主节点宕机

🎯 下一步：数据库主节点宕机，如何自动故障转移？

下一章节数据库高可用数据库主节点宕机，如何自动故障转移？