服务健康检查
场景
多服务器运行后,偶尔有服务挂掉。
问题:
- 某个应用服务器进程崩溃
- 某个数据库连接池耗尽
- 某个 Redis 实例内存溢出
- 人工发现和恢复太慢解决方案:健康检查
1. 健康检查端点
@app.route('/health')
def health_check():
"""健康检查端点"""
health_status = {
'status': 'healthy',
'timestamp': datetime.now().isoformat(),
'checks': {}
}
# 检查数据库连接
try:
with get_db_connection() as conn:
cursor = conn.cursor()
cursor.execute('SELECT 1')
health_status['checks']['database'] = {
'status': 'healthy',
'response_time': cursor.execute('SELECT 1') # 简化示例
}
except Exception as e:
health_status['checks']['database'] = {
'status': 'unhealthy',
'error': str(e)
}
health_status['status'] = 'unhealthy'
# 检查 Redis 连接
try:
redis_client.ping()
health_status['checks']['redis'] = {
'status': 'healthy'
}
except Exception as e:
health_status['checks']['redis'] = {
'status': 'unhealthy',
'error': str(e)
}
health_status['status'] = 'unhealthy'
# 检查外部 API
try:
response = requests.get(
'https://weather-api.kuaiyizhi.cn/health',
timeout=3
)
health_status['checks']['external_api'] = {
'status': 'healthy' if response.status_code == 200 else 'degraded'
}
except Exception as e:
health_status['checks']['external_api'] = {
'status': 'unhealthy',
'error': str(e)
}
# 返回相应状态码
status_code = 200 if health_status['status'] == 'healthy' else 503
return jsonify(health_status), status_code2. 自动重启脚本
import subprocess
import signal
import time
class ServiceMonitor:
"""服务监控器"""
def __init__(self, service_name, health_url):
self.service_name = service_name
self.health_url = health_url
self.failure_count = 0
self.max_failures = 3
def check_service(self):
"""检查服务健康状态"""
try:
response = requests.get(self.health_url, timeout=5)
if response.status_code == 200:
self.failure_count = 0
return True
else:
self.failure_count += 1
return False
except Exception as e:
self.failure_count += 1
logging.error(f'Health check failed: {e}')
return False
def restart_service(self):
"""重启服务"""
logging.warning(f'Restarting service {self.service_name}')
try:
# 使用 systemctl 重启
subprocess.run(
['systemctl', 'restart', self.service_name],
check=True
)
logging.info(f'Service {self.service_name} restarted successfully')
return True
except subprocess.CalledProcessError as e:
logging.error(f'Failed to restart service: {e}')
return False
def monitor_loop(self):
"""监控循环"""
while True:
if not self.check_service():
logging.warning(
f'Service {self.service_name} unhealthy '
f'({self.failure_count}/{self.max_failures})'
)
if self.failure_count >= self.max_failures:
self.restart_service()
self.failure_count = 0
time.sleep(10) # 每 10 秒检查一次
# 启动监控
if __name__ == '__main__':
monitor = ServiceMonitor(
service_name='api-server',
health_url='http://localhost:8080/health'
)
monitor.monitor_loop()3. Systemd 服务配置
# /etc/systemd/system/api-server.service
[Unit]
Description=API Server
After=network.target
[Service]
Type=simple
User=apiuser
WorkingDirectory=/opt/api-server
ExecStart=/usr/bin/python3 /opt/api-server/app.py
Restart=always
RestartSec=10
StandardOutput=journal
StandardError=journal
[Install]
WantedBy=multi-user.target4. 进程管理
import psutil
import os
def check_process_health():
"""检查进程健康"""
current_process = psutil.Process()
# 检查 CPU 使用率
cpu_percent = current_process.cpu_percent(interval=1)
if cpu_percent > 90:
logging.warning(f'High CPU usage: {cpu_percent}%')
# 检查内存使用
memory_info = current_process.memory_info()
memory_percent = current_process.memory_percent()
if memory_percent > 80:
logging.warning(f'High memory usage: {memory_percent}%')
# 检查线程数
num_threads = current_process.num_threads()
if num_threads > 100:
logging.warning(f'Many threads: {num_threads}')
# 检查文件描述符
num_fds = current_process.num_fds()
if num_fds > 1000:
logging.warning(f'Many open files: {num_fds}')
return {
'cpu': cpu_percent,
'memory': memory_percent,
'threads': num_threads,
'fds': num_fds
}监控告警
def send_service_alert(service_name, issue):
"""发送服务告警"""
message = f"""
Service Alert: {service_name}
Issue: {issue}
Time: {datetime.now().isoformat()}
Please check immediately.
"""
send_alert(message)
# 在监控脚本中使用
def monitor_with_alerts():
"""带告警的监控"""
if not check_service_health():
send_service_alert(
'api-server',
'Health check failed'
)效果验证
优化前
服务崩溃:
- 需要人工发现
- 人工重启
- 恢复时间:10-30 分钟优化后
自动监控:
- 自动发现故障
- 自动重启
- 恢复时间:30 秒本节小结
✅ 完成的工作:
- 实现了健康检查端点
- 实现了自动重启脚本
- 配置了 systemd 服务
- 添加了监控告警
✅ 效果:
- 自动恢复故障
- 恢复时间从 30 分钟降到 30 秒
- 减少人工干预
⚠️ 下一步:数据库主节点宕机
🎯 下一步:数据库主节点宕机,如何自动故障转移?
