数据库高可用
场景
某天凌晨,数据库主节点宕机。
事件:
- 主数据库服务器硬件故障
- 所有写操作失败
- 从库无法同步
- 系统部分不可用
影响:
- 用户注册失败
- 订阅更新失败
- 日志记录失败
- 持续时间:2 小时(人工处理)解决方案:自动故障转移
1. MySQL 主从切换
class MySQLFailoverManager:
"""MySQL 故障转移管理器"""
def __init__(self):
self.master_host = 'mysql-master.internal'
self.slave_hosts = [
'mysql-slave-1.internal',
'mysql-slave-2.internal',
'mysql-slave-3.internal'
]
self.current_master = self.master_host
def check_master_health(self):
"""检查主库健康状态"""
try:
conn = pymysql.connect(
host=self.current_master,
user='monitor',
password='password',
connect_timeout=5
)
conn.close()
return True
except Exception as e:
logging.error(f'Master health check failed: {e}')
return False
def promote_slave_to_master(self, slave_host):
"""将从库提升为主库"""
logging.info(f'Promoting {slave_host} to master')
try:
# 1. 停止从库复制
conn = pymysql.connect(
host=slave_host,
user='admin',
password='password'
)
cursor = conn.cursor()
cursor.execute('STOP SLAVE')
cursor.execute('RESET SLAVE ALL')
# 2. 设置为读写模式
cursor.execute('SET GLOBAL read_only = 0')
# 3. 更新应用配置
self.update_application_config(slave_host)
self.current_master = slave_host
conn.commit()
conn.close()
logging.info(f'Successfully promoted {slave_host} to master')
return True
except Exception as e:
logging.error(f'Failed to promote slave: {e}')
return False
def update_application_config(self, new_master):
"""更新应用配置"""
# 更新数据库路由配置
config_file = '/etc/api-server/db_config.json'
with open(config_file, 'r') as f:
config = json.load(f)
config['master'] = new_master
with open(config_file, 'w') as f:
json.dump(config, f, indent=2)
# 通知应用重载配置
requests.post('http://localhost:8080/admin/reload-config')
def perform_failover(self):
"""执行故障转移"""
logging.warning('Starting failover process')
# 1. 选择最新的从库
best_slave = self.select_best_slave()
if not best_slave:
logging.error('No suitable slave found for failover')
return False
# 2. 提升从库为主库
if self.promote_slave_to_master(best_slave):
# 3. 重新配置其他从库
self.reconfigure_slaves(best_slave)
# 4. 发送告警
send_alert(f'Database failover completed. New master: {best_slave}')
return True
return False
def select_best_slave(self):
"""选择最适合提升的从库"""
candidates = []
for slave_host in self.slave_hosts:
try:
conn = pymysql.connect(
host=slave_host,
user='monitor',
password='password'
)
cursor = conn.cursor()
# 检查复制状态
cursor.execute('SHOW SLAVE STATUS')
status = cursor.fetchone()
# 检查延迟
lag = status['Seconds_Behind_Master']
# 检查 IO 和 SQL 线程状态
io_running = status['Slave_IO_Running'] == 'Yes'
sql_running = status['Slave_SQL_Running'] == 'Yes'
if io_running and sql_running and lag is not None and lag < 10:
candidates.append({
'host': slave_host,
'lag': lag
})
conn.close()
except Exception as e:
logging.error(f'Failed to check slave {slave_host}: {e}')
if not candidates:
return None
# 选择延迟最低的从库
candidates.sort(key=lambda x: x['lag'])
return candidates[0]['host']
def reconfigure_slaves(self, new_master):
"""重新配置其他从库"""
for slave_host in self.slave_hosts:
if slave_host == new_master:
continue
try:
conn = pymysql.connect(
host=slave_host,
user='admin',
password='password'
)
cursor = conn.cursor()
# 停止旧复制
cursor.execute('STOP SLAVE')
# 配置新的主库
cursor.execute(f'''
CHANGE MASTER TO
MASTER_HOST='{new_master}',
MASTER_USER='repl',
MASTER_PASSWORD='password',
MASTER_AUTO_POSITION = 1
''')
# 启动复制
cursor.execute('START SLAVE')
conn.commit()
conn.close()
logging.info(f'Reconfigured slave {slave_host}')
except Exception as e:
logging.error(f'Failed to reconfigure slave {slave_host}: {e}')
failover_manager = MySQLFailoverManager()2. 自动故障转移脚本
def monitor_and_failover():
"""监控并自动故障转移"""
failure_count = 0
max_failures = 3
while True:
if not failover_manager.check_master_health():
failure_count += 1
logging.warning(
f'Master unhealthy ({failure_count}/{max_failures})'
)
if failure_count >= max_failures:
# 执行故障转移
if failover_manager.perform_failover():
failure_count = 0
else:
send_alert('Failover failed!')
time.sleep(60) # 等待 1 分钟后重试
else:
failure_count = 0
time.sleep(10) # 每 10 秒检查一次
# 启动监控
if __name__ == '__main__':
monitor_and_failover()3. 使用 MHA(MasterHA)
# 安装 MHA
# MasterHA 是专门用于 MySQL 主从切换的工具
# 安装 MHA Manager
sudo apt install mha-manager
# 配置 MHA
cat > /etc/mha/app1.cnf <<EOF
[server default]
manager_workdir=/var/log/masterha/app1
manager_log=/var/log/masterha/app1/manager.log
user=mha
password=mhapassword
ssh_user=root
[server1]
hostname=mysql-master.internal
candidate_master=0
[server2]
hostname=mysql-slave-1.internal
candidate_master=1
[server3]
hostname=mysql-slave-2.internal
candidate_master=1
EOF
# 启动 MHA Manager
masterha_manager --conf=/etc/mha/app1.cnf监控和告警
def monitor_mysql_replication():
"""监控 MySQL 复制状态"""
for slave_host in failover_manager.slave_hosts:
try:
conn = pymysql.connect(
host=slave_host,
user='monitor',
password='password'
)
cursor = conn.cursor()
cursor.execute('SHOW SLAVE STATUS')
status = cursor.fetchone()
# 检查复制状态
if status['Slave_IO_Running'] != 'Yes':
send_alert(
f'Slave IO stopped on {slave_host}'
)
if status['Slave_SQL_Running'] != 'Yes':
send_alert(
f'Slave SQL stopped on {slave_host}'
)
# 检查延迟
lag = status['Seconds_Behind_Master']
if lag and lag > 30:
send_alert(
f'High replication lag on {slave_host}: {lag}s'
)
conn.close()
except Exception as e:
send_alert(
f'Failed to check slave {slave_host}: {e}'
)
# 定时监控
scheduler.add_job(
monitor_mysql_replication,
'interval',
seconds=60,
id='monitor_replication'
)效果验证
优化前
主库宕机:
- 人工发现:30 分钟
- 人工切换:1 小时
- 总恢复时间:1.5 小时
- 用户体验:差优化后
主库宕机:
- 自动发现:30 秒
- 自动切换:30 秒
- 总恢复时间:1 分钟
- 用户体验:基本无影响本节小结
✅ 完成的工作:
- 实现了主库健康检查
- 实现了自动故障转移
- 配置了从库重新同步
- 添加了复制监控
✅ 效果:
- 自动切换主库
- 恢复时间从 1.5 小时降到 1 分钟
- 系统可用性提升
⚠️ 下一步:一个服务挂了引发雪崩
🎯 下一步:一个服务挂了引发整个系统雪崩,如何防止?
