docs: 统一文档日期和状态规范
This commit is contained in:
+715
-39
@@ -4,7 +4,7 @@
|
||||
> 版本: v1.0
|
||||
> 日期: 2026-03-04
|
||||
> 作者: 张翔
|
||||
> 状态: 初稿
|
||||
> 状态: 正式发布
|
||||
|
||||
---
|
||||
|
||||
@@ -29,42 +29,18 @@
|
||||
|
||||
### 1.1 部署拓扑
|
||||
|
||||
```
|
||||
┌─────────────────────────────────────────────────────────────────┐
|
||||
│ 部署架构拓扑 │
|
||||
├─────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ ┌─────────────────────────────────────────────────────────┐ │
|
||||
│ │ 用户层 │ │
|
||||
│ ├─────────────────────────────────────────────────────────┤ │
|
||||
│ │ • 会员小程序 • 教练端App • 管理后台PC │ │
|
||||
│ └─────────────────────────────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌─────────────────────────────────────────────────────────┐ │
|
||||
│ │ 负载均衡层 (Nginx) │ │
|
||||
│ ├─────────────────────────────────────────────────────────┤ │
|
||||
│ │ • 负载均衡 • SSL 终止 • 静态资源 • 限流 │ │
|
||||
│ └─────────────────────────────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌─────────────────────────────────────────────────────────┐ │
|
||||
│ │ 应用层 (Docker Compose) │ │
|
||||
│ ├─────────────────────────────────────────────────────────┤ │
|
||||
│ │ • gym-manage (应用) • postgres (数据库) │ │
|
||||
│ │ • redis (缓存) • rabbitmq (消息队列) │ │
|
||||
│ │ • elasticsearch (搜索引擎) • prometheus (监控) │ │
|
||||
│ │ • grafana (可视化) • kibana (日志可视化) │ │
|
||||
│ └─────────────────────────────────────────────────────────┘ │
|
||||
│ │ │
|
||||
│ ▼ │
|
||||
│ ┌─────────────────────────────────────────────────────────┐ │
|
||||
│ │ 监控层 (Prometheus + Grafana) │ │
|
||||
│ ├─────────────────────────────────────────────────────────┤ │
|
||||
│ │ • 指标采集 • 告警规则 • 可视化仪表板 │ │
|
||||
│ └─────────────────────────────────────────────────────────┘ │
|
||||
│ │
|
||||
└─────────────────────────────────────────────────────────────────┘
|
||||
```mermaid
|
||||
flowchart TB
|
||||
subgraph 部署架构拓扑
|
||||
A[用户层<br/>• 会员小程序<br/>• 教练端App<br/>• 管理后台PC]
|
||||
B[负载均衡层 Nginx<br/>• 负载均衡<br/>• SSL 终止<br/>• 静态资源<br/>• 限流]
|
||||
C[应用层 Docker Compose<br/>• gym-manage 应用<br/>• postgres 数据库<br/>• redis 缓存<br/>• rabbitmq 消息队列<br/>• elasticsearch 搜索引擎<br/>• prometheus 监控<br/>• grafana 可视化<br/>• kibana 日志可视化]
|
||||
D[监控层 Prometheus + Grafana<br/>• 指标采集<br/>• 告警规则<br/>• 可视化仪表板]
|
||||
end
|
||||
|
||||
A --> B
|
||||
B --> C
|
||||
C --> D
|
||||
```
|
||||
|
||||
### 1.2 服务器配置
|
||||
@@ -436,7 +412,7 @@ docker-compose logs -f gym-manage
|
||||
docker-compose logs --tail=100 gym-manage
|
||||
|
||||
# 查看特定时间的日志
|
||||
docker-compose logs --since 2024-01-01T00:00:00 gym-manage
|
||||
docker-compose logs --since 2026-01-01T00:00:00 gym-manage
|
||||
```
|
||||
|
||||
#### 5.2.2 日志文件
|
||||
@@ -776,7 +752,7 @@ crontab -e
|
||||
docker-compose stop gym-manage
|
||||
|
||||
# 恢复数据库
|
||||
docker-compose exec -T postgres psql -U postgres gym_manage < backup/gym_manage_20240101_020000.sql
|
||||
docker-compose exec -T postgres psql -U postgres gym_manage < backup/gym_manage_20260101_020000.sql
|
||||
|
||||
# 启动应用
|
||||
docker-compose start gym-manage
|
||||
@@ -834,6 +810,706 @@ spring:
|
||||
|
||||
---
|
||||
|
||||
|
||||
## 六、监控告警详细配置
|
||||
|
||||
### 6.1 Prometheus 监控配置
|
||||
|
||||
#### 6.1.1 prometheus.yml 配置
|
||||
|
||||
**文件位置**: `monitoring/prometheus.yml`
|
||||
|
||||
```yaml
|
||||
global:
|
||||
scrape_interval: 15s # 采集间隔
|
||||
evaluation_interval: 15s # 规则评估间隔
|
||||
external_labels:
|
||||
monitor: 'gym-manage'
|
||||
environment: 'production'
|
||||
|
||||
# 告警规则配置
|
||||
rule_files:
|
||||
- "alerts.yml"
|
||||
|
||||
# 告警管理器配置
|
||||
alerting:
|
||||
alertmanagers:
|
||||
- static_configs:
|
||||
- targets:
|
||||
- alertmanager:9093
|
||||
|
||||
# 采集配置
|
||||
scrape_configs:
|
||||
# Prometheus 自监控
|
||||
- job_name: 'prometheus'
|
||||
static_configs:
|
||||
- targets: ['localhost:9090']
|
||||
labels:
|
||||
instance: 'prometheus-server'
|
||||
|
||||
# 应用监控
|
||||
- job_name: 'gym-manage'
|
||||
metrics_path: '/actuator/prometheus'
|
||||
static_configs:
|
||||
- targets: ['gym-manage:8080']
|
||||
labels:
|
||||
application: 'gym-manage'
|
||||
environment: 'production'
|
||||
scrape_interval: 10s
|
||||
|
||||
# Node 导出器
|
||||
- job_name: 'node-exporter'
|
||||
static_configs:
|
||||
- targets: ['node-exporter:9100']
|
||||
labels:
|
||||
instance: 'server-node'
|
||||
|
||||
# Redis 导出器
|
||||
- job_name: 'redis-exporter'
|
||||
static_configs:
|
||||
- targets: ['redis-exporter:9121']
|
||||
labels:
|
||||
instance: 'redis-server'
|
||||
|
||||
# PostgreSQL 导出器
|
||||
- job_name: 'postgres-exporter'
|
||||
static_configs:
|
||||
- targets: ['postgres-exporter:9187']
|
||||
labels:
|
||||
instance: 'postgres-server'
|
||||
|
||||
# RabbitMQ 导出器
|
||||
- job_name: 'rabbitmq-exporter'
|
||||
static_configs:
|
||||
- targets: ['rabbitmq-exporter:9419']
|
||||
labels:
|
||||
instance: 'rabbitmq-server'
|
||||
```
|
||||
|
||||
#### 6.1.2 alerts.yml 告警规则
|
||||
|
||||
**文件位置**: `monitoring/alerts.yml`
|
||||
|
||||
```yaml
|
||||
groups:
|
||||
- name: gym-manage-alerts
|
||||
interval: 30s
|
||||
rules:
|
||||
# 应用可用性告警
|
||||
- alert: ApplicationDown
|
||||
expr: up{job="gym-manage"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "应用不可用"
|
||||
description: "应用 {{ $labels.instance }} 已宕机超过 1 分钟"
|
||||
|
||||
# 高错误率告警
|
||||
- alert: HighErrorRate
|
||||
expr: sum(rate(http_server_requests_seconds_count{status=~"5..", job="gym-manage"}[5m])) / sum(rate(http_server_requests_seconds_count{job="gym-manage"}[5m])) > 0.05
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "高错误率"
|
||||
description: "应用错误率超过 5% (当前值:{{ $value | humanizePercentage }})"
|
||||
|
||||
# 高响应时间告警
|
||||
- alert: HighResponseTime
|
||||
expr: histogram_quantile(0.95, sum(rate(http_server_requests_seconds_bucket{job="gym-manage"}[5m])) by (le)) > 1
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "高响应时间"
|
||||
description: "应用 P95 响应时间超过 1 秒 (当前值:{{ $value | humanizeDuration }})"
|
||||
|
||||
# 高内存使用率告警
|
||||
- alert: HighMemoryUsage
|
||||
expr: (jvm_memory_used_bytes{area="heap", job="gym-manage"} / jvm_memory_max_bytes{area="heap", job="gym-manage"}) > 0.85
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "高内存使用率"
|
||||
description: "JVM 堆内存使用率超过 85% (当前值:{{ $value | humanizePercentage }})"
|
||||
|
||||
# OOM 告警
|
||||
- alert: OutOfMemory
|
||||
expr: (jvm_memory_used_bytes{area="heap", job="gym-manage"} / jvm_memory_max_bytes{area="heap", job="gym-manage"}) > 0.95
|
||||
for: 2m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "内存即将耗尽"
|
||||
description: "JVM 堆内存使用率超过 95% (当前值:{{ $value | humanizePercentage }})"
|
||||
|
||||
# 数据库连接池耗尽告警
|
||||
- alert: DatabaseConnectionPoolExhausted
|
||||
expr: hikaricp_active_connections{job="gym-manage"} / hikaricp_max_connections{job="gym-manage"} > 0.9
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "数据库连接池耗尽"
|
||||
description: "数据库连接池使用率超过 90% (当前值:{{ $value | humanizePercentage }})"
|
||||
|
||||
# Redis 连接失败告警
|
||||
- alert: RedisConnectionFailed
|
||||
expr: redis_up{job="redis-exporter"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "Redis 连接失败"
|
||||
description: "Redis {{ $labels.instance }} 连接失败"
|
||||
|
||||
# PostgreSQL 连接失败告警
|
||||
- alert: PostgresConnectionFailed
|
||||
expr: pg_up{job="postgres-exporter"} == 0
|
||||
for: 1m
|
||||
labels:
|
||||
severity: critical
|
||||
annotations:
|
||||
summary: "PostgreSQL 连接失败"
|
||||
description: "PostgreSQL {{ $labels.instance }} 连接失败"
|
||||
|
||||
# RabbitMQ 队列堆积告警
|
||||
- alert: RabbitMQQueueBacklog
|
||||
expr: rabbitmq_queue_messages{job="rabbitmq-exporter"} > 1000
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "消息队列堆积"
|
||||
description: "队列 {{ $labels.queue }} 消息数量超过 1000 (当前值:{{ $value }})"
|
||||
|
||||
# 磁盘空间不足告警
|
||||
- alert: DiskSpaceLow
|
||||
expr: (node_filesystem_avail_bytes{mountpoint="/"} / node_filesystem_size_bytes{mountpoint="/"}) < 0.15
|
||||
for: 5m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "磁盘空间不足"
|
||||
description: "服务器 {{ $labels.instance }} 根分区磁盘空间不足 15% (当前值:{{ $value | humanizePercentage }})"
|
||||
|
||||
# CPU 使用率过高告警
|
||||
- alert: HighCPUUsage
|
||||
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 85
|
||||
for: 10m
|
||||
labels:
|
||||
severity: warning
|
||||
annotations:
|
||||
summary: "CPU 使用率过高"
|
||||
description: "服务器 {{ $labels.instance }} CPU 使用率超过 85% (当前值:{{ $value | humanize }}%)"
|
||||
```
|
||||
|
||||
### 6.2 Grafana 仪表板配置
|
||||
|
||||
#### 6.2.1 应用监控仪表板
|
||||
|
||||
**仪表板 ID**: `gym-manage-overview`
|
||||
|
||||
**主要面板**:
|
||||
1. **应用健康状态**
|
||||
- 应用在线状态
|
||||
- 健康检查状态
|
||||
- 运行时长
|
||||
|
||||
2. **流量指标**
|
||||
- QPS (每秒请求数)
|
||||
- 并发连接数
|
||||
- 网络吞吐量
|
||||
|
||||
3. **响应时间**
|
||||
- 平均响应时间
|
||||
- P95 响应时间
|
||||
- P99 响应时间
|
||||
|
||||
4. **错误率**
|
||||
- HTTP 5xx 错误率
|
||||
- HTTP 4xx 错误率
|
||||
- 业务错误率
|
||||
|
||||
5. **JVM 指标**
|
||||
- 堆内存使用率
|
||||
- 非堆内存使用率
|
||||
- GC 次数和时间
|
||||
- 线程数
|
||||
|
||||
6. **数据库连接池**
|
||||
- 活跃连接数
|
||||
- 空闲连接数
|
||||
- 连接池使用率
|
||||
- 平均获取连接时间
|
||||
|
||||
7. **Redis 缓存**
|
||||
- 缓存命中率
|
||||
- 缓存键数量
|
||||
- 内存使用量
|
||||
- 命令执行时间
|
||||
|
||||
8. **消息队列**
|
||||
- 队列消息数量
|
||||
- 消息生产速率
|
||||
- 消息消费速率
|
||||
- 消息堆积情况
|
||||
|
||||
#### 6.2.2 系统监控仪表板
|
||||
|
||||
**仪表板 ID**: `system-overview`
|
||||
|
||||
**主要面板**:
|
||||
1. **CPU 指标**
|
||||
- CPU 使用率
|
||||
- CPU 负载 (1/5/15 分钟)
|
||||
- CPU 核心数
|
||||
|
||||
2. **内存指标**
|
||||
- 内存使用率
|
||||
- 可用内存
|
||||
- Swap 使用率
|
||||
|
||||
3. **磁盘指标**
|
||||
- 磁盘使用率
|
||||
- 磁盘 I/O
|
||||
- 磁盘读写速率
|
||||
|
||||
4. **网络指标**
|
||||
- 网络流量
|
||||
- 网络连接数
|
||||
- 网络错误率
|
||||
|
||||
### 6.3 告警通知配置
|
||||
|
||||
#### 6.3.1 Alertmanager 配置
|
||||
|
||||
**文件位置**: `monitoring/alertmanager.yml`
|
||||
|
||||
```yaml
|
||||
global:
|
||||
# 邮件配置
|
||||
smtp_smarthost: 'smtp.example.com:587'
|
||||
smtp_from: 'alertmanager@example.com'
|
||||
smtp_auth_username: 'alertmanager@example.com'
|
||||
smtp_auth_password: 'your-password'
|
||||
|
||||
# 钉钉配置
|
||||
dingtalk_configs:
|
||||
- url: 'https://oapi.dingtalk.com/robot/send?access_token=YOUR_TOKEN'
|
||||
secret: 'YOUR_SECRET'
|
||||
send_resolved: true
|
||||
|
||||
# 企业微信配置
|
||||
wechat_configs:
|
||||
- corp_id: 'YOUR_CORP_ID'
|
||||
agent_id: 'YOUR_AGENT_ID'
|
||||
secret: 'YOUR_SECRET'
|
||||
to_user: '@all'
|
||||
send_resolved: true
|
||||
|
||||
# 模板配置
|
||||
templates:
|
||||
- '/etc/alertmanager/templates/*.tmpl'
|
||||
|
||||
# 路由配置
|
||||
route:
|
||||
receiver: 'default-receiver'
|
||||
group_by: ['alertname', 'severity']
|
||||
group_wait: 30s
|
||||
group_interval: 5m
|
||||
repeat_interval: 4h
|
||||
routes:
|
||||
# 严重告警立即通知
|
||||
- match:
|
||||
severity: critical
|
||||
receiver: 'critical-receiver'
|
||||
group_wait: 10s
|
||||
repeat_interval: 1h
|
||||
# 警告告警延迟通知
|
||||
- match:
|
||||
severity: warning
|
||||
receiver: 'warning-receiver'
|
||||
group_wait: 5m
|
||||
repeat_interval: 4h
|
||||
|
||||
# 接收器配置
|
||||
receivers:
|
||||
- name: 'default-receiver'
|
||||
email_configs:
|
||||
- to: 'devops-team@example.com'
|
||||
send_resolved: true
|
||||
|
||||
- name: 'critical-receiver'
|
||||
email_configs:
|
||||
- to: 'oncall@example.com'
|
||||
send_resolved: true
|
||||
dingtalk_configs:
|
||||
- send_resolved: true
|
||||
wechat_configs:
|
||||
- send_resolved: true
|
||||
|
||||
- name: 'warning-receiver'
|
||||
email_configs:
|
||||
- to: 'dev-team@example.com'
|
||||
send_resolved: true
|
||||
|
||||
# 抑制规则
|
||||
inhibit_rules:
|
||||
# 如果应用宕机,抑制其他告警
|
||||
- source_match:
|
||||
alertname: 'ApplicationDown'
|
||||
target_match:
|
||||
severity: 'warning'
|
||||
equal: ['instance']
|
||||
```
|
||||
|
||||
#### 6.3.2 告警升级策略
|
||||
|
||||
**升级规则**:
|
||||
1. **P0 级别 (Critical)**
|
||||
- 立即通知:钉钉 + 企业微信 + 短信 + 电话
|
||||
- 15 分钟未响应:升级至技术总监
|
||||
- 30 分钟未响应:升级至 CTO
|
||||
|
||||
2. **P1 级别 (Warning)**
|
||||
- 立即通知:钉钉 + 企业微信
|
||||
- 1 小时未响应:升级至部门经理
|
||||
- 2 小时未响应:升级至技术总监
|
||||
|
||||
3. **P2 级别 (Info)**
|
||||
- 工作时间通知:邮件
|
||||
- 24 小时未处理:升级为 Warning
|
||||
|
||||
#### 6.3.3 告警值班安排
|
||||
|
||||
**值班表配置**:
|
||||
```yaml
|
||||
# 工作日值班
|
||||
work_hours:
|
||||
- Monday to Friday: 09:00-18:00
|
||||
|
||||
# 值班人员
|
||||
on_call_schedule:
|
||||
- name: "张三"
|
||||
email: "zhangsan@example.com"
|
||||
phone: "13800138000"
|
||||
schedule: "周一,周三"
|
||||
- name: "李四"
|
||||
email: "lisi@example.com"
|
||||
phone: "13900139000"
|
||||
schedule: "周二,周四"
|
||||
- name: "王五"
|
||||
email: "wangwu@example.com"
|
||||
phone: "13700137000"
|
||||
schedule: "周五"
|
||||
|
||||
# 周末值班
|
||||
weekend_on_call:
|
||||
- name: "值班团队"
|
||||
email: "weekend-team@example.com"
|
||||
phone: "400-xxx-xxxx"
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 七、备份恢复详细策略
|
||||
|
||||
### 7.1 备份策略
|
||||
|
||||
#### 7.1.1 备份类型
|
||||
|
||||
**全量备份**:
|
||||
- 频率:每日凌晨 2 点
|
||||
- 保留期限:30 天
|
||||
- 备份内容:完整数据库、配置文件
|
||||
|
||||
**增量备份**:
|
||||
- 频率:每小时
|
||||
- 保留期限:7 天
|
||||
- 备份内容:WAL 日志、变更数据
|
||||
|
||||
**差异备份**:
|
||||
- 频率:每 6 小时
|
||||
- 保留期限:7 天
|
||||
- 备份内容:自上次全量备份后的变更
|
||||
|
||||
#### 7.1.2 备份内容
|
||||
|
||||
**数据库备份**:
|
||||
```bash
|
||||
# PostgreSQL 全量备份脚本
|
||||
#!/bin/bash
|
||||
BACKUP_DIR="/backup/postgres"
|
||||
DATE=$(date +%Y%m%d_%H%M%S)
|
||||
DB_NAME="gym_manage"
|
||||
DB_USER="postgres"
|
||||
|
||||
# 创建备份目录
|
||||
mkdir -p ${BACKUP_DIR}
|
||||
|
||||
# 全量备份
|
||||
pg_dump -U ${DB_USER} -h localhost ${DB_NAME} | gzip > ${BACKUP_DIR}/${DB_NAME}_${DATE}.sql.gz
|
||||
|
||||
# 备份 WAL 日志
|
||||
# 配置 postgresql.conf:
|
||||
# wal_level = replica
|
||||
# archive_mode = on
|
||||
# archive_command = 'cp %p /backup/wal/%f'
|
||||
|
||||
# 清理旧备份 (保留 30 天)
|
||||
find ${BACKUP_DIR} -name "*.sql.gz" -mtime +30 -delete
|
||||
```
|
||||
|
||||
**配置文件备份**:
|
||||
```bash
|
||||
# 备份应用配置
|
||||
#!/bin/bash
|
||||
BACKUP_DIR="/backup/config"
|
||||
DATE=$(date +%Y%m%d_%H%M%S)
|
||||
|
||||
# 备份配置文件
|
||||
tar -czf ${BACKUP_DIR}/config_${DATE}.tar.gz application-prod.yml docker-compose.yml nginx/nginx.conf monitoring/prometheus.yml monitoring/alerts.yml
|
||||
|
||||
# 备份环境变量
|
||||
docker-compose exec gym-manage env > ${BACKUP_DIR}/env_${DATE}.txt
|
||||
```
|
||||
|
||||
**数据文件备份**:
|
||||
```bash
|
||||
# 备份 Redis 数据
|
||||
#!/bin/bash
|
||||
BACKUP_DIR="/backup/redis"
|
||||
DATE=$(date +%Y%m%d_%H%M%S)
|
||||
|
||||
# 触发 RDB 保存
|
||||
docker-compose exec redis redis-cli BGSAVE
|
||||
|
||||
# 等待保存完成
|
||||
sleep 5
|
||||
|
||||
# 复制 RDB 文件
|
||||
docker cp gym-manage-redis:/data/dump.rdb ${BACKUP_DIR}/dump_${DATE}.rdb
|
||||
|
||||
# 备份 Elasticsearch 数据
|
||||
docker-compose exec elasticsearch elasticsearch-snapshot -repository backup -snapshot gym_manage_${DATE}
|
||||
```
|
||||
|
||||
#### 7.1.3 备份验证
|
||||
|
||||
**定期验证**:
|
||||
- 频率:每周日凌晨 3 点
|
||||
- 内容:验证备份文件完整性
|
||||
- 方法:恢复测试
|
||||
|
||||
```bash
|
||||
# 备份验证脚本
|
||||
#!/bin/bash
|
||||
BACKUP_DIR="/backup/postgres"
|
||||
LATEST_BACKUP=$(ls -t ${BACKUP_DIR}/*.sql.gz | head -1)
|
||||
|
||||
# 验证备份文件完整性
|
||||
if gzip -t ${LATEST_BACKUP}; then
|
||||
echo "备份文件完整: ${LATEST_BACKUP}"
|
||||
else
|
||||
echo "备份文件损坏: ${LATEST_BACKUP}"
|
||||
# 发送告警
|
||||
curl -X POST "https://alert.example.com/backup-failed"
|
||||
fi
|
||||
|
||||
# 恢复测试 (在测试环境)
|
||||
# gunzip -c ${LATEST_BACKUP} | psql -U postgres -h test-db gym_manage_test
|
||||
```
|
||||
|
||||
### 7.2 恢复策略
|
||||
|
||||
#### 7.2.1 恢复优先级
|
||||
|
||||
**P0 - 核心业务恢复** (RTO ≤ 30 分钟):
|
||||
1. 数据库恢复
|
||||
2. 应用服务恢复
|
||||
3. 缓存恢复
|
||||
|
||||
**P1 - 重要业务恢复** (RTO ≤ 2 小时):
|
||||
4. 消息队列恢复
|
||||
5. 搜索引擎恢复
|
||||
6. 日志系统恢复
|
||||
|
||||
**P2 - 辅助业务恢复** (RTO ≤ 4 小时):
|
||||
7. 监控系统恢复
|
||||
8. 报表系统恢复
|
||||
9. 备份系统恢复
|
||||
|
||||
#### 7.2.2 数据库恢复流程
|
||||
|
||||
**完整恢复流程**:
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# 数据库恢复脚本
|
||||
|
||||
BACKUP_FILE=$1
|
||||
DB_NAME="gym_manage"
|
||||
DB_USER="postgres"
|
||||
|
||||
echo "开始恢复数据库..."
|
||||
|
||||
# 1. 停止应用
|
||||
echo "停止应用..."
|
||||
docker-compose stop gym-manage
|
||||
|
||||
# 2. 创建临时数据库
|
||||
echo "创建临时数据库..."
|
||||
docker-compose exec postgres psql -U postgres -c "CREATE DATABASE ${DB_NAME}_restore;"
|
||||
|
||||
# 3. 恢复数据
|
||||
echo "恢复数据..."
|
||||
gunzip -c ${BACKUP_FILE} | docker-compose exec -T postgres psql -U postgres ${DB_NAME}_restore
|
||||
|
||||
# 4. 验证数据
|
||||
echo "验证数据..."
|
||||
docker-compose exec postgres psql -U postgres -d ${DB_NAME}_restore -c "SELECT COUNT(*) FROM members;"
|
||||
|
||||
# 5. 备份当前数据库 (如果有)
|
||||
if docker-compose exec postgres psql -U postgres -lqt | cut -d \| -f 1 | grep -w ${DB_NAME}; then
|
||||
echo "备份当前数据库..."
|
||||
docker-compose exec postgres pg_dump -U postgres ${DB_NAME} | gzip > /backup/emergency_${DB_NAME}_$(date +%Y%m%d_%H%M%S).sql.gz
|
||||
fi
|
||||
|
||||
# 6. 删除原数据库
|
||||
echo "删除原数据库..."
|
||||
docker-compose exec postgres psql -U postgres -c "DROP DATABASE ${DB_NAME};"
|
||||
|
||||
# 7. 重命名恢复的数据库
|
||||
echo "重命名数据库..."
|
||||
docker-compose exec postgres psql -U postgres -c "ALTER DATABASE ${DB_NAME}_restore RENAME TO ${DB_NAME};"
|
||||
|
||||
# 8. 启动应用
|
||||
echo "启动应用..."
|
||||
docker-compose start gym-manage
|
||||
|
||||
# 9. 验证应用
|
||||
echo "验证应用..."
|
||||
sleep 10
|
||||
curl -f http://localhost:8080/actuator/health
|
||||
|
||||
echo "数据库恢复完成!"
|
||||
```
|
||||
|
||||
#### 7.2.3 应用恢复流程
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# 应用恢复脚本
|
||||
|
||||
echo "开始恢复应用..."
|
||||
|
||||
# 1. 停止应用
|
||||
docker-compose stop gym-manage
|
||||
|
||||
# 2. 清理旧容器
|
||||
docker-compose rm -f gym-manage
|
||||
|
||||
# 3. 拉取最新镜像
|
||||
docker-compose pull gym-manage
|
||||
|
||||
# 4. 恢复配置
|
||||
cp backup/application/application-prod.yml.bak ./config/application-prod.yml
|
||||
|
||||
# 5. 启动应用
|
||||
docker-compose up -d gym-manage
|
||||
|
||||
# 6. 等待启动
|
||||
sleep 30
|
||||
|
||||
# 7. 健康检查
|
||||
curl -f http://localhost:8080/actuator/health || exit 1
|
||||
|
||||
echo "应用恢复完成!"
|
||||
```
|
||||
|
||||
#### 7.2.4 缓存恢复流程
|
||||
|
||||
```bash
|
||||
#!/bin/bash
|
||||
# Redis 恢复脚本
|
||||
|
||||
echo "开始恢复 Redis..."
|
||||
|
||||
# 1. 停止 Redis
|
||||
docker-compose stop redis
|
||||
|
||||
# 2. 清理旧数据
|
||||
docker-compose run --rm redis rm -rf /data/*
|
||||
|
||||
# 3. 恢复 RDB 文件
|
||||
LATEST_RDB=$(ls -t /backup/redis/dump_*.rdb | head -1)
|
||||
cp ${LATEST_RDB} docker/redis/data/dump.rdb
|
||||
|
||||
# 4. 启动 Redis
|
||||
docker-compose up -d redis
|
||||
|
||||
# 5. 验证
|
||||
docker-compose exec redis redis-cli PING
|
||||
|
||||
echo "Redis 恢复完成!"
|
||||
```
|
||||
|
||||
### 7.3 灾难恢复
|
||||
|
||||
#### 7.3.1 灾难恢复场景
|
||||
|
||||
**场景 1: 单服务器故障**
|
||||
- 恢复时间:RTO ≤ 1 小时
|
||||
- 恢复点:RPO ≤ 15 分钟
|
||||
- 恢复步骤:
|
||||
1. 切换到备用服务器
|
||||
2. 从备份恢复数据
|
||||
3. 更新 DNS 解析
|
||||
4. 验证服务可用性
|
||||
|
||||
**场景 2: 数据中心故障**
|
||||
- 恢复时间:RTO ≤ 4 小时
|
||||
- 恢复点:RPO ≤ 1 小时
|
||||
- 恢复步骤:
|
||||
1. 启用异地灾备中心
|
||||
2. 从异地备份恢复数据
|
||||
3. 切换流量到灾备中心
|
||||
4. 验证服务可用性
|
||||
|
||||
**场景 3: 数据损坏/丢失**
|
||||
- 恢复时间:RTO ≤ 2 小时
|
||||
- 恢复点:RPO ≤ 15 分钟
|
||||
- 恢复步骤:
|
||||
1. 确定数据损坏时间点
|
||||
2. 从损坏前的备份恢复
|
||||
3. 应用增量备份
|
||||
4. 验证数据完整性
|
||||
|
||||
#### 7.3.2 灾难恢复演练
|
||||
|
||||
**演练频率**:
|
||||
- 桌面推演:每月一次
|
||||
- 实战演练:每季度一次
|
||||
- 全链路演练:每半年一次
|
||||
|
||||
**演练内容**:
|
||||
1. 备份恢复验证
|
||||
2. 故障切换验证
|
||||
3. 监控告警验证
|
||||
4. 通讯流程验证
|
||||
5. 文档更新验证
|
||||
|
||||
**演练报告**:
|
||||
- 演练目标
|
||||
- 演练过程
|
||||
- 问题记录
|
||||
- 改进措施
|
||||
- 责任人和时间节点
|
||||
|
||||
---
|
||||
|
||||
## 十、总结
|
||||
|
||||
### 10.1 部署要点
|
||||
|
||||
Reference in New Issue
Block a user