备份与恢复

备份是灾难恢复的基础，本节介绍 Kubernetes 集群和应用的备份策略。

备份策略设计

备份范围

┌─────────────────────────────────────┐
│         集群级别                     │
│  - etcd 数据                        │
│  - 证书和密钥                        │
│  - kubeconfig                       │
└─────────────────────────────────────┘

┌─────────────────────────────────────┐
│         应用级别                     │
│  - Kubernetes 资源定义               │
│  - PersistentVolume 数据            │
│  - ConfigMap/Secret                │
└─────────────────────────────────────┘

┌─────────────────────────────────────┐
│         应用数据                     │
│  - 数据库数据                        │
│  - 文件存储                         │
│  - 对象存储                         │
└─────────────────────────────────────┘

备份频率

# 关键数据
etcd: 每 6 小时 + 重要变更前
应用配置: 每天
数据库: 每天全量 + 每小时增量

# 次要数据
日志: 每周
监控数据: 不备份（可重建）

# 保留策略
每日备份: 保留 7 天
每周备份: 保留 4 周
每月备份: 保留 12 个月

etcd 备份

手动备份

# 使用 etcdctl 备份
ETCDCTL_API=3 etcdctl snapshot save /backup/etcd-snapshot-$(date +%Y%m%d-%H%M%S).db \
  --endpoints=https://127.0.0.1:2379 \
  --cacert=/etc/kubernetes/pki/etcd/ca.crt \
  --cert=/etc/kubernetes/pki/etcd/server.crt \
  --key=/etc/kubernetes/pki/etcd/server.key

# 验证备份
ETCDCTL_API=3 etcdctl snapshot status /backup/etcd-snapshot.db --write-out=table

# 输出：
# +----------+----------+------------+------------+
# |   HASH   | REVISION | TOTAL KEYS | TOTAL SIZE |
# +----------+----------+------------+------------+
# | 1a2b3c4d |   123456 |      10000 |     100 MB |
# +----------+----------+------------+------------+

自动备份 CronJob

apiVersion: batch/v1
kind: CronJob
metadata:
  name: etcd-backup
  namespace: kube-system
spec:
  schedule: "0 */6 * * *"  # 每 6 小时
  successfulJobsHistoryLimit: 3
  failedJobsHistoryLimit: 3
  jobTemplate:
    spec:
      template:
        spec:
          hostNetwork: true
          nodeSelector:
            node-role.kubernetes.io/control-plane: ""
          tolerations:
          - key: node-role.kubernetes.io/control-plane
            operator: Exists
            effect: NoSchedule
          
          containers:
          - name: backup
            image: k8s.gcr.io/etcd:3.5.9-0
            command:
            - /bin/sh
            - -c
            - |
              BACKUP_FILE="/backup/etcd-snapshot-$(date +%Y%m%d-%H%M%S).db"
              
              # 创建备份
              ETCDCTL_API=3 etcdctl snapshot save $BACKUP_FILE \
                --endpoints=https://127.0.0.1:2379 \
                --cacert=/etc/kubernetes/pki/etcd/ca.crt \
                --cert=/etc/kubernetes/pki/etcd/server.crt \
                --key=/etc/kubernetes/pki/etcd/server.key
              
              # 验证备份
              ETCDCTL_API=3 etcdctl snapshot status $BACKUP_FILE
              
              # 上传到 S3（可选）
              aws s3 cp $BACKUP_FILE s3://my-k8s-backup/etcd/
              
              # 删除 7 天前的备份
              find /backup -name "etcd-snapshot-*.db" -mtime +7 -delete
            
            volumeMounts:
            - name: etcd-certs
              mountPath: /etc/kubernetes/pki/etcd
              readOnly: true
            - name: backup
              mountPath: /backup
            
            env:
            - name: AWS_ACCESS_KEY_ID
              valueFrom:
                secretKeyRef:
                  name: aws-credentials
                  key: access-key-id
            - name: AWS_SECRET_ACCESS_KEY
              valueFrom:
                secretKeyRef:
                  name: aws-credentials
                  key: secret-access-key
          
          volumes:
          - name: etcd-certs
            hostPath:
              path: /etc/kubernetes/pki/etcd
              type: Directory
          - name: backup
            hostPath:
              path: /var/backup/etcd
              type: DirectoryOrCreate
          
          restartPolicy: OnFailure

etcd 恢复

# 1. 停止 kube-apiserver
systemctl stop kube-apiserver

# 2. 停止 etcd
systemctl stop etcd

# 3. 备份当前数据（以防万一）
mv /var/lib/etcd /var/lib/etcd.backup

# 4. 从快照恢复
ETCDCTL_API=3 etcdctl snapshot restore /backup/etcd-snapshot.db \
  --data-dir=/var/lib/etcd \
  --name=etcd-1 \
  --initial-cluster=etcd-1=https://192.168.1.10:2380 \
  --initial-advertise-peer-urls=https://192.168.1.10:2380

# 5. 修复权限
chown -R etcd:etcd /var/lib/etcd

# 6. 启动 etcd
systemctl start etcd

# 7. 启动 kube-apiserver
systemctl start kube-apiserver

# 8. 验证
kubectl get nodes
kubectl get pods --all-namespaces

Velero 备份方案

Velero 是 Kubernetes 集群备份和恢复工具。

安装 Velero

# 下载 Velero CLI
wget https://github.com/vmware-tanzu/velero/releases/download/v1.12.0/velero-v1.12.0-linux-amd64.tar.gz
tar -xvf velero-v1.12.0-linux-amd64.tar.gz
sudo mv velero-v1.12.0-linux-amd64/velero /usr/local/bin/

# 安装 Velero 到集群（使用 AWS S3）
velero install \
  --provider aws \
  --plugins velero/velero-plugin-for-aws:v1.8.0 \
  --bucket my-k8s-backup \
  --backup-location-config region=us-west-2 \
  --snapshot-location-config region=us-west-2 \
  --secret-file ./credentials-velero

# credentials-velero 文件内容：
# [default]
# aws_access_key_id=YOUR_ACCESS_KEY
# aws_secret_access_key=YOUR_SECRET_KEY

创建备份

# 备份整个集群
velero backup create full-cluster-backup

# 备份特定 namespace
velero backup create prod-backup --include-namespaces production

# 备份特定资源
velero backup create app-backup \
  --include-namespaces production \
  --include-resources deployments,services,configmaps,secrets

# 排除特定资源
velero backup create backup-without-logs \
  --exclude-resources pods,replicasets

# 使用标签选择
velero backup create labeled-backup \
  --selector app=myapp

# 定时备份
velero schedule create daily-backup \
  --schedule="0 2 * * *" \
  --include-namespaces production

恢复备份

# 查看备份列表
velero backup get

# 恢复最新备份
velero restore create --from-backup full-cluster-backup

# 恢复到不同 namespace
velero restore create --from-backup prod-backup \
  --namespace-mappings production:production-restore

# 只恢复特定资源
velero restore create --from-backup full-cluster-backup \
  --include-resources deployments,services

# 查看恢复状态
velero restore get
velero restore describe <restore-name>
velero restore logs <restore-name>

Velero 备份 Hook

# 备份前后执行命令（如数据库一致性检查）
apiVersion: v1
kind: Pod
metadata:
  name: database
  annotations:
    # 备份前执行
    pre.hook.backup.velero.io/command: '["/bin/bash", "-c", "pg_dump mydb > /backup/dump.sql"]'
    pre.hook.backup.velero.io/container: postgres
    
    # 备份后执行
    post.hook.backup.velero.io/command: '["/bin/bash", "-c", "rm /backup/dump.sql"]'
    post.hook.backup.velero.io/container: postgres
spec:
  containers:
  - name: postgres
    image: postgres:15

应用数据备份

MySQL/PostgreSQL 备份

apiVersion: batch/v1
kind: CronJob
metadata:
  name: mysql-backup
  namespace: production
spec:
  schedule: "0 2 * * *"
  jobTemplate:
    spec:
      template:
        spec:
          containers:
          - name: backup
            image: mysql:8.0
            command:
            - /bin/sh
            - -c
            - |
              BACKUP_FILE="/backup/mysql-$(date +%Y%m%d-%H%M%S).sql.gz"
              
              # 备份
              mysqldump -h mysql -u root -p$MYSQL_ROOT_PASSWORD \
                --all-databases \
                --single-transaction \
                --quick \
                --lock-tables=false \
                | gzip > $BACKUP_FILE
              
              # 上传到 S3
              aws s3 cp $BACKUP_FILE s3://my-db-backup/mysql/
              
              # 清理旧备份
              find /backup -name "mysql-*.sql.gz" -mtime +7 -delete
            
            env:
            - name: MYSQL_ROOT_PASSWORD
              valueFrom:
                secretKeyRef:
                  name: mysql-secret
                  key: root-password
            
            volumeMounts:
            - name: backup
              mountPath: /backup
          
          volumes:
          - name: backup
            persistentVolumeClaim:
              claimName: backup-pvc
          
          restartPolicy: OnFailure

MongoDB 备份

apiVersion: batch/v1
kind: CronJob
metadata:
  name: mongodb-backup
  namespace: production
spec:
  schedule: "0 2 * * *"
  jobTemplate:
    spec:
      template:
        spec:
          containers:
          - name: backup
            image: mongo:6.0
            command:
            - /bin/sh
            - -c
            - |
              BACKUP_DIR="/backup/mongodb-$(date +%Y%m%d-%H%M%S)"
              
              # 备份
              mongodump \
                --host=mongodb-0.mongodb:27017 \
                --username=admin \
                --password=$MONGO_PASSWORD \
                --authenticationDatabase=admin \
                --gzip \
                --out=$BACKUP_DIR
              
              # 打包
              tar -czf $BACKUP_DIR.tar.gz $BACKUP_DIR
              
              # 上传到 S3
              aws s3 cp $BACKUP_DIR.tar.gz s3://my-db-backup/mongodb/
              
              # 清理
              rm -rf $BACKUP_DIR $BACKUP_DIR.tar.gz
              find /backup -name "mongodb-*.tar.gz" -mtime +7 -delete
            
            env:
            - name: MONGO_PASSWORD
              valueFrom:
                secretKeyRef:
                  name: mongodb-secret
                  key: password
            
            volumeMounts:
            - name: backup
              mountPath: /backup
          
          volumes:
          - name: backup
            persistentVolumeClaim:
              claimName: backup-pvc
          
          restartPolicy: OnFailure

PV 数据备份

使用 Velero 备份 PV

# Velero 自动备份 PV（如果使用云存储快照）
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
  name: myapp-data
  labels:
    velero.io/backup-volume: "true"  # 标记需要备份
spec:
  accessModes:
  - ReadWriteOnce
  storageClassName: fast-ssd
  resources:
    requests:
      storage: 10Gi

手动备份 PV 数据

apiVersion: batch/v1
kind: Job
metadata:
  name: pv-backup
spec:
  template:
    spec:
      containers:
      - name: backup
        image: alpine:3.19
        command:
        - /bin/sh
        - -c
        - |
          # 安装 tar
          apk add --no-cache tar
          
          # 备份数据
          cd /data
          tar -czf /backup/pv-data-$(date +%Y%m%d).tar.gz .
          
          # 上传（如果配置了 rclone）
          rclone copy /backup/pv-data-$(date +%Y%m%d).tar.gz s3:my-backup/
        
        volumeMounts:
        - name: data
          mountPath: /data
        - name: backup
          mountPath: /backup
      
      volumes:
      - name: data
        persistentVolumeClaim:
          claimName: myapp-data
      - name: backup
        hostPath:
          path: /var/backup
      
      restartPolicy: OnFailure

备份验证

定期恢复测试

#!/bin/bash
# backup-test.sh

# 1. 创建测试 namespace
kubectl create namespace backup-test

# 2. 从备份恢复到测试环境
velero restore create test-restore \
  --from-backup daily-backup \
  --namespace-mappings production:backup-test

# 3. 等待恢复完成
velero restore wait test-restore

# 4. 验证资源
kubectl get all -n backup-test

# 5. 运行测试（可选）
kubectl run test -n backup-test --image=busybox --rm -it -- \
  wget -O- http://myapp-service:8080/health

# 6. 清理测试环境
kubectl delete namespace backup-test

# 7. 输出结果
if [ $? -eq 0 ]; then
  echo "✅ 备份验证成功"
else
  echo "❌ 备份验证失败"
  exit 1
fi

灾难恢复演练

完整恢复流程

# 场景：集群完全丢失，需要从备份恢复

# 1. 准备新集群
kubeadm init --config kubeadm-config.yaml

# 2. 安装 CNI
kubectl apply -f calico.yaml

# 3. 恢复 etcd（如果有备份）
ETCDCTL_API=3 etcdctl snapshot restore /backup/etcd-snapshot.db

# 4. 安装 Velero
velero install --provider aws --bucket my-k8s-backup

# 5. 恢复所有资源
velero restore create full-restore --from-backup latest-backup

# 6. 验证恢复
kubectl get nodes
kubectl get pods --all-namespaces

# 7. 恢复数据库
# 根据具体数据库类型恢复

# 8. 验证应用
curl http://myapp.example.com/health

# 9. 切换 DNS（如果需要）

小结

本节介绍了备份与恢复：

✅ 备份策略：范围、频率、保留策略
✅ etcd 备份：手动备份、自动 CronJob、恢复流程
✅ Velero：安装、备份、恢复、定时备份
✅ 应用数据：MySQL、MongoDB、PV 数据备份
✅ 备份验证：定期恢复测试
✅ 灾难恢复：完整恢复流程

下一节：集群升级。