混合云架构实战(阿里云 + IDC)

使用 Terraform 管理混合云环境,统一编排阿里云和 IDC 机房的 Kubernetes 集群与虚拟机资源。

项目背景

业务场景

混合云架构需求:

  • 核心业务部署在 IDC 机房(合规性、数据安全)
  • 弹性业务部署在阿里云(快速扩展、成本优化)
  • 两地 Kubernetes 集群互联互通
  • 统一的虚拟机管理和配置
  • 跨云的网络连通和安全隔离

架构设计

┌─────────────────────────────────────────────────────────┐
│                    Terraform 统一管理                     │
└───────────┬─────────────────────────────┬───────────────┘
            │                             │
    ┌───────▼────────┐           ┌────────▼────────┐
    │   阿里云区域    │           │   IDC 机房       │
    │                │◄─────────►│                 │
    │  VPC 网络      │  VPN/专线   │  内网           │
    └────────────────┘           └─────────────────┘
            │                             │
    ┌───────┴────────┐           ┌────────┴────────┐
    │                │           │                 │
    │  ACK 集群      │           │  自建 K8s       │
    │  (3 节点)      │           │  (5 节点)       │
    │                │           │                 │
    │  ECS VM        │           │  物理机/VM      │
    │  (业务应用)     │           │  (核心服务)      │
    │                │           │                 │
    │  RDS/Redis     │           │  数据库集群      │
    │  OSS 存储      │           │  NAS 存储       │
    └────────────────┘           └─────────────────┘

项目结构

terraform-hybrid-cloud/
├── main.tf                    # 主配置入口
├── variables.tf               # 变量定义
├── outputs.tf                 # 输出定义
├── terraform.tfvars          # 变量值(敏感信息)
├── versions.tf               # Provider 版本
│
├── modules/
│   ├── alicloud/             # 阿里云模块
│   │   ├── network/          # VPC、交换机、安全组
│   │   ├── ack/              # ACK Kubernetes 集群
│   │   ├── ecs/              # ECS 虚拟机
│   │   ├── rds/              # RDS 数据库
│   │   └── vpn/              # VPN 网关
│   │
│   ├── idc/                  # IDC 模块
│   │   ├── vsphere/          # vSphere 虚拟机
│   │   ├── kubernetes/       # K8s 集群配置
│   │   ├── network/          # 网络配置
│   │   └── storage/          # 存储配置
│   │
│   └── shared/               # 共享模块
│       ├── monitoring/       # 监控配置
│       ├── logging/          # 日志配置
│       └── istio/            # 服务网格
│
├── environments/
│   ├── dev/                  # 开发环境
│   ├── staging/              # 预发环境
│   └── production/           # 生产环境
│
└── scripts/
    ├── init.sh               # 初始化脚本
    ├── deploy.sh             # 部署脚本
    └── destroy.sh            # 清理脚本

核心配置

1. Provider 配置

versions.tf

terraform {
  required_version = ">= 1.5"
  
  required_providers {
    # 阿里云 Provider
    alicloud = {
      source  = "aliyun/alicloud"
      version = "~> 1.219"
    }
    
    # vSphere Provider (IDC)
    vsphere = {
      source  = "hashicorp/vsphere"
      version = "~> 2.6"
    }
    
    # Kubernetes Provider
    kubernetes = {
      source  = "hashicorp/kubernetes"
      version = "~> 2.24"
    }
    
    # Helm Provider
    helm = {
      source  = "hashicorp/helm"
      version = "~> 2.12"
    }
    
    # Random Provider
    random = {
      source  = "hashicorp/random"
      version = "~> 3.6"
    }
  }
  
  # 远程状态存储
  backend "oss" {
    bucket              = "terraform-state-prod"
    prefix              = "hybrid-cloud"
    key                 = "terraform.tfstate"
    region              = "cn-hangzhou"
    encrypt             = true
    acl                 = "private"
    tablestore_endpoint = "https://terraform-lock.cn-hangzhou.ots.aliyuncs.com"
    tablestore_table    = "terraform_state_lock"
  }
}

# 阿里云 Provider
provider "alicloud" {
  access_key = var.alicloud_access_key
  secret_key = var.alicloud_secret_key
  region     = var.alicloud_region
}

# vSphere Provider (IDC)
provider "vsphere" {
  user                 = var.vsphere_user
  password             = var.vsphere_password
  vsphere_server       = var.vsphere_server
  allow_unverified_ssl = true
}

# Kubernetes Provider - 阿里云 ACK
provider "kubernetes" {
  alias                  = "ack"
  host                   = module.alicloud_ack.cluster_endpoint
  cluster_ca_certificate = base64decode(module.alicloud_ack.cluster_ca_cert)
  token                  = module.alicloud_ack.cluster_token
}

# Kubernetes Provider - IDC K8s
provider "kubernetes" {
  alias          = "idc"
  config_path    = var.idc_kubeconfig_path
  config_context = var.idc_kube_context
}

# Helm Provider - 阿里云
provider "helm" {
  alias = "ack"
  kubernetes {
    host                   = module.alicloud_ack.cluster_endpoint
    cluster_ca_certificate = base64decode(module.alicloud_ack.cluster_ca_cert)
    token                  = module.alicloud_ack.cluster_token
  }
}

# Helm Provider - IDC
provider "helm" {
  alias = "idc"
  kubernetes {
    config_path    = var.idc_kubeconfig_path
    config_context = var.idc_kube_context
  }
}

2. 变量定义

variables.tf

# ==================== 通用变量 ====================
variable "project_name" {
  description = "项目名称"
  type        = string
  default     = "hybrid-cloud"
}

variable "environment" {
  description = "环境名称"
  type        = string
  validation {
    condition     = contains(["dev", "staging", "production"], var.environment)
    error_message = "环境必须是 dev, staging 或 production"
  }
}

variable "tags" {
  description = "通用标签"
  type        = map(string)
  default = {
    Project     = "HybridCloud"
    ManagedBy   = "Terraform"
    Team        = "DevOps"
  }
}

# ==================== 阿里云变量 ====================
variable "alicloud_access_key" {
  description = "阿里云 Access Key"
  type        = string
  sensitive   = true
}

variable "alicloud_secret_key" {
  description = "阿里云 Secret Key"
  type        = string
  sensitive   = true
}

variable "alicloud_region" {
  description = "阿里云区域"
  type        = string
  default     = "cn-hangzhou"
}

variable "alicloud_zones" {
  description = "阿里云可用区"
  type        = list(string)
  default     = ["cn-hangzhou-h", "cn-hangzhou-i", "cn-hangzhou-j"]
}

variable "alicloud_vpc_cidr" {
  description = "阿里云 VPC CIDR"
  type        = string
  default     = "172.16.0.0/12"
}

variable "ack_cluster_config" {
  description = "ACK 集群配置"
  type = object({
    name               = string
    k8s_version        = string
    pod_cidr           = string
    service_cidr       = string
    worker_instance_type = string
    worker_number      = number
    worker_disk_size   = number
  })
  default = {
    name               = "ack-prod"
    k8s_version        = "1.28.3-aliyun.1"
    pod_cidr           = "172.20.0.0/16"
    service_cidr       = "172.21.0.0/20"
    worker_instance_type = "ecs.c6.2xlarge"
    worker_number      = 3
    worker_disk_size   = 100
  }
}

# ==================== IDC 变量 ====================
variable "vsphere_server" {
  description = "vSphere 服务器地址"
  type        = string
}

variable "vsphere_user" {
  description = "vSphere 用户名"
  type        = string
  sensitive   = true
}

variable "vsphere_password" {
  description = "vSphere 密码"
  type        = string
  sensitive   = true
}

variable "vsphere_datacenter" {
  description = "vSphere 数据中心"
  type        = string
  default     = "Datacenter"
}

variable "vsphere_cluster" {
  description = "vSphere 计算集群"
  type        = string
  default     = "Cluster01"
}

variable "idc_network_cidr" {
  description = "IDC 内网 CIDR"
  type        = string
  default     = "10.0.0.0/16"
}

variable "idc_kubeconfig_path" {
  description = "IDC K8s kubeconfig 路径"
  type        = string
  default     = "~/.kube/config-idc"
}

variable "idc_kube_context" {
  description = "IDC K8s context"
  type        = string
  default     = "kubernetes-idc"
}

variable "idc_k8s_masters" {
  description = "IDC K8s Master 节点配置"
  type = list(object({
    hostname = string
    ip       = string
    cpu      = number
    memory   = number
    disk     = number
  }))
  default = [
    {
      hostname = "k8s-master-01"
      ip       = "10.0.1.11"
      cpu      = 8
      memory   = 16384
      disk     = 200
    },
    {
      hostname = "k8s-master-02"
      ip       = "10.0.1.12"
      cpu      = 8
      memory   = 16384
      disk     = 200
    },
    {
      hostname = "k8s-master-03"
      ip       = "10.0.1.13"
      cpu      = 8
      memory   = 16384
      disk     = 200
    }
  ]
}

variable "idc_k8s_workers" {
  description = "IDC K8s Worker 节点配置"
  type = list(object({
    hostname = string
    ip       = string
    cpu      = number
    memory   = number
    disk     = number
  }))
  default = [
    {
      hostname = "k8s-worker-01"
      ip       = "10.0.2.11"
      cpu      = 16
      memory   = 32768
      disk     = 500
    },
    {
      hostname = "k8s-worker-02"
      ip       = "10.0.2.12"
      cpu      = 16
      memory   = 32768
      disk     = 500
    }
  ]
}

# ==================== VPN 变量 ====================
variable "vpn_config" {
  description = "VPN 配置"
  type = object({
    enabled         = bool
    ipsec_psk       = string
    customer_gateway_ip = string
  })
  default = {
    enabled         = true
    ipsec_psk       = ""  # 在 tfvars 中设置
    customer_gateway_ip = ""  # IDC 公网 IP
  }
  sensitive = true
}

3. 主配置文件

main.tf

# ==================== 数据源 ====================
data "alicloud_zones" "default" {
  available_resource_creation = "VSwitch"
}

# ==================== 阿里云网络 ====================
module "alicloud_network" {
  source = "./modules/alicloud/network"
  
  project_name = var.project_name
  environment  = var.environment
  
  vpc_cidr = var.alicloud_vpc_cidr
  vswitch_cidrs = [
    cidrsubnet(var.alicloud_vpc_cidr, 8, 1),  # 172.16.1.0/24
    cidrsubnet(var.alicloud_vpc_cidr, 8, 2),  # 172.16.2.0/24
    cidrsubnet(var.alicloud_vpc_cidr, 8, 3),  # 172.16.3.0/24
  ]
  
  availability_zones = var.alicloud_zones
  
  tags = merge(var.tags, {
    Location = "AliCloud"
  })
}

# ==================== 阿里云 ACK 集群 ====================
module "alicloud_ack" {
  source = "./modules/alicloud/ack"
  
  cluster_name    = "${var.project_name}-${var.environment}-ack"
  k8s_version     = var.ack_cluster_config.k8s_version
  vpc_id          = module.alicloud_network.vpc_id
  vswitch_ids     = module.alicloud_network.vswitch_ids
  pod_cidr        = var.ack_cluster_config.pod_cidr
  service_cidr    = var.ack_cluster_config.service_cidr
  
  worker_instance_type = var.ack_cluster_config.worker_instance_type
  worker_number        = var.ack_cluster_config.worker_number
  worker_disk_size     = var.ack_cluster_config.worker_disk_size
  
  security_group_id = module.alicloud_network.security_group_id
  
  tags = merge(var.tags, {
    Cluster = "ACK"
  })
}

# ==================== 阿里云 ECS 虚拟机 ====================
module "alicloud_ecs" {
  source = "./modules/alicloud/ecs"
  
  project_name = var.project_name
  environment  = var.environment
  
  vpc_id            = module.alicloud_network.vpc_id
  vswitch_id        = module.alicloud_network.vswitch_ids[0]
  security_group_id = module.alicloud_network.security_group_id
  
  instances = [
    {
      name          = "app-server-01"
      instance_type = "ecs.c6.xlarge"
      system_disk_size = 100
      data_disk_size   = 200
      role          = "application"
    },
    {
      name          = "app-server-02"
      instance_type = "ecs.c6.xlarge"
      system_disk_size = 100
      data_disk_size   = 200
      role          = "application"
    },
    {
      name          = "middleware-server"
      instance_type = "ecs.g6.2xlarge"
      system_disk_size = 100
      data_disk_size   = 500
      role          = "middleware"
    }
  ]
  
  tags = merge(var.tags, {
    Location = "AliCloud"
  })
}

# ==================== 阿里云 RDS ====================
module "alicloud_rds" {
  source = "./modules/alicloud/rds"
  
  project_name = var.project_name
  environment  = var.environment
  
  vpc_id     = module.alicloud_network.vpc_id
  vswitch_id = module.alicloud_network.vswitch_ids[0]
  
  instance_type  = "mysql.n2.medium.1"
  engine_version = "8.0"
  storage        = 100
  
  database_name = "${var.project_name}_${var.environment}"
  
  # 允许 IDC 访问
  security_ips = concat(
    [var.alicloud_vpc_cidr],
    [var.idc_network_cidr]
  )
  
  tags = merge(var.tags, {
    Service = "Database"
  })
}

# ==================== VPN 网关(阿里云到IDC)====================
module "alicloud_vpn" {
  source = "./modules/alicloud/vpn"
  count  = var.vpn_config.enabled ? 1 : 0
  
  project_name = var.project_name
  environment  = var.environment
  
  vpc_id = module.alicloud_network.vpc_id
  
  # VPN 网关配置
  vpn_gateway_spec = "10M"
  
  # 客户网关(IDC 端)
  customer_gateway_ip = var.vpn_config.customer_gateway_ip
  
  # IPsec 连接
  ipsec_config = {
    local_subnet  = var.alicloud_vpc_cidr
    remote_subnet = var.idc_network_cidr
    psk           = var.vpn_config.ipsec_psk
  }
  
  tags = var.tags
}

# ==================== IDC vSphere 虚拟机 ====================
module "idc_vsphere" {
  source = "./modules/idc/vsphere"
  
  vsphere_datacenter = var.vsphere_datacenter
  vsphere_cluster    = var.vsphere_cluster
  
  # K8s Master 节点
  master_vms = [
    for vm in var.idc_k8s_masters : {
      name     = vm.hostname
      num_cpus = vm.cpu
      memory   = vm.memory
      disk_size = vm.disk
      network_cidr = var.idc_network_cidr
      ip_address   = vm.ip
    }
  ]
  
  # K8s Worker 节点
  worker_vms = [
    for vm in var.idc_k8s_workers : {
      name     = vm.hostname
      num_cpus = vm.cpu
      memory   = vm.memory
      disk_size = vm.disk
      network_cidr = var.idc_network_cidr
      ip_address   = vm.ip
    }
  ]
  
  # 业务应用虚拟机
  app_vms = [
    {
      name      = "core-app-01"
      num_cpus  = 16
      memory    = 32768
      disk_size = 500
      ip_address = "10.0.3.11"
    },
    {
      name      = "core-app-02"
      num_cpus  = 16
      memory    = 32768
      disk_size = 500
      ip_address = "10.0.3.12"
    }
  ]
  
  tags = merge(var.tags, {
    Location = "IDC"
  })
}

# ==================== IDC Kubernetes 配置 ====================
module "idc_kubernetes" {
  source = "./modules/idc/kubernetes"
  
  providers = {
    kubernetes = kubernetes.idc
    helm       = helm.idc
  }
  
  cluster_name = "${var.project_name}-${var.environment}-idc"
  
  # 安装基础组件
  install_components = {
    ingress_nginx    = true
    cert_manager     = true
    metrics_server   = true
    prometheus       = true
    istio            = true
  }
  
  # 配置 Istio 多集群
  istio_config = {
    enabled       = true
    cluster_name  = "idc-cluster"
    network       = "idc-network"
    
    # 与阿里云 ACK 互联
    remote_clusters = [{
      name     = "ack-cluster"
      endpoint = module.alicloud_ack.cluster_endpoint
      ca_cert  = module.alicloud_ack.cluster_ca_cert
    }]
  }
  
  tags = merge(var.tags, {
    Location = "IDC"
  })
}

# ==================== 阿里云 Kubernetes 配置 ====================
module "ack_kubernetes" {
  source = "./modules/alicloud/ack-config"
  
  providers = {
    kubernetes = kubernetes.ack
    helm       = helm.ack
  }
  
  cluster_name = module.alicloud_ack.cluster_name
  
  # 安装基础组件
  install_components = {
    ingress_nginx    = true
    cert_manager     = true
    metrics_server   = true
    prometheus       = true
    istio            = true
  }
  
  # 配置 Istio 多集群
  istio_config = {
    enabled       = true
    cluster_name  = "ack-cluster"
    network       = "alicloud-network"
    
    # 与 IDC K8s 互联
    remote_clusters = [{
      name     = "idc-cluster"
      endpoint = var.idc_k8s_masters[0].ip
      ca_cert  = ""  # 从 IDC 集群获取
    }]
  }
  
  tags = merge(var.tags, {
    Location = "AliCloud"
  })
}

# ==================== 共享监控配置 ====================
module "shared_monitoring" {
  source = "./modules/shared/monitoring"
  
  # 阿里云 Prometheus
  alicloud_prometheus = {
    enabled  = true
    endpoint = module.ack_kubernetes.prometheus_endpoint
  }
  
  # IDC Prometheus
  idc_prometheus = {
    enabled  = true
    endpoint = module.idc_kubernetes.prometheus_endpoint
  }
  
  # 统一的 Grafana
  grafana_config = {
    enabled     = true
    admin_password = random_password.grafana_password.result
    
    # 数据源
    datasources = [
      {
        name = "AliCloud Prometheus"
        type = "prometheus"
        url  = module.ack_kubernetes.prometheus_endpoint
      },
      {
        name = "IDC Prometheus"
        type = "prometheus"
        url  = module.idc_kubernetes.prometheus_endpoint
      }
    ]
  }
}

# ==================== 随机密码 ====================
resource "random_password" "grafana_password" {
  length  = 16
  special = true
}

阿里云模块详解

ACK 集群模块

modules/alicloud/ack/main.tf

resource "alicloud_cs_managed_kubernetes" "main" {
  name                      = var.cluster_name
  version                   = var.k8s_version
  worker_vswitch_ids        = var.vswitch_ids
  new_nat_gateway           = true
  pod_cidr                  = var.pod_cidr
  service_cidr              = var.service_cidr
  slb_internet_enabled      = true
  
  # 控制平面日志
  control_plane_log_ttl     = 30
  control_plane_log_components = ["apiserver", "kcm", "scheduler"]
  
  # 集群插件
  addons {
    name = "terway-eniip"
  }
  
  addons {
    name = "csi-plugin"
  }
  
  addons {
    name = "csi-provisioner"
  }
  
  addons {
    name = "logtail-ds"
    config = jsonencode({
      IngressDashboardEnabled = "true"
      sls_project_name        = "${var.cluster_name}-logs"
    })
  }
  
  addons {
    name = "nginx-ingress-controller"
    config = jsonencode({
      IngressSlbNetworkType = "internet"
      IngressSlbSpec        = "slb.s2.small"
    })
  }
  
  dynamic "tags" {
    for_each = var.tags
    content {
      key   = tags.key
      value = tags.value
    }
  }
}

# Worker 节点池
resource "alicloud_cs_kubernetes_node_pool" "workers" {
  cluster_id     = alicloud_cs_managed_kubernetes.main.id
  name           = "${var.cluster_name}-workers"
  vswitch_ids    = var.vswitch_ids
  instance_types = [var.worker_instance_type]
  
  desired_size = var.worker_number
  
  # 系统盘
  system_disk_category = "cloud_essd"
  system_disk_size     = var.worker_disk_size
  
  # 数据盘
  data_disks {
    category = "cloud_essd"
    size     = 200
    encrypted = true
  }
  
  # 自动扩缩容
  scaling_config {
    min_size = var.worker_number
    max_size = var.worker_number * 2
    type     = "cpu"
  }
  
  # 节点配置
  node_config {
    kubelet_configuration {
      maxPods = 64
    }
  }
  
  # 管理配置
  management {
    auto_repair      = true
    auto_upgrade     = false
    max_unavailable  = 1
  }
  
  labels = {
    "node-role" = "worker"
    "env"       = var.environment
  }
  
  taints {
    key    = "workload"
    value  = "general"
    effect = "NoSchedule"
  }
}

# 输出
output "cluster_id" {
  value = alicloud_cs_managed_kubernetes.main.id
}

output "cluster_endpoint" {
  value = alicloud_cs_managed_kubernetes.main.connections[0].api_server_internet
}

output "cluster_ca_cert" {
  value     = alicloud_cs_managed_kubernetes.main.certificate_authority[0].cluster_cert
  sensitive = true
}

output "cluster_token" {
  value     = alicloud_cs_managed_kubernetes.main.certificate_authority[0].client_key
  sensitive = true
}

VPN 网关模块

modules/alicloud/vpn/main.tf

# VPN 网关
resource "alicloud_vpn_gateway" "main" {
  name                 = "${var.project_name}-${var.environment}-vpn"
  vpc_id               = var.vpc_id
  bandwidth            = var.vpn_gateway_spec
  enable_ssl           = true
  instance_charge_type = "PostPaid"
  description          = "VPN Gateway for hybrid cloud connectivity"
  
  vswitch_id = var.vswitch_id
  
  tags = var.tags
}

# 客户网关(IDC 端)
resource "alicloud_vpn_customer_gateway" "idc" {
  name        = "${var.project_name}-${var.environment}-cgw-idc"
  ip_address  = var.customer_gateway_ip
  description = "IDC Customer Gateway"
  
  tags = var.tags
}

# IPsec 连接
resource "alicloud_vpn_connection" "idc" {
  name                = "${var.project_name}-${var.environment}-ipsec-idc"
  vpn_gateway_id      = alicloud_vpn_gateway.main.id
  customer_gateway_id = alicloud_vpn_customer_gateway.idc.id
  
  local_subnet  = [var.ipsec_config.local_subnet]
  remote_subnet = [var.ipsec_config.remote_subnet]
  
  effect_immediately = true
  
  # IKE 配置
  ike_config {
    ike_auth_alg  = "sha256"
    ike_enc_alg   = "aes256"
    ike_version   = "ikev2"
    ike_mode      = "main"
    ike_lifetime  = 86400
    psk           = var.ipsec_config.psk
    ike_pfs       = "group14"
  }
  
  # IPsec 配置
  ipsec_config {
    ipsec_pfs      = "group14"
    ipsec_enc_alg  = "aes256"
    ipsec_auth_alg = "sha256"
    ipsec_lifetime = 3600
  }
  
  tags = var.tags
}

# 路由条目
resource "alicloud_route_entry" "to_idc" {
  route_table_id        = var.route_table_id
  destination_cidrblock = var.ipsec_config.remote_subnet
  nexthop_type          = "VpnGateway"
  nexthop_id            = alicloud_vpn_gateway.main.id
}

output "vpn_gateway_id" {
  value = alicloud_vpn_gateway.main.id
}

output "vpn_gateway_ip" {
  value = alicloud_vpn_gateway.main.internet_ip
}

output "ipsec_connection_id" {
  value = alicloud_vpn_connection.idc.id
}

IDC 模块详解

vSphere 虚拟机模块

modules/idc/vsphere/main.tf

data "vsphere_datacenter" "dc" {
  name = var.vsphere_datacenter
}

data "vsphere_compute_cluster" "cluster" {
  name          = var.vsphere_cluster
  datacenter_id = data.vsphere_datacenter.dc.id
}

data "vsphere_datastore" "datastore" {
  name          = var.datastore_name
  datacenter_id = data.vsphere_datacenter.dc.id
}

data "vsphere_network" "network" {
  name          = var.network_name
  datacenter_id = data.vsphere_datacenter.dc.id
}

data "vsphere_virtual_machine" "template" {
  name          = var.template_name
  datacenter_id = data.vsphere_datacenter.dc.id
}

# K8s Master 节点
resource "vsphere_virtual_machine" "k8s_masters" {
  for_each = { for vm in var.master_vms : vm.name => vm }
  
  name             = each.value.name
  resource_pool_id = data.vsphere_compute_cluster.cluster.resource_pool_id
  datastore_id     = data.vsphere_datastore.datastore.id
  num_cpus         = each.value.num_cpus
  memory           = each.value.memory
  guest_id         = data.vsphere_virtual_machine.template.guest_id
  
  network_interface {
    network_id   = data.vsphere_network.network.id
    adapter_type = data.vsphere_virtual_machine.template.network_interface_types[0]
  }
  
  disk {
    label            = "disk0"
    size             = each.value.disk_size
    thin_provisioned = true
  }
  
  clone {
    template_uuid = data.vsphere_virtual_machine.template.id
    
    customize {
      linux_options {
        host_name = each.value.name
        domain    = "idc.local"
      }
      
      network_interface {
        ipv4_address = each.value.ip_address
        ipv4_netmask = 24
      }
      
      ipv4_gateway = cidrhost(var.network_cidr, 1)
      dns_server_list = var.dns_servers
    }
  }
  
  tags = [for k, v in var.tags : "${k}:${v}"]
  
  # 自定义配置
  extra_config = {
    "guestinfo.metadata" = base64encode(templatefile("${path.module}/cloud-init.yaml", {
      hostname = each.value.name
      role     = "master"
    }))
  }
}

# K8s Worker 节点
resource "vsphere_virtual_machine" "k8s_workers" {
  for_each = { for vm in var.worker_vms : vm.name => vm }
  
  name             = each.value.name
  resource_pool_id = data.vsphere_compute_cluster.cluster.resource_pool_id
  datastore_id     = data.vsphere_datastore.datastore.id
  num_cpus         = each.value.num_cpus
  memory           = each.value.memory
  guest_id         = data.vsphere_virtual_machine.template.guest_id
  
  network_interface {
    network_id   = data.vsphere_network.network.id
    adapter_type = data.vsphere_virtual_machine.template.network_interface_types[0]
  }
  
  disk {
    label            = "disk0"
    size             = each.value.disk_size
    thin_provisioned = true
  }
  
  # 额外数据盘
  disk {
    label            = "disk1"
    size             = 500
    unit_number      = 1
    thin_provisioned = true
  }
  
  clone {
    template_uuid = data.vsphere_virtual_machine.template.id
    
    customize {
      linux_options {
        host_name = each.value.name
        domain    = "idc.local"
      }
      
      network_interface {
        ipv4_address = each.value.ip_address
        ipv4_netmask = 24
      }
      
      ipv4_gateway = cidrhost(var.network_cidr, 1)
      dns_server_list = var.dns_servers
    }
  }
  
  tags = [for k, v in var.tags : "${k}:${v}"]
  
  extra_config = {
    "guestinfo.metadata" = base64encode(templatefile("${path.module}/cloud-init.yaml", {
      hostname = each.value.name
      role     = "worker"
    }))
  }
}

# 输出
output "master_ips" {
  value = { for k, v in vsphere_virtual_machine.k8s_masters : k => v.default_ip_address }
}

output "worker_ips" {
  value = { for k, v in vsphere_virtual_machine.k8s_workers : k => v.default_ip_address }
}

IDC Kubernetes 配置模块

modules/idc/kubernetes/main.tf

# Nginx Ingress Controller
resource "helm_release" "ingress_nginx" {
  count = var.install_components.ingress_nginx ? 1 : 0
  
  name       = "ingress-nginx"
  repository = "https://kubernetes.github.io/ingress-nginx"
  chart      = "ingress-nginx"
  version    = "4.8.3"
  namespace  = "ingress-nginx"
  
  create_namespace = true
  
  values = [
    yamlencode({
      controller = {
        replicaCount = 2
        service = {
          type = "NodePort"
          nodePorts = {
            http  = 30080
            https = 30443
          }
        }
        resources = {
          requests = {
            cpu    = "200m"
            memory = "256Mi"
          }
          limits = {
            cpu    = "1000m"
            memory = "512Mi"
          }
        }
      }
    })
  ]
}

# Cert Manager
resource "helm_release" "cert_manager" {
  count = var.install_components.cert_manager ? 1 : 0
  
  name       = "cert-manager"
  repository = "https://charts.jetstack.io"
  chart      = "cert-manager"
  version    = "v1.13.2"
  namespace  = "cert-manager"
  
  create_namespace = true
  
  set {
    name  = "installCRDs"
    value = "true"
  }
}

# Istio Base
resource "helm_release" "istio_base" {
  count = var.istio_config.enabled ? 1 : 0
  
  name       = "istio-base"
  repository = "https://istio-release.storage.googleapis.com/charts"
  chart      = "base"
  version    = "1.20.0"
  namespace  = "istio-system"
  
  create_namespace = true
}

# Istiod
resource "helm_release" "istiod" {
  count = var.istio_config.enabled ? 1 : 0
  
  name       = "istiod"
  repository = "https://istio-release.storage.googleapis.com/charts"
  chart      = "istiod"
  version    = "1.20.0"
  namespace  = "istio-system"
  
  depends_on = [helm_release.istio_base]
  
  values = [
    yamlencode({
      meshConfig = {
        defaultConfig = {
          proxyMetadata = {
            ISTIO_META_CLUSTER_ID = var.istio_config.cluster_name
            ISTIO_META_NETWORK    = var.istio_config.network
          }
        }
      }
      pilot = {
        env = {
          EXTERNAL_ISTIOD = "true"
        }
      }
    })
  ]
}

# Istio Ingress Gateway
resource "helm_release" "istio_ingress" {
  count = var.istio_config.enabled ? 1 : 0
  
  name       = "istio-ingressgateway"
  repository = "https://istio-release.storage.googleapis.com/charts"
  chart      = "gateway"
  version    = "1.20.0"
  namespace  = "istio-system"
  
  depends_on = [helm_release.istiod]
  
  values = [
    yamlencode({
      service = {
        type = "NodePort"
        ports = [
          {
            name       = "http2"
            port       = 80
            targetPort = 8080
            nodePort   = 30080
          },
          {
            name       = "https"
            port       = 443
            targetPort = 8443
            nodePort   = 30443
          }
        ]
      }
    })
  ]
}

# Prometheus
resource "helm_release" "prometheus" {
  count = var.install_components.prometheus ? 1 : 0
  
  name       = "prometheus"
  repository = "https://prometheus-community.github.io/helm-charts"
  chart      = "kube-prometheus-stack"
  version    = "54.0.0"
  namespace  = "monitoring"
  
  create_namespace = true
  
  values = [
    yamlencode({
      prometheus = {
        prometheusSpec = {
          retention = "30d"
          storageSpec = {
            volumeClaimTemplate = {
              spec = {
                accessModes = ["ReadWriteOnce"]
                resources = {
                  requests = {
                    storage = "100Gi"
                  }
                }
              }
            }
          }
          externalLabels = {
            cluster = var.cluster_name
            location = "idc"
          }
        }
      }
      grafana = {
        enabled = true
        adminPassword = var.grafana_admin_password
      }
    })
  ]
}

output "prometheus_endpoint" {
  value = var.install_components.prometheus ? "http://prometheus-kube-prometheus-prometheus.monitoring:9090" : ""
}

部署流程

1. 初始化配置

terraform.tfvars

# 环境配置
project_name = "hybrid-cloud"
environment  = "production"

# 阿里云配置
alicloud_access_key = "YOUR_ACCESS_KEY"
alicloud_secret_key = "YOUR_SECRET_KEY"
alicloud_region     = "cn-hangzhou"

# IDC vSphere 配置
vsphere_server   = "vcenter.idc.local"
vsphere_user     = "administrator@vsphere.local"
vsphere_password = "YOUR_VSPHERE_PASSWORD"

# VPN 配置
vpn_config = {
  enabled             = true
  ipsec_psk           = "YOUR_STRONG_PSK"
  customer_gateway_ip = "203.0.113.10"  # IDC 公网 IP
}

# 标签
tags = {
  Project     = "HybridCloud"
  Environment = "Production"
  ManagedBy   = "Terraform"
  Team        = "Platform"
  CostCenter  = "IT"
}

2. 执行部署

#!/bin/bash
# deploy.sh

set -e

echo "========== 混合云基础设施部署 =========="

# 1. 初始化 Terraform
echo "📦 初始化 Terraform..."
terraform init -upgrade

# 2. 验证配置
echo "🔍 验证配置..."
terraform validate

# 3. 格式化代码
echo "✨ 格式化代码..."
terraform fmt -recursive

# 4. 生成执行计划
echo "📋 生成执行计划..."
terraform plan -out=tfplan

# 5. 确认部署
read -p "确认部署?(yes/no): " confirm
if [ "$confirm" != "yes" ]; then
    echo "❌ 部署已取消"
    exit 1
fi

# 6. 应用配置
echo "🚀 开始部署..."
terraform apply tfplan

# 7. 输出结果
echo "✅ 部署完成!"
terraform output

# 8. 保存 kubeconfig
echo "💾 保存 Kubernetes 配置..."
terraform output -raw ack_kubeconfig > ~/.kube/config-ack
terraform output -raw idc_kubeconfig > ~/.kube/config-idc

echo "========== 部署成功 =========="

3. 验证部署

#!/bin/bash
# verify.sh

echo "========== 验证混合云环境 =========="

# 1. 验证阿里云 ACK
echo "🔍 验证阿里云 ACK 集群..."
export KUBECONFIG=~/.kube/config-ack
kubectl cluster-info
kubectl get nodes
kubectl get pods -A

# 2. 验证 IDC K8s
echo "🔍 验证 IDC K8s 集群..."
export KUBECONFIG=~/.kube/config-idc
kubectl cluster-info
kubectl get nodes
kubectl get pods -A

# 3. 验证 VPN 连通性
echo "🔍 验证 VPN 连接..."
# 从阿里云 ECS ping IDC 内网
ALICLOUD_ECS_IP=$(terraform output -raw alicloud_ecs_public_ip)
IDC_VM_IP=$(terraform output -raw idc_vm_private_ip)

ssh root@$ALICLOUD_ECS_IP "ping -c 4 $IDC_VM_IP"

# 4. 验证跨集群服务发现
echo "🔍 验证 Istio 多集群..."
export KUBECONFIG=~/.kube/config-ack
kubectl get secret istio-remote-secret-idc-cluster -n istio-system

export KUBECONFIG=~/.kube/config-idc
kubectl get secret istio-remote-secret-ack-cluster -n istio-system

echo "✅ 验证完成!"

运维管理

1. 统一监控 Dashboard

Grafana 配置:

# grafana-dashboard.json
{
  "dashboard": {
    "title": "混合云监控总览",
    "panels": [
      {
        "title": "集群状态",
        "targets": [
          {
            "expr": "up{job=\"kubernetes-nodes\",cluster=\"ack-cluster\"}",
            "legendFormat": "阿里云 ACK - {{instance}}"
          },
          {
            "expr": "up{job=\"kubernetes-nodes\",cluster=\"idc-cluster\"}",
            "legendFormat": "IDC K8s - {{instance}}"
          }
        ]
      },
      {
        "title": "Pod 数量",
        "targets": [
          {
            "expr": "sum(kube_pod_info{cluster=\"ack-cluster\"}) by (namespace)",
            "legendFormat": "ACK - {{namespace}}"
          },
          {
            "expr": "sum(kube_pod_info{cluster=\"idc-cluster\"}) by (namespace)",
            "legendFormat": "IDC - {{namespace}}"
          }
        ]
      },
      {
        "title": "跨云流量",
        "targets": [
          {
            "expr": "sum(istio_tcp_sent_bytes_total{source_cluster=\"ack-cluster\",destination_cluster=\"idc-cluster\"})",
            "legendFormat": "ACK → IDC"
          },
          {
            "expr": "sum(istio_tcp_sent_bytes_total{source_cluster=\"idc-cluster\",destination_cluster=\"ack-cluster\"})",
            "legendFormat": "IDC → ACK"
          }
        ]
      }
    ]
  }
}

2. 成本优化

#!/bin/bash
# cost-optimization.sh

# 1. 分析资源使用情况
echo "📊 分析资源使用..."

# 阿里云成本
aliyun bssopenapi DescribeInstanceBill \
  --BillingCycle 2024-01 \
  --Granularity MONTHLY

# 2. 识别闲置资源
echo "🔍 识别闲置资源..."

# 查找未使用的 ECS
terraform state list | grep alicloud_instance | while read instance; do
  instance_id=$(terraform state show $instance | grep "id " | awk '{print $3}' | tr -d '"')
  # 检查 CPU 使用率
  echo "检查实例: $instance_id"
done

# 3. 自动缩容建议
echo "💡 缩容建议..."
# 根据负载自动调整 Worker 节点数量

3. 灾难恢复

#!/bin/bash
# disaster-recovery.sh

# 1. 备份 Terraform 状态
echo "💾 备份 Terraform 状态..."
terraform state pull > backup/terraform.tfstate.$(date +%Y%m%d)

# 2. 备份 Kubernetes 资源
echo "💾 备份 Kubernetes 资源..."
export KUBECONFIG=~/.kube/config-ack
velero backup create ack-backup-$(date +%Y%m%d) --include-namespaces '*'

export KUBECONFIG=~/.kube/config-idc
velero backup create idc-backup-$(date +%Y%m%d) --include-namespaces '*'

# 3. 备份数据库
echo "💾 备份数据库..."
aliyun rds CreateBackup \
  --DBInstanceId $(terraform output -raw rds_instance_id)

echo "✅ 备份完成!"

最佳实践

1. 安全加固

  • 使用 Terraform Cloud 管理敏感变量
  • 启用 VPN IPsec 加密
  • 配置网络隔离和安全组
  • 定期轮换访问密钥
  • 启用审计日志

2. 性能优化

  • 使用专线替代 VPN(生产环境)
  • 启用 CDN 加速
  • 合理规划 Pod 和 Service CIDR
  • 使用亲和性规则优化调度
  • 启用集群自动扩缩容

3. 成本控制

  • 使用预留实例降低成本
  • 开发/测试环境定时开关机
  • 合理配置资源 requests/limits
  • 使用 Spot 实例(非核心业务)
  • 定期清理未使用资源

小结

混合云架构关键点:

网络互通:

  • VPN Gateway 建立连接
  • 统一的 CIDR 规划
  • 安全的 IPsec 加密

统一管理:

  • Terraform 统一编排
  • Kubernetes 多集群管理
  • Istio 服务网格

可观测性:

  • 统一监控 (Prometheus + Grafana)
  • 集中日志 (ELK/Loki)
  • 分布式追踪 (Jaeger)

高可用性:

  • 多可用区部署
  • 跨云容灾备份
  • 自动故障转移

通过 Terraform 实现混合云的基础设施即代码,大大简化了复杂环境的管理和运维!