混合云架构实战(阿里云 + IDC)
使用 Terraform 管理混合云环境,统一编排阿里云和 IDC 机房的 Kubernetes 集群与虚拟机资源。
项目背景
业务场景
混合云架构需求:
- 核心业务部署在 IDC 机房(合规性、数据安全)
- 弹性业务部署在阿里云(快速扩展、成本优化)
- 两地 Kubernetes 集群互联互通
- 统一的虚拟机管理和配置
- 跨云的网络连通和安全隔离
架构设计
┌─────────────────────────────────────────────────────────┐
│ Terraform 统一管理 │
└───────────┬─────────────────────────────┬───────────────┘
│ │
┌───────▼────────┐ ┌────────▼────────┐
│ 阿里云区域 │ │ IDC 机房 │
│ │◄─────────►│ │
│ VPC 网络 │ VPN/专线 │ 内网 │
└────────────────┘ └─────────────────┘
│ │
┌───────┴────────┐ ┌────────┴────────┐
│ │ │ │
│ ACK 集群 │ │ 自建 K8s │
│ (3 节点) │ │ (5 节点) │
│ │ │ │
│ ECS VM │ │ 物理机/VM │
│ (业务应用) │ │ (核心服务) │
│ │ │ │
│ RDS/Redis │ │ 数据库集群 │
│ OSS 存储 │ │ NAS 存储 │
└────────────────┘ └─────────────────┘
项目结构
terraform-hybrid-cloud/
├── main.tf # 主配置入口
├── variables.tf # 变量定义
├── outputs.tf # 输出定义
├── terraform.tfvars # 变量值(敏感信息)
├── versions.tf # Provider 版本
│
├── modules/
│ ├── alicloud/ # 阿里云模块
│ │ ├── network/ # VPC、交换机、安全组
│ │ ├── ack/ # ACK Kubernetes 集群
│ │ ├── ecs/ # ECS 虚拟机
│ │ ├── rds/ # RDS 数据库
│ │ └── vpn/ # VPN 网关
│ │
│ ├── idc/ # IDC 模块
│ │ ├── vsphere/ # vSphere 虚拟机
│ │ ├── kubernetes/ # K8s 集群配置
│ │ ├── network/ # 网络配置
│ │ └── storage/ # 存储配置
│ │
│ └── shared/ # 共享模块
│ ├── monitoring/ # 监控配置
│ ├── logging/ # 日志配置
│ └── istio/ # 服务网格
│
├── environments/
│ ├── dev/ # 开发环境
│ ├── staging/ # 预发环境
│ └── production/ # 生产环境
│
└── scripts/
├── init.sh # 初始化脚本
├── deploy.sh # 部署脚本
└── destroy.sh # 清理脚本
核心配置
1. Provider 配置
versions.tf
terraform {
required_version = ">= 1.5"
required_providers {
# 阿里云 Provider
alicloud = {
source = "aliyun/alicloud"
version = "~> 1.219"
}
# vSphere Provider (IDC)
vsphere = {
source = "hashicorp/vsphere"
version = "~> 2.6"
}
# Kubernetes Provider
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.24"
}
# Helm Provider
helm = {
source = "hashicorp/helm"
version = "~> 2.12"
}
# Random Provider
random = {
source = "hashicorp/random"
version = "~> 3.6"
}
}
# 远程状态存储
backend "oss" {
bucket = "terraform-state-prod"
prefix = "hybrid-cloud"
key = "terraform.tfstate"
region = "cn-hangzhou"
encrypt = true
acl = "private"
tablestore_endpoint = "https://terraform-lock.cn-hangzhou.ots.aliyuncs.com"
tablestore_table = "terraform_state_lock"
}
}
# 阿里云 Provider
provider "alicloud" {
access_key = var.alicloud_access_key
secret_key = var.alicloud_secret_key
region = var.alicloud_region
}
# vSphere Provider (IDC)
provider "vsphere" {
user = var.vsphere_user
password = var.vsphere_password
vsphere_server = var.vsphere_server
allow_unverified_ssl = true
}
# Kubernetes Provider - 阿里云 ACK
provider "kubernetes" {
alias = "ack"
host = module.alicloud_ack.cluster_endpoint
cluster_ca_certificate = base64decode(module.alicloud_ack.cluster_ca_cert)
token = module.alicloud_ack.cluster_token
}
# Kubernetes Provider - IDC K8s
provider "kubernetes" {
alias = "idc"
config_path = var.idc_kubeconfig_path
config_context = var.idc_kube_context
}
# Helm Provider - 阿里云
provider "helm" {
alias = "ack"
kubernetes {
host = module.alicloud_ack.cluster_endpoint
cluster_ca_certificate = base64decode(module.alicloud_ack.cluster_ca_cert)
token = module.alicloud_ack.cluster_token
}
}
# Helm Provider - IDC
provider "helm" {
alias = "idc"
kubernetes {
config_path = var.idc_kubeconfig_path
config_context = var.idc_kube_context
}
}
2. 变量定义
variables.tf
# ==================== 通用变量 ====================
variable "project_name" {
description = "项目名称"
type = string
default = "hybrid-cloud"
}
variable "environment" {
description = "环境名称"
type = string
validation {
condition = contains(["dev", "staging", "production"], var.environment)
error_message = "环境必须是 dev, staging 或 production"
}
}
variable "tags" {
description = "通用标签"
type = map(string)
default = {
Project = "HybridCloud"
ManagedBy = "Terraform"
Team = "DevOps"
}
}
# ==================== 阿里云变量 ====================
variable "alicloud_access_key" {
description = "阿里云 Access Key"
type = string
sensitive = true
}
variable "alicloud_secret_key" {
description = "阿里云 Secret Key"
type = string
sensitive = true
}
variable "alicloud_region" {
description = "阿里云区域"
type = string
default = "cn-hangzhou"
}
variable "alicloud_zones" {
description = "阿里云可用区"
type = list(string)
default = ["cn-hangzhou-h", "cn-hangzhou-i", "cn-hangzhou-j"]
}
variable "alicloud_vpc_cidr" {
description = "阿里云 VPC CIDR"
type = string
default = "172.16.0.0/12"
}
variable "ack_cluster_config" {
description = "ACK 集群配置"
type = object({
name = string
k8s_version = string
pod_cidr = string
service_cidr = string
worker_instance_type = string
worker_number = number
worker_disk_size = number
})
default = {
name = "ack-prod"
k8s_version = "1.28.3-aliyun.1"
pod_cidr = "172.20.0.0/16"
service_cidr = "172.21.0.0/20"
worker_instance_type = "ecs.c6.2xlarge"
worker_number = 3
worker_disk_size = 100
}
}
# ==================== IDC 变量 ====================
variable "vsphere_server" {
description = "vSphere 服务器地址"
type = string
}
variable "vsphere_user" {
description = "vSphere 用户名"
type = string
sensitive = true
}
variable "vsphere_password" {
description = "vSphere 密码"
type = string
sensitive = true
}
variable "vsphere_datacenter" {
description = "vSphere 数据中心"
type = string
default = "Datacenter"
}
variable "vsphere_cluster" {
description = "vSphere 计算集群"
type = string
default = "Cluster01"
}
variable "idc_network_cidr" {
description = "IDC 内网 CIDR"
type = string
default = "10.0.0.0/16"
}
variable "idc_kubeconfig_path" {
description = "IDC K8s kubeconfig 路径"
type = string
default = "~/.kube/config-idc"
}
variable "idc_kube_context" {
description = "IDC K8s context"
type = string
default = "kubernetes-idc"
}
variable "idc_k8s_masters" {
description = "IDC K8s Master 节点配置"
type = list(object({
hostname = string
ip = string
cpu = number
memory = number
disk = number
}))
default = [
{
hostname = "k8s-master-01"
ip = "10.0.1.11"
cpu = 8
memory = 16384
disk = 200
},
{
hostname = "k8s-master-02"
ip = "10.0.1.12"
cpu = 8
memory = 16384
disk = 200
},
{
hostname = "k8s-master-03"
ip = "10.0.1.13"
cpu = 8
memory = 16384
disk = 200
}
]
}
variable "idc_k8s_workers" {
description = "IDC K8s Worker 节点配置"
type = list(object({
hostname = string
ip = string
cpu = number
memory = number
disk = number
}))
default = [
{
hostname = "k8s-worker-01"
ip = "10.0.2.11"
cpu = 16
memory = 32768
disk = 500
},
{
hostname = "k8s-worker-02"
ip = "10.0.2.12"
cpu = 16
memory = 32768
disk = 500
}
]
}
# ==================== VPN 变量 ====================
variable "vpn_config" {
description = "VPN 配置"
type = object({
enabled = bool
ipsec_psk = string
customer_gateway_ip = string
})
default = {
enabled = true
ipsec_psk = "" # 在 tfvars 中设置
customer_gateway_ip = "" # IDC 公网 IP
}
sensitive = true
}
3. 主配置文件
main.tf
# ==================== 数据源 ====================
data "alicloud_zones" "default" {
available_resource_creation = "VSwitch"
}
# ==================== 阿里云网络 ====================
module "alicloud_network" {
source = "./modules/alicloud/network"
project_name = var.project_name
environment = var.environment
vpc_cidr = var.alicloud_vpc_cidr
vswitch_cidrs = [
cidrsubnet(var.alicloud_vpc_cidr, 8, 1), # 172.16.1.0/24
cidrsubnet(var.alicloud_vpc_cidr, 8, 2), # 172.16.2.0/24
cidrsubnet(var.alicloud_vpc_cidr, 8, 3), # 172.16.3.0/24
]
availability_zones = var.alicloud_zones
tags = merge(var.tags, {
Location = "AliCloud"
})
}
# ==================== 阿里云 ACK 集群 ====================
module "alicloud_ack" {
source = "./modules/alicloud/ack"
cluster_name = "${var.project_name}-${var.environment}-ack"
k8s_version = var.ack_cluster_config.k8s_version
vpc_id = module.alicloud_network.vpc_id
vswitch_ids = module.alicloud_network.vswitch_ids
pod_cidr = var.ack_cluster_config.pod_cidr
service_cidr = var.ack_cluster_config.service_cidr
worker_instance_type = var.ack_cluster_config.worker_instance_type
worker_number = var.ack_cluster_config.worker_number
worker_disk_size = var.ack_cluster_config.worker_disk_size
security_group_id = module.alicloud_network.security_group_id
tags = merge(var.tags, {
Cluster = "ACK"
})
}
# ==================== 阿里云 ECS 虚拟机 ====================
module "alicloud_ecs" {
source = "./modules/alicloud/ecs"
project_name = var.project_name
environment = var.environment
vpc_id = module.alicloud_network.vpc_id
vswitch_id = module.alicloud_network.vswitch_ids[0]
security_group_id = module.alicloud_network.security_group_id
instances = [
{
name = "app-server-01"
instance_type = "ecs.c6.xlarge"
system_disk_size = 100
data_disk_size = 200
role = "application"
},
{
name = "app-server-02"
instance_type = "ecs.c6.xlarge"
system_disk_size = 100
data_disk_size = 200
role = "application"
},
{
name = "middleware-server"
instance_type = "ecs.g6.2xlarge"
system_disk_size = 100
data_disk_size = 500
role = "middleware"
}
]
tags = merge(var.tags, {
Location = "AliCloud"
})
}
# ==================== 阿里云 RDS ====================
module "alicloud_rds" {
source = "./modules/alicloud/rds"
project_name = var.project_name
environment = var.environment
vpc_id = module.alicloud_network.vpc_id
vswitch_id = module.alicloud_network.vswitch_ids[0]
instance_type = "mysql.n2.medium.1"
engine_version = "8.0"
storage = 100
database_name = "${var.project_name}_${var.environment}"
# 允许 IDC 访问
security_ips = concat(
[var.alicloud_vpc_cidr],
[var.idc_network_cidr]
)
tags = merge(var.tags, {
Service = "Database"
})
}
# ==================== VPN 网关(阿里云到IDC)====================
module "alicloud_vpn" {
source = "./modules/alicloud/vpn"
count = var.vpn_config.enabled ? 1 : 0
project_name = var.project_name
environment = var.environment
vpc_id = module.alicloud_network.vpc_id
# VPN 网关配置
vpn_gateway_spec = "10M"
# 客户网关(IDC 端)
customer_gateway_ip = var.vpn_config.customer_gateway_ip
# IPsec 连接
ipsec_config = {
local_subnet = var.alicloud_vpc_cidr
remote_subnet = var.idc_network_cidr
psk = var.vpn_config.ipsec_psk
}
tags = var.tags
}
# ==================== IDC vSphere 虚拟机 ====================
module "idc_vsphere" {
source = "./modules/idc/vsphere"
vsphere_datacenter = var.vsphere_datacenter
vsphere_cluster = var.vsphere_cluster
# K8s Master 节点
master_vms = [
for vm in var.idc_k8s_masters : {
name = vm.hostname
num_cpus = vm.cpu
memory = vm.memory
disk_size = vm.disk
network_cidr = var.idc_network_cidr
ip_address = vm.ip
}
]
# K8s Worker 节点
worker_vms = [
for vm in var.idc_k8s_workers : {
name = vm.hostname
num_cpus = vm.cpu
memory = vm.memory
disk_size = vm.disk
network_cidr = var.idc_network_cidr
ip_address = vm.ip
}
]
# 业务应用虚拟机
app_vms = [
{
name = "core-app-01"
num_cpus = 16
memory = 32768
disk_size = 500
ip_address = "10.0.3.11"
},
{
name = "core-app-02"
num_cpus = 16
memory = 32768
disk_size = 500
ip_address = "10.0.3.12"
}
]
tags = merge(var.tags, {
Location = "IDC"
})
}
# ==================== IDC Kubernetes 配置 ====================
module "idc_kubernetes" {
source = "./modules/idc/kubernetes"
providers = {
kubernetes = kubernetes.idc
helm = helm.idc
}
cluster_name = "${var.project_name}-${var.environment}-idc"
# 安装基础组件
install_components = {
ingress_nginx = true
cert_manager = true
metrics_server = true
prometheus = true
istio = true
}
# 配置 Istio 多集群
istio_config = {
enabled = true
cluster_name = "idc-cluster"
network = "idc-network"
# 与阿里云 ACK 互联
remote_clusters = [{
name = "ack-cluster"
endpoint = module.alicloud_ack.cluster_endpoint
ca_cert = module.alicloud_ack.cluster_ca_cert
}]
}
tags = merge(var.tags, {
Location = "IDC"
})
}
# ==================== 阿里云 Kubernetes 配置 ====================
module "ack_kubernetes" {
source = "./modules/alicloud/ack-config"
providers = {
kubernetes = kubernetes.ack
helm = helm.ack
}
cluster_name = module.alicloud_ack.cluster_name
# 安装基础组件
install_components = {
ingress_nginx = true
cert_manager = true
metrics_server = true
prometheus = true
istio = true
}
# 配置 Istio 多集群
istio_config = {
enabled = true
cluster_name = "ack-cluster"
network = "alicloud-network"
# 与 IDC K8s 互联
remote_clusters = [{
name = "idc-cluster"
endpoint = var.idc_k8s_masters[0].ip
ca_cert = "" # 从 IDC 集群获取
}]
}
tags = merge(var.tags, {
Location = "AliCloud"
})
}
# ==================== 共享监控配置 ====================
module "shared_monitoring" {
source = "./modules/shared/monitoring"
# 阿里云 Prometheus
alicloud_prometheus = {
enabled = true
endpoint = module.ack_kubernetes.prometheus_endpoint
}
# IDC Prometheus
idc_prometheus = {
enabled = true
endpoint = module.idc_kubernetes.prometheus_endpoint
}
# 统一的 Grafana
grafana_config = {
enabled = true
admin_password = random_password.grafana_password.result
# 数据源
datasources = [
{
name = "AliCloud Prometheus"
type = "prometheus"
url = module.ack_kubernetes.prometheus_endpoint
},
{
name = "IDC Prometheus"
type = "prometheus"
url = module.idc_kubernetes.prometheus_endpoint
}
]
}
}
# ==================== 随机密码 ====================
resource "random_password" "grafana_password" {
length = 16
special = true
}
阿里云模块详解
ACK 集群模块
modules/alicloud/ack/main.tf
resource "alicloud_cs_managed_kubernetes" "main" {
name = var.cluster_name
version = var.k8s_version
worker_vswitch_ids = var.vswitch_ids
new_nat_gateway = true
pod_cidr = var.pod_cidr
service_cidr = var.service_cidr
slb_internet_enabled = true
# 控制平面日志
control_plane_log_ttl = 30
control_plane_log_components = ["apiserver", "kcm", "scheduler"]
# 集群插件
addons {
name = "terway-eniip"
}
addons {
name = "csi-plugin"
}
addons {
name = "csi-provisioner"
}
addons {
name = "logtail-ds"
config = jsonencode({
IngressDashboardEnabled = "true"
sls_project_name = "${var.cluster_name}-logs"
})
}
addons {
name = "nginx-ingress-controller"
config = jsonencode({
IngressSlbNetworkType = "internet"
IngressSlbSpec = "slb.s2.small"
})
}
dynamic "tags" {
for_each = var.tags
content {
key = tags.key
value = tags.value
}
}
}
# Worker 节点池
resource "alicloud_cs_kubernetes_node_pool" "workers" {
cluster_id = alicloud_cs_managed_kubernetes.main.id
name = "${var.cluster_name}-workers"
vswitch_ids = var.vswitch_ids
instance_types = [var.worker_instance_type]
desired_size = var.worker_number
# 系统盘
system_disk_category = "cloud_essd"
system_disk_size = var.worker_disk_size
# 数据盘
data_disks {
category = "cloud_essd"
size = 200
encrypted = true
}
# 自动扩缩容
scaling_config {
min_size = var.worker_number
max_size = var.worker_number * 2
type = "cpu"
}
# 节点配置
node_config {
kubelet_configuration {
maxPods = 64
}
}
# 管理配置
management {
auto_repair = true
auto_upgrade = false
max_unavailable = 1
}
labels = {
"node-role" = "worker"
"env" = var.environment
}
taints {
key = "workload"
value = "general"
effect = "NoSchedule"
}
}
# 输出
output "cluster_id" {
value = alicloud_cs_managed_kubernetes.main.id
}
output "cluster_endpoint" {
value = alicloud_cs_managed_kubernetes.main.connections[0].api_server_internet
}
output "cluster_ca_cert" {
value = alicloud_cs_managed_kubernetes.main.certificate_authority[0].cluster_cert
sensitive = true
}
output "cluster_token" {
value = alicloud_cs_managed_kubernetes.main.certificate_authority[0].client_key
sensitive = true
}
VPN 网关模块
modules/alicloud/vpn/main.tf
# VPN 网关
resource "alicloud_vpn_gateway" "main" {
name = "${var.project_name}-${var.environment}-vpn"
vpc_id = var.vpc_id
bandwidth = var.vpn_gateway_spec
enable_ssl = true
instance_charge_type = "PostPaid"
description = "VPN Gateway for hybrid cloud connectivity"
vswitch_id = var.vswitch_id
tags = var.tags
}
# 客户网关(IDC 端)
resource "alicloud_vpn_customer_gateway" "idc" {
name = "${var.project_name}-${var.environment}-cgw-idc"
ip_address = var.customer_gateway_ip
description = "IDC Customer Gateway"
tags = var.tags
}
# IPsec 连接
resource "alicloud_vpn_connection" "idc" {
name = "${var.project_name}-${var.environment}-ipsec-idc"
vpn_gateway_id = alicloud_vpn_gateway.main.id
customer_gateway_id = alicloud_vpn_customer_gateway.idc.id
local_subnet = [var.ipsec_config.local_subnet]
remote_subnet = [var.ipsec_config.remote_subnet]
effect_immediately = true
# IKE 配置
ike_config {
ike_auth_alg = "sha256"
ike_enc_alg = "aes256"
ike_version = "ikev2"
ike_mode = "main"
ike_lifetime = 86400
psk = var.ipsec_config.psk
ike_pfs = "group14"
}
# IPsec 配置
ipsec_config {
ipsec_pfs = "group14"
ipsec_enc_alg = "aes256"
ipsec_auth_alg = "sha256"
ipsec_lifetime = 3600
}
tags = var.tags
}
# 路由条目
resource "alicloud_route_entry" "to_idc" {
route_table_id = var.route_table_id
destination_cidrblock = var.ipsec_config.remote_subnet
nexthop_type = "VpnGateway"
nexthop_id = alicloud_vpn_gateway.main.id
}
output "vpn_gateway_id" {
value = alicloud_vpn_gateway.main.id
}
output "vpn_gateway_ip" {
value = alicloud_vpn_gateway.main.internet_ip
}
output "ipsec_connection_id" {
value = alicloud_vpn_connection.idc.id
}
IDC 模块详解
vSphere 虚拟机模块
modules/idc/vsphere/main.tf
data "vsphere_datacenter" "dc" {
name = var.vsphere_datacenter
}
data "vsphere_compute_cluster" "cluster" {
name = var.vsphere_cluster
datacenter_id = data.vsphere_datacenter.dc.id
}
data "vsphere_datastore" "datastore" {
name = var.datastore_name
datacenter_id = data.vsphere_datacenter.dc.id
}
data "vsphere_network" "network" {
name = var.network_name
datacenter_id = data.vsphere_datacenter.dc.id
}
data "vsphere_virtual_machine" "template" {
name = var.template_name
datacenter_id = data.vsphere_datacenter.dc.id
}
# K8s Master 节点
resource "vsphere_virtual_machine" "k8s_masters" {
for_each = { for vm in var.master_vms : vm.name => vm }
name = each.value.name
resource_pool_id = data.vsphere_compute_cluster.cluster.resource_pool_id
datastore_id = data.vsphere_datastore.datastore.id
num_cpus = each.value.num_cpus
memory = each.value.memory
guest_id = data.vsphere_virtual_machine.template.guest_id
network_interface {
network_id = data.vsphere_network.network.id
adapter_type = data.vsphere_virtual_machine.template.network_interface_types[0]
}
disk {
label = "disk0"
size = each.value.disk_size
thin_provisioned = true
}
clone {
template_uuid = data.vsphere_virtual_machine.template.id
customize {
linux_options {
host_name = each.value.name
domain = "idc.local"
}
network_interface {
ipv4_address = each.value.ip_address
ipv4_netmask = 24
}
ipv4_gateway = cidrhost(var.network_cidr, 1)
dns_server_list = var.dns_servers
}
}
tags = [for k, v in var.tags : "${k}:${v}"]
# 自定义配置
extra_config = {
"guestinfo.metadata" = base64encode(templatefile("${path.module}/cloud-init.yaml", {
hostname = each.value.name
role = "master"
}))
}
}
# K8s Worker 节点
resource "vsphere_virtual_machine" "k8s_workers" {
for_each = { for vm in var.worker_vms : vm.name => vm }
name = each.value.name
resource_pool_id = data.vsphere_compute_cluster.cluster.resource_pool_id
datastore_id = data.vsphere_datastore.datastore.id
num_cpus = each.value.num_cpus
memory = each.value.memory
guest_id = data.vsphere_virtual_machine.template.guest_id
network_interface {
network_id = data.vsphere_network.network.id
adapter_type = data.vsphere_virtual_machine.template.network_interface_types[0]
}
disk {
label = "disk0"
size = each.value.disk_size
thin_provisioned = true
}
# 额外数据盘
disk {
label = "disk1"
size = 500
unit_number = 1
thin_provisioned = true
}
clone {
template_uuid = data.vsphere_virtual_machine.template.id
customize {
linux_options {
host_name = each.value.name
domain = "idc.local"
}
network_interface {
ipv4_address = each.value.ip_address
ipv4_netmask = 24
}
ipv4_gateway = cidrhost(var.network_cidr, 1)
dns_server_list = var.dns_servers
}
}
tags = [for k, v in var.tags : "${k}:${v}"]
extra_config = {
"guestinfo.metadata" = base64encode(templatefile("${path.module}/cloud-init.yaml", {
hostname = each.value.name
role = "worker"
}))
}
}
# 输出
output "master_ips" {
value = { for k, v in vsphere_virtual_machine.k8s_masters : k => v.default_ip_address }
}
output "worker_ips" {
value = { for k, v in vsphere_virtual_machine.k8s_workers : k => v.default_ip_address }
}
IDC Kubernetes 配置模块
modules/idc/kubernetes/main.tf
# Nginx Ingress Controller
resource "helm_release" "ingress_nginx" {
count = var.install_components.ingress_nginx ? 1 : 0
name = "ingress-nginx"
repository = "https://kubernetes.github.io/ingress-nginx"
chart = "ingress-nginx"
version = "4.8.3"
namespace = "ingress-nginx"
create_namespace = true
values = [
yamlencode({
controller = {
replicaCount = 2
service = {
type = "NodePort"
nodePorts = {
http = 30080
https = 30443
}
}
resources = {
requests = {
cpu = "200m"
memory = "256Mi"
}
limits = {
cpu = "1000m"
memory = "512Mi"
}
}
}
})
]
}
# Cert Manager
resource "helm_release" "cert_manager" {
count = var.install_components.cert_manager ? 1 : 0
name = "cert-manager"
repository = "https://charts.jetstack.io"
chart = "cert-manager"
version = "v1.13.2"
namespace = "cert-manager"
create_namespace = true
set {
name = "installCRDs"
value = "true"
}
}
# Istio Base
resource "helm_release" "istio_base" {
count = var.istio_config.enabled ? 1 : 0
name = "istio-base"
repository = "https://istio-release.storage.googleapis.com/charts"
chart = "base"
version = "1.20.0"
namespace = "istio-system"
create_namespace = true
}
# Istiod
resource "helm_release" "istiod" {
count = var.istio_config.enabled ? 1 : 0
name = "istiod"
repository = "https://istio-release.storage.googleapis.com/charts"
chart = "istiod"
version = "1.20.0"
namespace = "istio-system"
depends_on = [helm_release.istio_base]
values = [
yamlencode({
meshConfig = {
defaultConfig = {
proxyMetadata = {
ISTIO_META_CLUSTER_ID = var.istio_config.cluster_name
ISTIO_META_NETWORK = var.istio_config.network
}
}
}
pilot = {
env = {
EXTERNAL_ISTIOD = "true"
}
}
})
]
}
# Istio Ingress Gateway
resource "helm_release" "istio_ingress" {
count = var.istio_config.enabled ? 1 : 0
name = "istio-ingressgateway"
repository = "https://istio-release.storage.googleapis.com/charts"
chart = "gateway"
version = "1.20.0"
namespace = "istio-system"
depends_on = [helm_release.istiod]
values = [
yamlencode({
service = {
type = "NodePort"
ports = [
{
name = "http2"
port = 80
targetPort = 8080
nodePort = 30080
},
{
name = "https"
port = 443
targetPort = 8443
nodePort = 30443
}
]
}
})
]
}
# Prometheus
resource "helm_release" "prometheus" {
count = var.install_components.prometheus ? 1 : 0
name = "prometheus"
repository = "https://prometheus-community.github.io/helm-charts"
chart = "kube-prometheus-stack"
version = "54.0.0"
namespace = "monitoring"
create_namespace = true
values = [
yamlencode({
prometheus = {
prometheusSpec = {
retention = "30d"
storageSpec = {
volumeClaimTemplate = {
spec = {
accessModes = ["ReadWriteOnce"]
resources = {
requests = {
storage = "100Gi"
}
}
}
}
}
externalLabels = {
cluster = var.cluster_name
location = "idc"
}
}
}
grafana = {
enabled = true
adminPassword = var.grafana_admin_password
}
})
]
}
output "prometheus_endpoint" {
value = var.install_components.prometheus ? "http://prometheus-kube-prometheus-prometheus.monitoring:9090" : ""
}
部署流程
1. 初始化配置
terraform.tfvars
# 环境配置
project_name = "hybrid-cloud"
environment = "production"
# 阿里云配置
alicloud_access_key = "YOUR_ACCESS_KEY"
alicloud_secret_key = "YOUR_SECRET_KEY"
alicloud_region = "cn-hangzhou"
# IDC vSphere 配置
vsphere_server = "vcenter.idc.local"
vsphere_user = "administrator@vsphere.local"
vsphere_password = "YOUR_VSPHERE_PASSWORD"
# VPN 配置
vpn_config = {
enabled = true
ipsec_psk = "YOUR_STRONG_PSK"
customer_gateway_ip = "203.0.113.10" # IDC 公网 IP
}
# 标签
tags = {
Project = "HybridCloud"
Environment = "Production"
ManagedBy = "Terraform"
Team = "Platform"
CostCenter = "IT"
}
2. 执行部署
#!/bin/bash
# deploy.sh
set -e
echo "========== 混合云基础设施部署 =========="
# 1. 初始化 Terraform
echo "📦 初始化 Terraform..."
terraform init -upgrade
# 2. 验证配置
echo "🔍 验证配置..."
terraform validate
# 3. 格式化代码
echo "✨ 格式化代码..."
terraform fmt -recursive
# 4. 生成执行计划
echo "📋 生成执行计划..."
terraform plan -out=tfplan
# 5. 确认部署
read -p "确认部署?(yes/no): " confirm
if [ "$confirm" != "yes" ]; then
echo "❌ 部署已取消"
exit 1
fi
# 6. 应用配置
echo "🚀 开始部署..."
terraform apply tfplan
# 7. 输出结果
echo "✅ 部署完成!"
terraform output
# 8. 保存 kubeconfig
echo "💾 保存 Kubernetes 配置..."
terraform output -raw ack_kubeconfig > ~/.kube/config-ack
terraform output -raw idc_kubeconfig > ~/.kube/config-idc
echo "========== 部署成功 =========="
3. 验证部署
#!/bin/bash
# verify.sh
echo "========== 验证混合云环境 =========="
# 1. 验证阿里云 ACK
echo "🔍 验证阿里云 ACK 集群..."
export KUBECONFIG=~/.kube/config-ack
kubectl cluster-info
kubectl get nodes
kubectl get pods -A
# 2. 验证 IDC K8s
echo "🔍 验证 IDC K8s 集群..."
export KUBECONFIG=~/.kube/config-idc
kubectl cluster-info
kubectl get nodes
kubectl get pods -A
# 3. 验证 VPN 连通性
echo "🔍 验证 VPN 连接..."
# 从阿里云 ECS ping IDC 内网
ALICLOUD_ECS_IP=$(terraform output -raw alicloud_ecs_public_ip)
IDC_VM_IP=$(terraform output -raw idc_vm_private_ip)
ssh root@$ALICLOUD_ECS_IP "ping -c 4 $IDC_VM_IP"
# 4. 验证跨集群服务发现
echo "🔍 验证 Istio 多集群..."
export KUBECONFIG=~/.kube/config-ack
kubectl get secret istio-remote-secret-idc-cluster -n istio-system
export KUBECONFIG=~/.kube/config-idc
kubectl get secret istio-remote-secret-ack-cluster -n istio-system
echo "✅ 验证完成!"
运维管理
1. 统一监控 Dashboard
Grafana 配置:
# grafana-dashboard.json
{
"dashboard": {
"title": "混合云监控总览",
"panels": [
{
"title": "集群状态",
"targets": [
{
"expr": "up{job=\"kubernetes-nodes\",cluster=\"ack-cluster\"}",
"legendFormat": "阿里云 ACK - {{instance}}"
},
{
"expr": "up{job=\"kubernetes-nodes\",cluster=\"idc-cluster\"}",
"legendFormat": "IDC K8s - {{instance}}"
}
]
},
{
"title": "Pod 数量",
"targets": [
{
"expr": "sum(kube_pod_info{cluster=\"ack-cluster\"}) by (namespace)",
"legendFormat": "ACK - {{namespace}}"
},
{
"expr": "sum(kube_pod_info{cluster=\"idc-cluster\"}) by (namespace)",
"legendFormat": "IDC - {{namespace}}"
}
]
},
{
"title": "跨云流量",
"targets": [
{
"expr": "sum(istio_tcp_sent_bytes_total{source_cluster=\"ack-cluster\",destination_cluster=\"idc-cluster\"})",
"legendFormat": "ACK → IDC"
},
{
"expr": "sum(istio_tcp_sent_bytes_total{source_cluster=\"idc-cluster\",destination_cluster=\"ack-cluster\"})",
"legendFormat": "IDC → ACK"
}
]
}
]
}
}
2. 成本优化
#!/bin/bash
# cost-optimization.sh
# 1. 分析资源使用情况
echo "📊 分析资源使用..."
# 阿里云成本
aliyun bssopenapi DescribeInstanceBill \
--BillingCycle 2024-01 \
--Granularity MONTHLY
# 2. 识别闲置资源
echo "🔍 识别闲置资源..."
# 查找未使用的 ECS
terraform state list | grep alicloud_instance | while read instance; do
instance_id=$(terraform state show $instance | grep "id " | awk '{print $3}' | tr -d '"')
# 检查 CPU 使用率
echo "检查实例: $instance_id"
done
# 3. 自动缩容建议
echo "💡 缩容建议..."
# 根据负载自动调整 Worker 节点数量
3. 灾难恢复
#!/bin/bash
# disaster-recovery.sh
# 1. 备份 Terraform 状态
echo "💾 备份 Terraform 状态..."
terraform state pull > backup/terraform.tfstate.$(date +%Y%m%d)
# 2. 备份 Kubernetes 资源
echo "💾 备份 Kubernetes 资源..."
export KUBECONFIG=~/.kube/config-ack
velero backup create ack-backup-$(date +%Y%m%d) --include-namespaces '*'
export KUBECONFIG=~/.kube/config-idc
velero backup create idc-backup-$(date +%Y%m%d) --include-namespaces '*'
# 3. 备份数据库
echo "💾 备份数据库..."
aliyun rds CreateBackup \
--DBInstanceId $(terraform output -raw rds_instance_id)
echo "✅ 备份完成!"
最佳实践
1. 安全加固
- 使用 Terraform Cloud 管理敏感变量
- 启用 VPN IPsec 加密
- 配置网络隔离和安全组
- 定期轮换访问密钥
- 启用审计日志
2. 性能优化
- 使用专线替代 VPN(生产环境)
- 启用 CDN 加速
- 合理规划 Pod 和 Service CIDR
- 使用亲和性规则优化调度
- 启用集群自动扩缩容
3. 成本控制
- 使用预留实例降低成本
- 开发/测试环境定时开关机
- 合理配置资源 requests/limits
- 使用 Spot 实例(非核心业务)
- 定期清理未使用资源
小结
混合云架构关键点:
网络互通:
- VPN Gateway 建立连接
- 统一的 CIDR 规划
- 安全的 IPsec 加密
统一管理:
- Terraform 统一编排
- Kubernetes 多集群管理
- Istio 服务网格
可观测性:
- 统一监控 (Prometheus + Grafana)
- 集中日志 (ELK/Loki)
- 分布式追踪 (Jaeger)
高可用性:
- 多可用区部署
- 跨云容灾备份
- 自动故障转移
通过 Terraform 实现混合云的基础设施即代码,大大简化了复杂环境的管理和运维!