Kubernetes 集群部署实战
使用 Terraform 在 AWS 上部署生产级 EKS 集群。
一、项目概述
1.1 架构设计
┌─────────────────────────────────────────────┐
│ AWS Account │
│ ┌───────────────────────────────────────┐ │
│ │ VPC (10.0.0.0/16) │ │
│ │ ┌─────────────┐ ┌─────────────┐ │ │
│ │ │ Public AZ1 │ │ Public AZ2 │ │ │
│ │ │ │ │ │ │ │
│ │ │ ┌───────┐ │ │ ┌───────┐ │ │ │
│ │ │ │ NAT │ │ │ │ NAT │ │ │ │
│ │ │ └───────┘ │ │ └───────┘ │ │ │
│ │ └─────────────┘ └─────────────┘ │ │
│ │ ┌─────────────┐ ┌─────────────┐ │ │
│ │ │ Private AZ1 │ │ Private AZ2 │ │ │
│ │ │ │ │ │ │ │
│ │ │ ┌───────┐ │ │ ┌───────┐ │ │ │
│ │ │ │ EKS │ │ │ │ EKS │ │ │ │
│ │ │ │ Nodes │ │ │ │ Nodes │ │ │ │
│ │ │ └───────┘ │ │ └───────┘ │ │ │
│ │ └─────────────┘ └─────────────┘ │ │
│ └───────────────────────────────────────┘ │
│ │
│ ┌───────────────────────────────────────┐ │
│ │ EKS Control Plane │ │
│ │ (Managed by AWS) │ │
│ └───────────────────────────────────────┘ │
└─────────────────────────────────────────────┘
1.2 项目结构
eks-cluster/
├── main.tf
├── variables.tf
├── outputs.tf
├── versions.tf
├── terraform.tfvars
├── modules/
│ ├── vpc/
│ ├── eks/
│ ├── node-groups/
│ └── addons/
└── examples/
├── basic/
└── production/
二、网络配置
2.1 VPC 模块
modules/vpc/main.tf:
locals {
az_count = length(var.availability_zones)
}
# VPC
resource "aws_vpc" "main" {
cidr_block = var.vpc_cidr
enable_dns_hostnames = true
enable_dns_support = true
tags = merge(
var.tags,
{
Name = "${var.cluster_name}-vpc"
"kubernetes.io/cluster/${var.cluster_name}" = "shared"
}
)
}
# 公有子网
resource "aws_subnet" "public" {
count = local.az_count
vpc_id = aws_vpc.main.id
cidr_block = cidrsubnet(var.vpc_cidr, 4, count.index)
availability_zone = var.availability_zones[count.index]
map_public_ip_on_launch = true
tags = merge(
var.tags,
{
Name = "${var.cluster_name}-public-${count.index + 1}"
"kubernetes.io/cluster/${var.cluster_name}" = "shared"
"kubernetes.io/role/elb" = "1"
}
)
}
# 私有子网
resource "aws_subnet" "private" {
count = local.az_count
vpc_id = aws_vpc.main.id
cidr_block = cidrsubnet(var.vpc_cidr, 4, count.index + 4)
availability_zone = var.availability_zones[count.index]
tags = merge(
var.tags,
{
Name = "${var.cluster_name}-private-${count.index + 1}"
"kubernetes.io/cluster/${var.cluster_name}" = "shared"
"kubernetes.io/role/internal-elb" = "1"
}
)
}
# Internet Gateway
resource "aws_internet_gateway" "main" {
vpc_id = aws_vpc.main.id
tags = merge(
var.tags,
{
Name = "${var.cluster_name}-igw"
}
)
}
# NAT Gateway
resource "aws_eip" "nat" {
count = local.az_count
domain = "vpc"
tags = merge(
var.tags,
{
Name = "${var.cluster_name}-nat-eip-${count.index + 1}"
}
)
depends_on = [aws_internet_gateway.main]
}
resource "aws_nat_gateway" "main" {
count = local.az_count
allocation_id = aws_eip.nat[count.index].id
subnet_id = aws_subnet.public[count.index].id
tags = merge(
var.tags,
{
Name = "${var.cluster_name}-nat-${count.index + 1}"
}
)
depends_on = [aws_internet_gateway.main]
}
# 公有路由表
resource "aws_route_table" "public" {
vpc_id = aws_vpc.main.id
route {
cidr_block = "0.0.0.0/0"
gateway_id = aws_internet_gateway.main.id
}
tags = merge(
var.tags,
{
Name = "${var.cluster_name}-public-rt"
}
)
}
resource "aws_route_table_association" "public" {
count = local.az_count
subnet_id = aws_subnet.public[count.index].id
route_table_id = aws_route_table.public.id
}
# 私有路由表
resource "aws_route_table" "private" {
count = local.az_count
vpc_id = aws_vpc.main.id
route {
cidr_block = "0.0.0.0/0"
nat_gateway_id = aws_nat_gateway.main[count.index].id
}
tags = merge(
var.tags,
{
Name = "${var.cluster_name}-private-rt-${count.index + 1}"
}
)
}
resource "aws_route_table_association" "private" {
count = local.az_count
subnet_id = aws_subnet.private[count.index].id
route_table_id = aws_route_table.private[count.index].id
}
三、EKS 集群
3.1 IAM 角色
modules/eks/iam.tf:
# EKS 集群角色
data "aws_iam_policy_document" "cluster_assume_role" {
statement {
effect = "Allow"
principals {
type = "Service"
identifiers = ["eks.amazonaws.com"]
}
actions = ["sts:AssumeRole"]
}
}
resource "aws_iam_role" "cluster" {
name = "${var.cluster_name}-cluster-role"
assume_role_policy = data.aws_iam_policy_document.cluster_assume_role.json
tags = var.tags
}
resource "aws_iam_role_policy_attachment" "cluster_policy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSClusterPolicy"
role = aws_iam_role.cluster.name
}
resource "aws_iam_role_policy_attachment" "cluster_vpc_policy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSVPCResourceController"
role = aws_iam_role.cluster.name
}
# 节点组角色
data "aws_iam_policy_document" "node_assume_role" {
statement {
effect = "Allow"
principals {
type = "Service"
identifiers = ["ec2.amazonaws.com"]
}
actions = ["sts:AssumeRole"]
}
}
resource "aws_iam_role" "node" {
name = "${var.cluster_name}-node-role"
assume_role_policy = data.aws_iam_policy_document.node_assume_role.json
tags = var.tags
}
resource "aws_iam_role_policy_attachment" "node_policy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKSWorkerNodePolicy"
role = aws_iam_role.node.name
}
resource "aws_iam_role_policy_attachment" "node_cni_policy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEKS_CNI_Policy"
role = aws_iam_role.node.name
}
resource "aws_iam_role_policy_attachment" "node_registry_policy" {
policy_arn = "arn:aws:iam::aws:policy/AmazonEC2ContainerRegistryReadOnly"
role = aws_iam_role.node.name
}
# EBS CSI Driver 角色
resource "aws_iam_role" "ebs_csi" {
name = "${var.cluster_name}-ebs-csi-driver"
assume_role_policy = jsonencode({
Version = "2012-10-17"
Statement = [{
Effect = "Allow"
Principal = {
Federated = var.oidc_provider_arn
}
Action = "sts:AssumeRoleWithWebIdentity"
Condition = {
StringEquals = {
"${var.oidc_provider}:sub" = "system:serviceaccount:kube-system:ebs-csi-controller-sa"
"${var.oidc_provider}:aud" = "sts.amazonaws.com"
}
}
}]
})
tags = var.tags
}
resource "aws_iam_role_policy_attachment" "ebs_csi" {
policy_arn = "arn:aws:iam::aws:policy/service-role/AmazonEBSCSIDriverPolicy"
role = aws_iam_role.ebs_csi.name
}
3.2 EKS 集群
modules/eks/main.tf:
# 集群安全组
resource "aws_security_group" "cluster" {
name_prefix = "${var.cluster_name}-cluster-"
description = "EKS cluster security group"
vpc_id = var.vpc_id
egress {
from_port = 0
to_port = 0
protocol = "-1"
cidr_blocks = ["0.0.0.0/0"]
}
tags = merge(
var.tags,
{
Name = "${var.cluster_name}-cluster-sg"
}
)
lifecycle {
create_before_destroy = true
}
}
resource "aws_security_group_rule" "cluster_ingress_workstation_https" {
description = "Allow workstation to communicate with the cluster API"
type = "ingress"
from_port = 443
to_port = 443
protocol = "tcp"
cidr_blocks = var.allowed_cidr_blocks
security_group_id = aws_security_group.cluster.id
}
# EKS 集群
resource "aws_eks_cluster" "main" {
name = var.cluster_name
role_arn = aws_iam_role.cluster.arn
version = var.cluster_version
vpc_config {
subnet_ids = var.subnet_ids
endpoint_private_access = var.endpoint_private_access
endpoint_public_access = var.endpoint_public_access
public_access_cidrs = var.public_access_cidrs
security_group_ids = [aws_security_group.cluster.id]
}
enabled_cluster_log_types = var.cluster_log_types
encryption_config {
provider {
key_arn = var.kms_key_arn
}
resources = ["secrets"]
}
tags = var.tags
depends_on = [
aws_iam_role_policy_attachment.cluster_policy,
aws_iam_role_policy_attachment.cluster_vpc_policy,
]
}
# OIDC Provider
data "tls_certificate" "cluster" {
url = aws_eks_cluster.main.identity[0].oidc[0].issuer
}
resource "aws_iam_openid_connect_provider" "cluster" {
client_id_list = ["sts.amazonaws.com"]
thumbprint_list = [data.tls_certificate.cluster.certificates[0].sha1_fingerprint]
url = aws_eks_cluster.main.identity[0].oidc[0].issuer
tags = var.tags
}
# CloudWatch 日志组
resource "aws_cloudwatch_log_group" "cluster" {
name = "/aws/eks/${var.cluster_name}/cluster"
retention_in_days = var.log_retention_days
tags = var.tags
}
四、节点组
4.1 托管节点组
modules/node-groups/main.tf:
resource "aws_eks_node_group" "main" {
for_each = var.node_groups
cluster_name = var.cluster_name
node_group_name = each.key
node_role_arn = var.node_role_arn
subnet_ids = var.subnet_ids
version = var.cluster_version
scaling_config {
desired_size = each.value.desired_size
max_size = each.value.max_size
min_size = each.value.min_size
}
update_config {
max_unavailable_percentage = 33
}
instance_types = each.value.instance_types
capacity_type = each.value.capacity_type
disk_size = each.value.disk_size
labels = merge(
each.value.labels,
{
"node-group" = each.key
}
)
dynamic "taint" {
for_each = each.value.taints
content {
key = taint.value.key
value = taint.value.value
effect = taint.value.effect
}
}
tags = merge(
var.tags,
each.value.tags,
{
Name = "${var.cluster_name}-${each.key}"
}
)
lifecycle {
create_before_destroy = true
ignore_changes = [scaling_config[0].desired_size]
}
}
使用示例:
module "node_groups" {
source = "./modules/node-groups"
cluster_name = module.eks.cluster_name
cluster_version = var.cluster_version
node_role_arn = module.eks.node_role_arn
subnet_ids = module.vpc.private_subnet_ids
node_groups = {
general = {
desired_size = 2
min_size = 2
max_size = 4
instance_types = ["t3.medium"]
capacity_type = "ON_DEMAND"
disk_size = 50
labels = {
role = "general"
}
taints = []
tags = {}
}
spot = {
desired_size = 1
min_size = 0
max_size = 10
instance_types = ["t3.medium", "t3a.medium"]
capacity_type = "SPOT"
disk_size = 50
labels = {
role = "spot"
}
taints = [{
key = "spot"
value = "true"
effect = "NoSchedule"
}]
tags = {}
}
}
tags = local.tags
}
五、集群附加组件
5.1 核心附加组件
modules/addons/main.tf:
# VPC CNI
resource "aws_eks_addon" "vpc_cni" {
cluster_name = var.cluster_name
addon_name = "vpc-cni"
addon_version = var.addon_versions.vpc_cni
resolve_conflicts_on_create = "OVERWRITE"
resolve_conflicts_on_update = "PRESERVE"
tags = var.tags
}
# CoreDNS
resource "aws_eks_addon" "coredns" {
cluster_name = var.cluster_name
addon_name = "coredns"
addon_version = var.addon_versions.coredns
resolve_conflicts_on_create = "OVERWRITE"
resolve_conflicts_on_update = "PRESERVE"
tags = var.tags
depends_on = [aws_eks_addon.vpc_cni]
}
# kube-proxy
resource "aws_eks_addon" "kube_proxy" {
cluster_name = var.cluster_name
addon_name = "kube-proxy"
addon_version = var.addon_versions.kube_proxy
resolve_conflicts_on_create = "OVERWRITE"
resolve_conflicts_on_update = "PRESERVE"
tags = var.tags
}
# EBS CSI Driver
resource "aws_eks_addon" "ebs_csi" {
cluster_name = var.cluster_name
addon_name = "aws-ebs-csi-driver"
addon_version = var.addon_versions.ebs_csi
service_account_role_arn = var.ebs_csi_role_arn
resolve_conflicts_on_create = "OVERWRITE"
resolve_conflicts_on_update = "PRESERVE"
tags = var.tags
}
5.2 Kubernetes 配置
使用 Helm Provider:
# 安装 AWS Load Balancer Controller
resource "helm_release" "aws_load_balancer_controller" {
name = "aws-load-balancer-controller"
repository = "https://aws.github.io/eks-charts"
chart = "aws-load-balancer-controller"
namespace = "kube-system"
version = "1.6.2"
set {
name = "clusterName"
value = var.cluster_name
}
set {
name = "serviceAccount.create"
value = "true"
}
set {
name = "serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn"
value = var.alb_controller_role_arn
}
depends_on = [module.node_groups]
}
# 安装 Metrics Server
resource "helm_release" "metrics_server" {
name = "metrics-server"
repository = "https://kubernetes-sigs.github.io/metrics-server/"
chart = "metrics-server"
namespace = "kube-system"
version = "3.11.0"
depends_on = [module.node_groups]
}
# 安装 Cluster Autoscaler
resource "helm_release" "cluster_autoscaler" {
name = "cluster-autoscaler"
repository = "https://kubernetes.github.io/autoscaler"
chart = "cluster-autoscaler"
namespace = "kube-system"
version = "9.29.3"
set {
name = "autoDiscovery.clusterName"
value = var.cluster_name
}
set {
name = "awsRegion"
value = var.region
}
set {
name = "rbac.serviceAccount.annotations.eks\\.amazonaws\\.com/role-arn"
value = var.autoscaler_role_arn
}
depends_on = [module.node_groups]
}
六、主配置
main.tf:
terraform {
required_version = ">= 1.6.0"
required_providers {
aws = {
source = "hashicorp/aws"
version = "~> 5.0"
}
helm = {
source = "hashicorp/helm"
version = "~> 2.11"
}
kubernetes = {
source = "hashicorp/kubernetes"
version = "~> 2.23"
}
}
backend "s3" {
bucket = "my-terraform-state"
key = "eks-cluster/terraform.tfstate"
region = "us-west-2"
encrypt = true
dynamodb_table = "terraform-locks"
}
}
provider "aws" {
region = var.region
}
data "aws_eks_cluster" "cluster" {
name = module.eks.cluster_name
depends_on = [module.eks]
}
data "aws_eks_cluster_auth" "cluster" {
name = module.eks.cluster_name
depends_on = [module.eks]
}
provider "kubernetes" {
host = data.aws_eks_cluster.cluster.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.cluster.token
}
provider "helm" {
kubernetes {
host = data.aws_eks_cluster.cluster.endpoint
cluster_ca_certificate = base64decode(data.aws_eks_cluster.cluster.certificate_authority[0].data)
token = data.aws_eks_cluster_auth.cluster.token
}
}
locals {
tags = {
Project = var.project_name
Environment = var.environment
ManagedBy = "Terraform"
CreatedAt = formatdate("YYYY-MM-DD", timestamp())
}
}
# VPC
module "vpc" {
source = "./modules/vpc"
cluster_name = var.cluster_name
vpc_cidr = var.vpc_cidr
availability_zones = var.availability_zones
tags = local.tags
}
# EKS 集群
module "eks" {
source = "./modules/eks"
cluster_name = var.cluster_name
cluster_version = var.cluster_version
vpc_id = module.vpc.vpc_id
subnet_ids = concat(module.vpc.public_subnet_ids, module.vpc.private_subnet_ids)
endpoint_private_access = true
endpoint_public_access = true
public_access_cidrs = var.public_access_cidrs
allowed_cidr_blocks = var.allowed_cidr_blocks
cluster_log_types = ["api", "audit", "authenticator", "controllerManager", "scheduler"]
log_retention_days = 7
kms_key_arn = aws_kms_key.eks.arn
tags = local.tags
}
# 节点组
module "node_groups" {
source = "./modules/node-groups"
cluster_name = module.eks.cluster_name
cluster_version = var.cluster_version
node_role_arn = module.eks.node_role_arn
subnet_ids = module.vpc.private_subnet_ids
node_groups = var.node_groups
tags = local.tags
depends_on = [module.eks]
}
# 附加组件
module "addons" {
source = "./modules/addons"
cluster_name = module.eks.cluster_name
addon_versions = var.addon_versions
ebs_csi_role_arn = module.eks.ebs_csi_role_arn
tags = local.tags
depends_on = [module.node_groups]
}
# KMS 密钥
resource "aws_kms_key" "eks" {
description = "EKS cluster encryption key"
deletion_window_in_days = 7
enable_key_rotation = true
tags = merge(
local.tags,
{
Name = "${var.cluster_name}-eks-key"
}
)
}
resource "aws_kms_alias" "eks" {
name = "alias/${var.cluster_name}-eks"
target_key_id = aws_kms_key.eks.key_id
}
七、配置文件
terraform.tfvars:
project_name = "myapp"
environment = "production"
region = "us-west-2"
cluster_name = "myapp-prod-eks"
cluster_version = "1.28"
vpc_cidr = "10.0.0.0/16"
availability_zones = [
"us-west-2a",
"us-west-2b",
"us-west-2c"
]
public_access_cidrs = ["0.0.0.0/0"]
allowed_cidr_blocks = ["10.0.0.0/16"]
node_groups = {
general = {
desired_size = 3
min_size = 3
max_size = 6
instance_types = ["t3.large"]
capacity_type = "ON_DEMAND"
disk_size = 100
labels = {
role = "general"
}
taints = []
tags = {}
}
}
addon_versions = {
vpc_cni = "v1.15.1-eksbuild.1"
coredns = "v1.10.1-eksbuild.6"
kube_proxy = "v1.28.2-eksbuild.2"
ebs_csi = "v1.25.0-eksbuild.1"
}
八、部署和验证
8.1 部署集群
# 初始化
terraform init
# 规划
terraform plan -out=tfplan
# 应用
terraform apply tfplan
8.2 配置 kubectl
# 更新 kubeconfig
aws eks update-kubeconfig \
--region us-west-2 \
--name myapp-prod-eks
# 验证连接
kubectl get nodes
kubectl get pods -A
8.3 测试部署
# 部署测试应用
kubectl create deployment nginx --image=nginx
kubectl expose deployment nginx --port=80 --type=LoadBalancer
# 检查服务
kubectl get svc nginx
九、监控和日志
9.1 安装 Prometheus
resource "helm_release" "prometheus" {
name = "prometheus"
repository = "https://prometheus-community.github.io/helm-charts"
chart = "kube-prometheus-stack"
namespace = "monitoring"
create_namespace = true
version = "54.0.0"
values = [
file("${path.module}/values/prometheus.yaml")
]
}
9.2 CloudWatch Container Insights
# 安装 CloudWatch Agent
kubectl apply -f https://raw.githubusercontent.com/aws-samples/amazon-cloudwatch-container-insights/latest/k8s-deployment-manifest-templates/deployment-mode/daemonset/container-insights-monitoring/quickstart/cwagent-fluentd-quickstart.yaml
小结
本章构建了一个生产级 EKS 集群:
- 网络层:VPC、子网、NAT 网关
- 控制平面:EKS 集群、OIDC Provider
- 数据平面:托管节点组、Spot 实例
- 附加组件:CNI、CoreDNS、EBS CSI
- 可观测性:CloudWatch、Prometheus
这是一个可用于生产环境的完整 Kubernetes 集群!