AWS EKS 集群
Amazon Elastic Kubernetes Service(EKS)是托管的 Kubernetes 服务,提供高可用、安全的 Kubernetes 控制面,与 AWS 生态深度集成(IAM、VPC、ALB、EFS、S3 等)。
EKS 集群架构
┌─────────────────────────────────────────────────────────┐
│ AWS Region │
│ ┌───────────────────────────────────────────────────┐ │
│ │ Managed Control Plane │ │
│ │ ├── API Server (Multi-AZ) │ │
│ │ ├── etcd (Multi-AZ) │ │
│ │ └── Scheduler / Controller Manager │ │
│ └───────────────────────────────────────────────────┘ │
│ │
│ ┌───────────────────────────────────────────────────┐ │
│ │ VPC (10.0.0.0/16) │ │
│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │
│ │ │Public Subnet│ │Public Subnet│ │Public Subnet│ │ │
│ │ │ az1 │ │ az2 │ │ az3 │ │ │
│ │ └──────────┘ └──────────┘ └──────────┘ │ │
│ │ ┌──────────┐ ┌──────────┐ ┌──────────┐ │ │
│ │ │Private Sub│ │Private Sub│ │Private Sub│ │ │
│ │ │ az1 │ │ az2 │ │ az3 │ │ │
│ │ │NodeGroup │ │NodeGroup │ │NodeGroup │ │ │
│ │ └──────────┘ └──────────┘ └──────────┘ │ │
│ └───────────────────────────────────────────────────┘ │
└─────────────────────────────────────────────────────────┘
创建 EKS 集群
使用 eksctl 创建
# 安装 eksctl
curl --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
sudo mv /tmp/eksctl /usr/local/bin
# 创建集群
eksctl create cluster \
--name my-cluster \
--region ap-northeast-1 \
--version 1.29 \
--nodegroup-name standard-workers \
--node-type t3.medium \
--nodes 3 \
--nodes-min 1 \
--nodes-max 10 \
--managed \
--with-oidc \
--ssh-public-key ~/.ssh/id_rsa.pub \
--vpc-nat-mode HighlyAvailable \
--alb-ingress-access
使用 Terraform 创建
module "eks" {
source = "terraform-aws-modules/eks/aws"
version = "~> 19.0"
cluster_name = "my-cluster"
cluster_version = "1.29"
vpc_id = module.vpc.vpc_id
subnet_ids = module.vpc.private_subnets
control_plane_subnet_ids = module.vpc.public_subnets
eks_managed_node_groups = {
standard = {
min_size = 1
max_size = 10
desired_size = 3
instance_types = ["t3.medium"]
labels = {
environment = "production"
team = "platform"
}
}
}
enable_ClusterTags = true
cluster_addons = {
coredns = {}
kube-proxy = {}
vpc-cni = {}
aws-ebs-csi-driver = {}
}
enable_irsa = true
}
Node Group 配置
托管节点组(Managed Node Group)
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
name: my-cluster
region: ap-northeast-1
managedNodeGroups:
- name: compute-optimized
instanceType: c6i.xlarge
desiredCapacity: 2
minSize: 1
maxSize: 5
volumeSize: 100
volumeType: gp3
labels:
workload-type: compute
tags:
Environment: production
spotOptions:
enabled: true
maxPrice: 0.12
- name: memory-optimized
instanceType: r6i.xlarge
desiredCapacity: 2
minSize: 1
maxSize: 3
labels:
workload-type: memory
Fargate Profile
apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
name: my-cluster
region: ap-northeast-1
fargateProfiles:
- name: default
selectors:
- namespace: default
- namespace: production
labels:
env: production
- name: monitoring
selectors:
- namespace: monitoring
IAM IRSA(服务账户角色)
为 Pod 分配 IAM 角色,实现细粒度权限控制:
# 创建 IAM Service Account
eksctl create iamserviceaccount \
--name my-app \
--namespace default \
--cluster my-cluster \
--attach-policy-arn arn:aws:iam::123456789:policy/my-app-policy \
--approve
apiVersion: v1
kind: ServiceAccount
metadata:
name: my-app
namespace: default
annotations:
eks.amazonaws.com/role-arn: arn:aws:iam::123456789:role/my-app-role
AWS Load Balancer Controller
安装 ALB Ingress Controller
# 添加 IAM 策略
aws iam create-policy \
--policy-name AWSLoadBalancerController \
--policy-document file://iam_policy.json
# 创建 Service Account
eksctl create iamserviceaccount \
--cluster=my-cluster \
--namespace=kube-system \
--name=aws-load-balancer-controller \
--attach-policy-arn=arn:aws:iam::123456789:policy/AWSLoadBalancerController \
--approve
# 安装 Helm Chart
helm repo add eks https://aws.github.io/eks-charts
helm install aws-load-balancer-controller eks/aws-load-balancer-controller \
-n kube-system \
--set clusterName=my-cluster \
--set serviceAccount.create=false \
--set serviceAccount.name=aws-load-balancer-controller
ALB Ingress 配置
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: myapp-ingress
annotations:
kubernetes.io/ingress.class: alb
alb.ingress.kubernetes.io/scheme: internet-facing
alb.ingress.kubernetes.io/target-type: ip
alb.ingress.kubernetes.io/healthcheck-path: /health
alb.ingress.kubernetes.io/success-codes: "200-299"
alb.ingress.kubernetes.io/certificate-arn: arn:aws:acm:region:123456789:certificate/cert-id
alb.ingress.kubernetes.io/ssl-redirect: "443"
spec:
rules:
- host: myapp.example.com
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: myapp-service
port:
number: 80
EBS CSI Driver(持久化存储)
# 安装 EBS CSI Driver
eksctl create addon \
--name aws-ebs-csi-driver \
--cluster my-cluster \
--region ap-northeast-1
# 创建 StorageClass
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: gp3
provisioner: ebs.csi.aws.com
parameters:
type: gp3
encrypted: "true"
kmsKeyId: arn:aws:kms:region:123456789:key/key-id
volumeBindingMode: WaitForFirstConsumer
allowVolumeExpansion: true
EFS CSI Driver(共享存储)
# 创建 EFS
aws efs create-file-system \
--throughput-mode bursting \
--encrypted \
--region ap-northeast-1
# 安装 EFS CSI Driver
helm install aws-efs-csi-driver aws-efs-csi-driver/aws-efs-csi-driver \
--namespace kube-system
# 创建 PersistentVolume
apiVersion: v1
kind: PersistentVolume
metadata:
name: efs-pv
spec:
capacity:
storage: 10Gi
volumeMode: Filesystem
accessModes:
- ReadWriteMany
persistentVolumeReclaimPolicy: Retain
storageClassName: efs-sc
csi:
driver: efs.csi.aws.com
volumeHandle: fs-12345678
Cluster Autoscaler
apiVersion: v1
kind: ServiceAccount
metadata:
name: cluster-autoscaler
namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: cluster-autoscaler
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: cluster-autoscaler
subjects:
- kind: ServiceAccount
name: cluster-autoscaler
namespace: kube-system
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: cluster-autoscaler
namespace: kube-system
spec:
replicas: 1
selector:
matchLabels:
app: cluster-autoscaler
template:
spec:
serviceAccountName: cluster-autoscaler
containers:
- image: registry.k8s.io/autoscaling/cluster-autoscaler:v1.29.0
name: cluster-autoscaler
command:
- /cluster-autoscaler
args:
- --cloud-provider=aws
- --nodes=1:10:my-cluster-nodegroup
- --scale-down-unneeded-time=10m
- --scale-down-delay-after-add=10m
- --expander=random
env:
- name: AWS_REGION
value: ap-northeast-1
监控与日志
CloudWatch Container Insights
# 启用 Container Insights
aws eks update-cluster-config \
--name my-cluster \
--region ap-northeast-1 \
--logging '{"clusterLogging":[{"types":["api","audit","authenticator","controllerManager","scheduler"],"enabled":true}]}'
# 安装 CloudWatch Agent
helm install cloudwatch-agent aws-cloudwatch/aws-cloudwatch-observability \
--namespace amazon-cloudwatch \
--create-namespace
故障排除
# 查看集群状态
aws eks describe-cluster --name my-cluster --region ap-northeast-1
# 更新 kubeconfig
aws eks update-kubeconfig --name my-cluster --region ap-northeast-1
# 查看 Node 问题
kubectl describe nodes
# 查看 Pod 事件
kubectl get events --sort-by='.lastTimestamp'
# 查看 AWS LB Controller 日志
kubectl logs -n kube-system deployment/aws-load-balancer-controller
# 查看 CoreDNS 日志
kubectl logs -n kube-system deployment/coredns -f
成本优化
- 使用 Spot 实例:非关键工作负载使用 Spot,最高节省 90%
- Right-sizing:定期分析资源使用,调整 Node 类型和数量
- Cluster Autoscaler:自动伸缩节点数量
- Fargate:无服务器容器,仅按实际使用计费
- Savings Plans:承诺使用量换取折扣
下一步
- ArgoCD GitOps 部署 — 在 EKS 上部署 ArgoCD
- Pipeline 设计与规范 — EKS 上的 CI/CD 流水线
- DevOps 标准化 — 云原生 DevOps 实践