AWS EKS 集群

Amazon Elastic Kubernetes Service(EKS)是托管的 Kubernetes 服务,提供高可用、安全的 Kubernetes 控制面,与 AWS 生态深度集成(IAM、VPC、ALB、EFS、S3 等)。

EKS 集群架构


┌─────────────────────────────────────────────────────────┐
│                    AWS Region                            │
│  ┌───────────────────────────────────────────────────┐  │
│  │              Managed Control Plane                 │  │
│  │  ├── API Server (Multi-AZ)                        │  │
│  │  ├── etcd (Multi-AZ)                               │  │
│  │  └── Scheduler / Controller Manager               │  │
│  └───────────────────────────────────────────────────┘  │
│                                                          │
│  ┌───────────────────────────────────────────────────┐  │
│  │              VPC (10.0.0.0/16)                     │  │
│  │  ┌──────────┐  ┌──────────┐  ┌──────────┐        │  │
│  │  │Public Subnet│ │Public Subnet│ │Public Subnet│  │  │
│  │  │ az1      │  │ az2      │  │ az3      │        │  │
│  │  └──────────┘  └──────────┘  └──────────┘        │  │
│  │  ┌──────────┐  ┌──────────┐  ┌──────────┐        │  │
│  │  │Private Sub│  │Private Sub│  │Private Sub│     │  │
│  │  │ az1      │  │ az2      │  │ az3      │        │  │
│  │  │NodeGroup │  │NodeGroup │  │NodeGroup │        │  │
│  │  └──────────┘  └──────────┘  └──────────┘        │  │
│  └───────────────────────────────────────────────────┘  │
└─────────────────────────────────────────────────────────┘

创建 EKS 集群

使用 eksctl 创建


# 安装 eksctl
curl --location "https://github.com/weaveworks/eksctl/releases/latest/download/eksctl_$(uname -s)_amd64.tar.gz" | tar xz -C /tmp
sudo mv /tmp/eksctl /usr/local/bin

# 创建集群
eksctl create cluster \
  --name my-cluster \
  --region ap-northeast-1 \
  --version 1.29 \
  --nodegroup-name standard-workers \
  --node-type t3.medium \
  --nodes 3 \
  --nodes-min 1 \
  --nodes-max 10 \
  --managed \
  --with-oidc \
  --ssh-public-key ~/.ssh/id_rsa.pub \
  --vpc-nat-mode HighlyAvailable \
  --alb-ingress-access

使用 Terraform 创建


module "eks" {
  source  = "terraform-aws-modules/eks/aws"
  version = "~> 19.0"

  cluster_name    = "my-cluster"
  cluster_version = "1.29"

  vpc_id                   = module.vpc.vpc_id
  subnet_ids               = module.vpc.private_subnets
  control_plane_subnet_ids = module.vpc.public_subnets

  eks_managed_node_groups = {
    standard = {
      min_size       = 1
      max_size       = 10
      desired_size   = 3
      instance_types = ["t3.medium"]

      labels = {
        environment = "production"
        team        = "platform"
      }
    }
  }

  enable_ClusterTags = true

  cluster_addons = {
    coredns                = {}
    kube-proxy             = {}
    vpc-cni                = {}
    aws-ebs-csi-driver     = {}
  }

  enable_irsa = true
}

Node Group 配置

托管节点组(Managed Node Group)


apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
  name: my-cluster
  region: ap-northeast-1
managedNodeGroups:
  - name: compute-optimized
    instanceType: c6i.xlarge
    desiredCapacity: 2
    minSize: 1
    maxSize: 5
    volumeSize: 100
    volumeType: gp3
    labels:
      workload-type: compute
    tags:
      Environment: production
    spotOptions:
      enabled: true
      maxPrice: 0.12
  - name: memory-optimized
    instanceType: r6i.xlarge
    desiredCapacity: 2
    minSize: 1
    maxSize: 3
    labels:
      workload-type: memory

Fargate Profile


apiVersion: eksctl.io/v1alpha5
kind: ClusterConfig
metadata:
  name: my-cluster
  region: ap-northeast-1
fargateProfiles:
  - name: default
    selectors:
      - namespace: default
      - namespace: production
        labels:
          env: production
  - name: monitoring
    selectors:
      - namespace: monitoring

IAM IRSA(服务账户角色)

为 Pod 分配 IAM 角色,实现细粒度权限控制:


# 创建 IAM Service Account
eksctl create iamserviceaccount \
  --name my-app \
  --namespace default \
  --cluster my-cluster \
  --attach-policy-arn arn:aws:iam::123456789:policy/my-app-policy \
  --approve

apiVersion: v1
kind: ServiceAccount
metadata:
  name: my-app
  namespace: default
  annotations:
    eks.amazonaws.com/role-arn: arn:aws:iam::123456789:role/my-app-role

AWS Load Balancer Controller

安装 ALB Ingress Controller


# 添加 IAM 策略
aws iam create-policy \
  --policy-name AWSLoadBalancerController \
  --policy-document file://iam_policy.json

# 创建 Service Account
eksctl create iamserviceaccount \
  --cluster=my-cluster \
  --namespace=kube-system \
  --name=aws-load-balancer-controller \
  --attach-policy-arn=arn:aws:iam::123456789:policy/AWSLoadBalancerController \
  --approve

# 安装 Helm Chart
helm repo add eks https://aws.github.io/eks-charts
helm install aws-load-balancer-controller eks/aws-load-balancer-controller \
  -n kube-system \
  --set clusterName=my-cluster \
  --set serviceAccount.create=false \
  --set serviceAccount.name=aws-load-balancer-controller

ALB Ingress 配置


apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
  name: myapp-ingress
  annotations:
    kubernetes.io/ingress.class: alb
    alb.ingress.kubernetes.io/scheme: internet-facing
    alb.ingress.kubernetes.io/target-type: ip
    alb.ingress.kubernetes.io/healthcheck-path: /health
    alb.ingress.kubernetes.io/success-codes: "200-299"
    alb.ingress.kubernetes.io/certificate-arn: arn:aws:acm:region:123456789:certificate/cert-id
    alb.ingress.kubernetes.io/ssl-redirect: "443"
spec:
  rules:
    - host: myapp.example.com
      http:
        paths:
          - path: /
            pathType: Prefix
            backend:
              service:
                name: myapp-service
                port:
                  number: 80

EBS CSI Driver(持久化存储)


# 安装 EBS CSI Driver
eksctl create addon \
  --name aws-ebs-csi-driver \
  --cluster my-cluster \
  --region ap-northeast-1

# 创建 StorageClass
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
  name: gp3
provisioner: ebs.csi.aws.com
parameters:
  type: gp3
  encrypted: "true"
  kmsKeyId: arn:aws:kms:region:123456789:key/key-id
volumeBindingMode: WaitForFirstConsumer
allowVolumeExpansion: true

EFS CSI Driver(共享存储)


# 创建 EFS
aws efs create-file-system \
  --throughput-mode bursting \
  --encrypted \
  --region ap-northeast-1

# 安装 EFS CSI Driver
helm install aws-efs-csi-driver aws-efs-csi-driver/aws-efs-csi-driver \
  --namespace kube-system

# 创建 PersistentVolume
apiVersion: v1
kind: PersistentVolume
metadata:
  name: efs-pv
spec:
  capacity:
    storage: 10Gi
  volumeMode: Filesystem
  accessModes:
    - ReadWriteMany
  persistentVolumeReclaimPolicy: Retain
  storageClassName: efs-sc
  csi:
    driver: efs.csi.aws.com
    volumeHandle: fs-12345678

Cluster Autoscaler


apiVersion: v1
kind: ServiceAccount
metadata:
  name: cluster-autoscaler
  namespace: kube-system
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: cluster-autoscaler
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: cluster-autoscaler
subjects:
  - kind: ServiceAccount
    name: cluster-autoscaler
    namespace: kube-system
---
apiVersion: apps/v1
kind: Deployment
metadata:
  name: cluster-autoscaler
  namespace: kube-system
spec:
  replicas: 1
  selector:
    matchLabels:
      app: cluster-autoscaler
  template:
    spec:
      serviceAccountName: cluster-autoscaler
      containers:
        - image: registry.k8s.io/autoscaling/cluster-autoscaler:v1.29.0
          name: cluster-autoscaler
          command:
            - /cluster-autoscaler
          args:
            - --cloud-provider=aws
            - --nodes=1:10:my-cluster-nodegroup
            - --scale-down-unneeded-time=10m
            - --scale-down-delay-after-add=10m
            - --expander=random
          env:
            - name: AWS_REGION
              value: ap-northeast-1

监控与日志

CloudWatch Container Insights


# 启用 Container Insights
aws eks update-cluster-config \
  --name my-cluster \
  --region ap-northeast-1 \
  --logging '{"clusterLogging":[{"types":["api","audit","authenticator","controllerManager","scheduler"],"enabled":true}]}'

# 安装 CloudWatch Agent
helm install cloudwatch-agent aws-cloudwatch/aws-cloudwatch-observability \
  --namespace amazon-cloudwatch \
  --create-namespace

故障排除


# 查看集群状态
aws eks describe-cluster --name my-cluster --region ap-northeast-1

# 更新 kubeconfig
aws eks update-kubeconfig --name my-cluster --region ap-northeast-1

# 查看 Node 问题
kubectl describe nodes

# 查看 Pod 事件
kubectl get events --sort-by='.lastTimestamp'

# 查看 AWS LB Controller 日志
kubectl logs -n kube-system deployment/aws-load-balancer-controller

# 查看 CoreDNS 日志
kubectl logs -n kube-system deployment/coredns -f

成本优化

  • 使用 Spot 实例:非关键工作负载使用 Spot,最高节省 90%
  • Right-sizing:定期分析资源使用,调整 Node 类型和数量
  • Cluster Autoscaler:自动伸缩节点数量
  • Fargate:无服务器容器,仅按实际使用计费
  • Savings Plans:承诺使用量换取折扣

下一步