Batch of upgrades and cleanups (#173)

irvingpop · web-flow · commit 3b7be33a0d6b · 2021-08-08T08:15:26.000-07:00
* Upgrade our EKS cluster to 1.20 and switch to a Managed Node Group
  * This lets us take advantage of higher pod:node ratios due to networking advances
* Add S3-based logging to our ALBs so we can better catch errors when they alarm
* Upgrade ArgoCD to the latest, 2.0.5
* Clean up Vertical Pod Autoscalers (VPAs) since we don't use them anymore

Signed-off-by: Irving Popovetsky &lt;irving@honeycomb.io&gt;
diff --git a/kubernetes/argocd/application-crd.yaml b/kubernetes/argocd/application-crd.yaml
diff --git a/kubernetes/argocd/appproject-crd.yaml b/kubernetes/argocd/appproject-crd.yaml
diff --git a/kubernetes/argocd/install.yaml b/kubernetes/argocd/install.yaml
diff --git a/kubernetes/argocd/redis-persistent.yaml b/kubernetes/argocd/redis-persistent.yaml
@@ -0,0 +1,52 @@
+---
+apiVersion: apps/v1
+kind: StatefulSet
+metadata:
+  labels:
+    app.kubernetes.io/component: redis
+    app.kubernetes.io/name: argocd-redis
+    app.kubernetes.io/part-of: argocd
+  name: argocd-redis
+spec:
+  selector:
+    matchLabels:
+      app.kubernetes.io/name: argocd-redis
+  serviceName: argocd-redis
+  template:
+    metadata:
+      labels:
+        app.kubernetes.io/name: argocd-redis
+    spec:
+      terminationGracePeriodSeconds: 10
+      containers:
+      - name: redis
+        resources:
+          requests:
+            memory: "100Mi"
+            cpu: "100m" # equivalent to 0.1 of a CPU core
+        args:
+        - --save
+        - "60 1000"
+        - --appendonly
+        - "yes"
+        image: redis:6.2.4-alpine
+        imagePullPolicy: Always
+        ports:
+        - containerPort: 6379
+        volumeMounts:
+        - name: redis-data
+          mountPath: /data
+      securityContext:
+        fsGroup: 1000
+        runAsGroup: 1000
+        runAsNonRoot: true
+        runAsUser: 1000
+      serviceAccountName: argocd-redis
+  volumeClaimTemplates:
+  - metadata:
+      name: redis-data
+    spec:
+      accessModes: [ "ReadWriteOnce" ]
+      resources:
+        requests:
+          storage: 10Gi
diff --git a/kubernetes/eksctl/max-pods-calculator.sh b/kubernetes/eksctl/max-pods-calculator.sh
@@ -0,0 +1,151 @@
+#!/bin/bash
+
+set -o pipefail
+set -o nounset
+set -o errexit
+
+err_report() {
+    echo "Exited with error on line $1"
+}
+trap 'err_report $LINENO' ERR
+
+function print_help {
+    echo "usage: $0 <instance(s)> [options]"
+    echo "Calculates maxPods value to be used when starting up the kubelet."
+    echo "-h,--help print this help."
+    echo "--instance-type Specify the instance type to calculate max pods value."
+    echo "--instance-type-from-imds Use this flag if the instance type should be fetched from IMDS."
+    echo "--cni-version Specify the version of the CNI (example - 1.7.5)."
+    echo "--cni-custom-networking-enabled Use this flag to indicate if CNI custom networking mode has been enabled."
+    echo "--cni-prefix-delegation-enabled Use this flag to indicate if CNI prefix delegation has been enabled."
+    echo "--cni-max-eni specify how many ENIs should be used for prefix delegation. Defaults to using all ENIs per instance."
+}
+
+POSITIONAL=()
+
+while [[ $# -gt 0 ]]; do
+    key="$1"
+    case $key in
+        -h|--help)
+            print_help
+            exit 1
+            ;;
+        --instance-type)
+            INSTANCE_TYPE=$2
+            shift
+            shift
+            ;;
+        --instance-type-from-imds)
+            INSTANCE_TYPE_FROM_IMDS=true
+            shift
+            ;;
+        --cni-version)
+            CNI_VERSION=$2
+            shift
+            shift
+            ;;
+        --cni-custom-networking-enabled)
+            CNI_CUSTOM_NETWORKING_ENABLED=true
+            shift
+            ;;
+        --cni-prefix-delegation-enabled)
+            CNI_PREFIX_DELEGATION_ENABLED=true
+            shift
+            ;;
+        --cni-max-eni)
+            CNI_MAX_ENI=$2
+            shift
+            shift
+            ;;
+        *)    # unknown option
+            POSITIONAL+=("$1") # save it in an array for later
+            shift # past argument
+            ;;
+    esac
+done
+
+CNI_VERSION="${CNI_VERSION:-}"
+CNI_CUSTOM_NETWORKING_ENABLED="${CNI_CUSTOM_NETWORKING_ENABLED:-false}"
+CNI_PREFIX_DELEGATION_ENABLED="${CNI_PREFIX_DELEGATION_ENABLED:-false}"
+CNI_MAX_ENI="${CNI_MAX_ENI:-}"
+INSTANCE_TYPE="${INSTANCE_TYPE:-}"
+INSTANCE_TYPE_FROM_IMDS="${INSTANCE_TYPE_FROM_IMDS:-false}"
+
+PREFIX_DELEGATION_SUPPORTED=false
+IPS_PER_PREFIX=16
+
+if [ "$INSTANCE_TYPE_FROM_IMDS" = true ]; then
+    TOKEN=$(curl -m 10 -X PUT -H "X-aws-ec2-metadata-token-ttl-seconds: 600" -s "http://169.254.169.254/latest/api/token")
+    export AWS_DEFAULT_REGION=$(curl -s --retry 5 -H "X-aws-ec2-metadata-token: $TOKEN" http://169.254.169.254/latest/dynamic/instance-identity/document | jq .region -r)
+    INSTANCE_TYPE=$(curl -m 10 -H "X-aws-ec2-metadata-token: $TOKEN" -s http://169.254.169.254/latest/meta-data/instance-type)
+elif [ -z "$INSTANCE_TYPE" ];
+    # There's no reasonable default for an instanceType so force one to be provided to the script.
+    then echo "You must specify an instance type to calculate max pods value."
+    exit 1
+fi
+
+if [ -z "$CNI_VERSION" ];
+    then echo "You must specify a CNI Version to use. Example - 1.7.5"
+    exit 1
+fi
+
+calculate_max_ip_addresses_prefix_delegation() {
+    enis=$1
+    instance_max_eni_ips=$2
+    echo $(($enis * (($instance_max_eni_ips - 1) * $IPS_PER_PREFIX ) + 2))
+}
+
+calculate_max_ip_addresses_secondary_ips() {
+    enis=$1
+    instance_max_eni_ips=$2
+    echo $(($enis * ($instance_max_eni_ips - 1) + 2))
+}
+
+min_number() {
+    printf "%s\n" "$@" | sort -g | head -n1
+}
+
+
+VERSION_SPLIT=(${CNI_VERSION//./ })
+CNI_MAJOR_VERSION="${VERSION_SPLIT[0]}"
+CNI_MINOR_VERSION="${VERSION_SPLIT[1]}"
+if [[ "$CNI_MAJOR_VERSION" -gt 1 ]] || ([[ "$CNI_MAJOR_VERSION" = 1 ]] && [[ "$CNI_MINOR_VERSION" -gt 8 ]]); then
+    PREFIX_DELEGATION_SUPPORTED=true
+fi
+
+DESCRIBE_INSTANCES_RESULT=$(aws ec2 describe-instance-types --instance-type $INSTANCE_TYPE --query 'InstanceTypes[0].{Hypervisor: Hypervisor, EniCount: NetworkInfo.MaximumNetworkInterfaces, PodsPerEniCount: NetworkInfo.Ipv4AddressesPerInterface, CpuCount: VCpuInfo.DefaultVCpus'})
+
+HYPERVISOR_TYPE=$(echo $DESCRIBE_INSTANCES_RESULT | jq -r '.Hypervisor' )
+IS_NITRO=false
+if [[ "$HYPERVISOR_TYPE" == "nitro" ]]; then
+    IS_NITRO=true
+fi
+INSTANCE_MAX_ENIS=$(echo $DESCRIBE_INSTANCES_RESULT | jq -r '.EniCount' )
+INSTANCE_MAX_ENIS_IPS=$(echo $DESCRIBE_INSTANCES_RESULT | jq -r '.PodsPerEniCount' )
+
+if [ -z "$CNI_MAX_ENI" ] ; then
+    enis_for_pods=$INSTANCE_MAX_ENIS
+else
+    enis_for_pods="$(min_number $CNI_MAX_ENI $INSTANCE_MAX_ENIS)"
+fi
+
+if [ "$CNI_CUSTOM_NETWORKING_ENABLED" = true ] ; then
+    enis_for_pods=$((enis_for_pods-1))
+fi
+
+
+if [ "$IS_NITRO" = true ] && [ "$CNI_PREFIX_DELEGATION_ENABLED" = true ] && [ "$PREFIX_DELEGATION_SUPPORTED" = true ]; then
+    max_pods=$(calculate_max_ip_addresses_prefix_delegation $enis_for_pods $INSTANCE_MAX_ENIS_IPS)
+else
+    max_pods=$(calculate_max_ip_addresses_secondary_ips $enis_for_pods $INSTANCE_MAX_ENIS_IPS)
+fi
+
+# Limit the total number of pods that can be launched on any instance type based on the vCPUs on that instance type.
+MAX_POD_CEILING_FOR_LOW_CPU=110
+MAX_POD_CEILING_FOR_HIGH_CPU=250
+CPU_COUNT=$(echo $DESCRIBE_INSTANCES_RESULT | jq -r '.CpuCount' )
+if [ "$CPU_COUNT" -gt 30 ] ; then
+    echo $(min_number $MAX_POD_CEILING_FOR_HIGH_CPU $max_pods)
+else
+    echo $(min_number $MAX_POD_CEILING_FOR_LOW_CPU $max_pods)
+fi
diff --git a/kubernetes/eksctl/operationcode-backend.yaml b/kubernetes/eksctl/operationcode-backend.yaml
@@ -6,25 +6,27 @@ metadata:
   name: operationcode-backend
   region: us-east-2
 
-nodeGroups:
-  - name: eks-infra-spot
+managedNodeGroups:
+  - name: eks-infra-spot-v2
+    instanceTypes:
+    - t3.small
+    spot: true
     minSize: 3
     desiredCapacity: 3
     maxSize: 5
-    # use Spot instance pricing
-    instancesDistribution:
-      instanceTypes:
-      - t3.small
-      onDemandBaseCapacity: 0
-      onDemandPercentageAboveBaseCapacity: 0
     volumeSize: 20
+    volumeType: gp3
+    # For this to be valid, run:
+    #   kubectl set env daemonset aws-node -n kube-system ENABLE_PREFIX_DELEGATION=true
+    #   kubectl set env daemonset aws-node -n kube-system WARM_PREFIX_TARGET=1
+    maxPodsPerNode: 30
     ssh:
       allow: true
       publicKeyName: oc-ops
     labels:
       nodegroup-type: infra
     tags:
-      Name: eks-infra-spot
+      Name: eks-infra-spot-v2
     iam:
       withAddonPolicies:
         imageBuilder: true
diff --git a/kubernetes/operationcode_python_backend/base/deployment.yaml b/kubernetes/operationcode_python_backend/base/deployment.yaml
@@ -1,14 +1,4 @@
 ---
-apiVersion: "autoscaling.k8s.io/v1beta2"
-kind: VerticalPodAutoscaler
-metadata:
-  name: back-end-vpa
-spec:
-  targetRef:
-    apiVersion: "apps/v1"
-    kind: Deployment
-    name: back-end
----
 apiVersion: apps/v1
 kind: Deployment
 metadata:
diff --git a/kubernetes/operationcode_python_backend/overlays/prod/ingress.yaml b/kubernetes/operationcode_python_backend/overlays/prod/ingress.yaml
@@ -11,6 +11,9 @@ metadata:
     alb.ingress.kubernetes.io/ssl-policy: ELBSecurityPolicy-TLS-1-2-2017-01
     alb.ingress.kubernetes.io/actions.ssl-redirect: '{"Type": "redirect", "RedirectConfig": { "Protocol": "HTTPS", "Port": "443", "StatusCode": "HTTP_301"}}'
     alb.ingress.kubernetes.io/actions.response-401: '{"Type":"fixed-response","FixedResponseConfig":{"ContentType":"text/plain","StatusCode":"401","MessageBody":"401 Not Authorized"}}'
+    alb.ingress.kubernetes.io/load-balancer-attributes: access_logs.s3.enabled=true,access_logs.s3.bucket=oc-alb-logs,access_logs.s3.prefix=oc-prod
+    alb.ingress.kubernetes.io/load-balancer-attributes: routing.http2.enabled=true
+    alb.ingress.kubernetes.io/load-balancer-attributes: idle_timeout.timeout_seconds=600
   labels:
     app: back-end
 spec:
diff --git a/kubernetes/operationcode_python_backend/overlays/staging/ingress.yaml b/kubernetes/operationcode_python_backend/overlays/staging/ingress.yaml
@@ -11,6 +11,9 @@ metadata:
     alb.ingress.kubernetes.io/ssl-policy: ELBSecurityPolicy-TLS-1-2-2017-01
     alb.ingress.kubernetes.io/actions.ssl-redirect: '{"Type": "redirect", "RedirectConfig": { "Protocol": "HTTPS", "Port": "443", "StatusCode": "HTTP_301"}}'
     alb.ingress.kubernetes.io/actions.response-401: '{"Type":"fixed-response","FixedResponseConfig":{"ContentType":"text/plain","StatusCode":"401","MessageBody":"401 Not Authorized"}}'
+    alb.ingress.kubernetes.io/load-balancer-attributes: access_logs.s3.enabled=true,access_logs.s3.bucket=oc-alb-logs,access_logs.s3.prefix=oc-staging
+    alb.ingress.kubernetes.io/load-balancer-attributes: routing.http2.enabled=true
+    alb.ingress.kubernetes.io/load-balancer-attributes: idle_timeout.timeout_seconds=600
   labels:
     app: back-end
 spec:
diff --git a/kubernetes/resources_api/base/deployment.yaml b/kubernetes/resources_api/base/deployment.yaml
@@ -1,14 +1,4 @@
 ---
-apiVersion: "autoscaling.k8s.io/v1beta2"
-kind: VerticalPodAutoscaler
-metadata:
-  name: resources-api-vpa
-spec:
-  targetRef:
-    apiVersion: "apps/v1"
-    kind: Deployment
-    name: resources-api
----
 apiVersion: apps/v1
 kind: Deployment
 metadata:
diff --git a/kubernetes/town_crier/base/deployment.yaml b/kubernetes/town_crier/base/deployment.yaml
@@ -1,14 +1,4 @@
 ---
-apiVersion: "autoscaling.k8s.io/v1beta2"
-kind: VerticalPodAutoscaler
-metadata:
-  name: town-crier-vpa
-spec:
-  targetRef:
-    apiVersion: "apps/v1"
-    kind: Deployment
-    name: town-crier
----
 apiVersion: extensions/v1beta1
 kind: Deployment
 metadata: