diff --git a/k8s-scripts/rebalance-drained-node-pods.sh b/k8s-scripts/rebalance-drained-node-pods.sh new file mode 100644 index 0000000..98b01c4 --- /dev/null +++ b/k8s-scripts/rebalance-drained-node-pods.sh @@ -0,0 +1,189 @@ +#!/usr/bin/env bash +set -euo pipefail + +# Run only from adminhost container i.e. source bin/offline-env.sh and then inside d bash +# rebalance-drained-node-pods.sh helps rebalance workloads after a node is drained, restarted, and uncordoned by deleting pods created after the drain so Kubernetes can reschedule them. +# Before draining, export DRAIN_NODE= and DRAIN_TIME="$(date -u +%Y-%m-%dT%H:%M:%SZ)"; alternatively use DRAIN_AGO_SECONDS= if the exact time is unavailable. +# Run the script after the node has been uncordoned; it first shows candidate pods, current pod distribution across nodes, and runs in DRY_RUN mode by default. +# Set DRY_RUN=false to actually delete candidate pods one at a time, waiting for their controller to become healthy before proceeding. +# Use INCLUDE_SYSTEM=true to include system namespaces; by default they are excluded. +# Note: deleting pods does not guarantee they return to DRAIN_NODE—the Kubernetes scheduler decides placement based on resources and scheduling constraints. + +usage() { + cat </dev/null 2>&1; then + echo "ERROR: node $DRAIN_NODE does not exist." + exit 1 +fi + +UNSCHEDULABLE="$(kubectl get node "$DRAIN_NODE" -o jsonpath='{.spec.unschedulable}' 2>/dev/null || true)" + +echo "DRAIN_NODE: $DRAIN_NODE" +echo "DRAIN_TIME: $DRAIN_TIME" +echo "Time passed since drain: ${AGE_SECONDS}s" +echo "DRY_RUN: $DRY_RUN" +echo + +if [[ "$UNSCHEDULABLE" == "true" ]]; then + echo "ERROR: $DRAIN_NODE is still cordoned/unschedulable." + echo "Run: kubectl uncordon $DRAIN_NODE" + exit 1 +fi + +echo "Current pod count per node:" +kubectl get pods -A -o json | jq -r ' + .items[] + | select(.spec.nodeName != null) + | .spec.nodeName +' | sort | uniq -c | awk '{print $2 ": " $1}' + +echo +echo "Current pods on $DRAIN_NODE:" +kubectl get pods -A --field-selector spec.nodeName="$DRAIN_NODE" --no-headers 2>/dev/null | wc -l +echo + + +echo "Recent events mentioning $DRAIN_NODE:" +kubectl get events -A --sort-by=.lastTimestamp 2>/dev/null | grep -i "$DRAIN_NODE" | tail -20 || true +echo + +JQ_FILTER=' + .items[] + | select(.spec.nodeName != $NODE) + | select(.metadata.creationTimestamp >= $DRAIN_TIME) + | select(.metadata.ownerReferences != null) + | select(.metadata.ownerReferences[0].kind != "DaemonSet") + | select(.metadata.ownerReferences[0].kind != "Job") + | select(.metadata.ownerReferences[0].kind != "CronJob") +' + +if [[ "$INCLUDE_SYSTEM" != "true" ]]; then + JQ_FILTER+=' + | select(.metadata.namespace != "kube-system") + | select(.metadata.namespace != "kube-public") + | select(.metadata.namespace != "kube-node-lease") + | select(.metadata.namespace != "ingress-nginx") + | select(.metadata.namespace != "monitoring") + ' +fi + +JQ_FILTER+=' + | [ + .metadata.namespace, + .metadata.name, + .spec.nodeName, + .metadata.creationTimestamp, + .metadata.ownerReferences[0].kind, + .metadata.ownerReferences[0].name + ] + | @tsv +' + +CANDIDATES="$(kubectl get pods -A -o json | jq -r \ + --arg NODE "$DRAIN_NODE" \ + --arg DRAIN_TIME "$DRAIN_TIME" \ + "$JQ_FILTER")" + +if [[ -z "$CANDIDATES" ]]; then + echo "No candidate pods found." + exit 0 +fi + +echo "Candidate pods:" +echo "$CANDIDATES" +echo + +echo "This can disrupt services. Kubernetes may or may not reschedule replacements onto $DRAIN_NODE." +read -r -p "Type yes to continue: " confirm + +if [[ "$confirm" != "yes" ]]; then + echo "Aborted." + exit 0 +fi + +echo "$CANDIDATES" | while IFS=$'\t' read -r ns pod current_node created owner_kind owner_name; do + echo + echo "Candidate: $ns/$pod" + echo "Current node: $current_node" + echo "Created: $created" + echo "Owner: $owner_kind/$owner_name" + + if [[ "$DRY_RUN" == "true" ]]; then + echo "DRY_RUN=true, not deleting." + continue + fi + + before_count="$(kubectl get pods -n "$ns" --no-headers 2>/dev/null | wc -l)" + + kubectl delete pod -n "$ns" "$pod" + + case "$owner_kind" in + ReplicaSet) + deploy="$(kubectl get rs -n "$ns" "$owner_name" -o jsonpath='{.metadata.ownerReferences[?(@.kind=="Deployment")].name}' 2>/dev/null || true)" + if [[ -n "$deploy" ]]; then + echo "Waiting for deployment rollout: $ns/$deploy" + kubectl rollout status deployment/"$deploy" -n "$ns" --timeout=300s + else + echo "Waiting for ReplicaSet-owned replacement pods to become Ready..." + sleep 20 + fi + ;; + + StatefulSet) + echo "Waiting for StatefulSet rollout: $ns/$owner_name" + kubectl rollout status statefulset/"$owner_name" -n "$ns" --timeout=300s + ;; + + *) + echo "Waiting briefly for replacement..." + sleep 20 + ;; + esac + + after_count="$(kubectl get pods -n "$ns" --no-headers 2>/dev/null | wc -l)" + echo "Namespace pod count before/after: $before_count/$after_count" + + echo "Pods now running on $DRAIN_NODE:" + kubectl get pods -A --field-selector spec.nodeName="$DRAIN_NODE" --no-headers 2>/dev/null | wc -l +done + +echo "Updated pod count per node post rebalancing:" +kubectl get pods -A -o json | jq -r ' + .items[] + | select(.spec.nodeName != null) + | .spec.nodeName +' | sort | uniq -c | awk '{print $2 ": " $1}'