From 90729ee8c16f99c8bb2996806ffb07f997765de0 Mon Sep 17 00:00:00 2001 From: Junyi Hou Date: Mon, 25 May 2026 07:52:58 +0800 Subject: [PATCH] fix: cloudflared probe loop causing constant restarts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The livenessProbe had failureThreshold:1, so a single non-200 from /ready (which depends on an active tunnel connection) was enough for kubelet to kill the pod. Cloudflare rotates edge connections, so this fired on every rotation — observed ~21 restarts/hour over 41 days. - Drop livenessProbe; cloudflared self-heals reconnects. - Add startupProbe (failureThreshold:30) for first-connect tolerance. - Add readinessProbe for rolling-deploy gating. - Pin image to 2026.5.0 (was :latest, no rollback path). - Switch --loglevel from debug to info. - Add resource requests/limits. Co-Authored-By: Claude Opus 4.7 (1M context) --- helm-chart/templates/cloudflare.yaml | 31 +++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/helm-chart/templates/cloudflare.yaml b/helm-chart/templates/cloudflare.yaml index 25371b3c..cea0ab7e 100644 --- a/helm-chart/templates/cloudflare.yaml +++ b/helm-chart/templates/cloudflare.yaml @@ -33,7 +33,7 @@ spec: - name: net.ipv4.ping_group_range value: "65532 65532" containers: - - image: cloudflare/cloudflared:latest + - image: cloudflare/cloudflared:2026.5.0 name: cloudflared env: # Defines an environment variable for the tunnel token. @@ -50,16 +50,33 @@ spec: - --protocol - http2 - --loglevel - - debug + - info - --metrics - 0.0.0.0:2000 - run - livenessProbe: + resources: + requests: + cpu: 50m + memory: 64Mi + limits: + cpu: 500m + memory: 256Mi + # cloudflared's /ready returns 200 only when at least one tunnel + # connection is active. Cloudflare rotates edge connections + # periodically, so transient /ready failures are expected and + # cloudflared reconnects on its own. We therefore drop livenessProbe + # (which was killing healthy pods on every reconnect) and rely on + # startupProbe for first-connect tolerance plus readinessProbe for + # rolling-deploy gating. + startupProbe: httpGet: - # Cloudflared has a /ready endpoint which returns 200 if and only if - # it has an active connection to Cloudflare's network. path: /ready port: 2000 - failureThreshold: 1 - initialDelaySeconds: 10 periodSeconds: 10 + failureThreshold: 30 + readinessProbe: + httpGet: + path: /ready + port: 2000 + periodSeconds: 10 + failureThreshold: 3