Skip to content

Commit 33929da

Browse files
committed
fix(supervisor): scrape timeout + schema-level pod-count hysteresis guard
1 parent 0160156 commit 33929da

4 files changed

Lines changed: 40 additions & 10 deletions

File tree

apps/supervisor/src/clients/kubernetes.ts

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -60,7 +60,7 @@ export { k8s };
6060
* One lightweight aggregate read - not a pod listing. Requires the service
6161
* account to be granted GET on the /metrics non-resource URL.
6262
*/
63-
export function createApiserverMetricsFetcher(): () => Promise<string> {
63+
export function createApiserverMetricsFetcher(timeoutMs: number): () => Promise<string> {
6464
const kubeConfig = getKubeConfig();
6565

6666
return async () => {
@@ -98,6 +98,11 @@ export function createApiserverMetricsFetcher(): () => Promise<string> {
9898
}
9999
});
100100
});
101+
// Without this a hung connect/TLS/read never settles, and the monitor's
102+
// refreshInFlight guard would freeze the source (silent fail-open).
103+
req.setTimeout(timeoutMs, () => {
104+
req.destroy(new Error(`apiserver /metrics scrape timed out after ${timeoutMs}ms`));
105+
});
101106
req.on("error", reject);
102107
req.end();
103108
});

apps/supervisor/src/env.test.ts

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,4 +50,15 @@ describe("Env superRefine - backpressure source awareness", () => {
5050
})
5151
).not.toThrow();
5252
});
53+
54+
it("rejects pod-count release >= engage when the source is enabled", () => {
55+
expect(() =>
56+
Env.parse({
57+
...base,
58+
TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_ENABLED: "true",
59+
TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_ENGAGE: "100",
60+
TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_RELEASE: "100",
61+
})
62+
).toThrow();
63+
});
5364
});

apps/supervisor/src/env.ts

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,13 @@ export const Env = z
7878
TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_ENGAGE: z.coerce.number().int().positive().default(10_000),
7979
TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_RELEASE: z.coerce.number().int().positive().default(5_000),
8080
TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_REFRESH_MS: z.coerce.number().int().positive().default(5_000),
81+
// Hard timeout on the apiserver /metrics scrape. A hung request would otherwise
82+
// never settle and freeze the monitor's refresh loop (fail-open silently).
83+
TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_SCRAPE_TIMEOUT_MS: z.coerce
84+
.number()
85+
.int()
86+
.positive()
87+
.default(10_000),
8188

8289
// Optional services
8390
TRIGGER_WARM_START_URL: z.string().optional(),
@@ -317,6 +324,18 @@ export const Env = z
317324
TRIGGER_WIDE_EVENTS_NOISY_ROUTES: BoolEnv.default(false),
318325
})
319326
.superRefine((data, ctx) => {
327+
if (
328+
data.TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_ENABLED &&
329+
data.TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_RELEASE >=
330+
data.TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_ENGAGE
331+
) {
332+
ctx.addIssue({
333+
code: z.ZodIssueCode.custom,
334+
message:
335+
"TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_RELEASE must be less than TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_ENGAGE",
336+
path: ["TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_RELEASE"],
337+
});
338+
}
320339
if (data.COMPUTE_SNAPSHOTS_ENABLED && !data.TRIGGER_METADATA_URL) {
321340
ctx.addIssue({
322341
code: z.ZodIssueCode.custom,

apps/supervisor/src/index.ts

Lines changed: 4 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -252,14 +252,7 @@ class ManagedSupervisor {
252252
// Pod-count source (in-process apiserver scrape). Namespaced metrics so the
253253
// redis source's metric names are preserved.
254254
if (env.TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_ENABLED) {
255-
if (
256-
env.TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_RELEASE >=
257-
env.TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_ENGAGE
258-
) {
259-
throw new Error(
260-
"TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_RELEASE must be less than TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_ENGAGE"
261-
);
262-
}
255+
// RELEASE < ENGAGE is enforced in env.ts (superRefine), so it's valid here.
263256
const podCountGauge = new Gauge({
264257
name: "supervisor_cluster_pod_count",
265258
help: "Total pod objects stored in the cluster, scraped for backpressure",
@@ -269,7 +262,9 @@ class ManagedSupervisor {
269262
new BackpressureMonitor({
270263
enabled: true,
271264
source: new K8sPodCountSignalSource({
272-
fetchMetrics: createApiserverMetricsFetcher(),
265+
fetchMetrics: createApiserverMetricsFetcher(
266+
env.TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_SCRAPE_TIMEOUT_MS
267+
),
273268
engageThreshold: env.TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_ENGAGE,
274269
releaseThreshold: env.TRIGGER_DEQUEUE_BACKPRESSURE_POD_COUNT_RELEASE,
275270
reportPodCount: (count) => podCountGauge.set(count),

0 commit comments

Comments
 (0)