diff --git a/.github/workflows/render.yml b/.github/workflows/render.yml new file mode 100644 index 0000000..501c6cd --- /dev/null +++ b/.github/workflows/render.yml @@ -0,0 +1,25 @@ +name: Render Helm chart + +on: + push: + +jobs: + render: + runs-on: blacksmith-4vcpu-ubuntu-2404 + permissions: + contents: read + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Install Helm + uses: azure/setup-helm@v4 + with: + version: latest + + - name: Lint chart + run: helm lint . + + - name: Render chart + run: helm template control . diff --git a/Chart.yaml b/Chart.yaml index 2bd9345..775b536 100644 --- a/Chart.yaml +++ b/Chart.yaml @@ -2,5 +2,5 @@ apiVersion: v2 name: pgdog-control description: PgDog Control type: application -version: 0.2.6 -appVersion: "v2026-05-28" +version: 0.2.7 +appVersion: "438ab3d1" diff --git a/README.md b/README.md index 3dd726e..5ef8ead 100644 --- a/README.md +++ b/README.md @@ -496,7 +496,7 @@ control: ### Helm -When the dashboard provisions a new PgDog cluster, it shells out to `helm upgrade --install` against a chart fetched from our Helm repository. `control.config.helm` controls which chart and which reposiory. The defaults point at the public `pgdogdev` chart on `helm.pgdog.dev`, which is what you want unless you mirror the chart internally. +When the dashboard provisions a new PgDog cluster, it shells out to `helm upgrade --install` against a chart fetched from our Helm repository. `control.config.helm` controls which chart and which repository. The defaults point at the public `pgdogdev` chart on `helm.pgdog.dev`, which is what you want unless you mirror the chart internally. ```yaml control: @@ -541,6 +541,33 @@ control: | `cloudwatch.lookback_secs` | How far back each fetch reaches. A fresh deploy pulls the full window on its first tick (int, default `3600`). | | `cloudwatch.period_secs` | CloudWatch aggregation period. The smallest bucket the metric API returns (int, default `60`). | +### Alerting + +`control.config.alerts` enables outbound alert integrations. Leave `incident_io` unset to disable incident.io. Thresholds are optional and only configured metrics create alerts. + +```yaml +control: + config: + alerts: + evaluation_window_secs: 300 + thresholds: + clients_waiting: 10 + cpu: 90.0 + memory: 2048 + server_connections: 100 + incident_io: + api_key: inc_live_xxx +``` + +| Option | Description | +|-|-| +| `evaluation_window_secs` | How long metrics must remain at or above threshold before creating an alert (int, default `300`). | +| `thresholds.clients_waiting` | Number of clients waiting on a server connection (int, optional). | +| `thresholds.cpu` | CPU usage percentage. Must be between `0.0` and `100.0`, inclusive (float, optional). | +| `thresholds.memory` | Memory used, in megabytes (int, optional). | +| `thresholds.server_connections` | Number of open server connections (int, optional). | +| `incident_io.api_key` | incident.io API key with permission to create incidents. Missing `incident_io` disables the integration (string, optional). | + ### State store `control.config.store` governs the in-memory metric store: how often it sweeps for stale data, when an instance is marked stale or evicted, and how long per-instance metric history is retained. The defaults are tight enough for an interactive dashboard; widen them if you keep the UI open against a cluster that's intentionally idle, or if you want a longer historical window in memory. @@ -553,6 +580,7 @@ control: stale_after_secs: 5 evict_after_secs: 60 metrics_retention_secs: 300 + query_history_limit: 1000 ``` | Option | Description | @@ -561,6 +589,7 @@ control: | `stale_after_secs` | Instance is marked stale if its newest metric is older than this. The UI dims it but keeps it visible (int, default `5`). | | `evict_after_secs` | Instance is dropped from the store entirely if its newest metric is older than this (int, default `60`). | | `metrics_retention_secs` | How much per-instance metric history is kept in memory. Older points are dropped as new ones arrive (int, default `300`). | +| `query_history_limit` | Per-token historical query store capacity. Oldest deduped query entries are evicted first once the limit is reached (int, default `1000`). | ### Redis persistence diff --git a/templates/configmap.yaml b/templates/configmap.yaml index fac8115..2566b4f 100644 --- a/templates/configmap.yaml +++ b/templates/configmap.yaml @@ -70,6 +70,9 @@ data: {{- with .metrics_retention_secs }} metrics_retention_secs = {{ . }} {{- end }} + {{- if hasKey . "query_history_limit" }} + query_history_limit = {{ .query_history_limit }} + {{- end }} {{- end }} {{- with $config.helm }} @@ -81,6 +84,9 @@ data: {{- with .repo }} repo = {{ . | quote }} {{- end }} + {{- with .repo_url }} + repo_url = {{ . | quote }} + {{- end }} {{- end }} {{- if or $cookieSecret (gt (len $auth) 0) }} @@ -129,6 +135,37 @@ data: {{- end }} {{- end }} + {{- with $config.alerts }} + + [alerts] + {{- if hasKey . "evaluation_window_secs" }} + evaluation_window_secs = {{ .evaluation_window_secs }} + {{- end }} + {{- with .thresholds }} + + [alerts.thresholds] + {{- if hasKey . "clients_waiting" }} + clients_waiting = {{ .clients_waiting }} + {{- end }} + {{- if hasKey . "cpu" }} + cpu = {{ .cpu }} + {{- end }} + {{- if hasKey . "memory" }} + memory = {{ .memory }} + {{- end }} + {{- if hasKey . "server_connections" }} + server_connections = {{ .server_connections }} + {{- end }} + {{- end }} + {{- with .incident_io }} + + [alerts.incident_io] + {{- if hasKey . "api_key" }} + api_key = {{ .api_key | quote }} + {{- end }} + {{- end }} + {{- end }} + {{- $redis := $config.redis | default dict }} {{- $redisUrl := $redis.url | default (printf "redis://%s.%s.svc.cluster.local:6379" (include "pgdog-control.redis.fullname" .) .Release.Namespace) }} diff --git a/values.yaml b/values.yaml index 17b3c5d..c58cb93 100644 --- a/values.yaml +++ b/values.yaml @@ -86,9 +86,11 @@ control: # stale_after_secs: 5 # evict_after_secs: 60 # metrics_retention_secs: 300 + # query_history_limit: 1000 helm: {} # chart: pgdog # repo: pgdogdev + # repo_url: https://helm.pgdog.dev auth: {} # cookie_secret: "" # optional; random key generated at boot when absent # redirect_base_url: "" # e.g. https://control.example.com @@ -103,6 +105,15 @@ control: # client_id: "" # client_secret: "" # allowed_domains: [] + alerts: {} + # evaluation_window_secs: 300 + # thresholds: + # clients_waiting: 10 + # cpu: 90.0 + # memory: 2048 + # server_connections: 100 + # incident_io: + # api_key: "" redis: {} # url: "" # optional; defaults to in-cluster redis # save_interval_secs: 60