From 52a3798afc8389f90dc3653eb80eac4aad191e49 Mon Sep 17 00:00:00 2001 From: Shirly Radco Date: Thu, 2 Jul 2026 12:50:24 +0300 Subject: [PATCH] fix: prefer KUBECONFIG over in-cluster config and fix e2e test framework for CI The server previously used rest.InClusterConfig() exclusively, which connects to the build cluster in CI rather than the provisioned test cluster where PrometheusRule CRDs exist. This caused the management API routes to fail to register, resulting in 404 errors during e2e tests. Introduce loadKubeConfig() that prefers KUBECONFIG when set (CI/local dev) and falls back to in-cluster config (production). Additionally fix the e2e test framework and server for CI: - Pass the long-lived server context to the k8s client so informers keep running beyond init. The previous timeout context killed informers when createHTTPServer returned, freezing the relabeled-rules cache. - Create a ServiceAccount token via the TokenRequest API when the kubeconfig uses client-certificate auth (no bearer token available). This is the case in CI where kubeadmin certs are used. - Use unique PromQL expressions per test rule so the spec-equivalence check does not reject rules across tests as duplicates. Signed-off-by: Shirly Radco Co-authored-by: AI Assistant Co-authored-by: Cursor --- pkg/server.go | 44 +++++++++++------ test/e2e/create_alert_rule_test.go | 7 +-- test/e2e/delete_alert_rule_test.go | 5 ++ test/e2e/framework/framework.go | 78 +++++++++++++++++++++++++++++- test/e2e/helpers_test.go | 9 +++- 5 files changed, 124 insertions(+), 19 deletions(-) diff --git a/pkg/server.go b/pkg/server.go index 323c83656..cf2c5fead 100644 --- a/pkg/server.go +++ b/pkg/server.go @@ -19,6 +19,7 @@ import ( "k8s.io/client-go/dynamic" "k8s.io/client-go/kubernetes/scheme" "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" "k8s.io/client-go/tools/record" "github.com/openshift/monitoring-plugin/internal/managementrouter" @@ -127,18 +128,11 @@ func createHTTPServer(ctx context.Context, cfg *Config) (*http.Server, error) { var k8sconfig *rest.Config var err error - // Uncomment the following line for local development: - // k8sconfig, err = clientcmd.BuildConfigFromFlags("", os.Getenv("KUBECONFIG")) - // if err != nil { - // return nil, fmt.Errorf("cannot get kubeconfig from file: %w", err) - // } - - // Comment the following line for local development: var k8sclient *dynamic.DynamicClient if acmMode || alertManagementAPIMode { - k8sconfig, err = rest.InClusterConfig() + k8sconfig, err = loadKubeConfig() if err != nil { - return nil, fmt.Errorf("cannot get in cluster config: %w", err) + return nil, fmt.Errorf("cannot load kubernetes config: %w", err) } k8sclient, err = dynamic.NewForConfig(k8sconfig) @@ -154,15 +148,20 @@ func createHTTPServer(ctx context.Context, cfg *Config) (*http.Server, error) { // hang the entire server startup indefinitely. var managementClient management.Client if alertManagementAPIMode { - const initTimeout = 30 * time.Second - initCtx, initCancel := context.WithTimeout(ctx, initTimeout) - defer initCancel() - - k8sClient, err := k8s.NewClient(initCtx, k8sconfig) + // The k8s client must receive the long-lived ctx so that its + // informers keep running for the lifetime of the server. + // Do NOT pass a timeout-scoped context here: informers that + // are started with a cancelled context stop watching and the + // relabeled-rules cache freezes. + k8sClient, err := k8s.NewClient(ctx, k8sconfig) if err != nil { return nil, fmt.Errorf("failed to create k8s client for alert management API: %w", err) } + const initTimeout = 30 * time.Second + initCtx, initCancel := context.WithTimeout(ctx, initTimeout) + defer initCancel() + if err := k8sClient.TestConnection(initCtx); err != nil { return nil, fmt.Errorf("failed to connect to kubernetes cluster for alert management API: %w", err) } @@ -402,6 +401,23 @@ func configHandler(cfg *Config) (http.HandlerFunc, *PluginConfig) { }), &pluginConfig } +// loadKubeConfig returns a *rest.Config by preferring KUBECONFIG (useful for +// local development and CI) and falling back to in-cluster service-account config. +func loadKubeConfig() (*rest.Config, error) { + if kubeconfig := os.Getenv("KUBECONFIG"); kubeconfig != "" { + cfg, err := clientcmd.BuildConfigFromFlags("", kubeconfig) + if err != nil { + return nil, fmt.Errorf("cannot build config from KUBECONFIG: %w", err) + } + return cfg, nil + } + cfg, err := rest.InClusterConfig() + if err != nil { + return nil, fmt.Errorf("cannot get in-cluster config: %w", err) + } + return cfg, nil +} + func startProxy(cfg *Config, k8sclient *dynamic.DynamicClient, tlsConfig *tls.Config, timeout time.Duration, kind proxy.KindType, port proxy.ProxyPort) { proxyRouter := setupProxyRoutes(cfg, k8sclient, kind) proxyRouter.Use(corsHeaderMiddleware()) diff --git a/test/e2e/create_alert_rule_test.go b/test/e2e/create_alert_rule_test.go index cc4a3656b..f0ea56ca5 100644 --- a/test/e2e/create_alert_rule_test.go +++ b/test/e2e/create_alert_rule_test.go @@ -32,10 +32,11 @@ func TestCreateUserDefinedAlertRule(t *testing.T) { } defer cleanup() + createExpr := "vector(1) or vector(0)" payload := managementrouter.CreateAlertRuleRequest{ AlertingRule: &managementrouter.AlertRuleSpec{ Alert: strPtr("E2ECreateAlert"), - Expr: strPtr("vector(1)"), + Expr: &createExpr, For: strPtr("1m"), Labels: &map[string]string{ "severity": "info", @@ -102,8 +103,8 @@ func TestCreateUserDefinedAlertRule(t *testing.T) { for _, rule := range group.Rules { if rule.Alert == "E2ECreateAlert" { foundAlert = true - if rule.Expr.String() != "vector(1)" { - t.Errorf("Expected expr 'vector(1)', got %q", rule.Expr.String()) + if rule.Expr.String() != createExpr { + t.Errorf("Expected expr %q, got %q", createExpr, rule.Expr.String()) } if rule.For == nil || string(*rule.For) != "1m" { t.Errorf("Expected for '1m', got %v", rule.For) diff --git a/test/e2e/delete_alert_rule_test.go b/test/e2e/delete_alert_rule_test.go index 4a01af43d..771be223f 100644 --- a/test/e2e/delete_alert_rule_test.go +++ b/test/e2e/delete_alert_rule_test.go @@ -8,6 +8,7 @@ import ( "io" "net/http" "testing" + "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" @@ -42,6 +43,10 @@ func TestDeleteAlertRule(t *testing.T) { t.Logf("Created 3 rules with IDs: %v", ruleIDs) + // Allow time for the informer watch event to propagate and + // the relabeled-rules cache to sync the new PrometheusRule. + time.Sleep(2 * time.Second) + deleteReq := managementrouter.BulkDeleteAlertRulesRequest{ RuleIds: []string{ruleIDs[0], ruleIDs[1]}, } diff --git a/test/e2e/framework/framework.go b/test/e2e/framework/framework.go index ce768f4bb..ac2dcd7b2 100644 --- a/test/e2e/framework/framework.go +++ b/test/e2e/framework/framework.go @@ -12,9 +12,13 @@ import ( osmv1client "github.com/openshift/client-go/monitoring/clientset/versioned" monitoringv1client "github.com/prometheus-operator/prometheus-operator/pkg/client/versioned" + authv1 "k8s.io/api/authentication/v1" corev1 "k8s.io/api/core/v1" + rbacv1 "k8s.io/api/rbac/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" "k8s.io/client-go/tools/clientcmd" "github.com/openshift/monitoring-plugin/pkg/k8s" @@ -73,12 +77,17 @@ func New() (*Framework, error) { return nil, fmt.Errorf("failed to create osmv1 clientset: %w", err) } + bearerToken, err := resolveToken(clientset, config) + if err != nil { + return nil, fmt.Errorf("failed to resolve bearer token: %w", err) + } + f = &Framework{ Clientset: clientset, Monitoringv1clientset: monitoringv1clientset, Osmv1clientset: osmv1clientset, PluginURL: pluginURL, - BearerToken: config.BearerToken, + BearerToken: bearerToken, } return f, nil @@ -135,3 +144,70 @@ func (f *Framework) AuthorizedRequest(ctx context.Context, method, url string, b } return req, nil } + +// resolveToken returns a bearer token for authenticating HTTP requests to the +// plugin server. It tries, in order: inline BearerToken, BearerTokenFile, and +// finally creates a short-lived ServiceAccount token (for CI kubeconfigs that +// use client-certificate auth). +func resolveToken(clientset *kubernetes.Clientset, config *rest.Config) (string, error) { + if config.BearerToken != "" { + return config.BearerToken, nil + } + if config.BearerTokenFile != "" { + data, err := os.ReadFile(config.BearerTokenFile) + if err != nil { + return "", fmt.Errorf("reading bearer token file: %w", err) + } + token := strings.TrimSpace(string(data)) + if token != "" { + return token, nil + } + } + return createServiceAccountToken(clientset) +} + +// createServiceAccountToken creates a ServiceAccount with cluster-admin +// privileges and returns a short-lived bearer token for it. This covers CI +// environments where the kubeconfig authenticates via client certificates +// and no bearer token is available. +func createServiceAccountToken(clientset *kubernetes.Clientset) (string, error) { + ctx := context.Background() + const saName = "e2e-management-api" + const ns = "default" + + sa := &corev1.ServiceAccount{ + ObjectMeta: metav1.ObjectMeta{Name: saName}, + } + if _, err := clientset.CoreV1().ServiceAccounts(ns).Create(ctx, sa, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + return "", fmt.Errorf("creating service account: %w", err) + } + + crb := &rbacv1.ClusterRoleBinding{ + ObjectMeta: metav1.ObjectMeta{Name: saName}, + Subjects: []rbacv1.Subject{{ + Kind: rbacv1.ServiceAccountKind, + Name: saName, + Namespace: ns, + }}, + RoleRef: rbacv1.RoleRef{ + APIGroup: rbacv1.GroupName, + Kind: "ClusterRole", + Name: "cluster-admin", + }, + } + if _, err := clientset.RbacV1().ClusterRoleBindings().Create(ctx, crb, metav1.CreateOptions{}); err != nil && !apierrors.IsAlreadyExists(err) { + return "", fmt.Errorf("creating cluster role binding: %w", err) + } + + expSeconds := int64(3600) + treq := &authv1.TokenRequest{ + Spec: authv1.TokenRequestSpec{ + ExpirationSeconds: &expSeconds, + }, + } + resp, err := clientset.CoreV1().ServiceAccounts(ns).CreateToken(ctx, saName, treq, metav1.CreateOptions{}) + if err != nil { + return "", fmt.Errorf("requesting service account token: %w", err) + } + return resp.Status.Token, nil +} diff --git a/test/e2e/helpers_test.go b/test/e2e/helpers_test.go index 8071266fe..46f0b6063 100644 --- a/test/e2e/helpers_test.go +++ b/test/e2e/helpers_test.go @@ -4,6 +4,7 @@ import ( "bytes" "context" "encoding/json" + "fmt" "io" "net/http" "testing" @@ -17,10 +18,16 @@ func strPtr(s string) *string { return &s } func createRuleViaAPI(t *testing.T, f *framework.Framework, ctx context.Context, namespace, alertName, prName string) string { t.Helper() + // Each rule needs a unique expression so the spec-equivalence check + // does not reject it as a duplicate of a rule from another test. + // absent() returns 1 when the selector matches nothing, which is + // always the case for a fabricated metric name. + expr := fmt.Sprintf("absent(nonexistent{e2e_rule=%q})", alertName) + payload := managementrouter.CreateAlertRuleRequest{ AlertingRule: &managementrouter.AlertRuleSpec{ Alert: &alertName, - Expr: strPtr("vector(1)"), + Expr: &expr, For: strPtr("1m"), Labels: &map[string]string{ "severity": "info",