diff --git a/.mk/development.mk b/.mk/development.mk index 86c470443..b08c40a30 100644 --- a/.mk/development.mk +++ b/.mk/development.mk @@ -66,6 +66,17 @@ deploy-netflow-simulator: ## Deploy netflow simulator undeploy-netflow-simulator: ## Undeploy netflow simulator kubectl --ignore-not-found=true delete -f contrib/kubernetes/deployment-netflow-simulator.yaml || true +.PHONY: deploy-flp-informers +deploy-flp-informers: ## Deploy flp-informers (centralized K8s cache pusher) + sed 's|%IMAGE_TAG_BASE%|$(IMAGE_TAG_BASE)|g;s|%VERSION%|$(VERSION)|g;s|%NAMESPACE%|$(NAMESPACE)|g' contrib/kubernetes/deployment-flp-informers.yaml > /tmp/deployment-flp-informers.yaml + kubectl apply -f /tmp/deployment-flp-informers.yaml -n $(NAMESPACE) + kubectl rollout status "deploy/flp-informers" --timeout=600s -n $(NAMESPACE) + +.PHONY: undeploy-flp-informers +undeploy-flp-informers: ## Undeploy flp-informers + sed 's|%IMAGE_TAG_BASE%|$(IMAGE_TAG_BASE)|g;s|%VERSION%|$(VERSION)|g;s|%NAMESPACE%|$(NAMESPACE)|g' contrib/kubernetes/deployment-flp-informers.yaml > /tmp/deployment-flp-informers.yaml + kubectl --ignore-not-found=true delete -f /tmp/deployment-flp-informers.yaml -n $(NAMESPACE) || true + ##@ kind .PHONY: create-kind-cluster @@ -121,6 +132,32 @@ local-cleanup: prereqs-kind local-deployments-cleanup delete-kind-cluster ## Und .PHONY: local-redeploy local-redeploy: local-deployments-cleanup local-deployments-deploy ## Redeploy locally (on current kind) +.PHONY: local-deployments-deploy-k8scache +local-deployments-deploy-k8scache: prereqs-kind deploy-prometheus deploy-loki deploy-grafana build-image kind-load-image deploy-k8scache deploy-flp-informers deploy-netflow-simulator + kubectl get pods -n $(NAMESPACE) + kubectl rollout status -w deployment/flowlogs-pipeline -n $(NAMESPACE) + kubectl rollout status -w deployment/flp-informers -n $(NAMESPACE) + kubectl logs -l app=flowlogs-pipeline -n $(NAMESPACE) + +.PHONY: deploy-k8scache +deploy-k8scache: ## Deploy FLP with k8scache server enabled + sed 's|%IMAGE_TAG_BASE%|$(IMAGE_TAG_BASE)|g;s|%VERSION%|$(VERSION)|g;s|%NAMESPACE%|$(NAMESPACE)|g' contrib/kubernetes/deployment-k8scache.yaml > /tmp/deployment-k8scache.yaml + kubectl create configmap flowlogs-pipeline-configuration --from-file=flowlogs-pipeline.conf.yaml=$(FLP_CONF_FILE) -n $(NAMESPACE) + kubectl apply -f /tmp/deployment-k8scache.yaml -n $(NAMESPACE) + kubectl rollout status "deploy/flowlogs-pipeline" --timeout=600s -n $(NAMESPACE) + +.PHONY: local-deploy-k8scache +local-deploy-k8scache: prereqs-kind local-cleanup-k8scache create-kind-cluster local-deployments-deploy-k8scache ## Deploy locally on kind with k8scache and flp-informers + +.PHONY: local-deployments-cleanup-k8scache +local-deployments-cleanup-k8scache: prereqs-kind undeploy-netflow-simulator undeploy undeploy-flp-informers undeploy-grafana undeploy-loki undeploy-prometheus + +.PHONY: local-cleanup-k8scache +local-cleanup-k8scache: prereqs-kind local-deployments-cleanup-k8scache delete-kind-cluster ## Undeploy k8scache setup from local kind + +.PHONY: local-redeploy-k8scache +local-redeploy-k8scache: local-deployments-cleanup-k8scache local-deployments-deploy-k8scache ## Redeploy locally with k8scache (on current kind) + .PHONY: ocp-deploy ocp-deploy: ocp-cleanup deploy-prometheus deploy-loki deploy-grafana deploy ## Deploy to OCP flowlogs_pipeline_svc_ip=$$(kubectl get svc flowlogs-pipeline -o jsonpath='{.spec.clusterIP}'); \ diff --git a/Makefile b/Makefile index d1383da6f..8858f015d 100644 --- a/Makefile +++ b/Makefile @@ -30,6 +30,9 @@ IMAGE_TAG_BASE ?= $(IMAGE_REGISTRY)/$(IMAGE_ORG)/flowlogs-pipeline # Image URL to use all building/pushing image targets IMAGE ?= $(IMAGE_TAG_BASE):$(VERSION) +# Kubernetes namespace for deployments - defaults to current context namespace or "default" +NAMESPACE ?= $(shell ns=$$(kubectl config view --minify --output 'jsonpath={..namespace}' 2>/dev/null); echo "$${ns:-default}") + # Image building tool (docker / podman) - docker is preferred in CI OCI_BIN_PATH = $(shell which docker 2>/dev/null || which podman) OCI_BIN ?= $(shell basename ${OCI_BIN_PATH}) diff --git a/README.md b/README.md index 69b712fc8..ec7ee64e4 100644 --- a/README.md +++ b/README.md @@ -54,6 +54,8 @@ Flags: --health.address string Health server address (default "0.0.0.0") --health.port int Health server port (default: disable health server) -h, --help help for flowlogs-pipeline + --k8scache.address string K8s cache sync server address (default "0.0.0.0") + --k8scache.port int K8s cache sync server port (default: disabled) --log-level string Log level: debug, info, warning, error (default "error") --metricsSettings string json for global metrics settings --parameters string json of config file parameters field diff --git a/cmd/flowlogs-pipeline/main.go b/cmd/flowlogs-pipeline/main.go index a6c571e78..59ba69e32 100644 --- a/cmd/flowlogs-pipeline/main.go +++ b/cmd/flowlogs-pipeline/main.go @@ -19,8 +19,11 @@ package main import ( "context" + "crypto/tls" + "crypto/x509" "encoding/json" "fmt" + "net" "net/http" "os" "path/filepath" @@ -33,12 +36,17 @@ import ( "github.com/netobserv/flowlogs-pipeline/pkg/config" "github.com/netobserv/flowlogs-pipeline/pkg/operational" "github.com/netobserv/flowlogs-pipeline/pkg/pipeline" + "github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/kubernetes" + "github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/kubernetes/datasource" + "github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/kubernetes/k8scache" "github.com/netobserv/flowlogs-pipeline/pkg/pipeline/utils" "github.com/netobserv/flowlogs-pipeline/pkg/prometheus" log "github.com/sirupsen/logrus" "github.com/spf13/cobra" "github.com/spf13/pflag" "github.com/spf13/viper" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials" ) var ( @@ -145,6 +153,12 @@ func initFlags() { rootCmd.PersistentFlags().StringVar(&opts.Health.Address, "health.address", "0.0.0.0", "Health server address") rootCmd.PersistentFlags().IntVar(&opts.Health.Port, "health.port", 0, "Health server port (default: disable health server) ") rootCmd.PersistentFlags().IntVar(&opts.Profile.Port, "profile.port", 0, "Go pprof tool port (default: disabled)") + rootCmd.PersistentFlags().StringVar(&opts.K8sCacheServer.Address, "k8scache.address", "0.0.0.0", "K8s cache sync server address") + rootCmd.PersistentFlags().IntVar(&opts.K8sCacheServer.Port, "k8scache.port", 0, "K8s cache sync server port (default: disabled)") + rootCmd.PersistentFlags().BoolVar(&opts.K8sCacheServer.TLSEnabled, "k8scache.tls-enabled", false, "Enable TLS for K8s cache sync server") + rootCmd.PersistentFlags().StringVar(&opts.K8sCacheServer.TLSCertPath, "k8scache.tls-cert-path", "", "Path to TLS server certificate") + rootCmd.PersistentFlags().StringVar(&opts.K8sCacheServer.TLSKeyPath, "k8scache.tls-key-path", "", "Path to TLS server private key") + rootCmd.PersistentFlags().StringVar(&opts.K8sCacheServer.TLSCAPath, "k8scache.tls-ca-path", "", "Path to TLS CA certificate for client verification") rootCmd.PersistentFlags().StringVar(&opts.PipeLine, "pipeline", "", "json of config file pipeline field") rootCmd.PersistentFlags().StringVar(&opts.Parameters, "parameters", "", "json of config file parameters field") rootCmd.PersistentFlags().StringVar(&opts.DynamicParameters, "dynamicParameters", "", "json of configmap location for dynamic parameters") @@ -183,6 +197,11 @@ func run() { utils.SetupElegantExit() promServer := prometheus.InitializePrometheus(&cfg.MetricsSettings) + // Enable k8scache mode if configured (disables local informers to save resources) + if opts.K8sCacheServer.Port > 0 { + kubernetes.SetK8sCacheEnabled(true) + } + // Create new flows pipeline mainPipeline, err = pipeline.NewPipeline(&cfg) if err != nil { @@ -204,6 +223,12 @@ func run() { healthServer = operational.NewHealthServer(&opts, mainPipeline.IsAlive, mainPipeline.IsReady) } + // Start K8s cache server + var grpcServer *grpc.Server + if opts.K8sCacheServer.Port > 0 { + grpcServer = startK8sCacheServer(&opts.K8sCacheServer) + } + // Starts the flows pipeline mainPipeline.Run() @@ -213,9 +238,102 @@ func run() { if healthServer != nil { _ = healthServer.Shutdown(context.Background()) } + if grpcServer != nil { + log.Info("stopping K8s cache sync server") + grpcServer.GracefulStop() + } // Give all threads a chance to exit and then exit the process time.Sleep(time.Second) log.Debugf("exiting main run") os.Exit(0) } + +// startK8sCacheServer initializes and starts the gRPC server for K8s cache synchronization +// Returns nil if the datasource is not available (e.g., no kubernetes enrichment configured) +func startK8sCacheServer(cfg *config.K8sCacheServer) *grpc.Server { + // Check if kubernetes datasource is available + ds := kubernetes.GetDatasource() + if ds == nil { + log.Warn("K8s cache server requested but kubernetes datasource not initialized. " + + "Make sure kubernetes enrichment is configured in the pipeline.") + return nil + } + + // Attach a Kubernetes store so the cache server can apply received updates; enrichment will use it for lookups + ds.SetKubernetesStore(datasource.NewKubernetesStore()) + + // Create cache server + cacheServer := k8scache.NewKubernetesCacheServer(ds) + + // Create gRPC server with optional TLS + var grpcServer *grpc.Server + if cfg.TLSEnabled { + tlsConfig, err := createServerTLSConfig(cfg) + if err != nil { + log.WithError(err).Fatal("failed to configure TLS for K8s cache server") + return nil + } + grpcServer = grpc.NewServer(grpc.Creds(tlsConfig)) + log.Info("K8s cache server TLS enabled") + } else { + grpcServer = grpc.NewServer() + log.Warn("K8s cache server TLS disabled - connections are insecure (not recommended for production)") + } + k8scache.RegisterKubernetesCacheServiceServer(grpcServer, cacheServer) + + // Start listening + address := fmt.Sprintf("%s:%d", cfg.Address, cfg.Port) + listener, err := net.Listen("tcp", address) + if err != nil { + log.WithError(err).WithField("address", address).Fatal("failed to start K8s cache server") + return nil + } + + // Start server in background + go func() { + log.WithField("address", address).Info("starting K8s cache sync server") + if err := grpcServer.Serve(listener); err != nil { + log.WithError(err).Error("K8s cache sync server stopped with error") + } + }() + + return grpcServer +} + +// createServerTLSConfig creates TLS credentials for the gRPC server +func createServerTLSConfig(cfg *config.K8sCacheServer) (credentials.TransportCredentials, error) { + // Load server certificate and private key + if cfg.TLSCertPath == "" || cfg.TLSKeyPath == "" { + return nil, fmt.Errorf("TLS enabled but cert/key paths not provided") + } + + cert, err := tls.LoadX509KeyPair(cfg.TLSCertPath, cfg.TLSKeyPath) + if err != nil { + return nil, fmt.Errorf("failed to load server cert/key: %w", err) + } + + tlsConfig := &tls.Config{ + Certificates: []tls.Certificate{cert}, + ClientAuth: tls.NoClientCert, // Default: no client cert required + } + + // If CA is provided, require and verify client certificates + if cfg.TLSCAPath != "" { + caCert, err := os.ReadFile(cfg.TLSCAPath) + if err != nil { + return nil, fmt.Errorf("failed to read CA cert: %w", err) + } + + caCertPool := x509.NewCertPool() + if !caCertPool.AppendCertsFromPEM(caCert) { + return nil, fmt.Errorf("failed to append CA cert") + } + + tlsConfig.ClientCAs = caCertPool + tlsConfig.ClientAuth = tls.RequireAndVerifyClientCert + log.Info("K8s cache server: mutual TLS enabled (client certificates required)") + } + + return credentials.NewTLS(tlsConfig), nil +} diff --git a/cmd/flp-informers/main.go b/cmd/flp-informers/main.go new file mode 100644 index 000000000..92a744890 --- /dev/null +++ b/cmd/flp-informers/main.go @@ -0,0 +1,276 @@ +package main + +import ( + "context" + "fmt" + "os" + "os/signal" + "strings" + "syscall" + "time" + + "github.com/netobserv/flowlogs-pipeline/internal/informers" + "github.com/netobserv/flowlogs-pipeline/pkg/api" + "github.com/netobserv/flowlogs-pipeline/pkg/config" + "github.com/netobserv/flowlogs-pipeline/pkg/metrics" + "github.com/netobserv/flowlogs-pipeline/pkg/operational" + k8sinformers "github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/kubernetes/informers" + "github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/kubernetes/k8scache" + log "github.com/sirupsen/logrus" + "github.com/spf13/cobra" + "github.com/spf13/pflag" + "github.com/spf13/viper" +) + +var ( + version = "dev" + commit = "unknown" + envPrefix = "FLP_INFORMERS" +) + +type options struct { + Kubeconfig string + LogLevel string + ProcessorSelector string // Label selector for FLP processors (e.g., "app=flowlogs-pipeline") + ProcessorPort int // Port where FLP processors listen for gRPC (k8scache.port) + ProcessorServiceName string // Headless service name for DNS-based discovery (optional, required for TLS) + ResyncInterval int // Interval in seconds to rediscover processors + // TLS configuration for gRPC client + TLSEnabled bool + TLSCertPath string + TLSKeyPath string + TLSCAPath string + TLSServerName string + InsecureSkipVerify bool + // Cache configuration + UpdateBufferSize int // Size of the update channel buffer + SendTimeoutSec int // Timeout in seconds for sending updates to processors + BatchSize int // Maximum number of entries to send in a single update + // High availability configuration + EnableLeaderElection bool // Enable leader election for HA + HealthPort int // Port for health check HTTP server + MetricsPort int // Port for Prometheus metrics HTTP server +} + +var opts = options{} + +var rootCmd = &cobra.Command{ + Use: "flp-informers", + Short: "Centralized Kubernetes informers that push cache updates to FLP processors", + Long: `flp-informers watches Kubernetes resources (Pods, Nodes, Services) and pushes +updates to distributed FlowLogs Pipeline (FLP) processor pods via gRPC. + +This reduces the load on the Kubernetes API server by having a single component +(or 1-2 replicas) query the API instead of N FLP processors.`, + Run: run, +} + +// initConfig reads environment variables that match the prefix +func initConfig() { + v := viper.New() + + // Read environment variables that match prefix + // Format: FLP_INFORMERS_ + // Example: FLP_INFORMERS_LOG_LEVEL, FLP_INFORMERS_PROCESSOR_PORT + v.SetEnvPrefix(envPrefix) + v.AutomaticEnv() + + bindFlags(rootCmd, v) + + // Initialize logger + initLogger() +} + +func initLogger() { + lvl, err := log.ParseLevel(opts.LogLevel) + if err != nil { + lvl = log.ErrorLevel + } + log.SetLevel(lvl) + log.SetFormatter(&log.TextFormatter{DisableColors: false, FullTimestamp: true, PadLevelText: true, DisableQuote: true}) +} + +// bindFlags applies environment variable overrides to flags +// This follows the same pattern as flowlogs-pipeline/main.go +func bindFlags(cmd *cobra.Command, v *viper.Viper) { + cmd.Flags().VisitAll(func(f *pflag.Flag) { + // Convert flag name to env var format (e.g., "log-level" -> "LOG_LEVEL") + if strings.Contains(f.Name, "-") { + envVarSuffix := strings.ToUpper(strings.ReplaceAll(f.Name, "-", "_")) + _ = v.BindEnv(f.Name, fmt.Sprintf("%s_%s", envPrefix, envVarSuffix)) + } + + // Apply the viper config value to the flag when the flag is not set and viper has a value + if !f.Changed && v.IsSet(f.Name) { + val := v.Get(f.Name) + _ = cmd.Flags().Set(f.Name, fmt.Sprintf("%v", val)) + } + }) +} + +func initFlags() { + cobra.OnInitialize(initConfig) + rootCmd.PersistentFlags().StringVar(&opts.Kubeconfig, "kubeconfig", "", "Path to kubeconfig file (empty = in-cluster)") + rootCmd.PersistentFlags().StringVar(&opts.LogLevel, "log-level", "info", "Log level: debug, info, warning, error") + rootCmd.PersistentFlags().StringVar(&opts.ProcessorSelector, "processor-selector", "app=flowlogs-pipeline", "Label selector for FLP processor pods") + rootCmd.PersistentFlags().IntVar(&opts.ProcessorPort, "processor-port", 9090, "Port where FLP processors listen for gRPC") + rootCmd.PersistentFlags().StringVar(&opts.ProcessorServiceName, "processor-service-name", "", "Headless service name for DNS-based discovery (required for TLS)") + rootCmd.PersistentFlags().IntVar(&opts.ResyncInterval, "resync-interval", 60, "Interval in seconds to rediscover processors") + // TLS configuration + rootCmd.PersistentFlags().BoolVar(&opts.TLSEnabled, "tls-enabled", false, "Enable TLS for gRPC connections to processors") + rootCmd.PersistentFlags().StringVar(&opts.TLSCertPath, "tls-cert-path", "", "Path to TLS client certificate") + rootCmd.PersistentFlags().StringVar(&opts.TLSKeyPath, "tls-key-path", "", "Path to TLS client private key") + rootCmd.PersistentFlags().StringVar(&opts.TLSCAPath, "tls-ca-path", "", "Path to TLS CA certificate for server verification") + rootCmd.PersistentFlags().StringVar(&opts.TLSServerName, "tls-server-name", "", "Expected server name for TLS verification (e.g., flowlogs-pipeline.namespace.svc)") + rootCmd.PersistentFlags().BoolVar(&opts.InsecureSkipVerify, "tls-insecure-skip-verify", false, "Skip TLS certificate verification (not recommended for production)") + // Cache configuration + rootCmd.PersistentFlags().IntVar(&opts.UpdateBufferSize, "update-buffer-size", 100, "Size of the update channel buffer") + rootCmd.PersistentFlags().IntVar(&opts.SendTimeoutSec, "send-timeout", 10, "Timeout in seconds for sending updates to processors") + rootCmd.PersistentFlags().IntVar(&opts.BatchSize, "batch-size", 100, "Maximum number of entries to send in a single update") + // High availability configuration + rootCmd.PersistentFlags().BoolVar(&opts.EnableLeaderElection, "enable-leader-election", true, "Enable leader election for high availability") + rootCmd.PersistentFlags().IntVar(&opts.HealthPort, "health-port", 8080, "Port for health check HTTP server") + rootCmd.PersistentFlags().IntVar(&opts.MetricsPort, "metrics-port", 9091, "Port for Prometheus metrics HTTP server") +} + +func main() { + initFlags() + if err := rootCmd.Execute(); err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(1) + } +} + +func run(_ *cobra.Command, _ []string) { + log.Infof("Starting flp-informers version=%s commit=%s", version, commit) + + // Initialize Prometheus metrics + metrics.InitMetrics() + + // Start health server + healthServer := informers.NewHealthServer(opts.HealthPort) + if err := healthServer.Start(); err != nil { + log.WithError(err).Fatal("failed to start health server") + } + defer func() { + if err := healthServer.Stop(); err != nil { + log.WithError(err).Error("failed to stop health server") + } + }() + + // Start metrics server + metricsServer := metrics.NewServer(opts.MetricsPort) + if err := metricsServer.Start(); err != nil { + log.WithError(err).Fatal("failed to start metrics server") + } + defer func() { + if err := metricsServer.Stop(); err != nil { + log.WithError(err).Error("failed to stop metrics server") + } + }() + + // Wait for shutdown signal + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sigChan := make(chan os.Signal, 1) + signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM) + + // Run with leader election (or as single instance if disabled) + leConfig := informers.LeaderElectionConfig{ + Enabled: opts.EnableLeaderElection, + Namespace: informers.GetNamespace(), + Identity: informers.GetPodName(), + } + + go func() { + if err := informers.RunWithLeaderElection(ctx, leConfig, healthServer, func(ctx context.Context) { + runInformers(ctx, healthServer) + }); err != nil { + log.WithError(err).Fatal("leader election failed") + } + }() + + <-sigChan + log.Info("Shutdown signal received, stopping...") + cancel() +} + +func runInformers(ctx context.Context, healthServer *informers.HealthServer) { + log.Info("Starting informers and gRPC client") + + // Create gRPC client + processorID := fmt.Sprintf("flp-informers-%d", time.Now().Unix()) + clientConfig := k8scache.ClientConfig{ + ProcessorID: processorID, + TLSEnabled: opts.TLSEnabled, + TLSCertPath: opts.TLSCertPath, + TLSKeyPath: opts.TLSKeyPath, + TLSCAPath: opts.TLSCAPath, + TLSServerName: opts.TLSServerName, + InsecureSkipVerify: opts.InsecureSkipVerify, + UpdateBufferSize: opts.UpdateBufferSize, + SendTimeout: time.Duration(opts.SendTimeoutSec) * time.Second, + BatchSize: opts.BatchSize, + } + grpcClient := k8scache.NewClient(&clientConfig) + + if opts.TLSEnabled { + log.Info("TLS enabled for gRPC connections to processors") + // Warn if neither TLSServerName nor ProcessorServiceName is set (may cause TLS verification issues) + if opts.TLSServerName == "" && opts.ProcessorServiceName == "" { + log.Warn("TLS enabled but neither --tls-server-name nor --processor-service-name is set. " + + "TLS verification may fail when connecting by IP. Consider setting one of these options.") + } + } else { + log.Warn("TLS disabled - connections to processors are insecure (not recommended for production)") + } + grpcClient.Start() + defer grpcClient.Stop() + + // Initialize Kubernetes informers + apiConfig := &api.NetworkTransformKubeConfig{} // Empty config - will use defaults + infConfig := k8sinformers.NewConfig(apiConfig) + inf := &k8sinformers.Informers{} + opMetrics := operational.NewMetrics(&config.MetricsSettings{}) + + if err := inf.InitFromConfig(opts.Kubeconfig, &infConfig, opMetrics); err != nil { + log.WithError(err).Fatal("failed to initialize informers") + } + + log.Info("Kubernetes informers initialized and synced") + + // Set informer data source in gRPC client for snapshot generation + grpcClient.SetInformer(inf) + + // Setup informer event handlers to push updates via gRPC + handler := k8scache.NewEventHandler(grpcClient) + if err := inf.AddEventHandler(handler); err != nil { + log.WithError(err).Fatal("failed to add event handlers") + } + log.Info("Informer event handlers registered for cache sync") + + // Mark as ready + healthServer.SetReady(true) + + // Start processor discovery in background + discoveryConfig := k8scache.DiscoveryConfig{ + Kubeconfig: opts.Kubeconfig, + ProcessorSelector: opts.ProcessorSelector, + ProcessorPort: opts.ProcessorPort, + ProcessorServiceName: opts.ProcessorServiceName, + ResyncInterval: opts.ResyncInterval, + } + + go func() { + if err := k8scache.StartProcessorDiscovery(ctx, grpcClient, discoveryConfig); err != nil { + log.WithError(err).Error("processor discovery stopped") + } + }() + + log.Info("flp-informers started - sending snapshots to new processors (lastVersion=0) and incremental updates to all") + + // Wait for context cancellation + <-ctx.Done() + log.Info("Context cancelled, stopping informers...") +} diff --git a/contrib/docker/Dockerfile b/contrib/docker/Dockerfile index 7aa6f9a5b..07be12871 100644 --- a/contrib/docker/Dockerfile +++ b/contrib/docker/Dockerfile @@ -11,12 +11,15 @@ COPY go.sum . COPY vendor/ vendor/ COPY cmd/ cmd/ COPY pkg/ pkg/ +COPY internal/ internal/ -RUN CGO_ENABLED=0 GOARCH=$TARGETARCH go build -ldflags "$LDFLAGS" -mod vendor -o flowlogs-pipeline cmd/flowlogs-pipeline/main.go +RUN CGO_ENABLED=0 GOARCH=$TARGETARCH go build -ldflags "$LDFLAGS" -mod vendor -o flowlogs-pipeline ./cmd/flowlogs-pipeline +RUN CGO_ENABLED=0 GOARCH=$TARGETARCH go build -ldflags "$LDFLAGS" -mod vendor -o flp-informers ./cmd/flp-informers # final stage FROM --platform=linux/$TARGETARCH registry.access.redhat.com/ubi9/ubi-minimal:9.7-1773939694 COPY --from=builder /app/flowlogs-pipeline /app/ +COPY --from=builder /app/flp-informers /app/ ENTRYPOINT ["/app/flowlogs-pipeline"] diff --git a/contrib/kubernetes/deployment-flp-informers.yaml b/contrib/kubernetes/deployment-flp-informers.yaml new file mode 100644 index 000000000..f9d2f15b2 --- /dev/null +++ b/contrib/kubernetes/deployment-flp-informers.yaml @@ -0,0 +1,70 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flp-informers + labels: + app: flp-informers +spec: + replicas: 1 + selector: + matchLabels: + app: flp-informers + template: + metadata: + labels: + app: flp-informers + spec: + serviceAccountName: flp-informers + containers: + - name: flp-informers + image: %IMAGE_TAG_BASE%:%VERSION% + command: ["/app/flp-informers"] + args: + - --processor-selector=app=flowlogs-pipeline + - --processor-port=9090 + - --resync-interval=60 + - --log-level=info + env: + - name: POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + imagePullPolicy: IfNotPresent + resources: + requests: + memory: "128Mi" + cpu: "50m" + limits: + memory: "256Mi" + cpu: "200m" +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: flp-informers +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: flp-informers +rules: +- apiGroups: [""] + resources: ["pods", "nodes", "services"] + verbs: ["list", "watch", "get"] +- apiGroups: ["apps"] + resources: ["replicasets", "deployments"] + verbs: ["list", "watch", "get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: flp-informers +subjects: +- kind: ServiceAccount + name: flp-informers + namespace: %NAMESPACE% +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: flp-informers +--- diff --git a/contrib/kubernetes/deployment-k8scache.yaml b/contrib/kubernetes/deployment-k8scache.yaml new file mode 100644 index 000000000..ce94ea272 --- /dev/null +++ b/contrib/kubernetes/deployment-k8scache.yaml @@ -0,0 +1,113 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: flowlogs-pipeline + labels: + app: flowlogs-pipeline +spec: + replicas: 1 + selector: + matchLabels: + app: flowlogs-pipeline + template: + metadata: + labels: + app: flowlogs-pipeline + spec: + containers: + - name: flowlogs-pipeline + image: %IMAGE_TAG_BASE%:%VERSION% + args: + - "--config=/etc/flowlogs-pipeline/flowlogs-pipeline.conf.yaml" + - "--k8scache.port=9090" + - "--k8scache.address=0.0.0.0" + ports: + - containerPort: 6343 + - containerPort: 2055 + - containerPort: 2056 + - containerPort: 9090 + name: k8scache + protocol: TCP + # When deployed on KinD, the image is pre-pushed to KinD from the local registry. + # When deployed on OCP, OCP will pull the image from quay.io. + imagePullPolicy: IfNotPresent + volumeMounts: + - name: configuration + mountPath: "/etc/flowlogs-pipeline/" + volumes: + - name: configuration + configMap: + name: flowlogs-pipeline-configuration + serviceAccountName: flowlogs-pipeline +--- +apiVersion: v1 +kind: Service +metadata: + name: flowlogs-pipeline + labels: + app: flowlogs-pipeline +spec: + ports: + - port: 6343 + targetPort: 6343 + protocol: UDP + name: sflow + - port: 2055 + targetPort: 2055 + protocol: UDP + name: netflow + - port: 2056 + targetPort: 2056 + protocol: UDP + name: netflow-legacy + - port: 9090 + targetPort: 9090 + protocol: TCP + name: k8scache + selector: + app: flowlogs-pipeline +--- +apiVersion: v1 +kind: Service +metadata: + name: flowlogs-pipeline-metrics + labels: + app: flowlogs-pipeline +spec: + ports: + - port: 9102 + targetPort: 9102 + name: prometheous + selector: + app: flowlogs-pipeline +--- +apiVersion: v1 +kind: ServiceAccount +metadata: + name: flowlogs-pipeline +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: flowlogs-pipeline +rules: + - apiGroups: [""] + resources: ["pods", "nodes", "services"] + verbs: ["list", "watch", "get"] + - apiGroups: ["apps"] + resources: ["replicasets", "deployments"] + verbs: ["list", "watch", "get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: flowlogs-pipeline +subjects: + - kind: ServiceAccount + name: flowlogs-pipeline + namespace: %NAMESPACE% +roleRef: + apiGroup: rbac.authorization.k8s.io + kind: ClusterRole + name: flowlogs-pipeline +--- diff --git a/internal/informers/health.go b/internal/informers/health.go new file mode 100644 index 000000000..df1a08cb3 --- /dev/null +++ b/internal/informers/health.go @@ -0,0 +1,104 @@ +package informers + +import ( + "encoding/json" + "net/http" + "sync/atomic" + + log "github.com/sirupsen/logrus" +) + +// HealthServer provides HTTP health and readiness endpoints +type HealthServer struct { + server *http.Server + ready atomic.Bool + isLeader atomic.Bool +} + +// HealthStatus represents the health check response +type HealthStatus struct { + Status string `json:"status"` + IsLeader bool `json:"isLeader"` + Ready bool `json:"ready"` +} + +// NewHealthServer creates a new health server listening on the specified port +func NewHealthServer(port int) *HealthServer { + hs := &HealthServer{} + + mux := http.NewServeMux() + mux.HandleFunc("/healthz", hs.healthHandler) + mux.HandleFunc("/ready", hs.readyHandler) + mux.HandleFunc("/status", hs.statusHandler) + + hs.server = &http.Server{ + Addr: formatAddress(port), + Handler: mux, + } + + return hs +} + +// Start starts the health server in a goroutine +func (hs *HealthServer) Start() error { + log.WithField("address", hs.server.Addr).Info("Starting health server") + + go func() { + if err := hs.server.ListenAndServe(); err != nil && err != http.ErrServerClosed { + log.WithError(err).Error("Health server error") + } + }() + + return nil +} + +// Stop stops the health server gracefully +func (hs *HealthServer) Stop() error { + log.Info("Stopping health server") + return hs.server.Close() +} + +// SetReady marks the server as ready +func (hs *HealthServer) SetReady(ready bool) { + hs.ready.Store(ready) +} + +// SetLeader marks this instance as leader or follower +func (hs *HealthServer) SetLeader(isLeader bool) { + hs.isLeader.Store(isLeader) + if isLeader { + log.Info("Became leader") + } else { + log.Info("Lost leadership") + } +} + +// healthHandler always returns 200 OK if the process is running +func (hs *HealthServer) healthHandler(w http.ResponseWriter, _ *http.Request) { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("OK")) +} + +// readyHandler returns 200 OK only if the service is ready +func (hs *HealthServer) readyHandler(w http.ResponseWriter, _ *http.Request) { + if hs.ready.Load() { + w.WriteHeader(http.StatusOK) + _, _ = w.Write([]byte("Ready")) + } else { + w.WriteHeader(http.StatusServiceUnavailable) + _, _ = w.Write([]byte("Not Ready")) + } +} + +// statusHandler returns detailed status information as JSON +func (hs *HealthServer) statusHandler(w http.ResponseWriter, _ *http.Request) { + status := HealthStatus{ + Status: "OK", + IsLeader: hs.isLeader.Load(), + Ready: hs.ready.Load(), + } + + w.Header().Set("Content-Type", "application/json") + w.WriteHeader(http.StatusOK) + _ = json.NewEncoder(w).Encode(status) +} diff --git a/internal/informers/leaderelection.go b/internal/informers/leaderelection.go new file mode 100644 index 000000000..07e190660 --- /dev/null +++ b/internal/informers/leaderelection.go @@ -0,0 +1,100 @@ +package informers + +import ( + "context" + "time" + + "github.com/netobserv/flowlogs-pipeline/pkg/metrics" + log "github.com/sirupsen/logrus" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/leaderelection" + "k8s.io/client-go/tools/leaderelection/resourcelock" +) + +const ( + leaseName = "flp-informers-lease" + leaseDuration = 15 * time.Second + renewDeadline = 10 * time.Second + retryPeriod = 2 * time.Second +) + +// LeaderElectionConfig holds configuration for leader election +type LeaderElectionConfig struct { + Enabled bool + Namespace string + Identity string +} + +// RunWithLeaderElection runs the main logic with leader election +// Only the elected leader will execute runFunc, others will standby +func RunWithLeaderElection(ctx context.Context, config LeaderElectionConfig, healthServer *HealthServer, runFunc func(context.Context)) error { + if !config.Enabled { + log.Info("Leader election disabled - running as single instance") + healthServer.SetLeader(true) + healthServer.SetReady(true) + metrics.InformersMetrics.IsLeader.Set(1) + runFunc(ctx) + return nil + } + + // Get in-cluster config + k8sConfig, err := rest.InClusterConfig() + if err != nil { + return err + } + + clientset, err := kubernetes.NewForConfig(k8sConfig) + if err != nil { + return err + } + + // Create resource lock for leader election + // Using Lease as it's the recommended lock type + lock := &resourcelock.LeaseLock{ + LeaseMeta: metav1.ObjectMeta{ + Name: leaseName, + Namespace: config.Namespace, + }, + Client: clientset.CoordinationV1(), + LockConfig: resourcelock.ResourceLockConfig{ + Identity: config.Identity, + }, + } + + // Mark as ready once we can participate in leader election + // Both leader and followers are considered "ready" for K8s purposes + healthServer.SetReady(true) + + // Start leader election + leaderelection.RunOrDie(ctx, leaderelection.LeaderElectionConfig{ + Lock: lock, + ReleaseOnCancel: true, + LeaseDuration: leaseDuration, + RenewDeadline: renewDeadline, + RetryPeriod: retryPeriod, + Callbacks: leaderelection.LeaderCallbacks{ + OnStartedLeading: func(ctx context.Context) { + log.Info("Started leading - running informers") + healthServer.SetLeader(true) + metrics.InformersMetrics.IsLeader.Set(1) + runFunc(ctx) + }, + OnStoppedLeading: func() { + log.Info("Stopped leading") + healthServer.SetLeader(false) + metrics.InformersMetrics.IsLeader.Set(0) + }, + OnNewLeader: func(identity string) { + if identity == config.Identity { + log.Info("I am the new leader") + } else { + log.WithField("leader", identity).Info("New leader elected") + } + }, + }, + }) + + return nil +} diff --git a/internal/informers/utils.go b/internal/informers/utils.go new file mode 100644 index 000000000..45585dc9d --- /dev/null +++ b/internal/informers/utils.go @@ -0,0 +1,33 @@ +package informers + +import ( + "fmt" + "os" +) + +// formatAddress formats address for HTTP server +func formatAddress(port int) string { + return fmt.Sprintf("0.0.0.0:%d", port) +} + +// GetPodName returns the pod name from environment variable +func GetPodName() string { + podName := os.Getenv("POD_NAME") + if podName == "" { + hostname, err := os.Hostname() + if err != nil { + return "unknown" + } + return hostname + } + return podName +} + +// GetNamespace returns the namespace from environment variable or default +func GetNamespace() string { + namespace := os.Getenv("POD_NAMESPACE") + if namespace == "" { + return "netobserv" + } + return namespace +} diff --git a/pkg/config/config.go b/pkg/config/config.go index 444f278f5..c3e80693b 100644 --- a/pkg/config/config.go +++ b/pkg/config/config.go @@ -33,6 +33,7 @@ type Options struct { MetricsSettings string Health Health Profile Profile + K8sCacheServer K8sCacheServer } type Root struct { @@ -64,6 +65,16 @@ type Profile struct { Port int } +type K8sCacheServer struct { + Address string + Port int + // TLS configuration + TLSEnabled bool + TLSCertPath string + TLSKeyPath string + TLSCAPath string +} + // MetricsSettings is similar to api.PromEncode, but is global to the application, ie. it also works with operational metrics. // Also, currently FLP doesn't support defining more than one PromEncode stage. If this feature is added later, these global settings // will help configuring common setting for all PromEncode stages - PromEncode settings would then act as overrides. diff --git a/pkg/metrics/informers.go b/pkg/metrics/informers.go new file mode 100644 index 000000000..64ca23b4f --- /dev/null +++ b/pkg/metrics/informers.go @@ -0,0 +1,161 @@ +package metrics + +import ( + "fmt" + "net/http" + + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/promhttp" + log "github.com/sirupsen/logrus" +) + +var ( + // InformersMetrics holds all Prometheus metrics for flp-informers + InformersMetrics *Metrics +) + +// Metrics holds all Prometheus metrics +type Metrics struct { + IsLeader prometheus.Gauge + ConnectedProcessors prometheus.Gauge + CacheUpdatesTotal *prometheus.CounterVec + CacheSnapshotsSentTotal prometheus.Counter + ErrorsTotal *prometheus.CounterVec + // gRPC communication metrics + GrpcBytesSentTotal prometheus.Counter + GrpcBytesRecvTotal prometheus.Counter + GrpcMessagesSentTotal prometheus.Counter + GrpcMessagesRecvTotal prometheus.Counter + // Processor lifecycle metrics + ProcessorConnectionsTotal *prometheus.CounterVec + ProcessorLifetimeDuration prometheus.Histogram + // UDN disambiguation metrics + UdnDisambiguateTotal prometheus.Counter + UdnDisambiguateDuration prometheus.Histogram +} + +// InitMetrics initializes all Prometheus metrics +func InitMetrics() { + InformersMetrics = &Metrics{ + IsLeader: prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "flp_informers_is_leader", + Help: "1 if this instance is the current leader, 0 otherwise", + }), + ConnectedProcessors: prometheus.NewGauge(prometheus.GaugeOpts{ + Name: "flp_informers_connected_processors", + Help: "Number of FLP processors currently connected", + }), + CacheUpdatesTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "flp_informers_cache_updates_total", + Help: "Total number of cache updates sent to processors", + }, + []string{"operation"}, // ADD, UPDATE, DELETE, SNAPSHOT + ), + CacheSnapshotsSentTotal: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "flp_informers_snapshots_sent_total", + Help: "Total number of full snapshots sent to processors", + }), + ErrorsTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "flp_informers_errors_total", + Help: "Total number of errors by type", + }, + []string{"error_type"}, // discovery, udn_disambiguation + ), + // gRPC communication metrics + GrpcBytesSentTotal: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "flp_informers_grpc_bytes_sent_total", + Help: "Total number of bytes sent via gRPC to processors", + }), + GrpcBytesRecvTotal: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "flp_informers_grpc_bytes_received_total", + Help: "Total number of bytes received via gRPC from processors", + }), + GrpcMessagesSentTotal: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "flp_informers_grpc_messages_sent_total", + Help: "Total number of gRPC messages sent to processors", + }), + GrpcMessagesRecvTotal: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "flp_informers_grpc_messages_received_total", + Help: "Total number of gRPC messages received from processors", + }), + // Processor lifecycle metrics + ProcessorConnectionsTotal: prometheus.NewCounterVec( + prometheus.CounterOpts{ + Name: "flp_informers_processor_connections_total", + Help: "Total number of processor connection events", + }, + []string{"event"}, // connected, disconnected, reconnected + ), + ProcessorLifetimeDuration: prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "flp_informers_processor_lifetime_duration_seconds", + Help: "Duration of processor connections in seconds", + Buckets: []float64{1, 10, 30, 60, 300, 600, 1800, 3600, 7200}, // 1s to 2h + }), + // UDN disambiguation metrics + UdnDisambiguateTotal: prometheus.NewCounter(prometheus.CounterOpts{ + Name: "flp_informers_udn_disambiguate_total", + Help: "Total number of UDN disambiguation attempts", + }), + UdnDisambiguateDuration: prometheus.NewHistogram(prometheus.HistogramOpts{ + Name: "flp_informers_udn_disambiguate_duration_seconds", + Help: "Duration of UDN disambiguation operations in seconds", + Buckets: prometheus.DefBuckets, // 0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10 + }), + } + + // Register all metrics + prometheus.MustRegister( + InformersMetrics.IsLeader, + InformersMetrics.ConnectedProcessors, + InformersMetrics.CacheUpdatesTotal, + InformersMetrics.CacheSnapshotsSentTotal, + InformersMetrics.ErrorsTotal, + InformersMetrics.GrpcBytesSentTotal, + InformersMetrics.GrpcBytesRecvTotal, + InformersMetrics.GrpcMessagesSentTotal, + InformersMetrics.GrpcMessagesRecvTotal, + InformersMetrics.ProcessorConnectionsTotal, + InformersMetrics.ProcessorLifetimeDuration, + InformersMetrics.UdnDisambiguateTotal, + InformersMetrics.UdnDisambiguateDuration, + ) +} + +// Server provides HTTP endpoint for Prometheus metrics +type Server struct { + server *http.Server +} + +// NewServer creates a new metrics server listening on the specified port +func NewServer(port int) *Server { + mux := http.NewServeMux() + mux.Handle("/metrics", promhttp.Handler()) + + return &Server{ + server: &http.Server{ + Addr: fmt.Sprintf(":%d", port), + Handler: mux, + }, + } +} + +// Start starts the metrics server in a goroutine +func (ms *Server) Start() error { + log.WithField("address", ms.server.Addr).Info("Starting metrics server") + + go func() { + if err := ms.server.ListenAndServe(); err != nil && err != http.ErrServerClosed { + log.WithError(err).Error("Metrics server error") + } + }() + + return nil +} + +// Stop stops the metrics server gracefully +func (ms *Server) Stop() error { + log.Info("Stopping metrics server") + return ms.server.Close() +} diff --git a/pkg/pipeline/transform/kubernetes/cni/udn.go b/pkg/pipeline/transform/kubernetes/cni/udn.go index a03396b2a..bf06fe965 100644 --- a/pkg/pipeline/transform/kubernetes/cni/udn.go +++ b/pkg/pipeline/transform/kubernetes/cni/udn.go @@ -5,8 +5,10 @@ import ( "encoding/json" "fmt" "strings" + "time" "github.com/netobserv/flowlogs-pipeline/pkg/api" + "github.com/netobserv/flowlogs-pipeline/pkg/metrics" "github.com/netobserv/flowlogs-pipeline/pkg/config" log "github.com/sirupsen/logrus" v1 "k8s.io/api/core/v1" @@ -99,6 +101,17 @@ func (m *UDNHandler) GetPodUniqueKeys(ctx context.Context, dynClient *dynamic.Dy } func disambiguateClusterUDN(ctx context.Context, dynClient *dynamic.DynamicClient, name string) string { + // Update metrics + if metrics.InformersMetrics != nil { + metrics.InformersMetrics.UdnDisambiguateTotal.Inc() + } + start := time.Now() + defer func() { + if metrics.InformersMetrics != nil { + metrics.InformersMetrics.UdnDisambiguateDuration.Observe(time.Since(start).Seconds()) + } + }() + // "name" can look like this: "my-namespace/my-udn"; namespace included even for Cluster UDN parts := strings.SplitN(name, "/", 2) if len(parts) < 2 { @@ -121,6 +134,9 @@ func disambiguateClusterUDN(ctx context.Context, dynClient *dynamic.DynamicClien return name } else if !errors.IsNotFound(err) { log.Errorf("could not fetch UDN %s: %v", name, err) + if metrics.InformersMetrics != nil { + metrics.InformersMetrics.ErrorsTotal.WithLabelValues("udn_disambiguation").Inc() + } } // Does it exist as a cluster-udn? _, err = dynClient. @@ -135,6 +151,9 @@ func disambiguateClusterUDN(ctx context.Context, dynClient *dynamic.DynamicClien return udnName } else if !errors.IsNotFound(err) { log.Errorf("could not fetch CUDN %s: %v", udnName, err) + if metrics.InformersMetrics != nil { + metrics.InformersMetrics.ErrorsTotal.WithLabelValues("udn_disambiguation").Inc() + } } return name } diff --git a/pkg/pipeline/transform/kubernetes/datasource/datasource.go b/pkg/pipeline/transform/kubernetes/datasource/datasource.go index 7268bbecb..db50b150a 100644 --- a/pkg/pipeline/transform/kubernetes/datasource/datasource.go +++ b/pkg/pipeline/transform/kubernetes/datasource/datasource.go @@ -7,7 +7,19 @@ import ( ) type Datasource struct { + // Informers provides local Kubernetes informers (may be nil when k8scache is enabled). Informers informers.Interface + // kubernetesStore, when set, is used for IndexLookup and GetNodeByName instead of Informers. + // It is populated by the k8s cache sync gRPC server when receiving updates from flp-informers. + // When k8scache is enabled, Informers is nil and only kubernetesStore is used. + // This is set once during initialization before any concurrent access begins. + kubernetesStore *KubernetesStore +} + +// SetKubernetesStore sets the Kubernetes store (used when k8s cache server is enabled). +// This must be called during initialization, before the pipeline starts processing flows. +func (d *Datasource) SetKubernetesStore(store *KubernetesStore) { + d.kubernetesStore = store } func NewInformerDatasource(kubeconfig string, infConfig *informers.Config, opMetrics *operational.Metrics) (*Datasource, error) { @@ -18,10 +30,49 @@ func NewInformerDatasource(kubeconfig string, infConfig *informers.Config, opMet return &Datasource{Informers: inf}, nil } +// NewDatasourceK8sCache creates a datasource for k8scache mode without local informers. +// In this mode, the KubernetesStore will be set later by the k8scache gRPC server, +// and all lookups will use the centralized cache (Informers is nil to save resources). +func NewDatasourceK8sCache() *Datasource { + return &Datasource{ + Informers: nil, // No local informers when using k8scache + } +} + func (d *Datasource) IndexLookup(potentialKeys []string, ip string) *model.ResourceMetaData { - return d.Informers.IndexLookup(potentialKeys, ip) + if d.kubernetesStore != nil { + return d.kubernetesStore.IndexLookup(potentialKeys, ip) + } + // Fallback to local informers if available (nil when k8scache is enabled) + if d.Informers != nil { + return d.Informers.IndexLookup(potentialKeys, ip) + } + return nil } func (d *Datasource) GetNodeByName(name string) (*model.ResourceMetaData, error) { - return d.Informers.GetNodeByName(name) + if d.kubernetesStore != nil { + return d.kubernetesStore.GetNodeByName(name) + } + // Fallback to local informers if available (nil when k8scache is enabled) + if d.Informers != nil { + return d.Informers.GetNodeByName(name) + } + return nil, nil +} + +// ApplyCacheAddOrUpdate adds or updates the given entries in the Kubernetes store. +// This method is thread-safe via the store's internal mutex. +func (d *Datasource) ApplyCacheAddOrUpdate(entries []*model.ResourceMetaData) { + if d.kubernetesStore != nil { + d.kubernetesStore.AddOrUpdate(entries) + } +} + +// ApplyCacheDelete removes the given entries from the Kubernetes store. +// This method is thread-safe via the store's internal mutex. +func (d *Datasource) ApplyCacheDelete(entries []*model.ResourceMetaData) { + if d.kubernetesStore != nil { + d.kubernetesStore.Delete(entries) + } } diff --git a/pkg/pipeline/transform/kubernetes/datasource/kubernetesstore.go b/pkg/pipeline/transform/kubernetes/datasource/kubernetesstore.go new file mode 100644 index 000000000..bf1918ad7 --- /dev/null +++ b/pkg/pipeline/transform/kubernetes/datasource/kubernetesstore.go @@ -0,0 +1,151 @@ +package datasource + +import ( + "errors" + "fmt" + "sync" + + "github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/kubernetes/model" +) + +func storeKey(meta *model.ResourceMetaData) string { + return fmt.Sprintf("%s/%s/%s", meta.Kind, meta.Namespace, meta.Name) +} + +// KubernetesStore holds Kubernetes resource metadata (pods, nodes, services) used for enrichment. +// It can be populated via the k8s cache sync gRPC stream or left empty when using informers. +// It supports snapshot replace and incremental add/update/delete, and provides the same +// lookup interface (IndexLookup, GetNodeByName) used by enrichment. +type KubernetesStore struct { + mu sync.RWMutex + // primary key -> meta (kind/namespace/name) + byKey map[string]*model.ResourceMetaData + // index: IP -> meta (first match wins; pods may have multiple IPs) + byIP map[string]*model.ResourceMetaData + // index: node name -> meta (for Node kind) + byNodeName map[string]*model.ResourceMetaData + // index: secondary network key -> meta (for custom key lookup) + bySecondaryKey map[string]*model.ResourceMetaData +} + +// NewKubernetesStore creates an empty KubernetesStore. +func NewKubernetesStore() *KubernetesStore { + return &KubernetesStore{ + byKey: make(map[string]*model.ResourceMetaData), + byIP: make(map[string]*model.ResourceMetaData), + byNodeName: make(map[string]*model.ResourceMetaData), + bySecondaryKey: make(map[string]*model.ResourceMetaData), + } +} + +// removeFromIndexes removes all index entries for the given meta (by key). +// Caller must hold mu (write). +func (s *KubernetesStore) removeFromIndexes(meta *model.ResourceMetaData) { + if meta == nil { + return + } + for _, ip := range meta.IPs { + delete(s.byIP, ip) + } + if meta.Kind == model.KindNode && meta.Name != "" { + delete(s.byNodeName, meta.Name) + } + for _, k := range meta.SecondaryNetKeys { + delete(s.bySecondaryKey, k) + } +} + +// addToIndexes adds the meta to all index maps. +// Caller must hold mu (write). +func (s *KubernetesStore) addToIndexes(meta *model.ResourceMetaData) { + if meta == nil { + return + } + for _, ip := range meta.IPs { + s.byIP[ip] = meta + } + if meta.Kind == model.KindNode && meta.Name != "" { + s.byNodeName[meta.Name] = meta + } + for _, k := range meta.SecondaryNetKeys { + s.bySecondaryKey[k] = meta + } +} + +// Replace replaces the entire store with the given entries (full snapshot). +// Note: Currently not used. We only support incremental updates (AddOrUpdate/Delete). +func (s *KubernetesStore) Replace(entries []*model.ResourceMetaData) { + s.mu.Lock() + defer s.mu.Unlock() + + s.byKey = make(map[string]*model.ResourceMetaData) + s.byIP = make(map[string]*model.ResourceMetaData) + s.byNodeName = make(map[string]*model.ResourceMetaData) + s.bySecondaryKey = make(map[string]*model.ResourceMetaData) + + for _, meta := range entries { + key := storeKey(meta) + s.byKey[key] = meta + s.addToIndexes(meta) + } +} + +// AddOrUpdate adds or updates the given entries in the store. +func (s *KubernetesStore) AddOrUpdate(entries []*model.ResourceMetaData) { + s.mu.Lock() + defer s.mu.Unlock() + + for _, meta := range entries { + key := storeKey(meta) + if existing, ok := s.byKey[key]; ok { + s.removeFromIndexes(existing) + } + s.byKey[key] = meta + s.addToIndexes(meta) + } +} + +// Delete removes the given entries from the store. +// Entries must have at least Kind, Namespace, and Name set for identification. +func (s *KubernetesStore) Delete(entries []*model.ResourceMetaData) { + s.mu.Lock() + defer s.mu.Unlock() + + for _, meta := range entries { + key := storeKey(meta) + if existing, ok := s.byKey[key]; ok { + s.removeFromIndexes(existing) + delete(s.byKey, key) + } + } +} + +// IndexLookup finds metadata by secondary network keys first, then by IP. +// Implements the same semantics as informers.Interface for use when KubernetesStore is the source. +func (s *KubernetesStore) IndexLookup(potentialKeys []string, ip string) *model.ResourceMetaData { + s.mu.RLock() + defer s.mu.RUnlock() + + for _, key := range potentialKeys { + if meta, ok := s.bySecondaryKey[key]; ok { + return meta + } + } + if ip != "" { + if meta, ok := s.byIP[ip]; ok { + return meta + } + } + return nil +} + +// GetNodeByName returns node metadata by name. +func (s *KubernetesStore) GetNodeByName(name string) (*model.ResourceMetaData, error) { + s.mu.RLock() + defer s.mu.RUnlock() + + if meta, ok := s.byNodeName[name]; ok { + return meta, nil + } + return nil, errors.New("notFound") +} diff --git a/pkg/pipeline/transform/kubernetes/enrich.go b/pkg/pipeline/transform/kubernetes/enrich.go index 45855ef2c..d205be10d 100644 --- a/pkg/pipeline/transform/kubernetes/enrich.go +++ b/pkg/pipeline/transform/kubernetes/enrich.go @@ -14,6 +14,7 @@ import ( var ds *datasource.Datasource var infConfig informers.Config +var k8scacheEnabled bool const ( truncateSuffix = "..." @@ -25,15 +26,36 @@ func MockInformers() { ds = &datasource.Datasource{Informers: informers.NewInformersMock()} } +// SetK8sCacheEnabled sets whether k8scache mode is enabled. +// When enabled, local informers are disabled to save resources. +// This must be called before InitInformerDatasource. +func SetK8sCacheEnabled(enabled bool) { + k8scacheEnabled = enabled +} + func InitInformerDatasource(config *api.NetworkTransformKubeConfig, opMetrics *operational.Metrics) error { var err error infConfig = informers.NewConfig(config) if ds == nil { - ds, err = datasource.NewInformerDatasource(config.ConfigPath, &infConfig, opMetrics) + if k8scacheEnabled { + // K8scache mode: create datasource without local informers to save resources + // The KubernetesStore will be set later by the k8scache server + logrus.Info("k8scache mode enabled: local informers disabled, using centralized cache") + ds = datasource.NewDatasourceK8sCache() + } else { + // Standard mode: create datasource with local informers + ds, err = datasource.NewInformerDatasource(config.ConfigPath, &infConfig, opMetrics) + } } return err } +// GetDatasource returns the initialized datasource +// Returns nil if datasource has not been initialized via InitInformerDatasource +func GetDatasource() *datasource.Datasource { + return ds +} + func Enrich(outputEntry config.GenericMap, rule *api.K8sRule) { ip, ok := outputEntry.LookupString(rule.IPField) if !ok { diff --git a/pkg/pipeline/transform/kubernetes/informers/cache_export.go b/pkg/pipeline/transform/kubernetes/informers/cache_export.go new file mode 100644 index 000000000..02a792230 --- /dev/null +++ b/pkg/pipeline/transform/kubernetes/informers/cache_export.go @@ -0,0 +1,35 @@ +package informers + +// AddEventHandler adds event handlers to informers for pushing incremental updates. +// Only Pods, Nodes, and Services receive handlers because they contain the full resource +// metadata (IPs, labels, etc.) needed by FLP processors. +// +// ReplicaSets and Deployments are intentionally excluded - they are metadata-only informers +// used solely for ownership resolution (checkParent) via passive lookups (GetByKey). +// They don't need event handlers since we never push their updates to processors. +func (k *Informers) AddEventHandler(handler EventHandler) error { + if k.pods != nil { + if _, err := k.pods.AddEventHandler(handler); err != nil { + return err + } + } + if k.nodes != nil { + if _, err := k.nodes.AddEventHandler(handler); err != nil { + return err + } + } + if k.services != nil { + if _, err := k.services.AddEventHandler(handler); err != nil { + return err + } + } + return nil +} + +// EventHandler defines callbacks for resource changes +// Compatible with cache.ResourceEventHandler interface +type EventHandler interface { + OnAdd(obj interface{}, isInInitialList bool) + OnUpdate(oldObj, newObj interface{}) + OnDelete(obj interface{}) +} diff --git a/pkg/pipeline/transform/kubernetes/informers/informers-mock.go b/pkg/pipeline/transform/kubernetes/informers/informers-mock.go index 15e315968..04b42ac98 100644 --- a/pkg/pipeline/transform/kubernetes/informers/informers-mock.go +++ b/pkg/pipeline/transform/kubernetes/informers/informers-mock.go @@ -27,6 +27,7 @@ type Mock struct { func NewInformersMock() *Mock { inf := new(Mock) inf.On("InitFromConfig", mock.Anything, mock.Anything, mock.Anything).Return(nil) + inf.On("GetAllResources").Return([]*model.ResourceMetaData{}) return inf } @@ -35,6 +36,11 @@ func (o *Mock) InitFromConfig(kubeconfig string, infConfig *Config, opMetrics *o return args.Error(0) } +func (o *Mock) GetAllResources() []*model.ResourceMetaData { + args := o.Called() + return args.Get(0).([]*model.ResourceMetaData) +} + type IndexerMock struct { mock.Mock cache.Indexer @@ -237,3 +243,17 @@ func (f *FakeInformers) GetNodeByName(n string) (*model.ResourceMetaData, error) } return nil, errors.New("notFound") } + +func (f *FakeInformers) GetAllResources() []*model.ResourceMetaData { + var all []*model.ResourceMetaData + for _, v := range f.ipInfo { + all = append(all, v) + } + for _, v := range f.customKeysInfo { + all = append(all, v) + } + for _, v := range f.nodes { + all = append(all, v) + } + return all +} diff --git a/pkg/pipeline/transform/kubernetes/informers/informers.go b/pkg/pipeline/transform/kubernetes/informers/informers.go index 369773e67..977975ad9 100644 --- a/pkg/pipeline/transform/kubernetes/informers/informers.go +++ b/pkg/pipeline/transform/kubernetes/informers/informers.go @@ -1,20 +1,3 @@ -/* - * Copyright (C) 2021 IBM, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - package informers import ( @@ -54,6 +37,7 @@ type Interface interface { IndexLookup([]string, string) *model.ResourceMetaData GetNodeByName(string) (*model.ResourceMetaData, error) InitFromConfig(string, *Config, *operational.Metrics) error + GetAllResources() []*model.ResourceMetaData } type Informers struct { @@ -608,6 +592,42 @@ func (k *Informers) initInformers(client kubernetes.Interface, metaClient metada return nil } +// GetAllResources returns all cached resources (pods, nodes, services) as a snapshot. +// This is used to send initial snapshots to processors when they connect or restart. +func (k *Informers) GetAllResources() []*model.ResourceMetaData { + var allResources []*model.ResourceMetaData + + // Get all pods + if k.pods != nil { + for _, obj := range k.pods.GetStore().List() { + if meta, ok := obj.(*model.ResourceMetaData); ok { + allResources = append(allResources, meta) + } + } + } + + // Get all nodes + if k.nodes != nil { + for _, obj := range k.nodes.GetStore().List() { + if meta, ok := obj.(*model.ResourceMetaData); ok { + allResources = append(allResources, meta) + } + } + } + + // Get all services + if k.services != nil { + for _, obj := range k.services.GetStore().List() { + if meta, ok := obj.(*model.ResourceMetaData); ok { + allResources = append(allResources, meta) + } + } + } + + log.WithField("count", len(allResources)).Debug("Retrieved all resources for snapshot") + return allResources +} + func isServiceIPSet(ip string) bool { return ip != v1.ClusterIPNone && ip != "" } diff --git a/pkg/pipeline/transform/kubernetes/informers/informers_test.go b/pkg/pipeline/transform/kubernetes/informers/informers_test.go index d88933fc7..19f420b81 100644 --- a/pkg/pipeline/transform/kubernetes/informers/informers_test.go +++ b/pkg/pipeline/transform/kubernetes/informers/informers_test.go @@ -1,20 +1,3 @@ -/* - * Copyright (C) 2022 IBM, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - */ - package informers import ( diff --git a/pkg/pipeline/transform/kubernetes/k8scache/client.go b/pkg/pipeline/transform/kubernetes/k8scache/client.go new file mode 100644 index 000000000..900b2ede8 --- /dev/null +++ b/pkg/pipeline/transform/kubernetes/k8scache/client.go @@ -0,0 +1,921 @@ +package k8scache + +import ( + "context" + "crypto/tls" + "crypto/x509" + "errors" + "fmt" + "io" + "os" + "sync" + "sync/atomic" + "time" + + "github.com/netobserv/flowlogs-pipeline/pkg/metrics" + "github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/kubernetes/model" + log "github.com/sirupsen/logrus" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials" + "google.golang.org/grpc/credentials/insecure" + "google.golang.org/protobuf/proto" +) + +var clog = log.WithField("component", "k8scache.Client") + +const ( + // Reconnection configuration + maxReconnectAttempts = 10 + initialBackoff = 1 * time.Second + maxBackoff = 60 * time.Second + backoffMultiplier = 2.0 + // Default values for configurable parameters + defaultUpdateBufferSize = 100 + defaultSendTimeout = 10 * time.Second + defaultBatchSize = 100 +) + +// ClientConfig holds configuration for the gRPC client +type ClientConfig struct { + // ProcessorID identifies this informer instance + ProcessorID string + // TLS configuration (optional) + TLSEnabled bool + TLSCertPath string + TLSKeyPath string + TLSCAPath string + // TLSServerName is the expected server name for TLS verification (optional). + // If set, this name will be used to validate the server certificate regardless of the connection address. + // Useful when connecting to pods by IP but validating against DNS names in the certificate. + TLSServerName string + // InsecureSkipVerify skips TLS certificate verification (not recommended for production) + InsecureSkipVerify bool + // Cache configuration (optional, defaults used if 0) + UpdateBufferSize int // Size of the update channel buffer (default: 100) + SendTimeout time.Duration // Timeout for sending updates to processors (default: 10s) + BatchSize int // Maximum number of entries to send in a single update (default: 100) +} + +// InformerDataSource defines the interface for getting all resources from informers. +type InformerDataSource interface { + GetAllResources() []*model.ResourceMetaData +} + +// Client manages gRPC connections to FLP processor servers and pushes cache updates. +type Client struct { + // processorID identifies this informer instance + processorID string + // TLS configuration + tlsConfig ClientConfig + // connections tracks active processor connections + connections map[string]*processorConnection + mu sync.RWMutex + // version tracks the current cache version + version atomic.Int64 + // updateChan receives cache updates from informers + updateChan chan *CacheUpdate + // ctx and cancel for lifecycle management + ctx context.Context + cancel context.CancelFunc + // Configurable parameters + sendTimeout time.Duration + batchSize int + // Informer data source for snapshots + informer InformerDataSource + infMu sync.RWMutex +} + +// processorConnection represents a connection to a single FLP processor +type processorConnection struct { + address string + conn *grpc.ClientConn + stream KubernetesCacheService_StreamUpdatesClient + // Track if connection is healthy + healthy atomic.Bool + // Cancel function for this connection's context + cancel context.CancelFunc + // Reconnect tracking + reconnectAttempts int + // Connection timestamp for lifetime metrics + connectedAt time.Time + mu sync.Mutex // Protects conn, stream, cancel, and reconnectAttempts +} + +// getStream returns a copy of the stream pointer under lock. +// The caller must not hold pc.mu when calling this method. +func (pc *processorConnection) getStream() KubernetesCacheService_StreamUpdatesClient { + pc.mu.Lock() + defer pc.mu.Unlock() + return pc.stream +} + +// cancelStream cancels the stream context, which will cause any blocked Send/Recv to return an error. +// This is used when a timeout occurs to unblock operations and trigger reconnection. +// The caller must not hold pc.mu when calling this method. +func (pc *processorConnection) cancelStream() { + pc.mu.Lock() + defer pc.mu.Unlock() + if pc.cancel != nil { + pc.cancel() + } +} + +// closeConnection closes the connection and cancels the context under lock. +// The caller must not hold pc.mu when calling this method. +func (pc *processorConnection) closeConnection() { + pc.mu.Lock() + defer pc.mu.Unlock() + if pc.cancel != nil { + pc.cancel() + } + if pc.conn != nil { + pc.conn.Close() + } +} + +// NewClient creates a new cache client (informer side) +func NewClient(config *ClientConfig) *Client { + ctx, cancel := context.WithCancel(context.Background()) + + // Set defaults for configurable parameters if not provided + bufferSize := config.UpdateBufferSize + if bufferSize == 0 { + bufferSize = defaultUpdateBufferSize + } + + sendTimeout := config.SendTimeout + if sendTimeout == 0 { + sendTimeout = defaultSendTimeout + } + + batchSize := config.BatchSize + if batchSize == 0 { + batchSize = defaultBatchSize + } + + clog.WithFields(log.Fields{ + "update_buffer_size": bufferSize, + "send_timeout": sendTimeout, + "batch_size": batchSize, + }).Info("Cache client configuration") + + return &Client{ + processorID: config.ProcessorID, + tlsConfig: *config, + connections: make(map[string]*processorConnection), + updateChan: make(chan *CacheUpdate, bufferSize), + ctx: ctx, + cancel: cancel, + sendTimeout: sendTimeout, + batchSize: batchSize, + } +} + +// getTransportCredentials creates gRPC transport credentials based on TLS configuration +func (c *Client) getTransportCredentials() (credentials.TransportCredentials, error) { + if !c.tlsConfig.TLSEnabled { + clog.Debug("Using insecure credentials (TLS disabled)") + return insecure.NewCredentials(), nil + } + + tlsConfig := &tls.Config{ + InsecureSkipVerify: c.tlsConfig.InsecureSkipVerify, + } + + // Set ServerName if provided (allows connecting by IP while validating against DNS name in certificate) + if c.tlsConfig.TLSServerName != "" { + tlsConfig.ServerName = c.tlsConfig.TLSServerName + clog.WithField("server_name", c.tlsConfig.TLSServerName).Debug("TLS ServerName override configured") + } + + // Load client cert/key if provided + if c.tlsConfig.TLSCertPath != "" && c.tlsConfig.TLSKeyPath != "" { + cert, err := tls.LoadX509KeyPair(c.tlsConfig.TLSCertPath, c.tlsConfig.TLSKeyPath) + if err != nil { + return nil, fmt.Errorf("failed to load client cert/key: %w", err) + } + tlsConfig.Certificates = []tls.Certificate{cert} + clog.WithFields(log.Fields{ + "cert": c.tlsConfig.TLSCertPath, + "key": c.tlsConfig.TLSKeyPath, + }).Debug("Loaded client certificate") + } + + // Load CA cert if provided + if c.tlsConfig.TLSCAPath != "" { + caCert, err := os.ReadFile(c.tlsConfig.TLSCAPath) + if err != nil { + return nil, fmt.Errorf("failed to read CA cert: %w", err) + } + caCertPool := x509.NewCertPool() + if !caCertPool.AppendCertsFromPEM(caCert) { + return nil, fmt.Errorf("failed to append CA cert") + } + tlsConfig.RootCAs = caCertPool + clog.WithField("ca", c.tlsConfig.TLSCAPath).Debug("Loaded CA certificate") + } + + return credentials.NewTLS(tlsConfig), nil +} + +// AddProcessor connects to a new FLP processor server +func (c *Client) AddProcessor(address string) error { + return c.AddProcessorWithTimeout(address, 30*time.Second) +} + +// AddProcessorWithTimeout connects to a new FLP processor server with a timeout +func (c *Client) AddProcessorWithTimeout(address string, timeout time.Duration) error { + // First check: quick lock to see if already connected + c.mu.Lock() + if _, exists := c.connections[address]; exists { + c.mu.Unlock() + clog.WithField("address", address).Debug("processor already connected") + return nil + } + c.mu.Unlock() + + clog.WithField("address", address).Info("connecting to FLP processor") + + // Create a context with timeout for the connection attempt + ctx, cancel := context.WithTimeout(c.ctx, timeout) + defer cancel() + + // Perform slow network operations without holding the lock + // Get transport credentials + creds, err := c.getTransportCredentials() + if err != nil { + return fmt.Errorf("failed to create transport credentials: %w", err) + } + + // Create connection with context + conn, err := grpc.NewClient(address, + grpc.WithTransportCredentials(creds), + grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(50*1024*1024)), // 50MB max message size + ) + if err != nil { + return fmt.Errorf("failed to connect to %s: %w", address, err) + } + + // Create stream with timeout context + client := NewKubernetesCacheServiceClient(conn) + streamCtx, streamCancel := context.WithCancel(c.ctx) + + // Use a channel to implement timeout for stream creation + type streamResult struct { + stream KubernetesCacheService_StreamUpdatesClient + err error + } + streamChan := make(chan streamResult, 1) + + go func() { + stream, err := client.StreamUpdates(streamCtx) + select { + case streamChan <- streamResult{stream: stream, err: err}: + case <-ctx.Done(): + // Timeout occurred during stream creation, attempt cleanup + if stream != nil { + if closeErr := stream.CloseSend(); closeErr != nil { + clog.WithError(closeErr).WithField("address", address). + Debug("failed to close stream during timeout cleanup") + } + } + } + }() + + var stream KubernetesCacheService_StreamUpdatesClient + select { + case result := <-streamChan: + if result.err != nil { + streamCancel() + conn.Close() + return fmt.Errorf("failed to create stream to %s: %w", address, result.err) + } + stream = result.stream + case <-ctx.Done(): + streamCancel() + conn.Close() + return fmt.Errorf("timeout connecting to %s: %w", address, ctx.Err()) + } + + // Second check: re-acquire lock and verify no other goroutine added this connection + c.mu.Lock() + if _, exists := c.connections[address]; exists { + c.mu.Unlock() + // Another goroutine added this connection while we were connecting + // Clean up our newly-created resources + streamCancel() + conn.Close() + clog.WithField("address", address).Debug("processor was connected by another goroutine, discarding duplicate") + return nil + } + + // Store the connection + pc := &processorConnection{ + address: address, + conn: conn, + stream: stream, + cancel: streamCancel, + connectedAt: time.Now(), + } + pc.healthy.Store(true) + + c.connections[address] = pc + c.mu.Unlock() + + // Update metrics + c.updateConnectionMetrics("connected") + + // Start receiver goroutine for this connection + go c.receiveFromProcessor(pc) + + clog.WithField("address", address).Info("connected to FLP processor") + return nil +} + +// RemoveProcessor disconnects from a FLP processor +func (c *Client) RemoveProcessor(address string) { + c.mu.Lock() + pc, exists := c.connections[address] + if !exists { + c.mu.Unlock() + return + } + + clog.WithField("address", address).Info("disconnecting from FLP processor") + delete(c.connections, address) + c.mu.Unlock() + + // Update metrics - measure lifetime if we have a connection timestamp + c.updateConnectionMetrics("disconnected") + if !pc.connectedAt.IsZero() { + c.observeProcessorLifetime(pc.connectedAt) + } + + // Close connection after releasing client lock to avoid holding both locks + pc.closeConnection() +} + +// RemoveStaleProcessors removes connections to processors that are no longer in the discovered set. +// This is called after discovery to clean up connections to pods that have been deleted or restarted +// with new IPs. Prevents memory leaks and zombie connections. +func (c *Client) RemoveStaleProcessors(discoveredAddresses map[string]bool) { + c.mu.RLock() + var staleAddresses []string + for address := range c.connections { + if !discoveredAddresses[address] { + staleAddresses = append(staleAddresses, address) + } + } + c.mu.RUnlock() + + if len(staleAddresses) > 0 { + clog.WithFields(log.Fields{ + "num_stale": len(staleAddresses), + "stale_addresses": staleAddresses, + }).Info("removing stale processor connections") + + for _, address := range staleAddresses { + c.RemoveProcessor(address) + } + } +} + +// Start begins processing cache updates and sending them to all connected processors +func (c *Client) Start() { + go c.processorLoop() +} + +// Stop shuts down the client +func (c *Client) Stop() { + c.cancel() + c.mu.Lock() + connections := make([]*processorConnection, 0, len(c.connections)) + for _, pc := range c.connections { + connections = append(connections, pc) + } + c.connections = make(map[string]*processorConnection) + c.mu.Unlock() + + // Close all connections after releasing client lock + for _, pc := range connections { + pc.closeConnection() + } +} + +// SendAdd sends an ADD operation to all connected processors +// Entries are sent in batches according to the configured batch size +func (c *Client) SendAdd(entries []*model.ResourceMetaData) error { + return c.sendBatched(entries, OperationType_OPERATION_ADD, false) +} + +// SendUpdate sends an UPDATE operation to all connected processors +// Entries are sent in batches according to the configured batch size +func (c *Client) SendUpdate(entries []*model.ResourceMetaData) error { + return c.sendBatched(entries, OperationType_OPERATION_UPDATE, false) +} + +// SendDelete sends a DELETE operation to all connected processors +// Entries are sent in batches according to the configured batch size +func (c *Client) SendDelete(entries []*model.ResourceMetaData) error { + return c.sendBatched(entries, OperationType_OPERATION_DELETE, false) +} + +// sendBatched sends entries in batches to all connected processors +func (c *Client) sendBatched(entries []*model.ResourceMetaData, operation OperationType, isSnapshot bool) error { + if len(entries) == 0 { + return nil + } + + batchSize := c.batchSize + numBatches := (len(entries) + batchSize - 1) / batchSize + + if numBatches > 1 { + clog.WithFields(log.Fields{ + "total_entries": len(entries), + "batch_size": batchSize, + "num_batches": numBatches, + "operation": operation, + }).Debug("sending entries in batches") + } + + for i := 0; i < len(entries); i += batchSize { + end := i + batchSize + if end > len(entries) { + end = len(entries) + } + + batch := entries[i:end] + version := c.version.Add(1) + update := &CacheUpdate{ + Version: version, + IsSnapshot: isSnapshot, + Operation: operation, + Entries: metaToResourceEntries(batch), + } + + if err := c.sendUpdate(update); err != nil { + return fmt.Errorf("failed to send batch %d/%d: %w", (i/batchSize)+1, numBatches, err) + } + } + + return nil +} + +// SendSnapshot sends a full snapshot to a specific processor. +// This is used when a processor connects/restarts to get the current state. +func (c *Client) SendSnapshot(entries []*model.ResourceMetaData, targetAddress string) error { + batchSize := c.batchSize + numBatches := (len(entries) + batchSize - 1) / batchSize + + clog.WithFields(log.Fields{ + "num_entries": len(entries), + "batch_size": batchSize, + "num_batches": numBatches, + "target": targetAddress, + }).Info("sending snapshot to processor") + + // Split entries into batches if needed + for i := 0; i < len(entries); i += batchSize { + end := i + batchSize + if end > len(entries) { + end = len(entries) + } + + batch := entries[i:end] + version := c.version.Add(1) + update := &CacheUpdate{ + Version: version, + IsSnapshot: true, + Operation: OperationType_OPERATION_ADD, + Entries: metaToResourceEntries(batch), + } + + // Send directly to the target processor + c.mu.RLock() + pc, exists := c.connections[targetAddress] + c.mu.RUnlock() + + if !exists { + return fmt.Errorf("processor %s not found", targetAddress) + } + + if !pc.healthy.Load() { + return fmt.Errorf("processor %s is not healthy", targetAddress) + } + + stream := pc.getStream() + if stream == nil { + return fmt.Errorf("stream is nil for processor %s", targetAddress) + } + + sendCtx, sendCancel := context.WithTimeout(context.Background(), c.sendTimeout) + done := make(chan error, 1) + + go func() { + select { + case <-sendCtx.Done(): + return + case done <- stream.Send(update): + } + }() + + select { + case err := <-done: + sendCancel() + if err != nil { + return fmt.Errorf("failed to send snapshot batch to %s: %w", targetAddress, err) + } + case <-sendCtx.Done(): + sendCancel() + pc.cancelStream() + return fmt.Errorf("timeout sending snapshot batch to %s", targetAddress) + } + + clog.WithFields(log.Fields{ + "batch": (i / batchSize) + 1, + "num_batches": numBatches, + "batch_size": len(batch), + "target": targetAddress, + }).Debug("sent snapshot batch") + } + + clog.WithField("target", targetAddress).Info("snapshot sent successfully") + + if metrics.InformersMetrics != nil { + metrics.InformersMetrics.CacheSnapshotsSentTotal.Inc() + } + + return nil +} + +// sendUpdate sends an update to the update channel (non-blocking with timeout) +func (c *Client) sendUpdate(update *CacheUpdate) error { + select { + case c.updateChan <- update: + return nil + case <-time.After(c.sendTimeout): + return fmt.Errorf("timeout sending update to channel") + case <-c.ctx.Done(): + return c.ctx.Err() + } +} + +// processorLoop reads from updateChan and sends to all connected processors +func (c *Client) processorLoop() { + for { + select { + case <-c.ctx.Done(): + return + case update := <-c.updateChan: + c.broadcastUpdate(update) + } + } +} + +// broadcastUpdate sends an update to all connected processors +func (c *Client) broadcastUpdate(update *CacheUpdate) { + c.mu.RLock() + connections := make([]*processorConnection, 0, len(c.connections)) + for _, pc := range c.connections { + if pc.healthy.Load() { + connections = append(connections, pc) + } + } + c.mu.RUnlock() + + if len(connections) == 0 { + clog.Warn("no healthy processor connections to send update") + return + } + + clog.WithFields(log.Fields{ + "version": update.Version, + "is_snapshot": update.IsSnapshot, + "operation": update.Operation, + "num_entries": len(update.Entries), + "num_targets": len(connections), + }).Debug("broadcasting cache update") + + // Measure message size for metrics + messageSize := proto.Size(update) + + // Send to all processors concurrently + var wg sync.WaitGroup + for _, pc := range connections { + wg.Add(1) + go func(pc *processorConnection) { + defer wg.Done() + + // Get stream reference under lock + stream := pc.getStream() + if stream == nil { + clog.WithField("address", pc.address).Warn("stream is nil, skipping send") + pc.healthy.Store(false) + return + } + + // Create a context with timeout for this send operation + sendCtx, sendCancel := context.WithTimeout(context.Background(), c.sendTimeout) + defer sendCancel() + + // Use a channel to signal completion or timeout + done := make(chan error, 1) + go func() { + select { + case <-sendCtx.Done(): + // Context cancelled, exit goroutine to prevent leak + return + case done <- stream.Send(update): + // Send completed (success or error) + } + }() + + select { + case err := <-done: + if err != nil { + clog.WithError(err).WithField("address", pc.address).Error("failed to send update") + pc.healthy.Store(false) + } else { + // Update gRPC metrics on successful send + c.updateGrpcSentMetrics(messageSize) + } + case <-sendCtx.Done(): + clog.WithField("address", pc.address).Error("send operation timed out") + pc.healthy.Store(false) + + // Cancel the stream context to ensure any blocked operations are unblocked + // This will cause receiveFromProcessor to see an error and trigger reconnection + pc.cancelStream() + } + }(pc) + } + wg.Wait() +} + +// reconnect attempts to reconnect to a failed processor with exponential backoff +func (c *Client) reconnect(pc *processorConnection) bool { + backoff := initialBackoff + for attempt := 1; attempt <= maxReconnectAttempts; attempt++ { + // Lock pc.mu for updating reconnect attempts + pc.mu.Lock() + pc.reconnectAttempts = attempt + address := pc.address + pc.mu.Unlock() + + clog.WithFields(log.Fields{ + "address": address, + "attempt": attempt, + "backoff": backoff, + }).Info("attempting to reconnect") + + // Wait before retry (with context cancellation support) + select { + case <-time.After(backoff): + case <-c.ctx.Done(): + clog.WithField("address", address).Info("reconnection cancelled") + return false + } + + // Close old connection under lock + pc.mu.Lock() + if pc.conn != nil { + pc.conn.Close() + } + if pc.cancel != nil { + pc.cancel() + } + pc.mu.Unlock() + + // Get transport credentials (without holding any locks) + creds, err := c.getTransportCredentials() + if err != nil { + clog.WithError(err).WithField("address", address).Warn("reconnect: failed to create transport credentials") + backoff = min(time.Duration(float64(backoff)*backoffMultiplier), maxBackoff) + continue + } + + // Create new connection (without holding any locks) + conn, err := grpc.NewClient(address, + grpc.WithTransportCredentials(creds), + grpc.WithDefaultCallOptions(grpc.MaxCallRecvMsgSize(50*1024*1024)), + ) + if err != nil { + clog.WithError(err).WithField("address", address).Warn("reconnect: failed to connect") + backoff = min(time.Duration(float64(backoff)*backoffMultiplier), maxBackoff) + continue + } + + // Create new stream (without holding any locks) + client := NewKubernetesCacheServiceClient(conn) + ctx, cancel := context.WithCancel(c.ctx) + stream, err := client.StreamUpdates(ctx) + if err != nil { + clog.WithError(err).WithField("address", address).Warn("reconnect: failed to create stream") + cancel() + conn.Close() + backoff = min(time.Duration(float64(backoff)*backoffMultiplier), maxBackoff) + continue + } + + // Update connection under lock + pc.mu.Lock() + pc.conn = conn + pc.stream = stream + pc.cancel = cancel + pc.healthy.Store(true) + pc.reconnectAttempts = 0 + pc.connectedAt = time.Now() // Reset connection timestamp on reconnect + pc.mu.Unlock() + + // Update metrics + c.updateConnectionMetrics("reconnected") + + clog.WithFields(log.Fields{ + "address": address, + "attempt": attempt, + }).Info("reconnection successful") + + return true + } + + clog.WithFields(log.Fields{ + "address": pc.address, + "attempts": maxReconnectAttempts, + }).Error("reconnection failed after max attempts, removing processor") + + // Remove from connections map after exhausting retries + // This is safe now as we don't hold pc.mu when acquiring c.mu + c.mu.Lock() + delete(c.connections, pc.address) + c.mu.Unlock() + + return false +} + +// receiveFromProcessor handles incoming messages from a processor (SyncRequest/SyncAck) +func (c *Client) receiveFromProcessor(pc *processorConnection) { + defer func() { + pc.healthy.Store(false) + clog.WithField("address", pc.address).Info("receiver goroutine stopped") + }() + + for { + // Get stream reference under lock + stream := pc.getStream() + if stream == nil { + clog.WithField("address", pc.address).Warn("stream is nil in receiver, exiting") + return + } + + msg, err := stream.Recv() + if err != nil { + if errors.Is(err, io.EOF) { + clog.WithField("address", pc.address).Info("processor disconnected (EOF)") + } else { + clog.WithError(err).WithField("address", pc.address).Warn("error receiving from processor") + } + + // Mark as unhealthy and attempt reconnection + pc.healthy.Store(false) + + // Attempt to reconnect + if c.reconnect(pc) { + // Reconnection successful, restart receiver loop + clog.WithField("address", pc.address).Info("restarting receiver after successful reconnection") + continue + } + + // Reconnection failed, exit + return + } + + // Update gRPC metrics on successful receive + messageSize := proto.Size(msg) + c.updateGrpcRecvMetrics(messageSize) + + switch m := msg.Message.(type) { + case *SyncMessage_Request: + c.handleSyncRequest(pc, m.Request) + case *SyncMessage_Ack: + c.handleSyncAck(pc, m.Ack) + default: + clog.WithField("address", pc.address).Warn("received unknown message type") + } + } +} + +// handleSyncRequest handles a SyncRequest from a processor +func (c *Client) handleSyncRequest(pc *processorConnection, req *SyncRequest) { + clog.WithFields(log.Fields{ + "address": pc.address, + "processor_id": req.ProcessorId, + "last_version": req.LastVersion, + }).Info("received SyncRequest from processor") + + // Only send snapshot if LastVersion is 0 (processor is new or restarted) + // If LastVersion > 0, the processor is reconnecting and will continue receiving + // incremental updates (ADD/UPDATE/DELETE) from where it left off + if req.LastVersion == 0 { + // Get informer data source + c.infMu.RLock() + informer := c.informer + c.infMu.RUnlock() + + if informer == nil { + clog.Warn("informer not set, cannot send snapshot") + return + } + + // Get all resources from informer cache (local, no K8s API query) + allResources := informer.GetAllResources() + + clog.WithFields(log.Fields{ + "address": pc.address, + "processor_id": req.ProcessorId, + "num_resources": len(allResources), + }).Info("sending snapshot to new/restarted processor") + + // Send snapshot to this specific processor + if err := c.SendSnapshot(allResources, pc.address); err != nil { + clog.WithError(err).WithField("address", pc.address).Error("failed to send snapshot") + } else { + clog.WithField("address", pc.address).Info("snapshot sent successfully to processor") + } + } else { + clog.WithFields(log.Fields{ + "address": pc.address, + "processor_id": req.ProcessorId, + "last_version": req.LastVersion, + }).Info("processor reconnecting with existing state, continuing incremental updates") + } +} + +// handleSyncAck handles a SyncAck from a processor +func (c *Client) handleSyncAck(pc *processorConnection, ack *SyncAck) { + if ack.Success { + clog.WithFields(log.Fields{ + "address": pc.address, + "processor_id": ack.ProcessorId, + "version": ack.Version, + }).Debug("received ACK from processor") + } else { + clog.WithFields(log.Fields{ + "address": pc.address, + "processor_id": ack.ProcessorId, + "version": ack.Version, + "error": ack.Error, + }).Error("received NACK from processor") + } +} + +// GetVersion returns the current cache version +func (c *Client) GetVersion() int64 { + return c.version.Load() +} + +// SetInformer sets the informer data source for obtaining snapshots +func (c *Client) SetInformer(informer InformerDataSource) { + c.infMu.Lock() + defer c.infMu.Unlock() + c.informer = informer + clog.Info("Informer data source set for snapshot generation") +} + +// updateConnectionMetrics updates processor connection metrics +func (c *Client) updateConnectionMetrics(event string) { + if metrics.InformersMetrics != nil { + metrics.InformersMetrics.ProcessorConnectionsTotal.WithLabelValues(event).Inc() + + // Update connected processors gauge + switch event { + case "connected", "reconnected": + metrics.InformersMetrics.ConnectedProcessors.Inc() + case "disconnected": + metrics.InformersMetrics.ConnectedProcessors.Dec() + } + } +} + +// observeProcessorLifetime records the lifetime of a processor connection +func (c *Client) observeProcessorLifetime(connectedAt time.Time) { + if metrics.InformersMetrics != nil { + lifetime := time.Since(connectedAt).Seconds() + metrics.InformersMetrics.ProcessorLifetimeDuration.Observe(lifetime) + } +} + +// updateGrpcSentMetrics updates gRPC sent metrics +func (c *Client) updateGrpcSentMetrics(messageSize int) { + if metrics.InformersMetrics != nil { + metrics.InformersMetrics.GrpcBytesSentTotal.Add(float64(messageSize)) + metrics.InformersMetrics.GrpcMessagesSentTotal.Inc() + } +} + +// updateGrpcRecvMetrics updates gRPC received metrics +func (c *Client) updateGrpcRecvMetrics(messageSize int) { + if metrics.InformersMetrics != nil { + metrics.InformersMetrics.GrpcBytesRecvTotal.Add(float64(messageSize)) + metrics.InformersMetrics.GrpcMessagesRecvTotal.Inc() + } +} diff --git a/pkg/pipeline/transform/kubernetes/k8scache/converter.go b/pkg/pipeline/transform/kubernetes/k8scache/converter.go new file mode 100644 index 000000000..5806d03f6 --- /dev/null +++ b/pkg/pipeline/transform/kubernetes/k8scache/converter.go @@ -0,0 +1,96 @@ +package k8scache + +import ( + "github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/kubernetes/model" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" +) + +// resourceEntryToMeta converts a gRPC ResourceEntry to the internal model used by the datasource. +func resourceEntryToMeta(entry *ResourceEntry) *model.ResourceMetaData { + if entry == nil { + return nil + } + meta := &model.ResourceMetaData{ + ObjectMeta: metav1.ObjectMeta{ + Name: entry.Name, + Namespace: entry.Namespace, + UID: "", + ResourceVersion: entry.ResourceVersion, + Labels: entry.Labels, + Annotations: entry.Annotations, + }, + Kind: entry.Kind, + OwnerName: entry.OwnerName, + OwnerKind: entry.OwnerKind, + HostName: entry.HostName, + HostIP: entry.HostIp, + NetworkName: entry.NetworkName, + IPs: append([]string(nil), entry.Ips...), + SecondaryNetKeys: append([]string(nil), entry.SecondaryNetKeys...), + SecondaryNetNames: entry.SecondaryNetNames, + } + if entry.Uid != "" { + meta.UID = types.UID(entry.Uid) + } + if entry.CreationTimestamp != 0 { + meta.CreationTimestamp = metav1.Unix(entry.CreationTimestamp, 0) + } + return meta +} + +// resourceEntriesToMeta converts a slice of ResourceEntry to model.ResourceMetaData. +func resourceEntriesToMeta(entries []*ResourceEntry) []*model.ResourceMetaData { + if len(entries) == 0 { + return nil + } + out := make([]*model.ResourceMetaData, 0, len(entries)) + for _, e := range entries { + if m := resourceEntryToMeta(e); m != nil { + out = append(out, m) + } + } + return out +} + +// metaToResourceEntry converts internal model.ResourceMetaData to gRPC ResourceEntry. +func metaToResourceEntry(meta *model.ResourceMetaData) *ResourceEntry { + if meta == nil { + return nil + } + entry := &ResourceEntry{ + Kind: meta.Kind, + Namespace: meta.Namespace, + Name: meta.Name, + Uid: string(meta.UID), + OwnerName: meta.OwnerName, + OwnerKind: meta.OwnerKind, + HostName: meta.HostName, + HostIp: meta.HostIP, + NetworkName: meta.NetworkName, + Ips: append([]string(nil), meta.IPs...), + SecondaryNetKeys: append([]string(nil), meta.SecondaryNetKeys...), + SecondaryNetNames: meta.SecondaryNetNames, + Labels: meta.Labels, + Annotations: meta.Annotations, + ResourceVersion: meta.ResourceVersion, + } + if !meta.CreationTimestamp.IsZero() { + entry.CreationTimestamp = meta.CreationTimestamp.Unix() + } + return entry +} + +// metaToResourceEntries converts a slice of model.ResourceMetaData to ResourceEntry. +func metaToResourceEntries(metas []*model.ResourceMetaData) []*ResourceEntry { + if len(metas) == 0 { + return nil + } + out := make([]*ResourceEntry, 0, len(metas)) + for _, m := range metas { + if e := metaToResourceEntry(m); e != nil { + out = append(out, e) + } + } + return out +} diff --git a/pkg/pipeline/transform/kubernetes/k8scache/discovery.go b/pkg/pipeline/transform/kubernetes/k8scache/discovery.go new file mode 100644 index 000000000..c7953ec9d --- /dev/null +++ b/pkg/pipeline/transform/kubernetes/k8scache/discovery.go @@ -0,0 +1,149 @@ +package k8scache + +import ( + "context" + "fmt" + "os" + "time" + + "github.com/netobserv/flowlogs-pipeline/pkg/metrics" + log "github.com/sirupsen/logrus" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/clientcmd" +) + +// DiscoveryConfig holds configuration for processor discovery +type DiscoveryConfig struct { + // Kubeconfig path (empty for in-cluster config) + Kubeconfig string + // ProcessorSelector is the label selector for FLP processor pods (e.g., "app=flowlogs-pipeline") + ProcessorSelector string + // ProcessorPort is the port where FLP processors listen for gRPC + ProcessorPort int + // ResyncInterval is how often to rediscover processors (in seconds) + ResyncInterval int + // ProcessorServiceName is the headless service name for processor pods (optional). + // If set, discovery will use DNS names (e.g., pod-name.service-name.namespace.svc.cluster.local) + // instead of pod IPs. This is required for proper TLS certificate validation. + ProcessorServiceName string +} + +// StartProcessorDiscovery periodically discovers FLP processor pods and connects the client to them. +// It runs in a loop until the context is cancelled, discovering processors at the configured interval. +// +// The discovery process: +// 1. Lists pods matching ProcessorSelector in the current namespace (from POD_NAMESPACE env var) +// 2. Filters for running pods with assigned IPs +// 3. Connects the client to each discovered processor (idempotent - won't duplicate connections) +// +// This function blocks until ctx is cancelled. Run it in a goroutine for background discovery. +func StartProcessorDiscovery(ctx context.Context, client *Client, cfg DiscoveryConfig) error { + // Validate ResyncInterval before doing any work + if cfg.ResyncInterval <= 0 { + return fmt.Errorf("invalid ResyncInterval: %d (must be positive)", cfg.ResyncInterval) + } + + // Get Kubernetes client + k8sConfig, err := getK8sConfig(cfg.Kubeconfig) + if err != nil { + return fmt.Errorf("failed to get k8s config for processor discovery: %w", err) + } + + clientset, err := kubernetes.NewForConfig(k8sConfig) + if err != nil { + return fmt.Errorf("failed to create k8s clientset: %w", err) + } + + ticker := time.NewTicker(time.Duration(cfg.ResyncInterval) * time.Second) + defer ticker.Stop() + + // Immediate first run + discoverAndConnect(ctx, clientset, client, cfg) + + for { + select { + case <-ctx.Done(): + return ctx.Err() + case <-ticker.C: + discoverAndConnect(ctx, clientset, client, cfg) + } + } +} + +// discoverAndConnect discovers FLP processor pods and connects to them +// Also removes connections to processors that no longer exist (e.g., pods that restarted with new IPs) +func discoverAndConnect(ctx context.Context, clientset *kubernetes.Clientset, client *Client, cfg DiscoveryConfig) { + namespace := os.Getenv("POD_NAMESPACE") + if namespace == "" { + namespace = "default" + } + + pods, err := clientset.CoreV1().Pods(namespace).List(ctx, metav1.ListOptions{ + LabelSelector: cfg.ProcessorSelector, + }) + if err != nil { + log.WithError(err).Error("failed to list processor pods") + if metrics.InformersMetrics != nil { + metrics.InformersMetrics.ErrorsTotal.WithLabelValues("discovery").Inc() + } + return + } + + log.WithField("num_pods", len(pods.Items)).Debug("discovered processor pods") + + // Track discovered addresses in this cycle + discoveredAddresses := make(map[string]bool) + + for i := range pods.Items { + pod := &pods.Items[i] + if pod.Status.Phase != v1.PodRunning { + continue + } + if pod.Status.PodIP == "" { + continue + } + + // Build address: prefer DNS name (for TLS) if service name is configured, otherwise use IP + var address string + if cfg.ProcessorServiceName != "" { + // Use DNS name: ...svc.cluster.local:port + // This is required for proper TLS certificate validation + dnsName := fmt.Sprintf("%s.%s.%s.svc.cluster.local", pod.Name, cfg.ProcessorServiceName, namespace) + address = fmt.Sprintf("%s:%d", dnsName, cfg.ProcessorPort) + log.WithFields(log.Fields{ + "pod": pod.Name, + "dns_name": dnsName, + }).Debug("using DNS name for processor connection (TLS-friendly)") + } else { + // Fallback to IP address (may cause TLS verification issues) + address = fmt.Sprintf("%s:%d", pod.Status.PodIP, cfg.ProcessorPort) + } + + // Mark this address as discovered + discoveredAddresses[address] = true + + // AddProcessorWithTimeout is idempotent (won't duplicate if already connected) + // Use a 10-second timeout to avoid blocking the discovery loop for too long + if err := client.AddProcessorWithTimeout(address, 10*time.Second); err != nil { + log.WithError(err).WithField("pod", pod.Name).Error("failed to connect to processor") + if metrics.InformersMetrics != nil { + metrics.InformersMetrics.ErrorsTotal.WithLabelValues("discovery").Inc() + } + } + } + + // Remove connections to processors that are no longer discovered + // This handles cases where pods restart with new IPs or are deleted + client.RemoveStaleProcessors(discoveredAddresses) +} + +// getK8sConfig returns the Kubernetes client config (in-cluster or from kubeconfig) +func getK8sConfig(kubeconfig string) (*rest.Config, error) { + if kubeconfig != "" { + return clientcmd.BuildConfigFromFlags("", kubeconfig) + } + return rest.InClusterConfig() +} diff --git a/pkg/pipeline/transform/kubernetes/k8scache/discovery_test.go b/pkg/pipeline/transform/kubernetes/k8scache/discovery_test.go new file mode 100644 index 000000000..5627f5c34 --- /dev/null +++ b/pkg/pipeline/transform/kubernetes/k8scache/discovery_test.go @@ -0,0 +1,65 @@ +package k8scache + +import ( + "context" + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestStartProcessorDiscovery_InvalidResyncInterval(t *testing.T) { + ctx := context.Background() + client := NewClient(&ClientConfig{ + ProcessorID: "test", + TLSEnabled: false, + }) + + testCases := []struct { + name string + resyncInterval int + shouldFail bool + }{ + { + name: "zero interval", + resyncInterval: 0, + shouldFail: true, + }, + { + name: "negative interval", + resyncInterval: -1, + shouldFail: true, + }, + { + name: "negative interval large", + resyncInterval: -100, + shouldFail: true, + }, + { + name: "positive interval", + resyncInterval: 10, + shouldFail: false, // Will fail for other reasons (no k8s), but not validation + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + cfg := DiscoveryConfig{ + Kubeconfig: "/nonexistent/path/to/kubeconfig", // Will fail on k8s config + ProcessorSelector: "app=test", + ProcessorPort: 9090, + ResyncInterval: tc.resyncInterval, + } + + err := StartProcessorDiscovery(ctx, client, cfg) + + if tc.shouldFail { + assert.Error(t, err, "should fail with invalid ResyncInterval") + assert.Contains(t, err.Error(), "invalid ResyncInterval", "error should mention ResyncInterval") + } else { + // Will fail due to invalid kubeconfig, but not due to ResyncInterval + assert.Error(t, err, "will fail due to kubeconfig") + assert.NotContains(t, err.Error(), "invalid ResyncInterval", "error should not be about ResyncInterval") + } + }) + } +} diff --git a/pkg/pipeline/transform/kubernetes/k8scache/eventhandler.go b/pkg/pipeline/transform/kubernetes/k8scache/eventhandler.go new file mode 100644 index 000000000..045d56fc3 --- /dev/null +++ b/pkg/pipeline/transform/kubernetes/k8scache/eventhandler.go @@ -0,0 +1,109 @@ +package k8scache + +import ( + "github.com/netobserv/flowlogs-pipeline/pkg/metrics" + "github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/kubernetes/model" + log "github.com/sirupsen/logrus" + "k8s.io/client-go/tools/cache" +) + +// clientSender defines the interface for sending cache updates. +// This interface allows for easier testing by enabling mock implementations. +type clientSender interface { + SendAdd(entries []*model.ResourceMetaData) error + SendUpdate(entries []*model.ResourceMetaData) error + SendDelete(entries []*model.ResourceMetaData) error +} + +// EventHandler implements informers.EventHandler to push cache updates via gRPC. +// It handles Kubernetes resource events (Add, Update, Delete) and forwards them +// to connected FLP processor pods through the k8scache client. +type EventHandler struct { + client clientSender +} + +// NewEventHandler creates a new event handler that forwards K8s events to the given client +func NewEventHandler(client *Client) *EventHandler { + return &EventHandler{client: client} +} + +// OnAdd is called when a new resource is added to the informer cache. +// It skips resources from the initial list (isInInitialList=true) to avoid +// sending full snapshots, and only forwards incremental additions. +func (h *EventHandler) OnAdd(obj interface{}, isInInitialList bool) { + if isInInitialList { + // Skip initial list - we send incremental updates only + return + } + + meta, ok := obj.(*model.ResourceMetaData) + if !ok { + // Kubernetes sometimes sends partial metadata objects for optimization. + // These don't have the full info we need (IPs, etc), so we skip them. + log.Debugf("skipping partial metadata object in OnAdd: %T", obj) + return + } + + if err := h.client.SendAdd([]*model.ResourceMetaData{meta}); err != nil { + log.WithError(err).WithField("resource", meta.Name).Error("failed to send ADD") + } else if metrics.InformersMetrics != nil { + metrics.InformersMetrics.CacheUpdatesTotal.WithLabelValues("ADD").Inc() + } +} + +// OnUpdate is called when a resource is updated in the informer cache. +// It forwards the new state to all connected processors. +func (h *EventHandler) OnUpdate(_, newObj interface{}) { + meta, ok := newObj.(*model.ResourceMetaData) + if !ok { + // Kubernetes sometimes sends partial metadata objects for optimization. + // These don't have the full info we need (IPs, etc), so we skip them. + log.Debugf("skipping partial metadata object in OnUpdate: %T", newObj) + return + } + + if err := h.client.SendUpdate([]*model.ResourceMetaData{meta}); err != nil { + log.WithError(err).WithField("resource", meta.Name).Error("failed to send UPDATE") + } else if metrics.InformersMetrics != nil { + metrics.InformersMetrics.CacheUpdatesTotal.WithLabelValues("UPDATE").Inc() + } +} + +// OnDelete is called when a resource is deleted from the informer cache. +// It handles both normal delete events and tombstones (DeletedFinalStateUnknown). +// +// Tombstones occur when the informer misses a delete event (e.g., due to temporary +// disconnection). In this case, Kubernetes sends a DeletedFinalStateUnknown object +// containing the last known state of the deleted resource. Without proper handling, +// these missed deletes would leave stale entries in the cache. +func (h *EventHandler) OnDelete(obj interface{}) { + var meta *model.ResourceMetaData + var ok bool + + // Handle tombstones: when an informer misses a delete event, it can send a + // DeletedFinalStateUnknown object containing the last known state + if tombstone, isTombstone := obj.(cache.DeletedFinalStateUnknown); isTombstone { + // Extract the actual object from the tombstone + meta, ok = tombstone.Obj.(*model.ResourceMetaData) + if !ok { + // Kubernetes sometimes sends partial metadata objects for optimization. + log.Debugf("tombstone contained partial metadata object in OnDelete: %T", tombstone.Obj) + return + } + log.Debugf("recovered delete event from tombstone for resource: %s", meta.Name) + } else { + // Not a tombstone, try direct conversion + meta, ok = obj.(*model.ResourceMetaData) + if !ok { + // Kubernetes sometimes sends partial metadata objects for optimization. + log.Debugf("skipping partial metadata object in OnDelete: %T", obj) + return + } + } + + if err := h.client.SendDelete([]*model.ResourceMetaData{meta}); err != nil { + log.WithError(err).WithField("resource", meta.Name).Error("failed to send DELETE") + } else if metrics.InformersMetrics != nil { + metrics.InformersMetrics.CacheUpdatesTotal.WithLabelValues("DELETE").Inc() + } +} diff --git a/pkg/pipeline/transform/kubernetes/k8scache/eventhandler_test.go b/pkg/pipeline/transform/kubernetes/k8scache/eventhandler_test.go new file mode 100644 index 000000000..1bc2bfbd3 --- /dev/null +++ b/pkg/pipeline/transform/kubernetes/k8scache/eventhandler_test.go @@ -0,0 +1,189 @@ +package k8scache + +import ( + "sync" + "testing" + + "github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/kubernetes/model" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/client-go/tools/cache" +) + +// mockClient captures calls to Send* methods for testing +// It implements the clientSender interface needed by EventHandler +type mockClient struct { + mu sync.Mutex + deletedMeta []*model.ResourceMetaData + addedMeta []*model.ResourceMetaData + updatedMeta []*model.ResourceMetaData + sendError error +} + +func (m *mockClient) SendDelete(entries []*model.ResourceMetaData) error { + m.mu.Lock() + defer m.mu.Unlock() + m.deletedMeta = append(m.deletedMeta, entries...) + return m.sendError +} + +func (m *mockClient) SendAdd(entries []*model.ResourceMetaData) error { + m.mu.Lock() + defer m.mu.Unlock() + m.addedMeta = append(m.addedMeta, entries...) + return m.sendError +} + +func (m *mockClient) SendUpdate(entries []*model.ResourceMetaData) error { + m.mu.Lock() + defer m.mu.Unlock() + m.updatedMeta = append(m.updatedMeta, entries...) + return m.sendError +} + +func (m *mockClient) getDeleted() []*model.ResourceMetaData { + m.mu.Lock() + defer m.mu.Unlock() + return m.deletedMeta +} + +// createTestPod creates a ResourceMetaData for testing +func createTestPod(name, namespace string) *model.ResourceMetaData { + return &model.ResourceMetaData{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: namespace, + }, + Kind: model.KindPod, + } +} + +func TestOnDelete_TombstoneEvent(t *testing.T) { + testPod := createTestPod("test-pod", "default") + + // Create a tombstone wrapping our test pod + tombstone := cache.DeletedFinalStateUnknown{ + Key: "default/test-pod", + Obj: testPod, + } + + // Create mock client and handler + mock := &mockClient{} + handler := &EventHandler{client: mock} + + // Call the real OnDelete method with tombstone + handler.OnDelete(tombstone) + + // Verify the handler extracted and sent the correct metadata + deleted := mock.getDeleted() + require.Len(t, deleted, 1, "should have sent one delete") + assert.Equal(t, "test-pod", deleted[0].Name, "metadata name should match") + assert.Equal(t, "default", deleted[0].Namespace, "metadata namespace should match") + assert.Equal(t, model.KindPod, deleted[0].Kind, "metadata kind should match") +} + +func TestOnDelete_InvalidTombstone(t *testing.T) { + // Create a tombstone with wrong object type + invalidTombstone := cache.DeletedFinalStateUnknown{ + Key: "default/invalid", + Obj: "not-a-resource-metadata", + } + + // Create mock client and handler + mock := &mockClient{} + handler := &EventHandler{client: mock} + + // Call the real OnDelete method with invalid tombstone + // Should log warning and not send anything + handler.OnDelete(invalidTombstone) + + // Verify nothing was sent (handler gracefully handled invalid tombstone) + deleted := mock.getDeleted() + assert.Empty(t, deleted, "should not send delete for invalid tombstone") +} + +func TestOnDelete_InvalidDirectObject(t *testing.T) { + // Test with a completely wrong object type (not a tombstone, not ResourceMetaData) + var invalidObj interface{} = "not-a-resource-metadata" + + // Create mock client and handler + mock := &mockClient{} + handler := &EventHandler{client: mock} + + // Call the real OnDelete method with invalid object + // Should log warning and not send anything + handler.OnDelete(invalidObj) + + // Verify nothing was sent (handler gracefully handled invalid object) + deleted := mock.getDeleted() + assert.Empty(t, deleted, "should not send delete for invalid object") +} + +// Integration test that verifies the full OnDelete flow with tombstone +func TestOnDelete_TombstoneIntegration(t *testing.T) { + // This test verifies that the OnDelete method can handle both: + // 1. Normal delete events (direct ResourceMetaData object) + // 2. Tombstone events (DeletedFinalStateUnknown wrapping ResourceMetaData) + + testCases := []struct { + name string + obj interface{} + shouldSend bool + expectedName string + expectedNS string + }{ + { + name: "normal delete", + obj: createTestPod("pod1", "default"), + shouldSend: true, + expectedName: "pod1", + expectedNS: "default", + }, + { + name: "tombstone delete", + obj: cache.DeletedFinalStateUnknown{ + Key: "default/pod2", + Obj: createTestPod("pod2", "default"), + }, + shouldSend: true, + expectedName: "pod2", + expectedNS: "default", + }, + { + name: "invalid tombstone", + obj: cache.DeletedFinalStateUnknown{ + Key: "invalid", + Obj: "wrong-type", + }, + shouldSend: false, + }, + { + name: "invalid object", + obj: "not-a-resource", + shouldSend: false, + }, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + // Create mock client and handler for each test case + mock := &mockClient{} + handler := &EventHandler{client: mock} + + // Call the real OnDelete method + handler.OnDelete(tc.obj) + + // Verify the result + deleted := mock.getDeleted() + if tc.shouldSend { + require.Len(t, deleted, 1, "should have sent one delete") + assert.Equal(t, tc.expectedName, deleted[0].Name, "name should match") + assert.Equal(t, tc.expectedNS, deleted[0].Namespace, "namespace should match") + assert.Equal(t, model.KindPod, deleted[0].Kind, "kind should match") + } else { + assert.Empty(t, deleted, "should not send delete for invalid input") + } + }) + } +} diff --git a/pkg/pipeline/transform/kubernetes/k8scache/integration_test.go b/pkg/pipeline/transform/kubernetes/k8scache/integration_test.go new file mode 100644 index 000000000..b24f1b2af --- /dev/null +++ b/pkg/pipeline/transform/kubernetes/k8scache/integration_test.go @@ -0,0 +1,333 @@ +package k8scache + +import ( + "context" + "fmt" + "net" + "testing" + "time" + + "github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/kubernetes/datasource" + inf "github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/kubernetes/informers" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "google.golang.org/grpc" + "google.golang.org/grpc/credentials/insecure" +) + +// TestIntegration_ServerReceivesAdd tests that the server +// can receive an ADD update from a real gRPC client +func TestIntegration_ServerReceivesAdd(t *testing.T) { + // Setup test datasource + _, informers := inf.SetupStubs(testIPInfo, nil, testNodes) + ds := &datasource.Datasource{Informers: informers} + ds.SetKubernetesStore(datasource.NewKubernetesStore()) + + // Create cache server + cacheServer := NewKubernetesCacheServer(ds) + + // Create gRPC server + grpcServer := grpc.NewServer() + RegisterKubernetesCacheServiceServer(grpcServer, cacheServer) + + // Start server on random port + listener, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + defer listener.Close() + + address := listener.Addr().String() + + // Start server in background + go func() { + _ = grpcServer.Serve(listener) + }() + defer grpcServer.Stop() + + // Give server time to start + time.Sleep(100 * time.Millisecond) + + // Create client connection + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + conn, err := grpc.NewClient(address, + grpc.WithTransportCredentials(insecure.NewCredentials()), + ) + require.NoError(t, err, "Failed to connect to server") + defer conn.Close() + + // Create client + client := NewKubernetesCacheServiceClient(conn) + + // Open bidirectional stream + stream, err := client.StreamUpdates(ctx) + require.NoError(t, err) + + // Client should receive SyncRequest from server first + syncMsg, err := stream.Recv() + require.NoError(t, err) + require.NotNil(t, syncMsg) + + req, ok := syncMsg.Message.(*SyncMessage_Request) + require.True(t, ok, "Expected SyncRequest from server") + assert.Equal(t, int64(0), req.Request.LastVersion) + + // Client sends ADD update + addUpdate := &CacheUpdate{ + Version: 1, + IsSnapshot: false, + Operation: OperationType_OPERATION_ADD, + Entries: []*ResourceEntry{ + { + Kind: "Pod", + Namespace: "default", + Name: "test-pod", + Uid: "test-uid", + Ips: []string{"10.0.0.100"}, + }, + }, + } + + err = stream.Send(addUpdate) + require.NoError(t, err) + + // Client should receive ACK from server + ackMsg, err := stream.Recv() + require.NoError(t, err) + require.NotNil(t, ackMsg) + + ack, ok := ackMsg.Message.(*SyncMessage_Ack) + require.True(t, ok, "Expected SyncAck from server") + assert.True(t, ack.Ack.Success, "Server should ACK successfully") + assert.Equal(t, int64(1), ack.Ack.Version) + + // Close stream + err = stream.CloseSend() + assert.NoError(t, err) + + fmt.Printf("✓ Integration test passed: server on %s received and acknowledged ADD update\n", address) +} + +// TestIntegration_MultipleUpdatesFlow tests a realistic update flow +func TestIntegration_MultipleUpdatesFlow(t *testing.T) { + // Setup + _, informers := inf.SetupStubs(testIPInfo, nil, testNodes) + ds := &datasource.Datasource{Informers: informers} + cacheServer := NewKubernetesCacheServer(ds) + + grpcServer := grpc.NewServer() + RegisterKubernetesCacheServiceServer(grpcServer, cacheServer) + + listener, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + defer listener.Close() + + address := listener.Addr().String() + + go func() { + _ = grpcServer.Serve(listener) + }() + defer grpcServer.Stop() + + time.Sleep(100 * time.Millisecond) + + // Create client + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + conn, err := grpc.NewClient(address, + grpc.WithTransportCredentials(insecure.NewCredentials()), + ) + require.NoError(t, err) + defer conn.Close() + + client := NewKubernetesCacheServiceClient(conn) + stream, err := client.StreamUpdates(ctx) + require.NoError(t, err) + + // Receive initial SyncRequest + syncMsg, err := stream.Recv() + require.NoError(t, err) + _, ok := syncMsg.Message.(*SyncMessage_Request) + require.True(t, ok) + + // Send first ADD + err = stream.Send(&CacheUpdate{ + Version: 1, + IsSnapshot: false, + Operation: OperationType_OPERATION_ADD, + Entries: []*ResourceEntry{ + {Kind: "Pod", Name: "pod1", Namespace: "default"}, + }, + }) + require.NoError(t, err) + + // Receive ACK + ackMsg, err := stream.Recv() + require.NoError(t, err) + ack, ok := ackMsg.Message.(*SyncMessage_Ack) + require.True(t, ok) + assert.True(t, ack.Ack.Success) + assert.Equal(t, int64(1), ack.Ack.Version) + + // Send second ADD + err = stream.Send(&CacheUpdate{ + Version: 2, + IsSnapshot: false, + Operation: OperationType_OPERATION_ADD, + Entries: []*ResourceEntry{{Kind: "Pod", Name: "pod2", Namespace: "default"}}, + }) + require.NoError(t, err) + + // Receive ACK + ackMsg, err = stream.Recv() + require.NoError(t, err) + ack, ok = ackMsg.Message.(*SyncMessage_Ack) + require.True(t, ok) + assert.True(t, ack.Ack.Success) + assert.Equal(t, int64(2), ack.Ack.Version) + + // Send DELETE + err = stream.Send(&CacheUpdate{ + Version: 3, + IsSnapshot: false, + Operation: OperationType_OPERATION_DELETE, + Entries: []*ResourceEntry{{Kind: "Pod", Name: "pod1", Namespace: "default"}}, + }) + require.NoError(t, err) + + // Receive ACK + ackMsg, err = stream.Recv() + require.NoError(t, err) + ack, ok = ackMsg.Message.(*SyncMessage_Ack) + require.True(t, ok) + assert.True(t, ack.Ack.Success) + assert.Equal(t, int64(3), ack.Ack.Version) + + // Verify server state + assert.Equal(t, int64(3), cacheServer.GetCurrentVersion()) + + err = stream.CloseSend() + assert.NoError(t, err) + + fmt.Printf("✓ Integration test passed: full update flow (ADD + ADD + DELETE)\n") +} + +// TestIntegration_MultipleClientsConnect tests that multiple informer clients +// can connect to the same server +func TestIntegration_MultipleClientsConnect(t *testing.T) { + // Setup + _, informers := inf.SetupStubs(testIPInfo, nil, testNodes) + ds := &datasource.Datasource{Informers: informers} + cacheServer := NewKubernetesCacheServer(ds) + + grpcServer := grpc.NewServer() + RegisterKubernetesCacheServiceServer(grpcServer, cacheServer) + + listener, err := net.Listen("tcp", "127.0.0.1:0") + require.NoError(t, err) + defer listener.Close() + + address := listener.Addr().String() + + go func() { + _ = grpcServer.Serve(listener) + }() + defer grpcServer.Stop() + + time.Sleep(100 * time.Millisecond) + + // Create 3 clients concurrently + numClients := 3 + done := make(chan bool, numClients) + errors := make(chan error, numClients) + + for i := range numClients { + go func(clientID int) { + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + defer cancel() + + conn, err := grpc.NewClient(address, + grpc.WithTransportCredentials(insecure.NewCredentials()), + ) + if err != nil { + errors <- fmt.Errorf("client %d failed to connect: %w", clientID, err) + return + } + defer conn.Close() + + client := NewKubernetesCacheServiceClient(conn) + stream, err := client.StreamUpdates(ctx) + if err != nil { + errors <- fmt.Errorf("client %d failed to open stream: %w", clientID, err) + return + } + + // Receive SyncRequest + syncMsg, err := stream.Recv() + if err != nil { + errors <- fmt.Errorf("client %d failed to receive SyncRequest: %w", clientID, err) + return + } + req, ok := syncMsg.Message.(*SyncMessage_Request) + if !ok { + errors <- fmt.Errorf("client %d expected SyncRequest but got %T", clientID, syncMsg.Message) + return + } + if req.Request.ProcessorId == "" { + errors <- fmt.Errorf("client %d received SyncRequest with empty ProcessorId", clientID) + return + } + + // Send ADD update + expectedVersion := int64(clientID + 1) + err = stream.Send(&CacheUpdate{ + Version: expectedVersion, + IsSnapshot: false, + Operation: OperationType_OPERATION_ADD, + Entries: []*ResourceEntry{{Kind: "Pod", Name: fmt.Sprintf("pod-%d", clientID), Namespace: "default"}}, + }) + if err != nil { + errors <- fmt.Errorf("client %d failed to send ADD update: %w", clientID, err) + return + } + + // Receive ACK + ackMsg, err := stream.Recv() + if err != nil { + errors <- fmt.Errorf("client %d failed to receive ACK: %w", clientID, err) + return + } + ack, ok := ackMsg.Message.(*SyncMessage_Ack) + if !ok { + errors <- fmt.Errorf("client %d expected SyncAck but got %T", clientID, ackMsg.Message) + return + } + if !ack.Ack.Success { + errors <- fmt.Errorf("client %d received ACK with Success=false, error: %s", clientID, ack.Ack.Error) + return + } + if ack.Ack.Version != expectedVersion { + errors <- fmt.Errorf("client %d expected ACK version %d but got %d", clientID, expectedVersion, ack.Ack.Version) + return + } + + _ = stream.CloseSend() + done <- true + }(i) + } + + // Wait for all clients + for range numClients { + select { + case <-done: + // Success + case err := <-errors: + t.Fatal(err) + case <-time.After(10 * time.Second): + t.Fatal("Timeout waiting for clients") + } + } + + fmt.Printf("✓ Integration test passed: %d clients connected successfully\n", numClients) +} diff --git a/pkg/pipeline/transform/kubernetes/k8scache/k8scache.pb.go b/pkg/pipeline/transform/kubernetes/k8scache/k8scache.pb.go new file mode 100644 index 000000000..7e7c95218 --- /dev/null +++ b/pkg/pipeline/transform/kubernetes/k8scache/k8scache.pb.go @@ -0,0 +1,648 @@ +// Code generated by protoc-gen-go. DO NOT EDIT. +// versions: +// protoc-gen-go v1.36.11 +// protoc v3.19.6 +// source: proto/k8scache.proto + +package k8scache + +import ( + protoreflect "google.golang.org/protobuf/reflect/protoreflect" + protoimpl "google.golang.org/protobuf/runtime/protoimpl" + reflect "reflect" + sync "sync" + unsafe "unsafe" +) + +const ( + // Verify that this generated code is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(20 - protoimpl.MinVersion) + // Verify that runtime/protoimpl is sufficiently up-to-date. + _ = protoimpl.EnforceVersion(protoimpl.MaxVersion - 20) +) + +// OperationType defines the kind of cache operation +type OperationType int32 + +const ( + OperationType_OPERATION_UNSPECIFIED OperationType = 0 + OperationType_OPERATION_ADD OperationType = 1 + OperationType_OPERATION_UPDATE OperationType = 2 + OperationType_OPERATION_DELETE OperationType = 3 +) + +// Enum value maps for OperationType. +var ( + OperationType_name = map[int32]string{ + 0: "OPERATION_UNSPECIFIED", + 1: "OPERATION_ADD", + 2: "OPERATION_UPDATE", + 3: "OPERATION_DELETE", + } + OperationType_value = map[string]int32{ + "OPERATION_UNSPECIFIED": 0, + "OPERATION_ADD": 1, + "OPERATION_UPDATE": 2, + "OPERATION_DELETE": 3, + } +) + +func (x OperationType) Enum() *OperationType { + p := new(OperationType) + *p = x + return p +} + +func (x OperationType) String() string { + return protoimpl.X.EnumStringOf(x.Descriptor(), protoreflect.EnumNumber(x)) +} + +func (OperationType) Descriptor() protoreflect.EnumDescriptor { + return file_proto_k8scache_proto_enumTypes[0].Descriptor() +} + +func (OperationType) Type() protoreflect.EnumType { + return &file_proto_k8scache_proto_enumTypes[0] +} + +func (x OperationType) Number() protoreflect.EnumNumber { + return protoreflect.EnumNumber(x) +} + +// Deprecated: Use OperationType.Descriptor instead. +func (OperationType) EnumDescriptor() ([]byte, []int) { + return file_proto_k8scache_proto_rawDescGZIP(), []int{0} +} + +// SyncMessage is sent by FLP processors (server) to the informers (client) +type SyncMessage struct { + state protoimpl.MessageState `protogen:"open.v1"` + // Types that are valid to be assigned to Message: + // + // *SyncMessage_Request + // *SyncMessage_Ack + Message isSyncMessage_Message `protobuf_oneof:"message"` + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *SyncMessage) Reset() { + *x = SyncMessage{} + mi := &file_proto_k8scache_proto_msgTypes[0] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *SyncMessage) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SyncMessage) ProtoMessage() {} + +func (x *SyncMessage) ProtoReflect() protoreflect.Message { + mi := &file_proto_k8scache_proto_msgTypes[0] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SyncMessage.ProtoReflect.Descriptor instead. +func (*SyncMessage) Descriptor() ([]byte, []int) { + return file_proto_k8scache_proto_rawDescGZIP(), []int{0} +} + +func (x *SyncMessage) GetMessage() isSyncMessage_Message { + if x != nil { + return x.Message + } + return nil +} + +func (x *SyncMessage) GetRequest() *SyncRequest { + if x != nil { + if x, ok := x.Message.(*SyncMessage_Request); ok { + return x.Request + } + } + return nil +} + +func (x *SyncMessage) GetAck() *SyncAck { + if x != nil { + if x, ok := x.Message.(*SyncMessage_Ack); ok { + return x.Ack + } + } + return nil +} + +type isSyncMessage_Message interface { + isSyncMessage_Message() +} + +type SyncMessage_Request struct { + Request *SyncRequest `protobuf:"bytes,1,opt,name=request,proto3,oneof"` +} + +type SyncMessage_Ack struct { + Ack *SyncAck `protobuf:"bytes,2,opt,name=ack,proto3,oneof"` +} + +func (*SyncMessage_Request) isSyncMessage_Message() {} + +func (*SyncMessage_Ack) isSyncMessage_Message() {} + +// SyncRequest is sent when an FLP processor connects or reconnects +type SyncRequest struct { + state protoimpl.MessageState `protogen:"open.v1"` + ProcessorId string `protobuf:"bytes,1,opt,name=processor_id,json=processorId,proto3" json:"processor_id,omitempty"` // Unique identifier for the FLP processor pod + LastVersion int64 `protobuf:"varint,2,opt,name=last_version,json=lastVersion,proto3" json:"last_version,omitempty"` // Last cache version seen (0 if requesting full snapshot) + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *SyncRequest) Reset() { + *x = SyncRequest{} + mi := &file_proto_k8scache_proto_msgTypes[1] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *SyncRequest) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SyncRequest) ProtoMessage() {} + +func (x *SyncRequest) ProtoReflect() protoreflect.Message { + mi := &file_proto_k8scache_proto_msgTypes[1] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SyncRequest.ProtoReflect.Descriptor instead. +func (*SyncRequest) Descriptor() ([]byte, []int) { + return file_proto_k8scache_proto_rawDescGZIP(), []int{1} +} + +func (x *SyncRequest) GetProcessorId() string { + if x != nil { + return x.ProcessorId + } + return "" +} + +func (x *SyncRequest) GetLastVersion() int64 { + if x != nil { + return x.LastVersion + } + return 0 +} + +// SyncAck acknowledges receipt of cache updates +type SyncAck struct { + state protoimpl.MessageState `protogen:"open.v1"` + ProcessorId string `protobuf:"bytes,1,opt,name=processor_id,json=processorId,proto3" json:"processor_id,omitempty"` + Version int64 `protobuf:"varint,2,opt,name=version,proto3" json:"version,omitempty"` // The version that was successfully applied + Success bool `protobuf:"varint,3,opt,name=success,proto3" json:"success,omitempty"` // Whether the update was applied successfully + Error string `protobuf:"bytes,4,opt,name=error,proto3" json:"error,omitempty"` // Error message if success = false + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *SyncAck) Reset() { + *x = SyncAck{} + mi := &file_proto_k8scache_proto_msgTypes[2] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *SyncAck) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*SyncAck) ProtoMessage() {} + +func (x *SyncAck) ProtoReflect() protoreflect.Message { + mi := &file_proto_k8scache_proto_msgTypes[2] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use SyncAck.ProtoReflect.Descriptor instead. +func (*SyncAck) Descriptor() ([]byte, []int) { + return file_proto_k8scache_proto_rawDescGZIP(), []int{2} +} + +func (x *SyncAck) GetProcessorId() string { + if x != nil { + return x.ProcessorId + } + return "" +} + +func (x *SyncAck) GetVersion() int64 { + if x != nil { + return x.Version + } + return 0 +} + +func (x *SyncAck) GetSuccess() bool { + if x != nil { + return x.Success + } + return false +} + +func (x *SyncAck) GetError() string { + if x != nil { + return x.Error + } + return "" +} + +// CacheUpdate represents an incremental change to the cache +type CacheUpdate struct { + state protoimpl.MessageState `protogen:"open.v1"` + Version int64 `protobuf:"varint,1,opt,name=version,proto3" json:"version,omitempty"` // Monotonically increasing version number + IsSnapshot bool `protobuf:"varint,2,opt,name=is_snapshot,json=isSnapshot,proto3" json:"is_snapshot,omitempty"` // True if this is a full snapshot, false for incremental + Entries []*ResourceEntry `protobuf:"bytes,3,rep,name=entries,proto3" json:"entries,omitempty"` + Operation OperationType `protobuf:"varint,4,opt,name=operation,proto3,enum=k8scache.OperationType" json:"operation,omitempty"` // Only relevant when is_snapshot = false + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *CacheUpdate) Reset() { + *x = CacheUpdate{} + mi := &file_proto_k8scache_proto_msgTypes[3] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *CacheUpdate) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*CacheUpdate) ProtoMessage() {} + +func (x *CacheUpdate) ProtoReflect() protoreflect.Message { + mi := &file_proto_k8scache_proto_msgTypes[3] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use CacheUpdate.ProtoReflect.Descriptor instead. +func (*CacheUpdate) Descriptor() ([]byte, []int) { + return file_proto_k8scache_proto_rawDescGZIP(), []int{3} +} + +func (x *CacheUpdate) GetVersion() int64 { + if x != nil { + return x.Version + } + return 0 +} + +func (x *CacheUpdate) GetIsSnapshot() bool { + if x != nil { + return x.IsSnapshot + } + return false +} + +func (x *CacheUpdate) GetEntries() []*ResourceEntry { + if x != nil { + return x.Entries + } + return nil +} + +func (x *CacheUpdate) GetOperation() OperationType { + if x != nil { + return x.Operation + } + return OperationType_OPERATION_UNSPECIFIED +} + +// ResourceEntry represents a single Kubernetes resource in the cache +type ResourceEntry struct { + state protoimpl.MessageState `protogen:"open.v1"` + Kind string `protobuf:"bytes,1,opt,name=kind,proto3" json:"kind,omitempty"` // "Pod", "Node", or "Service" + Namespace string `protobuf:"bytes,2,opt,name=namespace,proto3" json:"namespace,omitempty"` + Name string `protobuf:"bytes,3,opt,name=name,proto3" json:"name,omitempty"` + Uid string `protobuf:"bytes,4,opt,name=uid,proto3" json:"uid,omitempty"` + // Resource-specific metadata + OwnerName string `protobuf:"bytes,5,opt,name=owner_name,json=ownerName,proto3" json:"owner_name,omitempty"` + OwnerKind string `protobuf:"bytes,6,opt,name=owner_kind,json=ownerKind,proto3" json:"owner_kind,omitempty"` + HostName string `protobuf:"bytes,7,opt,name=host_name,json=hostName,proto3" json:"host_name,omitempty"` // For Pods: the node name + HostIp string `protobuf:"bytes,8,opt,name=host_ip,json=hostIp,proto3" json:"host_ip,omitempty"` // For Pods: the node IP + NetworkName string `protobuf:"bytes,9,opt,name=network_name,json=networkName,proto3" json:"network_name,omitempty"` // For multi-network support + Ips []string `protobuf:"bytes,10,rep,name=ips,proto3" json:"ips,omitempty"` // IP addresses associated with this resource + SecondaryNetKeys []string `protobuf:"bytes,11,rep,name=secondary_net_keys,json=secondaryNetKeys,proto3" json:"secondary_net_keys,omitempty"` // Secondary network keys for CNI plugins + SecondaryNetNames map[string]string `protobuf:"bytes,16,rep,name=secondary_net_names,json=secondaryNetNames,proto3" json:"secondary_net_names,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` // Secondary network names mapping (key -> network name) + Labels map[string]string `protobuf:"bytes,12,rep,name=labels,proto3" json:"labels,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` + Annotations map[string]string `protobuf:"bytes,13,rep,name=annotations,proto3" json:"annotations,omitempty" protobuf_key:"bytes,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` + // Kubernetes metadata + CreationTimestamp int64 `protobuf:"varint,14,opt,name=creation_timestamp,json=creationTimestamp,proto3" json:"creation_timestamp,omitempty"` // Unix timestamp + ResourceVersion string `protobuf:"bytes,15,opt,name=resource_version,json=resourceVersion,proto3" json:"resource_version,omitempty"` // K8s resource version + unknownFields protoimpl.UnknownFields + sizeCache protoimpl.SizeCache +} + +func (x *ResourceEntry) Reset() { + *x = ResourceEntry{} + mi := &file_proto_k8scache_proto_msgTypes[4] + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + ms.StoreMessageInfo(mi) +} + +func (x *ResourceEntry) String() string { + return protoimpl.X.MessageStringOf(x) +} + +func (*ResourceEntry) ProtoMessage() {} + +func (x *ResourceEntry) ProtoReflect() protoreflect.Message { + mi := &file_proto_k8scache_proto_msgTypes[4] + if x != nil { + ms := protoimpl.X.MessageStateOf(protoimpl.Pointer(x)) + if ms.LoadMessageInfo() == nil { + ms.StoreMessageInfo(mi) + } + return ms + } + return mi.MessageOf(x) +} + +// Deprecated: Use ResourceEntry.ProtoReflect.Descriptor instead. +func (*ResourceEntry) Descriptor() ([]byte, []int) { + return file_proto_k8scache_proto_rawDescGZIP(), []int{4} +} + +func (x *ResourceEntry) GetKind() string { + if x != nil { + return x.Kind + } + return "" +} + +func (x *ResourceEntry) GetNamespace() string { + if x != nil { + return x.Namespace + } + return "" +} + +func (x *ResourceEntry) GetName() string { + if x != nil { + return x.Name + } + return "" +} + +func (x *ResourceEntry) GetUid() string { + if x != nil { + return x.Uid + } + return "" +} + +func (x *ResourceEntry) GetOwnerName() string { + if x != nil { + return x.OwnerName + } + return "" +} + +func (x *ResourceEntry) GetOwnerKind() string { + if x != nil { + return x.OwnerKind + } + return "" +} + +func (x *ResourceEntry) GetHostName() string { + if x != nil { + return x.HostName + } + return "" +} + +func (x *ResourceEntry) GetHostIp() string { + if x != nil { + return x.HostIp + } + return "" +} + +func (x *ResourceEntry) GetNetworkName() string { + if x != nil { + return x.NetworkName + } + return "" +} + +func (x *ResourceEntry) GetIps() []string { + if x != nil { + return x.Ips + } + return nil +} + +func (x *ResourceEntry) GetSecondaryNetKeys() []string { + if x != nil { + return x.SecondaryNetKeys + } + return nil +} + +func (x *ResourceEntry) GetSecondaryNetNames() map[string]string { + if x != nil { + return x.SecondaryNetNames + } + return nil +} + +func (x *ResourceEntry) GetLabels() map[string]string { + if x != nil { + return x.Labels + } + return nil +} + +func (x *ResourceEntry) GetAnnotations() map[string]string { + if x != nil { + return x.Annotations + } + return nil +} + +func (x *ResourceEntry) GetCreationTimestamp() int64 { + if x != nil { + return x.CreationTimestamp + } + return 0 +} + +func (x *ResourceEntry) GetResourceVersion() string { + if x != nil { + return x.ResourceVersion + } + return "" +} + +var File_proto_k8scache_proto protoreflect.FileDescriptor + +const file_proto_k8scache_proto_rawDesc = "" + + "\n" + + "\x14proto/k8scache.proto\x12\bk8scache\"r\n" + + "\vSyncMessage\x121\n" + + "\arequest\x18\x01 \x01(\v2\x15.k8scache.SyncRequestH\x00R\arequest\x12%\n" + + "\x03ack\x18\x02 \x01(\v2\x11.k8scache.SyncAckH\x00R\x03ackB\t\n" + + "\amessage\"S\n" + + "\vSyncRequest\x12!\n" + + "\fprocessor_id\x18\x01 \x01(\tR\vprocessorId\x12!\n" + + "\flast_version\x18\x02 \x01(\x03R\vlastVersion\"v\n" + + "\aSyncAck\x12!\n" + + "\fprocessor_id\x18\x01 \x01(\tR\vprocessorId\x12\x18\n" + + "\aversion\x18\x02 \x01(\x03R\aversion\x12\x18\n" + + "\asuccess\x18\x03 \x01(\bR\asuccess\x12\x14\n" + + "\x05error\x18\x04 \x01(\tR\x05error\"\xb2\x01\n" + + "\vCacheUpdate\x12\x18\n" + + "\aversion\x18\x01 \x01(\x03R\aversion\x12\x1f\n" + + "\vis_snapshot\x18\x02 \x01(\bR\n" + + "isSnapshot\x121\n" + + "\aentries\x18\x03 \x03(\v2\x17.k8scache.ResourceEntryR\aentries\x125\n" + + "\toperation\x18\x04 \x01(\x0e2\x17.k8scache.OperationTypeR\toperation\"\xc2\x06\n" + + "\rResourceEntry\x12\x12\n" + + "\x04kind\x18\x01 \x01(\tR\x04kind\x12\x1c\n" + + "\tnamespace\x18\x02 \x01(\tR\tnamespace\x12\x12\n" + + "\x04name\x18\x03 \x01(\tR\x04name\x12\x10\n" + + "\x03uid\x18\x04 \x01(\tR\x03uid\x12\x1d\n" + + "\n" + + "owner_name\x18\x05 \x01(\tR\townerName\x12\x1d\n" + + "\n" + + "owner_kind\x18\x06 \x01(\tR\townerKind\x12\x1b\n" + + "\thost_name\x18\a \x01(\tR\bhostName\x12\x17\n" + + "\ahost_ip\x18\b \x01(\tR\x06hostIp\x12!\n" + + "\fnetwork_name\x18\t \x01(\tR\vnetworkName\x12\x10\n" + + "\x03ips\x18\n" + + " \x03(\tR\x03ips\x12,\n" + + "\x12secondary_net_keys\x18\v \x03(\tR\x10secondaryNetKeys\x12^\n" + + "\x13secondary_net_names\x18\x10 \x03(\v2..k8scache.ResourceEntry.SecondaryNetNamesEntryR\x11secondaryNetNames\x12;\n" + + "\x06labels\x18\f \x03(\v2#.k8scache.ResourceEntry.LabelsEntryR\x06labels\x12J\n" + + "\vannotations\x18\r \x03(\v2(.k8scache.ResourceEntry.AnnotationsEntryR\vannotations\x12-\n" + + "\x12creation_timestamp\x18\x0e \x01(\x03R\x11creationTimestamp\x12)\n" + + "\x10resource_version\x18\x0f \x01(\tR\x0fresourceVersion\x1aD\n" + + "\x16SecondaryNetNamesEntry\x12\x10\n" + + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\x1a9\n" + + "\vLabelsEntry\x12\x10\n" + + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01\x1a>\n" + + "\x10AnnotationsEntry\x12\x10\n" + + "\x03key\x18\x01 \x01(\tR\x03key\x12\x14\n" + + "\x05value\x18\x02 \x01(\tR\x05value:\x028\x01*i\n" + + "\rOperationType\x12\x19\n" + + "\x15OPERATION_UNSPECIFIED\x10\x00\x12\x11\n" + + "\rOPERATION_ADD\x10\x01\x12\x14\n" + + "\x10OPERATION_UPDATE\x10\x02\x12\x14\n" + + "\x10OPERATION_DELETE\x10\x032]\n" + + "\x16KubernetesCacheService\x12C\n" + + "\rStreamUpdates\x12\x15.k8scache.CacheUpdate\x1a\x15.k8scache.SyncMessage\"\x00(\x010\x01B\fZ\n" + + "./k8scacheb\x06proto3" + +var ( + file_proto_k8scache_proto_rawDescOnce sync.Once + file_proto_k8scache_proto_rawDescData []byte +) + +func file_proto_k8scache_proto_rawDescGZIP() []byte { + file_proto_k8scache_proto_rawDescOnce.Do(func() { + file_proto_k8scache_proto_rawDescData = protoimpl.X.CompressGZIP(unsafe.Slice(unsafe.StringData(file_proto_k8scache_proto_rawDesc), len(file_proto_k8scache_proto_rawDesc))) + }) + return file_proto_k8scache_proto_rawDescData +} + +var file_proto_k8scache_proto_enumTypes = make([]protoimpl.EnumInfo, 1) +var file_proto_k8scache_proto_msgTypes = make([]protoimpl.MessageInfo, 8) +var file_proto_k8scache_proto_goTypes = []any{ + (OperationType)(0), // 0: k8scache.OperationType + (*SyncMessage)(nil), // 1: k8scache.SyncMessage + (*SyncRequest)(nil), // 2: k8scache.SyncRequest + (*SyncAck)(nil), // 3: k8scache.SyncAck + (*CacheUpdate)(nil), // 4: k8scache.CacheUpdate + (*ResourceEntry)(nil), // 5: k8scache.ResourceEntry + nil, // 6: k8scache.ResourceEntry.SecondaryNetNamesEntry + nil, // 7: k8scache.ResourceEntry.LabelsEntry + nil, // 8: k8scache.ResourceEntry.AnnotationsEntry +} +var file_proto_k8scache_proto_depIdxs = []int32{ + 2, // 0: k8scache.SyncMessage.request:type_name -> k8scache.SyncRequest + 3, // 1: k8scache.SyncMessage.ack:type_name -> k8scache.SyncAck + 5, // 2: k8scache.CacheUpdate.entries:type_name -> k8scache.ResourceEntry + 0, // 3: k8scache.CacheUpdate.operation:type_name -> k8scache.OperationType + 6, // 4: k8scache.ResourceEntry.secondary_net_names:type_name -> k8scache.ResourceEntry.SecondaryNetNamesEntry + 7, // 5: k8scache.ResourceEntry.labels:type_name -> k8scache.ResourceEntry.LabelsEntry + 8, // 6: k8scache.ResourceEntry.annotations:type_name -> k8scache.ResourceEntry.AnnotationsEntry + 4, // 7: k8scache.KubernetesCacheService.StreamUpdates:input_type -> k8scache.CacheUpdate + 1, // 8: k8scache.KubernetesCacheService.StreamUpdates:output_type -> k8scache.SyncMessage + 8, // [8:9] is the sub-list for method output_type + 7, // [7:8] is the sub-list for method input_type + 7, // [7:7] is the sub-list for extension type_name + 7, // [7:7] is the sub-list for extension extendee + 0, // [0:7] is the sub-list for field type_name +} + +func init() { file_proto_k8scache_proto_init() } +func file_proto_k8scache_proto_init() { + if File_proto_k8scache_proto != nil { + return + } + file_proto_k8scache_proto_msgTypes[0].OneofWrappers = []any{ + (*SyncMessage_Request)(nil), + (*SyncMessage_Ack)(nil), + } + type x struct{} + out := protoimpl.TypeBuilder{ + File: protoimpl.DescBuilder{ + GoPackagePath: reflect.TypeOf(x{}).PkgPath(), + RawDescriptor: unsafe.Slice(unsafe.StringData(file_proto_k8scache_proto_rawDesc), len(file_proto_k8scache_proto_rawDesc)), + NumEnums: 1, + NumMessages: 8, + NumExtensions: 0, + NumServices: 1, + }, + GoTypes: file_proto_k8scache_proto_goTypes, + DependencyIndexes: file_proto_k8scache_proto_depIdxs, + EnumInfos: file_proto_k8scache_proto_enumTypes, + MessageInfos: file_proto_k8scache_proto_msgTypes, + }.Build() + File_proto_k8scache_proto = out.File + file_proto_k8scache_proto_goTypes = nil + file_proto_k8scache_proto_depIdxs = nil +} diff --git a/pkg/pipeline/transform/kubernetes/k8scache/k8scache_grpc.pb.go b/pkg/pipeline/transform/kubernetes/k8scache/k8scache_grpc.pb.go new file mode 100644 index 000000000..7fb08f6d1 --- /dev/null +++ b/pkg/pipeline/transform/kubernetes/k8scache/k8scache_grpc.pb.go @@ -0,0 +1,162 @@ +// Code generated by protoc-gen-go-grpc. DO NOT EDIT. +// versions: +// - protoc-gen-go-grpc v1.6.1 +// - protoc v3.19.6 +// source: proto/k8scache.proto + +package k8scache + +import ( + context "context" + grpc "google.golang.org/grpc" + codes "google.golang.org/grpc/codes" + status "google.golang.org/grpc/status" +) + +// This is a compile-time assertion to ensure that this generated file +// is compatible with the grpc package it is being compiled against. +// Requires gRPC-Go v1.64.0 or later. +const _ = grpc.SupportPackageIsVersion9 + +const ( + KubernetesCacheService_StreamUpdates_FullMethodName = "/k8scache.KubernetesCacheService/StreamUpdates" +) + +// KubernetesCacheServiceClient is the client API for KubernetesCacheService service. +// +// For semantics around ctx use and closing/ending streaming RPCs, please refer to https://pkg.go.dev/google.golang.org/grpc/?tab=doc#ClientConn.NewStream. +// +// KubernetesCacheService defines the bidirectional streaming service for cache synchronization +// between the centralized informers pod and the distributed FLP processor pods. +// +// Flow: +// 1. FLP processor (server) accepts connection from informers (client) +// 2. FLP sends SyncRequest indicating what version it has +// 3. Informers sends CacheUpdate (snapshot or incremental) +// 4. FLP sends SyncAck to confirm receipt +// 5. Repeat steps 3-4 for ongoing updates +type KubernetesCacheServiceClient interface { + // StreamUpdates establishes a bidirectional stream: + // - Informers (client) send CacheUpdate messages (snapshots and incremental updates) + // - FLP processors (server) send SyncRequest and SyncAck messages + // + // NOTE: In protobuf syntax, "rpc Method(stream A) returns (stream B)": + // - A = what server RECEIVES (what client SENDS) + // - B = what server SENDS (what client RECEIVES) + // + // We want: + // - Server receives: CacheUpdate (from informers client) + // - Server sends: SyncMessage (requests/acks) + // + // Therefore we need: (stream CacheUpdate) returns (stream SyncMessage) + StreamUpdates(ctx context.Context, opts ...grpc.CallOption) (grpc.BidiStreamingClient[CacheUpdate, SyncMessage], error) +} + +type kubernetesCacheServiceClient struct { + cc grpc.ClientConnInterface +} + +func NewKubernetesCacheServiceClient(cc grpc.ClientConnInterface) KubernetesCacheServiceClient { + return &kubernetesCacheServiceClient{cc} +} + +func (c *kubernetesCacheServiceClient) StreamUpdates(ctx context.Context, opts ...grpc.CallOption) (grpc.BidiStreamingClient[CacheUpdate, SyncMessage], error) { + cOpts := append([]grpc.CallOption{grpc.StaticMethod()}, opts...) + stream, err := c.cc.NewStream(ctx, &KubernetesCacheService_ServiceDesc.Streams[0], KubernetesCacheService_StreamUpdates_FullMethodName, cOpts...) + if err != nil { + return nil, err + } + x := &grpc.GenericClientStream[CacheUpdate, SyncMessage]{ClientStream: stream} + return x, nil +} + +// This type alias is provided for backwards compatibility with existing code that references the prior non-generic stream type by name. +type KubernetesCacheService_StreamUpdatesClient = grpc.BidiStreamingClient[CacheUpdate, SyncMessage] + +// KubernetesCacheServiceServer is the server API for KubernetesCacheService service. +// All implementations must embed UnimplementedKubernetesCacheServiceServer +// for forward compatibility. +// +// KubernetesCacheService defines the bidirectional streaming service for cache synchronization +// between the centralized informers pod and the distributed FLP processor pods. +// +// Flow: +// 1. FLP processor (server) accepts connection from informers (client) +// 2. FLP sends SyncRequest indicating what version it has +// 3. Informers sends CacheUpdate (snapshot or incremental) +// 4. FLP sends SyncAck to confirm receipt +// 5. Repeat steps 3-4 for ongoing updates +type KubernetesCacheServiceServer interface { + // StreamUpdates establishes a bidirectional stream: + // - Informers (client) send CacheUpdate messages (snapshots and incremental updates) + // - FLP processors (server) send SyncRequest and SyncAck messages + // + // NOTE: In protobuf syntax, "rpc Method(stream A) returns (stream B)": + // - A = what server RECEIVES (what client SENDS) + // - B = what server SENDS (what client RECEIVES) + // + // We want: + // - Server receives: CacheUpdate (from informers client) + // - Server sends: SyncMessage (requests/acks) + // + // Therefore we need: (stream CacheUpdate) returns (stream SyncMessage) + StreamUpdates(grpc.BidiStreamingServer[CacheUpdate, SyncMessage]) error + mustEmbedUnimplementedKubernetesCacheServiceServer() +} + +// UnimplementedKubernetesCacheServiceServer must be embedded to have +// forward compatible implementations. +// +// NOTE: this should be embedded by value instead of pointer to avoid a nil +// pointer dereference when methods are called. +type UnimplementedKubernetesCacheServiceServer struct{} + +func (UnimplementedKubernetesCacheServiceServer) StreamUpdates(grpc.BidiStreamingServer[CacheUpdate, SyncMessage]) error { + return status.Error(codes.Unimplemented, "method StreamUpdates not implemented") +} +func (UnimplementedKubernetesCacheServiceServer) mustEmbedUnimplementedKubernetesCacheServiceServer() { +} +func (UnimplementedKubernetesCacheServiceServer) testEmbeddedByValue() {} + +// UnsafeKubernetesCacheServiceServer may be embedded to opt out of forward compatibility for this service. +// Use of this interface is not recommended, as added methods to KubernetesCacheServiceServer will +// result in compilation errors. +type UnsafeKubernetesCacheServiceServer interface { + mustEmbedUnimplementedKubernetesCacheServiceServer() +} + +func RegisterKubernetesCacheServiceServer(s grpc.ServiceRegistrar, srv KubernetesCacheServiceServer) { + // If the following call panics, it indicates UnimplementedKubernetesCacheServiceServer was + // embedded by pointer and is nil. This will cause panics if an + // unimplemented method is ever invoked, so we test this at initialization + // time to prevent it from happening at runtime later due to I/O. + if t, ok := srv.(interface{ testEmbeddedByValue() }); ok { + t.testEmbeddedByValue() + } + s.RegisterService(&KubernetesCacheService_ServiceDesc, srv) +} + +func _KubernetesCacheService_StreamUpdates_Handler(srv interface{}, stream grpc.ServerStream) error { + return srv.(KubernetesCacheServiceServer).StreamUpdates(&grpc.GenericServerStream[CacheUpdate, SyncMessage]{ServerStream: stream}) +} + +// This type alias is provided for backwards compatibility with existing code that references the prior non-generic stream type by name. +type KubernetesCacheService_StreamUpdatesServer = grpc.BidiStreamingServer[CacheUpdate, SyncMessage] + +// KubernetesCacheService_ServiceDesc is the grpc.ServiceDesc for KubernetesCacheService service. +// It's only intended for direct use with grpc.RegisterService, +// and not to be introspected or modified (even as a copy) +var KubernetesCacheService_ServiceDesc = grpc.ServiceDesc{ + ServiceName: "k8scache.KubernetesCacheService", + HandlerType: (*KubernetesCacheServiceServer)(nil), + Methods: []grpc.MethodDesc{}, + Streams: []grpc.StreamDesc{ + { + StreamName: "StreamUpdates", + Handler: _KubernetesCacheService_StreamUpdates_Handler, + ServerStreams: true, + ClientStreams: true, + }, + }, + Metadata: "proto/k8scache.proto", +} diff --git a/pkg/pipeline/transform/kubernetes/k8scache/server.go b/pkg/pipeline/transform/kubernetes/k8scache/server.go new file mode 100644 index 000000000..62e840a68 --- /dev/null +++ b/pkg/pipeline/transform/kubernetes/k8scache/server.go @@ -0,0 +1,158 @@ +package k8scache + +import ( + "errors" + "fmt" + "io" + "sync/atomic" + "time" + + "github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/kubernetes/datasource" + log "github.com/sirupsen/logrus" +) + +var slog = log.WithField("component", "k8scache.Server") + +// KubernetesCacheServer implements the gRPC KubernetesCacheService server. +// It receives cache updates from the centralized informers client. +type KubernetesCacheServer struct { + UnimplementedKubernetesCacheServiceServer + datasource *datasource.Datasource + version atomic.Int64 // Last version received +} + +// NewKubernetesCacheServer creates a new cache synchronization server. +func NewKubernetesCacheServer(ds *datasource.Datasource) *KubernetesCacheServer { + server := &KubernetesCacheServer{ + datasource: ds, + } + server.version.Store(0) + return server +} + +// StreamUpdates implements the bidirectional streaming RPC +// The server: +// 1. Sends SyncRequest to ask for data +// 2. Receives CacheUpdate from client +// 3. Sends SyncAck to confirm receipt +// 4. Repeat steps 2-3 +func (s *KubernetesCacheServer) StreamUpdates(stream KubernetesCacheService_StreamUpdatesServer) error { + ctx := stream.Context() + + // Generate a unique ID for this connection (for logging) + connectionID := fmt.Sprintf("flp-%d", time.Now().UnixNano()) + + // Send SyncRequest with our current version so the client knows what to send + // (full snapshot if 0, or incrementals from that version) + lastVersion := s.version.Load() + err := stream.Send(&SyncMessage{ + Message: &SyncMessage_Request{ + Request: &SyncRequest{ + ProcessorId: connectionID, + LastVersion: lastVersion, + }, + }, + }) + if err != nil { + slog.WithError(err).Error("Failed to send initial SyncRequest") + return err + } + + slog.WithFields(log.Fields{ + "connection_id": connectionID, + "last_version": lastVersion, + }).Info("Sent SyncRequest to client") + + // Receive updates from client + for { + select { + case <-ctx.Done(): + slog.WithField("connection_id", connectionID).Info("Connection context cancelled") + return ctx.Err() + default: + update, err := stream.Recv() + if errors.Is(err, io.EOF) { + slog.WithField("connection_id", connectionID).Info("Client disconnected gracefully") + return nil + } + if err != nil { + slog.WithError(err).WithField("connection_id", connectionID).Warn("Error receiving from client") + return err + } + + // Process the update + if err := s.processUpdate(update); err != nil { + slog.WithError(err).WithField("version", update.Version).Error("Failed to process update") + // Send NACK + _ = stream.Send(&SyncMessage{ + Message: &SyncMessage_Ack{ + Ack: &SyncAck{ + ProcessorId: connectionID, + Version: update.Version, + Success: false, + Error: err.Error(), + }, + }, + }) + continue + } + + // Update was processed successfully + s.version.Store(update.Version) + + // Send ACK + err = stream.Send(&SyncMessage{ + Message: &SyncMessage_Ack{ + Ack: &SyncAck{ + ProcessorId: connectionID, + Version: update.Version, + Success: true, + }, + }, + }) + if err != nil { + slog.WithError(err).Error("Failed to send ACK") + return err + } + + slog.WithFields(log.Fields{ + "connection_id": connectionID, + "version": update.Version, + "is_snapshot": update.IsSnapshot, + "num_entries": len(update.Entries), + }).Debug("Processed and acknowledged update") + } + } +} + +// processUpdate applies a cache update to the local datasource (when KubernetesStore is set). +func (s *KubernetesCacheServer) processUpdate(update *CacheUpdate) error { + if s.datasource == nil { + return fmt.Errorf("datasource not initialized") + } + + entries := resourceEntriesToMeta(update.Entries) + + // Note: is_snapshot field is ignored - we only support incremental updates + // If a processor restarts, it starts with empty cache and builds up from incoming ADD/UPDATE events + + switch update.Operation { + case OperationType_OPERATION_ADD, OperationType_OPERATION_UPDATE: + slog.WithField("num_entries", len(entries)).Debug("Received ADD/UPDATE") + s.datasource.ApplyCacheAddOrUpdate(entries) + case OperationType_OPERATION_DELETE: + slog.WithField("num_entries", len(entries)).Debug("Received DELETE") + s.datasource.ApplyCacheDelete(entries) + case OperationType_OPERATION_UNSPECIFIED: + return fmt.Errorf("received update with unspecified operation") + default: + return fmt.Errorf("unknown operation type: %v", update.Operation) + } + + return nil +} + +// GetCurrentVersion returns the last version received +func (s *KubernetesCacheServer) GetCurrentVersion() int64 { + return s.version.Load() +} diff --git a/pkg/pipeline/transform/kubernetes/k8scache/server_test.go b/pkg/pipeline/transform/kubernetes/k8scache/server_test.go new file mode 100644 index 000000000..ccc5000ee --- /dev/null +++ b/pkg/pipeline/transform/kubernetes/k8scache/server_test.go @@ -0,0 +1,486 @@ +package k8scache + +import ( + "context" + "io" + "testing" + + "github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/kubernetes/datasource" + inf "github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/kubernetes/informers" + "github.com/netobserv/flowlogs-pipeline/pkg/pipeline/transform/kubernetes/model" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + "google.golang.org/grpc" + v1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// Test data - same pattern as enrich_test.go +var testIPInfo = map[string]*model.ResourceMetaData{ + "10.0.0.1": { + ObjectMeta: v1.ObjectMeta{ + Name: "test-pod-1", + Namespace: "test-ns-1", + UID: "pod-uid-1", + Labels: map[string]string{ + "app": "web", + "version": "v1", + }, + Annotations: map[string]string{ + "description": "test pod", + }, + }, + Kind: "Pod", + OwnerName: "test-deployment", + OwnerKind: "Deployment", + HostName: "node-1", + HostIP: "192.168.1.1", + NetworkName: "primary", + IPs: []string{"10.0.0.1"}, + }, + "10.0.0.2": { + ObjectMeta: v1.ObjectMeta{ + Name: "test-pod-2", + Namespace: "test-ns-2", + UID: "pod-uid-2", + }, + Kind: "Pod", + OwnerName: "test-statefulset", + OwnerKind: "StatefulSet", + HostName: "node-2", + HostIP: "192.168.1.2", + NetworkName: "primary", + IPs: []string{"10.0.0.2"}, + }, + "192.168.1.1": { + ObjectMeta: v1.ObjectMeta{ + Name: "node-1", + UID: "node-uid-1", + }, + Kind: "Node", + OwnerName: "node-1", + OwnerKind: "Node", + IPs: []string{"192.168.1.1"}, + }, +} + +var testNodes = map[string]*model.ResourceMetaData{ + "node-1": { + ObjectMeta: v1.ObjectMeta{ + Name: "node-1", + }, + Kind: "Node", + }, +} + +func setupTestDatasource() *datasource.Datasource { + _, informers := inf.SetupStubs(testIPInfo, nil, testNodes) + return &datasource.Datasource{Informers: informers} +} + +func setupTestDatasourceWithStore() *datasource.Datasource { + _, informers := inf.SetupStubs(testIPInfo, nil, testNodes) + ds := &datasource.Datasource{Informers: informers} + ds.SetKubernetesStore(datasource.NewKubernetesStore()) + return ds +} + +// TestBackwardCompatibility ensures existing datasource functionality is not broken +func TestBackwardCompatibility_DatasourceLookup(t *testing.T) { + ds := setupTestDatasource() + + // Test IP lookup - existing functionality + result := ds.IndexLookup(nil, "10.0.0.1") + require.NotNil(t, result) + assert.Equal(t, "test-pod-1", result.Name) + assert.Equal(t, "test-ns-1", result.Namespace) + assert.Equal(t, "Pod", result.Kind) + + // Test node lookup - existing functionality + node, err := ds.GetNodeByName("node-1") + require.NoError(t, err) + require.NotNil(t, node) + assert.Equal(t, "node-1", node.Name) +} + +// TestKubernetesCacheServer_Creation tests server instantiation +func TestKubernetesCacheServer_Creation(t *testing.T) { + ds := setupTestDatasource() + + server := NewKubernetesCacheServer(ds) + + require.NotNil(t, server) + assert.NotNil(t, server.datasource) +} + +// TestKubernetesCacheServer_ReceivesAdd tests that when a client sends an ADD, +// the server processes it correctly +func TestKubernetesCacheServer_ReceivesAdd(t *testing.T) { + ds := setupTestDatasource() + // Attach KubernetesStore so updates are actually stored + ds.SetKubernetesStore(datasource.NewKubernetesStore()) + server := NewKubernetesCacheServer(ds) + + // Create mock stream + mockStream := &mockStreamServer{ + ctx: context.Background(), + sendChan: make(chan *CacheUpdate, 10), + recvMsgs: make([]*SyncMessage, 0), + firstSend: true, + } + + // Client (informers) sends ADD update + addUpdate := &CacheUpdate{ + Version: 1, + IsSnapshot: false, + Operation: OperationType_OPERATION_ADD, + Entries: []*ResourceEntry{ + { + Kind: "Pod", + Namespace: "test-ns-1", + Name: "test-pod-1", + Uid: "pod-uid-1", + Ips: []string{"10.0.0.1"}, + }, + }, + } + mockStream.sendChan <- addUpdate + close(mockStream.sendChan) + + // Run server + err := server.StreamUpdates(mockStream) + require.NoError(t, err) + + // Verify server sent SyncRequest first + require.Greater(t, len(mockStream.recvMsgs), 0, "Should have sent at least SyncRequest") + firstMsg := mockStream.recvMsgs[0] + req, ok := firstMsg.Message.(*SyncMessage_Request) + require.True(t, ok, "First message should be SyncRequest") + assert.NotEmpty(t, req.Request.ProcessorId) + + // Verify server sent ACK + require.Greater(t, len(mockStream.recvMsgs), 1, "Should have sent ACK") + ackMsg := mockStream.recvMsgs[1] + ack, ok := ackMsg.Message.(*SyncMessage_Ack) + require.True(t, ok, "Second message should be ACK") + assert.True(t, ack.Ack.Success) + assert.Equal(t, int64(1), ack.Ack.Version) + + // Verify resource was added to store + meta := ds.IndexLookup(nil, "10.0.0.1") + require.NotNil(t, meta, "Resource should be in store") + assert.Equal(t, "test-pod-1", meta.Name) +} + +// TestKubernetesCacheServer_ReceivesIncrementalUpdate tests incremental updates +func TestKubernetesCacheServer_ReceivesIncrementalUpdate(t *testing.T) { + ds := setupTestDatasource() + server := NewKubernetesCacheServer(ds) + + mockStream := &mockStreamServer{ + ctx: context.Background(), + sendChan: make(chan *CacheUpdate, 10), + recvMsgs: make([]*SyncMessage, 0), + firstSend: true, + } + + // Send incremental update + update := &CacheUpdate{ + Version: 2, + IsSnapshot: false, + Operation: OperationType_OPERATION_ADD, + Entries: []*ResourceEntry{ + { + Kind: "Pod", + Namespace: "default", + Name: "new-pod", + }, + }, + } + mockStream.sendChan <- update + close(mockStream.sendChan) + + err := server.StreamUpdates(mockStream) + require.NoError(t, err) + + // Verify ACK was sent + require.Greater(t, len(mockStream.recvMsgs), 1) + ackMsg := mockStream.recvMsgs[1] + ack, ok := ackMsg.Message.(*SyncMessage_Ack) + require.True(t, ok) + assert.True(t, ack.Ack.Success) + assert.Equal(t, int64(2), ack.Ack.Version) +} + +// TestKubernetesCacheServer_MultipleUpdates tests receiving multiple updates in sequence +func TestKubernetesCacheServer_MultipleUpdates(t *testing.T) { + ds := setupTestDatasource() + server := NewKubernetesCacheServer(ds) + + mockStream := &mockStreamServer{ + ctx: context.Background(), + sendChan: make(chan *CacheUpdate, 10), + recvMsgs: make([]*SyncMessage, 0), + firstSend: true, + } + + // Send ADD + mockStream.sendChan <- &CacheUpdate{ + Version: 1, + IsSnapshot: false, + Operation: OperationType_OPERATION_ADD, + Entries: []*ResourceEntry{{Kind: "Pod", Name: "pod1", Namespace: "default"}}, + } + + // Send another ADD + mockStream.sendChan <- &CacheUpdate{ + Version: 2, + IsSnapshot: false, + Operation: OperationType_OPERATION_ADD, + Entries: []*ResourceEntry{{Kind: "Pod", Name: "pod2", Namespace: "default"}}, + } + + // Send DELETE + mockStream.sendChan <- &CacheUpdate{ + Version: 3, + IsSnapshot: false, + Operation: OperationType_OPERATION_DELETE, + Entries: []*ResourceEntry{{Kind: "Pod", Name: "pod1", Namespace: "default"}}, + } + + close(mockStream.sendChan) + + err := server.StreamUpdates(mockStream) + require.NoError(t, err) + + // Should have: 1 SyncRequest + 3 ACKs + assert.Equal(t, 4, len(mockStream.recvMsgs)) + + // Verify version tracking + assert.Equal(t, int64(3), server.GetCurrentVersion()) +} + +// TestKubernetesCacheServer_ErrorHandling tests error scenarios +func TestKubernetesCacheServer_ErrorHandling(t *testing.T) { + ds := setupTestDatasource() + server := NewKubernetesCacheServer(ds) + + t.Run("client disconnects abruptly", func(t *testing.T) { + mockStream := &mockStreamServer{ + ctx: context.Background(), + sendChan: make(chan *CacheUpdate, 10), + recvMsgs: make([]*SyncMessage, 0), + firstSend: true, + } + + // Close immediately + close(mockStream.sendChan) + + // Should handle gracefully + err := server.StreamUpdates(mockStream) + assert.NoError(t, err) // EOF is normal when client disconnects + }) +} + +// TestKubernetesCacheServer_WithKubernetesStore tests operations using KubernetesStore +func TestKubernetesCacheServer_WithKubernetesStore(t *testing.T) { + ds := setupTestDatasourceWithStore() + server := NewKubernetesCacheServer(ds) + + mockStream := &mockStreamServer{ + ctx: context.Background(), + sendChan: make(chan *CacheUpdate, 10), + recvMsgs: make([]*SyncMessage, 0), + firstSend: true, + } + + // Send ADD + mockStream.sendChan <- &CacheUpdate{ + Version: 1, + IsSnapshot: false, + Operation: OperationType_OPERATION_ADD, + Entries: []*ResourceEntry{ + { + Kind: "Pod", + Namespace: "test-ns", + Name: "store-pod-1", + Uid: "store-pod-uid-1", + Ips: []string{"10.0.1.1"}, + Labels: map[string]string{ + "app": "test", + }, + }, + }, + } + + // Send UPDATE + mockStream.sendChan <- &CacheUpdate{ + Version: 2, + IsSnapshot: false, + Operation: OperationType_OPERATION_UPDATE, + Entries: []*ResourceEntry{ + { + Kind: "Pod", + Namespace: "test-ns", + Name: "store-pod-1", + Uid: "store-pod-uid-1", + Ips: []string{"10.0.1.1"}, + Labels: map[string]string{ + "app": "test", + "version": "v2", // Updated label + }, + }, + }, + } + + close(mockStream.sendChan) + + err := server.StreamUpdates(mockStream) + require.NoError(t, err) + + // Verify resource was added and updated in store + meta := ds.IndexLookup(nil, "10.0.1.1") + require.NotNil(t, meta, "Resource should be in KubernetesStore") + assert.Equal(t, "store-pod-1", meta.Name) + assert.Equal(t, "test-ns", meta.Namespace) + assert.Equal(t, "v2", meta.Labels["version"], "Label should be updated") + + // Verify ACKs were sent + require.Equal(t, 3, len(mockStream.recvMsgs)) // SyncRequest + 2 ACKs +} + +// TestKubernetesCacheServer_DeleteFromStore tests DELETE operation on KubernetesStore +func TestKubernetesCacheServer_DeleteFromStore(t *testing.T) { + ds := setupTestDatasourceWithStore() + server := NewKubernetesCacheServer(ds) + + mockStream := &mockStreamServer{ + ctx: context.Background(), + sendChan: make(chan *CacheUpdate, 10), + recvMsgs: make([]*SyncMessage, 0), + firstSend: true, + } + + // First ADD a resource + mockStream.sendChan <- &CacheUpdate{ + Version: 1, + IsSnapshot: false, + Operation: OperationType_OPERATION_ADD, + Entries: []*ResourceEntry{ + { + Kind: "Pod", + Namespace: "test-ns", + Name: "delete-me", + Uid: "delete-pod-uid", + Ips: []string{"10.0.2.1"}, + }, + }, + } + + // Then DELETE it + mockStream.sendChan <- &CacheUpdate{ + Version: 2, + IsSnapshot: false, + Operation: OperationType_OPERATION_DELETE, + Entries: []*ResourceEntry{ + { + Kind: "Pod", + Namespace: "test-ns", + Name: "delete-me", + }, + }, + } + + close(mockStream.sendChan) + + err := server.StreamUpdates(mockStream) + require.NoError(t, err) + + // Verify resource was deleted from store + meta := ds.IndexLookup(nil, "10.0.2.1") + assert.Nil(t, meta, "Resource should be deleted from KubernetesStore") +} + +// TestKubernetesCacheServer_StoreReplacesInformers tests that KubernetesStore replaces Informers +func TestKubernetesCacheServer_StoreReplacesInformers(t *testing.T) { + ds := setupTestDatasourceWithStore() + server := NewKubernetesCacheServer(ds) + + mockStream := &mockStreamServer{ + ctx: context.Background(), + sendChan: make(chan *CacheUpdate, 10), + recvMsgs: make([]*SyncMessage, 0), + firstSend: true, + } + + // Add resource via gRPC (goes to KubernetesStore) + mockStream.sendChan <- &CacheUpdate{ + Version: 1, + IsSnapshot: false, + Operation: OperationType_OPERATION_ADD, + Entries: []*ResourceEntry{ + { + Kind: "Pod", + Namespace: "grpc-ns", + Name: "grpc-pod", + Ips: []string{"10.0.3.1"}, + }, + }, + } + + close(mockStream.sendChan) + + err := server.StreamUpdates(mockStream) + require.NoError(t, err) + + // Should find resource from KubernetesStore + grpcMeta := ds.IndexLookup(nil, "10.0.3.1") + require.NotNil(t, grpcMeta) + assert.Equal(t, "grpc-pod", grpcMeta.Name) + + // When KubernetesStore is set, Informers are bypassed + // (testIPInfo from setupTestDatasourceWithStore is in Informers, not in Store) + informerMeta := ds.IndexLookup(nil, "10.0.0.1") + assert.Nil(t, informerMeta, "KubernetesStore replaces Informers, so Informer data is not accessible") +} + +// TestKubernetesCacheServer_FallbackToInformers verifies fallback when Store is not set +func TestKubernetesCacheServer_FallbackToInformers(t *testing.T) { + // Datasource WITHOUT KubernetesStore - falls back to Informers + ds := setupTestDatasource() + + // Should find resource from Informers (set up in testIPInfo) + informerMeta := ds.IndexLookup(nil, "10.0.0.1") + require.NotNil(t, informerMeta, "Should fallback to Informers when Store is not set") + assert.Equal(t, "test-pod-1", informerMeta.Name) +} + +// mockStreamServer implements the server-side stream for testing +// Note: With the corrected protocol, the server: +// - Receives CacheUpdate (from client) +// - Sends SyncMessage (to client) +type mockStreamServer struct { + grpc.ServerStream + ctx context.Context + sendChan chan *CacheUpdate // What client sends + recvMsgs []*SyncMessage // What server sent + firstSend bool +} + +func (m *mockStreamServer) Context() context.Context { + return m.ctx +} + +// Send is called by the server to send SyncMessage to client +func (m *mockStreamServer) Send(msg *SyncMessage) error { + m.recvMsgs = append(m.recvMsgs, msg) + return nil +} + +// Recv is called by the server to receive CacheUpdate from client +func (m *mockStreamServer) Recv() (*CacheUpdate, error) { + update, ok := <-m.sendChan + if !ok { + return nil, io.EOF + } + return update, nil +} diff --git a/proto/README.md b/proto/README.md index 78fbc764a..9d5a26009 100644 --- a/proto/README.md +++ b/proto/README.md @@ -5,4 +5,13 @@ Run the following commands to update `genericmap.pb.go` and `genericmap_grpc.pb. ```bash $ protoc --go_out=./pkg/pipeline/write/grpc ./proto/genericmap.proto $ protoc --go-grpc_out=./pkg/pipeline/write/grpc ./proto/genericmap.proto +``` + +# Update k8scache gRPC + +Run the following commands to update `k8scache.pb.go` and `k8scache_grpc.pb.go`: + +```bash +$ protoc --go_out=./pkg/pipeline/transform/kubernetes/k8scache --go_opt=paths=source_relative ./proto/k8scache.proto +$ protoc --go-grpc_out=./pkg/pipeline/transform/kubernetes/k8scache --go-grpc_opt=paths=source_relative ./proto/k8scache.proto ``` \ No newline at end of file diff --git a/proto/k8scache.proto b/proto/k8scache.proto new file mode 100644 index 000000000..6ba488ee0 --- /dev/null +++ b/proto/k8scache.proto @@ -0,0 +1,95 @@ +syntax = "proto3"; + +option go_package = "./k8scache"; + +package k8scache; + +// KubernetesCacheService defines the bidirectional streaming service for cache synchronization +// between the centralized informers pod and the distributed FLP processor pods. +// +// Flow: +// 1. FLP processor (server) accepts connection from informers (client) +// 2. FLP sends SyncRequest indicating what version it has +// 3. Informers sends CacheUpdate (snapshot or incremental) +// 4. FLP sends SyncAck to confirm receipt +// 5. Repeat steps 3-4 for ongoing updates +service KubernetesCacheService { + // StreamUpdates establishes a bidirectional stream: + // - Informers (client) send CacheUpdate messages (snapshots and incremental updates) + // - FLP processors (server) send SyncRequest and SyncAck messages + // + // NOTE: In protobuf syntax, "rpc Method(stream A) returns (stream B)": + // - A = what server RECEIVES (what client SENDS) + // - B = what server SENDS (what client RECEIVES) + // + // We want: + // - Server receives: CacheUpdate (from informers client) + // - Server sends: SyncMessage (requests/acks) + // + // Therefore we need: (stream CacheUpdate) returns (stream SyncMessage) + rpc StreamUpdates(stream CacheUpdate) returns (stream SyncMessage) {} +} + +// SyncMessage is sent by FLP processors (server) to the informers (client) +message SyncMessage { + oneof message { + SyncRequest request = 1; + SyncAck ack = 2; + } +} + +// SyncRequest is sent when an FLP processor connects or reconnects +message SyncRequest { + string processor_id = 1; // Unique identifier for the FLP processor pod + int64 last_version = 2; // Last cache version seen (0 if requesting full snapshot) +} + +// SyncAck acknowledges receipt of cache updates +message SyncAck { + string processor_id = 1; + int64 version = 2; // The version that was successfully applied + bool success = 3; // Whether the update was applied successfully + string error = 4; // Error message if success = false +} + +// CacheUpdate represents an incremental change to the cache +message CacheUpdate { + int64 version = 1; // Monotonically increasing version number + bool is_snapshot = 2; // True if this is a full snapshot, false for incremental + repeated ResourceEntry entries = 3; + OperationType operation = 4; // Only relevant when is_snapshot = false +} + +// OperationType defines the kind of cache operation +enum OperationType { + OPERATION_UNSPECIFIED = 0; + OPERATION_ADD = 1; + OPERATION_UPDATE = 2; + OPERATION_DELETE = 3; +} + +// ResourceEntry represents a single Kubernetes resource in the cache +message ResourceEntry { + string kind = 1; // "Pod", "Node", or "Service" + string namespace = 2; + string name = 3; + string uid = 4; + + // Resource-specific metadata + string owner_name = 5; + string owner_kind = 6; + string host_name = 7; // For Pods: the node name + string host_ip = 8; // For Pods: the node IP + string network_name = 9; // For multi-network support + + repeated string ips = 10; // IP addresses associated with this resource + repeated string secondary_net_keys = 11; // Secondary network keys for CNI plugins + map secondary_net_names = 16; // Secondary network names mapping (key -> network name) + + map labels = 12; + map annotations = 13; + + // Kubernetes metadata + int64 creation_timestamp = 14; // Unix timestamp + string resource_version = 15; // K8s resource version +} diff --git a/vendor/k8s.io/client-go/tools/leaderelection/OWNERS b/vendor/k8s.io/client-go/tools/leaderelection/OWNERS new file mode 100644 index 000000000..70787f2b5 --- /dev/null +++ b/vendor/k8s.io/client-go/tools/leaderelection/OWNERS @@ -0,0 +1,13 @@ +# See the OWNERS docs at https://go.k8s.io/owners + +approvers: + - mikedanese + - jefftree +reviewers: + - wojtek-t + - deads2k + - mikedanese + - ingvagabund + - jefftree +emeritus_approvers: + - timothysc diff --git a/vendor/k8s.io/client-go/tools/leaderelection/healthzadaptor.go b/vendor/k8s.io/client-go/tools/leaderelection/healthzadaptor.go new file mode 100644 index 000000000..b93537291 --- /dev/null +++ b/vendor/k8s.io/client-go/tools/leaderelection/healthzadaptor.go @@ -0,0 +1,69 @@ +/* +Copyright 2015 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package leaderelection + +import ( + "net/http" + "sync" + "time" +) + +// HealthzAdaptor associates the /healthz endpoint with the LeaderElection object. +// It helps deal with the /healthz endpoint being set up prior to the LeaderElection. +// This contains the code needed to act as an adaptor between the leader +// election code the health check code. It allows us to provide health +// status about the leader election. Most specifically about if the leader +// has failed to renew without exiting the process. In that case we should +// report not healthy and rely on the kubelet to take down the process. +type HealthzAdaptor struct { + pointerLock sync.Mutex + le *LeaderElector + timeout time.Duration +} + +// Name returns the name of the health check we are implementing. +func (l *HealthzAdaptor) Name() string { + return "leaderElection" +} + +// Check is called by the healthz endpoint handler. +// It fails (returns an error) if we own the lease but had not been able to renew it. +func (l *HealthzAdaptor) Check(req *http.Request) error { + l.pointerLock.Lock() + defer l.pointerLock.Unlock() + if l.le == nil { + return nil + } + return l.le.Check(l.timeout) +} + +// SetLeaderElection ties a leader election object to a HealthzAdaptor +func (l *HealthzAdaptor) SetLeaderElection(le *LeaderElector) { + l.pointerLock.Lock() + defer l.pointerLock.Unlock() + l.le = le +} + +// NewLeaderHealthzAdaptor creates a basic healthz adaptor to monitor a leader election. +// timeout determines the time beyond the lease expiry to be allowed for timeout. +// checks within the timeout period after the lease expires will still return healthy. +func NewLeaderHealthzAdaptor(timeout time.Duration) *HealthzAdaptor { + result := &HealthzAdaptor{ + timeout: timeout, + } + return result +} diff --git a/vendor/k8s.io/client-go/tools/leaderelection/leaderelection.go b/vendor/k8s.io/client-go/tools/leaderelection/leaderelection.go new file mode 100644 index 000000000..29d34c4e9 --- /dev/null +++ b/vendor/k8s.io/client-go/tools/leaderelection/leaderelection.go @@ -0,0 +1,543 @@ +/* +Copyright 2015 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package leaderelection implements leader election of a set of endpoints. +// It uses an annotation in the endpoints object to store the record of the +// election state. This implementation does not guarantee that only one +// client is acting as a leader (a.k.a. fencing). +// +// A client only acts on timestamps captured locally to infer the state of the +// leader election. The client does not consider timestamps in the leader +// election record to be accurate because these timestamps may not have been +// produced by a local clock. The implemention does not depend on their +// accuracy and only uses their change to indicate that another client has +// renewed the leader lease. Thus the implementation is tolerant to arbitrary +// clock skew, but is not tolerant to arbitrary clock skew rate. +// +// However the level of tolerance to skew rate can be configured by setting +// RenewDeadline and LeaseDuration appropriately. The tolerance expressed as a +// maximum tolerated ratio of time passed on the fastest node to time passed on +// the slowest node can be approximately achieved with a configuration that sets +// the same ratio of LeaseDuration to RenewDeadline. For example if a user wanted +// to tolerate some nodes progressing forward in time twice as fast as other nodes, +// the user could set LeaseDuration to 60 seconds and RenewDeadline to 30 seconds. +// +// While not required, some method of clock synchronization between nodes in the +// cluster is highly recommended. It's important to keep in mind when configuring +// this client that the tolerance to skew rate varies inversely to master +// availability. +// +// Larger clusters often have a more lenient SLA for API latency. This should be +// taken into account when configuring the client. The rate of leader transitions +// should be monitored and RetryPeriod and LeaseDuration should be increased +// until the rate is stable and acceptably low. It's important to keep in mind +// when configuring this client that the tolerance to API latency varies inversely +// to master availability. +// +// DISCLAIMER: this is an alpha API. This library will likely change significantly +// or even be removed entirely in subsequent releases. Depend on this API at +// your own risk. +package leaderelection + +import ( + "bytes" + "context" + "fmt" + "sync" + "time" + + "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/apimachinery/pkg/util/wait" + rl "k8s.io/client-go/tools/leaderelection/resourcelock" + "k8s.io/klog/v2" + "k8s.io/utils/clock" +) + +const ( + JitterFactor = 1.2 +) + +// NewLeaderElector creates a LeaderElector from a LeaderElectionConfig +func NewLeaderElector(lec LeaderElectionConfig) (*LeaderElector, error) { + if lec.LeaseDuration <= lec.RenewDeadline { + return nil, fmt.Errorf("leaseDuration must be greater than renewDeadline") + } + if lec.RenewDeadline <= time.Duration(JitterFactor*float64(lec.RetryPeriod)) { + return nil, fmt.Errorf("renewDeadline must be greater than retryPeriod*JitterFactor") + } + if lec.LeaseDuration < 1 { + return nil, fmt.Errorf("leaseDuration must be greater than zero") + } + if lec.RenewDeadline < 1 { + return nil, fmt.Errorf("renewDeadline must be greater than zero") + } + if lec.RetryPeriod < 1 { + return nil, fmt.Errorf("retryPeriod must be greater than zero") + } + if lec.Callbacks.OnStartedLeading == nil { + return nil, fmt.Errorf("OnStartedLeading callback must not be nil") + } + if lec.Callbacks.OnStoppedLeading == nil { + return nil, fmt.Errorf("OnStoppedLeading callback must not be nil") + } + + if lec.Lock == nil { + return nil, fmt.Errorf("Lock must not be nil.") + } + id := lec.Lock.Identity() + if id == "" { + return nil, fmt.Errorf("Lock identity is empty") + } + + le := LeaderElector{ + config: lec, + clock: clock.RealClock{}, + metrics: globalMetricsFactory.newLeaderMetrics(), + } + le.metrics.leaderOff(le.config.Name) + return &le, nil +} + +type LeaderElectionConfig struct { + // Lock is the resource that will be used for locking + Lock rl.Interface + + // LeaseDuration is the duration that non-leader candidates will + // wait to force acquire leadership. This is measured against time of + // last observed ack. + // + // A client needs to wait a full LeaseDuration without observing a change to + // the record before it can attempt to take over. When all clients are + // shutdown and a new set of clients are started with different names against + // the same leader record, they must wait the full LeaseDuration before + // attempting to acquire the lease. Thus LeaseDuration should be as short as + // possible (within your tolerance for clock skew rate) to avoid a possible + // long waits in the scenario. + // + // Core clients default this value to 15 seconds. + LeaseDuration time.Duration + // RenewDeadline is the duration that the acting master will retry + // refreshing leadership before giving up. + // + // Core clients default this value to 10 seconds. + RenewDeadline time.Duration + // RetryPeriod is the duration the LeaderElector clients should wait + // between tries of actions. + // + // Core clients default this value to 2 seconds. + RetryPeriod time.Duration + + // Callbacks are callbacks that are triggered during certain lifecycle + // events of the LeaderElector + Callbacks LeaderCallbacks + + // WatchDog is the associated health checker + // WatchDog may be null if it's not needed/configured. + WatchDog *HealthzAdaptor + + // ReleaseOnCancel should be set true if the lock should be released + // when the run context is cancelled. If you set this to true, you must + // ensure all code guarded by this lease has successfully completed + // prior to cancelling the context, or you may have two processes + // simultaneously acting on the critical path. + ReleaseOnCancel bool + + // Name is the name of the resource lock for debugging + Name string + + // Coordinated will use the Coordinated Leader Election feature + // WARNING: Coordinated leader election is ALPHA. + Coordinated bool +} + +// LeaderCallbacks are callbacks that are triggered during certain +// lifecycle events of the LeaderElector. These are invoked asynchronously. +// +// possible future callbacks: +// - OnChallenge() +type LeaderCallbacks struct { + // OnStartedLeading is called when a LeaderElector client starts leading + OnStartedLeading func(context.Context) + // OnStoppedLeading is called when a LeaderElector client stops leading. + // This callback is always called when the LeaderElector exits, even if it did not start leading. + // Users should not assume that OnStoppedLeading is only called after OnStartedLeading. + // see: https://github.com/kubernetes/kubernetes/pull/127675#discussion_r1780059887 + OnStoppedLeading func() + // OnNewLeader is called when the client observes a leader that is + // not the previously observed leader. This includes the first observed + // leader when the client starts. + OnNewLeader func(identity string) +} + +// LeaderElector is a leader election client. +type LeaderElector struct { + config LeaderElectionConfig + // internal bookkeeping + observedRecord rl.LeaderElectionRecord + observedRawRecord []byte + observedTime time.Time + // used to implement OnNewLeader(), may lag slightly from the + // value observedRecord.HolderIdentity if the transition has + // not yet been reported. + reportedLeader string + + // clock is wrapper around time to allow for less flaky testing + clock clock.Clock + + // used to lock the observedRecord + observedRecordLock sync.Mutex + + metrics leaderMetricsAdapter +} + +// Run starts the leader election loop. Run will not return +// before leader election loop is stopped by ctx or it has +// stopped holding the leader lease +func (le *LeaderElector) Run(ctx context.Context) { + defer runtime.HandleCrashWithContext(ctx) + defer le.config.Callbacks.OnStoppedLeading() + + if !le.acquire(ctx) { + return // ctx signalled done + } + ctx, cancel := context.WithCancel(ctx) + defer cancel() + go le.config.Callbacks.OnStartedLeading(ctx) + le.renew(ctx) +} + +// RunOrDie starts a client with the provided config or panics if the config +// fails to validate. RunOrDie blocks until leader election loop is +// stopped by ctx or it has stopped holding the leader lease +func RunOrDie(ctx context.Context, lec LeaderElectionConfig) { + le, err := NewLeaderElector(lec) + if err != nil { + panic(err) + } + if lec.WatchDog != nil { + lec.WatchDog.SetLeaderElection(le) + } + le.Run(ctx) +} + +// GetLeader returns the identity of the last observed leader or returns the empty string if +// no leader has yet been observed. +// This function is for informational purposes. (e.g. monitoring, logs, etc.) +func (le *LeaderElector) GetLeader() string { + return le.getObservedRecord().HolderIdentity +} + +// IsLeader returns true if the last observed leader was this client else returns false. +func (le *LeaderElector) IsLeader() bool { + return le.getObservedRecord().HolderIdentity == le.config.Lock.Identity() +} + +// acquire loops calling tryAcquireOrRenew and returns true immediately when tryAcquireOrRenew succeeds. +// Returns false if ctx signals done. +func (le *LeaderElector) acquire(ctx context.Context) bool { + ctx, cancel := context.WithCancel(ctx) + defer cancel() + succeeded := false + desc := le.config.Lock.Describe() + logger := klog.FromContext(ctx) + logger.Info("Attempting to acquire leader lease...", "lock", desc) + wait.JitterUntil(func() { + if !le.config.Coordinated { + succeeded = le.tryAcquireOrRenew(ctx) + } else { + succeeded = le.tryCoordinatedRenew(ctx) + } + le.maybeReportTransition() + if !succeeded { + logger.V(4).Info("Failed to acquire lease", "lock", desc) + return + } + le.config.Lock.RecordEvent("became leader") + le.metrics.leaderOn(le.config.Name) + logger.Info("Successfully acquired lease", "lock", desc) + cancel() + }, le.config.RetryPeriod, JitterFactor, true, ctx.Done()) + return succeeded +} + +// renew loops calling tryAcquireOrRenew and returns immediately when tryAcquireOrRenew fails or ctx signals done. +func (le *LeaderElector) renew(ctx context.Context) { + defer le.config.Lock.RecordEvent("stopped leading") + ctx, cancel := context.WithCancel(ctx) + defer cancel() + logger := klog.FromContext(ctx) + wait.Until(func() { + err := wait.PollUntilContextTimeout(ctx, le.config.RetryPeriod, le.config.RenewDeadline, true, func(ctx context.Context) (done bool, err error) { + if !le.config.Coordinated { + return le.tryAcquireOrRenew(ctx), nil + } else { + return le.tryCoordinatedRenew(ctx), nil + } + }) + le.maybeReportTransition() + desc := le.config.Lock.Describe() + if err == nil { + logger.V(5).Info("Successfully renewed lease", "lock", desc) + return + } + le.metrics.leaderOff(le.config.Name) + logger.Info("Failed to renew lease", "lock", desc, "err", err) + cancel() + }, le.config.RetryPeriod, ctx.Done()) + + // if we hold the lease, give it up + if le.config.ReleaseOnCancel { + le.release(logger) + } +} + +// release attempts to release the leader lease if we have acquired it. +func (le *LeaderElector) release(logger klog.Logger) bool { + ctx := context.Background() + timeoutCtx, timeoutCancel := context.WithTimeout(ctx, le.config.RenewDeadline) + defer timeoutCancel() + // update the resourceVersion of lease + oldLeaderElectionRecord, _, err := le.config.Lock.Get(timeoutCtx) + if err != nil { + if !errors.IsNotFound(err) { + logger.Error(err, "error retrieving resource lock", "lock", le.config.Lock.Describe()) + return false + } + logger.Info("lease lock not found", "lock", le.config.Lock.Describe()) + return false + } + + if !le.IsLeader() { + return true + } + now := metav1.NewTime(le.clock.Now()) + leaderElectionRecord := rl.LeaderElectionRecord{ + LeaderTransitions: oldLeaderElectionRecord.LeaderTransitions, + LeaseDurationSeconds: 1, + RenewTime: now, + AcquireTime: now, + } + if err := le.config.Lock.Update(timeoutCtx, leaderElectionRecord); err != nil { + logger.Error(err, "Failed to release lease", "lock", le.config.Lock.Describe()) + return false + } + + le.setObservedRecord(&leaderElectionRecord) + return true +} + +// tryCoordinatedRenew checks if it acquired a lease and tries to renew the +// lease if it has already been acquired. Returns true on success else returns +// false. +func (le *LeaderElector) tryCoordinatedRenew(ctx context.Context) bool { + logger := klog.FromContext(ctx) + now := metav1.NewTime(le.clock.Now()) + leaderElectionRecord := rl.LeaderElectionRecord{ + HolderIdentity: le.config.Lock.Identity(), + LeaseDurationSeconds: int(le.config.LeaseDuration / time.Second), + RenewTime: now, + AcquireTime: now, + } + + // 1. obtain the electionRecord + oldLeaderElectionRecord, oldLeaderElectionRawRecord, err := le.config.Lock.Get(ctx) + if err != nil { + if !errors.IsNotFound(err) { + logger.Error(err, "Error retrieving lease lock", "lock", le.config.Lock.Describe()) + return false + } + logger.Info("Lease lock not found", "lock", le.config.Lock.Describe(), "err", err) + return false + } + + // 2. Record obtained, check the Identity & Time + if !bytes.Equal(le.observedRawRecord, oldLeaderElectionRawRecord) { + le.setObservedRecord(oldLeaderElectionRecord) + + le.observedRawRecord = oldLeaderElectionRawRecord + } + + hasExpired := le.observedTime.Add(time.Second * time.Duration(oldLeaderElectionRecord.LeaseDurationSeconds)).Before(now.Time) + if hasExpired { + logger.Info("Lease has expired", "lock", le.config.Lock.Describe()) + return false + } + + if !le.IsLeader() { + logger.V(6).Info("Lease is held and has not yet expired", "lock", le.config.Lock.Describe(), "holder", oldLeaderElectionRecord.HolderIdentity) + return false + } + + // 2b. If the lease has been marked as "end of term", don't renew it + if le.IsLeader() && oldLeaderElectionRecord.PreferredHolder != "" { + logger.V(4).Info("Lease is marked as 'end of term'", "lock", le.config.Lock.Describe()) + // TODO: Instead of letting lease expire, the holder may deleted it directly + // This will not be compatible with all controllers, so it needs to be opt-in behavior. + // We must ensure all code guarded by this lease has successfully completed + // prior to releasing or there may be two processes + // simultaneously acting on the critical path. + // Usually once this returns false, the process is terminated.. + // xref: OnStoppedLeading + return false + } + + // 3. We're going to try to update. The leaderElectionRecord is set to it's default + // here. Let's correct it before updating. + if le.IsLeader() { + leaderElectionRecord.AcquireTime = oldLeaderElectionRecord.AcquireTime + leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions + leaderElectionRecord.Strategy = oldLeaderElectionRecord.Strategy + le.metrics.slowpathExercised(le.config.Name) + } else { + leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions + 1 + } + + // update the lock itself + if err = le.config.Lock.Update(ctx, leaderElectionRecord); err != nil { + logger.Error(err, "Failed to update lock", "lock", le.config.Lock.Describe()) + return false + } + + le.setObservedRecord(&leaderElectionRecord) + return true +} + +// tryAcquireOrRenew tries to acquire a leader lease if it is not already acquired, +// else it tries to renew the lease if it has already been acquired. Returns true +// on success else returns false. +func (le *LeaderElector) tryAcquireOrRenew(ctx context.Context) bool { + logger := klog.FromContext(ctx) + now := metav1.NewTime(le.clock.Now()) + leaderElectionRecord := rl.LeaderElectionRecord{ + HolderIdentity: le.config.Lock.Identity(), + LeaseDurationSeconds: int(le.config.LeaseDuration / time.Second), + RenewTime: now, + AcquireTime: now, + } + + // 1. fast path for the leader to update optimistically assuming that the record observed + // last time is the current version. + if le.IsLeader() && le.isLeaseValid(now.Time) { + oldObservedRecord := le.getObservedRecord() + leaderElectionRecord.AcquireTime = oldObservedRecord.AcquireTime + leaderElectionRecord.LeaderTransitions = oldObservedRecord.LeaderTransitions + + err := le.config.Lock.Update(ctx, leaderElectionRecord) + if err == nil { + le.setObservedRecord(&leaderElectionRecord) + return true + } + logger.Error(err, "Failed to update lease optimistically, falling back to slow path", "lock", le.config.Lock.Describe()) + } + + // 2. obtain or create the ElectionRecord + oldLeaderElectionRecord, oldLeaderElectionRawRecord, err := le.config.Lock.Get(ctx) + if err != nil { + if !errors.IsNotFound(err) { + logger.Error(err, "Error retrieving lease lock", "lock", le.config.Lock.Describe()) + return false + } + if err = le.config.Lock.Create(ctx, leaderElectionRecord); err != nil { + logger.Error(err, "Error initially creating lease lock", "lock", le.config.Lock.Describe()) + return false + } + + le.setObservedRecord(&leaderElectionRecord) + + return true + } + + // 3. Record obtained, check the Identity & Time + if !bytes.Equal(le.observedRawRecord, oldLeaderElectionRawRecord) { + le.setObservedRecord(oldLeaderElectionRecord) + + le.observedRawRecord = oldLeaderElectionRawRecord + } + if len(oldLeaderElectionRecord.HolderIdentity) > 0 && le.isLeaseValid(now.Time) && !le.IsLeader() { + logger.V(4).Info("Lease is held by and has not yet expired", "lock", le.config.Lock.Describe(), "holder", oldLeaderElectionRecord.HolderIdentity) + return false + } + + // 4. We're going to try to update. The leaderElectionRecord is set to it's default + // here. Let's correct it before updating. + if le.IsLeader() { + leaderElectionRecord.AcquireTime = oldLeaderElectionRecord.AcquireTime + leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions + le.metrics.slowpathExercised(le.config.Name) + } else { + leaderElectionRecord.LeaderTransitions = oldLeaderElectionRecord.LeaderTransitions + 1 + } + + // update the lock itself + if err = le.config.Lock.Update(ctx, leaderElectionRecord); err != nil { + logger.Error(err, "Failed to update lease", "lock", le.config.Lock.Describe()) + return false + } + + le.setObservedRecord(&leaderElectionRecord) + return true +} + +func (le *LeaderElector) maybeReportTransition() { + if le.observedRecord.HolderIdentity == le.reportedLeader { + return + } + le.reportedLeader = le.observedRecord.HolderIdentity + if le.config.Callbacks.OnNewLeader != nil { + go le.config.Callbacks.OnNewLeader(le.reportedLeader) + } +} + +// Check will determine if the current lease is expired by more than timeout. +func (le *LeaderElector) Check(maxTolerableExpiredLease time.Duration) error { + if !le.IsLeader() { + // Currently not concerned with the case that we are hot standby + return nil + } + // If we are more than timeout seconds after the lease duration that is past the timeout + // on the lease renew. Time to start reporting ourselves as unhealthy. We should have + // died but conditions like deadlock can prevent this. (See #70819) + if le.clock.Since(le.observedTime) > le.config.LeaseDuration+maxTolerableExpiredLease { + return fmt.Errorf("failed election to renew leadership on lease %s", le.config.Name) + } + + return nil +} + +func (le *LeaderElector) isLeaseValid(now time.Time) bool { + return le.observedTime.Add(time.Second * time.Duration(le.getObservedRecord().LeaseDurationSeconds)).After(now) +} + +// setObservedRecord will set a new observedRecord and update observedTime to the current time. +// Protect critical sections with lock. +func (le *LeaderElector) setObservedRecord(observedRecord *rl.LeaderElectionRecord) { + le.observedRecordLock.Lock() + defer le.observedRecordLock.Unlock() + + le.observedRecord = *observedRecord + le.observedTime = le.clock.Now() +} + +// getObservedRecord returns observersRecord. +// Protect critical sections with lock. +func (le *LeaderElector) getObservedRecord() rl.LeaderElectionRecord { + le.observedRecordLock.Lock() + defer le.observedRecordLock.Unlock() + + return le.observedRecord +} diff --git a/vendor/k8s.io/client-go/tools/leaderelection/leasecandidate.go b/vendor/k8s.io/client-go/tools/leaderelection/leasecandidate.go new file mode 100644 index 000000000..b2fa14a5f --- /dev/null +++ b/vendor/k8s.io/client-go/tools/leaderelection/leasecandidate.go @@ -0,0 +1,207 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package leaderelection + +import ( + "context" + "reflect" + "time" + + v1 "k8s.io/api/coordination/v1" + v1beta1 "k8s.io/api/coordination/v1beta1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/fields" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + "k8s.io/client-go/informers" + "k8s.io/client-go/kubernetes" + coordinationv1beta1client "k8s.io/client-go/kubernetes/typed/coordination/v1beta1" + "k8s.io/client-go/tools/cache" + "k8s.io/client-go/util/workqueue" + "k8s.io/klog/v2" + "k8s.io/utils/clock" +) + +const requeueInterval = 5 * time.Minute + +type CacheSyncWaiter interface { + WaitForCacheSync(stopCh <-chan struct{}) map[reflect.Type]bool +} + +type LeaseCandidate struct { + leaseClient coordinationv1beta1client.LeaseCandidateInterface + leaseCandidateInformer cache.SharedIndexInformer + informerFactory informers.SharedInformerFactory + hasSynced cache.InformerSynced + + // At most there will be one item in this Queue (since we only watch one item) + queue workqueue.TypedRateLimitingInterface[int] + + name string + namespace string + + // controller lease + leaseName string + + clock clock.Clock + + binaryVersion, emulationVersion string + strategy v1.CoordinatedLeaseStrategy +} + +// NewCandidate creates new LeaseCandidate controller that creates a +// LeaseCandidate object if it does not exist and watches changes +// to the corresponding object and renews if PingTime is set. +// WARNING: This is an ALPHA feature. Ensure that the CoordinatedLeaderElection +// feature gate is on. +func NewCandidate(clientset kubernetes.Interface, + candidateNamespace string, + candidateName string, + targetLease string, + binaryVersion, emulationVersion string, + strategy v1.CoordinatedLeaseStrategy, +) (*LeaseCandidate, CacheSyncWaiter, error) { + fieldSelector := fields.OneTermEqualSelector("metadata.name", candidateName).String() + // A separate informer factory is required because this must start before informerFactories + // are started for leader elected components + informerFactory := informers.NewSharedInformerFactoryWithOptions( + clientset, 5*time.Minute, + informers.WithTweakListOptions(func(options *metav1.ListOptions) { + options.FieldSelector = fieldSelector + }), + ) + leaseCandidateInformer := informerFactory.Coordination().V1beta1().LeaseCandidates().Informer() + + lc := &LeaseCandidate{ + leaseClient: clientset.CoordinationV1beta1().LeaseCandidates(candidateNamespace), + leaseCandidateInformer: leaseCandidateInformer, + informerFactory: informerFactory, + name: candidateName, + namespace: candidateNamespace, + leaseName: targetLease, + clock: clock.RealClock{}, + binaryVersion: binaryVersion, + emulationVersion: emulationVersion, + strategy: strategy, + } + lc.queue = workqueue.NewTypedRateLimitingQueueWithConfig(workqueue.DefaultTypedControllerRateLimiter[int](), workqueue.TypedRateLimitingQueueConfig[int]{Name: "leasecandidate"}) + + h, err := leaseCandidateInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + UpdateFunc: func(oldObj, newObj interface{}) { + if leasecandidate, ok := newObj.(*v1beta1.LeaseCandidate); ok { + if leasecandidate.Spec.PingTime != nil && leasecandidate.Spec.PingTime.After(leasecandidate.Spec.RenewTime.Time) { + lc.enqueueLease() + } + } + }, + }) + if err != nil { + return nil, nil, err + } + lc.hasSynced = h.HasSynced + + return lc, informerFactory, nil +} + +func (c *LeaseCandidate) Run(ctx context.Context) { + defer c.queue.ShutDown() + + logger := klog.FromContext(ctx) + logger = klog.LoggerWithName(logger, "leasecandidate") + ctx = klog.NewContext(ctx, logger) + + c.informerFactory.Start(ctx.Done()) + if !cache.WaitForNamedCacheSyncWithContext(ctx, c.hasSynced) { + return + } + + c.enqueueLease() + go c.runWorker(ctx) + <-ctx.Done() +} + +func (c *LeaseCandidate) runWorker(ctx context.Context) { + for c.processNextWorkItem(ctx) { + } +} + +func (c *LeaseCandidate) processNextWorkItem(ctx context.Context) bool { + key, shutdown := c.queue.Get() + if shutdown { + return false + } + defer c.queue.Done(key) + + err := c.ensureLease(ctx) + if err == nil { + c.queue.AddAfter(key, requeueInterval) + return true + } + + utilruntime.HandleErrorWithContext(ctx, err, "Ensuring lease failed") + c.queue.AddRateLimited(key) + + return true +} + +func (c *LeaseCandidate) enqueueLease() { + c.queue.Add(0) +} + +// ensureLease creates the lease if it does not exist and renew it if it exists. Returns the lease and +// a bool (true if this call created the lease), or any error that occurs. +func (c *LeaseCandidate) ensureLease(ctx context.Context) error { + logger := klog.FromContext(ctx) + lease, err := c.leaseClient.Get(ctx, c.name, metav1.GetOptions{}) + if apierrors.IsNotFound(err) { + logger.V(2).Info("Creating lease candidate") + // lease does not exist, create it. + leaseToCreate := c.newLeaseCandidate() + if _, err := c.leaseClient.Create(ctx, leaseToCreate, metav1.CreateOptions{}); err != nil { + return err + } + logger.V(2).Info("Created lease candidate") + return nil + } else if err != nil { + return err + } + logger.V(2).Info("Lease candidate exists. Renewing.") + clone := lease.DeepCopy() + clone.Spec.RenewTime = &metav1.MicroTime{Time: c.clock.Now()} + _, err = c.leaseClient.Update(ctx, clone, metav1.UpdateOptions{}) + if err != nil { + return err + } + return nil +} + +func (c *LeaseCandidate) newLeaseCandidate() *v1beta1.LeaseCandidate { + lc := &v1beta1.LeaseCandidate{ + ObjectMeta: metav1.ObjectMeta{ + Name: c.name, + Namespace: c.namespace, + }, + Spec: v1beta1.LeaseCandidateSpec{ + LeaseName: c.leaseName, + BinaryVersion: c.binaryVersion, + EmulationVersion: c.emulationVersion, + Strategy: c.strategy, + }, + } + lc.Spec.RenewTime = &metav1.MicroTime{Time: c.clock.Now()} + return lc +} diff --git a/vendor/k8s.io/client-go/tools/leaderelection/metrics.go b/vendor/k8s.io/client-go/tools/leaderelection/metrics.go new file mode 100644 index 000000000..7438345fb --- /dev/null +++ b/vendor/k8s.io/client-go/tools/leaderelection/metrics.go @@ -0,0 +1,119 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package leaderelection + +import ( + "sync" +) + +// This file provides abstractions for setting the provider (e.g., prometheus) +// of metrics. + +type leaderMetricsAdapter interface { + leaderOn(name string) + leaderOff(name string) + slowpathExercised(name string) +} + +// LeaderMetric instruments metrics used in leader election. +type LeaderMetric interface { + On(name string) + Off(name string) + SlowpathExercised(name string) +} + +type noopMetric struct{} + +func (noopMetric) On(name string) {} +func (noopMetric) Off(name string) {} +func (noopMetric) SlowpathExercised(name string) {} + +// defaultLeaderMetrics expects the caller to lock before setting any metrics. +type defaultLeaderMetrics struct { + // leader's value indicates if the current process is the owner of name lease + leader LeaderMetric +} + +func (m *defaultLeaderMetrics) leaderOn(name string) { + if m == nil { + return + } + m.leader.On(name) +} + +func (m *defaultLeaderMetrics) leaderOff(name string) { + if m == nil { + return + } + m.leader.Off(name) +} + +func (m *defaultLeaderMetrics) slowpathExercised(name string) { + if m == nil { + return + } + m.leader.SlowpathExercised(name) +} + +type noMetrics struct{} + +func (noMetrics) leaderOn(name string) {} +func (noMetrics) leaderOff(name string) {} +func (noMetrics) slowpathExercised(name string) {} + +// MetricsProvider generates various metrics used by the leader election. +type MetricsProvider interface { + NewLeaderMetric() LeaderMetric +} + +type noopMetricsProvider struct{} + +func (noopMetricsProvider) NewLeaderMetric() LeaderMetric { + return noopMetric{} +} + +var globalMetricsFactory = leaderMetricsFactory{ + metricsProvider: noopMetricsProvider{}, +} + +type leaderMetricsFactory struct { + metricsProvider MetricsProvider + + onlyOnce sync.Once +} + +func (f *leaderMetricsFactory) setProvider(mp MetricsProvider) { + f.onlyOnce.Do(func() { + f.metricsProvider = mp + }) +} + +func (f *leaderMetricsFactory) newLeaderMetrics() leaderMetricsAdapter { + mp := f.metricsProvider + if mp == (noopMetricsProvider{}) { + return noMetrics{} + } + return &defaultLeaderMetrics{ + leader: mp.NewLeaderMetric(), + } +} + +// SetProvider sets the metrics provider for all subsequently created work +// queues. Only the first call has an effect. +func SetProvider(metricsProvider MetricsProvider) { + globalMetricsFactory.setProvider(metricsProvider) +} diff --git a/vendor/k8s.io/client-go/tools/leaderelection/resourcelock/interface.go b/vendor/k8s.io/client-go/tools/leaderelection/resourcelock/interface.go new file mode 100644 index 000000000..7e5523909 --- /dev/null +++ b/vendor/k8s.io/client-go/tools/leaderelection/resourcelock/interface.go @@ -0,0 +1,154 @@ +/* +Copyright 2016 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resourcelock + +import ( + "context" + "fmt" + "time" + + v1 "k8s.io/api/coordination/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + clientset "k8s.io/client-go/kubernetes" + coordinationv1 "k8s.io/client-go/kubernetes/typed/coordination/v1" + corev1 "k8s.io/client-go/kubernetes/typed/core/v1" + restclient "k8s.io/client-go/rest" +) + +const ( + LeaderElectionRecordAnnotationKey = "control-plane.alpha.kubernetes.io/leader" + endpointsResourceLock = "endpoints" + configMapsResourceLock = "configmaps" + LeasesResourceLock = "leases" + endpointsLeasesResourceLock = "endpointsleases" + configMapsLeasesResourceLock = "configmapsleases" +) + +// LeaderElectionRecord is the record that is stored in the leader election annotation. +// This information should be used for observational purposes only and could be replaced +// with a random string (e.g. UUID) with only slight modification of this code. +// TODO(mikedanese): this should potentially be versioned +type LeaderElectionRecord struct { + // HolderIdentity is the ID that owns the lease. If empty, no one owns this lease and + // all callers may acquire. Versions of this library prior to Kubernetes 1.14 will not + // attempt to acquire leases with empty identities and will wait for the full lease + // interval to expire before attempting to reacquire. This value is set to empty when + // a client voluntarily steps down. + HolderIdentity string `json:"holderIdentity"` + LeaseDurationSeconds int `json:"leaseDurationSeconds"` + AcquireTime metav1.Time `json:"acquireTime"` + RenewTime metav1.Time `json:"renewTime"` + LeaderTransitions int `json:"leaderTransitions"` + Strategy v1.CoordinatedLeaseStrategy `json:"strategy"` + PreferredHolder string `json:"preferredHolder"` +} + +// EventRecorder records a change in the ResourceLock. +type EventRecorder interface { + Eventf(obj runtime.Object, eventType, reason, message string, args ...interface{}) +} + +// ResourceLockConfig common data that exists across different +// resource locks +type ResourceLockConfig struct { + // Identity is the unique string identifying a lease holder across + // all participants in an election. + Identity string + // EventRecorder is optional. + EventRecorder EventRecorder +} + +// Interface offers a common interface for locking on arbitrary +// resources used in leader election. The Interface is used +// to hide the details on specific implementations in order to allow +// them to change over time. This interface is strictly for use +// by the leaderelection code. +type Interface interface { + // Get returns the LeaderElectionRecord + Get(ctx context.Context) (*LeaderElectionRecord, []byte, error) + + // Create attempts to create a LeaderElectionRecord + Create(ctx context.Context, ler LeaderElectionRecord) error + + // Update will update and existing LeaderElectionRecord + Update(ctx context.Context, ler LeaderElectionRecord) error + + // RecordEvent is used to record events + RecordEvent(string) + + // Identity will return the locks Identity + Identity() string + + // Describe is used to convert details on current resource lock + // into a string + Describe() string +} + +// new will create a lock of a given type according to the input parameters +func new(lockType string, ns string, name string, coreClient corev1.CoreV1Interface, coordinationClient coordinationv1.CoordinationV1Interface, rlc ResourceLockConfig, labels map[string]string) (Interface, error) { + leaseLock := &LeaseLock{ + LeaseMeta: metav1.ObjectMeta{ + Namespace: ns, + Name: name, + }, + Client: coordinationClient, + LockConfig: rlc, + Labels: labels, + } + switch lockType { + case endpointsResourceLock: + return nil, fmt.Errorf("endpoints lock is removed, migrate to %s", LeasesResourceLock) + case configMapsResourceLock: + return nil, fmt.Errorf("configmaps lock is removed, migrate to %s", LeasesResourceLock) + case LeasesResourceLock: + return leaseLock, nil + case endpointsLeasesResourceLock: + return nil, fmt.Errorf("endpointsleases lock is removed, migrate to %s", LeasesResourceLock) + case configMapsLeasesResourceLock: + return nil, fmt.Errorf("configmapsleases lock is removed, migrated to %s", LeasesResourceLock) + default: + return nil, fmt.Errorf("Invalid lock-type %s", lockType) + } +} + +// New will create a lock of a given type according to the input parameters +func New(lockType string, ns string, name string, coreClient corev1.CoreV1Interface, coordinationClient coordinationv1.CoordinationV1Interface, rlc ResourceLockConfig) (Interface, error) { + return new(lockType, ns, name, coreClient, coordinationClient, rlc, nil) +} + +// NewWithLabels will create a lock of a given type according to the input parameters +// When the holder of the lock changes, that holder will apply their labels +func NewWithLabels(lockType string, ns string, name string, coreClient corev1.CoreV1Interface, coordinationClient coordinationv1.CoordinationV1Interface, rlc ResourceLockConfig, labels map[string]string) (Interface, error) { + return new(lockType, ns, name, coreClient, coordinationClient, rlc, labels) +} + +// NewFromKubeconfig will create a lock of a given type according to the input parameters. +// Timeout set for a client used to contact to Kubernetes should be lower than +// RenewDeadline to keep a single hung request from forcing a leader loss. +// Setting it to max(time.Second, RenewDeadline/2) as a reasonable heuristic. +func NewFromKubeconfig(lockType string, ns string, name string, rlc ResourceLockConfig, kubeconfig *restclient.Config, renewDeadline time.Duration) (Interface, error) { + // shallow copy, do not modify the kubeconfig + config := *kubeconfig + timeout := renewDeadline / 2 + if timeout < time.Second { + timeout = time.Second + } + config.Timeout = timeout + leaderElectionClient := clientset.NewForConfigOrDie(restclient.AddUserAgent(&config, "leader-election")) + return New(lockType, ns, name, leaderElectionClient.CoreV1(), leaderElectionClient.CoordinationV1(), rlc) +} diff --git a/vendor/k8s.io/client-go/tools/leaderelection/resourcelock/leaselock.go b/vendor/k8s.io/client-go/tools/leaderelection/resourcelock/leaselock.go new file mode 100644 index 000000000..79a748b74 --- /dev/null +++ b/vendor/k8s.io/client-go/tools/leaderelection/resourcelock/leaselock.go @@ -0,0 +1,166 @@ +/* +Copyright 2018 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resourcelock + +import ( + "context" + "encoding/json" + "errors" + "fmt" + + coordinationv1 "k8s.io/api/coordination/v1" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + coordinationv1client "k8s.io/client-go/kubernetes/typed/coordination/v1" +) + +type LeaseLock struct { + // LeaseMeta should contain a Name and a Namespace of a + // LeaseMeta object that the LeaderElector will attempt to lead. + LeaseMeta metav1.ObjectMeta + Client coordinationv1client.LeasesGetter + LockConfig ResourceLockConfig + lease *coordinationv1.Lease + Labels map[string]string +} + +// Get returns the election record from a Lease spec +func (ll *LeaseLock) Get(ctx context.Context) (*LeaderElectionRecord, []byte, error) { + lease, err := ll.Client.Leases(ll.LeaseMeta.Namespace).Get(ctx, ll.LeaseMeta.Name, metav1.GetOptions{}) + if err != nil { + return nil, nil, err + } + ll.lease = lease + record := LeaseSpecToLeaderElectionRecord(&ll.lease.Spec) + recordByte, err := json.Marshal(*record) + if err != nil { + return nil, nil, err + } + return record, recordByte, nil +} + +// Create attempts to create a Lease +func (ll *LeaseLock) Create(ctx context.Context, ler LeaderElectionRecord) error { + var err error + lease := &coordinationv1.Lease{ + ObjectMeta: metav1.ObjectMeta{ + Name: ll.LeaseMeta.Name, + Namespace: ll.LeaseMeta.Namespace, + Labels: ll.Labels, + }, + Spec: LeaderElectionRecordToLeaseSpec(&ler), + } + + ll.lease, err = ll.Client.Leases(ll.LeaseMeta.Namespace).Create(ctx, lease, metav1.CreateOptions{}) + return err +} + +// Update will update an existing Lease spec. +func (ll *LeaseLock) Update(ctx context.Context, ler LeaderElectionRecord) error { + if ll.lease == nil { + return errors.New("lease not initialized, call get or create first") + } + ll.lease.Spec = LeaderElectionRecordToLeaseSpec(&ler) + + if ll.Labels != nil { + if ll.lease.Labels == nil { + ll.lease.Labels = map[string]string{} + } + // Only overwrite the labels that are specifically set + for k, v := range ll.Labels { + ll.lease.Labels[k] = v + } + } + + lease, err := ll.Client.Leases(ll.LeaseMeta.Namespace).Update(ctx, ll.lease, metav1.UpdateOptions{}) + if err != nil { + return err + } + + ll.lease = lease + return nil +} + +// RecordEvent in leader election while adding meta-data +func (ll *LeaseLock) RecordEvent(s string) { + if ll.LockConfig.EventRecorder == nil { + return + } + events := fmt.Sprintf("%v %v", ll.LockConfig.Identity, s) + subject := &coordinationv1.Lease{ObjectMeta: ll.lease.ObjectMeta} + // Populate the type meta, so we don't have to get it from the schema + subject.Kind = "Lease" + subject.APIVersion = coordinationv1.SchemeGroupVersion.String() + ll.LockConfig.EventRecorder.Eventf(subject, corev1.EventTypeNormal, "LeaderElection", events) +} + +// Describe is used to convert details on current resource lock +// into a string +func (ll *LeaseLock) Describe() string { + return fmt.Sprintf("%v/%v", ll.LeaseMeta.Namespace, ll.LeaseMeta.Name) +} + +// Identity returns the Identity of the lock +func (ll *LeaseLock) Identity() string { + return ll.LockConfig.Identity +} + +func LeaseSpecToLeaderElectionRecord(spec *coordinationv1.LeaseSpec) *LeaderElectionRecord { + var r LeaderElectionRecord + if spec.HolderIdentity != nil { + r.HolderIdentity = *spec.HolderIdentity + } + if spec.LeaseDurationSeconds != nil { + r.LeaseDurationSeconds = int(*spec.LeaseDurationSeconds) + } + if spec.LeaseTransitions != nil { + r.LeaderTransitions = int(*spec.LeaseTransitions) + } + if spec.AcquireTime != nil { + r.AcquireTime = metav1.Time{Time: spec.AcquireTime.Time} + } + if spec.RenewTime != nil { + r.RenewTime = metav1.Time{Time: spec.RenewTime.Time} + } + if spec.PreferredHolder != nil { + r.PreferredHolder = *spec.PreferredHolder + } + if spec.Strategy != nil { + r.Strategy = *spec.Strategy + } + return &r + +} + +func LeaderElectionRecordToLeaseSpec(ler *LeaderElectionRecord) coordinationv1.LeaseSpec { + leaseDurationSeconds := int32(ler.LeaseDurationSeconds) + leaseTransitions := int32(ler.LeaderTransitions) + spec := coordinationv1.LeaseSpec{ + HolderIdentity: &ler.HolderIdentity, + LeaseDurationSeconds: &leaseDurationSeconds, + AcquireTime: &metav1.MicroTime{Time: ler.AcquireTime.Time}, + RenewTime: &metav1.MicroTime{Time: ler.RenewTime.Time}, + LeaseTransitions: &leaseTransitions, + } + if ler.PreferredHolder != "" { + spec.PreferredHolder = &ler.PreferredHolder + } + if ler.Strategy != "" { + spec.Strategy = &ler.Strategy + } + return spec +} diff --git a/vendor/k8s.io/client-go/tools/leaderelection/resourcelock/multilock.go b/vendor/k8s.io/client-go/tools/leaderelection/resourcelock/multilock.go new file mode 100644 index 000000000..5ee1dcbb5 --- /dev/null +++ b/vendor/k8s.io/client-go/tools/leaderelection/resourcelock/multilock.go @@ -0,0 +1,104 @@ +/* +Copyright 2019 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package resourcelock + +import ( + "bytes" + "context" + "encoding/json" + + apierrors "k8s.io/apimachinery/pkg/api/errors" +) + +const ( + UnknownLeader = "leaderelection.k8s.io/unknown" +) + +// MultiLock is used for lock's migration +type MultiLock struct { + Primary Interface + Secondary Interface +} + +// Get returns the older election record of the lock +func (ml *MultiLock) Get(ctx context.Context) (*LeaderElectionRecord, []byte, error) { + primary, primaryRaw, err := ml.Primary.Get(ctx) + if err != nil { + return nil, nil, err + } + + secondary, secondaryRaw, err := ml.Secondary.Get(ctx) + if err != nil { + // Lock is held by old client + if apierrors.IsNotFound(err) && primary.HolderIdentity != ml.Identity() { + return primary, primaryRaw, nil + } + return nil, nil, err + } + + if primary.HolderIdentity != secondary.HolderIdentity { + primary.HolderIdentity = UnknownLeader + primaryRaw, err = json.Marshal(primary) + if err != nil { + return nil, nil, err + } + } + return primary, ConcatRawRecord(primaryRaw, secondaryRaw), nil +} + +// Create attempts to create both primary lock and secondary lock +func (ml *MultiLock) Create(ctx context.Context, ler LeaderElectionRecord) error { + err := ml.Primary.Create(ctx, ler) + if err != nil && !apierrors.IsAlreadyExists(err) { + return err + } + return ml.Secondary.Create(ctx, ler) +} + +// Update will update and existing annotation on both two resources. +func (ml *MultiLock) Update(ctx context.Context, ler LeaderElectionRecord) error { + err := ml.Primary.Update(ctx, ler) + if err != nil { + return err + } + _, _, err = ml.Secondary.Get(ctx) + if err != nil && apierrors.IsNotFound(err) { + return ml.Secondary.Create(ctx, ler) + } + return ml.Secondary.Update(ctx, ler) +} + +// RecordEvent in leader election while adding meta-data +func (ml *MultiLock) RecordEvent(s string) { + ml.Primary.RecordEvent(s) + ml.Secondary.RecordEvent(s) +} + +// Describe is used to convert details on current resource lock +// into a string +func (ml *MultiLock) Describe() string { + return ml.Primary.Describe() +} + +// Identity returns the Identity of the lock +func (ml *MultiLock) Identity() string { + return ml.Primary.Identity() +} + +func ConcatRawRecord(primaryRaw, secondaryRaw []byte) []byte { + return bytes.Join([][]byte{primaryRaw, secondaryRaw}, []byte(",")) +} diff --git a/vendor/modules.txt b/vendor/modules.txt index e2784b108..17c5e0aaa 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1303,6 +1303,8 @@ k8s.io/client-go/tools/clientcmd k8s.io/client-go/tools/clientcmd/api k8s.io/client-go/tools/clientcmd/api/latest k8s.io/client-go/tools/clientcmd/api/v1 +k8s.io/client-go/tools/leaderelection +k8s.io/client-go/tools/leaderelection/resourcelock k8s.io/client-go/tools/metrics k8s.io/client-go/tools/pager k8s.io/client-go/tools/reference