From 5022abb7b79b96ccc7fb9c5c23607a28b0c6c418 Mon Sep 17 00:00:00 2001 From: StartE Date: Fri, 27 Mar 2026 07:17:02 +0000 Subject: [PATCH 01/20] support argo flow --- pkg/helper/k8smeta/k8s_meta_cache.go | 66 ++-- pkg/helper/k8smeta/k8s_meta_const.go | 32 ++ .../k8smeta/k8s_meta_cr_unified_cache.go | 281 ++++++++++++++++++ pkg/helper/k8smeta/k8s_meta_informer_auth.go | 24 ++ pkg/helper/k8smeta/k8s_meta_link.go | 79 +++++ .../input/kubernetesmetav2/meta_collector.go | 21 ++ .../kubernetesmetav2/meta_collector_cr.go | 100 +++++++ .../input/kubernetesmetav2/service_meta.go | 20 +- 8 files changed, 605 insertions(+), 18 deletions(-) create mode 100644 pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go create mode 100644 pkg/helper/k8smeta/k8s_meta_informer_auth.go create mode 100644 plugins/input/kubernetesmetav2/meta_collector_cr.go diff --git a/pkg/helper/k8smeta/k8s_meta_cache.go b/pkg/helper/k8smeta/k8s_meta_cache.go index 6ff2ad926f..53b1cfb4a5 100644 --- a/pkg/helper/k8smeta/k8s_meta_cache.go +++ b/pkg/helper/k8smeta/k8s_meta_cache.go @@ -3,6 +3,7 @@ package k8smeta import ( "context" "fmt" + "sync" "time" app "k8s.io/api/apps/v1" @@ -34,6 +35,11 @@ type k8sMetaCache struct { resourceType string schema *runtime.Scheme + + authFailMu sync.Mutex + authFailCount int + authGiveUp chan struct{} + authGiveUpOnce sync.Once } func newK8sMetaCache(stopCh chan struct{}, resourceType string) *k8sMetaCache { @@ -44,6 +50,7 @@ func newK8sMetaCache(stopCh chan struct{}, resourceType string) *k8sMetaCache { m.metaStore = NewDeferredDeletionMetaStore(m.eventCh, m.stopCh, 120, cache.MetaNamespaceKeyFunc, idxRules...) m.resourceType = resourceType m.schema = runtime.NewScheme() + m.authGiveUp = make(chan struct{}) _ = v1.AddToScheme(m.schema) _ = batch.AddToScheme(m.schema) _ = batchv1beta1.AddToScheme(m.schema) @@ -90,11 +97,39 @@ func (m *k8sMetaCache) UnRegisterSendFunc(key string) { } func (m *k8sMetaCache) watch(stopCh <-chan struct{}) { + _ = stopCh // MetaCache uses m.stopCh for global shutdown; parameter kept for interface compatibility. defer panicRecover() factory, informer := m.getFactoryInformer() if informer == nil { return } + mergedStop := make(chan struct{}) + go func() { + select { + case <-m.stopCh: + case <-m.authGiveUp: + } + close(mergedStop) + }() + if err := informer.SetWatchErrorHandler(func(_ *cache.Reflector, err error) { + if err != nil { + logger.Error(context.Background(), K8sMetaUnifyErrorCode, "resourceType", m.resourceType, "watchError", err) + if isInformerAuthFailure(err) { + m.authFailMu.Lock() + m.authFailCount++ + n := m.authFailCount + m.authFailMu.Unlock() + if n >= informerAuthFailureStopAfter { + m.authGiveUpOnce.Do(func() { + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "stopping informer after repeated RBAC/auth errors (no further retries)", "resourceType", m.resourceType, "failures", n) + close(m.authGiveUp) + }) + } + } + } + }); err != nil { + logger.Error(context.Background(), K8sMetaUnifyErrorCode, "fail to set watch error handler", err) + } _, _ = informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { defer panicRecover() @@ -137,11 +172,17 @@ func (m *k8sMetaCache) watch(stopCh <-chan struct{}) { metaManager.deleteEventCount.Add(1) }, }) - go factory.Start(stopCh) - // wait infinite for first cache sync success + go factory.Start(mergedStop) + // wait for first cache sync success, or stop when RBAC limit merges stopCh for { - if !cache.WaitForCacheSync(stopCh, informer.HasSynced) { - logger.Error(context.Background(), K8sMetaUnifyErrorCode, "service cache sync timeout") + if !cache.WaitForCacheSync(mergedStop, informer.HasSynced) { + select { + case <-mergedStop: + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "informer cache sync aborted", "resourceType", m.resourceType) + return + default: + } + logger.Error(context.Background(), K8sMetaUnifyErrorCode, "service cache sync timeout", "resourceType", m.resourceType) time.Sleep(1 * time.Second) } else { break @@ -171,10 +212,10 @@ func (m *k8sMetaCache) getFactoryInformer() (informers.SharedInformerFactory, ca informer = factory.Apps().V1().StatefulSets().Informer() case DAEMONSET: informer = factory.Apps().V1().DaemonSets().Informer() - case CRONJOB: - informer = m.getCronJobInformer(factory) - case JOB: - informer = factory.Batch().V1().Jobs().Informer() + // case CRONJOB: + // informer = m.getCronJobInformer(factory) + // case JOB: + // informer = factory.Batch().V1().Jobs().Informer() case NODE: informer = factory.Core().V1().Nodes().Informer() case NAMESPACE: @@ -198,15 +239,6 @@ func (m *k8sMetaCache) getFactoryInformer() (informers.SharedInformerFactory, ca if informer == nil { return factory, nil } - // add watch error handler - err := informer.SetWatchErrorHandler(func(r *cache.Reflector, err error) { - if err != nil { - logger.Error(context.Background(), K8sMetaUnifyErrorCode, "resourceType", m.resourceType, "watchError", err) - } - }) - if err != nil { - logger.Error(context.Background(), K8sMetaUnifyErrorCode, "fail to handle watch error handler", err) - } return factory, informer } diff --git a/pkg/helper/k8smeta/k8s_meta_const.go b/pkg/helper/k8smeta/k8s_meta_const.go index 61d38f9fe8..f02df0184a 100644 --- a/pkg/helper/k8smeta/k8s_meta_const.go +++ b/pkg/helper/k8smeta/k8s_meta_const.go @@ -5,6 +5,7 @@ import ( batch "k8s.io/api/batch/v1" v1 "k8s.io/api/core/v1" networking "k8s.io/api/networking/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" ) const ( @@ -27,6 +28,8 @@ const ( STORAGECLASS = "storageclass" INGRESS = "ingress" CONTAINER = "container" + // CUSTOM_RESOURCE_ARGO_WORKFLOW is the unified MetaCache key / event ResourceType for Argo Workflow CR entities. + CUSTOM_RESOURCE_ARGO_WORKFLOW = "customresource/argoproj.io/workflow" // entity link type, the direction is from resource which will be trigger to linked resource //revive:disable:var-naming LINK_SPLIT_CHARACTER = "->" @@ -43,8 +46,21 @@ const ( POD_SERVICE = "pod->service" POD_CONTAINER = "pod->container" INGRESS_SERVICE = "ingress->service" + POD_ARGO_WORKFLOW = "pod->customresource/argoproj.io/workflow" //revive:enable:var-naming + // ArgoWorkflowKind is the Kubernetes kind for argoproj.io Workflow CRs. + ArgoWorkflowKind = "Workflow" + + // DefaultArgoWorkflowAPIGroup is the Workflow CRD API group (ownerRef APIVersion match + dynamic informer Group). + DefaultArgoWorkflowAPIGroup = "argoproj.io" + // DefaultArgoWorkflowAPIVersion is the Workflow informer API version. + DefaultArgoWorkflowAPIVersion = "v1alpha1" + // DefaultArgoWorkflowResource is the Workflow informer resource name (plural). + DefaultArgoWorkflowResource = "workflows" + // DefaultArgoWorkflowPodLabelKey is the Pod label used as fallback to resolve Workflow name. + DefaultArgoWorkflowPodLabelKey = "workflows.argoproj.io/workflow" + // add namespace link //revive:disable:var-naming POD_NAMESPACE = "pod->namespace" @@ -202,6 +218,22 @@ type IngressNamespace struct { Namespace *v1.Namespace } +// ArgoWorkflowCollectorOptions configures Pod↔Workflow link matching and the Workflow dynamic informer GVR. +// A zero value means: use DefaultArgoWorkflowAPIGroup, DefaultArgoWorkflowAPIVersion, DefaultArgoWorkflowResource, DefaultArgoWorkflowPodLabelKey. +// Apply via MetaManager.ConfigureArgoWorkflowCollector before the first EnsureArgoWorkflowInformerStarted. +type ArgoWorkflowCollectorOptions struct { + APIGroup string + APIVersion string + Resource string + PodWorkflowLabelKey string +} + +// PodArgoWorkflow links a Pod to an Argo Workflow CR (unstructured). +type PodArgoWorkflow struct { + Pod *v1.Pod + Workflow *unstructured.Unstructured +} + const ( EventTypeAdd = "add" EventTypeUpdate = "update" diff --git a/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go new file mode 100644 index 0000000000..14876cb8b9 --- /dev/null +++ b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go @@ -0,0 +1,281 @@ +package k8smeta + +import ( + "context" + "fmt" + "sync" + "time" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/dynamic" + "k8s.io/client-go/dynamic/dynamicinformer" + "k8s.io/client-go/kubernetes" + "k8s.io/client-go/rest" + "k8s.io/client-go/tools/cache" + + "github.com/alibaba/ilogtail/pkg/logger" +) + +// ArgoWorkflowGVR is the default GVR for Argo Workflows; override before first informer start via ConfigureArgoWorkflowCollector. +var ArgoWorkflowGVR = schema.GroupVersionResource{ + Group: DefaultArgoWorkflowAPIGroup, + Version: DefaultArgoWorkflowAPIVersion, + Resource: DefaultArgoWorkflowResource, +} + +// crUnifiedCache is a single MetaCache for configured third-party CRs (currently Argo Workflow only). +type crUnifiedCache struct { + metaStore *DeferredDeletionMetaStore + eventCh chan *K8sMetaEvent + stopCh chan struct{} + + resourceType string + gvr schema.GroupVersionResource + + mu sync.Mutex + dynamicClient dynamic.Interface + informer cache.SharedIndexInformer + factory dynamicinformer.DynamicSharedInformerFactory + watchStarted bool + watchStartOnce sync.Once + + authFailMu sync.Mutex + authFailCount int + authGiveUp chan struct{} + authGiveUpOnce sync.Once +} + +func newCRUnifiedCache(stopCh chan struct{}, resourceType string, gvr schema.GroupVersionResource) *crUnifiedCache { + c := &crUnifiedCache{ + stopCh: stopCh, + resourceType: resourceType, + gvr: gvr, + eventCh: make(chan *K8sMetaEvent, 100), + } + c.metaStore = NewDeferredDeletionMetaStore(c.eventCh, stopCh, 120, cache.MetaNamespaceKeyFunc, generateCommonKey) + c.authGiveUp = make(chan struct{}) + return c +} + +func (c *crUnifiedCache) init(_ *kubernetes.Clientset) { + // Built-in clientset unused; dynamic client is wired via setRESTConfig from MetaManager.Init. +} + +// SetWorkflowGVRIfNotStarted updates the Workflow informer GVR before the dynamic informer starts; later calls are ignored. +func (c *crUnifiedCache) SetWorkflowGVRIfNotStarted(gvr schema.GroupVersionResource) { + c.mu.Lock() + defer c.mu.Unlock() + if c.watchStarted { + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "argo workflow informer already started; GVR change ignored", "gvr", gvr.String()) + return + } + c.gvr = gvr +} + +func restConfigForDynamicClient(cfg *rest.Config) *rest.Config { + if cfg == nil { + return nil + } + d := *cfg + // Dynamic client + unstructured ListWatch expect JSON; shared *rest.Config uses protobuf for clientset. + d.ContentType = runtime.ContentTypeJSON + d.AcceptContentTypes = runtime.ContentTypeJSON + return &d +} + +func (c *crUnifiedCache) setRESTConfig(cfg *rest.Config) error { + if cfg == nil { + return fmt.Errorf("nil rest.Config") + } + c.mu.Lock() + defer c.mu.Unlock() + dyn, err := dynamic.NewForConfig(restConfigForDynamicClient(cfg)) + if err != nil { + return err + } + c.dynamicClient = dyn + return nil +} + +// EnsureWatchStarted starts the dynamic informer (once) when the dynamic client is ready. +// Important: never enter sync.Once when dynamicClient is nil — Once would still count as done and block forever. +func (c *crUnifiedCache) EnsureWatchStarted() { + c.mu.Lock() + ready := c.dynamicClient != nil + c.mu.Unlock() + if !ready { + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "dynamic client not ready, skip argo workflow informer; ensure MetaManager.Init completed") + return + } + c.watchStartOnce.Do(func() { + c.mu.Lock() + if c.dynamicClient == nil { + c.mu.Unlock() + return + } + c.metaStore.Start() + c.factory = dynamicinformer.NewDynamicSharedInformerFactory(c.dynamicClient, time.Hour) + c.informer = c.factory.ForResource(c.gvr).Informer() + _, _ = c.informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + AddFunc: func(obj interface{}) { + defer panicRecover() + u := objectToUnstructured(obj) + if u == nil { + return + } + trimWorkflowObjectForCache(u) + now := time.Now().Unix() + c.eventCh <- &K8sMetaEvent{ + EventType: EventTypeAdd, + Object: &ObjectWrapper{ + ResourceType: c.resourceType, + Raw: u, + FirstObservedTime: now, + LastObservedTime: now, + }, + } + metaManager.addEventCount.Add(1) + }, + UpdateFunc: func(_, obj interface{}) { + defer panicRecover() + u := objectToUnstructured(obj) + if u == nil { + return + } + trimWorkflowObjectForCache(u) + now := time.Now().Unix() + c.eventCh <- &K8sMetaEvent{ + EventType: EventTypeUpdate, + Object: &ObjectWrapper{ + ResourceType: c.resourceType, + Raw: u, + FirstObservedTime: now, + LastObservedTime: now, + }, + } + metaManager.updateEventCount.Add(1) + }, + DeleteFunc: func(obj interface{}) { + defer panicRecover() + u := objectToUnstructured(obj) + if u == nil { + return + } + trimWorkflowObjectForCache(u) + c.eventCh <- &K8sMetaEvent{ + EventType: EventTypeDelete, + Object: &ObjectWrapper{ + ResourceType: c.resourceType, + Raw: u, + LastObservedTime: time.Now().Unix(), + }, + } + metaManager.deleteEventCount.Add(1) + }, + }) + if err := c.informer.SetWatchErrorHandler(func(_ *cache.Reflector, err error) { + if err != nil { + logger.Error(context.Background(), K8sMetaUnifyErrorCode, "resourceType", c.resourceType, "watchError", err) + if isInformerAuthFailure(err) { + c.authFailMu.Lock() + c.authFailCount++ + n := c.authFailCount + c.authFailMu.Unlock() + if n >= informerAuthFailureStopAfter { + c.authGiveUpOnce.Do(func() { + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "stopping dynamic informer after repeated RBAC/auth errors (no further retries)", "resourceType", c.resourceType, "gvr", c.gvr.String(), "failures", n) + close(c.authGiveUp) + }) + } + } + } + }); err != nil { + logger.Error(context.Background(), K8sMetaUnifyErrorCode, "fail to set dynamic informer watch error handler", err) + } + c.watchStarted = true + inf := c.informer + gvr := c.gvr + c.mu.Unlock() + + mergedStop := make(chan struct{}) + go func() { + select { + case <-c.stopCh: + case <-c.authGiveUp: + } + close(mergedStop) + }() + go c.factory.Start(mergedStop) + go func() { + for { + if cache.WaitForCacheSync(mergedStop, inf.HasSynced) { + logger.Info(context.Background(), "dynamic informer cache synced", "gvr", gvr.String()) + return + } + select { + case <-mergedStop: + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "dynamic informer cache sync aborted", "gvr", gvr.String()) + return + default: + } + logger.Error(context.Background(), K8sMetaUnifyErrorCode, "dynamic informer cache sync timeout", "gvr", gvr.String()) + time.Sleep(time.Second) + } + }() + }) +} + +func (c *crUnifiedCache) watch(<-chan struct{}) {} + +func (c *crUnifiedCache) Get(key []string) map[string][]*ObjectWrapper { + return c.metaStore.Get(key) +} + +func (c *crUnifiedCache) GetSize() int { + return len(c.metaStore.Items) +} + +func (c *crUnifiedCache) GetQueueSize() int { + return len(c.eventCh) +} + +func (c *crUnifiedCache) List() []*ObjectWrapper { + return c.metaStore.List() +} + +func (c *crUnifiedCache) Filter(filterFunc func(*ObjectWrapper) bool, limit int) []*ObjectWrapper { + return c.metaStore.Filter(filterFunc, limit) +} + +func (c *crUnifiedCache) RegisterSendFunc(key string, sendFunc SendFunc, interval int) { + c.EnsureWatchStarted() + c.metaStore.RegisterSendFunc(key, sendFunc, interval) + logger.Debug(context.Background(), "register send func", c.resourceType) +} + +func (c *crUnifiedCache) UnRegisterSendFunc(key string) { + c.metaStore.UnRegisterSendFunc(key) +} + +func objectToUnstructured(obj interface{}) *unstructured.Unstructured { + if u, ok := obj.(*unstructured.Unstructured); ok { + return u.DeepCopy() + } + if tombstone, ok := obj.(cache.DeletedFinalStateUnknown); ok { + if u, ok := tombstone.Obj.(*unstructured.Unstructured); ok { + return u.DeepCopy() + } + } + return nil +} + +// trimWorkflowObjectForCache drops spec and managedFields to limit memory; metadata + status remain for linking and whitelisted export. +func trimWorkflowObjectForCache(u *unstructured.Unstructured) { + if u == nil { + return + } + unstructured.RemoveNestedField(u.Object, "spec") + unstructured.RemoveNestedField(u.Object, "metadata", "managedFields") +} diff --git a/pkg/helper/k8smeta/k8s_meta_informer_auth.go b/pkg/helper/k8smeta/k8s_meta_informer_auth.go new file mode 100644 index 0000000000..a7ebb061af --- /dev/null +++ b/pkg/helper/k8smeta/k8s_meta_informer_auth.go @@ -0,0 +1,24 @@ +package k8smeta + +import ( + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// Number of consecutive List/Watch errors treated as RBAC/auth failures before stopping this informer only. +const informerAuthFailureStopAfter = 3 + +func isInformerAuthFailure(err error) bool { + if err == nil { + return false + } + if apierrors.IsForbidden(err) || apierrors.IsUnauthorized(err) { + return true + } + switch apierrors.ReasonForError(err) { + case metav1.StatusReasonForbidden, metav1.StatusReasonUnauthorized: + return true + default: + return false + } +} diff --git a/pkg/helper/k8smeta/k8s_meta_link.go b/pkg/helper/k8smeta/k8s_meta_link.go index b533f724d5..2c92e9f8ce 100644 --- a/pkg/helper/k8smeta/k8s_meta_link.go +++ b/pkg/helper/k8smeta/k8s_meta_link.go @@ -7,11 +7,16 @@ import ( batch "k8s.io/api/batch/v1" v1 "k8s.io/api/core/v1" networking "k8s.io/api/networking/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/labels" ) type LinkGenerator struct { metaCache map[string]MetaCache + // ArgoWorkflowAPIGroup is matched as substring of owner reference APIVersion (empty => DefaultArgoWorkflowAPIGroup). + ArgoWorkflowAPIGroup string + // ArgoWorkflowPodLabelKey is the Pod label key for Workflow name fallback (empty => DefaultArgoWorkflowPodLabelKey). + ArgoWorkflowPodLabelKey string } func NewK8sMetaLinkGenerator(metaCache map[string]MetaCache) *LinkGenerator { @@ -56,6 +61,8 @@ func (g *LinkGenerator) GenerateLinks(events []*K8sMetaEvent, linkType string) [ return g.getReplicaSetDeploymentLink(events) case INGRESS_SERVICE: return g.getIngressServiceLink(events) + case POD_ARGO_WORKFLOW: + return g.getPodArgoWorkflowLink(events) case POD_NAMESPACE: return g.getPodNamespaceLink(events) case SERVICE_NAMESPACE: @@ -565,6 +572,78 @@ func (g *LinkGenerator) getIngressServiceLink(ingressList []*K8sMetaEvent) []*K8 return result } +func (g *LinkGenerator) argoWorkflowAPIGroup() string { + if g != nil && g.ArgoWorkflowAPIGroup != "" { + return g.ArgoWorkflowAPIGroup + } + return DefaultArgoWorkflowAPIGroup +} + +func (g *LinkGenerator) argoWorkflowPodLabelKey() string { + if g != nil && g.ArgoWorkflowPodLabelKey != "" { + return g.ArgoWorkflowPodLabelKey + } + return DefaultArgoWorkflowPodLabelKey +} + +// argoWorkflowNameFromPod resolves the Workflow name: prefer ownerReferences (Argo task pods), +// fall back to configurable Pod label when owner ref is absent or non-standard. +func (g *LinkGenerator) argoWorkflowNameFromPod(pod *v1.Pod) (namespace, name string, ok bool) { + apiGroup := g.argoWorkflowAPIGroup() + labelKey := g.argoWorkflowPodLabelKey() + for _, ref := range pod.OwnerReferences { + if ref.Kind == ArgoWorkflowKind && strings.Contains(ref.APIVersion, apiGroup) { + return pod.Namespace, ref.Name, true + } + } + if pod.Labels != nil { + if v := pod.Labels[labelKey]; v != "" { + return pod.Namespace, v, true + } + } + return "", "", false +} + +func (g *LinkGenerator) getPodArgoWorkflowLink(podList []*K8sMetaEvent) []*K8sMetaEvent { + crCache := g.metaCache[CUSTOM_RESOURCE_ARGO_WORKFLOW] + if crCache == nil { + return nil + } + result := make([]*K8sMetaEvent, 0) + for _, data := range podList { + pod, ok := data.Object.Raw.(*v1.Pod) + if !ok { + continue + } + ns, wfName, found := g.argoWorkflowNameFromPod(pod) + if !found || wfName == "" { + continue + } + wfList := crCache.Get([]string{generateNameWithNamespaceKey(ns, wfName)}) + for _, workflows := range wfList { + for _, w := range workflows { + u, ok := w.Raw.(*unstructured.Unstructured) + if !ok { + continue + } + result = append(result, &K8sMetaEvent{ + EventType: data.EventType, + Object: &ObjectWrapper{ + ResourceType: POD_ARGO_WORKFLOW, + Raw: &PodArgoWorkflow{ + Pod: pod, + Workflow: u, + }, + FirstObservedTime: data.Object.FirstObservedTime, + LastObservedTime: data.Object.LastObservedTime, + }, + }) + } + } + } + return result +} + func (g *LinkGenerator) getPodNamespaceLink(podList []*K8sMetaEvent) []*K8sMetaEvent { result := make([]*K8sMetaEvent, 0) for _, data := range podList { diff --git a/plugins/input/kubernetesmetav2/meta_collector.go b/plugins/input/kubernetesmetav2/meta_collector.go index fbed1d39cb..b044241f38 100644 --- a/plugins/input/kubernetesmetav2/meta_collector.go +++ b/plugins/input/kubernetesmetav2/meta_collector.go @@ -50,6 +50,7 @@ func (m *metaCollector) Start() error { k8smeta.PERSISTENTVOLUMECLAIM: m.processPersistentVolumeClaimEntity, k8smeta.STORAGECLASS: m.processStorageClassEntity, k8smeta.INGRESS: m.processIngressEntity, + k8smeta.CUSTOM_RESOURCE_ARGO_WORKFLOW: m.processCustomResourceEntity, k8smeta.POD_NODE: m.processPodNodeLink, k8smeta.POD_DEPLOYMENT: m.processPodDeploymentLink, k8smeta.POD_REPLICASET: m.processPodReplicaSetLink, @@ -63,6 +64,7 @@ func (m *metaCollector) Start() error { k8smeta.POD_SERVICE: m.processPodServiceLink, k8smeta.POD_CONTAINER: m.processPodContainerLink, k8smeta.INGRESS_SERVICE: m.processIngressServiceLink, + k8smeta.POD_ARGO_WORKFLOW: m.processPodArgoWorkflowLink, // add namespace to xx link processor k8smeta.POD_NAMESPACE: m.processPodNamespaceLink, @@ -77,6 +79,15 @@ func (m *metaCollector) Start() error { k8smeta.INGRESS_NAMESPACE: m.processIngressNamespaceLink, } + if needArgoWorkflowInformer(m.serviceK8sMeta) { + m.serviceK8sMeta.metaManager.ConfigureArgoWorkflowCollector(k8smeta.ArgoWorkflowCollectorOptions{ + APIGroup: m.serviceK8sMeta.ArgoWorkflowAPIGroup, + APIVersion: m.serviceK8sMeta.ArgoWorkflowAPIVersion, + Resource: m.serviceK8sMeta.ArgoWorkflowResource, + PodWorkflowLabelKey: m.serviceK8sMeta.ArgoWorkflowPodLabelKey, + }) + m.serviceK8sMeta.metaManager.EnsureArgoWorkflowInformerStarted() + } if m.serviceK8sMeta.Pod { m.serviceK8sMeta.metaManager.RegisterSendFunc(m.serviceK8sMeta.context.GetProject(), m.serviceK8sMeta.configName, k8smeta.POD, m.handleEvent, m.serviceK8sMeta.Interval) } @@ -122,6 +133,9 @@ func (m *metaCollector) Start() error { if m.serviceK8sMeta.Ingress { m.serviceK8sMeta.metaManager.RegisterSendFunc(m.serviceK8sMeta.context.GetProject(), m.serviceK8sMeta.configName, k8smeta.INGRESS, m.handleEvent, m.serviceK8sMeta.Interval) } + if m.serviceK8sMeta.ArgoWorkflow { + m.serviceK8sMeta.metaManager.RegisterSendFunc(m.serviceK8sMeta.context.GetProject(), m.serviceK8sMeta.configName, k8smeta.CUSTOM_RESOURCE_ARGO_WORKFLOW, m.handleEvent, m.serviceK8sMeta.Interval) + } if m.serviceK8sMeta.Pod && m.serviceK8sMeta.Node && m.serviceK8sMeta.Node2Pod != "" { m.serviceK8sMeta.metaManager.RegisterSendFunc(m.serviceK8sMeta.context.GetProject(), m.serviceK8sMeta.configName, k8smeta.POD_NODE, m.handleEvent, m.serviceK8sMeta.Interval) @@ -162,6 +176,9 @@ func (m *metaCollector) Start() error { if m.serviceK8sMeta.Ingress && m.serviceK8sMeta.Service && m.serviceK8sMeta.Ingress2Service != "" { m.serviceK8sMeta.metaManager.RegisterSendFunc(m.serviceK8sMeta.context.GetProject(), m.serviceK8sMeta.configName, k8smeta.INGRESS_SERVICE, m.handleEvent, m.serviceK8sMeta.Interval) } + if m.serviceK8sMeta.Pod && m.serviceK8sMeta.Workflow2Pod != "" { + m.serviceK8sMeta.metaManager.RegisterSendFunc(m.serviceK8sMeta.context.GetProject(), m.serviceK8sMeta.configName, k8smeta.POD_ARGO_WORKFLOW, m.handleEvent, m.serviceK8sMeta.Interval) + } if m.serviceK8sMeta.Namespace && m.serviceK8sMeta.Pod && m.serviceK8sMeta.Namespace2Pod != "" { m.serviceK8sMeta.metaManager.RegisterSendFunc(m.serviceK8sMeta.context.GetProject(), m.serviceK8sMeta.configName, k8smeta.POD_NAMESPACE, m.handleEvent, m.serviceK8sMeta.Interval) } @@ -477,6 +494,10 @@ func isEntity(resourceType string) bool { return !strings.Contains(resourceType, k8smeta.LINK_SPLIT_CHARACTER) } +func needArgoWorkflowInformer(s *ServiceK8sMeta) bool { + return s.ArgoWorkflow || (s.Pod && s.Workflow2Pod != "") +} + func safeGetInt32String(pointer *int32) string { if pointer == nil { return "" diff --git a/plugins/input/kubernetesmetav2/meta_collector_cr.go b/plugins/input/kubernetesmetav2/meta_collector_cr.go new file mode 100644 index 0000000000..3c9c7ff704 --- /dev/null +++ b/plugins/input/kubernetesmetav2/meta_collector_cr.go @@ -0,0 +1,100 @@ +package kubernetesmetav2 + +import ( + "strings" + "time" + + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + + "github.com/alibaba/ilogtail/pkg/helper/k8smeta" + "github.com/alibaba/ilogtail/pkg/models" +) + +func filterStringMapByAllowList(m map[string]string, allow []string) map[string]string { + if len(allow) == 0 || len(m) == 0 { + return nil + } + out := make(map[string]string) + for _, k := range allow { + if k == "" { + continue + } + if v, ok := m[k]; ok { + out[k] = v + } + } + if len(out) == 0 { + return nil + } + return out +} + +func pickUnstructuredFieldCopy(obj map[string]interface{}, paths []string) map[string]interface{} { + if len(paths) == 0 || obj == nil { + return nil + } + out := make(map[string]interface{}) + for _, p := range paths { + if p == "" { + continue + } + parts := strings.Split(p, ".") + if v, found, err := unstructured.NestedFieldCopy(obj, parts...); found && err == nil { + out[p] = v + } + } + if len(out) == 0 { + return nil + } + return out +} + +func (m *metaCollector) processCustomResourceEntity(data *k8smeta.ObjectWrapper, method string) []models.PipelineEvent { + obj, ok := data.Raw.(*unstructured.Unstructured) + if !ok { + return nil + } + log := &models.Log{} + log.Contents = models.NewLogContents() + log.Timestamp = uint64(time.Now().Unix()) + kindKey := k8smeta.CUSTOM_RESOURCE_ARGO_WORKFLOW + m.processEntityCommonPart(log.Contents, kindKey, obj.GetNamespace(), obj.GetName(), method, data.FirstObservedTime, data.LastObservedTime, obj.GetCreationTimestamp()) + log.Contents.Add(entityKindFieldName, k8smeta.ArgoWorkflowKind) + log.Contents.Add("api_version", obj.GetAPIVersion()) + log.Contents.Add("namespace", obj.GetNamespace()) + + // Labels/annotations: same switches as built-in entities (EnableLabels / EnableAnnotations). + // If CustomResourceWorkflow*AllowList is non-empty, only those keys are emitted (subset mode; does not require Enable*). + if len(m.serviceK8sMeta.CustomResourceWorkflowLabelAllowList) > 0 { + if labels := filterStringMapByAllowList(obj.GetLabels(), m.serviceK8sMeta.CustomResourceWorkflowLabelAllowList); labels != nil { + log.Contents.Add("labels", m.processEntityJSONObject(labels)) + } + } else if m.serviceK8sMeta.EnableLabels { + log.Contents.Add("labels", m.processEntityJSONObject(obj.GetLabels())) + } + if len(m.serviceK8sMeta.CustomResourceWorkflowAnnotationAllowList) > 0 { + if annos := filterStringMapByAllowList(obj.GetAnnotations(), m.serviceK8sMeta.CustomResourceWorkflowAnnotationAllowList); annos != nil { + log.Contents.Add("annotations", m.processEntityJSONObject(annos)) + } + } else if m.serviceK8sMeta.EnableAnnotations { + log.Contents.Add("annotations", m.processEntityJSONObject(obj.GetAnnotations())) + } + if statusObj := pickUnstructuredFieldCopy(obj.Object, m.serviceK8sMeta.CustomResourceWorkflowStatusPathAllowList); statusObj != nil { + log.Contents.Add("status", m.processEntityJSONObject(statusObj)) + } + return []models.PipelineEvent{log} +} + +func (m *metaCollector) processPodArgoWorkflowLink(data *k8smeta.ObjectWrapper, method string) []models.PipelineEvent { + obj, ok := data.Raw.(*k8smeta.PodArgoWorkflow) + if !ok { + return nil + } + log := &models.Log{} + log.Contents = models.NewLogContents() + m.processEntityLinkCommonPart(log.Contents, k8smeta.CUSTOM_RESOURCE_ARGO_WORKFLOW, obj.Workflow.GetNamespace(), obj.Workflow.GetName(), + obj.Pod.Kind, obj.Pod.Namespace, obj.Pod.Name, method, data.FirstObservedTime, data.LastObservedTime) + log.Contents.Add(entityLinkRelationTypeFieldName, m.serviceK8sMeta.Workflow2Pod) + log.Timestamp = uint64(time.Now().Unix()) + return []models.PipelineEvent{log} +} diff --git a/plugins/input/kubernetesmetav2/service_meta.go b/plugins/input/kubernetesmetav2/service_meta.go index 1d99c224f2..615058ae98 100644 --- a/plugins/input/kubernetesmetav2/service_meta.go +++ b/plugins/input/kubernetesmetav2/service_meta.go @@ -31,7 +31,25 @@ type ServiceK8sMeta struct { StorageClass bool Ingress bool Container bool - // labels and annotations switch + // ArgoWorkflow enables collecting argoproj.io/v1alpha1 Workflow CRs as entities (dynamic informer). + ArgoWorkflow bool + // Workflow2Pod sets __relation_type__ for Pod–Argo Workflow entity links (non-empty to enable link collection when Pod is enabled). + Workflow2Pod string + // ArgoWorkflowAPIGroup overrides k8smeta.DefaultArgoWorkflowAPIGroup for ownerRef matching and informer Group (empty = default). + ArgoWorkflowAPIGroup string + // ArgoWorkflowAPIVersion overrides k8smeta.DefaultArgoWorkflowAPIVersion for the Workflow informer (empty = default). + ArgoWorkflowAPIVersion string + // ArgoWorkflowResource overrides k8smeta.DefaultArgoWorkflowResource for the Workflow informer (empty = default). + ArgoWorkflowResource string + // ArgoWorkflowPodLabelKey overrides k8smeta.DefaultArgoWorkflowPodLabelKey for Pod label fallback (empty = default). + ArgoWorkflowPodLabelKey string + // CustomResourceWorkflowLabelAllowList: if non-empty, only these label keys are emitted (subset); when empty, use EnableLabels for full labels like other entities. + CustomResourceWorkflowLabelAllowList []string + // CustomResourceWorkflowAnnotationAllowList: if non-empty, only these annotation keys are emitted (subset); when empty, use EnableAnnotations for full annotations. + CustomResourceWorkflowAnnotationAllowList []string + // CustomResourceWorkflowStatusPathAllowList: JSON paths under the object root (e.g. "status.phase", "status.startedAt"). + CustomResourceWorkflowStatusPathAllowList []string + // EnableLabels / EnableAnnotations: when true, emit full labels/annotations on entities; Argo Workflow CR entities use the same flags when the allow lists above are empty. EnableLabels bool EnableAnnotations bool // link switch From 3a828a41f145f974e0024606732ed363bcb7d642 Mon Sep 17 00:00:00 2001 From: StartE Date: Fri, 27 Mar 2026 08:56:00 +0000 Subject: [PATCH 02/20] update --- pkg/helper/k8smeta/k8s_meta_const.go | 66 +++---- .../k8smeta/k8s_meta_cr_unified_cache.go | 17 +- .../k8smeta/k8s_meta_custom_resource.go | 106 +++++++++++ .../k8s_meta_deferred_deletion_meta_store.go | 27 +++ pkg/helper/k8smeta/k8s_meta_link.go | 97 ++++++---- pkg/helper/k8smeta/k8s_meta_manager.go | 94 +++++++++- .../k8smeta/k8s_meta_namespace_policy.go | 168 ++++++++++++++++++ .../k8smeta/k8s_meta_namespace_policy_test.go | 109 ++++++++++++ .../input/kubernetesmetav2/meta_collector.go | 57 +++--- .../kubernetesmetav2/meta_collector_cr.go | 57 ++++-- .../input/kubernetesmetav2/service_meta.go | 49 +++-- 11 files changed, 686 insertions(+), 161 deletions(-) create mode 100644 pkg/helper/k8smeta/k8s_meta_custom_resource.go create mode 100644 pkg/helper/k8smeta/k8s_meta_namespace_policy.go create mode 100644 pkg/helper/k8smeta/k8s_meta_namespace_policy_test.go diff --git a/pkg/helper/k8smeta/k8s_meta_const.go b/pkg/helper/k8smeta/k8s_meta_const.go index f02df0184a..f882cc345d 100644 --- a/pkg/helper/k8smeta/k8s_meta_const.go +++ b/pkg/helper/k8smeta/k8s_meta_const.go @@ -12,25 +12,22 @@ const ( EntityCollectorUserAgent = "loongcollector-singleton" // entity type - POD = "pod" - SERVICE = "service" - DEPLOYMENT = "deployment" - REPLICASET = "replicaset" - STATEFULSET = "statefulset" - DAEMONSET = "daemonset" - CRONJOB = "cronjob" - JOB = "job" - NODE = "node" - NAMESPACE = "namespace" - CONFIGMAP = "configmap" - PERSISTENTVOLUME = "persistentvolume" - PERSISTENTVOLUMECLAIM = "persistentvolumeclaim" - STORAGECLASS = "storageclass" - INGRESS = "ingress" - CONTAINER = "container" - // CUSTOM_RESOURCE_ARGO_WORKFLOW is the unified MetaCache key / event ResourceType for Argo Workflow CR entities. - CUSTOM_RESOURCE_ARGO_WORKFLOW = "customresource/argoproj.io/workflow" - // entity link type, the direction is from resource which will be trigger to linked resource + POD = "pod" + SERVICE = "service" + DEPLOYMENT = "deployment" + REPLICASET = "replicaset" + STATEFULSET = "statefulset" + DAEMONSET = "daemonset" + CRONJOB = "cronjob" + JOB = "job" + NODE = "node" + NAMESPACE = "namespace" + CONFIGMAP = "configmap" + PERSISTENTVOLUME = "persistentvolume" + PERSISTENTVOLUMECLAIM = "persistentvolumeclaim" + STORAGECLASS = "storageclass" + INGRESS = "ingress" + CONTAINER = "container" //revive:disable:var-naming LINK_SPLIT_CHARACTER = "->" POD_NODE = "pod->node" @@ -46,21 +43,8 @@ const ( POD_SERVICE = "pod->service" POD_CONTAINER = "pod->container" INGRESS_SERVICE = "ingress->service" - POD_ARGO_WORKFLOW = "pod->customresource/argoproj.io/workflow" //revive:enable:var-naming - // ArgoWorkflowKind is the Kubernetes kind for argoproj.io Workflow CRs. - ArgoWorkflowKind = "Workflow" - - // DefaultArgoWorkflowAPIGroup is the Workflow CRD API group (ownerRef APIVersion match + dynamic informer Group). - DefaultArgoWorkflowAPIGroup = "argoproj.io" - // DefaultArgoWorkflowAPIVersion is the Workflow informer API version. - DefaultArgoWorkflowAPIVersion = "v1alpha1" - // DefaultArgoWorkflowResource is the Workflow informer resource name (plural). - DefaultArgoWorkflowResource = "workflows" - // DefaultArgoWorkflowPodLabelKey is the Pod label used as fallback to resolve Workflow name. - DefaultArgoWorkflowPodLabelKey = "workflows.argoproj.io/workflow" - // add namespace link //revive:disable:var-naming POD_NAMESPACE = "pod->namespace" @@ -218,20 +202,10 @@ type IngressNamespace struct { Namespace *v1.Namespace } -// ArgoWorkflowCollectorOptions configures Pod↔Workflow link matching and the Workflow dynamic informer GVR. -// A zero value means: use DefaultArgoWorkflowAPIGroup, DefaultArgoWorkflowAPIVersion, DefaultArgoWorkflowResource, DefaultArgoWorkflowPodLabelKey. -// Apply via MetaManager.ConfigureArgoWorkflowCollector before the first EnsureArgoWorkflowInformerStarted. -type ArgoWorkflowCollectorOptions struct { - APIGroup string - APIVersion string - Resource string - PodWorkflowLabelKey string -} - -// PodArgoWorkflow links a Pod to an Argo Workflow CR (unstructured). -type PodArgoWorkflow struct { - Pod *v1.Pod - Workflow *unstructured.Unstructured +// PodCustomResource links a Pod to an arbitrary CR stored as unstructured. +type PodCustomResource struct { + Pod *v1.Pod + CR *unstructured.Unstructured } const ( diff --git a/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go index 14876cb8b9..fba8a614f9 100644 --- a/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go +++ b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go @@ -18,14 +18,7 @@ import ( "github.com/alibaba/ilogtail/pkg/logger" ) -// ArgoWorkflowGVR is the default GVR for Argo Workflows; override before first informer start via ConfigureArgoWorkflowCollector. -var ArgoWorkflowGVR = schema.GroupVersionResource{ - Group: DefaultArgoWorkflowAPIGroup, - Version: DefaultArgoWorkflowAPIVersion, - Resource: DefaultArgoWorkflowResource, -} - -// crUnifiedCache is a single MetaCache for configured third-party CRs (currently Argo Workflow only). +// crUnifiedCache is a MetaCache for one third-party API resource (dynamic informer + unstructured objects). type crUnifiedCache struct { metaStore *DeferredDeletionMetaStore eventCh chan *K8sMetaEvent @@ -63,12 +56,12 @@ func (c *crUnifiedCache) init(_ *kubernetes.Clientset) { // Built-in clientset unused; dynamic client is wired via setRESTConfig from MetaManager.Init. } -// SetWorkflowGVRIfNotStarted updates the Workflow informer GVR before the dynamic informer starts; later calls are ignored. -func (c *crUnifiedCache) SetWorkflowGVRIfNotStarted(gvr schema.GroupVersionResource) { +// SetGVRIfNotStarted updates the informer GVR before the dynamic informer starts; later calls are ignored. +func (c *crUnifiedCache) SetGVRIfNotStarted(gvr schema.GroupVersionResource) { c.mu.Lock() defer c.mu.Unlock() if c.watchStarted { - logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "argo workflow informer already started; GVR change ignored", "gvr", gvr.String()) + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "custom resource informer already started; GVR change ignored", "gvr", gvr.String()) return } c.gvr = gvr @@ -106,7 +99,7 @@ func (c *crUnifiedCache) EnsureWatchStarted() { ready := c.dynamicClient != nil c.mu.Unlock() if !ready { - logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "dynamic client not ready, skip argo workflow informer; ensure MetaManager.Init completed") + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "dynamic client not ready, skip custom resource informer; ensure MetaManager.Init completed") return } c.watchStartOnce.Do(func() { diff --git a/pkg/helper/k8smeta/k8s_meta_custom_resource.go b/pkg/helper/k8smeta/k8s_meta_custom_resource.go new file mode 100644 index 0000000000..31aa2db938 --- /dev/null +++ b/pkg/helper/k8smeta/k8s_meta_custom_resource.go @@ -0,0 +1,106 @@ +package k8smeta + +import ( + "fmt" + "strings" + + "k8s.io/apimachinery/pkg/runtime/schema" +) + +// CustomResourceCollectorConfig describes one third-party API resource collected via a dynamic informer. +// Use MetaManager.RegisterCustomResourceCollector after GetMetaManagerInstance and before or after Init +// (late registration will receive the REST config stored at Init). +// +// YAML-friendly field names match JSON tags when used in pipeline configs. +type CustomResourceCollectorConfig struct { + // EntityType is required: internal cache key and K8sMetaEvent.ResourceType (e.g. customresource/argoproj.io/workflow). + // It drives __entity_type__, __entity_id__, and pod->{EntityType} links — set explicitly in pipeline config. + EntityType string `json:"EntityType,omitempty"` + + APIGroup string `json:"APIGroup,omitempty"` + APIVersion string `json:"APIVersion,omitempty"` + Resource string `json:"Resource,omitempty"` // plural resource name + Kind string `json:"Kind,omitempty"` // Kubernetes kind, for ownerReferences matching and export + + // PodLink, if set, registers a Pod → this CR link generator (link type: PodLinkTypeForEntity(EntityType)). + PodLink *PodToCustomResourceLinkConfig `json:"PodLink,omitempty"` + // CollectEntity registers entity collection (K8sMetaEvent stream) for this CR. + CollectEntity bool `json:"CollectEntity,omitempty"` + // Entity2PodRelation is __relation_type__ on entity_link logs (custom resource → Pod). Required when Pod link export is enabled together with PodLink. + Entity2PodRelation string `json:"Entity2PodRelation,omitempty"` + + // EnableLabels, if true, exports full labels when LabelAllowList is empty. Ignores ServiceK8sMeta.EnableLabels. Default false. + EnableLabels bool `json:"EnableLabels,omitempty"` + // EnableAnnotations, if true, exports full annotations when AnnotationAllowList is empty. Ignores ServiceK8sMeta.EnableAnnotations. Default false. + EnableAnnotations bool `json:"EnableAnnotations,omitempty"` + + // Export lists for entity logs (optional; when non-empty, only listed keys are exported regardless of Enable*). + LabelAllowList []string `json:"LabelAllowList,omitempty"` + AnnotationAllowList []string `json:"AnnotationAllowList,omitempty"` + StatusPathAllowList []string `json:"StatusPathAllowList,omitempty"` +} + +// PodToCustomResourceLinkConfig resolves which Workflow-like object a Pod belongs to. +type PodToCustomResourceLinkConfig struct { + // OwnerKind matches Pod ownerReferences[].Kind (e.g. Workflow). + OwnerKind string `json:"OwnerKind,omitempty"` + // OwnerAPIGroupContains is matched as substring of ownerReferences[].APIVersion (empty => use collector APIGroup). + OwnerAPIGroupContains string `json:"OwnerAPIGroupContains,omitempty"` + // PodLabelKey fallback when no matching ownerRef (e.g. workflows.argoproj.io/workflow). + PodLabelKey string `json:"PodLabelKey,omitempty"` +} + +// PodLinkTypeForEntity returns the link ResourceType for RegisterSendFunc (e.g. pod->customresource/...). +func PodLinkTypeForEntity(entityType string) string { + return POD + LINK_SPLIT_CHARACTER + entityType +} + +// DefaultEntityType returns the conventional type string customresource//. +// It does not apply automatically; EntityType must still be set on the config (Normalize requires it). +func DefaultEntityType(apiGroup, kind string) string { + return fmt.Sprintf("customresource/%s/%s", strings.ToLower(strings.TrimSpace(apiGroup)), strings.ToLower(strings.TrimSpace(kind))) +} + +// ToGVR returns the GroupVersionResource for the dynamic informer. +func (c *CustomResourceCollectorConfig) ToGVR() schema.GroupVersionResource { + return schema.GroupVersionResource{ + Group: c.APIGroup, + Version: c.APIVersion, + Resource: c.Resource, + } +} + +// Normalize validates and fills PodLink defaults. EntityType must be non-empty. Call before RegisterCustomResourceCollector. +func (c *CustomResourceCollectorConfig) Normalize() error { + c.APIGroup = strings.TrimSpace(c.APIGroup) + c.APIVersion = strings.TrimSpace(c.APIVersion) + c.Resource = strings.TrimSpace(c.Resource) + c.Kind = strings.TrimSpace(c.Kind) + c.EntityType = strings.TrimSpace(c.EntityType) + + if c.APIGroup == "" || c.APIVersion == "" || c.Resource == "" || c.Kind == "" { + return fmt.Errorf("custom resource collector: APIGroup, APIVersion, Resource, and Kind are required") + } + if c.EntityType == "" { + return fmt.Errorf("custom resource collector: EntityType is required") + } + if pl := c.PodLink; pl != nil { + pl.OwnerKind = strings.TrimSpace(pl.OwnerKind) + pl.OwnerAPIGroupContains = strings.TrimSpace(pl.OwnerAPIGroupContains) + pl.PodLabelKey = strings.TrimSpace(pl.PodLabelKey) + if pl.OwnerKind == "" { + pl.OwnerKind = c.Kind + } + if pl.OwnerAPIGroupContains == "" { + pl.OwnerAPIGroupContains = c.APIGroup + } + } + return nil +} + +func firstNonEmpty(val, def string) string { + if strings.TrimSpace(val) != "" { + return val + } + return def +} diff --git a/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store.go b/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store.go index 2159566ff7..28b4f424f2 100644 --- a/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store.go +++ b/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store.go @@ -215,6 +215,10 @@ func (m *DeferredDeletionMetaStore) handleAddOrUpdateEvent(event *K8sMetaEvent) logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "handle k8s meta with keyFunc error", err) return } + if !GetMetaManagerInstance().MetaObjectPassesNamespacePolicy(event.Object) { + m.purgeKey(key) + return + } newIdxKeys := m.getIdxKeys(event.Object) m.lock.Lock() // should delete oldIdxKeys in two cases: @@ -269,6 +273,10 @@ func (m *DeferredDeletionMetaStore) handleDeleteEvent(event *K8sMetaEvent) { logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "handle k8s meta with keyFunc error", err) return } + if !GetMetaManagerInstance().MetaObjectPassesNamespacePolicy(event.Object) { + m.purgeKey(key) + return + } m.lock.Lock() if obj, ok := m.Items[key]; ok { obj.Deleted = true @@ -316,6 +324,25 @@ func (m *DeferredDeletionMetaStore) handleDeferredDeleteEvent(event *K8sMetaEven } } +// purgeKey removes an object from the store and index if present (no SendFunc). +func (m *DeferredDeletionMetaStore) purgeKey(key string) { + m.lock.Lock() + defer m.lock.Unlock() + obj, ok := m.Items[key] + if !ok { + return + } + for _, idxKey := range m.getIdxKeys(obj) { + if item, ok := m.Index[idxKey]; ok { + item.Remove(key) + if len(item.Keys) == 0 { + delete(m.Index, idxKey) + } + } + } + delete(m.Items, key) +} + func (m *DeferredDeletionMetaStore) handleTimerEvent(event *K8sMetaEvent) { timerEvent := event.Object.Raw.(*TimerEvent) m.registerLock.RLock() diff --git a/pkg/helper/k8smeta/k8s_meta_link.go b/pkg/helper/k8smeta/k8s_meta_link.go index 2c92e9f8ce..3e71dff773 100644 --- a/pkg/helper/k8smeta/k8s_meta_link.go +++ b/pkg/helper/k8smeta/k8s_meta_link.go @@ -2,6 +2,7 @@ package k8smeta import ( "strings" + "sync" app "k8s.io/api/apps/v1" batch "k8s.io/api/batch/v1" @@ -13,18 +14,47 @@ import ( type LinkGenerator struct { metaCache map[string]MetaCache - // ArgoWorkflowAPIGroup is matched as substring of owner reference APIVersion (empty => DefaultArgoWorkflowAPIGroup). - ArgoWorkflowAPIGroup string - // ArgoWorkflowPodLabelKey is the Pod label key for Workflow name fallback (empty => DefaultArgoWorkflowPodLabelKey). - ArgoWorkflowPodLabelKey string + + podCRMu sync.RWMutex + podCRByLinkType map[string]*podCRLinkRuntime +} + +type podCRLinkRuntime struct { + entityType string + ownerKind string + ownerAPIGroupSubstr string + podLabelKey string } func NewK8sMetaLinkGenerator(metaCache map[string]MetaCache) *LinkGenerator { return &LinkGenerator{ - metaCache: metaCache, + metaCache: metaCache, + podCRByLinkType: make(map[string]*podCRLinkRuntime), } } +func (g *LinkGenerator) registerPodCRLink(linkType string, rt *podCRLinkRuntime) { + if g == nil || linkType == "" || rt == nil { + return + } + g.podCRMu.Lock() + defer g.podCRMu.Unlock() + if g.podCRByLinkType == nil { + g.podCRByLinkType = make(map[string]*podCRLinkRuntime) + } + g.podCRByLinkType[linkType] = rt +} + +func (g *LinkGenerator) podCRRuntimeForLinkType(linkType string) (*podCRLinkRuntime, bool) { + if g == nil { + return nil, false + } + g.podCRMu.RLock() + defer g.podCRMu.RUnlock() + rt, ok := g.podCRByLinkType[linkType] + return rt, ok +} + func (g *LinkGenerator) GenerateLinks(events []*K8sMetaEvent, linkType string) []*K8sMetaEvent { if len(events) == 0 { return nil @@ -34,6 +64,9 @@ func (g *LinkGenerator) GenerateLinks(events []*K8sMetaEvent, linkType string) [ if !strings.HasPrefix(linkType, resourceType) { return nil } + if rt, ok := g.podCRRuntimeForLinkType(linkType); ok { + return g.getPodCustomResourceLink(events, rt, linkType) + } switch linkType { case POD_NODE: return g.getPodNodeLink(events) @@ -61,8 +94,6 @@ func (g *LinkGenerator) GenerateLinks(events []*K8sMetaEvent, linkType string) [ return g.getReplicaSetDeploymentLink(events) case INGRESS_SERVICE: return g.getIngressServiceLink(events) - case POD_ARGO_WORKFLOW: - return g.getPodArgoWorkflowLink(events) case POD_NAMESPACE: return g.getPodNamespaceLink(events) case SERVICE_NAMESPACE: @@ -572,40 +603,28 @@ func (g *LinkGenerator) getIngressServiceLink(ingressList []*K8sMetaEvent) []*K8 return result } -func (g *LinkGenerator) argoWorkflowAPIGroup() string { - if g != nil && g.ArgoWorkflowAPIGroup != "" { - return g.ArgoWorkflowAPIGroup - } - return DefaultArgoWorkflowAPIGroup -} - -func (g *LinkGenerator) argoWorkflowPodLabelKey() string { - if g != nil && g.ArgoWorkflowPodLabelKey != "" { - return g.ArgoWorkflowPodLabelKey +func (g *LinkGenerator) crNameFromPod(pod *v1.Pod, rt *podCRLinkRuntime) (namespace, name string, ok bool) { + if pod == nil || rt == nil { + return "", "", false } - return DefaultArgoWorkflowPodLabelKey -} - -// argoWorkflowNameFromPod resolves the Workflow name: prefer ownerReferences (Argo task pods), -// fall back to configurable Pod label when owner ref is absent or non-standard. -func (g *LinkGenerator) argoWorkflowNameFromPod(pod *v1.Pod) (namespace, name string, ok bool) { - apiGroup := g.argoWorkflowAPIGroup() - labelKey := g.argoWorkflowPodLabelKey() for _, ref := range pod.OwnerReferences { - if ref.Kind == ArgoWorkflowKind && strings.Contains(ref.APIVersion, apiGroup) { + if ref.Kind == rt.ownerKind && strings.Contains(ref.APIVersion, rt.ownerAPIGroupSubstr) { return pod.Namespace, ref.Name, true } } - if pod.Labels != nil { - if v := pod.Labels[labelKey]; v != "" { + if rt.podLabelKey != "" && pod.Labels != nil { + if v := pod.Labels[rt.podLabelKey]; v != "" { return pod.Namespace, v, true } } return "", "", false } -func (g *LinkGenerator) getPodArgoWorkflowLink(podList []*K8sMetaEvent) []*K8sMetaEvent { - crCache := g.metaCache[CUSTOM_RESOURCE_ARGO_WORKFLOW] +func (g *LinkGenerator) getPodCustomResourceLink(podList []*K8sMetaEvent, rt *podCRLinkRuntime, linkType string) []*K8sMetaEvent { + if rt == nil { + return nil + } + crCache := g.metaCache[rt.entityType] if crCache == nil { return nil } @@ -615,13 +634,13 @@ func (g *LinkGenerator) getPodArgoWorkflowLink(podList []*K8sMetaEvent) []*K8sMe if !ok { continue } - ns, wfName, found := g.argoWorkflowNameFromPod(pod) - if !found || wfName == "" { + ns, resName, found := g.crNameFromPod(pod, rt) + if !found || resName == "" { continue } - wfList := crCache.Get([]string{generateNameWithNamespaceKey(ns, wfName)}) - for _, workflows := range wfList { - for _, w := range workflows { + items := crCache.Get([]string{generateNameWithNamespaceKey(ns, resName)}) + for _, group := range items { + for _, w := range group { u, ok := w.Raw.(*unstructured.Unstructured) if !ok { continue @@ -629,10 +648,10 @@ func (g *LinkGenerator) getPodArgoWorkflowLink(podList []*K8sMetaEvent) []*K8sMe result = append(result, &K8sMetaEvent{ EventType: data.EventType, Object: &ObjectWrapper{ - ResourceType: POD_ARGO_WORKFLOW, - Raw: &PodArgoWorkflow{ - Pod: pod, - Workflow: u, + ResourceType: linkType, + Raw: &PodCustomResource{ + Pod: pod, + CR: u, }, FirstObservedTime: data.Object.FirstObservedTime, LastObservedTime: data.Object.LastObservedTime, diff --git a/pkg/helper/k8smeta/k8s_meta_manager.go b/pkg/helper/k8smeta/k8s_meta_manager.go index 76887b5ad1..6b2a35c045 100644 --- a/pkg/helper/k8smeta/k8s_meta_manager.go +++ b/pkg/helper/k8smeta/k8s_meta_manager.go @@ -41,16 +41,22 @@ type FlushCh struct { type MetaManager struct { clientset *kubernetes.Clientset + restConfig *rest.Config stopCh chan struct{} ready atomic.Bool metadataHandler *metadataHandler cacheMap map[string]MetaCache + cacheMu sync.RWMutex linkGenerator *LinkGenerator linkRegisterMap map[string][]string registerLock sync.RWMutex + nsPolicyMu sync.RWMutex + nsPolicyRegs []nsPolicyReg + nextNsPolicyID int + // self metrics projectNames map[string]int metricRecord selfmonitor.MetricsRecord @@ -81,6 +87,54 @@ func GetMetaManagerInstance() *MetaManager { return metaManager } +// RegisterCustomResourceCollector registers a dynamic informer cache keyed by cfg.EntityType (after Normalize). +// Optional PodLink registers Pod→CR link generation for PodLinkTypeForEntity(EntityType). +// Safe before or after Init; if Init already ran, the dynamic client is attached immediately. +func (m *MetaManager) RegisterCustomResourceCollector(cfg CustomResourceCollectorConfig) error { + if err := cfg.Normalize(); err != nil { + return err + } + m.cacheMu.Lock() + if exist, ok := m.cacheMap[cfg.EntityType]; ok { + if uc, isCR := exist.(*crUnifiedCache); isCR { + uc.SetGVRIfNotStarted(cfg.ToGVR()) + } + } else { + m.cacheMap[cfg.EntityType] = newCRUnifiedCache(m.stopCh, cfg.EntityType, cfg.ToGVR()) + if m.restConfig != nil { + if uc, ok := m.cacheMap[cfg.EntityType].(*crUnifiedCache); ok { + if err := uc.setRESTConfig(m.restConfig); err != nil { + logger.Error(context.Background(), K8sMetaUnifyErrorCode, "setRESTConfig for custom resource cache", err, "entityType", cfg.EntityType) + } + } + } + } + m.cacheMu.Unlock() + + if cfg.PodLink != nil { + m.linkGenerator.registerPodCRLink(PodLinkTypeForEntity(cfg.EntityType), &podCRLinkRuntime{ + entityType: cfg.EntityType, + ownerKind: cfg.PodLink.OwnerKind, + ownerAPIGroupSubstr: firstNonEmpty(cfg.PodLink.OwnerAPIGroupContains, cfg.APIGroup), + podLabelKey: cfg.PodLink.PodLabelKey, + }) + } + return nil +} + +// EnsureCustomResourceInformerStarted starts the dynamic informer for an EntityType if the REST config is ready. +func (m *MetaManager) EnsureCustomResourceInformerStarted(entityType string) { + m.cacheMu.RLock() + c, ok := m.cacheMap[entityType] + m.cacheMu.RUnlock() + if !ok { + return + } + if uc, ok := c.(*crUnifiedCache); ok { + uc.EnsureWatchStarted() + } +} + func (m *MetaManager) Init(configPath string) (err error) { var config *rest.Config if len(configPath) > 0 { @@ -103,6 +157,17 @@ func (m *MetaManager) Init(configPath string) (err error) { return err } m.clientset = clientset + m.restConfig = config + + m.cacheMu.Lock() + for _, c := range m.cacheMap { + if uc, ok := c.(*crUnifiedCache); ok { + if err := uc.setRESTConfig(config); err != nil { + logger.Error(context.Background(), K8sMetaUnifyErrorCode, "setRESTConfig for custom resource cache at Init", err, "resourceType", uc.resourceType) + } + } + } + m.cacheMu.Unlock() m.metricRecord = selfmonitor.MetricsRecord{} m.addEventCount = selfmonitor.NewCounterMetricAndRegister(&m.metricRecord, selfmonitor.MetricRunnerK8sMetaAddEventTotal) @@ -116,9 +181,21 @@ func (m *MetaManager) Init(configPath string) (err error) { go func() { startTime := time.Now() + m.cacheMu.RLock() + caches := make([]struct { + name string + c MetaCache + }, 0, len(m.cacheMap)) for resourceType, cache := range m.cacheMap { - logger.Info(context.Background(), resourceType, "init success") - cache.init(clientset) + caches = append(caches, struct { + name string + c MetaCache + }{resourceType, cache}) + } + m.cacheMu.RUnlock() + for _, ent := range caches { + logger.Info(context.Background(), ent.name, "init success") + ent.c.init(clientset) } m.ready.Store(true) logger.Info(context.Background(), "init k8s meta manager", "success", "latancy (ms)", fmt.Sprintf("%d", time.Since(startTime).Milliseconds())) @@ -136,7 +213,10 @@ func (m *MetaManager) IsReady() bool { } func (m *MetaManager) RegisterSendFunc(projectName, configName, resourceType string, sendFunc SendFunc, interval int) { - if cache, ok := m.cacheMap[resourceType]; ok { + m.cacheMu.RLock() + cache, ok := m.cacheMap[resourceType] + m.cacheMu.RUnlock() + if ok { cache.RegisterSendFunc(configName, func(events []*K8sMetaEvent) { defer panicRecover() sendFunc(events) @@ -174,7 +254,13 @@ func (m *MetaManager) RegisterSendFunc(projectName, configName, resourceType str } func (m *MetaManager) UnRegisterAllSendFunc(projectName, configName string) { + m.cacheMu.RLock() + caches := make([]MetaCache, 0, len(m.cacheMap)) for _, cache := range m.cacheMap { + caches = append(caches, cache) + } + m.cacheMu.RUnlock() + for _, cache := range caches { cache.UnRegisterSendFunc(configName) } m.registerLock.Lock() @@ -197,10 +283,12 @@ func GetMetaManagerMetrics() []map[string]string { // cache queueSize := 0 cacheSize := 0 + manager.cacheMu.RLock() for _, cache := range manager.cacheMap { queueSize += cache.GetQueueSize() cacheSize += cache.GetSize() } + manager.cacheMu.RUnlock() manager.queueSizeGauge.Set(float64(queueSize)) manager.cacheResourceGauge.Set(float64(cacheSize)) // set labels diff --git a/pkg/helper/k8smeta/k8s_meta_namespace_policy.go b/pkg/helper/k8smeta/k8s_meta_namespace_policy.go new file mode 100644 index 0000000000..0b624ff727 --- /dev/null +++ b/pkg/helper/k8smeta/k8s_meta_namespace_policy.go @@ -0,0 +1,168 @@ +package k8smeta + +import ( + "strings" + + "k8s.io/apimachinery/pkg/api/meta" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" +) + +// namespacePolicy is one pipeline's NamespaceBlackList / NamespaceWhiteList. +// - Neither list: allows all namespaces (such a policy is not registered). +// - BlackList only: allow ns ∉ BlackList. +// - WhiteList only: allow ns ∈ WhiteList. +// - Both: union semantics — allow if ns ∉ BlackList OR ns ∈ WhiteList. +type namespacePolicy struct { + black map[string]struct{} + white map[string]struct{} + hasBlack bool + hasWhite bool +} + +func newNamespacePolicy(blackList, whiteList []string) *namespacePolicy { + p := &namespacePolicy{} + for _, s := range blackList { + s = strings.TrimSpace(s) + if s == "" { + continue + } + if p.black == nil { + p.black = make(map[string]struct{}) + } + p.black[s] = struct{}{} + p.hasBlack = true + } + for _, s := range whiteList { + s = strings.TrimSpace(s) + if s == "" { + continue + } + if p.white == nil { + p.white = make(map[string]struct{}) + } + p.white[s] = struct{}{} + p.hasWhite = true + } + return p +} + +func (p *namespacePolicy) allowsNamespace(ns string) bool { + if !p.hasBlack && !p.hasWhite { + return true + } + inB := p.hasBlack && p.black != nil && containsSet(p.black, ns) + inW := p.hasWhite && p.white != nil && containsSet(p.white, ns) + if !p.hasBlack { + return inW + } + if !p.hasWhite { + return !inB + } + return !inB || inW +} + +func containsSet(m map[string]struct{}, ns string) bool { + _, ok := m[ns] + return ok +} + +// ObjectMetaNamespaceForFilter returns the namespace string used for policy checks and whether the object is cluster-scoped (no namespace filtering). +func ObjectMetaNamespaceForFilter(resourceType string, raw interface{}) (ns string, clusterScoped bool) { + if raw == nil { + return "", true + } + switch t := raw.(type) { + case *unstructured.Unstructured: + if resourceType == NAMESPACE { + return t.GetName(), false + } + n := t.GetNamespace() + if n == "" { + return "", true + } + return n, false + default: + switch resourceType { + case NODE, PERSISTENTVOLUME, STORAGECLASS: + return "", true + case NAMESPACE: + acc, err := meta.Accessor(raw) + if err != nil { + return "", true + } + return acc.GetName(), false + } + acc, err := meta.Accessor(raw) + if err != nil { + return "", true + } + n := acc.GetNamespace() + if n == "" { + return "", true + } + return n, false + } +} + +type nsPolicyReg struct { + id int + p *namespacePolicy +} + +// RegisterNamespacePolicy registers one input's namespace rules. Multiple inputs are combined with OR: +// a namespace passes if any registered policy allows it. Returns -1 when both lists are empty (nothing registered). +// Unregister non-negative ids with UnregisterNamespacePolicy on stop. +func (m *MetaManager) RegisterNamespacePolicy(blackList, whiteList []string) int { + p := newNamespacePolicy(blackList, whiteList) + if !p.hasBlack && !p.hasWhite { + return -1 + } + m.nsPolicyMu.Lock() + defer m.nsPolicyMu.Unlock() + id := m.nextNsPolicyID + m.nextNsPolicyID++ + m.nsPolicyRegs = append(m.nsPolicyRegs, nsPolicyReg{id: id, p: p}) + return id +} + +// UnregisterNamespacePolicy removes a policy registered with RegisterNamespacePolicy. Pass id -1 for no-op. +func (m *MetaManager) UnregisterNamespacePolicy(id int) { + if id < 0 { + return + } + m.nsPolicyMu.Lock() + defer m.nsPolicyMu.Unlock() + out := make([]nsPolicyReg, 0, len(m.nsPolicyRegs)) + for _, r := range m.nsPolicyRegs { + if r.id != id { + out = append(out, r) + } + } + m.nsPolicyRegs = out +} + +// MetaObjectPassesNamespacePolicy returns whether the object may enter the meta cache or be broadcast (add/update/delete). +func (m *MetaManager) MetaObjectPassesNamespacePolicy(o *ObjectWrapper) bool { + if o == nil || o.Raw == nil { + return true + } + if _, ok := o.Raw.(*TimerEvent); ok { + return true + } + ns, clusterScoped := ObjectMetaNamespaceForFilter(o.ResourceType, o.Raw) + if clusterScoped { + return true + } + m.nsPolicyMu.RLock() + regs := m.nsPolicyRegs + m.nsPolicyMu.RUnlock() + if len(regs) == 0 { + return true + } + for _, r := range regs { + if r.p.allowsNamespace(ns) { + return true + } + } + return false +} diff --git a/pkg/helper/k8smeta/k8s_meta_namespace_policy_test.go b/pkg/helper/k8smeta/k8s_meta_namespace_policy_test.go new file mode 100644 index 0000000000..9797cc7977 --- /dev/null +++ b/pkg/helper/k8smeta/k8s_meta_namespace_policy_test.go @@ -0,0 +1,109 @@ +package k8smeta + +import ( + "testing" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" +) + +func TestNamespacePolicyAllows(t *testing.T) { + tests := []struct { + name string + black []string + white []string + ns string + wantAllow bool + }{ + {"empty both", nil, nil, "kube-system", true}, + {"black only drop", []string{"kube-system"}, nil, "kube-system", false}, + {"black only allow", []string{"kube-system"}, nil, "app", true}, + {"white only in", nil, []string{"app"}, "app", true}, + {"white only out", nil, []string{"app"}, "kube-system", false}, + {"both union rescue", []string{"kube-system"}, []string{"kube-system"}, "kube-system", true}, + {"both black blocks", []string{"bad"}, []string{"app"}, "bad", false}, + {"both allow via white", []string{"bad"}, []string{"app"}, "app", true}, + {"both allow via not black", []string{"bad"}, []string{"app"}, "other", true}, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + p := newNamespacePolicy(tt.black, tt.white) + if !p.hasBlack && !p.hasWhite { + if !p.allowsNamespace(tt.ns) { + t.Fatalf("empty policy should allow") + } + return + } + if got := p.allowsNamespace(tt.ns); got != tt.wantAllow { + t.Fatalf("allowsNamespace(%q) = %v, want %v", tt.ns, got, tt.wantAllow) + } + }) + } +} + +func TestObjectMetaNamespaceForFilter(t *testing.T) { + pod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: "app", Name: "p"}, + } + ns, cluster := ObjectMetaNamespaceForFilter(POD, pod) + if cluster || ns != "app" { + t.Fatalf("pod: got ns=%q cluster=%v", ns, cluster) + } + nsObj := &v1.Namespace{ + ObjectMeta: metav1.ObjectMeta{Name: "kube-system"}, + } + ns, cluster = ObjectMetaNamespaceForFilter(NAMESPACE, nsObj) + if cluster || ns != "kube-system" { + t.Fatalf("namespace resource: got ns=%q cluster=%v", ns, cluster) + } + node := &v1.Node{ + ObjectMeta: metav1.ObjectMeta{Name: "n1"}, + } + ns, cluster = ObjectMetaNamespaceForFilter(NODE, node) + if !cluster { + t.Fatalf("node should be cluster scoped") + } + u := &unstructured.Unstructured{} + u.SetNamespace("cr-ns") + u.SetName("wf1") + ns, cluster = ObjectMetaNamespaceForFilter("custom.entity", u) + if cluster || ns != "cr-ns" { + t.Fatalf("unstructured: got ns=%q cluster=%v", ns, cluster) + } +} + +func TestMetaManagerNamespacePolicyOR(t *testing.T) { + m := &MetaManager{} + id1 := m.RegisterNamespacePolicy([]string{"kube-system"}, nil) + if id1 < 0 { + t.Fatal(id1) + } + id2 := m.RegisterNamespacePolicy(nil, []string{"app"}) + if id2 < 0 { + t.Fatal(id2) + } + wrap := func(rt string, raw interface{}) *ObjectWrapper { + return &ObjectWrapper{ResourceType: rt, Raw: raw} + } + if !m.MetaObjectPassesNamespacePolicy(wrap(POD, &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: "app", Name: "a"}, + })) { + t.Fatal("app should pass whitelist policy") + } + if !m.MetaObjectPassesNamespacePolicy(wrap(POD, &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: "default", Name: "a"}, + })) { + t.Fatal("default should pass via only-black policy (not kube-system)") + } + if m.MetaObjectPassesNamespacePolicy(wrap(POD, &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{Namespace: "kube-system", Name: "a"}, + })) { + t.Fatal("kube-system should be blocked by blacklist policy") + } + m.UnregisterNamespacePolicy(id1) + m.UnregisterNamespacePolicy(id2) + if len(m.nsPolicyRegs) != 0 { + t.Fatal("regs should be empty") + } +} diff --git a/plugins/input/kubernetesmetav2/meta_collector.go b/plugins/input/kubernetesmetav2/meta_collector.go index b044241f38..65299f0c65 100644 --- a/plugins/input/kubernetesmetav2/meta_collector.go +++ b/plugins/input/kubernetesmetav2/meta_collector.go @@ -29,11 +29,17 @@ type metaCollector struct { entityBuffer chan models.PipelineEvent entityLinkBuffer chan models.PipelineEvent - stopCh chan struct{} - entityProcessor map[string]ProcessFunc + stopCh chan struct{} + namespacePolicyID int + entityProcessor map[string]ProcessFunc + crConfigs map[string]k8smeta.CustomResourceCollectorConfig } func (m *metaCollector) Start() error { + m.namespacePolicyID = m.serviceK8sMeta.metaManager.RegisterNamespacePolicy( + m.serviceK8sMeta.NamespaceBlackList, + m.serviceK8sMeta.NamespaceWhiteList, + ) m.entityProcessor = map[string]ProcessFunc{ k8smeta.POD: m.processPodEntity, k8smeta.NODE: m.processNodeEntity, @@ -50,7 +56,6 @@ func (m *metaCollector) Start() error { k8smeta.PERSISTENTVOLUMECLAIM: m.processPersistentVolumeClaimEntity, k8smeta.STORAGECLASS: m.processStorageClassEntity, k8smeta.INGRESS: m.processIngressEntity, - k8smeta.CUSTOM_RESOURCE_ARGO_WORKFLOW: m.processCustomResourceEntity, k8smeta.POD_NODE: m.processPodNodeLink, k8smeta.POD_DEPLOYMENT: m.processPodDeploymentLink, k8smeta.POD_REPLICASET: m.processPodReplicaSetLink, @@ -64,7 +69,6 @@ func (m *metaCollector) Start() error { k8smeta.POD_SERVICE: m.processPodServiceLink, k8smeta.POD_CONTAINER: m.processPodContainerLink, k8smeta.INGRESS_SERVICE: m.processIngressServiceLink, - k8smeta.POD_ARGO_WORKFLOW: m.processPodArgoWorkflowLink, // add namespace to xx link processor k8smeta.POD_NAMESPACE: m.processPodNamespaceLink, @@ -79,15 +83,26 @@ func (m *metaCollector) Start() error { k8smeta.INGRESS_NAMESPACE: m.processIngressNamespaceLink, } - if needArgoWorkflowInformer(m.serviceK8sMeta) { - m.serviceK8sMeta.metaManager.ConfigureArgoWorkflowCollector(k8smeta.ArgoWorkflowCollectorOptions{ - APIGroup: m.serviceK8sMeta.ArgoWorkflowAPIGroup, - APIVersion: m.serviceK8sMeta.ArgoWorkflowAPIVersion, - Resource: m.serviceK8sMeta.ArgoWorkflowResource, - PodWorkflowLabelKey: m.serviceK8sMeta.ArgoWorkflowPodLabelKey, - }) - m.serviceK8sMeta.metaManager.EnsureArgoWorkflowInformerStarted() + m.crConfigs = make(map[string]k8smeta.CustomResourceCollectorConfig) + for _, cfg := range m.serviceK8sMeta.resolvedCustomResources() { + if err := cfg.Normalize(); err != nil { + logger.Warning(context.Background(), k8smeta.K8sMetaUnifyErrorCode, "invalid CustomResources entry", err, "entity", cfg.EntityType) + continue + } + if err := m.serviceK8sMeta.metaManager.RegisterCustomResourceCollector(cfg); err != nil { + logger.Warning(context.Background(), k8smeta.K8sMetaUnifyErrorCode, "register custom resource collector", err, "entity", cfg.EntityType) + continue + } + m.crConfigs[cfg.EntityType] = cfg + if cfg.CollectEntity { + m.entityProcessor[cfg.EntityType] = m.processCustomResourceEntity + } + if m.serviceK8sMeta.Pod && cfg.PodLink != nil && cfg.Entity2PodRelation != "" { + m.entityProcessor[k8smeta.PodLinkTypeForEntity(cfg.EntityType)] = m.processPodCustomResourceLink + } + m.serviceK8sMeta.metaManager.EnsureCustomResourceInformerStarted(cfg.EntityType) } + if m.serviceK8sMeta.Pod { m.serviceK8sMeta.metaManager.RegisterSendFunc(m.serviceK8sMeta.context.GetProject(), m.serviceK8sMeta.configName, k8smeta.POD, m.handleEvent, m.serviceK8sMeta.Interval) } @@ -133,8 +148,13 @@ func (m *metaCollector) Start() error { if m.serviceK8sMeta.Ingress { m.serviceK8sMeta.metaManager.RegisterSendFunc(m.serviceK8sMeta.context.GetProject(), m.serviceK8sMeta.configName, k8smeta.INGRESS, m.handleEvent, m.serviceK8sMeta.Interval) } - if m.serviceK8sMeta.ArgoWorkflow { - m.serviceK8sMeta.metaManager.RegisterSendFunc(m.serviceK8sMeta.context.GetProject(), m.serviceK8sMeta.configName, k8smeta.CUSTOM_RESOURCE_ARGO_WORKFLOW, m.handleEvent, m.serviceK8sMeta.Interval) + for entityType, cfg := range m.crConfigs { + if cfg.CollectEntity { + m.serviceK8sMeta.metaManager.RegisterSendFunc(m.serviceK8sMeta.context.GetProject(), m.serviceK8sMeta.configName, entityType, m.handleEvent, m.serviceK8sMeta.Interval) + } + if m.serviceK8sMeta.Pod && cfg.PodLink != nil && cfg.Entity2PodRelation != "" { + m.serviceK8sMeta.metaManager.RegisterSendFunc(m.serviceK8sMeta.context.GetProject(), m.serviceK8sMeta.configName, k8smeta.PodLinkTypeForEntity(entityType), m.handleEvent, m.serviceK8sMeta.Interval) + } } if m.serviceK8sMeta.Pod && m.serviceK8sMeta.Node && m.serviceK8sMeta.Node2Pod != "" { @@ -176,9 +196,6 @@ func (m *metaCollector) Start() error { if m.serviceK8sMeta.Ingress && m.serviceK8sMeta.Service && m.serviceK8sMeta.Ingress2Service != "" { m.serviceK8sMeta.metaManager.RegisterSendFunc(m.serviceK8sMeta.context.GetProject(), m.serviceK8sMeta.configName, k8smeta.INGRESS_SERVICE, m.handleEvent, m.serviceK8sMeta.Interval) } - if m.serviceK8sMeta.Pod && m.serviceK8sMeta.Workflow2Pod != "" { - m.serviceK8sMeta.metaManager.RegisterSendFunc(m.serviceK8sMeta.context.GetProject(), m.serviceK8sMeta.configName, k8smeta.POD_ARGO_WORKFLOW, m.handleEvent, m.serviceK8sMeta.Interval) - } if m.serviceK8sMeta.Namespace && m.serviceK8sMeta.Pod && m.serviceK8sMeta.Namespace2Pod != "" { m.serviceK8sMeta.metaManager.RegisterSendFunc(m.serviceK8sMeta.context.GetProject(), m.serviceK8sMeta.configName, k8smeta.POD_NAMESPACE, m.handleEvent, m.serviceK8sMeta.Interval) } @@ -215,6 +232,8 @@ func (m *metaCollector) Start() error { } func (m *metaCollector) Stop() error { + m.serviceK8sMeta.metaManager.UnregisterNamespacePolicy(m.namespacePolicyID) + m.namespacePolicyID = -1 m.serviceK8sMeta.metaManager.UnRegisterAllSendFunc(m.serviceK8sMeta.context.GetProject(), m.serviceK8sMeta.configName) close(m.stopCh) return nil @@ -494,10 +513,6 @@ func isEntity(resourceType string) bool { return !strings.Contains(resourceType, k8smeta.LINK_SPLIT_CHARACTER) } -func needArgoWorkflowInformer(s *ServiceK8sMeta) bool { - return s.ArgoWorkflow || (s.Pod && s.Workflow2Pod != "") -} - func safeGetInt32String(pointer *int32) string { if pointer == nil { return "" diff --git a/plugins/input/kubernetesmetav2/meta_collector_cr.go b/plugins/input/kubernetesmetav2/meta_collector_cr.go index 3c9c7ff704..0eb0e8c39e 100644 --- a/plugins/input/kubernetesmetav2/meta_collector_cr.go +++ b/plugins/input/kubernetesmetav2/meta_collector_cr.go @@ -10,6 +10,27 @@ import ( "github.com/alibaba/ilogtail/pkg/models" ) +func (m *metaCollector) customResourceLabelAllowList(cfg k8smeta.CustomResourceCollectorConfig) []string { + if len(cfg.LabelAllowList) > 0 { + return cfg.LabelAllowList + } + return nil +} + +func (m *metaCollector) customResourceAnnotationAllowList(cfg k8smeta.CustomResourceCollectorConfig) []string { + if len(cfg.AnnotationAllowList) > 0 { + return cfg.AnnotationAllowList + } + return nil +} + +func (m *metaCollector) customResourceStatusPaths(cfg k8smeta.CustomResourceCollectorConfig) []string { + if len(cfg.StatusPathAllowList) > 0 { + return cfg.StatusPathAllowList + } + return nil +} + func filterStringMapByAllowList(m map[string]string, allow []string) map[string]string { if len(allow) == 0 || len(m) == 0 { return nil @@ -50,6 +71,10 @@ func pickUnstructuredFieldCopy(obj map[string]interface{}, paths []string) map[s } func (m *metaCollector) processCustomResourceEntity(data *k8smeta.ObjectWrapper, method string) []models.PipelineEvent { + cfg, ok := m.crConfigs[data.ResourceType] + if !ok { + return nil + } obj, ok := data.Raw.(*unstructured.Unstructured) if !ok { return nil @@ -57,44 +82,46 @@ func (m *metaCollector) processCustomResourceEntity(data *k8smeta.ObjectWrapper, log := &models.Log{} log.Contents = models.NewLogContents() log.Timestamp = uint64(time.Now().Unix()) - kindKey := k8smeta.CUSTOM_RESOURCE_ARGO_WORKFLOW + kindKey := data.ResourceType m.processEntityCommonPart(log.Contents, kindKey, obj.GetNamespace(), obj.GetName(), method, data.FirstObservedTime, data.LastObservedTime, obj.GetCreationTimestamp()) - log.Contents.Add(entityKindFieldName, k8smeta.ArgoWorkflowKind) + log.Contents.Add(entityKindFieldName, cfg.Kind) log.Contents.Add("api_version", obj.GetAPIVersion()) log.Contents.Add("namespace", obj.GetNamespace()) - // Labels/annotations: same switches as built-in entities (EnableLabels / EnableAnnotations). - // If CustomResourceWorkflow*AllowList is non-empty, only those keys are emitted (subset mode; does not require Enable*). - if len(m.serviceK8sMeta.CustomResourceWorkflowLabelAllowList) > 0 { - if labels := filterStringMapByAllowList(obj.GetLabels(), m.serviceK8sMeta.CustomResourceWorkflowLabelAllowList); labels != nil { + labelAllow := m.customResourceLabelAllowList(cfg) + if len(labelAllow) > 0 { + if labels := filterStringMapByAllowList(obj.GetLabels(), labelAllow); labels != nil { log.Contents.Add("labels", m.processEntityJSONObject(labels)) } - } else if m.serviceK8sMeta.EnableLabels { + } else if cfg.EnableLabels { log.Contents.Add("labels", m.processEntityJSONObject(obj.GetLabels())) } - if len(m.serviceK8sMeta.CustomResourceWorkflowAnnotationAllowList) > 0 { - if annos := filterStringMapByAllowList(obj.GetAnnotations(), m.serviceK8sMeta.CustomResourceWorkflowAnnotationAllowList); annos != nil { + annoAllow := m.customResourceAnnotationAllowList(cfg) + if len(annoAllow) > 0 { + if annos := filterStringMapByAllowList(obj.GetAnnotations(), annoAllow); annos != nil { log.Contents.Add("annotations", m.processEntityJSONObject(annos)) } - } else if m.serviceK8sMeta.EnableAnnotations { + } else if cfg.EnableAnnotations { log.Contents.Add("annotations", m.processEntityJSONObject(obj.GetAnnotations())) } - if statusObj := pickUnstructuredFieldCopy(obj.Object, m.serviceK8sMeta.CustomResourceWorkflowStatusPathAllowList); statusObj != nil { + if statusObj := pickUnstructuredFieldCopy(obj.Object, m.customResourceStatusPaths(cfg)); statusObj != nil { log.Contents.Add("status", m.processEntityJSONObject(statusObj)) } return []models.PipelineEvent{log} } -func (m *metaCollector) processPodArgoWorkflowLink(data *k8smeta.ObjectWrapper, method string) []models.PipelineEvent { - obj, ok := data.Raw.(*k8smeta.PodArgoWorkflow) +func (m *metaCollector) processPodCustomResourceLink(data *k8smeta.ObjectWrapper, method string) []models.PipelineEvent { + obj, ok := data.Raw.(*k8smeta.PodCustomResource) if !ok { return nil } + entityType := strings.TrimPrefix(data.ResourceType, k8smeta.POD+k8smeta.LINK_SPLIT_CHARACTER) + cfg := m.crConfigs[entityType] log := &models.Log{} log.Contents = models.NewLogContents() - m.processEntityLinkCommonPart(log.Contents, k8smeta.CUSTOM_RESOURCE_ARGO_WORKFLOW, obj.Workflow.GetNamespace(), obj.Workflow.GetName(), + m.processEntityLinkCommonPart(log.Contents, entityType, obj.CR.GetNamespace(), obj.CR.GetName(), obj.Pod.Kind, obj.Pod.Namespace, obj.Pod.Name, method, data.FirstObservedTime, data.LastObservedTime) - log.Contents.Add(entityLinkRelationTypeFieldName, m.serviceK8sMeta.Workflow2Pod) + log.Contents.Add(entityLinkRelationTypeFieldName, cfg.Entity2PodRelation) log.Timestamp = uint64(time.Now().Unix()) return []models.PipelineEvent{log} } diff --git a/plugins/input/kubernetesmetav2/service_meta.go b/plugins/input/kubernetesmetav2/service_meta.go index 615058ae98..580e911d93 100644 --- a/plugins/input/kubernetesmetav2/service_meta.go +++ b/plugins/input/kubernetesmetav2/service_meta.go @@ -31,25 +31,14 @@ type ServiceK8sMeta struct { StorageClass bool Ingress bool Container bool - // ArgoWorkflow enables collecting argoproj.io/v1alpha1 Workflow CRs as entities (dynamic informer). - ArgoWorkflow bool - // Workflow2Pod sets __relation_type__ for Pod–Argo Workflow entity links (non-empty to enable link collection when Pod is enabled). - Workflow2Pod string - // ArgoWorkflowAPIGroup overrides k8smeta.DefaultArgoWorkflowAPIGroup for ownerRef matching and informer Group (empty = default). - ArgoWorkflowAPIGroup string - // ArgoWorkflowAPIVersion overrides k8smeta.DefaultArgoWorkflowAPIVersion for the Workflow informer (empty = default). - ArgoWorkflowAPIVersion string - // ArgoWorkflowResource overrides k8smeta.DefaultArgoWorkflowResource for the Workflow informer (empty = default). - ArgoWorkflowResource string - // ArgoWorkflowPodLabelKey overrides k8smeta.DefaultArgoWorkflowPodLabelKey for Pod label fallback (empty = default). - ArgoWorkflowPodLabelKey string - // CustomResourceWorkflowLabelAllowList: if non-empty, only these label keys are emitted (subset); when empty, use EnableLabels for full labels like other entities. - CustomResourceWorkflowLabelAllowList []string - // CustomResourceWorkflowAnnotationAllowList: if non-empty, only these annotation keys are emitted (subset); when empty, use EnableAnnotations for full annotations. - CustomResourceWorkflowAnnotationAllowList []string - // CustomResourceWorkflowStatusPathAllowList: JSON paths under the object root (e.g. "status.phase", "status.startedAt"). - CustomResourceWorkflowStatusPathAllowList []string - // EnableLabels / EnableAnnotations: when true, emit full labels/annotations on entities; Argo Workflow CR entities use the same flags when the allow lists above are empty. + // CustomResources registers third-party CRs (dynamic informer + optional CR→Pod links via PodLink). See k8smeta.CustomResourceCollectorConfig. + CustomResources []k8smeta.CustomResourceCollectorConfig `json:"CustomResources,omitempty"` + // NamespaceBlackList / NamespaceWhiteList: global namespace filter for meta cache and events (Node, PersistentVolume, StorageClass are not filtered). + // If both are set on this input, a namespace is allowed when it is not blacklisted OR whitelisted (union). Multiple pipelines OR their policies. + // Empty both: no restriction from this input. Cluster-scoped objects are always allowed. + NamespaceBlackList []string `json:"NamespaceBlackList,omitempty"` + NamespaceWhiteList []string `json:"NamespaceWhiteList,omitempty"` + // EnableLabels / EnableAnnotations: when true, emit full labels/annotations on built-in entity kinds (not CustomResources; those use CustomResources[].EnableLabels/EnableAnnotations). EnableLabels bool EnableAnnotations bool // link switch @@ -130,12 +119,13 @@ func (s *ServiceK8sMeta) Stop() error { func (s *ServiceK8sMeta) Start(collector pipeline.Collector) error { s.collector = collector s.metaCollector = &metaCollector{ - serviceK8sMeta: s, - collector: collector, - entityBuffer: make(chan models.PipelineEvent, 100), - entityLinkBuffer: make(chan models.PipelineEvent, 100), - stopCh: make(chan struct{}), - entityProcessor: make(map[string]ProcessFunc), + serviceK8sMeta: s, + collector: collector, + entityBuffer: make(chan models.PipelineEvent, 100), + entityLinkBuffer: make(chan models.PipelineEvent, 100), + stopCh: make(chan struct{}), + namespacePolicyID: -1, + entityProcessor: make(map[string]ProcessFunc), } return s.metaCollector.Start() } @@ -150,6 +140,15 @@ func (s *ServiceK8sMeta) initDomain() { } +func (s *ServiceK8sMeta) resolvedCustomResources() []k8smeta.CustomResourceCollectorConfig { + if len(s.CustomResources) == 0 { + return nil + } + out := make([]k8smeta.CustomResourceCollectorConfig, len(s.CustomResources)) + copy(out, s.CustomResources) + return out +} + func init() { pipeline.ServiceInputs["service_kubernetes_meta"] = func() pipeline.ServiceInput { return &ServiceK8sMeta{ From d3e4baf81c76f993e754c76ad5226197f4ceb138 Mon Sep 17 00:00:00 2001 From: StartE Date: Fri, 27 Mar 2026 10:37:57 +0000 Subject: [PATCH 03/20] undo --- pkg/helper/k8smeta/k8s_meta_cache.go | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/pkg/helper/k8smeta/k8s_meta_cache.go b/pkg/helper/k8smeta/k8s_meta_cache.go index 53b1cfb4a5..56d0b523c5 100644 --- a/pkg/helper/k8smeta/k8s_meta_cache.go +++ b/pkg/helper/k8smeta/k8s_meta_cache.go @@ -212,10 +212,10 @@ func (m *k8sMetaCache) getFactoryInformer() (informers.SharedInformerFactory, ca informer = factory.Apps().V1().StatefulSets().Informer() case DAEMONSET: informer = factory.Apps().V1().DaemonSets().Informer() - // case CRONJOB: - // informer = m.getCronJobInformer(factory) - // case JOB: - // informer = factory.Batch().V1().Jobs().Informer() + case CRONJOB: + informer = m.getCronJobInformer(factory) + case JOB: + informer = factory.Batch().V1().Jobs().Informer() case NODE: informer = factory.Core().V1().Nodes().Informer() case NAMESPACE: From 6822b6c50f18b9df0b79108c84fe591ee534e3d0 Mon Sep 17 00:00:00 2001 From: StartE Date: Mon, 30 Mar 2026 06:58:09 +0000 Subject: [PATCH 04/20] suport ns cr link --- pkg/helper/k8smeta/k8s_meta_const.go | 38 +++++++------ .../k8smeta/k8s_meta_custom_resource.go | 16 ++++-- pkg/helper/k8smeta/k8s_meta_link.go | 53 ++++++++++++++++++- .../input/kubernetesmetav2/meta_collector.go | 14 +++-- .../kubernetesmetav2/meta_collector_cr.go | 19 +++++++ 5 files changed, 114 insertions(+), 26 deletions(-) diff --git a/pkg/helper/k8smeta/k8s_meta_const.go b/pkg/helper/k8smeta/k8s_meta_const.go index f882cc345d..8f93ed0723 100644 --- a/pkg/helper/k8smeta/k8s_meta_const.go +++ b/pkg/helper/k8smeta/k8s_meta_const.go @@ -12,22 +12,22 @@ const ( EntityCollectorUserAgent = "loongcollector-singleton" // entity type - POD = "pod" - SERVICE = "service" - DEPLOYMENT = "deployment" - REPLICASET = "replicaset" - STATEFULSET = "statefulset" - DAEMONSET = "daemonset" - CRONJOB = "cronjob" - JOB = "job" - NODE = "node" - NAMESPACE = "namespace" - CONFIGMAP = "configmap" - PERSISTENTVOLUME = "persistentvolume" - PERSISTENTVOLUMECLAIM = "persistentvolumeclaim" - STORAGECLASS = "storageclass" - INGRESS = "ingress" - CONTAINER = "container" + POD = "pod" + SERVICE = "service" + DEPLOYMENT = "deployment" + REPLICASET = "replicaset" + STATEFULSET = "statefulset" + DAEMONSET = "daemonset" + CRONJOB = "cronjob" + JOB = "job" + NODE = "node" + NAMESPACE = "namespace" + CONFIGMAP = "configmap" + PERSISTENTVOLUME = "persistentvolume" + PERSISTENTVOLUMECLAIM = "persistentvolumeclaim" + STORAGECLASS = "storageclass" + INGRESS = "ingress" + CONTAINER = "container" //revive:disable:var-naming LINK_SPLIT_CHARACTER = "->" POD_NODE = "pod->node" @@ -208,6 +208,12 @@ type PodCustomResource struct { CR *unstructured.Unstructured } +// NamespaceCustomResource links a Namespace to a namespaced CR (unstructured). +type NamespaceCustomResource struct { + Namespace *v1.Namespace + CR *unstructured.Unstructured +} + const ( EventTypeAdd = "add" EventTypeUpdate = "update" diff --git a/pkg/helper/k8smeta/k8s_meta_custom_resource.go b/pkg/helper/k8smeta/k8s_meta_custom_resource.go index 31aa2db938..09ec548041 100644 --- a/pkg/helper/k8smeta/k8s_meta_custom_resource.go +++ b/pkg/helper/k8smeta/k8s_meta_custom_resource.go @@ -17,10 +17,10 @@ type CustomResourceCollectorConfig struct { // It drives __entity_type__, __entity_id__, and pod->{EntityType} links — set explicitly in pipeline config. EntityType string `json:"EntityType,omitempty"` - APIGroup string `json:"APIGroup,omitempty"` - APIVersion string `json:"APIVersion,omitempty"` - Resource string `json:"Resource,omitempty"` // plural resource name - Kind string `json:"Kind,omitempty"` // Kubernetes kind, for ownerReferences matching and export + APIGroup string `json:"APIGroup,omitempty"` + APIVersion string `json:"APIVersion,omitempty"` + Resource string `json:"Resource,omitempty"` // plural resource name + Kind string `json:"Kind,omitempty"` // Kubernetes kind, for ownerReferences matching and export // PodLink, if set, registers a Pod → this CR link generator (link type: PodLinkTypeForEntity(EntityType)). PodLink *PodToCustomResourceLinkConfig `json:"PodLink,omitempty"` @@ -28,6 +28,8 @@ type CustomResourceCollectorConfig struct { CollectEntity bool `json:"CollectEntity,omitempty"` // Entity2PodRelation is __relation_type__ on entity_link logs (custom resource → Pod). Required when Pod link export is enabled together with PodLink. Entity2PodRelation string `json:"Entity2PodRelation,omitempty"` + // Namespace2EntityRelation is __relation_type__ on entity_link logs (Namespace → this namespaced CR). Export when CollectEntity, Namespace input, and this string are all set. Cluster-scoped CRs are skipped. + Namespace2EntityRelation string `json:"Namespace2EntityRelation,omitempty"` // EnableLabels, if true, exports full labels when LabelAllowList is empty. Ignores ServiceK8sMeta.EnableLabels. Default false. EnableLabels bool `json:"EnableLabels,omitempty"` @@ -55,6 +57,11 @@ func PodLinkTypeForEntity(entityType string) string { return POD + LINK_SPLIT_CHARACTER + entityType } +// NamespaceLinkTypeForEntity is the link ResourceType for Namespace → namespaced CR (e.g. argo.workflow->namespace). +func NamespaceLinkTypeForEntity(entityType string) string { + return entityType + LINK_SPLIT_CHARACTER + NAMESPACE +} + // DefaultEntityType returns the conventional type string customresource//. // It does not apply automatically; EntityType must still be set on the config (Normalize requires it). func DefaultEntityType(apiGroup, kind string) string { @@ -77,6 +84,7 @@ func (c *CustomResourceCollectorConfig) Normalize() error { c.Resource = strings.TrimSpace(c.Resource) c.Kind = strings.TrimSpace(c.Kind) c.EntityType = strings.TrimSpace(c.EntityType) + c.Namespace2EntityRelation = strings.TrimSpace(c.Namespace2EntityRelation) if c.APIGroup == "" || c.APIVersion == "" || c.Resource == "" || c.Kind == "" { return fmt.Errorf("custom resource collector: APIGroup, APIVersion, Resource, and Kind are required") diff --git a/pkg/helper/k8smeta/k8s_meta_link.go b/pkg/helper/k8smeta/k8s_meta_link.go index 3e71dff773..45499c92a8 100644 --- a/pkg/helper/k8smeta/k8s_meta_link.go +++ b/pkg/helper/k8smeta/k8s_meta_link.go @@ -64,6 +64,9 @@ func (g *LinkGenerator) GenerateLinks(events []*K8sMetaEvent, linkType string) [ if !strings.HasPrefix(linkType, resourceType) { return nil } + // CustomResource links (third-party CR): + // 1) Pod→CR when linkType is registered via PodLink (before built-in switch). + // 2) namespaced CR→Namespace when linkType is "->namespace" (after switch, so built-in *->namespace kinds stay in cases above). if rt, ok := g.podCRRuntimeForLinkType(linkType); ok { return g.getPodCustomResourceLink(events, rt, linkType) } @@ -114,9 +117,13 @@ func (g *LinkGenerator) GenerateLinks(events []*K8sMetaEvent, linkType string) [ return g.getPVCNamespaceLink(events) case INGRESS_NAMESPACE: return g.getIngressNamespaceLink(events) - default: - return nil } + // CustomResource links (third-party CR): + // 2) namespaced CR→Namespace when linkType is "->namespace" (after switch, so built-in *->namespace kinds stay in cases above). + if strings.HasSuffix(linkType, LINK_SPLIT_CHARACTER+NAMESPACE) { + return g.getCustomResourceNamespaceLink(events) + } + return nil } func (g *LinkGenerator) getPodNodeLink(podList []*K8sMetaEvent) []*K8sMetaEvent { @@ -663,6 +670,48 @@ func (g *LinkGenerator) getPodCustomResourceLink(podList []*K8sMetaEvent, rt *po return result } +func (g *LinkGenerator) getCustomResourceNamespaceLink(events []*K8sMetaEvent) []*K8sMetaEvent { + if len(events) == 0 { + return nil + } + entityType := events[0].Object.ResourceType + nsCache := g.metaCache[NAMESPACE] + if nsCache == nil { + return nil + } + result := make([]*K8sMetaEvent, 0) + for _, data := range events { + u, ok := data.Object.Raw.(*unstructured.Unstructured) + if !ok { + continue + } + nsName := u.GetNamespace() + if nsName == "" { + continue + } + nsList := nsCache.Get([]string{generateNameWithNamespaceKey("", nsName)}) + for _, ns := range nsList { + for _, n := range ns { + if namespace, ok := n.Raw.(*v1.Namespace); ok { + result = append(result, &K8sMetaEvent{ + EventType: data.EventType, + Object: &ObjectWrapper{ + ResourceType: NamespaceLinkTypeForEntity(entityType), + Raw: &NamespaceCustomResource{ + Namespace: namespace, + CR: u, + }, + FirstObservedTime: data.Object.FirstObservedTime, + LastObservedTime: data.Object.LastObservedTime, + }, + }) + } + } + } + } + return result +} + func (g *LinkGenerator) getPodNamespaceLink(podList []*K8sMetaEvent) []*K8sMetaEvent { result := make([]*K8sMetaEvent, 0) for _, data := range podList { diff --git a/plugins/input/kubernetesmetav2/meta_collector.go b/plugins/input/kubernetesmetav2/meta_collector.go index 65299f0c65..f33cd0406d 100644 --- a/plugins/input/kubernetesmetav2/meta_collector.go +++ b/plugins/input/kubernetesmetav2/meta_collector.go @@ -29,10 +29,10 @@ type metaCollector struct { entityBuffer chan models.PipelineEvent entityLinkBuffer chan models.PipelineEvent - stopCh chan struct{} - namespacePolicyID int - entityProcessor map[string]ProcessFunc - crConfigs map[string]k8smeta.CustomResourceCollectorConfig + stopCh chan struct{} + namespacePolicyID int + entityProcessor map[string]ProcessFunc + crConfigs map[string]k8smeta.CustomResourceCollectorConfig } func (m *metaCollector) Start() error { @@ -100,6 +100,9 @@ func (m *metaCollector) Start() error { if m.serviceK8sMeta.Pod && cfg.PodLink != nil && cfg.Entity2PodRelation != "" { m.entityProcessor[k8smeta.PodLinkTypeForEntity(cfg.EntityType)] = m.processPodCustomResourceLink } + if m.serviceK8sMeta.Namespace && cfg.CollectEntity && cfg.Namespace2EntityRelation != "" { + m.entityProcessor[k8smeta.NamespaceLinkTypeForEntity(cfg.EntityType)] = m.processNamespaceCustomResourceLink + } m.serviceK8sMeta.metaManager.EnsureCustomResourceInformerStarted(cfg.EntityType) } @@ -155,6 +158,9 @@ func (m *metaCollector) Start() error { if m.serviceK8sMeta.Pod && cfg.PodLink != nil && cfg.Entity2PodRelation != "" { m.serviceK8sMeta.metaManager.RegisterSendFunc(m.serviceK8sMeta.context.GetProject(), m.serviceK8sMeta.configName, k8smeta.PodLinkTypeForEntity(entityType), m.handleEvent, m.serviceK8sMeta.Interval) } + if m.serviceK8sMeta.Namespace && cfg.CollectEntity && cfg.Namespace2EntityRelation != "" { + m.serviceK8sMeta.metaManager.RegisterSendFunc(m.serviceK8sMeta.context.GetProject(), m.serviceK8sMeta.configName, k8smeta.NamespaceLinkTypeForEntity(entityType), m.handleEvent, m.serviceK8sMeta.Interval) + } } if m.serviceK8sMeta.Pod && m.serviceK8sMeta.Node && m.serviceK8sMeta.Node2Pod != "" { diff --git a/plugins/input/kubernetesmetav2/meta_collector_cr.go b/plugins/input/kubernetesmetav2/meta_collector_cr.go index 0eb0e8c39e..1ac29c5526 100644 --- a/plugins/input/kubernetesmetav2/meta_collector_cr.go +++ b/plugins/input/kubernetesmetav2/meta_collector_cr.go @@ -110,6 +110,25 @@ func (m *metaCollector) processCustomResourceEntity(data *k8smeta.ObjectWrapper, return []models.PipelineEvent{log} } +func (m *metaCollector) processNamespaceCustomResourceLink(data *k8smeta.ObjectWrapper, method string) []models.PipelineEvent { + obj, ok := data.Raw.(*k8smeta.NamespaceCustomResource) + if !ok { + return nil + } + entityType := strings.TrimSuffix(data.ResourceType, k8smeta.LINK_SPLIT_CHARACTER+k8smeta.NAMESPACE) + cfg := m.crConfigs[entityType] + if cfg.EntityType == "" || cfg.Namespace2EntityRelation == "" { + return nil + } + log := &models.Log{} + log.Contents = models.NewLogContents() + m.processEntityLinkCommonPart(log.Contents, obj.Namespace.Kind, obj.Namespace.Namespace, obj.Namespace.Name, + cfg.EntityType, obj.CR.GetNamespace(), obj.CR.GetName(), method, data.FirstObservedTime, data.LastObservedTime) + log.Contents.Add(entityLinkRelationTypeFieldName, cfg.Namespace2EntityRelation) + log.Timestamp = uint64(time.Now().Unix()) + return []models.PipelineEvent{log} +} + func (m *metaCollector) processPodCustomResourceLink(data *k8smeta.ObjectWrapper, method string) []models.PipelineEvent { obj, ok := data.Raw.(*k8smeta.PodCustomResource) if !ok { From 5c3087a4affb3ba2a359734f1d30c1b3cd897c65 Mon Sep 17 00:00:00 2001 From: StartE Date: Tue, 31 Mar 2026 07:47:39 +0000 Subject: [PATCH 05/20] update doc --- .../extended/service-kubernetesmeta-v2.md | 105 +++++++++++++++++- 1 file changed, 101 insertions(+), 4 deletions(-) diff --git a/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md b/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md index 0e88dcce34..21e3c17259 100644 --- a/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md +++ b/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md @@ -2,7 +2,7 @@ ## 简介 -`service_kubernetes_meta` 定时采集Kubernetes元数据,包括Pod、Deployment等资源及其之间的关系。并提供HTTP查询接口,支持通过一些字段索引,如Pod IP、Host IP等信息快速查询元数据。 +`service_kubernetes_meta` 定时采集 Kubernetes 元数据,包括 Pod、Deployment 等内置资源及其关系;可通过 **`CustomResources`** 扩展采集第三方 CR(如 Argo Workflow),并生成对应实体与链路日志。提供 HTTP 查询接口,支持通过 Pod IP、Host IP 等索引快速查询元数据。 ## 版本 @@ -14,7 +14,7 @@ ## 配置参数 -**注意:** 本插件需要在Kubernetes集群中运行,且需要有访问Kubernetes API的权限。并且部署模式为单例模式,且配置环境变量`DEPLOY_MODE`为`singleton`,`ENABLE_KUBERNETES_META`为`true`。 +**注意:** 本插件需要在 Kubernetes 集群中(或具备访问 apiserver 的配置)运行,且 ServiceAccount / kubeconfig 必须具备与开启能力及 **CustomResources** 相匹配的 **RBAC**(见下文「Kubernetes RBAC 权限」)。部署为单例时需配置环境变量 `DEPLOY_MODE=singleton`、`ENABLE_KUBERNETES_META=true`。 | 参数 | 类型,默认值 | 说明 | | - | - | - | @@ -34,8 +34,11 @@ | PersistentVolumeClaim | bool, false | 是否采集PersistentVolumeClaim元数据。 | | StorageClass | bool, false | 是否采集StorageClass元数据。 | | Ingress | bool, false | 是否采集Ingress元数据。 | -| EnableLabels | bool, false | 是否采集Kubernetes资源的标签(Labels)信息。启用后会在元数据中包含资源的标签字段。 | -| EnableAnnotations | bool, false | 是否采集Kubernetes资源的注解(Annotations)信息。启用后会在元数据中包含资源的注解字段。 | +| EnableLabels | bool, false | 是否采集**内置**Kubernetes 资源的标签(Labels)。**自定义资源(CustomResources)**是否导出标签由每条 `CustomResources.EnableLabels` 控制,不受本字段影响。 | +| EnableAnnotations | bool, false | 是否采集**内置**资源的注解(Annotations)。自定义资源是否导出注解由每条 `CustomResources.EnableAnnotations` 控制。 | +| NamespaceBlackList | []string,可选 | 全局命名空间过滤:名单内命名空间的对象**不进入 meta 缓存、不投递实体/链路事件**。`Node`、`PersistentVolume`、`StorageClass` 不按命名空间过滤。与 `NamespaceWhiteList` 同时配置时,命名空间允许条件为「未命中黑名单 **或** 在白名单中」(并集语义);多条 Pipeline 各自注册的策略之间为 **OR**。 | +| NamespaceWhiteList | []string,可选 | 全局命名空间白名单,见上。均为空则不做命名空间限制。 | +| CustomResources | []object,可选 | 第三方 CR(动态 Informer)采集与链路,见下文「**第三方自定义资源(CustomResources)**」与「**Kubernetes RBAC 权限**」。 | | Node2Pod | string, 无默认值(可选) | Node到Pod的关系名,不填则不生成关系。 | | Deployment2Pod | string, 无默认值(可选) | Deployment到Pod的关系名,不填则不生成关系。 | | ReplicaSet2Pod | string, 无默认值(可选) | ReplicaSet到Pod的关系名,不填则不生成关系。 | @@ -65,6 +68,100 @@ | Cluster2StorageClass | string, 无默认值(可选) | Cluster到StorageClass的关系名,不填则不生成关系。 | +## Kubernetes RBAC 权限 + +本插件通过 **ServiceAccount**(或 kubeconfig 身份)访问 kube-apiserver。除内置资源外,凡在配置中开启的采集与 **CustomResources** 中声明的 GVR,都需要在 **`ClusterRole`(推荐集群级采集)+ `ClusterRoleBinding`** 或对应的 **`Role` + `RoleBinding`** 中授予至少 **`get`、`list`、`watch`**。 + +### 常见错误 + +动态 Informer 对 CR `list/watch` 失败时,日志中可能出现类似: + +`cannot list resource "workflows" in API group "argoproj.io" ... User "system:serviceaccount:..." cannot list ...` + +说明当前身份**缺少该 API 组下对应复数资源**的权限,与 LoongCollector 配置无关,需在 RBAC 中补齐。 + +### 内置资源 + +与配置里打开的内置开关对应即可(如 `pods`、`namespaces`、`deployments` 等),按你集群现有 `ClusterRole` 规则维护。 + +### 自定义资源示例:Argo Workflow + +采集 `argoproj.io/v1alpha1` 的 `Workflow`(复数资源名 **`workflows`**)时,在 **ClusterRole** 的 `rules` 中增加(按需合并到现有角色): + +```yaml + - apiGroups: + - argoproj.io + resources: + - workflows + verbs: + - get + - list + - watch +``` + +* **`resources`** 须为 CRD 中 **`spec.names.plural`**,可用 `kubectl api-resources --api-group=argoproj.io` 核对。 +* 使用 **ClusterRoleBinding** 绑定到运行采集的 ServiceAccount 时,可在全命名空间内 `list/watch` 该资源(Workflow 一般为命名空间作用域资源)。 +* 若还需采集同组其它 CR(如 `workflowtemplates`),在 `resources` 列表中继续追加即可。 +* 不推荐在生产中用 `resources: ["*"]` 放宽整组权限,除非有明确运维规范。 + +应用后等待 Informer 重试或重启相关 Pod 使权限生效。 + + +## 第三方自定义资源(CustomResources) + +`CustomResources` 为数组,每一项对应一种 CR(**GVR + Kind**),由动态客户端监听;可选生成实体日志及 **Pod→CR**、**Namespace→CR** 链路。 + +### 子项字段 + +| 字段 | 类型 | 说明 | +| - | - | - | +| EntityType | string,**必填** | 内部缓存与事件类型键,用于 `__entity_type__`、实体 ID 及链路类型(如 `pod->`)。须在多条 CR 配置间唯一。 | +| APIGroup | string,必填 | CRD 的 API 组,如 `argoproj.io`。 | +| APIVersion | string,必填 | 版本,如 `v1alpha1`。 | +| Resource | string,必填 | **复数**资源名,如 `workflows`(与 `kubectl api-resources` / CRD `spec.names.plural` 一致)。 | +| Kind | string,必填 | Kubernetes **Kind**,如 `Workflow`;用于 `ownerReferences` 匹配及实体日志中的 `kind` 等。 | +| CollectEntity | bool | 为 true 时投递该 CR 的实体(entity)日志。 | +| PodLink | object,可选 | 配置后,在开启 **Pod** 的前提下可生成 **Pod→CR** 链路;需同时配置 **`Entity2PodRelation`**。 | +| Entity2PodRelation | string,可选 | entity_link 中 **`__relation_type__`**(CR 与 Pod 之间的业务关系名);与 `PodLink` 同时非空时生效。 | +| Namespace2EntityRelation | string,可选 | entity_link 中 **`__relation_type__`**(Namespace 与该 CR);需 **`CollectEntity: true`**、顶层 **`Namespace: true`** 且本字段非空;仅**有命名空间**的 CR 会生成(集群级 CR 跳过)。 | +| EnableLabels | bool | 为 true 且 `LabelAllowList` 为空时导出**全部** labels;不受顶层 `EnableLabels` 影响。默认不导出。 | +| EnableAnnotations | bool | 同上,作用于 annotations。 | +| LabelAllowList | []string | 非空时仅导出所列 label 键(与 `EnableLabels` 组合见实现逻辑)。 | +| AnnotationAllowList | []string | 非空时仅导出所列 annotation 键。 | +| StatusPathAllowList | []string | 可选,实体日志中 `status` 字段的白名单路径。 | + +**PodLink** 子字段: + +| 字段 | 说明 | +| - | - | +| OwnerKind | 匹配 Pod `ownerReferences[].Kind`,默认可与条目的 `Kind` 一致。 | +| OwnerAPIGroupContains | 与 `ownerReferences[].apiVersion` 做子串匹配;空则默认使用条目的 `APIGroup`。 | +| PodLabelKey | 无匹配 owner 时的回退标签,如 Argo 的 `workflows.argoproj.io/workflow`。 | + +### 配置示例:Argo Workflow + +```yaml + CustomResources: + - APIGroup: argoproj.io + APIVersion: v1alpha1 + Resource: workflows + Kind: Workflow + # EntityType 必填:缓存键、__entity_type__、pod->{EntityType} 等 + EntityType: argo.workflow + CollectEntity: true + Entity2PodRelation: "contains" + Namespace2EntityRelation: "contains" + PodLink: + OwnerKind: Workflow + OwnerAPIGroupContains: argoproj.io + PodLabelKey: workflows.argoproj.io/workflow + EnableLabels: true + EnableAnnotations: true +``` + +同时请确保 **RBAC** 已授予对 `argoproj.io` / `workflows` 的 `get、list、watch`(见上文「Kubernetes RBAC 权限」)。 + + ## 环境变量 如需使用HTTP查询接口,需要配置环境变量`KUBERNETES_METADATA_PORT`,指定HTTP查询接口的端口号。 From ff73dc5201ddc8a7454a200ae9ee145ce913ff96 Mon Sep 17 00:00:00 2001 From: StartE Date: Mon, 13 Apr 2026 03:35:02 +0000 Subject: [PATCH 06/20] update lint and doc --- .../extended/service-kubernetesmeta-v2.md | 99 +++++++++++++++---- pkg/helper/k8smeta/k8s_meta_manager.go | 4 +- 2 files changed, 82 insertions(+), 21 deletions(-) diff --git a/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md b/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md index 21e3c17259..58b96fda66 100644 --- a/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md +++ b/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md @@ -14,7 +14,7 @@ ## 配置参数 -**注意:** 本插件需要在 Kubernetes 集群中(或具备访问 apiserver 的配置)运行,且 ServiceAccount / kubeconfig 必须具备与开启能力及 **CustomResources** 相匹配的 **RBAC**(见下文「Kubernetes RBAC 权限」)。部署为单例时需配置环境变量 `DEPLOY_MODE=singleton`、`ENABLE_KUBERNETES_META=true`。 +**注意:** 本插件需要在 Kubernetes 集群中(或具备访问 apiserver 的配置)运行,且需要有访问Kubernetes API的权限。并且部署模式为单例模式,且配置环境变量 `DEPLOY_MODE=singleton`、`ENABLE_KUBERNETES_META=true`。 | 参数 | 类型,默认值 | 说明 | | - | - | - | @@ -34,9 +34,9 @@ | PersistentVolumeClaim | bool, false | 是否采集PersistentVolumeClaim元数据。 | | StorageClass | bool, false | 是否采集StorageClass元数据。 | | Ingress | bool, false | 是否采集Ingress元数据。 | -| EnableLabels | bool, false | 是否采集**内置**Kubernetes 资源的标签(Labels)。**自定义资源(CustomResources)**是否导出标签由每条 `CustomResources.EnableLabels` 控制,不受本字段影响。 | -| EnableAnnotations | bool, false | 是否采集**内置**资源的注解(Annotations)。自定义资源是否导出注解由每条 `CustomResources.EnableAnnotations` 控制。 | -| NamespaceBlackList | []string,可选 | 全局命名空间过滤:名单内命名空间的对象**不进入 meta 缓存、不投递实体/链路事件**。`Node`、`PersistentVolume`、`StorageClass` 不按命名空间过滤。与 `NamespaceWhiteList` 同时配置时,命名空间允许条件为「未命中黑名单 **或** 在白名单中」(并集语义);多条 Pipeline 各自注册的策略之间为 **OR**。 | +| EnableLabels | bool, false | 是否采集**内置**Kubernetes 资源的标签(Labels)。 | +| EnableAnnotations | bool, false | 是否采集**内置**资源的注解(Annotations)。 | +| NamespaceBlackList | []string,可选 | 全局命名空间过滤:名单内命名空间的对象**不进入 meta 缓存、不投递实体/链路事件**。`Node`、`PersistentVolume`、`StorageClass` 不按命名空间过滤。与 `NamespaceWhiteList` 同时配置时,命名空间允许条件为「未命中黑名单 **或** 在白名单中」(并集语义);| | NamespaceWhiteList | []string,可选 | 全局命名空间白名单,见上。均为空则不做命名空间限制。 | | CustomResources | []object,可选 | 第三方 CR(动态 Informer)采集与链路,见下文「**第三方自定义资源(CustomResources)**」与「**Kubernetes RBAC 权限**」。 | | Node2Pod | string, 无默认值(可选) | Node到Pod的关系名,不填则不生成关系。 | @@ -82,7 +82,57 @@ ### 内置资源 -与配置里打开的内置开关对应即可(如 `pods`、`namespaces`、`deployments` 等),按你集群现有 `ClusterRole` 规则维护。 +与配置里打开的内置开关对应即可(如 `pods`、`namespaces`、`deployments` 等)。权限请保持最小可用**`get`、`list`、`watch`** 权限,下面是示例。 +```yaml + - apiGroups: [""] + resources: + - configmaps + - nodes + - pods + - services + - persistentvolumeclaims + - persistentvolumes + - namespaces + - endpoints + verbs: ["get", "list", "watch"] + - apiGroups: ["extensions"] + resources: + - daemonsets + - deployments + - replicasets + - ingresses + verbs: ["get", "list", "watch"] + - apiGroups: ["apps"] + resources: + - daemonsets + - deployments + - replicasets + - statefulsets + verbs: ["get", "list", "watch"] + - apiGroups: ["batch"] + resources: + - cronjobs + - jobs + verbs: ["get", "list", "watch"] + - apiGroups: + - networking.k8s.io + resources: + - ingresses + - networkpolicies + verbs: + - get + - list + - watch + - apiGroups: + - storage.k8s.io + resources: + - storageclasses + - volumeattachments + verbs: + - get + - list + - watch +``` ### 自定义资源示例:Argo Workflow @@ -141,22 +191,33 @@ ### 配置示例:Argo Workflow ```yaml +enable: true +inputs: + - Type: service_kubernetes_meta + Interval: 600 + Node: true + Pod: true + NamespaceBlackList: + - kube-system + # NamespaceWhiteList: + # - default + # - production + # Third-party CRs: configure each GVR (and optional PodLink / Entity2PodRelation) under CustomResources. CustomResources: - - APIGroup: argoproj.io - APIVersion: v1alpha1 - Resource: workflows - Kind: Workflow - # EntityType 必填:缓存键、__entity_type__、pod->{EntityType} 等 - EntityType: argo.workflow + - APIGroup: argoproj.io # API-Group + APIVersion: v1alpha1 # API version + Resource: workflows # 资源类型 + Kind: Workflow # kind信息 + EntityType: argo.workflow #实体类型 CollectEntity: true - Entity2PodRelation: "contains" - Namespace2EntityRelation: "contains" - PodLink: - OwnerKind: Workflow - OwnerAPIGroupContains: argoproj.io - PodLabelKey: workflows.argoproj.io/workflow - EnableLabels: true - EnableAnnotations: true + Entity2PodRelation: "contains" + Namespace2EntityRelation: "contains" + PodLink: # 和Pod的关联关系提取 + OwnerKind: Workflow # Pod OwnerReference 对应的资源Kind + OwnerAPIGroupContains: argoproj.io # Pod OwnerReference 对应的 API-Group + PodLabelKey: workflows.argoproj.io/workflow # 兜底逻辑,从label中提取关联关系 + EnableLabels: true # CR粒度配置是否上报标签 + EnableAnnotations: true # CR粒度配置是否上报注释 ``` 同时请确保 **RBAC** 已授予对 `argoproj.io` / `workflows` 的 `get、list、watch`(见上文「Kubernetes RBAC 权限」)。 diff --git a/pkg/helper/k8smeta/k8s_meta_manager.go b/pkg/helper/k8smeta/k8s_meta_manager.go index 6b2a35c045..74eddcca78 100644 --- a/pkg/helper/k8smeta/k8s_meta_manager.go +++ b/pkg/helper/k8smeta/k8s_meta_manager.go @@ -40,9 +40,9 @@ type FlushCh struct { } type MetaManager struct { - clientset *kubernetes.Clientset + clientset *kubernetes.Clientset restConfig *rest.Config - stopCh chan struct{} + stopCh chan struct{} ready atomic.Bool From 48ebc9697db2cab5b4b9ad7d9cc73341f0db3e4e Mon Sep 17 00:00:00 2001 From: StartE Date: Mon, 13 Apr 2026 06:43:51 +0000 Subject: [PATCH 07/20] update doc and add link --- .../extended/service-kubernetesmeta-v2.md | 29 +++ pkg/helper/k8smeta/k8s_meta_cache.go | 2 + .../k8smeta/k8s_meta_cr_unified_cache.go | 12 +- .../k8s_meta_deferred_deletion_meta_store.go | 4 + pkg/helper/k8smeta/k8s_meta_informer_auth.go | 3 +- pkg/helper/k8smeta/k8s_meta_link_test.go | 194 ++++++++++++++++++ pkg/helper/k8smeta/k8s_meta_manager.go | 1 + .../k8smeta/k8s_meta_namespace_policy.go | 1 + 8 files changed, 237 insertions(+), 9 deletions(-) diff --git a/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md b/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md index 58b96fda66..9140a02351 100644 --- a/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md +++ b/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md @@ -161,6 +161,23 @@ `CustomResources` 为数组,每一项对应一种 CR(**GVR + Kind**),由动态客户端监听;可选生成实体日志及 **Pod→CR**、**Namespace→CR** 链路。 +与内置资源相比,CR 在代码里走 **`crUnifiedCache`(dynamic informer + `unstructured`)**,内置走 **`k8sMetaCache`(typed informer)**;二者均实现 **`MetaCache`** 并共用 **`DeferredDeletionMetaStore`**,差异主要在客户端与启动时机。对比如下(**供开发与排障参考**)。 + +### 内置缓存与 CR 缓存对比 + +| 维度 | `k8sMetaCache`(内置资源) | `crUnifiedCache`(CustomResources) | +|------|---------------------------|-------------------------------------| +| **K8s 客户端** | `kubernetes.Clientset` | `dynamic.Interface`(`dynamic.NewForConfig`) | +| **Informer** | `informers.SharedInformerFactory` + 各资源 Typed Informer | `dynamicinformer.DynamicSharedInformerFactory` + `ForResource(GVR).Informer()` | +| **资源标识** | 内部 `resourceType` 常量 + `getFactoryInformer` 分支 | `schema.GroupVersionResource`(GVR),`resourceType` 多为配置中的 `EntityType` | +| **对象形态** | 具体 API 类型(如 `*v1.Pod`),经 `preProcess` 等处理 | 统一 `*unstructured.Unstructured`,`objectToUnstructured` | +| **REST / 内容协商** | 跟随 `MetaManager` 为 clientset 设置的配置(含 protobuf 等) | `restConfigForDynamicClient`:**Dynamic 使用 JSON**,与 clientset 的 protobuf 区分 | +| **何时起 watch** | `init(clientset)` 内:`metaStore.Start()` + `watch()` | `init` 不占 clientset;`setRESTConfig` 后由 `EnsureWatchStarted()`(`sync.Once`)**延迟启动** | +| **`watch` 方法** | 完整实现(factory、事件、`WaitForCacheSync` 等) | 空实现;逻辑在 `EnsureWatchStarted` 内 | +| **索引** | `getIdxRules(resourceType)`(如 Host IP 等) | `generateCommonKey` 等 CR 侧规则 | +| **体积优化** | 按资源类型的 `preProcess` | 如 `trimWorkflowObjectForCache`(裁剪 `spec`、`managedFields` 等) | +| **与 `MetaManager.Init` 顺序 init** | 各内置 cache 的 `watch` 参与**顺序**初始化链 | `init` 轻量;避免 GVR / REST 未就绪即起 Informer | + ### 子项字段 | 字段 | 类型 | 说明 | @@ -259,3 +276,15 @@ inputs: "__time__":"1723276913" } ``` + +## CR 开发说明 + +**实现上内置与 CR 的差异**见上文「**第三方自定义资源(CustomResources)**」中的 **「内置缓存与 CR 缓存对比」** 表;源码分别位于 `pkg/helper/k8smeta/k8s_meta_cache.go` 与 `k8s_meta_cr_unified_cache.go`。 + +### 扩展 CR 时可关注的代码路径 + +* **配置与注册**:`pkg/helper/k8smeta/k8s_meta_custom_resource.go`、`MetaManager.RegisterCustomResourceCollector`(`k8s_meta_manager.go`)。 +* **采集与投递**:`plugins/input/kubernetesmetav2/meta_collector.go` 中对 `EntityType` / 链路类型的处理。 +* **命名空间策略**:`k8s_meta_namespace_policy.go`(`ObjectMetaNamespaceForFilter` 对 `unstructured` 与内置类型均已覆盖)。 + +新增 CR 时一般只需**配置层**声明 GVR 与链路字段;若需新索引、新裁剪或新事件语义,再在 **`crUnifiedCache`** 与 **`meta_collector`** 侧按现有模式扩展即可。 diff --git a/pkg/helper/k8smeta/k8s_meta_cache.go b/pkg/helper/k8smeta/k8s_meta_cache.go index 56d0b523c5..41c8ffcdad 100644 --- a/pkg/helper/k8smeta/k8s_meta_cache.go +++ b/pkg/helper/k8smeta/k8s_meta_cache.go @@ -119,6 +119,7 @@ func (m *k8sMetaCache) watch(stopCh <-chan struct{}) { m.authFailCount++ n := m.authFailCount m.authFailMu.Unlock() + // Shut down this informer once RBAC/auth failures reach informerAuthFailureStopAfter. if n >= informerAuthFailureStopAfter { m.authGiveUpOnce.Do(func() { logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "stopping informer after repeated RBAC/auth errors (no further retries)", "resourceType", m.resourceType, "failures", n) @@ -178,6 +179,7 @@ func (m *k8sMetaCache) watch(stopCh <-chan struct{}) { if !cache.WaitForCacheSync(mergedStop, informer.HasSynced) { select { case <-mergedStop: + // Stop was signaled(stop or auth fail): return so MetaManager's sequential cache init does not block other resource types. logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "informer cache sync aborted", "resourceType", m.resourceType) return default: diff --git a/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go index fba8a614f9..e802f3d430 100644 --- a/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go +++ b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go @@ -93,23 +93,19 @@ func (c *crUnifiedCache) setRESTConfig(cfg *rest.Config) error { } // EnsureWatchStarted starts the dynamic informer (once) when the dynamic client is ready. -// Important: never enter sync.Once when dynamicClient is nil — Once would still count as done and block forever. +// Important: never enter sync.Once when dynamicClient is nil. func (c *crUnifiedCache) EnsureWatchStarted() { c.mu.Lock() - ready := c.dynamicClient != nil + dyn := c.dynamicClient c.mu.Unlock() - if !ready { + if dyn == nil { logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "dynamic client not ready, skip custom resource informer; ensure MetaManager.Init completed") return } c.watchStartOnce.Do(func() { c.mu.Lock() - if c.dynamicClient == nil { - c.mu.Unlock() - return - } c.metaStore.Start() - c.factory = dynamicinformer.NewDynamicSharedInformerFactory(c.dynamicClient, time.Hour) + c.factory = dynamicinformer.NewDynamicSharedInformerFactory(dyn, time.Hour) c.informer = c.factory.ForResource(c.gvr).Informer() _, _ = c.informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { diff --git a/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store.go b/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store.go index 28b4f424f2..2ddd0b2cae 100644 --- a/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store.go +++ b/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store.go @@ -215,6 +215,9 @@ func (m *DeferredDeletionMetaStore) handleAddOrUpdateEvent(event *K8sMetaEvent) logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "handle k8s meta with keyFunc error", err) return } + // Namespace policy after keyFunc: we need the store key for purgeKey. For standard namespaced + // objects, metadata.namespace is immutable, so MetaNamespaceKeyFunc keeps a stable key across + // updates; purgeKey removes Items[key] and index rows derived from the previously cached object. if !GetMetaManagerInstance().MetaObjectPassesNamespacePolicy(event.Object) { m.purgeKey(key) return @@ -273,6 +276,7 @@ func (m *DeferredDeletionMetaStore) handleDeleteEvent(event *K8sMetaEvent) { logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "handle k8s meta with keyFunc error", err) return } + // Same ordering assumption as handleAddOrUpdateEvent (namespace immutable for namespaced objects). if !GetMetaManagerInstance().MetaObjectPassesNamespacePolicy(event.Object) { m.purgeKey(key) return diff --git a/pkg/helper/k8smeta/k8s_meta_informer_auth.go b/pkg/helper/k8smeta/k8s_meta_informer_auth.go index a7ebb061af..9e9089532f 100644 --- a/pkg/helper/k8smeta/k8s_meta_informer_auth.go +++ b/pkg/helper/k8smeta/k8s_meta_informer_auth.go @@ -5,7 +5,8 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" ) -// Number of consecutive List/Watch errors treated as RBAC/auth failures before stopping this informer only. +// Consecutive RBAC/auth (Forbidden/Unauthorized) watch errors before stopping this informer. +// 3 balances a single flake vs. failing fast on permanent bad credentials/RBAC. const informerAuthFailureStopAfter = 3 func isInformerAuthFailure(err error) bool { diff --git a/pkg/helper/k8smeta/k8s_meta_link_test.go b/pkg/helper/k8smeta/k8s_meta_link_test.go index cd490ece04..19842df9ee 100644 --- a/pkg/helper/k8smeta/k8s_meta_link_test.go +++ b/pkg/helper/k8smeta/k8s_meta_link_test.go @@ -4,11 +4,14 @@ import ( "testing" "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" app "k8s.io/api/apps/v1" batch "k8s.io/api/batch/v1" corev1 "k8s.io/api/core/v1" networking "k8s.io/api/networking/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + "k8s.io/apimachinery/pkg/runtime/schema" ) func TestGetPodNodeLink(t *testing.T) { @@ -1647,3 +1650,194 @@ func generateMockPod(index string) *ObjectWrapper { }, } } + +func testArgoWorkflowCR(name string) *unstructured.Unstructured { + u := &unstructured.Unstructured{} + u.SetAPIVersion("argoproj.io/v1alpha1") + u.SetKind("Workflow") + u.SetNamespace("default") + u.SetName(name) + return u +} + +func testPodCRLinkRuntime(entityType string) *podCRLinkRuntime { + return &podCRLinkRuntime{ + entityType: entityType, + ownerKind: "Workflow", + ownerAPIGroupSubstr: "argoproj.io", + podLabelKey: "workflows.argoproj.io/workflow", + } +} + +// TestGetPodCustomResourceLinkViaOwnerReference covers Pod→CR resolution when the Pod has a matching Workflow ownerReference. +func TestGetPodCustomResourceLinkViaOwnerReference(t *testing.T) { + stopCh := make(chan struct{}) + defer close(stopCh) + entityType := "argo.workflow" + linkType := PodLinkTypeForEntity(entityType) + + podCache := newK8sMetaCache(stopCh, POD) + crCache := newCRUnifiedCache(stopCh, entityType, schema.GroupVersionResource{Group: "argoproj.io", Version: "v1alpha1", Resource: "workflows"}) + crCache.metaStore.handleAddOrUpdateEvent(&K8sMetaEvent{ + EventType: EventTypeAdd, + Object: &ObjectWrapper{ + ResourceType: entityType, + Raw: testArgoWorkflowCR("my-wf"), + }, + }) + + podW := generateMockPod("1") + podW.ResourceType = POD + podW.Raw.(*corev1.Pod).OwnerReferences = []metav1.OwnerReference{{ + APIVersion: "argoproj.io/v1alpha1", + Kind: "Workflow", + Name: "my-wf", + UID: "uid-wf", + }} + podCache.metaStore.handleAddOrUpdateEvent(&K8sMetaEvent{EventType: EventTypeAdd, Object: podW}) + + lg := NewK8sMetaLinkGenerator(map[string]MetaCache{ + POD: podCache, + entityType: crCache, + }) + lg.registerPodCRLink(linkType, testPodCRLinkRuntime(entityType)) + + podList := []*K8sMetaEvent{{EventType: EventTypeUpdate, Object: podCache.metaStore.Items["default/pod1"]}} + results := lg.GenerateLinks(podList, linkType) + require.Len(t, results, 1) + pcr, ok := results[0].Object.Raw.(*PodCustomResource) + require.True(t, ok) + assert.Equal(t, "my-wf", pcr.CR.GetName()) + assert.Equal(t, "pod1", pcr.Pod.Name) + assert.Equal(t, linkType, results[0].Object.ResourceType) +} + +// TestGetPodCustomResourceLinkViaLabelFallback covers Pod→CR when ownerReferences do not match but PodLabelKey is set. +func TestGetPodCustomResourceLinkViaLabelFallback(t *testing.T) { + stopCh := make(chan struct{}) + defer close(stopCh) + entityType := "argo.workflow" + linkType := PodLinkTypeForEntity(entityType) + + podCache := newK8sMetaCache(stopCh, POD) + crCache := newCRUnifiedCache(stopCh, entityType, schema.GroupVersionResource{Group: "argoproj.io", Version: "v1alpha1", Resource: "workflows"}) + crCache.metaStore.handleAddOrUpdateEvent(&K8sMetaEvent{ + EventType: EventTypeAdd, + Object: &ObjectWrapper{ + ResourceType: entityType, + Raw: testArgoWorkflowCR("wf-from-label"), + }, + }) + + podW := generateMockPod("2") + podW.ResourceType = POD + pod := podW.Raw.(*corev1.Pod) + pod.OwnerReferences = []metav1.OwnerReference{{ + APIVersion: "apps/v1", + Kind: "ReplicaSet", + Name: "some-rs", + }} + pod.Labels = map[string]string{"workflows.argoproj.io/workflow": "wf-from-label"} + podCache.metaStore.handleAddOrUpdateEvent(&K8sMetaEvent{EventType: EventTypeAdd, Object: podW}) + + lg := NewK8sMetaLinkGenerator(map[string]MetaCache{ + POD: podCache, + entityType: crCache, + }) + lg.registerPodCRLink(linkType, testPodCRLinkRuntime(entityType)) + + podList := []*K8sMetaEvent{{EventType: EventTypeUpdate, Object: podCache.metaStore.Items["default/pod2"]}} + results := lg.GenerateLinks(podList, linkType) + require.Len(t, results, 1) + pcr := results[0].Object.Raw.(*PodCustomResource) + assert.Equal(t, "wf-from-label", pcr.CR.GetName()) +} + +// TestGetCustomResourceNamespaceLink covers Namespace→namespaced CR links using the namespace cache. +func TestGetCustomResourceNamespaceLink(t *testing.T) { + stopCh := make(chan struct{}) + defer close(stopCh) + entityType := "argo.workflow" + linkType := NamespaceLinkTypeForEntity(entityType) + + nsCache := newK8sMetaCache(stopCh, NAMESPACE) + nsCache.metaStore.handleAddOrUpdateEvent(&K8sMetaEvent{ + EventType: EventTypeAdd, + Object: generateMockNamespace("default"), + }) + + wf := testArgoWorkflowCR("ns-linked-wf") + events := []*K8sMetaEvent{{ + EventType: EventTypeAdd, + Object: &ObjectWrapper{ + ResourceType: entityType, + Raw: wf, + }, + }} + + lg := NewK8sMetaLinkGenerator(map[string]MetaCache{NAMESPACE: nsCache}) + results := lg.GenerateLinks(events, linkType) + require.Len(t, results, 1) + ncr, ok := results[0].Object.Raw.(*NamespaceCustomResource) + require.True(t, ok) + assert.Equal(t, "default", ncr.Namespace.Name) + assert.Equal(t, "ns-linked-wf", ncr.CR.GetName()) + assert.Equal(t, linkType, results[0].Object.ResourceType) +} + +// TestGetPodCustomResourceLinkMissingCRCache verifies GenerateLinks returns nil when the CR MetaCache entry is absent. +func TestGetPodCustomResourceLinkMissingCRCache(t *testing.T) { + stopCh := make(chan struct{}) + defer close(stopCh) + entityType := "argo.workflow" + linkType := PodLinkTypeForEntity(entityType) + + podCache := newK8sMetaCache(stopCh, POD) + podW := generateMockPod("3") + podW.ResourceType = POD + podW.Raw.(*corev1.Pod).OwnerReferences = []metav1.OwnerReference{{ + APIVersion: "argoproj.io/v1alpha1", + Kind: "Workflow", + Name: "ghost", + }} + podCache.metaStore.handleAddOrUpdateEvent(&K8sMetaEvent{EventType: EventTypeAdd, Object: podW}) + + // Intentionally omit entityType from metaCache (no CR cache registered). + lg := NewK8sMetaLinkGenerator(map[string]MetaCache{POD: podCache}) + lg.registerPodCRLink(linkType, testPodCRLinkRuntime(entityType)) + + podList := []*K8sMetaEvent{{EventType: EventTypeUpdate, Object: podCache.metaStore.Items["default/pod3"]}} + results := lg.GenerateLinks(podList, linkType) + assert.Nil(t, results) +} + +// TestGetPodCustomResourceLinkCRCacheHitMiss verifies no links when the CR is not present in cache (cache exists, Get empty). +func TestGetPodCustomResourceLinkCRCacheHitMiss(t *testing.T) { + stopCh := make(chan struct{}) + defer close(stopCh) + entityType := "argo.workflow" + linkType := PodLinkTypeForEntity(entityType) + + podCache := newK8sMetaCache(stopCh, POD) + crCache := newCRUnifiedCache(stopCh, entityType, schema.GroupVersionResource{Group: "argoproj.io", Version: "v1alpha1", Resource: "workflows"}) + // CR cache empty: Pod points to a workflow name not in cache. + + podW := generateMockPod("4") + podW.ResourceType = POD + podW.Raw.(*corev1.Pod).OwnerReferences = []metav1.OwnerReference{{ + APIVersion: "argoproj.io/v1alpha1", + Kind: "Workflow", + Name: "not-in-cache", + }} + podCache.metaStore.handleAddOrUpdateEvent(&K8sMetaEvent{EventType: EventTypeAdd, Object: podW}) + + lg := NewK8sMetaLinkGenerator(map[string]MetaCache{ + POD: podCache, + entityType: crCache, + }) + lg.registerPodCRLink(linkType, testPodCRLinkRuntime(entityType)) + + podList := []*K8sMetaEvent{{EventType: EventTypeUpdate, Object: podCache.metaStore.Items["default/pod4"]}} + results := lg.GenerateLinks(podList, linkType) + assert.Empty(t, results) +} diff --git a/pkg/helper/k8smeta/k8s_meta_manager.go b/pkg/helper/k8smeta/k8s_meta_manager.go index 74eddcca78..c1dd8afd42 100644 --- a/pkg/helper/k8smeta/k8s_meta_manager.go +++ b/pkg/helper/k8smeta/k8s_meta_manager.go @@ -159,6 +159,7 @@ func (m *MetaManager) Init(configPath string) (err error) { m.clientset = clientset m.restConfig = config + // CR dynamic client: setRESTConfig errors are logged only; graceful degradation, built-in meta still starts. m.cacheMu.Lock() for _, c := range m.cacheMap { if uc, ok := c.(*crUnifiedCache); ok { diff --git a/pkg/helper/k8smeta/k8s_meta_namespace_policy.go b/pkg/helper/k8smeta/k8s_meta_namespace_policy.go index 0b624ff727..2e8f8dc258 100644 --- a/pkg/helper/k8smeta/k8s_meta_namespace_policy.go +++ b/pkg/helper/k8smeta/k8s_meta_namespace_policy.go @@ -58,6 +58,7 @@ func (p *namespacePolicy) allowsNamespace(ns string) bool { if !p.hasWhite { return !inB } + // Allow if: (1) not in blacklist, OR (2) in whitelist return !inB || inW } From 831e2a7432fa7102e300aba9b989baa652333faa Mon Sep 17 00:00:00 2001 From: StartE Date: Mon, 13 Apr 2026 06:56:27 +0000 Subject: [PATCH 08/20] update --- pkg/helper/k8smeta/k8s_meta_link_test.go | 2 + .../meta_collector_cr_test.go | 120 ++++++++++++++++++ 2 files changed, 122 insertions(+) create mode 100644 plugins/input/kubernetesmetav2/meta_collector_cr_test.go diff --git a/pkg/helper/k8smeta/k8s_meta_link_test.go b/pkg/helper/k8smeta/k8s_meta_link_test.go index 19842df9ee..23f6c36126 100644 --- a/pkg/helper/k8smeta/k8s_meta_link_test.go +++ b/pkg/helper/k8smeta/k8s_meta_link_test.go @@ -1651,6 +1651,8 @@ func generateMockPod(index string) *ObjectWrapper { } } +// Namespace policy is not tested here: CR link tests use handleAddOrUpdateEvent on the store only; +// MetaObjectPassesNamespacePolicy runs on the global singleton with no registered policies (allow-all). func testArgoWorkflowCR(name string) *unstructured.Unstructured { u := &unstructured.Unstructured{} u.SetAPIVersion("argoproj.io/v1alpha1") diff --git a/plugins/input/kubernetesmetav2/meta_collector_cr_test.go b/plugins/input/kubernetesmetav2/meta_collector_cr_test.go new file mode 100644 index 0000000000..dcb6837b6b --- /dev/null +++ b/plugins/input/kubernetesmetav2/meta_collector_cr_test.go @@ -0,0 +1,120 @@ +package kubernetesmetav2 + +import ( + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" + + "github.com/alibaba/ilogtail/pkg/helper/k8smeta" + "github.com/alibaba/ilogtail/pkg/models" +) + +func TestProcessPodCustomResourceLink(t *testing.T) { + entityType := "argo.workflow" + linkRT := k8smeta.POD + k8smeta.LINK_SPLIT_CHARACTER + entityType + + pod := &corev1.Pod{ + ObjectMeta: metav1.ObjectMeta{Name: "pod1", Namespace: "default"}, + } + cr := &unstructured.Unstructured{} + cr.SetAPIVersion("argoproj.io/v1alpha1") + cr.SetKind("Workflow") + cr.SetNamespace("default") + cr.SetName("wf1") + + m := &metaCollector{ + serviceK8sMeta: &ServiceK8sMeta{}, + crConfigs: map[string]k8smeta.CustomResourceCollectorConfig{ + entityType: { + EntityType: entityType, + Entity2PodRelation: "contains", + }, + }, + } + data := &k8smeta.ObjectWrapper{ + ResourceType: linkRT, + Raw: &k8smeta.PodCustomResource{ + Pod: pod, + CR: cr, + }, + FirstObservedTime: 1, + LastObservedTime: 2, + } + events := m.processPodCustomResourceLink(data, "update") + require.Len(t, events, 1) + log, ok := events[0].(*models.Log) + require.True(t, ok) + rel := log.Contents.Get(entityLinkRelationTypeFieldName) + assert.Equal(t, "contains", rel) +} + +func TestProcessNamespaceCustomResourceLink(t *testing.T) { + entityType := "argo.workflow" + linkRT := entityType + k8smeta.LINK_SPLIT_CHARACTER + k8smeta.NAMESPACE + + ns := &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "production"}} + cr := &unstructured.Unstructured{} + cr.SetAPIVersion("argoproj.io/v1alpha1") + cr.SetKind("Workflow") + cr.SetNamespace("production") + cr.SetName("wf-ns") + + m := &metaCollector{ + serviceK8sMeta: &ServiceK8sMeta{}, + crConfigs: map[string]k8smeta.CustomResourceCollectorConfig{ + entityType: { + EntityType: entityType, + Namespace2EntityRelation: "contains", + }, + }, + } + data := &k8smeta.ObjectWrapper{ + ResourceType: linkRT, + Raw: &k8smeta.NamespaceCustomResource{ + Namespace: ns, + CR: cr, + }, + FirstObservedTime: 10, + LastObservedTime: 20, + } + events := m.processNamespaceCustomResourceLink(data, "update") + require.Len(t, events, 1) + log, ok := events[0].(*models.Log) + require.True(t, ok) + assert.Equal(t, "contains", log.Contents.Get(entityLinkRelationTypeFieldName)) +} + +func TestProcessNamespaceCustomResourceLinkSkipsWhenRelationUnset(t *testing.T) { + entityType := "argo.workflow" + linkRT := entityType + k8smeta.LINK_SPLIT_CHARACTER + k8smeta.NAMESPACE + m := &metaCollector{ + serviceK8sMeta: &ServiceK8sMeta{}, + crConfigs: map[string]k8smeta.CustomResourceCollectorConfig{ + entityType: {EntityType: entityType}, + }, + } + data := &k8smeta.ObjectWrapper{ + ResourceType: linkRT, + Raw: &k8smeta.NamespaceCustomResource{ + Namespace: &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "x"}}, + CR: &unstructured.Unstructured{}, + }, + } + assert.Nil(t, m.processNamespaceCustomResourceLink(data, "update")) +} + +func TestProcessPodCustomResourceLinkWrongRawType(t *testing.T) { + m := &metaCollector{ + serviceK8sMeta: &ServiceK8sMeta{}, + crConfigs: map[string]k8smeta.CustomResourceCollectorConfig{"argo.workflow": {}}, + } + data := &k8smeta.ObjectWrapper{ + ResourceType: k8smeta.POD + k8smeta.LINK_SPLIT_CHARACTER + "argo.workflow", + Raw: &corev1.Pod{}, + } + assert.Nil(t, m.processPodCustomResourceLink(data, "update")) +} From 93f3f0386e08702e010e45fb6fda8821d0e1f0e7 Mon Sep 17 00:00:00 2001 From: StartE Date: Mon, 13 Apr 2026 07:45:29 +0000 Subject: [PATCH 09/20] fix lint and optmize comment --- .../extended/service-kubernetesmeta-v2.md | 2 +- .../k8smeta/k8s_meta_cr_unified_cache.go | 15 +- .../k8smeta/k8s_meta_custom_resource.go | 7 - pkg/helper/k8smeta/k8s_meta_manager.go | 11 +- .../kubernetesmetav2/meta_collector_cr.go | 18 +- .../meta_collector_cr_test.go | 170 ++++++++++++++++++ 6 files changed, 206 insertions(+), 17 deletions(-) diff --git a/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md b/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md index 9140a02351..5e20c81823 100644 --- a/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md +++ b/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md @@ -175,7 +175,7 @@ | **何时起 watch** | `init(clientset)` 内:`metaStore.Start()` + `watch()` | `init` 不占 clientset;`setRESTConfig` 后由 `EnsureWatchStarted()`(`sync.Once`)**延迟启动** | | **`watch` 方法** | 完整实现(factory、事件、`WaitForCacheSync` 等) | 空实现;逻辑在 `EnsureWatchStarted` 内 | | **索引** | `getIdxRules(resourceType)`(如 Host IP 等) | `generateCommonKey` 等 CR 侧规则 | -| **体积优化** | 按资源类型的 `preProcess` | 如 `trimWorkflowObjectForCache`(裁剪 `spec`、`managedFields` 等) | +| **体积优化** | 按资源类型的 `preProcess` | 如 `trimCRObjectForCache`(裁剪 `spec`、`managedFields` 等) | | **与 `MetaManager.Init` 顺序 init** | 各内置 cache 的 `watch` 参与**顺序**初始化链 | `init` 轻量;避免 GVR / REST 未就绪即起 Informer | ### 子项字段 diff --git a/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go index e802f3d430..784e2d24ef 100644 --- a/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go +++ b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go @@ -107,14 +107,14 @@ func (c *crUnifiedCache) EnsureWatchStarted() { c.metaStore.Start() c.factory = dynamicinformer.NewDynamicSharedInformerFactory(dyn, time.Hour) c.informer = c.factory.ForResource(c.gvr).Informer() - _, _ = c.informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ + _, err := c.informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { defer panicRecover() u := objectToUnstructured(obj) if u == nil { return } - trimWorkflowObjectForCache(u) + trimCRObjectForCache(u) now := time.Now().Unix() c.eventCh <- &K8sMetaEvent{ EventType: EventTypeAdd, @@ -133,7 +133,7 @@ func (c *crUnifiedCache) EnsureWatchStarted() { if u == nil { return } - trimWorkflowObjectForCache(u) + trimCRObjectForCache(u) now := time.Now().Unix() c.eventCh <- &K8sMetaEvent{ EventType: EventTypeUpdate, @@ -152,7 +152,7 @@ func (c *crUnifiedCache) EnsureWatchStarted() { if u == nil { return } - trimWorkflowObjectForCache(u) + trimCRObjectForCache(u) c.eventCh <- &K8sMetaEvent{ EventType: EventTypeDelete, Object: &ObjectWrapper{ @@ -164,6 +164,9 @@ func (c *crUnifiedCache) EnsureWatchStarted() { metaManager.deleteEventCount.Add(1) }, }) + if err != nil { + logger.Error(context.Background(), K8sMetaUnifyErrorCode, "fail to add dynamic informer event handler", err, "resourceType", c.resourceType, "gvr", c.gvr.String()) + } if err := c.informer.SetWatchErrorHandler(func(_ *cache.Reflector, err error) { if err != nil { logger.Error(context.Background(), K8sMetaUnifyErrorCode, "resourceType", c.resourceType, "watchError", err) @@ -260,8 +263,8 @@ func objectToUnstructured(obj interface{}) *unstructured.Unstructured { return nil } -// trimWorkflowObjectForCache drops spec and managedFields to limit memory; metadata + status remain for linking and whitelisted export. -func trimWorkflowObjectForCache(u *unstructured.Unstructured) { +// trimCRObjectForCache drops spec and managedFields to limit memory; metadata + status remain for linking and whitelisted export. +func trimCRObjectForCache(u *unstructured.Unstructured) { if u == nil { return } diff --git a/pkg/helper/k8smeta/k8s_meta_custom_resource.go b/pkg/helper/k8smeta/k8s_meta_custom_resource.go index 09ec548041..39ef46bc15 100644 --- a/pkg/helper/k8smeta/k8s_meta_custom_resource.go +++ b/pkg/helper/k8smeta/k8s_meta_custom_resource.go @@ -105,10 +105,3 @@ func (c *CustomResourceCollectorConfig) Normalize() error { } return nil } - -func firstNonEmpty(val, def string) string { - if strings.TrimSpace(val) != "" { - return val - } - return def -} diff --git a/pkg/helper/k8smeta/k8s_meta_manager.go b/pkg/helper/k8smeta/k8s_meta_manager.go index c1dd8afd42..39edd6068c 100644 --- a/pkg/helper/k8smeta/k8s_meta_manager.go +++ b/pkg/helper/k8smeta/k8s_meta_manager.go @@ -104,6 +104,7 @@ func (m *MetaManager) RegisterCustomResourceCollector(cfg CustomResourceCollecto if m.restConfig != nil { if uc, ok := m.cacheMap[cfg.EntityType].(*crUnifiedCache); ok { if err := uc.setRESTConfig(m.restConfig); err != nil { + // Graceful degradation: dynamicClient unset; crUnifiedCache.EnsureWatchStarted skips when client is nil (this CR informer does not run). logger.Error(context.Background(), K8sMetaUnifyErrorCode, "setRESTConfig for custom resource cache", err, "entityType", cfg.EntityType) } } @@ -159,7 +160,8 @@ func (m *MetaManager) Init(configPath string) (err error) { m.clientset = clientset m.restConfig = config - // CR dynamic client: setRESTConfig errors are logged only; graceful degradation, built-in meta still starts. + // CR dynamic client: setRESTConfig errors are logged only (graceful degradation; built-in meta still starts). + // Failed caches keep dynamicClient nil; EnsureWatchStarted skips there (no CR informer until a successful setRESTConfig, e.g. after restart). m.cacheMu.Lock() for _, c := range m.cacheMap { if uc, ok := c.(*crUnifiedCache); ok { @@ -328,6 +330,13 @@ func (m *MetaManager) runServer() { go m.metadataHandler.K8sServerRun(m.stopCh) } +func firstNonEmpty(val, def string) string { + if strings.TrimSpace(val) != "" { + return val + } + return def +} + func isEntity(resourceType string) bool { return !strings.Contains(resourceType, LINK_SPLIT_CHARACTER) } diff --git a/plugins/input/kubernetesmetav2/meta_collector_cr.go b/plugins/input/kubernetesmetav2/meta_collector_cr.go index 1ac29c5526..aa2a2aa8ea 100644 --- a/plugins/input/kubernetesmetav2/meta_collector_cr.go +++ b/plugins/input/kubernetesmetav2/meta_collector_cr.go @@ -115,7 +115,14 @@ func (m *metaCollector) processNamespaceCustomResourceLink(data *k8smeta.ObjectW if !ok { return nil } - entityType := strings.TrimSuffix(data.ResourceType, k8smeta.LINK_SPLIT_CHARACTER+k8smeta.NAMESPACE) + nsLinkSuffix := k8smeta.LINK_SPLIT_CHARACTER + k8smeta.NAMESPACE + if !strings.HasSuffix(data.ResourceType, nsLinkSuffix) { + return nil + } + entityType := strings.TrimSuffix(data.ResourceType, nsLinkSuffix) + if entityType == "" { + return nil + } cfg := m.crConfigs[entityType] if cfg.EntityType == "" || cfg.Namespace2EntityRelation == "" { return nil @@ -134,7 +141,14 @@ func (m *metaCollector) processPodCustomResourceLink(data *k8smeta.ObjectWrapper if !ok { return nil } - entityType := strings.TrimPrefix(data.ResourceType, k8smeta.POD+k8smeta.LINK_SPLIT_CHARACTER) + podCRPrefix := k8smeta.POD + k8smeta.LINK_SPLIT_CHARACTER + if !strings.HasPrefix(data.ResourceType, podCRPrefix) { + return nil + } + entityType := strings.TrimPrefix(data.ResourceType, podCRPrefix) + if entityType == "" { + return nil + } cfg := m.crConfigs[entityType] log := &models.Log{} log.Contents = models.NewLogContents() diff --git a/plugins/input/kubernetesmetav2/meta_collector_cr_test.go b/plugins/input/kubernetesmetav2/meta_collector_cr_test.go index dcb6837b6b..6269e85d94 100644 --- a/plugins/input/kubernetesmetav2/meta_collector_cr_test.go +++ b/plugins/input/kubernetesmetav2/meta_collector_cr_test.go @@ -13,6 +13,19 @@ import ( "github.com/alibaba/ilogtail/pkg/models" ) +func testWorkflowUnstructured(t *testing.T) *unstructured.Unstructured { + t.Helper() + u := &unstructured.Unstructured{} + u.SetAPIVersion("argoproj.io/v1alpha1") + u.SetKind("Workflow") + u.SetNamespace("default") + u.SetName("wf-entity") + u.SetLabels(map[string]string{"keep": "yes", "drop": "no"}) + u.SetAnnotations(map[string]string{"anno": "v"}) + require.NoError(t, unstructured.SetNestedField(u.Object, "Running", "status", "phase")) + return u +} + func TestProcessPodCustomResourceLink(t *testing.T) { entityType := "argo.workflow" linkRT := k8smeta.POD + k8smeta.LINK_SPLIT_CHARACTER + entityType @@ -107,6 +120,38 @@ func TestProcessNamespaceCustomResourceLinkSkipsWhenRelationUnset(t *testing.T) assert.Nil(t, m.processNamespaceCustomResourceLink(data, "update")) } +func TestProcessNamespaceCustomResourceLinkRejectsNonNamespaceLinkResourceType(t *testing.T) { + m := &metaCollector{ + crConfigs: map[string]k8smeta.CustomResourceCollectorConfig{ + "argo.workflow": {EntityType: "argo.workflow", Namespace2EntityRelation: "contains"}, + }, + } + data := &k8smeta.ObjectWrapper{ + ResourceType: "argo.workflow", + Raw: &k8smeta.NamespaceCustomResource{ + Namespace: &corev1.Namespace{ObjectMeta: metav1.ObjectMeta{Name: "default"}}, + CR: &unstructured.Unstructured{}, + }, + } + assert.Nil(t, m.processNamespaceCustomResourceLink(data, "update")) +} + +func TestProcessPodCustomResourceLinkRejectsNonPodCRResourceType(t *testing.T) { + m := &metaCollector{ + crConfigs: map[string]k8smeta.CustomResourceCollectorConfig{ + "argo.workflow": {EntityType: "argo.workflow", Entity2PodRelation: "contains"}, + }, + } + data := &k8smeta.ObjectWrapper{ + ResourceType: "argo.workflow", + Raw: &k8smeta.PodCustomResource{ + Pod: &corev1.Pod{}, + CR: &unstructured.Unstructured{}, + }, + } + assert.Nil(t, m.processPodCustomResourceLink(data, "update")) +} + func TestProcessPodCustomResourceLinkWrongRawType(t *testing.T) { m := &metaCollector{ serviceK8sMeta: &ServiceK8sMeta{}, @@ -118,3 +163,128 @@ func TestProcessPodCustomResourceLinkWrongRawType(t *testing.T) { } assert.Nil(t, m.processPodCustomResourceLink(data, "update")) } + +func stringField(t *testing.T, log *models.Log, key string) string { + t.Helper() + v := log.Contents.Get(key) + require.NotNil(t, v) + s, ok := v.(string) + require.True(t, ok) + return s +} + +func TestProcessCustomResourceEntityUnknownResourceType(t *testing.T) { + m := &metaCollector{ + serviceK8sMeta: &ServiceK8sMeta{Interval: 10}, + crConfigs: map[string]k8smeta.CustomResourceCollectorConfig{}, + } + data := &k8smeta.ObjectWrapper{ + ResourceType: "unknown.type", + Raw: testWorkflowUnstructured(t), + } + assert.Nil(t, m.processCustomResourceEntity(data, "update")) +} + +func TestProcessCustomResourceEntityWrongRawType(t *testing.T) { + entityType := "argo.workflow" + m := &metaCollector{ + serviceK8sMeta: &ServiceK8sMeta{Interval: 10}, + crConfigs: map[string]k8smeta.CustomResourceCollectorConfig{ + entityType: {EntityType: entityType, Kind: "Workflow"}, + }, + } + data := &k8smeta.ObjectWrapper{ResourceType: entityType, Raw: &corev1.Pod{}} + assert.Nil(t, m.processCustomResourceEntity(data, "update")) +} + +func TestProcessCustomResourceEntityCoreFields(t *testing.T) { + entityType := "argo.workflow" + cfg := k8smeta.CustomResourceCollectorConfig{EntityType: entityType, Kind: "Workflow"} + m := &metaCollector{ + serviceK8sMeta: &ServiceK8sMeta{Interval: 10}, + crConfigs: map[string]k8smeta.CustomResourceCollectorConfig{entityType: cfg}, + } + data := &k8smeta.ObjectWrapper{ + ResourceType: entityType, + Raw: testWorkflowUnstructured(t), + FirstObservedTime: 100, + LastObservedTime: 200, + } + events := m.processCustomResourceEntity(data, "update") + require.Len(t, events, 1) + log := events[0].(*models.Log) + assert.Equal(t, "Workflow", stringField(t, log, entityKindFieldName)) + assert.Equal(t, "argoproj.io/v1alpha1", stringField(t, log, "api_version")) + assert.Equal(t, "default", stringField(t, log, "namespace")) + assert.False(t, log.Contents.Contains("labels")) + assert.False(t, log.Contents.Contains("annotations")) + assert.False(t, log.Contents.Contains("status")) +} + +func TestProcessCustomResourceEntityEnableLabels(t *testing.T) { + entityType := "argo.workflow" + cfg := k8smeta.CustomResourceCollectorConfig{EntityType: entityType, Kind: "Workflow", EnableLabels: true} + m := &metaCollector{ + serviceK8sMeta: &ServiceK8sMeta{Interval: 10}, + crConfigs: map[string]k8smeta.CustomResourceCollectorConfig{entityType: cfg}, + } + data := &k8smeta.ObjectWrapper{ResourceType: entityType, Raw: testWorkflowUnstructured(t)} + events := m.processCustomResourceEntity(data, "update") + require.Len(t, events, 1) + log := events[0].(*models.Log) + labels := stringField(t, log, "labels") + assert.Contains(t, labels, "keep") + assert.Contains(t, labels, "drop") +} + +func TestProcessCustomResourceEntityLabelAllowList(t *testing.T) { + entityType := "argo.workflow" + cfg := k8smeta.CustomResourceCollectorConfig{ + EntityType: entityType, Kind: "Workflow", + LabelAllowList: []string{"keep"}, + } + m := &metaCollector{ + serviceK8sMeta: &ServiceK8sMeta{Interval: 10}, + crConfigs: map[string]k8smeta.CustomResourceCollectorConfig{entityType: cfg}, + } + data := &k8smeta.ObjectWrapper{ResourceType: entityType, Raw: testWorkflowUnstructured(t)} + events := m.processCustomResourceEntity(data, "update") + require.Len(t, events, 1) + log := events[0].(*models.Log) + labels := stringField(t, log, "labels") + assert.Contains(t, labels, "keep") + assert.NotContains(t, labels, "drop") +} + +func TestProcessCustomResourceEntityStatusPathAllowList(t *testing.T) { + entityType := "argo.workflow" + cfg := k8smeta.CustomResourceCollectorConfig{ + EntityType: entityType, Kind: "Workflow", + StatusPathAllowList: []string{"status.phase"}, + } + m := &metaCollector{ + serviceK8sMeta: &ServiceK8sMeta{Interval: 10}, + crConfigs: map[string]k8smeta.CustomResourceCollectorConfig{entityType: cfg}, + } + data := &k8smeta.ObjectWrapper{ResourceType: entityType, Raw: testWorkflowUnstructured(t)} + events := m.processCustomResourceEntity(data, "update") + require.Len(t, events, 1) + log := events[0].(*models.Log) + status := stringField(t, log, "status") + assert.Contains(t, status, "Running") +} + +func TestProcessCustomResourceEntityEnableAnnotations(t *testing.T) { + entityType := "argo.workflow" + cfg := k8smeta.CustomResourceCollectorConfig{EntityType: entityType, Kind: "Workflow", EnableAnnotations: true} + m := &metaCollector{ + serviceK8sMeta: &ServiceK8sMeta{Interval: 10}, + crConfigs: map[string]k8smeta.CustomResourceCollectorConfig{entityType: cfg}, + } + data := &k8smeta.ObjectWrapper{ResourceType: entityType, Raw: testWorkflowUnstructured(t)} + events := m.processCustomResourceEntity(data, "update") + require.Len(t, events, 1) + log := events[0].(*models.Log) + annos := stringField(t, log, "annotations") + assert.Contains(t, annos, "anno") +} From d2e9d3ea5ec6337a955fae008a25288303c92907 Mon Sep 17 00:00:00 2001 From: StartE Date: Mon, 13 Apr 2026 09:15:58 +0000 Subject: [PATCH 10/20] add comment necessary --- pkg/helper/k8smeta/k8s_meta_link_test.go | 11 ++++++----- pkg/helper/k8smeta/k8s_meta_namespace_policy_test.go | 2 +- plugins/input/kubernetesmetav2/meta_collector.go | 2 +- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/pkg/helper/k8smeta/k8s_meta_link_test.go b/pkg/helper/k8smeta/k8s_meta_link_test.go index 23f6c36126..de93b41513 100644 --- a/pkg/helper/k8smeta/k8s_meta_link_test.go +++ b/pkg/helper/k8smeta/k8s_meta_link_test.go @@ -1662,7 +1662,8 @@ func testArgoWorkflowCR(name string) *unstructured.Unstructured { return u } -func testPodCRLinkRuntime(entityType string) *podCRLinkRuntime { +func testPodCRLinkRuntime() *podCRLinkRuntime { + const entityType = "argo.workflow" return &podCRLinkRuntime{ entityType: entityType, ownerKind: "Workflow", @@ -1702,7 +1703,7 @@ func TestGetPodCustomResourceLinkViaOwnerReference(t *testing.T) { POD: podCache, entityType: crCache, }) - lg.registerPodCRLink(linkType, testPodCRLinkRuntime(entityType)) + lg.registerPodCRLink(linkType, testPodCRLinkRuntime()) podList := []*K8sMetaEvent{{EventType: EventTypeUpdate, Object: podCache.metaStore.Items["default/pod1"]}} results := lg.GenerateLinks(podList, linkType) @@ -1746,7 +1747,7 @@ func TestGetPodCustomResourceLinkViaLabelFallback(t *testing.T) { POD: podCache, entityType: crCache, }) - lg.registerPodCRLink(linkType, testPodCRLinkRuntime(entityType)) + lg.registerPodCRLink(linkType, testPodCRLinkRuntime()) podList := []*K8sMetaEvent{{EventType: EventTypeUpdate, Object: podCache.metaStore.Items["default/pod2"]}} results := lg.GenerateLinks(podList, linkType) @@ -1806,7 +1807,7 @@ func TestGetPodCustomResourceLinkMissingCRCache(t *testing.T) { // Intentionally omit entityType from metaCache (no CR cache registered). lg := NewK8sMetaLinkGenerator(map[string]MetaCache{POD: podCache}) - lg.registerPodCRLink(linkType, testPodCRLinkRuntime(entityType)) + lg.registerPodCRLink(linkType, testPodCRLinkRuntime()) podList := []*K8sMetaEvent{{EventType: EventTypeUpdate, Object: podCache.metaStore.Items["default/pod3"]}} results := lg.GenerateLinks(podList, linkType) @@ -1837,7 +1838,7 @@ func TestGetPodCustomResourceLinkCRCacheHitMiss(t *testing.T) { POD: podCache, entityType: crCache, }) - lg.registerPodCRLink(linkType, testPodCRLinkRuntime(entityType)) + lg.registerPodCRLink(linkType, testPodCRLinkRuntime()) podList := []*K8sMetaEvent{{EventType: EventTypeUpdate, Object: podCache.metaStore.Items["default/pod4"]}} results := lg.GenerateLinks(podList, linkType) diff --git a/pkg/helper/k8smeta/k8s_meta_namespace_policy_test.go b/pkg/helper/k8smeta/k8s_meta_namespace_policy_test.go index 9797cc7977..ff55931240 100644 --- a/pkg/helper/k8smeta/k8s_meta_namespace_policy_test.go +++ b/pkg/helper/k8smeta/k8s_meta_namespace_policy_test.go @@ -60,7 +60,7 @@ func TestObjectMetaNamespaceForFilter(t *testing.T) { node := &v1.Node{ ObjectMeta: metav1.ObjectMeta{Name: "n1"}, } - ns, cluster = ObjectMetaNamespaceForFilter(NODE, node) + _, cluster = ObjectMetaNamespaceForFilter(NODE, node) if !cluster { t.Fatalf("node should be cluster scoped") } diff --git a/plugins/input/kubernetesmetav2/meta_collector.go b/plugins/input/kubernetesmetav2/meta_collector.go index f33cd0406d..f8969b979d 100644 --- a/plugins/input/kubernetesmetav2/meta_collector.go +++ b/plugins/input/kubernetesmetav2/meta_collector.go @@ -30,7 +30,7 @@ type metaCollector struct { entityLinkBuffer chan models.PipelineEvent stopCh chan struct{} - namespacePolicyID int + namespacePolicyID int // -1 when no namespace policy is registered entityProcessor map[string]ProcessFunc crConfigs map[string]k8smeta.CustomResourceCollectorConfig } From b7a742aaff24acbfabfb081f01db9462c8cecf9b Mon Sep 17 00:00:00 2001 From: StartE Date: Tue, 14 Apr 2026 06:32:45 +0000 Subject: [PATCH 11/20] update --- .../extended/service-kubernetesmeta-v2.md | 14 ++++- .../k8smeta/k8s_meta_cr_unified_cache.go | 60 +++++++++++++------ ..._meta_deferred_deletion_meta_store_test.go | 1 + 3 files changed, 55 insertions(+), 20 deletions(-) diff --git a/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md b/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md index 5e20c81823..64f4f79248 100644 --- a/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md +++ b/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md @@ -36,8 +36,8 @@ | Ingress | bool, false | 是否采集Ingress元数据。 | | EnableLabels | bool, false | 是否采集**内置**Kubernetes 资源的标签(Labels)。 | | EnableAnnotations | bool, false | 是否采集**内置**资源的注解(Annotations)。 | -| NamespaceBlackList | []string,可选 | 全局命名空间过滤:名单内命名空间的对象**不进入 meta 缓存、不投递实体/链路事件**。`Node`、`PersistentVolume`、`StorageClass` 不按命名空间过滤。与 `NamespaceWhiteList` 同时配置时,命名空间允许条件为「未命中黑名单 **或** 在白名单中」(并集语义);| -| NamespaceWhiteList | []string,可选 | 全局命名空间白名单,见上。均为空则不做命名空间限制。 | +| NamespaceBlackList | []string,可选 | 全局命名空间过滤,见下文「**命名空间黑名单 / 白名单**」。`Node`、`PersistentVolume`、`StorageClass` 不按命名空间过滤。 | +| NamespaceWhiteList | []string,可选 | 同上。 | | CustomResources | []object,可选 | 第三方 CR(动态 Informer)采集与链路,见下文「**第三方自定义资源(CustomResources)**」与「**Kubernetes RBAC 权限**」。 | | Node2Pod | string, 无默认值(可选) | Node到Pod的关系名,不填则不生成关系。 | | Deployment2Pod | string, 无默认值(可选) | Deployment到Pod的关系名,不填则不生成关系。 | @@ -67,6 +67,14 @@ | Cluster2PersistentVolume | string, 无默认值(可选) | Cluster到PersistentVolume的关系名,不填则不生成关系。 | | Cluster2StorageClass | string, 无默认值(可选) | Cluster到StorageClass的关系名,不填则不生成关系。 | +### 命名空间黑名单 / 白名单 + +对应参数 **`NamespaceBlackList`**、**`NamespaceWhiteList`**(均为可选字符串列表;名单项一般填命名空间名字符串)。 + +1. **未配置限制**:若两项**均未配置**(未出现配置键,或列表解析后等价于「无有效条目」),则**不对命名空间做过滤**,相关命名空间作用域对象可进入 meta 缓存并参与实体/链路事件(集群作用域资源如 `Node`、`PersistentVolume`、`StorageClass` 本身不按命名空间过滤,行为不变)。 +2. **已配置策略**:若至少配置了其一(仅黑名单、仅白名单、或黑白名单同时存在),则对该插件实例生效一条命名空间策略——命名空间 **被允许** 的条件为:**(该命名空间不在黑名单中)或(该命名空间在白名单中)**(逻辑或,并集语义;与「先黑后白再取交集」的直觉不同)。仅黑名单时等价于「不在黑名单则允许」;仅白名单时等价于「在白名单则允许」。 + +同一集群上若存在**多个** `service_kubernetes_meta` 配置实例,各自注册的策略在实现上为 **OR**:只要**任一**实例的策略允许该命名空间,对象即可通过(详见 `MetaManager.RegisterNamespacePolicy` 文档注释)。 ## Kubernetes RBAC 权限 @@ -80,6 +88,8 @@ 说明当前身份**缺少该 API 组下对应复数资源**的权限,与 LoongCollector 配置无关,需在 RBAC 中补齐。 +若 **CustomResources** 中某一 CR 的 watch **连续出现 RBAC/鉴权类错误**并达到停止阈值,该 CR 的动态 Informer 会在当前进程内停止且不会自动恢复,修正权限后须**手动重启 LoongCollector 进程**方可继续采集该类型。 + ### 内置资源 与配置里打开的内置开关对应即可(如 `pods`、`namespaces`、`deployments` 等)。权限请保持最小可用**`get`、`list`、`watch`** 权限,下面是示例。 diff --git a/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go index 784e2d24ef..e67302437d 100644 --- a/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go +++ b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go @@ -110,11 +110,10 @@ func (c *crUnifiedCache) EnsureWatchStarted() { _, err := c.informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { defer panicRecover() - u := objectToUnstructured(obj) + u := trimmedCRCopyFromInformer(obj, c.resourceType) if u == nil { return } - trimCRObjectForCache(u) now := time.Now().Unix() c.eventCh <- &K8sMetaEvent{ EventType: EventTypeAdd, @@ -129,11 +128,10 @@ func (c *crUnifiedCache) EnsureWatchStarted() { }, UpdateFunc: func(_, obj interface{}) { defer panicRecover() - u := objectToUnstructured(obj) + u := trimmedCRCopyFromInformer(obj, c.resourceType) if u == nil { return } - trimCRObjectForCache(u) now := time.Now().Unix() c.eventCh <- &K8sMetaEvent{ EventType: EventTypeUpdate, @@ -148,11 +146,10 @@ func (c *crUnifiedCache) EnsureWatchStarted() { }, DeleteFunc: func(obj interface{}) { defer panicRecover() - u := objectToUnstructured(obj) + u := trimmedCRCopyFromInformer(obj, c.resourceType) if u == nil { return } - trimCRObjectForCache(u) c.eventCh <- &K8sMetaEvent{ EventType: EventTypeDelete, Object: &ObjectWrapper{ @@ -251,23 +248,50 @@ func (c *crUnifiedCache) UnRegisterSendFunc(key string) { c.metaStore.UnRegisterSendFunc(key) } -func objectToUnstructured(obj interface{}) *unstructured.Unstructured { - if u, ok := obj.(*unstructured.Unstructured); ok { - return u.DeepCopy() - } - if tombstone, ok := obj.(cache.DeletedFinalStateUnknown); ok { - if u, ok := tombstone.Obj.(*unstructured.Unstructured); ok { - return u.DeepCopy() +// trimmedCRCopyFromInformer builds a detached object for the meta cache without full-object DeepCopy: +// copies apiVersion/kind, metadata (without managedFields), and status via NestedFieldCopy — spec is omitted. +// This avoids mutating the informer-shared *unstructured.Unstructured and skips copying large spec blobs. +func trimmedCRCopyFromInformer(obj interface{}, resourceType string) *unstructured.Unstructured { + switch t := obj.(type) { + case *unstructured.Unstructured: + return buildTrimmedCRCopy(t, resourceType) + case cache.DeletedFinalStateUnknown: + if u, ok := t.Obj.(*unstructured.Unstructured); ok { + return buildTrimmedCRCopy(u, resourceType) } } return nil } -// trimCRObjectForCache drops spec and managedFields to limit memory; metadata + status remain for linking and whitelisted export. -func trimCRObjectForCache(u *unstructured.Unstructured) { +func buildTrimmedCRCopy(u *unstructured.Unstructured, resourceType string) *unstructured.Unstructured { if u == nil { - return + return nil + } + out := &unstructured.Unstructured{Object: make(map[string]interface{})} + if gv := u.GetAPIVersion(); gv != "" { + out.SetAPIVersion(gv) + } + if k := u.GetKind(); k != "" { + out.SetKind(k) + } + metaVal, metaFound, metaErr := unstructured.NestedFieldCopy(u.Object, "metadata") + if metaErr != nil { + logger.Debug(context.Background(), K8sMetaUnifyErrorCode, "nested copy metadata for CR cache", metaErr, "resourceType", resourceType) + } else if metaFound { + if metaMap, ok := metaVal.(map[string]interface{}); ok { + delete(metaMap, "managedFields") + if err := unstructured.SetNestedMap(out.Object, metaMap, "metadata"); err != nil { + logger.Debug(context.Background(), K8sMetaUnifyErrorCode, "set metadata on trimmed CR", err, "resourceType", resourceType) + } + } + } + statusVal, statusFound, statusErr := unstructured.NestedFieldCopy(u.Object, "status") + if statusErr != nil { + logger.Debug(context.Background(), K8sMetaUnifyErrorCode, "nested copy status for CR cache", statusErr, "resourceType", resourceType) + } else if statusFound { + if err := unstructured.SetNestedField(out.Object, statusVal, "status"); err != nil { + logger.Debug(context.Background(), K8sMetaUnifyErrorCode, "set status on trimmed CR", err, "resourceType", resourceType) + } } - unstructured.RemoveNestedField(u.Object, "spec") - unstructured.RemoveNestedField(u.Object, "metadata", "managedFields") + return out } diff --git a/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store_test.go b/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store_test.go index 5c27400059..b39bd150d0 100644 --- a/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store_test.go +++ b/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store_test.go @@ -32,6 +32,7 @@ func TestDeferredDeletion(t *testing.T) { Raw: pod, }, } + time.Sleep(10 * time.Millisecond) cache.lock.RLock() if _, ok := cache.Items["default/test"]; !ok { t.Errorf("failed to add object to cache") From 1efe1ecfccfe70f009de9a74e774e70589c3429d Mon Sep 17 00:00:00 2001 From: StartE Date: Tue, 14 Apr 2026 09:56:58 +0000 Subject: [PATCH 12/20] k8s informer giveup when 1.auth fail 2. no resource --- pkg/helper/k8smeta/k8s_meta_cache.go | 70 +++++++++++++------ .../k8smeta/k8s_meta_cr_unified_cache.go | 70 ++++++++++++------- pkg/helper/k8smeta/k8s_meta_informer_auth.go | 25 ------- 3 files changed, 96 insertions(+), 69 deletions(-) delete mode 100644 pkg/helper/k8smeta/k8s_meta_informer_auth.go diff --git a/pkg/helper/k8smeta/k8s_meta_cache.go b/pkg/helper/k8smeta/k8s_meta_cache.go index 41c8ffcdad..129b0cbeb0 100644 --- a/pkg/helper/k8smeta/k8s_meta_cache.go +++ b/pkg/helper/k8smeta/k8s_meta_cache.go @@ -16,6 +16,8 @@ import ( meta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/discovery" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" @@ -36,10 +38,10 @@ type k8sMetaCache struct { resourceType string schema *runtime.Scheme - authFailMu sync.Mutex - authFailCount int - authGiveUp chan struct{} - authGiveUpOnce sync.Once + giveUpMu sync.Mutex + giveUpCount int + giveUpCh chan struct{} + giveUpOnce sync.Once } func newK8sMetaCache(stopCh chan struct{}, resourceType string) *k8sMetaCache { @@ -50,7 +52,7 @@ func newK8sMetaCache(stopCh chan struct{}, resourceType string) *k8sMetaCache { m.metaStore = NewDeferredDeletionMetaStore(m.eventCh, m.stopCh, 120, cache.MetaNamespaceKeyFunc, idxRules...) m.resourceType = resourceType m.schema = runtime.NewScheme() - m.authGiveUp = make(chan struct{}) + m.giveUpCh = make(chan struct{}) _ = v1.AddToScheme(m.schema) _ = batch.AddToScheme(m.schema) _ = batchv1beta1.AddToScheme(m.schema) @@ -107,23 +109,22 @@ func (m *k8sMetaCache) watch(stopCh <-chan struct{}) { go func() { select { case <-m.stopCh: - case <-m.authGiveUp: + case <-m.giveUpCh: } close(mergedStop) }() if err := informer.SetWatchErrorHandler(func(_ *cache.Reflector, err error) { if err != nil { logger.Error(context.Background(), K8sMetaUnifyErrorCode, "resourceType", m.resourceType, "watchError", err) - if isInformerAuthFailure(err) { - m.authFailMu.Lock() - m.authFailCount++ - n := m.authFailCount - m.authFailMu.Unlock() - // Shut down this informer once RBAC/auth failures reach informerAuthFailureStopAfter. - if n >= informerAuthFailureStopAfter { - m.authGiveUpOnce.Do(func() { - logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "stopping informer after repeated RBAC/auth errors (no further retries)", "resourceType", m.resourceType, "failures", n) - close(m.authGiveUp) + if isInformerGiveUpFailure(err) { + m.giveUpMu.Lock() + m.giveUpCount++ + n := m.giveUpCount + m.giveUpMu.Unlock() + if n >= informerGiveUpFailureThreshold { + m.giveUpOnce.Do(func() { + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "stopping informer after repeated errors (RBAC/auth or missing API resource; no further retries)", "resourceType", m.resourceType, "failures", n) + close(m.giveUpCh) }) } } @@ -174,18 +175,26 @@ func (m *k8sMetaCache) watch(stopCh <-chan struct{}) { }, }) go factory.Start(mergedStop) - // wait for first cache sync success, or stop when RBAC limit merges stopCh + // wait for first cache sync success, or stop when give-up merges stopCh + backoff := time.Second + const maxBackoff = 10 * time.Second for { if !cache.WaitForCacheSync(mergedStop, informer.HasSynced) { select { case <-mergedStop: - // Stop was signaled(stop or auth fail): return so MetaManager's sequential cache init does not block other resource types. + // Stop was signaled(stop or give-up): return so MetaManager's sequential cache init does not block other resource types. logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "informer cache sync aborted", "resourceType", m.resourceType) return default: } - logger.Error(context.Background(), K8sMetaUnifyErrorCode, "service cache sync timeout", "resourceType", m.resourceType) - time.Sleep(1 * time.Second) + logger.Error(context.Background(), K8sMetaUnifyErrorCode, "service cache sync timeout", "resourceType", m.resourceType, "nextRetryIn", backoff.String()) + time.Sleep(backoff) + if backoff < maxBackoff { + backoff *= 2 + if backoff > maxBackoff { + backoff = maxBackoff + } + } } else { break } @@ -483,6 +492,27 @@ func containsResource(resources []metav1.APIResource, name string) bool { } return false } + +// gvrDiscoveryAvailable checks discovery for a CRD/plural GVR before starting a dynamic informer +// (same idea as getIngressInformer probing ServerResourcesForGroupVersion). +func gvrDiscoveryAvailable(d discovery.DiscoveryInterface, gvr schema.GroupVersionResource) bool { + if d == nil { + return true + } + gv := schema.GroupVersion{Group: gvr.Group, Version: gvr.Version}.String() + resourceList, err := d.ServerResourcesForGroupVersion(gv) + if err != nil { + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, + "custom resource API group/version not available on server; skipping informer", "gvr", gvr.String(), "error", err) + return false + } + if !containsResource(resourceList.APIResources, gvr.Resource) { + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, + "custom resource plural not listed for group/version; skipping informer", "gvr", gvr.String()) + return false + } + return true +} func generateNodeKey(obj interface{}) ([]string, error) { node, err := meta.Accessor(obj) if err != nil { diff --git a/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go index e67302437d..ff6965892f 100644 --- a/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go +++ b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go @@ -9,6 +9,7 @@ import ( "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" + "k8s.io/client-go/discovery" "k8s.io/client-go/dynamic" "k8s.io/client-go/dynamic/dynamicinformer" "k8s.io/client-go/kubernetes" @@ -27,17 +28,18 @@ type crUnifiedCache struct { resourceType string gvr schema.GroupVersionResource - mu sync.Mutex - dynamicClient dynamic.Interface - informer cache.SharedIndexInformer - factory dynamicinformer.DynamicSharedInformerFactory - watchStarted bool - watchStartOnce sync.Once + mu sync.Mutex + dynamicClient dynamic.Interface + discoveryClient discovery.DiscoveryInterface + informer cache.SharedIndexInformer + factory dynamicinformer.DynamicSharedInformerFactory + watchStarted bool + watchStartOnce sync.Once - authFailMu sync.Mutex - authFailCount int - authGiveUp chan struct{} - authGiveUpOnce sync.Once + giveUpMu sync.Mutex + giveUpCount int + giveUpCh chan struct{} + giveUpOnce sync.Once } func newCRUnifiedCache(stopCh chan struct{}, resourceType string, gvr schema.GroupVersionResource) *crUnifiedCache { @@ -48,7 +50,7 @@ func newCRUnifiedCache(stopCh chan struct{}, resourceType string, gvr schema.Gro eventCh: make(chan *K8sMetaEvent, 100), } c.metaStore = NewDeferredDeletionMetaStore(c.eventCh, stopCh, 120, cache.MetaNamespaceKeyFunc, generateCommonKey) - c.authGiveUp = make(chan struct{}) + c.giveUpCh = make(chan struct{}) return c } @@ -89,6 +91,13 @@ func (c *crUnifiedCache) setRESTConfig(cfg *rest.Config) error { return err } c.dynamicClient = dyn + disco, derr := discovery.NewDiscoveryClientForConfig(restConfigForDynamicClient(cfg)) + if derr != nil { + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "discovery client for custom resource informer unavailable; will not pre-check GVR", "resourceType", c.resourceType, "error", derr) + c.discoveryClient = nil + } else { + c.discoveryClient = disco + } return nil } @@ -105,6 +114,12 @@ func (c *crUnifiedCache) EnsureWatchStarted() { c.watchStartOnce.Do(func() { c.mu.Lock() c.metaStore.Start() + gvr := c.gvr + if !gvrDiscoveryAvailable(c.discoveryClient, gvr) { + c.watchStarted = true + c.mu.Unlock() + return + } c.factory = dynamicinformer.NewDynamicSharedInformerFactory(dyn, time.Hour) c.informer = c.factory.ForResource(c.gvr).Informer() _, err := c.informer.AddEventHandler(cache.ResourceEventHandlerFuncs{ @@ -167,15 +182,15 @@ func (c *crUnifiedCache) EnsureWatchStarted() { if err := c.informer.SetWatchErrorHandler(func(_ *cache.Reflector, err error) { if err != nil { logger.Error(context.Background(), K8sMetaUnifyErrorCode, "resourceType", c.resourceType, "watchError", err) - if isInformerAuthFailure(err) { - c.authFailMu.Lock() - c.authFailCount++ - n := c.authFailCount - c.authFailMu.Unlock() - if n >= informerAuthFailureStopAfter { - c.authGiveUpOnce.Do(func() { - logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "stopping dynamic informer after repeated RBAC/auth errors (no further retries)", "resourceType", c.resourceType, "gvr", c.gvr.String(), "failures", n) - close(c.authGiveUp) + if isInformerGiveUpFailure(err) { + c.giveUpMu.Lock() + c.giveUpCount++ + n := c.giveUpCount + c.giveUpMu.Unlock() + if n >= informerGiveUpFailureThreshold { + c.giveUpOnce.Do(func() { + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "stopping dynamic informer after repeated errors (RBAC/auth or missing API resource; no further retries)", "resourceType", c.resourceType, "gvr", c.gvr.String(), "failures", n) + close(c.giveUpCh) }) } } @@ -185,19 +200,20 @@ func (c *crUnifiedCache) EnsureWatchStarted() { } c.watchStarted = true inf := c.informer - gvr := c.gvr c.mu.Unlock() mergedStop := make(chan struct{}) go func() { select { case <-c.stopCh: - case <-c.authGiveUp: + case <-c.giveUpCh: } close(mergedStop) }() go c.factory.Start(mergedStop) go func() { + backoff := time.Second + const maxBackoff = 10 * time.Second for { if cache.WaitForCacheSync(mergedStop, inf.HasSynced) { logger.Info(context.Background(), "dynamic informer cache synced", "gvr", gvr.String()) @@ -209,8 +225,14 @@ func (c *crUnifiedCache) EnsureWatchStarted() { return default: } - logger.Error(context.Background(), K8sMetaUnifyErrorCode, "dynamic informer cache sync timeout", "gvr", gvr.String()) - time.Sleep(time.Second) + logger.Error(context.Background(), K8sMetaUnifyErrorCode, "dynamic informer cache sync timeout", "gvr", gvr.String(), "nextRetryIn", backoff.String()) + time.Sleep(backoff) + if backoff < maxBackoff { + backoff *= 2 + if backoff > maxBackoff { + backoff = maxBackoff + } + } } }() }) diff --git a/pkg/helper/k8smeta/k8s_meta_informer_auth.go b/pkg/helper/k8smeta/k8s_meta_informer_auth.go deleted file mode 100644 index 9e9089532f..0000000000 --- a/pkg/helper/k8smeta/k8s_meta_informer_auth.go +++ /dev/null @@ -1,25 +0,0 @@ -package k8smeta - -import ( - apierrors "k8s.io/apimachinery/pkg/api/errors" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" -) - -// Consecutive RBAC/auth (Forbidden/Unauthorized) watch errors before stopping this informer. -// 3 balances a single flake vs. failing fast on permanent bad credentials/RBAC. -const informerAuthFailureStopAfter = 3 - -func isInformerAuthFailure(err error) bool { - if err == nil { - return false - } - if apierrors.IsForbidden(err) || apierrors.IsUnauthorized(err) { - return true - } - switch apierrors.ReasonForError(err) { - case metav1.StatusReasonForbidden, metav1.StatusReasonUnauthorized: - return true - default: - return false - } -} From 0aa42c05bd2e0f1af25f60a91ecd00e77a84efd6 Mon Sep 17 00:00:00 2001 From: StartE Date: Tue, 14 Apr 2026 11:49:17 +0000 Subject: [PATCH 13/20] remove useless changes --- .../extended/service-kubernetesmeta-v2.md | 25 +-- go.mod | 2 +- .../k8smeta/k8s_meta_custom_resource.go | 9 +- .../k8s_meta_deferred_deletion_meta_store.go | 12 -- .../k8smeta/k8s_meta_informer_giveup.go | 50 ++++++ pkg/helper/k8smeta/k8s_meta_link_test.go | 2 - pkg/helper/k8smeta/k8s_meta_manager.go | 4 - .../k8smeta/k8s_meta_namespace_policy.go | 169 ------------------ .../k8smeta/k8s_meta_namespace_policy_test.go | 109 ----------- .../input/kubernetesmetav2/meta_collector.go | 11 +- .../kubernetesmetav2/meta_collector_cr.go | 77 +------- .../meta_collector_cr_test.go | 37 ---- .../input/kubernetesmetav2/service_meta.go | 10 +- 13 files changed, 61 insertions(+), 456 deletions(-) create mode 100644 pkg/helper/k8smeta/k8s_meta_informer_giveup.go delete mode 100644 pkg/helper/k8smeta/k8s_meta_namespace_policy.go delete mode 100644 pkg/helper/k8smeta/k8s_meta_namespace_policy_test.go diff --git a/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md b/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md index 64f4f79248..c4cca66c79 100644 --- a/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md +++ b/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md @@ -36,8 +36,6 @@ | Ingress | bool, false | 是否采集Ingress元数据。 | | EnableLabels | bool, false | 是否采集**内置**Kubernetes 资源的标签(Labels)。 | | EnableAnnotations | bool, false | 是否采集**内置**资源的注解(Annotations)。 | -| NamespaceBlackList | []string,可选 | 全局命名空间过滤,见下文「**命名空间黑名单 / 白名单**」。`Node`、`PersistentVolume`、`StorageClass` 不按命名空间过滤。 | -| NamespaceWhiteList | []string,可选 | 同上。 | | CustomResources | []object,可选 | 第三方 CR(动态 Informer)采集与链路,见下文「**第三方自定义资源(CustomResources)**」与「**Kubernetes RBAC 权限**」。 | | Node2Pod | string, 无默认值(可选) | Node到Pod的关系名,不填则不生成关系。 | | Deployment2Pod | string, 无默认值(可选) | Deployment到Pod的关系名,不填则不生成关系。 | @@ -67,15 +65,6 @@ | Cluster2PersistentVolume | string, 无默认值(可选) | Cluster到PersistentVolume的关系名,不填则不生成关系。 | | Cluster2StorageClass | string, 无默认值(可选) | Cluster到StorageClass的关系名,不填则不生成关系。 | -### 命名空间黑名单 / 白名单 - -对应参数 **`NamespaceBlackList`**、**`NamespaceWhiteList`**(均为可选字符串列表;名单项一般填命名空间名字符串)。 - -1. **未配置限制**:若两项**均未配置**(未出现配置键,或列表解析后等价于「无有效条目」),则**不对命名空间做过滤**,相关命名空间作用域对象可进入 meta 缓存并参与实体/链路事件(集群作用域资源如 `Node`、`PersistentVolume`、`StorageClass` 本身不按命名空间过滤,行为不变)。 -2. **已配置策略**:若至少配置了其一(仅黑名单、仅白名单、或黑白名单同时存在),则对该插件实例生效一条命名空间策略——命名空间 **被允许** 的条件为:**(该命名空间不在黑名单中)或(该命名空间在白名单中)**(逻辑或,并集语义;与「先黑后白再取交集」的直觉不同)。仅黑名单时等价于「不在黑名单则允许」;仅白名单时等价于「在白名单则允许」。 - -同一集群上若存在**多个** `service_kubernetes_meta` 配置实例,各自注册的策略在实现上为 **OR**:只要**任一**实例的策略允许该命名空间,对象即可通过(详见 `MetaManager.RegisterNamespacePolicy` 文档注释)。 - ## Kubernetes RBAC 权限 本插件通过 **ServiceAccount**(或 kubeconfig 身份)访问 kube-apiserver。除内置资源外,凡在配置中开启的采集与 **CustomResources** 中声明的 GVR,都需要在 **`ClusterRole`(推荐集群级采集)+ `ClusterRoleBinding`** 或对应的 **`Role` + `RoleBinding`** 中授予至少 **`get`、`list`、`watch`**。 @@ -201,11 +190,8 @@ | PodLink | object,可选 | 配置后,在开启 **Pod** 的前提下可生成 **Pod→CR** 链路;需同时配置 **`Entity2PodRelation`**。 | | Entity2PodRelation | string,可选 | entity_link 中 **`__relation_type__`**(CR 与 Pod 之间的业务关系名);与 `PodLink` 同时非空时生效。 | | Namespace2EntityRelation | string,可选 | entity_link 中 **`__relation_type__`**(Namespace 与该 CR);需 **`CollectEntity: true`**、顶层 **`Namespace: true`** 且本字段非空;仅**有命名空间**的 CR 会生成(集群级 CR 跳过)。 | -| EnableLabels | bool | 为 true 且 `LabelAllowList` 为空时导出**全部** labels;不受顶层 `EnableLabels` 影响。默认不导出。 | -| EnableAnnotations | bool | 同上,作用于 annotations。 | -| LabelAllowList | []string | 非空时仅导出所列 label 键(与 `EnableLabels` 组合见实现逻辑)。 | -| AnnotationAllowList | []string | 非空时仅导出所列 annotation 键。 | -| StatusPathAllowList | []string | 可选,实体日志中 `status` 字段的白名单路径。 | +| EnableLabels | bool | 为 true 时导出**全部** labels;不受顶层 `EnableLabels` 影响。默认不导出。 | +| EnableAnnotations | bool | 为 true 时导出**全部** annotations;不受顶层 `EnableAnnotations` 影响。默认不导出。 | **PodLink** 子字段: @@ -224,11 +210,6 @@ inputs: Interval: 600 Node: true Pod: true - NamespaceBlackList: - - kube-system - # NamespaceWhiteList: - # - default - # - production # Third-party CRs: configure each GVR (and optional PodLink / Entity2PodRelation) under CustomResources. CustomResources: - APIGroup: argoproj.io # API-Group @@ -295,6 +276,4 @@ inputs: * **配置与注册**:`pkg/helper/k8smeta/k8s_meta_custom_resource.go`、`MetaManager.RegisterCustomResourceCollector`(`k8s_meta_manager.go`)。 * **采集与投递**:`plugins/input/kubernetesmetav2/meta_collector.go` 中对 `EntityType` / 链路类型的处理。 -* **命名空间策略**:`k8s_meta_namespace_policy.go`(`ObjectMetaNamespaceForFilter` 对 `unstructured` 与内置类型均已覆盖)。 - 新增 CR 时一般只需**配置层**声明 GVR 与链路字段;若需新索引、新裁剪或新事件语义,再在 **`crUnifiedCache`** 与 **`meta_collector`** 侧按现有模式扩展即可。 diff --git a/go.mod b/go.mod index 37a6b8e0f6..4303cf26bb 100644 --- a/go.mod +++ b/go.mod @@ -70,6 +70,7 @@ require ( k8s.io/api v0.32.1 k8s.io/apimachinery v0.32.1 k8s.io/client-go v0.32.1 + sigs.k8s.io/controller-runtime v0.12.1 ) require ( @@ -281,7 +282,6 @@ require ( k8s.io/kube-openapi v0.0.0-20241105132330-32ad38e42d3f // indirect k8s.io/utils v0.0.0-20241104100929-3ea5e8cea738 // indirect launchpad.net/gocheck v0.0.0-20140225173054-000000000087 // indirect - sigs.k8s.io/controller-runtime v0.12.1 // indirect sigs.k8s.io/gateway-api v0.6.2 // indirect sigs.k8s.io/json v0.0.0-20241010143419-9aa6b5e7a4b3 // indirect sigs.k8s.io/structured-merge-diff/v4 v4.4.2 // indirect diff --git a/pkg/helper/k8smeta/k8s_meta_custom_resource.go b/pkg/helper/k8smeta/k8s_meta_custom_resource.go index 39ef46bc15..32ec4f1367 100644 --- a/pkg/helper/k8smeta/k8s_meta_custom_resource.go +++ b/pkg/helper/k8smeta/k8s_meta_custom_resource.go @@ -31,15 +31,10 @@ type CustomResourceCollectorConfig struct { // Namespace2EntityRelation is __relation_type__ on entity_link logs (Namespace → this namespaced CR). Export when CollectEntity, Namespace input, and this string are all set. Cluster-scoped CRs are skipped. Namespace2EntityRelation string `json:"Namespace2EntityRelation,omitempty"` - // EnableLabels, if true, exports full labels when LabelAllowList is empty. Ignores ServiceK8sMeta.EnableLabels. Default false. + // EnableLabels, if true, exports full labels on entity logs. Ignores ServiceK8sMeta.EnableLabels. Default false. EnableLabels bool `json:"EnableLabels,omitempty"` - // EnableAnnotations, if true, exports full annotations when AnnotationAllowList is empty. Ignores ServiceK8sMeta.EnableAnnotations. Default false. + // EnableAnnotations, if true, exports full annotations on entity logs. Ignores ServiceK8sMeta.EnableAnnotations. Default false. EnableAnnotations bool `json:"EnableAnnotations,omitempty"` - - // Export lists for entity logs (optional; when non-empty, only listed keys are exported regardless of Enable*). - LabelAllowList []string `json:"LabelAllowList,omitempty"` - AnnotationAllowList []string `json:"AnnotationAllowList,omitempty"` - StatusPathAllowList []string `json:"StatusPathAllowList,omitempty"` } // PodToCustomResourceLinkConfig resolves which Workflow-like object a Pod belongs to. diff --git a/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store.go b/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store.go index 2ddd0b2cae..f8e60a5e6b 100644 --- a/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store.go +++ b/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store.go @@ -215,13 +215,6 @@ func (m *DeferredDeletionMetaStore) handleAddOrUpdateEvent(event *K8sMetaEvent) logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "handle k8s meta with keyFunc error", err) return } - // Namespace policy after keyFunc: we need the store key for purgeKey. For standard namespaced - // objects, metadata.namespace is immutable, so MetaNamespaceKeyFunc keeps a stable key across - // updates; purgeKey removes Items[key] and index rows derived from the previously cached object. - if !GetMetaManagerInstance().MetaObjectPassesNamespacePolicy(event.Object) { - m.purgeKey(key) - return - } newIdxKeys := m.getIdxKeys(event.Object) m.lock.Lock() // should delete oldIdxKeys in two cases: @@ -276,11 +269,6 @@ func (m *DeferredDeletionMetaStore) handleDeleteEvent(event *K8sMetaEvent) { logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "handle k8s meta with keyFunc error", err) return } - // Same ordering assumption as handleAddOrUpdateEvent (namespace immutable for namespaced objects). - if !GetMetaManagerInstance().MetaObjectPassesNamespacePolicy(event.Object) { - m.purgeKey(key) - return - } m.lock.Lock() if obj, ok := m.Items[key]; ok { obj.Deleted = true diff --git a/pkg/helper/k8smeta/k8s_meta_informer_giveup.go b/pkg/helper/k8smeta/k8s_meta_informer_giveup.go new file mode 100644 index 0000000000..0a31e1b032 --- /dev/null +++ b/pkg/helper/k8smeta/k8s_meta_informer_giveup.go @@ -0,0 +1,50 @@ +package k8smeta + +import ( + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/apimachinery/pkg/api/meta" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// informerGiveUpFailureThreshold is how many consecutive Reflector ListAndWatch errors +// of a “give up” class (RBAC, missing API resource, etc.) trigger stopping the informer factory. +const informerGiveUpFailureThreshold = 3 + +func isInformerAuthFailure(err error) bool { + if err == nil { + return false + } + if apierrors.IsForbidden(err) || apierrors.IsUnauthorized(err) { + return true + } + switch apierrors.ReasonForError(err) { + case metav1.StatusReasonForbidden, metav1.StatusReasonUnauthorized: + return true + default: + return false + } +} + +// isInformerPermanentResourceFailure matches Reflector ListAndWatch errors that will not +// recover without cluster changes (e.g. CRD not installed: "the server could not find the requested resource"). +func isInformerPermanentResourceFailure(err error) bool { + if err == nil { + return false + } + if apierrors.IsNotFound(err) { + return true + } + if meta.IsNoMatchError(err) { + return true + } + switch apierrors.ReasonForError(err) { + case metav1.StatusReasonNotFound: + return true + default: + return false + } +} + +func isInformerGiveUpFailure(err error) bool { + return isInformerAuthFailure(err) || isInformerPermanentResourceFailure(err) +} diff --git a/pkg/helper/k8smeta/k8s_meta_link_test.go b/pkg/helper/k8smeta/k8s_meta_link_test.go index de93b41513..f43b2a5190 100644 --- a/pkg/helper/k8smeta/k8s_meta_link_test.go +++ b/pkg/helper/k8smeta/k8s_meta_link_test.go @@ -1651,8 +1651,6 @@ func generateMockPod(index string) *ObjectWrapper { } } -// Namespace policy is not tested here: CR link tests use handleAddOrUpdateEvent on the store only; -// MetaObjectPassesNamespacePolicy runs on the global singleton with no registered policies (allow-all). func testArgoWorkflowCR(name string) *unstructured.Unstructured { u := &unstructured.Unstructured{} u.SetAPIVersion("argoproj.io/v1alpha1") diff --git a/pkg/helper/k8smeta/k8s_meta_manager.go b/pkg/helper/k8smeta/k8s_meta_manager.go index 39edd6068c..a2edae666b 100644 --- a/pkg/helper/k8smeta/k8s_meta_manager.go +++ b/pkg/helper/k8smeta/k8s_meta_manager.go @@ -53,10 +53,6 @@ type MetaManager struct { linkRegisterMap map[string][]string registerLock sync.RWMutex - nsPolicyMu sync.RWMutex - nsPolicyRegs []nsPolicyReg - nextNsPolicyID int - // self metrics projectNames map[string]int metricRecord selfmonitor.MetricsRecord diff --git a/pkg/helper/k8smeta/k8s_meta_namespace_policy.go b/pkg/helper/k8smeta/k8s_meta_namespace_policy.go deleted file mode 100644 index 2e8f8dc258..0000000000 --- a/pkg/helper/k8smeta/k8s_meta_namespace_policy.go +++ /dev/null @@ -1,169 +0,0 @@ -package k8smeta - -import ( - "strings" - - "k8s.io/apimachinery/pkg/api/meta" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" -) - -// namespacePolicy is one pipeline's NamespaceBlackList / NamespaceWhiteList. -// - Neither list: allows all namespaces (such a policy is not registered). -// - BlackList only: allow ns ∉ BlackList. -// - WhiteList only: allow ns ∈ WhiteList. -// - Both: union semantics — allow if ns ∉ BlackList OR ns ∈ WhiteList. -type namespacePolicy struct { - black map[string]struct{} - white map[string]struct{} - hasBlack bool - hasWhite bool -} - -func newNamespacePolicy(blackList, whiteList []string) *namespacePolicy { - p := &namespacePolicy{} - for _, s := range blackList { - s = strings.TrimSpace(s) - if s == "" { - continue - } - if p.black == nil { - p.black = make(map[string]struct{}) - } - p.black[s] = struct{}{} - p.hasBlack = true - } - for _, s := range whiteList { - s = strings.TrimSpace(s) - if s == "" { - continue - } - if p.white == nil { - p.white = make(map[string]struct{}) - } - p.white[s] = struct{}{} - p.hasWhite = true - } - return p -} - -func (p *namespacePolicy) allowsNamespace(ns string) bool { - if !p.hasBlack && !p.hasWhite { - return true - } - inB := p.hasBlack && p.black != nil && containsSet(p.black, ns) - inW := p.hasWhite && p.white != nil && containsSet(p.white, ns) - if !p.hasBlack { - return inW - } - if !p.hasWhite { - return !inB - } - // Allow if: (1) not in blacklist, OR (2) in whitelist - return !inB || inW -} - -func containsSet(m map[string]struct{}, ns string) bool { - _, ok := m[ns] - return ok -} - -// ObjectMetaNamespaceForFilter returns the namespace string used for policy checks and whether the object is cluster-scoped (no namespace filtering). -func ObjectMetaNamespaceForFilter(resourceType string, raw interface{}) (ns string, clusterScoped bool) { - if raw == nil { - return "", true - } - switch t := raw.(type) { - case *unstructured.Unstructured: - if resourceType == NAMESPACE { - return t.GetName(), false - } - n := t.GetNamespace() - if n == "" { - return "", true - } - return n, false - default: - switch resourceType { - case NODE, PERSISTENTVOLUME, STORAGECLASS: - return "", true - case NAMESPACE: - acc, err := meta.Accessor(raw) - if err != nil { - return "", true - } - return acc.GetName(), false - } - acc, err := meta.Accessor(raw) - if err != nil { - return "", true - } - n := acc.GetNamespace() - if n == "" { - return "", true - } - return n, false - } -} - -type nsPolicyReg struct { - id int - p *namespacePolicy -} - -// RegisterNamespacePolicy registers one input's namespace rules. Multiple inputs are combined with OR: -// a namespace passes if any registered policy allows it. Returns -1 when both lists are empty (nothing registered). -// Unregister non-negative ids with UnregisterNamespacePolicy on stop. -func (m *MetaManager) RegisterNamespacePolicy(blackList, whiteList []string) int { - p := newNamespacePolicy(blackList, whiteList) - if !p.hasBlack && !p.hasWhite { - return -1 - } - m.nsPolicyMu.Lock() - defer m.nsPolicyMu.Unlock() - id := m.nextNsPolicyID - m.nextNsPolicyID++ - m.nsPolicyRegs = append(m.nsPolicyRegs, nsPolicyReg{id: id, p: p}) - return id -} - -// UnregisterNamespacePolicy removes a policy registered with RegisterNamespacePolicy. Pass id -1 for no-op. -func (m *MetaManager) UnregisterNamespacePolicy(id int) { - if id < 0 { - return - } - m.nsPolicyMu.Lock() - defer m.nsPolicyMu.Unlock() - out := make([]nsPolicyReg, 0, len(m.nsPolicyRegs)) - for _, r := range m.nsPolicyRegs { - if r.id != id { - out = append(out, r) - } - } - m.nsPolicyRegs = out -} - -// MetaObjectPassesNamespacePolicy returns whether the object may enter the meta cache or be broadcast (add/update/delete). -func (m *MetaManager) MetaObjectPassesNamespacePolicy(o *ObjectWrapper) bool { - if o == nil || o.Raw == nil { - return true - } - if _, ok := o.Raw.(*TimerEvent); ok { - return true - } - ns, clusterScoped := ObjectMetaNamespaceForFilter(o.ResourceType, o.Raw) - if clusterScoped { - return true - } - m.nsPolicyMu.RLock() - regs := m.nsPolicyRegs - m.nsPolicyMu.RUnlock() - if len(regs) == 0 { - return true - } - for _, r := range regs { - if r.p.allowsNamespace(ns) { - return true - } - } - return false -} diff --git a/pkg/helper/k8smeta/k8s_meta_namespace_policy_test.go b/pkg/helper/k8smeta/k8s_meta_namespace_policy_test.go deleted file mode 100644 index ff55931240..0000000000 --- a/pkg/helper/k8smeta/k8s_meta_namespace_policy_test.go +++ /dev/null @@ -1,109 +0,0 @@ -package k8smeta - -import ( - "testing" - - v1 "k8s.io/api/core/v1" - metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/apis/meta/v1/unstructured" -) - -func TestNamespacePolicyAllows(t *testing.T) { - tests := []struct { - name string - black []string - white []string - ns string - wantAllow bool - }{ - {"empty both", nil, nil, "kube-system", true}, - {"black only drop", []string{"kube-system"}, nil, "kube-system", false}, - {"black only allow", []string{"kube-system"}, nil, "app", true}, - {"white only in", nil, []string{"app"}, "app", true}, - {"white only out", nil, []string{"app"}, "kube-system", false}, - {"both union rescue", []string{"kube-system"}, []string{"kube-system"}, "kube-system", true}, - {"both black blocks", []string{"bad"}, []string{"app"}, "bad", false}, - {"both allow via white", []string{"bad"}, []string{"app"}, "app", true}, - {"both allow via not black", []string{"bad"}, []string{"app"}, "other", true}, - } - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - p := newNamespacePolicy(tt.black, tt.white) - if !p.hasBlack && !p.hasWhite { - if !p.allowsNamespace(tt.ns) { - t.Fatalf("empty policy should allow") - } - return - } - if got := p.allowsNamespace(tt.ns); got != tt.wantAllow { - t.Fatalf("allowsNamespace(%q) = %v, want %v", tt.ns, got, tt.wantAllow) - } - }) - } -} - -func TestObjectMetaNamespaceForFilter(t *testing.T) { - pod := &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{Namespace: "app", Name: "p"}, - } - ns, cluster := ObjectMetaNamespaceForFilter(POD, pod) - if cluster || ns != "app" { - t.Fatalf("pod: got ns=%q cluster=%v", ns, cluster) - } - nsObj := &v1.Namespace{ - ObjectMeta: metav1.ObjectMeta{Name: "kube-system"}, - } - ns, cluster = ObjectMetaNamespaceForFilter(NAMESPACE, nsObj) - if cluster || ns != "kube-system" { - t.Fatalf("namespace resource: got ns=%q cluster=%v", ns, cluster) - } - node := &v1.Node{ - ObjectMeta: metav1.ObjectMeta{Name: "n1"}, - } - _, cluster = ObjectMetaNamespaceForFilter(NODE, node) - if !cluster { - t.Fatalf("node should be cluster scoped") - } - u := &unstructured.Unstructured{} - u.SetNamespace("cr-ns") - u.SetName("wf1") - ns, cluster = ObjectMetaNamespaceForFilter("custom.entity", u) - if cluster || ns != "cr-ns" { - t.Fatalf("unstructured: got ns=%q cluster=%v", ns, cluster) - } -} - -func TestMetaManagerNamespacePolicyOR(t *testing.T) { - m := &MetaManager{} - id1 := m.RegisterNamespacePolicy([]string{"kube-system"}, nil) - if id1 < 0 { - t.Fatal(id1) - } - id2 := m.RegisterNamespacePolicy(nil, []string{"app"}) - if id2 < 0 { - t.Fatal(id2) - } - wrap := func(rt string, raw interface{}) *ObjectWrapper { - return &ObjectWrapper{ResourceType: rt, Raw: raw} - } - if !m.MetaObjectPassesNamespacePolicy(wrap(POD, &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{Namespace: "app", Name: "a"}, - })) { - t.Fatal("app should pass whitelist policy") - } - if !m.MetaObjectPassesNamespacePolicy(wrap(POD, &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{Namespace: "default", Name: "a"}, - })) { - t.Fatal("default should pass via only-black policy (not kube-system)") - } - if m.MetaObjectPassesNamespacePolicy(wrap(POD, &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{Namespace: "kube-system", Name: "a"}, - })) { - t.Fatal("kube-system should be blocked by blacklist policy") - } - m.UnregisterNamespacePolicy(id1) - m.UnregisterNamespacePolicy(id2) - if len(m.nsPolicyRegs) != 0 { - t.Fatal("regs should be empty") - } -} diff --git a/plugins/input/kubernetesmetav2/meta_collector.go b/plugins/input/kubernetesmetav2/meta_collector.go index f8969b979d..2ffcc07bae 100644 --- a/plugins/input/kubernetesmetav2/meta_collector.go +++ b/plugins/input/kubernetesmetav2/meta_collector.go @@ -29,17 +29,12 @@ type metaCollector struct { entityBuffer chan models.PipelineEvent entityLinkBuffer chan models.PipelineEvent - stopCh chan struct{} - namespacePolicyID int // -1 when no namespace policy is registered - entityProcessor map[string]ProcessFunc + stopCh chan struct{} + entityProcessor map[string]ProcessFunc crConfigs map[string]k8smeta.CustomResourceCollectorConfig } func (m *metaCollector) Start() error { - m.namespacePolicyID = m.serviceK8sMeta.metaManager.RegisterNamespacePolicy( - m.serviceK8sMeta.NamespaceBlackList, - m.serviceK8sMeta.NamespaceWhiteList, - ) m.entityProcessor = map[string]ProcessFunc{ k8smeta.POD: m.processPodEntity, k8smeta.NODE: m.processNodeEntity, @@ -238,8 +233,6 @@ func (m *metaCollector) Start() error { } func (m *metaCollector) Stop() error { - m.serviceK8sMeta.metaManager.UnregisterNamespacePolicy(m.namespacePolicyID) - m.namespacePolicyID = -1 m.serviceK8sMeta.metaManager.UnRegisterAllSendFunc(m.serviceK8sMeta.context.GetProject(), m.serviceK8sMeta.configName) close(m.stopCh) return nil diff --git a/plugins/input/kubernetesmetav2/meta_collector_cr.go b/plugins/input/kubernetesmetav2/meta_collector_cr.go index aa2a2aa8ea..df06299f9a 100644 --- a/plugins/input/kubernetesmetav2/meta_collector_cr.go +++ b/plugins/input/kubernetesmetav2/meta_collector_cr.go @@ -10,66 +10,6 @@ import ( "github.com/alibaba/ilogtail/pkg/models" ) -func (m *metaCollector) customResourceLabelAllowList(cfg k8smeta.CustomResourceCollectorConfig) []string { - if len(cfg.LabelAllowList) > 0 { - return cfg.LabelAllowList - } - return nil -} - -func (m *metaCollector) customResourceAnnotationAllowList(cfg k8smeta.CustomResourceCollectorConfig) []string { - if len(cfg.AnnotationAllowList) > 0 { - return cfg.AnnotationAllowList - } - return nil -} - -func (m *metaCollector) customResourceStatusPaths(cfg k8smeta.CustomResourceCollectorConfig) []string { - if len(cfg.StatusPathAllowList) > 0 { - return cfg.StatusPathAllowList - } - return nil -} - -func filterStringMapByAllowList(m map[string]string, allow []string) map[string]string { - if len(allow) == 0 || len(m) == 0 { - return nil - } - out := make(map[string]string) - for _, k := range allow { - if k == "" { - continue - } - if v, ok := m[k]; ok { - out[k] = v - } - } - if len(out) == 0 { - return nil - } - return out -} - -func pickUnstructuredFieldCopy(obj map[string]interface{}, paths []string) map[string]interface{} { - if len(paths) == 0 || obj == nil { - return nil - } - out := make(map[string]interface{}) - for _, p := range paths { - if p == "" { - continue - } - parts := strings.Split(p, ".") - if v, found, err := unstructured.NestedFieldCopy(obj, parts...); found && err == nil { - out[p] = v - } - } - if len(out) == 0 { - return nil - } - return out -} - func (m *metaCollector) processCustomResourceEntity(data *k8smeta.ObjectWrapper, method string) []models.PipelineEvent { cfg, ok := m.crConfigs[data.ResourceType] if !ok { @@ -88,25 +28,12 @@ func (m *metaCollector) processCustomResourceEntity(data *k8smeta.ObjectWrapper, log.Contents.Add("api_version", obj.GetAPIVersion()) log.Contents.Add("namespace", obj.GetNamespace()) - labelAllow := m.customResourceLabelAllowList(cfg) - if len(labelAllow) > 0 { - if labels := filterStringMapByAllowList(obj.GetLabels(), labelAllow); labels != nil { - log.Contents.Add("labels", m.processEntityJSONObject(labels)) - } - } else if cfg.EnableLabels { + if cfg.EnableLabels { log.Contents.Add("labels", m.processEntityJSONObject(obj.GetLabels())) } - annoAllow := m.customResourceAnnotationAllowList(cfg) - if len(annoAllow) > 0 { - if annos := filterStringMapByAllowList(obj.GetAnnotations(), annoAllow); annos != nil { - log.Contents.Add("annotations", m.processEntityJSONObject(annos)) - } - } else if cfg.EnableAnnotations { + if cfg.EnableAnnotations { log.Contents.Add("annotations", m.processEntityJSONObject(obj.GetAnnotations())) } - if statusObj := pickUnstructuredFieldCopy(obj.Object, m.customResourceStatusPaths(cfg)); statusObj != nil { - log.Contents.Add("status", m.processEntityJSONObject(statusObj)) - } return []models.PipelineEvent{log} } diff --git a/plugins/input/kubernetesmetav2/meta_collector_cr_test.go b/plugins/input/kubernetesmetav2/meta_collector_cr_test.go index 6269e85d94..8bf185e11f 100644 --- a/plugins/input/kubernetesmetav2/meta_collector_cr_test.go +++ b/plugins/input/kubernetesmetav2/meta_collector_cr_test.go @@ -237,43 +237,6 @@ func TestProcessCustomResourceEntityEnableLabels(t *testing.T) { assert.Contains(t, labels, "drop") } -func TestProcessCustomResourceEntityLabelAllowList(t *testing.T) { - entityType := "argo.workflow" - cfg := k8smeta.CustomResourceCollectorConfig{ - EntityType: entityType, Kind: "Workflow", - LabelAllowList: []string{"keep"}, - } - m := &metaCollector{ - serviceK8sMeta: &ServiceK8sMeta{Interval: 10}, - crConfigs: map[string]k8smeta.CustomResourceCollectorConfig{entityType: cfg}, - } - data := &k8smeta.ObjectWrapper{ResourceType: entityType, Raw: testWorkflowUnstructured(t)} - events := m.processCustomResourceEntity(data, "update") - require.Len(t, events, 1) - log := events[0].(*models.Log) - labels := stringField(t, log, "labels") - assert.Contains(t, labels, "keep") - assert.NotContains(t, labels, "drop") -} - -func TestProcessCustomResourceEntityStatusPathAllowList(t *testing.T) { - entityType := "argo.workflow" - cfg := k8smeta.CustomResourceCollectorConfig{ - EntityType: entityType, Kind: "Workflow", - StatusPathAllowList: []string{"status.phase"}, - } - m := &metaCollector{ - serviceK8sMeta: &ServiceK8sMeta{Interval: 10}, - crConfigs: map[string]k8smeta.CustomResourceCollectorConfig{entityType: cfg}, - } - data := &k8smeta.ObjectWrapper{ResourceType: entityType, Raw: testWorkflowUnstructured(t)} - events := m.processCustomResourceEntity(data, "update") - require.Len(t, events, 1) - log := events[0].(*models.Log) - status := stringField(t, log, "status") - assert.Contains(t, status, "Running") -} - func TestProcessCustomResourceEntityEnableAnnotations(t *testing.T) { entityType := "argo.workflow" cfg := k8smeta.CustomResourceCollectorConfig{EntityType: entityType, Kind: "Workflow", EnableAnnotations: true} diff --git a/plugins/input/kubernetesmetav2/service_meta.go b/plugins/input/kubernetesmetav2/service_meta.go index 580e911d93..5b0d931d8b 100644 --- a/plugins/input/kubernetesmetav2/service_meta.go +++ b/plugins/input/kubernetesmetav2/service_meta.go @@ -33,11 +33,6 @@ type ServiceK8sMeta struct { Container bool // CustomResources registers third-party CRs (dynamic informer + optional CR→Pod links via PodLink). See k8smeta.CustomResourceCollectorConfig. CustomResources []k8smeta.CustomResourceCollectorConfig `json:"CustomResources,omitempty"` - // NamespaceBlackList / NamespaceWhiteList: global namespace filter for meta cache and events (Node, PersistentVolume, StorageClass are not filtered). - // If both are set on this input, a namespace is allowed when it is not blacklisted OR whitelisted (union). Multiple pipelines OR their policies. - // Empty both: no restriction from this input. Cluster-scoped objects are always allowed. - NamespaceBlackList []string `json:"NamespaceBlackList,omitempty"` - NamespaceWhiteList []string `json:"NamespaceWhiteList,omitempty"` // EnableLabels / EnableAnnotations: when true, emit full labels/annotations on built-in entity kinds (not CustomResources; those use CustomResources[].EnableLabels/EnableAnnotations). EnableLabels bool EnableAnnotations bool @@ -123,9 +118,8 @@ func (s *ServiceK8sMeta) Start(collector pipeline.Collector) error { collector: collector, entityBuffer: make(chan models.PipelineEvent, 100), entityLinkBuffer: make(chan models.PipelineEvent, 100), - stopCh: make(chan struct{}), - namespacePolicyID: -1, - entityProcessor: make(map[string]ProcessFunc), + stopCh: make(chan struct{}), + entityProcessor: make(map[string]ProcessFunc), } return s.metaCollector.Start() } From 9a496aea6d618578097f76defc4c8c370d8ff950 Mon Sep 17 00:00:00 2001 From: StartE Date: Tue, 14 Apr 2026 11:59:31 +0000 Subject: [PATCH 14/20] optimize --- .../extended/service-kubernetesmeta-v2.md | 10 --------- pkg/helper/k8smeta/k8s_meta_cache.go | 21 ------------------- .../k8smeta/k8s_meta_cr_unified_cache.go | 21 +++++++++++++++++++ .../k8s_meta_deferred_deletion_meta_store.go | 19 ----------------- ..._meta_deferred_deletion_meta_store_test.go | 1 - 5 files changed, 21 insertions(+), 51 deletions(-) diff --git a/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md b/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md index c4cca66c79..35fb9fd6bf 100644 --- a/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md +++ b/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md @@ -267,13 +267,3 @@ inputs: "__time__":"1723276913" } ``` - -## CR 开发说明 - -**实现上内置与 CR 的差异**见上文「**第三方自定义资源(CustomResources)**」中的 **「内置缓存与 CR 缓存对比」** 表;源码分别位于 `pkg/helper/k8smeta/k8s_meta_cache.go` 与 `k8s_meta_cr_unified_cache.go`。 - -### 扩展 CR 时可关注的代码路径 - -* **配置与注册**:`pkg/helper/k8smeta/k8s_meta_custom_resource.go`、`MetaManager.RegisterCustomResourceCollector`(`k8s_meta_manager.go`)。 -* **采集与投递**:`plugins/input/kubernetesmetav2/meta_collector.go` 中对 `EntityType` / 链路类型的处理。 -新增 CR 时一般只需**配置层**声明 GVR 与链路字段;若需新索引、新裁剪或新事件语义,再在 **`crUnifiedCache`** 与 **`meta_collector`** 侧按现有模式扩展即可。 diff --git a/pkg/helper/k8smeta/k8s_meta_cache.go b/pkg/helper/k8smeta/k8s_meta_cache.go index 129b0cbeb0..9a7db1e459 100644 --- a/pkg/helper/k8smeta/k8s_meta_cache.go +++ b/pkg/helper/k8smeta/k8s_meta_cache.go @@ -17,7 +17,6 @@ import ( metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" "k8s.io/apimachinery/pkg/runtime/schema" - "k8s.io/client-go/discovery" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" @@ -493,26 +492,6 @@ func containsResource(resources []metav1.APIResource, name string) bool { return false } -// gvrDiscoveryAvailable checks discovery for a CRD/plural GVR before starting a dynamic informer -// (same idea as getIngressInformer probing ServerResourcesForGroupVersion). -func gvrDiscoveryAvailable(d discovery.DiscoveryInterface, gvr schema.GroupVersionResource) bool { - if d == nil { - return true - } - gv := schema.GroupVersion{Group: gvr.Group, Version: gvr.Version}.String() - resourceList, err := d.ServerResourcesForGroupVersion(gv) - if err != nil { - logger.Warning(context.Background(), K8sMetaUnifyErrorCode, - "custom resource API group/version not available on server; skipping informer", "gvr", gvr.String(), "error", err) - return false - } - if !containsResource(resourceList.APIResources, gvr.Resource) { - logger.Warning(context.Background(), K8sMetaUnifyErrorCode, - "custom resource plural not listed for group/version; skipping informer", "gvr", gvr.String()) - return false - } - return true -} func generateNodeKey(obj interface{}) ([]string, error) { node, err := meta.Accessor(obj) if err != nil { diff --git a/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go index ff6965892f..4cfccf99ff 100644 --- a/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go +++ b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go @@ -101,6 +101,27 @@ func (c *crUnifiedCache) setRESTConfig(cfg *rest.Config) error { return nil } +// gvrDiscoveryAvailable checks discovery for a CRD/plural GVR before starting a dynamic informer +// (same idea as getIngressInformer probing ServerResourcesForGroupVersion in k8s_meta_cache.go). +func gvrDiscoveryAvailable(d discovery.DiscoveryInterface, gvr schema.GroupVersionResource) bool { + if d == nil { + return true + } + gv := schema.GroupVersion{Group: gvr.Group, Version: gvr.Version}.String() + resourceList, err := d.ServerResourcesForGroupVersion(gv) + if err != nil { + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, + "custom resource API group/version not available on server; skipping informer", "gvr", gvr.String(), "error", err) + return false + } + if !containsResource(resourceList.APIResources, gvr.Resource) { + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, + "custom resource plural not listed for group/version; skipping informer", "gvr", gvr.String()) + return false + } + return true +} + // EnsureWatchStarted starts the dynamic informer (once) when the dynamic client is ready. // Important: never enter sync.Once when dynamicClient is nil. func (c *crUnifiedCache) EnsureWatchStarted() { diff --git a/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store.go b/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store.go index f8e60a5e6b..2159566ff7 100644 --- a/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store.go +++ b/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store.go @@ -316,25 +316,6 @@ func (m *DeferredDeletionMetaStore) handleDeferredDeleteEvent(event *K8sMetaEven } } -// purgeKey removes an object from the store and index if present (no SendFunc). -func (m *DeferredDeletionMetaStore) purgeKey(key string) { - m.lock.Lock() - defer m.lock.Unlock() - obj, ok := m.Items[key] - if !ok { - return - } - for _, idxKey := range m.getIdxKeys(obj) { - if item, ok := m.Index[idxKey]; ok { - item.Remove(key) - if len(item.Keys) == 0 { - delete(m.Index, idxKey) - } - } - } - delete(m.Items, key) -} - func (m *DeferredDeletionMetaStore) handleTimerEvent(event *K8sMetaEvent) { timerEvent := event.Object.Raw.(*TimerEvent) m.registerLock.RLock() diff --git a/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store_test.go b/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store_test.go index b39bd150d0..5c27400059 100644 --- a/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store_test.go +++ b/pkg/helper/k8smeta/k8s_meta_deferred_deletion_meta_store_test.go @@ -32,7 +32,6 @@ func TestDeferredDeletion(t *testing.T) { Raw: pod, }, } - time.Sleep(10 * time.Millisecond) cache.lock.RLock() if _, ok := cache.Items["default/test"]; !ok { t.Errorf("failed to add object to cache") From d465b2d2530560c379ee414db8241db4133d471b Mon Sep 17 00:00:00 2001 From: StartE Date: Tue, 14 Apr 2026 12:13:39 +0000 Subject: [PATCH 15/20] update --- pkg/helper/k8smeta/k8s_meta_cache.go | 62 +------- .../k8smeta/k8s_meta_cr_unified_cache.go | 62 ++------ .../k8smeta/k8s_meta_informer_lifecycle.go | 134 ++++++++++++++++++ 3 files changed, 150 insertions(+), 108 deletions(-) create mode 100644 pkg/helper/k8smeta/k8s_meta_informer_lifecycle.go diff --git a/pkg/helper/k8smeta/k8s_meta_cache.go b/pkg/helper/k8smeta/k8s_meta_cache.go index 9a7db1e459..e5cd8484d6 100644 --- a/pkg/helper/k8smeta/k8s_meta_cache.go +++ b/pkg/helper/k8smeta/k8s_meta_cache.go @@ -3,7 +3,6 @@ package k8smeta import ( "context" "fmt" - "sync" "time" app "k8s.io/api/apps/v1" @@ -16,7 +15,6 @@ import ( meta "k8s.io/apimachinery/pkg/api/meta" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime" - "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" @@ -37,10 +35,7 @@ type k8sMetaCache struct { resourceType string schema *runtime.Scheme - giveUpMu sync.Mutex - giveUpCount int - giveUpCh chan struct{} - giveUpOnce sync.Once + giveUp *informerGiveUp } func newK8sMetaCache(stopCh chan struct{}, resourceType string) *k8sMetaCache { @@ -51,7 +46,7 @@ func newK8sMetaCache(stopCh chan struct{}, resourceType string) *k8sMetaCache { m.metaStore = NewDeferredDeletionMetaStore(m.eventCh, m.stopCh, 120, cache.MetaNamespaceKeyFunc, idxRules...) m.resourceType = resourceType m.schema = runtime.NewScheme() - m.giveUpCh = make(chan struct{}) + m.giveUp = newInformerGiveUp() _ = v1.AddToScheme(m.schema) _ = batch.AddToScheme(m.schema) _ = batchv1beta1.AddToScheme(m.schema) @@ -104,30 +99,10 @@ func (m *k8sMetaCache) watch(stopCh <-chan struct{}) { if informer == nil { return } - mergedStop := make(chan struct{}) - go func() { - select { - case <-m.stopCh: - case <-m.giveUpCh: - } - close(mergedStop) - }() - if err := informer.SetWatchErrorHandler(func(_ *cache.Reflector, err error) { - if err != nil { - logger.Error(context.Background(), K8sMetaUnifyErrorCode, "resourceType", m.resourceType, "watchError", err) - if isInformerGiveUpFailure(err) { - m.giveUpMu.Lock() - m.giveUpCount++ - n := m.giveUpCount - m.giveUpMu.Unlock() - if n >= informerGiveUpFailureThreshold { - m.giveUpOnce.Do(func() { - logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "stopping informer after repeated errors (RBAC/auth or missing API resource; no further retries)", "resourceType", m.resourceType, "failures", n) - close(m.giveUpCh) - }) - } - } - } + mergedStop := m.giveUp.mergedStop(m.stopCh) + if err := attachWatchErrorHandler(informer, m.giveUp, watchErrorHandlerOpts{ + ResourceType: m.resourceType, + GiveUpStopMsg: "stopping informer after repeated errors (RBAC/auth or missing API resource; no further retries)", }); err != nil { logger.Error(context.Background(), K8sMetaUnifyErrorCode, "fail to set watch error handler", err) } @@ -174,30 +149,7 @@ func (m *k8sMetaCache) watch(stopCh <-chan struct{}) { }, }) go factory.Start(mergedStop) - // wait for first cache sync success, or stop when give-up merges stopCh - backoff := time.Second - const maxBackoff = 10 * time.Second - for { - if !cache.WaitForCacheSync(mergedStop, informer.HasSynced) { - select { - case <-mergedStop: - // Stop was signaled(stop or give-up): return so MetaManager's sequential cache init does not block other resource types. - logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "informer cache sync aborted", "resourceType", m.resourceType) - return - default: - } - logger.Error(context.Background(), K8sMetaUnifyErrorCode, "service cache sync timeout", "resourceType", m.resourceType, "nextRetryIn", backoff.String()) - time.Sleep(backoff) - if backoff < maxBackoff { - backoff *= 2 - if backoff > maxBackoff { - backoff = maxBackoff - } - } - } else { - break - } - } + waitInformerCacheSync(mergedStop, informer.HasSynced, informerCacheSyncOpts{ResourceType: m.resourceType}) } func (m *k8sMetaCache) getFactoryInformer() (informers.SharedInformerFactory, cache.SharedIndexInformer) { diff --git a/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go index 4cfccf99ff..c97b8440ac 100644 --- a/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go +++ b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go @@ -36,10 +36,7 @@ type crUnifiedCache struct { watchStarted bool watchStartOnce sync.Once - giveUpMu sync.Mutex - giveUpCount int - giveUpCh chan struct{} - giveUpOnce sync.Once + giveUp *informerGiveUp } func newCRUnifiedCache(stopCh chan struct{}, resourceType string, gvr schema.GroupVersionResource) *crUnifiedCache { @@ -50,7 +47,7 @@ func newCRUnifiedCache(stopCh chan struct{}, resourceType string, gvr schema.Gro eventCh: make(chan *K8sMetaEvent, 100), } c.metaStore = NewDeferredDeletionMetaStore(c.eventCh, stopCh, 120, cache.MetaNamespaceKeyFunc, generateCommonKey) - c.giveUpCh = make(chan struct{}) + c.giveUp = newInformerGiveUp() return c } @@ -200,22 +197,10 @@ func (c *crUnifiedCache) EnsureWatchStarted() { if err != nil { logger.Error(context.Background(), K8sMetaUnifyErrorCode, "fail to add dynamic informer event handler", err, "resourceType", c.resourceType, "gvr", c.gvr.String()) } - if err := c.informer.SetWatchErrorHandler(func(_ *cache.Reflector, err error) { - if err != nil { - logger.Error(context.Background(), K8sMetaUnifyErrorCode, "resourceType", c.resourceType, "watchError", err) - if isInformerGiveUpFailure(err) { - c.giveUpMu.Lock() - c.giveUpCount++ - n := c.giveUpCount - c.giveUpMu.Unlock() - if n >= informerGiveUpFailureThreshold { - c.giveUpOnce.Do(func() { - logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "stopping dynamic informer after repeated errors (RBAC/auth or missing API resource; no further retries)", "resourceType", c.resourceType, "gvr", c.gvr.String(), "failures", n) - close(c.giveUpCh) - }) - } - } - } + if err := attachWatchErrorHandler(c.informer, c.giveUp, watchErrorHandlerOpts{ + ResourceType: c.resourceType, + GVR: c.gvr.String(), + GiveUpStopMsg: "stopping dynamic informer after repeated errors (RBAC/auth or missing API resource; no further retries)", }); err != nil { logger.Error(context.Background(), K8sMetaUnifyErrorCode, "fail to set dynamic informer watch error handler", err) } @@ -223,39 +208,10 @@ func (c *crUnifiedCache) EnsureWatchStarted() { inf := c.informer c.mu.Unlock() - mergedStop := make(chan struct{}) - go func() { - select { - case <-c.stopCh: - case <-c.giveUpCh: - } - close(mergedStop) - }() + mergedStop := c.giveUp.mergedStop(c.stopCh) go c.factory.Start(mergedStop) - go func() { - backoff := time.Second - const maxBackoff = 10 * time.Second - for { - if cache.WaitForCacheSync(mergedStop, inf.HasSynced) { - logger.Info(context.Background(), "dynamic informer cache synced", "gvr", gvr.String()) - return - } - select { - case <-mergedStop: - logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "dynamic informer cache sync aborted", "gvr", gvr.String()) - return - default: - } - logger.Error(context.Background(), K8sMetaUnifyErrorCode, "dynamic informer cache sync timeout", "gvr", gvr.String(), "nextRetryIn", backoff.String()) - time.Sleep(backoff) - if backoff < maxBackoff { - backoff *= 2 - if backoff > maxBackoff { - backoff = maxBackoff - } - } - } - }() + gvrStr := gvr.String() + go waitInformerCacheSync(mergedStop, inf.HasSynced, informerCacheSyncOpts{ResourceType: c.resourceType, GVR: gvrStr}) }) } diff --git a/pkg/helper/k8smeta/k8s_meta_informer_lifecycle.go b/pkg/helper/k8smeta/k8s_meta_informer_lifecycle.go new file mode 100644 index 0000000000..dcec137e91 --- /dev/null +++ b/pkg/helper/k8smeta/k8s_meta_informer_lifecycle.go @@ -0,0 +1,134 @@ +// Copyright 2021 iLogtail Authors +// +// Shared informer lifecycle helpers: merge global stop with give-up channel, +// Reflector watch-error give-up counting, and cache sync with exponential backoff. + +package k8smeta + +import ( + "context" + "sync" + "time" + + "k8s.io/client-go/tools/cache" + + "github.com/alibaba/ilogtail/pkg/logger" +) + +// informerGiveUp holds state for merging shutdown with “give up” after repeated +// non-recoverable Reflector errors (see informerGiveUpFailureThreshold). +type informerGiveUp struct { + mu sync.Mutex + count int + ch chan struct{} + once sync.Once +} + +func newInformerGiveUp() *informerGiveUp { + return &informerGiveUp{ch: make(chan struct{})} +} + +// mergedStop returns a channel that closes when either globalStop or give-up fires. +func (g *informerGiveUp) mergedStop(globalStop <-chan struct{}) chan struct{} { + merged := make(chan struct{}) + go func() { + select { + case <-globalStop: + case <-g.ch: + } + close(merged) + }() + return merged +} + +// watchErrorHandlerOpts configures logging for attachWatchErrorHandler. +type watchErrorHandlerOpts struct { + ResourceType string + GVR string // optional; when non-empty, included in Error and Warning logs + GiveUpStopMsg string // Warning message body on give-up (first kv key in logger.Warning) +} + +// attachWatchErrorHandler registers SetWatchErrorHandler with give-up counting. +func attachWatchErrorHandler(informer cache.SharedIndexInformer, g *informerGiveUp, o watchErrorHandlerOpts) error { + return informer.SetWatchErrorHandler(func(_ *cache.Reflector, err error) { + if err == nil { + return + } + kvs := []interface{}{"resourceType", o.ResourceType, "watchError", err} + if o.GVR != "" { + kvs = append(kvs, "gvr", o.GVR) + } + logger.Error(context.Background(), K8sMetaUnifyErrorCode, kvs...) + if !isInformerGiveUpFailure(err) { + return + } + g.mu.Lock() + g.count++ + n := g.count + g.mu.Unlock() + if n < informerGiveUpFailureThreshold { + return + } + g.once.Do(func() { + var wkvs []interface{} + if o.GVR != "" { + wkvs = []interface{}{ + o.GiveUpStopMsg, "resourceType", o.ResourceType, + "gvr", o.GVR, + "failures", n, + } + } else { + wkvs = []interface{}{ + o.GiveUpStopMsg, "resourceType", o.ResourceType, + "failures", n, + } + } + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, wkvs...) + close(g.ch) + }) + }) +} + +// informerCacheSyncOpts configures logging for waitInformerCacheSync. +type informerCacheSyncOpts struct { + ResourceType string + // GVR non-empty selects CR-style log lines (dynamic informer + gvr) and logs success on sync. + GVR string +} + +// waitInformerCacheSync blocks until hasSynced or mergedStop is closed, with exponential backoff +// between WaitForCacheSync polls (same as previous k8sMetaCache.watch / crUnifiedCache loops). +func waitInformerCacheSync(mergedStop <-chan struct{}, hasSynced cache.InformerSynced, o informerCacheSyncOpts) { + backoff := time.Second + const maxBackoff = 10 * time.Second + for { + if cache.WaitForCacheSync(mergedStop, hasSynced) { + if o.GVR != "" { + logger.Info(context.Background(), "dynamic informer cache synced", "gvr", o.GVR) + } + return + } + select { + case <-mergedStop: + if o.GVR != "" { + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "dynamic informer cache sync aborted", "gvr", o.GVR) + } else { + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "informer cache sync aborted", "resourceType", o.ResourceType) + } + return + default: + } + if o.GVR != "" { + logger.Error(context.Background(), K8sMetaUnifyErrorCode, "dynamic informer cache sync timeout", "gvr", o.GVR, "nextRetryIn", backoff.String()) + } else { + logger.Error(context.Background(), K8sMetaUnifyErrorCode, "service cache sync timeout", "resourceType", o.ResourceType, "nextRetryIn", backoff.String()) + } + time.Sleep(backoff) + if backoff < maxBackoff { + backoff *= 2 + if backoff > maxBackoff { + backoff = maxBackoff + } + } + } +} From 4b2f88efbe6db7beae58829d2adb9a38b1673800 Mon Sep 17 00:00:00 2001 From: StartE Date: Thu, 16 Apr 2026 12:47:54 +0800 Subject: [PATCH 16/20] fix lint --- plugins/input/kubernetesmetav2/meta_collector.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/plugins/input/kubernetesmetav2/meta_collector.go b/plugins/input/kubernetesmetav2/meta_collector.go index 2ffcc07bae..e2dad5570d 100644 --- a/plugins/input/kubernetesmetav2/meta_collector.go +++ b/plugins/input/kubernetesmetav2/meta_collector.go @@ -31,7 +31,7 @@ type metaCollector struct { stopCh chan struct{} entityProcessor map[string]ProcessFunc - crConfigs map[string]k8smeta.CustomResourceCollectorConfig + crConfigs map[string]k8smeta.CustomResourceCollectorConfig } func (m *metaCollector) Start() error { From b18f905717bb021fc707f121a2eb3bdff4458550 Mon Sep 17 00:00:00 2001 From: StartE Date: Thu, 16 Apr 2026 07:51:37 +0000 Subject: [PATCH 17/20] fix lint --- plugins/input/kubernetesmetav2/service_meta.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/plugins/input/kubernetesmetav2/service_meta.go b/plugins/input/kubernetesmetav2/service_meta.go index 5b0d931d8b..d788968f49 100644 --- a/plugins/input/kubernetesmetav2/service_meta.go +++ b/plugins/input/kubernetesmetav2/service_meta.go @@ -114,12 +114,12 @@ func (s *ServiceK8sMeta) Stop() error { func (s *ServiceK8sMeta) Start(collector pipeline.Collector) error { s.collector = collector s.metaCollector = &metaCollector{ - serviceK8sMeta: s, - collector: collector, - entityBuffer: make(chan models.PipelineEvent, 100), - entityLinkBuffer: make(chan models.PipelineEvent, 100), - stopCh: make(chan struct{}), - entityProcessor: make(map[string]ProcessFunc), + serviceK8sMeta: s, + collector: collector, + entityBuffer: make(chan models.PipelineEvent, 100), + entityLinkBuffer: make(chan models.PipelineEvent, 100), + stopCh: make(chan struct{}), + entityProcessor: make(map[string]ProcessFunc), } return s.metaCollector.Start() } From 9ba481ce7933e198f3222ad4fe99739b796bc162 Mon Sep 17 00:00:00 2001 From: StartE Date: Thu, 16 Apr 2026 09:25:40 +0000 Subject: [PATCH 18/20] update comments from code-review --- .../extended/service-kubernetesmeta-v2.md | 2 + .../k8smeta/k8s_meta_cr_unified_cache.go | 8 +++ .../input/kubernetesmetav2/meta_collector.go | 39 +++++++++++-- .../kubernetesmetav2/meta_collector_cr.go | 5 +- .../meta_collector_cr_test.go | 55 +++++++++++++++++++ 5 files changed, 103 insertions(+), 6 deletions(-) diff --git a/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md b/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md index 35fb9fd6bf..91085a510d 100644 --- a/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md +++ b/docs/cn/plugins/input/extended/service-kubernetesmeta-v2.md @@ -79,6 +79,8 @@ 若 **CustomResources** 中某一 CR 的 watch **连续出现 RBAC/鉴权类错误**并达到停止阈值,该 CR 的动态 Informer 会在当前进程内停止且不会自动恢复,修正权限后须**手动重启 LoongCollector 进程**方可继续采集该类型。 +同样地,若启动时 discovery 检测到该 CR 的 **APIGroup/APIVersion/Resource 不可用**(例如 CRD 尚未安装),该 CR 的动态 Informer 在当前进程内也不会自动恢复。这是 **by-design** 行为(用于避免重复启动与日志风暴),修正后需**手动重启 LoongCollector**。 + ### 内置资源 与配置里打开的内置开关对应即可(如 `pods`、`namespaces`、`deployments` 等)。权限请保持最小可用**`get`、`list`、`watch`** 权限,下面是示例。 diff --git a/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go index c97b8440ac..cc9f4edb04 100644 --- a/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go +++ b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go @@ -100,6 +100,11 @@ func (c *crUnifiedCache) setRESTConfig(cfg *rest.Config) error { // gvrDiscoveryAvailable checks discovery for a CRD/plural GVR before starting a dynamic informer // (same idea as getIngressInformer probing ServerResourcesForGroupVersion in k8s_meta_cache.go). +// +// By design: +// - when discovery reports this GVR unavailable at startup (CRD not installed yet, or plural mismatch), +// this process does not auto-retry to start the informer later; +// - operator should fix CRD/Resource and restart LoongCollector to enable this CR watcher. func gvrDiscoveryAvailable(d discovery.DiscoveryInterface, gvr schema.GroupVersionResource) bool { if d == nil { return true @@ -134,6 +139,9 @@ func (c *crUnifiedCache) EnsureWatchStarted() { c.metaStore.Start() gvr := c.gvr if !gvrDiscoveryAvailable(c.discoveryClient, gvr) { + // By design: discovery failure is treated as terminal for this process lifetime. + // We intentionally mark watch as started to avoid repeated start attempts and log storms. + // After CRD/resource is fixed, restart LoongCollector to recover this CR informer. c.watchStarted = true c.mu.Unlock() return diff --git a/plugins/input/kubernetesmetav2/meta_collector.go b/plugins/input/kubernetesmetav2/meta_collector.go index e2dad5570d..87e340eb45 100644 --- a/plugins/input/kubernetesmetav2/meta_collector.go +++ b/plugins/input/kubernetesmetav2/meta_collector.go @@ -34,6 +34,34 @@ type metaCollector struct { crConfigs map[string]k8smeta.CustomResourceCollectorConfig } +func validateCustomResourceEntityTypeUniqueness( + cfg k8smeta.CustomResourceCollectorConfig, seenEntityTypes map[string]struct{}, +) error { + if _, exists := seenEntityTypes[cfg.EntityType]; exists { + return fmt.Errorf("duplicated CustomResources EntityType %q", cfg.EntityType) + } + seenEntityTypes[cfg.EntityType] = struct{}{} + return nil +} + +func prepareNormalizedCustomResourceConfigs( + customResources []k8smeta.CustomResourceCollectorConfig, +) ([]k8smeta.CustomResourceCollectorConfig, error) { + seenCustomResourceEntityTypes := make(map[string]struct{}) + normalizedConfigs := make([]k8smeta.CustomResourceCollectorConfig, 0, len(customResources)) + for _, cfg := range customResources { + if err := cfg.Normalize(); err != nil { + logger.Warning(context.Background(), k8smeta.K8sMetaUnifyErrorCode, "invalid CustomResources entry", err, "entity", cfg.EntityType) + continue + } + if err := validateCustomResourceEntityTypeUniqueness(cfg, seenCustomResourceEntityTypes); err != nil { + return nil, err + } + normalizedConfigs = append(normalizedConfigs, cfg) + } + return normalizedConfigs, nil +} + func (m *metaCollector) Start() error { m.entityProcessor = map[string]ProcessFunc{ k8smeta.POD: m.processPodEntity, @@ -78,12 +106,13 @@ func (m *metaCollector) Start() error { k8smeta.INGRESS_NAMESPACE: m.processIngressNamespaceLink, } + normalizedCRConfigs, err := prepareNormalizedCustomResourceConfigs(m.serviceK8sMeta.resolvedCustomResources()) + if err != nil { + return err + } + m.crConfigs = make(map[string]k8smeta.CustomResourceCollectorConfig) - for _, cfg := range m.serviceK8sMeta.resolvedCustomResources() { - if err := cfg.Normalize(); err != nil { - logger.Warning(context.Background(), k8smeta.K8sMetaUnifyErrorCode, "invalid CustomResources entry", err, "entity", cfg.EntityType) - continue - } + for _, cfg := range normalizedCRConfigs { if err := m.serviceK8sMeta.metaManager.RegisterCustomResourceCollector(cfg); err != nil { logger.Warning(context.Background(), k8smeta.K8sMetaUnifyErrorCode, "register custom resource collector", err, "entity", cfg.EntityType) continue diff --git a/plugins/input/kubernetesmetav2/meta_collector_cr.go b/plugins/input/kubernetesmetav2/meta_collector_cr.go index df06299f9a..8ecc6853cc 100644 --- a/plugins/input/kubernetesmetav2/meta_collector_cr.go +++ b/plugins/input/kubernetesmetav2/meta_collector_cr.go @@ -76,7 +76,10 @@ func (m *metaCollector) processPodCustomResourceLink(data *k8smeta.ObjectWrapper if entityType == "" { return nil } - cfg := m.crConfigs[entityType] + cfg, ok := m.crConfigs[entityType] + if !ok || cfg.Entity2PodRelation == "" { + return nil + } log := &models.Log{} log.Contents = models.NewLogContents() m.processEntityLinkCommonPart(log.Contents, entityType, obj.CR.GetNamespace(), obj.CR.GetName(), diff --git a/plugins/input/kubernetesmetav2/meta_collector_cr_test.go b/plugins/input/kubernetesmetav2/meta_collector_cr_test.go index 8bf185e11f..437b95ad00 100644 --- a/plugins/input/kubernetesmetav2/meta_collector_cr_test.go +++ b/plugins/input/kubernetesmetav2/meta_collector_cr_test.go @@ -26,6 +26,61 @@ func testWorkflowUnstructured(t *testing.T) *unstructured.Unstructured { return u } +func TestValidateCustomResourceEntityTypeUniqueness(t *testing.T) { + seen := make(map[string]struct{}) + cfg := k8smeta.CustomResourceCollectorConfig{EntityType: "customresource/argoproj.io/workflow"} + require.NoError(t, validateCustomResourceEntityTypeUniqueness(cfg, seen)) + err := validateCustomResourceEntityTypeUniqueness(cfg, seen) + require.Error(t, err) + assert.Contains(t, err.Error(), "duplicated CustomResources EntityType") +} + +func TestPrepareNormalizedCustomResourceConfigsReturnErrorOnDuplicateEntityType(t *testing.T) { + cfg1 := k8smeta.CustomResourceCollectorConfig{ + EntityType: "customresource/argoproj.io/workflow", + APIGroup: "argoproj.io", + APIVersion: "v1alpha1", + Resource: "workflows", + Kind: "Workflow", + PodLink: &k8smeta.PodToCustomResourceLinkConfig{}, + CollectEntity: true, + } + cfg2 := k8smeta.CustomResourceCollectorConfig{ + EntityType: "customresource/argoproj.io/workflow", + APIGroup: "argoproj.io", + APIVersion: "v1alpha1", + Resource: "workflows", + Kind: "Workflow", + } + + normalized, err := prepareNormalizedCustomResourceConfigs([]k8smeta.CustomResourceCollectorConfig{cfg1, cfg2}) + require.Error(t, err) + assert.Nil(t, normalized) + assert.Contains(t, err.Error(), "duplicated CustomResources EntityType") +} + +func TestPrepareNormalizedCustomResourceConfigsSkipsInvalidEntries(t *testing.T) { + invalid := k8smeta.CustomResourceCollectorConfig{ + EntityType: "customresource/argoproj.io/workflow", + } + valid := k8smeta.CustomResourceCollectorConfig{ + EntityType: "customresource/argoproj.io/rollout", + APIGroup: "argoproj.io", + APIVersion: "v1alpha1", + Resource: "rollouts", + Kind: "Rollout", + PodLink: &k8smeta.PodToCustomResourceLinkConfig{}, + } + + normalized, err := prepareNormalizedCustomResourceConfigs([]k8smeta.CustomResourceCollectorConfig{invalid, valid}) + require.NoError(t, err) + require.Len(t, normalized, 1) + assert.Equal(t, "customresource/argoproj.io/rollout", normalized[0].EntityType) + // Normalize should fill PodLink defaults from CR config. + assert.Equal(t, "Rollout", normalized[0].PodLink.OwnerKind) + assert.Equal(t, "argoproj.io", normalized[0].PodLink.OwnerAPIGroupContains) +} + func TestProcessPodCustomResourceLink(t *testing.T) { entityType := "argo.workflow" linkRT := k8smeta.POD + k8smeta.LINK_SPLIT_CHARACTER + entityType From 0322b00ea802520fc5b3b886cb62f9f9e3702541 Mon Sep 17 00:00:00 2001 From: StartE Date: Wed, 6 May 2026 02:00:36 +0000 Subject: [PATCH 19/20] fix with copilot: msg kvPairs --- pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go | 14 +++++++------- pkg/helper/k8smeta/k8s_meta_informer_lifecycle.go | 12 +++++++----- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go index cc9f4edb04..46ccf35a7c 100644 --- a/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go +++ b/pkg/helper/k8smeta/k8s_meta_cr_unified_cache.go @@ -60,7 +60,7 @@ func (c *crUnifiedCache) SetGVRIfNotStarted(gvr schema.GroupVersionResource) { c.mu.Lock() defer c.mu.Unlock() if c.watchStarted { - logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "custom resource informer already started; GVR change ignored", "gvr", gvr.String()) + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "msg", "custom resource informer already started; GVR change ignored", "gvr", gvr.String()) return } c.gvr = gvr @@ -90,7 +90,7 @@ func (c *crUnifiedCache) setRESTConfig(cfg *rest.Config) error { c.dynamicClient = dyn disco, derr := discovery.NewDiscoveryClientForConfig(restConfigForDynamicClient(cfg)) if derr != nil { - logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "discovery client for custom resource informer unavailable; will not pre-check GVR", "resourceType", c.resourceType, "error", derr) + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "msg", "discovery client for custom resource informer unavailable; will not pre-check GVR", "resourceType", c.resourceType, "error", derr) c.discoveryClient = nil } else { c.discoveryClient = disco @@ -112,12 +112,12 @@ func gvrDiscoveryAvailable(d discovery.DiscoveryInterface, gvr schema.GroupVersi gv := schema.GroupVersion{Group: gvr.Group, Version: gvr.Version}.String() resourceList, err := d.ServerResourcesForGroupVersion(gv) if err != nil { - logger.Warning(context.Background(), K8sMetaUnifyErrorCode, + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "msg", "custom resource API group/version not available on server; skipping informer", "gvr", gvr.String(), "error", err) return false } if !containsResource(resourceList.APIResources, gvr.Resource) { - logger.Warning(context.Background(), K8sMetaUnifyErrorCode, + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "msg", "custom resource plural not listed for group/version; skipping informer", "gvr", gvr.String()) return false } @@ -131,7 +131,7 @@ func (c *crUnifiedCache) EnsureWatchStarted() { dyn := c.dynamicClient c.mu.Unlock() if dyn == nil { - logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "dynamic client not ready, skip custom resource informer; ensure MetaManager.Init completed") + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "msg", "dynamic client not ready, skip custom resource informer; ensure MetaManager.Init completed") return } c.watchStartOnce.Do(func() { @@ -203,14 +203,14 @@ func (c *crUnifiedCache) EnsureWatchStarted() { }, }) if err != nil { - logger.Error(context.Background(), K8sMetaUnifyErrorCode, "fail to add dynamic informer event handler", err, "resourceType", c.resourceType, "gvr", c.gvr.String()) + logger.Error(context.Background(), K8sMetaUnifyErrorCode, "msg", "fail to add dynamic informer event handler", "error", err, "resourceType", c.resourceType, "gvr", c.gvr.String()) } if err := attachWatchErrorHandler(c.informer, c.giveUp, watchErrorHandlerOpts{ ResourceType: c.resourceType, GVR: c.gvr.String(), GiveUpStopMsg: "stopping dynamic informer after repeated errors (RBAC/auth or missing API resource; no further retries)", }); err != nil { - logger.Error(context.Background(), K8sMetaUnifyErrorCode, "fail to set dynamic informer watch error handler", err) + logger.Error(context.Background(), K8sMetaUnifyErrorCode, "msg", "fail to set dynamic informer watch error handler", "error", err) } c.watchStarted = true inf := c.informer diff --git a/pkg/helper/k8smeta/k8s_meta_informer_lifecycle.go b/pkg/helper/k8smeta/k8s_meta_informer_lifecycle.go index dcec137e91..4b511f6910 100644 --- a/pkg/helper/k8smeta/k8s_meta_informer_lifecycle.go +++ b/pkg/helper/k8smeta/k8s_meta_informer_lifecycle.go @@ -73,13 +73,15 @@ func attachWatchErrorHandler(informer cache.SharedIndexInformer, g *informerGive var wkvs []interface{} if o.GVR != "" { wkvs = []interface{}{ - o.GiveUpStopMsg, "resourceType", o.ResourceType, + "msg", o.GiveUpStopMsg, + "resourceType", o.ResourceType, "gvr", o.GVR, "failures", n, } } else { wkvs = []interface{}{ - o.GiveUpStopMsg, "resourceType", o.ResourceType, + "msg", o.GiveUpStopMsg, + "resourceType", o.ResourceType, "failures", n, } } @@ -104,16 +106,16 @@ func waitInformerCacheSync(mergedStop <-chan struct{}, hasSynced cache.InformerS for { if cache.WaitForCacheSync(mergedStop, hasSynced) { if o.GVR != "" { - logger.Info(context.Background(), "dynamic informer cache synced", "gvr", o.GVR) + logger.Info(context.Background(), "msg", "dynamic informer cache synced", "gvr", o.GVR) } return } select { case <-mergedStop: if o.GVR != "" { - logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "dynamic informer cache sync aborted", "gvr", o.GVR) + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "msg", "dynamic informer cache sync aborted", "gvr", o.GVR) } else { - logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "informer cache sync aborted", "resourceType", o.ResourceType) + logger.Warning(context.Background(), K8sMetaUnifyErrorCode, "msg", "informer cache sync aborted", "resourceType", o.ResourceType) } return default: From da595b2c8e40455e6a730e99057b09c02faf9917 Mon Sep 17 00:00:00 2001 From: StartE Date: Wed, 6 May 2026 02:22:20 +0000 Subject: [PATCH 20/20] synchronize LinkGenerator metaCache reads with cacheMu --- pkg/helper/k8smeta/k8s_meta_link.go | 10 +++- pkg/helper/k8smeta/k8s_meta_link_test.go | 62 +++++++++++++----------- pkg/helper/k8smeta/k8s_meta_manager.go | 2 +- 3 files changed, 43 insertions(+), 31 deletions(-) diff --git a/pkg/helper/k8smeta/k8s_meta_link.go b/pkg/helper/k8smeta/k8s_meta_link.go index 45499c92a8..72605eb20b 100644 --- a/pkg/helper/k8smeta/k8s_meta_link.go +++ b/pkg/helper/k8smeta/k8s_meta_link.go @@ -13,7 +13,8 @@ import ( ) type LinkGenerator struct { - metaCache map[string]MetaCache + metaCache map[string]MetaCache + metaCacheMu *sync.RWMutex // same mutex as MetaManager.cacheMu; guards map structure for RegisterCustomResourceCollector podCRMu sync.RWMutex podCRByLinkType map[string]*podCRLinkRuntime @@ -26,9 +27,10 @@ type podCRLinkRuntime struct { podLabelKey string } -func NewK8sMetaLinkGenerator(metaCache map[string]MetaCache) *LinkGenerator { +func NewK8sMetaLinkGenerator(metaCache map[string]MetaCache, metaCacheMu *sync.RWMutex) *LinkGenerator { return &LinkGenerator{ metaCache: metaCache, + metaCacheMu: metaCacheMu, podCRByLinkType: make(map[string]*podCRLinkRuntime), } } @@ -64,6 +66,10 @@ func (g *LinkGenerator) GenerateLinks(events []*K8sMetaEvent, linkType string) [ if !strings.HasPrefix(linkType, resourceType) { return nil } + if g.metaCacheMu != nil { + g.metaCacheMu.RLock() + defer g.metaCacheMu.RUnlock() + } // CustomResource links (third-party CR): // 1) Pod→CR when linkType is registered via PodLink (before built-in switch). // 2) namespaced CR→Namespace when linkType is "->namespace" (after switch, so built-in *->namespace kinds stay in cases above). diff --git a/pkg/helper/k8smeta/k8s_meta_link_test.go b/pkg/helper/k8smeta/k8s_meta_link_test.go index f43b2a5190..fe91ff9210 100644 --- a/pkg/helper/k8smeta/k8s_meta_link_test.go +++ b/pkg/helper/k8smeta/k8s_meta_link_test.go @@ -14,6 +14,12 @@ import ( "k8s.io/apimachinery/pkg/runtime/schema" ) +// testLinkGenerator returns a LinkGenerator for unit tests (nil metaCacheMu; tests do not +// concurrently mutate the shared cache map). +func testLinkGenerator(metaCache map[string]MetaCache) *LinkGenerator { + return NewK8sMetaLinkGenerator(metaCache, nil) +} + func TestGetPodNodeLink(t *testing.T) { podCache := newK8sMetaCache(make(chan struct{}), POD) nodeCache := newK8sMetaCache(make(chan struct{}), NODE) @@ -49,7 +55,7 @@ func TestGetPodNodeLink(t *testing.T) { EventType: "add", Object: pod2, }) - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ POD: podCache, NODE: nodeCache, }) @@ -165,7 +171,7 @@ func TestGetPodDeploymentLink(t *testing.T) { EventType: "add", Object: pod2, }) - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ POD: podCache, REPLICASET: replicasetCache, DEPLOYMENT: deploymentCache, @@ -259,7 +265,7 @@ func TestGetReplicaSetDeploymentLink(t *testing.T) { }, }, }) - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ REPLICASET: replicasetCache, DEPLOYMENT: deploymentCache, }) @@ -340,7 +346,7 @@ func TestGetPodReplicaSetLink(t *testing.T) { EventType: "add", Object: pod2, }) - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ POD: podCache, REPLICASET: replicasetCache, }) @@ -407,7 +413,7 @@ func TestGetPodDaemonSetLink(t *testing.T) { EventType: "add", Object: pod2, }) - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ POD: podCache, DAEMONSET: daemonsetCache, }) @@ -474,7 +480,7 @@ func TestGetPodStatefulSetLink(t *testing.T) { EventType: "add", Object: pod2, }) - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ POD: podCache, STATEFULSET: statefulsetCache, }) @@ -541,7 +547,7 @@ func TestGetPodJobLink(t *testing.T) { EventType: "add", Object: pod2, }) - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ POD: podCache, JOB: jobCache, }) @@ -622,7 +628,7 @@ func TestGetJobCronJobLink(t *testing.T) { EventType: "add", Object: job2, }) - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ JOB: jobCache, CRONJOB: cronJobCache, }) @@ -697,7 +703,7 @@ func TestGetPodPVCLink(t *testing.T) { EventType: "add", Object: pod2, }) - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ POD: podCache, PERSISTENTVOLUMECLAIM: pvcCache, }) @@ -776,7 +782,7 @@ func TestGetPodConfigMapLink(t *testing.T) { EventType: "add", Object: pod2, }) - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ POD: podCache, CONFIGMAP: configMapCache, }) @@ -847,7 +853,7 @@ func TestGetPodServiceLink(t *testing.T) { EventType: "add", Object: pod2, }) - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ POD: podCache, SERVICE: serviceCache, }) @@ -877,7 +883,7 @@ func TestGetPodContainerLink(t *testing.T) { EventType: "add", Object: generateMockPod("2"), }) - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ POD: podCache, }) podList := []*K8sMetaEvent{ @@ -991,7 +997,7 @@ func TestGetIngressServiceLink(t *testing.T) { }, }, }) - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ INGRESS: ingressCache, SERVICE: serviceCache, }) @@ -1040,7 +1046,7 @@ func TestGetPodNamespaceLink(t *testing.T) { EventType: "add", Object: pod3, }) - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ POD: podCache, NAMESPACE: namespaceCache, }) @@ -1125,7 +1131,7 @@ func TestGetServiceNamespaceLink(t *testing.T) { Object: service2, }, } - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ SERVICE: serviceCache, NAMESPACE: namespaceCache, }) @@ -1186,7 +1192,7 @@ func TestGetDeploymentNamespaceLink(t *testing.T) { Object: deployment2, }, } - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ DEPLOYMENT: deploymentCache, NAMESPACE: namespaceCache, }) @@ -1247,7 +1253,7 @@ func TestGetDaemonSetNamespaceLink(t *testing.T) { Object: daemonset2, }, } - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ DAEMONSET: daemonSetCache, NAMESPACE: namespaceCache, }) @@ -1307,7 +1313,7 @@ func TestGetStatefulSetNamespaceLink(t *testing.T) { Object: statefulSet2, }, } - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ DAEMONSET: statefulSetCache, NAMESPACE: namespaceCache, }) @@ -1367,7 +1373,7 @@ func TestGetConfigMapNamespaceLink(t *testing.T) { Object: configmap2, }, } - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ CONFIGMAP: configmapCache, NAMESPACE: namespaceCache, }) @@ -1427,7 +1433,7 @@ func TestGetJobNamespaceLink(t *testing.T) { Object: job2, }, } - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ JOB: jobCache, NAMESPACE: namespaceCache, }) @@ -1487,7 +1493,7 @@ func TestGetCronJobNamespaceLink(t *testing.T) { Object: cronjob2, }, } - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ CRONJOB: cronjobCache, NAMESPACE: namespaceCache, }) @@ -1547,7 +1553,7 @@ func TestGetPVCNamespaceLink(t *testing.T) { Object: pvc2, }, } - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ PERSISTENTVOLUMECLAIM: pvcCache, NAMESPACE: namespaceCache, }) @@ -1607,7 +1613,7 @@ func TestGetIngressNamespaceLink(t *testing.T) { Object: ingress2, }, } - linkGenerator := NewK8sMetaLinkGenerator(map[string]MetaCache{ + linkGenerator := testLinkGenerator(map[string]MetaCache{ INGRESS: ingressCache, NAMESPACE: namespaceCache, }) @@ -1697,7 +1703,7 @@ func TestGetPodCustomResourceLinkViaOwnerReference(t *testing.T) { }} podCache.metaStore.handleAddOrUpdateEvent(&K8sMetaEvent{EventType: EventTypeAdd, Object: podW}) - lg := NewK8sMetaLinkGenerator(map[string]MetaCache{ + lg := testLinkGenerator(map[string]MetaCache{ POD: podCache, entityType: crCache, }) @@ -1741,7 +1747,7 @@ func TestGetPodCustomResourceLinkViaLabelFallback(t *testing.T) { pod.Labels = map[string]string{"workflows.argoproj.io/workflow": "wf-from-label"} podCache.metaStore.handleAddOrUpdateEvent(&K8sMetaEvent{EventType: EventTypeAdd, Object: podW}) - lg := NewK8sMetaLinkGenerator(map[string]MetaCache{ + lg := testLinkGenerator(map[string]MetaCache{ POD: podCache, entityType: crCache, }) @@ -1776,7 +1782,7 @@ func TestGetCustomResourceNamespaceLink(t *testing.T) { }, }} - lg := NewK8sMetaLinkGenerator(map[string]MetaCache{NAMESPACE: nsCache}) + lg := testLinkGenerator(map[string]MetaCache{NAMESPACE: nsCache}) results := lg.GenerateLinks(events, linkType) require.Len(t, results, 1) ncr, ok := results[0].Object.Raw.(*NamespaceCustomResource) @@ -1804,7 +1810,7 @@ func TestGetPodCustomResourceLinkMissingCRCache(t *testing.T) { podCache.metaStore.handleAddOrUpdateEvent(&K8sMetaEvent{EventType: EventTypeAdd, Object: podW}) // Intentionally omit entityType from metaCache (no CR cache registered). - lg := NewK8sMetaLinkGenerator(map[string]MetaCache{POD: podCache}) + lg := testLinkGenerator(map[string]MetaCache{POD: podCache}) lg.registerPodCRLink(linkType, testPodCRLinkRuntime()) podList := []*K8sMetaEvent{{EventType: EventTypeUpdate, Object: podCache.metaStore.Items["default/pod3"]}} @@ -1832,7 +1838,7 @@ func TestGetPodCustomResourceLinkCRCacheHitMiss(t *testing.T) { }} podCache.metaStore.handleAddOrUpdateEvent(&K8sMetaEvent{EventType: EventTypeAdd, Object: podW}) - lg := NewK8sMetaLinkGenerator(map[string]MetaCache{ + lg := testLinkGenerator(map[string]MetaCache{ POD: podCache, entityType: crCache, }) diff --git a/pkg/helper/k8smeta/k8s_meta_manager.go b/pkg/helper/k8smeta/k8s_meta_manager.go index a2edae666b..577a29c51e 100644 --- a/pkg/helper/k8smeta/k8s_meta_manager.go +++ b/pkg/helper/k8smeta/k8s_meta_manager.go @@ -76,7 +76,7 @@ func GetMetaManagerInstance() *MetaManager { for _, resource := range AllResources { metaManager.cacheMap[resource] = newK8sMetaCache(metaManager.stopCh, resource) } - metaManager.linkGenerator = NewK8sMetaLinkGenerator(metaManager.cacheMap) + metaManager.linkGenerator = NewK8sMetaLinkGenerator(metaManager.cacheMap, &metaManager.cacheMu) metaManager.linkRegisterMap = make(map[string][]string) metaManager.projectNames = make(map[string]int) })