diff --git a/Documentation/api.md b/Documentation/api.md index 27484af1a5..57e8e25bd4 100644 --- a/Documentation/api.md +++ b/Documentation/api.md @@ -44,6 +44,7 @@ Configuring Cluster Monitoring is optional. If the config does not exist or is e * [PrometheusOperatorConfig](#prometheusoperatorconfig) * [PrometheusRestrictedConfig](#prometheusrestrictedconfig) * [RemoteWriteSpec](#remotewritespec) +* [ResourceLabels](#resourcelabels) * [TLSConfig](#tlsconfig) * [TelemeterClientConfig](#telemeterclientconfig) * [ThanosQuerierConfig](#thanosquerierconfig) @@ -175,6 +176,7 @@ The `KubeStateMetricsConfig` resource defines settings for the `kube-state-metri | resources | *[v1.ResourceRequirements](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.35/#resourcerequirements-v1-core) | Defines resource requests and limits for the KubeStateMetrics container. | | tolerations | [][v1.Toleration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.35/#toleration-v1-core) | Defines tolerations for the pods. | | topologySpreadConstraints | []v1.TopologySpreadConstraint | Defines a pod's topology spread constraints. | +| additionalResourceLabels | [][ResourceLabels](#resourcelabels) | Defines additional resource labels to expose as metrics in addition to the default labels. Currently, only `jobs` and `cronjobs` resources are supported due to cardinality concerns. Each entry specifies a resource name and a list of label names (use `*` to expose all labels). | [Back to TOC](#table-of-contents) @@ -555,6 +557,25 @@ The `RemoteWriteSpec` resource defines the settings for remote write storage. [Back to TOC](#table-of-contents) +## ResourceLabels + +#### Description + +The `ResourceLabels` resource defines which Kubernetes labels to expose as metrics for a given resource type. + +#### Required + - ` resource ` + - ` labels ` + +appears in: [KubeStateMetricsConfig](#kubestatemetricsconfig) + +| Property | Type | Description | +| -------- | ---- | ----------- | +| resource | string | Defines the Kubernetes resource name (for example, `jobs` or `cronjobs`). | +| labels | []string | Defines the list of Kubernetes labels to expose as metrics for this resource. Use `*` to expose all labels. | + +[Back to TOC](#table-of-contents) + ## TLSConfig #### Description diff --git a/Documentation/openshiftdocs/index.adoc b/Documentation/openshiftdocs/index.adoc index cb855df793..38e6c645a8 100644 --- a/Documentation/openshiftdocs/index.adoc +++ b/Documentation/openshiftdocs/index.adoc @@ -64,6 +64,7 @@ The configuration file itself is always defined under the `config.yaml` key in t * link:modules/prometheusoperatorconfig.adoc[PrometheusOperatorConfig] * link:modules/prometheusrestrictedconfig.adoc[PrometheusRestrictedConfig] * link:modules/remotewritespec.adoc[RemoteWriteSpec] +* link:modules/resourcelabels.adoc[ResourceLabels] * link:modules/tlsconfig.adoc[TLSConfig] * link:modules/telemeterclientconfig.adoc[TelemeterClientConfig] * link:modules/thanosquerierconfig.adoc[ThanosQuerierConfig] diff --git a/Documentation/openshiftdocs/modules/kubestatemetricsconfig.adoc b/Documentation/openshiftdocs/modules/kubestatemetricsconfig.adoc index 1872516996..b57d64789d 100644 --- a/Documentation/openshiftdocs/modules/kubestatemetricsconfig.adoc +++ b/Documentation/openshiftdocs/modules/kubestatemetricsconfig.adoc @@ -26,6 +26,8 @@ Appears in: link:clustermonitoringconfiguration.adoc[ClusterMonitoringConfigurat |topologySpreadConstraints|[]v1.TopologySpreadConstraint|Defines a pod's topology spread constraints. +|additionalResourceLabels|[]link:resourcelabels.adoc[ResourceLabels]|Defines additional resource labels to expose as metrics in addition to the default labels. Currently, only `jobs` and `cronjobs` resources are supported due to cardinality concerns. Each entry specifies a resource name and a list of label names (use `*` to expose all labels). + |=== link:../index.adoc[Back to TOC] diff --git a/Documentation/openshiftdocs/modules/resourcelabels.adoc b/Documentation/openshiftdocs/modules/resourcelabels.adoc new file mode 100644 index 0000000000..a03bd18323 --- /dev/null +++ b/Documentation/openshiftdocs/modules/resourcelabels.adoc @@ -0,0 +1,30 @@ +// DO NOT EDIT THE CONTENT IN THIS FILE. It is automatically generated from the + // source code for the Cluster Monitoring Operator. Any changes made to this + // file will be overwritten when the content is re-generated. If you wish to + // make edits, read the docgen utility instructions in the source code for the + // CMO. + :_content-type: ASSEMBLY + +== ResourceLabels + +=== Description + +The `ResourceLabels` resource defines which Kubernetes labels to expose as metrics for a given resource type. + +=== Required +* `resource` +* `labels` + + +Appears in: link:kubestatemetricsconfig.adoc[KubeStateMetricsConfig] + +[options="header"] +|=== +| Property | Type | Description +|resource|string|Defines the Kubernetes resource name (for example, `jobs` or `cronjobs`). + +|labels|[]string|Defines the list of Kubernetes labels to expose as metrics for this resource. Use `*` to expose all labels. + +|=== + +link:../index.adoc[Back to TOC] diff --git a/pkg/manifests/config.go b/pkg/manifests/config.go index 7b92ee664a..f2fec724c2 100644 --- a/pkg/manifests/config.go +++ b/pkg/manifests/config.go @@ -363,9 +363,50 @@ func NewConfigFromString(content string) (*Config, error) { ) } + // Validate additional resource labels for KSM. + if err := validateAdditionalResourceLabels(c.ClusterMonitoringConfiguration.KubeStateMetricsConfig); err != nil { + return nil, err + } + return &c, nil } +var supportedResourceLabelsResources = []string{"jobs", "cronjobs"} + +func validateAdditionalResourceLabels(ksm *KubeStateMetricsConfig) error { + if ksm == nil { + return nil + } + + seenResources := map[string]bool{} + for _, rl := range ksm.AdditionalResourceLabels { + if rl.Resource == "" { + return fmt.Errorf("%w: additionalResourceLabels: resource name must not be empty", ErrConfigValidation) + } + if !slices.Contains(supportedResourceLabelsResources, rl.Resource) { + return fmt.Errorf("%w: additionalResourceLabels: unsupported resource %q, supported resources are: %v", ErrConfigValidation, rl.Resource, supportedResourceLabelsResources) + } + if seenResources[rl.Resource] { + return fmt.Errorf("%w: additionalResourceLabels: duplicate resource %q", ErrConfigValidation, rl.Resource) + } + seenResources[rl.Resource] = true + if len(rl.Labels) == 0 { + return fmt.Errorf("%w: additionalResourceLabels: resource %q must have at least one label", ErrConfigValidation, rl.Resource) + } + if slices.Contains(rl.Labels, "") { + return fmt.Errorf("%w: additionalResourceLabels: resource %q has an empty label value", ErrConfigValidation, rl.Resource) + } + seenLabels := map[string]bool{} + for _, l := range rl.Labels { + if seenLabels[l] { + return fmt.Errorf("%w: additionalResourceLabels: resource %q has duplicate label %q", ErrConfigValidation, rl.Resource, l) + } + seenLabels[l] = true + } + } + return nil +} + func (c *Config) applyDefaults() { if c.Images == nil { c.Images = &Images{} diff --git a/pkg/manifests/config_test.go b/pkg/manifests/config_test.go index 979002115f..9b9c22ca27 100644 --- a/pkg/manifests/config_test.go +++ b/pkg/manifests/config_test.go @@ -753,6 +753,119 @@ func TestCollectionProfileValues(t *testing.T) { } } +func TestAdditionalResourceLabelsValidation(t *testing.T) { + for _, tc := range []struct { + name string + config string + expectError bool + }{ + { + name: "no additional resource labels", + config: "", + expectError: false, + }, + { + name: "valid jobs resource", + config: `kubeStateMetrics: + additionalResourceLabels: + - resource: jobs + labels: + - foo`, + expectError: false, + }, + { + name: "valid cronjobs resource", + config: `kubeStateMetrics: + additionalResourceLabels: + - resource: cronjobs + labels: + - bar`, + expectError: false, + }, + { + name: "valid multiple resources", + config: `kubeStateMetrics: + additionalResourceLabels: + - resource: jobs + labels: + - foo + - resource: cronjobs + labels: + - bar`, + expectError: false, + }, + { + name: "unsupported resource", + config: `kubeStateMetrics: + additionalResourceLabels: + - resource: pods + labels: + - foo`, + expectError: true, + }, + { + name: "empty resource name", + config: `kubeStateMetrics: + additionalResourceLabels: + - resource: "" + labels: + - foo`, + expectError: true, + }, + { + name: "duplicate resource", + config: `kubeStateMetrics: + additionalResourceLabels: + - resource: jobs + labels: + - foo + - resource: jobs + labels: + - bar`, + expectError: true, + }, + { + name: "no labels", + config: `kubeStateMetrics: + additionalResourceLabels: + - resource: jobs + labels: []`, + expectError: true, + }, + { + name: "empty label value", + config: `kubeStateMetrics: + additionalResourceLabels: + - resource: jobs + labels: + - foo + - ""`, + expectError: true, + }, + { + name: "duplicate label", + config: `kubeStateMetrics: + additionalResourceLabels: + - resource: jobs + labels: + - foo + - bar + - foo`, + expectError: true, + }, + } { + t.Run(tc.name, func(t *testing.T) { + _, err := NewConfigFromString(tc.config) + if tc.expectError { + require.Error(t, err) + require.ErrorIs(t, err, ErrConfigValidation) + } else { + require.NoError(t, err) + } + }) + } +} + func TestDeprecatedConfig(t *testing.T) { for _, tc := range []struct { name string diff --git a/pkg/manifests/manifests.go b/pkg/manifests/manifests.go index f6471de312..bc24b5bf76 100644 --- a/pkg/manifests/manifests.go +++ b/pkg/manifests/manifests.go @@ -788,6 +788,18 @@ func (f *Factory) KubeStateMetricsDeployment() (*appsv1.Deployment, error) { if f.config.ClusterMonitoringConfiguration.KubeStateMetricsConfig.Resources != nil { d.Spec.Template.Spec.Containers[i].Resources = *f.config.ClusterMonitoringConfiguration.KubeStateMetricsConfig.Resources } + additionalResourceLabels := f.config.ClusterMonitoringConfiguration.KubeStateMetricsConfig.AdditionalResourceLabels + if len(additionalResourceLabels) > 0 { + for j := range container.Args { + if strings.HasPrefix(container.Args[j], "--metric-labels-allowlist=") { + var parts []string + for _, rl := range additionalResourceLabels { + parts = append(parts, fmt.Sprintf("%s=[%s]", rl.Resource, strings.Join(rl.Labels, ","))) + } + container.Args[j] += "," + strings.Join(parts, ",") + } + } + } } } diff --git a/pkg/manifests/manifests_test.go b/pkg/manifests/manifests_test.go index 8d9be2fff0..873e233764 100644 --- a/pkg/manifests/manifests_test.go +++ b/pkg/manifests/manifests_test.go @@ -3834,6 +3834,83 @@ func TestKubeStateMetrics(t *testing.T) { } } +func TestKubeStateMetricsAdditionalResourceLabels(t *testing.T) { + defaultAllowList := "--metric-labels-allowlist=pods=[*],nodes=[*],namespaces=[*],persistentvolumes=[*],persistentvolumeclaims=[*],poddisruptionbudgets=[*]" + + tests := []struct { + name string + config string + expectedArg string + }{ + { + name: "no additional resource labels", + config: "", + expectedArg: defaultAllowList, + }, + { + name: "single resource with specific labels", + config: `kubeStateMetrics: + additionalResourceLabels: + - resource: jobs + labels: + - foo + - bar`, + expectedArg: defaultAllowList + ",jobs=[foo,bar]", + }, + { + name: "multiple resources", + config: `kubeStateMetrics: + additionalResourceLabels: + - resource: jobs + labels: + - foo + - resource: cronjobs + labels: + - bar + - baz`, + expectedArg: defaultAllowList + ",jobs=[foo],cronjobs=[bar,baz]", + }, + { + name: "wildcard labels", + config: `kubeStateMetrics: + additionalResourceLabels: + - resource: cronjobs + labels: + - "*"`, + expectedArg: defaultAllowList + ",cronjobs=[*]", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + c, err := NewConfigFromString(tc.config) + require.NoError(t, err) + c.SetImages(map[string]string{ + "kube-state-metrics": "docker.io/openshift/origin-kube-state-metrics:latest", + "kube-rbac-proxy": "docker.io/openshift/origin-kube-rbac-proxy:latest", + }) + + f := NewFactory("openshift-monitoring", "openshift-user-workload-monitoring", c, defaultInfrastructureReader(), &fakeProxyReader{}, NewAssets(assetsPath), &APIServerConfig{}, &configv1.Console{}) + + d, err := f.KubeStateMetricsDeployment() + require.NoError(t, err) + + found := false + for _, container := range d.Spec.Template.Spec.Containers { + if container.Name == "kube-state-metrics" { + for _, arg := range container.Args { + if strings.HasPrefix(arg, "--metric-labels-allowlist=") { + found = true + require.Equal(t, tc.expectedArg, arg) + } + } + } + } + require.True(t, found, "--metric-labels-allowlist arg not found in kube-state-metrics container") + }) + } +} + func TestOpenShiftStateMetrics(t *testing.T) { config := `openshiftStateMetrics: resources: diff --git a/pkg/manifests/types.go b/pkg/manifests/types.go index dd920285d0..7d43bb156d 100644 --- a/pkg/manifests/types.go +++ b/pkg/manifests/types.go @@ -182,6 +182,16 @@ type DedicatedServiceMonitors struct { Enabled bool `json:"enabled,omitempty"` } +// The `ResourceLabels` resource defines which Kubernetes labels to expose +// as metrics for a given resource type. +type ResourceLabels struct { + // Defines the Kubernetes resource name (for example, `jobs` or `cronjobs`). + Resource string `json:"resource"` + // Defines the list of Kubernetes labels to expose as metrics for this + // resource. Use `*` to expose all labels. + Labels []string `json:"labels"` +} + // The `KubeStateMetricsConfig` resource defines settings for the // `kube-state-metrics` agent. type KubeStateMetricsConfig struct { @@ -193,6 +203,11 @@ type KubeStateMetricsConfig struct { Tolerations []v1.Toleration `json:"tolerations,omitempty"` // Defines a pod's topology spread constraints. TopologySpreadConstraints []v1.TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty"` + // Defines additional resource labels to expose as metrics in addition to + // the default labels. Currently, only `jobs` and `cronjobs` resources are + // supported due to cardinality concerns. Each entry specifies a resource + // name and a list of label names (use `*` to expose all labels). + AdditionalResourceLabels []ResourceLabels `json:"additionalResourceLabels,omitempty"` } // The `PrometheusK8sConfig` resource defines settings for the Prometheus