Skip to content

Commit 24b8d6b

Browse files
committed
MON-4115: expose label metrics for jobs and cronjobs
Adds `AdditionalLabelsAllowList` to KSM config. Signed-off-by: Pranshu Srivastava <rexagod@gmail.com>
1 parent 94d1c3d commit 24b8d6b

9 files changed

Lines changed: 289 additions & 0 deletions

File tree

Documentation/api.md

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ Configuring Cluster Monitoring is optional. If the config does not exist or is e
4444
* [PrometheusOperatorConfig](#prometheusoperatorconfig)
4545
* [PrometheusRestrictedConfig](#prometheusrestrictedconfig)
4646
* [RemoteWriteSpec](#remotewritespec)
47+
* [ResourceLabels](#resourcelabels)
4748
* [TLSConfig](#tlsconfig)
4849
* [TelemeterClientConfig](#telemeterclientconfig)
4950
* [ThanosQuerierConfig](#thanosquerierconfig)
@@ -175,6 +176,7 @@ The `KubeStateMetricsConfig` resource defines settings for the `kube-state-metri
175176
| resources | *[v1.ResourceRequirements](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.35/#resourcerequirements-v1-core) | Defines resource requests and limits for the KubeStateMetrics container. |
176177
| tolerations | [][v1.Toleration](https://kubernetes.io/docs/reference/generated/kubernetes-api/v1.35/#toleration-v1-core) | Defines tolerations for the pods. |
177178
| topologySpreadConstraints | []v1.TopologySpreadConstraint | Defines a pod's topology spread constraints. |
179+
| additionalResourceLabels | [][ResourceLabels](#resourcelabels) | Defines additional resource labels to expose as metrics in addition to the default labels. Currently, only `jobs` and `cronjobs` resources are supported due to cardinality concerns. Each entry specifies a resource name and a list of label names (use `*` to expose all labels). |
178180

179181
[Back to TOC](#table-of-contents)
180182

@@ -555,6 +557,25 @@ The `RemoteWriteSpec` resource defines the settings for remote write storage.
555557

556558
[Back to TOC](#table-of-contents)
557559

560+
## ResourceLabels
561+
562+
#### Description
563+
564+
The `ResourceLabels` resource defines which Kubernetes labels to expose as metrics for a given resource type.
565+
566+
#### Required
567+
- ` resource `
568+
- ` labels `
569+
570+
<em>appears in: [KubeStateMetricsConfig](#kubestatemetricsconfig)</em>
571+
572+
| Property | Type | Description |
573+
| -------- | ---- | ----------- |
574+
| resource | string | Defines the Kubernetes resource name (for example, `jobs` or `cronjobs`). |
575+
| labels | []string | Defines the list of Kubernetes labels to expose as metrics for this resource. Use `*` to expose all labels. |
576+
577+
[Back to TOC](#table-of-contents)
578+
558579
## TLSConfig
559580

560581
#### Description

Documentation/openshiftdocs/index.adoc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ The configuration file itself is always defined under the `config.yaml` key in t
6464
* link:modules/prometheusoperatorconfig.adoc[PrometheusOperatorConfig]
6565
* link:modules/prometheusrestrictedconfig.adoc[PrometheusRestrictedConfig]
6666
* link:modules/remotewritespec.adoc[RemoteWriteSpec]
67+
* link:modules/resourcelabels.adoc[ResourceLabels]
6768
* link:modules/tlsconfig.adoc[TLSConfig]
6869
* link:modules/telemeterclientconfig.adoc[TelemeterClientConfig]
6970
* link:modules/thanosquerierconfig.adoc[ThanosQuerierConfig]

Documentation/openshiftdocs/modules/kubestatemetricsconfig.adoc

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@ Appears in: link:clustermonitoringconfiguration.adoc[ClusterMonitoringConfigurat
2626

2727
|topologySpreadConstraints|[]v1.TopologySpreadConstraint|Defines a pod's topology spread constraints.
2828

29+
|additionalResourceLabels|[]link:resourcelabels.adoc[ResourceLabels]|Defines additional resource labels to expose as metrics in addition to the default labels. Currently, only `jobs` and `cronjobs` resources are supported due to cardinality concerns. Each entry specifies a resource name and a list of label names (use `*` to expose all labels).
30+
2931
|===
3032

3133
link:../index.adoc[Back to TOC]
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
// DO NOT EDIT THE CONTENT IN THIS FILE. It is automatically generated from the
2+
// source code for the Cluster Monitoring Operator. Any changes made to this
3+
// file will be overwritten when the content is re-generated. If you wish to
4+
// make edits, read the docgen utility instructions in the source code for the
5+
// CMO.
6+
:_content-type: ASSEMBLY
7+
8+
== ResourceLabels
9+
10+
=== Description
11+
12+
The `ResourceLabels` resource defines which Kubernetes labels to expose as metrics for a given resource type.
13+
14+
=== Required
15+
* `resource`
16+
* `labels`
17+
18+
19+
Appears in: link:kubestatemetricsconfig.adoc[KubeStateMetricsConfig]
20+
21+
[options="header"]
22+
|===
23+
| Property | Type | Description
24+
|resource|string|Defines the Kubernetes resource name (for example, `jobs` or `cronjobs`).
25+
26+
|labels|[]string|Defines the list of Kubernetes labels to expose as metrics for this resource. Use `*` to expose all labels.
27+
28+
|===
29+
30+
link:../index.adoc[Back to TOC]

pkg/manifests/config.go

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -363,9 +363,40 @@ func NewConfigFromString(content string) (*Config, error) {
363363
)
364364
}
365365

366+
// Validate additional resource labels for KSM.
367+
if err := validateAdditionalResourceLabels(c.ClusterMonitoringConfiguration.KubeStateMetricsConfig); err != nil {
368+
return nil, err
369+
}
370+
366371
return &c, nil
367372
}
368373

374+
var supportedResourceLabelsResources = []string{"jobs", "cronjobs"}
375+
376+
func validateAdditionalResourceLabels(ksm *KubeStateMetricsConfig) error {
377+
if ksm == nil {
378+
return nil
379+
}
380+
381+
seen := map[string]bool{}
382+
for _, rl := range ksm.AdditionalResourceLabels {
383+
if rl.Resource == "" {
384+
return fmt.Errorf("%w: additionalResourceLabels: resource name must not be empty", ErrConfigValidation)
385+
}
386+
if !slices.Contains(supportedResourceLabelsResources, rl.Resource) {
387+
return fmt.Errorf("%w: additionalResourceLabels: unsupported resource %q, supported resources are: %v", ErrConfigValidation, rl.Resource, supportedResourceLabelsResources)
388+
}
389+
if seen[rl.Resource] {
390+
return fmt.Errorf("%w: additionalResourceLabels: duplicate resource %q", ErrConfigValidation, rl.Resource)
391+
}
392+
seen[rl.Resource] = true
393+
if len(rl.Labels) == 0 {
394+
return fmt.Errorf("%w: additionalResourceLabels: resource %q must have at least one label", ErrConfigValidation, rl.Resource)
395+
}
396+
}
397+
return nil
398+
}
399+
369400
func (c *Config) applyDefaults() {
370401
if c.Images == nil {
371402
c.Images = &Images{}

pkg/manifests/config_test.go

Lines changed: 92 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -753,6 +753,98 @@ func TestCollectionProfileValues(t *testing.T) {
753753
}
754754
}
755755

756+
func TestAdditionalResourceLabelsValidation(t *testing.T) {
757+
for _, tc := range []struct {
758+
name string
759+
config string
760+
expectError bool
761+
}{
762+
{
763+
name: "no additional resource labels",
764+
config: "",
765+
expectError: false,
766+
},
767+
{
768+
name: "valid jobs resource",
769+
config: `kubeStateMetrics:
770+
additionalResourceLabels:
771+
- resource: jobs
772+
labels:
773+
- foo`,
774+
expectError: false,
775+
},
776+
{
777+
name: "valid cronjobs resource",
778+
config: `kubeStateMetrics:
779+
additionalResourceLabels:
780+
- resource: cronjobs
781+
labels:
782+
- bar`,
783+
expectError: false,
784+
},
785+
{
786+
name: "valid multiple resources",
787+
config: `kubeStateMetrics:
788+
additionalResourceLabels:
789+
- resource: jobs
790+
labels:
791+
- foo
792+
- resource: cronjobs
793+
labels:
794+
- bar`,
795+
expectError: false,
796+
},
797+
{
798+
name: "unsupported resource",
799+
config: `kubeStateMetrics:
800+
additionalResourceLabels:
801+
- resource: pods
802+
labels:
803+
- foo`,
804+
expectError: true,
805+
},
806+
{
807+
name: "empty resource name",
808+
config: `kubeStateMetrics:
809+
additionalResourceLabels:
810+
- resource: ""
811+
labels:
812+
- foo`,
813+
expectError: true,
814+
},
815+
{
816+
name: "duplicate resource",
817+
config: `kubeStateMetrics:
818+
additionalResourceLabels:
819+
- resource: jobs
820+
labels:
821+
- foo
822+
- resource: jobs
823+
labels:
824+
- bar`,
825+
expectError: true,
826+
},
827+
{
828+
name: "no labels",
829+
config: `kubeStateMetrics:
830+
additionalResourceLabels:
831+
- resource: jobs
832+
labels: []`,
833+
expectError: true,
834+
},
835+
} {
836+
t.Run(tc.name, func(t *testing.T) {
837+
_, err := NewConfigFromString(tc.config)
838+
if tc.expectError {
839+
require.Error(t, err)
840+
require.ErrorIs(t, err, ErrConfigValidation)
841+
} else {
842+
require.NoError(t, err)
843+
}
844+
})
845+
}
846+
}
847+
756848
func TestDeprecatedConfig(t *testing.T) {
757849
for _, tc := range []struct {
758850
name string

pkg/manifests/manifests.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -788,6 +788,18 @@ func (f *Factory) KubeStateMetricsDeployment() (*appsv1.Deployment, error) {
788788
if f.config.ClusterMonitoringConfiguration.KubeStateMetricsConfig.Resources != nil {
789789
d.Spec.Template.Spec.Containers[i].Resources = *f.config.ClusterMonitoringConfiguration.KubeStateMetricsConfig.Resources
790790
}
791+
additionalResourceLabels := f.config.ClusterMonitoringConfiguration.KubeStateMetricsConfig.AdditionalResourceLabels
792+
if len(additionalResourceLabels) > 0 {
793+
for j := range container.Args {
794+
if strings.HasPrefix(container.Args[j], "--metric-labels-allowlist=") {
795+
var parts []string
796+
for _, rl := range additionalResourceLabels {
797+
parts = append(parts, fmt.Sprintf("%s=[%s]", rl.Resource, strings.Join(rl.Labels, ",")))
798+
}
799+
container.Args[j] += "," + strings.Join(parts, ",")
800+
}
801+
}
802+
}
791803
}
792804
}
793805

pkg/manifests/manifests_test.go

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3834,6 +3834,91 @@ func TestKubeStateMetrics(t *testing.T) {
38343834
}
38353835
}
38363836

3837+
func TestKubeStateMetricsAdditionalResourceLabels(t *testing.T) {
3838+
defaultAllowList := "--metric-labels-allowlist=pods=[*],nodes=[*],namespaces=[*],persistentvolumes=[*],persistentvolumeclaims=[*],poddisruptionbudgets=[*]"
3839+
3840+
tests := []struct {
3841+
name string
3842+
config string
3843+
expectedArg string
3844+
}{
3845+
{
3846+
name: "no additional resource labels",
3847+
config: "",
3848+
expectedArg: defaultAllowList,
3849+
},
3850+
{
3851+
name: "single resource with specific labels",
3852+
config: `kubeStateMetrics:
3853+
additionalResourceLabels:
3854+
- resource: jobs
3855+
labels:
3856+
- foo
3857+
- bar`,
3858+
expectedArg: defaultAllowList + ",jobs=[foo,bar]",
3859+
},
3860+
{
3861+
name: "multiple resources",
3862+
config: `kubeStateMetrics:
3863+
additionalResourceLabels:
3864+
- resource: jobs
3865+
labels:
3866+
- foo
3867+
- resource: cronjobs
3868+
labels:
3869+
- bar
3870+
- baz`,
3871+
expectedArg: defaultAllowList + ",jobs=[foo],cronjobs=[bar,baz]",
3872+
},
3873+
{
3874+
name: "wildcard labels",
3875+
config: `kubeStateMetrics:
3876+
additionalResourceLabels:
3877+
- resource: cronjobs
3878+
labels:
3879+
- "*"`,
3880+
expectedArg: defaultAllowList + ",cronjobs=[*]",
3881+
},
3882+
}
3883+
3884+
for _, tc := range tests {
3885+
t.Run(tc.name, func(t *testing.T) {
3886+
c, err := NewConfigFromString(tc.config)
3887+
if err != nil {
3888+
t.Fatal(err)
3889+
}
3890+
c.SetImages(map[string]string{
3891+
"kube-state-metrics": "docker.io/openshift/origin-kube-state-metrics:latest",
3892+
"kube-rbac-proxy": "docker.io/openshift/origin-kube-rbac-proxy:latest",
3893+
})
3894+
3895+
f := NewFactory("openshift-monitoring", "openshift-user-workload-monitoring", c, defaultInfrastructureReader(), &fakeProxyReader{}, NewAssets(assetsPath), &APIServerConfig{}, &configv1.Console{})
3896+
3897+
d, err := f.KubeStateMetricsDeployment()
3898+
if err != nil {
3899+
t.Fatal(err)
3900+
}
3901+
3902+
found := false
3903+
for _, container := range d.Spec.Template.Spec.Containers {
3904+
if container.Name == "kube-state-metrics" {
3905+
for _, arg := range container.Args {
3906+
if strings.HasPrefix(arg, "--metric-labels-allowlist=") {
3907+
found = true
3908+
if arg != tc.expectedArg {
3909+
t.Fatalf("expected arg %q, got %q", tc.expectedArg, arg)
3910+
}
3911+
}
3912+
}
3913+
}
3914+
}
3915+
if !found {
3916+
t.Fatal("--metric-labels-allowlist arg not found in kube-state-metrics container")
3917+
}
3918+
})
3919+
}
3920+
}
3921+
38373922
func TestOpenShiftStateMetrics(t *testing.T) {
38383923
config := `openshiftStateMetrics:
38393924
resources:

pkg/manifests/types.go

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,16 @@ type DedicatedServiceMonitors struct {
182182
Enabled bool `json:"enabled,omitempty"`
183183
}
184184

185+
// The `ResourceLabels` resource defines which Kubernetes labels to expose
186+
// as metrics for a given resource type.
187+
type ResourceLabels struct {
188+
// Defines the Kubernetes resource name (for example, `jobs` or `cronjobs`).
189+
Resource string `json:"resource"`
190+
// Defines the list of Kubernetes labels to expose as metrics for this
191+
// resource. Use `*` to expose all labels.
192+
Labels []string `json:"labels"`
193+
}
194+
185195
// The `KubeStateMetricsConfig` resource defines settings for the
186196
// `kube-state-metrics` agent.
187197
type KubeStateMetricsConfig struct {
@@ -193,6 +203,11 @@ type KubeStateMetricsConfig struct {
193203
Tolerations []v1.Toleration `json:"tolerations,omitempty"`
194204
// Defines a pod's topology spread constraints.
195205
TopologySpreadConstraints []v1.TopologySpreadConstraint `json:"topologySpreadConstraints,omitempty"`
206+
// Defines additional resource labels to expose as metrics in addition to
207+
// the default labels. Currently, only `jobs` and `cronjobs` resources are
208+
// supported due to cardinality concerns. Each entry specifies a resource
209+
// name and a list of label names (use `*` to expose all labels).
210+
AdditionalResourceLabels []ResourceLabels `json:"additionalResourceLabels,omitempty"`
196211
}
197212

198213
// The `PrometheusK8sConfig` resource defines settings for the Prometheus

0 commit comments

Comments
 (0)