Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion bindata/network/multus/multus.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ metadata:
data:
daemon-config.json: |
{
"cniVersion": "0.3.1",
"cniVersion": "1.1.0",
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Multus was updated to the new version which enables CNI status

"chrootDir": "/hostroot",
"logToStderr": true,
"logLevel": "verbose",
Expand Down
13 changes: 12 additions & 1 deletion bindata/network/ovn-kubernetes/common/008-script-lib.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -540,6 +540,9 @@ data:
# Ensure allow_icmp_network_policy_flag is always defined
allow_icmp_network_policy_flag=

# Ensure dpu_lease_flags is always defined
dpu_lease_flags=

if [[ $# -ne 3 ]]; then
echo "Expected three arguments but got $#"
exit 1
Expand Down Expand Up @@ -600,6 +603,13 @@ data:
multi_external_gateway_enable_flag=""
fi

if [[ -n "${OVNKUBE_NODE_LEASE_RENEW_INTERVAL}" ]]; then
dpu_lease_flags="--dpu-node-lease-renew-interval ${OVNKUBE_NODE_LEASE_RENEW_INTERVAL}"
fi
if [[ -n "${OVNKUBE_NODE_LEASE_DURATION}" ]]; then
dpu_lease_flags="${dpu_lease_flags} --dpu-node-lease-duration ${OVNKUBE_NODE_LEASE_DURATION}"
fi

if [ "{{.OVN_GATEWAY_MODE}}" == "shared" ]; then
gateway_mode_flags="--gateway-mode shared --gateway-interface ${gateway_interface}"
elif [ "{{.OVN_GATEWAY_MODE}}" == "local" ]; then
Expand Down Expand Up @@ -797,5 +807,6 @@ data:
${ovn_v4_transit_switch_subnet_opt} \
${ovn_v6_transit_switch_subnet_opt} \
${egress_features_enable_flag} \
${multi_external_gateway_enable_flag}
${multi_external_gateway_enable_flag} \
${dpu_lease_flags}
}
6 changes: 6 additions & 0 deletions bindata/network/ovn-kubernetes/managed/ovnkube-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -437,6 +437,12 @@ spec:
- name: OVNKUBE_NODE_MGMT_PORT_DP_RESOURCE_NAME
value: {{ .MgmtPortResourceName }}
{{ end }}
{{ if and (or (eq .OVN_NODE_MODE "dpu-host") (eq .OVN_NODE_MODE "dpu")) (ne .DpuNodeLeaseRenewInterval "0") }}
- name: OVNKUBE_NODE_LEASE_RENEW_INTERVAL
value: "{{.DpuNodeLeaseRenewInterval}}"
- name: OVNKUBE_NODE_LEASE_DURATION
value: "{{.DpuNodeLeaseDuration}}"
{{ end }}
{{ if .HTTP_PROXY }}
- name: "HTTP_PROXY"
value: "{{ .HTTP_PROXY}}"
Expand Down
6 changes: 6 additions & 0 deletions bindata/network/ovn-kubernetes/self-hosted/ovnkube-node.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -473,6 +473,12 @@ spec:
- name: OVNKUBE_NODE_MGMT_PORT_DP_RESOURCE_NAME
value: {{ .MgmtPortResourceName }}
{{ end }}
{{ if and (or (eq .OVN_NODE_MODE "dpu-host") (eq .OVN_NODE_MODE "dpu")) (ne .DpuNodeLeaseRenewInterval "0") }}
- name: OVNKUBE_NODE_LEASE_RENEW_INTERVAL
value: "{{.DpuNodeLeaseRenewInterval}}"
- name: OVNKUBE_NODE_LEASE_DURATION
value: "{{.DpuNodeLeaseDuration}}"
{{ end }}
- name: K8S_NODE
valueFrom:
fieldRef:
Expand Down
2 changes: 2 additions & 0 deletions hack/hardware-offload-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,5 @@ data:
dpu-mode-label: "network.operator.openshift.io/dpu="
smart-nic-mode-label: "network.operator.openshift.io/smart-nic="
mgmt-port-resource-name: "openshift.io/mgmtvf"
dpu-node-lease-renew-interval-in-seconds: "10"
dpu-node-lease-duration-in-seconds: "40"
26 changes: 14 additions & 12 deletions pkg/bootstrap/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -30,18 +30,20 @@ type OVNHyperShiftBootstrapResult struct {
}

type OVNConfigBoostrapResult struct {
GatewayMode string
HyperShiftConfig *OVNHyperShiftBootstrapResult
DisableUDPAggregation bool
DpuHostModeLabel string
DpuHostModeNodes []string
DpuHostModeValue string
DpuModeLabel string
DpuModeNodes []string
SmartNicModeLabel string
SmartNicModeNodes []string
SmartNicModeValue string
MgmtPortResourceName string
GatewayMode string
HyperShiftConfig *OVNHyperShiftBootstrapResult
DisableUDPAggregation bool
DpuHostModeLabel string
DpuHostModeNodes []string
DpuHostModeValue string
DpuModeLabel string
DpuModeNodes []string
SmartNicModeLabel string
SmartNicModeNodes []string
SmartNicModeValue string
MgmtPortResourceName string
DpuNodeLeaseRenewInterval int
DpuNodeLeaseDuration int
// ConfigOverrides contains the overrides for the OVN Kubernetes configuration
// This is used to set the hidden OVN Kubernetes configuration in the cluster
// It is a map of key-value pairs where the key is the configuration option and the
Expand Down
10 changes: 6 additions & 4 deletions pkg/network/kube_proxy_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -374,10 +374,12 @@ func TestFillKubeProxyDefaults(t *testing.T) {
var FakeKubeProxyBootstrapResult = bootstrap.BootstrapResult{
OVN: bootstrap.OVNBootstrapResult{
OVNKubernetesConfig: &bootstrap.OVNConfigBoostrapResult{
DpuHostModeLabel: OVN_NODE_SELECTOR_DEFAULT_DPU_HOST,
DpuModeLabel: OVN_NODE_SELECTOR_DEFAULT_DPU,
SmartNicModeLabel: OVN_NODE_SELECTOR_DEFAULT_SMART_NIC,
MgmtPortResourceName: "",
DpuHostModeLabel: OVN_NODE_SELECTOR_DEFAULT_DPU_HOST,
DpuModeLabel: OVN_NODE_SELECTOR_DEFAULT_DPU,
SmartNicModeLabel: OVN_NODE_SELECTOR_DEFAULT_SMART_NIC,
MgmtPortResourceName: "",
DpuNodeLeaseRenewInterval: DPU_NODE_LEASE_RENEW_INTERVAL_DEFAULT,
DpuNodeLeaseDuration: DPU_NODE_LEASE_DURATION_DEFAULT,
},
},
}
Expand Down
44 changes: 40 additions & 4 deletions pkg/network/ovn_kubernetes.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,11 @@ const OVN_NODE_SELECTOR_DEFAULT_DPU = "network.operator.openshift.io/dpu="
const OVN_NODE_SELECTOR_DEFAULT_SMART_NIC = "network.operator.openshift.io/smart-nic="
const OVN_NODE_IDENTITY_CERT_DURATION = "24h"

// Default DPU health check lease configuration.
// Setting renew-interval to 0 disables the health check.
const DPU_NODE_LEASE_RENEW_INTERVAL_DEFAULT = 10
const DPU_NODE_LEASE_DURATION_DEFAULT = 40

// gRPC healthcheck port. See: https://github.com/openshift/enhancements/pull/1209
const OVN_EGRESSIP_HEALTHCHECK_PORT = "9107"

Expand Down Expand Up @@ -232,6 +237,8 @@ func renderOVNKubernetes(conf *operv1.NetworkSpec, bootstrapResult *bootstrap.Bo
data.Data["SmartNicModeLabel"] = bootstrapResult.OVN.OVNKubernetesConfig.SmartNicModeLabel
data.Data["SmartNicModeValue"] = bootstrapResult.OVN.OVNKubernetesConfig.SmartNicModeValue
data.Data["MgmtPortResourceName"] = bootstrapResult.OVN.OVNKubernetesConfig.MgmtPortResourceName
data.Data["DpuNodeLeaseRenewInterval"] = strconv.Itoa(bootstrapResult.OVN.OVNKubernetesConfig.DpuNodeLeaseRenewInterval)
data.Data["DpuNodeLeaseDuration"] = strconv.Itoa(bootstrapResult.OVN.OVNKubernetesConfig.DpuNodeLeaseDuration)
data.Data["OVN_CONTROLLER_INACTIVITY_PROBE"] = os.Getenv("OVN_CONTROLLER_INACTIVITY_PROBE")
controller_inactivity_probe := os.Getenv("OVN_CONTROLLER_INACTIVITY_PROBE")
if len(controller_inactivity_probe) == 0 {
Expand Down Expand Up @@ -1015,10 +1022,12 @@ func findCommonNode(nodeLists ...[]string) (bool, string) {
// if it exists, otherwise returns default configuration for OCP clusters using OVN-Kubernetes
func bootstrapOVNConfig(conf *operv1.Network, kubeClient cnoclient.Client, hc *hypershift.HyperShiftConfig, infraStatus *bootstrap.InfraStatus) (*bootstrap.OVNConfigBoostrapResult, error) {
ovnConfigResult := &bootstrap.OVNConfigBoostrapResult{
DpuHostModeLabel: OVN_NODE_SELECTOR_DEFAULT_DPU_HOST,
DpuModeLabel: OVN_NODE_SELECTOR_DEFAULT_DPU,
SmartNicModeLabel: OVN_NODE_SELECTOR_DEFAULT_SMART_NIC,
MgmtPortResourceName: "",
DpuHostModeLabel: OVN_NODE_SELECTOR_DEFAULT_DPU_HOST,
DpuModeLabel: OVN_NODE_SELECTOR_DEFAULT_DPU,
SmartNicModeLabel: OVN_NODE_SELECTOR_DEFAULT_SMART_NIC,
MgmtPortResourceName: "",
DpuNodeLeaseRenewInterval: DPU_NODE_LEASE_RENEW_INTERVAL_DEFAULT,
DpuNodeLeaseDuration: DPU_NODE_LEASE_DURATION_DEFAULT,
}
if conf.Spec.DefaultNetwork.OVNKubernetesConfig.GatewayConfig == nil {
bootstrapOVNGatewayConfig(conf, kubeClient.ClientFor("").CRClient())
Expand Down Expand Up @@ -1064,6 +1073,33 @@ func bootstrapOVNConfig(conf *operv1.Network, kubeClient cnoclient.Client, hc *h
if exists {
ovnConfigResult.MgmtPortResourceName = mgmtPortresourceName
}

if val, exists := cm.Data["dpu-node-lease-renew-interval-in-seconds"]; exists {
parsed, err := strconv.Atoi(val)
if err == nil && parsed >= 0 {
ovnConfigResult.DpuNodeLeaseRenewInterval = parsed
} else {
klog.Warningf("Invalid dpu-node-lease-renew-interval-in-seconds %q, using default %d", val, DPU_NODE_LEASE_RENEW_INTERVAL_DEFAULT)
}
}
if val, exists := cm.Data["dpu-node-lease-duration-in-seconds"]; exists {
parsed, err := strconv.Atoi(val)
if err == nil && parsed >= 0 {
ovnConfigResult.DpuNodeLeaseDuration = parsed
} else {
klog.Warningf("Invalid dpu-node-lease-duration-in-seconds %q, using default %d", val, DPU_NODE_LEASE_DURATION_DEFAULT)
}
}

// Setting either value to 0 disables the DPU health check.
// When both are non-zero, duration must be greater than interval.
if ovnConfigResult.DpuNodeLeaseRenewInterval != 0 && ovnConfigResult.DpuNodeLeaseDuration != 0 &&
ovnConfigResult.DpuNodeLeaseDuration <= ovnConfigResult.DpuNodeLeaseRenewInterval {
klog.Warningf("dpu-node-lease-duration-in-seconds (%d) must be greater than dpu-node-lease-renew-interval-in-seconds (%d), using defaults",
ovnConfigResult.DpuNodeLeaseDuration, ovnConfigResult.DpuNodeLeaseRenewInterval)
ovnConfigResult.DpuNodeLeaseRenewInterval = DPU_NODE_LEASE_RENEW_INTERVAL_DEFAULT
ovnConfigResult.DpuNodeLeaseDuration = DPU_NODE_LEASE_DURATION_DEFAULT
}
Comment on lines +1094 to +1102
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Normalize disable semantics when either lease value is 0.

Line 1094 says either value disables health check, but current flow can still render flags when renew interval is non-zero and duration is 0. Please normalize both fields to 0 when either is 0 so behavior is consistent and not dependent on downstream flag parsing behavior.

Proposed fix
-		// Setting either value to 0 disables the DPU health check.
-		// When both are non-zero, duration must be greater than interval.
-		if ovnConfigResult.DpuNodeLeaseRenewInterval != 0 && ovnConfigResult.DpuNodeLeaseDuration != 0 &&
-			ovnConfigResult.DpuNodeLeaseDuration <= ovnConfigResult.DpuNodeLeaseRenewInterval {
+		// Setting either value to 0 disables the DPU health check.
+		if ovnConfigResult.DpuNodeLeaseRenewInterval == 0 || ovnConfigResult.DpuNodeLeaseDuration == 0 {
+			ovnConfigResult.DpuNodeLeaseRenewInterval = 0
+			ovnConfigResult.DpuNodeLeaseDuration = 0
+		} else if ovnConfigResult.DpuNodeLeaseDuration <= ovnConfigResult.DpuNodeLeaseRenewInterval {
+			// When both are non-zero, duration must be greater than interval.
 			klog.Warningf("dpu-node-lease-duration-in-seconds (%d) must be greater than dpu-node-lease-renew-interval-in-seconds (%d), using defaults",
 				ovnConfigResult.DpuNodeLeaseDuration, ovnConfigResult.DpuNodeLeaseRenewInterval)
 			ovnConfigResult.DpuNodeLeaseRenewInterval = DPU_NODE_LEASE_RENEW_INTERVAL_DEFAULT
 			ovnConfigResult.DpuNodeLeaseDuration = DPU_NODE_LEASE_DURATION_DEFAULT
 		}
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
// Setting either value to 0 disables the DPU health check.
// When both are non-zero, duration must be greater than interval.
if ovnConfigResult.DpuNodeLeaseRenewInterval != 0 && ovnConfigResult.DpuNodeLeaseDuration != 0 &&
ovnConfigResult.DpuNodeLeaseDuration <= ovnConfigResult.DpuNodeLeaseRenewInterval {
klog.Warningf("dpu-node-lease-duration-in-seconds (%d) must be greater than dpu-node-lease-renew-interval-in-seconds (%d), using defaults",
ovnConfigResult.DpuNodeLeaseDuration, ovnConfigResult.DpuNodeLeaseRenewInterval)
ovnConfigResult.DpuNodeLeaseRenewInterval = DPU_NODE_LEASE_RENEW_INTERVAL_DEFAULT
ovnConfigResult.DpuNodeLeaseDuration = DPU_NODE_LEASE_DURATION_DEFAULT
}
// Setting either value to 0 disables the DPU health check.
if ovnConfigResult.DpuNodeLeaseRenewInterval == 0 || ovnConfigResult.DpuNodeLeaseDuration == 0 {
ovnConfigResult.DpuNodeLeaseRenewInterval = 0
ovnConfigResult.DpuNodeLeaseDuration = 0
} else if ovnConfigResult.DpuNodeLeaseDuration <= ovnConfigResult.DpuNodeLeaseRenewInterval {
// When both are non-zero, duration must be greater than interval.
klog.Warningf("dpu-node-lease-duration-in-seconds (%d) must be greater than dpu-node-lease-renew-interval-in-seconds (%d), using defaults",
ovnConfigResult.DpuNodeLeaseDuration, ovnConfigResult.DpuNodeLeaseRenewInterval)
ovnConfigResult.DpuNodeLeaseRenewInterval = DPU_NODE_LEASE_RENEW_INTERVAL_DEFAULT
ovnConfigResult.DpuNodeLeaseDuration = DPU_NODE_LEASE_DURATION_DEFAULT
}
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@pkg/network/ovn_kubernetes.go` around lines 1094 - 1102, If either
ovnConfigResult.DpuNodeLeaseRenewInterval or
ovnConfigResult.DpuNodeLeaseDuration is 0 we should normalize both to 0 so the
disable semantics are consistent; update the logic around the current checks to
first detect if either field == 0 and set both
ovnConfigResult.DpuNodeLeaseRenewInterval = 0 and
ovnConfigResult.DpuNodeLeaseDuration = 0, otherwise keep the existing validation
that when both are non-zero and DpuNodeLeaseDuration <=
DpuNodeLeaseRenewInterval you log the warning and reset to
DPU_NODE_LEASE_RENEW_INTERVAL_DEFAULT and DPU_NODE_LEASE_DURATION_DEFAULT.

}

// We want to see if there are any nodes that are labeled for specific modes such as Full/SmartNIC/DPU Host/DPU
Expand Down
91 changes: 91 additions & 0 deletions pkg/network/ovn_kubernetes_dpu_host_test.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package network

import (
"strconv"
"testing"

"github.com/ghodss/yaml"
Expand Down Expand Up @@ -160,6 +161,8 @@ func createTestRenderData(ovnNodeMode string) render.RenderData {
data.Data["SmartNicModeValue"] = ""
data.Data["DpuModeLabel"] = ""
data.Data["MgmtPortResourceName"] = ""
data.Data["DpuNodeLeaseRenewInterval"] = strconv.Itoa(DPU_NODE_LEASE_RENEW_INTERVAL_DEFAULT)
data.Data["DpuNodeLeaseDuration"] = strconv.Itoa(DPU_NODE_LEASE_DURATION_DEFAULT)
data.Data["HTTP_PROXY"] = ""
data.Data["HTTPS_PROXY"] = ""
data.Data["NO_PROXY"] = ""
Expand Down Expand Up @@ -211,6 +214,94 @@ func getMatchExpression(g *WithT, ds *appsv1.DaemonSet, label string) (corev1.No
return corev1.NodeSelectorOpDoesNotExist, ""
}

// TestOVNKubernetesLeaseEnvVars tests that DPU lease env vars are set
// for DPU and DPU-host modes but not for full mode
func TestOVNKubernetesLeaseEnvVars(t *testing.T) {
templates := []struct {
name string
templatePath string
}{
{
name: "managed",
templatePath: "../../bindata/network/ovn-kubernetes/managed/ovnkube-node.yaml",
},
{
name: "self-hosted",
templatePath: "../../bindata/network/ovn-kubernetes/self-hosted/ovnkube-node.yaml",
},
}

testCases := []struct {
name string
ovnNodeMode string
expectSet bool
}{
{
name: "full mode should not have lease env vars",
ovnNodeMode: "full",
expectSet: false,
},
{
name: "dpu-host mode should have lease env vars",
ovnNodeMode: "dpu-host",
expectSet: true,
},
{
name: "dpu mode should have lease env vars",
ovnNodeMode: "dpu",
expectSet: true,
},
}

// Env vars with literal values
leaseEnvVars := map[string]string{
"OVNKUBE_NODE_LEASE_RENEW_INTERVAL": strconv.Itoa(DPU_NODE_LEASE_RENEW_INTERVAL_DEFAULT),
"OVNKUBE_NODE_LEASE_DURATION": strconv.Itoa(DPU_NODE_LEASE_DURATION_DEFAULT),
}
for _, template := range templates {
for _, tc := range testCases {
testName := template.name + "_" + tc.name
t.Run(testName, func(t *testing.T) {
g := NewGomegaWithT(t)

data := createTestRenderData(tc.ovnNodeMode)

objs, err := render.RenderTemplate(template.templatePath, &data)
g.Expect(err).NotTo(HaveOccurred())
g.Expect(objs).To(HaveLen(1))

yamlBytes, err := yaml.Marshal(objs[0])
g.Expect(err).NotTo(HaveOccurred())

ds := &appsv1.DaemonSet{}
err = yaml.Unmarshal(yamlBytes, ds)
g.Expect(err).NotTo(HaveOccurred())

for envName, expectedValue := range leaseEnvVars {
found := false
for _, container := range ds.Spec.Template.Spec.Containers {
for _, env := range container.Env {
if env.Name == envName {
found = true
g.Expect(env.Value).To(Equal(expectedValue),
"%s should be set to %s", envName, expectedValue)
}
}
}

if tc.expectSet {
g.Expect(found).To(BeTrue(),
"%s should be set for %s mode", envName, tc.ovnNodeMode)
} else {
g.Expect(found).To(BeFalse(),
"%s should not be set for %s mode", envName, tc.ovnNodeMode)
}
}
})
}
}
}

// TestOVNKubernetesNodeSelectorOperator tests that the node selector operator works correctly with label values of different Full/SmartNIC/DPU modes
func TestOVNKubernetesNodeSelectorOperator(t *testing.T) {
templates := []struct {
Expand Down
Loading