diff --git a/ansible/roles_ocp_workloads/ocp4_workload_nvidia_gpu_setup/defaults/main.yml b/ansible/roles_ocp_workloads/ocp4_workload_nvidia_gpu_setup/defaults/main.yml index 713c1d38029..89b50971aaf 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_nvidia_gpu_setup/defaults/main.yml +++ b/ansible/roles_ocp_workloads/ocp4_workload_nvidia_gpu_setup/defaults/main.yml @@ -24,10 +24,10 @@ nvidia_gpu_operator_automatic_install_plan_approval: true nvidia_gpu_operator_starting_csv: gpu-operator-certified.v23.6.0 nvidia_gpu_operator_wait_for_deploy: true nvidia_gpu_operator_use_catalog_snapshot: false -nvidia_gpu_operator_catalogsource_name: "" -nvidia_gpu_operator_catalog_snapshot_image: "" -nvidia_gpu_operator_catalog_snapshot_image_tag: "" +nvidia_gpu_operator_catalogsource_name: "redhat-certified-nv-snap" +nvidia_gpu_operator_catalog_snapshot_image: "quay.io/gpte-devops-automation/olm_snapshot_certified_catalog" +nvidia_gpu_operator_catalog_snapshot_image_tag: "v4.20_2026_02_11" ocp4_workload_nvidia_gpu_setup_create_dashboard: false ocp4_workload_nvidia_gpu_setup_dcgm_exporter_dashboard_url: > - https://github.com/NVIDIA/dcgm-exporter/raw/main/grafana/dcgm-exporter-dashboard.json \ No newline at end of file + https://github.com/NVIDIA/dcgm-exporter/raw/main/grafana/dcgm-exporter-dashboard.json diff --git a/ansible/roles_ocp_workloads/ocp4_workload_nvidia_gpu_setup/tasks/nvidia_gpu_operator.yml b/ansible/roles_ocp_workloads/ocp4_workload_nvidia_gpu_setup/tasks/nvidia_gpu_operator.yml index 44d5f04e761..0196aa5af4e 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_nvidia_gpu_setup/tasks/nvidia_gpu_operator.yml +++ b/ansible/roles_ocp_workloads/ocp4_workload_nvidia_gpu_setup/tasks/nvidia_gpu_operator.yml @@ -6,6 +6,15 @@ kind: Namespace name: "{{ nvidia_gpu_operator_namespace }}" +- name: Create catalog source snapshot + when: nvidia_gpu_operator_use_catalog_snapshot | default(false) | bool + kubernetes.core.k8s: + state: present + definition: "{{ lookup('template', 'certified-operators-index.yaml.j2') | from_yaml }}" + register: snapshot_result + retries: 30 + delay: 5 + - name: Create NVIDIA GPU operatorgroup kubernetes.core.k8s: state: present @@ -22,6 +31,35 @@ retries: 40 delay: 6 +- name: wait for the status of the subscription to not be empty + when: nvidia_gpu_operator_automatic_install_plan_approval | bool == false + k8s_info: + api_version: operators.coreos.com/v1alpha1 + kind: Subscription + name: gpu-operator-certified + namespace: nvidia-gpu-operator + register: nv_subscription_out + until: + - nv_subscription_out is defined + - nv_subscription_out.resources is defined + - nv_subscription_out.resources[0] is defined + - nv_subscription_out.resources[0].status is defined + - nv_subscription_out.resources[0].status.installplan is defined + retries: 30 + delay: 20 + +- name: patch the installplan to approve it + when: nvidia_gpu_operator_automatic_install_plan_approval | bool == false + k8s: + definition: + apiVersion: operators.coreos.com/v1alpha1 + kind: InstallPlan + metadata: + namespace: nvidia-gpu-operator + name: "{{ nv_subscription_out.resources[0].status.installplan.name }}" + spec: + approved: true + - name: 120 second pause for NVIDIA GPU operator setup pause: seconds: 180 diff --git a/ansible/roles_ocp_workloads/ocp4_workload_nvidia_gpu_setup/templates/certified-operators-index.yaml.j2 b/ansible/roles_ocp_workloads/ocp4_workload_nvidia_gpu_setup/templates/certified-operators-index.yaml.j2 new file mode 100644 index 00000000000..7d63054a001 --- /dev/null +++ b/ansible/roles_ocp_workloads/ocp4_workload_nvidia_gpu_setup/templates/certified-operators-index.yaml.j2 @@ -0,0 +1,10 @@ +apiVersion: operators.coreos.com/v1alpha1 +kind: CatalogSource +metadata: + name: "{{ nvidia_gpu_operator_catalogsource_name }}" + namespace: openshift-marketplace +spec: + displayName: "Certified Operators Index {{ nvidia_gpu_operator_catalog_snapshot_image_tag }}" + image: "{{ nvidia_gpu_operator_catalog_snapshot_image }}:{{ nvidia_gpu_operator_catalog_snapshot_image_tag }}" + publisher: redhat-cop-agnosticd + sourceType: grpc diff --git a/ansible/roles_ocp_workloads/ocp4_workload_nvidia_gpu_setup/templates/nvidia_gpu_sub.yaml.j2 b/ansible/roles_ocp_workloads/ocp4_workload_nvidia_gpu_setup/templates/nvidia_gpu_sub.yaml.j2 index e1e692071ae..4eae93f62e9 100644 --- a/ansible/roles_ocp_workloads/ocp4_workload_nvidia_gpu_setup/templates/nvidia_gpu_sub.yaml.j2 +++ b/ansible/roles_ocp_workloads/ocp4_workload_nvidia_gpu_setup/templates/nvidia_gpu_sub.yaml.j2 @@ -6,8 +6,8 @@ metadata: namespace: nvidia-gpu-operator spec: channel: '{{ nvidia_gpu_operator_channel }}' - installPlanApproval: Automatic + installPlanApproval: "{{ ( nvidia_gpu_operator_automatic_install_plan_approval | default(true) | bool ) | ternary( 'Automatic', 'Manual') }}" name: gpu-operator-certified - source: certified-operators + source: "{{ ( nvidia_gpu_operator_use_catalog_snapshot | default(false) | bool ) | ternary( nvidia_gpu_operator_catalogsource_name, 'certified-operators') }}" sourceNamespace: openshift-marketplace startingCSV: '{{ nvidia_gpu_operator_starting_csv }}'