diff --git a/buildkite/pipeline_generator/buildkite_step.py b/buildkite/pipeline_generator/buildkite_step.py index 4b31fd0b..19e068a0 100644 --- a/buildkite/pipeline_generator/buildkite_step.py +++ b/buildkite/pipeline_generator/buildkite_step.py @@ -88,7 +88,7 @@ def get_agent_queue(step: Step): elif step.device == DeviceType.H100: # Route multi-GPU H100 tests to RedHat Frankfurt queue if step.num_devices is not None and step.num_devices >= 4: - return AgentQueue.MITHRIL_H100 + return AgentQueue.REDHAT_H100_FRANKFURT else: return AgentQueue.MITHRIL_H100 elif step.device == DeviceType.H200: diff --git a/buildkite/pipeline_generator/plugin/k8s_plugin.py b/buildkite/pipeline_generator/plugin/k8s_plugin.py index 8048d60c..3b2c2f38 100644 --- a/buildkite/pipeline_generator/plugin/k8s_plugin.py +++ b/buildkite/pipeline_generator/plugin/k8s_plugin.py @@ -176,7 +176,11 @@ def get_k8s_plugin(step: Step, image: str): plugin = None if step.device == DeviceType.H100: - plugin = copy.deepcopy(h100_plugin_template) + # Use RedHat template for multi-GPU tests (4+ GPUs) + if step.num_devices is not None and step.num_devices >= 4: + plugin = copy.deepcopy(h100_rh_plugin_template) + else: + plugin = copy.deepcopy(h100_plugin_template) elif step.device == DeviceType.H200: plugin = copy.deepcopy(nebius_h200_plugin_template) elif step.device == DeviceType.A100.value: