From 9b1ac4eabfb332e1eb6ae05f87a7d1672fb8f692 Mon Sep 17 00:00:00 2001 From: khluu Date: Sun, 29 Mar 2026 03:02:37 -0700 Subject: [PATCH] Reduce Docker plugin template duplication Replace three nearly-identical template dicts with shared constants (_COMMON_ENV, _DEVICE_CONFIGS, _DEFAULT_CONFIG) and a config-driven get_docker_plugin() function. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../plugin/docker_plugin.py | 95 ++++++--------- .../pipeline_generator/tests/__init__.py | 0 .../tests/test_docker_plugin.py | 112 ++++++++++++++++++ 3 files changed, 147 insertions(+), 60 deletions(-) create mode 100644 buildkite/pipeline_generator/tests/__init__.py create mode 100644 buildkite/pipeline_generator/tests/test_docker_plugin.py diff --git a/buildkite/pipeline_generator/plugin/docker_plugin.py b/buildkite/pipeline_generator/plugin/docker_plugin.py index 8e54d915..7b47016b 100644 --- a/buildkite/pipeline_generator/plugin/docker_plugin.py +++ b/buildkite/pipeline_generator/plugin/docker_plugin.py @@ -1,79 +1,54 @@ from step import Step from constants import DeviceType -import copy -docker_plugin_template = { - "image": "", +_COMMON_FIELDS = { "always-pull": True, "propagate-environment": True, - "gpus": "all", - "environment": [ - "VLLM_USAGE_SOURCE=ci-test", - "NCCL_CUMEM_HOST_ENABLE=0", - "HF_HOME=/fsx/hf_cache", - "HF_TOKEN", - "CODECOV_TOKEN", - "BUILDKITE_ANALYTICS_TOKEN", - "RAY_COMPAT_SLACK_WEBHOOK_URL", - ], - "volumes": [ - "/dev/shm:/dev/shm", - "/fsx/hf_cache:/fsx/hf_cache", - ], } -h200_plugin_template = { - "image": "", - "always-pull": True, - "propagate-environment": True, - "gpus": "all", - "environment": [ - "VLLM_USAGE_SOURCE=ci-test", - "NCCL_CUMEM_HOST_ENABLE=0", - "HF_TOKEN", - "HF_HOME", - "CODECOV_TOKEN", - "BUILDKITE_ANALYTICS_TOKEN", - ], - "volumes": [ - "/dev/shm:/dev/shm", - "/mnt/vllm-ci:/mnt/vllm-ci", - ], +_COMMON_ENV = [ + "VLLM_USAGE_SOURCE=ci-test", + "NCCL_CUMEM_HOST_ENABLE=0", + "HF_TOKEN", + "CODECOV_TOKEN", + "BUILDKITE_ANALYTICS_TOKEN", +] + +_DEVICE_CONFIGS = { + DeviceType.H200: { + "environment": _COMMON_ENV + ["HF_HOME"], + "volumes": ["/dev/shm:/dev/shm", "/mnt/vllm-ci:/mnt/vllm-ci"], + "gpus": "all", + }, + DeviceType.B200: { + "environment": _COMMON_ENV + ["HF_HOME"], + "volumes": ["/dev/shm:/dev/shm", "/raid:/raid", "/mnt/shared:/mnt/shared"], + }, } -b200_plugin_template = { - "image": "", - "always-pull": True, - "propagate-environment": True, - "environment": [ - "VLLM_USAGE_SOURCE=ci-test", - "NCCL_CUMEM_HOST_ENABLE=0", - "HF_HOME", - "HF_TOKEN", - "CODECOV_TOKEN", - "BUILDKITE_ANALYTICS_TOKEN", - ], - "volumes": [ - "/dev/shm:/dev/shm", - "/raid:/raid", - "/mnt/shared:/mnt/shared", +_DEFAULT_CONFIG = { + "environment": _COMMON_ENV + [ + "HF_HOME=/fsx/hf_cache", + "RAY_COMPAT_SLACK_WEBHOOK_URL", ], + "volumes": ["/dev/shm:/dev/shm", "/fsx/hf_cache:/fsx/hf_cache"], + "gpus": "all", } def get_docker_plugin(step: Step, image: str): - plugin = None - if step.device == DeviceType.H200: - plugin = copy.deepcopy(h200_plugin_template) - elif step.device == DeviceType.B200: - plugin = copy.deepcopy(b200_plugin_template) - else: - plugin = copy.deepcopy(docker_plugin_template) - plugin["image"] = image + config = _DEVICE_CONFIGS.get(step.device, _DEFAULT_CONFIG) + plugin = { + "image": image, + **_COMMON_FIELDS, + "environment": list(config["environment"]), + "volumes": list(config["volumes"]), + } + if "gpus" in config: + plugin["gpus"] = config["gpus"] if step.label == "Benchmarks" or step.mount_buildkite_agent: plugin["mount_buildkite_agent"] = True - if step.device in (DeviceType.CPU, DeviceType.CPU_SMALL, DeviceType.CPU_MEDIUM) and plugin.get("gpus"): + if step.device in (DeviceType.CPU, DeviceType.CPU_SMALL, DeviceType.CPU_MEDIUM) and "gpus" in plugin: del plugin["gpus"] - # TODO: Add BUILDKITE_ANALYTICS_TOKEN and pytest addopts for fail_fast return plugin diff --git a/buildkite/pipeline_generator/tests/__init__.py b/buildkite/pipeline_generator/tests/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/buildkite/pipeline_generator/tests/test_docker_plugin.py b/buildkite/pipeline_generator/tests/test_docker_plugin.py new file mode 100644 index 00000000..35ce0348 --- /dev/null +++ b/buildkite/pipeline_generator/tests/test_docker_plugin.py @@ -0,0 +1,112 @@ +import sys +import os + +# Add the pipeline_generator directory to the path so imports resolve correctly. +sys.path.insert(0, os.path.join(os.path.dirname(__file__), "..")) + +from plugin.docker_plugin import get_docker_plugin, _COMMON_ENV +from constants import DeviceType +from step import Step + + +def _make_step(**kwargs): + defaults = {"label": "Test"} + defaults.update(kwargs) + return Step(**defaults) + + +class TestDefaultDevice: + def test_gpus_all(self): + plugin = get_docker_plugin(_make_step(device=DeviceType.H100), "img") + assert plugin["gpus"] == "all" + + def test_environment_includes_hf_home_fsx(self): + plugin = get_docker_plugin(_make_step(device=DeviceType.H100), "img") + assert "HF_HOME=/fsx/hf_cache" in plugin["environment"] + + def test_environment_includes_ray_compat(self): + plugin = get_docker_plugin(_make_step(device=DeviceType.H100), "img") + assert "RAY_COMPAT_SLACK_WEBHOOK_URL" in plugin["environment"] + + def test_volumes(self): + plugin = get_docker_plugin(_make_step(device=DeviceType.H100), "img") + assert "/fsx/hf_cache:/fsx/hf_cache" in plugin["volumes"] + assert "/dev/shm:/dev/shm" in plugin["volumes"] + + +class TestH200: + def test_volumes(self): + plugin = get_docker_plugin(_make_step(device=DeviceType.H200), "img") + assert "/mnt/vllm-ci:/mnt/vllm-ci" in plugin["volumes"] + assert "/dev/shm:/dev/shm" in plugin["volumes"] + + def test_gpus_all(self): + plugin = get_docker_plugin(_make_step(device=DeviceType.H200), "img") + assert plugin["gpus"] == "all" + + +class TestB200: + def test_volumes(self): + plugin = get_docker_plugin(_make_step(device=DeviceType.B200), "img") + assert "/raid:/raid" in plugin["volumes"] + assert "/mnt/shared:/mnt/shared" in plugin["volumes"] + assert "/dev/shm:/dev/shm" in plugin["volumes"] + + def test_no_gpus_key(self): + plugin = get_docker_plugin(_make_step(device=DeviceType.B200), "img") + assert "gpus" not in plugin + + +class TestCPU: + def test_gpus_removed(self): + plugin = get_docker_plugin(_make_step(device=DeviceType.CPU), "img") + assert "gpus" not in plugin + + def test_cpu_small_gpus_removed(self): + plugin = get_docker_plugin(_make_step(device=DeviceType.CPU_SMALL), "img") + assert "gpus" not in plugin + + def test_cpu_medium_gpus_removed(self): + plugin = get_docker_plugin(_make_step(device=DeviceType.CPU_MEDIUM), "img") + assert "gpus" not in plugin + + +class TestCommonEnv: + def test_all_common_env_in_default(self): + plugin = get_docker_plugin(_make_step(device=DeviceType.H100), "img") + for var in _COMMON_ENV: + assert var in plugin["environment"] + + def test_all_common_env_in_h200(self): + plugin = get_docker_plugin(_make_step(device=DeviceType.H200), "img") + for var in _COMMON_ENV: + assert var in plugin["environment"] + + def test_all_common_env_in_b200(self): + plugin = get_docker_plugin(_make_step(device=DeviceType.B200), "img") + for var in _COMMON_ENV: + assert var in plugin["environment"] + + +class TestMountBuildkiteAgent: + def test_benchmarks_label(self): + plugin = get_docker_plugin(_make_step(label="Benchmarks"), "img") + assert plugin["mount_buildkite_agent"] is True + + def test_mount_flag(self): + plugin = get_docker_plugin(_make_step(mount_buildkite_agent=True), "img") + assert plugin["mount_buildkite_agent"] is True + + def test_no_mount_by_default(self): + plugin = get_docker_plugin(_make_step(), "img") + assert "mount_buildkite_agent" not in plugin + + +class TestNoSharedMutation: + def test_two_calls_return_independent_dicts(self): + p1 = get_docker_plugin(_make_step(device=DeviceType.H100), "img1") + p2 = get_docker_plugin(_make_step(device=DeviceType.H100), "img2") + p1["environment"].append("EXTRA=1") + assert "EXTRA=1" not in p2["environment"] + assert p1["image"] == "img1" + assert p2["image"] == "img2"