Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 35 additions & 60 deletions buildkite/pipeline_generator/plugin/docker_plugin.py
Original file line number Diff line number Diff line change
@@ -1,79 +1,54 @@
from step import Step
from constants import DeviceType
import copy

docker_plugin_template = {
"image": "",
_COMMON_FIELDS = {
"always-pull": True,
"propagate-environment": True,
"gpus": "all",
"environment": [
"VLLM_USAGE_SOURCE=ci-test",
"NCCL_CUMEM_HOST_ENABLE=0",
"HF_HOME=/fsx/hf_cache",
"HF_TOKEN",
"CODECOV_TOKEN",
"BUILDKITE_ANALYTICS_TOKEN",
"RAY_COMPAT_SLACK_WEBHOOK_URL",
],
"volumes": [
"/dev/shm:/dev/shm",
"/fsx/hf_cache:/fsx/hf_cache",
],
}

h200_plugin_template = {
"image": "",
"always-pull": True,
"propagate-environment": True,
"gpus": "all",
"environment": [
"VLLM_USAGE_SOURCE=ci-test",
"NCCL_CUMEM_HOST_ENABLE=0",
"HF_TOKEN",
"HF_HOME",
"CODECOV_TOKEN",
"BUILDKITE_ANALYTICS_TOKEN",
],
"volumes": [
"/dev/shm:/dev/shm",
"/mnt/vllm-ci:/mnt/vllm-ci",
],
_COMMON_ENV = [
"VLLM_USAGE_SOURCE=ci-test",
"NCCL_CUMEM_HOST_ENABLE=0",
"HF_TOKEN",
"CODECOV_TOKEN",
"BUILDKITE_ANALYTICS_TOKEN",
]

_DEVICE_CONFIGS = {
DeviceType.H200: {
"environment": _COMMON_ENV + ["HF_HOME"],
"volumes": ["/dev/shm:/dev/shm", "/mnt/vllm-ci:/mnt/vllm-ci"],
"gpus": "all",
},
DeviceType.B200: {
"environment": _COMMON_ENV + ["HF_HOME"],
"volumes": ["/dev/shm:/dev/shm", "/raid:/raid", "/mnt/shared:/mnt/shared"],
},
}

b200_plugin_template = {
"image": "",
"always-pull": True,
"propagate-environment": True,
"environment": [
"VLLM_USAGE_SOURCE=ci-test",
"NCCL_CUMEM_HOST_ENABLE=0",
"HF_HOME",
"HF_TOKEN",
"CODECOV_TOKEN",
"BUILDKITE_ANALYTICS_TOKEN",
],
"volumes": [
"/dev/shm:/dev/shm",
"/raid:/raid",
"/mnt/shared:/mnt/shared",
_DEFAULT_CONFIG = {
"environment": _COMMON_ENV + [
"HF_HOME=/fsx/hf_cache",
"RAY_COMPAT_SLACK_WEBHOOK_URL",
],
"volumes": ["/dev/shm:/dev/shm", "/fsx/hf_cache:/fsx/hf_cache"],
"gpus": "all",
}


def get_docker_plugin(step: Step, image: str):
plugin = None
if step.device == DeviceType.H200:
plugin = copy.deepcopy(h200_plugin_template)
elif step.device == DeviceType.B200:
plugin = copy.deepcopy(b200_plugin_template)
else:
plugin = copy.deepcopy(docker_plugin_template)
plugin["image"] = image
config = _DEVICE_CONFIGS.get(step.device, _DEFAULT_CONFIG)
plugin = {
"image": image,
**_COMMON_FIELDS,
"environment": list(config["environment"]),
"volumes": list(config["volumes"]),
}
if "gpus" in config:
plugin["gpus"] = config["gpus"]

if step.label == "Benchmarks" or step.mount_buildkite_agent:
plugin["mount_buildkite_agent"] = True
if step.device in (DeviceType.CPU, DeviceType.CPU_SMALL, DeviceType.CPU_MEDIUM) and plugin.get("gpus"):
if step.device in (DeviceType.CPU, DeviceType.CPU_SMALL, DeviceType.CPU_MEDIUM) and "gpus" in plugin:
del plugin["gpus"]
# TODO: Add BUILDKITE_ANALYTICS_TOKEN and pytest addopts for fail_fast
return plugin
Empty file.
112 changes: 112 additions & 0 deletions buildkite/pipeline_generator/tests/test_docker_plugin.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import sys
import os

# Add the pipeline_generator directory to the path so imports resolve correctly.
sys.path.insert(0, os.path.join(os.path.dirname(__file__), ".."))

from plugin.docker_plugin import get_docker_plugin, _COMMON_ENV
from constants import DeviceType
from step import Step


def _make_step(**kwargs):
defaults = {"label": "Test"}
defaults.update(kwargs)
return Step(**defaults)


class TestDefaultDevice:
def test_gpus_all(self):
plugin = get_docker_plugin(_make_step(device=DeviceType.H100), "img")
assert plugin["gpus"] == "all"

def test_environment_includes_hf_home_fsx(self):
plugin = get_docker_plugin(_make_step(device=DeviceType.H100), "img")
assert "HF_HOME=/fsx/hf_cache" in plugin["environment"]

def test_environment_includes_ray_compat(self):
plugin = get_docker_plugin(_make_step(device=DeviceType.H100), "img")
assert "RAY_COMPAT_SLACK_WEBHOOK_URL" in plugin["environment"]

def test_volumes(self):
plugin = get_docker_plugin(_make_step(device=DeviceType.H100), "img")
assert "/fsx/hf_cache:/fsx/hf_cache" in plugin["volumes"]
assert "/dev/shm:/dev/shm" in plugin["volumes"]


class TestH200:
def test_volumes(self):
plugin = get_docker_plugin(_make_step(device=DeviceType.H200), "img")
assert "/mnt/vllm-ci:/mnt/vllm-ci" in plugin["volumes"]
assert "/dev/shm:/dev/shm" in plugin["volumes"]

def test_gpus_all(self):
plugin = get_docker_plugin(_make_step(device=DeviceType.H200), "img")
assert plugin["gpus"] == "all"


class TestB200:
def test_volumes(self):
plugin = get_docker_plugin(_make_step(device=DeviceType.B200), "img")
assert "/raid:/raid" in plugin["volumes"]
assert "/mnt/shared:/mnt/shared" in plugin["volumes"]
assert "/dev/shm:/dev/shm" in plugin["volumes"]

def test_no_gpus_key(self):
plugin = get_docker_plugin(_make_step(device=DeviceType.B200), "img")
assert "gpus" not in plugin


class TestCPU:
def test_gpus_removed(self):
plugin = get_docker_plugin(_make_step(device=DeviceType.CPU), "img")
assert "gpus" not in plugin

def test_cpu_small_gpus_removed(self):
plugin = get_docker_plugin(_make_step(device=DeviceType.CPU_SMALL), "img")
assert "gpus" not in plugin

def test_cpu_medium_gpus_removed(self):
plugin = get_docker_plugin(_make_step(device=DeviceType.CPU_MEDIUM), "img")
assert "gpus" not in plugin


class TestCommonEnv:
def test_all_common_env_in_default(self):
plugin = get_docker_plugin(_make_step(device=DeviceType.H100), "img")
for var in _COMMON_ENV:
assert var in plugin["environment"]

def test_all_common_env_in_h200(self):
plugin = get_docker_plugin(_make_step(device=DeviceType.H200), "img")
for var in _COMMON_ENV:
assert var in plugin["environment"]

def test_all_common_env_in_b200(self):
plugin = get_docker_plugin(_make_step(device=DeviceType.B200), "img")
for var in _COMMON_ENV:
assert var in plugin["environment"]


class TestMountBuildkiteAgent:
def test_benchmarks_label(self):
plugin = get_docker_plugin(_make_step(label="Benchmarks"), "img")
assert plugin["mount_buildkite_agent"] is True

def test_mount_flag(self):
plugin = get_docker_plugin(_make_step(mount_buildkite_agent=True), "img")
assert plugin["mount_buildkite_agent"] is True

def test_no_mount_by_default(self):
plugin = get_docker_plugin(_make_step(), "img")
assert "mount_buildkite_agent" not in plugin


class TestNoSharedMutation:
def test_two_calls_return_independent_dicts(self):
p1 = get_docker_plugin(_make_step(device=DeviceType.H100), "img1")
p2 = get_docker_plugin(_make_step(device=DeviceType.H100), "img2")
p1["environment"].append("EXTRA=1")
assert "EXTRA=1" not in p2["environment"]
assert p1["image"] == "img1"
assert p2["image"] == "img2"