diff --git a/.gitignore b/.gitignore index 797af49a..41113993 100644 --- a/.gitignore +++ b/.gitignore @@ -44,3 +44,5 @@ __pycache__/ .cache *.log + +.pipeline_test_results.json diff --git a/buildkite/pipeline_generator/.coveragerc b/buildkite/pipeline_generator/.coveragerc new file mode 100644 index 00000000..20799300 --- /dev/null +++ b/buildkite/pipeline_generator/.coveragerc @@ -0,0 +1,21 @@ +[run] +source = . +omit = + */tests/* + */test_* + */__pycache__/* + */venv/* + */build/* + setup.py + +[report] +exclude_lines = + pragma: no cover + def __repr__ + raise AssertionError + raise NotImplementedError + if __name__ == .__main__.: + if TYPE_CHECKING: + @abstractmethod + + diff --git a/buildkite/pipeline_generator/.gitignore b/buildkite/pipeline_generator/.gitignore new file mode 100644 index 00000000..347745a6 --- /dev/null +++ b/buildkite/pipeline_generator/.gitignore @@ -0,0 +1,35 @@ +# Test artifacts +tests/.pipeline_debug/ +tests/.pipeline_test_results.json +.pipeline_test_results.json +.pipeline_debug/ + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python + +# Testing +.pytest_cache/ +.coverage +.coverage.* +htmlcov/ +.coveragerc + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + + + + + diff --git a/buildkite/pipeline_generator/README.md b/buildkite/pipeline_generator/README.md new file mode 100644 index 00000000..17a96678 --- /dev/null +++ b/buildkite/pipeline_generator/README.md @@ -0,0 +1,86 @@ +# Pipeline Generator + +Simple Python replacement for Jinja templates that generate Buildkite CI pipelines for vLLM. + +## Quick Start + +```bash +# CI mode (default) +python -m pipeline_generator --pipeline_mode ci + +# Fastcheck mode +python -m pipeline_generator --pipeline_mode fastcheck + +# AMD mode +python -m pipeline_generator --pipeline_mode amd +``` + +## Architecture + +Simple, readable code matching Jinja template complexity: + +``` +pipeline_generator/ +├── pipeline_generator.py # Main entry point +├── config.py # All constants and configuration +├── models.py # TestStep input model only +│ +├── modes/ # One file per mode (simple dict generation) +│ ├── ci.py # CI pipeline (~630 lines) +│ ├── fastcheck.py # Fastcheck pipeline (~520 lines) +│ └── amd.py # AMD pipeline (~60 lines) +│ +└── helpers/ # Simple utilities + ├── builds.py # Build step dicts + ├── commands.py # Command normalization + ├── coverage.py # Coverage injection (complex) + └── test_selection.py # Intelligent test targeting (complex) +``` + +## Design Philosophy + +- **Simple over clever**: Each mode file reads top-to-bottom like its Jinja template +- **Direct dict construction**: Use f-strings to build YAML dicts, no abstraction layers +- **Helper functions only where complex**: Coverage and test selection logic is genuinely complex (exists in Jinja too) +- **No Pydantic output models**: Only use Pydantic for input parsing (TestStep) + +## Example Code + +```python +def generate_test_step(test, config): + """Generate a test step - simple dict construction.""" + return { + "label": test.label, + "agents": {"queue": get_queue(test)}, + "plugins": [{ + "docker#v5.2.0": { + "image": config.container_image, + "command": ["bash", "-xc", build_command(test, config)], + "environment": ["VLLM_USAGE_SOURCE=ci-test", "HF_TOKEN"], + } + }], + "depends_on": "image-build", + } +``` + +## Testing + +All integration tests verify byte-for-byte YAML compatibility with Jinja templates: + +```bash +# All integration tests (64 scenarios) +pytest tests/test_integration_comprehensive.py tests/test_integration_fastcheck.py + +# Unit tests +pytest tests/ -k "not integration" +``` + +**Status**: ✅ 100% YAML compatibility verified (64/64 scenarios pass) + +## How It Works + +1. Read `test-pipeline.yaml` → Parse into TestStep objects +2. Generate mode-specific pipeline → Simple dicts with f-strings +3. Write `pipeline.yaml` → Direct YAML dump + +No plugin builders, no converters, no abstraction - just straightforward code. diff --git a/buildkite/pipeline_generator/__init__.py b/buildkite/pipeline_generator/__init__.py index e69de29b..22742ce0 100644 --- a/buildkite/pipeline_generator/__init__.py +++ b/buildkite/pipeline_generator/__init__.py @@ -0,0 +1,13 @@ +"""Pipeline generator for vLLM Buildkite CI.""" + +# Export key functions and classes +from .config import PipelineGeneratorConfig +from .pipeline_generator import PipelineGenerator, read_test_steps, write_buildkite_pipeline, write_pipeline + +__all__ = [ + "PipelineGenerator", + "PipelineGeneratorConfig", + "read_test_steps", + "write_buildkite_pipeline", + "write_pipeline", +] diff --git a/buildkite/pipeline_generator/__main__.py b/buildkite/pipeline_generator/__main__.py new file mode 100644 index 00000000..a6584d70 --- /dev/null +++ b/buildkite/pipeline_generator/__main__.py @@ -0,0 +1,7 @@ +"""Entry point for running pipeline_generator as a module.""" + +from .pipeline_generator import main + +if __name__ == "__main__": + main() + diff --git a/buildkite/pipeline_generator/config.py b/buildkite/pipeline_generator/config.py new file mode 100644 index 00000000..a9dcdbb6 --- /dev/null +++ b/buildkite/pipeline_generator/config.py @@ -0,0 +1,378 @@ +"""Configuration and constants for pipeline generation.""" + +import enum +import re +from dataclasses import dataclass, field +from typing import Any, Dict, List, Optional + +# ============================================================================== +# ENUMS AND MODES +# ============================================================================== + + +class PipelineMode(str, enum.Enum): + """Pipeline generation mode.""" + CI = "ci" + FASTCHECK = "fastcheck" + AMD = "amd" + + +class GPUType(str, enum.Enum): + """GPU types.""" + A100 = "a100" + H100 = "h100" + H200 = "h200" + B200 = "b200" + + +# ============================================================================== +# CONFIGURATION CLASS +# ============================================================================== + + +class PipelineGeneratorConfig: + """Configuration for the pipeline generator.""" + + def __init__( + self, + container_registry: str, + container_registry_repo: str, + commit: str, + branch: str, + list_file_diff: list, + run_all: bool = False, + nightly: bool = False, + mirror_hw: str = "amdexperimental", + fail_fast: bool = False, + vllm_use_precompiled: str = "0", + cov_enabled: bool = False, + vllm_ci_branch: str = "main", + pipeline_mode: PipelineMode = PipelineMode.CI, + ): + self.run_all = run_all + self.nightly = nightly + self.list_file_diff = list_file_diff + self.container_registry = container_registry + self.container_registry_repo = container_registry_repo + self.commit = commit + self.branch = branch + self.mirror_hw = mirror_hw + self.fail_fast = fail_fast + self.vllm_use_precompiled = vllm_use_precompiled + self.cov_enabled = cov_enabled + self.vllm_ci_branch = vllm_ci_branch + self.pipeline_mode = pipeline_mode + + def _get_repo_suffix(self) -> str: + """Get repository suffix based on branch (postmerge for main, test otherwise).""" + return "postmerge" if self.branch == "main" else "test" + + @property + def container_image(self): + """Get the main CUDA container image.""" + if self.pipeline_mode in [PipelineMode.FASTCHECK, PipelineMode.AMD]: + return "public.ecr.aws/q9t5s3a7/vllm-ci-test-repo:$BUILDKITE_COMMIT" + return f"public.ecr.aws/q9t5s3a7/vllm-ci-{self._get_repo_suffix()}-repo:$BUILDKITE_COMMIT" + + @property + def container_image_torch_nightly(self): + """Get the torch nightly container image.""" + return f"public.ecr.aws/q9t5s3a7/vllm-ci-{self._get_repo_suffix()}-repo:$BUILDKITE_COMMIT-torch-nightly" + + @property + def container_image_cu118(self): + """Get the CUDA 11.8 container image.""" + return f"public.ecr.aws/q9t5s3a7/vllm-ci-{self._get_repo_suffix()}-repo:$BUILDKITE_COMMIT-cu118" + + @property + def container_image_cpu(self): + """Get the CPU container image.""" + return f"public.ecr.aws/q9t5s3a7/vllm-ci-{self._get_repo_suffix()}-repo:$BUILDKITE_COMMIT-cpu" + + @property + def container_image_amd(self): + """Get the AMD container image.""" + return "rocm/vllm-ci:$BUILDKITE_COMMIT" + + def validate(self): + """Validate the configuration.""" + pattern = r"^[0-9a-f]{40}$" + if not re.match(pattern, self.commit): + raise ValueError(f"Commit {self.commit} is not a valid Git commit hash") + + +# ============================================================================== +# CONSTANTS +# ============================================================================== + +# ECR and Images +VLLM_ECR_URL = "public.ecr.aws/q9t5s3a7" +VLLM_ECR_REPO = f"{VLLM_ECR_URL}/vllm-ci-test-repo" +AMD_REPO = "rocm/vllm-ci" + +# Paths +HF_HOME = "/root/.cache/huggingface" +HF_HOME_FSX = "/fsx/hf_cache" +DEFAULT_WORKING_DIR = "/vllm-workspace/tests" + +# Build Step Keys +BUILD_KEY_MAIN = "image-build" +BUILD_KEY_CPU = "image-build-cpu" +BUILD_KEY_CU118 = "image-build-cu118" +BUILD_KEY_AMD = "amd-build" +BUILD_KEY_TORCH_NIGHTLY = "image-build-torch-nightly" + +# Agent Queues +QUEUE_CPU = "cpu_queue" +QUEUE_CPU_PREMERGE = "cpu_queue_premerge" +QUEUE_CPU_PREMERGE_US_EAST_1 = "cpu_queue_premerge_us_east_1" +QUEUE_CPU_POSTMERGE_US_EAST_1 = "cpu_queue_postmerge_us_east_1" +QUEUE_SMALL_CPU = "small_cpu_queue" +QUEUE_SMALL_CPU_PREMERGE = "small_cpu_queue_premerge" +QUEUE_GPU_1 = "gpu_1_queue" +QUEUE_GPU_4 = "gpu_4_queue" +QUEUE_A100 = "a100_queue" +QUEUE_H100 = "mithril-h100-pool" +QUEUE_H200 = "skylab-h200" +QUEUE_B200 = "B200" +QUEUE_AMD = "amd" +QUEUE_AMD_CPU = "amd-cpu" +QUEUE_AMD_MI300_1 = "amd_mi300_1" +QUEUE_AMD_MI325_1 = "amd_mi325_1" +QUEUE_AMD_MI325_2 = "amd_mi325_2" +QUEUE_AMD_MI325_4 = "amd_mi325_4" +QUEUE_AMD_MI325_8 = "amd_mi325_8" +QUEUE_NEURON = "neuron" +QUEUE_INTEL_CPU = "intel-cpu" +QUEUE_INTEL_GPU = "intel-gpu" +QUEUE_INTEL_HPU = "intel-hpu" +QUEUE_TPU_V5 = "tpu_v5_queue" +QUEUE_TPU_V6E = "tpu_v6e_queue" +QUEUE_GH200 = "gh200_queue" +QUEUE_IBM_PPC64LE = "ibm-ppc64le" +QUEUE_IBM_S390X = "ibm_s390x" +QUEUE_ASCEND = "ascend" + +# Docker Plugin +DOCKER_PLUGIN = "docker#v5.2.0" + +# Retry Configuration +RETRY_EXIT_STATUS_AGENT_LOST = -1 +RETRY_EXIT_STATUS_AGENT_TERMINATED = -10 + +# Priority Values +PRIORITY_AMD = 100 +PRIORITY_A100 = 10000 + +# Scripts +SCRIPT_RUN_MULTI_NODE = "./.buildkite/scripts/run-multi-node-test.sh" +SCRIPT_RUN_NEURON = ".buildkite/scripts/hardware_ci/run-neuron-test.sh" +SCRIPT_RUN_AMD = ".buildkite/scripts/hardware_ci/run-amd-test.sh" +SCRIPT_RUN_INTEL_CPU = ".buildkite/scripts/hardware_ci/run-cpu-test.sh" +SCRIPT_RUN_INTEL_GPU = ".buildkite/scripts/hardware_ci/run-xpu-test.sh" +SCRIPT_RUN_INTEL_HPU = ".buildkite/scripts/hardware_ci/run-hpu-test.sh" +SCRIPT_RUN_TPU = ".buildkite/scripts/hardware_ci/run-tpu-test.sh" +SCRIPT_RUN_TPU_V1 = ".buildkite/scripts/hardware_ci/run-tpu-v1-test.sh" +SCRIPT_RUN_TPU_V1_PART2 = ".buildkite/scripts/hardware_ci/run-tpu-v1-test-part2.sh" +SCRIPT_RUN_GH200 = ".buildkite/scripts/hardware_ci/run-gh200-test.sh" +SCRIPT_RUN_IBM_POWER = ".buildkite/scripts/hardware_ci/run-cpu-test-ppc64le.sh" +SCRIPT_RUN_IBM_S390X = ".buildkite/scripts/hardware_ci/run-cpu-test-s390x.sh" +SCRIPT_RUN_ASCEND = ".buildkite/scripts/hardware_ci/run-ascend-test.sh" +SCRIPT_TPU_CLEANUP = "bash .buildkite/scripts/tpu/cleanup_docker.sh" +SCRIPT_TPU_DOCKER_RUN_BM = "bash .buildkite/scripts/tpu/docker_run_bm.sh" + +# Dockerfiles +DOCKERFILE = "docker/Dockerfile" +DOCKERFILE_ROCM = "docker/Dockerfile.rocm" + +# Test Labels +LABEL_DOC_BUILD = "Documentation Build" +LABEL_BENCHMARKS = "Benchmarks" +LABEL_BASIC_CORRECTNESS = "Basic Correctness Test" +LABEL_SPEC_DECODE = "Speculative decoding tests" + +# Kubernetes +K8S_NVIDIA_GPU_RESOURCE = "nvidia.com/gpu" +K8S_NVIDIA_GPU_PRODUCT = "nvidia.com/gpu.product" +K8S_NVIDIA_A100_PRODUCT = "NVIDIA-A100-SXM4-80GB" +K8S_HF_TOKEN_SECRET = "hf-token-secret" +K8S_HF_TOKEN_KEY = "token" +K8S_PRIORITY_CLASS = "ci" +K8S_DEVSHM_VOLUME = "devshm" +K8S_HF_CACHE_VOLUME = "hf-cache" +K8S_DEV_SHM_PATH = "/dev/shm" + + +# ============================================================================== +# HARDWARE TEST DATACLASSES +# ============================================================================== + + +@dataclass +class HardwareTestConfig: + """Configuration for a hardware-specific test.""" + label: str + queue: str + script_path: str + depends_on: Optional[str] = None + soft_fail: bool = True + timeout_in_minutes: Optional[int] = None + extra_commands: List[str] = field(default_factory=list) + env: Optional[Dict[str, str]] = None + key: Optional[str] = None + + def to_dict(self) -> Dict[str, Any]: + """Convert to Buildkite step dictionary.""" + step: Dict[str, Any] = { + "label": self.label, + "agents": {"queue": self.queue}, + "soft_fail": self.soft_fail, + } + + if self.depends_on is not None: + step["depends_on"] = self.depends_on + + if self.timeout_in_minutes: + step["timeout_in_minutes"] = self.timeout_in_minutes + + if self.key: + step["key"] = self.key + + # Build command + if self.label == "GH200 Test": + step["command"] = f"nvidia-smi && bash {self.script_path}" + elif self.extra_commands: + step["commands"] = self.extra_commands + [f"bash {self.script_path}"] + else: + step["command"] = f"bash {self.script_path}" + + if self.env: + step["env"] = self.env + + return step + + +@dataclass +class TPUTestConfig: + """Configuration for TPU tests.""" + label: str + key: str + timeout_in_minutes: int + script_path: Optional[str] = None + extra_docker_build: Optional[str] = None + extra_scripts: List[str] = field(default_factory=list) + + def to_dict(self) -> Dict[str, Any]: + """Convert to Buildkite step.""" + commands = [SCRIPT_TPU_CLEANUP] + + if self.extra_docker_build: + commands.append(self.extra_docker_build) + + if self.script_path: + commands.append(f'if [[ -f "{self.script_path}" ]]; then bash {self.script_path}; fi') + + commands.extend(self.extra_scripts) + + return { + "label": self.label, + "soft_fail": True, + "depends_on": None, + "key": self.key, + "timeout_in_minutes": self.timeout_in_minutes, + "agents": {"queue": QUEUE_TPU_V6E}, + "commands": commands, + } + + +# ============================================================================== +# HARDWARE TEST DEFINITIONS +# ============================================================================== + +# Simple hardware tests +NEURON_TEST = HardwareTestConfig( + label="Neuron Test", + queue=QUEUE_NEURON, + script_path=SCRIPT_RUN_NEURON, +) + +INTEL_HPU_TEST = HardwareTestConfig( + label="Intel HPU Test", + queue=QUEUE_INTEL_HPU, + script_path=SCRIPT_RUN_INTEL_HPU, +) + +INTEL_GPU_TEST = HardwareTestConfig( + label="Intel GPU Test", + queue=QUEUE_INTEL_GPU, + script_path=SCRIPT_RUN_INTEL_GPU, +) + +ASCEND_TEST = HardwareTestConfig( + label="Ascend NPU Test", + queue=QUEUE_ASCEND, + script_path=".buildkite/scripts/hardware_ci/run-npu-test.sh", + timeout_in_minutes=20, +) + +GH200_TEST = HardwareTestConfig( + label="GH200 Test", + queue=QUEUE_GH200, + script_path=SCRIPT_RUN_GH200, + extra_commands=[], +) + +# TPU tests +TPU_V1_TEST = TPUTestConfig( + label="TPU V1 Test", + key="run-tpu-v1-test", + timeout_in_minutes=180, + script_path=SCRIPT_RUN_TPU_V1, +) + +TPU_V1_TEST_PART2 = TPUTestConfig( + label="TPU V1 Test Part2", + key="run-tpu-v1-test-part2", + timeout_in_minutes=90, + script_path=SCRIPT_RUN_TPU_V1_PART2, +) + +TPU_V1_BENCHMARK = TPUTestConfig( + label="TPU V1 Benchmark Test", + key="run-tpu-v1-benchmark-test", + timeout_in_minutes=60, + extra_docker_build=( + "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 " + "--build-arg GIT_REPO_CHECK=0 --tag vllm/vllm-tpu-bm --progress plain -f docker/Dockerfile.tpu ." + ), + extra_scripts=[ + "bash .buildkite/scripts/tpu/docker_run_bm.sh .buildkite/scripts/tpu/config_v6e_1.env", + "bash .buildkite/scripts/tpu/docker_run_bm.sh .buildkite/scripts/tpu/quantized_v6e_1.env", + ], +) + + +# AMD Queue Label Lists +AMD_MI325_8_LABELS = [ + LABEL_BENCHMARKS, + "Kernels Attention Test %N", + "LoRA Test %N", + "Kernels Quantization Test %N", +] + +AMD_MI325_4_LABELS = [ + "Distributed Tests (4 GPUs)", + "2 Node Tests (4 GPUs in total)", + "Multi-step Tests (4 GPUs)", + "Pipeline Parallelism Test", + "LoRA TP Test (Distributed)", +] + +AMD_MI325_2_LABELS = [ + "Distributed Comm Ops Test", + "Distributed Tests (2 GPUs)", + "Plugin Tests (2 GPUs)", + "Weight Loading Multiple GPU Test", + "Weight Loading Multiple GPU Test - Large Models", +] + diff --git a/buildkite/pipeline_generator/helpers/__init__.py b/buildkite/pipeline_generator/helpers/__init__.py new file mode 100644 index 00000000..5aca881c --- /dev/null +++ b/buildkite/pipeline_generator/helpers/__init__.py @@ -0,0 +1,2 @@ +"""Helper functions for pipeline generation.""" + diff --git a/buildkite/pipeline_generator/helpers/builds.py b/buildkite/pipeline_generator/helpers/builds.py new file mode 100644 index 00000000..c4063c8d --- /dev/null +++ b/buildkite/pipeline_generator/helpers/builds.py @@ -0,0 +1,316 @@ +"""Build step generation - returns simple dicts.""" + +from typing import Any, Dict, List, Optional + +from ..config import ( + BUILD_KEY_AMD, + BUILD_KEY_CPU, + BUILD_KEY_CU118, + BUILD_KEY_MAIN, + BUILD_KEY_TORCH_NIGHTLY, + DOCKERFILE, + DOCKERFILE_ROCM, + QUEUE_AMD_CPU, + RETRY_EXIT_STATUS_AGENT_LOST, + RETRY_EXIT_STATUS_AGENT_TERMINATED, +) + + +def _build_docker_build_cmd(dockerfile: str, build_args: Dict[str, str], image_tag: str, target: str, fastcheck_format: bool = False) -> str: + """Build docker build command with proper formatting.""" + if fastcheck_format: + # Fastcheck uses folded scalar (>) with specific spacing + lines = [f"docker build --file {dockerfile} "] + + args_list = list(build_args.items()) + for idx, (key, value) in enumerate(args_list): + # First 3 args always get double space + if idx < 3: + suffix = " " + # Last 3 args before --tag get single space + elif idx >= len(args_list) - 3: + suffix = " " + else: + suffix = " " + + if key == "buildkite_commit": + lines.append(f"--build-arg {key}=$BUILDKITE_COMMIT{suffix}") + else: + if " " in str(value): + lines.append(f'--build-arg {key}="{value}"{suffix}') + else: + lines.append(f"--build-arg {key}={value}{suffix}") + + lines.append(f"--tag {image_tag} ") + lines.append(f"--target {target} ") + lines.append("--progress plain .\n") + return "".join(lines) + else: + # CI mode uses simple space-separated + cmd_parts = ["docker build", f"--file {dockerfile}"] + + for key, value in build_args.items(): + if key == "buildkite_commit": + cmd_parts.append(f"--build-arg {key}=$BUILDKITE_COMMIT") + elif " " in str(value): + cmd_parts.append(f'--build-arg {key}="{value}"') + else: + cmd_parts.append(f"--build-arg {key}={value}") + + cmd_parts.extend([f"--tag {image_tag}", f"--target {target}", "--progress plain ."]) + return " ".join(cmd_parts) + + +def _build_image_check_cmd(image_tag: str, fastcheck_format: bool = False) -> str: + """Build command to check if image exists.""" + suffix = "\n" if fastcheck_format else "" + return f"""#!/bin/bash +if [[ -z $(docker manifest inspect {image_tag}) ]]; then + echo "Image not found, proceeding with build..." +else + echo "Image found" + exit 0 +fi{suffix}""" + + +def build_main_image(config) -> Dict[str, Any]: + """Build main CUDA image step.""" + is_fastcheck = config.pipeline_mode.value == "fastcheck" + retry_limit = 5 if is_fastcheck else 2 + + build_args = { + "max_jobs": "16", + "buildkite_commit": config.commit, + "USE_SCCACHE": "1", + } + + # Add CUDA arch lists for CI + if not is_fastcheck: + build_args["TORCH_CUDA_ARCH_LIST"] = "8.0 8.9 9.0 10.0" + build_args["FI_TORCH_CUDA_ARCH_LIST"] = "8.0 8.9 9.0a 10.0a" + + # Add precompiled args if needed (order matters!) + if is_fastcheck: + # Fastcheck has different order than CI + build_args["VLLM_DOCKER_BUILD_CONTEXT"] = "1" + build_args["VLLM_USE_PRECOMPILED"] = config.vllm_use_precompiled + if config.vllm_use_precompiled == "1": + build_args["USE_FLASHINFER_PREBUILT_WHEEL"] = "true" + elif config.branch != "main": + # CI non-main branch + build_args["VLLM_USE_PRECOMPILED"] = config.vllm_use_precompiled + build_args["VLLM_DOCKER_BUILD_CONTEXT"] = "1" + if config.vllm_use_precompiled == "1": + build_args["USE_FLASHINFER_PREBUILT_WHEEL"] = "true" + + image_tag = config.container_image + + commands = [ + "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7", + _build_image_check_cmd(image_tag, is_fastcheck), + _build_docker_build_cmd(DOCKERFILE, build_args, image_tag, "test", is_fastcheck), + f"docker push {image_tag}", + ] + + # Add latest tag push for main branch in CI mode + if config.branch == "main" and not is_fastcheck: + latest_tag = image_tag.replace("$BUILDKITE_COMMIT", "latest") + commands.extend([f"docker tag {image_tag} {latest_tag}", f"docker push {latest_tag}"]) + + # Determine queue + if is_fastcheck: + queue = "cpu_queue_premerge" + elif config.branch == "main": + queue = "cpu_queue_postmerge_us_east_1" + else: + queue = "cpu_queue_premerge_us_east_1" + + step = { + "label": ":docker: build image", + "key": BUILD_KEY_MAIN, + "agents": {"queue": queue}, + "commands": commands, + "env": {"DOCKER_BUILDKIT": "1"}, + "retry": { + "automatic": [ + {"exit_status": RETRY_EXIT_STATUS_AGENT_LOST, "limit": retry_limit}, + {"exit_status": RETRY_EXIT_STATUS_AGENT_TERMINATED, "limit": retry_limit}, + ] + }, + } + + # CI mode includes depends_on: null, fastcheck doesn't + if not is_fastcheck: + step["depends_on"] = None + + return step + + +def build_cu118_image(config) -> List[Dict[str, Any]]: + """Build CUDA 11.8 image (CI only, with block).""" + build_args = { + "max_jobs": "16", + "buildkite_commit": config.commit, + "USE_SCCACHE": "1", + "CUDA_VERSION": "11.8.0", + } + + if config.branch != "main": + build_args["VLLM_USE_PRECOMPILED"] = config.vllm_use_precompiled + build_args["VLLM_DOCKER_BUILD_CONTEXT"] = "1" + if config.vllm_use_precompiled == "1": + build_args["USE_FLASHINFER_PREBUILT_WHEEL"] = "true" + + queue = "cpu_queue_postmerge_us_east_1" if config.branch == "main" else "cpu_queue_premerge_us_east_1" + image_tag = config.container_image_cu118 + + commands = [ + "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7", + _build_image_check_cmd(image_tag), + _build_docker_build_cmd(DOCKERFILE, build_args, image_tag, "test"), + f"docker push {image_tag}", + ] + + return [ + { + "block": "Build CUDA 11.8 image", + "key": "block-build-cu118", + "depends_on": None, + }, + { + "label": ":docker: build image CUDA 11.8", + "key": BUILD_KEY_CU118, + "depends_on": "block-build-cu118", + "agents": {"queue": queue}, + "commands": commands, + "env": {"DOCKER_BUILDKIT": "1"}, + "retry": { + "automatic": [ + {"exit_status": RETRY_EXIT_STATUS_AGENT_LOST, "limit": 2}, + {"exit_status": RETRY_EXIT_STATUS_AGENT_TERMINATED, "limit": 2}, + ] + }, + }, + ] + + +def build_cpu_image(config) -> Dict[str, Any]: + """Build CPU image (CI only).""" + build_args = { + "max_jobs": "16", + "buildkite_commit": config.commit, + "VLLM_CPU_AVX512BF16": "true", + "VLLM_CPU_AVX512VNNI": "true", + } + + queue = "cpu_queue_postmerge_us_east_1" if config.branch == "main" else "cpu_queue_premerge_us_east_1" + image_tag = config.container_image_cpu + + commands = [ + "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7", + _build_image_check_cmd(image_tag), + _build_docker_build_cmd("docker/Dockerfile.cpu", build_args, image_tag, "vllm-test"), + f"docker push {image_tag}", + ] + + # CI only (always has depends_on: null) + return { + "label": ":docker: build image CPU", + "key": BUILD_KEY_CPU, + "depends_on": None, + "agents": {"queue": queue}, + "commands": commands, + "env": {"DOCKER_BUILDKIT": "1"}, + "retry": { + "automatic": [ + {"exit_status": RETRY_EXIT_STATUS_AGENT_LOST, "limit": 2}, + {"exit_status": RETRY_EXIT_STATUS_AGENT_TERMINATED, "limit": 2}, + ] + }, + } + + +def build_torch_nightly_image(config, depends_on: Optional[str]) -> Dict[str, Any]: + """Build torch nightly image (CI only).""" + build_args = { + "max_jobs": "16", + "buildkite_commit": config.commit, + "USE_SCCACHE": "1", + } + + queue = "cpu_queue_postmerge_us_east_1" if config.branch == "main" else "cpu_queue_premerge_us_east_1" + image_tag = config.container_image_torch_nightly + + commands = [ + "aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7", + _build_image_check_cmd(image_tag), + _build_docker_build_cmd("docker/Dockerfile.nightly_torch", build_args, image_tag, "test"), + f"docker push {image_tag}", + ] + + step = { + "label": ":docker: build image torch nightly", + "key": BUILD_KEY_TORCH_NIGHTLY, + "agents": {"queue": queue}, + "commands": commands, + "env": {"DOCKER_BUILDKIT": "1"}, + "soft_fail": True, + "timeout_in_minutes": 360, + "retry": { + "automatic": [ + {"exit_status": RETRY_EXIT_STATUS_AGENT_LOST, "limit": 2}, + {"exit_status": RETRY_EXIT_STATUS_AGENT_TERMINATED, "limit": 2}, + ] + }, + } + + if depends_on is not None: + step["depends_on"] = depends_on + + return step + + +def build_amd_image(config) -> Dict[str, Any]: + """Build AMD image.""" + is_fastcheck = config.pipeline_mode.value == "fastcheck" + image_tag = config.container_image_amd + + # CI mode has trailing newline, fastcheck doesn't + trailing = "\n" if not is_fastcheck else "" + + build_cmd = ( + "docker build " + "--build-arg max_jobs=16 " + "--build-arg REMOTE_VLLM=1 " + "--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942' " + "--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT " + f"--tag {image_tag} " + f"-f {DOCKERFILE_ROCM} " + "--target test " + "--no-cache " + f"--progress plain .{trailing}" + ) + + # Fastcheck includes mirror_hw in label + if is_fastcheck and config.mirror_hw: + label = f"AMD: :docker: build image with {config.mirror_hw}" + else: + label = "AMD: :docker: build image" + + return { + "label": label, + "key": BUILD_KEY_AMD, + "depends_on": None, + "agents": {"queue": QUEUE_AMD_CPU}, + "env": {"DOCKER_BUILDKIT": "1"}, + "soft_fail": is_fastcheck, # true for fastcheck, false for CI + "retry": { + "automatic": [ + {"exit_status": RETRY_EXIT_STATUS_AGENT_LOST, "limit": 1}, + {"exit_status": RETRY_EXIT_STATUS_AGENT_TERMINATED, "limit": 1}, + {"exit_status": 1, "limit": 1}, + ] + }, + "commands": [build_cmd, f"docker push {image_tag}"], + } + diff --git a/buildkite/pipeline_generator/helpers/commands.py b/buildkite/pipeline_generator/helpers/commands.py new file mode 100644 index 00000000..1800e203 --- /dev/null +++ b/buildkite/pipeline_generator/helpers/commands.py @@ -0,0 +1,29 @@ +"""Command normalization and flattening utilities.""" + +from typing import List, Union + + +def normalize_command(command: str) -> str: + """Normalize a single command by removing YAML line continuations (backslashes).""" + return command.replace(" \\", " ").replace("\\\n", " ").replace("\\", "") + + +def normalize_commands(commands: List[str]) -> List[str]: + """Normalize a list of commands.""" + return [normalize_command(cmd) for cmd in commands] + + +def flatten_commands(commands: Union[List[str], List[List[str]]]) -> List[str]: + """ + Flatten nested command lists into a simple list. + For multi-node tests, returns only the first node's commands. + """ + if not commands: + return [] + + # Check if it's a nested list (multi-node format) + if isinstance(commands[0], list): + return commands[0] # type: ignore[return-value] + + return commands # type: ignore[return-value] + diff --git a/buildkite/pipeline_generator/helpers/coverage.py b/buildkite/pipeline_generator/helpers/coverage.py new file mode 100644 index 00000000..4671c05d --- /dev/null +++ b/buildkite/pipeline_generator/helpers/coverage.py @@ -0,0 +1,40 @@ +"""Coverage injection for pytest commands.""" + +from typing import List + + +def inject_coverage_into_command(cmd: str, coverage_file: str) -> str: + """Inject coverage flags into pytest commands.""" + if "pytest " in cmd: + replacement = "pytest --cov=vllm --cov-report= --cov-append --durations=0 " + return f"COVERAGE_FILE={coverage_file} {cmd.replace('pytest ', replacement)} || true" + return cmd + + +def get_coverage_file_id(step_label: str) -> str: + """Compute coverage file identifier for a step.""" + step_length = len(step_label) + step_first = step_label[0] if step_label else "x" + return f".coverage.{step_length}_{step_first}" + + +def inject_coverage(commands: List[str], step_label: str, vllm_ci_branch: str) -> str: + """ + Inject coverage into commands and return combined command string. + """ + coverage_file = get_coverage_file_id(step_label) + injected_commands = [inject_coverage_into_command(cmd, coverage_file) for cmd in commands] + + # Check if any pytest commands were found + has_pytest = any("pytest " in cmd for cmd in commands) + result = " && ".join(injected_commands) + + if has_pytest: + upload_script = ( + f" && curl -sSL https://raw.githubusercontent.com/vllm-project/ci-infra/{vllm_ci_branch}" + f'/buildkite/scripts/upload_codecov.sh | bash -s -- "{step_label}"' + ) + result += upload_script + + return result + diff --git a/buildkite/pipeline_generator/helpers/test_selection.py b/buildkite/pipeline_generator/helpers/test_selection.py new file mode 100644 index 00000000..81a6cb7e --- /dev/null +++ b/buildkite/pipeline_generator/helpers/test_selection.py @@ -0,0 +1,149 @@ +"""Intelligent test selection and filtering.""" + +from typing import List + + +def get_changed_tests(file_diff: List[str]) -> List[str]: + """Extract changed test files from file diff (relative to tests/ directory).""" + changed_tests = [] + for file in file_diff: + if file.startswith("tests/") and "/test_" in file and file.endswith(".py"): + changed_tests.append(file[6:]) # Remove tests/ prefix + return changed_tests + + +def are_only_tests_changed(file_diff: List[str]) -> bool: + """Check if only test files have changed.""" + if not file_diff: + return False + + for file in file_diff: + if not (file.startswith("tests/") and "/test_" in file and file.endswith(".py")): + return False + + return True + + +def extract_covered_test_paths(commands) -> List[str]: + """Extract test paths that are covered by pytest commands.""" + covered_paths: List[str] = [] + + if not commands: + return covered_paths + + for cmd in commands: + if "pytest " not in cmd: + continue + + # Parse pytest arguments + cmd_parts = cmd.split(" ") + in_pytest = False + + for part in cmd_parts: + if part == "pytest": + in_pytest = True + continue + + if not in_pytest or part.startswith("-") or "/" not in part or "::" in part: + continue + + covered_paths.append(part) + + # If it's a file, also add its parent directory + if part.endswith(".py"): + path_parts = part.split("/") + if len(path_parts) > 2: + dir_path = "/".join(path_parts[:-1]) + covered_paths.append(dir_path) + + return covered_paths + + +def extract_pytest_markers(commands) -> str: + """Extract pytest markers from commands.""" + if not commands: + return "" + + for cmd in commands: + if "pytest " not in cmd or " -m " not in cmd: + continue + + parts = cmd.split(" -m ") + if len(parts) <= 1: + continue + + after_m = parts[1] + + # Handle different quote styles + if after_m.startswith("'"): + marker = after_m[1:].split("'")[0] + return f" -m '{marker}'" + elif after_m.startswith('"'): + marker = after_m[1:].split('"')[0] + return f' -m "{marker}"' + else: + marker = after_m.split(" ")[0] + return f" -m {marker}" + + return "" + + +def get_intelligent_test_targets(test_step, changed_tests: List[str]) -> List[str]: + """Get specific test targets when only test files changed.""" + if not test_step.source_file_dependencies: + return [] + + matched_targets = [] + + for dep in test_step.source_file_dependencies: + if not dep.startswith("tests/"): + continue + + dep_rel = dep[6:] # Remove tests/ prefix + + # Handle deps that end with '/' (directories) + if dep_rel.endswith("/"): + dep_dir_prefix = dep_rel + dep_file_name = dep_rel[:-1] + ".py" + else: + dep_dir_prefix = dep_rel + "/" + dep_file_name = dep_rel + ".py" + + # Check changed tests + for t in changed_tests: + if t.startswith(dep_dir_prefix) or t == dep_file_name: + matched_targets.append(t) + + # Filter matched targets to only include those covered by step commands + covered_paths = extract_covered_test_paths(test_step.commands or []) + filtered_targets = [] + + for target in matched_targets: + is_covered = any( + target.startswith(covered_path) and (len(target) == len(covered_path) or target[len(covered_path)] == "/") + for covered_path in covered_paths + ) + if is_covered: + filtered_targets.append(target) + + return filtered_targets + + +def apply_intelligent_test_targeting(commands: List[str], test_step, config) -> str: + """ + Apply intelligent test targeting when only test files changed. + Returns targeted command if applicable, or joined commands otherwise. + """ + if not are_only_tests_changed(config.list_file_diff): + return " && ".join(commands) + + changed_tests = get_changed_tests(config.list_file_diff) + matched_targets = get_intelligent_test_targets(test_step, changed_tests) + + if not matched_targets: + return " && ".join(commands) + + # Build targeted pytest command + markers = extract_pytest_markers(commands) + return f"pytest -v -s{markers} {' '.join(matched_targets)}" + diff --git a/buildkite/pipeline_generator/models.py b/buildkite/pipeline_generator/models.py new file mode 100644 index 00000000..01fa1485 --- /dev/null +++ b/buildkite/pipeline_generator/models.py @@ -0,0 +1,62 @@ +"""Data models for test step input parsing.""" + +from typing import List, Optional, Union + +from pydantic import BaseModel, model_validator +from typing_extensions import Self + +from .config import DEFAULT_WORKING_DIR, GPUType + + +class TestStep(BaseModel): + """Test step defined in test-pipeline.yaml.""" + + label: str + working_dir: Optional[str] = DEFAULT_WORKING_DIR + optional: Optional[bool] = False + fast_check: Optional[bool] = None + fast_check_only: Optional[bool] = None + torch_nightly: Optional[bool] = None + mirror_hardwares: Optional[List[str]] = None + no_gpu: Optional[bool] = None + gpu: Optional[GPUType] = None + num_gpus: Optional[int] = None + num_nodes: Optional[int] = None + source_file_dependencies: Optional[List[str]] = None + soft_fail: Optional[bool] = None + parallelism: Optional[int] = None + timeout_in_minutes: Optional[int] = None + mount_buildkite_agent: Optional[bool] = None + command: Optional[str] = None + commands: Optional[Union[List[str], List[List[str]]]] = None + + @model_validator(mode="before") + @classmethod + def validate_and_convert_command(cls, values): + """Validate that either 'command' or 'commands' is defined and convert command to commands.""" + if not values.get("command") and not values.get("commands"): + raise ValueError("Either 'command' or 'commands' must be defined.") + if values.get("command") and values.get("commands"): + raise ValueError("Only one of 'command' or 'commands' can be defined.") + if values.get("command"): + values["commands"] = [values["command"]] + del values["command"] + return values + + @model_validator(mode="after") + def validate_gpu(self) -> Self: + if self.gpu and self.no_gpu: + raise ValueError("Both 'gpu' and 'no_gpu' cannot be defined together.") + return self + + @model_validator(mode="after") + def validate_multi_node(self) -> Self: + if self.num_nodes and not self.num_gpus: + raise ValueError("'num_gpus' must be defined if 'num_nodes' is defined.") + if self.num_nodes and self.commands: + if isinstance(self.commands, list) and len(self.commands) > 0: + if isinstance(self.commands[0], list): + if len(self.commands) != self.num_nodes: + raise ValueError("Number of command lists must match the number of nodes.") + return self + diff --git a/buildkite/pipeline_generator/modes/__init__.py b/buildkite/pipeline_generator/modes/__init__.py new file mode 100644 index 00000000..eef03ecb --- /dev/null +++ b/buildkite/pipeline_generator/modes/__init__.py @@ -0,0 +1,2 @@ +"""Pipeline generation modes - one file per mode.""" + diff --git a/buildkite/pipeline_generator/modes/amd.py b/buildkite/pipeline_generator/modes/amd.py new file mode 100644 index 00000000..f40dfa13 --- /dev/null +++ b/buildkite/pipeline_generator/modes/amd.py @@ -0,0 +1,62 @@ +"""AMD mode pipeline generation - only AMD tests.""" + +from typing import Any, Dict, List + +from ..config import * # noqa: F403, F405 +from ..helpers.builds import build_amd_image +from ..helpers.commands import flatten_commands + + +def get_amd_queue(label: str) -> str: + """Determine AMD queue based on label.""" + if label in AMD_MI325_8_LABELS: + return QUEUE_AMD_MI325_8 + elif label in AMD_MI325_4_LABELS: + return QUEUE_AMD_MI325_4 + elif label in AMD_MI325_2_LABELS: + return QUEUE_AMD_MI325_2 + else: + return QUEUE_AMD_MI325_1 + + +def generate_amd_pipeline(test_steps, config) -> List[Dict[str, Any]]: + """ + Generate AMD-only pipeline. + Returns a single AMD test group containing build + all AMD tests. + """ + group_steps = [] + + # Add AMD build + group_steps.append(build_amd_image(config)) + + # Add all AMD mirror tests + for test in test_steps: + if not test.mirror_hardwares or config.mirror_hw not in test.mirror_hardwares: + continue + + # Format commands for AMD + commands = flatten_commands(test.commands or []) + commands_str = " && ".join(commands) + working_dir = test.working_dir or DEFAULT_WORKING_DIR + + full_command = f"(command rocm-smi || true) && cd {working_dir} && {commands_str}" + + step = { + "label": f"AMD MI300: {test.label}", + "depends_on": BUILD_KEY_AMD, + "agents": {"queue": get_amd_queue(test.label)}, + "env": {"DOCKER_BUILDKIT": "1"}, + "soft_fail": False, + "priority": PRIORITY_AMD, + "command": full_command, + } + + group_steps.append(step) + + # Return as a single-item list containing the AMD group + return [{ + "group": "AMD Tests", + "depends_on": None, + "steps": group_steps, + }] + diff --git a/buildkite/pipeline_generator/modes/ci.py b/buildkite/pipeline_generator/modes/ci.py new file mode 100644 index 00000000..1b577dc1 --- /dev/null +++ b/buildkite/pipeline_generator/modes/ci.py @@ -0,0 +1,839 @@ +"""CI mode pipeline generation - simple dict generation with large inline structures.""" + +from typing import Any, Dict, List + +from ..config import * # noqa: F403, F405 +from ..helpers.builds import ( + build_amd_image, + build_cpu_image, + build_cu118_image, + build_main_image, + build_torch_nightly_image, +) +from ..helpers.commands import flatten_commands, normalize_commands +from ..helpers.coverage import inject_coverage +from ..helpers.test_selection import apply_intelligent_test_targeting + +# ============================================================================== +# UTILITIES +# ============================================================================== + + +def get_step_key(label: str) -> str: + """Generate step key from label (matching Jinja logic).""" + return ( + label.lower() + .replace(" ", "-") + .replace("(", "") + .replace(")", "") + .replace("%", "") + .replace(",", "-") + .replace("+", "-") + ) + + +def create_notification_command( + check_step: str, + notification_label: str, + queue: str, + message: str, + slack_channel: str, + soft_fail: bool = False, +) -> str: + """ + Create a shell command that uploads a notification step on failure. + + Args: + check_step: Step name to check outcome for + notification_label: Label for the notification step + queue: Agent queue for notification + message: Echo message for the command + slack_channel: Slack channel to notify + soft_fail: Whether notification step should soft fail + """ + soft_fail_line = "\n soft_fail: true" if soft_fail else "" + return f'''if [ $$(buildkite-agent step get "outcome" --step "{check_step}") != "passed" ]; then + cat <<- YAML | buildkite-agent pipeline upload + steps: + - label: "{notification_label}"{soft_fail_line} + agents: + queue: {queue} + command: echo "{message}" + notify: + - slack: + channels: + - "{slack_channel}" +YAML +fi ''' + + +def create_multi_step_notification_command( + check_steps: List[str], + notification_label: str, + queue: str, + message: str, + slack_channel: str, +) -> str: + """Create a notification command that checks multiple steps.""" + conditions = ' || '.join( + f'$$(buildkite-agent step get "outcome" --step "{step}") != "passed"' + for step in check_steps + ) + return f'''if [[ {conditions} ]]; then + cat <<- YAML | buildkite-agent pipeline upload + steps: + - label: "{notification_label}" + agents: + queue: {queue} + command: echo "{message}" + notify: + - slack: + channels: + - "{slack_channel}" +YAML +fi''' + + +def get_agent_queue(test, config) -> str: + """Determine agent queue for a test.""" + if test.label == LABEL_DOC_BUILD: + return QUEUE_SMALL_CPU_PREMERGE + elif test.no_gpu: + return QUEUE_CPU_PREMERGE_US_EAST_1 + elif test.gpu == GPUType.A100: + return QUEUE_A100 + elif test.gpu == GPUType.H100: + return QUEUE_H100 + elif test.gpu == GPUType.H200: + return QUEUE_H200 + elif test.gpu == GPUType.B200: + return QUEUE_B200 + elif test.num_gpus and test.num_gpus >= 2: + return QUEUE_GPU_4 + else: + return QUEUE_GPU_1 + + +def should_run_test(test, config) -> bool: + """Check if test should run based on file changes.""" + if config.run_all or config.nightly: + return True + + if test.source_file_dependencies: + for source_file in test.source_file_dependencies: + for changed_file in config.list_file_diff: + if source_file in changed_file: + return True + return False + + return True + + +def should_block_test(test, config) -> bool: + """ + Check if test should have a block step. + + Jinja logic: block if (ns.blocked == 1 OR (step.optional AND nightly != "1")) + Where ns.blocked is set to 0 if run_all or nightly or test matches file changes + """ + # Optional tests are always blocked except in nightly mode + if test.optional and not config.nightly: + return True + + # In nightly or run_all mode, non-optional tests are not blocked + if config.nightly or config.run_all: + return False + + # Tests that shouldn't run are blocked + if not should_run_test(test, config): + return True + + return False + + +# ============================================================================== +# COMMAND BUILDING +# ============================================================================== + + +def build_docker_command(test, config) -> str: + """Build command that runs inside docker container.""" + commands = flatten_commands(test.commands or []) + commands = normalize_commands(commands) + + # Try intelligent test targeting first + targeted = apply_intelligent_test_targeting(commands, test, config) + + # If targeting didn't apply and coverage is enabled, inject coverage + if targeted == " && ".join(commands) and config.cov_enabled: + command_str = inject_coverage(commands, test.label, config.vllm_ci_branch) + else: + command_str = targeted + + working_dir = test.working_dir or DEFAULT_WORKING_DIR + + # CI mode adds trailing space + if command_str and not command_str.endswith(" "): + command_str += " " + + return f"(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {working_dir} && {command_str}" + + +def build_multi_node_command(test, config) -> str: + """Build multi-node test command.""" + working_dir = test.working_dir or DEFAULT_WORKING_DIR + + # Extract commands for each node + if test.commands and len(test.commands) > 0 and isinstance(test.commands[0], list): + node_commands = test.commands + else: + simple_commands = test.commands if test.commands else [] + node_commands = [simple_commands] * (test.num_nodes or 2) + + # Build quoted node commands + quoted_node_commands = [] + for node_cmds in node_commands: + node_cmd_str = " && ".join(node_cmds) + quoted_node_commands.append(f'"{node_cmd_str}"') + + image = config.container_image_cpu if test.no_gpu else config.container_image + return f"{SCRIPT_RUN_MULTI_NODE} {working_dir} {test.num_nodes} {test.num_gpus or 1} {image} {' '.join(quoted_node_commands)}" + + + + +# ============================================================================== +# TEST STEP GENERATION +# ============================================================================== + + +def generate_test_step(test, config) -> Dict[str, Any]: + """Generate a single test step with inline plugin construction.""" + # Multi-node test + if test.num_nodes and test.num_nodes >= 2: + return { + "label": test.label, + "agents": {"queue": get_agent_queue(test, config)}, + "soft_fail": test.soft_fail or False, + "depends_on": BUILD_KEY_MAIN, + "retry": { + "automatic": [ + {"exit_status": RETRY_EXIT_STATUS_AGENT_LOST, "limit": 1}, + {"exit_status": RETRY_EXIT_STATUS_AGENT_TERMINATED, "limit": 1}, + ] + }, + "commands": [build_multi_node_command(test, config)], + } + + image = config.container_image_cpu if test.no_gpu else config.container_image + command = build_docker_command(test, config) + bash_flag = "-xce" if config.fail_fast else "-xc" + + # Kubernetes plugin for A100/H100 (large inline structure) + if test.gpu in [GPUType.H100, GPUType.A100]: + commands = flatten_commands(test.commands or []) + commands = normalize_commands(commands) + command_str = " && ".join(commands) + working_dir = test.working_dir or DEFAULT_WORKING_DIR + full_command = f"(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {working_dir} && {command_str}" + num_gpus = test.num_gpus or 1 + hf_cache_path = "/mnt/hf-cache" if test.gpu == GPUType.H100 or (test.num_gpus and test.num_gpus >= 8 and test.gpu != GPUType.A100) else HF_HOME + + pod_spec = { + "containers": [{ + "image": image, + "command": [f'bash -c "{full_command}"'], + "resources": {"limits": {K8S_NVIDIA_GPU_RESOURCE: num_gpus}}, + "volumeMounts": [ + {"name": K8S_DEVSHM_VOLUME, "mountPath": K8S_DEV_SHM_PATH}, + {"name": K8S_HF_CACHE_VOLUME, "mountPath": HF_HOME}, + ], + "env": [ + {"name": "VLLM_USAGE_SOURCE", "value": "ci-test"}, + {"name": "NCCL_CUMEM_HOST_ENABLE", "value": "0"}, + {"name": "HF_HOME", "value": HF_HOME}, + { + "name": "HF_TOKEN", + "valueFrom": { + "secretKeyRef": { + "name": K8S_HF_TOKEN_SECRET, + "key": K8S_HF_TOKEN_KEY, + } + }, + }, + ], + }], + "volumes": [ + {"name": K8S_DEVSHM_VOLUME, "emptyDir": {"medium": "Memory"}}, + {"name": K8S_HF_CACHE_VOLUME, "hostPath": {"path": hf_cache_path, "type": "Directory"}}, + ], + } + + if test.gpu == GPUType.A100: + pod_spec["priorityClassName"] = K8S_PRIORITY_CLASS # type: ignore[assignment] + pod_spec["nodeSelector"] = {K8S_NVIDIA_GPU_PRODUCT: K8S_NVIDIA_A100_PRODUCT} # type: ignore[assignment] + elif test.gpu == GPUType.H100 or (test.num_gpus and test.num_gpus >= 8): + pod_spec["nodeSelector"] = {K8S_NVIDIA_GPU_PRODUCT: "NVIDIA-H100-80GB-HBM3"} # type: ignore[assignment] + + step = { + "label": test.label, + "agents": {"queue": get_agent_queue(test, config)}, + "soft_fail": test.soft_fail or False, + "plugins": [{"kubernetes": {"podSpec": pod_spec}}], + "depends_on": BUILD_KEY_MAIN, + "retry": { + "automatic": [ + {"exit_status": RETRY_EXIT_STATUS_AGENT_LOST, "limit": 1}, + {"exit_status": RETRY_EXIT_STATUS_AGENT_TERMINATED, "limit": 1}, + ] + }, + } + if test.parallelism: + step["parallelism"] = test.parallelism + return step + + # Special GPU plugin for H200/B200 (inline) + if test.gpu in [GPUType.H200, GPUType.B200]: + env_vars: List[str] = [ # type: ignore[annotation-unchecked] + "VLLM_USAGE_SOURCE=ci-test", + "NCCL_CUMEM_HOST_ENABLE=0", + "HF_HOME=/benchmark-hf-cache", + "HF_TOKEN", + "CODECOV_TOKEN", + ] + if config.fail_fast: + env_vars.append("PYTEST_ADDOPTS=-x") + if config.branch == "main": + env_vars.append("BUILDKITE_ANALYTICS_TOKEN") + + step = { + "label": test.label, + "agents": {"queue": get_agent_queue(test, config)}, + "soft_fail": test.soft_fail or False, + "plugins": [{ + DOCKER_PLUGIN: { + "image": image, + "always-pull": True, + "propagate-environment": True, + "gpus": "all" if test.gpu == GPUType.H200 else None, + "command": ["bash", bash_flag, command], + "environment": env_vars, + "volumes": [ + "/dev/shm:/dev/shm", + "/data/benchmark-hf-cache:/benchmark-hf-cache", + "/data/benchmark-vllm-cache:/root/.cache/vllm", + ], + } + }], + "depends_on": BUILD_KEY_MAIN, + "retry": { + "automatic": [ + {"exit_status": RETRY_EXIT_STATUS_AGENT_LOST, "limit": 1}, + {"exit_status": RETRY_EXIT_STATUS_AGENT_TERMINATED, "limit": 1}, + ] + }, + } + # Remove None gpus key if present + if step["plugins"][0][DOCKER_PLUGIN]["gpus"] is None: # type: ignore[index] + del step["plugins"][0][DOCKER_PLUGIN]["gpus"] # type: ignore[index] + if test.parallelism: + step["parallelism"] = test.parallelism + return step + + # Standard Docker plugin (inline) + plugin_env = [ + "VLLM_USAGE_SOURCE=ci-test", + "NCCL_CUMEM_HOST_ENABLE=0", + f"HF_HOME={HF_HOME_FSX}", + "HF_TOKEN", + "CODECOV_TOKEN", + ] + if config.fail_fast: + plugin_env.append("PYTEST_ADDOPTS=-x") + if config.branch == "main": + plugin_env.append("BUILDKITE_ANALYTICS_TOKEN") + if test.label == LABEL_SPEC_DECODE: + plugin_env.append("VLLM_ATTENTION_BACKEND=XFORMERS") + + step = { + "label": test.label, + "agents": {"queue": get_agent_queue(test, config)}, + "soft_fail": test.soft_fail or False, + "plugins": [{ + DOCKER_PLUGIN: { + "image": image, + "always-pull": True, + "propagate-environment": True, + "gpus": "all" if not test.no_gpu else None, + "mount-buildkite-agent": True if (test.label == LABEL_BENCHMARKS or test.mount_buildkite_agent or config.cov_enabled) else None, + "command": ["bash", bash_flag, command], + "environment": plugin_env, + "volumes": ["/dev/shm:/dev/shm", f"{HF_HOME_FSX}:{HF_HOME_FSX}"], + } + }], + "depends_on": BUILD_KEY_MAIN, + "retry": { + "automatic": [ + {"exit_status": RETRY_EXIT_STATUS_AGENT_LOST, "limit": 1}, + {"exit_status": RETRY_EXIT_STATUS_AGENT_TERMINATED, "limit": 1}, + ] + }, + } + + # Clean up None values in plugin + plugin_dict = step["plugins"][0][DOCKER_PLUGIN] + if plugin_dict.get("gpus") is None: + del plugin_dict["gpus"] + if plugin_dict.get("mount-buildkite-agent") is None: + del plugin_dict["mount-buildkite-agent"] + + if test.parallelism: + step["parallelism"] = test.parallelism + + return step + + +def generate_tests(test_steps, config) -> List[Dict[str, Any]]: + """Generate all test steps for CI mode.""" + steps = [] + + for test in test_steps: + # Skip fast_check_only tests + if test.fast_check_only: + continue + + # Generate block if needed + if should_block_test(test, config): + block_key = f"block-{get_step_key(test.label)}" + steps.append({ + "block": f"Run {test.label}", + "key": block_key, + "depends_on": BUILD_KEY_MAIN, + }) + + test_step = generate_test_step(test, config) + test_step["depends_on"] = block_key + steps.append(test_step) + else: + # No block needed + base_dependency = BUILD_KEY_CPU if test.no_gpu else BUILD_KEY_MAIN + test_step = generate_test_step(test, config) + test_step["depends_on"] = base_dependency + steps.append(test_step) + + return steps + + +# ============================================================================== +# TORCH NIGHTLY GROUP +# ============================================================================== + + +def generate_torch_nightly_group(test_steps, config) -> Dict[str, Any]: + """Generate torch nightly test group.""" + group_steps = [] + + # Add block for torch nightly build (not in nightly mode) + if not config.nightly: + group_steps.append({ + "block": "Build torch nightly image", + "key": "block-build-torch-nightly", + "depends_on": None, + }) + + # Add torch nightly build + build_depends_on = "block-build-torch-nightly" if not config.nightly else None + group_steps.append(build_torch_nightly_image(config, build_depends_on)) + + # Add torch nightly tests + for test in test_steps: + if not test.torch_nightly: + continue + + # Skip multi-node tests + if test.num_nodes and test.num_nodes >= 2: + continue + + # Check if torch nightly test should be blocked + # Torch nightly uses different logic than main tests (doesn't check run_all) + should_block = True # Start blocked + + # Set to not blocked if nightly mode + if config.nightly: + should_block = False + # Set to not blocked if no source deps + elif not test.source_file_dependencies: + should_block = False + # Set to not blocked if file dependencies match + elif test.source_file_dependencies: + for source_file in test.source_file_dependencies: + for changed_file in config.list_file_diff: + if source_file in changed_file: + should_block = False + break + if not should_block: + break + + # Also block if optional (and not nightly) + if test.optional and not config.nightly: + should_block = True + + # Add block if needed + if should_block: + block_key = f"block-torch-nightly-{get_step_key(test.label)}" + group_steps.append({ + "block": f"Run Torch Nightly {test.label}", + "key": block_key, + "depends_on": BUILD_KEY_TORCH_NIGHTLY, + }) + test_depends_on = block_key + else: + test_depends_on = BUILD_KEY_TORCH_NIGHTLY + + image = config.container_image_torch_nightly + command = build_docker_command(test, config) + bash_flag = "-xce" if config.fail_fast else "-xc" + + # Build plugin inline (same logic as generate_test_step but for torch nightly image) + plugin = { + DOCKER_PLUGIN: { + "image": image, + "always-pull": True, + "propagate-environment": True, + "command": ["bash", bash_flag, command], + "environment": ( + ["VLLM_USAGE_SOURCE=ci-test", "NCCL_CUMEM_HOST_ENABLE=0", f"HF_HOME={HF_HOME_FSX}", "HF_TOKEN", "CODECOV_TOKEN"] + + (["PYTEST_ADDOPTS=-x"] if config.fail_fast else []) + + (["BUILDKITE_ANALYTICS_TOKEN"] if config.branch == "main" else []) + + (["VLLM_ATTENTION_BACKEND=XFORMERS"] if test.label == LABEL_SPEC_DECODE else []) + ), + "volumes": ["/dev/shm:/dev/shm", f"{HF_HOME_FSX}:{HF_HOME_FSX}"], + } + } + + # Add gpus if not no_gpu + if not test.no_gpu: + plugin[DOCKER_PLUGIN]["gpus"] = "all" + + # Add mount-buildkite-agent if needed + if test.label == LABEL_BENCHMARKS or test.mount_buildkite_agent or config.cov_enabled: + plugin[DOCKER_PLUGIN]["mount-buildkite-agent"] = True + + step = { + "label": f"Torch Nightly {test.label}", + "agents": {"queue": get_agent_queue(test, config)}, + "soft_fail": True, # Torch nightly tests are soft_fail + "plugins": [plugin], + "depends_on": test_depends_on, + "retry": { + "automatic": [ + {"exit_status": RETRY_EXIT_STATUS_AGENT_LOST, "limit": 1}, + {"exit_status": RETRY_EXIT_STATUS_AGENT_TERMINATED, "limit": 1}, + ] + }, + } + + if test.parallelism: + step["parallelism"] = test.parallelism + + group_steps.append(step) # type: ignore[arg-type] + + return { + "group": "vllm against torch nightly", + "depends_on": None, + "steps": group_steps, + } + + +# ============================================================================== +# AMD GROUP +# ============================================================================== + + +def get_amd_queue(label: str) -> str: + """Determine AMD queue based on label.""" + if label in AMD_MI325_8_LABELS: + return QUEUE_AMD_MI325_8 + elif label in AMD_MI325_4_LABELS: + return QUEUE_AMD_MI325_4 + elif label in AMD_MI325_2_LABELS: + return QUEUE_AMD_MI325_2 + else: + return QUEUE_AMD_MI325_1 + + +def format_commands_for_amd(commands) -> str: + """Format commands for AMD, handling multi-node structure.""" + if not commands: + return "" + + # Check if it's multi-node (list of lists) + if isinstance(commands[0], list): + # Multi-node: convert each node's commands to JSON array representation + node_parts = [] + for node_cmds in commands: + # Format as JSON array + formatted_cmds = ", ".join(f'"{cmd}"' for cmd in node_cmds) + node_parts.append(f"[{formatted_cmds}]") + return " && ".join(node_parts) + else: + # Single node: just join with && + return " && ".join(commands) + + +def generate_amd_group(test_steps, config) -> Dict[str, Any]: + """Generate AMD test group.""" + group_steps = [] + + # Add AMD build + group_steps.append(build_amd_image(config)) + + # Add AMD tests + for test in test_steps: + if not test.mirror_hardwares or config.mirror_hw not in test.mirror_hardwares: + continue + + # Format commands for AMD (handles multi-node) + commands_str = format_commands_for_amd(test.commands or []) + working_dir = test.working_dir or DEFAULT_WORKING_DIR + + # AMD tests use a wrapper script with commands as argument + inner_command = f"(command rocm-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {working_dir} ; {commands_str}" + full_command = f'bash .buildkite/scripts/hardware_ci/run-amd-test.sh "{inner_command}"' + + step = { + "label": f"AMD MI300: {test.label}", + "depends_on": BUILD_KEY_AMD, + "agents": {"queue": get_amd_queue(test.label)}, + "env": {"DOCKER_BUILDKIT": "1"}, + "soft_fail": False, + "priority": PRIORITY_AMD, + "command": full_command, + } + + group_steps.append(step) + + return { + "group": "AMD Tests", + "depends_on": None, + "steps": group_steps, + } + + +# ============================================================================== +# HARDWARE TESTS +# ============================================================================== + + +def generate_hardware_tests(config) -> List[Dict[str, Any]]: + """Generate all hardware-specific tests - large inline structures.""" + steps = [] + + # Neuron test (inline) + steps.append({ + "label": "Neuron Test", + "agents": {"queue": QUEUE_NEURON}, + "command": f"bash {SCRIPT_RUN_NEURON}", + "soft_fail": True, + }) + + # Intel CPU (always has block) + steps.append({ + "block": "Run Intel CPU test", + "key": "block-intel-cpu", + "depends_on": None, + }) + steps.append({ + "label": "Intel CPU Test", + "agents": {"queue": QUEUE_INTEL_CPU}, + "command": f"bash {SCRIPT_RUN_INTEL_CPU}", + "soft_fail": True, + "depends_on": None if config.branch == "main" else "block-intel-cpu", + }) + + # Intel HPU and GPU (inline) + steps.append({ + "label": "Intel HPU Test", + "agents": {"queue": QUEUE_INTEL_HPU}, + "command": f"bash {SCRIPT_RUN_INTEL_HPU}", + "soft_fail": True, + }) + steps.append({ + "label": "Intel GPU Test", + "agents": {"queue": QUEUE_INTEL_GPU}, + "command": f"bash {SCRIPT_RUN_INTEL_GPU}", + "soft_fail": True, + }) + + # Ascend NPU (inline) + steps.append({ + "label": "Ascend NPU Test", + "agents": {"queue": QUEUE_ASCEND}, + "command": "bash .buildkite/scripts/hardware_ci/run-npu-test.sh", + "soft_fail": True, + "timeout_in_minutes": 20, + "depends_on": None, + }) + + # IBM Power + if config.branch == "main": + steps.append({ + "block": "Run IBM Power CPU test", + "key": "block-ibm-power", + "depends_on": None, + }) + steps.append({ + "label": "IBM Power(ppc64le) CPU Test", + "key": "ibm-ppc64-test", + "depends_on": "block-ibm-power", + "agents": {"queue": QUEUE_IBM_PPC64LE}, + "command": f"bash {SCRIPT_RUN_IBM_POWER}", + "soft_fail": True, + }) + # Add notification + steps.append({ + "label": "IBM Power(ppc64le) Build Failure Notification", + "depends_on": "ibm-ppc64-test", + "soft_fail": True, + "agents": {"queue": QUEUE_IBM_PPC64LE}, + "commands": create_notification_command( + check_step="IBM Power(ppc64le) CPU Test", + notification_label="Notify owners about failing test", + queue="ibm-ppc64le", + message="IBM Power(ppc64le) Build/Test failed", + slack_channel="vllm#vllm-ci-on-power", + soft_fail=True, + ), + }) + else: + steps.append({ + "block": "Run IBM Power(ppc64le) CPU Test", + "key": "block-ibm-ppc64-test", + "depends_on": None, + }) + steps.append({ + "label": "IBM Power(ppc64le) CPU Test", + "depends_on": "block-ibm-ppc64-test", + "agents": {"queue": QUEUE_IBM_PPC64LE}, + "command": f"bash {SCRIPT_RUN_IBM_POWER}", + "soft_fail": True, + }) + + # IBM S390X + if not config.nightly: + steps.append({ + "block": 'Run "IBM Z (s390x) CPU Test"', + "key": "block-ibm-s390x", + "depends_on": None, + }) + steps.append({ + "label": "IBM Z (s390x) CPU Test", + "depends_on": None if config.nightly else "block-ibm-s390x", + "agents": {"queue": QUEUE_IBM_S390X}, + "command": f"bash {SCRIPT_RUN_IBM_S390X}", + "soft_fail": True, + }) + + # GH200 (nightly only) - inline + if config.nightly: + steps.append({ + "label": "GH200 Test", + "agents": {"queue": QUEUE_GH200}, + "command": f"nvidia-smi && bash {SCRIPT_RUN_GH200}", + "soft_fail": True, + }) + + # TPU tests (inline with all commands) + steps.append({ + "label": "TPU V1 Test", + "soft_fail": True, + "depends_on": None, + "key": "run-tpu-v1-test", + "timeout_in_minutes": 180, + "agents": {"queue": QUEUE_TPU_V6E}, + "commands": [ + SCRIPT_TPU_CLEANUP, + f'if [[ -f "{SCRIPT_RUN_TPU_V1}" ]]; then bash {SCRIPT_RUN_TPU_V1}; fi', + ], + }) + steps.append({ + "label": "TPU V1 Test Part2", + "soft_fail": True, + "depends_on": None, + "key": "run-tpu-v1-test-part2", + "timeout_in_minutes": 90, + "agents": {"queue": QUEUE_TPU_V6E}, + "commands": [ + SCRIPT_TPU_CLEANUP, + f'if [[ -f "{SCRIPT_RUN_TPU_V1_PART2}" ]]; then bash {SCRIPT_RUN_TPU_V1_PART2}; fi', + ], + }) + steps.append({ + "label": "TPU V1 Benchmark Test", + "soft_fail": True, + "depends_on": None, + "key": "run-tpu-v1-benchmark-test", + "timeout_in_minutes": 60, + "agents": {"queue": QUEUE_TPU_V6E}, + "commands": [ + SCRIPT_TPU_CLEANUP, + ( + "DOCKER_BUILDKIT=1 docker build --build-arg max_jobs=16 --build-arg USE_SCCACHE=1 " + "--build-arg GIT_REPO_CHECK=0 --tag vllm/vllm-tpu-bm --progress plain -f docker/Dockerfile.tpu ." + ), + "bash .buildkite/scripts/tpu/docker_run_bm.sh .buildkite/scripts/tpu/config_v6e_1.env", + "bash .buildkite/scripts/tpu/docker_run_bm.sh .buildkite/scripts/tpu/quantized_v6e_1.env", + ], + }) + + # TPU notification for main branch + if config.branch == "main": + steps.append({ + "label": "TPU V1 Test Notification", + "depends_on": ["run-tpu-v1-test", "run-tpu-v1-test-part2"], + "soft_fail": True, + "agents": {"queue": QUEUE_TPU_V6E}, + "commands": create_multi_step_notification_command( + check_steps=["run-tpu-v1-test", "run-tpu-v1-test-part2"], + notification_label="Notify owners about failing test", + queue="tpu_v6e_queue", + message="TPU V1 Test failed", + slack_channel="vllm#tpu-ci-notifications", + ), + }) + + return steps + + +# ============================================================================== +# MAIN CI PIPELINE +# ============================================================================== + + +def generate_ci_pipeline(test_steps, config) -> List[Dict[str, Any]]: + """Generate complete CI pipeline.""" + steps = [] + + # Main CUDA build + steps.append(build_main_image(config)) + + # CUDA 11.8 build (inline) + steps.extend(build_cu118_image(config)) + + # CPU build (inline) + steps.append(build_cpu_image(config)) + + # Test steps + steps.extend(generate_tests(test_steps, config)) + + # Torch nightly group + steps.append(generate_torch_nightly_group(test_steps, config)) + + # AMD group + steps.append(generate_amd_group(test_steps, config)) + + # Hardware tests + steps.extend(generate_hardware_tests(config)) + + return steps + diff --git a/buildkite/pipeline_generator/modes/fastcheck.py b/buildkite/pipeline_generator/modes/fastcheck.py new file mode 100644 index 00000000..ba222e46 --- /dev/null +++ b/buildkite/pipeline_generator/modes/fastcheck.py @@ -0,0 +1,532 @@ +"""Fastcheck mode pipeline generation - simple dict generation with inline structures.""" + +from typing import Any, Dict, List + +from ..config import * # noqa: F403, F405 +from ..helpers.builds import build_amd_image, build_main_image +from ..helpers.commands import flatten_commands, normalize_commands + +# ============================================================================== +# UTILITIES +# ============================================================================== + + +def get_step_key(label: str) -> str: + """Generate step key from label.""" + return ( + label.lower() + .replace(" ", "-") + .replace("(", "") + .replace(")", "") + .replace("%", "") + .replace(",", "-") + .replace("+", "-") + ) + + +def create_notification_command( + check_step: str, + notification_label: str, + queue: str, + message: str, + slack_channel: str, +) -> str: + """Create a shell command that uploads a notification step on failure.""" + return f'''if [ $$(buildkite-agent step get "outcome" --step "{check_step}") != "passed" ]; then + cat <<- YAML | buildkite-agent pipeline upload + steps: + - label: "{notification_label}" + agents: + queue: {queue} + command: echo "{message}" + notify: + - slack: + channels: + - "{slack_channel}" +YAML +fi +''' + + +def get_agent_queue(test) -> str: + """Determine agent queue for fastcheck test.""" + if test.label == LABEL_DOC_BUILD: + return QUEUE_SMALL_CPU_PREMERGE + elif test.no_gpu: + return QUEUE_CPU_PREMERGE + elif test.gpu == GPUType.A100: + return QUEUE_A100 + elif test.num_gpus in [2, 4]: + return QUEUE_GPU_4 + else: + return QUEUE_GPU_1 + + +# ============================================================================== +# COMMAND BUILDING +# ============================================================================== + + +def build_docker_command(test, config) -> str: + """Build command for docker container.""" + commands = flatten_commands(test.commands or []) + commands = normalize_commands(commands) + command_str = " && ".join(commands) + + working_dir = test.working_dir or DEFAULT_WORKING_DIR + return f"(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {working_dir} && {command_str}" + + +def build_multi_node_command(test, config) -> str: + """Build multi-node test command.""" + working_dir = test.working_dir or DEFAULT_WORKING_DIR + + # Extract commands for each node + if test.commands and len(test.commands) > 0 and isinstance(test.commands[0], list): + node_commands = test.commands + else: + simple_commands = test.commands if test.commands else [] + node_commands = [simple_commands] * (test.num_nodes or 2) + + # Build quoted node commands + quoted_node_commands = [] + for node_cmds in node_commands: + node_cmd_str = " && ".join(node_cmds) + quoted_node_commands.append(f'"{node_cmd_str}"') + + image = config.container_image + return f"{SCRIPT_RUN_MULTI_NODE} {working_dir} {test.num_nodes} {test.num_gpus or 1} {image} {' '.join(quoted_node_commands)}" + + +# ============================================================================== +# PLUGIN GENERATION +# ============================================================================== + + +def build_docker_plugin(test, image, config) -> Dict: + """Build Docker plugin for fastcheck.""" + command = build_docker_command(test, config) + bash_flag = "-xce" if config.fail_fast else "-xc" + + plugin = { + "image": image, + "always-pull": True, + "propagate-environment": True, + "command": ["bash", bash_flag, command], + "environment": [ + "VLLM_USAGE_SOURCE=ci-test", + "NCCL_CUMEM_HOST_ENABLE=0", + f"HF_HOME={HF_HOME_FSX}", + "HF_TOKEN", + ], + "volumes": ["/dev/shm:/dev/shm", f"{HF_HOME_FSX}:{HF_HOME_FSX}"], + } + + if not test.no_gpu: + plugin["gpus"] = "all" + + if config.fail_fast: + plugin["environment"].append("PYTEST_ADDOPTS=-x") + + if test.label == LABEL_SPEC_DECODE: + plugin["environment"].append("VLLM_ATTENTION_BACKEND=XFORMERS") + + if test.label == LABEL_BENCHMARKS: + plugin["mount-buildkite-agent"] = True + + return {DOCKER_PLUGIN: plugin} + + +def build_a100_kubernetes_plugin(test, image, config) -> Dict: + """Build Kubernetes plugin for A100 in fastcheck.""" + commands = flatten_commands(test.commands or []) + commands = normalize_commands(commands) + command_str = " && ".join(commands) + + working_dir = test.working_dir or DEFAULT_WORKING_DIR + full_command = f"(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {working_dir} && {command_str}" + + num_gpus = test.num_gpus or 1 + + pod_spec = { + "priorityClassName": K8S_PRIORITY_CLASS, + "containers": [{ + "image": image, + "command": ["bash"], + "args": ["-c", f"'{full_command}'"], + "resources": {"limits": {K8S_NVIDIA_GPU_RESOURCE: num_gpus}}, + "volumeMounts": [ + {"name": K8S_DEVSHM_VOLUME, "mountPath": K8S_DEV_SHM_PATH}, + {"name": K8S_HF_CACHE_VOLUME, "mountPath": HF_HOME}, + ], + "env": [ + {"name": "VLLM_USAGE_SOURCE", "value": "ci-test"}, + {"name": "NCCL_CUMEM_HOST_ENABLE", "value": 0}, + {"name": "HF_HOME", "value": HF_HOME}, + { + "name": "HF_TOKEN", + "valueFrom": { + "secretKeyRef": { + "name": K8S_HF_TOKEN_SECRET, + "key": K8S_HF_TOKEN_KEY, + } + }, + }, + ], + }], + "nodeSelector": {K8S_NVIDIA_GPU_PRODUCT: K8S_NVIDIA_A100_PRODUCT}, + "volumes": [ + {"name": K8S_DEVSHM_VOLUME, "emptyDir": {"medium": "Memory"}}, + {"name": K8S_HF_CACHE_VOLUME, "hostPath": {"path": HF_HOME, "type": "Directory"}}, + ], + } + + return {"kubernetes": {"podSpec": pod_spec}} + + +# ============================================================================== +# TEST GENERATION +# ============================================================================== + + +def generate_test_step(test, config) -> Dict[str, Any]: + """Generate a single test step.""" + # Multi-node test (NO retry or soft_fail in fastcheck) + if test.num_nodes and test.num_nodes >= 2: + return { + "label": test.label, + "agents": {"queue": get_agent_queue(test)}, + "depends_on": BUILD_KEY_MAIN, + "commands": [build_multi_node_command(test, config)], + } + + # A100 test (uses kubernetes) + if test.gpu == GPUType.A100: + step = { + "label": test.label, + "agents": {"queue": QUEUE_A100}, + "soft_fail": test.soft_fail or False, + "plugins": [build_a100_kubernetes_plugin(test, config.container_image, config)], + "retry": { + "automatic": [ + {"exit_status": RETRY_EXIT_STATUS_AGENT_LOST, "limit": 5}, + {"exit_status": RETRY_EXIT_STATUS_AGENT_TERMINATED, "limit": 5}, + ] + }, + "priority": PRIORITY_A100, + } + if test.parallelism: + step["parallelism"] = test.parallelism + return step + + # Regular test + step = { + "label": test.label, + "agents": {"queue": get_agent_queue(test)}, + "soft_fail": test.soft_fail or False, + "plugins": [build_docker_plugin(test, config.container_image, config)], + "depends_on": BUILD_KEY_MAIN, + "retry": { + "automatic": [ + {"exit_status": RETRY_EXIT_STATUS_AGENT_LOST, "limit": 5}, + {"exit_status": RETRY_EXIT_STATUS_AGENT_TERMINATED, "limit": 5}, + ] + }, + } + + if test.parallelism: + step["parallelism"] = test.parallelism + + return step + + +def generate_fast_check_tests(test_steps, config) -> List[Dict[str, Any]]: + """Generate tests that run immediately (fast_check=true, not multi-node, not A100).""" + steps = [] + + for test in test_steps: + if not test.fast_check: + continue + if test.num_nodes and test.num_nodes >= 2: + continue + if test.gpu == GPUType.A100: + continue + + steps.append(generate_test_step(test, config)) + + return steps + + +def generate_blocked_tests(test_steps, config) -> List[Dict[str, Any]]: + """Generate blocked tests (non-fast-check, multi-node, A100).""" + steps = [] + + # Regular non-fast-check tests + for test in test_steps: + if test.fast_check: + continue + if test.num_nodes and test.num_nodes >= 2: + continue + if test.gpu == GPUType.A100: + continue + + block_key = f"block-{get_step_key(test.label)}" + steps.append({ + "block": f"Run {test.label}", + "key": block_key, + "depends_on": BUILD_KEY_MAIN, + }) + + test_step = generate_test_step(test, config) + test_step["depends_on"] = block_key + steps.append(test_step) + + # Multi-node tests + for test in test_steps: + if not (test.num_nodes and test.num_nodes >= 2): + continue + + block_key = f"block-{get_step_key(test.label)}" + steps.append({ + "block": f"Run {test.label}", + "key": block_key, + "depends_on": BUILD_KEY_MAIN, + }) + + test_step = generate_test_step(test, config) + test_step["depends_on"] = block_key + steps.append(test_step) + + # A100 tests (all behind single block) + a100_tests = [t for t in test_steps if t.gpu == GPUType.A100] + if a100_tests: + steps.append({ + "block": "Run A100 tests", + "depends_on": BUILD_KEY_MAIN, + }) + for test in a100_tests: + steps.append(generate_test_step(test, config)) + + return steps + + +# ============================================================================== +# HARDWARE TESTS +# ============================================================================== + + +def generate_hardware_tests(config) -> List[Dict[str, Any]]: + """Generate hardware tests for fastcheck.""" + steps = [] + + # TPU V0 + steps.append({ + "block": "Run TPU V0 Test", + "depends_on": None, + "key": "block-tpu-v0", + }) + steps.append({ + "label": "TPU V0 Test", + "key": "run-tpu-v0-test", + "depends_on": "block-tpu-v0", + "soft_fail": True, # type: ignore + "agents": {"queue": QUEUE_TPU_V5}, # type: ignore + "commands": [ + f'if [[ -f "{SCRIPT_RUN_TPU}" ]]; then bash {SCRIPT_RUN_TPU}; fi', + "yes | docker system prune -a", + ], # type: ignore + }) + steps.append({ + "label": "TPU V0 Test Notification", + "depends_on": "run-tpu-v0-test", + "soft_fail": True, # type: ignore + "agents": {"queue": QUEUE_TPU_V5}, # type: ignore + "commands": create_notification_command( + check_step="run-tpu-v0-test", + notification_label="Notify owners about failing test", + queue="tpu_v5_queue", + message="TPU V0 Test failed", + slack_channel="#collab-google-ci", + ), + }) + + # TPU V1 + steps.append({ + "block": "Run TPU V1 Test", + "key": "block-tpu-v1", + "depends_on": None, + }) + steps.append({ + "label": "TPU V1 Test", + "key": "run-tpu-v1-test", + "depends_on": "block-tpu-v1", + "agents": {"queue": QUEUE_TPU_V6E}, # type: ignore + "commands": [ + SCRIPT_TPU_CLEANUP, + f'if [[ -f "{SCRIPT_RUN_TPU_V1}" ]]; then bash {SCRIPT_RUN_TPU_V1}; fi', + ], # type: ignore + }) + steps.append({ + "label": "TPU V1 Test Notification", + "depends_on": "run-tpu-v1-test", + "soft_fail": True, # type: ignore + "agents": {"queue": QUEUE_TPU_V5}, # type: ignore + "commands": create_notification_command( + check_step="run-tpu-v1-test", + notification_label="Notify owners about failing test", + queue="tpu_v5_queue", + message="TPU V1 Test failed", + slack_channel="#tpu-ci-notifications", + ), + }) + + # GH200 + steps.append({ + "block": "Run GH200 Test", + "depends_on": None, + "key": "block-gh200", + }) + steps.append({ + "label": "GH200 Test", + "depends_on": "block-gh200", + "agents": {"queue": QUEUE_GH200}, # type: ignore + "command": f"nvidia-smi && bash {SCRIPT_RUN_GH200}", + "soft_fail": True, # type: ignore + }) + + # Intel CPU + steps.append({ + "block": "Run Intel CPU test", + "key": "block-intel-cpu", + "depends_on": None, + }) + steps.append({ + "label": "Intel CPU Test", + "depends_on": "block-intel-cpu", + "agents": {"queue": QUEUE_INTEL_CPU}, # type: ignore + "command": f"bash {SCRIPT_RUN_INTEL_CPU}", + "soft_fail": True, # type: ignore + }) + + # Intel GPU + steps.append({ + "block": "Run Intel GPU test", + "key": "block-intel-gpu", + "depends_on": None, + }) + steps.append({ + "label": "Intel GPU Test", + "depends_on": "block-intel-gpu", + "agents": {"queue": QUEUE_INTEL_GPU}, # type: ignore + "command": f"bash {SCRIPT_RUN_INTEL_GPU}", + "soft_fail": True, # type: ignore + }) + + return steps + + +# ============================================================================== +# AMD GROUP (FASTCHECK VERSION) +# ============================================================================== + + +def generate_amd_group(test_steps, config) -> Dict[str, Any]: + """Generate AMD test group for fastcheck.""" + group_steps = [] + + # AMD build + amd_build = build_amd_image(config) + # Fastcheck AMD build has depends_on: null inside the group + if "depends_on" not in amd_build: + amd_build["depends_on"] = None + group_steps.append(amd_build) + + # Only Basic Correctness Test in fastcheck + for test in test_steps: + if not test.mirror_hardwares or config.mirror_hw not in test.mirror_hardwares: + continue + + if test.label != LABEL_BASIC_CORRECTNESS: + continue + + # Add block + block_key = f"block-amd-{get_step_key(test.label)}" + group_steps.append({ + "block": f"Run AMD MI300: {test.label} with {config.mirror_hw}", + "key": block_key, + "depends_on": BUILD_KEY_AMD, + }) + + # Format commands + commands = flatten_commands(test.commands or []) + commands_str = " && ".join(commands) + working_dir = test.working_dir or DEFAULT_WORKING_DIR + + # AMD tests use a wrapper script (same as CI) + inner_command = f"(command rocm-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd {working_dir} ; {commands_str}" + full_command = f'bash .buildkite/scripts/hardware_ci/run-amd-test.sh "{inner_command}"' + + group_steps.append({ + "label": f"AMD MI300: {test.label} with {config.mirror_hw}", + "depends_on": block_key, + "agents": {"queue": QUEUE_AMD_MI300_1}, + "env": {"DOCKER_BUILDKIT": "1"}, + "soft_fail": True, + "priority": PRIORITY_AMD, + "command": full_command, + }) + + return { + "group": "AMD Tests", + "depends_on": None, + "steps": group_steps, + } + + +# ============================================================================== +# MAIN FASTCHECK PIPELINE +# ============================================================================== + + +def generate_fastcheck_pipeline(test_steps, config) -> List[Dict[str, Any]]: + """Generate complete fastcheck pipeline.""" + steps = [] + + # Main build only + steps.append(build_main_image(config)) + + # Neuron test (at the top, before regular tests) + steps.append({ + "block": "Run Neuron Test", + "depends_on": None, + "key": "run-neuron-test", + }) + steps.append({ + "label": "Neuron Test", + "depends_on": "run-neuron-test", + "agents": {"queue": QUEUE_NEURON}, + "command": f"bash {SCRIPT_RUN_NEURON}", + "soft_fail": False, + }) + + # Fast-check tests (run immediately) + steps.extend(generate_fast_check_tests(test_steps, config)) + + # Blocked tests + steps.extend(generate_blocked_tests(test_steps, config)) + + # Hardware tests (TPU V0, TPU V1, GH200 - without Intel) + hw_steps = generate_hardware_tests(config) + # Hardware tests are: TPU V0 (3 steps), TPU V1 (3 steps), GH200 (2 steps), Intel CPU (2 steps), Intel GPU (2 steps) + # We want TPU V0, TPU V1, GH200 before AMD, then Intel after AMD + # So take first 8 steps (TPU V0 + TPU V1 + GH200) + steps.extend(hw_steps[:8]) + + # AMD group + steps.append(generate_amd_group(test_steps, config)) + + # Intel tests (remaining hardware tests) + steps.extend(hw_steps[8:]) + + return steps + diff --git a/buildkite/pipeline_generator/pipeline_generator.py b/buildkite/pipeline_generator/pipeline_generator.py index 24611220..109e8dcb 100644 --- a/buildkite/pipeline_generator/pipeline_generator.py +++ b/buildkite/pipeline_generator/pipeline_generator.py @@ -1,91 +1,127 @@ -import click -import os -import re -from typing import List, Optional, Union -import yaml -from pydantic import BaseModel, field_validator +"""Main pipeline generator - simplified entry point.""" -from .step import BuildkiteStep, BuildkiteBlockStep, TestStep -from .utils import VLLM_ECR_URL, VLLM_ECR_REPO, AgentQueue -from .pipeline_generator_helper import get_build_commands +import os +from typing import Any, Dict, List -class PipelineGeneratorConfig: - def __init__( - self, - container_registry: str, - container_registry_repo: str, - commit: str, - list_file_diff: List[str], - run_all: bool = False, - ): - self.run_all = run_all - self.list_file_diff = list_file_diff - self.container_registry = container_registry - self.container_registry_repo = container_registry_repo - self.commit = commit +import click +import yaml - @property - def container_image(self): - return f"{self.container_registry}/{self.container_registry_repo}:{self.commit}" - - def validate(self): - """Validate the configuration.""" - # Check if commit is a valid Git commit hash - pattern = r"^[0-9a-f]{40}$" - if not re.match(pattern, self.commit): - raise ValueError(f"Commit {self.commit} is not a valid Git commit hash") +from .config import VLLM_ECR_REPO, VLLM_ECR_URL, PipelineGeneratorConfig, PipelineMode +from .models import TestStep +from .modes.amd import generate_amd_pipeline +from .modes.ci import generate_ci_pipeline +from .modes.fastcheck import generate_fastcheck_pipeline class PipelineGenerator: - def __init__( - self, - config: PipelineGeneratorConfig - ): + """Compatibility wrapper for old PipelineGenerator interface.""" + + def __init__(self, config: PipelineGeneratorConfig): config.validate() self.config = config + + def generate(self, test_steps: List[TestStep]) -> List[Dict[str, Any]]: + """Generate pipeline based on mode.""" + if self.config.pipeline_mode == PipelineMode.AMD: + return generate_amd_pipeline(test_steps, self.config) + elif self.config.pipeline_mode == PipelineMode.FASTCHECK: + return generate_fastcheck_pipeline(test_steps, self.config) + else: + return generate_ci_pipeline(test_steps, self.config) - def generate_build_step(self) -> BuildkiteStep: - """Build the Docker image and push it to container registry.""" - build_commands = get_build_commands(self.config.container_registry, self.config.commit, self.config.container_image) - - return BuildkiteStep( - label=":docker: build image", - key="build", - agents={"queue": AgentQueue.AWS_CPU.value}, - env={"DOCKER_BUILDKIT": "1"}, - retry={ - "automatic": [ - {"exit_status": -1, "limit": 2}, - {"exit_status": -10, "limit": 2} - ] - }, - commands=build_commands, - depends_on=None, - ) def read_test_steps(file_path: str) -> List[TestStep]: - """Read test steps from test pipeline yaml and parse them into TestStep objects.""" + """Read test steps from test pipeline yaml.""" with open(file_path, "r") as f: content = yaml.safe_load(f) return [TestStep(**step) for step in content["steps"]] -def write_buildkite_steps(steps: List[Union[BuildkiteStep, BuildkiteBlockStep]], file_path: str) -> None: - """Write the buildkite steps to the Buildkite pipeline yaml file.""" - buildkite_steps_dict = {"steps": [step.dict(exclude_none=True) for step in steps]} + +def write_pipeline(steps: List[Dict[str, Any]], file_path: str) -> None: + """Write pipeline steps to yaml file.""" + pipeline = {"steps": steps} with open(file_path, "w") as f: - yaml.dump(buildkite_steps_dict, f, sort_keys=False) + yaml.dump(pipeline, f, sort_keys=False, default_flow_style=False, allow_unicode=True) + + +# Alias for backward compatibility +write_buildkite_pipeline = write_pipeline + @click.command() -@click.option("--test_path", type=str, required=True, help="Path to the test pipeline yaml file") -@click.option("--run_all", type=str, help="If set to 1, run all tests") -@click.option("--list_file_diff", type=str, help="List of files in the diff between current branch and main") -def main(test_path: str, external_hardware_test_path: str, run_all: str, list_file_diff: str): +@click.option("--test_path", type=str, default=".buildkite/test-pipeline.yaml", help="Path to test pipeline yaml") +@click.option("--run_all", type=str, default="0", help="Run all tests") +@click.option("--nightly", type=str, default="0", help="Run nightly tests") +@click.option("--list_file_diff", type=str, default="", help="List of changed files (pipe-separated)") +@click.option("--mirror_hw", type=str, default="amdexperimental", help="Mirror hardware") +@click.option("--fail_fast", type=str, default="false", help="Enable fail fast mode") +@click.option("--vllm_use_precompiled", type=str, default="0", help="Use precompiled wheels") +@click.option("--cov_enabled", type=str, default="0", help="Enable coverage") +@click.option("--vllm_ci_branch", type=str, default="main", help="CI branch") +@click.option("--pipeline_mode", type=str, default="ci", help="Pipeline mode: ci, fastcheck, or amd") +@click.option("--output", type=str, default=".buildkite/pipeline.yaml", help="Output path") +def main( + test_path: str, + run_all: str, + nightly: str, + list_file_diff: str, + mirror_hw: str, + fail_fast: str, + vllm_use_precompiled: str, + cov_enabled: str, + vllm_ci_branch: str, + pipeline_mode: str, + output: str, +): + """Generate Buildkite pipeline from test configuration.""" test_steps = read_test_steps(test_path) - - pipeline_generator_config = PipelineGeneratorConfig( + + # Get environment variables + commit = os.getenv("BUILDKITE_COMMIT", "0" * 40) + branch = os.getenv("BUILDKITE_BRANCH", "main") + + # Parse file diff + file_diff = list_file_diff.split("|") if list_file_diff else [] + + # Parse pipeline mode + if pipeline_mode == "fastcheck": + mode = PipelineMode.FASTCHECK + elif pipeline_mode == "amd": + mode = PipelineMode.AMD + else: + mode = PipelineMode.CI + + # Create config + config = PipelineGeneratorConfig( run_all=run_all == "1", - list_file_diff=list_file_diff, + nightly=nightly == "1", + list_file_diff=file_diff, container_registry=VLLM_ECR_URL, container_registry_repo=VLLM_ECR_REPO, - commit=os.getenv("BUILDKITE_COMMIT"), + commit=commit, + branch=branch, + mirror_hw=mirror_hw, + fail_fast=fail_fast == "true", + vllm_use_precompiled=vllm_use_precompiled, + cov_enabled=cov_enabled == "1", + vllm_ci_branch=vllm_ci_branch, + pipeline_mode=mode, ) + + config.validate() + + # Generate pipeline based on mode + if mode == PipelineMode.AMD: + steps = generate_amd_pipeline(test_steps, config) + elif mode == PipelineMode.FASTCHECK: + steps = generate_fastcheck_pipeline(test_steps, config) + else: + steps = generate_ci_pipeline(test_steps, config) + + # Write pipeline + write_pipeline(steps, output) + print(f"Pipeline generated at {output}") + + +if __name__ == "__main__": + main() diff --git a/buildkite/pipeline_generator/pipeline_generator_helper.py b/buildkite/pipeline_generator/pipeline_generator_helper.py deleted file mode 100644 index e9165030..00000000 --- a/buildkite/pipeline_generator/pipeline_generator_helper.py +++ /dev/null @@ -1,74 +0,0 @@ -from typing import Dict, List, Optional - -from .utils import GPUType, get_agent_queue, get_full_test_command, get_multi_node_test_command -from .step import TestStep, BuildkiteStep, get_step_key -from .plugin import get_docker_plugin_config, get_kubernetes_plugin_config - -def get_plugin_config( - container_image: str, - no_gpu: Optional[bool] = None, - gpu_type: Optional[GPUType] = None, - num_gpus: Optional[int] = None - ) -> Dict: - """Returns the plugin configuration for the Buildkite step.""" - if gpu_type and gpu_type == GPUType.A100 and num_gpus: - return get_kubernetes_plugin_config( - container_image, - num_gpus - ) - return get_docker_plugin_config( - container_image, - no_gpu or False, - ) - - -def convert_test_step_to_buildkite_step(step: TestStep, container_image: str) -> BuildkiteStep: - """Convert TestStep into BuildkiteStep.""" - buildkite_step = BuildkiteStep( - label=step.label, - key=get_step_key(step.label), - commands=step.commands, - parallelism=step.parallelism, - soft_fail=step.soft_fail, - plugins=[get_plugin_config(container_image, step.no_gpu, step.gpu, step.num_gpus)], - agents={"queue": get_agent_queue(step.no_gpu, step.gpu, step.num_gpus).value} - ) - # If test is multi-node, configure step to run with custom script - if step.num_nodes and step.num_nodes > 1: - buildkite_step.commands = [get_multi_node_test_command( - step.commands, - step.working_dir, - step.num_nodes, - step.num_gpus, - container_image - ) - ] - buildkite_step.plugins = None - return buildkite_step - -def get_build_commands(container_registry: str, buildkite_commit: str, container_image: str) -> List[str]: - ecr_login_command = ( - "aws ecr-public get-login-password --region us-east-1 | " - f"docker login --username AWS --password-stdin {container_registry}" - ) - image_check_command = f"""#!/bin/bash -if [[ -z $(docker manifest inspect {container_image}) ]]; then -echo "Image not found, proceeding with build..." -else -echo "Image found" -exit 0 -fi -""" - docker_build_command = ( - f"docker build " - f"--file docker/Dockerfile " - f"--build-arg max_jobs=64 " - f"--build-arg buildkite_commit={buildkite_commit} " - f"--build-arg USE_SCCACHE=1 " - f"--tag {container_image} " - f"--target test " - f"--progress plain ." - ) - # TODO: Stop using . in docker build command - docker_push_command = f"docker push {container_image}" - return [ecr_login_command, image_check_command, docker_build_command, docker_push_command] diff --git a/buildkite/pipeline_generator/plugin.py b/buildkite/pipeline_generator/plugin.py deleted file mode 100644 index ac7a29b2..00000000 --- a/buildkite/pipeline_generator/plugin.py +++ /dev/null @@ -1,114 +0,0 @@ -from pydantic import BaseModel, Field -from typing import List, Dict, Any, Optional - -from .utils import HF_HOME - -DOCKER_PLUGIN_NAME = "docker#v5.2.0" -KUBERNETES_PLUGIN_NAME = "kubernetes" - -DEFAULT_DOCKER_ENVIRONMENT_VARIBLES = [ - f"HF_HOME={HF_HOME}", - "VLLM_USAGE_SOURCE=ci-test", - "HF_TOKEN", - "BUILDKITE_ANALYTICS_TOKEN" -] -DEFAULT_DOCKER_VOLUMES = [ - "/dev/shm:/dev/shm", - f"{HF_HOME}:{HF_HOME}" -] -DEFAULT_KUBERNETES_CONTAINER_VOLUME_MOUNTS = [ - {"name": "devshm", "mountPath": "/dev/shm"}, - {"name": "hf-cache", "mountPath": HF_HOME} -] -DEFAULT_KUBERNETES_CONTAINER_ENVIRONMENT_VARIABLES = [ - {"name": "HF_HOME", "value": HF_HOME}, - {"name": "VLLM_USAGE_SOURCE", "value": "ci-test"}, - { - "name": "HF_TOKEN", - "valueFrom": { - "secretKeyRef": { - "name": "hf-token-secret", - "key": "token" - } - } - }, -] -DEFAULT_KUBERNETES_POD_VOLUMES = [ - {"name": "devshm", "emptyDir": {"medium": "Memory"}}, - {"name": "hf-cache", "hostPath": {"path": HF_HOME, "type": "Directory"}} -] -DEFAULT_KUBERNETES_NODE_SELECTOR = {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-80GB"} - - -class DockerPluginConfig(BaseModel): - """ - Configuration for Docker plugin running in a Buildkite step. - The specification is based on: - https://github.com/buildkite-plugins/docker-buildkite-plugin?tab=readme-ov-file#configuration - """ - image: str = "" - always_pull: bool = Field(default=True, alias="always-pull") - propagate_environment: bool = Field(default=True, alias="propagate-environment") - gpus: Optional[str] = "all" - mount_buildkite_agent: Optional[bool] = Field(default=False, alias="mount-buildkite-agent") - environment: List[str] = DEFAULT_DOCKER_ENVIRONMENT_VARIBLES - volumes: List[str] = DEFAULT_DOCKER_VOLUMES - shell: List[str] = ["/bin/bash", "-c"] - - -class KubernetesPodContainerConfig(BaseModel): - """ - Configuration for a container running in a Kubernetes pod. - """ - image: str - resources: Dict[str, Dict[str, int]] - volume_mounts: List[Dict[str, str]] = Field( - alias="volumeMounts", - default=DEFAULT_KUBERNETES_CONTAINER_VOLUME_MOUNTS - ) - env: List[Dict[str, str]] = Field( - default=DEFAULT_KUBERNETES_CONTAINER_ENVIRONMENT_VARIABLES, - ) - - -class KubernetesPodSpec(BaseModel): - """ - Configuration for a Kubernetes pod running in a Buildkite step. - """ - containers: List[KubernetesPodContainerConfig] - priority_class_name: str = Field(default="ci", alias="priorityClassName") - node_selector: Dict[str, Any] = Field( - default=DEFAULT_KUBERNETES_NODE_SELECTOR, - alias="nodeSelector" - ) - volumes: List[Dict[str, Any]] = Field( - default=DEFAULT_KUBERNETES_POD_VOLUMES - ) - - -class KubernetesPluginConfig(BaseModel): - """ - Configuration for Kubernetes plugin running in a Buildkite step. - """ - pod_spec: KubernetesPodSpec = Field(alias="podSpec") - - -def get_kubernetes_plugin_config(container_image: str, num_gpus: int) -> Dict: - pod_spec = KubernetesPodSpec( - containers=[ - KubernetesPodContainerConfig( - image=container_image, - resources={"limits": {"nvidia.com/gpu": num_gpus}} - ) - ] - ) - return {KUBERNETES_PLUGIN_NAME: KubernetesPluginConfig(podSpec=pod_spec).dict(by_alias=True)} - - -def get_docker_plugin_config(docker_image_path: str, no_gpu: bool) -> Dict: - docker_plugin_config = DockerPluginConfig( - image=docker_image_path, - ) - if no_gpu: - docker_plugin_config.gpus = None - return {DOCKER_PLUGIN_NAME: docker_plugin_config.dict(exclude_none=True, by_alias=True)} diff --git a/buildkite/pipeline_generator/pyproject.toml b/buildkite/pipeline_generator/pyproject.toml new file mode 100644 index 00000000..40784d46 --- /dev/null +++ b/buildkite/pipeline_generator/pyproject.toml @@ -0,0 +1,38 @@ +[tool.ruff] +# Exclude common directories +exclude = [ + ".git", + "__pycache__", + ".pytest_cache", + "*.egg-info", +] + +line-length = 160 # Allow longer lines for type annotations and strings + +[tool.ruff.lint] +# Enable pycodestyle (E), Pyflakes (F), and isort (I) rules +select = ["E", "F", "I"] + +# Ignore specific rules for certain files +[tool.ruff.lint.per-file-ignores] +# Integration tests need to modify sys.path before importing (E402) +"tests/test_integration_*.py" = ["E402"] +# Modes use star imports for simplicity (F405, F403) +"modes/*.py" = ["F405", "F403"] +# Config and helpers have long strings (E501) +"config.py" = ["E501"] +"helpers/coverage.py" = ["E501"] +"tests/test_integration_comprehensive.py" = ["E402", "E501"] +"tests/test_integration_fastcheck.py" = ["E402", "E501"] +"tests/test_integration_amd.py" = ["E402", "E501"] + +[tool.mypy] +python_version = "3.9" +warn_return_any = true +warn_unused_configs = true +disallow_untyped_defs = false # Allow gradual typing +exclude = [ + "tests/", + "__pycache__/", +] + diff --git a/buildkite/pipeline_generator/step.py b/buildkite/pipeline_generator/step.py deleted file mode 100644 index 4590331c..00000000 --- a/buildkite/pipeline_generator/step.py +++ /dev/null @@ -1,98 +0,0 @@ -from pydantic import BaseModel, Field, root_validator, model_validator -from typing import List, Dict, Any, Optional -from typing_extensions import Self - -from .utils import AgentQueue, GPUType - -BUILD_STEP_KEY = "build" -DEFAULT_TEST_WORKING_DIR = "/vllm-workspace/tests" - -class TestStep(BaseModel): - """This class represents a test step defined in the test configuration file.""" - label: str - working_dir: Optional[str] = DEFAULT_TEST_WORKING_DIR - optional: Optional[bool] = False - fast_check: Optional[bool] = None - mirror_hardwares: Optional[List[str]] = None - no_gpu: Optional[bool] = None - gpu: Optional[GPUType] = None - num_gpus: Optional[int] = None - num_nodes: Optional[int] = None - source_file_dependencies: Optional[List[str]] = None - soft_fail: Optional[bool] = None - parallelism: Optional[int] = None - command: Optional[str] = None - commands: Optional[List[str]] = None - - @model_validator(mode="before") - @classmethod - def validate_and_convert_command(cls, values) -> Any: - """ - Validate that either 'command' or 'commands' is defined. - If 'command' is defined, convert it to 'commands'. - """ - if not values.get("command") and not values.get("commands"): - raise ValueError("Either 'command' or 'commands' must be defined.") - if values.get("command") and values.get("commands"): - raise ValueError("Only one of 'command' or 'commands' can be defined.") - if values.get("command"): - values["commands"] = [values["command"]] - del values["command"] - return values - - @model_validator(mode="after") - def validate_gpu(self) -> Self: - if self.gpu and self.no_gpu: - raise ValueError("Both 'gpu' and 'no_gpu' cannot be defined together.") - return self - - @model_validator(mode="after") - def validate_multi_node(self) -> Self: - if self.num_nodes and not self.num_gpus: - raise ValueError("'num_gpus' must be defined if 'num_nodes' is defined.") - if self.num_nodes and len(self.commands) != self.num_nodes: - raise ValueError("Number of commands must match the number of nodes.") - return self - - -class BuildkiteStep(BaseModel): - """This class represents a step in Buildkite format.""" - label: str - agents: Dict[str, str] = {"queue": AgentQueue.AWS_CPU.value} - commands: List[str] - key: Optional[str] = None - plugins: Optional[List[Dict]] = None - parallelism: Optional[int] = None - soft_fail: Optional[bool] = None - depends_on: Optional[str] = "build" - env: Optional[Dict[str, str]] = None - retry: Optional[Dict[str, Any]] = None - - @model_validator(mode="after") - def validate_agent_queue(self) -> Self: - queue = self.agents.get("queue") - if not AgentQueue(queue): - raise ValueError(f"Invalid agent queue: {queue}") - - -class BuildkiteBlockStep(BaseModel): - """This class represents a block step in Buildkite format.""" - block: str - key: str - depends_on: Optional[str] = BUILD_STEP_KEY - - -def get_step_key(step_label: str) -> str: - step_key = "" - skip_chars = "()% " - for char in step_label.lower(): - if char in ", " and step_key[-1] != "-": - step_key += "-" - elif char not in skip_chars: - step_key += char - - return step_key - - -def get_block_step(step_label: str) -> BuildkiteBlockStep: - return BuildkiteBlockStep(block=f"Run {step_label}", key=f"block-{get_step_key(step_label)}") diff --git a/buildkite/pipeline_generator/tests/__init__.py b/buildkite/pipeline_generator/tests/__init__.py new file mode 100644 index 00000000..f74686ff --- /dev/null +++ b/buildkite/pipeline_generator/tests/__init__.py @@ -0,0 +1 @@ +"""Tests for pipeline generator.""" diff --git a/buildkite/pipeline_generator/tests/conftest.py b/buildkite/pipeline_generator/tests/conftest.py new file mode 100644 index 00000000..f162ba7b --- /dev/null +++ b/buildkite/pipeline_generator/tests/conftest.py @@ -0,0 +1,64 @@ +"""Shared test fixtures and configuration.""" + +import pytest + +from ..config import PipelineGeneratorConfig, PipelineMode +from ..models import TestStep + +TEST_COMMIT = "abcdef0123456789abcdef0123456789abcdef01" +TEST_CONTAINER_REGISTRY = "container.registry" +TEST_CONTAINER_REGISTRY_REPO = "test" + + +@pytest.fixture +def pipeline_config(): + """Create a basic pipeline configuration for testing.""" + return PipelineGeneratorConfig( + container_registry=TEST_CONTAINER_REGISTRY, + container_registry_repo=TEST_CONTAINER_REGISTRY_REPO, + commit=TEST_COMMIT, + branch="main", + list_file_diff=[], + run_all=False, + nightly=False, + pipeline_mode=PipelineMode.CI, + ) + + +@pytest.fixture +def simple_test_step(): + """Create a simple test step for testing.""" + return TestStep( + label="Test Step", + commands=["pytest -v test_sample.py"], + working_dir="/vllm-workspace/tests", + ) + + +@pytest.fixture +def multi_node_test_step(): + """Create a multi-node test step for testing.""" + return TestStep( + label="Multi-Node Test", + commands=[["pytest test1.py"], ["pytest test2.py"]], + num_nodes=2, + num_gpus=4, + working_dir="/vllm-workspace/tests", + ) + + +@pytest.fixture +def gpu_test_step(): + """Create a GPU test step for testing.""" + return TestStep(label="GPU Test", commands=["pytest -v test_gpu.py"], gpu="a100", num_gpus=2) + + +@pytest.fixture +def optional_test_step(): + """Create an optional test step for testing.""" + return TestStep( + label="Optional Test", + commands=["pytest -v test_optional.py"], + optional=True, + source_file_dependencies=["vllm/engine/engine.py"], + ) diff --git a/buildkite/pipeline_generator/tests/run_tests.sh b/buildkite/pipeline_generator/tests/run_tests.sh new file mode 100755 index 00000000..e1d7650e --- /dev/null +++ b/buildkite/pipeline_generator/tests/run_tests.sh @@ -0,0 +1,41 @@ +#!/bin/bash +# Run all pipeline generator unit tests + +set -e + +# Get the directory of this script +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$(dirname "$SCRIPT_DIR")" + +cd "$PROJECT_ROOT" + +echo "==================================" +echo "Pipeline Generator Unit Tests" +echo "==================================" +echo "" + +# Check if pytest is installed +if ! python -m pytest --version > /dev/null 2>&1; then + echo "ERROR: pytest not installed" + echo "Install with: pip install pytest pytest-cov" + exit 1 +fi + +# Run tests with coverage +echo "Running unit tests with coverage..." +python -m pytest tests/ \ + -v \ + --cov=. \ + --cov-report=term \ + --cov-report=html:htmlcov \ + --cov-config=.coveragerc \ + "$@" + +echo "" +echo "==================================" +echo "Test Results" +echo "==================================" +echo "HTML coverage report: htmlcov/index.html" +echo "" + + diff --git a/buildkite/tests/pipeline_generator/test_files/expected_pipeline.yaml b/buildkite/pipeline_generator/tests/test_files/expected_pipeline.yaml similarity index 100% rename from buildkite/tests/pipeline_generator/test_files/expected_pipeline.yaml rename to buildkite/pipeline_generator/tests/test_files/expected_pipeline.yaml diff --git a/buildkite/tests/pipeline_generator/test_files/test-pipeline.yaml b/buildkite/pipeline_generator/tests/test_files/test-pipeline.yaml similarity index 100% rename from buildkite/tests/pipeline_generator/test_files/test-pipeline.yaml rename to buildkite/pipeline_generator/tests/test_files/test-pipeline.yaml diff --git a/buildkite/pipeline_generator/tests/test_integration_comprehensive.py b/buildkite/pipeline_generator/tests/test_integration_comprehensive.py new file mode 100755 index 00000000..9af686cd --- /dev/null +++ b/buildkite/pipeline_generator/tests/test_integration_comprehensive.py @@ -0,0 +1,988 @@ +#!/usr/bin/env python3 +""" +Comprehensive test suite for pipeline generator. +Tests all scenarios, flags, and edge cases with detailed YAML diff output. +""" + +import difflib +import os +import subprocess +import sys +from dataclasses import dataclass +from typing import Any, Dict, List, Tuple, cast + +import pytest +import yaml + +from buildkite.pipeline_generator.config import VLLM_ECR_REPO, VLLM_ECR_URL, PipelineGeneratorConfig +from buildkite.pipeline_generator.pipeline_generator import ( + PipelineGenerator, + read_test_steps, + write_buildkite_pipeline, +) + +# Add parent directories to path for imports +# We're in pipeline_generator/tests/, need to go up to ci-infra/buildkite +current_dir = os.path.dirname(os.path.abspath(__file__)) # tests/ +pipeline_gen_dir = os.path.dirname(current_dir) # pipeline_generator/ +buildkite_dir = os.path.dirname(pipeline_gen_dir) # buildkite/ +ci_infra_dir = os.path.dirname(buildkite_dir) # ci-infra/ +sys.path.insert(0, ci_infra_dir) + + +@dataclass +class Scenario: + """Represents a test scenario with specific configuration.""" + + name: str + branch: str = "main" + run_all: str = "0" + nightly: str = "0" + list_file_diff: str = "" + mirror_hw: str = "amdexperimental" + fail_fast: str = "false" + vllm_use_precompiled: str = "0" + cov_enabled: str = "0" + vllm_ci_branch: str = "main" + commit: str = "0" * 40 + description: str = "" + + +def get_all_test_scenarios() -> List[Scenario]: + """Get comprehensive list of all test scenarios.""" + scenarios = [] + + # ==================== Branch Variations ==================== + scenarios.append( + Scenario( + name="main_branch_default", + description="Main branch with default settings", + branch="main", + ) + ) + + scenarios.append( + Scenario( + name="pr_branch_default", + description="PR branch with default settings", + branch="feature-branch", + ) + ) + + # ==================== Run All Mode ==================== + scenarios.append( + Scenario( + name="run_all_main", + description="Run all tests on main branch", + branch="main", + run_all="1", + ) + ) + + scenarios.append( + Scenario( + name="run_all_pr", + description="Run all tests on PR branch", + branch="feature-branch", + run_all="1", + ) + ) + + # ==================== Nightly Mode ==================== + scenarios.append( + Scenario( + name="nightly_main", + description="Nightly build on main branch", + branch="main", + nightly="1", + ) + ) + + scenarios.append( + Scenario( + name="nightly_pr", + description="Nightly build on PR branch", + branch="feature-branch", + nightly="1", + ) + ) + + scenarios.append( + Scenario( + name="nightly_run_all", + description="Nightly with run_all enabled", + branch="main", + nightly="1", + run_all="1", + ) + ) + + # ==================== File Changes ==================== + scenarios.append( + Scenario( + name="vllm_core_changes", + description="Changes to vllm core code", + branch="feature-branch", + list_file_diff="vllm/engine/llm_engine.py|vllm/worker/worker.py|vllm/attention/backends/flash_attn.py", + ) + ) + + scenarios.append( + Scenario( + name="test_files_only", + description="Only test files changed (intelligent targeting)", + branch="feature-branch", + list_file_diff="tests/engine/test_engine.py|tests/test_config.py|tests/test_sequence.py", + ) + ) + + scenarios.append( + Scenario( + name="csrc_changes", + description="Changes to csrc (should trigger more tests)", + branch="feature-branch", + list_file_diff="csrc/attention/attention_kernels.cu|csrc/quantization/fp8/fp8_kernels.cu", + ) + ) + + scenarios.append( + Scenario( + name="multimodal_changes", + description="Changes to multimodal code", + branch="feature-branch", + list_file_diff="vllm/multimodal/base.py|tests/multimodal/test_mapper.py", + ) + ) + + scenarios.append( + Scenario( + name="distributed_changes", + description="Changes to distributed code", + branch="feature-branch", + list_file_diff="vllm/distributed/parallel_state.py|tests/distributed/test_pynccl.py", + ) + ) + + # ==================== Build Configuration ==================== + scenarios.append( + Scenario( + name="fail_fast_enabled", + description="Fail fast mode enabled", + branch="feature-branch", + run_all="1", + fail_fast="true", + ) + ) + + scenarios.append( + Scenario( + name="precompiled_wheels", + description="Use precompiled wheels", + branch="feature-branch", + vllm_use_precompiled="1", + ) + ) + + scenarios.append( + Scenario( + name="no_precompiled_wheels", + description="Build from source", + branch="feature-branch", + vllm_use_precompiled="0", + ) + ) + + # ==================== Coverage ==================== + scenarios.append( + Scenario( + name="coverage_enabled", + description="Coverage collection enabled", + branch="feature-branch", + run_all="1", + cov_enabled="1", + ) + ) + + scenarios.append( + Scenario( + name="coverage_with_tests_only", + description="Coverage with only test changes", + branch="feature-branch", + cov_enabled="1", + list_file_diff="tests/models/test_transformers.py|tests/engine/test_engine.py", + ) + ) + + # ==================== CI Branch Variations ==================== + scenarios.append( + Scenario( + name="custom_ci_branch", + description="Custom CI branch", + branch="main", + vllm_ci_branch="dev", + ) + ) + + # ==================== Combined Scenarios ==================== + scenarios.append( + Scenario( + name="nightly_coverage", + description="Nightly with coverage", + branch="main", + nightly="1", + cov_enabled="1", + ) + ) + + scenarios.append( + Scenario( + name="run_all_fail_fast_precompiled", + description="Run all + fail fast + precompiled", + branch="feature-branch", + run_all="1", + fail_fast="true", + vllm_use_precompiled="1", + ) + ) + + scenarios.append( + Scenario( + name="nightly_fail_fast_coverage", + description="Nightly + fail fast + coverage", + branch="main", + nightly="1", + fail_fast="true", + cov_enabled="1", + ) + ) + + # ==================== Edge Cases ==================== + scenarios.append( + Scenario( + name="empty_file_diff", + description="Empty file diff list", + branch="feature-branch", + list_file_diff="", + ) + ) + + scenarios.append( + Scenario( + name="docs_only_changes", + description="Only documentation changes", + branch="feature-branch", + list_file_diff="docs/source/index.md|README.md", + ) + ) + + scenarios.append( + Scenario( + name="docker_changes", + description="Docker file changes", + branch="feature-branch", + list_file_diff="docker/Dockerfile|docker/Dockerfile.cpu", + ) + ) + + scenarios.append( + Scenario( + name="requirements_changes", + description="Requirements file changes", + branch="feature-branch", + list_file_diff="requirements/common.txt|requirements/cuda.txt", + ) + ) + + # ==================== Intelligent Test Filtering - CRITICAL ============= + scenarios.append( + Scenario( + name="intelligent_filter_engine_test", + description="Only engine test file changed - should run only that test", + branch="feature-branch", + list_file_diff="tests/engine/test_engine.py", + ) + ) + + scenarios.append( + Scenario( + name="intelligent_filter_multimodal_test", + description="Only multimodal test changed - targeted execution", + branch="feature-branch", + list_file_diff="tests/multimodal/test_mapper.py", + ) + ) + + scenarios.append( + Scenario( + name="intelligent_filter_multiple_tests", + description="Multiple test files in same directory", + branch="feature-branch", + list_file_diff="tests/engine/test_engine.py|tests/engine/test_llm_engine.py|tests/tokenization/test_tokenizers.py", + ) + ) + + scenarios.append( + Scenario( + name="intelligent_filter_v1_tests", + description="V1 test directory changes", + branch="feature-branch", + list_file_diff="tests/v1/engine/test_engine_core.py|tests/v1/core/test_scheduler.py", + ) + ) + + scenarios.append( + Scenario( + name="intelligent_filter_with_markers", + description="Test with pytest markers (should preserve markers)", + branch="feature-branch", + list_file_diff="tests/models/language/generation/test_llama.py", + ) + ) + + scenarios.append( + Scenario( + name="intelligent_filter_distributed", + description="Distributed test changes", + branch="feature-branch", + list_file_diff="tests/distributed/test_pynccl.py|tests/distributed/test_comm_ops.py", + ) + ) + + scenarios.append( + Scenario( + name="intelligent_filter_kernels", + description="Kernel test changes", + branch="feature-branch", + list_file_diff="tests/kernels/attention/test_flashinfer.py|tests/kernels/quantization/test_fp8.py", + ) + ) + + scenarios.append( + Scenario( + name="intelligent_filter_mixed_changes", + description="Mix of test and non-test files (should disable intelligent filtering)", + branch="feature-branch", + list_file_diff="tests/engine/test_engine.py|vllm/engine/llm_engine.py|tests/test_config.py", + ) + ) + + scenarios.append( + Scenario( + name="intelligent_filter_with_coverage", + description="Intelligent filtering + coverage", + branch="feature-branch", + cov_enabled="1", + list_file_diff="tests/samplers/test_sampling.py|tests/samplers/test_logits.py", + ) + ) + + # ==================== Pytest Integration Tests ==================== + scenarios.append( + Scenario( + name="pytest_with_sharding", + description="Tests with parallelism/sharding", + branch="feature-branch", + run_all="1", + list_file_diff="tests/lora/test_lora.py", + ) + ) + + scenarios.append( + Scenario( + name="pytest_with_markers_core_model", + description="Tests using pytest markers (core_model)", + branch="feature-branch", + run_all="1", + list_file_diff="tests/models/language/generation/test_llama.py", + ) + ) + + scenarios.append( + Scenario( + name="pytest_multimodal_markers", + description="Multimodal tests with markers", + branch="feature-branch", + run_all="1", + list_file_diff="tests/models/multimodal/generation/test_common.py", + ) + ) + + scenarios.append( + Scenario( + name="pytest_ignore_patterns", + description="Tests with --ignore patterns", + branch="feature-branch", + run_all="1", + list_file_diff="tests/entrypoints/llm/test_llm.py", + ) + ) + + scenarios.append( + Scenario( + name="pytest_specific_test_selection", + description="Tests selecting specific test functions", + branch="feature-branch", + run_all="1", + list_file_diff="tests/v1/engine/test_engine_core_client.py", + ) + ) + + # ==================== Multi-Node and Special Configurations ============= + scenarios.append( + Scenario( + name="multi_node_tests", + description="Multi-node test execution", + branch="feature-branch", + run_all="1", + list_file_diff="tests/distributed/test_pipeline_parallel.py", + ) + ) + + scenarios.append( + Scenario( + name="multi_gpu_tests", + description="Multi-GPU tests (2 and 4 GPU)", + branch="feature-branch", + run_all="1", + list_file_diff="tests/distributed/test_comm_ops.py", + ) + ) + + scenarios.append( + Scenario( + name="special_gpu_tests", + description="A100, H100, H200, B200 tests", + branch="feature-branch", + run_all="1", + list_file_diff="tests/quantization/test_blackwell_moe.py", + ) + ) + + # ==================== Timeout and Optional Tests ==================== + scenarios.append( + Scenario( + name="optional_tests_pr", + description="Optional tests on PR (should be blocked)", + branch="feature-branch", + run_all="0", + list_file_diff="tests/models/language/generation_ppl_test/test_ppl.py", + ) + ) + + scenarios.append( + Scenario( + name="optional_tests_nightly", + description="Optional tests on nightly (should run)", + branch="feature-branch", + nightly="1", + list_file_diff="tests/models/language/generation_ppl_test/test_ppl.py", + ) + ) + + # ==================== Source File Dependencies Edge Cases =============== + scenarios.append( + Scenario( + name="no_source_deps", + description="Tests with no source_file_dependencies (always run)", + branch="feature-branch", + list_file_diff="vllm/engine/llm_engine.py", + ) + ) + + scenarios.append( + Scenario( + name="exact_match_source_deps", + description="Exact match on source dependencies", + branch="feature-branch", + list_file_diff="vllm/entrypoints/llm.py", + ) + ) + + scenarios.append( + Scenario( + name="prefix_match_source_deps", + description="Prefix match on source dependencies", + branch="feature-branch", + list_file_diff="vllm/model_executor/models/llama.py", + ) + ) + + # ==================== Coverage Edge Cases ==================== + scenarios.append( + Scenario( + name="coverage_no_pytest", + description="Coverage on steps without pytest commands", + branch="feature-branch", + run_all="1", + cov_enabled="1", + list_file_diff="benchmarks/benchmark_latency.py", + ) + ) + + scenarios.append( + Scenario( + name="coverage_mixed_commands", + description="Coverage with mixed pytest and non-pytest commands", + branch="feature-branch", + run_all="1", + cov_enabled="1", + ) + ) + + # ==================== AMD Mirror Hardware ==================== + scenarios.append( + Scenario( + name="amd_mirror_disabled", + description="AMD mirror hardware disabled", + branch="feature-branch", + run_all="1", + mirror_hw="none", + ) + ) + + scenarios.append( + Scenario( + name="amd_mirror_production", + description="AMD mirror with production hardware", + branch="feature-branch", + run_all="1", + mirror_hw="amdproduction", + ) + ) + + # ==================== Torch Nightly Specific ==================== + scenarios.append( + Scenario( + name="torch_nightly_tests_only", + description="Only torch_nightly marked tests", + branch="feature-branch", + list_file_diff="tests/compile/test_fusion.py", + ) + ) + + scenarios.append( + Scenario( + name="torch_nightly_with_nightly_mode", + description="Torch nightly tests in nightly mode", + branch="main", + nightly="1", + list_file_diff="tests/compile/test_basic_correctness.py", + ) + ) + + # ==================== Fast Check Tests ==================== + scenarios.append( + Scenario( + name="fast_check_tests", + description="Tests marked with fast_check", + branch="feature-branch", + list_file_diff="tests/basic_correctness/test_basic_correctness.py", + ) + ) + + # ==================== Complex Combined Scenarios ==================== + scenarios.append( + Scenario( + name="complex_all_flags", + description="All flags enabled together", + branch="main", + run_all="1", + nightly="1", + fail_fast="true", + cov_enabled="1", + vllm_use_precompiled="1", + ) + ) + + scenarios.append( + Scenario( + name="intelligent_filter_coverage_fail_fast", + description="Intelligent filtering + coverage + fail-fast", + branch="feature-branch", + fail_fast="true", + cov_enabled="1", + list_file_diff="tests/engine/test_engine.py|tests/test_config.py", + ) + ) + + return scenarios + + +def run_python_pipeline(scenario: Scenario, test_pipeline_path: str, output_path: str) -> Tuple[bool, str]: + """Run Python pipeline generator.""" + os.environ["BUILDKITE_COMMIT"] = scenario.commit + os.environ["BUILDKITE_BRANCH"] = scenario.branch + + try: + test_steps = read_test_steps(test_pipeline_path) + file_diff = scenario.list_file_diff.split("|") if scenario.list_file_diff else [] + + config = PipelineGeneratorConfig( + run_all=scenario.run_all == "1", + nightly=scenario.nightly == "1", + list_file_diff=file_diff, + container_registry=VLLM_ECR_URL, + container_registry_repo=VLLM_ECR_REPO, + commit=scenario.commit, + branch=scenario.branch, + mirror_hw=scenario.mirror_hw, + fail_fast=scenario.fail_fast == "true", + vllm_use_precompiled=scenario.vllm_use_precompiled, + cov_enabled=scenario.cov_enabled == "1", + vllm_ci_branch=scenario.vllm_ci_branch, + ) + + generator = PipelineGenerator(config) + steps = generator.generate(test_steps) + write_buildkite_pipeline(steps, output_path) + return True, "" + except Exception as e: + import traceback + + return False, f"Error: {e}\n{traceback.format_exc()}" + + +def run_jinja_pipeline(scenario: Scenario, template_path: str, test_pipeline_path: str, output_path: str) -> Tuple[bool, str]: + """Run minijinja-cli to generate pipeline from jinja template.""" + cmd = [ + "minijinja-cli", + template_path, + test_pipeline_path, + "-D", + f"branch={scenario.branch}", + "-D", + f"list_file_diff={scenario.list_file_diff}", + "-D", + f"run_all={scenario.run_all}", + "-D", + f"nightly={scenario.nightly}", + "-D", + f"mirror_hw={scenario.mirror_hw}", + "-D", + f"fail_fast={scenario.fail_fast}", + "-D", + f"vllm_use_precompiled={scenario.vllm_use_precompiled}", + "-D", + f"cov_enabled={scenario.cov_enabled}", + "-D", + f"vllm_ci_branch={scenario.vllm_ci_branch}", + ] + + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + if result.returncode != 0: + return False, f"minijinja error: {result.stderr}" + + # Remove blank lines + lines = [line for line in result.stdout.split("\n") if line.strip()] + with open(output_path, "w") as f: + f.write("\n".join(lines)) + return True, "" + except subprocess.TimeoutExpired: + return False, "Timeout" + except Exception as e: + return False, str(e) + + +def show_yaml_diff(jinja_file: str, python_file: str, max_lines: int = 100): + """Show detailed YAML diff between two files.""" + with open(jinja_file, "r") as f: + jinja_lines = f.readlines() + with open(python_file, "r") as f: + python_lines = f.readlines() + + diff = list( + difflib.unified_diff( + jinja_lines, + python_lines, + fromfile="jinja_output", + tofile="python_output", + lineterm="\n", + ) + ) + + if len(diff) > max_lines: + print(f" Showing first {max_lines} diff lines (total: {len(diff)}):") + for line in diff[:max_lines]: + print(f" {line}", end="") + print(f" ... ({len(diff) - max_lines} more lines)") + else: + for line in diff: + print(f" {line}", end="") + + +def analyze_step_differences(jinja_file: str, python_file: str) -> Dict[str, Any]: + """Analyze differences between pipelines at step level.""" + with open(jinja_file, "r") as f: + jinja_data = yaml.safe_load(f) + with open(python_file, "r") as f: + python_data = yaml.safe_load(f) + + jinja_steps = jinja_data.get("steps", []) + python_steps = python_data.get("steps", []) + + analysis = { + "total_steps": {"jinja": len(jinja_steps), "python": len(python_steps)}, + "step_labels": {"jinja": [], "python": []}, + "missing_in_python": [], + "missing_in_jinja": [], + "different_fields": [], + } + + # Collect labels + jinja_label_map = {} + python_label_map: Dict[str, int] = {} + + for i, step_obj in enumerate(jinja_steps): + step = cast(Dict[str, Any], step_obj) + if "label" in step: + analysis["step_labels"]["jinja"].append(step["label"]) # type: ignore + jinja_label_map[step["label"]] = i # type: ignore + elif "group" in step: + label = f"GROUP:{step['group']}" # type: ignore + analysis["step_labels"]["jinja"].append(label) # type: ignore + jinja_label_map[label] = i + elif "block" in step: + label = f"BLOCK:{step['block']}" # type: ignore + analysis["step_labels"]["jinja"].append(label) # type: ignore + jinja_label_map[label] = i + + for i, step_obj in enumerate(python_steps): + step = cast(Dict[str, Any], step_obj) + if "label" in step: + analysis["step_labels"]["python"].append(step["label"]) # type: ignore + python_label_map[step["label"]] = i # type: ignore + elif "group" in step: + label = f"GROUP:{step['group']}" # type: ignore + analysis["step_labels"]["python"].append(label) # type: ignore + python_label_map[label] = i + elif "block" in step: + label = f"BLOCK:{step['block']}" # type: ignore + analysis["step_labels"]["python"].append(label) # type: ignore + python_label_map[label] = i + + # Find missing steps + jinja_labels = set(analysis["step_labels"]["jinja"]) # type: ignore + python_labels = set(analysis["step_labels"]["python"]) # type: ignore + + analysis["missing_in_python"] = sorted(jinja_labels - python_labels) + analysis["missing_in_jinja"] = sorted(python_labels - jinja_labels) + + # Find steps with different fields + common_labels = jinja_labels & python_labels + for label in sorted(common_labels): + jinja_step = jinja_steps[jinja_label_map[label]] + python_step = python_steps[python_label_map[label]] + + # Normalize for comparison + jinja_normalized = yaml.dump(jinja_step, default_flow_style=False) + python_normalized = yaml.dump(python_step, default_flow_style=False) + + if jinja_normalized != python_normalized: + analysis["different_fields"].append(label) # type: ignore + + return analysis + + +def normalize_value(value): + """Recursively normalize a value for deep comparison.""" + if value is None: + return None + elif isinstance(value, dict): + # Recursively normalize dict, removing None values and empty + # collections + result = {} + for k, v in value.items(): + normalized = normalize_value(v) + # Only include if not None and not empty list + if normalized is not None and normalized != []: + result[k] = normalized + return result if result else None + elif isinstance(value, list): + # Recursively normalize list items + normalized_list = [normalize_value(item) for item in value] + # Treat empty list as None for comparison + return normalized_list if normalized_list else None + elif isinstance(value, str): + # Normalize whitespace in strings + return value.strip() + elif isinstance(value, (int, float, bool)): + return value + else: + return value + + +def deep_equal(obj1, obj2) -> bool: + """ + Deep equality check for YAML trees. + - Normalizes whitespace + - Treats None and missing keys as equivalent + - Treats empty list and None as equivalent + """ + # Handle both None + if obj1 is None and obj2 is None: + return True + if obj1 is None or obj2 is None: + return False + + # Handle dicts + if isinstance(obj1, dict) and isinstance(obj2, dict): + # Get all keys from both, treating None values as missing + keys1 = {k for k, v in obj1.items() if v is not None and v != []} + keys2 = {k for k, v in obj2.items() if v is not None and v != []} + + if keys1 != keys2: + return False + + # Compare values for common keys + for key in keys1: + if not deep_equal(obj1[key], obj2[key]): + return False + return True + + # Handle lists + if isinstance(obj1, list) and isinstance(obj2, list): + # Empty lists should be treated as equal to None + if len(obj1) == 0 and len(obj2) == 0: + return True + if len(obj1) != len(obj2): + return False + return all(deep_equal(a, b) for a, b in zip(obj1, obj2)) + + # Handle strings (normalize whitespace) + if isinstance(obj1, str) and isinstance(obj2, str): + return obj1.strip() == obj2.strip() + + # Handle other types + return obj1 == obj2 + + +def compare_pipelines(jinja_file: str, python_file: str, scenario_name: str, show_diff: bool = False) -> Tuple[bool, Dict[str, Any]]: + """Compare two pipeline files using deep equality of parsed YAML trees.""" + with open(jinja_file, "r") as f: + jinja_content = f.read() + with open(python_file, "r") as f: + python_content = f.read() + + try: + jinja_data = yaml.safe_load(jinja_content) + python_data = yaml.safe_load(python_content) + except yaml.YAMLError as e: + return False, {"error": f"YAML parse error: {e}"} + + # Deep equality check on parsed YAML trees + yaml_trees_equal = deep_equal(jinja_data, python_data) + + # Get detailed analysis + analysis = analyze_step_differences(jinja_file, python_file) + + # String comparison (for reference) + jinja_normalized_str = yaml.dump(jinja_data, sort_keys=False, default_flow_style=False) + python_normalized_str = yaml.dump(python_data, sort_keys=False, default_flow_style=False) + exact_string_match = jinja_normalized_str == python_normalized_str + + # Find actual data differences if trees don't match + different_steps = [] + if not yaml_trees_equal: + jinja_steps = jinja_data.get("steps", []) + python_steps = python_data.get("steps", []) + + if len(jinja_steps) == len(python_steps): + for i, (jinja_step, python_step) in enumerate(zip(jinja_steps, python_steps)): + if not deep_equal(jinja_step, python_step): + step_id = jinja_step.get("label") or jinja_step.get("block") or jinja_step.get("group") or f"Step {i}" + # Find specific differences + jinja_keys = set(jinja_step.keys()) + python_keys = set(python_step.keys()) + + missing_in_python = jinja_keys - python_keys + missing_in_jinja = python_keys - jinja_keys + + diff_info = f"Step {i} ({step_id}): " + if missing_in_python: + diff_info += f"missing in Python: {missing_in_python} " + if missing_in_jinja: + diff_info += f"extra in Python: {missing_in_jinja} " + + # Check field value differences + common_keys = jinja_keys & python_keys + for key in common_keys: + if not deep_equal(jinja_step[key], python_step[key]): + diff_info += f"{key} differs " + + different_steps.append(diff_info) + else: + different_steps.append(f"Step count mismatch: {len(jinja_steps)} vs {len(python_steps)}") + + result = { + "yaml_trees_equal": yaml_trees_equal, + "exact_string_match": exact_string_match, + "analysis": analysis, + "different_steps": different_steps[:20] if different_steps else [], + } + + if not yaml_trees_equal and show_diff: + print("\n " + "─" * 76) + print(" YAML DATA DIFFERENCES:") + print(" " + "─" * 76) + for diff in different_steps[:10]: + print(f" {diff}") + if len(different_steps) > 10: + print(f" ... and {len(different_steps) - 10} more") + print(" " + "─" * 76) + + return yaml_trees_equal, result + + +# ============================================================================ +# PYTEST TEST FUNCTIONS +# ============================================================================ + + +@pytest.fixture(scope="module") +def template_path(): + """Path to CI Jinja template.""" + ci_infra_path = "/Users/rezabarazesh/Documents/test/ci-infra" + return os.path.join(ci_infra_path, "buildkite/test-template-ci.j2") + + +@pytest.fixture(scope="module") +def test_pipeline_path(): + """Path to test pipeline YAML.""" + vllm_path = "/Users/rezabarazesh/Documents/test/vllm" + return os.path.join(vllm_path, ".buildkite/test-pipeline.yaml") + + +@pytest.fixture(scope="module") +def check_minijinja(): + """Check that minijinja-cli is available.""" + try: + subprocess.run(["minijinja-cli", "--version"], capture_output=True, check=True) + except (subprocess.CalledProcessError, FileNotFoundError): + pytest.skip( + "minijinja-cli not found. Install: curl -sSfL https://github.com/mitsuhiko/minijinja/releases/download/2.3.1/minijinja-cli-installer.sh | sh" + ) + + +@pytest.mark.parametrize("scenario", get_all_test_scenarios(), ids=lambda s: s.name) +def test_ci_pipeline_scenario(scenario, template_path, test_pipeline_path, check_minijinja, tmp_path): + """Test that Python generator produces identical output to Jinja template for each scenario.""" + jinja_output = tmp_path / f"jinja_{scenario.name}.yaml" + python_output = tmp_path / f"python_{scenario.name}.yaml" + + # Generate with Jinja + jinja_success, jinja_error = run_jinja_pipeline(scenario, template_path, test_pipeline_path, str(jinja_output)) + assert jinja_success, f"Jinja generation failed: {jinja_error}" + + # Generate with Python + python_success, python_error = run_python_pipeline(scenario, test_pipeline_path, str(python_output)) + assert python_success, f"Python generation failed: {python_error}" + + # Compare outputs + matches, comparison = compare_pipelines(str(jinja_output), str(python_output), scenario.name, show_diff=True) + assert matches, f"Pipeline outputs don't match:\n{comparison}" + + +# Run with: pytest tests/test_integration_comprehensive.py -v +# To run specific scenario: pytest tests/test_integration_comprehensive.py +# -k "main_branch_default" diff --git a/buildkite/pipeline_generator/tests/test_integration_fastcheck.py b/buildkite/pipeline_generator/tests/test_integration_fastcheck.py new file mode 100755 index 00000000..593c6fef --- /dev/null +++ b/buildkite/pipeline_generator/tests/test_integration_fastcheck.py @@ -0,0 +1,245 @@ +#!/usr/bin/env python3 +""" +Integration test suite for fastcheck pipeline mode. +Compares Python generator output against test-template-fastcheck.j2 +""" + +import os +import subprocess +import sys +from dataclasses import dataclass +from typing import List, Tuple + +import pytest +import yaml + +from buildkite.pipeline_generator.config import VLLM_ECR_REPO, VLLM_ECR_URL, PipelineGeneratorConfig, PipelineMode +from buildkite.pipeline_generator.pipeline_generator import ( + PipelineGenerator, + read_test_steps, + write_buildkite_pipeline, +) + +# Add parent directories to path for imports +current_dir = os.path.dirname(os.path.abspath(__file__)) +pipeline_gen_dir = os.path.dirname(current_dir) +buildkite_dir = os.path.dirname(pipeline_gen_dir) +ci_infra_dir = os.path.dirname(buildkite_dir) +sys.path.insert(0, ci_infra_dir) + + +@dataclass +class Scenario: + """Test scenario for fastcheck mode.""" + + name: str + branch: str = "main" + vllm_use_precompiled: str = "0" + fail_fast: str = "false" + mirror_hw: str = "amdexperimental" + commit: str = "0" * 40 + description: str = "" + + +def get_fastcheck_scenarios() -> List[Scenario]: + """Get test scenarios for fastcheck mode.""" + return [ + Scenario(name="fastcheck_default", description="Default fastcheck configuration"), + Scenario(name="fastcheck_main_branch", branch="main", description="Fastcheck on main branch"), + Scenario( + name="fastcheck_pr_branch", + branch="feature-branch", + description="Fastcheck on PR branch", + ), + Scenario( + name="fastcheck_precompiled", + vllm_use_precompiled="1", + description="Fastcheck with precompiled wheels", + ), + Scenario( + name="fastcheck_no_precompiled", + vllm_use_precompiled="0", + description="Fastcheck building from source", + ), + Scenario( + name="fastcheck_fail_fast", + fail_fast="true", + description="Fastcheck with fail-fast enabled", + ), + Scenario( + name="fastcheck_amd_production", + mirror_hw="amdproduction", + description="Fastcheck with AMD production hardware", + ), + Scenario( + name="fastcheck_amd_experimental", + mirror_hw="amdexperimental", + description="Fastcheck with AMD experimental hardware", + ), + ] + + +def run_jinja_fastcheck(scenario: Scenario, template_path: str, test_pipeline_path: str, output_path: str) -> Tuple[bool, str]: + """Run Jinja template to generate pipeline.""" + try: + cmd = [ + "minijinja-cli", + template_path, + test_pipeline_path, + "-D", + f"vllm_use_precompiled={scenario.vllm_use_precompiled}", + "-D", + f"fail_fast={scenario.fail_fast}", + "-D", + f"mirror_hw={scenario.mirror_hw}", + ] + + result = subprocess.run(cmd, capture_output=True, text=True, check=True) + + # Remove blank lines like bootstrap.sh does + lines = [line for line in result.stdout.split("\n") if line.strip()] + output = "\n".join(lines) + "\n" + + with open(output_path, "w") as f: + f.write(output) + + return True, "" + except Exception as e: + return False, str(e) + + +def run_python_fastcheck(scenario: Scenario, test_pipeline_path: str, output_path: str) -> Tuple[bool, str]: + """Run Python generator to generate pipeline.""" + try: + test_steps = read_test_steps(test_pipeline_path) + + config = PipelineGeneratorConfig( + run_all=False, + nightly=False, + list_file_diff=[], + container_registry=VLLM_ECR_URL, + container_registry_repo=VLLM_ECR_REPO, + commit=scenario.commit, + branch=scenario.branch, + mirror_hw=scenario.mirror_hw, + fail_fast=scenario.fail_fast == "true", + vllm_use_precompiled=scenario.vllm_use_precompiled, + cov_enabled=False, + vllm_ci_branch="main", + pipeline_mode=PipelineMode.FASTCHECK, + ) + + generator = PipelineGenerator(config) + steps = generator.generate(test_steps) + write_buildkite_pipeline(steps, output_path) + + return True, "" + except Exception as e: + return False, str(e) + + +def normalize_for_comparison(obj): + """Recursively normalize data structure for comparison (order-independent).""" + if isinstance(obj, dict): + # Convert dict to sorted tuple of items for comparison + return tuple(sorted((k, normalize_for_comparison(v)) for k, v in obj.items())) + elif isinstance(obj, list): + # Lists maintain order, normalize each element + return tuple(normalize_for_comparison(item) for item in obj) + elif isinstance(obj, (str, int, float, bool, type(None))): + return obj + else: + return str(obj) + + +def compare_yaml_trees(jinja_path: str, python_path: str) -> Tuple[bool, str]: + """Compare two YAML files for structural equality (field order independent).""" + with open(jinja_path, "r") as f: + jinja_data = yaml.safe_load(f) + with open(python_path, "r") as f: + python_data = yaml.safe_load(f) + + # Normalize both structures for order-independent comparison + jinja_normalized = normalize_for_comparison(jinja_data) + python_normalized = normalize_for_comparison(python_data) + + matches = jinja_normalized == python_normalized + + if not matches: + # Show diff using sorted YAML for readability + import difflib + jinja_str = yaml.dump(jinja_data, default_flow_style=False, sort_keys=True) + python_str = yaml.dump(python_data, default_flow_style=False, sort_keys=True) + + diff = list(difflib.unified_diff( + jinja_str.splitlines(keepends=True), + python_str.splitlines(keepends=True), + fromfile='jinja', + tofile='python', + lineterm='' + )) + + diff_msg = "YAML structures don't match (semantically different data)\n" + "".join(diff[:50]) + else: + diff_msg = "" + + return matches, diff_msg + + +# ============================================================================ +# PYTEST TEST FUNCTIONS +# ============================================================================ + + +@pytest.fixture(scope="module") +def template_path(): + """Path to Fastcheck Jinja template.""" + ci_infra_path = "/Users/rezabarazesh/Documents/test/ci-infra" + return os.path.join(ci_infra_path, "buildkite/test-template-fastcheck.j2") + + +@pytest.fixture(scope="module") +def test_pipeline_path(): + """Path to test pipeline YAML.""" + vllm_path = "/Users/rezabarazesh/Documents/test/vllm" + return os.path.join(vllm_path, ".buildkite/test-pipeline.yaml") + + +@pytest.fixture(scope="module") +def check_minijinja(): + """Check that minijinja-cli is available.""" + try: + subprocess.run(["minijinja-cli", "--version"], capture_output=True, check=True) + except (subprocess.CalledProcessError, FileNotFoundError): + pytest.skip( + "minijinja-cli not found. Install: curl -sSfL https://github.com/mitsuhiko/minijinja/releases/download/2.3.1/minijinja-cli-installer.sh | sh" + ) + + +@pytest.mark.parametrize("scenario", get_fastcheck_scenarios(), ids=lambda s: s.name) +def test_fastcheck_pipeline_scenario(scenario, template_path, test_pipeline_path, check_minijinja, tmp_path): + """Test that Python generator produces identical output to Jinja template for fastcheck scenarios.""" + jinja_output = tmp_path / f"jinja_{scenario.name}.yaml" + python_output = tmp_path / f"python_{scenario.name}.yaml" + + # Generate with Jinja + jinja_success, jinja_error = run_jinja_fastcheck(scenario, template_path, test_pipeline_path, str(jinja_output)) + assert jinja_success, f"Jinja generation failed: {jinja_error}" + + # Generate with Python + python_success, python_error = run_python_fastcheck(scenario, test_pipeline_path, str(python_output)) + assert python_success, f"Python generation failed: {python_error}" + + # Also save to /tmp for debugging + import shutil + shutil.copy(str(jinja_output), f'/tmp/jinja_{scenario.name}.yaml') + shutil.copy(str(python_output), f'/tmp/python_{scenario.name}.yaml') + + # Compare outputs + matches, diff = compare_yaml_trees(str(jinja_output), str(python_output)) + assert matches, f"Pipeline outputs don't match:\n{diff}" + + +# Run with: pytest tests/test_integration_fastcheck.py -v +# To run specific scenario: pytest tests/test_integration_fastcheck.py -k +# "precompiled" diff --git a/buildkite/pipeline_generator/utils.py b/buildkite/pipeline_generator/utils.py deleted file mode 100644 index dc1615ae..00000000 --- a/buildkite/pipeline_generator/utils.py +++ /dev/null @@ -1,74 +0,0 @@ -import enum -from typing import Optional, List - -# Constants -HF_HOME = "/root/.cache/huggingface" -DEFAULT_WORKING_DIR = "/vllm-workspace/tests" -VLLM_ECR_URL = "public.ecr.aws/q9t5s3a7" -VLLM_ECR_REPO = f"{VLLM_ECR_URL}/vllm-ci-test-repo" -AMD_REPO = "rocm/vllm-ci" - -# File paths -TEST_PATH = ".buildkite/test-pipeline.yaml" -EXTERNAL_HARDWARE_TEST_PATH = ".buildkite/external-tests.yaml" -PIPELINE_FILE_PATH = ".buildkite/pipeline.yaml" -MULTI_NODE_TEST_SCRIPT = ".buildkite/run-multi-node-test.sh" - -TEST_DEFAULT_COMMANDS = [ - "(command nvidia-smi || true)", # Sanity check for Nvidia GPU setup - "export VLLM_LOGGING_LEVEL=DEBUG", - "export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1", -] - -STEPS_TO_BLOCK = [] - -class GPUType(str, enum.Enum): - A100 = "a100" - -class AgentQueue(str, enum.Enum): - AWS_CPU = "cpu_queue" - AWS_SMALL_CPU = "small_cpu_queue" - AWS_1xL4 = "gpu_1_queue" - AWS_4xL4 = "gpu_4_queue" - A100 = "a100-queue" - AMD_GPU = "amd" - AMD_CPU = "amd-cpu" - - -def get_agent_queue(no_gpu: Optional[bool], gpu_type: Optional[str], num_gpus: Optional[int]) -> AgentQueue: - if no_gpu: - return AgentQueue.AWS_SMALL_CPU - if gpu_type == GPUType.A100.value: - return AgentQueue.A100 - return AgentQueue.AWS_1xL4 if not num_gpus or num_gpus == 1 else AgentQueue.AWS_4xL4 - - -def get_full_test_command(test_commands: List[str], step_working_dir: str) -> str: - """Convert test commands into one-line command with the right directory.""" - working_dir = step_working_dir or DEFAULT_WORKING_DIR - test_commands_str = ";\n".join(test_commands) - full_test_commands = [ - *TEST_DEFAULT_COMMANDS, - f"cd {working_dir}", - test_commands_str - ] - return ";\n".join(full_test_commands) - - -def get_multi_node_test_command( - test_commands: List[str], - working_dir: str, - num_nodes: int, - num_gpus: int, - docker_image_path: str - ) -> str: - quoted_commands = [f"'{command}'" for command in test_commands] - multi_node_command = [ - MULTI_NODE_TEST_SCRIPT, - working_dir or DEFAULT_WORKING_DIR, - str(num_nodes), - str(num_gpus), - docker_image_path, - *quoted_commands - ] - return " ".join(map(str, multi_node_command)) diff --git a/buildkite/tests/__init__.py b/buildkite/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/buildkite/tests/pipeline_generator/__init__.py b/buildkite/tests/pipeline_generator/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/buildkite/tests/pipeline_generator/test_pipeline_generator.py b/buildkite/tests/pipeline_generator/test_pipeline_generator.py deleted file mode 100644 index 43b5baaa..00000000 --- a/buildkite/tests/pipeline_generator/test_pipeline_generator.py +++ /dev/null @@ -1,88 +0,0 @@ -import pytest -import sys -import os -import tempfile -import yaml - -from buildkite.pipeline_generator.pipeline_generator import PipelineGeneratorConfig, PipelineGenerator, read_test_steps, write_buildkite_steps -from buildkite.pipeline_generator.step import BuildkiteStep, BuildkiteBlockStep, DEFAULT_TEST_WORKING_DIR -from buildkite.pipeline_generator.utils import AgentQueue - -TEST_COMMIT = "abcdef0123456789abcdef0123456789abcdef01" -TEST_CONTAINER_REGISTRY = "container.registry" -TEST_CONTAINER_REGISTRY_REPO = "test" - - -def _get_pipeline_generator_config(): - return PipelineGeneratorConfig( - container_registry=TEST_CONTAINER_REGISTRY, - container_registry_repo=TEST_CONTAINER_REGISTRY_REPO, - commit=TEST_COMMIT, - list_file_diff=[], - ) - - -def test_pipeline_generator_config_get_container_image(): - config = _get_pipeline_generator_config() - config.validate() - assert config.container_image == "container.registry/test:abcdef0123456789abcdef0123456789abcdef01" - - -@pytest.mark.parametrize( - "commit", - [ - "abcdefghijklmnopqrstuvwxyz1234567890abcd", # Invalid, not in a-f 0-9 - "1234567890abcdef", # Invalid, not 40 characters - ] -) -def test_get_pipeline_generator_config_invalid_commit(commit): - config = _get_pipeline_generator_config() - config.commit = commit - with pytest.raises(ValueError, match="not a valid Git commit hash"): - config.validate() - - -def test_read_test_steps(): - current_dir = os.path.dirname(os.path.abspath(__file__)) - test_path = os.path.join(current_dir, "test_files/test-pipeline.yaml") - test_steps = read_test_steps(test_path) - assert len(test_steps) == 4 - assert test_steps[0].commands == ['echo "Test 1"'] - assert test_steps[0].command is None - assert test_steps[0].working_dir == DEFAULT_TEST_WORKING_DIR - - assert test_steps[1].working_dir == "/tests2/" - assert test_steps[1].no_gpu is True - - assert test_steps[2].commands == ['echo "Test 3"', 'echo "Test 3.1"'] - assert test_steps[2].source_file_dependencies == ["file1", "src/file2"] - - assert test_steps[3].commands == ['echo "Test 4.1"', 'echo "Test 4.2"'] - assert test_steps[3].num_nodes == 2 - assert test_steps[3].num_gpus == 4 - - -def test_write_buildkite_steps(): - current_dir = os.path.dirname(os.path.abspath(__file__)) - expected_output_path = os.path.join(current_dir, "test_files/expected_pipeline.yaml") - with open(expected_output_path, "r") as f: - expected_output = yaml.safe_load(f) - - steps = [ - BuildkiteStep(label="Test 1", commands=['echo "Test1.1"', 'echo "Test1.2"']), - BuildkiteStep(label="Test 2", commands=["command3"], agents = {"queue": AgentQueue.AWS_1xL4.value}), - BuildkiteBlockStep(block="Run Test 3", key="block-test-3"), - BuildkiteStep(label="Test 3", commands=["command4"], depends_on="block-test-3"), - ] - with tempfile.TemporaryDirectory() as temp_dir: - output_file_path = os.path.join(temp_dir, "output.yaml") - write_buildkite_steps(steps, output_file_path) - with open(output_file_path, "r") as f: - content = f.read() - with open(expected_output_path, "r") as f: - expected_content = f.read() - assert content == expected_content - - -if __name__ == "__main__": - sys.exit(pytest.main(["-v", __file__])) diff --git a/buildkite/tests/pipeline_generator/test_pipeline_generator_helper.py b/buildkite/tests/pipeline_generator/test_pipeline_generator_helper.py deleted file mode 100644 index a67d140f..00000000 --- a/buildkite/tests/pipeline_generator/test_pipeline_generator_helper.py +++ /dev/null @@ -1,76 +0,0 @@ -import pytest -import sys -from unittest import mock - -from buildkite.pipeline_generator.pipeline_generator_helper import get_plugin_config, convert_test_step_to_buildkite_step -from buildkite.pipeline_generator.utils import GPUType -from buildkite.pipeline_generator.step import TestStep, BuildkiteStep - -@mock.patch("buildkite.pipeline_generator.pipeline_generator_helper.get_kubernetes_plugin_config") -@mock.patch("buildkite.pipeline_generator.pipeline_generator_helper.get_docker_plugin_config") -def test_get_plugin_config(mock_get_docker_plugin_config, mock_get_kubernetes_plugin_config): - mock_get_docker_plugin_config.return_value = {"docker": "plugin"} - mock_get_kubernetes_plugin_config.return_value = {"kubernetes": "plugin"} - plugin = get_plugin_config("image:latest") - assert plugin == {"docker": "plugin"} - assert mock_get_docker_plugin_config.call_count == 1 - assert mock_get_docker_plugin_config.call_args[0] == ("image:latest", False) - -@mock.patch("buildkite.pipeline_generator.pipeline_generator_helper.get_kubernetes_plugin_config") -@mock.patch("buildkite.pipeline_generator.pipeline_generator_helper.get_docker_plugin_config") -def test_get_plugin_config_kubernetes(mock_get_docker_plugin_config, mock_get_kubernetes_plugin_config): - mock_get_docker_plugin_config.return_value = {"docker": "plugin"} - mock_get_kubernetes_plugin_config.return_value = {"kubernetes": "plugin"} - plugin = get_plugin_config(container_image="image:latest", gpu_type=GPUType.A100, num_gpus=4) - assert plugin == {"kubernetes": "plugin"} - assert mock_get_kubernetes_plugin_config.call_count == 1 - assert mock_get_kubernetes_plugin_config.call_args[0] == ("image:latest", 4) - -@pytest.mark.parametrize( - ("test_step", "expected_buildkite_step"), - [ - # Regular test with plugin - ( - TestStep( - label="First test", - commands=["echo A", "echo B", "echo C"], - num_gpus=1, - ), - BuildkiteStep( - label="First test", - key="first-test", - commands=["echo A", "echo B", "echo C"], - plugins=[{"plugin": "config"}], - agents={"queue": "gpu_1_queue"} - ) - ), - # Multi node test without plugin and custom command for multi-node - ( - TestStep( - label="Second test", - commands=["echo D", "echo E"], - num_nodes=2, - num_gpus=2, - ), - BuildkiteStep( - label="Second test", - key="second-test", - commands=["multi-node-command"], - plugins=None, - agents={"queue": "gpu_4_queue"} - ) - ) - ] -) -@mock.patch("buildkite.pipeline_generator.pipeline_generator_helper.get_multi_node_test_command") -@mock.patch("buildkite.pipeline_generator.pipeline_generator_helper.get_plugin_config") -def test_convert_test_step(mock_get_plugin_config, mock_get_multi_node_test_command, test_step, expected_buildkite_step): - mock_get_plugin_config.return_value = {"plugin": "config"} - mock_get_multi_node_test_command.return_value = "multi-node-command" - - buildkite_step = convert_test_step_to_buildkite_step(test_step, "image:latest") - assert buildkite_step == expected_buildkite_step - - -if __name__ == "__main__": - sys.exit(pytest.main(["-v", __file__])) diff --git a/buildkite/tests/pipeline_generator/test_plugin.py b/buildkite/tests/pipeline_generator/test_plugin.py deleted file mode 100644 index b62bb285..00000000 --- a/buildkite/tests/pipeline_generator/test_plugin.py +++ /dev/null @@ -1,112 +0,0 @@ -import pytest -import sys - -from buildkite.pipeline_generator.plugin import ( - get_kubernetes_plugin_config, - get_docker_plugin_config, - DOCKER_PLUGIN_NAME, - KUBERNETES_PLUGIN_NAME, -) - - -def test_get_kubernetes_plugin_config(): - docker_image_path = "test_image:latest" - num_gpus = 1 - - expected_config = { - KUBERNETES_PLUGIN_NAME: { - "podSpec": { - "containers": [ - { - "image": docker_image_path, - "resources": {"limits": {"nvidia.com/gpu": num_gpus}}, - "volumeMounts": [ - {"name": "devshm", "mountPath": "/dev/shm"}, - {"name": "hf-cache", "mountPath": "/root/.cache/huggingface"} - ], - "env": [ - {"name": "HF_HOME", "value": "/root/.cache/huggingface"}, - {"name": "VLLM_USAGE_SOURCE", "value": "ci-test"}, - { - "name": "HF_TOKEN", - "valueFrom": { - "secretKeyRef": { - "name": "hf-token-secret", - "key": "token" - } - } - }, - ], - } - ], - "priorityClassName": "ci", - "nodeSelector": {"nvidia.com/gpu.product": "NVIDIA-A100-SXM4-80GB"}, - "volumes": [ - {"name": "devshm", "emptyDir": {"medium": "Memory"}}, - {"name": "hf-cache", "hostPath": {"path": "/root/.cache/huggingface", "type": "Directory"}} - ] - } - } - } - - assert get_kubernetes_plugin_config(docker_image_path, num_gpus) == expected_config - - -@pytest.mark.parametrize( - "docker_image_path, no_gpu, expected_config", - [ - ( - "test_image:latest", - False, - { - DOCKER_PLUGIN_NAME: { - "image": "test_image:latest", - "always-pull": True, - "propagate-environment": True, - "shell": ["/bin/bash", "-c"], - "gpus": "all", - "environment": [ - "HF_HOME=/root/.cache/huggingface", - "VLLM_USAGE_SOURCE=ci-test", - "HF_TOKEN", - "BUILDKITE_ANALYTICS_TOKEN" - ], - "mount-buildkite-agent": False, - "volumes": [ - "/dev/shm:/dev/shm", - "/root/.cache/huggingface:/root/.cache/huggingface" - ] - } - } - ), - ( - "cpu_image:latest", - True, - { - DOCKER_PLUGIN_NAME: { - "image": "cpu_image:latest", - "always-pull": True, - "propagate-environment": True, - "shell": ["/bin/bash", "-c"], - "environment": [ - "HF_HOME=/root/.cache/huggingface", - "VLLM_USAGE_SOURCE=ci-test", - "HF_TOKEN", - "BUILDKITE_ANALYTICS_TOKEN" - ], - "mount-buildkite-agent": False, - "volumes": [ - "/dev/shm:/dev/shm", - "/root/.cache/huggingface:/root/.cache/huggingface" - ] - } - } - ), - ] -) -def test_get_docker_plugin_config(docker_image_path, no_gpu, expected_config): - assert get_docker_plugin_config(docker_image_path, no_gpu) == expected_config - - -if __name__ == "__main__": - sys.exit(pytest.main(["-v", __file__])) diff --git a/buildkite/tests/pipeline_generator/test_step.py b/buildkite/tests/pipeline_generator/test_step.py deleted file mode 100644 index c2dc32cc..00000000 --- a/buildkite/tests/pipeline_generator/test_step.py +++ /dev/null @@ -1,126 +0,0 @@ -import pytest -import sys -from pydantic import ValidationError - -from buildkite.pipeline_generator.step import get_step_key, get_block_step, BuildkiteBlockStep, TestStep, DEFAULT_TEST_WORKING_DIR, BuildkiteStep -from buildkite.pipeline_generator.utils import AgentQueue, GPUType - -@pytest.mark.parametrize( - ("step_label", "expected_result"), - [ - ("Test Step", "test-step"), - ("Test Step 2", "test-step-2"), - ("Test (Step)", "test-step"), - ("Test A, B, C", "test-a-b-c"), - ], -) -def test_get_step_key(step_label: str, expected_result: str): - assert get_step_key(step_label) == expected_result - - -@pytest.mark.parametrize( - ("step_label", "expected_result"), - [ - ("Test Step", BuildkiteBlockStep(block="Run Test Step", key="block-test-step")), - ("Test Step 2", BuildkiteBlockStep(block="Run Test Step 2", key="block-test-step-2")), - ("Test (Step)", BuildkiteBlockStep(block="Run Test (Step)", key="block-test-step")), - ("Test A, B, C", BuildkiteBlockStep(block="Run Test A, B, C", key="block-test-a-b-c")), - ], -) -def test_get_block_step(step_label: str, expected_result: BuildkiteBlockStep): - assert get_block_step(step_label) == expected_result - -def test_create_test_step_with_command(): - test_step = TestStep( - label="Test Step", - command="echo 'hello'", - ) - assert test_step.label == "Test Step" - # Check default values - assert test_step.working_dir == DEFAULT_TEST_WORKING_DIR - assert test_step.optional is False - assert test_step.commands == ["echo 'hello'"] - assert test_step.command is None - - -def test_create_test_step_fail_duplicate_command(): - with pytest.raises(ValueError): - test_step = TestStep( - label="Test Step", - command="echo 'hello'", - commands=["echo 'hello'"], - ) - -def test_create_test_step_fail_gpu_and_no_gpu(): - with pytest.raises(ValueError, match="cannot be defined together"): - test_step = TestStep( - label="Test Step", - command="echo 'hello'", - gpu="a100", - no_gpu=True, - ) - -def test_create_test_step_fail_gpu(): - with pytest.raises(ValidationError): - test_step = TestStep( - label="Test Step", - command="echo 'hello'", - gpu="abc100", - ) - -def test_create_test_step_multi_node(): - with pytest.raises(ValueError, match="'num_gpus' must be defined if 'num_nodes' is defined."): - test_step = TestStep( - label="Test Step", - command="echo 'hello'", - num_nodes=2, - ) - - with pytest.raises(ValueError, match="Number of commands must match the number of nodes."): - test_step = TestStep( - label="Test Step", - num_nodes=2, - num_gpus=2, - commands=["echo 'hello1'", "echo 'hello2'", "echo 'hello3'"], - ) - - test_step = TestStep( - label="Test Step", - num_nodes=2, - num_gpus=2, - commands=["echo 'hello1'", "echo 'hello2'"], - ) - assert test_step.label == "Test Step" - assert test_step.num_nodes == 2 - assert test_step.num_gpus == 2 - assert test_step.commands == ["echo 'hello1'", "echo 'hello2'"] - -def test_create_buildkite_step(): - buildkite_step = BuildkiteStep( - label="Test Step", - key="test-step", - commands = ["echo 'hello'"], - ) - assert buildkite_step.label == "Test Step" - assert buildkite_step.key == "test-step" - assert buildkite_step.agents == {"queue": AgentQueue.AWS_CPU} - assert buildkite_step.depends_on == "build" - -def test_create_buildkite_step_fail_no_command(): - with pytest.raises(ValidationError): - buildkite_step = BuildkiteStep( - label="Test Step", - key="test-step", - ) - -def test_create_buildkite_step_fail_wrong_agent_queue(): - with pytest.raises(ValidationError): - buildkite_step = BuildkiteStep( - label="Test Step", - key="test-step", - agents={"queue": "wrong-queue"}, - ) - - -if __name__ == "__main__": - sys.exit(pytest.main(["-v", __file__])) diff --git a/buildkite/tests/pipeline_generator/test_utils.py b/buildkite/tests/pipeline_generator/test_utils.py deleted file mode 100644 index c395adbf..00000000 --- a/buildkite/tests/pipeline_generator/test_utils.py +++ /dev/null @@ -1,68 +0,0 @@ -import pytest -import sys -from typing import List - -from buildkite.pipeline_generator.utils import ( - get_agent_queue, - get_full_test_command, - get_multi_node_test_command, - AgentQueue, - MULTI_NODE_TEST_SCRIPT, - TEST_DEFAULT_COMMANDS, -) - -TEST_DEFAULT_COMMANDS_STR = ";\n".join(TEST_DEFAULT_COMMANDS) - -@pytest.mark.parametrize( - ("no_gpu", "gpu_type", "num_gpus", "expected_result"), - [ - (True, None, None, AgentQueue.AWS_SMALL_CPU), - (False, "a100", None, AgentQueue.A100), - (False, None, 1, AgentQueue.AWS_1xL4), - (False, None, 4, AgentQueue.AWS_4xL4), - ], -) -def test_get_agent_queue(no_gpu: bool, gpu_type: str, num_gpus: int, expected_result: AgentQueue): - assert get_agent_queue(no_gpu, gpu_type, num_gpus) == expected_result - - -@pytest.mark.parametrize( - ("test_commands", "step_working_dir", "expected_result"), - [ - (["echo 'hello'"], None, f"{TEST_DEFAULT_COMMANDS_STR};\ncd /vllm-workspace/tests;\necho 'hello'"), - (["echo 'hello'"], "/vllm-workspace/tests", f"{TEST_DEFAULT_COMMANDS_STR};\ncd /vllm-workspace/tests;\necho 'hello'"), - (["echo 'hello1'", "echo 'hello2'"], "/sample_tests", f"{TEST_DEFAULT_COMMANDS_STR};\ncd /sample_tests;\necho 'hello1';\necho 'hello2'"), - ], -) -def test_get_full_test_command(test_commands: List[str], step_working_dir: str, expected_result: str): - assert get_full_test_command(test_commands, step_working_dir) == expected_result - - -def test_get_multi_node_test_command(): - test_commands = [ - ( - "distributed/test_same_node.py;" - "pytest -v -s distributed/test_multi_node_assignment.py;" - "pytest -v -s distributed/test_pipeline_parallel.py" - ), - "distributed/test_same_node.py", - ] - working_dir = "/vllm-workspace/tests" - num_nodes = 2 - num_gpus = 4 - docker_image_path = "ecr-path/vllm-ci-test-repo:latest" - expected_multi_node_command = [ - MULTI_NODE_TEST_SCRIPT, - working_dir, - num_nodes, - num_gpus, - docker_image_path, - f"'{test_commands[0]}'", - f"'{test_commands[1]}'", - ] - expected_result = " ".join(map(str, expected_multi_node_command)) - assert get_multi_node_test_command(test_commands, working_dir, num_nodes, num_gpus, docker_image_path) == expected_result - - -if __name__ == "__main__": - sys.exit(pytest.main(["-v", __file__]))