diff --git a/buildkite/test-template-ci.j2 b/buildkite/test-template-ci.j2 index f088ab61..b9f29eda 100644 --- a/buildkite/test-template-ci.j2 +++ b/buildkite/test-template-ci.j2 @@ -18,6 +18,7 @@ {% set hf_home = "/root/.cache/huggingface" %} {% set hf_home_efs = "/mnt/efs/hf_cache" %} {% set hf_home_fsx = "/fsx/hf_cache" %} +{% set hf_home_ibm = "/model-cache/huggingface" %} {% set list_file_diff = list_file_diff | split("|") %} {% macro add_pytest_coverage(cmd, coverage_file) %} @@ -58,7 +59,11 @@ agents: {% elif step.gpu == "a100" %} queue: a100_queue {% elif step.gpu == "h100" %} + {% if step.num_gpus == 4 %} + queue: IBM-H100-OpenShift + {% else %} queue: mithril-h100-pool + {% endif %} {% elif step.gpu == "h200" %} queue: skylab-h200 {% elif step.gpu == "b200" %} @@ -164,7 +169,60 @@ plugins: - /dev/shm:/dev/shm - /data/benchmark-hf-cache:/benchmark-hf-cache - /data/benchmark-vllm-cache:/root/.cache/vllm - {% elif step.gpu == "h100" %} + {% elif step.gpu == "h100" and step.num_gpus == 4 %} # IBM H100 + - kubernetes: + podSpec: + serviceAccountName: buildkite-h100 + securityContext: + runAsUser: 0 + runAsGroup: 0 + fsGroup: 0 + containers: + - image: {{ image }} + command: + - bash -c "{{ '(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd ' ~ ((step.working_dir or default_working_dir) | safe) ~ ' && ' ~ (step.command or (step.commands | join(" && ")) | safe) }}" + resources: + limits: + nvidia.com/gpu: {{ step.num_gpus or 1 }} + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: {{ hf_home_ibm }} + - name: hf-cache + mountPath: /model-cache/.cache + subPath: model-cache + - name: hf-cache + mountPath: /model-cache/.config + subPath: model-config + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: NCCL_CUMEM_HOST_ENABLE + value: "0" + - name: HF_HOME + value: {{ hf_home_ibm }} + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + - name: XDG_CACHE_HOME + value: /model-cache/.cache + - name: XDG_CONFIG_HOME + value: /model-cache/.config + - name: FLASHINFER_WORKSPACE_DIR + value: /model-cache/.cache/flashinfer + nodeSelector: + nvidia.com/gpu.product: NVIDIA-H100-80GB-HBM3 + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + persistentVolumeClaim: + claimName: hf-cache-pvc + {% elif step.gpu == "h100" %} # Nebius H100 - kubernetes: podSpec: containers: