Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
60 changes: 59 additions & 1 deletion buildkite/test-template-ci.j2
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
{% set hf_home = "/root/.cache/huggingface" %}
{% set hf_home_efs = "/mnt/efs/hf_cache" %}
{% set hf_home_fsx = "/fsx/hf_cache" %}
{% set hf_home_ibm = "/model-cache/huggingface" %}
{% set list_file_diff = list_file_diff | split("|") %}

{% macro add_pytest_coverage(cmd, coverage_file) %}
Expand Down Expand Up @@ -58,7 +59,11 @@ agents:
{% elif step.gpu == "a100" %}
queue: a100_queue
{% elif step.gpu == "h100" %}
{% if step.num_gpus == 4 %}
queue: IBM-H100-OpenShift
{% else %}
queue: mithril-h100-pool
{% endif %}
{% elif step.gpu == "h200" %}
queue: skylab-h200
{% elif step.gpu == "b200" %}
Expand Down Expand Up @@ -164,7 +169,60 @@ plugins:
- /dev/shm:/dev/shm
- /data/benchmark-hf-cache:/benchmark-hf-cache
- /data/benchmark-vllm-cache:/root/.cache/vllm
{% elif step.gpu == "h100" %}
{% elif step.gpu == "h100" and step.num_gpus == 4 %} # IBM H100
- kubernetes:
podSpec:
serviceAccountName: buildkite-h100
securityContext:
runAsUser: 0
runAsGroup: 0
fsGroup: 0
containers:
- image: {{ image }}
command:
- bash -c "{{ '(command nvidia-smi || true) && export VLLM_ALLOW_DEPRECATED_BEAM_SEARCH=1 && cd ' ~ ((step.working_dir or default_working_dir) | safe) ~ ' && ' ~ (step.command or (step.commands | join(" && ")) | safe) }}"
resources:
limits:
nvidia.com/gpu: {{ step.num_gpus or 1 }}
volumeMounts:
- name: devshm
mountPath: /dev/shm
- name: hf-cache
mountPath: {{ hf_home_ibm }}
- name: hf-cache
mountPath: /model-cache/.cache
subPath: model-cache
- name: hf-cache
mountPath: /model-cache/.config
subPath: model-config
env:
- name: VLLM_USAGE_SOURCE
value: ci-test
- name: NCCL_CUMEM_HOST_ENABLE
value: "0"
- name: HF_HOME
value: {{ hf_home_ibm }}
- name: HF_TOKEN
valueFrom:
secretKeyRef:
name: hf-token-secret
key: token
- name: XDG_CACHE_HOME
value: /model-cache/.cache
- name: XDG_CONFIG_HOME
value: /model-cache/.config
- name: FLASHINFER_WORKSPACE_DIR
value: /model-cache/.cache/flashinfer
nodeSelector:
nvidia.com/gpu.product: NVIDIA-H100-80GB-HBM3
volumes:
- name: devshm
emptyDir:
medium: Memory
- name: hf-cache
persistentVolumeClaim:
claimName: hf-cache-pvc
{% elif step.gpu == "h100" %} # Nebius H100
- kubernetes:
podSpec:
containers:
Expand Down