diff --git a/terraform/gcp_old/tpu-inference/modules/ci_v7x/main.tf b/terraform/gcp_old/tpu-inference/modules/ci_v7x/main.tf new file mode 100644 index 00000000..77bbbb86 --- /dev/null +++ b/terraform/gcp_old/tpu-inference/modules/ci_v7x/main.tf @@ -0,0 +1,96 @@ +# 1 TPU device each +# Runtime: v2-alpha-tpu7-ubuntu2404 + +data "google_client_config" "config" { + provider = google-beta +} + +resource "google_compute_disk" "tpu_disk" { + provider = google-beta + count = var.instance_count + name = "${var.accelerator_type}-ci-${count.index}-${var.project_short_name}-${data.google_client_config.config.zone}-disk" + size = 2048 + type = "hyperdisk-balanced" +} + +resource "google_tpu_v2_vm" "tpu_v7x_ci" { + provider = google-beta + count = var.instance_count + name = "${var.accelerator_type}-ci-${count.index}-${var.project_short_name}-${data.google_client_config.config.zone}" + + runtime_version = "v2-alpha-tpu7-ubuntu2404" + accelerator_type = var.accelerator_type + + dynamic "scheduling_config" { + for_each = var.reserved ? [1] : [] + content { + reserved = var.reserved + } + } + + network_config { + network = "projects/${var.project_id}/global/networks/default" + enable_external_ips = true + } + + data_disks { + source_disk = google_compute_disk.tpu_disk[count.index].id + mode = "READ_WRITE" + } + + metadata = { + "startup-script" = <<-EOF + #!/bin/bash + + apt-get update + apt-get install -y curl build-essential jq + + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + /root/.cargo/bin/cargo install minijinja-cli + cp /root/.cargo/bin/minijinja-cli /usr/bin/minijinja-cli + chmod 777 /usr/bin/minijinja-cli + + curl -fsSL https://keys.openpgp.org/vks/v1/by-fingerprint/32A37959C2FA5C3C99EFBC32A79206696452D198 | sudo gpg --dearmor -o /usr/share/keyrings/buildkite-agent-archive-keyring.gpg + echo "deb [signed-by=/usr/share/keyrings/buildkite-agent-archive-keyring.gpg] https://apt.buildkite.com/buildkite-agent stable main" | sudo tee /etc/apt/sources.list.d/buildkite-agent.list + apt-get update + apt-get install -y buildkite-agent + + sudo usermod -a -G docker buildkite-agent + sudo -u buildkite-agent gcloud auth configure-docker us-central1-docker.pkg.dev --quiet + + sudo sed -i "s/xxx/${var.buildkite_token_value}/g" /etc/buildkite-agent/buildkite-agent.cfg + sudo sed -i 's/name="%hostname-%spawn"/name="${var.accelerator_type}-ci-${count.index}-${var.project_short_name}-${data.google_client_config.config.zone}"/' /etc/buildkite-agent/buildkite-agent.cfg + echo 'tags="queue=${var.buildkite_queue_name}"' | sudo tee -a /etc/buildkite-agent/buildkite-agent.cfg + echo 'HF_TOKEN=${var.huggingface_token_value}' | sudo tee -a /etc/environment + + sudo mkdir -p /mnt/disks/persist + + # Format if not already formatted + if ! blkid /dev/nvme0n2; then + echo "Formatting /dev/nvme0n2 as ext4..." + sudo mkfs.ext4 -m 0 -E lazy_itable_init=0,lazy_journal_init=0,discard /dev/nvme0n2 + fi + + # Add to /etc/fstab using UUID + disk_uuid=$(blkid -s UUID -o value /dev/nvme0n2) + if ! grep -q "/mnt/disks/persist" /etc/fstab; then + echo "UUID=$disk_uuid /mnt/disks/persist ext4 defaults,discard 0 2" | sudo tee -a /etc/fstab + fi + + # Only mount if not already mounted (first boot or recovery) + if ! mountpoint -q /mnt/disks/persist; then + sudo mount /mnt/disks/persist + fi + + jq ". + {\"data-root\": \"/mnt/disks/persist\"}" /etc/docker/daemon.json > /tmp/daemon.json.tmp && mv /tmp/daemon.json.tmp /etc/docker/daemon.json + systemctl stop docker + systemctl daemon-reload + systemctl start docker + +      sudo chmod 777 /mnt/disks/persist + + systemctl enable buildkite-agent + systemctl start buildkite-agent + EOF + } +} diff --git a/terraform/gcp_old/tpu-inference/modules/ci_v7x/variables.tf b/terraform/gcp_old/tpu-inference/modules/ci_v7x/variables.tf new file mode 100644 index 00000000..0647c801 --- /dev/null +++ b/terraform/gcp_old/tpu-inference/modules/ci_v7x/variables.tf @@ -0,0 +1,40 @@ +variable "accelerator_type" { + type = string + description = "Accelerator type of TPU" +} + +variable "reserved" { + description = "if use reserved tpu resource" + type = bool + default = true +} + +variable "instance_count" { + type = number + description = "Number of TPU instance" +} + +variable "buildkite_queue_name" { + type = string + description = "The Buildkite agent queue name that the agents will join." +} + +variable "project_id" { + type = string + description = "The project ID for creating TPU agents" +} + +variable "project_short_name" { + type = string + description = "Short name for improved readability" +} + +variable "buildkite_token_value" { + type = string + description = "Agent token used to connect to Buildkite." +} + +variable "huggingface_token_value" { + type = string + description = "Hugging Face token for vLLM model serving usage." +} diff --git a/terraform/gcp_old/tpu-inference/modules/ci_v7x/versions.tf b/terraform/gcp_old/tpu-inference/modules/ci_v7x/versions.tf new file mode 100644 index 00000000..1207bb8e --- /dev/null +++ b/terraform/gcp_old/tpu-inference/modules/ci_v7x/versions.tf @@ -0,0 +1,7 @@ +terraform { + required_providers { + google-beta = { + source = "hashicorp/google-beta" + } + } +}