From 95444360660f8adbba3cdd1a282140d5bf5b9d77 Mon Sep 17 00:00:00 2001 From: Amr Mahdi Date: Sat, 3 Jan 2026 10:21:42 -0800 Subject: [PATCH] [6/N][AMI BuildKit Cache] Add network tuning for high-throughput operations Optimizes network and Docker daemon settings in custom AMIs for high-throughput ECR operations. These settings help maximize network utilization during cache import/export and registry push/pull operations. We configure BBR congestion control which works better for sustained ECR transfers compared to the default. TCP buffers are increased to 16MB to accommodate the higher throughput, and tcp_slow_start_after_idle is disabled since CI builds make frequent connections to ECR. The Docker daemon is configured with max-concurrent-downloads/uploads=16 to parallelize registry operations. During testing, we found that cache export time dropped from 162.7s to 118.0s with these network optimizations combined with 1000 MB/s EBS throughput. We tested 2000 MB/s EBS throughput but saw diminishing returns - the bottleneck shifted to ECR upload rate (~60-100 MB/s) and zstd compression (CPU-bound), so 1000 MB/s is the sweet spot. Signed-off-by: Amr Mahdi --- packer/cpu/buildkite-cpu-ami.pkr.hcl | 5 +++ packer/cpu/scripts/configure-network.sh | 50 +++++++++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100755 packer/cpu/scripts/configure-network.sh diff --git a/packer/cpu/buildkite-cpu-ami.pkr.hcl b/packer/cpu/buildkite-cpu-ami.pkr.hcl index 4b95e11e..f75842f3 100644 --- a/packer/cpu/buildkite-cpu-ami.pkr.hcl +++ b/packer/cpu/buildkite-cpu-ami.pkr.hcl @@ -124,6 +124,11 @@ build { source = "vllm-cache-source" } + # Configure network settings for high-throughput operations + provisioner "shell" { + script = "scripts/configure-network.sh" + } + # Install BuildKit as standalone systemd service (runs as ec2-user with sudo) provisioner "shell" { script = "scripts/install-build-tools.sh" diff --git a/packer/cpu/scripts/configure-network.sh b/packer/cpu/scripts/configure-network.sh new file mode 100755 index 00000000..a8660aa8 --- /dev/null +++ b/packer/cpu/scripts/configure-network.sh @@ -0,0 +1,50 @@ +#!/bin/bash +set -eu -o pipefail + +# Network tuning for high-throughput container image operations +# Optimized for r6in instances with 100Gbps networking + +echo "=== Configuring network sysctl settings ===" + +cat <<'EOF' | sudo tee /etc/sysctl.d/99-vllm-network.conf +# Network tuning for high-throughput Docker builds +# Reference: https://docs.aws.amazon.com/datatransferterminal/latest/userguide/tech-requirements.html + +# BBR congestion control - helps sustained ECR transfers +net.core.default_qdisc = fq +net.ipv4.tcp_congestion_control = bbr + +# Avoid slow start after idle - helps frequent connections +net.ipv4.tcp_slow_start_after_idle = 0 + +# Reasonable buffers (enough for ECR rate) +net.core.rmem_max = 16777216 +net.core.wmem_max = 16777216 +net.ipv4.tcp_rmem = 4096 1048576 16777216 +net.ipv4.tcp_wmem = 4096 1048576 16777216 +EOF + +# Apply sysctl settings +sudo sysctl -p /etc/sysctl.d/99-vllm-network.conf + +# ----------------------------------------------------------------------------- +# Docker daemon configuration for high-throughput registry operations +# ----------------------------------------------------------------------------- +echo "=== Configuring Docker daemon ===" + +# Update Docker daemon config to increase concurrent downloads/uploads +# Use jq to merge with existing config, or create new if doesn't exist +if [[ -f /etc/docker/daemon.json ]]; then + # Merge with existing config + sudo jq '. + {"max-concurrent-downloads": 16, "max-concurrent-uploads": 16}' /etc/docker/daemon.json | sudo tee /etc/docker/daemon.json.tmp + sudo mv /etc/docker/daemon.json.tmp /etc/docker/daemon.json +else + # Create new config + echo '{"max-concurrent-downloads": 16, "max-concurrent-uploads": 16}' | sudo tee /etc/docker/daemon.json +fi + +# Restart Docker to apply new config +sudo systemctl restart docker +echo "Docker daemon configured and restarted" + +echo "=== Network configuration complete ==="