Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions tests/e2e/kubetest2-kops/deployer/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -249,6 +249,10 @@ func (d *deployer) env() []string {
"KOPS_RUN_TOO_NEW_VERSION=1",
}...)

if d.ClusterName != "" {
vars = append(vars, fmt.Sprintf("CLUSTER_NAME=%v", d.ClusterName))
}

if d.BuildOptions.TargetBuildArch != "" {
vars = append(vars, fmt.Sprintf("KOPS_ARCH=%s", strings.Trim(d.BuildOptions.TargetBuildArch, "linux/")))
}
Expand Down
44 changes: 31 additions & 13 deletions tests/e2e/scenarios/ai-conformance/run-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,21 @@ set -o pipefail
REPO_ROOT=$(git rev-parse --show-toplevel)
source "${REPO_ROOT}"/tests/e2e/scenarios/lib/common.sh

# Install binaries onto path: helm
BIN_DIR=${REPO_ROOT}/.build/bin
mkdir -p "${BIN_DIR}"
PATH="${BIN_DIR}:$PATH"
export PATH

echo "Installing Helm..."
curl -fsSL -o ${BIN_DIR}/get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fyi, we now have helm installed in the kubekins-e2e-v2 images

ref: kubernetes/test-infra#36421

chmod 700 ${BIN_DIR}/get_helm.sh
USE_SUDO=false HELM_INSTALL_DIR="${BIN_DIR}" ${BIN_DIR}/get_helm.sh

# Setup helm repos
helm repo add nvidia https://helm.ngc.nvidia.com/nvidia
helm repo update

# AI Conformance requirements:
# - Kubernetes 1.35
# - NVIDIA L4 Instances (g6.xlarge on AWS)
Expand All @@ -39,11 +54,12 @@ SCENARIO_ROOT="${REPO_ROOT}/tests/e2e/scenarios/ai-conformance"
# Check for g6.xlarge availability in the region
echo "Checking availability of g6.xlarge in ${AWS_REGION}..."
(cd "${SCENARIO_ROOT}/tools/check-aws-availability" && go build -o check-aws-availability main.go)
AVAILABILITY=$("${SCENARIO_ROOT}/tools/check-aws-availability/check-aws-availability" -region "${AWS_REGION}" -instance-type g6.xlarge)
if [[ "${AVAILABILITY}" == "false" ]]; then
echo "Error: g6.xlarge instances are not available in ${AWS_REGION}. Please choose a region with L4 GPU support."
exit 1
fi
# Run and source the output to get the ZONES variable
source <("${SCENARIO_ROOT}/tools/check-aws-availability/check-aws-availability" -region "${AWS_REGION}" -instance-type g6.xlarge)

echo "ZONES=${ZONES}"
export ZONES

rm -f "${SCENARIO_ROOT}/tools/check-aws-availability/check-aws-availability"


Expand Down Expand Up @@ -85,6 +101,9 @@ EOF

${KOPS} update cluster --name "${CLUSTER_NAME}" --yes --admin

# TODO: Can we delay this until later?
${KOPS} validate cluster --wait=10m

echo "----------------------------------------------------------------"
echo "Deploying AI Conformance Components"
echo "----------------------------------------------------------------"
Expand All @@ -110,9 +129,6 @@ helm upgrade -i nvidia-gpu-operator --wait \
--version=v25.10.1 \
--wait

PATH="$(pwd):$PATH"
export PATH

# NVIDIA DRA Driver
# Uses the driver installed by GPU Operator at /run/nvidia/driver (the default).
echo "Installing NVIDIA DRA Driver..."
Expand Down Expand Up @@ -153,18 +169,20 @@ echo "----------------------------------------------------------------"
# Wait for kOps validation
"${KOPS}" validate cluster --wait=15m

# Verify Components
echo "Verifying NVIDIA Device Plugin..."
kubectl rollout status daemonset -n kube-system nvidia-device-plugin-daemonset --timeout=5m || echo "Warning: NVIDIA Device Plugin not ready yet"
echo "Verifying GPU Operator device plugin..."
kubectl rollout status daemonset -n gpu-operator nvidia-device-plugin-daemonset --timeout=5m || echo "Warning: GPU Operator device plugin not ready yet"

echo "Verifying Kueue..."
kubectl rollout status deployment -n kueue-system kueue-controller-manager --timeout=5m || echo "Warning: Kueue not ready yet"

echo "Verifying KubeRay..."
kubectl rollout status deployment -n kuberay-system kuberay-operator --timeout=5m || echo "Warning: KubeRay not ready yet"
kubectl rollout status deployment -n ray-system kuberay-operator --timeout=5m || echo "Warning: KubeRay not ready yet"

# echo "Verfying node-feature-discovery..."
# kubectl rollout status deployment -n node-feature-discovery nfd-master --timeout=5m || echo "Warning: node-feature-discovery not ready yet"

echo "Verifying Gateway API..."
kubectl get gatewayclass || echo "Warning: GatewayClass not found"
kubectl get crd gatewayclasses.gateway.networking.k8s.io || echo "Warning: GatewayClass CRD not found"

echo "Verifying Allocatable GPUs..."
# Wait a bit for nodes to report resources
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import (
"flag"
"fmt"
"os"
"strings"

"github.com/aws/aws-sdk-go-v2/aws"
"github.com/aws/aws-sdk-go-v2/config"
Expand All @@ -29,6 +30,14 @@ import (
)

func main() {
ctx := context.Background()
if err := run(ctx); err != nil {
fmt.Fprintf(os.Stderr, "Error: %v\n", err)
os.Exit(1)
}
}

func run(ctx context.Context) error {
var region string
var instanceType string

Expand All @@ -37,15 +46,12 @@ func main() {
flag.Parse()

if region == "" || instanceType == "" {
fmt.Println("Usage: check-aws-availability -region <region> -instance-type <type>")
os.Exit(1)
return fmt.Errorf("Usage: check-aws-availability -region <region> -instance-type <type>")
}

ctx := context.TODO()
cfg, err := config.LoadDefaultConfig(ctx, config.WithRegion(region))
if err != nil {
fmt.Printf("Error loading configuration: %v\n", err)
os.Exit(1)
return fmt.Errorf("Error loading configuration: %w", err)
}

client := ec2.NewFromConfig(cfg)
Expand All @@ -62,13 +68,21 @@ func main() {

result, err := client.DescribeInstanceTypeOfferings(ctx, input)
if err != nil {
fmt.Printf("Error describing instance type offerings: %v\n", err)
os.Exit(1)
return fmt.Errorf("Error describing instance type offerings: %w", err)
}

if len(result.InstanceTypeOfferings) > 0 {
fmt.Println("true")
} else {
fmt.Println("false")
var zones []string

// Gather the availability zones where the instance type is offered
for _, offering := range result.InstanceTypeOfferings {
zone := aws.ToString(offering.Location)
zones = append(zones, zone)
}

if len(zones) == 0 {
return fmt.Errorf("Instance type %s is not available in any availability zones in region %s", instanceType, region)
}

fmt.Fprintf(os.Stdout, "ZONES=%s\n", strings.Join(zones, ","))
return nil
}
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
/*
/*
Copyright The Kubernetes Authors.

Licensed under the Apache License, Version 2.0 (the "License");
Expand Down
Loading