diff --git a/tests/e2e/kubetest2-kops/deployer/common.go b/tests/e2e/kubetest2-kops/deployer/common.go index bb14aa132f1f4..574ff0c512965 100644 --- a/tests/e2e/kubetest2-kops/deployer/common.go +++ b/tests/e2e/kubetest2-kops/deployer/common.go @@ -249,6 +249,10 @@ func (d *deployer) env() []string { "KOPS_RUN_TOO_NEW_VERSION=1", }...) + if d.ClusterName != "" { + vars = append(vars, fmt.Sprintf("CLUSTER_NAME=%v", d.ClusterName)) + } + if d.BuildOptions.TargetBuildArch != "" { vars = append(vars, fmt.Sprintf("KOPS_ARCH=%s", strings.Trim(d.BuildOptions.TargetBuildArch, "linux/"))) } diff --git a/tests/e2e/scenarios/ai-conformance/run-test.sh b/tests/e2e/scenarios/ai-conformance/run-test.sh index 5fb245e14db33..f187bc6ed1a21 100755 --- a/tests/e2e/scenarios/ai-conformance/run-test.sh +++ b/tests/e2e/scenarios/ai-conformance/run-test.sh @@ -21,6 +21,21 @@ set -o pipefail REPO_ROOT=$(git rev-parse --show-toplevel) source "${REPO_ROOT}"/tests/e2e/scenarios/lib/common.sh +# Install binaries onto path: helm +BIN_DIR=${REPO_ROOT}/.build/bin +mkdir -p "${BIN_DIR}" +PATH="${BIN_DIR}:$PATH" +export PATH + +echo "Installing Helm..." +curl -fsSL -o ${BIN_DIR}/get_helm.sh https://raw.githubusercontent.com/helm/helm/main/scripts/get-helm-3 +chmod 700 ${BIN_DIR}/get_helm.sh +USE_SUDO=false HELM_INSTALL_DIR="${BIN_DIR}" ${BIN_DIR}/get_helm.sh + +# Setup helm repos +helm repo add nvidia https://helm.ngc.nvidia.com/nvidia +helm repo update + # AI Conformance requirements: # - Kubernetes 1.35 # - NVIDIA L4 Instances (g6.xlarge on AWS) @@ -39,11 +54,12 @@ SCENARIO_ROOT="${REPO_ROOT}/tests/e2e/scenarios/ai-conformance" # Check for g6.xlarge availability in the region echo "Checking availability of g6.xlarge in ${AWS_REGION}..." (cd "${SCENARIO_ROOT}/tools/check-aws-availability" && go build -o check-aws-availability main.go) -AVAILABILITY=$("${SCENARIO_ROOT}/tools/check-aws-availability/check-aws-availability" -region "${AWS_REGION}" -instance-type g6.xlarge) -if [[ "${AVAILABILITY}" == "false" ]]; then - echo "Error: g6.xlarge instances are not available in ${AWS_REGION}. Please choose a region with L4 GPU support." - exit 1 -fi +# Run and source the output to get the ZONES variable +source <("${SCENARIO_ROOT}/tools/check-aws-availability/check-aws-availability" -region "${AWS_REGION}" -instance-type g6.xlarge) + +echo "ZONES=${ZONES}" +export ZONES + rm -f "${SCENARIO_ROOT}/tools/check-aws-availability/check-aws-availability" @@ -85,6 +101,9 @@ EOF ${KOPS} update cluster --name "${CLUSTER_NAME}" --yes --admin +# TODO: Can we delay this until later? +${KOPS} validate cluster --wait=10m + echo "----------------------------------------------------------------" echo "Deploying AI Conformance Components" echo "----------------------------------------------------------------" @@ -110,9 +129,6 @@ helm upgrade -i nvidia-gpu-operator --wait \ --version=v25.10.1 \ --wait -PATH="$(pwd):$PATH" -export PATH - # NVIDIA DRA Driver # Uses the driver installed by GPU Operator at /run/nvidia/driver (the default). echo "Installing NVIDIA DRA Driver..." @@ -153,18 +169,20 @@ echo "----------------------------------------------------------------" # Wait for kOps validation "${KOPS}" validate cluster --wait=15m -# Verify Components -echo "Verifying NVIDIA Device Plugin..." -kubectl rollout status daemonset -n kube-system nvidia-device-plugin-daemonset --timeout=5m || echo "Warning: NVIDIA Device Plugin not ready yet" +echo "Verifying GPU Operator device plugin..." +kubectl rollout status daemonset -n gpu-operator nvidia-device-plugin-daemonset --timeout=5m || echo "Warning: GPU Operator device plugin not ready yet" echo "Verifying Kueue..." kubectl rollout status deployment -n kueue-system kueue-controller-manager --timeout=5m || echo "Warning: Kueue not ready yet" echo "Verifying KubeRay..." -kubectl rollout status deployment -n kuberay-system kuberay-operator --timeout=5m || echo "Warning: KubeRay not ready yet" +kubectl rollout status deployment -n ray-system kuberay-operator --timeout=5m || echo "Warning: KubeRay not ready yet" + +# echo "Verfying node-feature-discovery..." +# kubectl rollout status deployment -n node-feature-discovery nfd-master --timeout=5m || echo "Warning: node-feature-discovery not ready yet" echo "Verifying Gateway API..." -kubectl get gatewayclass || echo "Warning: GatewayClass not found" +kubectl get crd gatewayclasses.gateway.networking.k8s.io || echo "Warning: GatewayClass CRD not found" echo "Verifying Allocatable GPUs..." # Wait a bit for nodes to report resources diff --git a/tests/e2e/scenarios/ai-conformance/tools/check-aws-availability/main.go b/tests/e2e/scenarios/ai-conformance/tools/check-aws-availability/main.go index 48bf2099a8bc0..ca15103536ed1 100644 --- a/tests/e2e/scenarios/ai-conformance/tools/check-aws-availability/main.go +++ b/tests/e2e/scenarios/ai-conformance/tools/check-aws-availability/main.go @@ -21,6 +21,7 @@ import ( "flag" "fmt" "os" + "strings" "github.com/aws/aws-sdk-go-v2/aws" "github.com/aws/aws-sdk-go-v2/config" @@ -29,6 +30,14 @@ import ( ) func main() { + ctx := context.Background() + if err := run(ctx); err != nil { + fmt.Fprintf(os.Stderr, "Error: %v\n", err) + os.Exit(1) + } +} + +func run(ctx context.Context) error { var region string var instanceType string @@ -37,15 +46,12 @@ func main() { flag.Parse() if region == "" || instanceType == "" { - fmt.Println("Usage: check-aws-availability -region -instance-type ") - os.Exit(1) + return fmt.Errorf("Usage: check-aws-availability -region -instance-type ") } - ctx := context.TODO() cfg, err := config.LoadDefaultConfig(ctx, config.WithRegion(region)) if err != nil { - fmt.Printf("Error loading configuration: %v\n", err) - os.Exit(1) + return fmt.Errorf("Error loading configuration: %w", err) } client := ec2.NewFromConfig(cfg) @@ -62,13 +68,21 @@ func main() { result, err := client.DescribeInstanceTypeOfferings(ctx, input) if err != nil { - fmt.Printf("Error describing instance type offerings: %v\n", err) - os.Exit(1) + return fmt.Errorf("Error describing instance type offerings: %w", err) } - if len(result.InstanceTypeOfferings) > 0 { - fmt.Println("true") - } else { - fmt.Println("false") + var zones []string + + // Gather the availability zones where the instance type is offered + for _, offering := range result.InstanceTypeOfferings { + zone := aws.ToString(offering.Location) + zones = append(zones, zone) } + + if len(zones) == 0 { + return fmt.Errorf("Instance type %s is not available in any availability zones in region %s", instanceType, region) + } + + fmt.Fprintf(os.Stdout, "ZONES=%s\n", strings.Join(zones, ",")) + return nil } diff --git a/tests/e2e/scenarios/ai-conformance/validators/accelerators/dra_support/apiversions_test.go b/tests/e2e/scenarios/ai-conformance/validators/accelerators/dra_support/apiversions_test.go index 0d49c8acb3b69..318bf885277e6 100644 --- a/tests/e2e/scenarios/ai-conformance/validators/accelerators/dra_support/apiversions_test.go +++ b/tests/e2e/scenarios/ai-conformance/validators/accelerators/dra_support/apiversions_test.go @@ -1,4 +1,5 @@ /* + /* Copyright The Kubernetes Authors. Licensed under the Apache License, Version 2.0 (the "License");