From 3aff1b7891878ac59cc7ad2617f74dec2a0ce8ac Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Fri, 11 Jul 2025 16:01:54 +0200 Subject: [PATCH 01/38] test(e2e): introduces e2e test skeleton and single-node cpu model Enables end-to-end (aka fvt) pytests from LLMInferenceService. This change includes: - flexible, parameterized-based test skeleton that allows deploying models by simply combining LLMInferenceServiceConfig refs - can be mixed with partial spec (as another parameterized test) - simple happy path test as a basis for further cases - supports pytest parallelizm - gh-action: support for `metallb` addon for minikube > [!IMPORTANT] > See test/e2e/llmisvc/README.md for test design ideas. > Refer to pkg/controller/llmisvc/DEV.md for updated description how to run the tests. Fixes [RHOAIENG-30183](https://issues.redhat.com/browse/RHOAIENG-30183) Signed-off-by: Bartosz Majsak ; Conflicts: ; config/default/kustomization.yaml Signed-off-by: Bartosz Majsak --- .github/actions/kserve-dep-setup/action.yml | 6 +- .github/actions/minikube-setup/action.yml | 29 +- .github/workflows/e2e-test-llmisvc.yml | 172 +++++++++++ .../test/clusterresources/kustomization.yaml | 1 + pkg/controller/llmisvc/DEV.md | 97 +++--- test/e2e/llmisvc/README.md | 28 ++ test/e2e/llmisvc/__init__.py | 0 test/e2e/llmisvc/conftest.py | 26 ++ test/e2e/llmisvc/pytest.ini | 14 + test/e2e/llmisvc/test_configs.py | 179 +++++++++++ .../e2e/llmisvc/test_llm_inference_service.py | 289 ++++++++++++++++++ test/e2e/pytest.ini | 1 + test/scripts/gh-actions/run-e2e-tests.sh | 1 + 13 files changed, 804 insertions(+), 39 deletions(-) create mode 100644 .github/workflows/e2e-test-llmisvc.yml create mode 100644 test/e2e/llmisvc/README.md create mode 100644 test/e2e/llmisvc/__init__.py create mode 100644 test/e2e/llmisvc/conftest.py create mode 100644 test/e2e/llmisvc/pytest.ini create mode 100644 test/e2e/llmisvc/test_configs.py create mode 100644 test/e2e/llmisvc/test_llm_inference_service.py diff --git a/.github/actions/kserve-dep-setup/action.yml b/.github/actions/kserve-dep-setup/action.yml index de7f22dc411..3cf3da3958d 100644 --- a/.github/actions/kserve-dep-setup/action.yml +++ b/.github/actions/kserve-dep-setup/action.yml @@ -13,6 +13,10 @@ inputs: description: 'Enable KEDA for autoscaling' required: false default: 'false' + enable-lws: + description: 'Enable Leader Worker Set' + required: false + default: 'false' runs: using: "composite" steps: @@ -26,7 +30,7 @@ runs: ./test/scripts/gh-actions/setup-kourier.sh else echo "Selected network layer ${{ inputs.network-layer }}" - ./test/scripts/gh-actions/setup-deps.sh ${{ inputs.deployment-mode }} "${{ inputs.network-layer }}" "${{ inputs.enable-keda }}" + ./test/scripts/gh-actions/setup-deps.sh ${{ inputs.deployment-mode }} "${{ inputs.network-layer }}" "${{ inputs.enable-keda }}" "${{ inputs.enable-lws }}" fi - name: Update test overlays diff --git a/.github/actions/minikube-setup/action.yml b/.github/actions/minikube-setup/action.yml index f05e9678793..f38da79b2fa 100644 --- a/.github/actions/minikube-setup/action.yml +++ b/.github/actions/minikube-setup/action.yml @@ -14,6 +14,10 @@ inputs: description: 'Additional arguments to pass to minikube start' required: false default: '' + addons: + description: 'Choose optional addons to install. Valid options: metallb, ingress, gcp-auth, registry ...' + required: false + default: '' runs: using: "composite" @@ -29,11 +33,34 @@ runs: minikube-version: '1.35.0' kubernetes-version: 'v1.30.7' driver: ${{ inputs.driver }} + addons: ${{ inputs.addons }} wait: 'all' cpus: 'max' memory: 'max' start-args: --wait-timeout=6m0s --nodes=${{ inputs.nodes }} ${{ inputs.start-args }} - + - name: Configure MetalLB for Minikube + if: ${{ contains(inputs.addons, 'metallb') }} + shell: bash + run: | + IP=$(minikube ip) + PREFIX=${IP%.*} + START=${PREFIX}.200 + END=${PREFIX}.235 + + kubectl apply -f - < /dev/null 2>&1 & +cloud-provider-kind > /dev/null 2>&1 & ``` ##### Using `minikube` ```shell -minikube start --cpus='12' --memory='16G' +minikube start --cpus='12' --memory='16G' --kubernetes-version=v1.33.1 minikube addons enable metallb -# You need to configure metallb with an IP range. This depends on the minikube network. -# You can find your current minikube ip with: -# $ minikube ip -# 192.168.39.118 -# -# With the previous sample output, you would configure metallb with a range not including -# the minikube IP (change only the last entry). E.g: -minikube addons configure metallb -# Minikube will ask two prompts. Notice the configured range 192.168.39.200-192.168.39.235 is -# not including minikube IP: -# -- Enter Load Balancer Start IP: 192.168.39.200 -# -- Enter Load Balancer End IP: 192.168.39.235 +IP=$(minikube ip) +PREFIX=${IP%.*} +START=${PREFIX}.200 +END=${PREFIX}.235 + +kubectl apply -f - < ``` -#### Creating simple CPU model +#### Validation + +##### pytest + +Set up pytest +```shell +cd python/kserve +python -m venv .venv +pip install -e . +pip install pytest pytest-asyncio requests portforward Jinja2 pytest-xdist +cd - +``` + +Run the test + +```shell +./test/scripts/gh-actions/run-e2e-tests.sh "llminferenceservice(type='cpu')" 1 "istio-gatewayapi-ext" + +Starting E2E functional tests ... +No parallelism requested for pytest. Will use default value of 1 +pytest -m 'llminferenceservice(type='cpu')' --ignore=qpext --log-cli-level=INFO -n 1 --dist worksteal --network-layer istio-gatewayapi-ext +===================================================================================== test session starts ===================================================================================== +platform linux -- Python 3.12.11, pytest-8.4.1, pluggy-1.6.0 +rootdir: /home/bartek/code/redhat/model-serving/kserve/kserve-test/test/e2e +configfile: pytest.ini +plugins: anyio-4.9.0, xdist-3.8.0, asyncio-1.1.0 +asyncio: mode=Mode.STRICT, asyncio_default_fixture_loop_scope=None, asyncio_default_test_loop_scope=function +1 worker [1 item]s / 1 error +scheduling tests via WorkStealingScheduling + +llmisvc/test_llm_inference_service.py::test_llm_inference_service[managed-single-cpu-fb-opt-125m] +[gw0] [100%] PASSED llmisvc/test_llm_inference_service.py::test_llm_inference_service[managed-single-cpu-fb-opt-125m] +``` +> [!NOTE] +> Ignore error from ERROR collecting graph/test_inference_graph.py, but we should fix it! + +##### Manual + +Create LLMInferenceService, e.g.: ```shell NS=llm-test kubectl create namespace ${NS} || true -kubectl apply -f - <" + f"create_namespaced_custom_object for LLMInferenceServiceConfig: {e}" + ) from e + + +def delete_llmisvc_config( + kserve_client, name, namespace, version=constants.KSERVE_V1ALPHA1_VERSION +): + try: + return kserve_client.api_instance.delete_namespaced_custom_object( + constants.KSERVE_GROUP, + version, + namespace, + KSERVE_PLURAL_LLMINFERENCESERVICECONFIG, + name, + ) + except client.rest.ApiException as e: + raise RuntimeError( + f"Exception when calling CustomObjectsApi->" + f"delete_namespaced_custom_object for LLMInferenceServiceConfig: {e}" + ) from e + + +def get_llmisvc_config( + kserve_client, name, namespace, version=constants.KSERVE_V1ALPHA1_VERSION +): + try: + return kserve_client.api_instance.get_namespaced_custom_object( + constants.KSERVE_GROUP, + version, + namespace, + KSERVE_PLURAL_LLMINFERENCESERVICECONFIG, + name, + ) + except client.rest.ApiException as e: + raise RuntimeError( + f"Exception when calling CustomObjectsApi->" + f"get_namespaced_custom_object for LLMInferenceServiceConfig: {e}" + ) from e diff --git a/test/e2e/llmisvc/test_llm_inference_service.py b/test/e2e/llmisvc/test_llm_inference_service.py new file mode 100644 index 00000000000..9a3ddd2f06d --- /dev/null +++ b/test/e2e/llmisvc/test_llm_inference_service.py @@ -0,0 +1,289 @@ +# Copyright 2025 The KServe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import json +import os +import time + +import pytest +import requests +from kubernetes import client +from kserve import KServeClient, V1alpha1LLMInferenceService, constants + +from .test_configs import ( + LLMINFERENCESERVICE_CONFIGS, + generate_test_id, + llm_config_factory, + KSERVE_TEST_NAMESPACE, +) + +KSERVE_PLURAL_LLMINFERENCESERVICE = "llminferenceservices" + + +@pytest.mark.llminferenceservice(type="cpu") +@pytest.mark.asyncio(scope="session") +@pytest.mark.parametrize( + "config_names", + [ + ["router-managed", "workload-single-cpu", "model-fb-opt-125m"], + ], + indirect=True, + ids=generate_test_id, +) +async def test_llm_inference_service(request, llm_config_factory, config_names): + created_config_names = llm_config_factory(config_names) + service_name = generate_service_name(request.node.name, config_names) + + llm_isvc = V1alpha1LLMInferenceService( + api_version="serving.kserve.io/v1alpha1", + kind="LLMInferenceService", + metadata=client.V1ObjectMeta( + name=service_name, namespace=KSERVE_TEST_NAMESPACE + ), + spec={ + "replicas": 1, + "baseRefs": [{"name": config_name} for config_name in created_config_names], + }, + ) + + kserve_client = KServeClient( + config_file=os.environ.get("KUBECONFIG", "~/.kube/config") + ) + + try: + create_llmisvc(kserve_client, llm_isvc) + wait_for_model_response( + kserve_client, + service_name, + KSERVE_TEST_NAMESPACE, + model_name=get_model_name_from_configs(config_names), + ) + except Exception as e: + print(f"ERROR: Failed to call llm inference service {service_name}: {e}") + collect_diagnostics(service_name, KSERVE_TEST_NAMESPACE) + raise + finally: + try: + delete_llmisvc(kserve_client, service_name, KSERVE_TEST_NAMESPACE) + except Exception as e: + print(f"Warning: Failed to cleanup service {service_name}: {e}") + + +def create_llmisvc(kserve_client, llm_isvc, namespace=None): + from kserve.utils import utils + + version = llm_isvc.api_version.split("/")[1] + + if namespace is None: + namespace = utils.get_isvc_namespace(llm_isvc) + + try: + outputs = kserve_client.api_instance.create_namespaced_custom_object( + constants.KSERVE_GROUP, + version, + namespace, + KSERVE_PLURAL_LLMINFERENCESERVICE, + llm_isvc, + ) + return outputs + except client.rest.ApiException as e: + raise RuntimeError( + f"Exception when calling CustomObjectsApi->" + f"create_namespaced_custom_object for LLMInferenceService: {e}" + ) from e + + +def delete_llmisvc( + kserve_client, name, namespace, version=constants.KSERVE_V1ALPHA1_VERSION +): + try: + return kserve_client.api_instance.delete_namespaced_custom_object( + constants.KSERVE_GROUP, + version, + namespace, + KSERVE_PLURAL_LLMINFERENCESERVICE, + name, + ) + except client.rest.ApiException as e: + raise RuntimeError( + f"Exception when calling CustomObjectsApi->" + f"delete_namespaced_custom_object for LLMInferenceService: {e}" + ) from e + + +def get_llmisvc( + kserve_client, name, namespace, version=constants.KSERVE_V1ALPHA1_VERSION +): + try: + return kserve_client.api_instance.get_namespaced_custom_object( + constants.KSERVE_GROUP, + version, + namespace, + KSERVE_PLURAL_LLMINFERENCESERVICE, + name, + ) + except client.rest.ApiException as e: + raise RuntimeError( + f"Exception when calling CustomObjectsApi->" + f"get_namespaced_custom_object for LLMInferenceService: {e}" + ) from e + + +def wait_for_model_response( + kserve_client, + name, + namespace, + timeout_seconds=600, + version=constants.KSERVE_V1ALPHA1_VERSION, + model_name=None, +): + if model_name is None: + model_name = "default-model" + + service_url = None + + def assert_model_responds(): + nonlocal service_url + + try: + service_url = get_llm_service_url(kserve_client, name, namespace, version) + except Exception as e: + raise AssertionError(f"Failed to get service URL: {e}") from e + + completion_url = f"{service_url}/v1/completions" + test_payload = {"model": model_name, "prompt": "test", "max_tokens": 1} + + try: + response = requests.post( + completion_url, + headers={"Content-Type": "application/json"}, + json=test_payload, + timeout=30, + ) + except Exception as e: + raise AssertionError(f"Failed to call model: {e}") from e + + assert ( + response.status_code == 200 + ), f"Service returned {response.status_code}: {response.text}" + return service_url + + return wait_for(assert_model_responds, timeout=timeout_seconds, interval=10.0) + + +def get_llm_service_url( + kserve_client, service_name, namespace, version=constants.KSERVE_V1ALPHA1_VERSION +): + try: + llm_isvc = get_llmisvc(kserve_client, service_name, namespace, version) + + if "status" not in llm_isvc: + raise ValueError(f"No status found in LLM inference service {service_name}") + + status = llm_isvc["status"] + + if "url" in status and status["url"]: + return status["url"] + + if ( + "addresses" in status + and status["addresses"] + and len(status["addresses"]) > 0 + ): + first_address = status["addresses"][0] + if "url" in first_address: + return first_address["url"] + + raise ValueError(f"No URL found in LLM inference service {service_name} status") + + except Exception as e: + raise ValueError( + f"Failed to get URL for LLM inference service {service_name}: {e}" + ) from e + + +def wait_for(assertion_fn, timeout: float = 5.0, interval: float = 0.1): + deadline = time.time() + timeout + while True: + try: + return assertion_fn() + except AssertionError: + if time.time() >= deadline: + raise + time.sleep(interval) + + +def get_model_name_from_configs(config_names): + """Extract model name from model config.""" + for config_name in config_names: + if config_name.startswith("model-"): + config = LLMINFERENCESERVICE_CONFIGS[config_name] + if "model" in config and "name" in config["model"]: + return config["model"]["name"] + return "default-model" + + +def generate_service_name(test_name, config_names): + base_name = test_name.split("[")[0] # Remove everything after [ + base_name = base_name.replace("test_", "").replace("_", "-") + config_suffix = "-".join(sorted(config_names)) + service_name = f"{base_name}-{config_suffix}" + service_name = service_name.lower() + service_name = service_name[:63].rstrip("-") + return service_name + + +def collect_diagnostics(service_name, namespace): + try: + kserve_client = KServeClient( + config_file=os.environ.get("KUBECONFIG", "~/.kube/config") + ) + + print(f"\n{'='*60}") + print(f"DIAGNOSTIC INFORMATION FOR {service_name} in {namespace}") + print(f"{'='*60}") + + print("\n--- LLM Inference Service ---") + try: + llm_isvc = get_llmisvc(kserve_client, service_name, namespace) + print(json.dumps(llm_isvc, indent=2, default=str)) + except Exception as e: + print(f"Failed to get LLM inference service: {e}") + + print("\n--- Events ---") + try: + core_v1 = client.CoreV1Api() + events = core_v1.list_namespaced_event( + namespace=namespace, + field_selector=f"involvedObject.name={service_name}", + ) + if events.items: + sorted_events = sorted( + events.items, + key=lambda x: x.last_timestamp or x.first_timestamp, + reverse=True, + ) + for event in sorted_events[:5]: + timestamp = event.last_timestamp or event.first_timestamp + print(f" {event.type}: {event.reason} - {event.message}") + print(f" Time: {timestamp}") + else: + print(" No events found") + except Exception as e: + print(f"Failed to list events: {e}") + + print(f"\n{'='*60}") + + except Exception as e: + print(f"Failed to collect diagnostics: {e}") diff --git a/test/e2e/pytest.ini b/test/e2e/pytest.ini index 3407bbfbc27..9aba5ff4e3e 100644 --- a/test/e2e/pytest.ini +++ b/test/e2e/pytest.ini @@ -17,4 +17,5 @@ markers = path_based_routing: e2e tests for path based routing llm: e2e tests for huggingface runtime vllm: e2e tests for huggingface runtime with vllm-openvino backend + llminferenceservice: e2e tests for llm inference service controller modelcache: e2e tests for model caching \ No newline at end of file diff --git a/test/scripts/gh-actions/run-e2e-tests.sh b/test/scripts/gh-actions/run-e2e-tests.sh index 98d6af06f6e..ee8794c8d23 100755 --- a/test/scripts/gh-actions/run-e2e-tests.sh +++ b/test/scripts/gh-actions/run-e2e-tests.sh @@ -35,6 +35,7 @@ pushd test/e2e >/dev/null echo "Skipping explainer tests for raw deployment with ingress" pytest -m "$MARKER" --ignore=qpext --log-cli-level=INFO -n $PARALLELISM --dist worksteal --network-layer $NETWORK_LAYER --ignore=explainer/ else + echo "pytest -m '$MARKER' --ignore=qpext --log-cli-level=INFO -n $PARALLELISM --dist worksteal --network-layer $NETWORK_LAYER" pytest -m "$MARKER" --ignore=qpext --log-cli-level=INFO -n $PARALLELISM --dist worksteal --network-layer $NETWORK_LAYER fi popd From d3b53d27c572effec46870888776baa7011b80f0 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Thu, 24 Jul 2025 17:01:55 +0200 Subject: [PATCH 02/38] fix: apply crds first and wait for them to be ready Signed-off-by: Bartosz Majsak --- Makefile | 2 ++ config/overlays/test/clusterresources/kustomization.yaml | 1 - 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index bc670f76435..5aa8fca8d9c 100644 --- a/Makefile +++ b/Makefile @@ -253,6 +253,8 @@ deploy-dev-llm: ./hack/deploy_dev_llm.sh deploy-ci: manifests + kubectl apply --server-side=true -k config/crd || true + kubectl wait --for=condition=established --timeout=60s crd/llminferenceserviceconfigs.serving.kserve.io kubectl apply --server-side=true -k config/overlays/test # TODO: Add runtimes as part of default deployment kubectl wait --for=condition=ready pod -l control-plane=kserve-controller-manager -n kserve --timeout=300s diff --git a/config/overlays/test/clusterresources/kustomization.yaml b/config/overlays/test/clusterresources/kustomization.yaml index 520ae05cd67..ab1e56e51f6 100644 --- a/config/overlays/test/clusterresources/kustomization.yaml +++ b/config/overlays/test/clusterresources/kustomization.yaml @@ -4,7 +4,6 @@ kind: Kustomization resources: - ../../../runtimes - ../../../storagecontainers -- ../../../llmisvc images: From c7e76bd196eac7ad157dd3de7d59a2906215aa9c Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Thu, 24 Jul 2025 18:58:16 +0200 Subject: [PATCH 03/38] chore: limit gh-action tests to cpu Signed-off-by: Bartosz Majsak --- .github/workflows/e2e-test-llmisvc.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/e2e-test-llmisvc.yml b/.github/workflows/e2e-test-llmisvc.yml index 09ec0603d40..576a5aae993 100644 --- a/.github/workflows/e2e-test-llmisvc.yml +++ b/.github/workflows/e2e-test-llmisvc.yml @@ -96,7 +96,8 @@ jobs: - name: Run E2E tests timeout-minutes: 30 run: | - ./test/scripts/gh-actions/run-e2e-tests.sh "llminferenceservice" "1" "istio-gatewayapi-ext" + # Run only CPU tests for now + ./test/scripts/gh-actions/run-e2e-tests.sh "llminferenceservice(type='cpu')" 2 "istio-gatewayapi-ext" - name: Check system status if: always() From d2adcf3817ef50b62ec363191e73234bec3b025c Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Thu, 24 Jul 2025 20:28:06 +0200 Subject: [PATCH 04/38] fix: single worker job Signed-off-by: Bartosz Majsak --- .github/workflows/e2e-test-llmisvc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e-test-llmisvc.yml b/.github/workflows/e2e-test-llmisvc.yml index 576a5aae993..2249a6bde5c 100644 --- a/.github/workflows/e2e-test-llmisvc.yml +++ b/.github/workflows/e2e-test-llmisvc.yml @@ -97,7 +97,7 @@ jobs: timeout-minutes: 30 run: | # Run only CPU tests for now - ./test/scripts/gh-actions/run-e2e-tests.sh "llminferenceservice(type='cpu')" 2 "istio-gatewayapi-ext" + ./test/scripts/gh-actions/run-e2e-tests.sh "llminferenceservice(type='cpu')" 1 "istio-gatewayapi-ext" - name: Check system status if: always() From 5e0e87492f109c8af71b465725893e9fd4b14f99 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Fri, 25 Jul 2025 09:36:15 +0200 Subject: [PATCH 05/38] feat: introduces cluster capability markers Signed-off-by: Bartosz Majsak --- .github/workflows/e2e-test-llmisvc.yml | 5 ++- pkg/controller/llmisvc/DEV.md | 19 ++++++++- test/e2e/llmisvc/README.md | 41 ++++++++++++++++--- test/e2e/llmisvc/conftest.py | 22 ++++++++++ test/e2e/llmisvc/pytest.ini | 6 ++- test/e2e/llmisvc/test_configs.py | 17 +++----- .../e2e/llmisvc/test_llm_inference_service.py | 13 +++--- test/e2e/pytest.ini | 6 ++- 8 files changed, 101 insertions(+), 28 deletions(-) diff --git a/.github/workflows/e2e-test-llmisvc.yml b/.github/workflows/e2e-test-llmisvc.yml index 2249a6bde5c..65a82d93279 100644 --- a/.github/workflows/e2e-test-llmisvc.yml +++ b/.github/workflows/e2e-test-llmisvc.yml @@ -96,8 +96,9 @@ jobs: - name: Run E2E tests timeout-minutes: 30 run: | - # Run only CPU tests for now - ./test/scripts/gh-actions/run-e2e-tests.sh "llminferenceservice(type='cpu')" 1 "istio-gatewayapi-ext" + # Run only CPU tests for now using pytest markers + # Available GPU vendors: amd, nvidia, intel + ./test/scripts/gh-actions/run-e2e-tests.sh "llminferenceservice and cpu" 1 "istio-gatewayapi-ext" - name: Check system status if: always() diff --git a/pkg/controller/llmisvc/DEV.md b/pkg/controller/llmisvc/DEV.md index b0d893e46e9..722faf660cf 100644 --- a/pkg/controller/llmisvc/DEV.md +++ b/pkg/controller/llmisvc/DEV.md @@ -64,11 +64,26 @@ cd - Run the test ```shell -./test/scripts/gh-actions/run-e2e-tests.sh "llminferenceservice(type='cpu')" 1 "istio-gatewayapi-ext" +# Use pytest markers for filtering + +# Run only CPU tests +./test/scripts/gh-actions/run-e2e-tests.sh "llminferenceservice and cluster_cpu" 1 "istio-gatewayapi-ext" + +# Run only NVIDIA GPU tests +./test/scripts/gh-actions/run-e2e-tests.sh "llminferenceservice and cluster_nvidia" 1 "istio-gatewayapi-ext" + +# Run all GPU tests (any vendor: amd, nvidia, intel) +./test/scripts/gh-actions/run-e2e-tests.sh "llminferenceservice and (cluster_amd or cluster_nvidia or cluster_intel)" 1 "istio-gatewayapi-ext" + +# Run CPU and AMD GPU tests only +./test/scripts/gh-actions/run-e2e-tests.sh "llminferenceservice and (cluster_cpu or cluster_amd)" 1 "istio-gatewayapi-ext" + +# Run all LLM inference service tests +./test/scripts/gh-actions/run-e2e-tests.sh "llminferenceservice" 1 "istio-gatewayapi-ext" Starting E2E functional tests ... No parallelism requested for pytest. Will use default value of 1 -pytest -m 'llminferenceservice(type='cpu')' --ignore=qpext --log-cli-level=INFO -n 1 --dist worksteal --network-layer istio-gatewayapi-ext +pytest -m 'llminferenceservice and cluster_cpu' --ignore=qpext --log-cli-level=INFO -n 1 --dist worksteal --network-layer istio-gatewayapi-ext ===================================================================================== test session starts ===================================================================================== platform linux -- Python 3.12.11, pytest-8.4.1, pluggy-1.6.0 rootdir: /home/bartek/code/redhat/model-serving/kserve/kserve-test/test/e2e diff --git a/test/e2e/llmisvc/README.md b/test/e2e/llmisvc/README.md index 2d2fdefb5d8..ba0efc76ca2 100644 --- a/test/e2e/llmisvc/README.md +++ b/test/e2e/llmisvc/README.md @@ -4,15 +4,38 @@ Tests combine config fragments from different categories to create complete scenarios: ```python -["router-managed", "workload-single-cpu", "model-fb-opt-125m"] +pytest.param(["router-managed", "workload-single-cpu", "model-fb-opt-125m"], marks=pytest.mark.cluster_cpu) ``` The `llm_config_factory` fixture automatically creates/cleans up `LLMInferenceServiceConfig` objects. -## Markers +## Test Filtering -- `@pytest.mark.llminferenceservice(type="cpu")` - Resource type for selective test execution -- Use `pytest -m "llminferenceservice and cpu"` to run specific resource tests +Tests are marked with both general and cluster-specific capability markers: + +- `@pytest.mark.llminferenceservice` - All LLM inference service tests +- `@pytest.mark.cluster_cpu` - CPU-only tests +- `@pytest.mark.cluster_amd` - AMD GPU tests +- `@pytest.mark.cluster_nvidia` - NVIDIA GPU tests +- `@pytest.mark.cluster_intel` - Intel GPU tests + +Examples: +```bash +# Run all LLM inference service tests +pytest -m "llminferenceservice" test/e2e/llmisvc/ + +# Run only CPU tests +pytest -m "llminferenceservice and cluster_cpu" test/e2e/llmisvc/ + +# Run only NVIDIA GPU tests +pytest -m "llminferenceservice and cluster_nvidia" test/e2e/llmisvc/ + +# Run all GPU tests (any vendor) +pytest -m "llminferenceservice and (cluster_amd or cluster_nvidia or cluster_intel)" test/e2e/llmisvc/ + +# Run CPU and AMD GPU tests only +pytest -m "llminferenceservice and (cluster_cpu or cluster_amd)" test/e2e/llmisvc/ +``` ## Config Naming Convention @@ -22,7 +45,15 @@ Use prefixed categories that get composed together: - **`model-*`**: Model sources (e.g., `model-fb-opt-125m`, `model-gpt2`) - **`router-*`**: Routing configs (e.g., `router-managed`, `router-with-scheduler`) +Test IDs are generated by combining the cluster capability from pytest marks with all config names: +- Test ID format: `{cluster_capability}-{config1}-{config2}-{config3}` +- Example: `cluster_cpu-router-managed-workload-single-cpu-model-fb-opt-125m` + ## Adding New Configs 1. Add to `LLMINFERENCESERVICE_CONFIGS` in `test_configs.py` -2. Follow `category-descriptor` naming (prefix automatically stripped from test IDs) \ No newline at end of file +2. Follow `category-descriptor` naming (prefix automatically stripped from test IDs) +3. Add new cluster capability test cases using `pytest.param` with appropriate marks: + ```python + pytest.param(["router-managed", "workload-nvidia-a100-gpu", "model-llama-70b"], marks=pytest.mark.cluster_qualcomm), + ``` \ No newline at end of file diff --git a/test/e2e/llmisvc/conftest.py b/test/e2e/llmisvc/conftest.py index 8722251fc6f..cdbad281060 100644 --- a/test/e2e/llmisvc/conftest.py +++ b/test/e2e/llmisvc/conftest.py @@ -19,6 +19,28 @@ def config_names(request): return request.param +def pytest_collection_modifyitems(config, items): + for item in items: + # only touch parameterized tests + if not hasattr(item, "callspec"): + continue + + # if there's no [...] suffix (i.e. not parametrized), skip + if "[" not in item.nodeid: + continue + base, rest = item.nodeid.split("[", 1) + rest = rest.rstrip("]") + + cluster_marks = [ + m.name + for m in item.iter_markers() + if m.name.startswith("cluster_") + ] + if not cluster_marks: + continue + + new_id = "-".join(cluster_marks + [rest]) + item._nodeid = f"{base}[{new_id}]" def pytest_configure(config): config.addinivalue_line( diff --git a/test/e2e/llmisvc/pytest.ini b/test/e2e/llmisvc/pytest.ini index c1a0cbaa961..3c88282f9bc 100644 --- a/test/e2e/llmisvc/pytest.ini +++ b/test/e2e/llmisvc/pytest.ini @@ -1,4 +1,4 @@ -[tool:pytest] +[pytest] testpaths = . python_files = test_*.py python_functions = test_* @@ -10,5 +10,9 @@ addopts = --disable-warnings markers = llminferenceservice: LLM inference service tests + cluster.cpu: CPU tests + cluster.amd: AMD tests + cluster.intel: Intel tests + cluster.nvidia: NVIDIA tests asyncio: AsyncIO tests asyncio_mode = auto \ No newline at end of file diff --git a/test/e2e/llmisvc/test_configs.py b/test/e2e/llmisvc/test_configs.py index 87e6da0a89a..259ed056e78 100644 --- a/test/e2e/llmisvc/test_configs.py +++ b/test/e2e/llmisvc/test_configs.py @@ -58,17 +58,6 @@ }, } - -def generate_test_id(config_names): - """Generate a test ID from config names by removing prefixes.""" - parts = [] - for config in config_names: - if "-" in config: - parts.append(config.split("-", 1)[1]) # Remove first prefix- - else: - parts.append(config) - return "-".join(parts) - @pytest.fixture(scope="session") def llm_config_factory(): """Factory for creating/cleaning LLMInferenceServiceConfig once per session.""" @@ -120,6 +109,12 @@ def _create_configs(names, namespace=KSERVE_TEST_NAMESPACE): except Exception: pass +def generate_test_id(config_names): + """Generate a test ID from config names by removing prefixes.""" + parts = [] + for config in config_names: + parts.append(config) + return "-".join(parts) def create_llmisvc_config(kserve_client, llm_config, namespace=None): version = llm_config["apiVersion"].split("/")[1] diff --git a/test/e2e/llmisvc/test_llm_inference_service.py b/test/e2e/llmisvc/test_llm_inference_service.py index 9a3ddd2f06d..64f32a54bf0 100644 --- a/test/e2e/llmisvc/test_llm_inference_service.py +++ b/test/e2e/llmisvc/test_llm_inference_service.py @@ -31,18 +31,19 @@ KSERVE_PLURAL_LLMINFERENCESERVICE = "llminferenceservices" -@pytest.mark.llminferenceservice(type="cpu") +@pytest.mark.llminferenceservice @pytest.mark.asyncio(scope="session") @pytest.mark.parametrize( "config_names", [ - ["router-managed", "workload-single-cpu", "model-fb-opt-125m"], + pytest.param(["router-managed", "workload-single-cpu", "model-fb-opt-125m"], marks=pytest.mark.cluster_cpu), + pytest.param(["router-managed", "workload-amd-gpu", "model-fb-opt-125m"], marks=pytest.mark.cluster_amd), ], - indirect=True, + indirect=["config_names"], ids=generate_test_id, ) async def test_llm_inference_service(request, llm_config_factory, config_names): - created_config_names = llm_config_factory(config_names) + created_service_configs = llm_config_factory(config_names) service_name = generate_service_name(request.node.name, config_names) llm_isvc = V1alpha1LLMInferenceService( @@ -52,8 +53,8 @@ async def test_llm_inference_service(request, llm_config_factory, config_names): name=service_name, namespace=KSERVE_TEST_NAMESPACE ), spec={ - "replicas": 1, - "baseRefs": [{"name": config_name} for config_name in created_config_names], + + "baseRefs": [{"name": config_name} for config_name in created_service_configs], }, ) diff --git a/test/e2e/pytest.ini b/test/e2e/pytest.ini index 9aba5ff4e3e..92c33eb8a26 100644 --- a/test/e2e/pytest.ini +++ b/test/e2e/pytest.ini @@ -18,4 +18,8 @@ markers = llm: e2e tests for huggingface runtime vllm: e2e tests for huggingface runtime with vllm-openvino backend llminferenceservice: e2e tests for llm inference service controller - modelcache: e2e tests for model caching \ No newline at end of file + modelcache: e2e tests for model caching + cluster_cpu: test targeting cluster with CPU + cluster_amd: test targeting cluster with AMD + cluster_intel: test targeting cluster with Intel + cluster_nvidia: test targeting cluster with NVIDIA \ No newline at end of file From 2752ce177d747863eaa72a1e0a1227ba0c01fedc Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Thu, 24 Jul 2025 18:43:45 +0200 Subject: [PATCH 06/38] chore: excludes tests from flake8 Signed-off-by: Bartosz Majsak --- .flake8 | 1 + 1 file changed, 1 insertion(+) diff --git a/.flake8 b/.flake8 index 83642179983..1037518bd0e 100644 --- a/.flake8 +++ b/.flake8 @@ -16,3 +16,4 @@ exclude = python/*_pb2.py docs/**/*.py python/kserve/kserve/protocol/rest/openai/types/openapi.py + test/e2e/llmisvc/**.py From 20e58693f053eb2f30274269c54a3b94ac06106c Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Thu, 24 Jul 2025 18:58:57 +0200 Subject: [PATCH 07/38] chore: precommit fixes Signed-off-by: Bartosz Majsak ; Conflicts: ; test/e2e/llmisvc/test_configs.py Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/test_configs.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/test/e2e/llmisvc/test_configs.py b/test/e2e/llmisvc/test_configs.py index 259ed056e78..8b55eba58ed 100644 --- a/test/e2e/llmisvc/test_configs.py +++ b/test/e2e/llmisvc/test_configs.py @@ -75,16 +75,20 @@ def _create_configs(names, namespace=KSERVE_TEST_NAMESPACE): get_llmisvc_config(client, name, namespace) continue except Exception as e: - is_404_api = isinstance(e, ApiException) and getattr(e, "status", None) == 404 - is_404_runtime = isinstance(e, RuntimeError) and "not found" in str(e).lower() + is_404_api = ( + isinstance(e, ApiException) and getattr(e, "status", None) == 404 + ) + is_404_runtime = ( + isinstance(e, RuntimeError) and "not found" in str(e).lower() + ) if not (is_404_api or is_404_runtime): raise body = { "apiVersion": "serving.kserve.io/v1alpha1", - "kind": "LLMInferenceServiceConfig", - "metadata": {"name": name, "namespace": namespace}, - "spec": spec, + "kind": "LLMInferenceServiceConfig", + "metadata": {"name": name, "namespace": namespace}, + "spec": spec, } try: From 5aceb3bda68540213ceb2aca5bfb1ada6861c8b7 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Fri, 25 Jul 2025 09:46:05 +0200 Subject: [PATCH 08/38] chore: no need for local pytest.ini as its ignored anyway Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/pytest.ini | 18 ------------------ 1 file changed, 18 deletions(-) delete mode 100644 test/e2e/llmisvc/pytest.ini diff --git a/test/e2e/llmisvc/pytest.ini b/test/e2e/llmisvc/pytest.ini deleted file mode 100644 index 3c88282f9bc..00000000000 --- a/test/e2e/llmisvc/pytest.ini +++ /dev/null @@ -1,18 +0,0 @@ -[pytest] -testpaths = . -python_files = test_*.py -python_functions = test_* -python_classes = Test* -addopts = - -v - --tb=short - --strict-markers - --disable-warnings -markers = - llminferenceservice: LLM inference service tests - cluster.cpu: CPU tests - cluster.amd: AMD tests - cluster.intel: Intel tests - cluster.nvidia: NVIDIA tests - asyncio: AsyncIO tests -asyncio_mode = auto \ No newline at end of file From 5299d05b418099af930c4acb4acc5f3b7436a756 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Fri, 25 Jul 2025 09:52:15 +0200 Subject: [PATCH 09/38] lint: adds possibility to ignore unused warnings When using with test factories in other pkg this yields false negatives. #upstream Signed-off-by: Bartosz Majsak --- .flake8 | 3 +-- test/e2e/llmisvc/test_llm_inference_service.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.flake8 b/.flake8 index 1037518bd0e..84fed34e69c 100644 --- a/.flake8 +++ b/.flake8 @@ -1,6 +1,6 @@ [flake8] max-line-length = 120 -extend-ignore = E203,E501,E701, B017 +extend-ignore = E203,E501,E701,B017,F401,F811 exclude = .venv, venv, @@ -16,4 +16,3 @@ exclude = python/*_pb2.py docs/**/*.py python/kserve/kserve/protocol/rest/openai/types/openapi.py - test/e2e/llmisvc/**.py diff --git a/test/e2e/llmisvc/test_llm_inference_service.py b/test/e2e/llmisvc/test_llm_inference_service.py index 64f32a54bf0..d9182bdda8f 100644 --- a/test/e2e/llmisvc/test_llm_inference_service.py +++ b/test/e2e/llmisvc/test_llm_inference_service.py @@ -24,7 +24,7 @@ from .test_configs import ( LLMINFERENCESERVICE_CONFIGS, generate_test_id, - llm_config_factory, + llm_config_factory, # noqa: F401,F811 KSERVE_TEST_NAMESPACE, ) From 2b3a4c60c20b46a08e2a4268843f6a2d33bbdc84 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Fri, 25 Jul 2025 09:52:24 +0200 Subject: [PATCH 10/38] precommit fixes Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/conftest.py | 6 +++--- test/e2e/llmisvc/test_configs.py | 3 +++ test/e2e/llmisvc/test_llm_inference_service.py | 15 +++++++++++---- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/test/e2e/llmisvc/conftest.py b/test/e2e/llmisvc/conftest.py index cdbad281060..a22fdb37b55 100644 --- a/test/e2e/llmisvc/conftest.py +++ b/test/e2e/llmisvc/conftest.py @@ -19,6 +19,7 @@ def config_names(request): return request.param + def pytest_collection_modifyitems(config, items): for item in items: # only touch parameterized tests @@ -32,9 +33,7 @@ def pytest_collection_modifyitems(config, items): rest = rest.rstrip("]") cluster_marks = [ - m.name - for m in item.iter_markers() - if m.name.startswith("cluster_") + m.name for m in item.iter_markers() if m.name.startswith("cluster_") ] if not cluster_marks: continue @@ -42,6 +41,7 @@ def pytest_collection_modifyitems(config, items): new_id = "-".join(cluster_marks + [rest]) item._nodeid = f"{base}[{new_id}]" + def pytest_configure(config): config.addinivalue_line( "markers", "llminferenceservice: mark test as an LLM inference service test" diff --git a/test/e2e/llmisvc/test_configs.py b/test/e2e/llmisvc/test_configs.py index 8b55eba58ed..774fb94047e 100644 --- a/test/e2e/llmisvc/test_configs.py +++ b/test/e2e/llmisvc/test_configs.py @@ -58,6 +58,7 @@ }, } + @pytest.fixture(scope="session") def llm_config_factory(): """Factory for creating/cleaning LLMInferenceServiceConfig once per session.""" @@ -113,6 +114,7 @@ def _create_configs(names, namespace=KSERVE_TEST_NAMESPACE): except Exception: pass + def generate_test_id(config_names): """Generate a test ID from config names by removing prefixes.""" parts = [] @@ -120,6 +122,7 @@ def generate_test_id(config_names): parts.append(config) return "-".join(parts) + def create_llmisvc_config(kserve_client, llm_config, namespace=None): version = llm_config["apiVersion"].split("/")[1] diff --git a/test/e2e/llmisvc/test_llm_inference_service.py b/test/e2e/llmisvc/test_llm_inference_service.py index d9182bdda8f..8dcb292062c 100644 --- a/test/e2e/llmisvc/test_llm_inference_service.py +++ b/test/e2e/llmisvc/test_llm_inference_service.py @@ -36,8 +36,14 @@ @pytest.mark.parametrize( "config_names", [ - pytest.param(["router-managed", "workload-single-cpu", "model-fb-opt-125m"], marks=pytest.mark.cluster_cpu), - pytest.param(["router-managed", "workload-amd-gpu", "model-fb-opt-125m"], marks=pytest.mark.cluster_amd), + pytest.param( + ["router-managed", "workload-single-cpu", "model-fb-opt-125m"], + marks=pytest.mark.cluster_cpu, + ), + pytest.param( + ["router-managed", "workload-amd-gpu", "model-fb-opt-125m"], + marks=pytest.mark.cluster_amd, + ), ], indirect=["config_names"], ids=generate_test_id, @@ -53,8 +59,9 @@ async def test_llm_inference_service(request, llm_config_factory, config_names): name=service_name, namespace=KSERVE_TEST_NAMESPACE ), spec={ - - "baseRefs": [{"name": config_name} for config_name in created_service_configs], + "baseRefs": [ + {"name": config_name} for config_name in created_service_configs + ], }, ) From 7b6f752b73d705ddcb6a9983c148906146601691 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Fri, 25 Jul 2025 10:00:20 +0200 Subject: [PATCH 11/38] Fail early on CRDs Co-authored-by: coderabbitai[bot] <136622811+coderabbitai[bot]@users.noreply.github.com> Signed-off-by: Bartosz Majsak --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 5aa8fca8d9c..4b3efdc226d 100644 --- a/Makefile +++ b/Makefile @@ -253,7 +253,7 @@ deploy-dev-llm: ./hack/deploy_dev_llm.sh deploy-ci: manifests - kubectl apply --server-side=true -k config/crd || true + kubectl apply --server-side=true -k config/crd kubectl wait --for=condition=established --timeout=60s crd/llminferenceserviceconfigs.serving.kserve.io kubectl apply --server-side=true -k config/overlays/test # TODO: Add runtimes as part of default deployment From 1b0747198de8e042df0c82d61c725ea7341b6f7c Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Fri, 25 Jul 2025 10:03:51 +0200 Subject: [PATCH 12/38] chore: test/e2e/llmisvc/README.md Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/llmisvc/README.md b/test/e2e/llmisvc/README.md index ba0efc76ca2..d9c7f8ebddb 100644 --- a/test/e2e/llmisvc/README.md +++ b/test/e2e/llmisvc/README.md @@ -55,5 +55,5 @@ Test IDs are generated by combining the cluster capability from pytest marks wit 2. Follow `category-descriptor` naming (prefix automatically stripped from test IDs) 3. Add new cluster capability test cases using `pytest.param` with appropriate marks: ```python - pytest.param(["router-managed", "workload-nvidia-a100-gpu", "model-llama-70b"], marks=pytest.mark.cluster_qualcomm), + pytest.param(["router-managed", "workload-nvidia-a100-gpu", "model-llama-70b"], marks=pytest.mark.cluster_nvidia), ``` \ No newline at end of file From 1121cd42f782bf299a0956d850dc1fba67acefd9 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Fri, 25 Jul 2025 19:54:22 +0200 Subject: [PATCH 13/38] chore: simplifies test fixtures Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/conftest.py | 9 +- .../llmisvc/{test_configs.py => fixtures.py} | 72 ++++++-- .../e2e/llmisvc/test_llm_inference_service.py | 172 ++++++++---------- 3 files changed, 133 insertions(+), 120 deletions(-) rename test/e2e/llmisvc/{test_configs.py => fixtures.py} (74%) diff --git a/test/e2e/llmisvc/conftest.py b/test/e2e/llmisvc/conftest.py index a22fdb37b55..ad950aedf93 100644 --- a/test/e2e/llmisvc/conftest.py +++ b/test/e2e/llmisvc/conftest.py @@ -12,14 +12,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -import pytest - - -@pytest.fixture -def config_names(request): - return request.param - +# This hook is used to ensure that the test names are unique and to ensure that +# the test names are consistent with the cluster marks. def pytest_collection_modifyitems(config, items): for item in items: # only touch parameterized tests diff --git a/test/e2e/llmisvc/test_configs.py b/test/e2e/llmisvc/fixtures.py similarity index 74% rename from test/e2e/llmisvc/test_configs.py rename to test/e2e/llmisvc/fixtures.py index 774fb94047e..c8f0a3f8886 100644 --- a/test/e2e/llmisvc/test_configs.py +++ b/test/e2e/llmisvc/fixtures.py @@ -12,11 +12,13 @@ # See the License for the specific language governing permissions and # limitations under the License. +import uuid import os import pytest +from typing import List from kubernetes import client from kubernetes.client.rest import ApiException -from kserve import KServeClient, constants +from kserve import KServeClient, constants, V1alpha1LLMInferenceService KSERVE_PLURAL_LLMINFERENCESERVICECONFIG = "llminferenceserviceconfigs" KSERVE_TEST_NAMESPACE = "kserve-ci-e2e-test" @@ -51,7 +53,7 @@ }, "router-with-scheduler": { "router": { - "scheduler": {"pool": {}, "template": {}}, + "scheduler": {}, "route": {}, "gateway": {}, }, @@ -59,14 +61,36 @@ } -@pytest.fixture(scope="session") +@pytest.fixture(scope="function") +def test_case(request): + tc = request.param + + service_name = generate_service_name(request.node.name, tc.base_refs) + tc.model_name = get_model_name_from_configs(tc.base_refs) + + tc.llm_service = V1alpha1LLMInferenceService( + api_version="serving.kserve.io/v1alpha1", + kind="LLMInferenceService", + metadata=client.V1ObjectMeta( + name=service_name, namespace=KSERVE_TEST_NAMESPACE + ), + spec={ + "baseRefs": [ + {"name": base_ref} for base_ref in tc.base_refs + ], + }, + ) + + return tc + +@pytest.fixture(scope="session", autouse=True) def llm_config_factory(): """Factory for creating/cleaning LLMInferenceServiceConfig once per session.""" created = [] client = KServeClient(config_file=os.environ.get("KUBECONFIG", "~/.kube/config")) - def _create_configs(names, namespace=KSERVE_TEST_NAMESPACE): - for name in names: + def _create_configs(namespace=KSERVE_TEST_NAMESPACE): + for name in LLMINFERENCESERVICE_CONFIGS: if name not in LLMINFERENCESERVICE_CONFIGS: raise ValueError(f"Unknown config name: {name}") @@ -103,9 +127,8 @@ def _create_configs(names, namespace=KSERVE_TEST_NAMESPACE): # otherwise, real error raise - return names - yield _create_configs + yield _create_configs() # teardown: best‑effort cleanup for name, namespace in created: @@ -114,13 +137,34 @@ def _create_configs(names, namespace=KSERVE_TEST_NAMESPACE): except Exception: pass - -def generate_test_id(config_names): - """Generate a test ID from config names by removing prefixes.""" - parts = [] - for config in config_names: - parts.append(config) - return "-".join(parts) +def get_model_name_from_configs(config_names): + """Extract model name from model config.""" + for config_name in config_names: + if config_name.startswith("model-"): + config = LLMINFERENCESERVICE_CONFIGS[config_name] + if "model" in config and "name" in config["model"]: + return config["model"]["name"] + return "default-model" + +def generate_service_name(test_name: str, base_refs: List[str]) -> str: + base_name = test_name.split("[", 1)[0] + base_name = base_name.replace("test_", "") + base_name = base_name.replace("_", "-") + config_suffix = "-".join(sorted(base_refs)) + test_case = f"{base_name}-{config_suffix}".lower() + + uid = uuid.uuid4().hex[:8] + + max_total = 63 + sep = "-" + max_test_case = max_total - len(sep) - len(uid) + test_case = test_case[:max_test_case].rstrip(sep) + + return f"{test_case}{sep}{uid}" + +def generate_test_id(test_case) -> str: + """Generate a test ID from base refs.""" + return "-".join(test_case.base_refs) def create_llmisvc_config(kserve_client, llm_config, namespace=None): diff --git a/test/e2e/llmisvc/test_llm_inference_service.py b/test/e2e/llmisvc/test_llm_inference_service.py index 8dcb292062c..b60d09531d9 100644 --- a/test/e2e/llmisvc/test_llm_inference_service.py +++ b/test/e2e/llmisvc/test_llm_inference_service.py @@ -15,92 +15,98 @@ import json import os import time +from dataclasses import dataclass +from typing import Any, Callable, List, Optional import pytest import requests from kubernetes import client from kserve import KServeClient, V1alpha1LLMInferenceService, constants -from .test_configs import ( +from .fixtures import ( LLMINFERENCESERVICE_CONFIGS, generate_test_id, llm_config_factory, # noqa: F401,F811 + test_case, # noqa: F401,F811 KSERVE_TEST_NAMESPACE, ) KSERVE_PLURAL_LLMINFERENCESERVICE = "llminferenceservices" +def assert_200(response: requests.Response) -> None: + """Default response assertion that checks for 200 status code.""" + assert ( + response.status_code == 200 + ), f"Service returned {response.status_code}: {response.text}" + + +@dataclass +class Case: + """Test case configuration for LLM inference service tests.""" + base_refs: List[str] + prompt: str = "Boston is a" + max_tokens: int = 1 + response_assertion: Callable[[requests.Response], None] = assert_200 + llm_service: V1alpha1LLMInferenceService = None # Generated by llm_service_factory + model_name: str = "default/model" + + @pytest.mark.llminferenceservice -@pytest.mark.asyncio(scope="session") +@pytest.mark.asyncio(loop_scope="session") @pytest.mark.parametrize( - "config_names", + "test_case", [ pytest.param( - ["router-managed", "workload-single-cpu", "model-fb-opt-125m"], + Case(["router-managed", "workload-single-cpu", "model-fb-opt-125m"]), marks=pytest.mark.cluster_cpu, ), pytest.param( - ["router-managed", "workload-amd-gpu", "model-fb-opt-125m"], - marks=pytest.mark.cluster_amd, + Case( + base_refs=["router-managed", "workload-single-cpu", "model-fb-opt-125m"], + prompt="What is the capital of France?", + response_assertion=lambda response: ( + response.status_code == 200 + and response.json().get("choices") is not None + and len(response.json().get("choices", [])) > 0 + ), + ), + marks=pytest.mark.cluster_cpu, ), ], - indirect=["config_names"], + indirect=["test_case"], ids=generate_test_id, ) -async def test_llm_inference_service(request, llm_config_factory, config_names): - created_service_configs = llm_config_factory(config_names) - service_name = generate_service_name(request.node.name, config_names) - - llm_isvc = V1alpha1LLMInferenceService( - api_version="serving.kserve.io/v1alpha1", - kind="LLMInferenceService", - metadata=client.V1ObjectMeta( - name=service_name, namespace=KSERVE_TEST_NAMESPACE - ), - spec={ - "baseRefs": [ - {"name": config_name} for config_name in created_service_configs - ], - }, - ) - +def test_llm_inference_service(test_case: Case): + kserve_client = KServeClient( config_file=os.environ.get("KUBECONFIG", "~/.kube/config") ) + service_name = test_case.llm_service.metadata.name + try: - create_llmisvc(kserve_client, llm_isvc) - wait_for_model_response( - kserve_client, - service_name, - KSERVE_TEST_NAMESPACE, - model_name=get_model_name_from_configs(config_names), - ) + create_llmisvc(kserve_client, test_case.llm_service) + wait_for_model_response(kserve_client, test_case) except Exception as e: print(f"ERROR: Failed to call llm inference service {service_name}: {e}") - collect_diagnostics(service_name, KSERVE_TEST_NAMESPACE) + collect_diagnostics(kserve_client, test_case.llm_service) raise finally: try: - delete_llmisvc(kserve_client, service_name, KSERVE_TEST_NAMESPACE) + delete_llmisvc(kserve_client, test_case.llm_service) except Exception as e: print(f"Warning: Failed to cleanup service {service_name}: {e}") -def create_llmisvc(kserve_client, llm_isvc, namespace=None): +def create_llmisvc(kserve_client: KServeClient, llm_isvc: V1alpha1LLMInferenceService): from kserve.utils import utils - version = llm_isvc.api_version.split("/")[1] - - if namespace is None: - namespace = utils.get_isvc_namespace(llm_isvc) - try: outputs = kserve_client.api_instance.create_namespaced_custom_object( constants.KSERVE_GROUP, - version, - namespace, + llm_isvc.api_version.split("/")[1], + llm_isvc.metadata.namespace, KSERVE_PLURAL_LLMINFERENCESERVICE, llm_isvc, ) @@ -112,16 +118,14 @@ def create_llmisvc(kserve_client, llm_isvc, namespace=None): ) from e -def delete_llmisvc( - kserve_client, name, namespace, version=constants.KSERVE_V1ALPHA1_VERSION -): +def delete_llmisvc(kserve_client: KServeClient, llm_isvc: V1alpha1LLMInferenceService): try: return kserve_client.api_instance.delete_namespaced_custom_object( constants.KSERVE_GROUP, - version, - namespace, + llm_isvc.api_version.split("/")[1], + llm_isvc.metadata.namespace, KSERVE_PLURAL_LLMINFERENCESERVICE, - name, + llm_isvc.metadata.name, ) except client.rest.ApiException as e: raise RuntimeError( @@ -130,9 +134,7 @@ def delete_llmisvc( ) from e -def get_llmisvc( - kserve_client, name, namespace, version=constants.KSERVE_V1ALPHA1_VERSION -): +def get_llmisvc(kserve_client: KServeClient, name, namespace, version=constants.KSERVE_V1ALPHA1_VERSION): try: return kserve_client.api_instance.get_namespaced_custom_object( constants.KSERVE_GROUP, @@ -149,55 +151,48 @@ def get_llmisvc( def wait_for_model_response( - kserve_client, - name, - namespace, - timeout_seconds=600, - version=constants.KSERVE_V1ALPHA1_VERSION, - model_name=None, -): - if model_name is None: - model_name = "default-model" - + kserve_client: KServeClient, + test_case: Case, + timeout_seconds: int = 300, # TODO Make it configurable in Case +) -> str: + service_url = None def assert_model_responds(): nonlocal service_url try: - service_url = get_llm_service_url(kserve_client, name, namespace, version) + service_url = get_llm_service_url(kserve_client, test_case.llm_service) except Exception as e: raise AssertionError(f"Failed to get service URL: {e}") from e completion_url = f"{service_url}/v1/completions" - test_payload = {"model": model_name, "prompt": "test", "max_tokens": 1} - + test_payload = {"model": test_case.model_name, "prompt": test_case.prompt, "max_tokens": test_case.max_tokens} + print(f"Calling LLM service at {completion_url} with payload {test_payload}") try: response = requests.post( completion_url, headers={"Content-Type": "application/json"}, json=test_payload, - timeout=30, + timeout=30, # TODO Make it configurable ) except Exception as e: raise AssertionError(f"Failed to call model: {e}") from e - assert ( - response.status_code == 200 - ), f"Service returned {response.status_code}: {response.text}" + test_case.response_assertion(response) return service_url return wait_for(assert_model_responds, timeout=timeout_seconds, interval=10.0) -def get_llm_service_url( - kserve_client, service_name, namespace, version=constants.KSERVE_V1ALPHA1_VERSION -): +def get_llm_service_url(kserve_client: KServeClient, llm_isvc: V1alpha1LLMInferenceService): + service_name = llm_isvc.metadata.name + try: - llm_isvc = get_llmisvc(kserve_client, service_name, namespace, version) + llm_isvc = get_llmisvc(kserve_client, llm_isvc.metadata.name, llm_isvc.metadata.namespace, llm_isvc.api_version.split("/")[1]) if "status" not in llm_isvc: - raise ValueError(f"No status found in LLM inference service {service_name}") + raise ValueError(f"No status found in LLM inference service {service_name} status: {llm_isvc}") status = llm_isvc["status"] @@ -221,7 +216,8 @@ def get_llm_service_url( ) from e -def wait_for(assertion_fn, timeout: float = 5.0, interval: float = 0.1): +def wait_for(assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1) -> Any: + """Wait for assertion function to succeed within timeout.""" deadline = time.time() + timeout while True: try: @@ -231,33 +227,11 @@ def wait_for(assertion_fn, timeout: float = 5.0, interval: float = 0.1): raise time.sleep(interval) - -def get_model_name_from_configs(config_names): - """Extract model name from model config.""" - for config_name in config_names: - if config_name.startswith("model-"): - config = LLMINFERENCESERVICE_CONFIGS[config_name] - if "model" in config and "name" in config["model"]: - return config["model"]["name"] - return "default-model" - - -def generate_service_name(test_name, config_names): - base_name = test_name.split("[")[0] # Remove everything after [ - base_name = base_name.replace("test_", "").replace("_", "-") - config_suffix = "-".join(sorted(config_names)) - service_name = f"{base_name}-{config_suffix}" - service_name = service_name.lower() - service_name = service_name[:63].rstrip("-") - return service_name - - -def collect_diagnostics(service_name, namespace): +def collect_diagnostics(kserve_client: KServeClient, llm_isvc: V1alpha1LLMInferenceService): try: - kserve_client = KServeClient( - config_file=os.environ.get("KUBECONFIG", "~/.kube/config") - ) - + + service_name = llm_isvc.metadata.name + namespace = llm_isvc.metadata.namespace print(f"\n{'='*60}") print(f"DIAGNOSTIC INFORMATION FOR {service_name} in {namespace}") print(f"{'='*60}") From 12664771c68deccfcb3c62fc7f58212768458fdd Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Fri, 25 Jul 2025 20:26:40 +0200 Subject: [PATCH 14/38] fix: adjusts gh action to run on cpu cluster Signed-off-by: Bartosz Majsak --- .github/workflows/e2e-test-llmisvc.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/e2e-test-llmisvc.yml b/.github/workflows/e2e-test-llmisvc.yml index 65a82d93279..d7a6c43fc2e 100644 --- a/.github/workflows/e2e-test-llmisvc.yml +++ b/.github/workflows/e2e-test-llmisvc.yml @@ -96,9 +96,9 @@ jobs: - name: Run E2E tests timeout-minutes: 30 run: | - # Run only CPU tests for now using pytest markers + # Run only CPU tests for now using pytest markers (cluster_) # Available GPU vendors: amd, nvidia, intel - ./test/scripts/gh-actions/run-e2e-tests.sh "llminferenceservice and cpu" 1 "istio-gatewayapi-ext" + ./test/scripts/gh-actions/run-e2e-tests.sh "llminferenceservice and cluster_cpu" 2 "istio-gatewayapi-ext" - name: Check system status if: always() From f270cfa8e23ab66ab8df887b266fa3b726547701 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Fri, 25 Jul 2025 20:44:07 +0200 Subject: [PATCH 15/38] feat: adds logging decorator Signed-off-by: Bartosz Majsak --- test/__init__.py | 0 test/e2e/llmisvc/logging.py | 45 +++++++++++++++++++ .../e2e/llmisvc/test_llm_inference_service.py | 35 ++++++++------- 3 files changed, 64 insertions(+), 16 deletions(-) create mode 100644 test/__init__.py create mode 100644 test/e2e/llmisvc/logging.py diff --git a/test/__init__.py b/test/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/test/e2e/llmisvc/logging.py b/test/e2e/llmisvc/logging.py new file mode 100644 index 00000000000..7da44364fa2 --- /dev/null +++ b/test/e2e/llmisvc/logging.py @@ -0,0 +1,45 @@ +# Copyright 2025 The KServe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import functools +import logging +import time +from datetime import datetime + +# Set up logging +logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logger = logging.getLogger(__name__) + +def log_execution(func): + """Decorator to log function start/end with timestamps and duration.""" + @functools.wraps(func) + def wrapper(*args, **kwargs): + func_name = func.__name__ + + timestamp_start = datetime.now().isoformat() + logger.info(f"[{func_name}] [{timestamp_start}] start") + start_time = time.time() + + try: + result = func(*args, **kwargs) + duration = time.time() - start_time + timestamp_end = datetime.now().isoformat() + logger.info(f"[{func_name}] [{timestamp_end}] end - SUCCESS in {duration:.3f}s") + return result + except Exception as e: + duration = time.time() - start_time + timestamp_end = datetime.now().isoformat() + logger.error(f"[{func_name}] [{timestamp_end}] end - FAILED in {duration:.3f}s: {e}") + raise + return wrapper diff --git a/test/e2e/llmisvc/test_llm_inference_service.py b/test/e2e/llmisvc/test_llm_inference_service.py index b60d09531d9..826b1e4181b 100644 --- a/test/e2e/llmisvc/test_llm_inference_service.py +++ b/test/e2e/llmisvc/test_llm_inference_service.py @@ -16,7 +16,7 @@ import os import time from dataclasses import dataclass -from typing import Any, Callable, List, Optional +from typing import Any, Callable, List import pytest import requests @@ -24,12 +24,10 @@ from kserve import KServeClient, V1alpha1LLMInferenceService, constants from .fixtures import ( - LLMINFERENCESERVICE_CONFIGS, generate_test_id, - llm_config_factory, # noqa: F401,F811 test_case, # noqa: F401,F811 - KSERVE_TEST_NAMESPACE, ) +from .logging import log_execution KSERVE_PLURAL_LLMINFERENCESERVICE = "llminferenceservices" @@ -61,22 +59,23 @@ class Case: Case(["router-managed", "workload-single-cpu", "model-fb-opt-125m"]), marks=pytest.mark.cluster_cpu, ), - pytest.param( - Case( - base_refs=["router-managed", "workload-single-cpu", "model-fb-opt-125m"], - prompt="What is the capital of France?", - response_assertion=lambda response: ( - response.status_code == 200 - and response.json().get("choices") is not None - and len(response.json().get("choices", [])) > 0 - ), - ), - marks=pytest.mark.cluster_cpu, - ), + # pytest.param( + # Case( + # base_refs=["router-managed", "workload-single-cpu", "model-fb-opt-125m"], + # prompt="What is the capital of France?", + # response_assertion=lambda response: ( + # response.status_code == 200 + # and response.json().get("choices") is not None + # and len(response.json().get("choices", [])) > 0 + # ), + # ), + # marks=pytest.mark.cluster_cpu, + # ), ], indirect=["test_case"], ids=generate_test_id, ) +@log_execution def test_llm_inference_service(test_case: Case): kserve_client = KServeClient( @@ -99,6 +98,7 @@ def test_llm_inference_service(test_case: Case): print(f"Warning: Failed to cleanup service {service_name}: {e}") +@log_execution def create_llmisvc(kserve_client: KServeClient, llm_isvc: V1alpha1LLMInferenceService): from kserve.utils import utils @@ -118,6 +118,7 @@ def create_llmisvc(kserve_client: KServeClient, llm_isvc: V1alpha1LLMInferenceSe ) from e +@log_execution def delete_llmisvc(kserve_client: KServeClient, llm_isvc: V1alpha1LLMInferenceService): try: return kserve_client.api_instance.delete_namespaced_custom_object( @@ -134,6 +135,7 @@ def delete_llmisvc(kserve_client: KServeClient, llm_isvc: V1alpha1LLMInferenceSe ) from e +@log_execution def get_llmisvc(kserve_client: KServeClient, name, namespace, version=constants.KSERVE_V1ALPHA1_VERSION): try: return kserve_client.api_instance.get_namespaced_custom_object( @@ -150,6 +152,7 @@ def get_llmisvc(kserve_client: KServeClient, name, namespace, version=constants. ) from e +@log_execution def wait_for_model_response( kserve_client: KServeClient, test_case: Case, From ba5834562440b6334f2d890b3f6c02d2d3173768 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Sat, 26 Jul 2025 00:25:33 +0200 Subject: [PATCH 16/38] feat: logging and cr dump Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/README.md | 31 ++- test/e2e/llmisvc/diagnostic.py | 100 ++++++++++ test/e2e/llmisvc/fixtures.py | 15 +- test/e2e/llmisvc/logging.py | 18 +- .../e2e/llmisvc/test_llm_inference_service.py | 182 +++++++++++------- 5 files changed, 257 insertions(+), 89 deletions(-) create mode 100644 test/e2e/llmisvc/diagnostic.py diff --git a/test/e2e/llmisvc/README.md b/test/e2e/llmisvc/README.md index d9c7f8ebddb..c08b3c5fcef 100644 --- a/test/e2e/llmisvc/README.md +++ b/test/e2e/llmisvc/README.md @@ -4,7 +4,10 @@ Tests combine config fragments from different categories to create complete scenarios: ```python -pytest.param(["router-managed", "workload-single-cpu", "model-fb-opt-125m"], marks=pytest.mark.cluster_cpu) +pytest.param( + TestCase(["router-managed", "workload-single-cpu", "model-fb-opt-125m"]), + marks=pytest.mark.cluster_cpu, +) ``` The `llm_config_factory` fixture automatically creates/cleans up `LLMInferenceServiceConfig` objects. @@ -51,9 +54,29 @@ Test IDs are generated by combining the cluster capability from pytest marks wit ## Adding New Configs -1. Add to `LLMINFERENCESERVICE_CONFIGS` in `test_configs.py` +1. Add to `LLMINFERENCESERVICE_CONFIGS` in `fixtures.py` 2. Follow `category-descriptor` naming (prefix automatically stripped from test IDs) 3. Add new cluster capability test cases using `pytest.param` with appropriate marks: ```python - pytest.param(["router-managed", "workload-nvidia-a100-gpu", "model-llama-70b"], marks=pytest.mark.cluster_nvidia), - ``` \ No newline at end of file + pytest.param( + TestCase(["router-managed", "workload-nvidia-a100-gpu", "model-llama-70b"]), + marks=pytest.mark.cluster_nvidia, + ), + ``` + + You can also customize test behavior with additional LlmDTestCase parameters: + ```python + pytest.param( + TestCase( + base_refs=["router-managed", "workload-single-cpu", "model-fb-opt-125m"], + prompt="What is the capital of France?", + max_tokens=50, + response_assertion=lambda response: ( + response.status_code == 200 + and response.json().get("choices") is not None + and len(response.json().get("choices", [])) > 0 + ), + ), + marks=pytest.mark.cluster_cpu, + ), + ``` diff --git a/test/e2e/llmisvc/diagnostic.py b/test/e2e/llmisvc/diagnostic.py new file mode 100644 index 00000000000..47fa19392c2 --- /dev/null +++ b/test/e2e/llmisvc/diagnostic.py @@ -0,0 +1,100 @@ +# Copyright 2025 The KServe Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import itertools +from datetime import datetime +import pytest +from kubernetes import client, config, dynamic +from kubernetes.client import api_client +from kubernetes.client.exceptions import ApiException +from kserve import KServeClient, V1alpha1LLMInferenceService, constants + + +def print_all_events_table(namespace: str, max_events: int = 50): + """ + Print the most recent `max_events` events in `namespace` as a nice table. + """ + core = client.CoreV1Api() + + try: + evs = core.list_namespaced_event(namespace=namespace).items + + if not evs: + print("ℹ️ # No events found in namespace", namespace) + return + + evs = sorted( + evs, key=lambda e: e.last_timestamp or e.first_timestamp, reverse=True + )[:max_events] + + # print header + header = f"{'TIME':<25} {'NAMESPACE':<12} {'SOURCE':<20} {'TYPE':<8} {'REASON':<20} MESSAGE" + print(header) + print("-" * len(header)) + + for ev in evs: + ts = ev.last_timestamp or ev.first_timestamp + ts_str = ( + ts.strftime("%Y-%m-%d %H:%M:%S") + if isinstance(ts, datetime) + else str(ts) + ) + src = f"{ev.source.component or ''}/{ev.source.host or ''}".strip("/") + msg = (ev.message or "").replace("\n", " ") + print( + f"{ts_str:<25} {ev.metadata.namespace:<12} {src:<20} {ev.type or '':<8} " + f"{ev.reason or '':<20} {msg}" + ) + + except Exception as e: + print(f"# ❌ failed to list events: {e}") + + +def kinds_matching_by_labels(namespace: str, labels, api_kinds): + """ + List all namespaced objects in `namespace` matching `labels` + whose kind is in `api_kinds`. + + :param namespace: kube namespace to search + :param labels: either a dict of {k: v} or a raw selector string + :param api_kinds: an iterable of Resource.kind strings to include + :return: list of Unstructured objects + """ + config.load_kube_config() + dyn = dynamic.DynamicClient(api_client.ApiClient()) + + selector = ( + ",".join(f"{k}={v}" for k, v in labels.items()) + if isinstance(labels, dict) + else labels + ) + + all_resources = itertools.chain.from_iterable(dyn.resources) + + found = [] + for rsrc in all_resources: + if not rsrc.namespaced or "list" not in rsrc.verbs: + continue + if rsrc.kind not in api_kinds: + continue + + try: + resp = rsrc.get(namespace=namespace, label_selector=selector) + except ApiException: + continue + + items = getattr(resp, "items", []) + found.extend(items) + + return found diff --git a/test/e2e/llmisvc/fixtures.py b/test/e2e/llmisvc/fixtures.py index c8f0a3f8886..09892cdbb62 100644 --- a/test/e2e/llmisvc/fixtures.py +++ b/test/e2e/llmisvc/fixtures.py @@ -64,10 +64,12 @@ @pytest.fixture(scope="function") def test_case(request): tc = request.param - + service_name = generate_service_name(request.node.name, tc.base_refs) tc.model_name = get_model_name_from_configs(tc.base_refs) + # TODO fail early if base_refs does not exist (e.g. mistyped)? + tc.llm_service = V1alpha1LLMInferenceService( api_version="serving.kserve.io/v1alpha1", kind="LLMInferenceService", @@ -75,14 +77,13 @@ def test_case(request): name=service_name, namespace=KSERVE_TEST_NAMESPACE ), spec={ - "baseRefs": [ - {"name": base_ref} for base_ref in tc.base_refs - ], + "baseRefs": [{"name": base_ref} for base_ref in tc.base_refs], }, ) - + return tc + @pytest.fixture(scope="session", autouse=True) def llm_config_factory(): """Factory for creating/cleaning LLMInferenceServiceConfig once per session.""" @@ -127,7 +128,6 @@ def _create_configs(namespace=KSERVE_TEST_NAMESPACE): # otherwise, real error raise - yield _create_configs() # teardown: best‑effort cleanup @@ -137,6 +137,7 @@ def _create_configs(namespace=KSERVE_TEST_NAMESPACE): except Exception: pass + def get_model_name_from_configs(config_names): """Extract model name from model config.""" for config_name in config_names: @@ -146,6 +147,7 @@ def get_model_name_from_configs(config_names): return config["model"]["name"] return "default-model" + def generate_service_name(test_name: str, base_refs: List[str]) -> str: base_name = test_name.split("[", 1)[0] base_name = base_name.replace("test_", "") @@ -162,6 +164,7 @@ def generate_service_name(test_name: str, base_refs: List[str]) -> str: return f"{test_case}{sep}{uid}" + def generate_test_id(test_case) -> str: """Generate a test ID from base refs.""" return "-".join(test_case.base_refs) diff --git a/test/e2e/llmisvc/logging.py b/test/e2e/llmisvc/logging.py index 7da44364fa2..31f6822113e 100644 --- a/test/e2e/llmisvc/logging.py +++ b/test/e2e/llmisvc/logging.py @@ -17,29 +17,35 @@ import time from datetime import datetime -# Set up logging -logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) logger = logging.getLogger(__name__) + def log_execution(func): """Decorator to log function start/end with timestamps and duration.""" + @functools.wraps(func) def wrapper(*args, **kwargs): func_name = func.__name__ - + timestamp_start = datetime.now().isoformat() logger.info(f"[{func_name}] [{timestamp_start}] start") start_time = time.time() - + try: result = func(*args, **kwargs) duration = time.time() - start_time timestamp_end = datetime.now().isoformat() - logger.info(f"[{func_name}] [{timestamp_end}] end - SUCCESS in {duration:.3f}s") + logger.info(f"[{func_name}] [{timestamp_end}] end - βœ… in {duration:.3f}s") return result except Exception as e: duration = time.time() - start_time timestamp_end = datetime.now().isoformat() - logger.error(f"[{func_name}] [{timestamp_end}] end - FAILED in {duration:.3f}s: {e}") + logger.error( + f"[{func_name}] [{timestamp_end}] end - ❌ {duration:.3f}s: {e}" + ) raise + return wrapper diff --git a/test/e2e/llmisvc/test_llm_inference_service.py b/test/e2e/llmisvc/test_llm_inference_service.py index 826b1e4181b..67d21e92b6d 100644 --- a/test/e2e/llmisvc/test_llm_inference_service.py +++ b/test/e2e/llmisvc/test_llm_inference_service.py @@ -12,17 +12,19 @@ # See the License for the specific language governing permissions and # limitations under the License. -import json import os import time from dataclasses import dataclass from typing import Any, Callable, List - import pytest import requests from kubernetes import client +import yaml from kserve import KServeClient, V1alpha1LLMInferenceService, constants - +from .diagnostic import ( + print_all_events_table, + kinds_matching_by_labels, +) from .fixtures import ( generate_test_id, test_case, # noqa: F401,F811 @@ -40,14 +42,17 @@ def assert_200(response: requests.Response) -> None: @dataclass -class Case: +class TestCase: + __test__ = False # So pytest will not try to execute it. """Test case configuration for LLM inference service tests.""" base_refs: List[str] - prompt: str = "Boston is a" + prompt: str = "KServe is a" max_tokens: int = 1 response_assertion: Callable[[requests.Response], None] = assert_200 - llm_service: V1alpha1LLMInferenceService = None # Generated by llm_service_factory - model_name: str = "default/model" + wait_timeout: int = 300 + # Factory provided + llm_service: V1alpha1LLMInferenceService = None # Generated by llm_service_factory + model_name: str = "default/model" # This will be generated by the factory @pytest.mark.llminferenceservice @@ -56,15 +61,16 @@ class Case: "test_case", [ pytest.param( - Case(["router-managed", "workload-single-cpu", "model-fb-opt-125m"]), + TestCase(["router-managed", "workload-single-cpu", "model-fb-opt-125m"]), marks=pytest.mark.cluster_cpu, ), + # Example test case # pytest.param( # Case( # base_refs=["router-managed", "workload-single-cpu", "model-fb-opt-125m"], # prompt="What is the capital of France?", # response_assertion=lambda response: ( - # response.status_code == 200 + # response.status_code == 200 # and response.json().get("choices") is not None # and len(response.json().get("choices", [])) > 0 # ), @@ -76,26 +82,27 @@ class Case: ids=generate_test_id, ) @log_execution -def test_llm_inference_service(test_case: Case): - +def test_llm_inference_service(test_case: TestCase): + kserve_client = KServeClient( config_file=os.environ.get("KUBECONFIG", "~/.kube/config") ) service_name = test_case.llm_service.metadata.name - + try: create_llmisvc(kserve_client, test_case.llm_service) - wait_for_model_response(kserve_client, test_case) + wait_for_model_response(kserve_client, test_case, test_case.wait_timeout) + print(f"πŸŽ‰ Test completed successfully for service {service_name}") except Exception as e: - print(f"ERROR: Failed to call llm inference service {service_name}: {e}") + print(f"❌ ERROR: Failed to call llm inference service {service_name}: {e}") collect_diagnostics(kserve_client, test_case.llm_service) raise finally: try: delete_llmisvc(kserve_client, test_case.llm_service) except Exception as e: - print(f"Warning: Failed to cleanup service {service_name}: {e}") + print(f"⚠️ Warning: Failed to cleanup service {service_name}: {e}") @log_execution @@ -110,10 +117,11 @@ def create_llmisvc(kserve_client: KServeClient, llm_isvc: V1alpha1LLMInferenceSe KSERVE_PLURAL_LLMINFERENCESERVICE, llm_isvc, ) + print(f"βœ… LLM inference service {llm_isvc.metadata.name} created successfully") return outputs except client.rest.ApiException as e: raise RuntimeError( - f"Exception when calling CustomObjectsApi->" + f"❌ Exception when calling CustomObjectsApi->" f"create_namespaced_custom_object for LLMInferenceService: {e}" ) from e @@ -121,22 +129,29 @@ def create_llmisvc(kserve_client: KServeClient, llm_isvc: V1alpha1LLMInferenceSe @log_execution def delete_llmisvc(kserve_client: KServeClient, llm_isvc: V1alpha1LLMInferenceService): try: - return kserve_client.api_instance.delete_namespaced_custom_object( + result = kserve_client.api_instance.delete_namespaced_custom_object( constants.KSERVE_GROUP, llm_isvc.api_version.split("/")[1], llm_isvc.metadata.namespace, KSERVE_PLURAL_LLMINFERENCESERVICE, llm_isvc.metadata.name, ) + print(f"βœ… LLM inference service {llm_isvc.metadata.name} deleted successfully") + return result except client.rest.ApiException as e: raise RuntimeError( - f"Exception when calling CustomObjectsApi->" + f"❌ Exception when calling CustomObjectsApi->" f"delete_namespaced_custom_object for LLMInferenceService: {e}" ) from e @log_execution -def get_llmisvc(kserve_client: KServeClient, name, namespace, version=constants.KSERVE_V1ALPHA1_VERSION): +def get_llmisvc( + kserve_client: KServeClient, + name, + namespace, + version=constants.KSERVE_V1ALPHA1_VERSION, +): try: return kserve_client.api_instance.get_namespaced_custom_object( constants.KSERVE_GROUP, @@ -147,7 +162,7 @@ def get_llmisvc(kserve_client: KServeClient, name, namespace, version=constants. ) except client.rest.ApiException as e: raise RuntimeError( - f"Exception when calling CustomObjectsApi->" + f"❌ Exception when calling CustomObjectsApi->" f"get_namespaced_custom_object for LLMInferenceService: {e}" ) from e @@ -155,10 +170,10 @@ def get_llmisvc(kserve_client: KServeClient, name, namespace, version=constants. @log_execution def wait_for_model_response( kserve_client: KServeClient, - test_case: Case, - timeout_seconds: int = 300, # TODO Make it configurable in Case + test_case: TestCase, + timeout_seconds: int = 600, ) -> str: - + service_url = None def assert_model_responds(): @@ -167,39 +182,54 @@ def assert_model_responds(): try: service_url = get_llm_service_url(kserve_client, test_case.llm_service) except Exception as e: - raise AssertionError(f"Failed to get service URL: {e}") from e + raise AssertionError(f"❌ Failed to get service URL: {e}") from e completion_url = f"{service_url}/v1/completions" - test_payload = {"model": test_case.model_name, "prompt": test_case.prompt, "max_tokens": test_case.max_tokens} - print(f"Calling LLM service at {completion_url} with payload {test_payload}") + test_payload = { + "model": test_case.model_name, + "prompt": test_case.prompt, + "max_tokens": test_case.max_tokens, + } + print(f"πŸ“ž Calling LLM service at {completion_url} with payload {test_payload}") try: response = requests.post( completion_url, headers={"Content-Type": "application/json"}, json=test_payload, - timeout=30, # TODO Make it configurable + timeout=30, # TODO Make it configurable ) except Exception as e: - raise AssertionError(f"Failed to call model: {e}") from e + raise AssertionError(f"❌ Failed to call model: {e}") from e test_case.response_assertion(response) + print("βœ… LLM service responded successfully") return service_url return wait_for(assert_model_responds, timeout=timeout_seconds, interval=10.0) -def get_llm_service_url(kserve_client: KServeClient, llm_isvc: V1alpha1LLMInferenceService): +def get_llm_service_url( + kserve_client: KServeClient, llm_isvc: V1alpha1LLMInferenceService +): service_name = llm_isvc.metadata.name - + try: - llm_isvc = get_llmisvc(kserve_client, llm_isvc.metadata.name, llm_isvc.metadata.namespace, llm_isvc.api_version.split("/")[1]) + llm_isvc = get_llmisvc( + kserve_client, + llm_isvc.metadata.name, + llm_isvc.metadata.namespace, + llm_isvc.api_version.split("/")[1], + ) if "status" not in llm_isvc: - raise ValueError(f"No status found in LLM inference service {service_name} status: {llm_isvc}") + raise ValueError( + f"❌ No status found in LLM inference service {service_name} status: {llm_isvc}" + ) status = llm_isvc["status"] if "url" in status and status["url"]: + print(f"βœ… Found service URL: {status['url']}") return status["url"] if ( @@ -209,17 +239,22 @@ def get_llm_service_url(kserve_client: KServeClient, llm_isvc: V1alpha1LLMInfere ): first_address = status["addresses"][0] if "url" in first_address: + print(f"βœ… Found service URL in addresses: {first_address['url']}") return first_address["url"] - raise ValueError(f"No URL found in LLM inference service {service_name} status") + raise ValueError( + f"❌ No URL found in LLM inference service {service_name} status" + ) except Exception as e: raise ValueError( - f"Failed to get URL for LLM inference service {service_name}: {e}" + f"❌ Failed to get URL for LLM inference service {service_name}: {e}" ) from e -def wait_for(assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1) -> Any: +def wait_for( + assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1 +) -> Any: """Wait for assertion function to succeed within timeout.""" deadline = time.time() + timeout while True: @@ -230,45 +265,46 @@ def wait_for(assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: fl raise time.sleep(interval) -def collect_diagnostics(kserve_client: KServeClient, llm_isvc: V1alpha1LLMInferenceService): - try: - - service_name = llm_isvc.metadata.name - namespace = llm_isvc.metadata.namespace - print(f"\n{'='*60}") - print(f"DIAGNOSTIC INFORMATION FOR {service_name} in {namespace}") - print(f"{'='*60}") - - print("\n--- LLM Inference Service ---") - try: - llm_isvc = get_llmisvc(kserve_client, service_name, namespace) - print(json.dumps(llm_isvc, indent=2, default=str)) - except Exception as e: - print(f"Failed to get LLM inference service: {e}") - print("\n--- Events ---") - try: - core_v1 = client.CoreV1Api() - events = core_v1.list_namespaced_event( - namespace=namespace, - field_selector=f"involvedObject.name={service_name}", - ) - if events.items: - sorted_events = sorted( - events.items, - key=lambda x: x.last_timestamp or x.first_timestamp, - reverse=True, - ) - for event in sorted_events[:5]: - timestamp = event.last_timestamp or event.first_timestamp - print(f" {event.type}: {event.reason} - {event.message}") - print(f" Time: {timestamp}") - else: - print(" No events found") - except Exception as e: - print(f"Failed to list events: {e}") +def collect_diagnostics( + kserve_client: KServeClient, llm_isvc: V1alpha1LLMInferenceService +): + print("πŸ” # Collecting diagnostics...") + name = llm_isvc.metadata.name + ns = llm_isvc.metadata.namespace + + svc = get_llmisvc(kserve_client, name, ns) - print(f"\n{'='*60}") + labels = { + "app.kubernetes.io/part-of": "llminferenceservice", + "app.kubernetes.io/name": svc["metadata"].get("name"), + } + print(f"πŸ” # Diagnostics for {name!r} in {ns!r}") + print("---") + print(f"# LLMInferenceService {name}") + try: + print(yaml.safe_dump(svc, sort_keys=False)) except Exception as e: - print(f"Failed to collect diagnostics: {e}") + print(f"# ❌ failed to dump LLMInferenceService: {e}") + + print_all_events_table(ns) + + all_resources = kinds_matching_by_labels( + ns, + labels, + api_kinds={ + "HTTPRoute", + "InferencePool", + "InferenceModel", + "Service", + "Deployment", + "LeaderWorkerSet", + "DestinationRule", + "StatefulSet", + }, + ) + + for obj in all_resources: + print("---") + print(yaml.safe_dump(obj.to_dict(), sort_keys=False)) From 405cad40cffeebe4673b49e8c89d6fb809ac1a3d Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Sat, 26 Jul 2025 00:26:26 +0200 Subject: [PATCH 17/38] chore: bumps python to 3.12 for e2e job Signed-off-by: Bartosz Majsak --- .github/workflows/e2e-test-llmisvc.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/e2e-test-llmisvc.yml b/.github/workflows/e2e-test-llmisvc.yml index d7a6c43fc2e..7b08f7fa6e7 100644 --- a/.github/workflows/e2e-test-llmisvc.yml +++ b/.github/workflows/e2e-test-llmisvc.yml @@ -70,7 +70,7 @@ jobs: - name: Setup Python uses: actions/setup-python@v5 with: - python-version: "3.9" + python-version: "3.12" - name: Setup Minikube uses: ./.github/actions/minikube-setup From 9587f189ab3c30673791dbe4e59a2d5dce4d3cc4 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Sat, 26 Jul 2025 00:49:15 +0200 Subject: [PATCH 18/38] fix: mismatched class name in example comment Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/test_llm_inference_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/llmisvc/test_llm_inference_service.py b/test/e2e/llmisvc/test_llm_inference_service.py index 67d21e92b6d..94d3bf19a68 100644 --- a/test/e2e/llmisvc/test_llm_inference_service.py +++ b/test/e2e/llmisvc/test_llm_inference_service.py @@ -66,7 +66,7 @@ class TestCase: ), # Example test case # pytest.param( - # Case( + # TestCase( # base_refs=["router-managed", "workload-single-cpu", "model-fb-opt-125m"], # prompt="What is the capital of France?", # response_assertion=lambda response: ( From dfade037b562eef152f3ddcecf2de15061121543 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Sat, 26 Jul 2025 00:52:04 +0200 Subject: [PATCH 19/38] fix: clarifies workload preset Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/llmisvc/README.md b/test/e2e/llmisvc/README.md index c08b3c5fcef..b605afe4eed 100644 --- a/test/e2e/llmisvc/README.md +++ b/test/e2e/llmisvc/README.md @@ -44,7 +44,7 @@ pytest -m "llminferenceservice and (cluster_cpu or cluster_amd)" test/e2e/llmisv Use prefixed categories that get composed together: -- **`workload-*`**: Container specs and resources (e.g., `workload-single-cpu`, `workload-multi-node-gpu`) +- **`workload-*`**: workload topology, container specs and resource specs (e.g., `workload-single-cpu`, `workload-multi-node-gpu`) - **`model-*`**: Model sources (e.g., `model-fb-opt-125m`, `model-gpt2`) - **`router-*`**: Routing configs (e.g., `router-managed`, `router-with-scheduler`) From 176da0354a91203bf6a11e3ab7b0bf250f89e256 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Sat, 26 Jul 2025 01:00:14 +0200 Subject: [PATCH 20/38] chore: removes noise aka poor man debugging tool Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/diagnostic.py | 1 - test/e2e/llmisvc/fixtures.py | 1 - test/e2e/llmisvc/test_llm_inference_service.py | 4 ---- test/scripts/gh-actions/run-e2e-tests.sh | 1 - 4 files changed, 7 deletions(-) diff --git a/test/e2e/llmisvc/diagnostic.py b/test/e2e/llmisvc/diagnostic.py index 47fa19392c2..509b68548e3 100644 --- a/test/e2e/llmisvc/diagnostic.py +++ b/test/e2e/llmisvc/diagnostic.py @@ -38,7 +38,6 @@ def print_all_events_table(namespace: str, max_events: int = 50): evs, key=lambda e: e.last_timestamp or e.first_timestamp, reverse=True )[:max_events] - # print header header = f"{'TIME':<25} {'NAMESPACE':<12} {'SOURCE':<20} {'TYPE':<8} {'REASON':<20} MESSAGE" print(header) print("-" * len(header)) diff --git a/test/e2e/llmisvc/fixtures.py b/test/e2e/llmisvc/fixtures.py index 09892cdbb62..be4dcdee731 100644 --- a/test/e2e/llmisvc/fixtures.py +++ b/test/e2e/llmisvc/fixtures.py @@ -130,7 +130,6 @@ def _create_configs(namespace=KSERVE_TEST_NAMESPACE): yield _create_configs() - # teardown: best‑effort cleanup for name, namespace in created: try: delete_llmisvc_config(client, name, namespace) diff --git a/test/e2e/llmisvc/test_llm_inference_service.py b/test/e2e/llmisvc/test_llm_inference_service.py index 94d3bf19a68..6f91f1604d4 100644 --- a/test/e2e/llmisvc/test_llm_inference_service.py +++ b/test/e2e/llmisvc/test_llm_inference_service.py @@ -93,7 +93,6 @@ def test_llm_inference_service(test_case: TestCase): try: create_llmisvc(kserve_client, test_case.llm_service) wait_for_model_response(kserve_client, test_case, test_case.wait_timeout) - print(f"πŸŽ‰ Test completed successfully for service {service_name}") except Exception as e: print(f"❌ ERROR: Failed to call llm inference service {service_name}: {e}") collect_diagnostics(kserve_client, test_case.llm_service) @@ -229,7 +228,6 @@ def get_llm_service_url( status = llm_isvc["status"] if "url" in status and status["url"]: - print(f"βœ… Found service URL: {status['url']}") return status["url"] if ( @@ -239,7 +237,6 @@ def get_llm_service_url( ): first_address = status["addresses"][0] if "url" in first_address: - print(f"βœ… Found service URL in addresses: {first_address['url']}") return first_address["url"] raise ValueError( @@ -269,7 +266,6 @@ def wait_for( def collect_diagnostics( kserve_client: KServeClient, llm_isvc: V1alpha1LLMInferenceService ): - print("πŸ” # Collecting diagnostics...") name = llm_isvc.metadata.name ns = llm_isvc.metadata.namespace diff --git a/test/scripts/gh-actions/run-e2e-tests.sh b/test/scripts/gh-actions/run-e2e-tests.sh index ee8794c8d23..98d6af06f6e 100755 --- a/test/scripts/gh-actions/run-e2e-tests.sh +++ b/test/scripts/gh-actions/run-e2e-tests.sh @@ -35,7 +35,6 @@ pushd test/e2e >/dev/null echo "Skipping explainer tests for raw deployment with ingress" pytest -m "$MARKER" --ignore=qpext --log-cli-level=INFO -n $PARALLELISM --dist worksteal --network-layer $NETWORK_LAYER --ignore=explainer/ else - echo "pytest -m '$MARKER' --ignore=qpext --log-cli-level=INFO -n $PARALLELISM --dist worksteal --network-layer $NETWORK_LAYER" pytest -m "$MARKER" --ignore=qpext --log-cli-level=INFO -n $PARALLELISM --dist worksteal --network-layer $NETWORK_LAYER fi popd From 8210c4533a84f0eddd927a0520850929c4ef84dc Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Mon, 28 Jul 2025 10:36:43 +0200 Subject: [PATCH 21/38] fix: imports inference service config factory Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/test_llm_inference_service.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/e2e/llmisvc/test_llm_inference_service.py b/test/e2e/llmisvc/test_llm_inference_service.py index 6f91f1604d4..7c453bc3486 100644 --- a/test/e2e/llmisvc/test_llm_inference_service.py +++ b/test/e2e/llmisvc/test_llm_inference_service.py @@ -27,7 +27,9 @@ ) from .fixtures import ( generate_test_id, + # Factory functions are not called explicitly, but they need to be imported to work test_case, # noqa: F401,F811 + llm_config_factory, # noqa: F401,F811 ) from .logging import log_execution From e75b15eb2899957f73676d0d11d889081e32ea30 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Mon, 28 Jul 2025 10:58:38 +0200 Subject: [PATCH 22/38] chore: bumps resource limits Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/fixtures.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/e2e/llmisvc/fixtures.py b/test/e2e/llmisvc/fixtures.py index be4dcdee731..37db7fb1889 100644 --- a/test/e2e/llmisvc/fixtures.py +++ b/test/e2e/llmisvc/fixtures.py @@ -32,8 +32,8 @@ "image": "quay.io/pierdipi/vllm-cpu:latest", "env": [{"name": "VLLM_LOGGING_LEVEL", "value": "DEBUG"}], "resources": { - "limits": {"cpu": "1", "memory": "10Gi"}, - "requests": {"cpu": "100m", "memory": "8Gi"}, + "limits": {"cpu": "2", "memory": "10Gi"}, + "requests": {"cpu": "1", "memory": "8Gi"}, }, "livenessProbe": { "initialDelaySeconds": 30, From 6be1e7cb7bf76fb49a2d233c3eae324ee631ecef Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Mon, 28 Jul 2025 10:59:56 +0200 Subject: [PATCH 23/38] chore: cleanup Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/fixtures.py | 10 ---------- test/e2e/llmisvc/test_llm_inference_service.py | 4 ++-- 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/test/e2e/llmisvc/fixtures.py b/test/e2e/llmisvc/fixtures.py index 37db7fb1889..f6de3793ea4 100644 --- a/test/e2e/llmisvc/fixtures.py +++ b/test/e2e/llmisvc/fixtures.py @@ -51,16 +51,8 @@ "router-managed": { "router": {"scheduler": {}, "route": {}, "gateway": {}}, }, - "router-with-scheduler": { - "router": { - "scheduler": {}, - "route": {}, - "gateway": {}, - }, - }, } - @pytest.fixture(scope="function") def test_case(request): tc = request.param @@ -92,8 +84,6 @@ def llm_config_factory(): def _create_configs(namespace=KSERVE_TEST_NAMESPACE): for name in LLMINFERENCESERVICE_CONFIGS: - if name not in LLMINFERENCESERVICE_CONFIGS: - raise ValueError(f"Unknown config name: {name}") spec = LLMINFERENCESERVICE_CONFIGS[name] diff --git a/test/e2e/llmisvc/test_llm_inference_service.py b/test/e2e/llmisvc/test_llm_inference_service.py index 7c453bc3486..39d314435bd 100644 --- a/test/e2e/llmisvc/test_llm_inference_service.py +++ b/test/e2e/llmisvc/test_llm_inference_service.py @@ -191,7 +191,7 @@ def assert_model_responds(): "prompt": test_case.prompt, "max_tokens": test_case.max_tokens, } - print(f"πŸ“ž Calling LLM service at {completion_url} with payload {test_payload}") + print(f"Calling LLM service at {completion_url} with payload {test_payload}") try: response = requests.post( completion_url, @@ -254,7 +254,7 @@ def get_llm_service_url( def wait_for( assertion_fn: Callable[[], Any], timeout: float = 5.0, interval: float = 0.1 ) -> Any: - """Wait for assertion function to succeed within timeout.""" + """Wait for the assertion to succeed within timeout.""" deadline = time.time() + timeout while True: try: From e7b015cd18580f93515fd8620328fb022a3d1a0a Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Mon, 28 Jul 2025 11:00:31 +0200 Subject: [PATCH 24/38] feat: adds simple p/d deployment Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/fixtures.py | 45 +++++++++++++++++++ .../e2e/llmisvc/test_llm_inference_service.py | 34 +++++++------- test/e2e/pytest.ini | 3 +- 3 files changed, 66 insertions(+), 16 deletions(-) diff --git a/test/e2e/llmisvc/fixtures.py b/test/e2e/llmisvc/fixtures.py index f6de3793ea4..555d24853ca 100644 --- a/test/e2e/llmisvc/fixtures.py +++ b/test/e2e/llmisvc/fixtures.py @@ -45,6 +45,51 @@ ] }, }, + "workload-pd-cpu": { + "model": { + "uri": "hf://facebook/opt-125m", + "name": "facebook/opt-125m" + }, + "router": { + "scheduler": {}, + "route": {}, + "gateway": {} + }, + "template": { + "initContainers": [ + { + "name": "llm-d-routing-sidecar", + "image": "ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0" + } + ], + "containers": [ + { + "name": "main", + "image": "quay.io/pierdipi/vllm-cpu:latest", + "env": [{"name": "VLLM_LOGGING_LEVEL", "value": "DEBUG"}], + "resources": { + "limits": {"cpu": "2", "memory": "10Gi"}, + "requests": {"cpu": "1", "memory": "8Gi"}, + } + } + ] + }, + "prefill": { + "template": { + "containers": [ + { + "name": "main", + "image": "quay.io/pierdipi/vllm-cpu:latest", + "env": [{"name": "VLLM_LOGGING_LEVEL", "value": "DEBUG"}], + "resources": { + "limits": {"cpu": "2", "memory": "10Gi"}, + "requests": {"cpu": "1", "memory": "8Gi"}, + } + } + ] + } + } + }, "model-fb-opt-125m": { "model": {"uri": "hf://facebook/opt-125m", "name": "facebook/opt-125m"}, }, diff --git a/test/e2e/llmisvc/test_llm_inference_service.py b/test/e2e/llmisvc/test_llm_inference_service.py index 39d314435bd..ea4a44d548f 100644 --- a/test/e2e/llmisvc/test_llm_inference_service.py +++ b/test/e2e/llmisvc/test_llm_inference_service.py @@ -63,22 +63,26 @@ class TestCase: "test_case", [ pytest.param( - TestCase(["router-managed", "workload-single-cpu", "model-fb-opt-125m"]), - marks=pytest.mark.cluster_cpu, + TestCase( + base_refs=["router-managed", "workload-single-cpu", "model-fb-opt-125m"], + prompt = "KServe is a", + ), + marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], + ), + pytest.param( + TestCase( + base_refs=["router-managed", "workload-pd-cpu", "model-fb-opt-125m"], + prompt="You are an expert in Kubernetes-native machine learning serving platforms, with deep knowledge of the KServe project. " + "Explain the challenges of serving large-scale models, GPU scheduling, and how KServe integrates with capabilities like multi-model serving. " + "Provide a detailed comparison with open source alternatives, focusing on operational trade-offs.", + response_assertion=lambda response: ( + response.status_code == 200 + and response.json().get("choices") is not None + and len(response.json().get("choices", [])) > 0 + ), + ), + marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), - # Example test case - # pytest.param( - # TestCase( - # base_refs=["router-managed", "workload-single-cpu", "model-fb-opt-125m"], - # prompt="What is the capital of France?", - # response_assertion=lambda response: ( - # response.status_code == 200 - # and response.json().get("choices") is not None - # and len(response.json().get("choices", [])) > 0 - # ), - # ), - # marks=pytest.mark.cluster_cpu, - # ), ], indirect=["test_case"], ids=generate_test_id, diff --git a/test/e2e/pytest.ini b/test/e2e/pytest.ini index 92c33eb8a26..32b2ab1b408 100644 --- a/test/e2e/pytest.ini +++ b/test/e2e/pytest.ini @@ -22,4 +22,5 @@ markers = cluster_cpu: test targeting cluster with CPU cluster_amd: test targeting cluster with AMD cluster_intel: test targeting cluster with Intel - cluster_nvidia: test targeting cluster with NVIDIA \ No newline at end of file + cluster_nvidia: test targeting cluster with NVIDIA + cluster_single_node: test targeting single node cluster \ No newline at end of file From a4ca343a662d6da15b4f0a3835ec9f7eb0d1f31f Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Mon, 28 Jul 2025 11:01:08 +0200 Subject: [PATCH 25/38] feat: makes response timeout configurable with 60s default Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/test_llm_inference_service.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/test/e2e/llmisvc/test_llm_inference_service.py b/test/e2e/llmisvc/test_llm_inference_service.py index ea4a44d548f..ecf2b0583ba 100644 --- a/test/e2e/llmisvc/test_llm_inference_service.py +++ b/test/e2e/llmisvc/test_llm_inference_service.py @@ -48,10 +48,11 @@ class TestCase: __test__ = False # So pytest will not try to execute it. """Test case configuration for LLM inference service tests.""" base_refs: List[str] - prompt: str = "KServe is a" + prompt: str max_tokens: int = 1 response_assertion: Callable[[requests.Response], None] = assert_200 wait_timeout: int = 300 + response_timeout: int = 60 # Factory provided llm_service: V1alpha1LLMInferenceService = None # Generated by llm_service_factory model_name: str = "default/model" # This will be generated by the factory @@ -201,7 +202,7 @@ def assert_model_responds(): completion_url, headers={"Content-Type": "application/json"}, json=test_payload, - timeout=30, # TODO Make it configurable + timeout=test_case.response_timeout, ) except Exception as e: raise AssertionError(f"❌ Failed to call model: {e}") from e From 19279316b3e709c4c34abc9f5d23714d6641be47 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Mon, 28 Jul 2025 11:19:08 +0200 Subject: [PATCH 26/38] midstream: disable gh-action kustomize manifests are significantly different making it impossible to re-use upstream e2e github action without major restructuring Signed-off-by: Bartosz Majsak --- .github/workflows/e2e-test-llmisvc.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/e2e-test-llmisvc.yml b/.github/workflows/e2e-test-llmisvc.yml index 7b08f7fa6e7..6943f568e9a 100644 --- a/.github/workflows/e2e-test-llmisvc.yml +++ b/.github/workflows/e2e-test-llmisvc.yml @@ -53,6 +53,7 @@ concurrency: jobs: test-llmisvc: + if: false runs-on: ubuntu-22.04 needs: [ kserve-image-build ] steps: @@ -106,6 +107,7 @@ jobs: ./test/scripts/gh-actions/status-check.sh kserve-image-build: + if: false runs-on: ubuntu-latest steps: - name: Checkout source From 0623b620a228af8fcc982d7ad1b2c32515512182 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Mon, 28 Jul 2025 13:25:45 +0200 Subject: [PATCH 27/38] chore: uses name variable Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/test_llm_inference_service.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/llmisvc/test_llm_inference_service.py b/test/e2e/llmisvc/test_llm_inference_service.py index ecf2b0583ba..7a4949ac882 100644 --- a/test/e2e/llmisvc/test_llm_inference_service.py +++ b/test/e2e/llmisvc/test_llm_inference_service.py @@ -280,7 +280,7 @@ def collect_diagnostics( labels = { "app.kubernetes.io/part-of": "llminferenceservice", - "app.kubernetes.io/name": svc["metadata"].get("name"), + "app.kubernetes.io/name": name, } print(f"πŸ” # Diagnostics for {name!r} in {ns!r}") From 8d62551e7cabe689b29cdbb4b670ff5921350e69 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Mon, 28 Jul 2025 13:29:21 +0200 Subject: [PATCH 28/38] chore: removes redundant preset Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/fixtures.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/test/e2e/llmisvc/fixtures.py b/test/e2e/llmisvc/fixtures.py index 555d24853ca..e38bcd89434 100644 --- a/test/e2e/llmisvc/fixtures.py +++ b/test/e2e/llmisvc/fixtures.py @@ -46,15 +46,6 @@ }, }, "workload-pd-cpu": { - "model": { - "uri": "hf://facebook/opt-125m", - "name": "facebook/opt-125m" - }, - "router": { - "scheduler": {}, - "route": {}, - "gateway": {} - }, "template": { "initContainers": [ { From 48d6b0b9bb0ce252bebca9bead301b6dcb90c059 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Mon, 28 Jul 2025 14:01:35 +0200 Subject: [PATCH 29/38] fix: minor precommit linter findings Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/fixtures.py | 1 + test/e2e/llmisvc/test_llm_inference_service.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/test/e2e/llmisvc/fixtures.py b/test/e2e/llmisvc/fixtures.py index e38bcd89434..13ce5090f52 100644 --- a/test/e2e/llmisvc/fixtures.py +++ b/test/e2e/llmisvc/fixtures.py @@ -89,6 +89,7 @@ }, } + @pytest.fixture(scope="function") def test_case(request): tc = request.param diff --git a/test/e2e/llmisvc/test_llm_inference_service.py b/test/e2e/llmisvc/test_llm_inference_service.py index 7a4949ac882..57819705e2f 100644 --- a/test/e2e/llmisvc/test_llm_inference_service.py +++ b/test/e2e/llmisvc/test_llm_inference_service.py @@ -29,7 +29,7 @@ generate_test_id, # Factory functions are not called explicitly, but they need to be imported to work test_case, # noqa: F401,F811 - llm_config_factory, # noqa: F401,F811 + llm_config_factory, # noqa: F401,F811 ) from .logging import log_execution @@ -66,7 +66,7 @@ class TestCase: pytest.param( TestCase( base_refs=["router-managed", "workload-single-cpu", "model-fb-opt-125m"], - prompt = "KServe is a", + prompt="KServe is a", ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), @@ -80,7 +80,7 @@ class TestCase: response.status_code == 200 and response.json().get("choices") is not None and len(response.json().get("choices", [])) > 0 - ), + ), ), marks=[pytest.mark.cluster_cpu, pytest.mark.cluster_single_node], ), @@ -202,7 +202,7 @@ def assert_model_responds(): completion_url, headers={"Content-Type": "application/json"}, json=test_payload, - timeout=test_case.response_timeout, + timeout=test_case.response_timeout, ) except Exception as e: raise AssertionError(f"❌ Failed to call model: {e}") from e From b56022a4bbc4baff8d03c733497b0ec0f7718f41 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Mon, 28 Jul 2025 16:08:09 +0200 Subject: [PATCH 30/38] chore: removes leftover empty file Signed-off-by: Bartosz Majsak --- test/__init__.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) delete mode 100644 test/__init__.py diff --git a/test/__init__.py b/test/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 From a878d7556f32dc87214560f43443d9779d30a6a6 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Mon, 28 Jul 2025 20:04:33 +0200 Subject: [PATCH 31/38] chore: reworks related resources dump to exclude certain kinds with default exclusion for secrets Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/diagnostic.py | 11 ++++++----- test/e2e/llmisvc/test_llm_inference_service.py | 18 ++---------------- 2 files changed, 8 insertions(+), 21 deletions(-) diff --git a/test/e2e/llmisvc/diagnostic.py b/test/e2e/llmisvc/diagnostic.py index 509b68548e3..bd6984e2a1b 100644 --- a/test/e2e/llmisvc/diagnostic.py +++ b/test/e2e/llmisvc/diagnostic.py @@ -60,14 +60,14 @@ def print_all_events_table(namespace: str, max_events: int = 50): print(f"# ❌ failed to list events: {e}") -def kinds_matching_by_labels(namespace: str, labels, api_kinds): +def kinds_matching_by_labels(namespace: str, labels, skip_api_kinds={"Secret"}): """ List all namespaced objects in `namespace` matching `labels` - whose kind is in `api_kinds`. + whose kinds are not in `skip_api_kinds`. :param namespace: kube namespace to search :param labels: either a dict of {k: v} or a raw selector string - :param api_kinds: an iterable of Resource.kind strings to include + :param skip_api_kinds: an iterable of Resource.kind strings to exclude :return: list of Unstructured objects """ config.load_kube_config() @@ -85,12 +85,13 @@ def kinds_matching_by_labels(namespace: str, labels, api_kinds): for rsrc in all_resources: if not rsrc.namespaced or "list" not in rsrc.verbs: continue - if rsrc.kind not in api_kinds: + if rsrc.kind in skip_api_kinds: continue try: resp = rsrc.get(namespace=namespace, label_selector=selector) - except ApiException: + except Exception as e: + print(f"failed to get {rsrc.kind}, skipping: {e}") continue items = getattr(resp, "items", []) diff --git a/test/e2e/llmisvc/test_llm_inference_service.py b/test/e2e/llmisvc/test_llm_inference_service.py index 57819705e2f..94e0a094fba 100644 --- a/test/e2e/llmisvc/test_llm_inference_service.py +++ b/test/e2e/llmisvc/test_llm_inference_service.py @@ -49,7 +49,7 @@ class TestCase: """Test case configuration for LLM inference service tests.""" base_refs: List[str] prompt: str - max_tokens: int = 1 + max_tokens: int = None response_assertion: Callable[[requests.Response], None] = assert_200 wait_timeout: int = 300 response_timeout: int = 60 @@ -293,21 +293,7 @@ def collect_diagnostics( print_all_events_table(ns) - all_resources = kinds_matching_by_labels( - ns, - labels, - api_kinds={ - "HTTPRoute", - "InferencePool", - "InferenceModel", - "Service", - "Deployment", - "LeaderWorkerSet", - "DestinationRule", - "StatefulSet", - }, - ) - + all_resources = kinds_matching_by_labels(ns, labels) for obj in all_resources: print("---") print(yaml.safe_dump(obj.to_dict(), sort_keys=False)) From 492d47d30128d63ff7324ebaaae7eb0540c90f5a Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Tue, 29 Jul 2025 11:02:03 +0200 Subject: [PATCH 32/38] chore: minor fixes in README Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/README.md | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/test/e2e/llmisvc/README.md b/test/e2e/llmisvc/README.md index b605afe4eed..d9f4a56f2bd 100644 --- a/test/e2e/llmisvc/README.md +++ b/test/e2e/llmisvc/README.md @@ -40,22 +40,10 @@ pytest -m "llminferenceservice and (cluster_amd or cluster_nvidia or cluster_int pytest -m "llminferenceservice and (cluster_cpu or cluster_amd)" test/e2e/llmisvc/ ``` -## Config Naming Convention - -Use prefixed categories that get composed together: - -- **`workload-*`**: workload topology, container specs and resource specs (e.g., `workload-single-cpu`, `workload-multi-node-gpu`) -- **`model-*`**: Model sources (e.g., `model-fb-opt-125m`, `model-gpt2`) -- **`router-*`**: Routing configs (e.g., `router-managed`, `router-with-scheduler`) - -Test IDs are generated by combining the cluster capability from pytest marks with all config names: -- Test ID format: `{cluster_capability}-{config1}-{config2}-{config3}` -- Example: `cluster_cpu-router-managed-workload-single-cpu-model-fb-opt-125m` - ## Adding New Configs 1. Add to `LLMINFERENCESERVICE_CONFIGS` in `fixtures.py` -2. Follow `category-descriptor` naming (prefix automatically stripped from test IDs) +2. Follow `category-descriptor` naming (described in the subsequent section) 3. Add new cluster capability test cases using `pytest.param` with appropriate marks: ```python pytest.param( @@ -80,3 +68,16 @@ Test IDs are generated by combining the cluster capability from pytest marks wit marks=pytest.mark.cluster_cpu, ), ``` + +## Config Naming Convention + +Use prefixed categories that get composed together: + +- **`workload-*`**: workload topology, container specs and resource specs (e.g., `workload-single-cpu`, `workload-multi-node-gpu`) +- **`model-*`**: Model sources (e.g., `model-fb-opt-125m`, `model-gpt2`) +- **`router-*`**: Routing configs (e.g., `router-managed`, `router-with-scheduler`) + +Test IDs are generated by combining the cluster capability from pytest marks with all config names: +- Test ID format: `{cluster_capability}-{config1}-{config2}-{config3}` +- Example: `cluster_cpu-router-managed-workload-single-cpu-model-fb-opt-125m` + From 01e714a4f00ef7951b4765e0df4f5a49d32a37d7 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Tue, 29 Jul 2025 11:04:27 +0200 Subject: [PATCH 33/38] fix: filters out *List resources Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/diagnostic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/llmisvc/diagnostic.py b/test/e2e/llmisvc/diagnostic.py index bd6984e2a1b..11d255bc50d 100644 --- a/test/e2e/llmisvc/diagnostic.py +++ b/test/e2e/llmisvc/diagnostic.py @@ -85,7 +85,7 @@ def kinds_matching_by_labels(namespace: str, labels, skip_api_kinds={"Secret"}): for rsrc in all_resources: if not rsrc.namespaced or "list" not in rsrc.verbs: continue - if rsrc.kind in skip_api_kinds: + if rsrc.kind.endswith("List") or rsrc.kind in skip_api_kinds: continue try: From 6392bcd4c587a0e863e4494a999a8fc7838af6ab Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Tue, 29 Jul 2025 11:06:26 +0200 Subject: [PATCH 34/38] chore: removes redundant init-container Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/fixtures.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/test/e2e/llmisvc/fixtures.py b/test/e2e/llmisvc/fixtures.py index 13ce5090f52..93c2a056dd1 100644 --- a/test/e2e/llmisvc/fixtures.py +++ b/test/e2e/llmisvc/fixtures.py @@ -47,12 +47,6 @@ }, "workload-pd-cpu": { "template": { - "initContainers": [ - { - "name": "llm-d-routing-sidecar", - "image": "ghcr.io/llm-d/llm-d-routing-sidecar:v0.2.0" - } - ], "containers": [ { "name": "main", From 66552698fde51a37fb94f565676ba00599b0ad10 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Tue, 29 Jul 2025 11:29:24 +0200 Subject: [PATCH 35/38] fix: makes test params xdict-friendly Previous design of test fixtures did not work for xdict when multilple worker nodes were execution the same parametrized test due to limitation of session-scope fixtures in such a setup: https://github.com/pytest-dev/pytest-xdist/issues/271 This was leading to race conditions and flaky tests when shared LLMInferenceServiceConfigs were removed prematurely, leading to failures of subsequent tests. This approach creates per-test clones of base_refs, making them localized and avoids introducing weird workarounds that author spent too time much on. Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/fixtures.py | 174 ++++++++++-------- .../e2e/llmisvc/test_llm_inference_service.py | 4 +- 2 files changed, 100 insertions(+), 78 deletions(-) diff --git a/test/e2e/llmisvc/fixtures.py b/test/e2e/llmisvc/fixtures.py index 93c2a056dd1..d679ebf3bcf 100644 --- a/test/e2e/llmisvc/fixtures.py +++ b/test/e2e/llmisvc/fixtures.py @@ -12,14 +12,16 @@ # See the License for the specific language governing permissions and # limitations under the License. -import uuid import os import pytest +import hashlib from typing import List from kubernetes import client from kubernetes.client.rest import ApiException from kserve import KServeClient, constants, V1alpha1LLMInferenceService +from .logging import logger + KSERVE_PLURAL_LLMINFERENCESERVICECONFIG = "llminferenceserviceconfigs" KSERVE_TEST_NAMESPACE = "kserve-ci-e2e-test" @@ -87,79 +89,61 @@ @pytest.fixture(scope="function") def test_case(request): tc = request.param + created_configs = [] + kserve_client = KServeClient(config_file=os.environ.get("KUBECONFIG", "~/.kube/config")) - service_name = generate_service_name(request.node.name, tc.base_refs) - tc.model_name = get_model_name_from_configs(tc.base_refs) - - # TODO fail early if base_refs does not exist (e.g. mistyped)? - - tc.llm_service = V1alpha1LLMInferenceService( - api_version="serving.kserve.io/v1alpha1", - kind="LLMInferenceService", - metadata=client.V1ObjectMeta( - name=service_name, namespace=KSERVE_TEST_NAMESPACE - ), - spec={ - "baseRefs": [{"name": base_ref} for base_ref in tc.base_refs], - }, - ) - - return tc - + try: + # Validate base_refs defined in the test fixture exist in LLMINFERENCESERVICE_CONFIGS + missing_refs = [ref for ref in tc.base_refs if ref not in LLMINFERENCESERVICE_CONFIGS] + if missing_refs: + raise ValueError(f"Missing base_refs in LLMINFERENCESERVICE_CONFIGS: {missing_refs}") -@pytest.fixture(scope="session", autouse=True) -def llm_config_factory(): - """Factory for creating/cleaning LLMInferenceServiceConfig once per session.""" - created = [] - client = KServeClient(config_file=os.environ.get("KUBECONFIG", "~/.kube/config")) + service_name = generate_service_name(request.node.name, tc.base_refs) + tc.model_name = get_model_name_from_configs(tc.base_refs) - def _create_configs(namespace=KSERVE_TEST_NAMESPACE): - for name in LLMINFERENCESERVICE_CONFIGS: + # Create unique configs for this test + unique_base_refs = [] + for base_ref in tc.base_refs: + unique_config_name = generate_k8s_safe_suffix(base_ref, [service_name]) + unique_base_refs.append(unique_config_name) - spec = LLMINFERENCESERVICE_CONFIGS[name] + original_spec = LLMINFERENCESERVICE_CONFIGS[base_ref] - try: - get_llmisvc_config(client, name, namespace) - continue - except Exception as e: - is_404_api = ( - isinstance(e, ApiException) and getattr(e, "status", None) == 404 - ) - is_404_runtime = ( - isinstance(e, RuntimeError) and "not found" in str(e).lower() - ) - if not (is_404_api or is_404_runtime): - raise - - body = { + unique_config_body = { "apiVersion": "serving.kserve.io/v1alpha1", "kind": "LLMInferenceServiceConfig", - "metadata": {"name": name, "namespace": namespace}, - "spec": spec, + "metadata": {"name": unique_config_name, "namespace": KSERVE_TEST_NAMESPACE}, + "spec": original_spec, } - try: - create_llmisvc_config(client, body, namespace) - created.append((name, namespace)) - except Exception as e: - if isinstance(e, ApiException) and getattr(e, "status", None) == 409: - continue - if isinstance(e, RuntimeError) and "already exists" in str(e).lower(): - continue - # otherwise, real error - raise + create_or_update_llmisvc_config(kserve_client, unique_config_body, KSERVE_TEST_NAMESPACE) + created_configs.append(unique_config_name) + + tc.llm_service = V1alpha1LLMInferenceService( + api_version="serving.kserve.io/v1alpha1", + kind="LLMInferenceService", + metadata=client.V1ObjectMeta( + name=service_name, namespace=KSERVE_TEST_NAMESPACE + ), + spec={ + "baseRefs": [{"name": base_ref} for base_ref in unique_base_refs], + }, + ) - yield _create_configs() + yield tc - for name, namespace in created: - try: - delete_llmisvc_config(client, name, namespace) - except Exception: - pass + finally: + for config_name in created_configs: + try: + logger.info(f"Cleaning up unique LLMInferenceServiceConfig {config_name}") + delete_llmisvc_config(kserve_client, config_name, KSERVE_TEST_NAMESPACE) + logger.info(f"βœ“ Deleted unique LLMInferenceServiceConfig {config_name}") + except Exception as e: + logger.warning(f"Failed to cleanup LLMInferenceServiceConfig {config_name}: {e}") def get_model_name_from_configs(config_names): - """Extract model name from model config.""" + """Extract the model name from model config.""" for config_name in config_names: if config_name.startswith("model-"): config = LLMINFERENCESERVICE_CONFIGS[config_name] @@ -168,21 +152,29 @@ def get_model_name_from_configs(config_names): return "default-model" -def generate_service_name(test_name: str, base_refs: List[str]) -> str: - base_name = test_name.split("[", 1)[0] - base_name = base_name.replace("test_", "") - base_name = base_name.replace("_", "-") - config_suffix = "-".join(sorted(base_refs)) - test_case = f"{base_name}-{config_suffix}".lower() +def generate_k8s_safe_suffix(base_name: str, extra_parts: List[str] = None) -> str: + """Generate a Kubernetes-safe name suffix with hash.""" + if extra_parts: + full_name = f"{base_name}-{'-'.join(sorted(extra_parts))}" + else: + full_name = base_name - uid = uuid.uuid4().hex[:8] + full_name = full_name.lower().replace("_", "-") + + name_hash = hashlib.md5(full_name.encode()).hexdigest()[:8] max_total = 63 sep = "-" - max_test_case = max_total - len(sep) - len(uid) - test_case = test_case[:max_test_case].rstrip(sep) + max_base = max_total - len(sep) - len(name_hash) + safe_base = full_name[:max_base].rstrip(sep) + + return f"{safe_base}{sep}{name_hash}" - return f"{test_case}{sep}{uid}" + +def generate_service_name(test_name: str, base_refs: List[str]) -> str: + base_name = test_name.split("[", 1)[0] + base_name = base_name.replace("test_", "") + return generate_k8s_safe_suffix(base_name, base_refs) def generate_test_id(test_case) -> str: @@ -190,32 +182,62 @@ def generate_test_id(test_case) -> str: return "-".join(test_case.base_refs) -def create_llmisvc_config(kserve_client, llm_config, namespace=None): +def create_or_update_llmisvc_config(kserve_client, llm_config, namespace=None): + """Create or update an LLMInferenceServiceConfig resource.""" version = llm_config["apiVersion"].split("/")[1] if namespace is None: namespace = llm_config.get("metadata", {}).get("namespace", "default") + name = llm_config.get("metadata", {}).get("name") + if not name: + raise ValueError("LLMInferenceServiceConfig must have a name in metadata") + + logger.info(f"Checking LLMInferenceServiceConfig {name} in namespace {namespace}") + try: - outputs = kserve_client.api_instance.create_namespaced_custom_object( + existing_config = kserve_client.api_instance.get_namespaced_custom_object( constants.KSERVE_GROUP, version, namespace, KSERVE_PLURAL_LLMINFERENCESERVICECONFIG, + name, + ) + + llm_config["metadata"] = existing_config["metadata"] + + outputs = kserve_client.api_instance.replace_namespaced_custom_object( + constants.KSERVE_GROUP, + version, + namespace, + KSERVE_PLURAL_LLMINFERENCESERVICECONFIG, + name, llm_config, ) + logger.info(f"βœ“ Successfully updated LLMInferenceServiceConfig {name}") return outputs + except client.rest.ApiException as e: - raise RuntimeError( - f"Exception when calling CustomObjectsApi->" - f"create_namespaced_custom_object for LLMInferenceServiceConfig: {e}" - ) from e + if e.status == 404: # Not found - create it + logger.info(f"Resource not found, creating LLMInferenceServiceConfig {name}") + outputs = kserve_client.api_instance.create_namespaced_custom_object( + constants.KSERVE_GROUP, + version, + namespace, + KSERVE_PLURAL_LLMINFERENCESERVICECONFIG, + llm_config, + ) + logger.info(f"βœ“ Successfully created LLMInferenceServiceConfig {name}") + return outputs + else: + raise RuntimeError(f"Failed to get/create LLMInferenceServiceConfig {name}: {e}") from e def delete_llmisvc_config( kserve_client, name, namespace, version=constants.KSERVE_V1ALPHA1_VERSION ): try: + print(f"Deleting LLMInferenceServiceConfig {name} in namespace {namespace}") return kserve_client.api_instance.delete_namespaced_custom_object( constants.KSERVE_GROUP, version, diff --git a/test/e2e/llmisvc/test_llm_inference_service.py b/test/e2e/llmisvc/test_llm_inference_service.py index 94e0a094fba..c09d5582520 100644 --- a/test/e2e/llmisvc/test_llm_inference_service.py +++ b/test/e2e/llmisvc/test_llm_inference_service.py @@ -15,6 +15,7 @@ import os import time from dataclasses import dataclass +from operator import truediv from typing import Any, Callable, List import pytest import requests @@ -29,7 +30,6 @@ generate_test_id, # Factory functions are not called explicitly, but they need to be imported to work test_case, # noqa: F401,F811 - llm_config_factory, # noqa: F401,F811 ) from .logging import log_execution @@ -49,7 +49,7 @@ class TestCase: """Test case configuration for LLM inference service tests.""" base_refs: List[str] prompt: str - max_tokens: int = None + max_tokens: int = 10 response_assertion: Callable[[requests.Response], None] = assert_200 wait_timeout: int = 300 response_timeout: int = 60 From cc45a4684929b9c225466945a98ff8cabdd7ee90 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Tue, 29 Jul 2025 11:40:13 +0200 Subject: [PATCH 36/38] chore: clean up Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/diagnostic.py | 12 +++++------- test/e2e/llmisvc/fixtures.py | 14 +++++++------- test/e2e/llmisvc/test_llm_inference_service.py | 5 ++--- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/test/e2e/llmisvc/diagnostic.py b/test/e2e/llmisvc/diagnostic.py index 11d255bc50d..59fc75dc595 100644 --- a/test/e2e/llmisvc/diagnostic.py +++ b/test/e2e/llmisvc/diagnostic.py @@ -14,10 +14,8 @@ import itertools from datetime import datetime -import pytest from kubernetes import client, config, dynamic from kubernetes.client import api_client -from kubernetes.client.exceptions import ApiException from kserve import KServeClient, V1alpha1LLMInferenceService, constants @@ -28,21 +26,21 @@ def print_all_events_table(namespace: str, max_events: int = 50): core = client.CoreV1Api() try: - evs = core.list_namespaced_event(namespace=namespace).items + events = core.list_namespaced_event(namespace=namespace).items - if not evs: + if not events: print("ℹ️ # No events found in namespace", namespace) return - evs = sorted( - evs, key=lambda e: e.last_timestamp or e.first_timestamp, reverse=True + events = sorted( + events, key=lambda e: e.last_timestamp or e.first_timestamp, reverse=True )[:max_events] header = f"{'TIME':<25} {'NAMESPACE':<12} {'SOURCE':<20} {'TYPE':<8} {'REASON':<20} MESSAGE" print(header) print("-" * len(header)) - for ev in evs: + for ev in events: ts = ev.last_timestamp or ev.first_timestamp ts_str = ( ts.strftime("%Y-%m-%d %H:%M:%S") diff --git a/test/e2e/llmisvc/fixtures.py b/test/e2e/llmisvc/fixtures.py index d679ebf3bcf..72a66e9f3e1 100644 --- a/test/e2e/llmisvc/fixtures.py +++ b/test/e2e/llmisvc/fixtures.py @@ -99,7 +99,7 @@ def test_case(request): raise ValueError(f"Missing base_refs in LLMINFERENCESERVICE_CONFIGS: {missing_refs}") service_name = generate_service_name(request.node.name, tc.base_refs) - tc.model_name = get_model_name_from_configs(tc.base_refs) + tc.model_name = _get_model_name_from_configs(tc.base_refs) # Create unique configs for this test unique_base_refs = [] @@ -116,7 +116,7 @@ def test_case(request): "spec": original_spec, } - create_or_update_llmisvc_config(kserve_client, unique_config_body, KSERVE_TEST_NAMESPACE) + _create_or_update_llmisvc_config(kserve_client, unique_config_body, KSERVE_TEST_NAMESPACE) created_configs.append(unique_config_name) tc.llm_service = V1alpha1LLMInferenceService( @@ -136,13 +136,13 @@ def test_case(request): for config_name in created_configs: try: logger.info(f"Cleaning up unique LLMInferenceServiceConfig {config_name}") - delete_llmisvc_config(kserve_client, config_name, KSERVE_TEST_NAMESPACE) + _delete_llmisvc_config(kserve_client, config_name, KSERVE_TEST_NAMESPACE) logger.info(f"βœ“ Deleted unique LLMInferenceServiceConfig {config_name}") except Exception as e: logger.warning(f"Failed to cleanup LLMInferenceServiceConfig {config_name}: {e}") -def get_model_name_from_configs(config_names): +def _get_model_name_from_configs(config_names): """Extract the model name from model config.""" for config_name in config_names: if config_name.startswith("model-"): @@ -182,7 +182,7 @@ def generate_test_id(test_case) -> str: return "-".join(test_case.base_refs) -def create_or_update_llmisvc_config(kserve_client, llm_config, namespace=None): +def _create_or_update_llmisvc_config(kserve_client, llm_config, namespace=None): """Create or update an LLMInferenceServiceConfig resource.""" version = llm_config["apiVersion"].split("/")[1] @@ -233,7 +233,7 @@ def create_or_update_llmisvc_config(kserve_client, llm_config, namespace=None): raise RuntimeError(f"Failed to get/create LLMInferenceServiceConfig {name}: {e}") from e -def delete_llmisvc_config( +def _delete_llmisvc_config( kserve_client, name, namespace, version=constants.KSERVE_V1ALPHA1_VERSION ): try: @@ -252,7 +252,7 @@ def delete_llmisvc_config( ) from e -def get_llmisvc_config( +def _get_llmisvc_config( kserve_client, name, namespace, version=constants.KSERVE_V1ALPHA1_VERSION ): try: diff --git a/test/e2e/llmisvc/test_llm_inference_service.py b/test/e2e/llmisvc/test_llm_inference_service.py index c09d5582520..1f3bc0405d0 100644 --- a/test/e2e/llmisvc/test_llm_inference_service.py +++ b/test/e2e/llmisvc/test_llm_inference_service.py @@ -15,7 +15,6 @@ import os import time from dataclasses import dataclass -from operator import truediv from typing import Any, Callable, List import pytest import requests @@ -102,7 +101,7 @@ def test_llm_inference_service(test_case: TestCase): wait_for_model_response(kserve_client, test_case, test_case.wait_timeout) except Exception as e: print(f"❌ ERROR: Failed to call llm inference service {service_name}: {e}") - collect_diagnostics(kserve_client, test_case.llm_service) + _collect_diagnostics(kserve_client, test_case.llm_service) raise finally: try: @@ -270,7 +269,7 @@ def wait_for( time.sleep(interval) -def collect_diagnostics( +def _collect_diagnostics( kserve_client: KServeClient, llm_isvc: V1alpha1LLMInferenceService ): name = llm_isvc.metadata.name From 1ba1e0cf09a31efa7a2efe2fafeb02e719a9af46 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Tue, 29 Jul 2025 12:14:29 +0200 Subject: [PATCH 37/38] review Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/diagnostic.py | 9 ++++++--- test/e2e/llmisvc/test_llm_inference_service.py | 2 -- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/test/e2e/llmisvc/diagnostic.py b/test/e2e/llmisvc/diagnostic.py index 59fc75dc595..8ced640c637 100644 --- a/test/e2e/llmisvc/diagnostic.py +++ b/test/e2e/llmisvc/diagnostic.py @@ -58,16 +58,19 @@ def print_all_events_table(namespace: str, max_events: int = 50): print(f"# ❌ failed to list events: {e}") -def kinds_matching_by_labels(namespace: str, labels, skip_api_kinds={"Secret"}): +def kinds_matching_by_labels(namespace: str, labels, skip_api_kinds=None): """ List all namespaced objects in `namespace` matching `labels` whose kinds are not in `skip_api_kinds`. - :param namespace: kube namespace to search + :param namespace: Namespace to search :param labels: either a dict of {k: v} or a raw selector string :param skip_api_kinds: an iterable of Resource.kind strings to exclude - :return: list of Unstructured objects + :return: a list of Unstructured objects """ + if skip_api_kinds is None: + skip_api_kinds = {"Secret"} + config.load_kube_config() dyn = dynamic.DynamicClient(api_client.ApiClient()) diff --git a/test/e2e/llmisvc/test_llm_inference_service.py b/test/e2e/llmisvc/test_llm_inference_service.py index 1f3bc0405d0..c8f6fcfa38d 100644 --- a/test/e2e/llmisvc/test_llm_inference_service.py +++ b/test/e2e/llmisvc/test_llm_inference_service.py @@ -112,8 +112,6 @@ def test_llm_inference_service(test_case: TestCase): @log_execution def create_llmisvc(kserve_client: KServeClient, llm_isvc: V1alpha1LLMInferenceService): - from kserve.utils import utils - try: outputs = kserve_client.api_instance.create_namespaced_custom_object( constants.KSERVE_GROUP, From 03b15b574c8a0fff597abcfecf4397bdec418a81 Mon Sep 17 00:00:00 2001 From: Bartosz Majsak Date: Tue, 29 Jul 2025 12:21:51 +0200 Subject: [PATCH 38/38] precommit fix Signed-off-by: Bartosz Majsak --- test/e2e/llmisvc/diagnostic.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/e2e/llmisvc/diagnostic.py b/test/e2e/llmisvc/diagnostic.py index 8ced640c637..a1458db91de 100644 --- a/test/e2e/llmisvc/diagnostic.py +++ b/test/e2e/llmisvc/diagnostic.py @@ -70,7 +70,7 @@ def kinds_matching_by_labels(namespace: str, labels, skip_api_kinds=None): """ if skip_api_kinds is None: skip_api_kinds = {"Secret"} - + config.load_kube_config() dyn = dynamic.DynamicClient(api_client.ApiClient())