Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions python/kserve/kserve/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,14 @@
from .models.v1alpha1_inference_router import V1alpha1InferenceRouter
from .models.v1alpha1_inference_step import V1alpha1InferenceStep
from .models.v1alpha1_inference_target import V1alpha1InferenceTarget
from .models.v1alpha1_llm_inference_service import V1alpha1LLMInferenceService
from .models.v1alpha1_llm_inference_service_config import (
V1alpha1LLMInferenceServiceConfig,
)
from .models.v1alpha1_llm_inference_service_config_list import (
V1alpha1LLMInferenceServiceConfigList,
)
from .models.v1alpha1_llm_inference_service_list import V1alpha1LLMInferenceServiceList
from .models.v1alpha1_model_spec import V1alpha1ModelSpec
from .models.v1alpha1_serving_runtime import V1alpha1ServingRuntime
from .models.v1alpha1_serving_runtime_list import V1alpha1ServingRuntimeList
Expand Down
260 changes: 260 additions & 0 deletions test/e2e/llmisvc/test_llm_inference_service.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,260 @@
# Copyright 2025 The KServe Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import os
import pytest
import requests
import time
from kserve import KServeClient, constants, V1alpha1LLMInferenceService
from kubernetes import client

# TODO both KServe Gateway and DestinationRule have to exist,
# latter can be created in the test namespace. It should be automated.
KSERVE_TEST_NAMESPACE = (
"llm-test" # Aligned with DEV.md - expects to have DestinationRule created
)

KSERVE_PLURAL_LLMINFERENCESERVICE = "llminferenceservices"


@pytest.mark.llminferenceservice
@pytest.mark.asyncio(scope="session")
async def test_llm_inference_service_facebook_opt():
Comment on lines +31 to +33
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Remove unnecessary async marker or add async operations.

The test function is marked with @pytest.mark.asyncio but doesn't contain any await statements or async operations. This creates unnecessary overhead.

Apply this diff to remove the async marker:

 @pytest.mark.llminferenceservice
-@pytest.mark.asyncio(scope="session")
-async def test_llm_inference_service_facebook_opt():
+def test_llm_inference_service_facebook_opt():
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
@pytest.mark.llminferenceservice
@pytest.mark.asyncio(scope="session")
async def test_llm_inference_service_facebook_opt():
@pytest.mark.llminferenceservice
def test_llm_inference_service_facebook_opt():
🤖 Prompt for AI Agents
In test/e2e/llmisvc/test_llm_inference_service.py around lines 31 to 33, the
test function is marked as async with @pytest.mark.asyncio but contains no await
or async operations. Remove the @pytest.mark.asyncio decorator and the async
keyword from the function definition to eliminate unnecessary async overhead.

service_name = "facebook-opt-125m-single"

llm_isvc = V1alpha1LLMInferenceService(
api_version="serving.kserve.io/v1alpha1",
kind="LLMInferenceService",
metadata=client.V1ObjectMeta(
name=service_name, namespace=KSERVE_TEST_NAMESPACE
),
spec={
"model": {"uri": "hf://facebook/opt-125m", "name": "facebook/opt-125m"},
"replicas": 1,
"router": {"scheduler": {}, "route": {}, "gateway": {}},
"template": {
"containers": [
{
"name": "main",
"image": "quay.io/pierdipi/vllm-cpu:latest",
"env": [{"name": "VLLM_LOGGING_LEVEL", "value": "DEBUG"}],
"resources": {
"limits": {"cpu": "2", "memory": "16Gi"},
"requests": {"cpu": "1", "memory": "8Gi"},
},
"livenessProbe": {
"initialDelaySeconds": 30,
"periodSeconds": 30,
"timeoutSeconds": 30,
"failureThreshold": 5,
},
}
]
},
},
)

kserve_client = KServeClient(
config_file=os.environ.get("KUBECONFIG", "~/.kube/config")
)

try:
create_llmisvc(kserve_client, llm_isvc)

wait_for_model_response(kserve_client, service_name, KSERVE_TEST_NAMESPACE)

finally:
try:
delete_llmisvc(kserve_client, service_name, KSERVE_TEST_NAMESPACE)
except Exception as e:
print(f"Warning: Failed to cleanup service {service_name}: {e}")


# TODO this can be moved to kserve_client, keeping localized for now


def create_llmisvc(kserve_client, llm_isvc, namespace=None, timeout_seconds=600):
"""
Create LLM inference service - based on KServeClient.create() but for LLMInferenceService
:param kserve_client: KServe client instance
:param llm_isvc: LLM inference service object
:param namespace: defaults to current or default namespace
:param timeout_seconds: timeout seconds for create operation, default to 600s
:return: created LLM inference service
"""
from kserve.utils import utils

version = llm_isvc.api_version.split("/")[1]

if namespace is None:
namespace = utils.get_isvc_namespace(llm_isvc)

try:
outputs = kserve_client.api_instance.create_namespaced_custom_object(
constants.KSERVE_GROUP,
version,
namespace,
KSERVE_PLURAL_LLMINFERENCESERVICE,
llm_isvc,
)
return outputs
except client.rest.ApiException as e:
raise RuntimeError(
f"Exception when calling CustomObjectsApi->create_namespaced_custom_object for LLMInferenceService: {e}"
)
Comment on lines +113 to +115
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion

Improve exception handling by preserving the exception chain.

Use raise ... from e to maintain the exception chain for better debugging and error tracing.

Apply these diffs to fix exception handling:

In create_llmisvc:

     except client.rest.ApiException as e:
         raise RuntimeError(
             f"Exception when calling CustomObjectsApi->create_namespaced_custom_object for LLMInferenceService: {e}"
-        )
+        ) from e

In delete_llmisvc:

     except client.rest.ApiException as e:
         raise RuntimeError(
             f"Exception when calling CustomObjectsApi->delete_namespaced_custom_object for LLMInferenceService: {e}"
-        )
+        ) from e

In get_llmisvc:

     except client.rest.ApiException as e:
         raise RuntimeError(
             f"Exception when calling CustomObjectsApi->get_namespaced_custom_object for LLMInferenceService: {e}"
-        )
+        ) from e

In get_llm_service_url:

     except Exception as e:
-        raise ValueError(f"Failed to get URL for LLM inference service {service_name}: {e}")
+        raise ValueError(f"Failed to get URL for LLM inference service {service_name}: {e}") from e

Also applies to: 175-177, 197-199, 245-245

🧰 Tools
🪛 Ruff (0.11.9)

143-145: Within an except clause, raise exceptions with raise ... from err or raise ... from None to distinguish them from errors in exception handling

(B904)

🤖 Prompt for AI Agents
In test/e2e/llmisvc/test_llm_inference_service.py at lines 143-145 and similarly
at lines 175-177, 197-199, and 245, improve exception handling by changing the
raise statements to use "raise ... from e" syntax. This preserves the original
exception context and stack trace. Locate each raise RuntimeError statement that
includes the exception variable e and modify it to "raise RuntimeError(...) from
e" to maintain the exception chain for better debugging.



def delete_llmisvc(
kserve_client, name, namespace, version=constants.KSERVE_V1ALPHA1_VERSION
):
"""
Delete LLM inference service - based on KServeClient.delete() but for LLMInferenceService
:param kserve_client: KServe client instance
:param name: LLM inference service name
:param namespace: namespace
:param version: api group version
:return: delete response
"""
try:
return kserve_client.api_instance.delete_namespaced_custom_object(
constants.KSERVE_GROUP,
version,
namespace,
KSERVE_PLURAL_LLMINFERENCESERVICE,
name,
)
except client.rest.ApiException as e:
raise RuntimeError(
f"Exception when calling CustomObjectsApi->delete_namespaced_custom_object for LLMInferenceService: {e}"
)


def get_llmisvc(
kserve_client, name, namespace, version=constants.KSERVE_V1ALPHA1_VERSION
):
"""
Get LLM inference service - based on KServeClient.get() but for LLMInferenceService
:param kserve_client: KServe client instance
:param name: LLM inference service name
:param namespace: namespace
:param version: api group version
:return: LLM inference service object
"""
try:
return kserve_client.api_instance.get_namespaced_custom_object(
constants.KSERVE_GROUP,
version,
namespace,
KSERVE_PLURAL_LLMINFERENCESERVICE,
name,
)
except client.rest.ApiException as e:
raise RuntimeError(
f"Exception when calling CustomObjectsApi->get_namespaced_custom_object for LLMInferenceService: {e}"
)


def wait_for_model_response(
kserve_client,
name,
namespace,
timeout_seconds=600,
version=constants.KSERVE_V1ALPHA1_VERSION,
):
"""
Wait for LLM model to respond successfully by polling the endpoint
:param kserve_client: KServe client instance
:param name: LLM inference service name
:param namespace: namespace
:param timeout_seconds: timeout seconds for waiting, default to 600s
:param version: api group version
:return: service URL when model responds successfully
"""

def assert_model_responds():
service_url = None

try:
service_url = get_llm_service_url(kserve_client, name, namespace, version)
except Exception as e:
raise AssertionError(f"Failed to get service URL: {e}")

completion_url = f"{service_url}/v1/completions"
test_payload = {"model": "facebook/opt-125m", "prompt": "test", "max_tokens": 1}

try:
response = requests.post(
completion_url,
headers={"Content-Type": "application/json"},
json=test_payload,
timeout=30,
)
except Exception as e:
raise AssertionError(f"Failed to call model: {e}")

assert (
response.status_code == 200
), f"Service returned {response.status_code}: {response.text}"

response_data = response.json()
assert "choices" in response_data, "Response should contain 'choices' field"

wait_for(assert_model_responds, timeout=timeout_seconds, interval=10.0)


def wait_for(assertion_fn, timeout: float = 5.0, interval: float = 0.1):
"""
Repeatedly calls assertion_fn() until it returns without AssertionError
or until timeout (in seconds) is reached.
"""
deadline = time.time() + timeout
while True:
try:
return assertion_fn()
except AssertionError:
if time.time() >= deadline:
# Re-raise the last AssertionError so pytest reports it
raise
time.sleep(interval)


def get_llm_service_url(
kserve_client, service_name, namespace, version=constants.KSERVE_V1ALPHA1_VERSION
):
try:
llm_isvc = get_llmisvc(kserve_client, service_name, namespace, version)

if "status" not in llm_isvc:
raise ValueError(f"No status found in LLM inference service {service_name}")

status = llm_isvc["status"]

if "url" in status and status["url"]:
return status["url"]

if (
"addresses" in status
and status["addresses"]
and len(status["addresses"]) > 0
):
first_address = status["addresses"][0]
if "url" in first_address:
return first_address["url"]

raise ValueError(f"No URL found in LLM inference service {service_name} status")

except Exception as e:
raise ValueError(
f"Failed to get URL for LLM inference service {service_name}: {e}"
)
1 change: 1 addition & 0 deletions test/e2e/pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -17,4 +17,5 @@ markers =
path_based_routing: e2e tests for path based routing
llm: e2e tests for huggingface runtime
vllm: e2e tests for huggingface runtime with vllm-openvino backend
llminferenceservice: e2e tests for llm inference service controller
modelcache: e2e tests for model caching
Loading