From 33efcce917a15b7a20ca7e7916136ea9f1b18db7 Mon Sep 17 00:00:00 2001 From: "Li, Tianmu" Date: Tue, 19 May 2026 20:21:35 +0000 Subject: [PATCH] Allow tokenizer_name override Signed-off-by: Li, Tianmu --- src/inference_endpoint/commands/benchmark/execute.py | 5 ++++- src/inference_endpoint/config/schema.py | 4 ++++ .../config/templates/concurrency_template_full.yaml | 1 + .../config/templates/offline_template_full.yaml | 1 + .../config/templates/online_template_full.yaml | 1 + tests/unit/config/test_schema.py | 9 +++++++++ 6 files changed, 20 insertions(+), 1 deletion(-) diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py index 1a569f60..33c8fa0f 100644 --- a/src/inference_endpoint/commands/benchmark/execute.py +++ b/src/inference_endpoint/commands/benchmark/execute.py @@ -386,7 +386,10 @@ def setup_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> BenchmarkCo # Tokenizer check (light API call, no download) model_name = config.model_params.name - tokenizer_name = model_name if _check_tokenizer_exists(model_name) else None + tokenizer_source = config.model_params.tokenizer_name or model_name + tokenizer_name = ( + tokenizer_source if _check_tokenizer_exists(tokenizer_source) else None + ) # Streaming logger.info( diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py index d3390483..2175c6fa 100644 --- a/src/inference_endpoint/config/schema.py +++ b/src/inference_endpoint/config/schema.py @@ -197,6 +197,10 @@ class ModelParams(BaseModel): StreamingMode, cyclopts.Parameter(alias="--streaming", help="Streaming mode: auto/on/off"), ] = StreamingMode.AUTO + tokenizer_name: str | None = Field( + None, + description="HuggingFace tokenizer repo ID. Overrides model name for tokenizer loading.", + ) class SubmissionReference(BaseModel): diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml index 24e2f337..64f9e22e 100644 --- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml +++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml @@ -12,6 +12,7 @@ model_params: max_new_tokens: 1024 # Max output tokens osl_distribution: null # Output sequence length distribution streaming: 'on' # Streaming mode: auto/on/off | options: auto, on, off + tokenizer_name: null # HuggingFace tokenizer repo ID. Overrides model name for tokenizer loading. datasets: # Dataset configs - name: perf type: performance # Dataset purpose: performance or accuracy | options: performance, accuracy diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml index 460e9493..1b5969d0 100644 --- a/src/inference_endpoint/config/templates/offline_template_full.yaml +++ b/src/inference_endpoint/config/templates/offline_template_full.yaml @@ -12,6 +12,7 @@ model_params: max_new_tokens: 1024 # Max output tokens osl_distribution: null # Output sequence length distribution streaming: 'off' # Streaming mode: auto/on/off | options: auto, on, off + tokenizer_name: null # HuggingFace tokenizer repo ID. Overrides model name for tokenizer loading. datasets: # Dataset configs - name: perf type: performance # Dataset purpose: performance or accuracy | options: performance, accuracy diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml index c03ae175..226f0a64 100644 --- a/src/inference_endpoint/config/templates/online_template_full.yaml +++ b/src/inference_endpoint/config/templates/online_template_full.yaml @@ -12,6 +12,7 @@ model_params: max_new_tokens: 1024 # Max output tokens osl_distribution: null # Output sequence length distribution streaming: 'on' # Streaming mode: auto/on/off | options: auto, on, off + tokenizer_name: null # HuggingFace tokenizer repo ID. Overrides model name for tokenizer loading. datasets: # Dataset configs - name: perf type: performance # Dataset purpose: performance or accuracy | options: performance, accuracy diff --git a/tests/unit/config/test_schema.py b/tests/unit/config/test_schema.py index ea7dc9ee..0aaa44c2 100644 --- a/tests/unit/config/test_schema.py +++ b/tests/unit/config/test_schema.py @@ -68,6 +68,7 @@ def test_defaults(self): params = ModelParams(name="test") assert params.temperature is None assert params.max_new_tokens == 1024 + assert params.tokenizer_name is None @pytest.mark.unit def test_with_osl_distribution(self): @@ -84,6 +85,14 @@ def test_with_osl_distribution(self): assert params.temperature == 0.5 assert params.osl_distribution.type == OSLDistributionType.NORMAL + @pytest.mark.unit + def test_tokenizer_name_override(self): + params = ModelParams( + name="qwen/qwen3.6-35b-a3b", tokenizer_name="Qwen/Qwen3.6-35B-A3B" + ) + assert params.tokenizer_name == "Qwen/Qwen3.6-35B-A3B" + assert params.name == "qwen/qwen3.6-35b-a3b" + class TestAPIType: @pytest.mark.unit