From 33efcce917a15b7a20ca7e7916136ea9f1b18db7 Mon Sep 17 00:00:00 2001
From: "Li, Tianmu" <tianmu.li@intel.com>
Date: Tue, 19 May 2026 20:21:35 +0000
Subject: [PATCH] Allow tokenizer_name override

Signed-off-by: Li, Tianmu <tianmu.li@intel.com>
---
 src/inference_endpoint/commands/benchmark/execute.py     | 5 ++++-
 src/inference_endpoint/config/schema.py                  | 4 ++++
 .../config/templates/concurrency_template_full.yaml      | 1 +
 .../config/templates/offline_template_full.yaml          | 1 +
 .../config/templates/online_template_full.yaml           | 1 +
 tests/unit/config/test_schema.py                         | 9 +++++++++
 6 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/src/inference_endpoint/commands/benchmark/execute.py b/src/inference_endpoint/commands/benchmark/execute.py
index 1a569f60..33c8fa0f 100644
--- a/src/inference_endpoint/commands/benchmark/execute.py
+++ b/src/inference_endpoint/commands/benchmark/execute.py
@@ -386,7 +386,10 @@ def setup_benchmark(config: BenchmarkConfig, test_mode: TestMode) -> BenchmarkCo
 
     # Tokenizer check (light API call, no download)
     model_name = config.model_params.name
-    tokenizer_name = model_name if _check_tokenizer_exists(model_name) else None
+    tokenizer_source = config.model_params.tokenizer_name or model_name
+    tokenizer_name = (
+        tokenizer_source if _check_tokenizer_exists(tokenizer_source) else None
+    )
 
     # Streaming
     logger.info(
diff --git a/src/inference_endpoint/config/schema.py b/src/inference_endpoint/config/schema.py
index d3390483..2175c6fa 100644
--- a/src/inference_endpoint/config/schema.py
+++ b/src/inference_endpoint/config/schema.py
@@ -197,6 +197,10 @@ class ModelParams(BaseModel):
         StreamingMode,
         cyclopts.Parameter(alias="--streaming", help="Streaming mode: auto/on/off"),
     ] = StreamingMode.AUTO
+    tokenizer_name: str | None = Field(
+        None,
+        description="HuggingFace tokenizer repo ID. Overrides model name for tokenizer loading.",
+    )
 
 
 class SubmissionReference(BaseModel):
diff --git a/src/inference_endpoint/config/templates/concurrency_template_full.yaml b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
index 24e2f337..64f9e22e 100644
--- a/src/inference_endpoint/config/templates/concurrency_template_full.yaml
+++ b/src/inference_endpoint/config/templates/concurrency_template_full.yaml
@@ -12,6 +12,7 @@ model_params:
   max_new_tokens: 1024  # Max output tokens
   osl_distribution: null  # Output sequence length distribution
   streaming: 'on'  # Streaming mode: auto/on/off | options: auto, on, off
+  tokenizer_name: null  # HuggingFace tokenizer repo ID. Overrides model name for tokenizer loading.
 datasets:  # Dataset configs
 - name: perf
   type: performance  # Dataset purpose: performance or accuracy | options: performance, accuracy
diff --git a/src/inference_endpoint/config/templates/offline_template_full.yaml b/src/inference_endpoint/config/templates/offline_template_full.yaml
index 460e9493..1b5969d0 100644
--- a/src/inference_endpoint/config/templates/offline_template_full.yaml
+++ b/src/inference_endpoint/config/templates/offline_template_full.yaml
@@ -12,6 +12,7 @@ model_params:
   max_new_tokens: 1024  # Max output tokens
   osl_distribution: null  # Output sequence length distribution
   streaming: 'off'  # Streaming mode: auto/on/off | options: auto, on, off
+  tokenizer_name: null  # HuggingFace tokenizer repo ID. Overrides model name for tokenizer loading.
 datasets:  # Dataset configs
 - name: perf
   type: performance  # Dataset purpose: performance or accuracy | options: performance, accuracy
diff --git a/src/inference_endpoint/config/templates/online_template_full.yaml b/src/inference_endpoint/config/templates/online_template_full.yaml
index c03ae175..226f0a64 100644
--- a/src/inference_endpoint/config/templates/online_template_full.yaml
+++ b/src/inference_endpoint/config/templates/online_template_full.yaml
@@ -12,6 +12,7 @@ model_params:
   max_new_tokens: 1024  # Max output tokens
   osl_distribution: null  # Output sequence length distribution
   streaming: 'on'  # Streaming mode: auto/on/off | options: auto, on, off
+  tokenizer_name: null  # HuggingFace tokenizer repo ID. Overrides model name for tokenizer loading.
 datasets:  # Dataset configs
 - name: perf
   type: performance  # Dataset purpose: performance or accuracy | options: performance, accuracy
diff --git a/tests/unit/config/test_schema.py b/tests/unit/config/test_schema.py
index ea7dc9ee..0aaa44c2 100644
--- a/tests/unit/config/test_schema.py
+++ b/tests/unit/config/test_schema.py
@@ -68,6 +68,7 @@ def test_defaults(self):
         params = ModelParams(name="test")
         assert params.temperature is None
         assert params.max_new_tokens == 1024
+        assert params.tokenizer_name is None
 
     @pytest.mark.unit
     def test_with_osl_distribution(self):
@@ -84,6 +85,14 @@ def test_with_osl_distribution(self):
         assert params.temperature == 0.5
         assert params.osl_distribution.type == OSLDistributionType.NORMAL
 
+    @pytest.mark.unit
+    def test_tokenizer_name_override(self):
+        params = ModelParams(
+            name="qwen/qwen3.6-35b-a3b", tokenizer_name="Qwen/Qwen3.6-35B-A3B"
+        )
+        assert params.tokenizer_name == "Qwen/Qwen3.6-35B-A3B"
+        assert params.name == "qwen/qwen3.6-35b-a3b"
+
 
 class TestAPIType:
     @pytest.mark.unit