ModelTC · sufubao · May 6, 2026 · May 6, 2026 · May 6, 2026 · May 6, 2026
diff --git a/docs/EN/source/cookbook/qwen35_deployment.rst b/docs/EN/source/cookbook/qwen35_deployment.rst
@@ -233,3 +233,12 @@ Hardware Requirements
 - ``--tp 8`` required to fit model weights across GPUs
 - Reduce ``--max_req_total_len`` or ``--graph_max_batch_size`` if encountering OOM errors
 - Use ``--data_type fp8_e4m3`` for FP8 KV quantization to further reduce memory pressure
+- For multimodal OOMs driven by dynamic-resolution images or video, cap the
+  per-step ViT workload with ``--visual_batch_max_tokens`` (e.g. ``16384``).
+  This bounds peak ViT memory the same way ``--batch_max_tokens`` bounds the
+  LLM prefill. Setting this alone also derives a default single-image cap
+  (``--visual_image_max_tokens`` is implicitly set to the same value), which
+  rejects any single image that couldn't fit in one batch — closing the "first
+  image always admitted" deadlock-avoidance hole. Override
+  ``--visual_image_max_tokens`` separately only if you need a stricter single-
+  image limit.
diff --git a/docs/EN/source/tutorial/api_server_args.rst b/docs/EN/source/tutorial/api_server_args.rst
@@ -272,6 +272,35 @@ Multimodal Parameters
 
     Number of images processed in each inference batch, default is ``1``
 
+.. option:: --visual_batch_max_tokens
+
+    Per-step ViT admission budget, measured in image output tokens (post
+    spatial_merge). The multimodal analogue of ``--batch_max_tokens``: the
+    ViT scheduler stops adding images to the current batch once their
+    cumulative ``token_num`` would exceed this value. Useful for bounding
+    peak ViT memory on dynamic-resolution models (Qwen2.5/3/3.5-VL, etc.)
+    where one 4K image or long video can contain more patches than many
+    small images combined. One image is always admitted per step to avoid
+    deadlock when a single request is larger than the budget. Default is
+    ``None`` (disabled; only ``--visual_infer_batch_size`` applies).
+
+.. option:: --visual_image_max_tokens
+
+    Per-image hard cap, measured in image output tokens (post spatial_merge).
+    The multimodal analogue of ``--max_req_total_len``: a single image whose
+    ``token_num`` exceeds this value is rejected with a ``ValueError`` before
+    reaching the ViT. Pairs with ``--visual_batch_max_tokens`` to close the
+    "first image always admitted" hole — without this cap, one 4K image can
+    still OOM the ViT on its own.
+
+    If not specified, defaults to ``--visual_batch_max_tokens`` (single image
+    must fit in one batch; this is the implicit precondition of the first-image-
+    always-admitted rule). Set explicitly to a smaller value only if your ViT
+    cannot handle a batch-sized image alone. Must satisfy
+    ``visual_image_max_tokens <= visual_batch_max_tokens``.
+    Default is ``None`` (disabled when ``--visual_batch_max_tokens`` is also
+    unset).
+
 .. option:: --visual_gpu_ids
 
     List of GPU IDs to use, e.g., 0 1 2

diff --git a/lightllm/models/qwen2_5_vl/qwen2_5_visual.py b/lightllm/models/qwen2_5_vl/qwen2_5_visual.py
@@ -8,7 +8,8 @@
 from io import BytesIO
 import torch.nn as nn
 from transformers.activations import ACT2FN
-from lightllm.models.qwen2_vl.vision_process import resize_image, Qwen2VLImageProcessor
+from lightllm.models.qwen2_vl.vision_process import resize_image, Qwen2VLImageProcessor, clamp_processor_max_pixels
+from lightllm.utils.envs_utils import get_env_start_args
 from safetensors import safe_open
 from lightllm.server.multimodal_params import ImageItem
 from lightllm.models.qwen2_vl.qwen2_visual import PatchEmbed, VisionRotaryEmbedding
@@ -208,6 +209,9 @@ def __init__(
         with open(processor_config_path, "r") as f:
             processor_config_dict = json.load(f)
         self.processor = Qwen2VLImageProcessor(**processor_config_dict)
+        clamp_processor_max_pixels(
+            self.processor, get_env_start_args().visual_image_max_tokens, processor_name="qwen2_5_vl-vit"
+        )
 
         self._init_datatype()
 

diff --git a/lightllm/models/qwen2_vl/qwen2_visual.py b/lightllm/models/qwen2_vl/qwen2_visual.py
@@ -33,7 +33,8 @@
 from safetensors import safe_open
 from lightllm.server.multimodal_params import ImageItem
 from lightllm.server.visualserver import get_vit_attn_backend
-from lightllm.models.qwen2_vl.vision_process import resize_image, Qwen2VLImageProcessor
+from lightllm.models.qwen2_vl.vision_process import resize_image, Qwen2VLImageProcessor, clamp_processor_max_pixels
+from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.common.basemodel.layer_infer.cache_tensor_manager import g_cache_manager
 from lightllm.models.qwen2_vl.triton_kernel.rotary_pos_emb import apply_rotary_pos_emb_triton
 
@@ -244,6 +245,9 @@ def load_model(self, weight_dir):
         with open(processor_config_path, "r") as f:
             processor_config_dict = json.load(f)
         self.processor = Qwen2VLImageProcessor(**processor_config_dict)
+        clamp_processor_max_pixels(
+            self.processor, get_env_start_args().visual_image_max_tokens, processor_name="qwen2_vl-vit"
+        )
 
         bin_weight_files = [file_ for file_ in os.listdir(weight_dir) if file_.endswith(".bin")]
         if bin_weight_files:

diff --git a/lightllm/models/qwen2_vl/vision_process.py b/lightllm/models/qwen2_vl/vision_process.py
@@ -27,6 +27,38 @@
 logger = init_logger(__name__)
 
 
+def clamp_processor_max_pixels(processor, visual_image_max_tokens, processor_name: str = "") -> None:
+    """Clamp a Qwen-VL style image processor's ``max_pixels`` so that even a
+    max-sized image produces ``token_num <= visual_image_max_tokens``.
+
+    Reuses the processor's built-in ``smart_resize`` + ``max_pixels`` mechanism —
+    just tightens ``max_pixels`` so the existing resize path fits the server-wide
+    per-image token budget. After the clamp, ``get_image_token_length`` cannot
+    return a value above the budget, so request-level rejection becomes a
+    defensive no-op in practice.
+
+    No-op when ``visual_image_max_tokens`` is None or the processor already
+    enforces a tighter bound.
+    """
+    if visual_image_max_tokens is None:
+        return
+    unit = processor.patch_size * processor.merge_size
+    allowed_max_pixels = visual_image_max_tokens * unit * unit
+    if allowed_max_pixels < unit * unit:
+        raise ValueError(
+            f"visual_image_max_tokens={visual_image_max_tokens} is too small; "
+            f"need at least 1 patch's worth (={unit * unit} pixels) for {processor_name or 'processor'}."
+        )
+    current_max_pixels = getattr(processor, "max_pixels", None)
+    if current_max_pixels is None or allowed_max_pixels < current_max_pixels:
+        logger.info(
+            f"{processor_name or 'processor'}: clamping max_pixels "
+            f"{current_max_pixels} -> {allowed_max_pixels} "
+            f"(visual_image_max_tokens={visual_image_max_tokens}, unit={unit})"
+        )
+        processor.max_pixels = allowed_max_pixels
+
+
 IMAGE_FACTOR = 28
 MIN_PIXELS = 4 * 28 * 28
 MAX_PIXELS = 16384 * 28 * 28

diff --git a/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py b/lightllm/models/qwen3_omni_moe_thinker/qwen3_omni_visual.py
@@ -27,7 +27,8 @@
 
 from lightllm.server.multimodal_params import ImageItem
 from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
-from lightllm.models.qwen2_vl.vision_process import resize_image, Qwen2VLImageProcessor
+from lightllm.models.qwen2_vl.vision_process import resize_image, Qwen2VLImageProcessor, clamp_processor_max_pixels
+from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.models.qwen2_vl.qwen2_visual import VisionRotaryEmbedding, VisionFlashAttention
 
 
@@ -225,6 +226,9 @@ def load_model(self, weight_dir):
         with open(processor_config_path, "r") as f:
             processor_config_dict = json.load(f)
         self.processor = Qwen2VLImageProcessor(**processor_config_dict)
+        clamp_processor_max_pixels(
+            self.processor, get_env_start_args().visual_image_max_tokens, processor_name="qwen3_omni-vit"
+        )
 
         bin_weight_files = [file_ for file_ in os.listdir(weight_dir) if file_.endswith(".bin")]
         if bin_weight_files:

diff --git a/lightllm/models/qwen3_vl/qwen3_visual.py b/lightllm/models/qwen3_vl/qwen3_visual.py
@@ -27,7 +27,8 @@
 
 from lightllm.server.multimodal_params import ImageItem
 from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
-from lightllm.models.qwen2_vl.vision_process import resize_image, Qwen2VLImageProcessor
+from lightllm.models.qwen2_vl.vision_process import resize_image, Qwen2VLImageProcessor, clamp_processor_max_pixels
+from lightllm.utils.envs_utils import get_env_start_args
 from lightllm.models.qwen2_vl.qwen2_visual import VisionRotaryEmbedding, VisionFlashAttention
 from lightllm.utils.log_utils import init_logger
 
@@ -220,6 +221,9 @@ def load_model(self, weight_dir):
         with open(processor_config_path, "r") as f:
             processor_config_dict = json.load(f)
         self.processor = Qwen2VLImageProcessor(**processor_config_dict)
+        clamp_processor_max_pixels(
+            self.processor, get_env_start_args().visual_image_max_tokens, processor_name="qwen3_vl-vit"
+        )
 
         bin_weight_files = [file_ for file_ in os.listdir(weight_dir) if file_.endswith(".bin")]
         if bin_weight_files:

diff --git a/lightllm/models/tarsier2/tarsier2_visual.py b/lightllm/models/tarsier2/tarsier2_visual.py
@@ -16,7 +16,8 @@
 from lightllm.models.qwen2_vl.qwen2_visual import Qwen2VisionTransformerPretrainedModel
 from lightllm.server.embed_cache.utils import read_shm, get_shm_name_data
 from lightllm.server.multimodal_params import ImageItem
-from lightllm.models.qwen2_vl.vision_process import Qwen2VLImageProcessor, resize_image
+from lightllm.models.qwen2_vl.vision_process import Qwen2VLImageProcessor, resize_image, clamp_processor_max_pixels
+from lightllm.utils.envs_utils import get_env_start_args
 
 
 def add_split_tokens(image_features, image_newline_embed, image_new_embed):
@@ -221,6 +222,9 @@ def load_model(self, weight_dir):
         with open(processor_config_path, "r") as f:
             processor_config_dict = json.load(f)
         self.processor = Qwen2VLImageProcessor(**processor_config_dict)
+        clamp_processor_max_pixels(
+            self.processor, get_env_start_args().visual_image_max_tokens, processor_name="tarsier2-vit"
+        )
 
         bin_weight_files = [file_ for file_ in os.listdir(weight_dir) if file_.endswith(".bin")]
         if bin_weight_files:

diff --git a/lightllm/server/api_cli.py b/lightllm/server/api_cli.py
@@ -472,6 +472,35 @@ def make_argument_parser() -> argparse.ArgumentParser:
     parser.add_argument(
         "--visual_infer_batch_size", type=int, default=None, help="number of images to process in each inference batch"
     )
+    parser.add_argument(
+        "--visual_batch_max_tokens",
+        type=int,
+        default=None,
+        help="""
+        Per-step ViT admission budget measured in image output tokens (post
+        spatial_merge). When set, the ViT scheduler stops adding images to the
+        current batch once their cumulative token_num would exceed this value.
+        Acts as the multimodal analogue of --batch_max_tokens and caps peak ViT
+        memory/compute for dynamic-resolution models (Qwen2.5/3/3.5-VL, etc.).
+        One image is always admitted per step to avoid deadlock when a single
+        request is larger than the budget. Defaults to None (disabled; only the
+        image-count cap applies).
+        """,
+    )
+    parser.add_argument(
+        "--visual_image_max_tokens",
+        type=int,
+        default=None,
+        help="""
+        Per-image hard cap, measured in image output tokens (post spatial_merge).
+        The multimodal analogue of --max_req_total_len: a single image whose
+        token_num exceeds this value is rejected with a ValueError instead of
+        being forwarded to the ViT. Pairs with --visual_batch_max_tokens to
+        close the "first image always admitted" hole — without this cap, one
+        4K image or long-aspect-ratio image can still OOM the ViT by itself.
+        Defaults to None (disabled; any size is accepted).
+        """,
+    )
     parser.add_argument(
         "--visual_send_batch_size",
         type=int,

diff --git a/lightllm/server/api_start.py b/lightllm/server/api_start.py
@@ -272,6 +272,24 @@ def normal_or_p_d_start(args):
         args.cpu_cache_token_page_size = args.linear_att_hash_page_size * args.linear_att_page_block_num
         logger.info(f"set cpu_cache_token_page_size to {args.cpu_cache_token_page_size} for linear hybrid att model")
 
+    # 多模态预算默认值：
+    # - visual_batch_max_tokens 默认等于 batch_max_tokens（LLM 和 ViT 共用预算口径）
+    # - visual_image_max_tokens 默认等于 visual_batch_max_tokens（单图必须能塞进一个批次，
+    #   "首图必放行"规则的隐含前提）
+    # 用户显式指定其中任意一个会覆盖默认值。
+    if args.enable_multimodal:
+        if args.visual_batch_max_tokens is None:
+            args.visual_batch_max_tokens = args.batch_max_tokens
+            logger.info(f"visual_batch_max_tokens auto-derived from batch_max_tokens = {args.batch_max_tokens}")
+        if args.visual_image_max_tokens is None:
+            args.visual_image_max_tokens = args.visual_batch_max_tokens
+        if args.visual_image_max_tokens > args.visual_batch_max_tokens:
+            raise ValueError(
+                f"visual_image_max_tokens ({args.visual_image_max_tokens}) must be "
+                f"<= visual_batch_max_tokens ({args.visual_batch_max_tokens}); otherwise "
+                f"a single 'valid' image can always exceed the batch budget alone."
+            )
+
     # help to manage data stored on Ceph
     if "s3://" in args.model_dir:
         from lightllm.utils.petrel_helper import s3_model_prepare
@@ -563,6 +581,17 @@ def visual_only_start(args):
         args.visual_gpu_ids = list(range(args.visual_dp * args.visual_tp))
     if args.visual_infer_batch_size is None:
         args.visual_infer_batch_size = args.visual_dp
+    if args.visual_image_max_tokens is None and args.visual_batch_max_tokens is not None:
+        args.visual_image_max_tokens = args.visual_batch_max_tokens
+    if (
+        args.visual_image_max_tokens is not None
+        and args.visual_batch_max_tokens is not None
+        and args.visual_image_max_tokens > args.visual_batch_max_tokens
+    ):
+        raise ValueError(
+            f"visual_image_max_tokens ({args.visual_image_max_tokens}) must be "
+            f"<= visual_batch_max_tokens ({args.visual_batch_max_tokens})"
+        )
     if args.data_type is None:
         from lightllm.utils.config_utils import get_dtype
 

diff --git a/lightllm/server/core/objs/start_args_type.py b/lightllm/server/core/objs/start_args_type.py
@@ -107,6 +107,8 @@ class StartArgs:
     push_interval: int = field(default=10)
     visual_node_id: int = field(default=None)
     visual_infer_batch_size: int = field(default=None)
+    visual_batch_max_tokens: Optional[int] = field(default=None)
+    visual_image_max_tokens: Optional[int] = field(default=None)
     visual_send_batch_size: int = field(default=1)
     visual_gpu_ids: List[int] = field(default_factory=lambda: [0])
     visual_tp: int = field(default=1)

diff --git a/lightllm/server/httpserver/manager.py b/lightllm/server/httpserver/manager.py
@@ -35,6 +35,7 @@
 from lightllm.utils.config_utils import get_vocab_size
 from lightllm.utils.envs_utils import get_unique_server_name
 from lightllm.utils.error_utils import NixlPrefillNodeStopGenToken
+from lightllm.utils.multimodal_utils import enforce_image_token_budget
 from rpyc.utils.classic import obtain
 
 logger = init_logger(__name__)
@@ -179,11 +180,12 @@ async def _alloc_multimodal_resources(self, multimodal_params: MultimodalParams,
         # 只有 P 和 NORMAL 节点需要真的管理多模态资源
         if self.pd_mode.is_P_or_NORMAL():
             items, md5sums, tokens_nums, datas = [], [], [], []
-            for img in multimodal_params.images:
+            for img_index, img in enumerate(multimodal_params.images):
                 self.tokenizer.init_imageitem_extral_params(img, multimodal_params, sampling_params)
                 data = img.read()
                 # must after init_imageitem_extral_params
                 token_num = self.tokenizer.get_image_token_length(img)
+                enforce_image_token_budget(token_num, self.args.visual_image_max_tokens, image_index=img_index)
                 md5sum = hashlib.md5(data).hexdigest() + "_" + str(hash(frozendict(img.extra_params)))
                 md5sums.append(md5sum)
                 img.md5 = md5sum
@@ -236,10 +238,12 @@ def tokens(self, prompt, multimodal_params, samping_params: SamplingParams, kwar
         img_count = 0
         audio_tokens = 0
         audio_count = 0
-        for img in multimodal_params.images:
+        for img_index, img in enumerate(multimodal_params.images):
             img_count += 1
             self.tokenizer.init_imageitem_extral_params(img, multimodal_params, samping_params)
-            image_tokens += self.tokenizer.get_image_token_length(img)
+            token_num = self.tokenizer.get_image_token_length(img)
+            enforce_image_token_budget(token_num, self.args.visual_image_max_tokens, image_index=img_index)
+            image_tokens += token_num
         for audio in multimodal_params.audios:
             audio_count += 1
             self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, samping_params)

diff --git a/lightllm/server/httpserver_for_pd_master/manager.py b/lightllm/server/httpserver_for_pd_master/manager.py
@@ -21,6 +21,7 @@
 from lightllm.server.httpserver.manager import AsyncQueue
 from lightllm.utils.error_utils import ServerBusyError
 from lightllm.utils.envs_utils import get_pd_split_max_new_tokens
+from lightllm.utils.multimodal_utils import enforce_image_token_budget
 from .pd_selector import create_selector
 
 logger = init_logger(__name__)
@@ -73,10 +74,12 @@ def tokens(self, prompt, multimodal_params, samping_params: SamplingParams, kwar
         img_count = 0
         audio_tokens = 0
         audio_count = 0
-        for img in multimodal_params.images:
+        for img_index, img in enumerate(multimodal_params.images):
             img_count += 1
             self.tokenizer.init_imageitem_extral_params(img, multimodal_params, samping_params)
-            image_tokens += self.tokenizer.get_image_token_length(img)
+            token_num = self.tokenizer.get_image_token_length(img)
+            enforce_image_token_budget(token_num, self.args.visual_image_max_tokens, image_index=img_index)
+            image_tokens += token_num
         for audio in multimodal_params.audios:
             audio_count += 1
             self.tokenizer.init_audioitem_extral_params(audio, multimodal_params, samping_params)