Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
e3ed0b8
argument for freezing vision encoder parameters added
ehsk Nov 26, 2025
97943e2
freezing vision tower code simplified
ehsk Nov 26, 2025
0e336be
minor issue fixed
ehsk Nov 26, 2025
7d17b5a
removed unnecessary logs
ehsk Nov 27, 2025
691073d
non-trainable parameters excluded from grouped_parameters
ehsk Nov 27, 2025
32d8985
replace "python" with current executable python depending on current env
ehsk Nov 28, 2025
71db4a8
add processor args to vllm
ehsk Nov 28, 2025
14b017b
epsilons for chartqa added
ehsk Dec 2, 2025
fe00bea
max_stream_size added for redis to avoid OOM
ehsk Dec 2, 2025
84c4349
mini-batch size can be greater than 1
ehsk Dec 5, 2025
3b5e2a0
a fix for configs in VLMs like Qwen3-VL or Apriel where there's a tex…
ehsk Dec 18, 2025
c3a3039
refactorings and improvements
ehsk Jan 26, 2026
8e0c285
Merge branch 'main' of github.com:ServiceNow/PipelineRL into multimod…
ehsk Jan 26, 2026
1bd9eca
Merge remote-tracking branch 'origin/main' into multimodal-tweaks
ehsk Apr 9, 2026
d274131
merged with main and conflicts resolved
ehsk May 8, 2026
a10ccd4
mm_processor_kwargs moved inside llm in configs
ehsk May 8, 2026
e3e2ecf
improved error message for samples discarded while reading from redis
ehsk May 11, 2026
8299e4a
AutoModel for vision models updated to match latest transformers
ehsk May 11, 2026
5ea03d1
use queue.Queue instead of multiprocessing.Queue for raw_chunk_queue
ehsk May 11, 2026
d185900
stream size reduced and processor args updated for Qwen3-VL
ehsk May 11, 2026
acfc3fb
new task, visual math, added
ehsk May 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion conf/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ actor:
result_queue_size: 64
throughput_window_size: 50
shared_memory_entry_size: 10000000
# Maximum number of entries to retain in the actor data stream (Redis only for now)
max_stream_size: 1000000
environment: null
preprocess:
input: actor
Expand All @@ -42,6 +44,8 @@ preprocess:
pop_old_data: ${..pop_old_data}
shared_memory_entry_size: 100000000
log_every_n_samples: 128
# Maximum number of entries to retain in the training data stream (Redis only for now)
max_stream_size: 1000000

llm:
parameters:
Expand All @@ -50,7 +54,7 @@ llm:
# changed
temperature: 1.0
test_llm:
parameters:
parameters:
max_tokens: 8192
temperature: 1.0
top_p: 0.95
Expand Down
23 changes: 21 additions & 2 deletions conf/chartqa.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,29 +8,42 @@ finetune:
seq_length: 8000
gradient_accumulation_passes: 512
seq_packing: false
rl:
epsilon_high: 4.0
epsilon_low: 0.0

llm:
parameters:
max_tokens: 2048
temperature: 0.7
# Processor configuration for vision-language models (shared between training and inference)
# Defaults are for Qwen3-VL (patch=16, merge=2 -> effective 32x32 per visual token).
# For Qwen2.5-VL (effective 28x28 per visual token), use: min_pixels=784, max_pixels=1003520.
mm_processor_kwargs:
min_pixels: 1024 # 32*32
max_pixels: 1310720 # 1280*32*32
use_fast: true

test_llm:
parameters:
max_tokens: 2048
temperature: 0.7
mm_processor_kwargs: ${llm.mm_processor_kwargs}

actor:
rollout_policy: pipelinerl.domains.chartqa.generate_chartqa_rollout
system_prompt: You are an expert at analyzing charts and graphs. Please examine the chart carefully and answer the question accurately. Remember to provide your final answer in a boxed format, like \\boxed{{your answer}}.
task_template: |-
Question: {question}

Please analyze the chart step by step and put your final answer within \\boxed{{}}.
llm_max_rollouts: 16
shared_memory_entry_size: 2000000000
max_stream_size: 1000

preprocess:
shared_memory_entry_size: 2000000000
max_stream_size: 1000

environment: null

Expand All @@ -43,10 +56,16 @@ test_dataset_names:
- chartqa_test

# Use vision-language model for multimodal support
model_path: Qwen/Qwen2.5-VL-3B-Instruct
model_path: Qwen/Qwen3-VL-4B-Instruct
# model_path: Qwen/Qwen2.5-VL-3B-Instruct

eval_every_n_versions: 12500

# Override vLLM config for multimodal support
vllm_config:
vllm_kwargs:
max-num-seqs: 64
max-num-batched-tokens: 32768
max_model_len: 8000
# Qwen3-VL defaults (effective 32x32 per visual token); for Qwen2.5-VL use min_pixels=784, max_pixels=1003520
mm-processor-kwargs: '{"min_pixels": 1024, "max_pixels": 1310720, "use_fast": true}' # 32*32 to 1280*32*32
2 changes: 2 additions & 0 deletions conf/finetune/base.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ data: null
model_class: causal-language-modeling
# Model name or path of model to be trained.
config_name: ${..model_path}
# Freeze vision tower for vision-language models (only applicable for vision2seq-language-modeling)
freeze_vision_tower: false
# Optimizer type, supported: adamw_torch, adafactor, cpuadam, lion
optim: adamw_torch
# use half precision training, full bf16 without mixed precision copies at all
Expand Down
71 changes: 71 additions & 0 deletions conf/mathv.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
defaults:
- base
- override streams: redis
- _self_

finetune:
model_class: vision2seq-language-modeling
seq_length: 12000
gradient_accumulation_passes: 512
seq_packing: false
rl:
epsilon_high: 4.0
epsilon_low: 0.0

llm:
parameters:
max_tokens: 4096
temperature: 0.7
# Processor configuration for vision-language models (shared between training and inference)
# Defaults are for Qwen3-VL (patch=16, merge=2 -> effective 32x32 per visual token).
# For Qwen2.5-VL (effective 28x28 per visual token), use: min_pixels=784, max_pixels=1003520.
mm_processor_kwargs:
min_pixels: 1024 # 32*32
max_pixels: 1310720 # 1280*32*32
use_fast: true

test_llm:
parameters:
max_tokens: 4096
temperature: 0.7
mm_processor_kwargs: ${llm.mm_processor_kwargs}

actor:
rollout_policy: pipelinerl.domains.mathv.generate_mathv_rollout
system_prompt: You are an expert at solving math problems involving geometric figures, charts, and diagrams. Examine the image carefully and reason step by step. Provide your final answer in a boxed format, like \\boxed{{your answer}}.
task_template: |-
Question: {question}

Solve step by step and put your final answer within \\boxed{{}}.
llm_max_rollouts: 16
shared_memory_entry_size: 2000000000
max_stream_size: 1000

preprocess:
shared_memory_entry_size: 2000000000
max_stream_size: 1000

environment: null

dataset_loader: pipelinerl.domains.mathv.load_problems

train_dataset_names:
- geometry3k_train

test_dataset_names:
- mathvista_testmini

# Use vision-language model for multimodal support
model_path: Qwen/Qwen3-VL-4B-Instruct
# model_path: Qwen/Qwen2.5-VL-3B-Instruct

eval_every_n_versions: 12500

# Override vLLM config for multimodal support
vllm_config:
vllm_kwargs:
max-num-seqs: 64
max_model_len: 12000
max-num-batched-tokens: 32768
# Qwen3-VL defaults (effective 32x32 per visual token); for Qwen2.5-VL use min_pixels=784, max_pixels=1003520
mm-processor-kwargs: '{"min_pixels": 1024, "max_pixels": 1310720, "use_fast": true}' # 32*32 to 1280*32*32
4 changes: 3 additions & 1 deletion pipelinerl/actor.py
Original file line number Diff line number Diff line change
Expand Up @@ -535,7 +535,7 @@ def _run(self, dataset: list[tuple[str, dict]]):

logger.info(f"Start {'train' if self.is_training else 'test'} actor loop")
with (
write_to_streams(self.data_stream, "a") as data_stream_writer,
write_to_streams(self.data_stream, "a", max_stream_size=self.cfg.actor.max_stream_size) as data_stream_writer,
write_to_streams(self.stats_stream, "a") as stats_writer,
):
while True:
Expand Down Expand Up @@ -825,6 +825,7 @@ def run_actor_loop(cfg: DictConfig):
parameters=cfg.llm.parameters,
collect_logprobs=True,
chat_template_kwargs=cfg.llm.get("chat_template_kwargs", {}),
mm_processor_kwargs=cfg.llm.get("mm_processor_kwargs", {}),
)
for url in llm_urls
]
Expand All @@ -836,6 +837,7 @@ def run_actor_loop(cfg: DictConfig):
parameters=cfg.test_llm.parameters,
collect_logprobs=True,
chat_template_kwargs=cfg.test_llm.get("chat_template_kwargs", {}),
mm_processor_kwargs=cfg.test_llm.get("mm_processor_kwargs", {}),
)
for url in llm_urls
]
Expand Down
11 changes: 6 additions & 5 deletions pipelinerl/async_llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@

import aiohttp
import numpy as np
import torch
from PIL import Image
from pipelinerl.llm import LLMCall, LLMOutput, Prompt, TokenLogprob, TrainableLLM

from pipelinerl.finetune.data import MASKED_TOKEN_ID
from pipelinerl.rollouts import TrainingText
from pipelinerl.processor_factory import get_processor
from pipelinerl.vision_processor_utils import get_mm_processor
from omegaconf import DictConfig, ListConfig, OmegaConf

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -221,7 +222,7 @@ def make_training_text(llm: TrainableLLM, llm_call: LLMCall) -> TrainingText:

if use_processor:
# Use processor for vision-language models
processor = get_processor(llm.model_name)
processor = get_mm_processor(llm.model_name, mm_processor_kwargs=llm.mm_processor_kwargs)

try:
# Apply chat template using processor for proper image token handling
Expand Down Expand Up @@ -253,11 +254,11 @@ def make_training_text(llm: TrainableLLM, llm_call: LLMCall) -> TrainingText:
processed = processor(
text=[prompt_text], images=images, padding=True, return_tensors=None
)
# Convert PyTorch tensors to numpy arrays
visual_features = {
key: value
key: value.cpu().numpy() if torch.is_tensor(value) else value
for key, value in processed.items()
if isinstance(value, np.ndarray)
and key not in ["input_ids", "attention_mask"]
if key not in ["input_ids", "attention_mask"]
}

except Exception as e:
Expand Down
12 changes: 12 additions & 0 deletions pipelinerl/domains/mathv/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Math Visual Reasoning (mathv)

A Vision Language Model (VLM) RL example for math reasoning over images.
Trains on [Geometry3K](https://huggingface.co/datasets/hiyouga/geometry3k)
and evaluates on [MathVista](https://huggingface.co/datasets/AI4Math/MathVista)
(`testmini` split).

## Usage

```bash
python -m pipelinerl.launch output_dir=results/mathv --config-name mathv
```
6 changes: 6 additions & 0 deletions pipelinerl/domains/mathv/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
"""Math visual reasoning domain (Geometry3K for training, MathVista for eval)."""

from .mathv import generate_mathv_rollout
from .load_datasets import load_problems

__all__ = ["generate_mathv_rollout", "load_problems"]
56 changes: 56 additions & 0 deletions pipelinerl/domains/mathv/evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
import re
from typing import Optional


def relaxed_correctness(target: str,
prediction: str,
max_relative_change: float = 0.05) -> bool:
"""Numeric answers within ``max_relative_change`` are correct; otherwise
fall back to case-insensitive exact match (handles letter answers like
"A"/"B"/"C"/"D" and short strings).
"""

def _to_float(text: str) -> Optional[float]:
try:
if text.endswith("%"):
return float(text.rstrip("%")) / 100.0
return float(text)
except ValueError:
return None

prediction_float = _to_float(prediction)
target_float = _to_float(target)
if prediction_float is not None and target_float:
relative_change = abs(prediction_float - target_float) / abs(target_float)
return relative_change <= max_relative_change
return prediction.strip().lower() == target.strip().lower()


def extract_boxed_answer(text: str) -> str | None:
"""Extract answer from \\boxed{} format."""
boxed_pattern = r'\\boxed\{([^}]*)\}'
matches = re.findall(boxed_pattern, text, re.IGNORECASE)
if matches:
return matches[-1].strip()
return None


def evaluate_answer(predicted: str, ground_truth: str) -> str:
"""
Evaluate math-visual answer and return status.

Returns:
- "correct": Answer is correct
- "wrong": Answer is incorrect
- "no_answer": No \\boxed{} found
- "unparsable": Could not parse answer
"""
try:
boxed_answer = extract_boxed_answer(predicted)
if not boxed_answer:
return "no_answer"
if relaxed_correctness(ground_truth, boxed_answer):
return "correct"
return "wrong"
except Exception:
return "unparsable"
Loading