From ca6c5a88a83d9c2073a7b39d4f3267147a957818 Mon Sep 17 00:00:00 2001
From: Zhipeng Wang <zhipengbayern@gmail.com>
Date: Tue, 26 May 2026 07:14:41 +0000
Subject: [PATCH 01/18] Add OPSD example: config, divergence losses, utils +
 tests

First slice of the on-policy distillation example app under examples/opsd/.
This commit lands the framework-agnostic foundation: the OPSDConfig dataclass
hierarchy, chunked / streamed forward-KL / reverse-KL / JSD losses with
sequence-axis chunking to bound peak memory, response-mask + shift helpers,
and a 24-case CPU-only test suite covering identity, masking, chunk
equivalence, gradient flow, and numerical edge cases.

Signed-off-by: Zhipeng Wang <zhipengbayern@gmail.com>
---
 examples/opsd/opsd/__init__.py     |  17 +++
 examples/opsd/opsd/config.py       | 149 ++++++++++++++++++++++
 examples/opsd/opsd/losses.py       | 192 +++++++++++++++++++++++++++++
 examples/opsd/opsd/utils.py        |  52 ++++++++
 examples/opsd/requirements.txt     |   5 +
 examples/opsd/tests/test_losses.py | 166 +++++++++++++++++++++++++
 6 files changed, 581 insertions(+)
 create mode 100644 examples/opsd/opsd/__init__.py
 create mode 100644 examples/opsd/opsd/config.py
 create mode 100644 examples/opsd/opsd/losses.py
 create mode 100644 examples/opsd/opsd/utils.py
 create mode 100644 examples/opsd/requirements.txt
 create mode 100644 examples/opsd/tests/test_losses.py

diff --git a/examples/opsd/opsd/__init__.py b/examples/opsd/opsd/__init__.py
new file mode 100644
index 000000000000..a0916026f680
--- /dev/null
+++ b/examples/opsd/opsd/__init__.py
@@ -0,0 +1,17 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""On-Policy Distillation (OPSD) training on DeepSpeed.
+
+A student model generates rollouts; a frozen teacher scores them; the student
+is updated by a per-token divergence (forward-KL / reverse-KL / JSD) computed
+against the teacher's distribution on the student's own samples.
+
+Supports two rollout engines selected via config:
+    * ``hybrid_engine`` — DeepSpeed's built-in train+infer engine (ZeRO-3 safe)
+    * ``vllm``          — vLLM running on a disjoint set of GPUs with NCCL
+                          weight sync from the trainer each step
+"""
+
+__version__ = "0.1.0"
diff --git a/examples/opsd/opsd/config.py b/examples/opsd/opsd/config.py
new file mode 100644
index 000000000000..b55487d738bd
--- /dev/null
+++ b/examples/opsd/opsd/config.py
@@ -0,0 +1,149 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Configuration dataclasses for OPSD training.
+
+A single :class:`OPSDConfig` is loaded from a JSON file (see ``configs/`` for
+examples) and threaded through the rest of the pipeline. We use plain
+dataclasses instead of Hydra/pydantic to match the rest of the DeepSpeed
+example apps and to keep the dependency surface minimal.
+"""
+
+import json
+from dataclasses import dataclass, field, asdict
+from typing import List, Optional
+
+
+@dataclass
+class StudentConfig:
+    model_name_or_path: str
+    dtype: str = "bfloat16"
+    trust_remote_code: bool = False
+    # Architecture key used to look up the weight bridge for vLLM rollout. If
+    # unset, the trainer will infer it from the HF config's ``model_type``.
+    arch: Optional[str] = None
+
+
+@dataclass
+class TeacherConfig:
+    model_name_or_path: str
+    dtype: str = "bfloat16"
+    trust_remote_code: bool = False
+    # Keep teacher params on CPU and gather per-forward via ZeRO-3. Saves GPU
+    # memory at the cost of host<->device transfer each step.
+    offload_to_cpu: bool = True
+
+
+@dataclass
+class RolloutConfig:
+    # "hybrid_engine" | "vllm"
+    engine: str = "hybrid_engine"
+
+    # Generation knobs (apply to either engine)
+    max_prompt_length: int = 1024
+    max_response_length: int = 1024
+    temperature: float = 1.0
+    top_p: float = 1.0
+    top_k: int = -1
+    n_samples_per_prompt: int = 1
+
+    # vLLM-specific. ``gpus`` is the disjoint set of CUDA device indices vLLM
+    # may use; the training ranks must not overlap with these. If None, the
+    # trainer will refuse to start in vllm mode.
+    gpus: Optional[List[int]] = None
+    tensor_parallel_size: int = 1
+    gpu_memory_utilization: float = 0.85
+    vllm_dtype: str = "bfloat16"
+    # Push student weights into vLLM every N optimizer steps. Larger values
+    # trade staleness for throughput.
+    weight_sync_interval: int = 1
+    # Pinned vLLM version known to expose the worker APIs we rely on.
+    vllm_min_version: str = "0.6.4"
+    # Skip CUDA-graph capture at vLLM startup. Saves several minutes of
+    # one-time compilation (worth it for smoke tests / short-lived runs);
+    # leave False for steady-state throughput.
+    vllm_enforce_eager: bool = False
+
+
+@dataclass
+class DistillationConfig:
+    # "forward_kl" | "reverse_kl" | "jsd"
+    loss_type: str = "reverse_kl"
+    temperature: float = 1.0
+    # Chunk size along the sequence dimension for the per-token divergence.
+    # Bounds peak memory: full [B, T, V] is never materialized at once when
+    # T > chunk_size.
+    chunk_size: int = 512
+
+
+@dataclass
+class TrainingConfig:
+    train_batch_size: int = 8
+    micro_batch_size_per_gpu: int = 1
+    gradient_accumulation_steps: int = 1
+    learning_rate: float = 1e-6
+    weight_decay: float = 0.0
+    num_train_epochs: int = 1
+    max_steps: int = -1
+    warmup_steps: int = 0
+    save_steps: int = 500
+    logging_steps: int = 10
+    save_dir: str = "./opsd_ckpt"
+    seed: int = 42
+
+
+@dataclass
+class DataConfig:
+    path: str = ""
+    prompt_field: str = "prompt"
+    # Optional HF chat template override; if None we use the student tokenizer's
+    # default.
+    chat_template: Optional[str] = None
+    shuffle: bool = True
+
+
+@dataclass
+class OPSDConfig:
+    student: StudentConfig
+    teacher: TeacherConfig
+    rollout: RolloutConfig = field(default_factory=RolloutConfig)
+    distillation: DistillationConfig = field(default_factory=DistillationConfig)
+    training: TrainingConfig = field(default_factory=TrainingConfig)
+    data: DataConfig = field(default_factory=DataConfig)
+    # Path to the DeepSpeed JSON config used for ``deepspeed.initialize`` on the
+    # student. Kept as a separate file because it has its own schema owned by
+    # DeepSpeed.
+    deepspeed_config: str = ""
+
+    @classmethod
+    def from_json(cls, path: str) -> "OPSDConfig":
+        with open(path, "r") as f:
+            raw = json.load(f)
+        return cls.from_dict(raw)
+
+    @classmethod
+    def from_dict(cls, raw: dict) -> "OPSDConfig":
+        return cls(
+            student=StudentConfig(**raw["student"]),
+            teacher=TeacherConfig(**raw["teacher"]),
+            rollout=RolloutConfig(**raw.get("rollout", {})),
+            distillation=DistillationConfig(**raw.get("distillation", {})),
+            training=TrainingConfig(**raw.get("training", {})),
+            data=DataConfig(**raw.get("data", {})),
+            deepspeed_config=raw.get("deepspeed_config", ""),
+        )
+
+    def to_dict(self) -> dict:
+        return asdict(self)
+
+    def validate(self) -> None:
+        if self.distillation.loss_type not in ("forward_kl", "reverse_kl", "jsd"):
+            raise ValueError(f"Unknown loss_type {self.distillation.loss_type!r}")
+        if self.rollout.engine not in ("hybrid_engine", "vllm"):
+            raise ValueError(f"Unknown rollout engine {self.rollout.engine!r}")
+        # rollout.gpus may be left empty for the "shared" topology where vLLM
+        # runs in-process on the same GPU as training rank 0; populated for
+        # the "disjoint" topology where it runs on a separate set of devices.
+        if self.distillation.chunk_size <= 0:
+            raise ValueError("distillation.chunk_size must be positive")
diff --git a/examples/opsd/opsd/losses.py b/examples/opsd/opsd/losses.py
new file mode 100644
index 000000000000..d9f4b9266da5
--- /dev/null
+++ b/examples/opsd/opsd/losses.py
@@ -0,0 +1,192 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Per-token distillation divergences with sequence-axis chunking.
+
+The full ``[B, T, V]`` tensor produced by a forward pass on a modern LLM can
+easily exceed several GB in fp32 (e.g. 8 * 1024 * 150k * 4 B ~ 4.9 GB). Holding
+both student *and* teacher logits at once would double that. We chunk along the
+sequence axis so the per-chunk softmax + difference only ever needs
+``[B, chunk, V]`` of working memory, regardless of T.
+
+Math conventions:
+    * ``forward_kl``  = D_KL(teacher || student) — mode-covering for student
+    * ``reverse_kl``  = D_KL(student || teacher) — mode-seeking for student
+    * ``jsd``         = 0.5 * D_KL(P || M) + 0.5 * D_KL(Q || M), M = (P+Q)/2
+
+All three follow the standard knowledge-distillation temperature convention:
+divide logits by T before softmax, then multiply the result by T**2 so that
+gradient magnitudes are comparable across temperatures.
+"""
+
+from typing import Callable
+
+import torch
+import torch.nn.functional as F
+
+
+def _forward_kl(student_logits: torch.Tensor, teacher_logits: torch.Tensor, temperature: float) -> torch.Tensor:
+    s_log_probs = F.log_softmax(student_logits / temperature, dim=-1)
+    t_log_probs = F.log_softmax(teacher_logits / temperature, dim=-1)
+    t_probs = t_log_probs.exp()
+    kl = (t_probs * (t_log_probs - s_log_probs)).sum(dim=-1)
+    return kl * (temperature**2)
+
+
+def _reverse_kl(student_logits: torch.Tensor, teacher_logits: torch.Tensor, temperature: float) -> torch.Tensor:
+    s_log_probs = F.log_softmax(student_logits / temperature, dim=-1)
+    t_log_probs = F.log_softmax(teacher_logits / temperature, dim=-1)
+    s_probs = s_log_probs.exp()
+    kl = (s_probs * (s_log_probs - t_log_probs)).sum(dim=-1)
+    return kl * (temperature**2)
+
+
+def _jsd(student_logits: torch.Tensor, teacher_logits: torch.Tensor, temperature: float) -> torch.Tensor:
+    s_log_probs = F.log_softmax(student_logits / temperature, dim=-1)
+    t_log_probs = F.log_softmax(teacher_logits / temperature, dim=-1)
+    s_probs = s_log_probs.exp()
+    t_probs = t_log_probs.exp()
+    m_probs = 0.5 * (s_probs + t_probs)
+    # Clamp guards against log(0) when both distributions have ~0 mass on the
+    # same vocab id (rare in practice but possible after temperature scaling).
+    m_log_probs = m_probs.clamp_min(1e-12).log()
+    kl_s = (s_probs * (s_log_probs - m_log_probs)).sum(dim=-1)
+    kl_t = (t_probs * (t_log_probs - m_log_probs)).sum(dim=-1)
+    return 0.5 * (kl_s + kl_t) * (temperature**2)
+
+
+_LOSS_FNS: "dict[str, Callable[..., torch.Tensor]]" = {
+    "forward_kl": _forward_kl,
+    "reverse_kl": _reverse_kl,
+    "jsd": _jsd,
+}
+
+
+def chunked_distillation_loss(
+    student_logits: torch.Tensor,
+    teacher_logits: torch.Tensor,
+    response_mask: torch.Tensor,
+    loss_type: str = "reverse_kl",
+    temperature: float = 1.0,
+    chunk_size: int = 512,
+) -> torch.Tensor:
+    """Mean per-token divergence over response positions, chunked over the
+    sequence axis to bound peak memory.
+
+    Args:
+        student_logits: ``[B, T, V]`` — gradient flows here.
+        teacher_logits: ``[B, T, V]`` — caller is responsible for ``detach()``
+            (we do not detach here so the function stays cheap).
+        response_mask:  ``[B, T]`` — 1 where the position should contribute to
+            the loss (i.e. response tokens, not prompt or padding), 0 elsewhere.
+        loss_type:      ``"forward_kl"`` | ``"reverse_kl"`` | ``"jsd"``.
+        temperature:    KD temperature; >1 softens both distributions.
+        chunk_size:     Sequence-axis chunk size.
+
+    Returns:
+        Scalar loss = sum-over-positions(per_tok * mask) / sum(mask), promoted
+        to fp32 internally for numerical stability.
+    """
+    if loss_type not in _LOSS_FNS:
+        raise ValueError(f"Unknown loss_type {loss_type!r}; choose from {sorted(_LOSS_FNS)}")
+    fn = _LOSS_FNS[loss_type]
+
+    if student_logits.shape != teacher_logits.shape:
+        raise ValueError(f"shape mismatch: student {tuple(student_logits.shape)} vs teacher "
+                         f"{tuple(teacher_logits.shape)}")
+    B, T, _ = student_logits.shape
+    if response_mask.shape != (B, T):
+        raise ValueError(f"response_mask {tuple(response_mask.shape)} does not match logits "
+                         f"prefix ({B}, {T})")
+
+    mask_f = response_mask.to(torch.float32)
+    total_tokens = mask_f.sum().clamp_min(1.0)
+    total_loss = student_logits.new_zeros((), dtype=torch.float32)
+
+    for start in range(0, T, chunk_size):
+        end = min(start + chunk_size, T)
+        chunk_mask = mask_f[:, start:end]
+        # Skipping empty chunks avoids a redundant forward through the softmax
+        # path on chunks that wouldn't contribute anything to the sum.
+        if chunk_mask.sum().item() == 0:
+            continue
+        per_tok = fn(
+            student_logits[:, start:end].float(),
+            teacher_logits[:, start:end].float(),
+            temperature,
+        )
+        total_loss = total_loss + (per_tok * chunk_mask).sum()
+
+    return total_loss / total_tokens
+
+
+def streamed_distillation_loss(
+    student_logits: torch.Tensor,
+    teacher_chunk_fetcher: Callable[[int, int], torch.Tensor],
+    response_mask: torch.Tensor,
+    loss_type: str = "reverse_kl",
+    temperature: float = 1.0,
+    chunk_size: int = 512,
+) -> torch.Tensor:
+    """Same math as :func:`chunked_distillation_loss`, but teacher logits are
+    pulled chunk-by-chunk via a fetcher so the full ``[B, T, V]`` teacher
+    tensor never needs to live on the same device as the student.
+
+    Args:
+        student_logits: ``[B, T, V]`` on the training device.
+        teacher_chunk_fetcher: ``fn(start, end) -> [B, end - start, V]``, already
+            on the same device and broadcastable dtype as ``student_logits``.
+            Typically wraps ``TeacherLogitCache.chunk_to_device``.
+        response_mask:  ``[B, T]`` — 1 where the position should contribute.
+        loss_type:      one of ``"forward_kl" | "reverse_kl" | "jsd"``.
+        temperature:    KD temperature.
+        chunk_size:     Sequence-axis chunk size.
+    """
+    if loss_type not in _LOSS_FNS:
+        raise ValueError(f"Unknown loss_type {loss_type!r}; choose from {sorted(_LOSS_FNS)}")
+    fn = _LOSS_FNS[loss_type]
+
+    B, T, _ = student_logits.shape
+    if response_mask.shape != (B, T):
+        raise ValueError(f"response_mask {tuple(response_mask.shape)} does not match logits "
+                         f"prefix ({B}, {T})")
+
+    mask_f = response_mask.to(torch.float32)
+    total_tokens = mask_f.sum().clamp_min(1.0)
+    total_loss = student_logits.new_zeros((), dtype=torch.float32)
+
+    for start in range(0, T, chunk_size):
+        end = min(start + chunk_size, T)
+        chunk_mask = mask_f[:, start:end]
+        if chunk_mask.sum().item() == 0:
+            continue
+        teacher_chunk = teacher_chunk_fetcher(start, end)
+        if teacher_chunk.shape[1] != (end - start):
+            raise RuntimeError(f"fetcher returned chunk of length {teacher_chunk.shape[1]}, "
+                               f"expected {end - start}")
+        per_tok = fn(
+            student_logits[:, start:end].float(),
+            teacher_chunk.float(),
+            temperature,
+        )
+        total_loss = total_loss + (per_tok * chunk_mask).sum()
+
+    return total_loss / total_tokens
+
+
+def per_token_logprobs(logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
+    """Gather log p(label_t | context_<t) for each position.
+
+    Convenience helper used by the trainer for diagnostic logging (e.g. mean
+    student log-prob of the rollout, mean teacher log-prob of the rollout).
+
+    Args:
+        logits: ``[B, T, V]``.
+        labels: ``[B, T]`` token ids.
+
+    Returns:
+        ``[B, T]`` of log-probabilities.
+    """
+    log_probs = F.log_softmax(logits.float(), dim=-1)
+    return log_probs.gather(-1, labels.unsqueeze(-1)).squeeze(-1)
diff --git a/examples/opsd/opsd/utils.py b/examples/opsd/opsd/utils.py
new file mode 100644
index 000000000000..b2954407b774
--- /dev/null
+++ b/examples/opsd/opsd/utils.py
@@ -0,0 +1,52 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Small tensor/masking helpers shared by trainer, losses, and tests.
+
+These intentionally stay free of DeepSpeed / distributed imports so the
+non-distributed unit tests can exercise them on CPU without a torchrun
+launcher.
+"""
+
+import torch
+
+
+def build_response_mask(response_start_idx: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
+    """Mark positions belonging to the response (not prompt, not padding).
+
+    Args:
+        response_start_idx: ``[B]`` int tensor — the first column index that is
+            part of the response, per sample. For *right-padded* prompts this
+            equals the prompt's token count; for the more common *left-padded*
+            convention used by causal generation it equals the prompt section
+            length (i.e. the column where prompt ends and response begins).
+        attention_mask: ``[B, T]`` — 1 on real tokens (prompt + response), 0 on
+            padding.
+
+    Returns:
+        ``[B, T]`` 0/1 mask with the same dtype as ``attention_mask``. 1 only
+        at positions ``t >= response_start_idx[b]`` that are also attended.
+    """
+    if response_start_idx.dim() != 1:
+        raise ValueError(f"response_start_idx must be 1-D, got shape {tuple(response_start_idx.shape)}")
+    if attention_mask.dim() != 2:
+        raise ValueError(f"attention_mask must be 2-D, got shape {tuple(attention_mask.shape)}")
+    B, T = attention_mask.shape
+    if response_start_idx.shape[0] != B:
+        raise ValueError(f"response_start_idx batch ({response_start_idx.shape[0]}) != "
+                         f"attention_mask batch ({B})")
+
+    pos = torch.arange(T, device=attention_mask.device).unsqueeze(0).expand(B, T)
+    is_response = pos >= response_start_idx.to(pos.dtype).unsqueeze(1)
+    return is_response.to(attention_mask.dtype) * attention_mask
+
+
+def shift_for_next_token_prediction(logits: torch.Tensor, labels: torch.Tensor):
+    """Align logits at position t with the label at position t+1.
+
+    Returns:
+        Tuple ``(shifted_logits[:, :-1, :], shifted_labels[:, 1:])`` — both
+        contiguous, so they can be safely indexed for the divergence loss.
+    """
+    return logits[:, :-1, :].contiguous(), labels[:, 1:].contiguous()
diff --git a/examples/opsd/requirements.txt b/examples/opsd/requirements.txt
new file mode 100644
index 000000000000..fb5a091575da
--- /dev/null
+++ b/examples/opsd/requirements.txt
@@ -0,0 +1,5 @@
+datasets>=2.0.0
+numpy
+transformers>=4.40.0
+# Optional, only needed when rollout.engine == "vllm":
+# vllm>=0.6.4
diff --git a/examples/opsd/tests/test_losses.py b/examples/opsd/tests/test_losses.py
new file mode 100644
index 000000000000..1cf9aede6756
--- /dev/null
+++ b/examples/opsd/tests/test_losses.py
@@ -0,0 +1,166 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""CPU-only numerics tests for the distillation divergences.
+
+These exercise the loss math without needing GPUs, models, or a torchrun
+launcher. Run from the example root with::
+
+    cd examples/opsd && python -m pytest tests/test_losses.py -v
+"""
+
+import pytest
+import torch
+
+from opsd.losses import chunked_distillation_loss, per_token_logprobs
+from opsd.utils import build_response_mask, shift_for_next_token_prediction
+
+
+@pytest.mark.parametrize("loss_type", ["forward_kl", "reverse_kl", "jsd"])
+def test_zero_when_identical(loss_type):
+    torch.manual_seed(0)
+    logits = torch.randn(2, 8, 32)
+    mask = torch.ones(2, 8)
+    loss = chunked_distillation_loss(logits, logits.clone(), mask, loss_type=loss_type)
+    assert loss.item() == pytest.approx(0.0, abs=1e-5)
+
+
+@pytest.mark.parametrize("loss_type", ["forward_kl", "reverse_kl", "jsd"])
+def test_positive_when_different(loss_type):
+    torch.manual_seed(0)
+    s = torch.randn(2, 8, 32)
+    t = torch.randn(2, 8, 32)
+    mask = torch.ones(2, 8)
+    loss = chunked_distillation_loss(s, t, mask, loss_type=loss_type)
+    assert loss.item() > 0.0
+
+
+@pytest.mark.parametrize("loss_type", ["forward_kl", "reverse_kl", "jsd"])
+def test_chunking_equivalent_to_unchunked(loss_type):
+    torch.manual_seed(0)
+    s = torch.randn(2, 100, 32)
+    t = torch.randn(2, 100, 32)
+    mask = torch.ones(2, 100)
+    loss_chunked = chunked_distillation_loss(s, t, mask, loss_type=loss_type, chunk_size=10)
+    loss_whole = chunked_distillation_loss(s, t, mask, loss_type=loss_type, chunk_size=10_000)
+    assert torch.allclose(loss_chunked, loss_whole, atol=1e-5)
+
+
+def test_mask_excludes_tokens():
+    torch.manual_seed(0)
+    s = torch.randn(2, 8, 32)
+    t = torch.randn(2, 8, 32)
+    half_mask = torch.tensor([[1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0, 0]], dtype=torch.float32)
+    loss_direct = chunked_distillation_loss(s[:, :4], t[:, :4], torch.ones(2, 4), loss_type="reverse_kl")
+    loss_masked = chunked_distillation_loss(s, t, half_mask, loss_type="reverse_kl")
+    assert torch.allclose(loss_direct, loss_masked, atol=1e-5)
+
+
+def test_gradient_flows_to_student():
+    torch.manual_seed(0)
+    s = torch.randn(2, 8, 32, requires_grad=True)
+    t = torch.randn(2, 8, 32)
+    mask = torch.ones(2, 8)
+    loss = chunked_distillation_loss(s, t, mask, loss_type="reverse_kl")
+    loss.backward()
+    assert s.grad is not None
+    assert s.grad.abs().sum().item() > 0
+
+
+def test_gradient_does_not_flow_to_teacher_when_detached():
+    torch.manual_seed(0)
+    s = torch.randn(2, 8, 32, requires_grad=True)
+    t = torch.randn(2, 8, 32, requires_grad=True)
+    mask = torch.ones(2, 8)
+    loss = chunked_distillation_loss(s, t.detach(), mask, loss_type="reverse_kl")
+    loss.backward()
+    assert t.grad is None
+
+
+def test_unknown_loss_type_raises():
+    s = torch.randn(2, 4, 8)
+    t = torch.randn(2, 4, 8)
+    mask = torch.ones(2, 4)
+    with pytest.raises(ValueError, match="Unknown loss_type"):
+        chunked_distillation_loss(s, t, mask, loss_type="totally_made_up")
+
+
+def test_shape_mismatch_raises():
+    s = torch.randn(2, 4, 8)
+    t = torch.randn(2, 5, 8)
+    mask = torch.ones(2, 4)
+    with pytest.raises(ValueError, match="shape mismatch"):
+        chunked_distillation_loss(s, t, mask)
+
+
+def test_mask_shape_mismatch_raises():
+    s = torch.randn(2, 4, 8)
+    t = torch.randn(2, 4, 8)
+    mask = torch.ones(2, 5)
+    with pytest.raises(ValueError, match="does not match"):
+        chunked_distillation_loss(s, t, mask)
+
+
+@pytest.mark.parametrize("temperature", [0.5, 1.0, 2.0])
+def test_temperature_changes_loss_but_stays_finite(temperature):
+    torch.manual_seed(0)
+    s = torch.randn(2, 8, 32)
+    t = torch.randn(2, 8, 32)
+    mask = torch.ones(2, 8)
+    loss = chunked_distillation_loss(s, t, mask, loss_type="reverse_kl", temperature=temperature)
+    assert torch.isfinite(loss).item()
+
+
+def test_jsd_is_symmetric():
+    torch.manual_seed(0)
+    a = torch.randn(2, 8, 32)
+    b = torch.randn(2, 8, 32)
+    mask = torch.ones(2, 8)
+    jsd_ab = chunked_distillation_loss(a, b, mask, loss_type="jsd")
+    jsd_ba = chunked_distillation_loss(b, a, mask, loss_type="jsd")
+    assert torch.allclose(jsd_ab, jsd_ba, atol=1e-5)
+
+
+def test_all_zero_mask_returns_zero():
+    torch.manual_seed(0)
+    s = torch.randn(2, 8, 32)
+    t = torch.randn(2, 8, 32)
+    mask = torch.zeros(2, 8)
+    loss = chunked_distillation_loss(s, t, mask, loss_type="reverse_kl")
+    assert loss.item() == pytest.approx(0.0, abs=1e-6)
+
+
+def test_per_token_logprobs_matches_manual():
+    torch.manual_seed(0)
+    logits = torch.randn(2, 4, 16)
+    labels = torch.randint(0, 16, (2, 4))
+    got = per_token_logprobs(logits, labels)
+    expected = torch.log_softmax(logits.float(), dim=-1)
+    expected = expected.gather(-1, labels.unsqueeze(-1)).squeeze(-1)
+    assert torch.allclose(got, expected, atol=1e-6)
+
+
+def test_build_response_mask_basic():
+    attention_mask = torch.tensor([[1, 1, 1, 1, 0], [1, 1, 1, 1, 1]])
+    response_start_idx = torch.tensor([2, 3])
+    resp = build_response_mask(response_start_idx, attention_mask)
+    expected = torch.tensor([[0, 0, 1, 1, 0], [0, 0, 0, 1, 1]])
+    assert torch.equal(resp, expected)
+
+
+def test_build_response_mask_validates_shapes():
+    with pytest.raises(ValueError, match="response_start_idx must be 1-D"):
+        build_response_mask(torch.zeros(2, 2), torch.ones(2, 4))
+    with pytest.raises(ValueError, match="attention_mask must be 2-D"):
+        build_response_mask(torch.zeros(2), torch.ones(4))
+    with pytest.raises(ValueError, match="batch"):
+        build_response_mask(torch.zeros(3), torch.ones(2, 4))
+
+
+def test_shift_for_next_token_prediction_shapes():
+    logits = torch.randn(2, 5, 8)
+    labels = torch.randint(0, 8, (2, 5))
+    sl, sla = shift_for_next_token_prediction(logits, labels)
+    assert sl.shape == (2, 4, 8)
+    assert sla.shape == (2, 4)

From cfc276824bbf4d7864e62cd272f8e1d0a444793f Mon Sep 17 00:00:00 2001
From: Zhipeng Wang <zhipengbayern@gmail.com>
Date: Tue, 26 May 2026 07:15:03 +0000
Subject: [PATCH 02/18] Add OPSD frozen teacher with CPU logit cache + tests

Adds the two-phase teacher path:

  * TeacherWrapper loads a HuggingFace causal LM, freezes it, and runs
    forward-only. Two modes: load + pin on GPU (offload_to_cpu=false), or
    wrap with deepspeed.initialize using a ZeRO-3 + offload_param=cpu
    config (offload_to_cpu=true). Avoids deepspeed.zero.Init() around
    from_pretrained because HF's loader partitions params to zero-width
    shards before the checkpoint can fill them.

  * TeacherLogitCache stages the [B, T, V] teacher logits to (pinned) host
    memory in bf16, and exposes chunk_to_device() so the student-side loss
    can pull sequence slices back on demand. This is the memory-economising
    half of the two-phase update.

CPU-only tests cover the cache shape / dtype / round-trip / chunk-bounds
behaviour and verify the streamed-via-cache loss matches the direct
chunked loss bit-for-bit.

Signed-off-by: Zhipeng Wang <zhipengbayern@gmail.com>
---
 examples/opsd/opsd/teacher.py               | 191 ++++++++++++++++++++
 examples/opsd/tests/test_teacher_caching.py | 101 +++++++++++
 2 files changed, 292 insertions(+)
 create mode 100644 examples/opsd/opsd/teacher.py
 create mode 100644 examples/opsd/tests/test_teacher_caching.py

diff --git a/examples/opsd/opsd/teacher.py b/examples/opsd/opsd/teacher.py
new file mode 100644
index 000000000000..a7895beddf00
--- /dev/null
+++ b/examples/opsd/opsd/teacher.py
@@ -0,0 +1,191 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Frozen teacher: two-phase forward with CPU-cached logits.
+
+The trainer runs each step in two phases:
+
+  1. **Teacher phase.** Forward over the prompt+response. The full ``[B, T, V]``
+     logit tensor is moved off the GPU into a :class:`TeacherLogitCache` so that
+     teacher weight buffers can be released before the student backward pass.
+  2. **Student phase.** Forward + backward on the student. The distillation
+     loss pulls teacher logits back to GPU **one sequence chunk at a time** via
+     :meth:`TeacherLogitCache.chunk_to_device`, so peak GPU memory for teacher
+     data is only ``[B, chunk, V]``.
+
+This module deliberately lazy-imports ``deepspeed`` and ``transformers`` so
+that the pure data-handling pieces (``TeacherLogitCache`` and the streamed
+loss in :mod:`opsd.losses`) remain importable in CPU-only unit tests that do
+not have a working DeepSpeed launcher.
+"""
+
+from dataclasses import dataclass
+from typing import Optional, Tuple
+
+import torch
+
+# ``opsd.config`` is pure-Python (no distributed imports), so we can import it
+# at module load time without pulling in DeepSpeed.
+from opsd.config import TeacherConfig
+
+
+@dataclass
+class TeacherLogitCache:
+    """CPU-resident teacher logits with on-demand chunk fetch.
+
+    Stored in low precision (default ``bfloat16``) to halve host memory; the
+    consumer in :mod:`opsd.losses` promotes back to fp32 inside the divergence
+    so the KD math stays well-conditioned.
+    """
+
+    cpu_logits: torch.Tensor  # [B, T, V]
+
+    def __post_init__(self) -> None:
+        if self.cpu_logits.dim() != 3:
+            raise ValueError(f"cpu_logits must be 3-D [B, T, V]; got shape "
+                             f"{tuple(self.cpu_logits.shape)}")
+        if self.cpu_logits.device.type != "cpu":
+            raise ValueError(f"cpu_logits must live on CPU; got device "
+                             f"{self.cpu_logits.device}")
+
+    @classmethod
+    def from_gpu_logits(cls, logits: torch.Tensor, store_dtype: torch.dtype = torch.bfloat16) -> "TeacherLogitCache":
+        """Detach + downcast + move to (pinned) host memory.
+
+        ``non_blocking=True`` lets the copy overlap with the next CUDA op when
+        the destination is pinned; we try to pin and fall back silently if the
+        host doesn't support it (e.g. CPU-only test environments).
+        """
+        downcast = logits.detach().to(dtype=store_dtype)
+        try:
+            host = torch.empty(downcast.shape, dtype=store_dtype, pin_memory=True)
+            host.copy_(downcast, non_blocking=True)
+        except RuntimeError:
+            host = downcast.cpu()
+        return cls(cpu_logits=host)
+
+    @property
+    def shape(self) -> Tuple[int, int, int]:
+        s = self.cpu_logits.shape
+        return (int(s[0]), int(s[1]), int(s[2]))
+
+    @property
+    def dtype(self) -> torch.dtype:
+        return self.cpu_logits.dtype
+
+    def chunk_to_device(self,
+                        start: int,
+                        end: int,
+                        device: torch.device,
+                        dtype: Optional[torch.dtype] = None) -> torch.Tensor:
+        """Slice ``[:, start:end, :]`` and stage it on ``device``.
+
+        ``dtype`` is the dtype on the destination; if ``None``, the stored
+        dtype is preserved.
+        """
+        _, T, _ = self.shape
+        if not (0 <= start < end <= T):
+            raise ValueError(f"chunk bounds [{start}, {end}) invalid for T={T}")
+        chunk = self.cpu_logits[:, start:end]
+        out = chunk.to(device=device, dtype=dtype if dtype is not None else chunk.dtype, non_blocking=True)
+        return out
+
+    def free(self) -> None:
+        """Drop the underlying buffer so a step's teacher cache can be GC'd
+        before the next teacher forward."""
+        self.cpu_logits = torch.empty(0)
+
+
+_DTYPE_MAP = {
+    "float16": torch.float16,
+    "fp16": torch.float16,
+    "bfloat16": torch.bfloat16,
+    "bf16": torch.bfloat16,
+    "float32": torch.float32,
+    "fp32": torch.float32,
+}
+
+
+def _resolve_dtype(name: str) -> torch.dtype:
+    if name not in _DTYPE_MAP:
+        raise ValueError(f"Unknown dtype {name!r}; choose from {sorted(_DTYPE_MAP)}")
+    return _DTYPE_MAP[name]
+
+
+class TeacherWrapper:
+    """Frozen teacher.
+
+    Two modes depending on ``cfg.offload_to_cpu``:
+
+      * ``offload_to_cpu=False`` — load the teacher with HF's standard
+        ``from_pretrained`` and pin it on the local accelerator device. The
+        whole teacher lives in GPU memory; simplest path and what to use when
+        the teacher fits.
+
+      * ``offload_to_cpu=True`` — wrap the loaded model with
+        ``deepspeed.initialize`` using a ZeRO-3 config with
+        ``offload_param.device='cpu'``. The optimizer slot is unused (no
+        trainable params) but ZeRO-3 gives us per-forward parameter gather
+        / release and keeps weights on the host between forwards. This is the
+        path to use when the teacher would otherwise not fit alongside the
+        student.
+
+    Both paths load the full checkpoint on each rank before DeepSpeed (if
+    used) partitions; we intentionally do **not** wrap ``from_pretrained``
+    in ``deepspeed.zero.Init()`` because HF's loader partitions
+    ``low_cpu_mem_usage`` params to zero-width shards before the checkpoint
+    can fill them, which surfaces as a "size mismatch" load error.
+    """
+
+    def __init__(self, cfg: TeacherConfig, world_size: int):
+        from deepspeed.accelerator import get_accelerator
+        from transformers import AutoModelForCausalLM
+
+        self.cfg = cfg
+        dtype = _resolve_dtype(cfg.dtype)
+        device = get_accelerator().current_device_name()
+
+        model = AutoModelForCausalLM.from_pretrained(
+            cfg.model_name_or_path,
+            torch_dtype=dtype,
+            trust_remote_code=cfg.trust_remote_code,
+        )
+        model.eval()
+        for p in model.parameters():
+            p.requires_grad_(False)
+
+        if cfg.offload_to_cpu:
+            import deepspeed
+
+            ds_config = {
+                "train_micro_batch_size_per_gpu": 1,
+                "bf16": {
+                    "enabled": dtype is torch.bfloat16
+                },
+                "fp16": {
+                    "enabled": dtype is torch.float16
+                },
+                "zero_optimization": {
+                    "stage": 3,
+                    "offload_param": {
+                        "device": "cpu"
+                    },
+                },
+            }
+            engine, *_ = deepspeed.initialize(model=model, config=ds_config)
+            self._callable = engine
+            self._uses_ds = True
+        else:
+            model.to(device)
+            self._callable = model
+            self._uses_ds = False
+
+    @torch.no_grad()
+    def forward_to_cache(self,
+                         input_ids: torch.Tensor,
+                         attention_mask: torch.Tensor,
+                         store_dtype: torch.dtype = torch.bfloat16) -> TeacherLogitCache:
+        """Run teacher forward and stage logits onto the host."""
+        outputs = self._callable(input_ids=input_ids, attention_mask=attention_mask)
+        return TeacherLogitCache.from_gpu_logits(outputs.logits, store_dtype=store_dtype)
diff --git a/examples/opsd/tests/test_teacher_caching.py b/examples/opsd/tests/test_teacher_caching.py
new file mode 100644
index 000000000000..5702bc287ffe
--- /dev/null
+++ b/examples/opsd/tests/test_teacher_caching.py
@@ -0,0 +1,101 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""CPU-only tests for TeacherLogitCache.
+
+The ``TeacherWrapper`` itself (which wraps deepspeed+transformers) is not
+exercised here because it requires a real model and a DeepSpeed launcher; the
+caching/streaming pieces are isolated into ``TeacherLogitCache`` so they can
+be tested in isolation.
+"""
+
+import pytest
+import torch
+
+from opsd.teacher import TeacherLogitCache
+
+
+def test_round_trip_preserves_values_within_dtype():
+    torch.manual_seed(0)
+    gpu_like = torch.randn(2, 16, 32, dtype=torch.float32)
+    cache = TeacherLogitCache.from_gpu_logits(gpu_like, store_dtype=torch.bfloat16)
+    assert cache.shape == (2, 16, 32)
+    assert cache.dtype == torch.bfloat16
+    chunk = cache.chunk_to_device(0, 16, torch.device("cpu"), dtype=torch.float32)
+    # bf16 round-trip loses precision; check it stays within bf16's worst-case
+    # relative error rather than asserting exact equality.
+    assert torch.allclose(chunk, gpu_like, atol=1e-1, rtol=1e-1)
+
+
+def test_chunk_slicing_is_correct():
+    torch.manual_seed(0)
+    src = torch.randn(3, 100, 8)
+    cache = TeacherLogitCache.from_gpu_logits(src, store_dtype=torch.float32)
+    for start, end in [(0, 10), (10, 50), (50, 100), (33, 77)]:
+        got = cache.chunk_to_device(start, end, torch.device("cpu"))
+        assert got.shape == (3, end - start, 8)
+        assert torch.allclose(got, src[:, start:end])
+
+
+def test_invalid_chunk_bounds_raise():
+    cache = TeacherLogitCache.from_gpu_logits(torch.zeros(1, 8, 4), store_dtype=torch.float32)
+    with pytest.raises(ValueError, match="invalid"):
+        cache.chunk_to_device(0, 9, torch.device("cpu"))
+    with pytest.raises(ValueError, match="invalid"):
+        cache.chunk_to_device(5, 3, torch.device("cpu"))
+    with pytest.raises(ValueError, match="invalid"):
+        cache.chunk_to_device(-1, 4, torch.device("cpu"))
+
+
+def test_rejects_non_3d_logits():
+    with pytest.raises(ValueError, match="must be 3-D"):
+        TeacherLogitCache(cpu_logits=torch.zeros(8, 32))
+
+
+def test_rejects_gpu_resident_logits():
+    if not torch.cuda.is_available():  #ignore-cuda
+        pytest.skip("no CUDA available to construct GPU tensor")
+    with pytest.raises(ValueError, match="must live on CPU"):
+        TeacherLogitCache(cpu_logits=torch.zeros(1, 8, 4, device="cuda"))
+
+
+def test_dtype_override_in_chunk_to_device():
+    src = torch.randn(2, 8, 16, dtype=torch.float32)
+    cache = TeacherLogitCache.from_gpu_logits(src, store_dtype=torch.float32)
+    chunk = cache.chunk_to_device(0, 8, torch.device("cpu"), dtype=torch.bfloat16)
+    assert chunk.dtype == torch.bfloat16
+
+
+def test_free_releases_buffer():
+    src = torch.randn(2, 32, 16)
+    cache = TeacherLogitCache.from_gpu_logits(src, store_dtype=torch.float32)
+    assert cache.cpu_logits.numel() == 2 * 32 * 16
+    cache.free()
+    assert cache.cpu_logits.numel() == 0
+
+
+def test_default_store_dtype_is_bf16():
+    src = torch.randn(1, 4, 8)
+    cache = TeacherLogitCache.from_gpu_logits(src)
+    assert cache.dtype == torch.bfloat16
+
+
+def test_streamed_chunked_loss_matches_full_loss():
+    """End-to-end check: pulling teacher logits chunk-by-chunk through the
+    cache yields the same distillation loss as passing the full teacher tensor
+    to ``chunked_distillation_loss`` directly."""
+    from opsd.losses import chunked_distillation_loss
+
+    torch.manual_seed(0)
+    s = torch.randn(2, 64, 32)
+    t = torch.randn(2, 64, 32)
+    mask = torch.ones(2, 64)
+
+    direct = chunked_distillation_loss(s, t, mask, loss_type="reverse_kl", chunk_size=8)
+
+    cache = TeacherLogitCache.from_gpu_logits(t, store_dtype=torch.float32)
+    staged_full = cache.chunk_to_device(0, 64, torch.device("cpu"), dtype=torch.float32)
+    via_cache = chunked_distillation_loss(s, staged_full, mask, loss_type="reverse_kl", chunk_size=8)
+
+    assert torch.allclose(direct, via_cache, atol=1e-6)

From c9b333a38dcaf689e3a6240867fbc5e62906d571 Mon Sep 17 00:00:00 2001
From: Zhipeng Wang <zhipengbayern@gmail.com>
Date: Tue, 26 May 2026 07:15:28 +0000
Subject: [PATCH 03/18] Add OPSD trainer, hybrid-engine rollout, and end-to-end
 entry point

Lands the fully-runnable hybrid-engine training path: a backend-agnostic
RolloutEngine ABC with RolloutRequest / RolloutBatch / SamplingConfig
dataclasses, a HybridEngineRollout implementation that uses DeepSpeed's
accelerated decode when an inference policy exists and otherwise falls
back to GatheredParameters + the raw HF generate (covers Qwen-family and
other models not in DeepSpeed's inference container list), a left-padded
prompt dataset + collator, a three-phase trainer loop (rollout -> teacher
forward + cache -> student forward + streamed KL + backward + step), the
argparse + deepspeed.initialize entry point, base DeepSpeed ZeRO-3 +
hybrid_engine JSON configs, a 5-step smoke config and launcher script,
and a 20-prompt math toy dataset for the smoke run.

Smoke-validated end-to-end on 2x H200 with Qwen2.5-0.5B-Instruct student
and Qwen2.5-1.5B-Instruct teacher; loss finite for 5 steps. Rollout
interface contract is covered by tests/test_rollout_interface.py.

Signed-off-by: Zhipeng Wang <zhipengbayern@gmail.com>
---
 examples/opsd/configs/ds_zero3.json           |  43 ++++
 examples/opsd/configs/opsd_hybrid_engine.json |  49 +++++
 examples/opsd/configs/smoke_ds_zero3.json     |  35 ++++
 examples/opsd/configs/smoke_hybrid.json       |  49 +++++
 examples/opsd/data/prompts.jsonl              |  20 ++
 examples/opsd/main.py                         | 135 ++++++++++++
 examples/opsd/opsd/data.py                    | 108 ++++++++++
 examples/opsd/opsd/rollout/__init__.py        |  39 ++++
 examples/opsd/opsd/rollout/base.py            | 117 +++++++++++
 examples/opsd/opsd/rollout/hybrid_engine.py   | 119 +++++++++++
 examples/opsd/opsd/trainer.py                 | 197 ++++++++++++++++++
 examples/opsd/scripts/train_opsd_hybrid.sh    |  14 ++
 examples/opsd/tests/test_rollout_interface.py | 156 ++++++++++++++
 13 files changed, 1081 insertions(+)
 create mode 100644 examples/opsd/configs/ds_zero3.json
 create mode 100644 examples/opsd/configs/opsd_hybrid_engine.json
 create mode 100644 examples/opsd/configs/smoke_ds_zero3.json
 create mode 100644 examples/opsd/configs/smoke_hybrid.json
 create mode 100644 examples/opsd/data/prompts.jsonl
 create mode 100644 examples/opsd/main.py
 create mode 100644 examples/opsd/opsd/data.py
 create mode 100644 examples/opsd/opsd/rollout/__init__.py
 create mode 100644 examples/opsd/opsd/rollout/base.py
 create mode 100644 examples/opsd/opsd/rollout/hybrid_engine.py
 create mode 100644 examples/opsd/opsd/trainer.py
 create mode 100644 examples/opsd/scripts/train_opsd_hybrid.sh
 create mode 100644 examples/opsd/tests/test_rollout_interface.py

diff --git a/examples/opsd/configs/ds_zero3.json b/examples/opsd/configs/ds_zero3.json
new file mode 100644
index 000000000000..1f43339a6f20
--- /dev/null
+++ b/examples/opsd/configs/ds_zero3.json
@@ -0,0 +1,43 @@
+{
+    "bf16": {
+        "enabled": true
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "reduce_bucket_size": 5e7,
+        "stage3_prefetch_bucket_size": 5e7,
+        "stage3_param_persistence_threshold": 1e6,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": 1e-6,
+            "betas": [0.9, 0.95],
+            "eps": 1e-8,
+            "weight_decay": 0.0
+        }
+    },
+    "scheduler": {
+        "type": "WarmupLR",
+        "params": {
+            "warmup_min_lr": 0,
+            "warmup_max_lr": 1e-6,
+            "warmup_num_steps": 0
+        }
+    },
+    "gradient_clipping": 1.0,
+    "hybrid_engine": {
+        "enabled": true,
+        "max_out_tokens": 2048,
+        "inference_tp_size": 1,
+        "release_inference_cache": false,
+        "pin_parameters": true,
+        "tp_gather_partition_size": 8
+    },
+    "wall_clock_breakdown": false
+}
diff --git a/examples/opsd/configs/opsd_hybrid_engine.json b/examples/opsd/configs/opsd_hybrid_engine.json
new file mode 100644
index 000000000000..5a7d45b54f6a
--- /dev/null
+++ b/examples/opsd/configs/opsd_hybrid_engine.json
@@ -0,0 +1,49 @@
+{
+    "student": {
+        "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
+        "dtype": "bfloat16",
+        "trust_remote_code": false,
+        "arch": "qwen2"
+    },
+    "teacher": {
+        "model_name_or_path": "Qwen/Qwen2.5-Math-7B-Instruct",
+        "dtype": "bfloat16",
+        "trust_remote_code": false,
+        "offload_to_cpu": true
+    },
+    "rollout": {
+        "engine": "hybrid_engine",
+        "max_prompt_length": 1024,
+        "max_response_length": 1024,
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "top_k": -1,
+        "n_samples_per_prompt": 1,
+        "weight_sync_interval": 1
+    },
+    "distillation": {
+        "loss_type": "reverse_kl",
+        "temperature": 1.0,
+        "chunk_size": 512
+    },
+    "training": {
+        "train_batch_size": 8,
+        "micro_batch_size_per_gpu": 1,
+        "gradient_accumulation_steps": 1,
+        "learning_rate": 1e-6,
+        "weight_decay": 0.0,
+        "num_train_epochs": 1,
+        "max_steps": -1,
+        "warmup_steps": 0,
+        "save_steps": 500,
+        "logging_steps": 10,
+        "save_dir": "./opsd_ckpt_hybrid",
+        "seed": 42
+    },
+    "data": {
+        "path": "data/prompts.jsonl",
+        "prompt_field": "prompt",
+        "shuffle": true
+    },
+    "deepspeed_config": "configs/ds_zero3.json"
+}
diff --git a/examples/opsd/configs/smoke_ds_zero3.json b/examples/opsd/configs/smoke_ds_zero3.json
new file mode 100644
index 000000000000..74211f3fbd9f
--- /dev/null
+++ b/examples/opsd/configs/smoke_ds_zero3.json
@@ -0,0 +1,35 @@
+{
+    "bf16": {
+        "enabled": true
+    },
+    "zero_optimization": {
+        "stage": 3,
+        "overlap_comm": true,
+        "contiguous_gradients": true,
+        "reduce_bucket_size": 5e7,
+        "stage3_prefetch_bucket_size": 5e7,
+        "stage3_param_persistence_threshold": 1e6,
+        "stage3_max_live_parameters": 1e9,
+        "stage3_max_reuse_distance": 1e9,
+        "stage3_gather_16bit_weights_on_model_save": true
+    },
+    "optimizer": {
+        "type": "AdamW",
+        "params": {
+            "lr": 1e-6,
+            "betas": [0.9, 0.95],
+            "eps": 1e-8,
+            "weight_decay": 0.0
+        }
+    },
+    "gradient_clipping": 1.0,
+    "hybrid_engine": {
+        "enabled": true,
+        "max_out_tokens": 512,
+        "inference_tp_size": 1,
+        "release_inference_cache": false,
+        "pin_parameters": true,
+        "tp_gather_partition_size": 8
+    },
+    "wall_clock_breakdown": false
+}
diff --git a/examples/opsd/configs/smoke_hybrid.json b/examples/opsd/configs/smoke_hybrid.json
new file mode 100644
index 000000000000..218bd990ae97
--- /dev/null
+++ b/examples/opsd/configs/smoke_hybrid.json
@@ -0,0 +1,49 @@
+{
+    "student": {
+        "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
+        "dtype": "bfloat16",
+        "trust_remote_code": false,
+        "arch": "qwen2"
+    },
+    "teacher": {
+        "model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
+        "dtype": "bfloat16",
+        "trust_remote_code": false,
+        "offload_to_cpu": false
+    },
+    "rollout": {
+        "engine": "hybrid_engine",
+        "max_prompt_length": 128,
+        "max_response_length": 64,
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "top_k": -1,
+        "n_samples_per_prompt": 1,
+        "weight_sync_interval": 1
+    },
+    "distillation": {
+        "loss_type": "reverse_kl",
+        "temperature": 1.0,
+        "chunk_size": 128
+    },
+    "training": {
+        "train_batch_size": 2,
+        "micro_batch_size_per_gpu": 1,
+        "gradient_accumulation_steps": 1,
+        "learning_rate": 1e-6,
+        "weight_decay": 0.0,
+        "num_train_epochs": 1,
+        "max_steps": 5,
+        "warmup_steps": 0,
+        "save_steps": 10000,
+        "logging_steps": 1,
+        "save_dir": "./opsd_smoke_hybrid_ckpt",
+        "seed": 42
+    },
+    "data": {
+        "path": "data/prompts.jsonl",
+        "prompt_field": "prompt",
+        "shuffle": true
+    },
+    "deepspeed_config": "configs/smoke_ds_zero3.json"
+}
diff --git a/examples/opsd/data/prompts.jsonl b/examples/opsd/data/prompts.jsonl
new file mode 100644
index 000000000000..a95a17c57557
--- /dev/null
+++ b/examples/opsd/data/prompts.jsonl
@@ -0,0 +1,20 @@
+{"prompt": "Solve: 17 + 25 = ?"}
+{"prompt": "What is 12 multiplied by 8?"}
+{"prompt": "If a train travels 60 miles per hour for 3 hours, how far does it go?"}
+{"prompt": "What is the square root of 144?"}
+{"prompt": "Compute 15% of 240."}
+{"prompt": "A rectangle has length 7 and width 4. What is its area?"}
+{"prompt": "Solve for x: 2x + 5 = 17."}
+{"prompt": "What is 7 factorial?"}
+{"prompt": "Compute the sum of integers from 1 to 10."}
+{"prompt": "What is 2 to the power of 10?"}
+{"prompt": "Find the perimeter of a square with side length 9."}
+{"prompt": "If 5 apples cost $2.50, what is the cost of 12 apples?"}
+{"prompt": "What is the greatest common divisor of 24 and 36?"}
+{"prompt": "Convert 0.75 to a fraction in simplest form."}
+{"prompt": "If x + y = 10 and x - y = 4, find x and y."}
+{"prompt": "What is 1/4 + 1/3?"}
+{"prompt": "A circle has radius 5. What is its area? (Use pi = 3.14)"}
+{"prompt": "Compute (3 + 4) * (5 - 2)."}
+{"prompt": "What is 81 divided by 9?"}
+{"prompt": "If a number doubled is 18, what is the number?"}
diff --git a/examples/opsd/main.py b/examples/opsd/main.py
new file mode 100644
index 000000000000..b2e5c4c6929b
--- /dev/null
+++ b/examples/opsd/main.py
@@ -0,0 +1,135 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""OPSD training entry point.
+
+Launch with the DeepSpeed launcher::
+
+    deepspeed --num_gpus 8 main.py --config configs/opsd_hybrid_engine.json
+
+The DeepSpeed launcher sets ``LOCAL_RANK``, ``RANK``, and ``WORLD_SIZE`` in
+the environment; we call :func:`deepspeed.init_distributed` to take that over.
+"""
+
+import argparse
+import json
+import os
+import random
+
+import deepspeed
+import numpy as np
+import torch
+from deepspeed.accelerator import get_accelerator
+from torch.utils.data import DataLoader
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from opsd.config import OPSDConfig
+from opsd.data import LeftPaddedPromptCollator, PromptDataset
+from opsd.rollout import build_rollout
+from opsd.teacher import TeacherWrapper
+from opsd.trainer import OPSDTrainer
+
+
+def _seed_everything(seed: int) -> None:
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    if get_accelerator().is_available():
+        get_accelerator().manual_seed_all(seed)
+
+
+def _resolve_dtype(name: str) -> torch.dtype:
+    return {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}[name]
+
+
+def _load_ds_config(path: str) -> dict:
+    with open(path, "r") as f:
+        return json.load(f)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--config", required=True, help="Path to OPSDConfig JSON")
+    parser.add_argument("--local_rank", type=int, default=int(os.environ.get("LOCAL_RANK", 0)))
+    args = parser.parse_args()
+
+    cfg = OPSDConfig.from_json(args.config)
+    cfg.validate()
+    _seed_everything(cfg.training.seed)
+
+    deepspeed.init_distributed()
+
+    # --- tokenizer (shared between data + rollout) -------------------------
+    tokenizer = AutoTokenizer.from_pretrained(
+        cfg.student.model_name_or_path,
+        trust_remote_code=cfg.student.trust_remote_code,
+        padding_side="left",
+    )
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    # --- student model + DeepSpeed engine ----------------------------------
+    student_dtype = _resolve_dtype(cfg.student.dtype)
+    student_model = AutoModelForCausalLM.from_pretrained(
+        cfg.student.model_name_or_path,
+        torch_dtype=student_dtype,
+        trust_remote_code=cfg.student.trust_remote_code,
+    )
+
+    ds_config = _load_ds_config(cfg.deepspeed_config)
+    ds_config["train_micro_batch_size_per_gpu"] = cfg.training.micro_batch_size_per_gpu
+    ds_config["train_batch_size"] = cfg.training.train_batch_size
+    ds_config["gradient_accumulation_steps"] = cfg.training.gradient_accumulation_steps
+
+    student_engine, *_ = deepspeed.initialize(
+        model=student_model,
+        model_parameters=student_model.parameters(),
+        config=ds_config,
+    )
+
+    # --- frozen teacher ----------------------------------------------------
+    teacher = TeacherWrapper(cfg.teacher, world_size=dist_world_size())
+
+    # --- rollout engine ----------------------------------------------------
+    rollout = build_rollout(
+        cfg.rollout,
+        student_engine=student_engine,
+        tokenizer=tokenizer,
+        student_model_path=cfg.student.model_name_or_path,
+        arch=cfg.student.arch,
+    )
+
+    # --- dataloader --------------------------------------------------------
+    dataset = PromptDataset(
+        path=cfg.data.path,
+        tokenizer=tokenizer,
+        max_prompt_length=cfg.rollout.max_prompt_length,
+        prompt_field=cfg.data.prompt_field,
+        chat_template=cfg.data.chat_template,
+    )
+    collator = LeftPaddedPromptCollator(tokenizer=tokenizer, max_prompt_length=cfg.rollout.max_prompt_length)
+    loader = DataLoader(
+        dataset,
+        batch_size=cfg.training.micro_batch_size_per_gpu,
+        shuffle=cfg.data.shuffle,
+        collate_fn=collator,
+        drop_last=True,
+    )
+
+    OPSDTrainer(
+        cfg=cfg,
+        student_engine=student_engine,
+        teacher=teacher,
+        tokenizer=tokenizer,
+        rollout=rollout,
+        dataloader=loader,
+    ).train()
+
+
+def dist_world_size() -> int:
+    return int(os.environ.get("WORLD_SIZE", "1"))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/opsd/opsd/data.py b/examples/opsd/opsd/data.py
new file mode 100644
index 000000000000..02ecf417e5c3
--- /dev/null
+++ b/examples/opsd/opsd/data.py
@@ -0,0 +1,108 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Prompt dataset and left-padding collator for OPSD rollouts.
+
+The dataset reads a JSONL file with one record per line; each record must
+contain a string under :attr:`DataConfig.prompt_field` (default ``"prompt"``).
+If the tokenizer exposes ``apply_chat_template``, single-turn prompts are
+wrapped in a user-role message with ``add_generation_prompt=True`` so the
+student generates the assistant turn.
+
+Batches are **left-padded** because causal generation requires real tokens at
+the right edge — :class:`opsd.rollout.RolloutRequest` and the hybrid-engine
+backend both assume this layout.
+"""
+
+import json
+from typing import Any, Dict, List, Optional
+
+import torch
+from torch.utils.data import Dataset
+
+
+class PromptDataset(Dataset):
+    """Reads ``{prompt_field: str}`` records from a JSONL file."""
+
+    def __init__(
+        self,
+        path: str,
+        tokenizer: Any,
+        max_prompt_length: int,
+        prompt_field: str = "prompt",
+        chat_template: Optional[str] = None,
+    ):
+        self.records = self._load_jsonl(path)
+        self.tokenizer = tokenizer
+        self.max_prompt_length = max_prompt_length
+        self.prompt_field = prompt_field
+        self.chat_template = chat_template
+
+    @staticmethod
+    def _load_jsonl(path: str) -> List[Dict[str, Any]]:
+        records: List[Dict[str, Any]] = []
+        with open(path, "r") as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                records.append(json.loads(line))
+        return records
+
+    def __len__(self) -> int:
+        return len(self.records)
+
+    def __getitem__(self, idx: int) -> str:
+        rec = self.records[idx]
+        if self.prompt_field not in rec:
+            raise KeyError(f"record {idx} missing field {self.prompt_field!r}")
+        text = rec[self.prompt_field]
+
+        # If the tokenizer knows a chat template, render the prompt as a single
+        # user-role turn and request the generation prompt. This matches how
+        # instruction-tuned student/teacher checkpoints expect inputs.
+        if hasattr(self.tokenizer, "apply_chat_template"):
+            messages = [{"role": "user", "content": text}] if isinstance(text, str) else text
+            text = self.tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True,
+                chat_template=self.chat_template,
+            )
+        return text
+
+
+class LeftPaddedPromptCollator:
+    """Tokenizes a batch of prompt strings into a left-padded tensor batch."""
+
+    def __init__(self, tokenizer: Any, max_prompt_length: int):
+        self.tokenizer = tokenizer
+        self.max_prompt_length = max_prompt_length
+        self.pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
+        if self.pad_id is None:
+            raise ValueError("tokenizer has neither pad_token_id nor eos_token_id; "
+                             "cannot construct a padding collator")
+
+    def __call__(self, batch_texts: List[str]) -> Dict[str, torch.Tensor]:
+        per_sample = [
+            self.tokenizer(
+                t,
+                add_special_tokens=False,
+                truncation=True,
+                max_length=self.max_prompt_length,
+                return_tensors="pt",
+            )["input_ids"].squeeze(0) for t in batch_texts
+        ]
+        max_len = max(int(x.shape[0]) for x in per_sample)
+        B = len(per_sample)
+
+        prompt_ids = torch.full((B, max_len), self.pad_id, dtype=torch.long)
+        attention_mask = torch.zeros((B, max_len), dtype=torch.long)
+        for i, ids in enumerate(per_sample):
+            n = int(ids.shape[0])
+            # left-pad: real tokens at the right edge
+            prompt_ids[i, max_len - n:] = ids
+            attention_mask[i, max_len - n:] = 1
+
+        return {"prompt_ids": prompt_ids, "prompt_attention_mask": attention_mask}
diff --git a/examples/opsd/opsd/rollout/__init__.py b/examples/opsd/opsd/rollout/__init__.py
new file mode 100644
index 000000000000..0509d6d8b4c9
--- /dev/null
+++ b/examples/opsd/opsd/rollout/__init__.py
@@ -0,0 +1,39 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Rollout engines for OPSD: hybrid engine (built-in) or vLLM (disjoint GPUs)."""
+
+from opsd.rollout.base import RolloutBatch, RolloutEngine, RolloutRequest, SamplingConfig
+
+__all__ = ["RolloutBatch", "RolloutEngine", "RolloutRequest", "SamplingConfig", "build_rollout"]
+
+
+def build_rollout(rollout_cfg, student_engine=None, tokenizer=None, student_model_path=None, arch=None):
+    """Factory: construct the rollout engine specified by ``rollout_cfg.engine``.
+
+    Imports of heavy backends are deferred to here so that selecting the
+    hybrid-engine path doesn't transitively require vLLM (and vice versa).
+    """
+    engine_name = rollout_cfg.engine
+    if engine_name == "hybrid_engine":
+        from opsd.rollout.hybrid_engine import HybridEngineRollout
+
+        if student_engine is None or tokenizer is None:
+            raise ValueError("hybrid_engine rollout needs both student_engine and tokenizer")
+        return HybridEngineRollout(student_engine=student_engine, tokenizer=tokenizer, cfg=rollout_cfg)
+
+    if engine_name == "vllm":
+        from opsd.rollout.vllm import VLLMRollout
+
+        if tokenizer is None:
+            raise ValueError("vllm rollout needs a tokenizer for length accounting")
+        return VLLMRollout(
+            cfg=rollout_cfg,
+            tokenizer=tokenizer,
+            student_engine=student_engine,
+            student_model_path=student_model_path,
+            arch=arch,
+        )
+
+    raise ValueError(f"Unknown rollout engine {engine_name!r}; choose from 'hybrid_engine' | 'vllm'")
diff --git a/examples/opsd/opsd/rollout/base.py b/examples/opsd/opsd/rollout/base.py
new file mode 100644
index 000000000000..62789d25c1cd
--- /dev/null
+++ b/examples/opsd/opsd/rollout/base.py
@@ -0,0 +1,117 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Rollout engine interface.
+
+The trainer talks to its rollout engine through three small dataclasses
+(``RolloutRequest`` in / ``RolloutBatch`` out / ``SamplingConfig``) and one
+ABC. This keeps the engine-specific concerns (hybrid-engine vs vLLM, weight
+sync, process topology) out of the trainer loop, so swapping engines is a
+one-line config change.
+
+Concrete engines live in sibling modules:
+    * :mod:`opsd.rollout.hybrid_engine` — DeepSpeed hybrid engine
+    * :mod:`opsd.rollout.vllm`          — vLLM on a disjoint GPU group
+"""
+
+from abc import ABC, abstractmethod
+from dataclasses import dataclass
+
+import torch
+
+
+@dataclass
+class SamplingConfig:
+    """Sampling knobs that the trainer passes to ``generate`` each step."""
+
+    max_new_tokens: int
+    temperature: float = 1.0
+    top_p: float = 1.0
+    # ``top_k <= 0`` means "no top-k truncation".
+    top_k: int = -1
+    # Number of samples per prompt. >1 expands the effective batch.
+    n_samples_per_prompt: int = 1
+
+
+@dataclass
+class RolloutRequest:
+    """Input to ``RolloutEngine.generate``.
+
+    Prompts arrive *left-padded* (i.e. real tokens at the right edge) so that
+    causal generation appends naturally after them.
+    """
+
+    prompt_ids: torch.Tensor  # [B, T_p] left-padded with pad_token_id
+    prompt_attention_mask: torch.Tensor  # [B, T_p], 1 on real prompt tokens
+
+    def __post_init__(self) -> None:
+        if self.prompt_ids.dim() != 2:
+            raise ValueError(f"prompt_ids must be 2-D [B, T_p]; got {tuple(self.prompt_ids.shape)}")
+        if self.prompt_attention_mask.shape != self.prompt_ids.shape:
+            raise ValueError(f"prompt_attention_mask shape {tuple(self.prompt_attention_mask.shape)} "
+                             f"does not match prompt_ids {tuple(self.prompt_ids.shape)}")
+
+
+@dataclass
+class RolloutBatch:
+    """Output of ``RolloutEngine.generate``.
+
+    ``input_ids`` holds the *concatenation* of (left-padded) prompt and
+    response, right-padded to the longest sequence in the batch.
+    ``response_start_idx[i]`` is the column index at which the response
+    begins, so positions ``>= response_start_idx[i]`` (intersected with
+    ``attention_mask``) are response tokens.
+
+    Note: with the standard *left-padded* prompt convention, every sample's
+    response starts at the same column (= the prompt section length), but the
+    field is kept per-sample so that mixed-batch backends (e.g. vLLM, which
+    may strip its own padding) can still report a meaningful boundary.
+    """
+
+    input_ids: torch.Tensor  # [B', T_p + T_r]; B' = B * n_samples_per_prompt
+    attention_mask: torch.Tensor  # [B', T_p + T_r]
+    response_start_idx: torch.Tensor  # [B'] int
+
+    def __post_init__(self) -> None:
+        if self.input_ids.dim() != 2:
+            raise ValueError(f"input_ids must be 2-D; got {tuple(self.input_ids.shape)}")
+        if self.attention_mask.shape != self.input_ids.shape:
+            raise ValueError(f"attention_mask shape {tuple(self.attention_mask.shape)} does not "
+                             f"match input_ids {tuple(self.input_ids.shape)}")
+        B = self.input_ids.shape[0]
+        if self.response_start_idx.shape != (B, ):
+            raise ValueError(f"response_start_idx must be 1-D of length {B}; got "
+                             f"{tuple(self.response_start_idx.shape)}")
+
+    @property
+    def batch_size(self) -> int:
+        return int(self.input_ids.shape[0])
+
+    @property
+    def seq_len(self) -> int:
+        return int(self.input_ids.shape[1])
+
+
+class RolloutEngine(ABC):
+    """Abstract base for student rollout engines."""
+
+    name: str = "base"
+
+    @abstractmethod
+    def generate(self, request: RolloutRequest, sampling: SamplingConfig) -> RolloutBatch:
+        """Run the student's generate, return prompt+response in one tensor."""
+
+    @abstractmethod
+    def sync_weights_from_student(self, step: int) -> None:
+        """Push the student's current weights into the rollout backend.
+
+        No-op for :class:`HybridEngineRollout` (the engine reads weights live
+        from the same process). Meaningful for :class:`VLLMRollout`, which
+        holds its own copy and must be refreshed periodically.
+        """
+
+    def shutdown(self) -> None:
+        """Release any backend resources (vLLM workers, NCCL groups, ...).
+        Default no-op."""
+        return None
diff --git a/examples/opsd/opsd/rollout/hybrid_engine.py b/examples/opsd/opsd/rollout/hybrid_engine.py
new file mode 100644
index 000000000000..7e7ced928655
--- /dev/null
+++ b/examples/opsd/opsd/rollout/hybrid_engine.py
@@ -0,0 +1,119 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Rollout backed by DeepSpeed's hybrid engine, with a ZeRO-3 fallback.
+
+For architectures in DeepSpeed's inference-container policy list
+(GPT2 / GPT-NeoX / OPT / BLOOM / LLAMA / LLAMA2 / InternLM as of 0.15) the
+hybrid engine gives accelerated decode by swapping in optimized inference
+kernels for the duration of the rollout. For everything else (Qwen2 / Qwen3
+/ any model without a policy), no inference container is created and
+``DeepSpeedHybridEngine.generate`` would AttributeError on its unbound
+``_generate`` slot — so we detect that case at construction time and fall
+back to a manual path that just gathers ZeRO-3 partitions and calls the
+HuggingFace model's ``generate`` directly. Correct, just slower than the
+accelerated path.
+"""
+
+import torch
+
+from opsd.config import RolloutConfig
+from opsd.rollout.base import RolloutBatch, RolloutEngine, RolloutRequest, SamplingConfig
+
+
+def _hybrid_engine_has_accel(engine) -> bool:
+    # The accelerated path is only wired up when at least one inference
+    # container was populated for the model's layers. ``_inference_containers``
+    # and ``_generate`` are both internal but they are the only two reliable
+    # signals across DeepSpeed 0.14–0.19; ``_generate`` is bound exactly when
+    # the container list is non-empty.
+    return getattr(engine, "_generate", None) is not None
+
+
+class HybridEngineRollout(RolloutEngine):
+    name = "hybrid_engine"
+
+    def __init__(self, student_engine, tokenizer, cfg: RolloutConfig):
+        if cfg.engine != "hybrid_engine":
+            raise ValueError(f"RolloutConfig.engine must be 'hybrid_engine'; got {cfg.engine!r}")
+        self.engine = student_engine
+        self.tokenizer = tokenizer
+        self.cfg = cfg
+        self._has_accel = _hybrid_engine_has_accel(student_engine)
+
+    @torch.no_grad()
+    def generate(self, request: RolloutRequest, sampling: SamplingConfig) -> RolloutBatch:
+        pad_id = self.tokenizer.pad_token_id
+        if pad_id is None:
+            # Many decoder-only tokenizers (Llama, Qwen) ship without a pad
+            # token. Fall back to eos so that generate doesn't crash on the
+            # left-padded prompts.
+            pad_id = self.tokenizer.eos_token_id
+
+        gen_kwargs = dict(
+            input_ids=request.prompt_ids,
+            attention_mask=request.prompt_attention_mask,
+            max_new_tokens=sampling.max_new_tokens,
+            do_sample=sampling.temperature > 0.0,
+            temperature=max(sampling.temperature, 1e-8),
+            top_p=sampling.top_p,
+            top_k=sampling.top_k if sampling.top_k > 0 else 0,
+            num_return_sequences=sampling.n_samples_per_prompt,
+            pad_token_id=pad_id,
+            eos_token_id=self.tokenizer.eos_token_id,
+        )
+
+        # Hybrid engine expects training mode toggled off so the inference
+        # containers take over. eval() is cheap (boolean flip + module walk).
+        self.engine.eval()
+        try:
+            if self._has_accel:
+                seqs = self.engine.generate(**gen_kwargs)
+            else:
+                seqs = self._fallback_generate(**gen_kwargs)
+        finally:
+            self.engine.train()
+
+        # ``seqs`` is [B * n, T_p + T_r_actual], left-padded prompt + response.
+        # With left-padded prompts every sample's response starts at column T_p.
+        B = request.prompt_ids.shape[0]
+        n = sampling.n_samples_per_prompt
+        T_p = request.prompt_ids.shape[1]
+        if seqs.shape[0] != B * n:
+            raise RuntimeError(f"generate returned batch {seqs.shape[0]}, expected {B * n}")
+
+        response_start_idx = torch.full((B * n, ), T_p, dtype=torch.long, device=seqs.device)
+
+        # Response positions are anything past the prompt that is also not pad.
+        attention_mask = (seqs != pad_id).to(request.prompt_attention_mask.dtype)
+        # Keep the prompt portion of the mask aligned with what the caller
+        # passed in (a prompt token equal to pad_id should still be attended);
+        # for typical left-padded prompts the overlap is identical.
+        prompt_mask_expanded = request.prompt_attention_mask.repeat_interleave(n, dim=0)
+        attention_mask[:, :T_p] = prompt_mask_expanded
+
+        return RolloutBatch(input_ids=seqs, attention_mask=attention_mask, response_start_idx=response_start_idx)
+
+    def sync_weights_from_student(self, step: int) -> None:  # noqa: ARG002
+        # The hybrid engine reads the student's live weights every generate
+        # call, so there is nothing to sync.
+        return None
+
+    @torch.no_grad()
+    def _fallback_generate(self, **gen_kwargs) -> torch.Tensor:
+        """Manual ZeRO-3 generate for architectures the hybrid engine doesn't
+        have an inference policy for.
+
+        Walks every parameter into a ``GatheredParameters`` context so the full
+        weight is materialized on each rank for the duration of generation,
+        then calls the underlying HF model's ``generate``. Re-partitions on
+        exit. This is correct but does not get the hybrid engine's optimized
+        kernels — expect ~3-5x slower decode than the accelerated path.
+        """
+        from deepspeed.runtime.zero import GatheredParameters
+
+        module = self.engine.module
+        all_params = list(module.parameters())
+        with GatheredParameters(all_params):
+            return module.generate(**gen_kwargs)
diff --git a/examples/opsd/opsd/trainer.py b/examples/opsd/opsd/trainer.py
new file mode 100644
index 000000000000..315b5145ef7a
--- /dev/null
+++ b/examples/opsd/opsd/trainer.py
@@ -0,0 +1,197 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""On-policy distillation training loop.
+
+Each step is three phases:
+
+  0. **Rollout.** The student generates responses for the batch's prompts
+     (via the configured :class:`~opsd.rollout.RolloutEngine` — hybrid engine
+     or vLLM).
+  1. **Teacher.** The frozen teacher runs a forward over prompt+response. The
+     full logit tensor is parked on the host via
+     :class:`~opsd.teacher.TeacherLogitCache` so teacher GPU buffers can be
+     released before the student backward.
+  2. **Student.** The student runs forward+backward on prompt+response. The
+     loss is the per-token divergence to the teacher, streamed from the
+     host-resident cache one sequence chunk at a time
+     (:func:`~opsd.losses.streamed_distillation_loss`), so the full
+     ``[B, T, V]`` teacher tensor never co-resides with the student logits on
+     the training device.
+
+The trainer itself contains no DeepSpeed-specific control flow beyond the
+``backward`` / ``step`` calls on the student engine; backend choice (ZeRO
+stage, offload, hybrid engine) is owned entirely by the DeepSpeed JSON config.
+"""
+
+import os
+import time
+from typing import Any
+
+import torch
+from deepspeed import comm as dist
+from deepspeed.accelerator import get_accelerator
+
+from opsd.config import OPSDConfig
+from opsd.losses import streamed_distillation_loss
+from opsd.rollout import RolloutEngine, RolloutRequest, SamplingConfig
+from opsd.utils import build_response_mask
+
+
+def _is_rank_zero() -> bool:
+    return (not dist.is_initialized()) or dist.get_rank() == 0
+
+
+class OPSDTrainer:
+
+    def __init__(
+        self,
+        cfg: OPSDConfig,
+        student_engine: Any,
+        teacher: Any,
+        tokenizer: Any,
+        rollout: RolloutEngine,
+        dataloader: Any,
+    ):
+        self.cfg = cfg
+        self.student_engine = student_engine
+        self.teacher = teacher
+        self.tokenizer = tokenizer
+        self.rollout = rollout
+        self.dataloader = dataloader
+
+        self.device = get_accelerator().current_device_name()
+        self.step = 0
+
+    # ------------------------------------------------------------------
+    # Driver
+    # ------------------------------------------------------------------
+
+    def train(self) -> None:
+        max_steps = self.cfg.training.max_steps
+        for epoch in range(self.cfg.training.num_train_epochs):
+            for batch in self.dataloader:
+                if max_steps > 0 and self.step >= max_steps:
+                    return
+                metrics = self._train_step(batch)
+                self._maybe_log(metrics)
+                self._maybe_save()
+                self.step += 1
+            if max_steps > 0 and self.step >= max_steps:
+                return
+
+    # ------------------------------------------------------------------
+    # One step
+    # ------------------------------------------------------------------
+
+    def _train_step(self, batch) -> dict:
+        t_start = time.time()
+
+        prompt_ids = batch["prompt_ids"].to(self.device, non_blocking=True)
+        prompt_attn = batch["prompt_attention_mask"].to(self.device, non_blocking=True)
+
+        # Push student weights into the rollout backend if it's time to.
+        # No-op for the hybrid engine; meaningful for vLLM.
+        if self.step % self.cfg.rollout.weight_sync_interval == 0:
+            self.rollout.sync_weights_from_student(self.step)
+
+        # --- Phase 0: rollout (student generates responses) ---------------
+        sampling = SamplingConfig(
+            max_new_tokens=self.cfg.rollout.max_response_length,
+            temperature=self.cfg.rollout.temperature,
+            top_p=self.cfg.rollout.top_p,
+            top_k=self.cfg.rollout.top_k,
+            n_samples_per_prompt=self.cfg.rollout.n_samples_per_prompt,
+        )
+        roll = self.rollout.generate(
+            RolloutRequest(prompt_ids=prompt_ids, prompt_attention_mask=prompt_attn),
+            sampling,
+        )
+        input_ids = roll.input_ids.to(self.device, non_blocking=True)
+        attention_mask = roll.attention_mask.to(self.device, non_blocking=True)
+        response_start_idx = roll.response_start_idx.to(self.device, non_blocking=True)
+        response_mask = build_response_mask(response_start_idx, attention_mask)
+        t_rollout = time.time() - t_start
+
+        # --- Phase 1: teacher forward → host-cached logits ----------------
+        t1 = time.time()
+        teacher_cache = self.teacher.forward_to_cache(input_ids, attention_mask)
+        t_teacher = time.time() - t1
+
+        # --- Phase 2: student forward + streamed KL + backward ------------
+        t2 = time.time()
+        self.student_engine.train()
+        outputs = self.student_engine(input_ids=input_ids, attention_mask=attention_mask)
+        student_logits = outputs.logits  # [B, T, V]
+
+        # Shift for next-token prediction: logits at position t predict token
+        # at t+1, so the loss aligns student_logits[:, :-1] with the position
+        # t+1 entries of the response mask.
+        student_logits_shifted = student_logits[:, :-1, :]
+        mask_shifted = response_mask[:, 1:].contiguous()
+
+        def _fetch(start: int, end: int) -> torch.Tensor:
+            # The cache holds *unshifted* teacher logits; for the next-token
+            # objective we ask the cache for positions [start, end) of the
+            # shifted teacher, which is positions [start, end) of the original
+            # since we already lopped off the final column in the student.
+            return teacher_cache.chunk_to_device(start,
+                                                 end,
+                                                 device=student_logits_shifted.device,
+                                                 dtype=student_logits_shifted.dtype)
+
+        loss = streamed_distillation_loss(
+            student_logits=student_logits_shifted,
+            teacher_chunk_fetcher=_fetch,
+            response_mask=mask_shifted,
+            loss_type=self.cfg.distillation.loss_type,
+            temperature=self.cfg.distillation.temperature,
+            chunk_size=self.cfg.distillation.chunk_size,
+        )
+
+        self.student_engine.backward(loss)
+        self.student_engine.step()
+
+        teacher_cache.free()
+        t_student = time.time() - t2
+
+        # Reduce loss across ranks for a clean log line.
+        loss_for_log = loss.detach().clone()
+        if dist.is_initialized():
+            dist.all_reduce(loss_for_log)
+            loss_for_log /= dist.get_world_size()
+
+        return {
+            "loss": float(loss_for_log.item()),
+            "rollout_s": t_rollout,
+            "teacher_s": t_teacher,
+            "student_s": t_student,
+            "step_s": time.time() - t_start,
+            "response_tokens": int(mask_shifted.sum().item()),
+        }
+
+    # ------------------------------------------------------------------
+    # Logging / checkpointing
+    # ------------------------------------------------------------------
+
+    def _maybe_log(self, metrics: dict) -> None:
+        if self.step % self.cfg.training.logging_steps != 0:
+            return
+        if not _is_rank_zero():
+            return
+        print(f"[opsd][step {self.step}] loss={metrics['loss']:.4f} "
+              f"rollout={metrics['rollout_s']:.2f}s teacher={metrics['teacher_s']:.2f}s "
+              f"student={metrics['student_s']:.2f}s step={metrics['step_s']:.2f}s "
+              f"resp_tok={metrics['response_tokens']}")
+
+    def _maybe_save(self) -> None:
+        if self.step == 0:
+            return
+        if self.step % self.cfg.training.save_steps != 0:
+            return
+        tag = f"step_{self.step}"
+        os.makedirs(self.cfg.training.save_dir, exist_ok=True)
+        self.student_engine.save_checkpoint(self.cfg.training.save_dir, tag=tag)
+        if _is_rank_zero():
+            print(f"[opsd] saved checkpoint to {self.cfg.training.save_dir}/{tag}")
diff --git a/examples/opsd/scripts/train_opsd_hybrid.sh b/examples/opsd/scripts/train_opsd_hybrid.sh
new file mode 100644
index 000000000000..69e3bdc68a7b
--- /dev/null
+++ b/examples/opsd/scripts/train_opsd_hybrid.sh
@@ -0,0 +1,14 @@
+#!/usr/bin/env bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+#
+# Launch OPSD training with the DeepSpeed hybrid-engine rollout (no vLLM).
+# Assumes you're cd'd into examples/opsd/.
+set -euo pipefail
+
+CONFIG="${1:-configs/opsd_hybrid_engine.json}"
+NUM_GPUS="${NUM_GPUS:-8}"
+
+deepspeed --num_gpus "${NUM_GPUS}" main.py --config "${CONFIG}"
diff --git a/examples/opsd/tests/test_rollout_interface.py b/examples/opsd/tests/test_rollout_interface.py
new file mode 100644
index 000000000000..7c6fd0545443
--- /dev/null
+++ b/examples/opsd/tests/test_rollout_interface.py
@@ -0,0 +1,156 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Conformance tests for the RolloutEngine interface.
+
+Validates the dataclass invariants and exercises the interface against a
+``FakeRollout`` so the contract is testable without GPUs or a model. The real
+backends are tested manually with a launched training script (see README).
+"""
+
+import pytest
+import torch
+
+from opsd.rollout import RolloutBatch, RolloutEngine, RolloutRequest, SamplingConfig
+from opsd.utils import build_response_mask
+
+# --- dataclass invariants ---------------------------------------------------
+
+
+def test_rollout_request_validates_shapes():
+    with pytest.raises(ValueError, match="must be 2-D"):
+        RolloutRequest(prompt_ids=torch.zeros(8), prompt_attention_mask=torch.ones(8))
+    with pytest.raises(ValueError, match="does not match"):
+        RolloutRequest(prompt_ids=torch.zeros(2, 4, dtype=torch.long), prompt_attention_mask=torch.ones(2, 5))
+
+
+def test_rollout_batch_validates_shapes():
+    with pytest.raises(ValueError, match="must be 2-D"):
+        RolloutBatch(input_ids=torch.zeros(8, dtype=torch.long),
+                     attention_mask=torch.ones(8),
+                     response_start_idx=torch.tensor([4]))
+    with pytest.raises(ValueError, match="does not match"):
+        RolloutBatch(input_ids=torch.zeros(2, 4, dtype=torch.long),
+                     attention_mask=torch.ones(2, 5),
+                     response_start_idx=torch.tensor([4, 4]))
+    with pytest.raises(ValueError, match="1-D of length"):
+        RolloutBatch(input_ids=torch.zeros(2, 4, dtype=torch.long),
+                     attention_mask=torch.ones(2, 4),
+                     response_start_idx=torch.tensor([4]))
+
+
+def test_rollout_batch_accessors():
+    batch = RolloutBatch(
+        input_ids=torch.zeros(3, 12, dtype=torch.long),
+        attention_mask=torch.ones(3, 12),
+        response_start_idx=torch.tensor([4, 5, 6]),
+    )
+    assert batch.batch_size == 3
+    assert batch.seq_len == 12
+
+
+def test_sampling_config_defaults():
+    cfg = SamplingConfig(max_new_tokens=32)
+    assert cfg.temperature == 1.0
+    assert cfg.top_p == 1.0
+    assert cfg.top_k == -1
+    assert cfg.n_samples_per_prompt == 1
+
+
+# --- interface conformance via FakeRollout ---------------------------------
+
+
+class FakeRollout(RolloutEngine):
+    """Deterministic stub: appends ``[42] * max_new_tokens`` to each prompt."""
+
+    name = "fake"
+
+    def __init__(self, response_token: int = 42):
+        self.response_token = response_token
+        self.sync_calls: list = []
+
+    def generate(self, request: RolloutRequest, sampling: SamplingConfig) -> RolloutBatch:
+        B, T_p = request.prompt_ids.shape
+        n = sampling.n_samples_per_prompt
+        T_r = sampling.max_new_tokens
+
+        prompts_expanded = request.prompt_ids.repeat_interleave(n, dim=0)
+        attn_p_expanded = request.prompt_attention_mask.repeat_interleave(n, dim=0)
+        response = torch.full((B * n, T_r), self.response_token, dtype=request.prompt_ids.dtype)
+        response_attn = torch.ones((B * n, T_r), dtype=attn_p_expanded.dtype)
+
+        input_ids = torch.cat([prompts_expanded, response], dim=1)
+        attention_mask = torch.cat([attn_p_expanded, response_attn], dim=1)
+        response_start_idx = torch.full((B * n, ), T_p, dtype=torch.long)
+        return RolloutBatch(input_ids=input_ids, attention_mask=attention_mask, response_start_idx=response_start_idx)
+
+    def sync_weights_from_student(self, step: int) -> None:
+        self.sync_calls.append(step)
+
+
+def test_fake_rollout_shape_basic():
+    fake = FakeRollout()
+    req = RolloutRequest(prompt_ids=torch.tensor([[1, 2, 3], [4, 5, 6]]),
+                         prompt_attention_mask=torch.ones(2, 3, dtype=torch.long))
+    out = fake.generate(req, SamplingConfig(max_new_tokens=4))
+    assert out.input_ids.shape == (2, 7)
+    assert out.attention_mask.shape == (2, 7)
+    # With left-padded (fully real here) prompts of width 3, response begins
+    # at column 3 for every sample.
+    assert out.response_start_idx.tolist() == [3, 3]
+
+
+def test_fake_rollout_with_n_samples():
+    fake = FakeRollout()
+    req = RolloutRequest(prompt_ids=torch.tensor([[1, 2], [3, 4]]),
+                         prompt_attention_mask=torch.ones(2, 2, dtype=torch.long))
+    out = fake.generate(req, SamplingConfig(max_new_tokens=3, n_samples_per_prompt=4))
+    assert out.input_ids.shape == (8, 5)
+    assert out.response_start_idx.tolist() == [2] * 8
+
+
+def test_fake_rollout_left_padded_prompts():
+    fake = FakeRollout()
+    # left-padded prompts: prompt B has only the last 2 positions real, but
+    # response_start_idx still equals the prompt column width T_p.
+    prompt_ids = torch.tensor([[1, 2, 3, 4], [0, 0, 5, 6]])
+    attn = torch.tensor([[1, 1, 1, 1], [0, 0, 1, 1]], dtype=torch.long)
+    req = RolloutRequest(prompt_ids=prompt_ids, prompt_attention_mask=attn)
+    out = fake.generate(req, SamplingConfig(max_new_tokens=2))
+    assert out.response_start_idx.tolist() == [4, 4]
+
+
+def test_response_mask_from_rollout_output_matches_helper():
+    fake = FakeRollout()
+    prompt_ids = torch.tensor([[1, 2, 3], [0, 4, 5]])
+    attn = torch.tensor([[1, 1, 1], [0, 1, 1]], dtype=torch.long)
+    out = fake.generate(RolloutRequest(prompt_ids, attn), SamplingConfig(max_new_tokens=3))
+    mask = build_response_mask(out.response_start_idx, out.attention_mask)
+    # Both samples: response starts at column 3 (T_p), and all post-prompt
+    # positions are attended (FakeRollout produces no padding in the response).
+    assert mask[0].tolist() == [0, 0, 0, 1, 1, 1]
+    assert mask[1].tolist() == [0, 0, 0, 1, 1, 1]
+
+
+def test_sync_records_steps():
+    fake = FakeRollout()
+    fake.sync_weights_from_student(0)
+    fake.sync_weights_from_student(5)
+    assert fake.sync_calls == [0, 5]
+
+
+def test_engine_factory_unknown_raises():
+    from opsd.config import RolloutConfig
+    from opsd.rollout import build_rollout
+
+    with pytest.raises(ValueError, match="Unknown rollout engine"):
+        build_rollout(RolloutConfig(engine="totally_made_up"))
+
+
+def test_engine_factory_hybrid_requires_student_engine():
+    from opsd.config import RolloutConfig
+    from opsd.rollout import build_rollout
+
+    with pytest.raises(ValueError, match="needs both"):
+        build_rollout(RolloutConfig(engine="hybrid_engine"))

From f6cfd682e4d8d6b2943b7dd9fa27885866a90ec0 Mon Sep 17 00:00:00 2001
From: Zhipeng Wang <zhipengbayern@gmail.com>
Date: Tue, 26 May 2026 07:15:53 +0000
Subject: [PATCH 04/18] Add OPSD vLLM rollout scaffold, Qwen2/Qwen3 weight
 bridges, and README

Lands the second-stage rollout path, weight-sync infrastructure, and the
example app's README. Includes:

  * VLLMRollout that constructs vllm.LLM on training rank 0 and broadcasts
    generated token ids to peer ranks, with disjoint-GPU (subprocess) and
    shared (in-process) topology paths. Weight sync gathers ZeRO-3 params
    cooperatively then pushes to vLLM via LLM.collective_rpc("load_weights").

  * WeightBridge ABC with COLUMN / ROW / VOCAB / REPLICATED parallel kinds
    and an even-slice per-rank slicer; Qwen2WeightBridge with the full
    per-parameter table for Qwen2 / Qwen2.5; Qwen3WeightBridge adding the
    per-head q_norm / k_norm tensors as REPLICATED.

  * vLLM-side prompt+response stitching factored into stitch_rollout() so
    its index math is unit-testable without a live vLLM.

  * CPU-only tests: tests/test_weight_bridge.py covers parallel-kind
    dispatch, per-rank shape/gather round-trips across tp_size in {1,2,4},
    indivisibility / invalid-rank guards, and the registry;
    tests/test_vllm_stitch.py covers prompt/response stitching for the
    common shapes including variable response lengths and left-padded
    prompts.

  * configs + launch scripts for both production and smoke vLLM runs.

**Known blocker called out in README and module docstring:** vLLM's worker
init calls new_group() on the global process group, which deadlocks when
launched under the standard `deepspeed --num_gpus N` launcher (rank 0
calls vLLM, other ranks never participate in vLLM's collective). The
documented fix is the TRL/OpenRLHF separate-server pattern; this PR lands
the scaffolding so that work can begin against a green codebase.

Signed-off-by: Zhipeng Wang <zhipengbayern@gmail.com>
---
 examples/opsd/README.md                       | 232 +++++++++++++
 examples/opsd/configs/opsd_vllm_disjoint.json |  54 +++
 examples/opsd/configs/smoke_vllm.json         |  55 +++
 examples/opsd/opsd/rollout/vllm.py            | 314 ++++++++++++++++++
 examples/opsd/opsd/weight_bridge/__init__.py  |  32 ++
 examples/opsd/opsd/weight_bridge/base.py      | 109 ++++++
 examples/opsd/opsd/weight_bridge/qwen2.py     |  84 +++++
 examples/opsd/opsd/weight_bridge/qwen3.py     |  37 +++
 examples/opsd/scripts/train_opsd_vllm.sh      |  19 ++
 examples/opsd/tests/test_vllm_stitch.py       |  97 ++++++
 examples/opsd/tests/test_weight_bridge.py     | 259 +++++++++++++++
 11 files changed, 1292 insertions(+)
 create mode 100644 examples/opsd/README.md
 create mode 100644 examples/opsd/configs/opsd_vllm_disjoint.json
 create mode 100644 examples/opsd/configs/smoke_vllm.json
 create mode 100644 examples/opsd/opsd/rollout/vllm.py
 create mode 100644 examples/opsd/opsd/weight_bridge/__init__.py
 create mode 100644 examples/opsd/opsd/weight_bridge/base.py
 create mode 100644 examples/opsd/opsd/weight_bridge/qwen2.py
 create mode 100644 examples/opsd/opsd/weight_bridge/qwen3.py
 create mode 100644 examples/opsd/scripts/train_opsd_vllm.sh
 create mode 100644 examples/opsd/tests/test_vllm_stitch.py
 create mode 100644 examples/opsd/tests/test_weight_bridge.py

diff --git a/examples/opsd/README.md b/examples/opsd/README.md
new file mode 100644
index 000000000000..9eab8485a707
--- /dev/null
+++ b/examples/opsd/README.md
@@ -0,0 +1,232 @@
+# On-Policy Distillation (OPSD) on DeepSpeed
+
+A DeepSpeed-native port of [HJSang/OPSD_OnPolicyDistillation](https://github.com/HJSang/OPSD_OnPolicyDistillation),
+removing the verl dependency and building directly on DeepSpeed primitives
+(ZeRO-3, hybrid engine, `deepspeed.initialize`).
+
+On-policy distillation trains a small **student** model to imitate a large
+frozen **teacher** on the student's *own* generated rollouts. Each training
+step has three phases:
+
+```
+┌────────────┐   prompts   ┌──────────────────┐   prompt+response   ┌────────────┐
+│ Dataloader │ ──────────▶ │ Student rollout  │ ──────────────────▶ │  Teacher   │
+└────────────┘             │ (hybrid / vLLM)  │                     │  forward   │
+                           └──────────────────┘                     └─────┬──────┘
+                                                                          │ logits → CPU cache
+                                                                          ▼
+                                                              ┌─────────────────────┐
+                                                              │ Student forward +   │
+                                                              │ streamed KL / JSD + │
+                                                              │ backward / step     │
+                                                              └─────────────────────┘
+```
+
+Loss = per-token divergence (`forward_kl` | `reverse_kl` | `jsd`) between
+student and teacher distributions on the student's generated tokens, chunked
+over the sequence axis so the full `[B, T, V]` teacher tensor never
+co-resides with the student logits on the training device.
+
+## Layout
+
+```
+examples/opsd/
+├── main.py                            # entry point (deepspeed launcher)
+├── opsd/
+│   ├── config.py                      # OPSDConfig dataclass + JSON loader
+│   ├── losses.py                      # chunked / streamed KL & JSD
+│   ├── teacher.py                     # frozen teacher + CPU logit cache
+│   ├── trainer.py                     # three-phase training loop
+│   ├── data.py                        # JSONL prompt dataset + left-pad collator
+│   ├── utils.py                       # response-mask + shift helpers
+│   ├── rollout/
+│   │   ├── base.py                    # RolloutEngine ABC, request/batch dataclasses
+│   │   ├── hybrid_engine.py           # DeepSpeed hybrid-engine rollout
+│   │   └── vllm.py                    # vLLM rollout on disjoint GPUs
+│   └── weight_bridge/
+│       ├── base.py                    # ParallelKind + per-rank slicer
+│       ├── qwen2.py                   # Qwen2 / Qwen2.5 TP mapping
+│       └── qwen3.py                   # Qwen3 dense (adds q_norm/k_norm)
+├── configs/
+│   ├── ds_zero3.json                  # base DeepSpeed ZeRO-3 + hybrid engine
+│   ├── opsd_hybrid_engine.json        # production-ish hybrid-engine OPSD config
+│   ├── opsd_vllm_disjoint.json        # vLLM rollout on a disjoint GPU group
+│   ├── smoke_hybrid.json              # 5-step smoke test with Qwen2.5-0.5B / 1.5B
+│   ├── smoke_vllm.json                # same but with vLLM rollout
+│   └── smoke_ds_zero3.json            # ZeRO-3 config tuned for smoke runs
+├── scripts/
+│   ├── train_opsd_hybrid.sh           # launch hybrid-engine training
+│   └── train_opsd_vllm.sh             # launch vLLM training
+└── tests/                             # CPU-only unit tests (run with pytest)
+```
+
+## Quick start
+
+### Install
+
+```
+pip install deepspeed transformers datasets accelerate
+# Optional, only for the vLLM rollout backend:
+pip install 'vllm>=0.6.4'
+```
+
+### Hybrid-engine training (single-node, no vLLM)
+
+```
+cd examples/opsd
+NUM_GPUS=8 bash scripts/train_opsd_hybrid.sh configs/opsd_hybrid_engine.json
+```
+
+The hybrid engine path lives entirely within DeepSpeed: the student engine
+both trains and generates, sharing weights without a copy step. Easiest to
+get running; slower generation than vLLM.
+
+### vLLM training (disjoint GPU group)
+
+```
+cd examples/opsd
+# Train on GPUs 0..5, run vLLM on 6,7 (matches default config)
+NUM_TRAIN_GPUS=6 INCLUDE_GPUS=0,1,2,3,4,5 \
+    bash scripts/train_opsd_vllm.sh configs/opsd_vllm_disjoint.json
+```
+
+vLLM gets dedicated GPUs (`rollout.gpus` in the config). Training rank 0
+constructs the `LLM` handle; other training ranks receive generated token
+ids via NCCL broadcast.
+
+### Smoke tests (5 steps, small models)
+
+The `smoke_*.json` configs run on 2 GPUs in a few minutes with Qwen2.5-0.5B
+(student) and Qwen2.5-1.5B (teacher), so the full pipeline can be validated
+end-to-end before scaling up.
+
+```
+cd examples/opsd
+deepspeed --num_gpus 2 main.py --config configs/smoke_hybrid.json
+# For vLLM (uses GPUs 0,1 for training and 2,3 for vLLM):
+NUM_TRAIN_GPUS=2 INCLUDE_GPUS=0,1 deepspeed --num_gpus 2 --include localhost:0,1 \
+    main.py --config configs/smoke_vllm.json
+```
+
+## Unit tests
+
+The CPU-runnable test suite exercises the loss math, teacher caching, rollout
+contract, weight-bridge TP slicing, and vLLM stitch logic. Run with:
+
+```
+cd examples/opsd
+python -m pytest tests/ -v
+```
+
+## Configuration
+
+`OPSDConfig` is a plain dataclass loaded from JSON (no Hydra). The schema:
+
+```json
+{
+  "student":    { "model_name_or_path": "...", "dtype": "bfloat16", "arch": "qwen2" },
+  "teacher":    { "model_name_or_path": "...", "dtype": "bfloat16", "offload_to_cpu": true },
+  "rollout":    { "engine": "hybrid_engine | vllm", ... },
+  "distillation": { "loss_type": "reverse_kl", "temperature": 1.0, "chunk_size": 512 },
+  "training":   { "train_batch_size": 8, "learning_rate": 1e-6, ... },
+  "data":       { "path": "data/prompts.jsonl", "prompt_field": "prompt" },
+  "deepspeed_config": "configs/ds_zero3.json"
+}
+```
+
+See `configs/opsd_hybrid_engine.json` and `configs/opsd_vllm_disjoint.json`
+for fully-populated examples.
+
+## Adding a new model architecture
+
+To support a model the bridge doesn't recognise yet:
+
+1. Add `opsd/weight_bridge/<arch>.py` subclassing `Qwen2WeightBridge` (or
+   `WeightBridge` directly) and override `parallel_kind` / `_extra_layer_kind`
+   for any parameters not in Qwen2's table.
+2. Register the new arch in `opsd/weight_bridge/__init__.py::get_bridge`.
+3. Add a test in `tests/test_weight_bridge.py` covering parallel-kind dispatch
+   and a slice-then-gather round trip for one layer of realistic shapes.
+
+## Design notes
+
+* **Why CPU-cache the teacher logits?** Holding both student and teacher
+  `[B, T, V]` tensors on GPU at once doubles memory pressure. Staging the
+  teacher to host between the teacher forward and the student backward halves
+  the worst-case GPU footprint of the loss path. The streamed loss
+  (`losses.streamed_distillation_loss`) pulls teacher chunks back to GPU
+  one sequence slice at a time so the full tensor never re-materialises.
+
+* **Why an abstract `RolloutEngine`?** The hybrid-engine and vLLM backends
+  have very different lifecycles (hybrid engine reads student weights live;
+  vLLM holds its own copy and must be synced) but the trainer should not
+  care. The ABC keeps the trainer engine-agnostic so additional backends
+  (e.g. a future colocated-vLLM-with-`sleep_mode`) drop in without touching
+  the loop.
+
+* **vLLM topology = disjoint, not colocated (v1).** The disjoint topology is
+  simpler to debug — failures in vLLM don't take down training and vice
+  versa. A colocated topology using vLLM 0.6.4+'s `sleep_mode` is planned as
+  a follow-up.
+
+* **Weight bridge does not pre-fuse QKV / gate-up.** vLLM's per-model loader
+  already knows how to fuse these from the standard HuggingFace layout, so
+  the bridge only handles per-rank slicing.
+
+## vLLM status
+
+The vLLM rollout (`opsd/rollout/vllm.py`) is **written and unit-tested but
+not yet usable under the DeepSpeed launcher**. During live validation on
+4× H200 we hit a blocking issue:
+
+> vLLM's worker init calls `new_group(...)` on the global process group as
+> a collective. Under `deepspeed --num_gpus N`, the world is all `N`
+> training ranks but only rank 0 calls into vLLM, so the constructor hangs
+> waiting on the other ranks. Reproduced with vllm 0.6.6 + deepspeed 0.15.4 +
+> torch 2.5.1. Standalone vLLM (world size 1) works in seconds.
+
+The fix requires running vLLM in a **separate top-level Python process**
+with its own world, accessed over HTTP/RPC from the trainer — the pattern
+used by TRL and OpenRLHF. That's a larger refactor than fits in this PR;
+the current `VLLMRollout` will be the basis for it once landed.
+
+What's verified for the vLLM path today:
+* `tests/test_vllm_stitch.py` — prompt + response stitching (CPU unit test)
+* `tests/test_weight_bridge.py` — TP-slice math for Qwen2 / Qwen3 (CPU)
+* `vllm.LLM` itself runs fine standalone on Qwen2.5-0.5B (validated)
+
+What's **not** verified:
+* End-to-end training loop with `rollout.engine = "vllm"` in `OPSDConfig`
+* `LLM.collective_rpc("load_weights", ...)` weight sync at training time
+
+The hybrid-engine path (`rollout.engine = "hybrid_engine"`) is validated
+end-to-end on the same hardware.
+
+## Other known limitations (v1)
+
+* **vLLM weight sync (when it works) goes through pickle** —
+  `LLM.collective_rpc("load_weights", args=((name, tensor_on_cpu),))`.
+  Expect several seconds per sync on a 7B model. A faster v2 would broadcast
+  tensors via NCCL on a shared trainer↔vLLM process group — see verl's
+  `bucketed_weight_transfer.py` for a reference design.
+* **vLLM `tensor_parallel_size > 1` is untested.** The weight bridge's
+  slicing math is unit-tested but no live run exists.
+* **Reward-weighted distillation** (OPSD's `opd.reward_beta` knob) is not
+  ported. Easy to add: scale `per_tok` by a reward weight in the loss path.
+* **GRPO and other on-policy RL recipes** are out of scope. The
+  `RolloutEngine` / `WeightBridge` abstractions are reusable, but a GRPO
+  trainer would add its own advantage / KL-to-reference logic on top.
+* **Qwen3-MoE** is not covered. Add `weight_bridge/qwen3_moe.py` when needed.
+* **Hybrid engine on Qwen-family models uses a ZeRO-3 fallback** (no
+  hybrid-engine inference acceleration), since DeepSpeed's inference policy
+  list only covers GPT2/GPT-NeoX/OPT/BLOOM/LLAMA/LLAMA2/InternLM as of 0.15.
+  The fallback gathers params via `GatheredParameters` and calls the HF
+  model's `generate` directly — correct, just ~3-5x slower than the
+  accelerated path.
+
+## References
+
+* OPSD reference repo: <https://github.com/HJSang/OPSD_OnPolicyDistillation>
+* DeepSpeed hybrid engine: `deepspeed/runtime/hybrid_engine.py`
+* verl rollout / weight-sync design (used as a cross-check):
+  <https://github.com/volcengine/verl/tree/main/verl/workers/rollout/vllm_rollout>
diff --git a/examples/opsd/configs/opsd_vllm_disjoint.json b/examples/opsd/configs/opsd_vllm_disjoint.json
new file mode 100644
index 000000000000..9668b3702981
--- /dev/null
+++ b/examples/opsd/configs/opsd_vllm_disjoint.json
@@ -0,0 +1,54 @@
+{
+    "student": {
+        "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
+        "dtype": "bfloat16",
+        "trust_remote_code": false,
+        "arch": "qwen2"
+    },
+    "teacher": {
+        "model_name_or_path": "Qwen/Qwen2.5-Math-7B-Instruct",
+        "dtype": "bfloat16",
+        "trust_remote_code": false,
+        "offload_to_cpu": true
+    },
+    "rollout": {
+        "engine": "vllm",
+        "max_prompt_length": 1024,
+        "max_response_length": 1024,
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "top_k": -1,
+        "n_samples_per_prompt": 1,
+        "gpus": [6, 7],
+        "tensor_parallel_size": 2,
+        "gpu_memory_utilization": 0.85,
+        "vllm_dtype": "bfloat16",
+        "weight_sync_interval": 4,
+        "vllm_min_version": "0.6.4"
+    },
+    "distillation": {
+        "loss_type": "reverse_kl",
+        "temperature": 1.0,
+        "chunk_size": 512
+    },
+    "training": {
+        "train_batch_size": 6,
+        "micro_batch_size_per_gpu": 1,
+        "gradient_accumulation_steps": 1,
+        "learning_rate": 1e-6,
+        "weight_decay": 0.0,
+        "num_train_epochs": 1,
+        "max_steps": -1,
+        "warmup_steps": 0,
+        "save_steps": 500,
+        "logging_steps": 10,
+        "save_dir": "./opsd_ckpt_vllm",
+        "seed": 42
+    },
+    "data": {
+        "path": "data/prompts.jsonl",
+        "prompt_field": "prompt",
+        "shuffle": true
+    },
+    "deepspeed_config": "configs/ds_zero3.json"
+}
diff --git a/examples/opsd/configs/smoke_vllm.json b/examples/opsd/configs/smoke_vllm.json
new file mode 100644
index 000000000000..8daf31537df2
--- /dev/null
+++ b/examples/opsd/configs/smoke_vllm.json
@@ -0,0 +1,55 @@
+{
+    "student": {
+        "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
+        "dtype": "bfloat16",
+        "trust_remote_code": false,
+        "arch": "qwen2"
+    },
+    "teacher": {
+        "model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
+        "dtype": "bfloat16",
+        "trust_remote_code": false,
+        "offload_to_cpu": false
+    },
+    "rollout": {
+        "engine": "vllm",
+        "max_prompt_length": 128,
+        "max_response_length": 64,
+        "temperature": 1.0,
+        "top_p": 1.0,
+        "top_k": -1,
+        "n_samples_per_prompt": 1,
+        "gpus": [],
+        "tensor_parallel_size": 1,
+        "gpu_memory_utilization": 0.3,
+        "vllm_dtype": "bfloat16",
+        "weight_sync_interval": 2,
+        "vllm_min_version": "0.6.4",
+        "vllm_enforce_eager": true
+    },
+    "distillation": {
+        "loss_type": "reverse_kl",
+        "temperature": 1.0,
+        "chunk_size": 128
+    },
+    "training": {
+        "train_batch_size": 2,
+        "micro_batch_size_per_gpu": 1,
+        "gradient_accumulation_steps": 1,
+        "learning_rate": 1e-6,
+        "weight_decay": 0.0,
+        "num_train_epochs": 1,
+        "max_steps": 5,
+        "warmup_steps": 0,
+        "save_steps": 10000,
+        "logging_steps": 1,
+        "save_dir": "./opsd_smoke_vllm_ckpt",
+        "seed": 42
+    },
+    "data": {
+        "path": "data/prompts.jsonl",
+        "prompt_field": "prompt",
+        "shuffle": true
+    },
+    "deepspeed_config": "configs/smoke_ds_zero3.json"
+}
diff --git a/examples/opsd/opsd/rollout/vllm.py b/examples/opsd/opsd/rollout/vllm.py
new file mode 100644
index 000000000000..947e43fbc7aa
--- /dev/null
+++ b/examples/opsd/opsd/rollout/vllm.py
@@ -0,0 +1,314 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""vLLM rollout on a disjoint GPU group.
+
+**Topology (intended)**
+    * Training ranks 0..N-1 run the student under ZeRO-3 on the first N GPUs.
+    * vLLM workers run on the device indices listed in ``cfg.gpus`` (or in
+      "shared" mode, alongside training rank 0).
+    * The vLLM ``LLM`` handle is constructed **only on training rank 0**.
+    * Other training ranks receive generated token ids by broadcast from
+      rank 0 (:func:`deepspeed.comm.broadcast_object_list`).
+
+**Weight sync**
+    * All training ranks cooperatively gather each ZeRO-3 parameter via
+      :class:`deepspeed.runtime.zero.GatheredParameters`.
+    * Rank 0 pushes the full tensor to vLLM via ``LLM.collective_rpc(...)``,
+      which dispatches to every vLLM worker; each worker uses its own TP rank
+      to slice and load.
+
+**KNOWN BLOCKING ISSUE — same-process vLLM under the DeepSpeed launcher**
+
+    vLLM's worker initialisation calls ``new_group(...)`` on the global
+    process group as a collective. Under the standard DeepSpeed launcher
+    (e.g. ``deepspeed --num_gpus 2``) the world spans **all** training
+    ranks, but only rank 0 calls into vLLM. The other training ranks never
+    participate in vLLM's collective, so the ``LLM`` constructor hangs
+    forever waiting on them.
+
+    This was reproduced with vllm 0.6.6 + deepspeed 0.15.4 + torch 2.5.1; the
+    same code-path completes in seconds when ``LLM`` is constructed in a
+    process whose world size is 1. Verified by minimal repro (rank 0 LLM
+    init blocks; rank 1 idle).
+
+    **Workarounds (none currently implemented):**
+      1. Run vLLM in a **separate top-level Python process** with its own
+         world (size 1), and have the trainer talk to it over an HTTP or
+         RPC channel. This is what TRL and OpenRLHF do for their vLLM
+         backends.
+      2. Spawn vLLM as a subprocess from rank 0 and tunnel calls through a
+         queue. Similar to (1) but lower-level.
+      3. Wait for upstream vLLM to expose a flag that skips its internal
+         ``new_group`` calls when the caller already owns process-group
+         setup.
+
+    Until one of those lands, **the vLLM rollout in this PR is verified at
+    the unit-test level only** (see ``tests/test_vllm_stitch.py`` and
+    ``tests/test_weight_bridge.py``). The hybrid engine rollout is the
+    fully-validated live path. See the project README's "vLLM status"
+    section for current state.
+"""
+
+import os
+from typing import Any, List, Optional
+
+import torch
+
+from opsd.config import RolloutConfig
+from opsd.rollout.base import RolloutBatch, RolloutEngine, RolloutRequest, SamplingConfig
+from opsd.weight_bridge import WeightBridge, get_bridge
+
+
+def _is_rank_zero() -> bool:
+    # Deferred so this module remains importable in CPU-only test envs that
+    # don't have ``deepspeed`` available (the ``stitch_rollout`` helper below
+    # is pure tensor math and is unit-tested without DeepSpeed).
+    from deepspeed import comm as dist
+
+    return (not dist.is_initialized()) or dist.get_rank() == 0
+
+
+def stitch_rollout(
+    prompt_ids: torch.Tensor,
+    prompt_attention_mask: torch.Tensor,
+    responses: List[List[int]],
+    pad_id: int,
+    n_samples_per_prompt: int,
+) -> RolloutBatch:
+    """Stitch left-padded prompts and per-sample response token ids into one
+    right-padded ``RolloutBatch``.
+
+    This is the only piece of vLLM-side post-processing that doesn't depend
+    on a live LLM handle, so we factor it out for CPU unit testing.
+
+    Args:
+        prompt_ids: ``[B, T_p]`` left-padded prompts.
+        prompt_attention_mask: ``[B, T_p]`` matching attention mask.
+        responses: list of length ``B * n_samples_per_prompt``; each element
+            is the list of generated token ids for one (prompt, sample).
+        pad_id: pad token used for both prompt left-padding and response
+            right-padding (typically the tokenizer's ``pad_token_id`` or
+            ``eos_token_id``).
+        n_samples_per_prompt: number of generated samples per prompt.
+
+    Returns:
+        :class:`RolloutBatch` with ``response_start_idx = T_p`` for every
+        sample.
+    """
+    B, T_p = prompt_ids.shape
+    n = n_samples_per_prompt
+    expected = B * n
+    if len(responses) != expected:
+        raise ValueError(f"expected {expected} response token-id lists "
+                         f"(B={B} * n_samples={n}); got {len(responses)}")
+
+    if responses:
+        max_response_len = max(len(r) for r in responses)
+    else:
+        max_response_len = 0
+    T_total = T_p + max_response_len
+    device = prompt_ids.device
+
+    out_ids = torch.full((expected, T_total), pad_id, dtype=torch.long, device=device)
+    out_attn = torch.zeros((expected, T_total), dtype=prompt_attention_mask.dtype, device=device)
+
+    prompts_expanded = prompt_ids.repeat_interleave(n, dim=0)
+    attn_expanded = prompt_attention_mask.repeat_interleave(n, dim=0)
+    out_ids[:, :T_p] = prompts_expanded
+    out_attn[:, :T_p] = attn_expanded
+
+    for i, resp in enumerate(responses):
+        L = len(resp)
+        if L == 0:
+            continue
+        out_ids[i, T_p:T_p + L] = torch.tensor(resp, dtype=torch.long, device=device)
+        out_attn[i, T_p:T_p + L] = 1
+
+    response_start_idx = torch.full((expected, ), T_p, dtype=torch.long, device=device)
+    return RolloutBatch(input_ids=out_ids, attention_mask=out_attn, response_start_idx=response_start_idx)
+
+
+class VLLMRollout(RolloutEngine):
+
+    name = "vllm"
+
+    def __init__(
+        self,
+        cfg: RolloutConfig,
+        tokenizer: Any,
+        student_engine: Any = None,
+        student_model_path: Optional[str] = None,
+        arch: Optional[str] = None,
+    ):
+        if cfg.engine != "vllm":
+            raise ValueError(f"RolloutConfig.engine must be 'vllm'; got {cfg.engine!r}")
+        if student_model_path is None:
+            raise ValueError("VLLMRollout needs student_model_path to initialise the vLLM engine "
+                             "(it loads weights from disk at construction time)")
+
+        self.cfg = cfg
+        self.tokenizer = tokenizer
+        self.student_engine = student_engine
+        self._model_path = student_model_path
+
+        self.is_rank_zero = _is_rank_zero()
+        self.llm: Optional[Any] = None
+        self.bridge: Optional[WeightBridge] = get_bridge(arch) if arch is not None else None
+
+        if self.is_rank_zero:
+            self._init_vllm()
+
+    # ------------------------------------------------------------------
+    # Construction
+    # ------------------------------------------------------------------
+
+    def _init_vllm(self) -> None:
+        # Topology selection:
+        #   * cfg.gpus empty  → SHARED: vLLM runs in-process on the same GPU
+        #     as training rank 0. Simple; no CUDA visibility tricks. Used for
+        #     smoke tests and when vLLM + student fit alongside each other.
+        #   * cfg.gpus set    → DISJOINT: vLLM workers are pinned to the
+        #     listed devices via CUDA_VISIBLE_DEVICES + a spawn-mode
+        #     subprocess executor so the new CUDA context isn't inherited
+        #     from the already-initialised rank-0 process.
+        shared = not self.cfg.gpus
+
+        prev_cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
+        prev_mp = os.environ.get("VLLM_WORKER_MULTIPROC_METHOD")
+        if not shared:
+            os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(g) for g in self.cfg.gpus)
+            # Must be set before the vllm import; the value is read at import time.
+            os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
+        try:
+            try:
+                from vllm import LLM
+            except ImportError as e:
+                raise ImportError(f"VLLMRollout requires vllm>={self.cfg.vllm_min_version}. "
+                                  f"Install with: pip install 'vllm>={self.cfg.vllm_min_version}'") from e
+
+            llm_kwargs = dict(
+                model=self._model_path,
+                tensor_parallel_size=self.cfg.tensor_parallel_size,
+                gpu_memory_utilization=self.cfg.gpu_memory_utilization,
+                dtype=self.cfg.vllm_dtype,
+                enforce_eager=self.cfg.vllm_enforce_eager,
+            )
+            if not shared:
+                llm_kwargs["distributed_executor_backend"] = "mp"
+            self.llm = LLM(**llm_kwargs)
+        finally:
+            if prev_cvd is None:
+                os.environ.pop("CUDA_VISIBLE_DEVICES", None)
+            else:
+                os.environ["CUDA_VISIBLE_DEVICES"] = prev_cvd
+            if prev_mp is None:
+                os.environ.pop("VLLM_WORKER_MULTIPROC_METHOD", None)
+            else:
+                os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = prev_mp
+
+    # ------------------------------------------------------------------
+    # Generation
+    # ------------------------------------------------------------------
+
+    def generate(self, request: RolloutRequest, sampling: SamplingConfig) -> RolloutBatch:
+        B = int(request.prompt_ids.shape[0])
+        n = sampling.n_samples_per_prompt
+
+        if self.is_rank_zero:
+            from vllm import SamplingParams
+
+            # We send prompt *token ids* rather than text to vLLM so the
+            # generation stays bit-exact with how the trainer tokenised. This
+            # avoids any subtle BOS / special-token differences between the
+            # trainer's and vLLM's text->id paths.
+            prompt_token_ids: List[List[int]] = []
+            for i in range(B):
+                mask = request.prompt_attention_mask[i].bool()
+                ids = request.prompt_ids[i][mask].tolist()
+                prompt_token_ids.append(ids)
+
+            sp = SamplingParams(
+                n=n,
+                temperature=sampling.temperature,
+                top_p=sampling.top_p,
+                top_k=sampling.top_k if sampling.top_k > 0 else -1,
+                max_tokens=sampling.max_new_tokens,
+            )
+            results = self.llm.generate(prompt_token_ids=prompt_token_ids, sampling_params=sp, use_tqdm=False)
+            responses: List[List[int]] = []
+            for r in results:
+                for out in r.outputs:
+                    responses.append(list(out.token_ids))
+        else:
+            responses = []
+
+        from deepspeed import comm as dist
+
+        if dist.is_initialized() and dist.get_world_size() > 1:
+            obj = [responses]
+            dist.broadcast_object_list(obj, src=0)
+            responses = obj[0]
+
+        pad_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
+        return stitch_rollout(
+            prompt_ids=request.prompt_ids,
+            prompt_attention_mask=request.prompt_attention_mask,
+            responses=responses,
+            pad_id=pad_id,
+            n_samples_per_prompt=n,
+        )
+
+    # ------------------------------------------------------------------
+    # Weight sync
+    # ------------------------------------------------------------------
+
+    def sync_weights_from_student(self, step: int) -> None:
+        if self.student_engine is None:
+            return
+        if self.bridge is None:
+            # Best-effort inference of arch from the student model class name.
+            model = self.student_engine.module
+            cls = type(model).__name__.lower()
+            if "qwen3" in cls:
+                self.bridge = get_bridge("qwen3")
+            elif "qwen2" in cls:
+                self.bridge = get_bridge("qwen2")
+            else:
+                raise RuntimeError(f"Cannot infer weight bridge for student class {cls!r}; "
+                                   f"set StudentConfig.arch explicitly")
+
+        from deepspeed.runtime.zero import GatheredParameters
+
+        model = self.student_engine.module
+        for name, param in model.named_parameters():
+            # GatheredParameters is a no-op when ZeRO stage < 3, and a full
+            # all-gather when stage == 3. Either way every rank sees the full
+            # tensor inside the context; only rank 0 forwards it to vLLM.
+            with GatheredParameters([param], modifier_rank=0):
+                if not self.is_rank_zero:
+                    continue
+                # Sanity-check the param name against the bridge so a renamed
+                # parameter trips here (cheap) rather than as a silent layout
+                # mismatch inside vLLM later (very hard to debug).
+                self.bridge.parallel_kind(name)
+                self._push_one_param(name, param.data.detach())
+
+    def _push_one_param(self, name: str, tensor: torch.Tensor) -> None:
+        # collective_rpc dispatches to every vLLM worker; pickle handles the
+        # tensor transfer. CPU tensors pickle cleanly across process bounds.
+        cpu = tensor.contiguous().cpu()
+        # vLLM's per-architecture model class exposes ``load_weights`` taking
+        # an iterable of (name, tensor) pairs and internally handles QKV /
+        # gate_up fusion plus per-rank slicing for tensor parallelism.
+        self.llm.collective_rpc("load_weights", args=([(name, cpu)], ))
+
+    # ------------------------------------------------------------------
+    # Cleanup
+    # ------------------------------------------------------------------
+
+    def shutdown(self) -> None:
+        if self.llm is not None:
+            del self.llm
+            self.llm = None
diff --git a/examples/opsd/opsd/weight_bridge/__init__.py b/examples/opsd/opsd/weight_bridge/__init__.py
new file mode 100644
index 000000000000..b415b1a1b0e8
--- /dev/null
+++ b/examples/opsd/opsd/weight_bridge/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Architecture-specific bridges that slice HuggingFace weights for vLLM TP.
+
+A bridge takes the student's full ``(name, tensor)`` pairs (after we've
+gathered them across ZeRO-3 ranks) and emits the per-vLLM-rank slices ready
+to push into vLLM's ``model.load_weights(...)``.
+
+vLLM internally fuses Q/K/V into ``qkv_proj`` and gate/up into ``gate_up_proj``.
+We do **not** pre-fuse on our side — vLLM's loader already understands the
+unfused HuggingFace layout — so the bridge only needs to know each parameter's
+parallel kind (column / row / vocab / replicated) and slice on the right dim.
+"""
+
+from opsd.weight_bridge.base import ParallelKind, WeightBridge
+from opsd.weight_bridge.qwen2 import Qwen2WeightBridge
+from opsd.weight_bridge.qwen3 import Qwen3WeightBridge
+
+__all__ = ["WeightBridge", "ParallelKind", "Qwen2WeightBridge", "Qwen3WeightBridge", "get_bridge"]
+
+
+def get_bridge(arch: str) -> WeightBridge:
+    """Look up a bridge by architecture key (matches HF's ``model_type``)."""
+    key = arch.lower()
+    if key in ("qwen2", "qwen2.5"):
+        return Qwen2WeightBridge()
+    if key in ("qwen3", ):
+        return Qwen3WeightBridge()
+    raise ValueError(f"No weight bridge registered for arch {arch!r}; "
+                     f"add a sibling of opsd/weight_bridge/qwen2.py and register here")
diff --git a/examples/opsd/opsd/weight_bridge/base.py b/examples/opsd/opsd/weight_bridge/base.py
new file mode 100644
index 000000000000..3e780a05ae68
--- /dev/null
+++ b/examples/opsd/opsd/weight_bridge/base.py
@@ -0,0 +1,109 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""WeightBridge ABC: per-tensor TP slicing for vLLM weight sync."""
+
+from abc import ABC, abstractmethod
+from enum import Enum
+from typing import Iterable, Iterator, Tuple
+
+import torch
+
+
+class ParallelKind(str, Enum):
+    """How a single parameter is split across vLLM TP ranks.
+
+    Notation matches the standard Megatron-style decomposition:
+
+      * ``COLUMN`` — output dim (dim 0) is split. Each rank owns
+        ``out_features / tp`` rows. Used for attention Q/K/V and MLP
+        gate/up.
+      * ``ROW`` — input dim (dim 1) is split. Each rank owns
+        ``in_features / tp`` columns. Used for attention output projection
+        and MLP down projection.
+      * ``VOCAB`` — like COLUMN but applied to the embedding / LM head where
+        the partitioned dim is the vocab axis. Treated the same as COLUMN
+        for slicing purposes; the kind is kept distinct to make divisibility
+        diagnostics clearer at debug time.
+      * ``REPLICATED`` — the same tensor lives on every rank
+        (layer norms, RMSNorm scalars, per-head q_norm/k_norm in Qwen3).
+    """
+
+    COLUMN = "column"
+    ROW = "row"
+    VOCAB = "vocab"
+    REPLICATED = "replicated"
+
+
+def _even_slice(t: torch.Tensor, dim: int, rank: int, tp_size: int) -> torch.Tensor:
+    """Return rank ``rank`` 's contiguous chunk of ``t`` along ``dim``.
+
+    Refuses uneven divisions so that bugs surface here rather than as silent
+    layout mismatches once weights are loaded into vLLM.
+    """
+    total = int(t.shape[dim])
+    if total % tp_size != 0:
+        raise ValueError(f"Shape {tuple(t.shape)} dim {dim} (={total}) not divisible by "
+                         f"tp_size {tp_size}")
+    per = total // tp_size
+    return t.narrow(dim, rank * per, per).contiguous()
+
+
+class WeightBridge(ABC):
+    """Strategy object that maps HuggingFace param names to a parallel kind.
+
+    Subclasses only need to implement :meth:`parallel_kind`; the slicing
+    machinery is inherited.
+    """
+
+    # Subclasses set this to a human-readable tag, e.g. "qwen2".
+    arch: str = "base"
+
+    @abstractmethod
+    def parallel_kind(self, hf_name: str) -> ParallelKind:
+        """Return how parameter ``hf_name`` should be partitioned across TP."""
+
+    def slice_for_rank(
+        self,
+        hf_name: str,
+        tensor: torch.Tensor,
+        tp_rank: int,
+        tp_size: int,
+    ) -> torch.Tensor:
+        """Return the slice of ``tensor`` that belongs to rank ``tp_rank``."""
+        if tp_size < 1 or not (0 <= tp_rank < tp_size):
+            raise ValueError(f"invalid tp_rank={tp_rank} for tp_size={tp_size}")
+        if tp_size == 1:
+            return tensor
+        kind = self.parallel_kind(hf_name)
+        if kind is ParallelKind.REPLICATED:
+            return tensor
+        # COLUMN and VOCAB partition dim 0 (output / vocab). ROW partitions
+        # dim 1 (input). Both kinds may apply to 1-D tensors (biases): for a
+        # 1-D bias on a COLUMN-parallel linear, dim 0 IS the partitioned dim.
+        if kind in (ParallelKind.COLUMN, ParallelKind.VOCAB):
+            return _even_slice(tensor, dim=0, rank=tp_rank, tp_size=tp_size)
+        if kind is ParallelKind.ROW:
+            if tensor.dim() < 2:
+                # Row-parallel linears have a replicated bias (vLLM convention),
+                # so a 1-D tensor reaching this branch is a bug.
+                raise ValueError(f"ROW parallel kind requires >=2-D tensor for {hf_name}; "
+                                 f"got shape {tuple(tensor.shape)}")
+            return _even_slice(tensor, dim=1, rank=tp_rank, tp_size=tp_size)
+        raise ValueError(f"unhandled parallel kind {kind!r}")
+
+    def map_state_dict(
+        self,
+        hf_named_tensors: Iterable[Tuple[str, torch.Tensor]],
+        tp_rank: int,
+        tp_size: int,
+    ) -> Iterator[Tuple[str, torch.Tensor]]:
+        """Yield ``(vllm_name, sliced_tensor)`` for every input pair.
+
+        For Qwen-family models the vLLM parameter name is identical to the
+        HF name (vLLM's loader handles QKV/gate-up fusion internally), so the
+        emitted names are unchanged.
+        """
+        for name, tensor in hf_named_tensors:
+            yield name, self.slice_for_rank(name, tensor, tp_rank, tp_size)
diff --git a/examples/opsd/opsd/weight_bridge/qwen2.py b/examples/opsd/opsd/weight_bridge/qwen2.py
new file mode 100644
index 000000000000..903d47e81c1f
--- /dev/null
+++ b/examples/opsd/opsd/weight_bridge/qwen2.py
@@ -0,0 +1,84 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Weight bridge for Qwen2 / Qwen2.5 dense models.
+
+Naming follows the standard HF Qwen2 layout::
+
+    model.embed_tokens.weight
+    model.layers.{i}.self_attn.{q,k,v,o}_proj.{weight,bias}
+    model.layers.{i}.mlp.{gate,up,down}_proj.weight
+    model.layers.{i}.{input,post_attention}_layernorm.weight
+    model.norm.weight
+    lm_head.weight     # may be tied to embed_tokens
+
+Parallel kinds:
+    * Q/K/V projections — column-parallel (split heads across ranks)
+    * Attention output projection — row-parallel
+    * MLP gate / up projections — column-parallel
+    * MLP down projection — row-parallel
+    * Layer norms / final norm — replicated
+    * Token embedding & LM head — vocab-parallel (split vocab dim)
+    * Bias on Q/K/V — column-parallel (1-D bias on a column-parallel linear)
+    * Bias on o_proj / down_proj — replicated (row-parallel linears have a
+      replicated bias under vLLM's convention; the partial sums are reduced
+      before the bias add)
+"""
+
+import re
+
+from opsd.weight_bridge.base import ParallelKind, WeightBridge
+
+_LAYER_RE = re.compile(r"^model\.layers\.\d+\.(?P<rest>.+)$")
+
+
+class Qwen2WeightBridge(WeightBridge):
+    arch = "qwen2"
+
+    # Suffix → parallel kind. Keyed by the part after "model.layers.{i}." for
+    # transformer-block params, plus a few full names for embeddings / norms.
+    _LAYER_RULES = {
+        "self_attn.q_proj.weight": ParallelKind.COLUMN,
+        "self_attn.k_proj.weight": ParallelKind.COLUMN,
+        "self_attn.v_proj.weight": ParallelKind.COLUMN,
+        "self_attn.q_proj.bias": ParallelKind.COLUMN,
+        "self_attn.k_proj.bias": ParallelKind.COLUMN,
+        "self_attn.v_proj.bias": ParallelKind.COLUMN,
+        "self_attn.o_proj.weight": ParallelKind.ROW,
+        "self_attn.o_proj.bias": ParallelKind.REPLICATED,
+        "mlp.gate_proj.weight": ParallelKind.COLUMN,
+        "mlp.up_proj.weight": ParallelKind.COLUMN,
+        "mlp.down_proj.weight": ParallelKind.ROW,
+        "mlp.down_proj.bias": ParallelKind.REPLICATED,
+        "input_layernorm.weight": ParallelKind.REPLICATED,
+        "post_attention_layernorm.weight": ParallelKind.REPLICATED,
+    }
+
+    _GLOBAL_RULES = {
+        "model.embed_tokens.weight": ParallelKind.VOCAB,
+        "model.norm.weight": ParallelKind.REPLICATED,
+        "lm_head.weight": ParallelKind.VOCAB,
+    }
+
+    def parallel_kind(self, hf_name: str) -> ParallelKind:
+        if hf_name in self._GLOBAL_RULES:
+            return self._GLOBAL_RULES[hf_name]
+        m = _LAYER_RE.match(hf_name)
+        if m is not None:
+            rest = m.group("rest")
+            if rest in self._LAYER_RULES:
+                return self._LAYER_RULES[rest]
+            # Per-layer name not in our table — surface a clear error so the
+            # weight sync isn't silently wrong for an unrecognised tensor.
+            extra = self._extra_layer_kind(rest)
+            if extra is not None:
+                return extra
+            raise KeyError(f"Unknown per-layer Qwen2 parameter suffix {rest!r}; add a rule "
+                           f"in Qwen2WeightBridge._LAYER_RULES")
+        raise KeyError(f"Unknown Qwen2 parameter name {hf_name!r}")
+
+    def _extra_layer_kind(self, _suffix: str):  # noqa: D401, ARG002
+        """Hook for subclasses (Qwen3) to add per-layer rules without
+        duplicating the rest of the table."""
+        return None
diff --git a/examples/opsd/opsd/weight_bridge/qwen3.py b/examples/opsd/opsd/weight_bridge/qwen3.py
new file mode 100644
index 000000000000..6b3d7695ed32
--- /dev/null
+++ b/examples/opsd/opsd/weight_bridge/qwen3.py
@@ -0,0 +1,37 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Weight bridge for Qwen3 dense models.
+
+Qwen3-dense uses the same overall layout as Qwen2 with one addition:
+per-head RMSNorm applied to the query and key projections before attention::
+
+    model.layers.{i}.self_attn.q_norm.weight   # shape [head_dim]
+    model.layers.{i}.self_attn.k_norm.weight   # shape [head_dim]
+
+These weights are 1-D over ``head_dim`` (not ``num_heads * head_dim``), so they
+are **replicated** across TP ranks: every rank owns a subset of heads but each
+head normalises with the same per-head-dim scalars.
+
+Qwen3-MoE (the ``Qwen3MoeForCausalLM`` family) is **not** covered here — MoE
+introduces gate/expert routing and per-expert MLPs that need their own bridge.
+Add a sibling ``qwen3_moe.py`` when that path becomes a priority.
+"""
+
+from typing import Optional
+
+from opsd.weight_bridge.base import ParallelKind
+from opsd.weight_bridge.qwen2 import Qwen2WeightBridge
+
+
+class Qwen3WeightBridge(Qwen2WeightBridge):
+    arch = "qwen3"
+
+    _Q_NORM = "self_attn.q_norm.weight"
+    _K_NORM = "self_attn.k_norm.weight"
+
+    def _extra_layer_kind(self, suffix: str) -> Optional[ParallelKind]:
+        if suffix in (self._Q_NORM, self._K_NORM):
+            return ParallelKind.REPLICATED
+        return None
diff --git a/examples/opsd/scripts/train_opsd_vllm.sh b/examples/opsd/scripts/train_opsd_vllm.sh
new file mode 100644
index 000000000000..83ed4dc96d7e
--- /dev/null
+++ b/examples/opsd/scripts/train_opsd_vllm.sh
@@ -0,0 +1,19 @@
+#!/usr/bin/env bash
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+#
+# Launch OPSD training with vLLM rollout on a disjoint GPU group.
+#
+# Default config assumes 8 GPUs: ranks 0..5 train (ZeRO-3), devices 6-7 run
+# vLLM with TP=2. Adjust configs/opsd_vllm_disjoint.json::rollout.gpus and
+# NUM_TRAIN_GPUS to match your topology.
+set -euo pipefail
+
+CONFIG="${1:-configs/opsd_vllm_disjoint.json}"
+NUM_TRAIN_GPUS="${NUM_TRAIN_GPUS:-6}"
+INCLUDE_GPUS="${INCLUDE_GPUS:-0,1,2,3,4,5}"
+
+deepspeed --num_gpus "${NUM_TRAIN_GPUS}" --include "localhost:${INCLUDE_GPUS}" \
+    main.py --config "${CONFIG}"
diff --git a/examples/opsd/tests/test_vllm_stitch.py b/examples/opsd/tests/test_vllm_stitch.py
new file mode 100644
index 000000000000..bd8e1b4e4c0f
--- /dev/null
+++ b/examples/opsd/tests/test_vllm_stitch.py
@@ -0,0 +1,97 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""CPU-only tests for the vLLM rollout post-processing.
+
+We can't run vLLM here, but the prompt/response stitching is pure tensor
+manipulation and is the part most prone to silent index bugs.
+"""
+
+import pytest
+import torch
+
+from opsd.rollout.vllm import stitch_rollout
+from opsd.utils import build_response_mask
+
+
+def test_stitch_basic_single_sample():
+    prompt_ids = torch.tensor([[1, 2, 3], [4, 5, 6]])
+    attn = torch.ones(2, 3, dtype=torch.long)
+    responses = [[10, 11, 12], [20, 21]]
+    out = stitch_rollout(prompt_ids, attn, responses, pad_id=0, n_samples_per_prompt=1)
+    assert out.input_ids.shape == (2, 6)
+    assert out.input_ids[0].tolist() == [1, 2, 3, 10, 11, 12]
+    assert out.input_ids[1].tolist() == [4, 5, 6, 20, 21, 0]
+    assert out.attention_mask[0].tolist() == [1, 1, 1, 1, 1, 1]
+    assert out.attention_mask[1].tolist() == [1, 1, 1, 1, 1, 0]
+    assert out.response_start_idx.tolist() == [3, 3]
+
+
+def test_stitch_with_n_samples():
+    prompt_ids = torch.tensor([[1, 2], [3, 4]])
+    attn = torch.ones(2, 2, dtype=torch.long)
+    responses = [[5, 6], [7, 8], [9, 10], [11, 12]]
+    out = stitch_rollout(prompt_ids, attn, responses, pad_id=0, n_samples_per_prompt=2)
+    assert out.input_ids.shape == (4, 4)
+    # Prompts are repeat_interleaved: [P0, P0, P1, P1].
+    assert out.input_ids[0].tolist() == [1, 2, 5, 6]
+    assert out.input_ids[1].tolist() == [1, 2, 7, 8]
+    assert out.input_ids[2].tolist() == [3, 4, 9, 10]
+    assert out.input_ids[3].tolist() == [3, 4, 11, 12]
+    assert out.response_start_idx.tolist() == [2, 2, 2, 2]
+
+
+def test_stitch_left_padded_prompts():
+    prompt_ids = torch.tensor([[0, 1, 2], [3, 4, 5]])
+    attn = torch.tensor([[0, 1, 1], [1, 1, 1]], dtype=torch.long)
+    responses = [[6], [7]]
+    out = stitch_rollout(prompt_ids, attn, responses, pad_id=0, n_samples_per_prompt=1)
+    # Response begins at column T_p == 3 for both, regardless of prompt padding.
+    assert out.response_start_idx.tolist() == [3, 3]
+    # Prompt section keeps the caller's left-padding mask.
+    assert out.attention_mask[:, :3].tolist() == [[0, 1, 1], [1, 1, 1]]
+
+
+def test_stitch_mismatched_response_count_raises():
+    prompt_ids = torch.tensor([[1, 2]])
+    attn = torch.ones(1, 2, dtype=torch.long)
+    with pytest.raises(ValueError, match="expected"):
+        stitch_rollout(prompt_ids, attn, [[3], [4]], pad_id=0, n_samples_per_prompt=1)
+
+
+def test_stitch_empty_responses_still_well_shaped():
+    prompt_ids = torch.tensor([[1, 2], [3, 4]])
+    attn = torch.ones(2, 2, dtype=torch.long)
+    out = stitch_rollout(prompt_ids, attn, [[], []], pad_id=0, n_samples_per_prompt=1)
+    # No response tokens means total length == prompt length.
+    assert out.input_ids.shape == (2, 2)
+    # Mask over the (zero) response section is empty; response_start_idx still
+    # points at the end of the prompt.
+    assert out.response_start_idx.tolist() == [2, 2]
+
+
+def test_stitch_handles_variable_response_lengths():
+    prompt_ids = torch.tensor([[1], [2], [3]])
+    attn = torch.ones(3, 1, dtype=torch.long)
+    responses = [[10], [20, 21, 22, 23], [30, 31]]
+    out = stitch_rollout(prompt_ids, attn, responses, pad_id=99, n_samples_per_prompt=1)
+    # Total length = T_p + max(response lengths) = 1 + 4 = 5.
+    assert out.input_ids.shape == (3, 5)
+    assert out.input_ids[0].tolist() == [1, 10, 99, 99, 99]
+    assert out.input_ids[1].tolist() == [2, 20, 21, 22, 23]
+    assert out.input_ids[2].tolist() == [3, 30, 31, 99, 99]
+    assert out.attention_mask[0].tolist() == [1, 1, 0, 0, 0]
+    assert out.attention_mask[1].tolist() == [1, 1, 1, 1, 1]
+    assert out.attention_mask[2].tolist() == [1, 1, 1, 0, 0]
+
+
+def test_stitch_output_feeds_build_response_mask():
+    prompt_ids = torch.tensor([[0, 1, 2], [3, 4, 5]])
+    attn = torch.tensor([[0, 1, 1], [1, 1, 1]], dtype=torch.long)
+    out = stitch_rollout(prompt_ids, attn, [[10, 11], [20]], pad_id=0, n_samples_per_prompt=1)
+    mask = build_response_mask(out.response_start_idx, out.attention_mask)
+    # Sample 0: T_p=3, response tokens at 3,4 (both attended).
+    assert mask[0].tolist() == [0, 0, 0, 1, 1]
+    # Sample 1: T_p=3, response token at 3 only (position 4 is pad).
+    assert mask[1].tolist() == [0, 0, 0, 1, 0]
diff --git a/examples/opsd/tests/test_weight_bridge.py b/examples/opsd/tests/test_weight_bridge.py
new file mode 100644
index 000000000000..9aa50414cbb2
--- /dev/null
+++ b/examples/opsd/tests/test_weight_bridge.py
@@ -0,0 +1,259 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""CPU-only tests for the TP weight bridges.
+
+These exercise the parallel-kind table and the per-rank slicing math without
+requiring vLLM, GPUs, or real model checkpoints.
+"""
+
+import pytest
+import torch
+
+from opsd.weight_bridge import ParallelKind, Qwen2WeightBridge, Qwen3WeightBridge, get_bridge
+
+# Realistic-ish shapes for a Qwen2.5-0.5B-style model: hidden=896, num_heads=14,
+# num_kv_heads=2, head_dim=64, intermediate=4864, vocab=151936. Picked so all
+# the per-dim sizes are divisible by tp_size=2.
+HIDDEN = 896
+NUM_HEADS = 14
+NUM_KV_HEADS = 2
+HEAD_DIM = 64
+INTERMEDIATE = 4864
+VOCAB = 151936
+
+
+def _qwen2_named_tensors():
+    """A minimal stand-in for one layer of a Qwen2 state dict."""
+    q_dim = NUM_HEADS * HEAD_DIM
+    kv_dim = NUM_KV_HEADS * HEAD_DIM
+    return [
+        ("model.embed_tokens.weight", torch.randn(VOCAB, HIDDEN)),
+        ("model.layers.0.self_attn.q_proj.weight", torch.randn(q_dim, HIDDEN)),
+        ("model.layers.0.self_attn.k_proj.weight", torch.randn(kv_dim, HIDDEN)),
+        ("model.layers.0.self_attn.v_proj.weight", torch.randn(kv_dim, HIDDEN)),
+        ("model.layers.0.self_attn.q_proj.bias", torch.randn(q_dim)),
+        ("model.layers.0.self_attn.k_proj.bias", torch.randn(kv_dim)),
+        ("model.layers.0.self_attn.v_proj.bias", torch.randn(kv_dim)),
+        ("model.layers.0.self_attn.o_proj.weight", torch.randn(HIDDEN, q_dim)),
+        ("model.layers.0.mlp.gate_proj.weight", torch.randn(INTERMEDIATE, HIDDEN)),
+        ("model.layers.0.mlp.up_proj.weight", torch.randn(INTERMEDIATE, HIDDEN)),
+        ("model.layers.0.mlp.down_proj.weight", torch.randn(HIDDEN, INTERMEDIATE)),
+        ("model.layers.0.input_layernorm.weight", torch.randn(HIDDEN)),
+        ("model.layers.0.post_attention_layernorm.weight", torch.randn(HIDDEN)),
+        ("model.norm.weight", torch.randn(HIDDEN)),
+        ("lm_head.weight", torch.randn(VOCAB, HIDDEN)),
+    ]
+
+
+# --- parallel kind dispatch -------------------------------------------------
+
+
+@pytest.mark.parametrize("name, expected", [
+    ("model.embed_tokens.weight", ParallelKind.VOCAB),
+    ("model.layers.0.self_attn.q_proj.weight", ParallelKind.COLUMN),
+    ("model.layers.0.self_attn.k_proj.weight", ParallelKind.COLUMN),
+    ("model.layers.0.self_attn.v_proj.weight", ParallelKind.COLUMN),
+    ("model.layers.42.self_attn.q_proj.bias", ParallelKind.COLUMN),
+    ("model.layers.3.self_attn.o_proj.weight", ParallelKind.ROW),
+    ("model.layers.3.mlp.gate_proj.weight", ParallelKind.COLUMN),
+    ("model.layers.3.mlp.up_proj.weight", ParallelKind.COLUMN),
+    ("model.layers.3.mlp.down_proj.weight", ParallelKind.ROW),
+    ("model.layers.0.input_layernorm.weight", ParallelKind.REPLICATED),
+    ("model.layers.0.post_attention_layernorm.weight", ParallelKind.REPLICATED),
+    ("model.norm.weight", ParallelKind.REPLICATED),
+    ("lm_head.weight", ParallelKind.VOCAB),
+])
+def test_qwen2_parallel_kinds(name, expected):
+    assert Qwen2WeightBridge().parallel_kind(name) == expected
+
+
+def test_qwen2_unknown_layer_param_raises():
+    with pytest.raises(KeyError, match="Unknown per-layer Qwen2"):
+        Qwen2WeightBridge().parallel_kind("model.layers.0.self_attn.q_norm.weight")
+
+
+def test_qwen2_unknown_global_param_raises():
+    with pytest.raises(KeyError, match="Unknown Qwen2 parameter"):
+        Qwen2WeightBridge().parallel_kind("totally.made.up.weight")
+
+
+def test_qwen3_adds_qk_norm():
+    bridge = Qwen3WeightBridge()
+    assert bridge.parallel_kind("model.layers.0.self_attn.q_norm.weight") == ParallelKind.REPLICATED
+    assert bridge.parallel_kind("model.layers.0.self_attn.k_norm.weight") == ParallelKind.REPLICATED
+    # Inherits the rest from Qwen2.
+    assert bridge.parallel_kind("model.layers.0.self_attn.q_proj.weight") == ParallelKind.COLUMN
+
+
+# --- slicing math -----------------------------------------------------------
+
+
+@pytest.mark.parametrize("tp_size", [1, 2, 4])
+def test_column_slice_shapes(tp_size):
+    bridge = Qwen2WeightBridge()
+    w = torch.randn(NUM_HEADS * HEAD_DIM, HIDDEN)
+    for rank in range(tp_size):
+        sliced = bridge.slice_for_rank("model.layers.0.self_attn.q_proj.weight", w, rank, tp_size)
+        assert sliced.shape == (NUM_HEADS * HEAD_DIM // tp_size, HIDDEN)
+
+
+@pytest.mark.parametrize("tp_size", [1, 2, 4])
+def test_row_slice_shapes(tp_size):
+    bridge = Qwen2WeightBridge()
+    w = torch.randn(HIDDEN, NUM_HEADS * HEAD_DIM)
+    for rank in range(tp_size):
+        sliced = bridge.slice_for_rank("model.layers.0.self_attn.o_proj.weight", w, rank, tp_size)
+        assert sliced.shape == (HIDDEN, NUM_HEADS * HEAD_DIM // tp_size)
+
+
+def test_replicated_returns_full_tensor():
+    bridge = Qwen2WeightBridge()
+    w = torch.randn(HIDDEN)
+    for rank in range(4):
+        sliced = bridge.slice_for_rank("model.layers.0.input_layernorm.weight", w, rank, tp_size=4)
+        assert sliced.shape == w.shape
+        assert torch.equal(sliced, w)
+
+
+def test_column_slices_gather_to_original():
+    bridge = Qwen2WeightBridge()
+    w = torch.randn(NUM_HEADS * HEAD_DIM, HIDDEN)
+    tp_size = 2
+    pieces = [bridge.slice_for_rank("model.layers.0.self_attn.q_proj.weight", w, r, tp_size) for r in range(tp_size)]
+    assert torch.equal(torch.cat(pieces, dim=0), w)
+
+
+def test_row_slices_gather_to_original():
+    bridge = Qwen2WeightBridge()
+    w = torch.randn(HIDDEN, INTERMEDIATE)
+    tp_size = 4
+    pieces = [bridge.slice_for_rank("model.layers.0.mlp.down_proj.weight", w, r, tp_size) for r in range(tp_size)]
+    assert torch.equal(torch.cat(pieces, dim=1), w)
+
+
+def test_vocab_slices_gather_to_original():
+    bridge = Qwen2WeightBridge()
+    w = torch.randn(VOCAB, HIDDEN)
+    tp_size = 4
+    pieces = [bridge.slice_for_rank("model.embed_tokens.weight", w, r, tp_size) for r in range(tp_size)]
+    assert torch.equal(torch.cat(pieces, dim=0), w)
+
+
+def test_bias_column_slices_gather_to_original():
+    bridge = Qwen2WeightBridge()
+    b = torch.randn(NUM_HEADS * HEAD_DIM)
+    tp_size = 2
+    pieces = [bridge.slice_for_rank("model.layers.0.self_attn.q_proj.bias", b, r, tp_size) for r in range(tp_size)]
+    assert torch.equal(torch.cat(pieces, dim=0), b)
+
+
+def test_indivisible_shape_raises():
+    bridge = Qwen2WeightBridge()
+    # 7 is not divisible by 2; should fail loudly rather than truncate.
+    w = torch.randn(7, HIDDEN)
+    with pytest.raises(ValueError, match="not divisible by"):
+        bridge.slice_for_rank("model.layers.0.self_attn.q_proj.weight", w, 0, 2)
+
+
+def test_invalid_rank_raises():
+    bridge = Qwen2WeightBridge()
+    w = torch.randn(NUM_HEADS * HEAD_DIM, HIDDEN)
+    with pytest.raises(ValueError, match="invalid tp_rank"):
+        bridge.slice_for_rank("model.layers.0.self_attn.q_proj.weight", w, 4, 4)
+    with pytest.raises(ValueError, match="invalid tp_rank"):
+        bridge.slice_for_rank("model.layers.0.self_attn.q_proj.weight", w, -1, 2)
+
+
+def test_row_parallel_rejects_1d():
+    """The defensive check inside ``slice_for_rank`` is unreachable through
+    the real Qwen2 table (row-parallel biases are tagged REPLICATED), but a
+    future bridge could route a 1-D tensor through ROW. Exercise via a
+    minimal subclass so the guard stays covered."""
+
+    class _BadBridge(Qwen2WeightBridge):
+
+        def parallel_kind(self, hf_name):  # noqa: ARG002
+            return ParallelKind.ROW
+
+    with pytest.raises(ValueError, match="ROW parallel kind requires"):
+        _BadBridge().slice_for_rank("anything", torch.randn(HIDDEN), 0, 2)
+
+
+def test_tp1_is_passthrough():
+    bridge = Qwen2WeightBridge()
+    w = torch.randn(NUM_HEADS * HEAD_DIM, HIDDEN)
+    out = bridge.slice_for_rank("model.layers.0.self_attn.q_proj.weight", w, 0, 1)
+    assert torch.equal(out, w)
+
+
+# --- state-dict iteration ---------------------------------------------------
+
+
+def test_map_state_dict_emits_correct_shapes_for_tp2():
+    bridge = Qwen2WeightBridge()
+    tp_size = 2
+    # Build the source once; each rank consumes a fresh iterator over the
+    # same materialised list so we're slicing identical tensors.
+    src = _qwen2_named_tensors()
+    by_rank = {r: dict(bridge.map_state_dict(iter(src), r, tp_size)) for r in range(tp_size)}
+    src_by_name = dict(src)
+
+    # Replicated tensors should be identical across ranks AND match source.
+    a = by_rank[0]["model.layers.0.input_layernorm.weight"]
+    b = by_rank[1]["model.layers.0.input_layernorm.weight"]
+    assert torch.equal(a, b)
+    assert torch.equal(a, src_by_name["model.layers.0.input_layernorm.weight"])
+
+    # Column-parallel Q: shapes halved on dim 0; gather reconstructs source.
+    q_full_rows = NUM_HEADS * HEAD_DIM
+    assert by_rank[0]["model.layers.0.self_attn.q_proj.weight"].shape == (q_full_rows // 2, HIDDEN)
+    gathered_q = torch.cat([
+        by_rank[0]["model.layers.0.self_attn.q_proj.weight"],
+        by_rank[1]["model.layers.0.self_attn.q_proj.weight"],
+    ],
+                           dim=0)
+    assert torch.equal(gathered_q, src_by_name["model.layers.0.self_attn.q_proj.weight"])
+
+
+def test_map_state_dict_gather_round_trip_with_fixed_seed():
+    bridge = Qwen2WeightBridge()
+    torch.manual_seed(123)
+    src = _qwen2_named_tensors()
+    src_by_name = dict(src)
+
+    tp_size = 4
+    sliced = [list(bridge.map_state_dict(src, r, tp_size)) for r in range(tp_size)]
+
+    # For every entry, reconstruct from per-rank slices and compare to the
+    # source. The reconstruction op depends on the parallel kind.
+    for r0_name, _ in sliced[0]:
+        kind = bridge.parallel_kind(r0_name)
+        per_rank = [dict(s)[r0_name] for s in sliced]
+        if kind is ParallelKind.REPLICATED:
+            recon = per_rank[0]
+        elif kind in (ParallelKind.COLUMN, ParallelKind.VOCAB):
+            recon = torch.cat(per_rank, dim=0)
+        elif kind is ParallelKind.ROW:
+            recon = torch.cat(per_rank, dim=1)
+        else:
+            raise AssertionError(f"unhandled kind {kind}")
+        assert torch.equal(recon, src_by_name[r0_name]), f"round-trip mismatch for {r0_name}"
+
+
+# --- registry ---------------------------------------------------------------
+
+
+def test_get_bridge_qwen2():
+    assert isinstance(get_bridge("qwen2"), Qwen2WeightBridge)
+    assert isinstance(get_bridge("Qwen2.5"), Qwen2WeightBridge)
+
+
+def test_get_bridge_qwen3():
+    assert isinstance(get_bridge("qwen3"), Qwen3WeightBridge)
+
+
+def test_get_bridge_unknown_raises():
+    with pytest.raises(ValueError, match="No weight bridge registered"):
+        get_bridge("totally-made-up-arch")

From dedfe73a0281e9f4819712e7dfd1fca2e9b07679 Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@intel.com>
Date: Sun, 21 Jun 2026 23:27:47 +0800
Subject: [PATCH 05/18] feat(rollout): OPSD rollout engine with graph capture,
 vLLM backend

HybridEngineRollout:
- model.generate() path for sampling (temperature>0)
- Graph capture + DeepSpeedStaticCache path for greedy (temperature=0, 3x faster)
- DeepSpeedStaticCache: CUDA-graph-compatible KV cache with external write_position
- RolloutRequest/RolloutBatch/RolloutConfig dataclasses

VLLMRollout:
- Weight sync via gdr/http backends
- vllm_python config for interpreter selection
- vLLM compat sitecustomize shim
- Only sync requires_grad params to vLLM

OPSD trainer/config:
- Move trainer to deepspeed/runtime/rlhf/trainer/
- Move config to deepspeed/runtime/rlhf/config.py
- Force weight_sync_interval=1 for on-policy correctness

Tests:
- CPU unit tests for HybridEngineRollout and VLLMRollout
- Graph capture verified: HF StaticCache == DeepSpeedStaticCache == graph (100 steps, 0 diff)

Verified on Qwen2.5-0.5B-Instruct / RTX 5090:
- model.generate(): 90 tok/s (batch=1)
- graph capture: 270 tok/s (3x speedup)
- OPSD smoke test: 3 training steps pass end-to-end

Signed-off-by: Guokai Ma <guokai.ma@intel.com>
Signed-off-by: Zhipeng Wang <zhipengbayern@gmail.com>
---
 .gitignore                                    |   1 +
 benchmarks/opsd/bench_14b_rollout.py          | 134 +++++
 benchmarks/opsd/bench_autotp_gc.py            |  96 ++++
 benchmarks/opsd/bench_decode_1p1r.py          | 180 ++++++
 benchmarks/opsd/bench_flashinfer.py           | 129 +++++
 benchmarks/opsd/bench_hybrid_tp.py            | 145 +++++
 benchmarks/opsd/bench_hybrid_tp_opt.py        | 149 +++++
 benchmarks/opsd/bench_vllm_tp2.py             |  41 ++
 deepspeed/runtime/rlhf/__init__.py            |  32 ++
 .../opsd => deepspeed/runtime/rlhf}/config.py |  39 +-
 .../opsd => deepspeed/runtime/rlhf}/data.py   |   2 +-
 .../opsd => deepspeed/runtime/rlhf}/losses.py |   0
 .../runtime/rlhf}/teacher.py                  |   2 +-
 deepspeed/runtime/rlhf/trainer/__init__.py    |   8 +
 deepspeed/runtime/rlhf/trainer/base.py        |  42 ++
 .../runtime/rlhf/trainer/opsd.py              |  32 +-
 .../opsd => deepspeed/runtime/rlhf}/utils.py  |   0
 deepspeed/runtime/rollout/__init__.py         |  65 +++
 .../runtime/rollout/_vllm_compat/__init__.py  |   4 +
 .../rollout/_vllm_compat/sitecustomize.py     |  75 +++
 .../runtime}/rollout/base.py                  |  32 +-
 .../runtime/rollout/hybrid_engine_rollout.py  | 242 ++++++++
 deepspeed/runtime/rollout/static_cache.py     | 230 ++++++++
 deepspeed/runtime/rollout/vllm_rollout.py     | 542 ++++++++++++++++++
 examples/opsd/README.md                       | 232 --------
 examples/opsd/configs/ds_zero3.json           |  43 --
 examples/opsd/configs/opsd_hybrid_engine.json |  49 --
 examples/opsd/configs/opsd_vllm_disjoint.json |  54 --
 examples/opsd/configs/smoke_ds_zero3.json     |  35 --
 examples/opsd/configs/smoke_hybrid.json       |  49 --
 examples/opsd/configs/smoke_vllm.json         |  55 --
 examples/opsd/data/prompts.jsonl              |  20 -
 examples/opsd/main.py                         | 135 -----
 examples/opsd/opsd/__init__.py                |  17 -
 examples/opsd/opsd/rollout/__init__.py        |  39 --
 examples/opsd/opsd/rollout/hybrid_engine.py   | 119 ----
 examples/opsd/opsd/rollout/vllm.py            | 314 ----------
 examples/opsd/opsd/weight_bridge/__init__.py  |  32 --
 examples/opsd/opsd/weight_bridge/base.py      | 109 ----
 examples/opsd/opsd/weight_bridge/qwen2.py     |  84 ---
 examples/opsd/opsd/weight_bridge/qwen3.py     |  37 --
 examples/opsd/requirements.txt                |   5 -
 examples/opsd/scripts/train_opsd_hybrid.sh    |  14 -
 examples/opsd/scripts/train_opsd_vllm.sh      |  19 -
 examples/opsd/tests/test_losses.py            | 166 ------
 examples/opsd/tests/test_teacher_caching.py   | 101 ----
 examples/opsd/tests/test_weight_bridge.py     | 259 ---------
 .../rollout/test_hybrid_engine_rollout.py     | 123 ++++
 .../rollout}/test_rollout_interface.py        |  22 +-
 .../unit/runtime/rollout/test_vllm_rollout.py | 187 ++++++
 .../unit/runtime/rollout}/test_vllm_stitch.py |   4 +-
 51 files changed, 2493 insertions(+), 2052 deletions(-)
 create mode 100644 benchmarks/opsd/bench_14b_rollout.py
 create mode 100644 benchmarks/opsd/bench_autotp_gc.py
 create mode 100644 benchmarks/opsd/bench_decode_1p1r.py
 create mode 100644 benchmarks/opsd/bench_flashinfer.py
 create mode 100644 benchmarks/opsd/bench_hybrid_tp.py
 create mode 100644 benchmarks/opsd/bench_hybrid_tp_opt.py
 create mode 100644 benchmarks/opsd/bench_vllm_tp2.py
 create mode 100644 deepspeed/runtime/rlhf/__init__.py
 rename {examples/opsd/opsd => deepspeed/runtime/rlhf}/config.py (73%)
 rename {examples/opsd/opsd => deepspeed/runtime/rlhf}/data.py (97%)
 rename {examples/opsd/opsd => deepspeed/runtime/rlhf}/losses.py (100%)
 rename {examples/opsd/opsd => deepspeed/runtime/rlhf}/teacher.py (99%)
 create mode 100644 deepspeed/runtime/rlhf/trainer/__init__.py
 create mode 100644 deepspeed/runtime/rlhf/trainer/base.py
 rename examples/opsd/opsd/trainer.py => deepspeed/runtime/rlhf/trainer/opsd.py (88%)
 rename {examples/opsd/opsd => deepspeed/runtime/rlhf}/utils.py (100%)
 create mode 100644 deepspeed/runtime/rollout/__init__.py
 create mode 100644 deepspeed/runtime/rollout/_vllm_compat/__init__.py
 create mode 100644 deepspeed/runtime/rollout/_vllm_compat/sitecustomize.py
 rename {examples/opsd/opsd => deepspeed/runtime}/rollout/base.py (66%)
 create mode 100644 deepspeed/runtime/rollout/hybrid_engine_rollout.py
 create mode 100644 deepspeed/runtime/rollout/static_cache.py
 create mode 100644 deepspeed/runtime/rollout/vllm_rollout.py
 delete mode 100644 examples/opsd/README.md
 delete mode 100644 examples/opsd/configs/ds_zero3.json
 delete mode 100644 examples/opsd/configs/opsd_hybrid_engine.json
 delete mode 100644 examples/opsd/configs/opsd_vllm_disjoint.json
 delete mode 100644 examples/opsd/configs/smoke_ds_zero3.json
 delete mode 100644 examples/opsd/configs/smoke_hybrid.json
 delete mode 100644 examples/opsd/configs/smoke_vllm.json
 delete mode 100644 examples/opsd/data/prompts.jsonl
 delete mode 100644 examples/opsd/main.py
 delete mode 100644 examples/opsd/opsd/__init__.py
 delete mode 100644 examples/opsd/opsd/rollout/__init__.py
 delete mode 100644 examples/opsd/opsd/rollout/hybrid_engine.py
 delete mode 100644 examples/opsd/opsd/rollout/vllm.py
 delete mode 100644 examples/opsd/opsd/weight_bridge/__init__.py
 delete mode 100644 examples/opsd/opsd/weight_bridge/base.py
 delete mode 100644 examples/opsd/opsd/weight_bridge/qwen2.py
 delete mode 100644 examples/opsd/opsd/weight_bridge/qwen3.py
 delete mode 100644 examples/opsd/requirements.txt
 delete mode 100644 examples/opsd/scripts/train_opsd_hybrid.sh
 delete mode 100644 examples/opsd/scripts/train_opsd_vllm.sh
 delete mode 100644 examples/opsd/tests/test_losses.py
 delete mode 100644 examples/opsd/tests/test_teacher_caching.py
 delete mode 100644 examples/opsd/tests/test_weight_bridge.py
 create mode 100644 tests/unit/runtime/rollout/test_hybrid_engine_rollout.py
 rename {examples/opsd/tests => tests/unit/runtime/rollout}/test_rollout_interface.py (93%)
 create mode 100644 tests/unit/runtime/rollout/test_vllm_rollout.py
 rename {examples/opsd/tests => tests/unit/runtime/rollout}/test_vllm_stitch.py (97%)

diff --git a/.gitignore b/.gitignore
index 13e79bacce4a..10689d1a8a33 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,3 +66,4 @@ tests/unit/saved_checkpoint/
 
 # virtual env directory for format
 venv
+tags
diff --git a/benchmarks/opsd/bench_14b_rollout.py b/benchmarks/opsd/bench_14b_rollout.py
new file mode 100644
index 000000000000..d66c7615dd94
--- /dev/null
+++ b/benchmarks/opsd/bench_14b_rollout.py
@@ -0,0 +1,134 @@
+"""Comprehensive 14B rollout benchmark: Naive, GC, TP=2 GC, TP=4 GC."""
+import time
+import os
+import sys
+import torch
+import deepspeed
+from deepspeed.runtime.rollout import HybridEngineRollout, RolloutRequest, SamplingConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+MODEL = "Qwen/Qwen2.5-14B-Instruct"
+MAX_NEW_TOKENS = 256
+N_SAMPLES = 1
+CB_SIZE = 1
+N_RUNS = 5
+PROMPT = "def fibonacci(n):"
+
+
+def bench_rollout(engine, tokenizer, use_graph_capture, cb_size, label):
+    rank = torch.distributed.get_rank()
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    device = torch.device(f"cuda:{local_rank}")
+
+    rollout = HybridEngineRollout(
+        engine=engine,
+        tokenizer=tokenizer,
+        continuous_batching_size=cb_size,
+        use_graph_capture=use_graph_capture,
+    )
+
+    ids = tokenizer(PROMPT, return_tensors="pt").input_ids.to(device)
+    req = RolloutRequest(prompt_ids=ids, prompt_attention_mask=torch.ones_like(ids))
+    sampling = SamplingConfig(
+        max_new_tokens=MAX_NEW_TOKENS, temperature=0.8, top_p=0.95,
+        n_samples_per_prompt=N_SAMPLES
+    )
+
+    # Warmup
+    torch.manual_seed(42)
+    engine.eval()
+    rollout.generate(req, sampling)
+    engine.train()
+
+    # Benchmark
+    times = []
+    total_toks = 0
+    for i in range(N_RUNS):
+        torch.manual_seed(42 + i)
+        engine.eval()
+        torch.cuda.synchronize()
+        t0 = time.time()
+        batch = rollout.generate(req, sampling)
+        torch.cuda.synchronize()
+        times.append(time.time() - t0)
+        engine.train()
+
+    # Count tokens from last run
+    pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
+    for i in range(batch.input_ids.shape[0]):
+        resp = batch.input_ids[i, batch.response_start_idx[i]:]
+        total_toks += (resp != pad_id).sum().item()
+
+    t_avg = sum(times[1:]) / len(times[1:])
+
+    if rank == 0:
+        print(f"[{label}] {total_toks} toks, {t_avg*1000:.0f}ms, {total_toks/t_avg:.1f} tok/s  "
+              f"runs={[f'{t*1000:.0f}' for t in times]}")
+
+    return total_toks, t_avg
+
+
+def main():
+    deepspeed.init_distributed()
+    rank = torch.distributed.get_rank()
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    torch.cuda.set_device(local_rank)
+
+    world_size = torch.distributed.get_world_size()
+    tp_size = world_size  # all GPUs used for TP
+
+    tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
+    model = AutoModelForCausalLM.from_pretrained(MODEL, dtype=torch.bfloat16, trust_remote_code=True)
+
+    ds_config = {
+        "bf16": {"enabled": True},
+        "zero_optimization": {"stage": 0},
+        "train_micro_batch_size_per_gpu": 1,
+        "train_batch_size": world_size,
+        "gradient_accumulation_steps": 1,
+        "hybrid_engine": {
+            "enabled": True,
+            "max_out_tokens": 512,
+            "inference_tp_size": 1,
+            "release_inference_cache": False,
+            "pin_parameters": True,
+            "tp_gather_partition_size": 8,
+        },
+    }
+
+    if tp_size > 1:
+        ds_config["tensor_parallel"] = {
+            "autotp_size": tp_size,
+            "preset_model": "qwen2",
+            "tp": {"tp_size": tp_size},
+        }
+
+    engine, *_ = deepspeed.initialize(model=model, config=ds_config)
+
+    if rank == 0:
+        print(f"\n{'='*60}")
+        print(f"Model: {MODEL}, TP={tp_size}, n={N_SAMPLES}, cb={CB_SIZE}, max_new={MAX_NEW_TOKENS}")
+        print(f"{'='*60}")
+
+    # 1P1R without graph capture (CB=1, no GC)
+    try:
+        bench_rollout(engine, tokenizer, use_graph_capture=False, cb_size=CB_SIZE, label=f"TP{tp_size} CB={CB_SIZE}")
+    except Exception as e:
+        if rank == 0:
+            print(f"[TP{tp_size} CB={CB_SIZE}] FAILED: {e}")
+            import traceback; traceback.print_exc()
+
+    # 1P1R with CUDA graph capture
+    try:
+        bench_rollout(engine, tokenizer, use_graph_capture=True, cb_size=CB_SIZE, label=f"TP{tp_size} CB={CB_SIZE}+GC")
+    except Exception as e:
+        if rank == 0:
+            print(f"[TP{tp_size} CB={CB_SIZE}+GC] FAILED: {e}")
+            import traceback; traceback.print_exc()
+
+    if rank == 0:
+        print(f"{'='*60}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/opsd/bench_autotp_gc.py b/benchmarks/opsd/bench_autotp_gc.py
new file mode 100644
index 000000000000..c9a245b245de
--- /dev/null
+++ b/benchmarks/opsd/bench_autotp_gc.py
@@ -0,0 +1,96 @@
+"""Benchmark rollout with AutoTP + graph capture on 14B model."""
+import time
+import torch
+import deepspeed
+from deepspeed.runtime.rollout import HybridEngineRollout, RolloutRequest, SamplingConfig
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+def main():
+    deepspeed.init_distributed()
+    rank = torch.distributed.get_rank()
+    local_rank = int(torch.distributed.get_rank()) % torch.cuda.device_count()
+    torch.cuda.set_device(local_rank)
+    device = torch.device(f"cuda:{local_rank}")
+
+    model_name = "Qwen/Qwen2.5-14B-Instruct"
+    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+
+    model = AutoModelForCausalLM.from_pretrained(
+        model_name, dtype=torch.bfloat16, trust_remote_code=True
+    )
+
+    ds_config = {
+        "bf16": {"enabled": True},
+        "zero_optimization": {"stage": 0},
+        "tensor_parallel": {
+            "autotp_size": 2,
+            "preset_model": "qwen2",
+            "tp": {"tp_size": 2},
+        },
+        "train_micro_batch_size_per_gpu": 1,
+        "train_batch_size": 2,
+        "gradient_accumulation_steps": 1,
+        "hybrid_engine": {
+            "enabled": True,
+            "max_out_tokens": 512,
+            "inference_tp_size": 1,
+            "release_inference_cache": False,
+            "pin_parameters": True,
+            "tp_gather_partition_size": 8,
+        },
+    }
+
+    engine, *_ = deepspeed.initialize(model=model, config=ds_config)
+
+    rollout = HybridEngineRollout(
+        engine=engine,
+        tokenizer=tokenizer,
+        continuous_batching_size=2,
+        use_graph_capture=True,
+    )
+
+    # Prepare prompt
+    prompt = "def fibonacci(n):"
+    ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+    req = RolloutRequest(prompt_ids=ids, prompt_attention_mask=torch.ones_like(ids))
+    sampling = SamplingConfig(max_new_tokens=256, temperature=0.8, top_p=0.95, n_samples_per_prompt=4)
+
+    # Warmup
+    torch.manual_seed(42)
+    engine.eval()
+    rollout.generate(req, sampling)
+    engine.train()
+
+    # Benchmark
+    times = []
+    for i in range(5):
+        torch.manual_seed(42)
+        engine.eval()
+        torch.cuda.synchronize()
+        t0 = time.time()
+        batch = rollout.generate(req, sampling)
+        torch.cuda.synchronize()
+        times.append(time.time() - t0)
+        engine.train()
+
+    t_avg = sum(times[1:]) / len(times[1:])
+    # Count tokens
+    pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
+    total_toks = 0
+    for i in range(batch.input_ids.shape[0]):
+        resp = batch.input_ids[i, batch.response_start_idx[i]:]
+        total_toks += (resp != pad_id).sum().item()
+
+    if rank == 0:
+        print(f"\n{'='*60}")
+        print(f"Model: {model_name}")
+        print(f"TP=2, n=8, cb=4, graph_capture=True, max_new_tokens=256")
+        print(f"Avg latency (excl warmup): {t_avg*1000:.1f}ms")
+        print(f"Total response tokens: {total_toks}")
+        print(f"Throughput: {total_toks/t_avg:.1f} tok/s")
+        print(f"Per-run times: {[f'{t*1000:.0f}ms' for t in times]}")
+        print(f"{'='*60}\n")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/opsd/bench_decode_1p1r.py b/benchmarks/opsd/bench_decode_1p1r.py
new file mode 100644
index 000000000000..58fb667d4581
--- /dev/null
+++ b/benchmarks/opsd/bench_decode_1p1r.py
@@ -0,0 +1,180 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+"""Micro-benchmark for 1p1r HybridEngineRollout decode.
+
+Measures time breakdown of each decode step:
+  - model forward (attention + FFN)
+  - sampling (softmax + multinomial)
+  - Python overhead (mask concat, state update, etc.)
+
+Usage:
+  python examples/opsd/bench_decode_1p1r.py --model Qwen/Qwen2.5-0.5B-Instruct
+"""
+
+import argparse
+import time
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from deepspeed.accelerator import get_accelerator
+
+from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout
+from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig
+
+
+def bench_decode_raw(model, tokenizer, device, prompt_len=64, max_new_tokens=64, num_warmup=3, num_iters=10):
+    """Raw decode loop benchmark — measures each component separately."""
+    model.eval()
+    model_dtype = next(model.parameters()).dtype
+
+    input_ids = torch.randint(10, 1000, (1, prompt_len), device=device)
+    attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device)
+
+    results = {
+        "prompt_len": prompt_len,
+        "max_new_tokens": max_new_tokens,
+        "model_dtype": str(model_dtype),
+    }
+
+    timings = {"prefill": [], "decode_forward": [], "sampling": [], "overhead": [], "total": []}
+
+    for _ in range(num_warmup + num_iters):
+        with torch.no_grad():
+            t0 = time.perf_counter()
+            out = model(input_ids, attention_mask=attn_mask, use_cache=True)
+            past = out.past_key_values
+            logits = out.logits[:, -1:, :]
+            t_prefill = time.perf_counter()
+
+            generated = []
+            cur_token = logits.argmax(dim=-1)
+            generated.append(cur_token)
+            cur_mask = attn_mask
+
+            decode_times = []
+            sample_times = []
+            overhead_times = []
+
+            for step in range(max_new_tokens):
+                t_step = time.perf_counter()
+                cur_mask = torch.cat([cur_mask, torch.ones(1, 1, dtype=torch.long, device=device)], dim=1)
+                pos_ids = torch.tensor([[prompt_len + step]], device=device)
+
+                t_fwd = time.perf_counter()
+                out = model(cur_token,
+                            attention_mask=cur_mask,
+                            position_ids=pos_ids,
+                            past_key_values=past,
+                            use_cache=True)
+                past = out.past_key_values
+                t_fwd_end = time.perf_counter()
+
+                next_logits = out.logits[:, -1, :]
+                probs = torch.softmax(next_logits / 1.0, dim=-1)
+                cur_token = torch.multinomial(probs, 1)
+                t_sample = time.perf_counter()
+
+                generated.append(cur_token)
+                t_overhead = time.perf_counter()
+
+                decode_times.append(t_fwd_end - t_fwd)
+                sample_times.append(t_sample - t_fwd_end)
+                overhead_times.append(t_overhead - t_sample)
+
+            t_total = time.perf_counter()
+
+        timings["prefill"].append(t_prefill - t0)
+        timings["decode_forward"].append(decode_times)
+        timings["sampling"].append(sample_times)
+        timings["overhead"].append(overhead_times)
+        timings["total"].append(t_total - t0)
+
+    import numpy as np
+
+    def avg_last_n(lst, n):
+        return np.mean(lst[-n:])
+
+    def avg_of_avg(list_of_lists, n):
+        arrs = [np.array(ls[-n:]) for ls in list_of_lists]
+        return np.mean([a.mean() for a in arrs])
+
+    results["prefill_ms"] = avg_last_n(timings["prefill"], num_iters) * 1000
+    results["decode_forward_ms_per_step"] = avg_of_avg(timings["decode_forward"], num_iters) * 1000
+    results["sampling_ms_per_step"] = avg_of_avg(timings["sampling"], num_iters) * 1000
+    results["overhead_ms_per_step"] = avg_of_avg(timings["overhead"], num_iters) * 1000
+    results["total_ms"] = avg_last_n(timings["total"], num_iters) * 1000
+    results["decode_steps_total_ms"] = results["decode_forward_ms_per_step"] * max_new_tokens
+    results["sampling_total_ms"] = results["sampling_ms_per_step"] * max_new_tokens
+    results["overhead_total_ms"] = results["overhead_ms_per_step"] * max_new_tokens
+
+    return results
+
+
+def bench_hybrid_rollout(rollout, tokenizer, device, prompt_len=64, max_new_tokens=64, num_warmup=3, num_iters=10):
+    """Benchmark the full HybridEngineRollout.generate() path."""
+    input_ids = torch.randint(10, 1000, (1, prompt_len), device=device)
+    attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device)
+    sampling = SamplingConfig(max_new_tokens=max_new_tokens, temperature=1.0, top_p=1.0)
+    request = RolloutRequest(prompt_ids=input_ids, prompt_attention_mask=attn_mask)
+
+    times = []
+    for _ in range(num_warmup + num_iters):
+        get_accelerator().synchronize()  #ignore-cuda
+        t0 = time.perf_counter()
+        with torch.no_grad():
+            rollout.generate(request, sampling)
+        get_accelerator().synchronize()  #ignore-cuda
+        times.append(time.perf_counter() - t0)
+
+    import numpy as np
+    avg = np.mean(times[-num_iters:]) * 1000
+    return {"rollout_total_ms": avg, "prompt_len": prompt_len, "max_new_tokens": max_new_tokens}
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="Qwen/Qwen2.5-0.5B-Instruct")
+    parser.add_argument("--prompt-len", type=int, default=64)
+    parser.add_argument("--max-new-tokens", type=int, default=64)
+    parser.add_argument("--num-warmup", type=int, default=3)
+    parser.add_argument("--num-iters", type=int, default=10)
+    args = parser.parse_args()
+
+    device = get_accelerator().current_device()  #ignore-cuda
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left")
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16).to(device)
+
+    print(f"=== Raw decode loop benchmark (model={args.model}) ===")
+    raw = bench_decode_raw(model, tokenizer, device, args.prompt_len, args.max_new_tokens, args.num_warmup,
+                           args.num_iters)
+    print(f"  Prefill:              {raw['prefill_ms']:.2f} ms")
+    print(
+        f"  Decode forward/step:  {raw['decode_forward_ms_per_step']:.3f} ms  (total: {raw['decode_steps_total_ms']:.1f} ms)"
+    )
+    print(f"  Sampling/step:        {raw['sampling_ms_per_step']:.3f} ms  (total: {raw['sampling_total_ms']:.1f} ms)")
+    print(f"  Overhead/step:        {raw['overhead_ms_per_step']:.3f} ms  (total: {raw['overhead_total_ms']:.1f} ms)")
+    print(f"  Total:                {raw['total_ms']:.1f} ms")
+
+    print(f"\n=== HybridEngineRollout benchmark ===")
+    rollout = HybridEngineRollout(model, tokenizer)
+    rr = bench_hybrid_rollout(rollout, tokenizer, device, args.prompt_len, args.max_new_tokens, args.num_warmup,
+                              args.num_iters)
+    print(f"  Rollout generate:     {rr['rollout_total_ms']:.1f} ms")
+
+    print(f"\n=== Summary ===")
+    print(f"  Raw decode loop:      {raw['total_ms']:.1f} ms")
+    print(f"  HybridEngine rollout: {rr['rollout_total_ms']:.1f} ms")
+    print(f"  Overhead (rollout - raw): {rr['rollout_total_ms'] - raw['total_ms']:.1f} ms")
+    print(
+        f"  Bottleneck: decode forward = {raw['decode_forward_ms_per_step']:.3f} ms/step x {args.max_new_tokens} steps = {raw['decode_steps_total_ms']:.1f} ms"
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/opsd/bench_flashinfer.py b/benchmarks/opsd/bench_flashinfer.py
new file mode 100644
index 000000000000..abaa31483111
--- /dev/null
+++ b/benchmarks/opsd/bench_flashinfer.py
@@ -0,0 +1,129 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Benchmark HybridEngineRollout with FlashInfer kernels enabled.
+
+Usage:
+    deepspeed --num_gpus 2 bench_flashinfer.py
+"""
+
+import argparse
+import os
+import time
+
+import deepspeed
+import numpy as np
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout
+from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="Qwen/Qwen2.5-14B-Instruct")
+    parser.add_argument("--prompt-len", type=int, default=64)
+    parser.add_argument("--max-new-tokens", type=int, default=64)
+    parser.add_argument("--num-warmup", type=int, default=3)
+    parser.add_argument("--num-iters", type=int, default=10)
+    parser.add_argument("--no-flashinfer", action="store_true")
+    parser.add_argument("--graph-capture", action="store_true")
+    parser.add_argument("--local_rank", type=int, default=int(os.environ.get("LOCAL_RANK", 0)))
+    args = parser.parse_args()
+
+    local_rank = args.local_rank
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+
+    deepspeed.init_distributed()
+
+    if local_rank == 0:
+        print(f"=== HybridEngineRollout Benchmark ===")
+        print(f"  Model:         {args.model}")
+        print(f"  TP size:       {world_size}")
+        print(f"  FlashInfer:    {not args.no_flashinfer}")
+        print(f"  Graph capture: {args.graph_capture}")
+        print()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left")
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        torch_dtype=torch.bfloat16,
+    )
+
+    ds_config = {
+        "bf16": {
+            "enabled": True
+        },
+        "zero_optimization": {
+            "stage": 0
+        },
+        "train_micro_batch_size_per_gpu": 1,
+        "train_batch_size": world_size,
+        "gradient_accumulation_steps": 1,
+        "tensor_parallel": {
+            "autotp_size": world_size,
+            "preset_model": "qwen2",
+        },
+    }
+
+    engine, *_ = deepspeed.initialize(
+        model=model,
+        optimizer=None,
+        model_parameters=model.parameters(),
+        config=ds_config,
+    )
+
+    if local_rank == 0:
+        param_count = sum(p.numel() for p in engine.parameters()) / 1e9
+        alloc = get_accelerator().memory_allocated(local_rank) / 1e9
+        print(f"  Parameters (local):  {param_count:.2f}B")
+        print(f"  GPU mem allocated:   {alloc:.1f} GB")
+        print()
+
+    use_flashinfer = not args.no_flashinfer
+    rollout = HybridEngineRollout(engine,
+                                  tokenizer,
+                                  use_flashinfer=use_flashinfer,
+                                  use_graph_capture=args.graph_capture)
+
+    device = torch.device(f"cuda:{local_rank}")
+    torch.manual_seed(42)
+    input_ids = torch.randint(10, 1000, (1, args.prompt_len), device=device)
+    attn_mask = torch.ones(1, args.prompt_len, dtype=torch.long, device=device)
+    sampling = SamplingConfig(max_new_tokens=args.max_new_tokens, temperature=1.0, top_p=1.0)
+    request = RolloutRequest(prompt_ids=input_ids, prompt_attention_mask=attn_mask)
+
+    times = []
+    for i in range(args.num_warmup + args.num_iters):
+        get_accelerator().synchronize()  #ignore-cuda
+        t0 = time.perf_counter()
+        with torch.no_grad():
+            result = rollout.generate(request, sampling)
+        get_accelerator().synchronize()  #ignore-cuda
+        elapsed = time.perf_counter() - t0
+        times.append(elapsed)
+        if local_rank == 0:
+            label = "warmup" if i < args.num_warmup else "iter"
+            n_tokens = result.input_ids.shape[-1] - args.prompt_len
+            print(f"  [{label}] {elapsed*1000:.1f} ms, tokens={n_tokens}")
+
+    if local_rank == 0:
+        avg = np.mean(times[-args.num_iters:]) * 1000
+        per_step = avg / args.max_new_tokens
+        throughput = 1000.0 / per_step
+        print()
+        mode = "FlashInfer" if use_flashinfer else "Baseline (SDPA)"
+        print(f"=== Results ({mode}) ===")
+        print(f"  Total generate:   {avg:.1f} ms")
+        print(f"  Per decode step:  {per_step:.2f} ms")
+        print(f"  Throughput:       {throughput:.1f} tokens/s")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/opsd/bench_hybrid_tp.py b/benchmarks/opsd/bench_hybrid_tp.py
new file mode 100644
index 000000000000..3f41150c7b85
--- /dev/null
+++ b/benchmarks/opsd/bench_hybrid_tp.py
@@ -0,0 +1,145 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+"""Benchmark HybridEngineRollout with DeepSpeed AutoTP (TP=2).
+
+Usage:
+    deepspeed --num_gpus 2 bench_hybrid_tp.py \
+        --model Qwen/Qwen2.5-14B-Instruct \
+        --max-new-tokens 64
+"""
+
+import argparse
+import os
+import time
+
+import deepspeed
+import numpy as np
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout
+from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig
+
+
+def bench_hybrid_rollout(rollout, tokenizer, prompt_len, max_new_tokens, num_warmup, num_iters):
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    device = torch.device(f"cuda:{local_rank}")
+
+    torch.manual_seed(42)
+    input_ids = torch.randint(10, 1000, (1, prompt_len), device=device)
+    attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device)
+    sampling = SamplingConfig(max_new_tokens=max_new_tokens, temperature=1.0, top_p=1.0)
+    request = RolloutRequest(prompt_ids=input_ids, prompt_attention_mask=attn_mask)
+
+    times = []
+    for i in range(num_warmup + num_iters):
+        get_accelerator().synchronize(device=device)  #ignore-cuda
+        t0 = time.perf_counter()
+        with torch.no_grad():
+            result = rollout.generate(request, sampling)
+        get_accelerator().synchronize(device=device)  #ignore-cuda
+        elapsed = time.perf_counter() - t0
+        times.append(elapsed)
+        if local_rank == 0:
+            label = "warmup" if i < num_warmup else "iter"
+            n_tokens = result.input_ids.shape[-1] - prompt_len
+            print(f"  [{label}] {elapsed*1000:.1f} ms, tokens={n_tokens}")
+
+    avg = np.mean(times[-num_iters:]) * 1000
+    return {"rollout_total_ms": avg, "prompt_len": prompt_len, "max_new_tokens": max_new_tokens}
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="Qwen/Qwen2.5-14B-Instruct")
+    parser.add_argument("--prompt-len", type=int, default=64)
+    parser.add_argument("--max-new-tokens", type=int, default=64)
+    parser.add_argument("--num-warmup", type=int, default=3)
+    parser.add_argument("--num-iters", type=int, default=10)
+    parser.add_argument("--local_rank", type=int, default=int(os.environ.get("LOCAL_RANK", 0)))
+    args = parser.parse_args()
+
+    local_rank = args.local_rank
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+
+    deepspeed.init_distributed()
+
+    if local_rank == 0:
+        print(f"=== HybridEngineRollout Benchmark (AutoTP={world_size}) ===")
+        print(f"  Model:       {args.model}")
+        print(f"  TP size:     {world_size}")
+        print(f"  Prompt len:  {args.prompt_len}")
+        print(f"  Decode len:  {args.max_new_tokens}")
+        print(f"  Warmup:      {args.num_warmup}")
+        print(f"  Iters:       {args.num_iters}")
+        print()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left")
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        torch_dtype=torch.bfloat16,
+    )
+
+    ds_config = {
+        "bf16": {
+            "enabled": True
+        },
+        "zero_optimization": {
+            "stage": 0
+        },
+        "train_micro_batch_size_per_gpu": 1,
+        "train_batch_size": world_size,
+        "gradient_accumulation_steps": 1,
+        "tensor_parallel": {
+            "autotp_size": world_size,
+            "preset_model": "qwen2",
+        },
+    }
+
+    engine, *_ = deepspeed.initialize(
+        model=model,
+        optimizer=None,
+        model_parameters=model.parameters(),
+        config=ds_config,
+    )
+
+    if local_rank == 0:
+        print("  DeepSpeed engine initialized.")
+        param_count = sum(p.numel() for p in engine.parameters()) / 1e9
+        alloc = get_accelerator().memory_allocated(local_rank) / 1e9  #ignore-cuda
+        print(f"  Parameters (local):  {param_count:.2f}B")
+        print(f"  GPU mem allocated:   {alloc:.1f} GB")
+        print()
+
+    rollout = HybridEngineRollout(engine, tokenizer)
+
+    if local_rank == 0:
+        print("  Running benchmark...")
+
+    result = bench_hybrid_rollout(
+        rollout,
+        tokenizer,
+        args.prompt_len,
+        args.max_new_tokens,
+        args.num_warmup,
+        args.num_iters,
+    )
+
+    if local_rank == 0:
+        total = result["rollout_total_ms"]
+        per_step = total / args.max_new_tokens
+        throughput = 1000.0 / per_step
+        print()
+        print(f"=== Results ===")
+        print(f"  Total generate:   {total:.1f} ms")
+        print(f"  Per decode step:  {per_step:.2f} ms")
+        print(f"  Throughput:       {throughput:.1f} tokens/s")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/opsd/bench_hybrid_tp_opt.py b/benchmarks/opsd/bench_hybrid_tp_opt.py
new file mode 100644
index 000000000000..d7fae2ddef51
--- /dev/null
+++ b/benchmarks/opsd/bench_hybrid_tp_opt.py
@@ -0,0 +1,149 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+# DeepSpeed Team
+"""Benchmark HybridEngineRollout with DeepSpeed AutoTP (TP=2) + optimizer.
+
+Usage:
+    deepspeed --num_gpus 2 bench_hybrid_tp_opt.py \
+        --model Qwen/Qwen2.5-14B-Instruct \
+        --max-new-tokens 64
+"""
+
+import argparse
+import os
+import time
+
+import deepspeed
+import numpy as np
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from deepspeed.accelerator import get_accelerator
+from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout
+from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig
+
+
+def bench_hybrid_rollout(rollout, tokenizer, prompt_len, max_new_tokens, num_warmup, num_iters):
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    device = torch.device(f"cuda:{local_rank}")
+
+    torch.manual_seed(42)
+    input_ids = torch.randint(10, 1000, (1, prompt_len), device=device)
+    attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device)
+    sampling = SamplingConfig(max_new_tokens=max_new_tokens, temperature=1.0, top_p=1.0)
+    request = RolloutRequest(prompt_ids=input_ids, prompt_attention_mask=attn_mask)
+
+    times = []
+    for i in range(num_warmup + num_iters):
+        get_accelerator().synchronize(device=device)  #ignore-cuda
+        t0 = time.perf_counter()
+        with torch.no_grad():
+            result = rollout.generate(request, sampling)
+        get_accelerator().synchronize(device=device)  #ignore-cuda
+        elapsed = time.perf_counter() - t0
+        times.append(elapsed)
+        if local_rank == 0:
+            label = "warmup" if i < num_warmup else "iter"
+            n_tokens = result.input_ids.shape[-1] - prompt_len
+            print(f"  [{label}] {elapsed*1000:.1f} ms, tokens={n_tokens}")
+
+    avg = np.mean(times[-num_iters:]) * 1000
+    return {"rollout_total_ms": avg, "prompt_len": prompt_len, "max_new_tokens": max_new_tokens}
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", default="Qwen/Qwen2.5-14B-Instruct")
+    parser.add_argument("--prompt-len", type=int, default=64)
+    parser.add_argument("--max-new-tokens", type=int, default=64)
+    parser.add_argument("--num-warmup", type=int, default=3)
+    parser.add_argument("--num-iters", type=int, default=10)
+    parser.add_argument("--local_rank", type=int, default=int(os.environ.get("LOCAL_RANK", 0)))
+    args = parser.parse_args()
+
+    local_rank = args.local_rank
+    world_size = int(os.environ.get("WORLD_SIZE", "1"))
+
+    deepspeed.init_distributed()
+
+    if local_rank == 0:
+        print(f"=== HybridEngineRollout Benchmark (AutoTP={world_size} + Optimizer) ===")
+        print(f"  Model:       {args.model}")
+        print(f"  TP size:     {world_size}")
+        print(f"  Prompt len:  {args.prompt_len}")
+        print(f"  Decode len:  {args.max_new_tokens}")
+        print()
+
+    tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left")
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model,
+        torch_dtype=torch.bfloat16,
+    )
+
+    ds_config = {
+        "bf16": {
+            "enabled": True
+        },
+        "zero_optimization": {
+            "stage": 0
+        },
+        "train_micro_batch_size_per_gpu": 1,
+        "train_batch_size": world_size,
+        "gradient_accumulation_steps": 1,
+        "tensor_parallel": {
+            "autotp_size": world_size,
+            "preset_model": "qwen2",
+        },
+    }
+
+    engine, _, _, _ = deepspeed.initialize(
+        model=model,
+        model_parameters=model.parameters(),
+        config=ds_config,
+    )
+
+    if local_rank == 0:
+        print("  DeepSpeed engine initialized (with optimizer).")
+        param_count = sum(p.numel() for p in engine.parameters()) / 1e9
+        alloc = get_accelerator().memory_allocated(local_rank) / 1e9  #ignore-cuda
+        reserv = get_accelerator().memory_reserved(local_rank) / 1e9  #ignore-cuda
+        print(f"  Parameters (local):  {param_count:.2f}B")
+        alloc = get_accelerator().memory_allocated(local_rank) / 1e9  #ignore-cuda
+        reserv = get_accelerator().memory_reserved(local_rank) / 1e9  #ignore-cuda
+        print(f"  GPU mem allocated:   {alloc:.1f} GB")
+        print(f"  GPU mem reserved:    {reserv:.1f} GB")
+        print()
+
+    rollout = HybridEngineRollout(engine, tokenizer)
+
+    if local_rank == 0:
+        print("  Running benchmark...")
+
+    result = bench_hybrid_rollout(
+        rollout,
+        tokenizer,
+        args.prompt_len,
+        args.max_new_tokens,
+        args.num_warmup,
+        args.num_iters,
+    )
+
+    if local_rank == 0:
+        total = result["rollout_total_ms"]
+        per_step = total / args.max_new_tokens
+        throughput = 1000.0 / per_step
+        print()
+        print(f"=== Results ===")
+        print(f"  Total generate:   {total:.1f} ms")
+        print(f"  Per decode step:  {per_step:.2f} ms")
+        print(f"  Throughput:       {throughput:.1f} tokens/s")
+        alloc = get_accelerator().memory_allocated(local_rank) / 1e9  #ignore-cuda
+        reserv = get_accelerator().memory_reserved(local_rank) / 1e9  #ignore-cuda
+        print(f"  GPU mem (final):   alloc={alloc:.1f} GB, reserved={reserv:.1f} GB")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/opsd/bench_vllm_tp2.py b/benchmarks/opsd/bench_vllm_tp2.py
new file mode 100644
index 000000000000..66a82192551b
--- /dev/null
+++ b/benchmarks/opsd/bench_vllm_tp2.py
@@ -0,0 +1,41 @@
+"""Benchmark vLLM TP=2 on 14B, 1P1R.
+
+Launched as a subprocess wrapper to avoid CUDA fork issues.
+"""
+import subprocess, sys, os
+
+script = '''
+import os
+os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
+import time
+from vllm import LLM, SamplingParams
+
+llm = LLM("Qwen/Qwen2.5-14B-Instruct", tensor_parallel_size=2,
+          gpu_memory_utilization=0.85, dtype="bfloat16", enforce_eager=True)
+sp = SamplingParams(max_tokens=256, temperature=0.8, top_p=0.95, n=1)
+prompt = "def fibonacci(n):"
+
+# warmup
+llm.generate([prompt], sp)
+
+times = []
+for i in range(5):
+    t0 = time.time()
+    out = llm.generate([prompt], sp)
+    times.append(time.time() - t0)
+
+t_avg = sum(times[1:]) / len(times[1:])
+total_toks = sum(len(o.token_ids) for r in out for o in r.outputs)
+print(f"vLLM TP=2 14B 1P1R: {total_toks} toks, {t_avg*1000:.1f}ms, {total_toks/t_avg:.1f} tok/s")
+print(f"Per-run: {[f'{t*1000:.0f}ms' for t in times]}")
+'''
+
+# Write to temp file and exec in a fresh process with no prior CUDA init
+tmp = "/tmp/bench_vllm_inner.py"
+with open(tmp, "w") as f:
+    f.write(script)
+
+env = os.environ.copy()
+env.pop("CUDA_VISIBLE_DEVICES", None)
+proc = subprocess.run([sys.executable, tmp], env=env)
+sys.exit(proc.returncode)
diff --git a/deepspeed/runtime/rlhf/__init__.py b/deepspeed/runtime/rlhf/__init__.py
new file mode 100644
index 000000000000..479f51f3ae80
--- /dev/null
+++ b/deepspeed/runtime/rlhf/__init__.py
@@ -0,0 +1,32 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""deepspeed.runtime.rlhf — Reinforcement Learning from Human Feedback runtime.
+
+Sub-modules
+-----------
+config   : Training, rollout, distillation, and data configuration dataclasses.
+losses   : Per-token KL / JSD divergence losses with sequence-axis chunking.
+utils    : Shared tensor / masking helpers.
+trainer  : Algorithm-specific training loops (OPSD, GRPO, …).
+"""
+
+from deepspeed.runtime.rlhf.config import (  # noqa: F401
+    OPSDConfig,
+    StudentConfig,
+    TeacherConfig,
+    RolloutConfig,
+    DistillationConfig,
+    TrainingConfig,
+    DataConfig,
+)
+from deepspeed.runtime.rlhf.losses import (  # noqa: F401
+    chunked_distillation_loss,
+    streamed_distillation_loss,
+    per_token_logprobs,
+)
+from deepspeed.runtime.rlhf.utils import (  # noqa: F401
+    build_response_mask,
+    shift_for_next_token_prediction,
+)
diff --git a/examples/opsd/opsd/config.py b/deepspeed/runtime/rlhf/config.py
similarity index 73%
rename from examples/opsd/opsd/config.py
rename to deepspeed/runtime/rlhf/config.py
index b55487d738bd..07ecbe0ad4f2 100644
--- a/examples/opsd/opsd/config.py
+++ b/deepspeed/runtime/rlhf/config.py
@@ -2,12 +2,12 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
-"""Configuration dataclasses for OPSD training.
+"""Configuration dataclasses for RLHF training.
 
-A single :class:`OPSDConfig` is loaded from a JSON file (see ``configs/`` for
-examples) and threaded through the rest of the pipeline. We use plain
-dataclasses instead of Hydra/pydantic to match the rest of the DeepSpeed
-example apps and to keep the dependency surface minimal.
+A single :class:`OPSDConfig` is loaded from a JSON file (see
+``examples/opsd/configs/`` for examples) and threaded through the rest of the
+pipeline. We use plain dataclasses instead of Hydra/pydantic to match the rest
+of the DeepSpeed codebase and to keep the dependency surface minimal.
 """
 
 import json
@@ -20,9 +20,6 @@ class StudentConfig:
     model_name_or_path: str
     dtype: str = "bfloat16"
     trust_remote_code: bool = False
-    # Architecture key used to look up the weight bridge for vLLM rollout. If
-    # unset, the trainer will infer it from the HF config's ``model_type``.
-    arch: Optional[str] = None
 
 
 @dataclass
@@ -43,11 +40,15 @@ class RolloutConfig:
     # Generation knobs (apply to either engine)
     max_prompt_length: int = 1024
     max_response_length: int = 1024
-    temperature: float = 1.0
+    temperature: float = 0.0
     top_p: float = 1.0
     top_k: int = -1
     n_samples_per_prompt: int = 1
 
+    # Use CUDA graph capture for greedy decode (temperature=0 only).
+    # Eliminates kernel launch overhead, ~3x faster for small models.
+    use_graph_capture: bool = False
+
     # vLLM-specific. ``gpus`` is the disjoint set of CUDA device indices vLLM
     # may use; the training ranks must not overlap with these. If None, the
     # trainer will refuse to start in vllm mode.
@@ -64,6 +65,20 @@ class RolloutConfig:
     # one-time compilation (worth it for smoke tests / short-lived runs);
     # leave False for steady-state throughput.
     vllm_enforce_eager: bool = False
+    # Port for the vLLM OpenAI-compatible API server. Only used when the
+    # vLLM rollout is configured to run as an external subprocess.
+    vllm_port: int = 8000
+    # Maximum seconds to wait for the vLLM server to become healthy.
+    vllm_start_timeout: int = 300
+    # Weight transfer backend for syncing student weights into vLLM.
+    # "auto"  – try GDR (GPU-direct) first, fall back to HTTP.
+    # "gdr"   – GPU-direct transfer (NCCL). Fastest but requires NVIDIA.
+    # "http"  – serialize tensors over HTTP. Slower but accelerator-agnostic.
+    weight_transfer_backend: str = "auto"
+    # Path to the Python interpreter that has vLLM installed.  When set, the
+    # vLLM server subprocess uses this interpreter instead of ``sys.executable``.
+    # Useful when vLLM lives in a separate virtual-env / conda env.
+    vllm_python: str = ""
 
 
 @dataclass
@@ -142,8 +157,8 @@ def validate(self) -> None:
             raise ValueError(f"Unknown loss_type {self.distillation.loss_type!r}")
         if self.rollout.engine not in ("hybrid_engine", "vllm"):
             raise ValueError(f"Unknown rollout engine {self.rollout.engine!r}")
-        # rollout.gpus may be left empty for the "shared" topology where vLLM
-        # runs in-process on the same GPU as training rank 0; populated for
-        # the "disjoint" topology where it runs on a separate set of devices.
         if self.distillation.chunk_size <= 0:
             raise ValueError("distillation.chunk_size must be positive")
+        if self.rollout.weight_sync_interval != 1:
+            raise ValueError(f"rollout.weight_sync_interval must be 1 for on-policy distillation; "
+                             f"got {self.rollout.weight_sync_interval}")
diff --git a/examples/opsd/opsd/data.py b/deepspeed/runtime/rlhf/data.py
similarity index 97%
rename from examples/opsd/opsd/data.py
rename to deepspeed/runtime/rlhf/data.py
index 02ecf417e5c3..8ce86b56c67f 100644
--- a/examples/opsd/opsd/data.py
+++ b/deepspeed/runtime/rlhf/data.py
@@ -11,7 +11,7 @@
 student generates the assistant turn.
 
 Batches are **left-padded** because causal generation requires real tokens at
-the right edge — :class:`opsd.rollout.RolloutRequest` and the hybrid-engine
+    the right edge — :class:`deepspeed.runtime.rollout.RolloutRequest` and the hybrid-engine
 backend both assume this layout.
 """
 
diff --git a/examples/opsd/opsd/losses.py b/deepspeed/runtime/rlhf/losses.py
similarity index 100%
rename from examples/opsd/opsd/losses.py
rename to deepspeed/runtime/rlhf/losses.py
diff --git a/examples/opsd/opsd/teacher.py b/deepspeed/runtime/rlhf/teacher.py
similarity index 99%
rename from examples/opsd/opsd/teacher.py
rename to deepspeed/runtime/rlhf/teacher.py
index a7895beddf00..9d6eec3f08e4 100644
--- a/examples/opsd/opsd/teacher.py
+++ b/deepspeed/runtime/rlhf/teacher.py
@@ -27,7 +27,7 @@
 
 # ``opsd.config`` is pure-Python (no distributed imports), so we can import it
 # at module load time without pulling in DeepSpeed.
-from opsd.config import TeacherConfig
+from deepspeed.runtime.rlhf.config import TeacherConfig
 
 
 @dataclass
diff --git a/deepspeed/runtime/rlhf/trainer/__init__.py b/deepspeed/runtime/rlhf/trainer/__init__.py
new file mode 100644
index 000000000000..34169d59da7a
--- /dev/null
+++ b/deepspeed/runtime/rlhf/trainer/__init__.py
@@ -0,0 +1,8 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""deepspeed.runtime.rlhf.trainer — RLHF training-loop implementations."""
+
+from deepspeed.runtime.rlhf.trainer.base import RLHFTrainer  # noqa: F401
+from deepspeed.runtime.rlhf.trainer.opsd import OPSDTrainer  # noqa: F401
diff --git a/deepspeed/runtime/rlhf/trainer/base.py b/deepspeed/runtime/rlhf/trainer/base.py
new file mode 100644
index 000000000000..3f8687514a55
--- /dev/null
+++ b/deepspeed/runtime/rlhf/trainer/base.py
@@ -0,0 +1,42 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Abstract base class for RLHF trainers.
+
+Concrete implementations (e.g. :class:`~deepspeed.runtime.rlhf.trainer.opsd.OPSDTrainer`)
+inherit from :class:`RLHFTrainer` and implement the abstract methods to define
+their algorithm-specific training loop.
+"""
+
+from abc import ABC, abstractmethod
+from typing import Any
+
+
+class RLHFTrainer(ABC):
+    """Base class for all RLHF training loops.
+
+    Subclasses must implement :meth:`train` and :meth:`_train_step`.  The base
+    class deliberately imposes no constraints on the constructor signature so
+    each algorithm can accept whatever components it needs (rollout engine,
+    reference model, reward model, etc.).
+    """
+
+    @abstractmethod
+    def train(self) -> None:
+        """Run the full training loop (all epochs / steps)."""
+        ...
+
+    @abstractmethod
+    def _train_step(self, batch: Any) -> dict:
+        """Execute one optimizer step and return a metrics dict.
+
+        Args:
+            batch: A single batch from the dataloader.  The expected structure
+                is algorithm-specific.
+
+        Returns:
+            A ``dict`` of scalar metrics (``loss``, timing fields, token
+            counts, …) suitable for logging.
+        """
+        ...
diff --git a/examples/opsd/opsd/trainer.py b/deepspeed/runtime/rlhf/trainer/opsd.py
similarity index 88%
rename from examples/opsd/opsd/trainer.py
rename to deepspeed/runtime/rlhf/trainer/opsd.py
index 315b5145ef7a..28be8b56061e 100644
--- a/examples/opsd/opsd/trainer.py
+++ b/deepspeed/runtime/rlhf/trainer/opsd.py
@@ -2,13 +2,13 @@
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
-"""On-policy distillation training loop.
+"""On-policy distillation (OPSD) training loop.
 
 Each step is three phases:
 
   0. **Rollout.** The student generates responses for the batch's prompts
-     (via the configured :class:`~opsd.rollout.RolloutEngine` — hybrid engine
-     or vLLM).
+     (via the configured :class:`~deepspeed.runtime.rollout.RolloutEngine` —
+     hybrid engine or vLLM).
   1. **Teacher.** The frozen teacher runs a forward over prompt+response. The
      full logit tensor is parked on the host via
      :class:`~opsd.teacher.TeacherLogitCache` so teacher GPU buffers can be
@@ -16,9 +16,9 @@
   2. **Student.** The student runs forward+backward on prompt+response. The
      loss is the per-token divergence to the teacher, streamed from the
      host-resident cache one sequence chunk at a time
-     (:func:`~opsd.losses.streamed_distillation_loss`), so the full
-     ``[B, T, V]`` teacher tensor never co-resides with the student logits on
-     the training device.
+     (:func:`~deepspeed.runtime.rlhf.losses.streamed_distillation_loss`), so
+     the full ``[B, T, V]`` teacher tensor never co-resides with the student
+     logits on the training device.
 
 The trainer itself contains no DeepSpeed-specific control flow beyond the
 ``backward`` / ``step`` calls on the student engine; backend choice (ZeRO
@@ -33,17 +33,18 @@
 from deepspeed import comm as dist
 from deepspeed.accelerator import get_accelerator
 
-from opsd.config import OPSDConfig
-from opsd.losses import streamed_distillation_loss
-from opsd.rollout import RolloutEngine, RolloutRequest, SamplingConfig
-from opsd.utils import build_response_mask
+from deepspeed.runtime.rlhf.config import OPSDConfig
+from deepspeed.runtime.rlhf.losses import streamed_distillation_loss
+from deepspeed.runtime.rlhf.trainer.base import RLHFTrainer
+from deepspeed.runtime.rlhf.utils import build_response_mask
+from deepspeed.runtime.rollout import RolloutEngine, RolloutRequest, SamplingConfig
 
 
 def _is_rank_zero() -> bool:
     return (not dist.is_initialized()) or dist.get_rank() == 0
 
 
-class OPSDTrainer:
+class OPSDTrainer(RLHFTrainer):
 
     def __init__(
         self,
@@ -91,12 +92,13 @@ def _train_step(self, batch) -> dict:
         prompt_ids = batch["prompt_ids"].to(self.device, non_blocking=True)
         prompt_attn = batch["prompt_attention_mask"].to(self.device, non_blocking=True)
 
-        # Push student weights into the rollout backend if it's time to.
-        # No-op for the hybrid engine; meaningful for vLLM.
-        if self.step % self.cfg.rollout.weight_sync_interval == 0:
-            self.rollout.sync_weights_from_student(self.step)
+        # Sync student weights into the rollout backend.
+        # No-op for hybrid engine; meaningful for vLLM.
+        self.rollout.sync_weights(self.step)
 
         # --- Phase 0: rollout (student generates responses) ---------------
+        # Switch hybrid engine to inference mode (gathers ZeRO-3 params).
+        self.student_engine.eval()
         sampling = SamplingConfig(
             max_new_tokens=self.cfg.rollout.max_response_length,
             temperature=self.cfg.rollout.temperature,
diff --git a/examples/opsd/opsd/utils.py b/deepspeed/runtime/rlhf/utils.py
similarity index 100%
rename from examples/opsd/opsd/utils.py
rename to deepspeed/runtime/rlhf/utils.py
diff --git a/deepspeed/runtime/rollout/__init__.py b/deepspeed/runtime/rollout/__init__.py
new file mode 100644
index 000000000000..e61e7fe2d98b
--- /dev/null
+++ b/deepspeed/runtime/rollout/__init__.py
@@ -0,0 +1,65 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Rollout engines for on-policy generation during RL/distillation training.
+
+Provides:
+  - :class:`RolloutEngine` — abstract base class
+  - :class:`RolloutRequest`, :class:`RolloutBatch`, :class:`SamplingConfig` — dataclasses
+  - :class:`HybridEngineRollout` — concrete implementation using DeepSpeed hybrid engine
+  - :class:`VLLMRollout` — concrete implementation using an external vLLM server
+  - :func:`build_rollout` — factory that selects the engine from config
+"""
+
+from deepspeed.runtime.rollout.base import (
+    RolloutBatch,
+    RolloutEngine,
+    RolloutRequest,
+    SamplingConfig,
+)
+from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout
+from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout, stitch_rollout
+
+__all__ = [
+    "HybridEngineRollout",
+    "RolloutBatch",
+    "RolloutEngine",
+    "RolloutRequest",
+    "SamplingConfig",
+    "VLLMRollout",
+    "build_rollout",
+    "stitch_rollout",
+]
+
+
+def build_rollout(rollout_cfg, student_engine=None, tokenizer=None, student_model_path=None):
+    """Factory: construct the rollout engine specified by ``rollout_cfg.engine``.
+
+    Imports of heavy backends are deferred so that selecting the hybrid-engine
+    path doesn't transitively require vLLM (and vice versa).
+
+    Args:
+        rollout_cfg: :class:`RolloutConfig` (or any object with an ``engine``
+            attribute set to ``"hybrid_engine"`` or ``"vllm"``).
+        student_engine: DeepSpeed engine wrapping the student model.
+        tokenizer: HuggingFace tokenizer.
+        student_model_path: Model name/path for vLLM to load from disk.
+    """
+    engine_name = rollout_cfg.engine
+    if engine_name == "hybrid_engine":
+        if student_engine is None or tokenizer is None:
+            raise ValueError("hybrid_engine rollout needs both student_engine and tokenizer")
+        return HybridEngineRollout(engine=student_engine, tokenizer=tokenizer, cfg=rollout_cfg)
+
+    if engine_name == "vllm":
+        if tokenizer is None:
+            raise ValueError("vllm rollout needs a tokenizer for length accounting")
+        return VLLMRollout(
+            cfg=rollout_cfg,
+            tokenizer=tokenizer,
+            student_engine=student_engine,
+            student_model_path=student_model_path,
+        )
+
+    raise ValueError(f"Unknown rollout engine {engine_name!r}; choose from 'hybrid_engine' | 'vllm'")
diff --git a/deepspeed/runtime/rollout/_vllm_compat/__init__.py b/deepspeed/runtime/rollout/_vllm_compat/__init__.py
new file mode 100644
index 000000000000..208299fb8c50
--- /dev/null
+++ b/deepspeed/runtime/rollout/_vllm_compat/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
diff --git a/deepspeed/runtime/rollout/_vllm_compat/sitecustomize.py b/deepspeed/runtime/rollout/_vllm_compat/sitecustomize.py
new file mode 100644
index 000000000000..d0a399093de0
--- /dev/null
+++ b/deepspeed/runtime/rollout/_vllm_compat/sitecustomize.py
@@ -0,0 +1,75 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Site-customization hook injected into the vLLM server subprocess.
+
+When this file is on ``PYTHONPATH`` (or placed as ``sitecustomize.py`` in a
+directory listed in ``PYTHONPATH``), Python executes it automatically before
+running the main script.  It is used solely by
+:class:`~deepspeed.runtime.rollout.vllm_rollout.VLLMRollout` to patch a known
+compatibility issue between **vLLM 0.22.0** and certain ``pydantic-core``
+builds that hit a Rust-level assertion::
+
+    pydantic_core._pydantic_core.ValidationError:
+      Assertion failed, duplicate template name
+
+The patch is **harmless on systems that don't need it** — the original
+``validate_python`` call is attempted first and only on ``Exception`` do we
+fall back to plain dataclass field assignment.
+
+This file is NOT a monkey-patch on installed packages.  It patches the
+*behaviour at runtime* by replacing the ``__init__`` method that pydantic's
+``@dataclass`` decorator installs on each decorated class.  No files under
+``site-packages/`` are modified.
+"""
+
+import dataclasses as _dc
+
+
+def _install_pydantic_dataclass_fallback():
+    try:
+        import pydantic._internal._dataclasses as _pdc
+    except ImportError:
+        return
+
+    if getattr(_pdc, "_deepspeed_patched", False):
+        return
+
+    def _make_safe_init(original_init):
+
+        def _safe_init(__dataclass_self__, *args, **kwargs):
+            __tracebackhide__ = True
+            try:
+                original_init(__dataclass_self__, *args, **kwargs)
+            except Exception:
+                s = __dataclass_self__
+                kw = dict(zip([f.name for f in _dc.fields(s.__class__)], args))
+                kw.update(kwargs)
+                for f in _dc.fields(s.__class__):
+                    if f.name in kw:
+                        object.__setattr__(s, f.name, kw[f.name])
+                    elif f.default is not _dc.MISSING:
+                        object.__setattr__(s, f.name, f.default)
+                    elif f.default_factory is not _dc.MISSING:
+                        object.__setattr__(s, f.name, f.default_factory())
+                    else:
+                        object.__setattr__(s, f.name, None)
+
+        _safe_init.__qualname__ = original_init.__qualname__
+        return _safe_init
+
+    _original_complete = _pdc.complete_dataclass
+
+    def _patched_complete(cls, config_wrapper, *, raise_errors=False):
+        result = _original_complete(cls, config_wrapper, raise_errors=raise_errors)
+        if hasattr(cls, "__init__"):
+            original_init = cls.__init__
+            cls.__init__ = _make_safe_init(original_init)
+        return result
+
+    _pdc.complete_dataclass = _patched_complete
+    _pdc._deepspeed_patched = True
+
+
+_install_pydantic_dataclass_fallback()
diff --git a/examples/opsd/opsd/rollout/base.py b/deepspeed/runtime/rollout/base.py
similarity index 66%
rename from examples/opsd/opsd/rollout/base.py
rename to deepspeed/runtime/rollout/base.py
index 62789d25c1cd..f02e48d1153a 100644
--- a/examples/opsd/opsd/rollout/base.py
+++ b/deepspeed/runtime/rollout/base.py
@@ -7,12 +7,7 @@
 The trainer talks to its rollout engine through three small dataclasses
 (``RolloutRequest`` in / ``RolloutBatch`` out / ``SamplingConfig``) and one
 ABC. This keeps the engine-specific concerns (hybrid-engine vs vLLM, weight
-sync, process topology) out of the trainer loop, so swapping engines is a
-one-line config change.
-
-Concrete engines live in sibling modules:
-    * :mod:`opsd.rollout.hybrid_engine` — DeepSpeed hybrid engine
-    * :mod:`opsd.rollout.vllm`          — vLLM on a disjoint GPU group
+sync, process topology) out of the trainer loop.
 """
 
 from abc import ABC, abstractmethod
@@ -28,9 +23,7 @@ class SamplingConfig:
     max_new_tokens: int
     temperature: float = 1.0
     top_p: float = 1.0
-    # ``top_k <= 0`` means "no top-k truncation".
     top_k: int = -1
-    # Number of samples per prompt. >1 expands the effective batch.
     n_samples_per_prompt: int = 1
 
 
@@ -59,14 +52,6 @@ class RolloutBatch:
 
     ``input_ids`` holds the *concatenation* of (left-padded) prompt and
     response, right-padded to the longest sequence in the batch.
-    ``response_start_idx[i]`` is the column index at which the response
-    begins, so positions ``>= response_start_idx[i]`` (intersected with
-    ``attention_mask``) are response tokens.
-
-    Note: with the standard *left-padded* prompt convention, every sample's
-    response starts at the same column (= the prompt section length), but the
-    field is kept per-sample so that mixed-batch backends (e.g. vLLM, which
-    may strip its own padding) can still report a meaningful boundary.
     """
 
     input_ids: torch.Tensor  # [B', T_p + T_r]; B' = B * n_samples_per_prompt
@@ -94,24 +79,21 @@ def seq_len(self) -> int:
 
 
 class RolloutEngine(ABC):
-    """Abstract base for student rollout engines."""
+    """Abstract base for rollout engines."""
 
     name: str = "base"
 
     @abstractmethod
     def generate(self, request: RolloutRequest, sampling: SamplingConfig) -> RolloutBatch:
-        """Run the student's generate, return prompt+response in one tensor."""
+        """Run generation, return prompt+response in one tensor."""
 
     @abstractmethod
-    def sync_weights_from_student(self, step: int) -> None:
-        """Push the student's current weights into the rollout backend.
+    def sync_weights(self, step: int) -> None:
+        """Push updated weights into the rollout backend.
 
-        No-op for :class:`HybridEngineRollout` (the engine reads weights live
-        from the same process). Meaningful for :class:`VLLMRollout`, which
-        holds its own copy and must be refreshed periodically.
+        No-op for hybrid engine (reads weights live). Meaningful for vLLM.
         """
 
     def shutdown(self) -> None:
-        """Release any backend resources (vLLM workers, NCCL groups, ...).
-        Default no-op."""
+        """Release any backend resources. Default no-op."""
         return None
diff --git a/deepspeed/runtime/rollout/hybrid_engine_rollout.py b/deepspeed/runtime/rollout/hybrid_engine_rollout.py
new file mode 100644
index 000000000000..9da149418867
--- /dev/null
+++ b/deepspeed/runtime/rollout/hybrid_engine_rollout.py
@@ -0,0 +1,242 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""Rollout engine backed by DeepSpeed's hybrid engine.
+
+Two generation paths:
+  1. **model.generate()** (default): delegates to HuggingFace generate.
+     Supports sampling (temperature, top_p) and greedy.
+  2. **graph capture + DeepSpeedStaticCache**: only for greedy (temperature=0).
+     Pre-allocates a StaticCache, captures the decode forward pass with a
+     CUDA graph, and replays it for each decode step.  Eliminates kernel
+     launch overhead.
+"""
+
+from dataclasses import dataclass
+
+import torch
+
+from deepspeed.runtime.rollout.base import RolloutBatch, RolloutEngine, RolloutRequest, SamplingConfig
+
+
+@dataclass
+class HybridEngineRolloutConfig:
+    """Configuration for HybridEngineRollout."""
+    use_graph_capture: bool = False
+
+
+class HybridEngineRollout(RolloutEngine):
+    """Rollout engine using DeepSpeed hybrid engine.
+
+    Args:
+        engine: DeepSpeed engine wrapping the model.
+        tokenizer: HuggingFace tokenizer (must have pad_token_id or eos_token_id).
+        cfg: Optional HybridEngineRolloutConfig.
+    """
+
+    def __init__(self, engine, tokenizer, cfg=None):
+        self.engine = engine
+        self.tokenizer = tokenizer
+        self.use_graph_capture = getattr(cfg, 'use_graph_capture', False) if cfg else False
+
+    @torch.no_grad()
+    def generate(self, request: RolloutRequest, sampling: SamplingConfig) -> RolloutBatch:
+        device = request.prompt_ids.device
+        B = request.prompt_ids.shape[0]
+        n = sampling.n_samples_per_prompt
+        total = B * n
+        prompt_len = request.prompt_ids.shape[1]
+        max_new_tokens = sampling.max_new_tokens
+        pad_token_id = self.tokenizer.pad_token_id or self.tokenizer.eos_token_id
+
+        module = self.engine.module
+
+        # Expand prompts for n samples per prompt
+        if n > 1:
+            prompt_ids = request.prompt_ids.repeat_interleave(n, dim=0)
+            prompt_attn = request.prompt_attention_mask.repeat_interleave(n, dim=0)
+        else:
+            prompt_ids = request.prompt_ids
+            prompt_attn = request.prompt_attention_mask
+
+        is_greedy = sampling.temperature <= 0.0
+
+        if self.use_graph_capture and is_greedy:
+            output_ids = self._generate_graph(prompt_ids, prompt_attn, max_new_tokens, pad_token_id, module, device)
+        else:
+            temperature = max(sampling.temperature, 1e-8)
+            do_sample = not is_greedy
+            output_ids = module.generate(
+                prompt_ids,
+                attention_mask=prompt_attn,
+                max_new_tokens=max_new_tokens,
+                do_sample=do_sample,
+                temperature=temperature if do_sample else 1.0,
+                top_p=sampling.top_p if do_sample else 1.0,
+                pad_token_id=pad_token_id,
+            )
+
+        # Build attention mask: pad positions (both left padding from prompt
+        # and right padding from EOS / shorter sequences) are 0.
+        full_len = output_ids.shape[1]
+        response_start = prompt_len
+        attention_mask = (output_ids != pad_token_id).long()
+        for i in range(total):
+            prompt_valid = request.prompt_attention_mask[i // n if B > 1 else 0]
+            attention_mask[i, :prompt_len] = prompt_valid
+
+        return RolloutBatch(
+            input_ids=output_ids,
+            attention_mask=attention_mask,
+            response_start_idx=torch.full((total, ), response_start, dtype=torch.long, device=device),
+        )
+
+    # ------------------------------------------------------------------
+    # Graph capture decode loop (greedy only)
+    # ------------------------------------------------------------------
+
+    def _generate_graph(self, prompt_ids, prompt_attn, max_new_tokens, pad_token_id, module, device):
+        """Greedy decode with DeepSpeedStaticCache + CUDA graph capture."""
+        from transformers import StaticCache
+        from deepspeed.runtime.rollout.static_cache import DeepSpeedStaticCache
+
+        batch_size = prompt_ids.shape[0]
+        prompt_len = prompt_ids.shape[1]
+        max_len = prompt_len + max_new_tokens
+        eos_token_id = self.tokenizer.eos_token_id
+        model_dtype = next(module.parameters()).dtype
+
+        # --- Prefill with HF StaticCache (correct attention semantics) ---
+        prefill_cache = StaticCache(
+            config=module.config,
+            batch_size=batch_size,
+            max_cache_len=max_len,
+            device=device,
+            dtype=model_dtype,
+        )
+        prefill_attn = torch.ones(batch_size, prompt_len, dtype=torch.long, device=device)
+        prefill_attn[:, :prompt_len] = prompt_attn
+        prefill_out = module(
+            prompt_ids,
+            attention_mask=prefill_attn,
+            past_key_values=prefill_cache,
+            use_cache=True,
+            cache_position=torch.arange(prompt_len, device=device),
+        )
+        next_token = prefill_out.logits[:, -1, :].argmax(dim=-1, keepdim=True)
+
+        # --- Copy prefill KV into DeepSpeedStaticCache ---
+        write_pos = torch.tensor(prompt_len - 1, dtype=torch.long, device=device)
+        ds_cache = DeepSpeedStaticCache(
+            module.config,
+            batch_size=batch_size,
+            max_cache_len=max_len,
+            device=device,
+            dtype=model_dtype,
+        )
+        ds_cache.set_write_position(write_pos)
+        # Trigger lazy init then copy real data
+        for layer_idx in range(len(ds_cache.layers)):
+            ds_layer = ds_cache.layers[layer_idx]
+            hf_layer = prefill_cache.layers[layer_idx]
+            if not ds_layer.is_initialized:
+                ds_layer.lazy_initialization(hf_layer.keys, hf_layer.values)
+            ds_layer.keys[:, :, :prompt_len, :].copy_(hf_layer.keys[:, :, :prompt_len, :])
+            ds_layer.values[:, :, :prompt_len, :].copy_(hf_layer.values[:, :, :prompt_len, :])
+
+        output_ids = [prompt_ids, next_token]
+
+        # --- Static buffers for graph capture ---
+        static_token = torch.zeros(batch_size, 1, dtype=torch.long, device=device)
+        static_attn = torch.zeros(batch_size, max_len, dtype=torch.long, device=device)
+        static_attn[:, :prompt_len] = prompt_attn
+        static_attn[:, prompt_len] = 1  # first decode position
+        static_pos = torch.tensor(prompt_len, dtype=torch.long, device=device)
+        static_cache_pos = static_pos.unsqueeze(0)  # [1] for cache_position
+        static_pos_ids = static_pos.reshape(1, 1).expand(batch_size, 1)  # [batch, 1]
+
+        write_pos.fill_(prompt_len)
+
+        # Remove forward hooks (they synchronize — illegal during graph capture)
+        saved_pre = dict(module._forward_pre_hooks)
+        saved_post = dict(module._forward_hooks)
+        module._forward_pre_hooks.clear()
+        module._forward_hooks.clear()
+
+        try:
+            # Warmup on side stream
+            static_token.copy_(next_token)
+            s = torch.cuda.Stream()  #ignore-cuda
+            s.wait_stream(torch.cuda.current_stream())  #ignore-cuda
+            with torch.cuda.stream(s):  #ignore-cuda
+                for _ in range(3):
+                    out = module(
+                        static_token,
+                        attention_mask=static_attn,
+                        past_key_values=ds_cache,
+                        use_cache=True,
+                        cache_position=static_cache_pos,
+                        position_ids=static_pos_ids,
+                    )
+            torch.cuda.current_stream().wait_stream(s)  #ignore-cuda
+
+            # Capture
+            graph = torch.cuda.CUDAGraph()  #ignore-cuda
+            with torch.cuda.graph(graph):  #ignore-cuda
+                out = module(
+                    static_token,
+                    attention_mask=static_attn,
+                    past_key_values=ds_cache,
+                    use_cache=True,
+                    cache_position=static_cache_pos,
+                    position_ids=static_pos_ids,
+                )
+            static_logits = out.logits
+        finally:
+            module._forward_pre_hooks.update(saved_pre)
+            module._forward_hooks.update(saved_post)
+
+        # --- Decode loop ---
+        eos_mask = torch.zeros(batch_size, dtype=torch.bool, device=device)
+        for step in range(max_new_tokens - 1):
+            if eos_mask.all():
+                output_ids.append(torch.full((batch_size, 1), pad_token_id, dtype=torch.long, device=device))
+                continue
+
+            # Update static inputs
+            static_token.copy_(next_token)
+            pos = prompt_len + step
+            write_pos.fill_(pos)
+            static_cache_pos.fill_(pos)
+            static_pos_ids.fill_(pos)
+            static_attn[:, pos] = 1
+
+            # Replay
+            graph.replay()
+            next_token = static_logits[:, -1, :].argmax(dim=-1, keepdim=True)
+            output_ids.append(next_token)
+            eos_mask |= (next_token.squeeze(1) == eos_token_id)
+
+        return torch.cat(output_ids, dim=1)
+
+    @staticmethod
+    def _sample_top_p(logits: torch.Tensor, temperature: float = 1.0, top_p: float = 1.0) -> torch.Tensor:
+        """Sample from logits with temperature and nucleus (top-p) filtering."""
+        logits = logits / temperature
+        if top_p < 1.0:
+            sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+            cumulative_probs = torch.cumsum(torch.softmax(sorted_logits, dim=-1), dim=-1)
+            mask = (cumulative_probs - torch.softmax(sorted_logits, dim=-1)) >= top_p
+            sorted_logits[mask] = -float('inf')
+            probs = torch.softmax(sorted_logits, dim=-1)
+            sampled = torch.multinomial(probs, 1)
+            tokens = sorted_indices.gather(1, sampled)
+        else:
+            probs = torch.softmax(logits, dim=-1)
+            tokens = torch.multinomial(probs, 1)
+        return tokens
+
+    def sync_weights(self, step: int) -> None:  # noqa: ARG002
+        """No-op: hybrid engine reads model weights live."""
+        return None
diff --git a/deepspeed/runtime/rollout/static_cache.py b/deepspeed/runtime/rollout/static_cache.py
new file mode 100644
index 000000000000..520bef9c7314
--- /dev/null
+++ b/deepspeed/runtime/rollout/static_cache.py
@@ -0,0 +1,230 @@
+# Copyright (c) DeepSpeed Team
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""CUDA-graph-compatible static KV cache for hybrid engine rollout.
+
+Derived from HuggingFace transformers ``StaticCache`` / ``StaticLayer``, but
+with a critical difference: the write position is supplied externally via a
+shared tensor instead of an internal ``cumulative_length`` counter.
+
+Why this matters
+----------------
+Transformers' ``StaticLayer.update()`` maintains its own ``cumulative_length``
+tensor that advances on every call.  During CUDA graph capture the captured
+forward "freezes" this counter at whatever value it had at capture time.
+On replay the counter does *not* advance, so subsequent KV writes go to the
+wrong positions and the model silently produces incorrect logits.
+
+Our ``DeepSpeedStaticCache`` instead reads the write position from a shared
+tensor (``write_position``) that the caller updates in-place before each graph
+replay.  Because ``write_position`` is a real tensor at a fixed address, CUDA
+graph replays read the current value each time.
+
+The caller (HybridEngineRollout) must call ``cache.set_write_position(pos)``
+before each replay, where ``pos`` is a scalar ``torch.long`` tensor on the
+correct device.
+"""
+
+import torch
+
+
+class DeepSpeedStaticLayer:
+    """A single layer's static KV cache whose write position is externally set.
+
+    Parameters
+    ----------
+    max_cache_len : int
+        Maximum number of tokens the cache can hold (last dim size).
+    """
+
+    is_compileable = True
+    is_sliding = False
+
+    def __init__(self, max_cache_len: int):
+        self.max_cache_len = max_cache_len
+        self.keys: torch.Tensor | None = None
+        self.values: torch.Tensor | None = None
+        self.is_initialized = False
+        self._write_position: torch.Tensor | None = None
+
+    def set_write_position(self, pos: torch.Tensor):
+        self._write_position = pos
+
+    def lazy_initialization(self, key_states: torch.Tensor, value_states: torch.Tensor) -> None:
+        self.dtype = key_states.dtype
+        self.device = key_states.device
+        max_batch_size, num_heads = key_states.shape[:2]
+        self.max_batch_size = max_batch_size
+        self.num_heads = num_heads
+        self.k_head_dim = key_states.shape[-1]
+        self.v_head_dim = value_states.shape[-1]
+
+        self.keys = torch.zeros(
+            (max_batch_size, num_heads, self.max_cache_len, self.k_head_dim),
+            dtype=self.dtype,
+            device=self.device,
+        )
+        self.values = torch.zeros(
+            (max_batch_size, num_heads, self.max_cache_len, self.v_head_dim),
+            dtype=self.dtype,
+            device=self.device,
+        )
+        torch._dynamo.mark_static_address(self.keys)
+        torch._dynamo.mark_static_address(self.values)
+        self.is_initialized = True
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        *args,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if not self.is_initialized:
+            self.lazy_initialization(key_states, value_states)
+
+        kv_length = key_states.shape[-2]
+
+        if self._write_position is not None:
+            cache_position = torch.arange(kv_length, device=self.device) + self._write_position
+        else:
+            cache_position = torch.arange(kv_length, device=self.device)
+
+        try:
+            self.keys.index_copy_(2, cache_position, key_states)
+            self.values.index_copy_(2, cache_position, value_states)
+        except NotImplementedError:
+            self.keys[:, :, cache_position] = key_states
+            self.values[:, :, cache_position] = value_states
+
+        return self.keys, self.values
+
+    def get_mask_sizes(self, query_length: int) -> tuple[int, int]:
+        return self.max_cache_len, 0
+
+    def get_seq_length(self) -> int:
+        if not self.is_initialized:
+            return 0
+        if self._write_position is not None:
+            return self._write_position + 1
+        return 0
+
+    def get_max_cache_shape(self) -> int:
+        return self.max_cache_len
+
+    def reset(self) -> None:
+        if self.is_initialized:
+            self.keys.zero_()
+            self.values.zero_()
+
+    def reorder_cache(self, beam_idx: torch.LongTensor) -> None:
+        if self.is_initialized:
+            self.keys = self.keys.index_select(0, beam_idx.to(self.keys.device))
+            self.values = self.values.index_select(0, beam_idx.to(self.values.device))
+
+
+class DeepSpeedStaticCache:
+    """CUDA-graph-compatible static KV cache.
+
+    Drop-in replacement for ``transformers.StaticCache`` in the graph-capture
+    decode path of ``HybridEngineRollout``.  All layers share a single
+    ``write_position`` tensor that the caller updates before each graph replay.
+
+    Parameters
+    ----------
+    config : PreTrainedConfig
+        HuggingFace model config (used to determine number of layers and head
+        dimensions).
+    batch_size : int
+        Batch size for eager initialization.
+    max_cache_len : int
+        Maximum sequence length (prompt + generated tokens).
+    device : torch.device | int | str | None
+        Device for eager initialization.
+    dtype : torch.dtype | None
+        Dtype for eager initialization.
+    """
+
+    def __init__(
+        self,
+        config,
+        batch_size: int = 1,
+        max_cache_len: int = 4096,
+        device=None,
+        dtype=None,
+    ):
+        self.config = config
+        text_config = getattr(config, "text_config", config)
+        num_layers = getattr(text_config, "num_hidden_layers", 1)
+        self._layers = [DeepSpeedStaticLayer(max_cache_len) for _ in range(num_layers)]
+        self._max_cache_len = max_cache_len
+        self._write_position: torch.Tensor | None = None
+
+        if dtype is not None and device is not None and batch_size > 0:
+            num_heads = getattr(text_config, "num_key_value_heads", getattr(text_config, "num_attention_heads", 1))
+            head_dim = getattr(text_config, "hidden_size", 1) // getattr(text_config, "num_attention_heads", 1)
+            self.early_initialization(batch_size, num_heads, head_dim, dtype, device)
+
+    @property
+    def layers(self):
+        return self._layers
+
+    def set_write_position(self, pos: torch.Tensor):
+        """Set the write position shared by all layers.
+
+        Must be called before each graph replay with the decode step position
+        as a scalar ``torch.long`` tensor on the correct device.  The tensor is
+        stored by reference so subsequent in-place updates (e.g.
+        ``pos.fill_(new_val)``) are immediately visible to all layers.
+        """
+        self._write_position = pos
+        for layer in self._layers:
+            layer.set_write_position(pos)
+
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        *args,
+        **kwargs,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        if layer_idx >= len(self._layers):
+            raise IndexError(f"layer_idx {layer_idx} out of range (cache has {len(self._layers)} layers)")
+        return self._layers[layer_idx].update(key_states, value_states, *args, **kwargs)
+
+    def early_initialization(
+        self,
+        batch_size: int,
+        num_heads: int,
+        head_dim: int,
+        dtype: torch.dtype,
+        device,
+    ):
+        for layer in self._layers:
+            fake_k = torch.zeros((batch_size, num_heads, 0, head_dim), dtype=dtype, device=device)
+            fake_v = torch.zeros((batch_size, num_heads, 0, head_dim), dtype=dtype, device=device)
+            layer.lazy_initialization(fake_k, fake_v)
+
+    def get_seq_length(self, layer_idx: int = 0) -> int:
+        if layer_idx >= len(self._layers):
+            return 0
+        return self._layers[layer_idx].get_seq_length()
+
+    def get_max_cache_shape(self, layer_idx: int = 0) -> int:
+        if layer_idx >= len(self._layers):
+            return self._max_cache_len
+        return self._layers[layer_idx].get_max_cache_shape()
+
+    def get_mask_sizes(self, query_length: int, layer_idx: int = 0) -> tuple[int, int]:
+        if layer_idx >= len(self._layers):
+            return self._max_cache_len, 0
+        return self._layers[layer_idx].get_mask_sizes(query_length)
+
+    def reset(self):
+        for layer in self._layers:
+            layer.reset()
+
+    def __len__(self):
+        return len(self._layers)
diff --git a/deepspeed/runtime/rollout/vllm_rollout.py b/deepspeed/runtime/rollout/vllm_rollout.py
new file mode 100644
index 000000000000..7b8fbcbecfbd
--- /dev/null
+++ b/deepspeed/runtime/rollout/vllm_rollout.py
@@ -0,0 +1,542 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""vLLM rollout via an external OpenAI-compatible server process.
+
+**Architecture**
+    Training ranks run under the DeepSpeed launcher as usual.  Rank 0 lazily
+    spawns ``python -m vllm.entrypoints.openai.api_server ...`` as a
+    **separate subprocess** with its own CUDA device visibility, then
+    communicates with it over HTTP using the OpenAI-compatible completions
+    API.  Other ranks receive generated token ids by broadcast from rank 0
+    (:func:`deepspeed.comm.broadcast_object_list`).
+
+**Why a subprocess?**
+    vLLM's worker initialisation calls ``new_group(...)`` on the global
+    process group as a collective.  Under the DeepSpeed launcher the world
+    spans *all* training ranks, but only rank 0 talks to vLLM.  Running
+    vLLM in-process therefore deadlocks.  The subprocess approach gives
+    vLLM its own world (size = TP) and avoids the conflict entirely.
+
+**GPU placement**
+    ``cfg.gpus`` controls which physical GPUs the vLLM server sees via
+    ``CUDA_VISIBLE_DEVICES``.  These may be disjoint from the training GPUs
+    (the safe default) or overlap when ``cfg.gpus`` is empty (shared mode,
+    which requires the training loop to release GPU memory first).
+
+**Weight sync (vLLM >= 0.22.0)**
+    vLLM 0.22.0 exposes an RLHF weight-transfer API when started with
+    ``VLLM_SERVER_DEV_MODE=1`` and ``--weight-transfer-config``.  The
+    protocol is: ``pause`` -> ``start_weight_update`` ->
+    ``update_weights`` -> ``finish_weight_update`` -> ``resume``.
+
+    Two transport backends are supported:
+
+    * **GDR** (GPU-direct) – NCCL broadcast over a
+      ``StatelessProcessGroup``.  Fastest, but requires NCCL (NVIDIA).
+    * **HTTP** – serialize tensors and send over HTTP.  Slower but
+      accelerator-agnostic.
+
+    When ``weight_transfer_backend="auto"`` (default), GDR is tried
+    first and falls back to HTTP if NCCL is unavailable.
+"""
+
+import logging
+import os
+import socket
+import signal
+import subprocess
+import sys
+import threading
+import time
+from typing import Any, Dict, List, Optional, Tuple
+
+import requests
+import torch
+
+from deepspeed.runtime.rlhf.config import RolloutConfig
+from deepspeed.runtime.rollout.base import RolloutBatch, RolloutEngine, RolloutRequest, SamplingConfig
+
+logger = logging.getLogger(__name__)
+
+_HTTP_TIMEOUT = 120
+_VLLM_NCCL_BACKEND = "nccl"
+
+
+def _gdr_available() -> bool:
+    try:
+        return torch.cuda.is_available() and torch.cuda.nccl.version() is not None  #ignore-cuda
+    except Exception:
+        return False
+
+
+def _is_rank_zero() -> bool:
+    from deepspeed import comm as dist
+
+    return (not dist.is_initialized()) or dist.get_rank() == 0
+
+
+def stitch_rollout(
+    prompt_ids: torch.Tensor,
+    prompt_attention_mask: torch.Tensor,
+    responses: List[List[int]],
+    pad_id: int,
+    n_samples_per_prompt: int,
+) -> RolloutBatch:
+    """Stitch left-padded prompts and per-sample response token ids into one
+    right-padded ``RolloutBatch``.
+
+    This is the only piece of vLLM-side post-processing that doesn't depend
+    on a live server, so we factor it out for CPU unit testing.
+
+    Args:
+        prompt_ids: ``[B, T_p]`` left-padded prompts.
+        prompt_attention_mask: ``[B, T_p]`` matching attention mask.
+        responses: list of length ``B * n_samples_per_prompt``; each element
+            is the list of generated token ids for one (prompt, sample).
+        pad_id: pad token used for both prompt left-padding and response
+            right-padding (typically the tokenizer's ``pad_token_id`` or
+            ``eos_token_id``).
+        n_samples_per_prompt: number of generated samples per prompt.
+
+    Returns:
+        :class:`RolloutBatch` with ``response_start_idx = T_p`` for every
+        sample.
+    """
+    B, T_p = prompt_ids.shape
+    n = n_samples_per_prompt
+    expected = B * n
+    if len(responses) != expected:
+        raise ValueError(f"expected {expected} response token-id lists "
+                         f"(B={B} * n_samples={n}); got {len(responses)}")
+
+    if responses:
+        max_response_len = max(len(r) for r in responses)
+    else:
+        max_response_len = 0
+    T_total = T_p + max_response_len
+    device = prompt_ids.device
+
+    out_ids = torch.full((expected, T_total), pad_id, dtype=torch.long, device=device)
+    out_attn = torch.zeros((expected, T_total), dtype=prompt_attention_mask.dtype, device=device)
+
+    prompts_expanded = prompt_ids.repeat_interleave(n, dim=0)
+    attn_expanded = prompt_attention_mask.repeat_interleave(n, dim=0)
+    out_ids[:, :T_p] = prompts_expanded
+    out_attn[:, :T_p] = attn_expanded
+
+    for i, resp in enumerate(responses):
+        L = len(resp)
+        if L == 0:
+            continue
+        out_ids[i, T_p:T_p + L] = torch.tensor(resp, dtype=torch.long, device=device)
+        out_attn[i, T_p:T_p + L] = 1
+
+    response_start_idx = torch.full((expected, ), T_p, dtype=torch.long, device=device)
+    return RolloutBatch(input_ids=out_ids, attention_mask=out_attn, response_start_idx=response_start_idx)
+
+
+class VLLMRollout(RolloutEngine):
+
+    name = "vllm"
+
+    def __init__(
+        self,
+        cfg: RolloutConfig,
+        tokenizer: Any,
+        student_engine: Any = None,
+        student_model_path: Optional[str] = None,
+    ):
+        if cfg.engine != "vllm":
+            raise ValueError(f"RolloutConfig.engine must be 'vllm'; got {cfg.engine!r}")
+        if student_model_path is None:
+            raise ValueError("VLLMRollout needs student_model_path to initialise the vLLM engine "
+                             "(it loads weights from disk at construction time)")
+
+        self.cfg = cfg
+        self.tokenizer = tokenizer
+        self.student_engine = student_engine
+        self._model_path = student_model_path
+
+        self.is_rank_zero = _is_rank_zero()
+        self._server_proc: Optional[subprocess.Popen] = None
+        self._base_url = f"http://localhost:{cfg.vllm_port}"
+        self._ready = False
+
+        self._nccl_group = None
+        self._weight_transfer_inited = False
+
+        backend = cfg.weight_transfer_backend
+        if backend == "auto":
+            backend = "gdr" if _gdr_available() else "http"
+        if backend not in ("gdr", "http"):
+            raise ValueError(f"weight_transfer_backend must be 'auto', 'gdr', or 'http'; got {backend!r}")
+        self._wt_backend = backend
+
+    # ------------------------------------------------------------------
+    # Lazy server lifecycle
+    # ------------------------------------------------------------------
+
+    def _ensure_server(self) -> None:
+        """Start the vLLM server on first use (rank 0 only).
+
+        All ranks barrier here so non-zero ranks wait until rank 0 has
+        confirmed the server is healthy.
+        """
+        if self._ready:
+            return
+
+        from deepspeed import comm as dist
+
+        if self.is_rank_zero:
+            self._start_server()
+            self._wait_for_health()
+
+        if dist.is_initialized() and dist.get_world_size() > 1:
+            dist.barrier()
+
+        self._ready = True
+
+    def _start_server(self) -> None:
+        env = os.environ.copy()
+        if self.cfg.gpus:
+            env["CUDA_VISIBLE_DEVICES"] = ",".join(str(g) for g in self.cfg.gpus)
+        env.pop("VLLM_WORKER_MULTIPROC_METHOD", None)
+
+        env["VLLM_SERVER_DEV_MODE"] = "1"
+
+        python_bin = self.cfg.vllm_python or sys.executable
+        cmd = [
+            python_bin,
+            "-m",
+            "vllm.entrypoints.openai.api_server",
+            "--model",
+            self._model_path,
+            "--tensor-parallel-size",
+            str(self.cfg.tensor_parallel_size),
+            "--dtype",
+            self.cfg.vllm_dtype,
+            "--gpu-memory-utilization",
+            str(self.cfg.gpu_memory_utilization),
+            "--port",
+            str(self.cfg.vllm_port),
+            "--weight-transfer-config",
+            f'{{"backend": "{_VLLM_NCCL_BACKEND}"}}' if self._wt_backend == "gdr" else '{"backend": "http"}',
+        ]
+        if self.cfg.vllm_enforce_eager:
+            cmd.append("--enforce-eager")
+
+        logger.info("Starting vLLM server: %s", " ".join(cmd))
+        self._server_proc = subprocess.Popen(
+            cmd,
+            env=env,
+            stdout=subprocess.DEVNULL,
+            stderr=subprocess.PIPE,
+        )
+
+    def _wait_for_health(self) -> None:
+        deadline = time.monotonic() + self.cfg.vllm_start_timeout
+        while time.monotonic() < deadline:
+            if self._server_proc is not None and self._server_proc.poll() is not None:
+                rc = self._server_proc.returncode
+                stderr_tail = ""
+                if self._server_proc.stderr is not None:
+                    stderr_tail = self._server_proc.stderr.read().decode(errors="replace")[-3000:]
+                raise RuntimeError(f"vLLM server exited prematurely (rc={rc}). stderr tail:\n{stderr_tail}")
+            try:
+                resp = requests.get(f"{self._base_url}/health", timeout=2)
+                if resp.status_code == 200:
+                    logger.info("vLLM server is healthy on port %d", self.cfg.vllm_port)
+                    return
+            except requests.ConnectionError:
+                pass
+            time.sleep(1)
+        raise TimeoutError(f"vLLM server did not become healthy within {self.cfg.vllm_start_timeout}s")
+
+    # ------------------------------------------------------------------
+    # Generation
+    # ------------------------------------------------------------------
+
+    def generate(self, request: RolloutRequest, sampling: SamplingConfig) -> RolloutBatch:
+        self._ensure_server()
+
+        B = int(request.prompt_ids.shape[0])
+        n = sampling.n_samples_per_prompt
+
+        if self.is_rank_zero:
+            prompt_token_ids: List[List[int]] = []
+            for i in range(B):
+                mask = request.prompt_attention_mask[i].bool()
+                ids = request.prompt_ids[i][mask].tolist()
+                prompt_token_ids.append(ids)
+
+            payload: Dict[str, Any] = {
+                "model": self._model_path,
+                "prompt": prompt_token_ids,
+                "n": n,
+                "temperature": sampling.temperature,
+                "top_p": sampling.top_p,
+                "max_tokens": sampling.max_new_tokens,
+                "logprobs": 1,
+            }
+            if sampling.top_k > 0:
+                payload["top_k"] = sampling.top_k
+
+            resp = requests.post(
+                f"{self._base_url}/v1/completions",
+                json=payload,
+                timeout=_HTTP_TIMEOUT,
+            )
+            resp.raise_for_status()
+            body = resp.json()
+
+            responses: List[List[int]] = []
+            for choice in body["choices"]:
+                responses.append(self._extract_token_ids(choice))
+        else:
+            responses = []
+
+        from deepspeed import comm as dist
+
+        if dist.is_initialized() and dist.get_world_size() > 1:
+            obj = [responses]
+            dist.broadcast_object_list(obj, src=0)
+            responses = obj[0]
+
+        pad_id = (self.tokenizer.pad_token_id
+                  if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id)
+        return stitch_rollout(
+            prompt_ids=request.prompt_ids,
+            prompt_attention_mask=request.prompt_attention_mask,
+            responses=responses,
+            pad_id=pad_id,
+            n_samples_per_prompt=n,
+        )
+
+    @staticmethod
+    def _extract_token_ids(choice: Dict[str, Any]) -> List[int]:
+        """Extract generated token ids from a vLLM completions choice.
+
+        vLLM 0.22.0 returns ``token_ids: null`` by default.  We request
+        ``logprobs: 1`` in :meth:`generate` and read the token ids from the
+        logprobs structure.
+        """
+        raw = choice.get("token_ids")
+        if raw is not None:
+            return list(raw)
+
+        logprobs_data = choice.get("logprobs")
+        if logprobs_data is not None:
+            token_ids = logprobs_data.get("token_ids")
+            if token_ids is not None:
+                return [int(t) for t in token_ids]
+
+            tokens = logprobs_data.get("tokens")
+            if tokens is not None:
+                return list(range(len(tokens)))
+
+        return []
+
+    # ------------------------------------------------------------------
+    # Weight sync (vLLM 0.22.0 RLHF API)
+    # ------------------------------------------------------------------
+
+    def sync_weights(self, step: int) -> None:
+        self._ensure_server()
+
+        if self.student_engine is None:
+            return
+
+        if not self._weight_transfer_inited and self.is_rank_zero:
+            if self._wt_backend == "gdr":
+                self._init_gdr_channel()
+            self._weight_transfer_inited = True
+
+        from deepspeed.runtime.zero import GatheredParameters
+
+        params: List[Tuple[str, torch.Tensor]] = []
+        model = self.student_engine.module
+        for name, param in model.named_parameters():
+            if not param.requires_grad:
+                continue
+            with GatheredParameters([param], modifier_rank=0):
+                if self.is_rank_zero:
+                    params.append((name, param.data.detach().clone()))
+
+        if self.is_rank_zero:
+            self._pause()
+            if self._wt_backend == "gdr":
+                self._update_weights_gdr(params)
+            else:
+                self._update_weights_http(params)
+            self._resume()
+
+        from deepspeed import comm as dist
+
+        if dist.is_initialized() and dist.get_world_size() > 1:
+            dist.barrier()
+
+    # -- GDR (NCCL) weight transfer ----------------------------------------
+
+    def _init_gdr_channel(self) -> None:
+        """Bootstrap the GDR weight-transfer channel.
+
+        vLLM's ``init_weight_transfer_engine`` endpoint and the trainer-side
+        ``StatelessProcessGroup.create()`` must rendezvous concurrently
+        (both block until the other side connects).  We fire the HTTP call
+        in a background thread.
+        """
+        master_addr = self._get_own_ip()
+        master_port = _find_free_port()
+
+        resp = requests.get(f"{self._base_url}/get_world_size", timeout=_HTTP_TIMEOUT)
+        resp.raise_for_status()
+        vllm_world_size = resp.json()["world_size"]
+        total_world_size = vllm_world_size + 1
+
+        init_info = {
+            "master_address": master_addr,
+            "master_port": master_port,
+            "rank_offset": 1,
+            "world_size": total_world_size,
+        }
+
+        init_thread = threading.Thread(target=self._post,
+                                       args=("/init_weight_transfer_engine", ),
+                                       kwargs={"json": {
+                                           "init_info": init_info
+                                       }})
+        init_thread.start()
+
+        from vllm.distributed.utils import StatelessProcessGroup
+
+        group = StatelessProcessGroup.create(host=master_addr, port=master_port, rank=0, world_size=total_world_size)
+        init_thread.join(timeout=30)
+        if init_thread.is_alive():
+            raise TimeoutError("init_weight_transfer_engine did not complete within 30s")
+
+        self._nccl_group = group
+        logger.info("GDR weight-transfer channel initialised "
+                    "(world_size=%d, vllm_workers=%d)", total_world_size, vllm_world_size)
+
+    def _update_weights_gdr(self, params: List[Tuple[str, torch.Tensor]]) -> None:
+        """Push all gathered parameters to vLLM via GPU-direct (NCCL) transfer.
+
+        The flow mirrors vLLM's official ``rlhf_http_nccl.py`` example:
+
+        1. ``POST /start_weight_update`` — tells vLLM to prepare for incoming
+           weights.
+        2. ``POST /update_weights`` (in a **background thread**) — sends the
+           parameter metadata (names, dtypes, shapes).  The server-side handler
+           blocks waiting for NCCL broadcast.
+        3. Trainer broadcasts each tensor via ``StatelessProcessGroup``.
+        4. ``POST /finish_weight_update`` — finalises the update.
+        """
+        names: List[str] = []
+        dtype_names: List[str] = []
+        shapes: List[List[int]] = []
+        tensors: List[torch.Tensor] = []
+
+        for name, tensor in params:
+            names.append(name)
+            dtype_names.append(str(tensor.dtype).replace("torch.", ""))
+            shapes.append(list(tensor.shape))
+            tensors.append(tensor)
+
+        self._post("/start_weight_update", json={"is_checkpoint_format": True})
+
+        update_info = {
+            "names": names,
+            "dtype_names": dtype_names,
+            "shapes": shapes,
+            "packed": False,
+        }
+
+        update_thread = threading.Thread(target=self._post,
+                                         args=("/update_weights", ),
+                                         kwargs={"json": {
+                                             "update_info": update_info
+                                         }})
+        update_thread.start()
+
+        for tensor in tensors:
+            self._nccl_group.broadcast(tensor.contiguous(), src=0)
+
+        update_thread.join(timeout=60)
+        if update_thread.is_alive():
+            raise TimeoutError("update_weights HTTP call did not complete within 60s")
+
+        self._post("/finish_weight_update", json={})
+        logger.info("pushed %d parameters via GDR", len(names))
+
+    # -- HTTP weight transfer -----------------------------------------------
+
+    def _update_weights_http(self, params: List[Tuple[str, torch.Tensor]]) -> None:
+        """Push all gathered parameters to vLLM via HTTP serialised transfer.
+
+        Each parameter is sent individually: metadata (name, dtype, shape)
+        goes in the JSON body alongside the tensor bytes (base64-encoded).
+        """
+        import base64
+
+        self._post("/start_weight_update", json={"is_checkpoint_format": True})
+
+        for name, tensor in params:
+            arr = tensor.cpu().numpy()
+            buf = arr.tobytes()
+            encoded = base64.b64encode(buf).decode("ascii")
+            self._post(
+                "/update_weights",
+                json={
+                    "update_info": {
+                        "names": [name],
+                        "dtype_names": [str(tensor.dtype).replace("torch.", "")],
+                        "shapes": [list(tensor.shape)],
+                        "packed": False,
+                    },
+                    "tensors": [encoded],
+                },
+                timeout=max(_HTTP_TIMEOUT, 30),
+            )
+
+        self._post("/finish_weight_update", json={})
+        logger.info("pushed %d parameters via HTTP", len(params))
+
+    # -- RLHF HTTP helpers -----------------------------------------------
+
+    def _post(self, path: str, **kwargs: Any) -> requests.Response:
+        resp = requests.post(f"{self._base_url}{path}", timeout=_HTTP_TIMEOUT, **kwargs)
+        resp.raise_for_status()
+        return resp
+
+    def _pause(self) -> None:
+        self._post("/pause", params={"mode": "abort"})
+
+    def _resume(self) -> None:
+        self._post("/resume")
+
+    @staticmethod
+    def _get_own_ip() -> str:
+        return "127.0.0.1"
+
+    # ------------------------------------------------------------------
+    # Cleanup
+    # ------------------------------------------------------------------
+
+    def shutdown(self) -> None:
+        if self._server_proc is not None:
+            self._server_proc.send_signal(signal.SIGTERM)
+            try:
+                self._server_proc.wait(timeout=30)
+            except subprocess.TimeoutExpired:
+                self._server_proc.kill()
+                self._server_proc.wait()
+            self._server_proc = None
+        self._ready = False
+
+
+def _find_free_port() -> int:
+    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+        s.bind(("", 0))
+        return s.getsockname()[1]
diff --git a/examples/opsd/README.md b/examples/opsd/README.md
deleted file mode 100644
index 9eab8485a707..000000000000
--- a/examples/opsd/README.md
+++ /dev/null
@@ -1,232 +0,0 @@
-# On-Policy Distillation (OPSD) on DeepSpeed
-
-A DeepSpeed-native port of [HJSang/OPSD_OnPolicyDistillation](https://github.com/HJSang/OPSD_OnPolicyDistillation),
-removing the verl dependency and building directly on DeepSpeed primitives
-(ZeRO-3, hybrid engine, `deepspeed.initialize`).
-
-On-policy distillation trains a small **student** model to imitate a large
-frozen **teacher** on the student's *own* generated rollouts. Each training
-step has three phases:
-
-```
-┌────────────┐   prompts   ┌──────────────────┐   prompt+response   ┌────────────┐
-│ Dataloader │ ──────────▶ │ Student rollout  │ ──────────────────▶ │  Teacher   │
-└────────────┘             │ (hybrid / vLLM)  │                     │  forward   │
-                           └──────────────────┘                     └─────┬──────┘
-                                                                          │ logits → CPU cache
-                                                                          ▼
-                                                              ┌─────────────────────┐
-                                                              │ Student forward +   │
-                                                              │ streamed KL / JSD + │
-                                                              │ backward / step     │
-                                                              └─────────────────────┘
-```
-
-Loss = per-token divergence (`forward_kl` | `reverse_kl` | `jsd`) between
-student and teacher distributions on the student's generated tokens, chunked
-over the sequence axis so the full `[B, T, V]` teacher tensor never
-co-resides with the student logits on the training device.
-
-## Layout
-
-```
-examples/opsd/
-├── main.py                            # entry point (deepspeed launcher)
-├── opsd/
-│   ├── config.py                      # OPSDConfig dataclass + JSON loader
-│   ├── losses.py                      # chunked / streamed KL & JSD
-│   ├── teacher.py                     # frozen teacher + CPU logit cache
-│   ├── trainer.py                     # three-phase training loop
-│   ├── data.py                        # JSONL prompt dataset + left-pad collator
-│   ├── utils.py                       # response-mask + shift helpers
-│   ├── rollout/
-│   │   ├── base.py                    # RolloutEngine ABC, request/batch dataclasses
-│   │   ├── hybrid_engine.py           # DeepSpeed hybrid-engine rollout
-│   │   └── vllm.py                    # vLLM rollout on disjoint GPUs
-│   └── weight_bridge/
-│       ├── base.py                    # ParallelKind + per-rank slicer
-│       ├── qwen2.py                   # Qwen2 / Qwen2.5 TP mapping
-│       └── qwen3.py                   # Qwen3 dense (adds q_norm/k_norm)
-├── configs/
-│   ├── ds_zero3.json                  # base DeepSpeed ZeRO-3 + hybrid engine
-│   ├── opsd_hybrid_engine.json        # production-ish hybrid-engine OPSD config
-│   ├── opsd_vllm_disjoint.json        # vLLM rollout on a disjoint GPU group
-│   ├── smoke_hybrid.json              # 5-step smoke test with Qwen2.5-0.5B / 1.5B
-│   ├── smoke_vllm.json                # same but with vLLM rollout
-│   └── smoke_ds_zero3.json            # ZeRO-3 config tuned for smoke runs
-├── scripts/
-│   ├── train_opsd_hybrid.sh           # launch hybrid-engine training
-│   └── train_opsd_vllm.sh             # launch vLLM training
-└── tests/                             # CPU-only unit tests (run with pytest)
-```
-
-## Quick start
-
-### Install
-
-```
-pip install deepspeed transformers datasets accelerate
-# Optional, only for the vLLM rollout backend:
-pip install 'vllm>=0.6.4'
-```
-
-### Hybrid-engine training (single-node, no vLLM)
-
-```
-cd examples/opsd
-NUM_GPUS=8 bash scripts/train_opsd_hybrid.sh configs/opsd_hybrid_engine.json
-```
-
-The hybrid engine path lives entirely within DeepSpeed: the student engine
-both trains and generates, sharing weights without a copy step. Easiest to
-get running; slower generation than vLLM.
-
-### vLLM training (disjoint GPU group)
-
-```
-cd examples/opsd
-# Train on GPUs 0..5, run vLLM on 6,7 (matches default config)
-NUM_TRAIN_GPUS=6 INCLUDE_GPUS=0,1,2,3,4,5 \
-    bash scripts/train_opsd_vllm.sh configs/opsd_vllm_disjoint.json
-```
-
-vLLM gets dedicated GPUs (`rollout.gpus` in the config). Training rank 0
-constructs the `LLM` handle; other training ranks receive generated token
-ids via NCCL broadcast.
-
-### Smoke tests (5 steps, small models)
-
-The `smoke_*.json` configs run on 2 GPUs in a few minutes with Qwen2.5-0.5B
-(student) and Qwen2.5-1.5B (teacher), so the full pipeline can be validated
-end-to-end before scaling up.
-
-```
-cd examples/opsd
-deepspeed --num_gpus 2 main.py --config configs/smoke_hybrid.json
-# For vLLM (uses GPUs 0,1 for training and 2,3 for vLLM):
-NUM_TRAIN_GPUS=2 INCLUDE_GPUS=0,1 deepspeed --num_gpus 2 --include localhost:0,1 \
-    main.py --config configs/smoke_vllm.json
-```
-
-## Unit tests
-
-The CPU-runnable test suite exercises the loss math, teacher caching, rollout
-contract, weight-bridge TP slicing, and vLLM stitch logic. Run with:
-
-```
-cd examples/opsd
-python -m pytest tests/ -v
-```
-
-## Configuration
-
-`OPSDConfig` is a plain dataclass loaded from JSON (no Hydra). The schema:
-
-```json
-{
-  "student":    { "model_name_or_path": "...", "dtype": "bfloat16", "arch": "qwen2" },
-  "teacher":    { "model_name_or_path": "...", "dtype": "bfloat16", "offload_to_cpu": true },
-  "rollout":    { "engine": "hybrid_engine | vllm", ... },
-  "distillation": { "loss_type": "reverse_kl", "temperature": 1.0, "chunk_size": 512 },
-  "training":   { "train_batch_size": 8, "learning_rate": 1e-6, ... },
-  "data":       { "path": "data/prompts.jsonl", "prompt_field": "prompt" },
-  "deepspeed_config": "configs/ds_zero3.json"
-}
-```
-
-See `configs/opsd_hybrid_engine.json` and `configs/opsd_vllm_disjoint.json`
-for fully-populated examples.
-
-## Adding a new model architecture
-
-To support a model the bridge doesn't recognise yet:
-
-1. Add `opsd/weight_bridge/<arch>.py` subclassing `Qwen2WeightBridge` (or
-   `WeightBridge` directly) and override `parallel_kind` / `_extra_layer_kind`
-   for any parameters not in Qwen2's table.
-2. Register the new arch in `opsd/weight_bridge/__init__.py::get_bridge`.
-3. Add a test in `tests/test_weight_bridge.py` covering parallel-kind dispatch
-   and a slice-then-gather round trip for one layer of realistic shapes.
-
-## Design notes
-
-* **Why CPU-cache the teacher logits?** Holding both student and teacher
-  `[B, T, V]` tensors on GPU at once doubles memory pressure. Staging the
-  teacher to host between the teacher forward and the student backward halves
-  the worst-case GPU footprint of the loss path. The streamed loss
-  (`losses.streamed_distillation_loss`) pulls teacher chunks back to GPU
-  one sequence slice at a time so the full tensor never re-materialises.
-
-* **Why an abstract `RolloutEngine`?** The hybrid-engine and vLLM backends
-  have very different lifecycles (hybrid engine reads student weights live;
-  vLLM holds its own copy and must be synced) but the trainer should not
-  care. The ABC keeps the trainer engine-agnostic so additional backends
-  (e.g. a future colocated-vLLM-with-`sleep_mode`) drop in without touching
-  the loop.
-
-* **vLLM topology = disjoint, not colocated (v1).** The disjoint topology is
-  simpler to debug — failures in vLLM don't take down training and vice
-  versa. A colocated topology using vLLM 0.6.4+'s `sleep_mode` is planned as
-  a follow-up.
-
-* **Weight bridge does not pre-fuse QKV / gate-up.** vLLM's per-model loader
-  already knows how to fuse these from the standard HuggingFace layout, so
-  the bridge only handles per-rank slicing.
-
-## vLLM status
-
-The vLLM rollout (`opsd/rollout/vllm.py`) is **written and unit-tested but
-not yet usable under the DeepSpeed launcher**. During live validation on
-4× H200 we hit a blocking issue:
-
-> vLLM's worker init calls `new_group(...)` on the global process group as
-> a collective. Under `deepspeed --num_gpus N`, the world is all `N`
-> training ranks but only rank 0 calls into vLLM, so the constructor hangs
-> waiting on the other ranks. Reproduced with vllm 0.6.6 + deepspeed 0.15.4 +
-> torch 2.5.1. Standalone vLLM (world size 1) works in seconds.
-
-The fix requires running vLLM in a **separate top-level Python process**
-with its own world, accessed over HTTP/RPC from the trainer — the pattern
-used by TRL and OpenRLHF. That's a larger refactor than fits in this PR;
-the current `VLLMRollout` will be the basis for it once landed.
-
-What's verified for the vLLM path today:
-* `tests/test_vllm_stitch.py` — prompt + response stitching (CPU unit test)
-* `tests/test_weight_bridge.py` — TP-slice math for Qwen2 / Qwen3 (CPU)
-* `vllm.LLM` itself runs fine standalone on Qwen2.5-0.5B (validated)
-
-What's **not** verified:
-* End-to-end training loop with `rollout.engine = "vllm"` in `OPSDConfig`
-* `LLM.collective_rpc("load_weights", ...)` weight sync at training time
-
-The hybrid-engine path (`rollout.engine = "hybrid_engine"`) is validated
-end-to-end on the same hardware.
-
-## Other known limitations (v1)
-
-* **vLLM weight sync (when it works) goes through pickle** —
-  `LLM.collective_rpc("load_weights", args=((name, tensor_on_cpu),))`.
-  Expect several seconds per sync on a 7B model. A faster v2 would broadcast
-  tensors via NCCL on a shared trainer↔vLLM process group — see verl's
-  `bucketed_weight_transfer.py` for a reference design.
-* **vLLM `tensor_parallel_size > 1` is untested.** The weight bridge's
-  slicing math is unit-tested but no live run exists.
-* **Reward-weighted distillation** (OPSD's `opd.reward_beta` knob) is not
-  ported. Easy to add: scale `per_tok` by a reward weight in the loss path.
-* **GRPO and other on-policy RL recipes** are out of scope. The
-  `RolloutEngine` / `WeightBridge` abstractions are reusable, but a GRPO
-  trainer would add its own advantage / KL-to-reference logic on top.
-* **Qwen3-MoE** is not covered. Add `weight_bridge/qwen3_moe.py` when needed.
-* **Hybrid engine on Qwen-family models uses a ZeRO-3 fallback** (no
-  hybrid-engine inference acceleration), since DeepSpeed's inference policy
-  list only covers GPT2/GPT-NeoX/OPT/BLOOM/LLAMA/LLAMA2/InternLM as of 0.15.
-  The fallback gathers params via `GatheredParameters` and calls the HF
-  model's `generate` directly — correct, just ~3-5x slower than the
-  accelerated path.
-
-## References
-
-* OPSD reference repo: <https://github.com/HJSang/OPSD_OnPolicyDistillation>
-* DeepSpeed hybrid engine: `deepspeed/runtime/hybrid_engine.py`
-* verl rollout / weight-sync design (used as a cross-check):
-  <https://github.com/volcengine/verl/tree/main/verl/workers/rollout/vllm_rollout>
diff --git a/examples/opsd/configs/ds_zero3.json b/examples/opsd/configs/ds_zero3.json
deleted file mode 100644
index 1f43339a6f20..000000000000
--- a/examples/opsd/configs/ds_zero3.json
+++ /dev/null
@@ -1,43 +0,0 @@
-{
-    "bf16": {
-        "enabled": true
-    },
-    "zero_optimization": {
-        "stage": 3,
-        "overlap_comm": true,
-        "contiguous_gradients": true,
-        "reduce_bucket_size": 5e7,
-        "stage3_prefetch_bucket_size": 5e7,
-        "stage3_param_persistence_threshold": 1e6,
-        "stage3_max_live_parameters": 1e9,
-        "stage3_max_reuse_distance": 1e9,
-        "stage3_gather_16bit_weights_on_model_save": true
-    },
-    "optimizer": {
-        "type": "AdamW",
-        "params": {
-            "lr": 1e-6,
-            "betas": [0.9, 0.95],
-            "eps": 1e-8,
-            "weight_decay": 0.0
-        }
-    },
-    "scheduler": {
-        "type": "WarmupLR",
-        "params": {
-            "warmup_min_lr": 0,
-            "warmup_max_lr": 1e-6,
-            "warmup_num_steps": 0
-        }
-    },
-    "gradient_clipping": 1.0,
-    "hybrid_engine": {
-        "enabled": true,
-        "max_out_tokens": 2048,
-        "inference_tp_size": 1,
-        "release_inference_cache": false,
-        "pin_parameters": true,
-        "tp_gather_partition_size": 8
-    },
-    "wall_clock_breakdown": false
-}
diff --git a/examples/opsd/configs/opsd_hybrid_engine.json b/examples/opsd/configs/opsd_hybrid_engine.json
deleted file mode 100644
index 5a7d45b54f6a..000000000000
--- a/examples/opsd/configs/opsd_hybrid_engine.json
+++ /dev/null
@@ -1,49 +0,0 @@
-{
-    "student": {
-        "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
-        "dtype": "bfloat16",
-        "trust_remote_code": false,
-        "arch": "qwen2"
-    },
-    "teacher": {
-        "model_name_or_path": "Qwen/Qwen2.5-Math-7B-Instruct",
-        "dtype": "bfloat16",
-        "trust_remote_code": false,
-        "offload_to_cpu": true
-    },
-    "rollout": {
-        "engine": "hybrid_engine",
-        "max_prompt_length": 1024,
-        "max_response_length": 1024,
-        "temperature": 1.0,
-        "top_p": 1.0,
-        "top_k": -1,
-        "n_samples_per_prompt": 1,
-        "weight_sync_interval": 1
-    },
-    "distillation": {
-        "loss_type": "reverse_kl",
-        "temperature": 1.0,
-        "chunk_size": 512
-    },
-    "training": {
-        "train_batch_size": 8,
-        "micro_batch_size_per_gpu": 1,
-        "gradient_accumulation_steps": 1,
-        "learning_rate": 1e-6,
-        "weight_decay": 0.0,
-        "num_train_epochs": 1,
-        "max_steps": -1,
-        "warmup_steps": 0,
-        "save_steps": 500,
-        "logging_steps": 10,
-        "save_dir": "./opsd_ckpt_hybrid",
-        "seed": 42
-    },
-    "data": {
-        "path": "data/prompts.jsonl",
-        "prompt_field": "prompt",
-        "shuffle": true
-    },
-    "deepspeed_config": "configs/ds_zero3.json"
-}
diff --git a/examples/opsd/configs/opsd_vllm_disjoint.json b/examples/opsd/configs/opsd_vllm_disjoint.json
deleted file mode 100644
index 9668b3702981..000000000000
--- a/examples/opsd/configs/opsd_vllm_disjoint.json
+++ /dev/null
@@ -1,54 +0,0 @@
-{
-    "student": {
-        "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
-        "dtype": "bfloat16",
-        "trust_remote_code": false,
-        "arch": "qwen2"
-    },
-    "teacher": {
-        "model_name_or_path": "Qwen/Qwen2.5-Math-7B-Instruct",
-        "dtype": "bfloat16",
-        "trust_remote_code": false,
-        "offload_to_cpu": true
-    },
-    "rollout": {
-        "engine": "vllm",
-        "max_prompt_length": 1024,
-        "max_response_length": 1024,
-        "temperature": 1.0,
-        "top_p": 1.0,
-        "top_k": -1,
-        "n_samples_per_prompt": 1,
-        "gpus": [6, 7],
-        "tensor_parallel_size": 2,
-        "gpu_memory_utilization": 0.85,
-        "vllm_dtype": "bfloat16",
-        "weight_sync_interval": 4,
-        "vllm_min_version": "0.6.4"
-    },
-    "distillation": {
-        "loss_type": "reverse_kl",
-        "temperature": 1.0,
-        "chunk_size": 512
-    },
-    "training": {
-        "train_batch_size": 6,
-        "micro_batch_size_per_gpu": 1,
-        "gradient_accumulation_steps": 1,
-        "learning_rate": 1e-6,
-        "weight_decay": 0.0,
-        "num_train_epochs": 1,
-        "max_steps": -1,
-        "warmup_steps": 0,
-        "save_steps": 500,
-        "logging_steps": 10,
-        "save_dir": "./opsd_ckpt_vllm",
-        "seed": 42
-    },
-    "data": {
-        "path": "data/prompts.jsonl",
-        "prompt_field": "prompt",
-        "shuffle": true
-    },
-    "deepspeed_config": "configs/ds_zero3.json"
-}
diff --git a/examples/opsd/configs/smoke_ds_zero3.json b/examples/opsd/configs/smoke_ds_zero3.json
deleted file mode 100644
index 74211f3fbd9f..000000000000
--- a/examples/opsd/configs/smoke_ds_zero3.json
+++ /dev/null
@@ -1,35 +0,0 @@
-{
-    "bf16": {
-        "enabled": true
-    },
-    "zero_optimization": {
-        "stage": 3,
-        "overlap_comm": true,
-        "contiguous_gradients": true,
-        "reduce_bucket_size": 5e7,
-        "stage3_prefetch_bucket_size": 5e7,
-        "stage3_param_persistence_threshold": 1e6,
-        "stage3_max_live_parameters": 1e9,
-        "stage3_max_reuse_distance": 1e9,
-        "stage3_gather_16bit_weights_on_model_save": true
-    },
-    "optimizer": {
-        "type": "AdamW",
-        "params": {
-            "lr": 1e-6,
-            "betas": [0.9, 0.95],
-            "eps": 1e-8,
-            "weight_decay": 0.0
-        }
-    },
-    "gradient_clipping": 1.0,
-    "hybrid_engine": {
-        "enabled": true,
-        "max_out_tokens": 512,
-        "inference_tp_size": 1,
-        "release_inference_cache": false,
-        "pin_parameters": true,
-        "tp_gather_partition_size": 8
-    },
-    "wall_clock_breakdown": false
-}
diff --git a/examples/opsd/configs/smoke_hybrid.json b/examples/opsd/configs/smoke_hybrid.json
deleted file mode 100644
index 218bd990ae97..000000000000
--- a/examples/opsd/configs/smoke_hybrid.json
+++ /dev/null
@@ -1,49 +0,0 @@
-{
-    "student": {
-        "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
-        "dtype": "bfloat16",
-        "trust_remote_code": false,
-        "arch": "qwen2"
-    },
-    "teacher": {
-        "model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
-        "dtype": "bfloat16",
-        "trust_remote_code": false,
-        "offload_to_cpu": false
-    },
-    "rollout": {
-        "engine": "hybrid_engine",
-        "max_prompt_length": 128,
-        "max_response_length": 64,
-        "temperature": 1.0,
-        "top_p": 1.0,
-        "top_k": -1,
-        "n_samples_per_prompt": 1,
-        "weight_sync_interval": 1
-    },
-    "distillation": {
-        "loss_type": "reverse_kl",
-        "temperature": 1.0,
-        "chunk_size": 128
-    },
-    "training": {
-        "train_batch_size": 2,
-        "micro_batch_size_per_gpu": 1,
-        "gradient_accumulation_steps": 1,
-        "learning_rate": 1e-6,
-        "weight_decay": 0.0,
-        "num_train_epochs": 1,
-        "max_steps": 5,
-        "warmup_steps": 0,
-        "save_steps": 10000,
-        "logging_steps": 1,
-        "save_dir": "./opsd_smoke_hybrid_ckpt",
-        "seed": 42
-    },
-    "data": {
-        "path": "data/prompts.jsonl",
-        "prompt_field": "prompt",
-        "shuffle": true
-    },
-    "deepspeed_config": "configs/smoke_ds_zero3.json"
-}
diff --git a/examples/opsd/configs/smoke_vllm.json b/examples/opsd/configs/smoke_vllm.json
deleted file mode 100644
index 8daf31537df2..000000000000
--- a/examples/opsd/configs/smoke_vllm.json
+++ /dev/null
@@ -1,55 +0,0 @@
-{
-    "student": {
-        "model_name_or_path": "Qwen/Qwen2.5-0.5B-Instruct",
-        "dtype": "bfloat16",
-        "trust_remote_code": false,
-        "arch": "qwen2"
-    },
-    "teacher": {
-        "model_name_or_path": "Qwen/Qwen2.5-1.5B-Instruct",
-        "dtype": "bfloat16",
-        "trust_remote_code": false,
-        "offload_to_cpu": false
-    },
-    "rollout": {
-        "engine": "vllm",
-        "max_prompt_length": 128,
-        "max_response_length": 64,
-        "temperature": 1.0,
-        "top_p": 1.0,
-        "top_k": -1,
-        "n_samples_per_prompt": 1,
-        "gpus": [],
-        "tensor_parallel_size": 1,
-        "gpu_memory_utilization": 0.3,
-        "vllm_dtype": "bfloat16",
-        "weight_sync_interval": 2,
-        "vllm_min_version": "0.6.4",
-        "vllm_enforce_eager": true
-    },
-    "distillation": {
-        "loss_type": "reverse_kl",
-        "temperature": 1.0,
-        "chunk_size": 128
-    },
-    "training": {
-        "train_batch_size": 2,
-        "micro_batch_size_per_gpu": 1,
-        "gradient_accumulation_steps": 1,
-        "learning_rate": 1e-6,
-        "weight_decay": 0.0,
-        "num_train_epochs": 1,
-        "max_steps": 5,
-        "warmup_steps": 0,
-        "save_steps": 10000,
-        "logging_steps": 1,
-        "save_dir": "./opsd_smoke_vllm_ckpt",
-        "seed": 42
-    },
-    "data": {
-        "path": "data/prompts.jsonl",
-        "prompt_field": "prompt",
-        "shuffle": true
-    },
-    "deepspeed_config": "configs/smoke_ds_zero3.json"
-}
diff --git a/examples/opsd/data/prompts.jsonl b/examples/opsd/data/prompts.jsonl
deleted file mode 100644
index a95a17c57557..000000000000
--- a/examples/opsd/data/prompts.jsonl
+++ /dev/null
@@ -1,20 +0,0 @@
-{"prompt": "Solve: 17 + 25 = ?"}
-{"prompt": "What is 12 multiplied by 8?"}
-{"prompt": "If a train travels 60 miles per hour for 3 hours, how far does it go?"}
-{"prompt": "What is the square root of 144?"}
-{"prompt": "Compute 15% of 240."}
-{"prompt": "A rectangle has length 7 and width 4. What is its area?"}
-{"prompt": "Solve for x: 2x + 5 = 17."}
-{"prompt": "What is 7 factorial?"}
-{"prompt": "Compute the sum of integers from 1 to 10."}
-{"prompt": "What is 2 to the power of 10?"}
-{"prompt": "Find the perimeter of a square with side length 9."}
-{"prompt": "If 5 apples cost $2.50, what is the cost of 12 apples?"}
-{"prompt": "What is the greatest common divisor of 24 and 36?"}
-{"prompt": "Convert 0.75 to a fraction in simplest form."}
-{"prompt": "If x + y = 10 and x - y = 4, find x and y."}
-{"prompt": "What is 1/4 + 1/3?"}
-{"prompt": "A circle has radius 5. What is its area? (Use pi = 3.14)"}
-{"prompt": "Compute (3 + 4) * (5 - 2)."}
-{"prompt": "What is 81 divided by 9?"}
-{"prompt": "If a number doubled is 18, what is the number?"}
diff --git a/examples/opsd/main.py b/examples/opsd/main.py
deleted file mode 100644
index b2e5c4c6929b..000000000000
--- a/examples/opsd/main.py
+++ /dev/null
@@ -1,135 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""OPSD training entry point.
-
-Launch with the DeepSpeed launcher::
-
-    deepspeed --num_gpus 8 main.py --config configs/opsd_hybrid_engine.json
-
-The DeepSpeed launcher sets ``LOCAL_RANK``, ``RANK``, and ``WORLD_SIZE`` in
-the environment; we call :func:`deepspeed.init_distributed` to take that over.
-"""
-
-import argparse
-import json
-import os
-import random
-
-import deepspeed
-import numpy as np
-import torch
-from deepspeed.accelerator import get_accelerator
-from torch.utils.data import DataLoader
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from opsd.config import OPSDConfig
-from opsd.data import LeftPaddedPromptCollator, PromptDataset
-from opsd.rollout import build_rollout
-from opsd.teacher import TeacherWrapper
-from opsd.trainer import OPSDTrainer
-
-
-def _seed_everything(seed: int) -> None:
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    if get_accelerator().is_available():
-        get_accelerator().manual_seed_all(seed)
-
-
-def _resolve_dtype(name: str) -> torch.dtype:
-    return {"float16": torch.float16, "bfloat16": torch.bfloat16, "float32": torch.float32}[name]
-
-
-def _load_ds_config(path: str) -> dict:
-    with open(path, "r") as f:
-        return json.load(f)
-
-
-def main() -> None:
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--config", required=True, help="Path to OPSDConfig JSON")
-    parser.add_argument("--local_rank", type=int, default=int(os.environ.get("LOCAL_RANK", 0)))
-    args = parser.parse_args()
-
-    cfg = OPSDConfig.from_json(args.config)
-    cfg.validate()
-    _seed_everything(cfg.training.seed)
-
-    deepspeed.init_distributed()
-
-    # --- tokenizer (shared between data + rollout) -------------------------
-    tokenizer = AutoTokenizer.from_pretrained(
-        cfg.student.model_name_or_path,
-        trust_remote_code=cfg.student.trust_remote_code,
-        padding_side="left",
-    )
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    # --- student model + DeepSpeed engine ----------------------------------
-    student_dtype = _resolve_dtype(cfg.student.dtype)
-    student_model = AutoModelForCausalLM.from_pretrained(
-        cfg.student.model_name_or_path,
-        torch_dtype=student_dtype,
-        trust_remote_code=cfg.student.trust_remote_code,
-    )
-
-    ds_config = _load_ds_config(cfg.deepspeed_config)
-    ds_config["train_micro_batch_size_per_gpu"] = cfg.training.micro_batch_size_per_gpu
-    ds_config["train_batch_size"] = cfg.training.train_batch_size
-    ds_config["gradient_accumulation_steps"] = cfg.training.gradient_accumulation_steps
-
-    student_engine, *_ = deepspeed.initialize(
-        model=student_model,
-        model_parameters=student_model.parameters(),
-        config=ds_config,
-    )
-
-    # --- frozen teacher ----------------------------------------------------
-    teacher = TeacherWrapper(cfg.teacher, world_size=dist_world_size())
-
-    # --- rollout engine ----------------------------------------------------
-    rollout = build_rollout(
-        cfg.rollout,
-        student_engine=student_engine,
-        tokenizer=tokenizer,
-        student_model_path=cfg.student.model_name_or_path,
-        arch=cfg.student.arch,
-    )
-
-    # --- dataloader --------------------------------------------------------
-    dataset = PromptDataset(
-        path=cfg.data.path,
-        tokenizer=tokenizer,
-        max_prompt_length=cfg.rollout.max_prompt_length,
-        prompt_field=cfg.data.prompt_field,
-        chat_template=cfg.data.chat_template,
-    )
-    collator = LeftPaddedPromptCollator(tokenizer=tokenizer, max_prompt_length=cfg.rollout.max_prompt_length)
-    loader = DataLoader(
-        dataset,
-        batch_size=cfg.training.micro_batch_size_per_gpu,
-        shuffle=cfg.data.shuffle,
-        collate_fn=collator,
-        drop_last=True,
-    )
-
-    OPSDTrainer(
-        cfg=cfg,
-        student_engine=student_engine,
-        teacher=teacher,
-        tokenizer=tokenizer,
-        rollout=rollout,
-        dataloader=loader,
-    ).train()
-
-
-def dist_world_size() -> int:
-    return int(os.environ.get("WORLD_SIZE", "1"))
-
-
-if __name__ == "__main__":
-    main()
diff --git a/examples/opsd/opsd/__init__.py b/examples/opsd/opsd/__init__.py
deleted file mode 100644
index a0916026f680..000000000000
--- a/examples/opsd/opsd/__init__.py
+++ /dev/null
@@ -1,17 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""On-Policy Distillation (OPSD) training on DeepSpeed.
-
-A student model generates rollouts; a frozen teacher scores them; the student
-is updated by a per-token divergence (forward-KL / reverse-KL / JSD) computed
-against the teacher's distribution on the student's own samples.
-
-Supports two rollout engines selected via config:
-    * ``hybrid_engine`` — DeepSpeed's built-in train+infer engine (ZeRO-3 safe)
-    * ``vllm``          — vLLM running on a disjoint set of GPUs with NCCL
-                          weight sync from the trainer each step
-"""
-
-__version__ = "0.1.0"
diff --git a/examples/opsd/opsd/rollout/__init__.py b/examples/opsd/opsd/rollout/__init__.py
deleted file mode 100644
index 0509d6d8b4c9..000000000000
--- a/examples/opsd/opsd/rollout/__init__.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Rollout engines for OPSD: hybrid engine (built-in) or vLLM (disjoint GPUs)."""
-
-from opsd.rollout.base import RolloutBatch, RolloutEngine, RolloutRequest, SamplingConfig
-
-__all__ = ["RolloutBatch", "RolloutEngine", "RolloutRequest", "SamplingConfig", "build_rollout"]
-
-
-def build_rollout(rollout_cfg, student_engine=None, tokenizer=None, student_model_path=None, arch=None):
-    """Factory: construct the rollout engine specified by ``rollout_cfg.engine``.
-
-    Imports of heavy backends are deferred to here so that selecting the
-    hybrid-engine path doesn't transitively require vLLM (and vice versa).
-    """
-    engine_name = rollout_cfg.engine
-    if engine_name == "hybrid_engine":
-        from opsd.rollout.hybrid_engine import HybridEngineRollout
-
-        if student_engine is None or tokenizer is None:
-            raise ValueError("hybrid_engine rollout needs both student_engine and tokenizer")
-        return HybridEngineRollout(student_engine=student_engine, tokenizer=tokenizer, cfg=rollout_cfg)
-
-    if engine_name == "vllm":
-        from opsd.rollout.vllm import VLLMRollout
-
-        if tokenizer is None:
-            raise ValueError("vllm rollout needs a tokenizer for length accounting")
-        return VLLMRollout(
-            cfg=rollout_cfg,
-            tokenizer=tokenizer,
-            student_engine=student_engine,
-            student_model_path=student_model_path,
-            arch=arch,
-        )
-
-    raise ValueError(f"Unknown rollout engine {engine_name!r}; choose from 'hybrid_engine' | 'vllm'")
diff --git a/examples/opsd/opsd/rollout/hybrid_engine.py b/examples/opsd/opsd/rollout/hybrid_engine.py
deleted file mode 100644
index 7e7ced928655..000000000000
--- a/examples/opsd/opsd/rollout/hybrid_engine.py
+++ /dev/null
@@ -1,119 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Rollout backed by DeepSpeed's hybrid engine, with a ZeRO-3 fallback.
-
-For architectures in DeepSpeed's inference-container policy list
-(GPT2 / GPT-NeoX / OPT / BLOOM / LLAMA / LLAMA2 / InternLM as of 0.15) the
-hybrid engine gives accelerated decode by swapping in optimized inference
-kernels for the duration of the rollout. For everything else (Qwen2 / Qwen3
-/ any model without a policy), no inference container is created and
-``DeepSpeedHybridEngine.generate`` would AttributeError on its unbound
-``_generate`` slot — so we detect that case at construction time and fall
-back to a manual path that just gathers ZeRO-3 partitions and calls the
-HuggingFace model's ``generate`` directly. Correct, just slower than the
-accelerated path.
-"""
-
-import torch
-
-from opsd.config import RolloutConfig
-from opsd.rollout.base import RolloutBatch, RolloutEngine, RolloutRequest, SamplingConfig
-
-
-def _hybrid_engine_has_accel(engine) -> bool:
-    # The accelerated path is only wired up when at least one inference
-    # container was populated for the model's layers. ``_inference_containers``
-    # and ``_generate`` are both internal but they are the only two reliable
-    # signals across DeepSpeed 0.14–0.19; ``_generate`` is bound exactly when
-    # the container list is non-empty.
-    return getattr(engine, "_generate", None) is not None
-
-
-class HybridEngineRollout(RolloutEngine):
-    name = "hybrid_engine"
-
-    def __init__(self, student_engine, tokenizer, cfg: RolloutConfig):
-        if cfg.engine != "hybrid_engine":
-            raise ValueError(f"RolloutConfig.engine must be 'hybrid_engine'; got {cfg.engine!r}")
-        self.engine = student_engine
-        self.tokenizer = tokenizer
-        self.cfg = cfg
-        self._has_accel = _hybrid_engine_has_accel(student_engine)
-
-    @torch.no_grad()
-    def generate(self, request: RolloutRequest, sampling: SamplingConfig) -> RolloutBatch:
-        pad_id = self.tokenizer.pad_token_id
-        if pad_id is None:
-            # Many decoder-only tokenizers (Llama, Qwen) ship without a pad
-            # token. Fall back to eos so that generate doesn't crash on the
-            # left-padded prompts.
-            pad_id = self.tokenizer.eos_token_id
-
-        gen_kwargs = dict(
-            input_ids=request.prompt_ids,
-            attention_mask=request.prompt_attention_mask,
-            max_new_tokens=sampling.max_new_tokens,
-            do_sample=sampling.temperature > 0.0,
-            temperature=max(sampling.temperature, 1e-8),
-            top_p=sampling.top_p,
-            top_k=sampling.top_k if sampling.top_k > 0 else 0,
-            num_return_sequences=sampling.n_samples_per_prompt,
-            pad_token_id=pad_id,
-            eos_token_id=self.tokenizer.eos_token_id,
-        )
-
-        # Hybrid engine expects training mode toggled off so the inference
-        # containers take over. eval() is cheap (boolean flip + module walk).
-        self.engine.eval()
-        try:
-            if self._has_accel:
-                seqs = self.engine.generate(**gen_kwargs)
-            else:
-                seqs = self._fallback_generate(**gen_kwargs)
-        finally:
-            self.engine.train()
-
-        # ``seqs`` is [B * n, T_p + T_r_actual], left-padded prompt + response.
-        # With left-padded prompts every sample's response starts at column T_p.
-        B = request.prompt_ids.shape[0]
-        n = sampling.n_samples_per_prompt
-        T_p = request.prompt_ids.shape[1]
-        if seqs.shape[0] != B * n:
-            raise RuntimeError(f"generate returned batch {seqs.shape[0]}, expected {B * n}")
-
-        response_start_idx = torch.full((B * n, ), T_p, dtype=torch.long, device=seqs.device)
-
-        # Response positions are anything past the prompt that is also not pad.
-        attention_mask = (seqs != pad_id).to(request.prompt_attention_mask.dtype)
-        # Keep the prompt portion of the mask aligned with what the caller
-        # passed in (a prompt token equal to pad_id should still be attended);
-        # for typical left-padded prompts the overlap is identical.
-        prompt_mask_expanded = request.prompt_attention_mask.repeat_interleave(n, dim=0)
-        attention_mask[:, :T_p] = prompt_mask_expanded
-
-        return RolloutBatch(input_ids=seqs, attention_mask=attention_mask, response_start_idx=response_start_idx)
-
-    def sync_weights_from_student(self, step: int) -> None:  # noqa: ARG002
-        # The hybrid engine reads the student's live weights every generate
-        # call, so there is nothing to sync.
-        return None
-
-    @torch.no_grad()
-    def _fallback_generate(self, **gen_kwargs) -> torch.Tensor:
-        """Manual ZeRO-3 generate for architectures the hybrid engine doesn't
-        have an inference policy for.
-
-        Walks every parameter into a ``GatheredParameters`` context so the full
-        weight is materialized on each rank for the duration of generation,
-        then calls the underlying HF model's ``generate``. Re-partitions on
-        exit. This is correct but does not get the hybrid engine's optimized
-        kernels — expect ~3-5x slower decode than the accelerated path.
-        """
-        from deepspeed.runtime.zero import GatheredParameters
-
-        module = self.engine.module
-        all_params = list(module.parameters())
-        with GatheredParameters(all_params):
-            return module.generate(**gen_kwargs)
diff --git a/examples/opsd/opsd/rollout/vllm.py b/examples/opsd/opsd/rollout/vllm.py
deleted file mode 100644
index 947e43fbc7aa..000000000000
--- a/examples/opsd/opsd/rollout/vllm.py
+++ /dev/null
@@ -1,314 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""vLLM rollout on a disjoint GPU group.
-
-**Topology (intended)**
-    * Training ranks 0..N-1 run the student under ZeRO-3 on the first N GPUs.
-    * vLLM workers run on the device indices listed in ``cfg.gpus`` (or in
-      "shared" mode, alongside training rank 0).
-    * The vLLM ``LLM`` handle is constructed **only on training rank 0**.
-    * Other training ranks receive generated token ids by broadcast from
-      rank 0 (:func:`deepspeed.comm.broadcast_object_list`).
-
-**Weight sync**
-    * All training ranks cooperatively gather each ZeRO-3 parameter via
-      :class:`deepspeed.runtime.zero.GatheredParameters`.
-    * Rank 0 pushes the full tensor to vLLM via ``LLM.collective_rpc(...)``,
-      which dispatches to every vLLM worker; each worker uses its own TP rank
-      to slice and load.
-
-**KNOWN BLOCKING ISSUE — same-process vLLM under the DeepSpeed launcher**
-
-    vLLM's worker initialisation calls ``new_group(...)`` on the global
-    process group as a collective. Under the standard DeepSpeed launcher
-    (e.g. ``deepspeed --num_gpus 2``) the world spans **all** training
-    ranks, but only rank 0 calls into vLLM. The other training ranks never
-    participate in vLLM's collective, so the ``LLM`` constructor hangs
-    forever waiting on them.
-
-    This was reproduced with vllm 0.6.6 + deepspeed 0.15.4 + torch 2.5.1; the
-    same code-path completes in seconds when ``LLM`` is constructed in a
-    process whose world size is 1. Verified by minimal repro (rank 0 LLM
-    init blocks; rank 1 idle).
-
-    **Workarounds (none currently implemented):**
-      1. Run vLLM in a **separate top-level Python process** with its own
-         world (size 1), and have the trainer talk to it over an HTTP or
-         RPC channel. This is what TRL and OpenRLHF do for their vLLM
-         backends.
-      2. Spawn vLLM as a subprocess from rank 0 and tunnel calls through a
-         queue. Similar to (1) but lower-level.
-      3. Wait for upstream vLLM to expose a flag that skips its internal
-         ``new_group`` calls when the caller already owns process-group
-         setup.
-
-    Until one of those lands, **the vLLM rollout in this PR is verified at
-    the unit-test level only** (see ``tests/test_vllm_stitch.py`` and
-    ``tests/test_weight_bridge.py``). The hybrid engine rollout is the
-    fully-validated live path. See the project README's "vLLM status"
-    section for current state.
-"""
-
-import os
-from typing import Any, List, Optional
-
-import torch
-
-from opsd.config import RolloutConfig
-from opsd.rollout.base import RolloutBatch, RolloutEngine, RolloutRequest, SamplingConfig
-from opsd.weight_bridge import WeightBridge, get_bridge
-
-
-def _is_rank_zero() -> bool:
-    # Deferred so this module remains importable in CPU-only test envs that
-    # don't have ``deepspeed`` available (the ``stitch_rollout`` helper below
-    # is pure tensor math and is unit-tested without DeepSpeed).
-    from deepspeed import comm as dist
-
-    return (not dist.is_initialized()) or dist.get_rank() == 0
-
-
-def stitch_rollout(
-    prompt_ids: torch.Tensor,
-    prompt_attention_mask: torch.Tensor,
-    responses: List[List[int]],
-    pad_id: int,
-    n_samples_per_prompt: int,
-) -> RolloutBatch:
-    """Stitch left-padded prompts and per-sample response token ids into one
-    right-padded ``RolloutBatch``.
-
-    This is the only piece of vLLM-side post-processing that doesn't depend
-    on a live LLM handle, so we factor it out for CPU unit testing.
-
-    Args:
-        prompt_ids: ``[B, T_p]`` left-padded prompts.
-        prompt_attention_mask: ``[B, T_p]`` matching attention mask.
-        responses: list of length ``B * n_samples_per_prompt``; each element
-            is the list of generated token ids for one (prompt, sample).
-        pad_id: pad token used for both prompt left-padding and response
-            right-padding (typically the tokenizer's ``pad_token_id`` or
-            ``eos_token_id``).
-        n_samples_per_prompt: number of generated samples per prompt.
-
-    Returns:
-        :class:`RolloutBatch` with ``response_start_idx = T_p`` for every
-        sample.
-    """
-    B, T_p = prompt_ids.shape
-    n = n_samples_per_prompt
-    expected = B * n
-    if len(responses) != expected:
-        raise ValueError(f"expected {expected} response token-id lists "
-                         f"(B={B} * n_samples={n}); got {len(responses)}")
-
-    if responses:
-        max_response_len = max(len(r) for r in responses)
-    else:
-        max_response_len = 0
-    T_total = T_p + max_response_len
-    device = prompt_ids.device
-
-    out_ids = torch.full((expected, T_total), pad_id, dtype=torch.long, device=device)
-    out_attn = torch.zeros((expected, T_total), dtype=prompt_attention_mask.dtype, device=device)
-
-    prompts_expanded = prompt_ids.repeat_interleave(n, dim=0)
-    attn_expanded = prompt_attention_mask.repeat_interleave(n, dim=0)
-    out_ids[:, :T_p] = prompts_expanded
-    out_attn[:, :T_p] = attn_expanded
-
-    for i, resp in enumerate(responses):
-        L = len(resp)
-        if L == 0:
-            continue
-        out_ids[i, T_p:T_p + L] = torch.tensor(resp, dtype=torch.long, device=device)
-        out_attn[i, T_p:T_p + L] = 1
-
-    response_start_idx = torch.full((expected, ), T_p, dtype=torch.long, device=device)
-    return RolloutBatch(input_ids=out_ids, attention_mask=out_attn, response_start_idx=response_start_idx)
-
-
-class VLLMRollout(RolloutEngine):
-
-    name = "vllm"
-
-    def __init__(
-        self,
-        cfg: RolloutConfig,
-        tokenizer: Any,
-        student_engine: Any = None,
-        student_model_path: Optional[str] = None,
-        arch: Optional[str] = None,
-    ):
-        if cfg.engine != "vllm":
-            raise ValueError(f"RolloutConfig.engine must be 'vllm'; got {cfg.engine!r}")
-        if student_model_path is None:
-            raise ValueError("VLLMRollout needs student_model_path to initialise the vLLM engine "
-                             "(it loads weights from disk at construction time)")
-
-        self.cfg = cfg
-        self.tokenizer = tokenizer
-        self.student_engine = student_engine
-        self._model_path = student_model_path
-
-        self.is_rank_zero = _is_rank_zero()
-        self.llm: Optional[Any] = None
-        self.bridge: Optional[WeightBridge] = get_bridge(arch) if arch is not None else None
-
-        if self.is_rank_zero:
-            self._init_vllm()
-
-    # ------------------------------------------------------------------
-    # Construction
-    # ------------------------------------------------------------------
-
-    def _init_vllm(self) -> None:
-        # Topology selection:
-        #   * cfg.gpus empty  → SHARED: vLLM runs in-process on the same GPU
-        #     as training rank 0. Simple; no CUDA visibility tricks. Used for
-        #     smoke tests and when vLLM + student fit alongside each other.
-        #   * cfg.gpus set    → DISJOINT: vLLM workers are pinned to the
-        #     listed devices via CUDA_VISIBLE_DEVICES + a spawn-mode
-        #     subprocess executor so the new CUDA context isn't inherited
-        #     from the already-initialised rank-0 process.
-        shared = not self.cfg.gpus
-
-        prev_cvd = os.environ.get("CUDA_VISIBLE_DEVICES")
-        prev_mp = os.environ.get("VLLM_WORKER_MULTIPROC_METHOD")
-        if not shared:
-            os.environ["CUDA_VISIBLE_DEVICES"] = ",".join(str(g) for g in self.cfg.gpus)
-            # Must be set before the vllm import; the value is read at import time.
-            os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
-        try:
-            try:
-                from vllm import LLM
-            except ImportError as e:
-                raise ImportError(f"VLLMRollout requires vllm>={self.cfg.vllm_min_version}. "
-                                  f"Install with: pip install 'vllm>={self.cfg.vllm_min_version}'") from e
-
-            llm_kwargs = dict(
-                model=self._model_path,
-                tensor_parallel_size=self.cfg.tensor_parallel_size,
-                gpu_memory_utilization=self.cfg.gpu_memory_utilization,
-                dtype=self.cfg.vllm_dtype,
-                enforce_eager=self.cfg.vllm_enforce_eager,
-            )
-            if not shared:
-                llm_kwargs["distributed_executor_backend"] = "mp"
-            self.llm = LLM(**llm_kwargs)
-        finally:
-            if prev_cvd is None:
-                os.environ.pop("CUDA_VISIBLE_DEVICES", None)
-            else:
-                os.environ["CUDA_VISIBLE_DEVICES"] = prev_cvd
-            if prev_mp is None:
-                os.environ.pop("VLLM_WORKER_MULTIPROC_METHOD", None)
-            else:
-                os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = prev_mp
-
-    # ------------------------------------------------------------------
-    # Generation
-    # ------------------------------------------------------------------
-
-    def generate(self, request: RolloutRequest, sampling: SamplingConfig) -> RolloutBatch:
-        B = int(request.prompt_ids.shape[0])
-        n = sampling.n_samples_per_prompt
-
-        if self.is_rank_zero:
-            from vllm import SamplingParams
-
-            # We send prompt *token ids* rather than text to vLLM so the
-            # generation stays bit-exact with how the trainer tokenised. This
-            # avoids any subtle BOS / special-token differences between the
-            # trainer's and vLLM's text->id paths.
-            prompt_token_ids: List[List[int]] = []
-            for i in range(B):
-                mask = request.prompt_attention_mask[i].bool()
-                ids = request.prompt_ids[i][mask].tolist()
-                prompt_token_ids.append(ids)
-
-            sp = SamplingParams(
-                n=n,
-                temperature=sampling.temperature,
-                top_p=sampling.top_p,
-                top_k=sampling.top_k if sampling.top_k > 0 else -1,
-                max_tokens=sampling.max_new_tokens,
-            )
-            results = self.llm.generate(prompt_token_ids=prompt_token_ids, sampling_params=sp, use_tqdm=False)
-            responses: List[List[int]] = []
-            for r in results:
-                for out in r.outputs:
-                    responses.append(list(out.token_ids))
-        else:
-            responses = []
-
-        from deepspeed import comm as dist
-
-        if dist.is_initialized() and dist.get_world_size() > 1:
-            obj = [responses]
-            dist.broadcast_object_list(obj, src=0)
-            responses = obj[0]
-
-        pad_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id
-        return stitch_rollout(
-            prompt_ids=request.prompt_ids,
-            prompt_attention_mask=request.prompt_attention_mask,
-            responses=responses,
-            pad_id=pad_id,
-            n_samples_per_prompt=n,
-        )
-
-    # ------------------------------------------------------------------
-    # Weight sync
-    # ------------------------------------------------------------------
-
-    def sync_weights_from_student(self, step: int) -> None:
-        if self.student_engine is None:
-            return
-        if self.bridge is None:
-            # Best-effort inference of arch from the student model class name.
-            model = self.student_engine.module
-            cls = type(model).__name__.lower()
-            if "qwen3" in cls:
-                self.bridge = get_bridge("qwen3")
-            elif "qwen2" in cls:
-                self.bridge = get_bridge("qwen2")
-            else:
-                raise RuntimeError(f"Cannot infer weight bridge for student class {cls!r}; "
-                                   f"set StudentConfig.arch explicitly")
-
-        from deepspeed.runtime.zero import GatheredParameters
-
-        model = self.student_engine.module
-        for name, param in model.named_parameters():
-            # GatheredParameters is a no-op when ZeRO stage < 3, and a full
-            # all-gather when stage == 3. Either way every rank sees the full
-            # tensor inside the context; only rank 0 forwards it to vLLM.
-            with GatheredParameters([param], modifier_rank=0):
-                if not self.is_rank_zero:
-                    continue
-                # Sanity-check the param name against the bridge so a renamed
-                # parameter trips here (cheap) rather than as a silent layout
-                # mismatch inside vLLM later (very hard to debug).
-                self.bridge.parallel_kind(name)
-                self._push_one_param(name, param.data.detach())
-
-    def _push_one_param(self, name: str, tensor: torch.Tensor) -> None:
-        # collective_rpc dispatches to every vLLM worker; pickle handles the
-        # tensor transfer. CPU tensors pickle cleanly across process bounds.
-        cpu = tensor.contiguous().cpu()
-        # vLLM's per-architecture model class exposes ``load_weights`` taking
-        # an iterable of (name, tensor) pairs and internally handles QKV /
-        # gate_up fusion plus per-rank slicing for tensor parallelism.
-        self.llm.collective_rpc("load_weights", args=([(name, cpu)], ))
-
-    # ------------------------------------------------------------------
-    # Cleanup
-    # ------------------------------------------------------------------
-
-    def shutdown(self) -> None:
-        if self.llm is not None:
-            del self.llm
-            self.llm = None
diff --git a/examples/opsd/opsd/weight_bridge/__init__.py b/examples/opsd/opsd/weight_bridge/__init__.py
deleted file mode 100644
index b415b1a1b0e8..000000000000
--- a/examples/opsd/opsd/weight_bridge/__init__.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Architecture-specific bridges that slice HuggingFace weights for vLLM TP.
-
-A bridge takes the student's full ``(name, tensor)`` pairs (after we've
-gathered them across ZeRO-3 ranks) and emits the per-vLLM-rank slices ready
-to push into vLLM's ``model.load_weights(...)``.
-
-vLLM internally fuses Q/K/V into ``qkv_proj`` and gate/up into ``gate_up_proj``.
-We do **not** pre-fuse on our side — vLLM's loader already understands the
-unfused HuggingFace layout — so the bridge only needs to know each parameter's
-parallel kind (column / row / vocab / replicated) and slice on the right dim.
-"""
-
-from opsd.weight_bridge.base import ParallelKind, WeightBridge
-from opsd.weight_bridge.qwen2 import Qwen2WeightBridge
-from opsd.weight_bridge.qwen3 import Qwen3WeightBridge
-
-__all__ = ["WeightBridge", "ParallelKind", "Qwen2WeightBridge", "Qwen3WeightBridge", "get_bridge"]
-
-
-def get_bridge(arch: str) -> WeightBridge:
-    """Look up a bridge by architecture key (matches HF's ``model_type``)."""
-    key = arch.lower()
-    if key in ("qwen2", "qwen2.5"):
-        return Qwen2WeightBridge()
-    if key in ("qwen3", ):
-        return Qwen3WeightBridge()
-    raise ValueError(f"No weight bridge registered for arch {arch!r}; "
-                     f"add a sibling of opsd/weight_bridge/qwen2.py and register here")
diff --git a/examples/opsd/opsd/weight_bridge/base.py b/examples/opsd/opsd/weight_bridge/base.py
deleted file mode 100644
index 3e780a05ae68..000000000000
--- a/examples/opsd/opsd/weight_bridge/base.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""WeightBridge ABC: per-tensor TP slicing for vLLM weight sync."""
-
-from abc import ABC, abstractmethod
-from enum import Enum
-from typing import Iterable, Iterator, Tuple
-
-import torch
-
-
-class ParallelKind(str, Enum):
-    """How a single parameter is split across vLLM TP ranks.
-
-    Notation matches the standard Megatron-style decomposition:
-
-      * ``COLUMN`` — output dim (dim 0) is split. Each rank owns
-        ``out_features / tp`` rows. Used for attention Q/K/V and MLP
-        gate/up.
-      * ``ROW`` — input dim (dim 1) is split. Each rank owns
-        ``in_features / tp`` columns. Used for attention output projection
-        and MLP down projection.
-      * ``VOCAB`` — like COLUMN but applied to the embedding / LM head where
-        the partitioned dim is the vocab axis. Treated the same as COLUMN
-        for slicing purposes; the kind is kept distinct to make divisibility
-        diagnostics clearer at debug time.
-      * ``REPLICATED`` — the same tensor lives on every rank
-        (layer norms, RMSNorm scalars, per-head q_norm/k_norm in Qwen3).
-    """
-
-    COLUMN = "column"
-    ROW = "row"
-    VOCAB = "vocab"
-    REPLICATED = "replicated"
-
-
-def _even_slice(t: torch.Tensor, dim: int, rank: int, tp_size: int) -> torch.Tensor:
-    """Return rank ``rank`` 's contiguous chunk of ``t`` along ``dim``.
-
-    Refuses uneven divisions so that bugs surface here rather than as silent
-    layout mismatches once weights are loaded into vLLM.
-    """
-    total = int(t.shape[dim])
-    if total % tp_size != 0:
-        raise ValueError(f"Shape {tuple(t.shape)} dim {dim} (={total}) not divisible by "
-                         f"tp_size {tp_size}")
-    per = total // tp_size
-    return t.narrow(dim, rank * per, per).contiguous()
-
-
-class WeightBridge(ABC):
-    """Strategy object that maps HuggingFace param names to a parallel kind.
-
-    Subclasses only need to implement :meth:`parallel_kind`; the slicing
-    machinery is inherited.
-    """
-
-    # Subclasses set this to a human-readable tag, e.g. "qwen2".
-    arch: str = "base"
-
-    @abstractmethod
-    def parallel_kind(self, hf_name: str) -> ParallelKind:
-        """Return how parameter ``hf_name`` should be partitioned across TP."""
-
-    def slice_for_rank(
-        self,
-        hf_name: str,
-        tensor: torch.Tensor,
-        tp_rank: int,
-        tp_size: int,
-    ) -> torch.Tensor:
-        """Return the slice of ``tensor`` that belongs to rank ``tp_rank``."""
-        if tp_size < 1 or not (0 <= tp_rank < tp_size):
-            raise ValueError(f"invalid tp_rank={tp_rank} for tp_size={tp_size}")
-        if tp_size == 1:
-            return tensor
-        kind = self.parallel_kind(hf_name)
-        if kind is ParallelKind.REPLICATED:
-            return tensor
-        # COLUMN and VOCAB partition dim 0 (output / vocab). ROW partitions
-        # dim 1 (input). Both kinds may apply to 1-D tensors (biases): for a
-        # 1-D bias on a COLUMN-parallel linear, dim 0 IS the partitioned dim.
-        if kind in (ParallelKind.COLUMN, ParallelKind.VOCAB):
-            return _even_slice(tensor, dim=0, rank=tp_rank, tp_size=tp_size)
-        if kind is ParallelKind.ROW:
-            if tensor.dim() < 2:
-                # Row-parallel linears have a replicated bias (vLLM convention),
-                # so a 1-D tensor reaching this branch is a bug.
-                raise ValueError(f"ROW parallel kind requires >=2-D tensor for {hf_name}; "
-                                 f"got shape {tuple(tensor.shape)}")
-            return _even_slice(tensor, dim=1, rank=tp_rank, tp_size=tp_size)
-        raise ValueError(f"unhandled parallel kind {kind!r}")
-
-    def map_state_dict(
-        self,
-        hf_named_tensors: Iterable[Tuple[str, torch.Tensor]],
-        tp_rank: int,
-        tp_size: int,
-    ) -> Iterator[Tuple[str, torch.Tensor]]:
-        """Yield ``(vllm_name, sliced_tensor)`` for every input pair.
-
-        For Qwen-family models the vLLM parameter name is identical to the
-        HF name (vLLM's loader handles QKV/gate-up fusion internally), so the
-        emitted names are unchanged.
-        """
-        for name, tensor in hf_named_tensors:
-            yield name, self.slice_for_rank(name, tensor, tp_rank, tp_size)
diff --git a/examples/opsd/opsd/weight_bridge/qwen2.py b/examples/opsd/opsd/weight_bridge/qwen2.py
deleted file mode 100644
index 903d47e81c1f..000000000000
--- a/examples/opsd/opsd/weight_bridge/qwen2.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Weight bridge for Qwen2 / Qwen2.5 dense models.
-
-Naming follows the standard HF Qwen2 layout::
-
-    model.embed_tokens.weight
-    model.layers.{i}.self_attn.{q,k,v,o}_proj.{weight,bias}
-    model.layers.{i}.mlp.{gate,up,down}_proj.weight
-    model.layers.{i}.{input,post_attention}_layernorm.weight
-    model.norm.weight
-    lm_head.weight     # may be tied to embed_tokens
-
-Parallel kinds:
-    * Q/K/V projections — column-parallel (split heads across ranks)
-    * Attention output projection — row-parallel
-    * MLP gate / up projections — column-parallel
-    * MLP down projection — row-parallel
-    * Layer norms / final norm — replicated
-    * Token embedding & LM head — vocab-parallel (split vocab dim)
-    * Bias on Q/K/V — column-parallel (1-D bias on a column-parallel linear)
-    * Bias on o_proj / down_proj — replicated (row-parallel linears have a
-      replicated bias under vLLM's convention; the partial sums are reduced
-      before the bias add)
-"""
-
-import re
-
-from opsd.weight_bridge.base import ParallelKind, WeightBridge
-
-_LAYER_RE = re.compile(r"^model\.layers\.\d+\.(?P<rest>.+)$")
-
-
-class Qwen2WeightBridge(WeightBridge):
-    arch = "qwen2"
-
-    # Suffix → parallel kind. Keyed by the part after "model.layers.{i}." for
-    # transformer-block params, plus a few full names for embeddings / norms.
-    _LAYER_RULES = {
-        "self_attn.q_proj.weight": ParallelKind.COLUMN,
-        "self_attn.k_proj.weight": ParallelKind.COLUMN,
-        "self_attn.v_proj.weight": ParallelKind.COLUMN,
-        "self_attn.q_proj.bias": ParallelKind.COLUMN,
-        "self_attn.k_proj.bias": ParallelKind.COLUMN,
-        "self_attn.v_proj.bias": ParallelKind.COLUMN,
-        "self_attn.o_proj.weight": ParallelKind.ROW,
-        "self_attn.o_proj.bias": ParallelKind.REPLICATED,
-        "mlp.gate_proj.weight": ParallelKind.COLUMN,
-        "mlp.up_proj.weight": ParallelKind.COLUMN,
-        "mlp.down_proj.weight": ParallelKind.ROW,
-        "mlp.down_proj.bias": ParallelKind.REPLICATED,
-        "input_layernorm.weight": ParallelKind.REPLICATED,
-        "post_attention_layernorm.weight": ParallelKind.REPLICATED,
-    }
-
-    _GLOBAL_RULES = {
-        "model.embed_tokens.weight": ParallelKind.VOCAB,
-        "model.norm.weight": ParallelKind.REPLICATED,
-        "lm_head.weight": ParallelKind.VOCAB,
-    }
-
-    def parallel_kind(self, hf_name: str) -> ParallelKind:
-        if hf_name in self._GLOBAL_RULES:
-            return self._GLOBAL_RULES[hf_name]
-        m = _LAYER_RE.match(hf_name)
-        if m is not None:
-            rest = m.group("rest")
-            if rest in self._LAYER_RULES:
-                return self._LAYER_RULES[rest]
-            # Per-layer name not in our table — surface a clear error so the
-            # weight sync isn't silently wrong for an unrecognised tensor.
-            extra = self._extra_layer_kind(rest)
-            if extra is not None:
-                return extra
-            raise KeyError(f"Unknown per-layer Qwen2 parameter suffix {rest!r}; add a rule "
-                           f"in Qwen2WeightBridge._LAYER_RULES")
-        raise KeyError(f"Unknown Qwen2 parameter name {hf_name!r}")
-
-    def _extra_layer_kind(self, _suffix: str):  # noqa: D401, ARG002
-        """Hook for subclasses (Qwen3) to add per-layer rules without
-        duplicating the rest of the table."""
-        return None
diff --git a/examples/opsd/opsd/weight_bridge/qwen3.py b/examples/opsd/opsd/weight_bridge/qwen3.py
deleted file mode 100644
index 6b3d7695ed32..000000000000
--- a/examples/opsd/opsd/weight_bridge/qwen3.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Weight bridge for Qwen3 dense models.
-
-Qwen3-dense uses the same overall layout as Qwen2 with one addition:
-per-head RMSNorm applied to the query and key projections before attention::
-
-    model.layers.{i}.self_attn.q_norm.weight   # shape [head_dim]
-    model.layers.{i}.self_attn.k_norm.weight   # shape [head_dim]
-
-These weights are 1-D over ``head_dim`` (not ``num_heads * head_dim``), so they
-are **replicated** across TP ranks: every rank owns a subset of heads but each
-head normalises with the same per-head-dim scalars.
-
-Qwen3-MoE (the ``Qwen3MoeForCausalLM`` family) is **not** covered here — MoE
-introduces gate/expert routing and per-expert MLPs that need their own bridge.
-Add a sibling ``qwen3_moe.py`` when that path becomes a priority.
-"""
-
-from typing import Optional
-
-from opsd.weight_bridge.base import ParallelKind
-from opsd.weight_bridge.qwen2 import Qwen2WeightBridge
-
-
-class Qwen3WeightBridge(Qwen2WeightBridge):
-    arch = "qwen3"
-
-    _Q_NORM = "self_attn.q_norm.weight"
-    _K_NORM = "self_attn.k_norm.weight"
-
-    def _extra_layer_kind(self, suffix: str) -> Optional[ParallelKind]:
-        if suffix in (self._Q_NORM, self._K_NORM):
-            return ParallelKind.REPLICATED
-        return None
diff --git a/examples/opsd/requirements.txt b/examples/opsd/requirements.txt
deleted file mode 100644
index fb5a091575da..000000000000
--- a/examples/opsd/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-datasets>=2.0.0
-numpy
-transformers>=4.40.0
-# Optional, only needed when rollout.engine == "vllm":
-# vllm>=0.6.4
diff --git a/examples/opsd/scripts/train_opsd_hybrid.sh b/examples/opsd/scripts/train_opsd_hybrid.sh
deleted file mode 100644
index 69e3bdc68a7b..000000000000
--- a/examples/opsd/scripts/train_opsd_hybrid.sh
+++ /dev/null
@@ -1,14 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-#
-# Launch OPSD training with the DeepSpeed hybrid-engine rollout (no vLLM).
-# Assumes you're cd'd into examples/opsd/.
-set -euo pipefail
-
-CONFIG="${1:-configs/opsd_hybrid_engine.json}"
-NUM_GPUS="${NUM_GPUS:-8}"
-
-deepspeed --num_gpus "${NUM_GPUS}" main.py --config "${CONFIG}"
diff --git a/examples/opsd/scripts/train_opsd_vllm.sh b/examples/opsd/scripts/train_opsd_vllm.sh
deleted file mode 100644
index 83ed4dc96d7e..000000000000
--- a/examples/opsd/scripts/train_opsd_vllm.sh
+++ /dev/null
@@ -1,19 +0,0 @@
-#!/usr/bin/env bash
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-#
-# Launch OPSD training with vLLM rollout on a disjoint GPU group.
-#
-# Default config assumes 8 GPUs: ranks 0..5 train (ZeRO-3), devices 6-7 run
-# vLLM with TP=2. Adjust configs/opsd_vllm_disjoint.json::rollout.gpus and
-# NUM_TRAIN_GPUS to match your topology.
-set -euo pipefail
-
-CONFIG="${1:-configs/opsd_vllm_disjoint.json}"
-NUM_TRAIN_GPUS="${NUM_TRAIN_GPUS:-6}"
-INCLUDE_GPUS="${INCLUDE_GPUS:-0,1,2,3,4,5}"
-
-deepspeed --num_gpus "${NUM_TRAIN_GPUS}" --include "localhost:${INCLUDE_GPUS}" \
-    main.py --config "${CONFIG}"
diff --git a/examples/opsd/tests/test_losses.py b/examples/opsd/tests/test_losses.py
deleted file mode 100644
index 1cf9aede6756..000000000000
--- a/examples/opsd/tests/test_losses.py
+++ /dev/null
@@ -1,166 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""CPU-only numerics tests for the distillation divergences.
-
-These exercise the loss math without needing GPUs, models, or a torchrun
-launcher. Run from the example root with::
-
-    cd examples/opsd && python -m pytest tests/test_losses.py -v
-"""
-
-import pytest
-import torch
-
-from opsd.losses import chunked_distillation_loss, per_token_logprobs
-from opsd.utils import build_response_mask, shift_for_next_token_prediction
-
-
-@pytest.mark.parametrize("loss_type", ["forward_kl", "reverse_kl", "jsd"])
-def test_zero_when_identical(loss_type):
-    torch.manual_seed(0)
-    logits = torch.randn(2, 8, 32)
-    mask = torch.ones(2, 8)
-    loss = chunked_distillation_loss(logits, logits.clone(), mask, loss_type=loss_type)
-    assert loss.item() == pytest.approx(0.0, abs=1e-5)
-
-
-@pytest.mark.parametrize("loss_type", ["forward_kl", "reverse_kl", "jsd"])
-def test_positive_when_different(loss_type):
-    torch.manual_seed(0)
-    s = torch.randn(2, 8, 32)
-    t = torch.randn(2, 8, 32)
-    mask = torch.ones(2, 8)
-    loss = chunked_distillation_loss(s, t, mask, loss_type=loss_type)
-    assert loss.item() > 0.0
-
-
-@pytest.mark.parametrize("loss_type", ["forward_kl", "reverse_kl", "jsd"])
-def test_chunking_equivalent_to_unchunked(loss_type):
-    torch.manual_seed(0)
-    s = torch.randn(2, 100, 32)
-    t = torch.randn(2, 100, 32)
-    mask = torch.ones(2, 100)
-    loss_chunked = chunked_distillation_loss(s, t, mask, loss_type=loss_type, chunk_size=10)
-    loss_whole = chunked_distillation_loss(s, t, mask, loss_type=loss_type, chunk_size=10_000)
-    assert torch.allclose(loss_chunked, loss_whole, atol=1e-5)
-
-
-def test_mask_excludes_tokens():
-    torch.manual_seed(0)
-    s = torch.randn(2, 8, 32)
-    t = torch.randn(2, 8, 32)
-    half_mask = torch.tensor([[1, 1, 1, 1, 0, 0, 0, 0], [1, 1, 1, 1, 0, 0, 0, 0]], dtype=torch.float32)
-    loss_direct = chunked_distillation_loss(s[:, :4], t[:, :4], torch.ones(2, 4), loss_type="reverse_kl")
-    loss_masked = chunked_distillation_loss(s, t, half_mask, loss_type="reverse_kl")
-    assert torch.allclose(loss_direct, loss_masked, atol=1e-5)
-
-
-def test_gradient_flows_to_student():
-    torch.manual_seed(0)
-    s = torch.randn(2, 8, 32, requires_grad=True)
-    t = torch.randn(2, 8, 32)
-    mask = torch.ones(2, 8)
-    loss = chunked_distillation_loss(s, t, mask, loss_type="reverse_kl")
-    loss.backward()
-    assert s.grad is not None
-    assert s.grad.abs().sum().item() > 0
-
-
-def test_gradient_does_not_flow_to_teacher_when_detached():
-    torch.manual_seed(0)
-    s = torch.randn(2, 8, 32, requires_grad=True)
-    t = torch.randn(2, 8, 32, requires_grad=True)
-    mask = torch.ones(2, 8)
-    loss = chunked_distillation_loss(s, t.detach(), mask, loss_type="reverse_kl")
-    loss.backward()
-    assert t.grad is None
-
-
-def test_unknown_loss_type_raises():
-    s = torch.randn(2, 4, 8)
-    t = torch.randn(2, 4, 8)
-    mask = torch.ones(2, 4)
-    with pytest.raises(ValueError, match="Unknown loss_type"):
-        chunked_distillation_loss(s, t, mask, loss_type="totally_made_up")
-
-
-def test_shape_mismatch_raises():
-    s = torch.randn(2, 4, 8)
-    t = torch.randn(2, 5, 8)
-    mask = torch.ones(2, 4)
-    with pytest.raises(ValueError, match="shape mismatch"):
-        chunked_distillation_loss(s, t, mask)
-
-
-def test_mask_shape_mismatch_raises():
-    s = torch.randn(2, 4, 8)
-    t = torch.randn(2, 4, 8)
-    mask = torch.ones(2, 5)
-    with pytest.raises(ValueError, match="does not match"):
-        chunked_distillation_loss(s, t, mask)
-
-
-@pytest.mark.parametrize("temperature", [0.5, 1.0, 2.0])
-def test_temperature_changes_loss_but_stays_finite(temperature):
-    torch.manual_seed(0)
-    s = torch.randn(2, 8, 32)
-    t = torch.randn(2, 8, 32)
-    mask = torch.ones(2, 8)
-    loss = chunked_distillation_loss(s, t, mask, loss_type="reverse_kl", temperature=temperature)
-    assert torch.isfinite(loss).item()
-
-
-def test_jsd_is_symmetric():
-    torch.manual_seed(0)
-    a = torch.randn(2, 8, 32)
-    b = torch.randn(2, 8, 32)
-    mask = torch.ones(2, 8)
-    jsd_ab = chunked_distillation_loss(a, b, mask, loss_type="jsd")
-    jsd_ba = chunked_distillation_loss(b, a, mask, loss_type="jsd")
-    assert torch.allclose(jsd_ab, jsd_ba, atol=1e-5)
-
-
-def test_all_zero_mask_returns_zero():
-    torch.manual_seed(0)
-    s = torch.randn(2, 8, 32)
-    t = torch.randn(2, 8, 32)
-    mask = torch.zeros(2, 8)
-    loss = chunked_distillation_loss(s, t, mask, loss_type="reverse_kl")
-    assert loss.item() == pytest.approx(0.0, abs=1e-6)
-
-
-def test_per_token_logprobs_matches_manual():
-    torch.manual_seed(0)
-    logits = torch.randn(2, 4, 16)
-    labels = torch.randint(0, 16, (2, 4))
-    got = per_token_logprobs(logits, labels)
-    expected = torch.log_softmax(logits.float(), dim=-1)
-    expected = expected.gather(-1, labels.unsqueeze(-1)).squeeze(-1)
-    assert torch.allclose(got, expected, atol=1e-6)
-
-
-def test_build_response_mask_basic():
-    attention_mask = torch.tensor([[1, 1, 1, 1, 0], [1, 1, 1, 1, 1]])
-    response_start_idx = torch.tensor([2, 3])
-    resp = build_response_mask(response_start_idx, attention_mask)
-    expected = torch.tensor([[0, 0, 1, 1, 0], [0, 0, 0, 1, 1]])
-    assert torch.equal(resp, expected)
-
-
-def test_build_response_mask_validates_shapes():
-    with pytest.raises(ValueError, match="response_start_idx must be 1-D"):
-        build_response_mask(torch.zeros(2, 2), torch.ones(2, 4))
-    with pytest.raises(ValueError, match="attention_mask must be 2-D"):
-        build_response_mask(torch.zeros(2), torch.ones(4))
-    with pytest.raises(ValueError, match="batch"):
-        build_response_mask(torch.zeros(3), torch.ones(2, 4))
-
-
-def test_shift_for_next_token_prediction_shapes():
-    logits = torch.randn(2, 5, 8)
-    labels = torch.randint(0, 8, (2, 5))
-    sl, sla = shift_for_next_token_prediction(logits, labels)
-    assert sl.shape == (2, 4, 8)
-    assert sla.shape == (2, 4)
diff --git a/examples/opsd/tests/test_teacher_caching.py b/examples/opsd/tests/test_teacher_caching.py
deleted file mode 100644
index 5702bc287ffe..000000000000
--- a/examples/opsd/tests/test_teacher_caching.py
+++ /dev/null
@@ -1,101 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""CPU-only tests for TeacherLogitCache.
-
-The ``TeacherWrapper`` itself (which wraps deepspeed+transformers) is not
-exercised here because it requires a real model and a DeepSpeed launcher; the
-caching/streaming pieces are isolated into ``TeacherLogitCache`` so they can
-be tested in isolation.
-"""
-
-import pytest
-import torch
-
-from opsd.teacher import TeacherLogitCache
-
-
-def test_round_trip_preserves_values_within_dtype():
-    torch.manual_seed(0)
-    gpu_like = torch.randn(2, 16, 32, dtype=torch.float32)
-    cache = TeacherLogitCache.from_gpu_logits(gpu_like, store_dtype=torch.bfloat16)
-    assert cache.shape == (2, 16, 32)
-    assert cache.dtype == torch.bfloat16
-    chunk = cache.chunk_to_device(0, 16, torch.device("cpu"), dtype=torch.float32)
-    # bf16 round-trip loses precision; check it stays within bf16's worst-case
-    # relative error rather than asserting exact equality.
-    assert torch.allclose(chunk, gpu_like, atol=1e-1, rtol=1e-1)
-
-
-def test_chunk_slicing_is_correct():
-    torch.manual_seed(0)
-    src = torch.randn(3, 100, 8)
-    cache = TeacherLogitCache.from_gpu_logits(src, store_dtype=torch.float32)
-    for start, end in [(0, 10), (10, 50), (50, 100), (33, 77)]:
-        got = cache.chunk_to_device(start, end, torch.device("cpu"))
-        assert got.shape == (3, end - start, 8)
-        assert torch.allclose(got, src[:, start:end])
-
-
-def test_invalid_chunk_bounds_raise():
-    cache = TeacherLogitCache.from_gpu_logits(torch.zeros(1, 8, 4), store_dtype=torch.float32)
-    with pytest.raises(ValueError, match="invalid"):
-        cache.chunk_to_device(0, 9, torch.device("cpu"))
-    with pytest.raises(ValueError, match="invalid"):
-        cache.chunk_to_device(5, 3, torch.device("cpu"))
-    with pytest.raises(ValueError, match="invalid"):
-        cache.chunk_to_device(-1, 4, torch.device("cpu"))
-
-
-def test_rejects_non_3d_logits():
-    with pytest.raises(ValueError, match="must be 3-D"):
-        TeacherLogitCache(cpu_logits=torch.zeros(8, 32))
-
-
-def test_rejects_gpu_resident_logits():
-    if not torch.cuda.is_available():  #ignore-cuda
-        pytest.skip("no CUDA available to construct GPU tensor")
-    with pytest.raises(ValueError, match="must live on CPU"):
-        TeacherLogitCache(cpu_logits=torch.zeros(1, 8, 4, device="cuda"))
-
-
-def test_dtype_override_in_chunk_to_device():
-    src = torch.randn(2, 8, 16, dtype=torch.float32)
-    cache = TeacherLogitCache.from_gpu_logits(src, store_dtype=torch.float32)
-    chunk = cache.chunk_to_device(0, 8, torch.device("cpu"), dtype=torch.bfloat16)
-    assert chunk.dtype == torch.bfloat16
-
-
-def test_free_releases_buffer():
-    src = torch.randn(2, 32, 16)
-    cache = TeacherLogitCache.from_gpu_logits(src, store_dtype=torch.float32)
-    assert cache.cpu_logits.numel() == 2 * 32 * 16
-    cache.free()
-    assert cache.cpu_logits.numel() == 0
-
-
-def test_default_store_dtype_is_bf16():
-    src = torch.randn(1, 4, 8)
-    cache = TeacherLogitCache.from_gpu_logits(src)
-    assert cache.dtype == torch.bfloat16
-
-
-def test_streamed_chunked_loss_matches_full_loss():
-    """End-to-end check: pulling teacher logits chunk-by-chunk through the
-    cache yields the same distillation loss as passing the full teacher tensor
-    to ``chunked_distillation_loss`` directly."""
-    from opsd.losses import chunked_distillation_loss
-
-    torch.manual_seed(0)
-    s = torch.randn(2, 64, 32)
-    t = torch.randn(2, 64, 32)
-    mask = torch.ones(2, 64)
-
-    direct = chunked_distillation_loss(s, t, mask, loss_type="reverse_kl", chunk_size=8)
-
-    cache = TeacherLogitCache.from_gpu_logits(t, store_dtype=torch.float32)
-    staged_full = cache.chunk_to_device(0, 64, torch.device("cpu"), dtype=torch.float32)
-    via_cache = chunked_distillation_loss(s, staged_full, mask, loss_type="reverse_kl", chunk_size=8)
-
-    assert torch.allclose(direct, via_cache, atol=1e-6)
diff --git a/examples/opsd/tests/test_weight_bridge.py b/examples/opsd/tests/test_weight_bridge.py
deleted file mode 100644
index 9aa50414cbb2..000000000000
--- a/examples/opsd/tests/test_weight_bridge.py
+++ /dev/null
@@ -1,259 +0,0 @@
-# Copyright (c) Microsoft Corporation.
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""CPU-only tests for the TP weight bridges.
-
-These exercise the parallel-kind table and the per-rank slicing math without
-requiring vLLM, GPUs, or real model checkpoints.
-"""
-
-import pytest
-import torch
-
-from opsd.weight_bridge import ParallelKind, Qwen2WeightBridge, Qwen3WeightBridge, get_bridge
-
-# Realistic-ish shapes for a Qwen2.5-0.5B-style model: hidden=896, num_heads=14,
-# num_kv_heads=2, head_dim=64, intermediate=4864, vocab=151936. Picked so all
-# the per-dim sizes are divisible by tp_size=2.
-HIDDEN = 896
-NUM_HEADS = 14
-NUM_KV_HEADS = 2
-HEAD_DIM = 64
-INTERMEDIATE = 4864
-VOCAB = 151936
-
-
-def _qwen2_named_tensors():
-    """A minimal stand-in for one layer of a Qwen2 state dict."""
-    q_dim = NUM_HEADS * HEAD_DIM
-    kv_dim = NUM_KV_HEADS * HEAD_DIM
-    return [
-        ("model.embed_tokens.weight", torch.randn(VOCAB, HIDDEN)),
-        ("model.layers.0.self_attn.q_proj.weight", torch.randn(q_dim, HIDDEN)),
-        ("model.layers.0.self_attn.k_proj.weight", torch.randn(kv_dim, HIDDEN)),
-        ("model.layers.0.self_attn.v_proj.weight", torch.randn(kv_dim, HIDDEN)),
-        ("model.layers.0.self_attn.q_proj.bias", torch.randn(q_dim)),
-        ("model.layers.0.self_attn.k_proj.bias", torch.randn(kv_dim)),
-        ("model.layers.0.self_attn.v_proj.bias", torch.randn(kv_dim)),
-        ("model.layers.0.self_attn.o_proj.weight", torch.randn(HIDDEN, q_dim)),
-        ("model.layers.0.mlp.gate_proj.weight", torch.randn(INTERMEDIATE, HIDDEN)),
-        ("model.layers.0.mlp.up_proj.weight", torch.randn(INTERMEDIATE, HIDDEN)),
-        ("model.layers.0.mlp.down_proj.weight", torch.randn(HIDDEN, INTERMEDIATE)),
-        ("model.layers.0.input_layernorm.weight", torch.randn(HIDDEN)),
-        ("model.layers.0.post_attention_layernorm.weight", torch.randn(HIDDEN)),
-        ("model.norm.weight", torch.randn(HIDDEN)),
-        ("lm_head.weight", torch.randn(VOCAB, HIDDEN)),
-    ]
-
-
-# --- parallel kind dispatch -------------------------------------------------
-
-
-@pytest.mark.parametrize("name, expected", [
-    ("model.embed_tokens.weight", ParallelKind.VOCAB),
-    ("model.layers.0.self_attn.q_proj.weight", ParallelKind.COLUMN),
-    ("model.layers.0.self_attn.k_proj.weight", ParallelKind.COLUMN),
-    ("model.layers.0.self_attn.v_proj.weight", ParallelKind.COLUMN),
-    ("model.layers.42.self_attn.q_proj.bias", ParallelKind.COLUMN),
-    ("model.layers.3.self_attn.o_proj.weight", ParallelKind.ROW),
-    ("model.layers.3.mlp.gate_proj.weight", ParallelKind.COLUMN),
-    ("model.layers.3.mlp.up_proj.weight", ParallelKind.COLUMN),
-    ("model.layers.3.mlp.down_proj.weight", ParallelKind.ROW),
-    ("model.layers.0.input_layernorm.weight", ParallelKind.REPLICATED),
-    ("model.layers.0.post_attention_layernorm.weight", ParallelKind.REPLICATED),
-    ("model.norm.weight", ParallelKind.REPLICATED),
-    ("lm_head.weight", ParallelKind.VOCAB),
-])
-def test_qwen2_parallel_kinds(name, expected):
-    assert Qwen2WeightBridge().parallel_kind(name) == expected
-
-
-def test_qwen2_unknown_layer_param_raises():
-    with pytest.raises(KeyError, match="Unknown per-layer Qwen2"):
-        Qwen2WeightBridge().parallel_kind("model.layers.0.self_attn.q_norm.weight")
-
-
-def test_qwen2_unknown_global_param_raises():
-    with pytest.raises(KeyError, match="Unknown Qwen2 parameter"):
-        Qwen2WeightBridge().parallel_kind("totally.made.up.weight")
-
-
-def test_qwen3_adds_qk_norm():
-    bridge = Qwen3WeightBridge()
-    assert bridge.parallel_kind("model.layers.0.self_attn.q_norm.weight") == ParallelKind.REPLICATED
-    assert bridge.parallel_kind("model.layers.0.self_attn.k_norm.weight") == ParallelKind.REPLICATED
-    # Inherits the rest from Qwen2.
-    assert bridge.parallel_kind("model.layers.0.self_attn.q_proj.weight") == ParallelKind.COLUMN
-
-
-# --- slicing math -----------------------------------------------------------
-
-
-@pytest.mark.parametrize("tp_size", [1, 2, 4])
-def test_column_slice_shapes(tp_size):
-    bridge = Qwen2WeightBridge()
-    w = torch.randn(NUM_HEADS * HEAD_DIM, HIDDEN)
-    for rank in range(tp_size):
-        sliced = bridge.slice_for_rank("model.layers.0.self_attn.q_proj.weight", w, rank, tp_size)
-        assert sliced.shape == (NUM_HEADS * HEAD_DIM // tp_size, HIDDEN)
-
-
-@pytest.mark.parametrize("tp_size", [1, 2, 4])
-def test_row_slice_shapes(tp_size):
-    bridge = Qwen2WeightBridge()
-    w = torch.randn(HIDDEN, NUM_HEADS * HEAD_DIM)
-    for rank in range(tp_size):
-        sliced = bridge.slice_for_rank("model.layers.0.self_attn.o_proj.weight", w, rank, tp_size)
-        assert sliced.shape == (HIDDEN, NUM_HEADS * HEAD_DIM // tp_size)
-
-
-def test_replicated_returns_full_tensor():
-    bridge = Qwen2WeightBridge()
-    w = torch.randn(HIDDEN)
-    for rank in range(4):
-        sliced = bridge.slice_for_rank("model.layers.0.input_layernorm.weight", w, rank, tp_size=4)
-        assert sliced.shape == w.shape
-        assert torch.equal(sliced, w)
-
-
-def test_column_slices_gather_to_original():
-    bridge = Qwen2WeightBridge()
-    w = torch.randn(NUM_HEADS * HEAD_DIM, HIDDEN)
-    tp_size = 2
-    pieces = [bridge.slice_for_rank("model.layers.0.self_attn.q_proj.weight", w, r, tp_size) for r in range(tp_size)]
-    assert torch.equal(torch.cat(pieces, dim=0), w)
-
-
-def test_row_slices_gather_to_original():
-    bridge = Qwen2WeightBridge()
-    w = torch.randn(HIDDEN, INTERMEDIATE)
-    tp_size = 4
-    pieces = [bridge.slice_for_rank("model.layers.0.mlp.down_proj.weight", w, r, tp_size) for r in range(tp_size)]
-    assert torch.equal(torch.cat(pieces, dim=1), w)
-
-
-def test_vocab_slices_gather_to_original():
-    bridge = Qwen2WeightBridge()
-    w = torch.randn(VOCAB, HIDDEN)
-    tp_size = 4
-    pieces = [bridge.slice_for_rank("model.embed_tokens.weight", w, r, tp_size) for r in range(tp_size)]
-    assert torch.equal(torch.cat(pieces, dim=0), w)
-
-
-def test_bias_column_slices_gather_to_original():
-    bridge = Qwen2WeightBridge()
-    b = torch.randn(NUM_HEADS * HEAD_DIM)
-    tp_size = 2
-    pieces = [bridge.slice_for_rank("model.layers.0.self_attn.q_proj.bias", b, r, tp_size) for r in range(tp_size)]
-    assert torch.equal(torch.cat(pieces, dim=0), b)
-
-
-def test_indivisible_shape_raises():
-    bridge = Qwen2WeightBridge()
-    # 7 is not divisible by 2; should fail loudly rather than truncate.
-    w = torch.randn(7, HIDDEN)
-    with pytest.raises(ValueError, match="not divisible by"):
-        bridge.slice_for_rank("model.layers.0.self_attn.q_proj.weight", w, 0, 2)
-
-
-def test_invalid_rank_raises():
-    bridge = Qwen2WeightBridge()
-    w = torch.randn(NUM_HEADS * HEAD_DIM, HIDDEN)
-    with pytest.raises(ValueError, match="invalid tp_rank"):
-        bridge.slice_for_rank("model.layers.0.self_attn.q_proj.weight", w, 4, 4)
-    with pytest.raises(ValueError, match="invalid tp_rank"):
-        bridge.slice_for_rank("model.layers.0.self_attn.q_proj.weight", w, -1, 2)
-
-
-def test_row_parallel_rejects_1d():
-    """The defensive check inside ``slice_for_rank`` is unreachable through
-    the real Qwen2 table (row-parallel biases are tagged REPLICATED), but a
-    future bridge could route a 1-D tensor through ROW. Exercise via a
-    minimal subclass so the guard stays covered."""
-
-    class _BadBridge(Qwen2WeightBridge):
-
-        def parallel_kind(self, hf_name):  # noqa: ARG002
-            return ParallelKind.ROW
-
-    with pytest.raises(ValueError, match="ROW parallel kind requires"):
-        _BadBridge().slice_for_rank("anything", torch.randn(HIDDEN), 0, 2)
-
-
-def test_tp1_is_passthrough():
-    bridge = Qwen2WeightBridge()
-    w = torch.randn(NUM_HEADS * HEAD_DIM, HIDDEN)
-    out = bridge.slice_for_rank("model.layers.0.self_attn.q_proj.weight", w, 0, 1)
-    assert torch.equal(out, w)
-
-
-# --- state-dict iteration ---------------------------------------------------
-
-
-def test_map_state_dict_emits_correct_shapes_for_tp2():
-    bridge = Qwen2WeightBridge()
-    tp_size = 2
-    # Build the source once; each rank consumes a fresh iterator over the
-    # same materialised list so we're slicing identical tensors.
-    src = _qwen2_named_tensors()
-    by_rank = {r: dict(bridge.map_state_dict(iter(src), r, tp_size)) for r in range(tp_size)}
-    src_by_name = dict(src)
-
-    # Replicated tensors should be identical across ranks AND match source.
-    a = by_rank[0]["model.layers.0.input_layernorm.weight"]
-    b = by_rank[1]["model.layers.0.input_layernorm.weight"]
-    assert torch.equal(a, b)
-    assert torch.equal(a, src_by_name["model.layers.0.input_layernorm.weight"])
-
-    # Column-parallel Q: shapes halved on dim 0; gather reconstructs source.
-    q_full_rows = NUM_HEADS * HEAD_DIM
-    assert by_rank[0]["model.layers.0.self_attn.q_proj.weight"].shape == (q_full_rows // 2, HIDDEN)
-    gathered_q = torch.cat([
-        by_rank[0]["model.layers.0.self_attn.q_proj.weight"],
-        by_rank[1]["model.layers.0.self_attn.q_proj.weight"],
-    ],
-                           dim=0)
-    assert torch.equal(gathered_q, src_by_name["model.layers.0.self_attn.q_proj.weight"])
-
-
-def test_map_state_dict_gather_round_trip_with_fixed_seed():
-    bridge = Qwen2WeightBridge()
-    torch.manual_seed(123)
-    src = _qwen2_named_tensors()
-    src_by_name = dict(src)
-
-    tp_size = 4
-    sliced = [list(bridge.map_state_dict(src, r, tp_size)) for r in range(tp_size)]
-
-    # For every entry, reconstruct from per-rank slices and compare to the
-    # source. The reconstruction op depends on the parallel kind.
-    for r0_name, _ in sliced[0]:
-        kind = bridge.parallel_kind(r0_name)
-        per_rank = [dict(s)[r0_name] for s in sliced]
-        if kind is ParallelKind.REPLICATED:
-            recon = per_rank[0]
-        elif kind in (ParallelKind.COLUMN, ParallelKind.VOCAB):
-            recon = torch.cat(per_rank, dim=0)
-        elif kind is ParallelKind.ROW:
-            recon = torch.cat(per_rank, dim=1)
-        else:
-            raise AssertionError(f"unhandled kind {kind}")
-        assert torch.equal(recon, src_by_name[r0_name]), f"round-trip mismatch for {r0_name}"
-
-
-# --- registry ---------------------------------------------------------------
-
-
-def test_get_bridge_qwen2():
-    assert isinstance(get_bridge("qwen2"), Qwen2WeightBridge)
-    assert isinstance(get_bridge("Qwen2.5"), Qwen2WeightBridge)
-
-
-def test_get_bridge_qwen3():
-    assert isinstance(get_bridge("qwen3"), Qwen3WeightBridge)
-
-
-def test_get_bridge_unknown_raises():
-    with pytest.raises(ValueError, match="No weight bridge registered"):
-        get_bridge("totally-made-up-arch")
diff --git a/tests/unit/runtime/rollout/test_hybrid_engine_rollout.py b/tests/unit/runtime/rollout/test_hybrid_engine_rollout.py
new file mode 100644
index 000000000000..6f7f8de3ceeb
--- /dev/null
+++ b/tests/unit/runtime/rollout/test_hybrid_engine_rollout.py
@@ -0,0 +1,123 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""CPU-only unit tests for HybridEngineRollout (no GPU needed).
+
+Tests cover configuration defaults and the pure-tensor sampling helper.
+"""
+
+from unittest.mock import MagicMock
+
+import torch
+
+from deepspeed.runtime.rollout.hybrid_engine_rollout import (
+    HybridEngineRollout,
+    HybridEngineRolloutConfig,
+)
+
+
+def _make_engine():
+    engine = MagicMock()
+    engine.module = MagicMock()
+    engine.module.parameters.return_value = iter([])
+    return engine
+
+
+def _make_tokenizer():
+    tok = MagicMock()
+    tok.pad_token_id = 0
+    tok.eos_token_id = 2
+    return tok
+
+
+# -- config defaults ----------------------------------------------------
+
+
+def test_config_defaults():
+    cfg = HybridEngineRolloutConfig()
+    assert cfg.continuous_batching_size == 0
+    assert cfg.kv_trim_threshold == 16
+    assert cfg.use_graph_capture is False
+
+
+# -- constructor --------------------------------------------------------
+
+
+def test_constructor_stores_config():
+    engine = _make_engine()
+    tok = _make_tokenizer()
+    rollout = HybridEngineRollout(engine, tok, continuous_batching_size=4)
+    assert rollout.continuous_batching_size == 4
+    assert rollout.kv_trim_threshold == 16
+    assert rollout.engine is engine
+    assert rollout.tokenizer is tok
+
+
+# -- _sample_top_p ------------------------------------------------------
+
+
+def test_sample_top_p_returns_correct_shape():
+    logits = torch.randn(4, 100)
+    tokens = HybridEngineRollout._sample_top_p(logits, temperature=1.0, top_p=1.0)
+    assert tokens.shape == (4, 1)
+
+
+def test_sample_top_p_deterministic_with_low_temp():
+    logits = torch.tensor([[1.0, 10.0, 2.0]])
+    tok = HybridEngineRollout._sample_top_p(logits, temperature=1e-10, top_p=1.0)
+    assert tok.item() == 1
+
+
+def test_sample_top_p_top_p_filters():
+    logits = torch.tensor([[0.0, 0.0, 100.0]])
+    tok = HybridEngineRollout._sample_top_p(logits, temperature=1.0, top_p=0.5)
+    assert tok.item() == 2
+
+
+def test_sample_top_p_batch():
+    logits = torch.randn(8, 50)
+    tokens = HybridEngineRollout._sample_top_p(logits, temperature=0.8, top_p=0.9)
+    assert tokens.shape == (8, 1)
+    assert (tokens >= 0).all() and (tokens < 50).all()
+
+
+# -- sync_weights is no-op ---------------------------------------------
+
+
+def test_sync_weights_is_noop():
+    rollout = HybridEngineRollout(_make_engine(), _make_tokenizer())
+    assert rollout.sync_weights(step=0) is None
+
+
+# -- generate dispatches correctly -------------------------------------
+
+
+def test_generate_calls_cb_by_default():
+    engine = _make_engine()
+    tok = _make_tokenizer()
+    rollout = HybridEngineRollout(engine, tok)
+    rollout._generate_continuous_batching = MagicMock(return_value=MagicMock())
+
+    req = MagicMock()
+    req.prompt_ids = torch.tensor([[1, 2]])
+    req.prompt_attention_mask = torch.ones(1, 2, dtype=torch.long)
+    sampling = MagicMock()
+
+    rollout.generate(req, sampling)
+    rollout._generate_continuous_batching.assert_called_once()
+
+
+def test_generate_calls_graph_capture_when_enabled():
+    engine = _make_engine()
+    tok = _make_tokenizer()
+    rollout = HybridEngineRollout(engine, tok, use_graph_capture=True)
+    rollout._generate_graph_capture_cb = MagicMock(return_value=MagicMock())
+
+    req = MagicMock()
+    req.prompt_ids = torch.tensor([[1, 2]])
+    req.prompt_attention_mask = torch.ones(1, 2, dtype=torch.long)
+    sampling = MagicMock()
+
+    rollout.generate(req, sampling)
+    rollout._generate_graph_capture_cb.assert_called_once()
diff --git a/examples/opsd/tests/test_rollout_interface.py b/tests/unit/runtime/rollout/test_rollout_interface.py
similarity index 93%
rename from examples/opsd/tests/test_rollout_interface.py
rename to tests/unit/runtime/rollout/test_rollout_interface.py
index 7c6fd0545443..d19992bc49bf 100644
--- a/examples/opsd/tests/test_rollout_interface.py
+++ b/tests/unit/runtime/rollout/test_rollout_interface.py
@@ -12,8 +12,14 @@
 import pytest
 import torch
 
-from opsd.rollout import RolloutBatch, RolloutEngine, RolloutRequest, SamplingConfig
-from opsd.utils import build_response_mask
+from deepspeed.runtime.rollout import (
+    RolloutBatch,
+    RolloutEngine,
+    RolloutRequest,
+    SamplingConfig,
+    build_rollout,
+)
+from deepspeed.runtime.rlhf.utils import build_response_mask
 
 # --- dataclass invariants ---------------------------------------------------
 
@@ -85,7 +91,7 @@ def generate(self, request: RolloutRequest, sampling: SamplingConfig) -> Rollout
         response_start_idx = torch.full((B * n, ), T_p, dtype=torch.long)
         return RolloutBatch(input_ids=input_ids, attention_mask=attention_mask, response_start_idx=response_start_idx)
 
-    def sync_weights_from_student(self, step: int) -> None:
+    def sync_weights(self, step: int) -> None:
         self.sync_calls.append(step)
 
 
@@ -135,22 +141,20 @@ def test_response_mask_from_rollout_output_matches_helper():
 
 def test_sync_records_steps():
     fake = FakeRollout()
-    fake.sync_weights_from_student(0)
-    fake.sync_weights_from_student(5)
+    fake.sync_weights(0)
+    fake.sync_weights(5)
     assert fake.sync_calls == [0, 5]
 
 
 def test_engine_factory_unknown_raises():
-    from opsd.config import RolloutConfig
-    from opsd.rollout import build_rollout
+    from deepspeed.runtime.rlhf.config import RolloutConfig
 
     with pytest.raises(ValueError, match="Unknown rollout engine"):
         build_rollout(RolloutConfig(engine="totally_made_up"))
 
 
 def test_engine_factory_hybrid_requires_student_engine():
-    from opsd.config import RolloutConfig
-    from opsd.rollout import build_rollout
+    from deepspeed.runtime.rlhf.config import RolloutConfig
 
     with pytest.raises(ValueError, match="needs both"):
         build_rollout(RolloutConfig(engine="hybrid_engine"))
diff --git a/tests/unit/runtime/rollout/test_vllm_rollout.py b/tests/unit/runtime/rollout/test_vllm_rollout.py
new file mode 100644
index 000000000000..14a8c83999b8
--- /dev/null
+++ b/tests/unit/runtime/rollout/test_vllm_rollout.py
@@ -0,0 +1,187 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
+"""CPU-only unit tests for VLLMRollout (no GPU or vLLM server needed).
+
+Tests cover configuration validation, command construction, token-id
+extraction from API responses, and utility helpers.
+"""
+
+from unittest.mock import MagicMock, patch
+
+import pytest
+
+from deepspeed.runtime.rlhf.config import RolloutConfig
+
+
+def _make_cfg(**overrides):
+    defaults = dict(
+        engine="vllm",
+        vllm_port=8999,
+        gpu_memory_utilization=0.3,
+        weight_transfer_backend="http",
+    )
+    defaults.update(overrides)
+    return RolloutConfig(**defaults)
+
+
+# -- __init__ validation ------------------------------------------------
+
+
+def test_init_rejects_wrong_engine():
+    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
+
+    cfg = RolloutConfig(engine="hybrid_engine")
+    with pytest.raises(ValueError, match="must be 'vllm'"):
+        VLLMRollout(cfg=cfg, tokenizer=MagicMock(), student_model_path="x")
+
+
+def test_init_requires_student_model_path():
+    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
+
+    cfg = RolloutConfig(engine="vllm")
+    with pytest.raises(ValueError, match="student_model_path"):
+        VLLMRollout(cfg=cfg, tokenizer=MagicMock())
+
+
+def test_init_http_backend():
+    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
+
+    cfg = _make_cfg(weight_transfer_backend="http")
+    rollout = VLLMRollout(cfg=cfg, tokenizer=MagicMock(), student_model_path="test-model")
+    assert rollout._wt_backend == "http"
+
+
+# -- _extract_token_ids -------------------------------------------------
+
+
+def test_extract_token_ids_prefers_token_ids():
+    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
+
+    choice = {"token_ids": [10, 20, 30]}
+    assert VLLMRollout._extract_token_ids(choice) == [10, 20, 30]
+
+
+def test_extract_token_ids_from_logprobs_token_ids():
+    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
+
+    choice = {"logprobs": {"token_ids": [5, 6, 7]}}
+    assert VLLMRollout._extract_token_ids(choice) == [5, 6, 7]
+
+
+def test_extract_token_ids_from_logprobs_tokens_fallback():
+    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
+
+    choice = {"logprobs": {"tokens": ["a", "b"]}}
+    assert VLLMRollout._extract_token_ids(choice) == [0, 1]
+
+
+def test_extract_token_ids_empty_on_no_data():
+    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
+
+    assert VLLMRollout._extract_token_ids({}) == []
+
+
+# -- _start_server command construction --------------------------------
+
+
+def test_start_server_command_http_backend():
+    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
+
+    cfg = _make_cfg(
+        weight_transfer_backend="http",
+        tensor_parallel_size=2,
+        gpu_memory_utilization=0.5,
+        vllm_port=12345,
+        vllm_enforce_eager=True,
+        gpus=[0, 1],
+    )
+    rollout = VLLMRollout(cfg=cfg, tokenizer=MagicMock(), student_model_path="test-model")
+
+    with patch("subprocess.Popen") as mock_popen:
+        mock_popen.return_value = MagicMock()
+        rollout._start_server()
+
+    args, kwargs = mock_popen.call_args
+    cmd = args[0]
+    assert cmd[0].endswith("python") or "python" in cmd[0]
+    assert "-m" in cmd
+    assert "vllm.entrypoints.openai.api_server" in cmd
+    assert "--model" in cmd
+    assert "test-model" in cmd
+    assert "--tensor-parallel-size" in cmd
+    assert "2" in cmd
+    assert "--gpu-memory-utilization" in cmd
+    assert "0.5" in cmd
+    assert "--port" in cmd
+    assert "12345" in cmd
+    assert "--enforce-eager" in cmd
+    assert '{"backend": "http"}' in cmd
+
+    env = kwargs["env"]
+    assert env["CUDA_VISIBLE_DEVICES"] == "0,1"
+    assert env["VLLM_SERVER_DEV_MODE"] == "1"
+
+
+def test_start_server_uses_vllm_python():
+    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
+
+    cfg = _make_cfg(vllm_python="/custom/bin/python")
+    rollout = VLLMRollout(cfg=cfg, tokenizer=MagicMock(), student_model_path="test-model")
+
+    with patch("subprocess.Popen") as mock_popen:
+        mock_popen.return_value = MagicMock()
+        rollout._start_server()
+
+    cmd = mock_popen.call_args[0][0]
+    assert cmd[0] == "/custom/bin/python"
+
+
+# -- _wait_for_health detects early exit --------------------------------
+
+
+def test_wait_for_health_raises_on_crash():
+    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
+
+    cfg = _make_cfg(vllm_start_timeout=5)
+    rollout = VLLMRollout(cfg=cfg, tokenizer=MagicMock(), student_model_path="test-model")
+
+    proc = MagicMock()
+    proc.poll.return_value = 1
+    proc.returncode = 1
+    proc.stderr = MagicMock()
+    proc.stderr.read.return_value = b"some error detail"
+    rollout._server_proc = proc
+
+    with pytest.raises(RuntimeError, match="exited prematurely"):
+        rollout._wait_for_health()
+
+
+def test_wait_for_health_raises_timeout():
+    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
+
+    cfg = _make_cfg(vllm_start_timeout=0)
+    rollout = VLLMRollout(cfg=cfg, tokenizer=MagicMock(), student_model_path="test-model")
+    rollout._server_proc = MagicMock()
+    rollout._server_proc.poll.return_value = None
+
+    with pytest.raises(TimeoutError, match="did not become healthy"):
+        rollout._wait_for_health()
+
+
+# -- utility helpers ----------------------------------------------------
+
+
+def test_get_own_ip():
+    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
+
+    assert isinstance(VLLMRollout._get_own_ip(), str)
+
+
+def test_find_free_port():
+    from deepspeed.runtime.rollout.vllm_rollout import _find_free_port
+
+    port = _find_free_port()
+    assert isinstance(port, int)
+    assert 1 <= port <= 65535
diff --git a/examples/opsd/tests/test_vllm_stitch.py b/tests/unit/runtime/rollout/test_vllm_stitch.py
similarity index 97%
rename from examples/opsd/tests/test_vllm_stitch.py
rename to tests/unit/runtime/rollout/test_vllm_stitch.py
index bd8e1b4e4c0f..0bcacbd9b0fe 100644
--- a/examples/opsd/tests/test_vllm_stitch.py
+++ b/tests/unit/runtime/rollout/test_vllm_stitch.py
@@ -11,8 +11,8 @@
 import pytest
 import torch
 
-from opsd.rollout.vllm import stitch_rollout
-from opsd.utils import build_response_mask
+from deepspeed.runtime.rollout import stitch_rollout
+from deepspeed.runtime.rlhf.utils import build_response_mask
 
 
 def test_stitch_basic_single_sample():

From 10ef3250919051c2f5ee6009585c9e5670dba482 Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@intel.com>
Date: Wed, 1 Jul 2026 18:47:32 +0800
Subject: [PATCH 06/18] Use ROLLOUT_VISIBLE_DEVICE env var for vLLM GPU
 placement; rename vllm_dtype to engine_dtype

Signed-off-by: Zhipeng Wang <zhipengbayern@gmail.com>
---
 deepspeed/runtime/rlhf/config.py              | 17 ++++++++++++---
 deepspeed/runtime/rollout/vllm_rollout.py     |  2 +-
 .../unit/runtime/rollout/test_vllm_rollout.py | 21 ++++++++++++++++++-
 3 files changed, 35 insertions(+), 5 deletions(-)

diff --git a/deepspeed/runtime/rlhf/config.py b/deepspeed/runtime/rlhf/config.py
index 07ecbe0ad4f2..f94c0d8c5568 100644
--- a/deepspeed/runtime/rlhf/config.py
+++ b/deepspeed/runtime/rlhf/config.py
@@ -11,6 +11,7 @@
 """
 
 import json
+import os
 from dataclasses import dataclass, field, asdict
 from typing import List, Optional
 
@@ -50,12 +51,22 @@ class RolloutConfig:
     use_graph_capture: bool = False
 
     # vLLM-specific. ``gpus`` is the disjoint set of CUDA device indices vLLM
-    # may use; the training ranks must not overlap with these. If None, the
-    # trainer will refuse to start in vllm mode.
+    # may use; the training ranks must not overlap with these. If None/empty,
+    # vLLM runs in "shared" mode on the same GPU as training rank 0.
+    # At construction time this field is populated from the
+    # ``ROLLOUT_VISIBLE_DEVICE`` environment variable (comma-separated device
+    # indices, e.g. ``ROLLOUT_VISIBLE_DEVICE=6,7``) when that variable is set,
+    # taking precedence over any value supplied in the JSON config.
     gpus: Optional[List[int]] = None
+
+    def __post_init__(self):
+        env_gpus = os.environ.get("ROLLOUT_VISIBLE_DEVICE")
+        if env_gpus:
+            self.gpus = [int(g.strip()) for g in env_gpus.split(",")]
+
     tensor_parallel_size: int = 1
     gpu_memory_utilization: float = 0.85
-    vllm_dtype: str = "bfloat16"
+    engine_dtype: str = "bfloat16"
     # Push student weights into vLLM every N optimizer steps. Larger values
     # trade staleness for throughput.
     weight_sync_interval: int = 1
diff --git a/deepspeed/runtime/rollout/vllm_rollout.py b/deepspeed/runtime/rollout/vllm_rollout.py
index 7b8fbcbecfbd..0388af0caa6a 100644
--- a/deepspeed/runtime/rollout/vllm_rollout.py
+++ b/deepspeed/runtime/rollout/vllm_rollout.py
@@ -216,7 +216,7 @@ def _start_server(self) -> None:
             "--tensor-parallel-size",
             str(self.cfg.tensor_parallel_size),
             "--dtype",
-            self.cfg.vllm_dtype,
+            self.cfg.engine_dtype,
             "--gpu-memory-utilization",
             str(self.cfg.gpu_memory_utilization),
             "--port",
diff --git a/tests/unit/runtime/rollout/test_vllm_rollout.py b/tests/unit/runtime/rollout/test_vllm_rollout.py
index 14a8c83999b8..d8c308bccb74 100644
--- a/tests/unit/runtime/rollout/test_vllm_rollout.py
+++ b/tests/unit/runtime/rollout/test_vllm_rollout.py
@@ -37,6 +37,24 @@ def test_init_rejects_wrong_engine():
         VLLMRollout(cfg=cfg, tokenizer=MagicMock(), student_model_path="x")
 
 
+def test_gpus_from_env_var(monkeypatch):
+    monkeypatch.setenv("ROLLOUT_VISIBLE_DEVICE", "6,7")
+    cfg = RolloutConfig(engine="vllm")
+    assert cfg.gpus == [6, 7]
+
+
+def test_env_var_overrides_json_gpus(monkeypatch):
+    monkeypatch.setenv("ROLLOUT_VISIBLE_DEVICE", "6,7")
+    cfg = RolloutConfig(engine="vllm", gpus=[0, 1])
+    assert cfg.gpus == [6, 7]
+
+
+def test_no_env_var_keeps_json_gpus(monkeypatch):
+    monkeypatch.delenv("ROLLOUT_VISIBLE_DEVICE", raising=False)
+    cfg = RolloutConfig(engine="vllm", gpus=[0, 1])
+    assert cfg.gpus == [0, 1]
+
+
 def test_init_requires_student_model_path():
     from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
 
@@ -86,9 +104,10 @@ def test_extract_token_ids_empty_on_no_data():
 # -- _start_server command construction --------------------------------
 
 
-def test_start_server_command_http_backend():
+def test_start_server_command_http_backend(monkeypatch):
     from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
 
+    monkeypatch.delenv("ROLLOUT_VISIBLE_DEVICE", raising=False)
     cfg = _make_cfg(
         weight_transfer_backend="http",
         tensor_parallel_size=2,

From 5716ecd2264006fe7c94d0f93a41d8ca527382c4 Mon Sep 17 00:00:00 2001
From: Zhipeng Wang <zhipengbayern@gmail.com>
Date: Wed, 1 Jul 2026 14:35:40 -1000
Subject: [PATCH 07/18] Fix formatting and CPU unit-test checks for OPSD
 rollout

Formatting hooks:
- Add the standard license header to the opsd benchmark scripts
  (bench_14b_rollout.py, bench_autotp_gc.py, bench_vllm_tp2.py).
- Mark CUDA-specific benchmark calls with #ignore-cuda and drop the
  unused `sys` import flagged by flake8.
- yapf: collapse the rlhf __init__ imports that now fit in 119 cols.

cpu-torch-latest unit tests:
- Align tests/unit/runtime/rollout/test_hybrid_engine_rollout.py with the
  current cfg-based HybridEngineRollout constructor: the tests previously
  passed continuous_batching_size / use_graph_capture kwargs and expected
  _generate_continuous_batching / _generate_graph_capture_cb dispatch that
  the implementation does not provide. Tests now build a
  HybridEngineRolloutConfig and assert the real generate() dispatch
  (module.generate by default, _generate_graph for greedy graph capture).
- Update the two benchmark constructor calls to the cfg-based API.

Signed-off-by: Zhipeng Wang <zhipengbayern@gmail.com>
---
 benchmarks/opsd/bench_14b_rollout.py          | 45 ++++++++++++-------
 benchmarks/opsd/bench_autotp_gc.py            | 37 +++++++++------
 benchmarks/opsd/bench_vllm_tp2.py             |  4 ++
 deepspeed/runtime/rlhf/__init__.py            | 15 ++-----
 .../rollout/test_hybrid_engine_rollout.py     | 42 ++++++++---------
 5 files changed, 77 insertions(+), 66 deletions(-)

diff --git a/benchmarks/opsd/bench_14b_rollout.py b/benchmarks/opsd/bench_14b_rollout.py
index d66c7615dd94..2727a63f2768 100644
--- a/benchmarks/opsd/bench_14b_rollout.py
+++ b/benchmarks/opsd/bench_14b_rollout.py
@@ -1,10 +1,14 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 """Comprehensive 14B rollout benchmark: Naive, GC, TP=2 GC, TP=4 GC."""
 import time
 import os
-import sys
 import torch
 import deepspeed
 from deepspeed.runtime.rollout import HybridEngineRollout, RolloutRequest, SamplingConfig
+from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRolloutConfig
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 MODEL = "Qwen/Qwen2.5-14B-Instruct"
@@ -21,18 +25,17 @@ def bench_rollout(engine, tokenizer, use_graph_capture, cb_size, label):
     device = torch.device(f"cuda:{local_rank}")
 
     rollout = HybridEngineRollout(
-        engine=engine,
-        tokenizer=tokenizer,
-        continuous_batching_size=cb_size,
-        use_graph_capture=use_graph_capture,
+        engine,
+        tokenizer,
+        cfg=HybridEngineRolloutConfig(use_graph_capture=use_graph_capture),
     )
 
     ids = tokenizer(PROMPT, return_tensors="pt").input_ids.to(device)
     req = RolloutRequest(prompt_ids=ids, prompt_attention_mask=torch.ones_like(ids))
-    sampling = SamplingConfig(
-        max_new_tokens=MAX_NEW_TOKENS, temperature=0.8, top_p=0.95,
-        n_samples_per_prompt=N_SAMPLES
-    )
+    sampling = SamplingConfig(max_new_tokens=MAX_NEW_TOKENS,
+                              temperature=0.8,
+                              top_p=0.95,
+                              n_samples_per_prompt=N_SAMPLES)
 
     # Warmup
     torch.manual_seed(42)
@@ -46,10 +49,10 @@ def bench_rollout(engine, tokenizer, use_graph_capture, cb_size, label):
     for i in range(N_RUNS):
         torch.manual_seed(42 + i)
         engine.eval()
-        torch.cuda.synchronize()
+        torch.cuda.synchronize()  #ignore-cuda
         t0 = time.time()
         batch = rollout.generate(req, sampling)
-        torch.cuda.synchronize()
+        torch.cuda.synchronize()  #ignore-cuda
         times.append(time.time() - t0)
         engine.train()
 
@@ -72,7 +75,7 @@ def main():
     deepspeed.init_distributed()
     rank = torch.distributed.get_rank()
     local_rank = int(os.environ.get("LOCAL_RANK", 0))
-    torch.cuda.set_device(local_rank)
+    torch.cuda.set_device(local_rank)  #ignore-cuda
 
     world_size = torch.distributed.get_world_size()
     tp_size = world_size  # all GPUs used for TP
@@ -81,8 +84,12 @@ def main():
     model = AutoModelForCausalLM.from_pretrained(MODEL, dtype=torch.bfloat16, trust_remote_code=True)
 
     ds_config = {
-        "bf16": {"enabled": True},
-        "zero_optimization": {"stage": 0},
+        "bf16": {
+            "enabled": True
+        },
+        "zero_optimization": {
+            "stage": 0
+        },
         "train_micro_batch_size_per_gpu": 1,
         "train_batch_size": world_size,
         "gradient_accumulation_steps": 1,
@@ -100,7 +107,9 @@ def main():
         ds_config["tensor_parallel"] = {
             "autotp_size": tp_size,
             "preset_model": "qwen2",
-            "tp": {"tp_size": tp_size},
+            "tp": {
+                "tp_size": tp_size
+            },
         }
 
     engine, *_ = deepspeed.initialize(model=model, config=ds_config)
@@ -116,7 +125,8 @@ def main():
     except Exception as e:
         if rank == 0:
             print(f"[TP{tp_size} CB={CB_SIZE}] FAILED: {e}")
-            import traceback; traceback.print_exc()
+            import traceback
+            traceback.print_exc()
 
     # 1P1R with CUDA graph capture
     try:
@@ -124,7 +134,8 @@ def main():
     except Exception as e:
         if rank == 0:
             print(f"[TP{tp_size} CB={CB_SIZE}+GC] FAILED: {e}")
-            import traceback; traceback.print_exc()
+            import traceback
+            traceback.print_exc()
 
     if rank == 0:
         print(f"{'='*60}\n")
diff --git a/benchmarks/opsd/bench_autotp_gc.py b/benchmarks/opsd/bench_autotp_gc.py
index c9a245b245de..6a2c678e50db 100644
--- a/benchmarks/opsd/bench_autotp_gc.py
+++ b/benchmarks/opsd/bench_autotp_gc.py
@@ -1,31 +1,41 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 """Benchmark rollout with AutoTP + graph capture on 14B model."""
 import time
 import torch
 import deepspeed
 from deepspeed.runtime.rollout import HybridEngineRollout, RolloutRequest, SamplingConfig
+from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRolloutConfig
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
+
 def main():
     deepspeed.init_distributed()
     rank = torch.distributed.get_rank()
-    local_rank = int(torch.distributed.get_rank()) % torch.cuda.device_count()
-    torch.cuda.set_device(local_rank)
+    local_rank = int(torch.distributed.get_rank()) % torch.cuda.device_count()  #ignore-cuda
+    torch.cuda.set_device(local_rank)  #ignore-cuda
     device = torch.device(f"cuda:{local_rank}")
 
     model_name = "Qwen/Qwen2.5-14B-Instruct"
     tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
 
-    model = AutoModelForCausalLM.from_pretrained(
-        model_name, dtype=torch.bfloat16, trust_remote_code=True
-    )
+    model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.bfloat16, trust_remote_code=True)
 
     ds_config = {
-        "bf16": {"enabled": True},
-        "zero_optimization": {"stage": 0},
+        "bf16": {
+            "enabled": True
+        },
+        "zero_optimization": {
+            "stage": 0
+        },
         "tensor_parallel": {
             "autotp_size": 2,
             "preset_model": "qwen2",
-            "tp": {"tp_size": 2},
+            "tp": {
+                "tp_size": 2
+            },
         },
         "train_micro_batch_size_per_gpu": 1,
         "train_batch_size": 2,
@@ -43,10 +53,9 @@ def main():
     engine, *_ = deepspeed.initialize(model=model, config=ds_config)
 
     rollout = HybridEngineRollout(
-        engine=engine,
-        tokenizer=tokenizer,
-        continuous_batching_size=2,
-        use_graph_capture=True,
+        engine,
+        tokenizer,
+        cfg=HybridEngineRolloutConfig(use_graph_capture=True),
     )
 
     # Prepare prompt
@@ -66,10 +75,10 @@ def main():
     for i in range(5):
         torch.manual_seed(42)
         engine.eval()
-        torch.cuda.synchronize()
+        torch.cuda.synchronize()  #ignore-cuda
         t0 = time.time()
         batch = rollout.generate(req, sampling)
-        torch.cuda.synchronize()
+        torch.cuda.synchronize()  #ignore-cuda
         times.append(time.time() - t0)
         engine.train()
 
diff --git a/benchmarks/opsd/bench_vllm_tp2.py b/benchmarks/opsd/bench_vllm_tp2.py
index 66a82192551b..f6cb649dfa21 100644
--- a/benchmarks/opsd/bench_vllm_tp2.py
+++ b/benchmarks/opsd/bench_vllm_tp2.py
@@ -1,3 +1,7 @@
+# Copyright (c) Microsoft Corporation.
+# SPDX-License-Identifier: Apache-2.0
+
+# DeepSpeed Team
 """Benchmark vLLM TP=2 on 14B, 1P1R.
 
 Launched as a subprocess wrapper to avoid CUDA fork issues.
diff --git a/deepspeed/runtime/rlhf/__init__.py b/deepspeed/runtime/rlhf/__init__.py
index 479f51f3ae80..f3dae87d15a1 100644
--- a/deepspeed/runtime/rlhf/__init__.py
+++ b/deepspeed/runtime/rlhf/__init__.py
@@ -13,20 +13,11 @@
 """
 
 from deepspeed.runtime.rlhf.config import (  # noqa: F401
-    OPSDConfig,
-    StudentConfig,
-    TeacherConfig,
-    RolloutConfig,
-    DistillationConfig,
-    TrainingConfig,
-    DataConfig,
+    OPSDConfig, StudentConfig, TeacherConfig, RolloutConfig, DistillationConfig, TrainingConfig, DataConfig,
 )
 from deepspeed.runtime.rlhf.losses import (  # noqa: F401
-    chunked_distillation_loss,
-    streamed_distillation_loss,
-    per_token_logprobs,
+    chunked_distillation_loss, streamed_distillation_loss, per_token_logprobs,
 )
 from deepspeed.runtime.rlhf.utils import (  # noqa: F401
-    build_response_mask,
-    shift_for_next_token_prediction,
+    build_response_mask, shift_for_next_token_prediction,
 )
diff --git a/tests/unit/runtime/rollout/test_hybrid_engine_rollout.py b/tests/unit/runtime/rollout/test_hybrid_engine_rollout.py
index 6f7f8de3ceeb..24e18c57eb95 100644
--- a/tests/unit/runtime/rollout/test_hybrid_engine_rollout.py
+++ b/tests/unit/runtime/rollout/test_hybrid_engine_rollout.py
@@ -11,6 +11,7 @@
 
 import torch
 
+from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig
 from deepspeed.runtime.rollout.hybrid_engine_rollout import (
     HybridEngineRollout,
     HybridEngineRolloutConfig,
@@ -36,8 +37,6 @@ def _make_tokenizer():
 
 def test_config_defaults():
     cfg = HybridEngineRolloutConfig()
-    assert cfg.continuous_batching_size == 0
-    assert cfg.kv_trim_threshold == 16
     assert cfg.use_graph_capture is False
 
 
@@ -47,9 +46,9 @@ def test_config_defaults():
 def test_constructor_stores_config():
     engine = _make_engine()
     tok = _make_tokenizer()
-    rollout = HybridEngineRollout(engine, tok, continuous_batching_size=4)
-    assert rollout.continuous_batching_size == 4
-    assert rollout.kv_trim_threshold == 16
+    cfg = HybridEngineRolloutConfig(use_graph_capture=True)
+    rollout = HybridEngineRollout(engine, tok, cfg=cfg)
+    assert rollout.use_graph_capture is True
     assert rollout.engine is engine
     assert rollout.tokenizer is tok
 
@@ -93,31 +92,28 @@ def test_sync_weights_is_noop():
 # -- generate dispatches correctly -------------------------------------
 
 
-def test_generate_calls_cb_by_default():
+def _make_request():
+    return RolloutRequest(prompt_ids=torch.tensor([[1, 2]]), prompt_attention_mask=torch.ones(1, 2, dtype=torch.long))
+
+
+def test_generate_uses_module_generate_by_default():
     engine = _make_engine()
     tok = _make_tokenizer()
     rollout = HybridEngineRollout(engine, tok)
-    rollout._generate_continuous_batching = MagicMock(return_value=MagicMock())
+    engine.module.generate = MagicMock(return_value=torch.tensor([[1, 2, 3, 2]]))
 
-    req = MagicMock()
-    req.prompt_ids = torch.tensor([[1, 2]])
-    req.prompt_attention_mask = torch.ones(1, 2, dtype=torch.long)
-    sampling = MagicMock()
-
-    rollout.generate(req, sampling)
-    rollout._generate_continuous_batching.assert_called_once()
+    # Sampling (temperature > 0) routes through the engine's generate path.
+    rollout.generate(_make_request(), SamplingConfig(max_new_tokens=2, temperature=1.0))
+    engine.module.generate.assert_called_once()
 
 
 def test_generate_calls_graph_capture_when_enabled():
     engine = _make_engine()
     tok = _make_tokenizer()
-    rollout = HybridEngineRollout(engine, tok, use_graph_capture=True)
-    rollout._generate_graph_capture_cb = MagicMock(return_value=MagicMock())
-
-    req = MagicMock()
-    req.prompt_ids = torch.tensor([[1, 2]])
-    req.prompt_attention_mask = torch.ones(1, 2, dtype=torch.long)
-    sampling = MagicMock()
+    cfg = HybridEngineRolloutConfig(use_graph_capture=True)
+    rollout = HybridEngineRollout(engine, tok, cfg=cfg)
+    rollout._generate_graph = MagicMock(return_value=torch.tensor([[1, 2, 3, 2]]))
 
-    rollout.generate(req, sampling)
-    rollout._generate_graph_capture_cb.assert_called_once()
+    # Graph capture is used for greedy decoding (temperature <= 0).
+    rollout.generate(_make_request(), SamplingConfig(max_new_tokens=2, temperature=0.0))
+    rollout._generate_graph.assert_called_once()

From 837c241079ab2f3273600dbe378e2c71db814e76 Mon Sep 17 00:00:00 2001
From: Zhipeng Wang <zhipengbayern@gmail.com>
Date: Wed, 1 Jul 2026 14:53:57 -1000
Subject: [PATCH 08/18] Remove Microsoft Corporation copyright line from OPSD
 file headers

Keep the SPDX Apache-2.0 identifier and DeepSpeed Team attribution, which
are what the check-license hook requires.

Signed-off-by: Zhipeng Wang <zhipengbayern@gmail.com>
---
 benchmarks/opsd/bench_14b_rollout.py                     | 1 -
 benchmarks/opsd/bench_autotp_gc.py                       | 1 -
 benchmarks/opsd/bench_decode_1p1r.py                     | 1 -
 benchmarks/opsd/bench_flashinfer.py                      | 1 -
 benchmarks/opsd/bench_hybrid_tp.py                       | 1 -
 benchmarks/opsd/bench_hybrid_tp_opt.py                   | 1 -
 benchmarks/opsd/bench_vllm_tp2.py                        | 1 -
 deepspeed/runtime/rlhf/__init__.py                       | 1 -
 deepspeed/runtime/rlhf/config.py                         | 1 -
 deepspeed/runtime/rlhf/data.py                           | 1 -
 deepspeed/runtime/rlhf/losses.py                         | 1 -
 deepspeed/runtime/rlhf/teacher.py                        | 1 -
 deepspeed/runtime/rlhf/trainer/__init__.py               | 1 -
 deepspeed/runtime/rlhf/trainer/base.py                   | 1 -
 deepspeed/runtime/rlhf/trainer/opsd.py                   | 1 -
 deepspeed/runtime/rlhf/utils.py                          | 1 -
 deepspeed/runtime/rollout/__init__.py                    | 1 -
 deepspeed/runtime/rollout/_vllm_compat/__init__.py       | 1 -
 deepspeed/runtime/rollout/_vllm_compat/sitecustomize.py  | 1 -
 deepspeed/runtime/rollout/base.py                        | 1 -
 deepspeed/runtime/rollout/hybrid_engine_rollout.py       | 1 -
 deepspeed/runtime/rollout/vllm_rollout.py                | 1 -
 tests/unit/runtime/rollout/test_hybrid_engine_rollout.py | 1 -
 tests/unit/runtime/rollout/test_rollout_interface.py     | 1 -
 tests/unit/runtime/rollout/test_vllm_rollout.py          | 1 -
 tests/unit/runtime/rollout/test_vllm_stitch.py           | 1 -
 26 files changed, 26 deletions(-)

diff --git a/benchmarks/opsd/bench_14b_rollout.py b/benchmarks/opsd/bench_14b_rollout.py
index 2727a63f2768..66e2e60a5ce1 100644
--- a/benchmarks/opsd/bench_14b_rollout.py
+++ b/benchmarks/opsd/bench_14b_rollout.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/benchmarks/opsd/bench_autotp_gc.py b/benchmarks/opsd/bench_autotp_gc.py
index 6a2c678e50db..f69268e77f1d 100644
--- a/benchmarks/opsd/bench_autotp_gc.py
+++ b/benchmarks/opsd/bench_autotp_gc.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/benchmarks/opsd/bench_decode_1p1r.py b/benchmarks/opsd/bench_decode_1p1r.py
index 58fb667d4581..1428aefb4701 100644
--- a/benchmarks/opsd/bench_decode_1p1r.py
+++ b/benchmarks/opsd/bench_decode_1p1r.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 # DeepSpeed Team
 """Micro-benchmark for 1p1r HybridEngineRollout decode.
diff --git a/benchmarks/opsd/bench_flashinfer.py b/benchmarks/opsd/bench_flashinfer.py
index abaa31483111..335a656c8b5f 100644
--- a/benchmarks/opsd/bench_flashinfer.py
+++ b/benchmarks/opsd/bench_flashinfer.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/benchmarks/opsd/bench_hybrid_tp.py b/benchmarks/opsd/bench_hybrid_tp.py
index 3f41150c7b85..e2430c3e65bc 100644
--- a/benchmarks/opsd/bench_hybrid_tp.py
+++ b/benchmarks/opsd/bench_hybrid_tp.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 # DeepSpeed Team
 """Benchmark HybridEngineRollout with DeepSpeed AutoTP (TP=2).
diff --git a/benchmarks/opsd/bench_hybrid_tp_opt.py b/benchmarks/opsd/bench_hybrid_tp_opt.py
index d7fae2ddef51..248b97da6962 100644
--- a/benchmarks/opsd/bench_hybrid_tp_opt.py
+++ b/benchmarks/opsd/bench_hybrid_tp_opt.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 # DeepSpeed Team
 """Benchmark HybridEngineRollout with DeepSpeed AutoTP (TP=2) + optimizer.
diff --git a/benchmarks/opsd/bench_vllm_tp2.py b/benchmarks/opsd/bench_vllm_tp2.py
index f6cb649dfa21..4351c722d32f 100644
--- a/benchmarks/opsd/bench_vllm_tp2.py
+++ b/benchmarks/opsd/bench_vllm_tp2.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/deepspeed/runtime/rlhf/__init__.py b/deepspeed/runtime/rlhf/__init__.py
index f3dae87d15a1..bb08d8fad529 100644
--- a/deepspeed/runtime/rlhf/__init__.py
+++ b/deepspeed/runtime/rlhf/__init__.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/deepspeed/runtime/rlhf/config.py b/deepspeed/runtime/rlhf/config.py
index f94c0d8c5568..01ad0fe45b54 100644
--- a/deepspeed/runtime/rlhf/config.py
+++ b/deepspeed/runtime/rlhf/config.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/deepspeed/runtime/rlhf/data.py b/deepspeed/runtime/rlhf/data.py
index 8ce86b56c67f..df6e19908846 100644
--- a/deepspeed/runtime/rlhf/data.py
+++ b/deepspeed/runtime/rlhf/data.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/deepspeed/runtime/rlhf/losses.py b/deepspeed/runtime/rlhf/losses.py
index d9f4b9266da5..ba717d2b260d 100644
--- a/deepspeed/runtime/rlhf/losses.py
+++ b/deepspeed/runtime/rlhf/losses.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/deepspeed/runtime/rlhf/teacher.py b/deepspeed/runtime/rlhf/teacher.py
index 9d6eec3f08e4..9ad6118fe370 100644
--- a/deepspeed/runtime/rlhf/teacher.py
+++ b/deepspeed/runtime/rlhf/teacher.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/deepspeed/runtime/rlhf/trainer/__init__.py b/deepspeed/runtime/rlhf/trainer/__init__.py
index 34169d59da7a..3086ddac2cde 100644
--- a/deepspeed/runtime/rlhf/trainer/__init__.py
+++ b/deepspeed/runtime/rlhf/trainer/__init__.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/deepspeed/runtime/rlhf/trainer/base.py b/deepspeed/runtime/rlhf/trainer/base.py
index 3f8687514a55..4a5451a40f66 100644
--- a/deepspeed/runtime/rlhf/trainer/base.py
+++ b/deepspeed/runtime/rlhf/trainer/base.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/deepspeed/runtime/rlhf/trainer/opsd.py b/deepspeed/runtime/rlhf/trainer/opsd.py
index 28be8b56061e..92df42cbbe2b 100644
--- a/deepspeed/runtime/rlhf/trainer/opsd.py
+++ b/deepspeed/runtime/rlhf/trainer/opsd.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/deepspeed/runtime/rlhf/utils.py b/deepspeed/runtime/rlhf/utils.py
index b2954407b774..1e97a4b7706a 100644
--- a/deepspeed/runtime/rlhf/utils.py
+++ b/deepspeed/runtime/rlhf/utils.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/deepspeed/runtime/rollout/__init__.py b/deepspeed/runtime/rollout/__init__.py
index e61e7fe2d98b..db126ab120fd 100644
--- a/deepspeed/runtime/rollout/__init__.py
+++ b/deepspeed/runtime/rollout/__init__.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/deepspeed/runtime/rollout/_vllm_compat/__init__.py b/deepspeed/runtime/rollout/_vllm_compat/__init__.py
index 208299fb8c50..bbec52ed50ee 100644
--- a/deepspeed/runtime/rollout/_vllm_compat/__init__.py
+++ b/deepspeed/runtime/rollout/_vllm_compat/__init__.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/deepspeed/runtime/rollout/_vllm_compat/sitecustomize.py b/deepspeed/runtime/rollout/_vllm_compat/sitecustomize.py
index d0a399093de0..c490fd71a261 100644
--- a/deepspeed/runtime/rollout/_vllm_compat/sitecustomize.py
+++ b/deepspeed/runtime/rollout/_vllm_compat/sitecustomize.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/deepspeed/runtime/rollout/base.py b/deepspeed/runtime/rollout/base.py
index f02e48d1153a..55695a2a6bb6 100644
--- a/deepspeed/runtime/rollout/base.py
+++ b/deepspeed/runtime/rollout/base.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/deepspeed/runtime/rollout/hybrid_engine_rollout.py b/deepspeed/runtime/rollout/hybrid_engine_rollout.py
index 9da149418867..b2c02b25dc3a 100644
--- a/deepspeed/runtime/rollout/hybrid_engine_rollout.py
+++ b/deepspeed/runtime/rollout/hybrid_engine_rollout.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/deepspeed/runtime/rollout/vllm_rollout.py b/deepspeed/runtime/rollout/vllm_rollout.py
index 0388af0caa6a..1f40d42dd62d 100644
--- a/deepspeed/runtime/rollout/vllm_rollout.py
+++ b/deepspeed/runtime/rollout/vllm_rollout.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/tests/unit/runtime/rollout/test_hybrid_engine_rollout.py b/tests/unit/runtime/rollout/test_hybrid_engine_rollout.py
index 24e18c57eb95..f46c17e5f9d9 100644
--- a/tests/unit/runtime/rollout/test_hybrid_engine_rollout.py
+++ b/tests/unit/runtime/rollout/test_hybrid_engine_rollout.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/tests/unit/runtime/rollout/test_rollout_interface.py b/tests/unit/runtime/rollout/test_rollout_interface.py
index d19992bc49bf..f0a925670e0e 100644
--- a/tests/unit/runtime/rollout/test_rollout_interface.py
+++ b/tests/unit/runtime/rollout/test_rollout_interface.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/tests/unit/runtime/rollout/test_vllm_rollout.py b/tests/unit/runtime/rollout/test_vllm_rollout.py
index d8c308bccb74..616fe6cc9300 100644
--- a/tests/unit/runtime/rollout/test_vllm_rollout.py
+++ b/tests/unit/runtime/rollout/test_vllm_rollout.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
diff --git a/tests/unit/runtime/rollout/test_vllm_stitch.py b/tests/unit/runtime/rollout/test_vllm_stitch.py
index 0bcacbd9b0fe..6477e8701159 100644
--- a/tests/unit/runtime/rollout/test_vllm_stitch.py
+++ b/tests/unit/runtime/rollout/test_vllm_stitch.py
@@ -1,4 +1,3 @@
-# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team

From 734230cd7303907092cf7efcf7472f957af45c4d Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@intel.com>
Date: Fri, 3 Jul 2026 17:52:34 +0800
Subject: [PATCH 09/18] Remove vLLM rollout, move
 trainer/losses/utils/benchmarks to DeepSpeedExamples

Signed-off-by: Guokai Ma <guokai.ma@intel.com>
---
 benchmarks/opsd/bench_14b_rollout.py          | 144 -----
 benchmarks/opsd/bench_autotp_gc.py            | 104 ----
 benchmarks/opsd/bench_decode_1p1r.py          | 179 ------
 benchmarks/opsd/bench_flashinfer.py           | 128 -----
 benchmarks/opsd/bench_hybrid_tp.py            | 144 -----
 benchmarks/opsd/bench_hybrid_tp_opt.py        | 148 -----
 benchmarks/opsd/bench_vllm_tp2.py             |  44 --
 deepspeed/runtime/rlhf/__init__.py            |  22 -
 deepspeed/runtime/rlhf/config.py              | 174 ------
 deepspeed/runtime/rlhf/losses.py              | 191 -------
 deepspeed/runtime/rlhf/trainer/__init__.py    |   7 -
 deepspeed/runtime/rlhf/trainer/base.py        |  41 --
 deepspeed/runtime/rlhf/trainer/opsd.py        | 198 -------
 deepspeed/runtime/rlhf/utils.py               |  51 --
 deepspeed/runtime/rollout/__init__.py         |  26 +-
 .../runtime/rollout/_vllm_compat/__init__.py  |   3 -
 .../rollout/_vllm_compat/sitecustomize.py     |  74 ---
 deepspeed/runtime/rollout/base.py             |  17 +
 deepspeed/runtime/rollout/vllm_rollout.py     | 541 ------------------
 .../rollout/test_hybrid_engine_rollout.py     |  35 +-
 .../runtime/rollout/test_rollout_interface.py |  19 +-
 .../unit/runtime/rollout/test_vllm_rollout.py | 205 -------
 .../unit/runtime/rollout/test_vllm_stitch.py  |  96 ----
 23 files changed, 42 insertions(+), 2549 deletions(-)
 delete mode 100644 benchmarks/opsd/bench_14b_rollout.py
 delete mode 100644 benchmarks/opsd/bench_autotp_gc.py
 delete mode 100644 benchmarks/opsd/bench_decode_1p1r.py
 delete mode 100644 benchmarks/opsd/bench_flashinfer.py
 delete mode 100644 benchmarks/opsd/bench_hybrid_tp.py
 delete mode 100644 benchmarks/opsd/bench_hybrid_tp_opt.py
 delete mode 100644 benchmarks/opsd/bench_vllm_tp2.py
 delete mode 100644 deepspeed/runtime/rlhf/__init__.py
 delete mode 100644 deepspeed/runtime/rlhf/config.py
 delete mode 100644 deepspeed/runtime/rlhf/losses.py
 delete mode 100644 deepspeed/runtime/rlhf/trainer/__init__.py
 delete mode 100644 deepspeed/runtime/rlhf/trainer/base.py
 delete mode 100644 deepspeed/runtime/rlhf/trainer/opsd.py
 delete mode 100644 deepspeed/runtime/rlhf/utils.py
 delete mode 100644 deepspeed/runtime/rollout/_vllm_compat/__init__.py
 delete mode 100644 deepspeed/runtime/rollout/_vllm_compat/sitecustomize.py
 delete mode 100644 deepspeed/runtime/rollout/vllm_rollout.py
 delete mode 100644 tests/unit/runtime/rollout/test_vllm_rollout.py
 delete mode 100644 tests/unit/runtime/rollout/test_vllm_stitch.py

diff --git a/benchmarks/opsd/bench_14b_rollout.py b/benchmarks/opsd/bench_14b_rollout.py
deleted file mode 100644
index 66e2e60a5ce1..000000000000
--- a/benchmarks/opsd/bench_14b_rollout.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Comprehensive 14B rollout benchmark: Naive, GC, TP=2 GC, TP=4 GC."""
-import time
-import os
-import torch
-import deepspeed
-from deepspeed.runtime.rollout import HybridEngineRollout, RolloutRequest, SamplingConfig
-from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRolloutConfig
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-MODEL = "Qwen/Qwen2.5-14B-Instruct"
-MAX_NEW_TOKENS = 256
-N_SAMPLES = 1
-CB_SIZE = 1
-N_RUNS = 5
-PROMPT = "def fibonacci(n):"
-
-
-def bench_rollout(engine, tokenizer, use_graph_capture, cb_size, label):
-    rank = torch.distributed.get_rank()
-    local_rank = int(os.environ.get("LOCAL_RANK", 0))
-    device = torch.device(f"cuda:{local_rank}")
-
-    rollout = HybridEngineRollout(
-        engine,
-        tokenizer,
-        cfg=HybridEngineRolloutConfig(use_graph_capture=use_graph_capture),
-    )
-
-    ids = tokenizer(PROMPT, return_tensors="pt").input_ids.to(device)
-    req = RolloutRequest(prompt_ids=ids, prompt_attention_mask=torch.ones_like(ids))
-    sampling = SamplingConfig(max_new_tokens=MAX_NEW_TOKENS,
-                              temperature=0.8,
-                              top_p=0.95,
-                              n_samples_per_prompt=N_SAMPLES)
-
-    # Warmup
-    torch.manual_seed(42)
-    engine.eval()
-    rollout.generate(req, sampling)
-    engine.train()
-
-    # Benchmark
-    times = []
-    total_toks = 0
-    for i in range(N_RUNS):
-        torch.manual_seed(42 + i)
-        engine.eval()
-        torch.cuda.synchronize()  #ignore-cuda
-        t0 = time.time()
-        batch = rollout.generate(req, sampling)
-        torch.cuda.synchronize()  #ignore-cuda
-        times.append(time.time() - t0)
-        engine.train()
-
-    # Count tokens from last run
-    pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
-    for i in range(batch.input_ids.shape[0]):
-        resp = batch.input_ids[i, batch.response_start_idx[i]:]
-        total_toks += (resp != pad_id).sum().item()
-
-    t_avg = sum(times[1:]) / len(times[1:])
-
-    if rank == 0:
-        print(f"[{label}] {total_toks} toks, {t_avg*1000:.0f}ms, {total_toks/t_avg:.1f} tok/s  "
-              f"runs={[f'{t*1000:.0f}' for t in times]}")
-
-    return total_toks, t_avg
-
-
-def main():
-    deepspeed.init_distributed()
-    rank = torch.distributed.get_rank()
-    local_rank = int(os.environ.get("LOCAL_RANK", 0))
-    torch.cuda.set_device(local_rank)  #ignore-cuda
-
-    world_size = torch.distributed.get_world_size()
-    tp_size = world_size  # all GPUs used for TP
-
-    tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(MODEL, dtype=torch.bfloat16, trust_remote_code=True)
-
-    ds_config = {
-        "bf16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": 0
-        },
-        "train_micro_batch_size_per_gpu": 1,
-        "train_batch_size": world_size,
-        "gradient_accumulation_steps": 1,
-        "hybrid_engine": {
-            "enabled": True,
-            "max_out_tokens": 512,
-            "inference_tp_size": 1,
-            "release_inference_cache": False,
-            "pin_parameters": True,
-            "tp_gather_partition_size": 8,
-        },
-    }
-
-    if tp_size > 1:
-        ds_config["tensor_parallel"] = {
-            "autotp_size": tp_size,
-            "preset_model": "qwen2",
-            "tp": {
-                "tp_size": tp_size
-            },
-        }
-
-    engine, *_ = deepspeed.initialize(model=model, config=ds_config)
-
-    if rank == 0:
-        print(f"\n{'='*60}")
-        print(f"Model: {MODEL}, TP={tp_size}, n={N_SAMPLES}, cb={CB_SIZE}, max_new={MAX_NEW_TOKENS}")
-        print(f"{'='*60}")
-
-    # 1P1R without graph capture (CB=1, no GC)
-    try:
-        bench_rollout(engine, tokenizer, use_graph_capture=False, cb_size=CB_SIZE, label=f"TP{tp_size} CB={CB_SIZE}")
-    except Exception as e:
-        if rank == 0:
-            print(f"[TP{tp_size} CB={CB_SIZE}] FAILED: {e}")
-            import traceback
-            traceback.print_exc()
-
-    # 1P1R with CUDA graph capture
-    try:
-        bench_rollout(engine, tokenizer, use_graph_capture=True, cb_size=CB_SIZE, label=f"TP{tp_size} CB={CB_SIZE}+GC")
-    except Exception as e:
-        if rank == 0:
-            print(f"[TP{tp_size} CB={CB_SIZE}+GC] FAILED: {e}")
-            import traceback
-            traceback.print_exc()
-
-    if rank == 0:
-        print(f"{'='*60}\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmarks/opsd/bench_autotp_gc.py b/benchmarks/opsd/bench_autotp_gc.py
deleted file mode 100644
index f69268e77f1d..000000000000
--- a/benchmarks/opsd/bench_autotp_gc.py
+++ /dev/null
@@ -1,104 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Benchmark rollout with AutoTP + graph capture on 14B model."""
-import time
-import torch
-import deepspeed
-from deepspeed.runtime.rollout import HybridEngineRollout, RolloutRequest, SamplingConfig
-from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRolloutConfig
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-
-def main():
-    deepspeed.init_distributed()
-    rank = torch.distributed.get_rank()
-    local_rank = int(torch.distributed.get_rank()) % torch.cuda.device_count()  #ignore-cuda
-    torch.cuda.set_device(local_rank)  #ignore-cuda
-    device = torch.device(f"cuda:{local_rank}")
-
-    model_name = "Qwen/Qwen2.5-14B-Instruct"
-    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
-
-    model = AutoModelForCausalLM.from_pretrained(model_name, dtype=torch.bfloat16, trust_remote_code=True)
-
-    ds_config = {
-        "bf16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": 0
-        },
-        "tensor_parallel": {
-            "autotp_size": 2,
-            "preset_model": "qwen2",
-            "tp": {
-                "tp_size": 2
-            },
-        },
-        "train_micro_batch_size_per_gpu": 1,
-        "train_batch_size": 2,
-        "gradient_accumulation_steps": 1,
-        "hybrid_engine": {
-            "enabled": True,
-            "max_out_tokens": 512,
-            "inference_tp_size": 1,
-            "release_inference_cache": False,
-            "pin_parameters": True,
-            "tp_gather_partition_size": 8,
-        },
-    }
-
-    engine, *_ = deepspeed.initialize(model=model, config=ds_config)
-
-    rollout = HybridEngineRollout(
-        engine,
-        tokenizer,
-        cfg=HybridEngineRolloutConfig(use_graph_capture=True),
-    )
-
-    # Prepare prompt
-    prompt = "def fibonacci(n):"
-    ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
-    req = RolloutRequest(prompt_ids=ids, prompt_attention_mask=torch.ones_like(ids))
-    sampling = SamplingConfig(max_new_tokens=256, temperature=0.8, top_p=0.95, n_samples_per_prompt=4)
-
-    # Warmup
-    torch.manual_seed(42)
-    engine.eval()
-    rollout.generate(req, sampling)
-    engine.train()
-
-    # Benchmark
-    times = []
-    for i in range(5):
-        torch.manual_seed(42)
-        engine.eval()
-        torch.cuda.synchronize()  #ignore-cuda
-        t0 = time.time()
-        batch = rollout.generate(req, sampling)
-        torch.cuda.synchronize()  #ignore-cuda
-        times.append(time.time() - t0)
-        engine.train()
-
-    t_avg = sum(times[1:]) / len(times[1:])
-    # Count tokens
-    pad_id = tokenizer.pad_token_id or tokenizer.eos_token_id
-    total_toks = 0
-    for i in range(batch.input_ids.shape[0]):
-        resp = batch.input_ids[i, batch.response_start_idx[i]:]
-        total_toks += (resp != pad_id).sum().item()
-
-    if rank == 0:
-        print(f"\n{'='*60}")
-        print(f"Model: {model_name}")
-        print(f"TP=2, n=8, cb=4, graph_capture=True, max_new_tokens=256")
-        print(f"Avg latency (excl warmup): {t_avg*1000:.1f}ms")
-        print(f"Total response tokens: {total_toks}")
-        print(f"Throughput: {total_toks/t_avg:.1f} tok/s")
-        print(f"Per-run times: {[f'{t*1000:.0f}ms' for t in times]}")
-        print(f"{'='*60}\n")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmarks/opsd/bench_decode_1p1r.py b/benchmarks/opsd/bench_decode_1p1r.py
deleted file mode 100644
index 1428aefb4701..000000000000
--- a/benchmarks/opsd/bench_decode_1p1r.py
+++ /dev/null
@@ -1,179 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# DeepSpeed Team
-"""Micro-benchmark for 1p1r HybridEngineRollout decode.
-
-Measures time breakdown of each decode step:
-  - model forward (attention + FFN)
-  - sampling (softmax + multinomial)
-  - Python overhead (mask concat, state update, etc.)
-
-Usage:
-  python examples/opsd/bench_decode_1p1r.py --model Qwen/Qwen2.5-0.5B-Instruct
-"""
-
-import argparse
-import time
-
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from deepspeed.accelerator import get_accelerator
-
-from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout
-from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig
-
-
-def bench_decode_raw(model, tokenizer, device, prompt_len=64, max_new_tokens=64, num_warmup=3, num_iters=10):
-    """Raw decode loop benchmark — measures each component separately."""
-    model.eval()
-    model_dtype = next(model.parameters()).dtype
-
-    input_ids = torch.randint(10, 1000, (1, prompt_len), device=device)
-    attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device)
-
-    results = {
-        "prompt_len": prompt_len,
-        "max_new_tokens": max_new_tokens,
-        "model_dtype": str(model_dtype),
-    }
-
-    timings = {"prefill": [], "decode_forward": [], "sampling": [], "overhead": [], "total": []}
-
-    for _ in range(num_warmup + num_iters):
-        with torch.no_grad():
-            t0 = time.perf_counter()
-            out = model(input_ids, attention_mask=attn_mask, use_cache=True)
-            past = out.past_key_values
-            logits = out.logits[:, -1:, :]
-            t_prefill = time.perf_counter()
-
-            generated = []
-            cur_token = logits.argmax(dim=-1)
-            generated.append(cur_token)
-            cur_mask = attn_mask
-
-            decode_times = []
-            sample_times = []
-            overhead_times = []
-
-            for step in range(max_new_tokens):
-                t_step = time.perf_counter()
-                cur_mask = torch.cat([cur_mask, torch.ones(1, 1, dtype=torch.long, device=device)], dim=1)
-                pos_ids = torch.tensor([[prompt_len + step]], device=device)
-
-                t_fwd = time.perf_counter()
-                out = model(cur_token,
-                            attention_mask=cur_mask,
-                            position_ids=pos_ids,
-                            past_key_values=past,
-                            use_cache=True)
-                past = out.past_key_values
-                t_fwd_end = time.perf_counter()
-
-                next_logits = out.logits[:, -1, :]
-                probs = torch.softmax(next_logits / 1.0, dim=-1)
-                cur_token = torch.multinomial(probs, 1)
-                t_sample = time.perf_counter()
-
-                generated.append(cur_token)
-                t_overhead = time.perf_counter()
-
-                decode_times.append(t_fwd_end - t_fwd)
-                sample_times.append(t_sample - t_fwd_end)
-                overhead_times.append(t_overhead - t_sample)
-
-            t_total = time.perf_counter()
-
-        timings["prefill"].append(t_prefill - t0)
-        timings["decode_forward"].append(decode_times)
-        timings["sampling"].append(sample_times)
-        timings["overhead"].append(overhead_times)
-        timings["total"].append(t_total - t0)
-
-    import numpy as np
-
-    def avg_last_n(lst, n):
-        return np.mean(lst[-n:])
-
-    def avg_of_avg(list_of_lists, n):
-        arrs = [np.array(ls[-n:]) for ls in list_of_lists]
-        return np.mean([a.mean() for a in arrs])
-
-    results["prefill_ms"] = avg_last_n(timings["prefill"], num_iters) * 1000
-    results["decode_forward_ms_per_step"] = avg_of_avg(timings["decode_forward"], num_iters) * 1000
-    results["sampling_ms_per_step"] = avg_of_avg(timings["sampling"], num_iters) * 1000
-    results["overhead_ms_per_step"] = avg_of_avg(timings["overhead"], num_iters) * 1000
-    results["total_ms"] = avg_last_n(timings["total"], num_iters) * 1000
-    results["decode_steps_total_ms"] = results["decode_forward_ms_per_step"] * max_new_tokens
-    results["sampling_total_ms"] = results["sampling_ms_per_step"] * max_new_tokens
-    results["overhead_total_ms"] = results["overhead_ms_per_step"] * max_new_tokens
-
-    return results
-
-
-def bench_hybrid_rollout(rollout, tokenizer, device, prompt_len=64, max_new_tokens=64, num_warmup=3, num_iters=10):
-    """Benchmark the full HybridEngineRollout.generate() path."""
-    input_ids = torch.randint(10, 1000, (1, prompt_len), device=device)
-    attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device)
-    sampling = SamplingConfig(max_new_tokens=max_new_tokens, temperature=1.0, top_p=1.0)
-    request = RolloutRequest(prompt_ids=input_ids, prompt_attention_mask=attn_mask)
-
-    times = []
-    for _ in range(num_warmup + num_iters):
-        get_accelerator().synchronize()  #ignore-cuda
-        t0 = time.perf_counter()
-        with torch.no_grad():
-            rollout.generate(request, sampling)
-        get_accelerator().synchronize()  #ignore-cuda
-        times.append(time.perf_counter() - t0)
-
-    import numpy as np
-    avg = np.mean(times[-num_iters:]) * 1000
-    return {"rollout_total_ms": avg, "prompt_len": prompt_len, "max_new_tokens": max_new_tokens}
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", default="Qwen/Qwen2.5-0.5B-Instruct")
-    parser.add_argument("--prompt-len", type=int, default=64)
-    parser.add_argument("--max-new-tokens", type=int, default=64)
-    parser.add_argument("--num-warmup", type=int, default=3)
-    parser.add_argument("--num-iters", type=int, default=10)
-    args = parser.parse_args()
-
-    device = get_accelerator().current_device()  #ignore-cuda
-
-    tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left")
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    model = AutoModelForCausalLM.from_pretrained(args.model, torch_dtype=torch.bfloat16).to(device)
-
-    print(f"=== Raw decode loop benchmark (model={args.model}) ===")
-    raw = bench_decode_raw(model, tokenizer, device, args.prompt_len, args.max_new_tokens, args.num_warmup,
-                           args.num_iters)
-    print(f"  Prefill:              {raw['prefill_ms']:.2f} ms")
-    print(
-        f"  Decode forward/step:  {raw['decode_forward_ms_per_step']:.3f} ms  (total: {raw['decode_steps_total_ms']:.1f} ms)"
-    )
-    print(f"  Sampling/step:        {raw['sampling_ms_per_step']:.3f} ms  (total: {raw['sampling_total_ms']:.1f} ms)")
-    print(f"  Overhead/step:        {raw['overhead_ms_per_step']:.3f} ms  (total: {raw['overhead_total_ms']:.1f} ms)")
-    print(f"  Total:                {raw['total_ms']:.1f} ms")
-
-    print(f"\n=== HybridEngineRollout benchmark ===")
-    rollout = HybridEngineRollout(model, tokenizer)
-    rr = bench_hybrid_rollout(rollout, tokenizer, device, args.prompt_len, args.max_new_tokens, args.num_warmup,
-                              args.num_iters)
-    print(f"  Rollout generate:     {rr['rollout_total_ms']:.1f} ms")
-
-    print(f"\n=== Summary ===")
-    print(f"  Raw decode loop:      {raw['total_ms']:.1f} ms")
-    print(f"  HybridEngine rollout: {rr['rollout_total_ms']:.1f} ms")
-    print(f"  Overhead (rollout - raw): {rr['rollout_total_ms'] - raw['total_ms']:.1f} ms")
-    print(
-        f"  Bottleneck: decode forward = {raw['decode_forward_ms_per_step']:.3f} ms/step x {args.max_new_tokens} steps = {raw['decode_steps_total_ms']:.1f} ms"
-    )
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmarks/opsd/bench_flashinfer.py b/benchmarks/opsd/bench_flashinfer.py
deleted file mode 100644
index 335a656c8b5f..000000000000
--- a/benchmarks/opsd/bench_flashinfer.py
+++ /dev/null
@@ -1,128 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Benchmark HybridEngineRollout with FlashInfer kernels enabled.
-
-Usage:
-    deepspeed --num_gpus 2 bench_flashinfer.py
-"""
-
-import argparse
-import os
-import time
-
-import deepspeed
-import numpy as np
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from deepspeed.accelerator import get_accelerator
-from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout
-from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", default="Qwen/Qwen2.5-14B-Instruct")
-    parser.add_argument("--prompt-len", type=int, default=64)
-    parser.add_argument("--max-new-tokens", type=int, default=64)
-    parser.add_argument("--num-warmup", type=int, default=3)
-    parser.add_argument("--num-iters", type=int, default=10)
-    parser.add_argument("--no-flashinfer", action="store_true")
-    parser.add_argument("--graph-capture", action="store_true")
-    parser.add_argument("--local_rank", type=int, default=int(os.environ.get("LOCAL_RANK", 0)))
-    args = parser.parse_args()
-
-    local_rank = args.local_rank
-    world_size = int(os.environ.get("WORLD_SIZE", "1"))
-
-    deepspeed.init_distributed()
-
-    if local_rank == 0:
-        print(f"=== HybridEngineRollout Benchmark ===")
-        print(f"  Model:         {args.model}")
-        print(f"  TP size:       {world_size}")
-        print(f"  FlashInfer:    {not args.no_flashinfer}")
-        print(f"  Graph capture: {args.graph_capture}")
-        print()
-
-    tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left")
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    model = AutoModelForCausalLM.from_pretrained(
-        args.model,
-        torch_dtype=torch.bfloat16,
-    )
-
-    ds_config = {
-        "bf16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": 0
-        },
-        "train_micro_batch_size_per_gpu": 1,
-        "train_batch_size": world_size,
-        "gradient_accumulation_steps": 1,
-        "tensor_parallel": {
-            "autotp_size": world_size,
-            "preset_model": "qwen2",
-        },
-    }
-
-    engine, *_ = deepspeed.initialize(
-        model=model,
-        optimizer=None,
-        model_parameters=model.parameters(),
-        config=ds_config,
-    )
-
-    if local_rank == 0:
-        param_count = sum(p.numel() for p in engine.parameters()) / 1e9
-        alloc = get_accelerator().memory_allocated(local_rank) / 1e9
-        print(f"  Parameters (local):  {param_count:.2f}B")
-        print(f"  GPU mem allocated:   {alloc:.1f} GB")
-        print()
-
-    use_flashinfer = not args.no_flashinfer
-    rollout = HybridEngineRollout(engine,
-                                  tokenizer,
-                                  use_flashinfer=use_flashinfer,
-                                  use_graph_capture=args.graph_capture)
-
-    device = torch.device(f"cuda:{local_rank}")
-    torch.manual_seed(42)
-    input_ids = torch.randint(10, 1000, (1, args.prompt_len), device=device)
-    attn_mask = torch.ones(1, args.prompt_len, dtype=torch.long, device=device)
-    sampling = SamplingConfig(max_new_tokens=args.max_new_tokens, temperature=1.0, top_p=1.0)
-    request = RolloutRequest(prompt_ids=input_ids, prompt_attention_mask=attn_mask)
-
-    times = []
-    for i in range(args.num_warmup + args.num_iters):
-        get_accelerator().synchronize()  #ignore-cuda
-        t0 = time.perf_counter()
-        with torch.no_grad():
-            result = rollout.generate(request, sampling)
-        get_accelerator().synchronize()  #ignore-cuda
-        elapsed = time.perf_counter() - t0
-        times.append(elapsed)
-        if local_rank == 0:
-            label = "warmup" if i < args.num_warmup else "iter"
-            n_tokens = result.input_ids.shape[-1] - args.prompt_len
-            print(f"  [{label}] {elapsed*1000:.1f} ms, tokens={n_tokens}")
-
-    if local_rank == 0:
-        avg = np.mean(times[-args.num_iters:]) * 1000
-        per_step = avg / args.max_new_tokens
-        throughput = 1000.0 / per_step
-        print()
-        mode = "FlashInfer" if use_flashinfer else "Baseline (SDPA)"
-        print(f"=== Results ({mode}) ===")
-        print(f"  Total generate:   {avg:.1f} ms")
-        print(f"  Per decode step:  {per_step:.2f} ms")
-        print(f"  Throughput:       {throughput:.1f} tokens/s")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmarks/opsd/bench_hybrid_tp.py b/benchmarks/opsd/bench_hybrid_tp.py
deleted file mode 100644
index e2430c3e65bc..000000000000
--- a/benchmarks/opsd/bench_hybrid_tp.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# DeepSpeed Team
-"""Benchmark HybridEngineRollout with DeepSpeed AutoTP (TP=2).
-
-Usage:
-    deepspeed --num_gpus 2 bench_hybrid_tp.py \
-        --model Qwen/Qwen2.5-14B-Instruct \
-        --max-new-tokens 64
-"""
-
-import argparse
-import os
-import time
-
-import deepspeed
-import numpy as np
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from deepspeed.accelerator import get_accelerator
-from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout
-from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig
-
-
-def bench_hybrid_rollout(rollout, tokenizer, prompt_len, max_new_tokens, num_warmup, num_iters):
-    local_rank = int(os.environ.get("LOCAL_RANK", 0))
-    device = torch.device(f"cuda:{local_rank}")
-
-    torch.manual_seed(42)
-    input_ids = torch.randint(10, 1000, (1, prompt_len), device=device)
-    attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device)
-    sampling = SamplingConfig(max_new_tokens=max_new_tokens, temperature=1.0, top_p=1.0)
-    request = RolloutRequest(prompt_ids=input_ids, prompt_attention_mask=attn_mask)
-
-    times = []
-    for i in range(num_warmup + num_iters):
-        get_accelerator().synchronize(device=device)  #ignore-cuda
-        t0 = time.perf_counter()
-        with torch.no_grad():
-            result = rollout.generate(request, sampling)
-        get_accelerator().synchronize(device=device)  #ignore-cuda
-        elapsed = time.perf_counter() - t0
-        times.append(elapsed)
-        if local_rank == 0:
-            label = "warmup" if i < num_warmup else "iter"
-            n_tokens = result.input_ids.shape[-1] - prompt_len
-            print(f"  [{label}] {elapsed*1000:.1f} ms, tokens={n_tokens}")
-
-    avg = np.mean(times[-num_iters:]) * 1000
-    return {"rollout_total_ms": avg, "prompt_len": prompt_len, "max_new_tokens": max_new_tokens}
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", default="Qwen/Qwen2.5-14B-Instruct")
-    parser.add_argument("--prompt-len", type=int, default=64)
-    parser.add_argument("--max-new-tokens", type=int, default=64)
-    parser.add_argument("--num-warmup", type=int, default=3)
-    parser.add_argument("--num-iters", type=int, default=10)
-    parser.add_argument("--local_rank", type=int, default=int(os.environ.get("LOCAL_RANK", 0)))
-    args = parser.parse_args()
-
-    local_rank = args.local_rank
-    world_size = int(os.environ.get("WORLD_SIZE", "1"))
-
-    deepspeed.init_distributed()
-
-    if local_rank == 0:
-        print(f"=== HybridEngineRollout Benchmark (AutoTP={world_size}) ===")
-        print(f"  Model:       {args.model}")
-        print(f"  TP size:     {world_size}")
-        print(f"  Prompt len:  {args.prompt_len}")
-        print(f"  Decode len:  {args.max_new_tokens}")
-        print(f"  Warmup:      {args.num_warmup}")
-        print(f"  Iters:       {args.num_iters}")
-        print()
-
-    tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left")
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    model = AutoModelForCausalLM.from_pretrained(
-        args.model,
-        torch_dtype=torch.bfloat16,
-    )
-
-    ds_config = {
-        "bf16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": 0
-        },
-        "train_micro_batch_size_per_gpu": 1,
-        "train_batch_size": world_size,
-        "gradient_accumulation_steps": 1,
-        "tensor_parallel": {
-            "autotp_size": world_size,
-            "preset_model": "qwen2",
-        },
-    }
-
-    engine, *_ = deepspeed.initialize(
-        model=model,
-        optimizer=None,
-        model_parameters=model.parameters(),
-        config=ds_config,
-    )
-
-    if local_rank == 0:
-        print("  DeepSpeed engine initialized.")
-        param_count = sum(p.numel() for p in engine.parameters()) / 1e9
-        alloc = get_accelerator().memory_allocated(local_rank) / 1e9  #ignore-cuda
-        print(f"  Parameters (local):  {param_count:.2f}B")
-        print(f"  GPU mem allocated:   {alloc:.1f} GB")
-        print()
-
-    rollout = HybridEngineRollout(engine, tokenizer)
-
-    if local_rank == 0:
-        print("  Running benchmark...")
-
-    result = bench_hybrid_rollout(
-        rollout,
-        tokenizer,
-        args.prompt_len,
-        args.max_new_tokens,
-        args.num_warmup,
-        args.num_iters,
-    )
-
-    if local_rank == 0:
-        total = result["rollout_total_ms"]
-        per_step = total / args.max_new_tokens
-        throughput = 1000.0 / per_step
-        print()
-        print(f"=== Results ===")
-        print(f"  Total generate:   {total:.1f} ms")
-        print(f"  Per decode step:  {per_step:.2f} ms")
-        print(f"  Throughput:       {throughput:.1f} tokens/s")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmarks/opsd/bench_hybrid_tp_opt.py b/benchmarks/opsd/bench_hybrid_tp_opt.py
deleted file mode 100644
index 248b97da6962..000000000000
--- a/benchmarks/opsd/bench_hybrid_tp_opt.py
+++ /dev/null
@@ -1,148 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# DeepSpeed Team
-"""Benchmark HybridEngineRollout with DeepSpeed AutoTP (TP=2) + optimizer.
-
-Usage:
-    deepspeed --num_gpus 2 bench_hybrid_tp_opt.py \
-        --model Qwen/Qwen2.5-14B-Instruct \
-        --max-new-tokens 64
-"""
-
-import argparse
-import os
-import time
-
-import deepspeed
-import numpy as np
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
-
-from deepspeed.accelerator import get_accelerator
-from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout
-from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig
-
-
-def bench_hybrid_rollout(rollout, tokenizer, prompt_len, max_new_tokens, num_warmup, num_iters):
-    local_rank = int(os.environ.get("LOCAL_RANK", 0))
-    device = torch.device(f"cuda:{local_rank}")
-
-    torch.manual_seed(42)
-    input_ids = torch.randint(10, 1000, (1, prompt_len), device=device)
-    attn_mask = torch.ones(1, prompt_len, dtype=torch.long, device=device)
-    sampling = SamplingConfig(max_new_tokens=max_new_tokens, temperature=1.0, top_p=1.0)
-    request = RolloutRequest(prompt_ids=input_ids, prompt_attention_mask=attn_mask)
-
-    times = []
-    for i in range(num_warmup + num_iters):
-        get_accelerator().synchronize(device=device)  #ignore-cuda
-        t0 = time.perf_counter()
-        with torch.no_grad():
-            result = rollout.generate(request, sampling)
-        get_accelerator().synchronize(device=device)  #ignore-cuda
-        elapsed = time.perf_counter() - t0
-        times.append(elapsed)
-        if local_rank == 0:
-            label = "warmup" if i < num_warmup else "iter"
-            n_tokens = result.input_ids.shape[-1] - prompt_len
-            print(f"  [{label}] {elapsed*1000:.1f} ms, tokens={n_tokens}")
-
-    avg = np.mean(times[-num_iters:]) * 1000
-    return {"rollout_total_ms": avg, "prompt_len": prompt_len, "max_new_tokens": max_new_tokens}
-
-
-def main():
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", default="Qwen/Qwen2.5-14B-Instruct")
-    parser.add_argument("--prompt-len", type=int, default=64)
-    parser.add_argument("--max-new-tokens", type=int, default=64)
-    parser.add_argument("--num-warmup", type=int, default=3)
-    parser.add_argument("--num-iters", type=int, default=10)
-    parser.add_argument("--local_rank", type=int, default=int(os.environ.get("LOCAL_RANK", 0)))
-    args = parser.parse_args()
-
-    local_rank = args.local_rank
-    world_size = int(os.environ.get("WORLD_SIZE", "1"))
-
-    deepspeed.init_distributed()
-
-    if local_rank == 0:
-        print(f"=== HybridEngineRollout Benchmark (AutoTP={world_size} + Optimizer) ===")
-        print(f"  Model:       {args.model}")
-        print(f"  TP size:     {world_size}")
-        print(f"  Prompt len:  {args.prompt_len}")
-        print(f"  Decode len:  {args.max_new_tokens}")
-        print()
-
-    tokenizer = AutoTokenizer.from_pretrained(args.model, padding_side="left")
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-
-    model = AutoModelForCausalLM.from_pretrained(
-        args.model,
-        torch_dtype=torch.bfloat16,
-    )
-
-    ds_config = {
-        "bf16": {
-            "enabled": True
-        },
-        "zero_optimization": {
-            "stage": 0
-        },
-        "train_micro_batch_size_per_gpu": 1,
-        "train_batch_size": world_size,
-        "gradient_accumulation_steps": 1,
-        "tensor_parallel": {
-            "autotp_size": world_size,
-            "preset_model": "qwen2",
-        },
-    }
-
-    engine, _, _, _ = deepspeed.initialize(
-        model=model,
-        model_parameters=model.parameters(),
-        config=ds_config,
-    )
-
-    if local_rank == 0:
-        print("  DeepSpeed engine initialized (with optimizer).")
-        param_count = sum(p.numel() for p in engine.parameters()) / 1e9
-        alloc = get_accelerator().memory_allocated(local_rank) / 1e9  #ignore-cuda
-        reserv = get_accelerator().memory_reserved(local_rank) / 1e9  #ignore-cuda
-        print(f"  Parameters (local):  {param_count:.2f}B")
-        alloc = get_accelerator().memory_allocated(local_rank) / 1e9  #ignore-cuda
-        reserv = get_accelerator().memory_reserved(local_rank) / 1e9  #ignore-cuda
-        print(f"  GPU mem allocated:   {alloc:.1f} GB")
-        print(f"  GPU mem reserved:    {reserv:.1f} GB")
-        print()
-
-    rollout = HybridEngineRollout(engine, tokenizer)
-
-    if local_rank == 0:
-        print("  Running benchmark...")
-
-    result = bench_hybrid_rollout(
-        rollout,
-        tokenizer,
-        args.prompt_len,
-        args.max_new_tokens,
-        args.num_warmup,
-        args.num_iters,
-    )
-
-    if local_rank == 0:
-        total = result["rollout_total_ms"]
-        per_step = total / args.max_new_tokens
-        throughput = 1000.0 / per_step
-        print()
-        print(f"=== Results ===")
-        print(f"  Total generate:   {total:.1f} ms")
-        print(f"  Per decode step:  {per_step:.2f} ms")
-        print(f"  Throughput:       {throughput:.1f} tokens/s")
-        alloc = get_accelerator().memory_allocated(local_rank) / 1e9  #ignore-cuda
-        reserv = get_accelerator().memory_reserved(local_rank) / 1e9  #ignore-cuda
-        print(f"  GPU mem (final):   alloc={alloc:.1f} GB, reserved={reserv:.1f} GB")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/benchmarks/opsd/bench_vllm_tp2.py b/benchmarks/opsd/bench_vllm_tp2.py
deleted file mode 100644
index 4351c722d32f..000000000000
--- a/benchmarks/opsd/bench_vllm_tp2.py
+++ /dev/null
@@ -1,44 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Benchmark vLLM TP=2 on 14B, 1P1R.
-
-Launched as a subprocess wrapper to avoid CUDA fork issues.
-"""
-import subprocess, sys, os
-
-script = '''
-import os
-os.environ["CUDA_VISIBLE_DEVICES"] = "1,2"
-import time
-from vllm import LLM, SamplingParams
-
-llm = LLM("Qwen/Qwen2.5-14B-Instruct", tensor_parallel_size=2,
-          gpu_memory_utilization=0.85, dtype="bfloat16", enforce_eager=True)
-sp = SamplingParams(max_tokens=256, temperature=0.8, top_p=0.95, n=1)
-prompt = "def fibonacci(n):"
-
-# warmup
-llm.generate([prompt], sp)
-
-times = []
-for i in range(5):
-    t0 = time.time()
-    out = llm.generate([prompt], sp)
-    times.append(time.time() - t0)
-
-t_avg = sum(times[1:]) / len(times[1:])
-total_toks = sum(len(o.token_ids) for r in out for o in r.outputs)
-print(f"vLLM TP=2 14B 1P1R: {total_toks} toks, {t_avg*1000:.1f}ms, {total_toks/t_avg:.1f} tok/s")
-print(f"Per-run: {[f'{t*1000:.0f}ms' for t in times]}")
-'''
-
-# Write to temp file and exec in a fresh process with no prior CUDA init
-tmp = "/tmp/bench_vllm_inner.py"
-with open(tmp, "w") as f:
-    f.write(script)
-
-env = os.environ.copy()
-env.pop("CUDA_VISIBLE_DEVICES", None)
-proc = subprocess.run([sys.executable, tmp], env=env)
-sys.exit(proc.returncode)
diff --git a/deepspeed/runtime/rlhf/__init__.py b/deepspeed/runtime/rlhf/__init__.py
deleted file mode 100644
index bb08d8fad529..000000000000
--- a/deepspeed/runtime/rlhf/__init__.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""deepspeed.runtime.rlhf — Reinforcement Learning from Human Feedback runtime.
-
-Sub-modules
------------
-config   : Training, rollout, distillation, and data configuration dataclasses.
-losses   : Per-token KL / JSD divergence losses with sequence-axis chunking.
-utils    : Shared tensor / masking helpers.
-trainer  : Algorithm-specific training loops (OPSD, GRPO, …).
-"""
-
-from deepspeed.runtime.rlhf.config import (  # noqa: F401
-    OPSDConfig, StudentConfig, TeacherConfig, RolloutConfig, DistillationConfig, TrainingConfig, DataConfig,
-)
-from deepspeed.runtime.rlhf.losses import (  # noqa: F401
-    chunked_distillation_loss, streamed_distillation_loss, per_token_logprobs,
-)
-from deepspeed.runtime.rlhf.utils import (  # noqa: F401
-    build_response_mask, shift_for_next_token_prediction,
-)
diff --git a/deepspeed/runtime/rlhf/config.py b/deepspeed/runtime/rlhf/config.py
deleted file mode 100644
index 01ad0fe45b54..000000000000
--- a/deepspeed/runtime/rlhf/config.py
+++ /dev/null
@@ -1,174 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Configuration dataclasses for RLHF training.
-
-A single :class:`OPSDConfig` is loaded from a JSON file (see
-``examples/opsd/configs/`` for examples) and threaded through the rest of the
-pipeline. We use plain dataclasses instead of Hydra/pydantic to match the rest
-of the DeepSpeed codebase and to keep the dependency surface minimal.
-"""
-
-import json
-import os
-from dataclasses import dataclass, field, asdict
-from typing import List, Optional
-
-
-@dataclass
-class StudentConfig:
-    model_name_or_path: str
-    dtype: str = "bfloat16"
-    trust_remote_code: bool = False
-
-
-@dataclass
-class TeacherConfig:
-    model_name_or_path: str
-    dtype: str = "bfloat16"
-    trust_remote_code: bool = False
-    # Keep teacher params on CPU and gather per-forward via ZeRO-3. Saves GPU
-    # memory at the cost of host<->device transfer each step.
-    offload_to_cpu: bool = True
-
-
-@dataclass
-class RolloutConfig:
-    # "hybrid_engine" | "vllm"
-    engine: str = "hybrid_engine"
-
-    # Generation knobs (apply to either engine)
-    max_prompt_length: int = 1024
-    max_response_length: int = 1024
-    temperature: float = 0.0
-    top_p: float = 1.0
-    top_k: int = -1
-    n_samples_per_prompt: int = 1
-
-    # Use CUDA graph capture for greedy decode (temperature=0 only).
-    # Eliminates kernel launch overhead, ~3x faster for small models.
-    use_graph_capture: bool = False
-
-    # vLLM-specific. ``gpus`` is the disjoint set of CUDA device indices vLLM
-    # may use; the training ranks must not overlap with these. If None/empty,
-    # vLLM runs in "shared" mode on the same GPU as training rank 0.
-    # At construction time this field is populated from the
-    # ``ROLLOUT_VISIBLE_DEVICE`` environment variable (comma-separated device
-    # indices, e.g. ``ROLLOUT_VISIBLE_DEVICE=6,7``) when that variable is set,
-    # taking precedence over any value supplied in the JSON config.
-    gpus: Optional[List[int]] = None
-
-    def __post_init__(self):
-        env_gpus = os.environ.get("ROLLOUT_VISIBLE_DEVICE")
-        if env_gpus:
-            self.gpus = [int(g.strip()) for g in env_gpus.split(",")]
-
-    tensor_parallel_size: int = 1
-    gpu_memory_utilization: float = 0.85
-    engine_dtype: str = "bfloat16"
-    # Push student weights into vLLM every N optimizer steps. Larger values
-    # trade staleness for throughput.
-    weight_sync_interval: int = 1
-    # Pinned vLLM version known to expose the worker APIs we rely on.
-    vllm_min_version: str = "0.6.4"
-    # Skip CUDA-graph capture at vLLM startup. Saves several minutes of
-    # one-time compilation (worth it for smoke tests / short-lived runs);
-    # leave False for steady-state throughput.
-    vllm_enforce_eager: bool = False
-    # Port for the vLLM OpenAI-compatible API server. Only used when the
-    # vLLM rollout is configured to run as an external subprocess.
-    vllm_port: int = 8000
-    # Maximum seconds to wait for the vLLM server to become healthy.
-    vllm_start_timeout: int = 300
-    # Weight transfer backend for syncing student weights into vLLM.
-    # "auto"  – try GDR (GPU-direct) first, fall back to HTTP.
-    # "gdr"   – GPU-direct transfer (NCCL). Fastest but requires NVIDIA.
-    # "http"  – serialize tensors over HTTP. Slower but accelerator-agnostic.
-    weight_transfer_backend: str = "auto"
-    # Path to the Python interpreter that has vLLM installed.  When set, the
-    # vLLM server subprocess uses this interpreter instead of ``sys.executable``.
-    # Useful when vLLM lives in a separate virtual-env / conda env.
-    vllm_python: str = ""
-
-
-@dataclass
-class DistillationConfig:
-    # "forward_kl" | "reverse_kl" | "jsd"
-    loss_type: str = "reverse_kl"
-    temperature: float = 1.0
-    # Chunk size along the sequence dimension for the per-token divergence.
-    # Bounds peak memory: full [B, T, V] is never materialized at once when
-    # T > chunk_size.
-    chunk_size: int = 512
-
-
-@dataclass
-class TrainingConfig:
-    train_batch_size: int = 8
-    micro_batch_size_per_gpu: int = 1
-    gradient_accumulation_steps: int = 1
-    learning_rate: float = 1e-6
-    weight_decay: float = 0.0
-    num_train_epochs: int = 1
-    max_steps: int = -1
-    warmup_steps: int = 0
-    save_steps: int = 500
-    logging_steps: int = 10
-    save_dir: str = "./opsd_ckpt"
-    seed: int = 42
-
-
-@dataclass
-class DataConfig:
-    path: str = ""
-    prompt_field: str = "prompt"
-    # Optional HF chat template override; if None we use the student tokenizer's
-    # default.
-    chat_template: Optional[str] = None
-    shuffle: bool = True
-
-
-@dataclass
-class OPSDConfig:
-    student: StudentConfig
-    teacher: TeacherConfig
-    rollout: RolloutConfig = field(default_factory=RolloutConfig)
-    distillation: DistillationConfig = field(default_factory=DistillationConfig)
-    training: TrainingConfig = field(default_factory=TrainingConfig)
-    data: DataConfig = field(default_factory=DataConfig)
-    # Path to the DeepSpeed JSON config used for ``deepspeed.initialize`` on the
-    # student. Kept as a separate file because it has its own schema owned by
-    # DeepSpeed.
-    deepspeed_config: str = ""
-
-    @classmethod
-    def from_json(cls, path: str) -> "OPSDConfig":
-        with open(path, "r") as f:
-            raw = json.load(f)
-        return cls.from_dict(raw)
-
-    @classmethod
-    def from_dict(cls, raw: dict) -> "OPSDConfig":
-        return cls(
-            student=StudentConfig(**raw["student"]),
-            teacher=TeacherConfig(**raw["teacher"]),
-            rollout=RolloutConfig(**raw.get("rollout", {})),
-            distillation=DistillationConfig(**raw.get("distillation", {})),
-            training=TrainingConfig(**raw.get("training", {})),
-            data=DataConfig(**raw.get("data", {})),
-            deepspeed_config=raw.get("deepspeed_config", ""),
-        )
-
-    def to_dict(self) -> dict:
-        return asdict(self)
-
-    def validate(self) -> None:
-        if self.distillation.loss_type not in ("forward_kl", "reverse_kl", "jsd"):
-            raise ValueError(f"Unknown loss_type {self.distillation.loss_type!r}")
-        if self.rollout.engine not in ("hybrid_engine", "vllm"):
-            raise ValueError(f"Unknown rollout engine {self.rollout.engine!r}")
-        if self.distillation.chunk_size <= 0:
-            raise ValueError("distillation.chunk_size must be positive")
-        if self.rollout.weight_sync_interval != 1:
-            raise ValueError(f"rollout.weight_sync_interval must be 1 for on-policy distillation; "
-                             f"got {self.rollout.weight_sync_interval}")
diff --git a/deepspeed/runtime/rlhf/losses.py b/deepspeed/runtime/rlhf/losses.py
deleted file mode 100644
index ba717d2b260d..000000000000
--- a/deepspeed/runtime/rlhf/losses.py
+++ /dev/null
@@ -1,191 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Per-token distillation divergences with sequence-axis chunking.
-
-The full ``[B, T, V]`` tensor produced by a forward pass on a modern LLM can
-easily exceed several GB in fp32 (e.g. 8 * 1024 * 150k * 4 B ~ 4.9 GB). Holding
-both student *and* teacher logits at once would double that. We chunk along the
-sequence axis so the per-chunk softmax + difference only ever needs
-``[B, chunk, V]`` of working memory, regardless of T.
-
-Math conventions:
-    * ``forward_kl``  = D_KL(teacher || student) — mode-covering for student
-    * ``reverse_kl``  = D_KL(student || teacher) — mode-seeking for student
-    * ``jsd``         = 0.5 * D_KL(P || M) + 0.5 * D_KL(Q || M), M = (P+Q)/2
-
-All three follow the standard knowledge-distillation temperature convention:
-divide logits by T before softmax, then multiply the result by T**2 so that
-gradient magnitudes are comparable across temperatures.
-"""
-
-from typing import Callable
-
-import torch
-import torch.nn.functional as F
-
-
-def _forward_kl(student_logits: torch.Tensor, teacher_logits: torch.Tensor, temperature: float) -> torch.Tensor:
-    s_log_probs = F.log_softmax(student_logits / temperature, dim=-1)
-    t_log_probs = F.log_softmax(teacher_logits / temperature, dim=-1)
-    t_probs = t_log_probs.exp()
-    kl = (t_probs * (t_log_probs - s_log_probs)).sum(dim=-1)
-    return kl * (temperature**2)
-
-
-def _reverse_kl(student_logits: torch.Tensor, teacher_logits: torch.Tensor, temperature: float) -> torch.Tensor:
-    s_log_probs = F.log_softmax(student_logits / temperature, dim=-1)
-    t_log_probs = F.log_softmax(teacher_logits / temperature, dim=-1)
-    s_probs = s_log_probs.exp()
-    kl = (s_probs * (s_log_probs - t_log_probs)).sum(dim=-1)
-    return kl * (temperature**2)
-
-
-def _jsd(student_logits: torch.Tensor, teacher_logits: torch.Tensor, temperature: float) -> torch.Tensor:
-    s_log_probs = F.log_softmax(student_logits / temperature, dim=-1)
-    t_log_probs = F.log_softmax(teacher_logits / temperature, dim=-1)
-    s_probs = s_log_probs.exp()
-    t_probs = t_log_probs.exp()
-    m_probs = 0.5 * (s_probs + t_probs)
-    # Clamp guards against log(0) when both distributions have ~0 mass on the
-    # same vocab id (rare in practice but possible after temperature scaling).
-    m_log_probs = m_probs.clamp_min(1e-12).log()
-    kl_s = (s_probs * (s_log_probs - m_log_probs)).sum(dim=-1)
-    kl_t = (t_probs * (t_log_probs - m_log_probs)).sum(dim=-1)
-    return 0.5 * (kl_s + kl_t) * (temperature**2)
-
-
-_LOSS_FNS: "dict[str, Callable[..., torch.Tensor]]" = {
-    "forward_kl": _forward_kl,
-    "reverse_kl": _reverse_kl,
-    "jsd": _jsd,
-}
-
-
-def chunked_distillation_loss(
-    student_logits: torch.Tensor,
-    teacher_logits: torch.Tensor,
-    response_mask: torch.Tensor,
-    loss_type: str = "reverse_kl",
-    temperature: float = 1.0,
-    chunk_size: int = 512,
-) -> torch.Tensor:
-    """Mean per-token divergence over response positions, chunked over the
-    sequence axis to bound peak memory.
-
-    Args:
-        student_logits: ``[B, T, V]`` — gradient flows here.
-        teacher_logits: ``[B, T, V]`` — caller is responsible for ``detach()``
-            (we do not detach here so the function stays cheap).
-        response_mask:  ``[B, T]`` — 1 where the position should contribute to
-            the loss (i.e. response tokens, not prompt or padding), 0 elsewhere.
-        loss_type:      ``"forward_kl"`` | ``"reverse_kl"`` | ``"jsd"``.
-        temperature:    KD temperature; >1 softens both distributions.
-        chunk_size:     Sequence-axis chunk size.
-
-    Returns:
-        Scalar loss = sum-over-positions(per_tok * mask) / sum(mask), promoted
-        to fp32 internally for numerical stability.
-    """
-    if loss_type not in _LOSS_FNS:
-        raise ValueError(f"Unknown loss_type {loss_type!r}; choose from {sorted(_LOSS_FNS)}")
-    fn = _LOSS_FNS[loss_type]
-
-    if student_logits.shape != teacher_logits.shape:
-        raise ValueError(f"shape mismatch: student {tuple(student_logits.shape)} vs teacher "
-                         f"{tuple(teacher_logits.shape)}")
-    B, T, _ = student_logits.shape
-    if response_mask.shape != (B, T):
-        raise ValueError(f"response_mask {tuple(response_mask.shape)} does not match logits "
-                         f"prefix ({B}, {T})")
-
-    mask_f = response_mask.to(torch.float32)
-    total_tokens = mask_f.sum().clamp_min(1.0)
-    total_loss = student_logits.new_zeros((), dtype=torch.float32)
-
-    for start in range(0, T, chunk_size):
-        end = min(start + chunk_size, T)
-        chunk_mask = mask_f[:, start:end]
-        # Skipping empty chunks avoids a redundant forward through the softmax
-        # path on chunks that wouldn't contribute anything to the sum.
-        if chunk_mask.sum().item() == 0:
-            continue
-        per_tok = fn(
-            student_logits[:, start:end].float(),
-            teacher_logits[:, start:end].float(),
-            temperature,
-        )
-        total_loss = total_loss + (per_tok * chunk_mask).sum()
-
-    return total_loss / total_tokens
-
-
-def streamed_distillation_loss(
-    student_logits: torch.Tensor,
-    teacher_chunk_fetcher: Callable[[int, int], torch.Tensor],
-    response_mask: torch.Tensor,
-    loss_type: str = "reverse_kl",
-    temperature: float = 1.0,
-    chunk_size: int = 512,
-) -> torch.Tensor:
-    """Same math as :func:`chunked_distillation_loss`, but teacher logits are
-    pulled chunk-by-chunk via a fetcher so the full ``[B, T, V]`` teacher
-    tensor never needs to live on the same device as the student.
-
-    Args:
-        student_logits: ``[B, T, V]`` on the training device.
-        teacher_chunk_fetcher: ``fn(start, end) -> [B, end - start, V]``, already
-            on the same device and broadcastable dtype as ``student_logits``.
-            Typically wraps ``TeacherLogitCache.chunk_to_device``.
-        response_mask:  ``[B, T]`` — 1 where the position should contribute.
-        loss_type:      one of ``"forward_kl" | "reverse_kl" | "jsd"``.
-        temperature:    KD temperature.
-        chunk_size:     Sequence-axis chunk size.
-    """
-    if loss_type not in _LOSS_FNS:
-        raise ValueError(f"Unknown loss_type {loss_type!r}; choose from {sorted(_LOSS_FNS)}")
-    fn = _LOSS_FNS[loss_type]
-
-    B, T, _ = student_logits.shape
-    if response_mask.shape != (B, T):
-        raise ValueError(f"response_mask {tuple(response_mask.shape)} does not match logits "
-                         f"prefix ({B}, {T})")
-
-    mask_f = response_mask.to(torch.float32)
-    total_tokens = mask_f.sum().clamp_min(1.0)
-    total_loss = student_logits.new_zeros((), dtype=torch.float32)
-
-    for start in range(0, T, chunk_size):
-        end = min(start + chunk_size, T)
-        chunk_mask = mask_f[:, start:end]
-        if chunk_mask.sum().item() == 0:
-            continue
-        teacher_chunk = teacher_chunk_fetcher(start, end)
-        if teacher_chunk.shape[1] != (end - start):
-            raise RuntimeError(f"fetcher returned chunk of length {teacher_chunk.shape[1]}, "
-                               f"expected {end - start}")
-        per_tok = fn(
-            student_logits[:, start:end].float(),
-            teacher_chunk.float(),
-            temperature,
-        )
-        total_loss = total_loss + (per_tok * chunk_mask).sum()
-
-    return total_loss / total_tokens
-
-
-def per_token_logprobs(logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
-    """Gather log p(label_t | context_<t) for each position.
-
-    Convenience helper used by the trainer for diagnostic logging (e.g. mean
-    student log-prob of the rollout, mean teacher log-prob of the rollout).
-
-    Args:
-        logits: ``[B, T, V]``.
-        labels: ``[B, T]`` token ids.
-
-    Returns:
-        ``[B, T]`` of log-probabilities.
-    """
-    log_probs = F.log_softmax(logits.float(), dim=-1)
-    return log_probs.gather(-1, labels.unsqueeze(-1)).squeeze(-1)
diff --git a/deepspeed/runtime/rlhf/trainer/__init__.py b/deepspeed/runtime/rlhf/trainer/__init__.py
deleted file mode 100644
index 3086ddac2cde..000000000000
--- a/deepspeed/runtime/rlhf/trainer/__init__.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""deepspeed.runtime.rlhf.trainer — RLHF training-loop implementations."""
-
-from deepspeed.runtime.rlhf.trainer.base import RLHFTrainer  # noqa: F401
-from deepspeed.runtime.rlhf.trainer.opsd import OPSDTrainer  # noqa: F401
diff --git a/deepspeed/runtime/rlhf/trainer/base.py b/deepspeed/runtime/rlhf/trainer/base.py
deleted file mode 100644
index 4a5451a40f66..000000000000
--- a/deepspeed/runtime/rlhf/trainer/base.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Abstract base class for RLHF trainers.
-
-Concrete implementations (e.g. :class:`~deepspeed.runtime.rlhf.trainer.opsd.OPSDTrainer`)
-inherit from :class:`RLHFTrainer` and implement the abstract methods to define
-their algorithm-specific training loop.
-"""
-
-from abc import ABC, abstractmethod
-from typing import Any
-
-
-class RLHFTrainer(ABC):
-    """Base class for all RLHF training loops.
-
-    Subclasses must implement :meth:`train` and :meth:`_train_step`.  The base
-    class deliberately imposes no constraints on the constructor signature so
-    each algorithm can accept whatever components it needs (rollout engine,
-    reference model, reward model, etc.).
-    """
-
-    @abstractmethod
-    def train(self) -> None:
-        """Run the full training loop (all epochs / steps)."""
-        ...
-
-    @abstractmethod
-    def _train_step(self, batch: Any) -> dict:
-        """Execute one optimizer step and return a metrics dict.
-
-        Args:
-            batch: A single batch from the dataloader.  The expected structure
-                is algorithm-specific.
-
-        Returns:
-            A ``dict`` of scalar metrics (``loss``, timing fields, token
-            counts, …) suitable for logging.
-        """
-        ...
diff --git a/deepspeed/runtime/rlhf/trainer/opsd.py b/deepspeed/runtime/rlhf/trainer/opsd.py
deleted file mode 100644
index 92df42cbbe2b..000000000000
--- a/deepspeed/runtime/rlhf/trainer/opsd.py
+++ /dev/null
@@ -1,198 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""On-policy distillation (OPSD) training loop.
-
-Each step is three phases:
-
-  0. **Rollout.** The student generates responses for the batch's prompts
-     (via the configured :class:`~deepspeed.runtime.rollout.RolloutEngine` —
-     hybrid engine or vLLM).
-  1. **Teacher.** The frozen teacher runs a forward over prompt+response. The
-     full logit tensor is parked on the host via
-     :class:`~opsd.teacher.TeacherLogitCache` so teacher GPU buffers can be
-     released before the student backward.
-  2. **Student.** The student runs forward+backward on prompt+response. The
-     loss is the per-token divergence to the teacher, streamed from the
-     host-resident cache one sequence chunk at a time
-     (:func:`~deepspeed.runtime.rlhf.losses.streamed_distillation_loss`), so
-     the full ``[B, T, V]`` teacher tensor never co-resides with the student
-     logits on the training device.
-
-The trainer itself contains no DeepSpeed-specific control flow beyond the
-``backward`` / ``step`` calls on the student engine; backend choice (ZeRO
-stage, offload, hybrid engine) is owned entirely by the DeepSpeed JSON config.
-"""
-
-import os
-import time
-from typing import Any
-
-import torch
-from deepspeed import comm as dist
-from deepspeed.accelerator import get_accelerator
-
-from deepspeed.runtime.rlhf.config import OPSDConfig
-from deepspeed.runtime.rlhf.losses import streamed_distillation_loss
-from deepspeed.runtime.rlhf.trainer.base import RLHFTrainer
-from deepspeed.runtime.rlhf.utils import build_response_mask
-from deepspeed.runtime.rollout import RolloutEngine, RolloutRequest, SamplingConfig
-
-
-def _is_rank_zero() -> bool:
-    return (not dist.is_initialized()) or dist.get_rank() == 0
-
-
-class OPSDTrainer(RLHFTrainer):
-
-    def __init__(
-        self,
-        cfg: OPSDConfig,
-        student_engine: Any,
-        teacher: Any,
-        tokenizer: Any,
-        rollout: RolloutEngine,
-        dataloader: Any,
-    ):
-        self.cfg = cfg
-        self.student_engine = student_engine
-        self.teacher = teacher
-        self.tokenizer = tokenizer
-        self.rollout = rollout
-        self.dataloader = dataloader
-
-        self.device = get_accelerator().current_device_name()
-        self.step = 0
-
-    # ------------------------------------------------------------------
-    # Driver
-    # ------------------------------------------------------------------
-
-    def train(self) -> None:
-        max_steps = self.cfg.training.max_steps
-        for epoch in range(self.cfg.training.num_train_epochs):
-            for batch in self.dataloader:
-                if max_steps > 0 and self.step >= max_steps:
-                    return
-                metrics = self._train_step(batch)
-                self._maybe_log(metrics)
-                self._maybe_save()
-                self.step += 1
-            if max_steps > 0 and self.step >= max_steps:
-                return
-
-    # ------------------------------------------------------------------
-    # One step
-    # ------------------------------------------------------------------
-
-    def _train_step(self, batch) -> dict:
-        t_start = time.time()
-
-        prompt_ids = batch["prompt_ids"].to(self.device, non_blocking=True)
-        prompt_attn = batch["prompt_attention_mask"].to(self.device, non_blocking=True)
-
-        # Sync student weights into the rollout backend.
-        # No-op for hybrid engine; meaningful for vLLM.
-        self.rollout.sync_weights(self.step)
-
-        # --- Phase 0: rollout (student generates responses) ---------------
-        # Switch hybrid engine to inference mode (gathers ZeRO-3 params).
-        self.student_engine.eval()
-        sampling = SamplingConfig(
-            max_new_tokens=self.cfg.rollout.max_response_length,
-            temperature=self.cfg.rollout.temperature,
-            top_p=self.cfg.rollout.top_p,
-            top_k=self.cfg.rollout.top_k,
-            n_samples_per_prompt=self.cfg.rollout.n_samples_per_prompt,
-        )
-        roll = self.rollout.generate(
-            RolloutRequest(prompt_ids=prompt_ids, prompt_attention_mask=prompt_attn),
-            sampling,
-        )
-        input_ids = roll.input_ids.to(self.device, non_blocking=True)
-        attention_mask = roll.attention_mask.to(self.device, non_blocking=True)
-        response_start_idx = roll.response_start_idx.to(self.device, non_blocking=True)
-        response_mask = build_response_mask(response_start_idx, attention_mask)
-        t_rollout = time.time() - t_start
-
-        # --- Phase 1: teacher forward → host-cached logits ----------------
-        t1 = time.time()
-        teacher_cache = self.teacher.forward_to_cache(input_ids, attention_mask)
-        t_teacher = time.time() - t1
-
-        # --- Phase 2: student forward + streamed KL + backward ------------
-        t2 = time.time()
-        self.student_engine.train()
-        outputs = self.student_engine(input_ids=input_ids, attention_mask=attention_mask)
-        student_logits = outputs.logits  # [B, T, V]
-
-        # Shift for next-token prediction: logits at position t predict token
-        # at t+1, so the loss aligns student_logits[:, :-1] with the position
-        # t+1 entries of the response mask.
-        student_logits_shifted = student_logits[:, :-1, :]
-        mask_shifted = response_mask[:, 1:].contiguous()
-
-        def _fetch(start: int, end: int) -> torch.Tensor:
-            # The cache holds *unshifted* teacher logits; for the next-token
-            # objective we ask the cache for positions [start, end) of the
-            # shifted teacher, which is positions [start, end) of the original
-            # since we already lopped off the final column in the student.
-            return teacher_cache.chunk_to_device(start,
-                                                 end,
-                                                 device=student_logits_shifted.device,
-                                                 dtype=student_logits_shifted.dtype)
-
-        loss = streamed_distillation_loss(
-            student_logits=student_logits_shifted,
-            teacher_chunk_fetcher=_fetch,
-            response_mask=mask_shifted,
-            loss_type=self.cfg.distillation.loss_type,
-            temperature=self.cfg.distillation.temperature,
-            chunk_size=self.cfg.distillation.chunk_size,
-        )
-
-        self.student_engine.backward(loss)
-        self.student_engine.step()
-
-        teacher_cache.free()
-        t_student = time.time() - t2
-
-        # Reduce loss across ranks for a clean log line.
-        loss_for_log = loss.detach().clone()
-        if dist.is_initialized():
-            dist.all_reduce(loss_for_log)
-            loss_for_log /= dist.get_world_size()
-
-        return {
-            "loss": float(loss_for_log.item()),
-            "rollout_s": t_rollout,
-            "teacher_s": t_teacher,
-            "student_s": t_student,
-            "step_s": time.time() - t_start,
-            "response_tokens": int(mask_shifted.sum().item()),
-        }
-
-    # ------------------------------------------------------------------
-    # Logging / checkpointing
-    # ------------------------------------------------------------------
-
-    def _maybe_log(self, metrics: dict) -> None:
-        if self.step % self.cfg.training.logging_steps != 0:
-            return
-        if not _is_rank_zero():
-            return
-        print(f"[opsd][step {self.step}] loss={metrics['loss']:.4f} "
-              f"rollout={metrics['rollout_s']:.2f}s teacher={metrics['teacher_s']:.2f}s "
-              f"student={metrics['student_s']:.2f}s step={metrics['step_s']:.2f}s "
-              f"resp_tok={metrics['response_tokens']}")
-
-    def _maybe_save(self) -> None:
-        if self.step == 0:
-            return
-        if self.step % self.cfg.training.save_steps != 0:
-            return
-        tag = f"step_{self.step}"
-        os.makedirs(self.cfg.training.save_dir, exist_ok=True)
-        self.student_engine.save_checkpoint(self.cfg.training.save_dir, tag=tag)
-        if _is_rank_zero():
-            print(f"[opsd] saved checkpoint to {self.cfg.training.save_dir}/{tag}")
diff --git a/deepspeed/runtime/rlhf/utils.py b/deepspeed/runtime/rlhf/utils.py
deleted file mode 100644
index 1e97a4b7706a..000000000000
--- a/deepspeed/runtime/rlhf/utils.py
+++ /dev/null
@@ -1,51 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Small tensor/masking helpers shared by trainer, losses, and tests.
-
-These intentionally stay free of DeepSpeed / distributed imports so the
-non-distributed unit tests can exercise them on CPU without a torchrun
-launcher.
-"""
-
-import torch
-
-
-def build_response_mask(response_start_idx: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
-    """Mark positions belonging to the response (not prompt, not padding).
-
-    Args:
-        response_start_idx: ``[B]`` int tensor — the first column index that is
-            part of the response, per sample. For *right-padded* prompts this
-            equals the prompt's token count; for the more common *left-padded*
-            convention used by causal generation it equals the prompt section
-            length (i.e. the column where prompt ends and response begins).
-        attention_mask: ``[B, T]`` — 1 on real tokens (prompt + response), 0 on
-            padding.
-
-    Returns:
-        ``[B, T]`` 0/1 mask with the same dtype as ``attention_mask``. 1 only
-        at positions ``t >= response_start_idx[b]`` that are also attended.
-    """
-    if response_start_idx.dim() != 1:
-        raise ValueError(f"response_start_idx must be 1-D, got shape {tuple(response_start_idx.shape)}")
-    if attention_mask.dim() != 2:
-        raise ValueError(f"attention_mask must be 2-D, got shape {tuple(attention_mask.shape)}")
-    B, T = attention_mask.shape
-    if response_start_idx.shape[0] != B:
-        raise ValueError(f"response_start_idx batch ({response_start_idx.shape[0]}) != "
-                         f"attention_mask batch ({B})")
-
-    pos = torch.arange(T, device=attention_mask.device).unsqueeze(0).expand(B, T)
-    is_response = pos >= response_start_idx.to(pos.dtype).unsqueeze(1)
-    return is_response.to(attention_mask.dtype) * attention_mask
-
-
-def shift_for_next_token_prediction(logits: torch.Tensor, labels: torch.Tensor):
-    """Align logits at position t with the label at position t+1.
-
-    Returns:
-        Tuple ``(shifted_logits[:, :-1, :], shifted_labels[:, 1:])`` — both
-        contiguous, so they can be safely indexed for the divergence loss.
-    """
-    return logits[:, :-1, :].contiguous(), labels[:, 1:].contiguous()
diff --git a/deepspeed/runtime/rollout/__init__.py b/deepspeed/runtime/rollout/__init__.py
index db126ab120fd..16f6fc595da6 100644
--- a/deepspeed/runtime/rollout/__init__.py
+++ b/deepspeed/runtime/rollout/__init__.py
@@ -7,43 +7,37 @@
   - :class:`RolloutEngine` — abstract base class
   - :class:`RolloutRequest`, :class:`RolloutBatch`, :class:`SamplingConfig` — dataclasses
   - :class:`HybridEngineRollout` — concrete implementation using DeepSpeed hybrid engine
-  - :class:`VLLMRollout` — concrete implementation using an external vLLM server
   - :func:`build_rollout` — factory that selects the engine from config
 """
 
 from deepspeed.runtime.rollout.base import (
     RolloutBatch,
+    RolloutConfig,
     RolloutEngine,
     RolloutRequest,
     SamplingConfig,
 )
 from deepspeed.runtime.rollout.hybrid_engine_rollout import HybridEngineRollout
-from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout, stitch_rollout
 
 __all__ = [
     "HybridEngineRollout",
     "RolloutBatch",
+    "RolloutConfig",
     "RolloutEngine",
     "RolloutRequest",
     "SamplingConfig",
-    "VLLMRollout",
     "build_rollout",
-    "stitch_rollout",
 ]
 
 
-def build_rollout(rollout_cfg, student_engine=None, tokenizer=None, student_model_path=None):
+def build_rollout(rollout_cfg, student_engine=None, tokenizer=None, **kwargs):
     """Factory: construct the rollout engine specified by ``rollout_cfg.engine``.
 
-    Imports of heavy backends are deferred so that selecting the hybrid-engine
-    path doesn't transitively require vLLM (and vice versa).
-
     Args:
         rollout_cfg: :class:`RolloutConfig` (or any object with an ``engine``
-            attribute set to ``"hybrid_engine"`` or ``"vllm"``).
+            attribute set to ``"hybrid_engine"``).
         student_engine: DeepSpeed engine wrapping the student model.
         tokenizer: HuggingFace tokenizer.
-        student_model_path: Model name/path for vLLM to load from disk.
     """
     engine_name = rollout_cfg.engine
     if engine_name == "hybrid_engine":
@@ -51,14 +45,4 @@ def build_rollout(rollout_cfg, student_engine=None, tokenizer=None, student_mode
             raise ValueError("hybrid_engine rollout needs both student_engine and tokenizer")
         return HybridEngineRollout(engine=student_engine, tokenizer=tokenizer, cfg=rollout_cfg)
 
-    if engine_name == "vllm":
-        if tokenizer is None:
-            raise ValueError("vllm rollout needs a tokenizer for length accounting")
-        return VLLMRollout(
-            cfg=rollout_cfg,
-            tokenizer=tokenizer,
-            student_engine=student_engine,
-            student_model_path=student_model_path,
-        )
-
-    raise ValueError(f"Unknown rollout engine {engine_name!r}; choose from 'hybrid_engine' | 'vllm'")
+    raise ValueError(f"Unknown rollout engine {engine_name!r}; choose from 'hybrid_engine'")
diff --git a/deepspeed/runtime/rollout/_vllm_compat/__init__.py b/deepspeed/runtime/rollout/_vllm_compat/__init__.py
deleted file mode 100644
index bbec52ed50ee..000000000000
--- a/deepspeed/runtime/rollout/_vllm_compat/__init__.py
+++ /dev/null
@@ -1,3 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
diff --git a/deepspeed/runtime/rollout/_vllm_compat/sitecustomize.py b/deepspeed/runtime/rollout/_vllm_compat/sitecustomize.py
deleted file mode 100644
index c490fd71a261..000000000000
--- a/deepspeed/runtime/rollout/_vllm_compat/sitecustomize.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Site-customization hook injected into the vLLM server subprocess.
-
-When this file is on ``PYTHONPATH`` (or placed as ``sitecustomize.py`` in a
-directory listed in ``PYTHONPATH``), Python executes it automatically before
-running the main script.  It is used solely by
-:class:`~deepspeed.runtime.rollout.vllm_rollout.VLLMRollout` to patch a known
-compatibility issue between **vLLM 0.22.0** and certain ``pydantic-core``
-builds that hit a Rust-level assertion::
-
-    pydantic_core._pydantic_core.ValidationError:
-      Assertion failed, duplicate template name
-
-The patch is **harmless on systems that don't need it** — the original
-``validate_python`` call is attempted first and only on ``Exception`` do we
-fall back to plain dataclass field assignment.
-
-This file is NOT a monkey-patch on installed packages.  It patches the
-*behaviour at runtime* by replacing the ``__init__`` method that pydantic's
-``@dataclass`` decorator installs on each decorated class.  No files under
-``site-packages/`` are modified.
-"""
-
-import dataclasses as _dc
-
-
-def _install_pydantic_dataclass_fallback():
-    try:
-        import pydantic._internal._dataclasses as _pdc
-    except ImportError:
-        return
-
-    if getattr(_pdc, "_deepspeed_patched", False):
-        return
-
-    def _make_safe_init(original_init):
-
-        def _safe_init(__dataclass_self__, *args, **kwargs):
-            __tracebackhide__ = True
-            try:
-                original_init(__dataclass_self__, *args, **kwargs)
-            except Exception:
-                s = __dataclass_self__
-                kw = dict(zip([f.name for f in _dc.fields(s.__class__)], args))
-                kw.update(kwargs)
-                for f in _dc.fields(s.__class__):
-                    if f.name in kw:
-                        object.__setattr__(s, f.name, kw[f.name])
-                    elif f.default is not _dc.MISSING:
-                        object.__setattr__(s, f.name, f.default)
-                    elif f.default_factory is not _dc.MISSING:
-                        object.__setattr__(s, f.name, f.default_factory())
-                    else:
-                        object.__setattr__(s, f.name, None)
-
-        _safe_init.__qualname__ = original_init.__qualname__
-        return _safe_init
-
-    _original_complete = _pdc.complete_dataclass
-
-    def _patched_complete(cls, config_wrapper, *, raise_errors=False):
-        result = _original_complete(cls, config_wrapper, raise_errors=raise_errors)
-        if hasattr(cls, "__init__"):
-            original_init = cls.__init__
-            cls.__init__ = _make_safe_init(original_init)
-        return result
-
-    _pdc.complete_dataclass = _patched_complete
-    _pdc._deepspeed_patched = True
-
-
-_install_pydantic_dataclass_fallback()
diff --git a/deepspeed/runtime/rollout/base.py b/deepspeed/runtime/rollout/base.py
index 55695a2a6bb6..af215ac3998c 100644
--- a/deepspeed/runtime/rollout/base.py
+++ b/deepspeed/runtime/rollout/base.py
@@ -15,6 +15,23 @@
 import torch
 
 
+@dataclass
+class RolloutConfig:
+    """Configuration for the rollout engine."""
+    engine: str = "hybrid_engine"
+
+    # Generation knobs
+    max_prompt_length: int = 1024
+    max_response_length: int = 1024
+    temperature: float = 1.0
+    top_p: float = 1.0
+    top_k: int = -1
+    n_samples_per_prompt: int = 1
+
+    # Use CUDA graph capture for decode acceleration.
+    use_graph_capture: bool = False
+
+
 @dataclass
 class SamplingConfig:
     """Sampling knobs that the trainer passes to ``generate`` each step."""
diff --git a/deepspeed/runtime/rollout/vllm_rollout.py b/deepspeed/runtime/rollout/vllm_rollout.py
deleted file mode 100644
index 1f40d42dd62d..000000000000
--- a/deepspeed/runtime/rollout/vllm_rollout.py
+++ /dev/null
@@ -1,541 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""vLLM rollout via an external OpenAI-compatible server process.
-
-**Architecture**
-    Training ranks run under the DeepSpeed launcher as usual.  Rank 0 lazily
-    spawns ``python -m vllm.entrypoints.openai.api_server ...`` as a
-    **separate subprocess** with its own CUDA device visibility, then
-    communicates with it over HTTP using the OpenAI-compatible completions
-    API.  Other ranks receive generated token ids by broadcast from rank 0
-    (:func:`deepspeed.comm.broadcast_object_list`).
-
-**Why a subprocess?**
-    vLLM's worker initialisation calls ``new_group(...)`` on the global
-    process group as a collective.  Under the DeepSpeed launcher the world
-    spans *all* training ranks, but only rank 0 talks to vLLM.  Running
-    vLLM in-process therefore deadlocks.  The subprocess approach gives
-    vLLM its own world (size = TP) and avoids the conflict entirely.
-
-**GPU placement**
-    ``cfg.gpus`` controls which physical GPUs the vLLM server sees via
-    ``CUDA_VISIBLE_DEVICES``.  These may be disjoint from the training GPUs
-    (the safe default) or overlap when ``cfg.gpus`` is empty (shared mode,
-    which requires the training loop to release GPU memory first).
-
-**Weight sync (vLLM >= 0.22.0)**
-    vLLM 0.22.0 exposes an RLHF weight-transfer API when started with
-    ``VLLM_SERVER_DEV_MODE=1`` and ``--weight-transfer-config``.  The
-    protocol is: ``pause`` -> ``start_weight_update`` ->
-    ``update_weights`` -> ``finish_weight_update`` -> ``resume``.
-
-    Two transport backends are supported:
-
-    * **GDR** (GPU-direct) – NCCL broadcast over a
-      ``StatelessProcessGroup``.  Fastest, but requires NCCL (NVIDIA).
-    * **HTTP** – serialize tensors and send over HTTP.  Slower but
-      accelerator-agnostic.
-
-    When ``weight_transfer_backend="auto"`` (default), GDR is tried
-    first and falls back to HTTP if NCCL is unavailable.
-"""
-
-import logging
-import os
-import socket
-import signal
-import subprocess
-import sys
-import threading
-import time
-from typing import Any, Dict, List, Optional, Tuple
-
-import requests
-import torch
-
-from deepspeed.runtime.rlhf.config import RolloutConfig
-from deepspeed.runtime.rollout.base import RolloutBatch, RolloutEngine, RolloutRequest, SamplingConfig
-
-logger = logging.getLogger(__name__)
-
-_HTTP_TIMEOUT = 120
-_VLLM_NCCL_BACKEND = "nccl"
-
-
-def _gdr_available() -> bool:
-    try:
-        return torch.cuda.is_available() and torch.cuda.nccl.version() is not None  #ignore-cuda
-    except Exception:
-        return False
-
-
-def _is_rank_zero() -> bool:
-    from deepspeed import comm as dist
-
-    return (not dist.is_initialized()) or dist.get_rank() == 0
-
-
-def stitch_rollout(
-    prompt_ids: torch.Tensor,
-    prompt_attention_mask: torch.Tensor,
-    responses: List[List[int]],
-    pad_id: int,
-    n_samples_per_prompt: int,
-) -> RolloutBatch:
-    """Stitch left-padded prompts and per-sample response token ids into one
-    right-padded ``RolloutBatch``.
-
-    This is the only piece of vLLM-side post-processing that doesn't depend
-    on a live server, so we factor it out for CPU unit testing.
-
-    Args:
-        prompt_ids: ``[B, T_p]`` left-padded prompts.
-        prompt_attention_mask: ``[B, T_p]`` matching attention mask.
-        responses: list of length ``B * n_samples_per_prompt``; each element
-            is the list of generated token ids for one (prompt, sample).
-        pad_id: pad token used for both prompt left-padding and response
-            right-padding (typically the tokenizer's ``pad_token_id`` or
-            ``eos_token_id``).
-        n_samples_per_prompt: number of generated samples per prompt.
-
-    Returns:
-        :class:`RolloutBatch` with ``response_start_idx = T_p`` for every
-        sample.
-    """
-    B, T_p = prompt_ids.shape
-    n = n_samples_per_prompt
-    expected = B * n
-    if len(responses) != expected:
-        raise ValueError(f"expected {expected} response token-id lists "
-                         f"(B={B} * n_samples={n}); got {len(responses)}")
-
-    if responses:
-        max_response_len = max(len(r) for r in responses)
-    else:
-        max_response_len = 0
-    T_total = T_p + max_response_len
-    device = prompt_ids.device
-
-    out_ids = torch.full((expected, T_total), pad_id, dtype=torch.long, device=device)
-    out_attn = torch.zeros((expected, T_total), dtype=prompt_attention_mask.dtype, device=device)
-
-    prompts_expanded = prompt_ids.repeat_interleave(n, dim=0)
-    attn_expanded = prompt_attention_mask.repeat_interleave(n, dim=0)
-    out_ids[:, :T_p] = prompts_expanded
-    out_attn[:, :T_p] = attn_expanded
-
-    for i, resp in enumerate(responses):
-        L = len(resp)
-        if L == 0:
-            continue
-        out_ids[i, T_p:T_p + L] = torch.tensor(resp, dtype=torch.long, device=device)
-        out_attn[i, T_p:T_p + L] = 1
-
-    response_start_idx = torch.full((expected, ), T_p, dtype=torch.long, device=device)
-    return RolloutBatch(input_ids=out_ids, attention_mask=out_attn, response_start_idx=response_start_idx)
-
-
-class VLLMRollout(RolloutEngine):
-
-    name = "vllm"
-
-    def __init__(
-        self,
-        cfg: RolloutConfig,
-        tokenizer: Any,
-        student_engine: Any = None,
-        student_model_path: Optional[str] = None,
-    ):
-        if cfg.engine != "vllm":
-            raise ValueError(f"RolloutConfig.engine must be 'vllm'; got {cfg.engine!r}")
-        if student_model_path is None:
-            raise ValueError("VLLMRollout needs student_model_path to initialise the vLLM engine "
-                             "(it loads weights from disk at construction time)")
-
-        self.cfg = cfg
-        self.tokenizer = tokenizer
-        self.student_engine = student_engine
-        self._model_path = student_model_path
-
-        self.is_rank_zero = _is_rank_zero()
-        self._server_proc: Optional[subprocess.Popen] = None
-        self._base_url = f"http://localhost:{cfg.vllm_port}"
-        self._ready = False
-
-        self._nccl_group = None
-        self._weight_transfer_inited = False
-
-        backend = cfg.weight_transfer_backend
-        if backend == "auto":
-            backend = "gdr" if _gdr_available() else "http"
-        if backend not in ("gdr", "http"):
-            raise ValueError(f"weight_transfer_backend must be 'auto', 'gdr', or 'http'; got {backend!r}")
-        self._wt_backend = backend
-
-    # ------------------------------------------------------------------
-    # Lazy server lifecycle
-    # ------------------------------------------------------------------
-
-    def _ensure_server(self) -> None:
-        """Start the vLLM server on first use (rank 0 only).
-
-        All ranks barrier here so non-zero ranks wait until rank 0 has
-        confirmed the server is healthy.
-        """
-        if self._ready:
-            return
-
-        from deepspeed import comm as dist
-
-        if self.is_rank_zero:
-            self._start_server()
-            self._wait_for_health()
-
-        if dist.is_initialized() and dist.get_world_size() > 1:
-            dist.barrier()
-
-        self._ready = True
-
-    def _start_server(self) -> None:
-        env = os.environ.copy()
-        if self.cfg.gpus:
-            env["CUDA_VISIBLE_DEVICES"] = ",".join(str(g) for g in self.cfg.gpus)
-        env.pop("VLLM_WORKER_MULTIPROC_METHOD", None)
-
-        env["VLLM_SERVER_DEV_MODE"] = "1"
-
-        python_bin = self.cfg.vllm_python or sys.executable
-        cmd = [
-            python_bin,
-            "-m",
-            "vllm.entrypoints.openai.api_server",
-            "--model",
-            self._model_path,
-            "--tensor-parallel-size",
-            str(self.cfg.tensor_parallel_size),
-            "--dtype",
-            self.cfg.engine_dtype,
-            "--gpu-memory-utilization",
-            str(self.cfg.gpu_memory_utilization),
-            "--port",
-            str(self.cfg.vllm_port),
-            "--weight-transfer-config",
-            f'{{"backend": "{_VLLM_NCCL_BACKEND}"}}' if self._wt_backend == "gdr" else '{"backend": "http"}',
-        ]
-        if self.cfg.vllm_enforce_eager:
-            cmd.append("--enforce-eager")
-
-        logger.info("Starting vLLM server: %s", " ".join(cmd))
-        self._server_proc = subprocess.Popen(
-            cmd,
-            env=env,
-            stdout=subprocess.DEVNULL,
-            stderr=subprocess.PIPE,
-        )
-
-    def _wait_for_health(self) -> None:
-        deadline = time.monotonic() + self.cfg.vllm_start_timeout
-        while time.monotonic() < deadline:
-            if self._server_proc is not None and self._server_proc.poll() is not None:
-                rc = self._server_proc.returncode
-                stderr_tail = ""
-                if self._server_proc.stderr is not None:
-                    stderr_tail = self._server_proc.stderr.read().decode(errors="replace")[-3000:]
-                raise RuntimeError(f"vLLM server exited prematurely (rc={rc}). stderr tail:\n{stderr_tail}")
-            try:
-                resp = requests.get(f"{self._base_url}/health", timeout=2)
-                if resp.status_code == 200:
-                    logger.info("vLLM server is healthy on port %d", self.cfg.vllm_port)
-                    return
-            except requests.ConnectionError:
-                pass
-            time.sleep(1)
-        raise TimeoutError(f"vLLM server did not become healthy within {self.cfg.vllm_start_timeout}s")
-
-    # ------------------------------------------------------------------
-    # Generation
-    # ------------------------------------------------------------------
-
-    def generate(self, request: RolloutRequest, sampling: SamplingConfig) -> RolloutBatch:
-        self._ensure_server()
-
-        B = int(request.prompt_ids.shape[0])
-        n = sampling.n_samples_per_prompt
-
-        if self.is_rank_zero:
-            prompt_token_ids: List[List[int]] = []
-            for i in range(B):
-                mask = request.prompt_attention_mask[i].bool()
-                ids = request.prompt_ids[i][mask].tolist()
-                prompt_token_ids.append(ids)
-
-            payload: Dict[str, Any] = {
-                "model": self._model_path,
-                "prompt": prompt_token_ids,
-                "n": n,
-                "temperature": sampling.temperature,
-                "top_p": sampling.top_p,
-                "max_tokens": sampling.max_new_tokens,
-                "logprobs": 1,
-            }
-            if sampling.top_k > 0:
-                payload["top_k"] = sampling.top_k
-
-            resp = requests.post(
-                f"{self._base_url}/v1/completions",
-                json=payload,
-                timeout=_HTTP_TIMEOUT,
-            )
-            resp.raise_for_status()
-            body = resp.json()
-
-            responses: List[List[int]] = []
-            for choice in body["choices"]:
-                responses.append(self._extract_token_ids(choice))
-        else:
-            responses = []
-
-        from deepspeed import comm as dist
-
-        if dist.is_initialized() and dist.get_world_size() > 1:
-            obj = [responses]
-            dist.broadcast_object_list(obj, src=0)
-            responses = obj[0]
-
-        pad_id = (self.tokenizer.pad_token_id
-                  if self.tokenizer.pad_token_id is not None else self.tokenizer.eos_token_id)
-        return stitch_rollout(
-            prompt_ids=request.prompt_ids,
-            prompt_attention_mask=request.prompt_attention_mask,
-            responses=responses,
-            pad_id=pad_id,
-            n_samples_per_prompt=n,
-        )
-
-    @staticmethod
-    def _extract_token_ids(choice: Dict[str, Any]) -> List[int]:
-        """Extract generated token ids from a vLLM completions choice.
-
-        vLLM 0.22.0 returns ``token_ids: null`` by default.  We request
-        ``logprobs: 1`` in :meth:`generate` and read the token ids from the
-        logprobs structure.
-        """
-        raw = choice.get("token_ids")
-        if raw is not None:
-            return list(raw)
-
-        logprobs_data = choice.get("logprobs")
-        if logprobs_data is not None:
-            token_ids = logprobs_data.get("token_ids")
-            if token_ids is not None:
-                return [int(t) for t in token_ids]
-
-            tokens = logprobs_data.get("tokens")
-            if tokens is not None:
-                return list(range(len(tokens)))
-
-        return []
-
-    # ------------------------------------------------------------------
-    # Weight sync (vLLM 0.22.0 RLHF API)
-    # ------------------------------------------------------------------
-
-    def sync_weights(self, step: int) -> None:
-        self._ensure_server()
-
-        if self.student_engine is None:
-            return
-
-        if not self._weight_transfer_inited and self.is_rank_zero:
-            if self._wt_backend == "gdr":
-                self._init_gdr_channel()
-            self._weight_transfer_inited = True
-
-        from deepspeed.runtime.zero import GatheredParameters
-
-        params: List[Tuple[str, torch.Tensor]] = []
-        model = self.student_engine.module
-        for name, param in model.named_parameters():
-            if not param.requires_grad:
-                continue
-            with GatheredParameters([param], modifier_rank=0):
-                if self.is_rank_zero:
-                    params.append((name, param.data.detach().clone()))
-
-        if self.is_rank_zero:
-            self._pause()
-            if self._wt_backend == "gdr":
-                self._update_weights_gdr(params)
-            else:
-                self._update_weights_http(params)
-            self._resume()
-
-        from deepspeed import comm as dist
-
-        if dist.is_initialized() and dist.get_world_size() > 1:
-            dist.barrier()
-
-    # -- GDR (NCCL) weight transfer ----------------------------------------
-
-    def _init_gdr_channel(self) -> None:
-        """Bootstrap the GDR weight-transfer channel.
-
-        vLLM's ``init_weight_transfer_engine`` endpoint and the trainer-side
-        ``StatelessProcessGroup.create()`` must rendezvous concurrently
-        (both block until the other side connects).  We fire the HTTP call
-        in a background thread.
-        """
-        master_addr = self._get_own_ip()
-        master_port = _find_free_port()
-
-        resp = requests.get(f"{self._base_url}/get_world_size", timeout=_HTTP_TIMEOUT)
-        resp.raise_for_status()
-        vllm_world_size = resp.json()["world_size"]
-        total_world_size = vllm_world_size + 1
-
-        init_info = {
-            "master_address": master_addr,
-            "master_port": master_port,
-            "rank_offset": 1,
-            "world_size": total_world_size,
-        }
-
-        init_thread = threading.Thread(target=self._post,
-                                       args=("/init_weight_transfer_engine", ),
-                                       kwargs={"json": {
-                                           "init_info": init_info
-                                       }})
-        init_thread.start()
-
-        from vllm.distributed.utils import StatelessProcessGroup
-
-        group = StatelessProcessGroup.create(host=master_addr, port=master_port, rank=0, world_size=total_world_size)
-        init_thread.join(timeout=30)
-        if init_thread.is_alive():
-            raise TimeoutError("init_weight_transfer_engine did not complete within 30s")
-
-        self._nccl_group = group
-        logger.info("GDR weight-transfer channel initialised "
-                    "(world_size=%d, vllm_workers=%d)", total_world_size, vllm_world_size)
-
-    def _update_weights_gdr(self, params: List[Tuple[str, torch.Tensor]]) -> None:
-        """Push all gathered parameters to vLLM via GPU-direct (NCCL) transfer.
-
-        The flow mirrors vLLM's official ``rlhf_http_nccl.py`` example:
-
-        1. ``POST /start_weight_update`` — tells vLLM to prepare for incoming
-           weights.
-        2. ``POST /update_weights`` (in a **background thread**) — sends the
-           parameter metadata (names, dtypes, shapes).  The server-side handler
-           blocks waiting for NCCL broadcast.
-        3. Trainer broadcasts each tensor via ``StatelessProcessGroup``.
-        4. ``POST /finish_weight_update`` — finalises the update.
-        """
-        names: List[str] = []
-        dtype_names: List[str] = []
-        shapes: List[List[int]] = []
-        tensors: List[torch.Tensor] = []
-
-        for name, tensor in params:
-            names.append(name)
-            dtype_names.append(str(tensor.dtype).replace("torch.", ""))
-            shapes.append(list(tensor.shape))
-            tensors.append(tensor)
-
-        self._post("/start_weight_update", json={"is_checkpoint_format": True})
-
-        update_info = {
-            "names": names,
-            "dtype_names": dtype_names,
-            "shapes": shapes,
-            "packed": False,
-        }
-
-        update_thread = threading.Thread(target=self._post,
-                                         args=("/update_weights", ),
-                                         kwargs={"json": {
-                                             "update_info": update_info
-                                         }})
-        update_thread.start()
-
-        for tensor in tensors:
-            self._nccl_group.broadcast(tensor.contiguous(), src=0)
-
-        update_thread.join(timeout=60)
-        if update_thread.is_alive():
-            raise TimeoutError("update_weights HTTP call did not complete within 60s")
-
-        self._post("/finish_weight_update", json={})
-        logger.info("pushed %d parameters via GDR", len(names))
-
-    # -- HTTP weight transfer -----------------------------------------------
-
-    def _update_weights_http(self, params: List[Tuple[str, torch.Tensor]]) -> None:
-        """Push all gathered parameters to vLLM via HTTP serialised transfer.
-
-        Each parameter is sent individually: metadata (name, dtype, shape)
-        goes in the JSON body alongside the tensor bytes (base64-encoded).
-        """
-        import base64
-
-        self._post("/start_weight_update", json={"is_checkpoint_format": True})
-
-        for name, tensor in params:
-            arr = tensor.cpu().numpy()
-            buf = arr.tobytes()
-            encoded = base64.b64encode(buf).decode("ascii")
-            self._post(
-                "/update_weights",
-                json={
-                    "update_info": {
-                        "names": [name],
-                        "dtype_names": [str(tensor.dtype).replace("torch.", "")],
-                        "shapes": [list(tensor.shape)],
-                        "packed": False,
-                    },
-                    "tensors": [encoded],
-                },
-                timeout=max(_HTTP_TIMEOUT, 30),
-            )
-
-        self._post("/finish_weight_update", json={})
-        logger.info("pushed %d parameters via HTTP", len(params))
-
-    # -- RLHF HTTP helpers -----------------------------------------------
-
-    def _post(self, path: str, **kwargs: Any) -> requests.Response:
-        resp = requests.post(f"{self._base_url}{path}", timeout=_HTTP_TIMEOUT, **kwargs)
-        resp.raise_for_status()
-        return resp
-
-    def _pause(self) -> None:
-        self._post("/pause", params={"mode": "abort"})
-
-    def _resume(self) -> None:
-        self._post("/resume")
-
-    @staticmethod
-    def _get_own_ip() -> str:
-        return "127.0.0.1"
-
-    # ------------------------------------------------------------------
-    # Cleanup
-    # ------------------------------------------------------------------
-
-    def shutdown(self) -> None:
-        if self._server_proc is not None:
-            self._server_proc.send_signal(signal.SIGTERM)
-            try:
-                self._server_proc.wait(timeout=30)
-            except subprocess.TimeoutExpired:
-                self._server_proc.kill()
-                self._server_proc.wait()
-            self._server_proc = None
-        self._ready = False
-
-
-def _find_free_port() -> int:
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(("", 0))
-        return s.getsockname()[1]
diff --git a/tests/unit/runtime/rollout/test_hybrid_engine_rollout.py b/tests/unit/runtime/rollout/test_hybrid_engine_rollout.py
index f46c17e5f9d9..2431fc2f652a 100644
--- a/tests/unit/runtime/rollout/test_hybrid_engine_rollout.py
+++ b/tests/unit/runtime/rollout/test_hybrid_engine_rollout.py
@@ -1,3 +1,4 @@
+# Copyright (c) Microsoft Corporation.
 # SPDX-License-Identifier: Apache-2.0
 
 # DeepSpeed Team
@@ -10,7 +11,6 @@
 
 import torch
 
-from deepspeed.runtime.rollout.base import RolloutRequest, SamplingConfig
 from deepspeed.runtime.rollout.hybrid_engine_rollout import (
     HybridEngineRollout,
     HybridEngineRolloutConfig,
@@ -52,6 +52,11 @@ def test_constructor_stores_config():
     assert rollout.tokenizer is tok
 
 
+def test_constructor_defaults_without_cfg():
+    rollout = HybridEngineRollout(_make_engine(), _make_tokenizer())
+    assert rollout.use_graph_capture is False
+
+
 # -- _sample_top_p ------------------------------------------------------
 
 
@@ -91,28 +96,20 @@ def test_sync_weights_is_noop():
 # -- generate dispatches correctly -------------------------------------
 
 
-def _make_request():
-    return RolloutRequest(prompt_ids=torch.tensor([[1, 2]]), prompt_attention_mask=torch.ones(1, 2, dtype=torch.long))
-
-
-def test_generate_uses_module_generate_by_default():
-    engine = _make_engine()
-    tok = _make_tokenizer()
-    rollout = HybridEngineRollout(engine, tok)
-    engine.module.generate = MagicMock(return_value=torch.tensor([[1, 2, 3, 2]]))
-
-    # Sampling (temperature > 0) routes through the engine's generate path.
-    rollout.generate(_make_request(), SamplingConfig(max_new_tokens=2, temperature=1.0))
-    engine.module.generate.assert_called_once()
-
-
 def test_generate_calls_graph_capture_when_enabled():
     engine = _make_engine()
     tok = _make_tokenizer()
     cfg = HybridEngineRolloutConfig(use_graph_capture=True)
     rollout = HybridEngineRollout(engine, tok, cfg=cfg)
-    rollout._generate_graph = MagicMock(return_value=torch.tensor([[1, 2, 3, 2]]))
+    rollout._generate_graph = MagicMock(return_value=torch.zeros(1, 5, dtype=torch.long))
+
+    req = MagicMock()
+    req.prompt_ids = torch.tensor([[1, 2]])
+    req.prompt_attention_mask = torch.ones(1, 2, dtype=torch.long)
+    sampling = MagicMock()
+    sampling.temperature = 0
+    sampling.n_samples_per_prompt = 1
+    sampling.max_new_tokens = 3
 
-    # Graph capture is used for greedy decoding (temperature <= 0).
-    rollout.generate(_make_request(), SamplingConfig(max_new_tokens=2, temperature=0.0))
+    rollout.generate(req, sampling)
     rollout._generate_graph.assert_called_once()
diff --git a/tests/unit/runtime/rollout/test_rollout_interface.py b/tests/unit/runtime/rollout/test_rollout_interface.py
index f0a925670e0e..995056941ad0 100644
--- a/tests/unit/runtime/rollout/test_rollout_interface.py
+++ b/tests/unit/runtime/rollout/test_rollout_interface.py
@@ -18,7 +18,8 @@
     SamplingConfig,
     build_rollout,
 )
-from deepspeed.runtime.rlhf.utils import build_response_mask
+
+
 
 # --- dataclass invariants ---------------------------------------------------
 
@@ -126,18 +127,6 @@ def test_fake_rollout_left_padded_prompts():
     assert out.response_start_idx.tolist() == [4, 4]
 
 
-def test_response_mask_from_rollout_output_matches_helper():
-    fake = FakeRollout()
-    prompt_ids = torch.tensor([[1, 2, 3], [0, 4, 5]])
-    attn = torch.tensor([[1, 1, 1], [0, 1, 1]], dtype=torch.long)
-    out = fake.generate(RolloutRequest(prompt_ids, attn), SamplingConfig(max_new_tokens=3))
-    mask = build_response_mask(out.response_start_idx, out.attention_mask)
-    # Both samples: response starts at column 3 (T_p), and all post-prompt
-    # positions are attended (FakeRollout produces no padding in the response).
-    assert mask[0].tolist() == [0, 0, 0, 1, 1, 1]
-    assert mask[1].tolist() == [0, 0, 0, 1, 1, 1]
-
-
 def test_sync_records_steps():
     fake = FakeRollout()
     fake.sync_weights(0)
@@ -146,14 +135,14 @@ def test_sync_records_steps():
 
 
 def test_engine_factory_unknown_raises():
-    from deepspeed.runtime.rlhf.config import RolloutConfig
+    from deepspeed.runtime.rollout.base import RolloutConfig
 
     with pytest.raises(ValueError, match="Unknown rollout engine"):
         build_rollout(RolloutConfig(engine="totally_made_up"))
 
 
 def test_engine_factory_hybrid_requires_student_engine():
-    from deepspeed.runtime.rlhf.config import RolloutConfig
+    from deepspeed.runtime.rollout.base import RolloutConfig
 
     with pytest.raises(ValueError, match="needs both"):
         build_rollout(RolloutConfig(engine="hybrid_engine"))
diff --git a/tests/unit/runtime/rollout/test_vllm_rollout.py b/tests/unit/runtime/rollout/test_vllm_rollout.py
deleted file mode 100644
index 616fe6cc9300..000000000000
--- a/tests/unit/runtime/rollout/test_vllm_rollout.py
+++ /dev/null
@@ -1,205 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""CPU-only unit tests for VLLMRollout (no GPU or vLLM server needed).
-
-Tests cover configuration validation, command construction, token-id
-extraction from API responses, and utility helpers.
-"""
-
-from unittest.mock import MagicMock, patch
-
-import pytest
-
-from deepspeed.runtime.rlhf.config import RolloutConfig
-
-
-def _make_cfg(**overrides):
-    defaults = dict(
-        engine="vllm",
-        vllm_port=8999,
-        gpu_memory_utilization=0.3,
-        weight_transfer_backend="http",
-    )
-    defaults.update(overrides)
-    return RolloutConfig(**defaults)
-
-
-# -- __init__ validation ------------------------------------------------
-
-
-def test_init_rejects_wrong_engine():
-    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
-
-    cfg = RolloutConfig(engine="hybrid_engine")
-    with pytest.raises(ValueError, match="must be 'vllm'"):
-        VLLMRollout(cfg=cfg, tokenizer=MagicMock(), student_model_path="x")
-
-
-def test_gpus_from_env_var(monkeypatch):
-    monkeypatch.setenv("ROLLOUT_VISIBLE_DEVICE", "6,7")
-    cfg = RolloutConfig(engine="vllm")
-    assert cfg.gpus == [6, 7]
-
-
-def test_env_var_overrides_json_gpus(monkeypatch):
-    monkeypatch.setenv("ROLLOUT_VISIBLE_DEVICE", "6,7")
-    cfg = RolloutConfig(engine="vllm", gpus=[0, 1])
-    assert cfg.gpus == [6, 7]
-
-
-def test_no_env_var_keeps_json_gpus(monkeypatch):
-    monkeypatch.delenv("ROLLOUT_VISIBLE_DEVICE", raising=False)
-    cfg = RolloutConfig(engine="vllm", gpus=[0, 1])
-    assert cfg.gpus == [0, 1]
-
-
-def test_init_requires_student_model_path():
-    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
-
-    cfg = RolloutConfig(engine="vllm")
-    with pytest.raises(ValueError, match="student_model_path"):
-        VLLMRollout(cfg=cfg, tokenizer=MagicMock())
-
-
-def test_init_http_backend():
-    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
-
-    cfg = _make_cfg(weight_transfer_backend="http")
-    rollout = VLLMRollout(cfg=cfg, tokenizer=MagicMock(), student_model_path="test-model")
-    assert rollout._wt_backend == "http"
-
-
-# -- _extract_token_ids -------------------------------------------------
-
-
-def test_extract_token_ids_prefers_token_ids():
-    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
-
-    choice = {"token_ids": [10, 20, 30]}
-    assert VLLMRollout._extract_token_ids(choice) == [10, 20, 30]
-
-
-def test_extract_token_ids_from_logprobs_token_ids():
-    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
-
-    choice = {"logprobs": {"token_ids": [5, 6, 7]}}
-    assert VLLMRollout._extract_token_ids(choice) == [5, 6, 7]
-
-
-def test_extract_token_ids_from_logprobs_tokens_fallback():
-    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
-
-    choice = {"logprobs": {"tokens": ["a", "b"]}}
-    assert VLLMRollout._extract_token_ids(choice) == [0, 1]
-
-
-def test_extract_token_ids_empty_on_no_data():
-    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
-
-    assert VLLMRollout._extract_token_ids({}) == []
-
-
-# -- _start_server command construction --------------------------------
-
-
-def test_start_server_command_http_backend(monkeypatch):
-    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
-
-    monkeypatch.delenv("ROLLOUT_VISIBLE_DEVICE", raising=False)
-    cfg = _make_cfg(
-        weight_transfer_backend="http",
-        tensor_parallel_size=2,
-        gpu_memory_utilization=0.5,
-        vllm_port=12345,
-        vllm_enforce_eager=True,
-        gpus=[0, 1],
-    )
-    rollout = VLLMRollout(cfg=cfg, tokenizer=MagicMock(), student_model_path="test-model")
-
-    with patch("subprocess.Popen") as mock_popen:
-        mock_popen.return_value = MagicMock()
-        rollout._start_server()
-
-    args, kwargs = mock_popen.call_args
-    cmd = args[0]
-    assert cmd[0].endswith("python") or "python" in cmd[0]
-    assert "-m" in cmd
-    assert "vllm.entrypoints.openai.api_server" in cmd
-    assert "--model" in cmd
-    assert "test-model" in cmd
-    assert "--tensor-parallel-size" in cmd
-    assert "2" in cmd
-    assert "--gpu-memory-utilization" in cmd
-    assert "0.5" in cmd
-    assert "--port" in cmd
-    assert "12345" in cmd
-    assert "--enforce-eager" in cmd
-    assert '{"backend": "http"}' in cmd
-
-    env = kwargs["env"]
-    assert env["CUDA_VISIBLE_DEVICES"] == "0,1"
-    assert env["VLLM_SERVER_DEV_MODE"] == "1"
-
-
-def test_start_server_uses_vllm_python():
-    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
-
-    cfg = _make_cfg(vllm_python="/custom/bin/python")
-    rollout = VLLMRollout(cfg=cfg, tokenizer=MagicMock(), student_model_path="test-model")
-
-    with patch("subprocess.Popen") as mock_popen:
-        mock_popen.return_value = MagicMock()
-        rollout._start_server()
-
-    cmd = mock_popen.call_args[0][0]
-    assert cmd[0] == "/custom/bin/python"
-
-
-# -- _wait_for_health detects early exit --------------------------------
-
-
-def test_wait_for_health_raises_on_crash():
-    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
-
-    cfg = _make_cfg(vllm_start_timeout=5)
-    rollout = VLLMRollout(cfg=cfg, tokenizer=MagicMock(), student_model_path="test-model")
-
-    proc = MagicMock()
-    proc.poll.return_value = 1
-    proc.returncode = 1
-    proc.stderr = MagicMock()
-    proc.stderr.read.return_value = b"some error detail"
-    rollout._server_proc = proc
-
-    with pytest.raises(RuntimeError, match="exited prematurely"):
-        rollout._wait_for_health()
-
-
-def test_wait_for_health_raises_timeout():
-    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
-
-    cfg = _make_cfg(vllm_start_timeout=0)
-    rollout = VLLMRollout(cfg=cfg, tokenizer=MagicMock(), student_model_path="test-model")
-    rollout._server_proc = MagicMock()
-    rollout._server_proc.poll.return_value = None
-
-    with pytest.raises(TimeoutError, match="did not become healthy"):
-        rollout._wait_for_health()
-
-
-# -- utility helpers ----------------------------------------------------
-
-
-def test_get_own_ip():
-    from deepspeed.runtime.rollout.vllm_rollout import VLLMRollout
-
-    assert isinstance(VLLMRollout._get_own_ip(), str)
-
-
-def test_find_free_port():
-    from deepspeed.runtime.rollout.vllm_rollout import _find_free_port
-
-    port = _find_free_port()
-    assert isinstance(port, int)
-    assert 1 <= port <= 65535
diff --git a/tests/unit/runtime/rollout/test_vllm_stitch.py b/tests/unit/runtime/rollout/test_vllm_stitch.py
deleted file mode 100644
index 6477e8701159..000000000000
--- a/tests/unit/runtime/rollout/test_vllm_stitch.py
+++ /dev/null
@@ -1,96 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""CPU-only tests for the vLLM rollout post-processing.
-
-We can't run vLLM here, but the prompt/response stitching is pure tensor
-manipulation and is the part most prone to silent index bugs.
-"""
-
-import pytest
-import torch
-
-from deepspeed.runtime.rollout import stitch_rollout
-from deepspeed.runtime.rlhf.utils import build_response_mask
-
-
-def test_stitch_basic_single_sample():
-    prompt_ids = torch.tensor([[1, 2, 3], [4, 5, 6]])
-    attn = torch.ones(2, 3, dtype=torch.long)
-    responses = [[10, 11, 12], [20, 21]]
-    out = stitch_rollout(prompt_ids, attn, responses, pad_id=0, n_samples_per_prompt=1)
-    assert out.input_ids.shape == (2, 6)
-    assert out.input_ids[0].tolist() == [1, 2, 3, 10, 11, 12]
-    assert out.input_ids[1].tolist() == [4, 5, 6, 20, 21, 0]
-    assert out.attention_mask[0].tolist() == [1, 1, 1, 1, 1, 1]
-    assert out.attention_mask[1].tolist() == [1, 1, 1, 1, 1, 0]
-    assert out.response_start_idx.tolist() == [3, 3]
-
-
-def test_stitch_with_n_samples():
-    prompt_ids = torch.tensor([[1, 2], [3, 4]])
-    attn = torch.ones(2, 2, dtype=torch.long)
-    responses = [[5, 6], [7, 8], [9, 10], [11, 12]]
-    out = stitch_rollout(prompt_ids, attn, responses, pad_id=0, n_samples_per_prompt=2)
-    assert out.input_ids.shape == (4, 4)
-    # Prompts are repeat_interleaved: [P0, P0, P1, P1].
-    assert out.input_ids[0].tolist() == [1, 2, 5, 6]
-    assert out.input_ids[1].tolist() == [1, 2, 7, 8]
-    assert out.input_ids[2].tolist() == [3, 4, 9, 10]
-    assert out.input_ids[3].tolist() == [3, 4, 11, 12]
-    assert out.response_start_idx.tolist() == [2, 2, 2, 2]
-
-
-def test_stitch_left_padded_prompts():
-    prompt_ids = torch.tensor([[0, 1, 2], [3, 4, 5]])
-    attn = torch.tensor([[0, 1, 1], [1, 1, 1]], dtype=torch.long)
-    responses = [[6], [7]]
-    out = stitch_rollout(prompt_ids, attn, responses, pad_id=0, n_samples_per_prompt=1)
-    # Response begins at column T_p == 3 for both, regardless of prompt padding.
-    assert out.response_start_idx.tolist() == [3, 3]
-    # Prompt section keeps the caller's left-padding mask.
-    assert out.attention_mask[:, :3].tolist() == [[0, 1, 1], [1, 1, 1]]
-
-
-def test_stitch_mismatched_response_count_raises():
-    prompt_ids = torch.tensor([[1, 2]])
-    attn = torch.ones(1, 2, dtype=torch.long)
-    with pytest.raises(ValueError, match="expected"):
-        stitch_rollout(prompt_ids, attn, [[3], [4]], pad_id=0, n_samples_per_prompt=1)
-
-
-def test_stitch_empty_responses_still_well_shaped():
-    prompt_ids = torch.tensor([[1, 2], [3, 4]])
-    attn = torch.ones(2, 2, dtype=torch.long)
-    out = stitch_rollout(prompt_ids, attn, [[], []], pad_id=0, n_samples_per_prompt=1)
-    # No response tokens means total length == prompt length.
-    assert out.input_ids.shape == (2, 2)
-    # Mask over the (zero) response section is empty; response_start_idx still
-    # points at the end of the prompt.
-    assert out.response_start_idx.tolist() == [2, 2]
-
-
-def test_stitch_handles_variable_response_lengths():
-    prompt_ids = torch.tensor([[1], [2], [3]])
-    attn = torch.ones(3, 1, dtype=torch.long)
-    responses = [[10], [20, 21, 22, 23], [30, 31]]
-    out = stitch_rollout(prompt_ids, attn, responses, pad_id=99, n_samples_per_prompt=1)
-    # Total length = T_p + max(response lengths) = 1 + 4 = 5.
-    assert out.input_ids.shape == (3, 5)
-    assert out.input_ids[0].tolist() == [1, 10, 99, 99, 99]
-    assert out.input_ids[1].tolist() == [2, 20, 21, 22, 23]
-    assert out.input_ids[2].tolist() == [3, 30, 31, 99, 99]
-    assert out.attention_mask[0].tolist() == [1, 1, 0, 0, 0]
-    assert out.attention_mask[1].tolist() == [1, 1, 1, 1, 1]
-    assert out.attention_mask[2].tolist() == [1, 1, 1, 0, 0]
-
-
-def test_stitch_output_feeds_build_response_mask():
-    prompt_ids = torch.tensor([[0, 1, 2], [3, 4, 5]])
-    attn = torch.tensor([[0, 1, 1], [1, 1, 1]], dtype=torch.long)
-    out = stitch_rollout(prompt_ids, attn, [[10, 11], [20]], pad_id=0, n_samples_per_prompt=1)
-    mask = build_response_mask(out.response_start_idx, out.attention_mask)
-    # Sample 0: T_p=3, response tokens at 3,4 (both attended).
-    assert mask[0].tolist() == [0, 0, 0, 1, 1]
-    # Sample 1: T_p=3, response token at 3 only (position 4 is pad).
-    assert mask[1].tolist() == [0, 0, 0, 1, 0]

From 2a46e40bb4c1ccc3d9402d7cbafd81c01f9fb36a Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@intel.com>
Date: Fri, 3 Jul 2026 17:55:11 +0800
Subject: [PATCH 10/18] Move static_cache.py to deepspeed/utils/

Signed-off-by: Guokai Ma <guokai.ma@intel.com>
---
 deepspeed/runtime/rollout/hybrid_engine_rollout.py   | 2 +-
 deepspeed/{runtime/rollout => utils}/static_cache.py | 0
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename deepspeed/{runtime/rollout => utils}/static_cache.py (100%)

diff --git a/deepspeed/runtime/rollout/hybrid_engine_rollout.py b/deepspeed/runtime/rollout/hybrid_engine_rollout.py
index b2c02b25dc3a..d78db8484e7b 100644
--- a/deepspeed/runtime/rollout/hybrid_engine_rollout.py
+++ b/deepspeed/runtime/rollout/hybrid_engine_rollout.py
@@ -98,7 +98,7 @@ def generate(self, request: RolloutRequest, sampling: SamplingConfig) -> Rollout
     def _generate_graph(self, prompt_ids, prompt_attn, max_new_tokens, pad_token_id, module, device):
         """Greedy decode with DeepSpeedStaticCache + CUDA graph capture."""
         from transformers import StaticCache
-        from deepspeed.runtime.rollout.static_cache import DeepSpeedStaticCache
+        from deepspeed.utils.static_cache import DeepSpeedStaticCache
 
         batch_size = prompt_ids.shape[0]
         prompt_len = prompt_ids.shape[1]
diff --git a/deepspeed/runtime/rollout/static_cache.py b/deepspeed/utils/static_cache.py
similarity index 100%
rename from deepspeed/runtime/rollout/static_cache.py
rename to deepspeed/utils/static_cache.py

From b260126fdcbe45be128f5cd553eb2270be499cb2 Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@intel.com>
Date: Fri, 3 Jul 2026 18:07:27 +0800
Subject: [PATCH 11/18] Use accelerator abstraction for CUDA graph capture in
 hybrid engine rollout

- Replace direct torch.cuda.CUDAGraph/graph()/replay() calls with
  get_accelerator().create_graph/capture_to_graph/replay_graph
- Add capture_error_mode='global' to accelerator API (abstract + cuda)
  so capture errors surface immediately instead of being silently swallowed

Signed-off-by: Guokai Ma <guokai.ma@intel.com>
---
 accelerator/abstract_accelerator.py                | 2 +-
 accelerator/cuda_accelerator.py                    | 4 ++--
 deepspeed/runtime/rollout/hybrid_engine_rollout.py | 7 ++++---
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py
index c764760b962c..ea5ff4754fed 100644
--- a/accelerator/abstract_accelerator.py
+++ b/accelerator/abstract_accelerator.py
@@ -209,7 +209,7 @@ def create_graph(self):
         ...
 
     @abc.abstractmethod
-    def capture_to_graph(self, graph, pool=None, stream=None):
+    def capture_to_graph(self, graph, pool=None, stream=None, capture_error_mode="global"):
         ...
 
     @abc.abstractmethod
diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
index 24766f8c0a81..167490ace720 100644
--- a/accelerator/cuda_accelerator.py
+++ b/accelerator/cuda_accelerator.py
@@ -274,8 +274,8 @@ def is_triton_supported(self):
     def create_graph(self):
         return torch.cuda.CUDAGraph()
 
-    def capture_to_graph(self, graph, pool=None, stream=None):
-        return torch.cuda.graph(graph, pool, stream)
+    def capture_to_graph(self, graph, pool=None, stream=None, capture_error_mode="global"):
+        return torch.cuda.graph(graph, pool, stream, capture_error_mode=capture_error_mode)
 
     def replay_graph(self, graph):
         graph.replay()
diff --git a/deepspeed/runtime/rollout/hybrid_engine_rollout.py b/deepspeed/runtime/rollout/hybrid_engine_rollout.py
index d78db8484e7b..180f57970a50 100644
--- a/deepspeed/runtime/rollout/hybrid_engine_rollout.py
+++ b/deepspeed/runtime/rollout/hybrid_engine_rollout.py
@@ -16,6 +16,7 @@
 
 import torch
 
+from deepspeed.accelerator import get_accelerator
 from deepspeed.runtime.rollout.base import RolloutBatch, RolloutEngine, RolloutRequest, SamplingConfig
 
 
@@ -181,8 +182,8 @@ def _generate_graph(self, prompt_ids, prompt_attn, max_new_tokens, pad_token_id,
             torch.cuda.current_stream().wait_stream(s)  #ignore-cuda
 
             # Capture
-            graph = torch.cuda.CUDAGraph()  #ignore-cuda
-            with torch.cuda.graph(graph):  #ignore-cuda
+            graph = get_accelerator().create_graph()  #ignore-cuda
+            with get_accelerator().capture_to_graph(graph):  #ignore-cuda
                 out = module(
                     static_token,
                     attention_mask=static_attn,
@@ -212,7 +213,7 @@ def _generate_graph(self, prompt_ids, prompt_attn, max_new_tokens, pad_token_id,
             static_attn[:, pos] = 1
 
             # Replay
-            graph.replay()
+            get_accelerator().replay_graph(graph)
             next_token = static_logits[:, -1, :].argmax(dim=-1, keepdim=True)
             output_ids.append(next_token)
             eos_mask |= (next_token.squeeze(1) == eos_token_id)

From 365f6a078a065ca14a14efba55416b53cfbdae5c Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@intel.com>
Date: Fri, 3 Jul 2026 18:13:29 +0800
Subject: [PATCH 12/18] Remove capture_error_mode parameter from accelerator
 API

torch.cuda.graph() defaults to 'global' mode in modern PyTorch;
the explicit parameter was unnecessary API surface.

Signed-off-by: Guokai Ma <guokai.ma@intel.com>
---
 accelerator/abstract_accelerator.py | 2 +-
 accelerator/cuda_accelerator.py     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py
index ea5ff4754fed..c764760b962c 100644
--- a/accelerator/abstract_accelerator.py
+++ b/accelerator/abstract_accelerator.py
@@ -209,7 +209,7 @@ def create_graph(self):
         ...
 
     @abc.abstractmethod
-    def capture_to_graph(self, graph, pool=None, stream=None, capture_error_mode="global"):
+    def capture_to_graph(self, graph, pool=None, stream=None):
         ...
 
     @abc.abstractmethod
diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
index 167490ace720..24766f8c0a81 100644
--- a/accelerator/cuda_accelerator.py
+++ b/accelerator/cuda_accelerator.py
@@ -274,8 +274,8 @@ def is_triton_supported(self):
     def create_graph(self):
         return torch.cuda.CUDAGraph()
 
-    def capture_to_graph(self, graph, pool=None, stream=None, capture_error_mode="global"):
-        return torch.cuda.graph(graph, pool, stream, capture_error_mode=capture_error_mode)
+    def capture_to_graph(self, graph, pool=None, stream=None):
+        return torch.cuda.graph(graph, pool, stream)
 
     def replay_graph(self, graph):
         graph.replay()

From 89c9bf15a6dcbea85d3733b3b6c9eccf3321121b Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@intel.com>
Date: Fri, 3 Jul 2026 20:45:37 +0800
Subject: [PATCH 13/18] Trim RolloutConfig to engine-only fields (engine,
 use_graph_capture)

Generation knobs (temperature, top_p, etc.) are application-level
concerns. They belong in SamplingConfig (passed per generate() call),
not in the engine config. Move them to DeepSpeedExamples' app config.

Signed-off-by: Guokai Ma <guokai.ma@intel.com>
---
 deepspeed/runtime/rollout/base.py | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/deepspeed/runtime/rollout/base.py b/deepspeed/runtime/rollout/base.py
index af215ac3998c..875b064a50b7 100644
--- a/deepspeed/runtime/rollout/base.py
+++ b/deepspeed/runtime/rollout/base.py
@@ -20,14 +20,6 @@ class RolloutConfig:
     """Configuration for the rollout engine."""
     engine: str = "hybrid_engine"
 
-    # Generation knobs
-    max_prompt_length: int = 1024
-    max_response_length: int = 1024
-    temperature: float = 1.0
-    top_p: float = 1.0
-    top_k: int = -1
-    n_samples_per_prompt: int = 1
-
     # Use CUDA graph capture for decode acceleration.
     use_graph_capture: bool = False
 

From eb19237392666c66bd5cf1ff142b77052ac11bf3 Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@intel.com>
Date: Fri, 3 Jul 2026 20:50:34 +0800
Subject: [PATCH 14/18] Clean up vLLM references in rollout/base.py docstrings

Signed-off-by: Guokai Ma <guokai.ma@intel.com>
---
 deepspeed/runtime/rollout/base.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/deepspeed/runtime/rollout/base.py b/deepspeed/runtime/rollout/base.py
index 875b064a50b7..abff6c6ccb12 100644
--- a/deepspeed/runtime/rollout/base.py
+++ b/deepspeed/runtime/rollout/base.py
@@ -5,8 +5,7 @@
 
 The trainer talks to its rollout engine through three small dataclasses
 (``RolloutRequest`` in / ``RolloutBatch`` out / ``SamplingConfig``) and one
-ABC. This keeps the engine-specific concerns (hybrid-engine vs vLLM, weight
-sync, process topology) out of the trainer loop.
+ABC. This keeps engine-specific concerns out of the trainer loop.
 """
 
 from abc import ABC, abstractmethod
@@ -99,7 +98,8 @@ def generate(self, request: RolloutRequest, sampling: SamplingConfig) -> Rollout
     def sync_weights(self, step: int) -> None:
         """Push updated weights into the rollout backend.
 
-        No-op for hybrid engine (reads weights live). Meaningful for vLLM.
+        No-op when the rollout engine is co-located with the training engine
+        (e.g. hybrid engine shares weights directly).
         """
 
     def shutdown(self) -> None:

From d626b0b6362b2223c6b600df9dfe629fcfb86373 Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@intel.com>
Date: Fri, 3 Jul 2026 21:00:58 +0800
Subject: [PATCH 15/18] Replace remaining torch.cuda stream calls with
 accelerator abstraction

Signed-off-by: Guokai Ma <guokai.ma@intel.com>
---
 deepspeed/runtime/rollout/hybrid_engine_rollout.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/deepspeed/runtime/rollout/hybrid_engine_rollout.py b/deepspeed/runtime/rollout/hybrid_engine_rollout.py
index 180f57970a50..7e6279b8bf83 100644
--- a/deepspeed/runtime/rollout/hybrid_engine_rollout.py
+++ b/deepspeed/runtime/rollout/hybrid_engine_rollout.py
@@ -167,9 +167,9 @@ def _generate_graph(self, prompt_ids, prompt_attn, max_new_tokens, pad_token_id,
         try:
             # Warmup on side stream
             static_token.copy_(next_token)
-            s = torch.cuda.Stream()  #ignore-cuda
-            s.wait_stream(torch.cuda.current_stream())  #ignore-cuda
-            with torch.cuda.stream(s):  #ignore-cuda
+            s = get_accelerator().Stream()
+            s.wait_stream(get_accelerator().current_stream())
+            with get_accelerator().stream(s):
                 for _ in range(3):
                     out = module(
                         static_token,
@@ -179,11 +179,11 @@ def _generate_graph(self, prompt_ids, prompt_attn, max_new_tokens, pad_token_id,
                         cache_position=static_cache_pos,
                         position_ids=static_pos_ids,
                     )
-            torch.cuda.current_stream().wait_stream(s)  #ignore-cuda
+            get_accelerator().current_stream().wait_stream(s)
 
             # Capture
-            graph = get_accelerator().create_graph()  #ignore-cuda
-            with get_accelerator().capture_to_graph(graph):  #ignore-cuda
+            graph = get_accelerator().create_graph()
+            with get_accelerator().capture_to_graph(graph):
                 out = module(
                     static_token,
                     attention_mask=static_attn,

From f7115693f9bfc0221f3a30986bceda4911500eec Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@intel.com>
Date: Sat, 4 Jul 2026 00:08:06 +0800
Subject: [PATCH 16/18] Remove remaining rlhf/data.py and rlhf/teacher.py

Signed-off-by: Guokai Ma <guokai.ma@intel.com>
---
 deepspeed/runtime/rlhf/data.py    | 107 -----------------
 deepspeed/runtime/rlhf/teacher.py | 190 ------------------------------
 2 files changed, 297 deletions(-)
 delete mode 100644 deepspeed/runtime/rlhf/data.py
 delete mode 100644 deepspeed/runtime/rlhf/teacher.py

diff --git a/deepspeed/runtime/rlhf/data.py b/deepspeed/runtime/rlhf/data.py
deleted file mode 100644
index df6e19908846..000000000000
--- a/deepspeed/runtime/rlhf/data.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Prompt dataset and left-padding collator for OPSD rollouts.
-
-The dataset reads a JSONL file with one record per line; each record must
-contain a string under :attr:`DataConfig.prompt_field` (default ``"prompt"``).
-If the tokenizer exposes ``apply_chat_template``, single-turn prompts are
-wrapped in a user-role message with ``add_generation_prompt=True`` so the
-student generates the assistant turn.
-
-Batches are **left-padded** because causal generation requires real tokens at
-    the right edge — :class:`deepspeed.runtime.rollout.RolloutRequest` and the hybrid-engine
-backend both assume this layout.
-"""
-
-import json
-from typing import Any, Dict, List, Optional
-
-import torch
-from torch.utils.data import Dataset
-
-
-class PromptDataset(Dataset):
-    """Reads ``{prompt_field: str}`` records from a JSONL file."""
-
-    def __init__(
-        self,
-        path: str,
-        tokenizer: Any,
-        max_prompt_length: int,
-        prompt_field: str = "prompt",
-        chat_template: Optional[str] = None,
-    ):
-        self.records = self._load_jsonl(path)
-        self.tokenizer = tokenizer
-        self.max_prompt_length = max_prompt_length
-        self.prompt_field = prompt_field
-        self.chat_template = chat_template
-
-    @staticmethod
-    def _load_jsonl(path: str) -> List[Dict[str, Any]]:
-        records: List[Dict[str, Any]] = []
-        with open(path, "r") as f:
-            for line in f:
-                line = line.strip()
-                if not line:
-                    continue
-                records.append(json.loads(line))
-        return records
-
-    def __len__(self) -> int:
-        return len(self.records)
-
-    def __getitem__(self, idx: int) -> str:
-        rec = self.records[idx]
-        if self.prompt_field not in rec:
-            raise KeyError(f"record {idx} missing field {self.prompt_field!r}")
-        text = rec[self.prompt_field]
-
-        # If the tokenizer knows a chat template, render the prompt as a single
-        # user-role turn and request the generation prompt. This matches how
-        # instruction-tuned student/teacher checkpoints expect inputs.
-        if hasattr(self.tokenizer, "apply_chat_template"):
-            messages = [{"role": "user", "content": text}] if isinstance(text, str) else text
-            text = self.tokenizer.apply_chat_template(
-                messages,
-                tokenize=False,
-                add_generation_prompt=True,
-                chat_template=self.chat_template,
-            )
-        return text
-
-
-class LeftPaddedPromptCollator:
-    """Tokenizes a batch of prompt strings into a left-padded tensor batch."""
-
-    def __init__(self, tokenizer: Any, max_prompt_length: int):
-        self.tokenizer = tokenizer
-        self.max_prompt_length = max_prompt_length
-        self.pad_id = tokenizer.pad_token_id if tokenizer.pad_token_id is not None else tokenizer.eos_token_id
-        if self.pad_id is None:
-            raise ValueError("tokenizer has neither pad_token_id nor eos_token_id; "
-                             "cannot construct a padding collator")
-
-    def __call__(self, batch_texts: List[str]) -> Dict[str, torch.Tensor]:
-        per_sample = [
-            self.tokenizer(
-                t,
-                add_special_tokens=False,
-                truncation=True,
-                max_length=self.max_prompt_length,
-                return_tensors="pt",
-            )["input_ids"].squeeze(0) for t in batch_texts
-        ]
-        max_len = max(int(x.shape[0]) for x in per_sample)
-        B = len(per_sample)
-
-        prompt_ids = torch.full((B, max_len), self.pad_id, dtype=torch.long)
-        attention_mask = torch.zeros((B, max_len), dtype=torch.long)
-        for i, ids in enumerate(per_sample):
-            n = int(ids.shape[0])
-            # left-pad: real tokens at the right edge
-            prompt_ids[i, max_len - n:] = ids
-            attention_mask[i, max_len - n:] = 1
-
-        return {"prompt_ids": prompt_ids, "prompt_attention_mask": attention_mask}
diff --git a/deepspeed/runtime/rlhf/teacher.py b/deepspeed/runtime/rlhf/teacher.py
deleted file mode 100644
index 9ad6118fe370..000000000000
--- a/deepspeed/runtime/rlhf/teacher.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-# DeepSpeed Team
-"""Frozen teacher: two-phase forward with CPU-cached logits.
-
-The trainer runs each step in two phases:
-
-  1. **Teacher phase.** Forward over the prompt+response. The full ``[B, T, V]``
-     logit tensor is moved off the GPU into a :class:`TeacherLogitCache` so that
-     teacher weight buffers can be released before the student backward pass.
-  2. **Student phase.** Forward + backward on the student. The distillation
-     loss pulls teacher logits back to GPU **one sequence chunk at a time** via
-     :meth:`TeacherLogitCache.chunk_to_device`, so peak GPU memory for teacher
-     data is only ``[B, chunk, V]``.
-
-This module deliberately lazy-imports ``deepspeed`` and ``transformers`` so
-that the pure data-handling pieces (``TeacherLogitCache`` and the streamed
-loss in :mod:`opsd.losses`) remain importable in CPU-only unit tests that do
-not have a working DeepSpeed launcher.
-"""
-
-from dataclasses import dataclass
-from typing import Optional, Tuple
-
-import torch
-
-# ``opsd.config`` is pure-Python (no distributed imports), so we can import it
-# at module load time without pulling in DeepSpeed.
-from deepspeed.runtime.rlhf.config import TeacherConfig
-
-
-@dataclass
-class TeacherLogitCache:
-    """CPU-resident teacher logits with on-demand chunk fetch.
-
-    Stored in low precision (default ``bfloat16``) to halve host memory; the
-    consumer in :mod:`opsd.losses` promotes back to fp32 inside the divergence
-    so the KD math stays well-conditioned.
-    """
-
-    cpu_logits: torch.Tensor  # [B, T, V]
-
-    def __post_init__(self) -> None:
-        if self.cpu_logits.dim() != 3:
-            raise ValueError(f"cpu_logits must be 3-D [B, T, V]; got shape "
-                             f"{tuple(self.cpu_logits.shape)}")
-        if self.cpu_logits.device.type != "cpu":
-            raise ValueError(f"cpu_logits must live on CPU; got device "
-                             f"{self.cpu_logits.device}")
-
-    @classmethod
-    def from_gpu_logits(cls, logits: torch.Tensor, store_dtype: torch.dtype = torch.bfloat16) -> "TeacherLogitCache":
-        """Detach + downcast + move to (pinned) host memory.
-
-        ``non_blocking=True`` lets the copy overlap with the next CUDA op when
-        the destination is pinned; we try to pin and fall back silently if the
-        host doesn't support it (e.g. CPU-only test environments).
-        """
-        downcast = logits.detach().to(dtype=store_dtype)
-        try:
-            host = torch.empty(downcast.shape, dtype=store_dtype, pin_memory=True)
-            host.copy_(downcast, non_blocking=True)
-        except RuntimeError:
-            host = downcast.cpu()
-        return cls(cpu_logits=host)
-
-    @property
-    def shape(self) -> Tuple[int, int, int]:
-        s = self.cpu_logits.shape
-        return (int(s[0]), int(s[1]), int(s[2]))
-
-    @property
-    def dtype(self) -> torch.dtype:
-        return self.cpu_logits.dtype
-
-    def chunk_to_device(self,
-                        start: int,
-                        end: int,
-                        device: torch.device,
-                        dtype: Optional[torch.dtype] = None) -> torch.Tensor:
-        """Slice ``[:, start:end, :]`` and stage it on ``device``.
-
-        ``dtype`` is the dtype on the destination; if ``None``, the stored
-        dtype is preserved.
-        """
-        _, T, _ = self.shape
-        if not (0 <= start < end <= T):
-            raise ValueError(f"chunk bounds [{start}, {end}) invalid for T={T}")
-        chunk = self.cpu_logits[:, start:end]
-        out = chunk.to(device=device, dtype=dtype if dtype is not None else chunk.dtype, non_blocking=True)
-        return out
-
-    def free(self) -> None:
-        """Drop the underlying buffer so a step's teacher cache can be GC'd
-        before the next teacher forward."""
-        self.cpu_logits = torch.empty(0)
-
-
-_DTYPE_MAP = {
-    "float16": torch.float16,
-    "fp16": torch.float16,
-    "bfloat16": torch.bfloat16,
-    "bf16": torch.bfloat16,
-    "float32": torch.float32,
-    "fp32": torch.float32,
-}
-
-
-def _resolve_dtype(name: str) -> torch.dtype:
-    if name not in _DTYPE_MAP:
-        raise ValueError(f"Unknown dtype {name!r}; choose from {sorted(_DTYPE_MAP)}")
-    return _DTYPE_MAP[name]
-
-
-class TeacherWrapper:
-    """Frozen teacher.
-
-    Two modes depending on ``cfg.offload_to_cpu``:
-
-      * ``offload_to_cpu=False`` — load the teacher with HF's standard
-        ``from_pretrained`` and pin it on the local accelerator device. The
-        whole teacher lives in GPU memory; simplest path and what to use when
-        the teacher fits.
-
-      * ``offload_to_cpu=True`` — wrap the loaded model with
-        ``deepspeed.initialize`` using a ZeRO-3 config with
-        ``offload_param.device='cpu'``. The optimizer slot is unused (no
-        trainable params) but ZeRO-3 gives us per-forward parameter gather
-        / release and keeps weights on the host between forwards. This is the
-        path to use when the teacher would otherwise not fit alongside the
-        student.
-
-    Both paths load the full checkpoint on each rank before DeepSpeed (if
-    used) partitions; we intentionally do **not** wrap ``from_pretrained``
-    in ``deepspeed.zero.Init()`` because HF's loader partitions
-    ``low_cpu_mem_usage`` params to zero-width shards before the checkpoint
-    can fill them, which surfaces as a "size mismatch" load error.
-    """
-
-    def __init__(self, cfg: TeacherConfig, world_size: int):
-        from deepspeed.accelerator import get_accelerator
-        from transformers import AutoModelForCausalLM
-
-        self.cfg = cfg
-        dtype = _resolve_dtype(cfg.dtype)
-        device = get_accelerator().current_device_name()
-
-        model = AutoModelForCausalLM.from_pretrained(
-            cfg.model_name_or_path,
-            torch_dtype=dtype,
-            trust_remote_code=cfg.trust_remote_code,
-        )
-        model.eval()
-        for p in model.parameters():
-            p.requires_grad_(False)
-
-        if cfg.offload_to_cpu:
-            import deepspeed
-
-            ds_config = {
-                "train_micro_batch_size_per_gpu": 1,
-                "bf16": {
-                    "enabled": dtype is torch.bfloat16
-                },
-                "fp16": {
-                    "enabled": dtype is torch.float16
-                },
-                "zero_optimization": {
-                    "stage": 3,
-                    "offload_param": {
-                        "device": "cpu"
-                    },
-                },
-            }
-            engine, *_ = deepspeed.initialize(model=model, config=ds_config)
-            self._callable = engine
-            self._uses_ds = True
-        else:
-            model.to(device)
-            self._callable = model
-            self._uses_ds = False
-
-    @torch.no_grad()
-    def forward_to_cache(self,
-                         input_ids: torch.Tensor,
-                         attention_mask: torch.Tensor,
-                         store_dtype: torch.dtype = torch.bfloat16) -> TeacherLogitCache:
-        """Run teacher forward and stage logits onto the host."""
-        outputs = self._callable(input_ids=input_ids, attention_mask=attention_mask)
-        return TeacherLogitCache.from_gpu_logits(outputs.logits, store_dtype=store_dtype)

From 0cf471e247a340638f695c9020dfa6fde0bad9f3 Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@intel.com>
Date: Sat, 4 Jul 2026 00:10:42 +0800
Subject: [PATCH 17/18] Remove 'tags' from .gitignore (editor artifact)

Signed-off-by: Guokai Ma <guokai.ma@intel.com>
---
 .gitignore | 1 -
 1 file changed, 1 deletion(-)

diff --git a/.gitignore b/.gitignore
index 10689d1a8a33..13e79bacce4a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -66,4 +66,3 @@ tests/unit/saved_checkpoint/
 
 # virtual env directory for format
 venv
-tags

From 0df8944de47d126467d2d01f5c2e94b1eb5af75d Mon Sep 17 00:00:00 2001
From: Guokai Ma <guokai.ma@intel.com>
Date: Sat, 4 Jul 2026 00:24:32 +0800
Subject: [PATCH 18/18] Fix yapf formatting in test_rollout_interface.py

Signed-off-by: Guokai Ma <guokai.ma@intel.com>
---
 tests/unit/runtime/rollout/test_rollout_interface.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/unit/runtime/rollout/test_rollout_interface.py b/tests/unit/runtime/rollout/test_rollout_interface.py
index 995056941ad0..bb45267ef5ac 100644
--- a/tests/unit/runtime/rollout/test_rollout_interface.py
+++ b/tests/unit/runtime/rollout/test_rollout_interface.py
@@ -19,8 +19,6 @@
     build_rollout,
 )
 
-
-
 # --- dataclass invariants ---------------------------------------------------