diff --git a/docs/design/auto-compaction-threshold-redesign.md b/docs/design/auto-compaction-threshold-redesign.md
new file mode 100644
index 0000000000..79bd6a8afc
--- /dev/null
+++ b/docs/design/auto-compaction-threshold-redesign.md
@@ -0,0 +1,418 @@
+# Auto-Compaction Threshold Redesign
+
+**Status:** Draft · 2026-05-14
+
+## 背景
+
+当前 qwen-code 的自动压缩仅使用单一比例阈值 `COMPRESSION_TOKEN_THRESHOLD = 0.7`（`chatCompressionService.ts:33`），所有窗口大小共用同一比例。对比 claude-code 的「绝对 token 梯子」（autoCompact.ts:62-65），qwen-code 存在三个具体问题：
+
+1. **大窗口下预留过多**：1M 模型 70% 阈值在 700K 触发，剩余 300K 远超摘要 + 输出实际所需的 ~33K
+2. **失败 1 次永久锁**：`hasFailedCompressionAttempt = true` 之后整个 session 不再尝试 auto-compact（geminiChat.ts:504），比 claude-code 的「连续 3 次熔断」更严苛
+3. **tip 系统与 auto 阈值脱钩**：`tipRegistry.ts` 里的三条 `context-*` tip 使用固定的 50/80/95 百分比，与 auto-compact 阈值（70%）完全独立。这意味着在「auto 正常工作」的主路径上 80% / 95% tip 极少触发，而在「auto 失败 / 反应式兜底」的边缘路径上又缺乏与阈值对齐的语义
+4. **压缩调用本身没有输出预算控制**：[chatCompressionService.ts:374-376](packages/core/src/services/chatCompressionService.ts:374) 显式开启 `thinkingConfig.includeThoughts = true`（注释：「Compression quality drives every subsequent main turn」），同时 sideQuery 调用未设 `maxOutputTokens` 上限。代码注释（[:436-437](packages/core/src/services/chatCompressionService.ts:436)）也承认 `compressionOutputTokenCount may include non-persisted tokens (thoughts)`。在压缩接近窗口顶时，总输出可能膨胀，使 buffer 预留缺乏可预测上限。<br/><br/>更糟糕的是跨 provider 行为不一致：Anthropic 的 thinking budget 与 max_tokens 完全独立；OpenAI 的 reasoning tokens 不受 max_completion_tokens 限制；Gemini 的行为又因模型版本而异。这意味着「单靠加 maxOutputTokens 就能控制总输出」在 qwen-code 这种多 provider 项目里不成立
+
+5. **阈值判断使用的 `lastPromptTokenCount` 系统性下偏。** [geminiChat.ts:1217-1232](packages/core/src/core/geminiChat.ts:1217) 表明这个数来自上一轮 API response 的 `usageMetadata.totalTokenCount`。两个 gap：(a) 不包含本轮即将加入的 user message，每次 cheap-gate 判断都比真实 prompt 小一段；(b) 首轮初始值是 0，`--continue` 恢复巨大 session / sub-agent 继承大量历史时第一次 send 永远绕过所有阈值。对比 claude-code 的 `tokenCountWithEstimation`（[query.ts:638](src/query.ts:638)）走「最后一条 assistant API usage + 之后新增 message 估算」的双轨制能闭合这两个 gap
+
+## 设计目标
+
+- 引入「比例 + 绝对」混合阈值，让大窗口模型由绝对值接管，小窗口仍走比例兜底
+- 新增 warn / hard 两层（auto 保留为主触发点），形成三层梯子
+- 把 tip 系统重写为跟随新阈值的触发条件
+- 失败处理从「1 次永久锁」升级为「3 次熔断 + 自动恢复」
+- **压缩调用关闭 thinking 并加 `maxOutputTokens` 上限**：与 claude-code 对齐，让总输出受单一参数约束、buffer 预算可预测；接受压缩质量可能下降的代价
+- **加 token 估算补偿**：消除 `lastPromptTokenCount` 的「滞后一轮」和「首轮为 0」两个系统性下偏，让阈值判断更贴近真实 prompt 大小
+- 删除 settings 里的 `contextPercentageThreshold` 配置入口（内部 PCT 常量保留）
+- **不引入** env 覆盖通道、**不**新增显式 enabled 开关
+
+## 三层阈值梯子
+
+```
+                       window  (raw context window)
+                          │
+                          │  ← SUMMARY_RESERVE = 20K
+                          ▼
+                    effectiveWindow
+                          │
+                          │  ← HARD_BUFFER = 3K
+                          ▼
+              hard_threshold = effectiveWindow - 3K
+                          │
+                          │  ← (AUTOCOMPACT_BUFFER - HARD_BUFFER) = 10K
+                          ▼
+auto_threshold = max(PCT * window, effectiveWindow - AUTOCOMPACT_BUFFER)
+                          │
+                          │  ← WARN_BUFFER = 20K
+                          ▼
+warn_threshold = max((PCT - WARN_OFFSET) * window, auto_threshold - WARN_BUFFER)
+                          │
+                          ▼
+                          0
+```
+
+### 三层语义
+
+| 层       | 触发条件                       | 行为                                                     |
+| -------- | ------------------------------ | -------------------------------------------------------- |
+| **warn** | `tokenCount >= warn_threshold` | UI 提示「距自动压缩还剩 X tokens」，不改变 send 行为     |
+| **auto** | `tokenCount >= auto_threshold` | 在 send 前 `tryCompress(force=false)`，正常压缩流程      |
+| **hard** | `tokenCount >= hard_threshold` | 在 send 前 `tryCompress(force=true)`，重置失败锁强制压缩 |
+
+`hard` 层等同于把现有 reactive overflow（geminiChat.ts:711）的兜底逻辑提前到 send 前，避免一次失败的 oversized request round-trip。
+
+## 内部常量
+
+```ts
+// chatCompressionService.ts
+const DEFAULT_PCT = 0.7; // auto 比例兜底
+const WARN_PCT_OFFSET = 0.1; // warn 比例 = PCT - WARN_OFFSET = 0.6
+const COMPACT_MAX_OUTPUT_TOKENS = 20_000; // 压缩 sideQuery 输出硬上限（thinking + summary 合计）
+const SUMMARY_RESERVE = 20_000; // 阈值梯子从窗口顶减去的输出预留 = maxOutput
+const AUTOCOMPACT_BUFFER = 13_000; // auto 与 effectiveWindow 间距
+const WARN_BUFFER = 20_000; // warn 与 auto 间距
+const HARD_BUFFER = 3_000; // hard 与 effectiveWindow 间距
+const MAX_CONSECUTIVE_FAILURES = 3; // 失败熔断阈值
+```
+
+数值来源：全部沿用 claude-code 的实测值（[autoCompact.ts:30,62-65](src/services/compact/autoCompact.ts:30)）。
+
+`SUMMARY_RESERVE = COMPACT_MAX_OUTPUT_TOKENS` 是关键关系：模型受 `maxOutputTokens` 硬限制约束，输出不可能超出 20K，因此 reserve 不需要额外 safety margin。注意：本设计关闭 thinking 后该等式成立（output budget 全部给 summary）；若保留 thinking，`thinking + summary` 共享预算（Gemini SDK / 多数 provider 的 `maxOutputTokens` 语义），模型自行在两者间分配，此时 summary 的实际可用空间小于 20K（见「风险与注意事项」第 1、2 条）。
+
+## 计算函数
+
+```ts
+export interface CompactionThresholds {
+  warn: number;
+  auto: number;
+  hard: number; // 当 hard < auto 时等于 auto（小窗口退化）
+  effectiveWindow: number;
+}
+
+export function computeThresholds(window: number): CompactionThresholds {
+  const effectiveWindow = window - SUMMARY_RESERVE;
+
+  const absAuto = effectiveWindow - AUTOCOMPACT_BUFFER;
+  const auto = Math.max(DEFAULT_PCT * window, absAuto);
+
+  const absWarn = auto - WARN_BUFFER;
+  const warn = Math.max((DEFAULT_PCT - WARN_PCT_OFFSET) * window, absWarn);
+
+  const rawHard = effectiveWindow - HARD_BUFFER;
+  const hard = Math.max(rawHard, auto); // 小窗口下退化为 auto
+
+  return { warn, auto, hard, effectiveWindow };
+}
+```
+
+### 实测数据
+
+| 窗口 | warn        | auto        | hard         | 备注                            |
+| ---- | ----------- | ----------- | ------------ | ------------------------------- |
+| 32K  | 19.2K (pct) | 22.4K (pct) | 22.4K (退化) | 比例兜底                        |
+| 64K  | 38.4K (pct) | 44.8K (pct) | 44.8K (退化) | 比例兜底                        |
+| 128K | 76.8K (pct) | 95K (abs)   | 105K (abs)   | 混合（warn=pct, auto/hard=abs） |
+| 200K | 147K (abs)  | 167K (abs)  | 177K (abs)   | 绝对接管                        |
+| 256K | 203K (abs)  | 223K (abs)  | 233K (abs)   | 绝对接管                        |
+| 1M   | 947K (abs)  | 967K (abs)  | 977K (abs)   | 全绝对                          |
+
+`(pct)` 表示该层由比例公式决定，`(abs)` 表示由绝对值公式决定。
+
+## 用户配置
+
+### ChatCompressionSettings 变更
+
+```ts
+// packages/core/src/config/config.ts:217
+export interface ChatCompressionSettings {
+  /** 保留（与本设计无关，由 compactionInputSlimming 使用） */
+  imageTokenEstimate?: number;
+}
+```
+
+**删除：** `contextPercentageThreshold` 字段。理由：
+
+1. 新公式下，对主流窗口（>= 128K）该字段几乎无影响——绝对值接管
+2. 小窗口下用户配置反而可能让阈值"更早"压缩，与节省 token 直觉相反
+3. claude-code 没有暴露此字段，无类似的用户面配置先例
+
+### Breaking change 处理
+
+启动时 `Config` 加载发现 `chatCompression.contextPercentageThreshold` 存在：
+
+- 写入 stderr 一行警告：`"chatCompression.contextPercentageThreshold has been removed and is now controlled by built-in thresholds."`
+- **不**报错、**不**阻塞启动
+- 字段值被忽略
+
+## Token 估算补偿
+
+qwen-code 的 `lastPromptTokenCount` 来自上一轮 API response 的 `usageMetadata.totalTokenCount`（[geminiChat.ts:1217-1232](packages/core/src/core/geminiChat.ts:1217)）。这导致：
+
+1. **滞后一轮**：cheap-gate 用 `lastPromptTokenCount` 判断，但本次 send 实际 prompt = 它 + 本轮 user message。少算的部分可能让阈值判断 false-negative
+2. **首轮为 0**：初始值是 0，第一次 send 时无论历史多大都不会触发任何阈值（含 `--continue` 恢复 / sub-agent 继承场景）
+
+引入轻量本地估算函数 `estimatePromptTokens`，在 send 前 cheap-gate / hard 判断时补足这两段缺失：
+
+```ts
+// chatCompressionService.ts（或新文件 packages/core/src/services/tokenEstimation.ts）
+
+const BYTES_PER_TOKEN = 4; // 通用 char/4 估算（claude-code 同此）
+const BYTES_PER_TOKEN_JSON = 2; // JSON / tool_call input 更密集
+
+/**
+ * 估算一组 Content 的 token 数，用于补偿 API usage metadata 的滞后。
+ * 对 image / document 复用现有 imageTokenEstimate（默认 1600）。
+ */
+export function estimateContentTokens(
+  contents: Content[],
+  imageTokenEstimate = DEFAULT_IMAGE_TOKEN_ESTIMATE,
+): number {
+  // 复用 estimateContentChars（compactionInputSlimming.ts），再除以 bytesPerToken
+  // 内部对 functionCall / functionResponse 用 BYTES_PER_TOKEN_JSON
+  // ...
+}
+
+/**
+ * cheap-gate 与 hard 判断的统一入口。
+ * 主路径：lastPromptTokenCount 准 + 本轮 user message 估算
+ * 首轮路径：full history 估算
+ */
+export function estimatePromptTokens(
+  history: Content[],
+  userMessage: Content,
+  lastPromptTokenCount: number,
+): number {
+  if (lastPromptTokenCount > 0) {
+    return lastPromptTokenCount + estimateContentTokens([userMessage]);
+  }
+  return estimateContentTokens([...history, userMessage]);
+}
+```
+
+应用位置：
+
+- `chatCompressionService.compress()` 的 cheap-gate：把 `originalTokenCount` 来源换成 `estimatePromptTokens(history, userMessage, lastPromptTokenCount)`
+- `geminiChat.sendMessageStream` 入口的 hard 判断（见下一节）
+
+**估算只用于提前触发，不用于「跳过触发」。** 因为 char/4 是粗略下界估计，作为 false-positive 一侧是安全的（宁可早一点压），作为 false-negative 则不可靠。
+
+## 触发链路改动
+
+### chatCompressionService.ts
+
+1. **导出 `computeThresholds`**，供 cheap-gate / UI / 命令复用
+2. **`compress()` cheap-gate** (line 221-249)：
+   ```ts
+   if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES && !force) {
+     return NOOP;
+   }
+   const { auto } = computeThresholds(contextLimit);
+   const effectiveTokens = estimatePromptTokens(
+     curatedHistory,
+     userMessage,
+     originalTokenCount,
+   );
+   if (!force && effectiveTokens < auto) return NOOP;
+   ```
+3. **`compress()` 的 runSideQuery 调用** (line 356-380)：关闭 thinking + 加 `maxOutputTokens`：
+
+   ```ts
+   const summaryResult = await runSideQuery(config, {
+     // ...
+     config: {
+       thinkingConfig: { includeThoughts: false }, // 关闭 thinking（与 claude-code 一致）
+       maxOutputTokens: COMPACT_MAX_OUTPUT_TOKENS, // 硬上限 20K
+     },
+     // ...
+   });
+   ```
+
+   或者直接删掉 `thinkingConfig` 让 `runSideQuery` 默认值（[sideQuery.ts:118](packages/core/src/utils/sideQuery.ts:118) 默认 `includeThoughts: false`）接管。
+
+   关 thinking 后，`maxOutputTokens` 直接约束总输出（不存在 thinking 单独 budget 的问题），`SUMMARY_RESERVE = maxOutput = 20K` 是干净的硬关系。
+
+   同时更新 [chatCompressionService.ts:374-376](packages/core/src/services/chatCompressionService.ts:374) 的注释，从「Compression quality drives every subsequent main turn — keep reasoning on」改为说明「为保证跨 provider 可预测的输出上限，与 claude-code 设计对齐」。
+
+   token math 一段（[:436-437](packages/core/src/services/chatCompressionService.ts:436)）的 "may include non-persisted tokens (thoughts)" 注释也可以同步清理
+
+### geminiChat.ts: `sendMessageStream` 入口（line 562）
+
+```ts
+// 替换前：tryCompress(force=false)
+// 替换后：用估算 token 判断是否触发 hard，决定 force 标志
+
+const { hard } = computeThresholds(contextLimit);
+const effectiveTokens = estimatePromptTokens(
+  this.getHistory(true),
+  createUserContent(params.message),
+  this.lastPromptTokenCount,
+);
+const shouldForceFromHard = effectiveTokens >= hard;
+
+if (shouldForceFromHard) {
+  // 重置熔断器，等同 force compress
+  this.consecutiveFailures = 0;
+}
+
+compressionInfo = await this.tryCompress(
+  prompt_id,
+  model,
+  shouldForceFromHard,
+  params.config?.abortSignal,
+);
+```
+
+### 失败处理升级 (`geminiChat.ts:504-510`)
+
+```ts
+// 替换前
+hasFailedCompressionAttempt: boolean;
+
+// 替换后
+consecutiveFailures: number;  // 默认 0
+
+// 失败分支
+} else if (isCompressionFailureStatus(info.compressionStatus)) {
+  if (!force) {
+    this.consecutiveFailures += 1;
+  }
+}
+
+// 成功分支
+this.consecutiveFailures = 0;
+```
+
+`force=true` 调用失败不计入计数（保持现有 reactive / manual 不"占额"的语义）。
+
+## UI 改动
+
+### tipRegistry.ts 重写三条 context-\* tip
+
+三层阈值正好与三条 tip 一一对应。映射关系（按 token 数从低到高）：
+
+| Tip ID             | 当前条件                                      | 新条件                                                              | 文案变化                                                          |
+| ------------------ | --------------------------------------------- | ------------------------------------------------------------------- | ----------------------------------------------------------------- |
+| `compress-intro`   | `pct >= 50 && < 80 && sessionPromptCount > 5` | `tokenCount >= warn && tokenCount < auto && sessionPromptCount > 5` | 保持不变                                                          |
+| `context-high`     | `pct >= 80 && < 95`                           | `tokenCount >= auto && tokenCount < hard`                           | 保持不变                                                          |
+| `context-critical` | `pct >= 95`                                   | `tokenCount >= hard`                                                | 加一句「Auto-compact will force on next send.」反映新 hard 层行为 |
+
+**对触发频率的影响：**
+
+- 主路径（auto 正常工作）：`tokenCount` 跨越 auto 后立即触发压缩，下一轮 tokenCount 回落，所以 `context-high` 仅在「触发到压缩生效之间」短暂可见
+- 边缘路径（auto 失败 / 熔断 / reactive 来不及）：`tokenCount` 持续上涨，会依次穿过 warn → auto → hard 触发三条 tip，跟用户视角的"上下文越来越紧"一致
+- `context-critical` 触发时 hard 层已经在 send 前 force compress（spec 触发链路改动一节），所以这条 tip 实际上是「post-rescue 告知」而非「pre-rescue 警告」，文案补一句说明
+
+`TipContext` 接口增加：
+
+```ts
+export interface TipContext {
+  lastPromptTokenCount: number;
+  contextWindowSize: number;
+  sessionPromptCount: number;
+  sessionCount: number;
+  platform: string;
+  // 新增：让 isRelevant 函数能拿到阈值。
+  // computeThresholds 在调用方算好后注入，避免 tipRegistry 直接依赖 core。
+  thresholds?: CompactionThresholds;
+}
+```
+
+`AppContainer.tsx:1150` 构造 `TipContext` 时同步注入。
+
+### /context 命令同步 (`contextCommand.ts:177-183`)
+
+```ts
+// 替换硬编码 (1 - threshold) * contextWindowSize
+const { warn, auto, hard, effectiveWindow } =
+  computeThresholds(contextWindowSize);
+
+// 显示四行：
+//   Effective window:   180K   (window − 20K reserve)
+//   Warn threshold:     147K   (...)
+//   Auto threshold:     167K   ← 当前位置
+//   Hard threshold:     177K
+// 标记当前 token count 落在哪个 tier
+```
+
+### Footer 持续提示（可选 follow-up）
+
+本 spec 不强制实现 footer 持续提示，理由：
+
+- 现有 tip 系统已经能在 history 里给出提示
+- Footer 持续提示需要改 ink 渲染、增加重绘频率
+- 可作为本 spec 后置 follow-up（独立 PR）
+
+如果后续要做，建议触发条件 `tokenCount >= warn && tokenCount < auto`，超过 auto 后隐藏（压缩已开始）。
+
+## 测试覆盖
+
+### 单元测试（chatCompressionService.test.ts）
+
+- `computeThresholds(32K)` → 比例兜底分支（warn/auto 均 pct，hard 退化）
+- `computeThresholds(128K)` → 混合分支（warn=pct，auto=abs，hard=abs）
+- `computeThresholds(200K)` → 绝对接管分支（warn/auto/hard 均 abs）
+- `computeThresholds(1M)` → 全绝对分支
+- `computeThresholds(window=10K)` → 极小窗口（绝对值全负），公式不崩
+- 三层阈值始终满足 `warn <= auto <= hard`
+- max() 公式在边界点（pct \* window == abs）稳定
+
+### 单元测试（tokenEstimation.test.ts）
+
+- `estimateContentTokens` 对纯文本 / json / functionCall / functionResponse / image / document 分别走对应 bytesPerToken
+- `estimatePromptTokens` 在 `lastPromptTokenCount > 0` 时走「主路径」，等于 0 时走「首轮路径」
+- 大 user message 在 cheap-gate 阶段被加上去后能跨越 auto 阈值
+- 估算与真实 API usage 的偏差在 ±30% 以内（用真实历史样本回归）
+
+### 集成测试（geminiChat.test.ts / chatCompressionService.test.ts）
+
+- 3 次连续失败后 cheap-gate NOOP；下一次 force 后恢复
+- 单次失败不再永久锁
+- 估算 token 跨越 hard 后 send 自动 force compress
+- 压缩 sideQuery 调用 `maxOutputTokens = COMPACT_MAX_OUTPUT_TOKENS` 正确透传到 `runSideQuery`，`thinkingConfig.includeThoughts` 为 `false`（或被 sideQuery 默认值接管）
+- **首轮覆盖**：构造一个 `lastPromptTokenCount = 0` 但 history 巨大的 chat（模拟 `--continue` 恢复），首次 send 时 auto 阈值能被估算路径触发
+
+### 兼容性测试
+
+- 设置 `contextPercentageThreshold = 0.5` 启动 → stderr 警告 + 字段被忽略，行为以内部 PCT 常量为准
+
+### Tip 系统测试（tipRegistry.test.ts）
+
+- 三条 context-\* tip 在跨越 warn/auto/hard 时正确触发，且区间不重叠
+- 主路径下 auto 阈值触发压缩后 `context-high` 不持续可见
+- 边缘路径（熔断 + token 继续涨）下三条 tip 依次触发
+- TipContext 缺 `thresholds` 时（fallback）行为合理
+
+## 实施分阶段
+
+| Phase | 内容                                                                                         | 独立性             |
+| ----- | -------------------------------------------------------------------------------------------- | ------------------ |
+| 1     | 内部常量 + `computeThresholds` + cheap-gate 改动（不含估算补偿）                             | 可独立合并         |
+| 2     | 失败处理升级（1 → 3 熔断）                                                                   | 可独立合并         |
+| 3     | hard 层 force compress 提前                                                                  | 依赖 P1 + P7       |
+| 4     | 配置面变更 + breaking change 警告                                                            | 依赖 P1            |
+| 5     | UI（tip 重写 + /context）                                                                    | 依赖 P1            |
+| 6     | 压缩 sideQuery 关 thinking + 加 `maxOutputTokens` 上限                                       | 独立可先于 P1 落地 |
+| 7     | Token 估算补偿（`estimateContentTokens` + `estimatePromptTokens`，应用到 cheap-gate / hard） | 独立可与 P1 并行   |
+
+每个 Phase 可独立 PR。建议合并顺序 **P6 → P7 → P1 → P2 → P4 → P3 → P5**：先给压缩调用打上 `maxOutputTokens` 上限（让 buffer 假设可信）；再加估算补偿（让 token 数判断更可靠）；再把阈值基础设施落地；再做失败熔断、配置面变更；最后才打开 hard 层主动救场（这时已有可靠的 token 数 + 熔断器）。每个 PR 都能独立验证、独立回滚。
+
+## 风险与注意事项
+
+1. **关 thinking 可能影响摘要质量。** 原作者注释 "Compression quality drives every subsequent main turn — keep reasoning on" 表达过对此的担忧。本 spec 的判断是「可预测的 token 上限」优先于「最大化质量」，但落地后需要观察 telemetry 里 `compression_input_token_count` / `compression_output_token_count` 的分布，以及主对话在压缩后的质量变化（用户反馈、`COMPRESSION_FAILED_*` 状态率）。如果质量下降明显，再考虑回退到 thinking 开启 + provider-specific thinkingBudget 控制。
+
+2. **`maxOutputTokens` 触顶可能导致 summary 被截断。** 关 thinking 后，20K 直接限制 summary 主体；claude-code 实测 p99.99 ≈ 17K，留 ~3K 安全冗余。但 qwen-code 的压缩 prompt 与 claude-code 不同，分布需要观测。建议在压缩失败分支（[chatCompressionService.ts:464-491](packages/core/src/services/chatCompressionService.ts:464)）追加「检测到 finish_reason = MAX_TOKENS」的 NOOP 路径，避免持久化半截 summary。
+
+3. **跨 provider 的 maxOutputTokens 映射差异。** OpenAI compat (dashscope) → `max_tokens`、Anthropic → `max_tokens`、Gemini SDK → `maxOutputTokens`。当前 qwen-code 已有这层映射（[contentGenerator.ts:94](packages/core/src/core/contentGenerator.ts:94) 等），需要在 P6 实现时验证 sideQuery 路径上 `maxOutputTokens` 字段确实贯穿到所有 provider 的请求体。
+
+4. **Token 估算是粗略下界，不应反向用作"跳过触发"的依据。** `char/4` 与各 provider 真实 tokenizer 偏差可能 ±30%。本 spec 只用估算来「让阈值更早触发」（false-positive 方向，宁可早压不可晚压）。所有「降低 token 计数 / 跳过压缩」的代码路径仍应使用 `lastPromptTokenCount`（API 权威值）。
+
+5. **估算函数与现有 `estimateContentChars` 的关系。** [compactionInputSlimming.ts](packages/core/src/services/compactionInputSlimming.ts) 已经有 `estimateContentChars`（用于压缩 split point 计算），新增的 `estimateContentTokens` 应复用它（除以 bytesPerToken）而非新写一套，避免两套估算口径出现分歧。
+
+## 不在本 spec 范围
+
+- Env 变量覆盖通道（D 方案）：维持「配置面最小」原则
+- Footer 常驻可视化：留作 follow-up
+- 摘要 prompt 改进、`MIN_COMPRESSION_FRACTION` 调整：与阈值设计正交
+
+## 开放问题（等 review）
+
+1. **breaking change 强度**：警告 + 忽略字段 vs 启动报错。当前选警告，需要确认对企业部署/团队配置是否够友好
+2. **小窗口（32K）下 hard 与 auto 退化为同一值**：用户视角是否需要在 `/context` 明示「该窗口下 hard 已退化」
diff --git a/docs/e2e-tests/2026-05-18-qwen-memory-benchmark-report.md b/docs/e2e-tests/2026-05-18-qwen-memory-benchmark-report.md
new file mode 100644
index 0000000000..1a7aaf3253
--- /dev/null
+++ b/docs/e2e-tests/2026-05-18-qwen-memory-benchmark-report.md
@@ -0,0 +1,286 @@
+# Qwen Code Runtime Memory Benchmark Report
+
+Date: 2026-05-18
+
+## Summary
+
+This report records local memory benchmarks for Qwen Code runtime behavior. It
+compares Qwen Code across models and compares Qwen Code with Claude Code on the
+same task shapes where equivalent model endpoints were available.
+
+The headline result is consistent across the latest matrix (single run per cell,
+not statistically repeated):
+
+- Qwen Code process-tree RSS peak: about `852-1062 MiB` (`0.83-1.04 GiB`).
+- Claude Code process-tree RSS peak: about `279-366 MiB` (`0.27-0.36 GiB`).
+- Qwen Code was about `2.3x-3.6x` higher in the tested
+  non-interactive CLI task benchmarks.
+
+Note: process-tree RSS includes MCP child processes (~350 MiB overhead on the
+Qwen side). This inflates the absolute numbers but the relative comparison
+remains informative since both CLIs were measured the same way.
+
+The difference reproduced in small PR review, code navigation, and synthetic
+diff workloads. It is therefore unlikely to be explained only by one large PR
+or by one model provider.
+
+This report is intended to make the current performance investigation visible:
+what has been measured, what conclusion is already supported, what remains
+unknown, and what diagnostics should be added next.
+
+## Test Environment
+
+| Item                                          | Value                                      |
+| --------------------------------------------- | ------------------------------------------ |
+| Date                                          | 2026-05-18                                 |
+| Platform                                      | macOS local development machine            |
+| Qwen Code version                             | `0.15.11`                                  |
+| Qwen Code binary                              | PATH-resolved `qwen` binary                |
+| Claude Code version used in the latest matrix | `2.1.129`                                  |
+| Claude Code binary used in the latest matrix  | PATH-resolved `claude` binary              |
+| Node.js version                               | v22.x (default system install)             |
+| Sampling method                               | External `ps` RSS sampling once per second |
+| Headline metric                               | Process-tree RSS peak                      |
+
+Process-tree RSS is used as the headline metric because Qwen Code launches a
+root wrapper and a child Node/Qwen worker. Looking only at the root process can
+understate the memory footprint seen by users.
+
+Temporary CLI config directories were used for matrix runs so the benchmarks
+did not depend on global CLI state.
+
+## Benchmark Artifacts
+
+Five local reports were produced before this consolidated report:
+
+1. Qwen Code PR review memory run.
+2. Qwen Code model comparison run.
+3. Strict Qwen Code vs Claude Code comparison with `pai/glm-5`.
+4. Qwen Code vs Claude Code, two CLIs by two models.
+5. Qwen Code vs Claude Code, five-case matrix.
+
+This consolidated report covers the conclusions and headline metrics from all
+five reports. It does not embed every raw sample row, terminal transcript, or
+temporary runner artifact. Those raw artifacts stayed in local `tmp/`
+directories because they are experiment outputs rather than stable repository
+fixtures.
+
+The latest matrix is the strongest evidence because it covers multiple task
+shapes rather than only one PR review workload.
+
+## Preliminary Conclusion
+
+The current data is strong enough to say that Qwen Code has a higher runtime
+memory footprint than Claude Code in these local non-interactive CLI task
+benchmarks. It is not strong enough to name one final root cause yet.
+
+The leading explanation is a Qwen Code runtime/path difference rather than a
+model provider difference:
+
+- the gap reproduces with both `pai/glm-5` and `qwen3.6-plus`;
+- the gap reproduces in small PR and code-navigation tasks, not only in large
+  diff tasks;
+- Qwen Code repeatedly sends or accounts for more tokens than Claude Code for
+  similar work;
+- Qwen Code's largest observed component is the child Node/Qwen worker process,
+  which points toward task-time process footprint, module loading, context
+  assembly, live history, tool-result retention, or subagent/saved-output
+  paths.
+
+The most useful next measurement is therefore not another external RSS-only
+run. The next measurement should split RSS into V8 heap, native memory,
+session/history size, retained tool-result size, and subagent/process-tree
+activity.
+
+## Initial Cause Analysis
+
+The benchmark does not yet prove one root cause, but it does narrow the likely
+problem area.
+
+| Signal                                                                                       | What it suggests                                                                           | What it does not prove                                                                                  |
+| -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------- |
+| Qwen remains near `1 GiB` in small PR and code-navigation cases                              | A high non-interactive task-time runtime cost is likely involved                           | It does not identify whether the footprint is V8 heap, native memory, module loading, or retained state |
+| Diff size from 100 KiB to 5 MiB does not scale linearly with RSS                             | Raw diff bytes alone are probably not the primary driver                                   | Large outputs can still amplify memory in real PR review flows                                          |
+| Qwen uses more tokens than Claude in every matrix cell                                       | Qwen likely constructs or retains larger prompt/context/tool-result state for similar work | Token count is not the same as process memory and may be an effect rather than the cause                |
+| Tool call counts are similar, and Claude sometimes uses more turns/tool calls with lower RSS | A longer tool-call chain is unlikely to be the main explanation by itself                  | Tool output size and retention still need to be measured                                                |
+| Earlier large PR runs showed saved-output recovery and subagent amplification                | Tool-output truncation and saved-output paths are likely heavy-workload amplifiers         | They do not explain the entire small-task execution footprint                                           |
+
+The current best explanation is therefore:
+
+1. **Task-time runtime cost first**: Qwen Code likely initializes or retains
+   more runtime state during non-interactive CLI task execution than Claude
+   Code. This may include agent runtime, tool registry, provider adapters,
+   session services, or UI/history structures that are not strictly needed for
+   a short non-interactive task.
+2. **Context/tool-result volume second**: Qwen Code appears to carry larger
+   model-facing or session-facing context for similar work. The token gap makes
+   context assembly, tool result normalization, and history retention important
+   suspects.
+3. **Large-output amplification third**: Large PR review can trigger additional
+   saved-output and subagent paths. These are probably not the only cause, but
+   they can make memory and token pressure worse in realistic review tasks.
+
+The next diagnostic run should answer where the `~1 GiB` sits:
+
+- high immediately after startup: module/runtime startup cost;
+- jumps after tool execution: tool-output retention or result normalization;
+- jumps during request assembly: context construction or duplicated histories;
+- grows after streaming/compression: response retention or compression state;
+- mostly RSS outside V8 heap: native buffers, loaded modules, or external
+  memory.
+
+## Latest Matrix
+
+The latest benchmark ran:
+
+- 2 CLIs: Qwen Code and Claude Code.
+- 2 model labels: `pai/glm-5` and `qwen3.6-plus`.
+- 5 cases:
+  - small PR review: PR `#4268`, one-line change
+  - code navigation: `rg` plus `sed` on compression-related files
+  - synthetic local diff, about 100 KiB
+  - synthetic local diff, about 1 MiB
+  - synthetic local diff, about 5 MiB
+
+All 20 runs exited `0` with no timeout.
+
+## Matrix Results
+
+| Case             | Model          | Qwen tree peak | Claude tree peak | Qwen / Claude |
+| ---------------- | -------------- | -------------: | ---------------: | ------------: |
+| small PR `#4268` | `pai/glm-5`    |     1032.7 MiB |        357.8 MiB |         2.89x |
+| small PR `#4268` | `qwen3.6-plus` |      852.2 MiB |        365.5 MiB |         2.33x |
+| code navigation  | `pai/glm-5`    |      993.1 MiB |        359.6 MiB |         2.76x |
+| code navigation  | `qwen3.6-plus` |      996.9 MiB |        349.0 MiB |         2.86x |
+| diff 100 KiB     | `pai/glm-5`    |     1012.1 MiB |        350.8 MiB |         2.89x |
+| diff 100 KiB     | `qwen3.6-plus` |     1001.1 MiB |        336.2 MiB |         2.98x |
+| diff 1 MiB       | `pai/glm-5`    |     1008.3 MiB |        278.8 MiB |         3.62x |
+| diff 1 MiB       | `qwen3.6-plus` |     1003.3 MiB |        340.5 MiB |         2.95x |
+| diff 5 MiB       | `pai/glm-5`    |      858.8 MiB |        323.2 MiB |         2.66x |
+| diff 5 MiB       | `qwen3.6-plus` |     1062.0 MiB |        331.2 MiB |         3.21x |
+
+Average process-tree RSS peak by case:
+
+| Case             | Avg Qwen tree peak | Avg Claude tree peak |
+| ---------------- | -----------------: | -------------------: |
+| small PR `#4268` |          942.5 MiB |            361.6 MiB |
+| code navigation  |          995.0 MiB |            354.3 MiB |
+| diff 100 KiB     |         1006.6 MiB |            343.5 MiB |
+| diff 1 MiB       |         1005.8 MiB |            309.6 MiB |
+| diff 5 MiB       |          960.4 MiB |            327.2 MiB |
+
+## Runtime And Token Signals
+
+The same matrix also showed Qwen Code using more model-side tokens in every
+tested case.
+
+Selected examples:
+
+| Case            | Model          | CLI    | Duration | Turns | Total tokens | Tool calls |
+| --------------- | -------------- | ------ | -------: | ----: | -----------: | ---------: |
+| small PR        | `pai/glm-5`    | Qwen   |    25.2s |     2 |       32,567 |          3 |
+| small PR        | `pai/glm-5`    | Claude |    21.1s |     4 |        7,899 |          3 |
+| code navigation | `qwen3.6-plus` | Qwen   |    25.2s |     2 |       38,151 |          3 |
+| code navigation | `qwen3.6-plus` | Claude |    46.9s |     6 |       25,861 |          5 |
+| diff 100 KiB    | `qwen3.6-plus` | Qwen   |    16.5s |     3 |       57,185 |          2 |
+| diff 100 KiB    | `qwen3.6-plus` | Claude |    17.2s |     3 |        6,377 |          2 |
+| diff 5 MiB      | `pai/glm-5`    | Qwen   |    23.2s |     2 |       38,574 |          2 |
+| diff 5 MiB      | `pai/glm-5`    | Claude |     9.8s |     3 |        5,285 |          2 |
+
+This token gap does not prove that token volume is the memory root cause, but it
+does suggest that context assembly, tool result retention, or response
+normalization should be measured alongside RSS and V8 heap statistics.
+
+## Token Usage Analysis
+
+The token gap is one of the strongest clues, but it needs internal request
+metrics before it can be treated as a root cause.
+
+What the data supports today:
+
+- Qwen Code used more total tokens than Claude Code in every matrix cell.
+- The gap appears even when tool-call counts are similar.
+- Claude sometimes used more turns or tool calls while still using less memory.
+
+What this suggests:
+
+- The token delta is unlikely to come only from a longer tool-call chain.
+- Qwen may be carrying larger static prompt/context state, larger tool schemas,
+  larger serialized tool results, or more retained conversation/session content.
+- Large-output flows may add another layer through truncation, saved-output
+  recovery, or subagent paths.
+
+What is still missing:
+
+- per-request input token breakdown;
+- system prompt and tool schema token sizes;
+- retained message and tool-result sizes before each model request;
+- whether large outputs are retained in multiple places, such as model history,
+  UI history, session recording, or saved-output storage.
+
+Those missing metrics are why the next step should add internal diagnostics
+rather than only repeat the external RSS benchmark.
+
+## Earlier Large PR Review Signal
+
+An earlier strict PR review benchmark used PR `#4186` and showed the same broad
+shape:
+
+| Model          | CLI         | Process-tree RSS peak |
+| -------------- | ----------- | --------------------: |
+| `pai/glm-5`    | Qwen Code   |            1000.7 MiB |
+| `pai/glm-5`    | Claude Code |             349.0 MiB |
+| `qwen3.6-plus` | Qwen Code   |            1095.8 MiB |
+| `qwen3.6-plus` | Claude Code |             341.1 MiB |
+
+That earlier run was not enough by itself because a large PR can trigger unusual
+tool-output and saved-output paths. The latest five-case matrix makes the
+finding stronger because small PR and code-navigation tasks also reproduce the
+gap.
+
+## Working Hypothesis
+
+The current evidence supports these hypotheses, in priority order:
+
+1. Qwen Code has a higher non-interactive task-time process footprint than
+   Claude Code. The Qwen child Node worker was typically the largest process in
+   local sampling, often around `0.7-0.8 GiB`.
+2. Model choice is not the main explanation. Both `pai/glm-5` and
+   `qwen3.6-plus` showed the same broad Qwen-vs-Claude gap.
+3. Large diff size alone is not the main explanation. The synthetic diff size
+   did not scale linearly from 100 KiB to 5 MiB, likely because tool-output
+   truncation caps how much output reaches the model.
+4. Context/tool-result handling is still a likely contributor. Qwen Code used
+   more tokens than Claude Code in every matrix cell, and earlier large-PR runs
+   showed saved tool-output recovery and subagent amplification paths.
+5. The next diagnostic layer should separate V8 heap, native RSS, loaded
+   module/runtime startup cost, session history, UI history, tool-result
+   retention, and subagent activity. External RSS alone cannot distinguish
+   those causes.
+
+## Caveats
+
+- These are single runs per matrix cell, not repeated statistical samples.
+- RSS is external process RSS. It cannot distinguish V8 heap, native buffers,
+  module loading, retained tool output, UI state, or session history.
+- Claude Code and Qwen Code use different runtime implementations and protocol
+  adapters, even when the model labels are the same.
+- The benchmark was run locally on macOS. Linux servers should be tested before
+  drawing deployment-specific conclusions.
+
+## Recommended Follow-Up Measurements
+
+The next local investigation branch should add or use diagnostics for:
+
+- `process.memoryUsage()` before and after startup, tool execution, streaming,
+  compression, and session finalization.
+- V8 heap statistics and heap spaces.
+- Active handles and requests.
+- Session message count and approximate retained character/token volume.
+- Tool result count, total retained tool-result size, largest tool-result size,
+  and whether large outputs are retained by UI history or model history.
+- Subagent count and child process/process-tree RSS.
+- Tool-output truncation and saved-output recovery events.
+
+These measurements should be collected with the same benchmark matrix so the
+current RSS comparison can be connected to internal Qwen Code state.
diff --git a/docs/e2e-tests/2026-05-19-oom-reproduction-report.md b/docs/e2e-tests/2026-05-19-oom-reproduction-report.md
new file mode 100644
index 0000000000..8716e208f5
--- /dev/null
+++ b/docs/e2e-tests/2026-05-19-oom-reproduction-report.md
@@ -0,0 +1,437 @@
+# OOM 压力测试与长任务 Replay 报告
+
+**日期**: 2026-05-19
+**分支**: `codex/memory-diagnostics-local-run`
+**测试人**: yiliang114
+**结论**: 成功复现并定位根因。v0.15.7 (#3735) 引入的 auto-compaction 使 `structuredClone`
+调用频率倍增，在高 heap 压力时形成正反馈死循环导致 OOM。真实 debug 日志完整佐证了该机制。
+
+---
+
+## 一、背景
+
+多个 issue（#4309, #4276, #4185, #4315, #4322, #2868）报告 qwen-code 在长会话中出现 V8 heap OOM crash：
+
+```
+FATAL ERROR: Ineffective mark-compacts near heap limit Allocation failed - JavaScript heap out of memory
+```
+
+用户报告的崩溃特征：
+| Issue | 崩溃时 Heap | 运行时长 | 平台 |
+|-------|------------|---------|------|
+| #4276 | 4014 MB | ~110 分钟 | Linux x64 |
+| #4315 | 2027 MB | ~19.6 小时 | macOS (默认 2GB limit) |
+| #4322 | 4023 MB | ~7 小时 | Windows |
+| #2868 | 2035 MB | ~1.7 分钟 | Linux |
+| #4309 | 7020 MB | 未知 | Windows (设了 8GB limit 仍崩) |
+
+---
+
+## 二、方法论修正
+
+本报告区分两类测试：
+
+1. **低 heap 压力测试**：通过降低 `--max-old-space-size` 放大问题，用于快速定位
+   “history 很大时整段复制导致瞬时峰值”的代码路径。它是诊断工具，不等价于用户真实
+   4G/8G OOM 复现。
+2. **默认 heap 长任务 replay**：不设置 `NODE_OPTIONS`，使用真实 JSONL 历史恢复并
+   继续执行 review 任务，同时从进程外采样 process-tree RSS。这类结果才用于判断
+   用户侧实际内存量级。
+
+因此，低 heap 结果不能单独作为“真实 OOM 已修复”的证明。它只能说明某条路径在
+history 足够大时会产生峰值放大，需要再用默认 heap 长任务验证。
+
+## 三、低 heap 压力测试条件
+
+| 参数                     | 值                                                           |
+| ------------------------ | ------------------------------------------------------------ |
+| CLI 版本                 | 0.15.11 (从 `codex/memory-diagnostics-local-run` 分支 build) |
+| Model                    | `qwen3.6-plus` (128K context window)                         |
+| Heap limit               | `--max-old-space-size=512`                                   |
+| Heap-pressure safety net | **禁用** (HEAP_PRESSURE_COMPRESSION_RATIO 设为 99.0)         |
+| 操作模式                 | YOLO + 自动化多轮 Read 文件任务                              |
+| 工作目录                 | qwen-code monorepo (3538 .ts files, 1.26M lines)             |
+
+### 关键配置修改
+
+`packages/core/src/core/geminiChat.ts` 中将 heap-pressure compaction 阈值从 0.7 改为 99.0（使其永远不触发），模拟 #4186 修复前的状态。
+
+---
+
+## 四、低 heap 压力测试结果
+
+### 崩溃时间线
+
+```
+[21:26:59] #1 RSS:193.6MB Ctx:0%   → Read geminiChat.ts (1500 行)
+[21:27:46] #2 RSS:270.4MB Ctx:4.2% → Read agent.ts
+[21:28:32] #3 RSS:397.5MB Ctx:4.3% → grep + Read 3 个文件
+[21:29:18] #4 RSS:452.7MB Ctx:5.7% → Read slashCommandProcessor.ts
+[21:30:04] #5 RSS:515.0MB Ctx:5.9% → Read chatCompressionService.ts
+[21:30:50] #6 RSS:649.1MB Ctx:4.0% ← TOKEN COMPACTION 触发 (5.9%→4.0%)
+                                       RSS 反增 134MB (structuredClone 峰值)
+[21:31:36] #7 RSS:666.7MB Ctx:3.2% ← 再次 compaction, RSS 继续涨
+[21:32:22] CRASH — FATAL ERROR: Ineffective mark-compacts near heap limit
+```
+
+**总耗时**: ~5.5 分钟，7 轮任务后崩溃。
+
+这证明在受限 heap 下，长 history + compaction/history clone 可以触发 V8 heap OOM。
+但该结果不代表默认 heap 下的真实用户 OOM 已经被完整复现。
+
+### 更大 heap 的 synthetic 复现
+
+为避免只依赖 512 MiB 低 heap 结论，补充了更大 heap 的 synthetic runtime
+pressure 测试。该测试不调用模型，而是构造类似长 review/subagent 任务的历史：
+
+- root review turns: 10
+- subagent calls: 30
+- subagent transcript records: 780
+- retained tool result bytes: 193,986,560
+- serialized history bytes: 195,620,061
+- pressure mode: retained `structuredClone(history)` copies
+
+| Heap limit |     Clone pressure | 结果                                     | 关键 GC / stack                                              |
+| ---------- | -----------------: | ---------------------------------------- | ------------------------------------------------------------ |
+| 2 GiB      |  8 retained clones | 未崩溃，RSS 2.42 GiB，heap used 1.87 GiB | 接近 heap limit                                              |
+| 2 GiB      | 10 retained clones | OOM                                      | `Reached heap limit`, `ValueDeserializer`, `StructuredClone` |
+| 4 GiB      | 20 retained clones | OOM                                      | `Reached heap limit`, `ValueDeserializer`, `StructuredClone` |
+
+2 GiB 复现的 GC 摘要：
+
+```
+Mark-Compact 2042.9 (2081.9) -> 2042.9 (2081.1) MB
+Mark-Compact 2048.9 (2087.2) -> 2048.9 (2087.2) MB
+FATAL ERROR: Reached heap limit Allocation failed - JavaScript heap out of memory
+...
+node::worker::(anonymous namespace)::StructuredClone
+```
+
+4 GiB 复现的 GC 摘要：
+
+```
+Mark-Compact 4082.5 (4126.8) -> 4082.5 (4126.3) MB
+Mark-Compact 4095.1 (4139.0) -> 4095.1 (4139.0) MB
+FATAL ERROR: Reached heap limit Allocation failed - JavaScript heap out of memory
+...
+node::worker::(anonymous namespace)::StructuredClone
+```
+
+这组结果比 512 MiB 压力测试更接近用户报告的 2 GiB / 4 GiB heap OOM：
+只要 history 中保留足够多的大 tool result / subagent transcript，对整段 history
+做 retained 或瞬时 clone 都可以在 2-4 GiB heap 下触发 V8 OOM。它仍然是 synthetic
+复现，不等价于完整业务长任务 replay，但能直接证明问题不是“小 heap 人为制造”的。
+
+### 崩溃时 GC 状态
+
+```
+[41381:0x130008000] 342468 ms: Mark-Compact 508.6 (526.7) -> 507.0 (526.9) MB,
+  pooled: 1 MB, 86.42 / 0.00 ms  (average mu = 0.175, current mu = 0.150)
+  task; scavenge might not succeed
+
+[41381:0x130008000] 342568 ms: Mark-Compact 509.1 (526.9) -> 507.1 (528.2) MB,
+  pooled: 0 MB, 93.79 / 0.12 ms  (average mu = 0.121, current mu = 0.068)
+  allocation failure; scavenge might not succeed
+
+FATAL ERROR: Ineffective mark-compacts near heap limit
+Allocation failed - JavaScript heap out of memory
+```
+
+Mark-Compact 只能回收 1-2 MB（几乎所有对象都是 reachable），证明内存确实被合法持有的对象占满。
+
+---
+
+## 五、默认 heap 长任务 replay
+
+为了避免低 heap 结论过度外推，补充了默认 heap 的真实 JSONL replay：
+
+- 不设置 `NODE_OPTIONS`
+- 不启用内部 runtime profiler，避免采样器自身影响 heap
+- 每个 CLI 从同一份 rewound JSONL 复制出 fresh session
+- 使用临时 `QWEN_HOME`，禁用 MCP 和 hooks，避免本地全局配置污染
+- 只用进程外采样统计 process-tree RSS
+
+| CLI                  | 结果 |   时长 | Tree RSS 峰值 | Root RSS 峰值 | Worker RSS 峰值 | 备注                                                        |
+| -------------------- | ---- | -----: | ------------: | ------------: | --------------: | ----------------------------------------------------------- |
+| installed `qwen`     | 成功 | 167.3s |     838.0 MiB |     230.2 MiB |       566.3 MiB | 第一次 fresh run 遇到模型服务端错误，未纳入结论；retry 成功 |
+| local rebuilt bundle | 成功 | 106.3s |     527.5 MiB |     182.1 MiB |       345.4 MiB | 包含本地 clone 热路径修复                                   |
+
+默认 heap replay 的结论：
+
+1. 当前这份 review JSONL 可以稳定跑出数百 MiB 到约 0.8 GiB 的 process-tree RSS，
+   但没有复现 4G/8G OOM。
+2. 本地 rebuilt bundle 在同起点 replay 上的峰值低于 installed CLI，说明减少
+   history clone 热路径有实际收益。
+3. 这还不能证明所有用户 OOM 都已解决。真实 4G/8G OOM 仍需要更长任务、更大
+   tool-result 累积，或保留 MCP/tool schema 压力的 replay 继续验证。
+
+## 六、根因分析
+
+### OOM 的三层机制
+
+```
+┌─────────────────────────────────────────────────────────┐
+│ Layer 3: V8 Heap Limit (512MB/2GB/4GB)                  │ ← 用户最终撞到这里
+├─────────────────────────────────────────────────────────┤
+│ Layer 2: structuredClone() 峰值放大 (瞬时 ~2x)         │ ← 直接诱因
+├─────────────────────────────────────────────────────────┤
+│ Layer 1: History 中 tool result 累积 (线性增长)         │ ← 基础增长
+├─────────────────────────────────────────────────────────┤
+│ Layer 0: Token compaction 触发时机                      │ ← 控制点
+└─────────────────────────────────────────────────────────┘
+```
+
+### 精确崩溃路径
+
+```
+sendMessage()
+  → tryCompress()
+    → heapPressureRatio < threshold (safety net disabled)
+    → ChatCompressionService.compress()
+      → chat.getHistory(true)
+        → structuredClone(this._history)   ← 峰值分配！
+          → V8 需要额外 ~N MB 来容纳 clone
+          → 如果 existing heap + N > limit → OOM
+```
+
+### 关键证据
+
+| 观察                                    | 含义                                           |
+| --------------------------------------- | ---------------------------------------------- |
+| Task #5→#6: Context 5.9%→4.0% (降了)    | Token compaction **成功执行**了                |
+| Task #5→#6: RSS 515→649 MB (涨了 134MB) | Compaction 过程的 `structuredClone` 制造了峰值 |
+| GC 只能回收 1-2 MB                      | 所有对象都是 live（history + clone 都在）      |
+| #4309 设 8GB limit 仍崩                 | history 足够大时，clone 峰值可超任何 limit     |
+
+需要注意：以上证据来自低 heap 压力测试和 issue 现象的组合推断。默认 heap replay
+目前支持”clone 热路径会显著影响峰值 RSS”，但尚未单独复现 4G/8G OOM。
+
+### 为什么 128K context window 更容易触发
+
+- 128K × 70% = ~90K tokens 触发 compaction
+- 大 context window (1M) 的 70% = 700K tokens，几乎不会触发
+- **compaction 越频繁 → structuredClone 越频繁 → OOM 风险越高**
+- DeepSeek 等未配置 contextWindowSize 的模型默认 128K，更易触发
+
+---
+
+## 六.5、真实运行日志佐证
+
+以下日志提取自本地 crash session 的 debug 输出。为避免泄露本地路径和 session id，
+报告只保留时间线和关键日志内容。
+
+该 session 启动于 `2026-05-19T13:26:35Z` (本地 21:26:35)，crash 于
+`2026-05-19T13:32:10Z` (本地 21:32:10)。
+
+### Heap Pressure 与 Auto-Compaction 事件时间线
+
+```
+13:29:43 [WARN]  Heap pressure at 74.9%; attempting auto-compaction before token threshold.
+13:30:06 [DEBUG] [FILE_READ_CACHE] clear after auto tryCompress    ← compaction #1 执行成功
+13:30:13 [WARN]  Heap pressure at 70.7%; attempting auto-compaction before token threshold.
+                 ← 刚压完 heap 从 74.9% 仅降到 70.7%，仍超阈值，立即再次尝试
+13:30:52 [DEBUG] Heap pressure at 86.0%; skipping heap-pressure auto-compaction during cooldown.
+                 ← 30s cooldown 期间拒绝执行
+13:30:56 [WARN]  Heap pressure at 85.3%; attempting auto-compaction before token threshold.
+                 ← cooldown 过期，heap 已升至 85.3%
+13:31:21 [DEBUG] [FILE_READ_CACHE] clear after auto tryCompress    ← compaction #2 执行成功
+13:31:37 [WARN]  Heap pressure at 88.8%; attempting auto-compaction before token threshold.
+                 ← 压完后 heap 反弹至 88.8%
+13:32:09 [DEBUG] Heap pressure at 90.2%; skipping heap-pressure auto-compaction during cooldown.
+                 ← heap 已达 90.2%，cooldown 中无法执行
+13:32:10 ← 日志终止（进程 OOM crash）
+```
+
+### 日志证据解读
+
+| 日志观察                                                                              | 含义                                                      |
+| ------------------------------------------------------------------------------------- | --------------------------------------------------------- |
+| 2.5 分钟内触发 **4 次** heap-pressure auto-compaction 尝试（另有 2 次 cooldown 拒绝） | #3735 引入的 `tryCompress` 在高压时频繁触发               |
+| 每次 compaction 执行后 heap 占比仍 >70%                                               | `structuredClone()` 制造的临时峰值抵消了压缩收益          |
+| 74.9% → 70.7% → 86% → 85.3% → 88.8% → 90.2% → crash                                   | 正反馈循环：压缩→clone 峰值→heap 更高→再压缩→更高         |
+| 日志在 90.2% 后 1 秒内断裂                                                            | 下一次 `getHistory(true)` 的 `structuredClone()` 瞬间超限 |
+| `[FILE_READ_CACHE] clear after auto tryCompress` 出现 2 次                            | 证实 compaction 确实走了完整的 compress → setHistory 路径 |
+
+### 正反馈死循环机制
+
+```
+heap 占比高 (>70%)
+  → 触发 heap-pressure auto-compaction
+    → tryCompress() 内部调用 getHistory(true)
+      → structuredClone(this._history)  ← 瞬时 heap 峰值 +30~40%
+        → compaction 成功，释放旧 history
+          → 但 clone 峰值已经把 heap 推高到更危险的水位
+            → 下一轮 send 继续累积
+              → heap 占比更高 → 更频繁触发 → crash
+```
+
+---
+
+## 六.6、版本归因：为什么 0.15.7 ~ 0.15.11 期间 OOM 报告增多
+
+### 关键 commit 时间线
+
+| 版本         | PR                                                   | 改动                                                                                | 对 `structuredClone` 调用频率的影响 |
+| ------------ | ---------------------------------------------------- | ----------------------------------------------------------------------------------- | ----------------------------------- |
+| **v0.15.6**  | —                                                    | `getHistory(true)` 仅在 `sendMessage` 入口调用 1 次                                 | 基线：每次 send 1 次 clone          |
+| **v0.15.7**  | **#3735** `auto-compact subagent context`            | 将 `tryCompress()` 下沉到 `GeminiChat`，**每次 send 前**先执行一次 compaction 检查  | **+1 次**：send 前 compress 检查    |
+| **v0.15.10** | **#3879** `reactive compression on context overflow` | 当 provider 返回 context overflow 时，再次触发 `tryCompress()` + `getHistory(true)` | **+1~2 次**：overflow retry 路径    |
+| **v0.15.10** | **#3985** `harden reactive compression`              | 强化 reactive compression 重试逻辑                                                  | 同上                                |
+
+### v0.15.6 vs v0.15.11 的 `getHistory(true)` 调用点对比
+
+**v0.15.6** (2 处)：
+
+```
+L367: const requestContents = this.getHistory(true);          ← send 构造 request
+L618: const recoveryContents = self.getHistory(true);         ← MAX_TOKENS escalation (极少触发)
+```
+
+**v0.15.11** (5 处)：
+
+```
+L467: ChatCompressionService.compress() 内部调用              ← #3735: 每次 send 前的 auto-compact
+L574: requestContents = this.getHistory(true);                ← send 构造 request
+L724: reactive tryCompress() 内部调用                         ← #3879: context overflow 后 retry
+L739: requestContents = self.getHistory(true);                ← #3879: retry 构造新 request
+L943: const recoveryContents = self.getHistory(true);         ← MAX_TOKENS escalation
+```
+
+### 最坏路径：一次 send 可触发 4 次 `structuredClone`
+
+```
+sendMessage()
+  → tryCompress()              ← #3735: getHistory(true) [clone #1]
+  → getHistory(true)           ← 构造 request [clone #2]
+  → API 返回 context overflow
+    → reactive tryCompress()   ← #3879: getHistory(true) [clone #3]
+    → getHistory(true)         ← retry request [clone #4]
+```
+
+### 结论
+
+**#3735 (v0.15.7)** 是 OOM 频率显著上升的最可能触发因素（非唯一根因）——它使每次
+`sendMessage` 都会先跑一次 `tryCompress()`，而 `tryCompress` 内部通过
+`ChatCompressionService.compress()` → `chat.getHistory(true)` 做全量 `structuredClone`。
+在 history 较大时，这个 “先 clone 再判断是否需要压缩” 的设计让内存峰值从 ~1.3x 升至 ~2x+。
+注：issue history 显示 OOM 报告在 #3735 之前就已存在，但 #3735 大幅增加了 structuredClone
+的调用频率，从而显著提高了 OOM 的触发概率。
+
+**#3879 (v0.15.10)** 进一步恶化了问题——在已经处于 heap 边界时 (provider 返回 context overflow)
+再触发一次全量 clone，使原本就危险的 session 更容易 crash。
+
+---
+
+## 七、#4186 修复效果验证（对比测试）
+
+启用 heap-pressure safety net (HEAP_PRESSURE_COMPRESSION_RATIO = 0.7) 后的对比测试：
+
+| 指标            | 禁用 safety net    | 启用 safety net           |
+| --------------- | ------------------ | ------------------------- |
+| OOM 发生        | 是（7 轮后 crash） | 否（持续运行 >10 分钟）   |
+| RSS 峰值        | 666 MB → crash     | 555 MB → GC 回收到 280 MB |
+| Compaction 触发 | 仅 token threshold | heap 70% 时提前触发       |
+| Context 行为    | 5.9%→4.0%→crash    | 22.7%→17.0%（安全回落）   |
+
+**结论**: #4186 的 heap-pressure safety net 有效防止了 OOM，但它是一个**缓解**而非根治：
+
+- 如果 history 本身已经占了 heap 的 60%+，即使提前 compact，clone 的峰值仍然可能超限
+- 这解释了为什么 #4309 用户设了 8GB limit 后仍然 crash
+
+---
+
+## 八、内存占用分布
+
+基于测试中的 RSS 增长模式估算：
+
+| 内存位置                         | 占比   | 增长特征                    |
+| -------------------------------- | ------ | --------------------------- |
+| `this._history[]` (tool results) | 40-50% | 线性累积，每轮 +30-100MB    |
+| `structuredClone()` 临时拷贝     | 30-40% | 瞬时峰值，compaction 时出现 |
+| V8 runtime (GC metadata, code)   | ~15%   | 基本恒定                    |
+| UI/logging/stream buffers        | ~5%    | 缓慢增长                    |
+
+---
+
+## 九、复现脚本与环境
+
+### 自动化驱动脚本
+
+```bash
+#!/bin/bash
+# /tmp/oom-simple-driver.sh <tmux-session-name>
+SESSION="$1"
+
+TASKS=(
+  "用 Read 工具完整读取 packages/core/src/core/geminiChat.ts"
+  "用 Read 工具完整读取 packages/core/src/tools/agent/agent.ts"
+  "用 grep -rn structuredClone packages/core/src 然后 Read 前 3 个文件"
+  "用 Read 完整读取 packages/cli/src/ui/hooks/slashCommandProcessor.ts"
+  "用 Read 完整读取 packages/core/src/services/chatCompressionService.ts"
+  "用 find packages/cli/src/ui/commands -name '*.ts' 然后逐一 Read"
+  "用 Read 完整读取 packages/core/src/core/turn.ts"
+  # ... 更多任务
+)
+
+i=0
+while true; do
+  TASK="${TASKS[$((i % ${#TASKS[@]}))]}"
+  i=$((i + 1))
+
+  QWEN_PID=$(ps aux | grep "dist/index.js" | grep -v grep | awk '{print $2}' | sort -rn | head -1)
+  RSS=$(ps -o rss= -p $QWEN_PID 2>/dev/null)
+  [ -z "$RSS" ] && { echo "CRASH after $((i-1)) tasks!"; exit 0; }
+
+  RSS_MB=$(echo "scale=1; $RSS/1024" | bc)
+  CTX=$(tmux capture-pane -t "$SESSION:1" -p 2>/dev/null | grep -oE "[0-9]+\.[0-9]+% 已用" | tail -1)
+  echo "[$(date +%H:%M:%S)] #$i RSS:${RSS_MB}MB Ctx:$CTX | ${TASK:0:55}"
+
+  tmux send-keys -t "$SESSION:1" C-u
+  sleep 0.2
+  tmux send-keys -t "$SESSION:1" "$TASK" Enter
+  sleep 0.5
+  tmux send-keys -t "$SESSION:1" Enter
+  sleep 45
+done
+```
+
+### 启动命令
+
+```bash
+# 1. 禁用 heap-pressure safety net
+# geminiChat.ts: HEAP_PRESSURE_COMPRESSION_RATIO = 99.0
+
+# 2. Build
+npm run build --workspace=packages/core && npm run build --workspace=packages/cli
+
+# 3. 启动 qwen (128K context model, 512MB heap)
+SESSION="oom-test"
+tmux new-session -d -s "$SESSION" -c "$REPO_DIR"
+tmux send-keys -t "$SESSION" \
+  "NODE_OPTIONS='--max-old-space-size=512' node packages/cli/dist/index.js --model 'qwen3.6-plus'" Enter
+
+# 4. 等待启动后运行驱动
+sleep 10
+bash /tmp/oom-simple-driver.sh "$SESSION"
+```
+
+---
+
+## 十、后续建议
+
+### 短期缓解（已有）
+
+- [x] #4186: heap-pressure auto-compaction safety net (0.7 threshold)
+- [x] #4188: fileReadCache / crawlCache 上限
+
+### 中期修复（建议）
+
+- [ ] 减少 `structuredClone()` 调用 — `nextSpeakerChecker` 只需最后一条消息，不需 clone 全量
+- [ ] Compaction 使用 slice + 引用替代全量 deep clone
+- [ ] 大 tool result (>100KB) 写入临时文件，history 中只保留摘要引用
+
+### 长期方向
+
+- [ ] Tool result offload 到磁盘 + lazy load (#4184)
+- [ ] 基于 RSS 的分级压缩策略（不仅是 token count）
+- [ ] History 分段存储，避免单次全量操作
diff --git a/docs/e2e-tests/2026-05-19-qwen-runtime-diagnostics-benchmark-report.md b/docs/e2e-tests/2026-05-19-qwen-runtime-diagnostics-benchmark-report.md
new file mode 100644
index 0000000000..e482f0f94c
--- /dev/null
+++ b/docs/e2e-tests/2026-05-19-qwen-runtime-diagnostics-benchmark-report.md
@@ -0,0 +1,904 @@
+# Qwen Code Runtime Diagnostics Benchmark Report
+
+Date: 2026-05-19
+
+## Scope
+
+This run repeats the previous Qwen Code benchmark shapes with the new opt-in
+runtime diagnostics enabled. It only tests Qwen Code, not Claude Code.
+
+Initial model matrix:
+
+- `pai/glm-5`
+- `qwen3.6-plus`
+
+Additional PR-size follow-up:
+
+- `DeepSeek/deepseek-v4-pro` through Anthropic-compatible protocol
+
+Cases:
+
+- small GitHub PR review: PR `#4268`
+- code navigation: compression / compaction related code search and reads
+- synthetic local diff: about 94.6 KiB
+- synthetic local diff: about 968.5 KiB
+- synthetic local diff: about 4.84 MiB
+
+The run used the local bundled CLI from the diagnostics branch, with
+`QWEN_CODE_PROFILE_RUNTIME=1` and a temporary CLI home. Global MCP servers and
+hooks were not loaded for this benchmark.
+
+Important caveat: these absolute RSS numbers are lower than the previous
+PATH-resolved `qwen` runs because this run used `node dist/cli.js` from the
+local branch plus a stripped temporary config. Treat this report as an internal
+diagnostics distribution run, not a direct replacement for the earlier installed
+CLI RSS comparison.
+
+## Installed CLI vs Local Bundle Sanity Check
+
+A follow-up sanity check used the same minimal prompt, model, and non-interactive
+mode across the installed CLI and the local diagnostics bundle. The only
+intentional variable was whether Qwen Code loaded a stripped temporary CLI home
+or the normal user config.
+
+| CLI                 | Config mode     | Total tokens | Tree RSS peak | Root RSS peak | Process count peak | Runtime diagnostics |
+| ------------------- | --------------- | -----------: | ------------: | ------------: | -----------------: | ------------------- |
+| PATH `qwen`         | stripped config |       33,965 |     542.4 MiB |     249.9 MiB |                  3 | no                  |
+| local `dist/cli.js` | stripped config |       47,281 |     455.2 MiB |     214.2 MiB |                  4 | yes                 |
+| PATH `qwen`         | normal config   |       97,615 |   1,099.9 MiB |     250.1 MiB |                  6 | no                  |
+| local `dist/cli.js` | normal config   |       97,954 |   1,105.4 MiB |     212.7 MiB |                  8 | yes                 |
+
+This check changes the attribution: the earlier 1 GiB user-visible peak is
+reproducible with the normal config even on the local diagnostics bundle. It is
+therefore not primarily explained by the local branch including PR `#4186`.
+
+At the normal-config peak, the local process-tree sample was dominated by
+multiple Node/MCP processes rather than the Qwen root process alone:
+
+| Role  | Command shape             | RSS at tree peak |
+| ----- | ------------------------- | ---------------: |
+| child | Node process              |        252.9 MiB |
+| child | Chrome DevTools MCP       |        219.7 MiB |
+| child | Node process              |        219.2 MiB |
+| root  | Qwen Node process         |        215.1 MiB |
+| child | Chrome DevTools MCP setup |        175.2 MiB |
+
+PR `#4186` is present in the local diagnostics branch, but it is a V8 heap
+pressure auto-compaction safety net. It triggers at about 70% V8 heap pressure;
+on this environment the Node heap limit is about 4.1 GiB, while the stripped
+benchmark end heap was about 99-143 MiB. Based on these numbers, the lower
+stripped-config RSS is not caused by `#4186` actively compressing context during
+these benchmark runs.
+
+### Bare Mode Config Attribution Check
+
+A second follow-up used `qwen3.6-plus` with the same PR-review prompt shape on
+both the installed CLI and the local bundle. This is not a normal end-to-end
+business benchmark. It is a controlled attribution check for startup/config
+memory only.
+
+`--bare` changes the runtime inputs: it skips normal global settings discovery,
+MCP startup, hooks, implicit context, skills, and other startup integrations. It
+can therefore fail or behave differently when a model provider is configured
+only in global settings. For this run, model credentials were supplied only
+through the child-process environment because bare mode intentionally does not
+load the normal provider settings. Nothing was written back to the user's global
+config.
+
+This run did not produce useful token/tool-call statistics: the model completed
+in one turn and did not call the requested shell command. Do not use these rows
+as normal task benchmark results, and do not compare their token/tool-call
+behavior with the matrix above. They are only useful for estimating how much
+process-tree RSS comes from normal config and configured child processes.
+
+| CLI                 | Mode     | Wall | Turns | Tool uses | Tree RSS peak | Root RSS peak | Process count peak |
+| ------------------- | -------- | ---: | ----: | --------: | ------------: | ------------: | -----------------: |
+| PATH `qwen`         | normal   | 5.5s |     1 |         0 |   1,021.3 MiB |     251.5 MiB |                  5 |
+| PATH `qwen`         | `--bare` | 2.4s |     1 |         0 |     525.7 MiB |     246.4 MiB |                  2 |
+| local `dist/cli.js` | normal   | 4.9s |     1 |         0 |   1,046.2 MiB |     213.3 MiB |                  5 |
+| local `dist/cli.js` | `--bare` | 2.3s |     1 |         0 |     454.3 MiB |     216.5 MiB |                  3 |
+
+The result confirms the process-tree hypothesis for startup/config attribution.
+On this machine, normal config adds roughly 0.50-0.59 GiB of user-visible
+process-tree RSS over `--bare`, while root RSS stays in the same 0.21-0.25 GiB
+band. At the normal-config peak, the extra RSS again came from additional
+Node/MCP child processes, including a Chrome DevTools MCP process and its setup
+wrapper. `--bare` removes those startup/config children and brings
+installed/local runs back into the 0.45-0.53 GiB tree-RSS range.
+
+### Temporary Settings MCP / Hooks Isolation
+
+Because `--bare` changes too many runtime inputs to be treated as a normal
+benchmark, a follow-up used temporary `QWEN_HOME` directories with generated
+settings files derived from the normal settings. The run stayed on the normal
+settings-loading path, but toggled only two config dimensions:
+
+- MCP disabled: `mcpServers` cleared and MCP allow/exclude lists emptied.
+- Hooks disabled: `disableAllHooks` set to true.
+
+No global settings were modified. The case used `qwen3.6-plus` and a minimal
+startup prompt, so it measures startup/config process-tree cost, not task
+reasoning quality.
+
+| CLI                 | Temporary config     | MCP servers | Tools | Tree RSS peak | Root RSS peak | Process count peak |
+| ------------------- | -------------------- | ----------: | ----: | ------------: | ------------: | -----------------: |
+| PATH `qwen`         | full                 |           4 |    46 |   1,017.4 MiB |     249.8 MiB |                  5 |
+| PATH `qwen`         | MCP disabled         |           0 |    17 |     548.7 MiB |     252.4 MiB |                  2 |
+| PATH `qwen`         | hooks disabled       |           4 |    46 |   1,003.8 MiB |     246.4 MiB |                  5 |
+| PATH `qwen`         | MCP + hooks disabled |           0 |    17 |     542.5 MiB |     248.0 MiB |                  2 |
+| local `dist/cli.js` | full                 |           4 |    48 |     865.9 MiB |     220.4 MiB |                  6 |
+| local `dist/cli.js` | MCP disabled         |           0 |    19 |     442.9 MiB |     209.6 MiB |                  2 |
+| local `dist/cli.js` | hooks disabled       |           4 |    48 |     848.3 MiB |     212.6 MiB |                  5 |
+| local `dist/cli.js` | MCP + hooks disabled |           0 |    19 |     447.2 MiB |     217.8 MiB |                  2 |
+
+Interpretation:
+
+1. Disabling MCP is the dominant change. It removes 4 MCP servers, reduces the
+   advertised tool count by about 29 tools, and lowers process-tree RSS by about
+   0.42-0.47 GiB in this startup/config case.
+2. Disabling hooks alone barely changes RSS in this case. That is expected
+   because the prompt did not produce tool calls, so `PreToolUse` /
+   `PostToolUse` hooks were not executed.
+3. The root process stays around 0.21-0.25 GiB across all rows. The large
+   difference is again process-tree composition, not root Qwen RSS.
+
+Two attempted code-navigation follow-ups with `qwen3.6-plus` and `pai/glm-5`
+also reproduced the same MCP-vs-no-MCP memory split, but neither model produced
+tool calls in those runs. Those rows are therefore not used as hooks execution
+evidence. A valid hooks benchmark still needs a task/model combination that
+reliably emits tool calls.
+
+### Per-MCP Isolation
+
+The previous row showed MCP as a group is the dominant startup/config memory
+factor. A follow-up isolated each configured MCP server while keeping hooks
+disabled for all rows. This keeps the test on the normal settings-loading path
+but changes only the MCP server subset.
+
+Configured MCP server names:
+
+- `approval-bridge`
+- `env-center`
+- `chrome-devtools`
+- `code`
+
+Single-pass isolation:
+
+| Variant                   | Enabled MCPs                                       | Tools | MCP servers | Tree RSS peak | Root RSS peak | Interpretation                       |
+| ------------------------- | -------------------------------------------------- | ----: | ----------: | ------------: | ------------: | ------------------------------------ |
+| none                      | none                                               |    19 |           0 |     444.4 MiB |     211.7 MiB | baseline without MCP                 |
+| full                      | all 4                                              |    48 |           4 |     857.3 MiB |     215.9 MiB | full MCP startup shape               |
+| only `approval-bridge`    | `approval-bridge`                                  |    19 |           1 |     455.5 MiB |     214.0 MiB | near baseline                        |
+| only `env-center`         | `env-center`                                       |    19 |           1 |     452.3 MiB |     214.4 MiB | near baseline                        |
+| only `chrome-devtools`    | `chrome-devtools`                                  |    48 |           1 |     824.4 MiB |     209.5 MiB | large RSS increase and tool increase |
+| only `code`               | `code`                                             |    19 |           1 |     452.1 MiB |     216.6 MiB | near baseline                        |
+| without `approval-bridge` | `env-center`, `chrome-devtools`, `code`            |    48 |           3 |     997.1 MiB |     215.4 MiB | still high; run showed variance      |
+| without `env-center`      | `approval-bridge`, `chrome-devtools`, `code`       |    48 |           3 |     863.8 MiB |     220.9 MiB | still high                           |
+| without `chrome-devtools` | `approval-bridge`, `env-center`, `code`            |    19 |           3 |     463.4 MiB |     221.6 MiB | returns near baseline                |
+| without `code`            | `approval-bridge`, `env-center`, `chrome-devtools` |    48 |           3 |     858.1 MiB |     219.5 MiB | still high                           |
+
+Because startup RSS has some variance, the key variants were repeated twice:
+
+| Variant                   | Samples | Tree RSS range      | Avg tree RSS | Result                         |
+| ------------------------- | ------: | ------------------- | -----------: | ------------------------------ |
+| none                      |       2 | 443.3-451.9 MiB     |    447.6 MiB | stable no-MCP baseline         |
+| full                      |       2 | 856.1-922.8 MiB     |    889.5 MiB | stable high-MCP range          |
+| only `chrome-devtools`    |       2 | 1,007.1-1,021.2 MiB |  1,014.2 MiB | enough alone to reproduce high |
+| without `chrome-devtools` |       2 | 461.1-461.6 MiB     |    461.4 MiB | removes the high RSS           |
+| only `approval-bridge`    |       2 | 449.1-449.9 MiB     |    449.5 MiB | near baseline                  |
+| only `env-center`         |       2 | 438.7-449.5 MiB     |    444.1 MiB | near baseline                  |
+| only `code`               |       2 | 450.6-451.3 MiB     |    451.0 MiB | near baseline                  |
+
+Interpretation:
+
+1. `chrome-devtools` is the dominant MCP contributor in this environment. It is
+   sufficient by itself to reproduce the high process-tree RSS.
+2. Removing `chrome-devtools` from the full MCP set returns RSS to the no-MCP
+   band. Removing other MCPs while keeping `chrome-devtools` does not.
+3. The advertised tool count follows the same pattern: baseline is 19 tools,
+   while `chrome-devtools` raises the tool count to 48. That means this MCP is
+   also likely to increase request tool schema size and token pressure, not just
+   process-tree RSS.
+4. `approval-bridge`, `env-center`, and `code` individually stay near the
+   no-MCP baseline in these startup/config runs. They emitted startup warnings
+   in this environment, so this result should be interpreted as "no persistent
+   startup RSS owner observed" rather than proof that they have zero cost in all
+   workflows.
+
+## Runtime Summary
+
+| Case             | Model          |  Wall | Turns | Total tokens | Tree RSS peak | Root RSS peak |  End heap |   End RSS |
+| ---------------- | -------------- | ----: | ----: | -----------: | ------------: | ------------: | --------: | --------: |
+| small PR `#4268` | `pai/glm-5`    | 20.1s |     7 |      173,216 |     362.1 MiB |     359.8 MiB | 103.1 MiB | 216.5 MiB |
+| code navigation  | `pai/glm-5`    | 18.4s |     2 |       49,127 |     378.0 MiB |     376.0 MiB | 102.4 MiB | 313.4 MiB |
+| diff 94.6 KiB    | `pai/glm-5`    | 16.6s |     6 |      135,716 |     367.9 MiB |     366.0 MiB |  99.1 MiB | 295.0 MiB |
+| diff 968.5 KiB   | `pai/glm-5`    | 11.4s |     2 |       42,590 |     373.2 MiB |     362.5 MiB | 106.4 MiB | 345.6 MiB |
+| diff 4.84 MiB    | `pai/glm-5`    | 12.0s |     4 |       95,119 |     414.2 MiB |     412.0 MiB | 123.6 MiB | 410.7 MiB |
+| small PR `#4268` | `qwen3.6-plus` | 35.0s |     6 |      156,556 |     358.9 MiB |     356.9 MiB | 102.6 MiB | 293.1 MiB |
+| code navigation  | `qwen3.6-plus` | 28.9s |     4 |       99,800 |     370.3 MiB |     368.3 MiB | 105.8 MiB | 298.2 MiB |
+| diff 94.6 KiB    | `qwen3.6-plus` | 28.3s |     4 |       90,808 |     358.8 MiB |     356.9 MiB | 105.9 MiB | 307.0 MiB |
+| diff 968.5 KiB   | `qwen3.6-plus` | 30.9s |     6 |      151,782 |     366.1 MiB |     364.1 MiB | 101.0 MiB | 316.9 MiB |
+| diff 4.84 MiB    | `qwen3.6-plus` | 24.1s |     4 |       93,271 |     372.8 MiB |     366.0 MiB | 142.8 MiB | 366.0 MiB |
+
+Average by model:
+
+| Model          | Avg tree RSS peak | Avg root RSS peak | Avg turns | Avg total tokens | Avg max wire body | Avg total tool result |
+| -------------- | ----------------: | ----------------: | --------: | ---------------: | ----------------: | --------------------: |
+| `pai/glm-5`    |         379.1 MiB |         375.3 MiB |       4.2 |           99,154 |         111.8 KiB |             335.1 KiB |
+| `qwen3.6-plus` |         365.4 MiB |         362.4 MiB |       4.8 |          118,443 |         119.3 KiB |             344.3 KiB |
+
+Overlapping small PR `#4268` model snapshot:
+
+| Model                      | Protocol  |  Wall | Turns | Total tokens | Tree RSS peak | Root RSS peak | Max wire body |
+| -------------------------- | --------- | ----: | ----: | -----------: | ------------: | ------------: | ------------: |
+| `pai/glm-5`                | OpenAI    | 20.1s |     7 |      173,216 |     362.1 MiB |     359.8 MiB |     113.8 KiB |
+| `qwen3.6-plus`             | OpenAI    | 35.0s |     6 |      156,556 |     358.9 MiB |     356.9 MiB |     134.1 KiB |
+| `DeepSeek/deepseek-v4-pro` | Anthropic | 39.7s |     2 |       43,362 |     346.9 MiB |     344.8 MiB |     103.0 KiB |
+
+## Request And Tool Diagnostics
+
+| Case             | Model          | Requests | Max wire body | Max system prompt | Max tool schema | Tool calls | Total tool result | Max tool result | Max function response in request |
+| ---------------- | -------------- | -------: | ------------: | ----------------: | --------------: | ---------: | ----------------: | --------------: | -------------------------------: |
+| small PR `#4268` | `pai/glm-5`    |        7 |     113.8 KiB |          51.4 KiB |        40.2 KiB |          9 |           4.7 KiB |         3.9 KiB |                         15.3 KiB |
+| code navigation  | `pai/glm-5`    |        2 |     114.6 KiB |          51.5 KiB |        40.2 KiB |          3 |          17.5 KiB |         6.2 KiB |                         18.4 KiB |
+| diff 94.6 KiB    | `pai/glm-5`    |        6 |     111.2 KiB |          39.1 KiB |        37.2 KiB |          9 |          94.9 KiB |        92.6 KiB |                         29.2 KiB |
+| diff 968.5 KiB   | `pai/glm-5`    |        2 |     104.8 KiB |          39.1 KiB |        37.2 KiB |          2 |         772.1 KiB |       771.9 KiB |                         25.6 KiB |
+| diff 4.84 MiB    | `pai/glm-5`    |        4 |     114.7 KiB |          39.1 KiB |        37.2 KiB |          4 |         786.3 KiB |       783.2 KiB |                         34.7 KiB |
+| small PR `#4268` | `qwen3.6-plus` |        6 |     134.1 KiB |          51.4 KiB |        40.2 KiB |          5 |          34.6 KiB |        15.6 KiB |                         36.6 KiB |
+| code navigation  | `qwen3.6-plus` |        4 |     114.9 KiB |          51.5 KiB |        40.2 KiB |          3 |          17.5 KiB |         6.2 KiB |                         18.4 KiB |
+| diff 94.6 KiB    | `qwen3.6-plus` |        4 |     112.8 KiB |          39.1 KiB |        37.2 KiB |          3 |          92.9 KiB |        92.6 KiB |                         33.0 KiB |
+| diff 968.5 KiB   | `qwen3.6-plus` |        6 |     113.1 KiB |          39.1 KiB |        37.2 KiB |          5 |         778.0 KiB |       771.9 KiB |                         32.1 KiB |
+| diff 4.84 MiB    | `qwen3.6-plus` |        4 |     121.5 KiB |          39.1 KiB |        37.2 KiB |          4 |         798.5 KiB |       783.2 KiB |                         41.3 KiB |
+
+## Observations
+
+1. Process-tree RSS is almost the same as root RSS in this local bundle run.
+   The root/tree gap is usually below 10 MiB. That means these runs did not
+   show a persistent child-process memory owner. The dominant process is the
+   main Node process.
+2. The local bundle run peaks around 0.36-0.41 GiB, not the earlier
+   0.83-1.04 GiB, because the matrix used a stripped temporary config. A
+   follow-up normal-config sanity check reproduced about 1.1 GiB tree RSS on
+   both PATH `qwen` and local `dist/cli.js`, with the extra memory coming from
+   child MCP/Node processes in the process tree.
+3. V8 heap is much smaller than RSS. End heap is about 99-143 MiB while end RSS
+   is about 216-411 MiB. The remaining footprint is likely loaded modules,
+   native allocations, external buffers, or runtime overhead outside live JS
+   heap.
+4. Static request overhead is large and repeated. The system prompt is about
+   39-51 KiB per request, and tool schema is about 37-40 KiB per request. This
+   explains why even small tasks can produce high accumulated token counts when
+   the model takes several turns.
+5. Large diff output is capped before it reaches the model request. The 968 KiB
+   and 4.84 MiB diff cases produced around 772-799 KiB of captured tool result,
+   but the largest model-facing function response in a request stayed around
+   25-41 KiB, and max wire body stayed around 105-122 KiB. This points to
+   truncation / saved-output handling working on the model-facing path.
+6. Memory still increases on large-output cases even though wire body remains
+   bounded. For example, the 4.84 MiB GLM run reached 414.2 MiB tree RSS and
+   410.7 MiB end RSS, and the 4.84 MiB qwen3.6-plus run ended with 142.8 MiB
+   heap. That suggests large tool output can still affect local capture,
+   normalization, or retained runtime state even when the final request payload
+   is capped.
+7. Model choice changed turns and token totals more than RSS in this run.
+   `qwen3.6-plus` averaged more tokens and turns than `pai/glm-5`, but its
+   average tree RSS peak was slightly lower. This supports the earlier
+   conclusion that model choice is not the main explanation for process memory.
+
+## Updated Working Inference
+
+The new diagnostics make the earlier hypothesis more precise:
+
+- The installed-CLI user-visible 1 GiB peak is now reproducible with the normal
+  config on the local diagnostics bundle. The stripped run should be used for
+  internal Qwen runtime attribution; the normal-config run should be used for
+  user-visible process-tree attribution.
+- The largest observed difference between stripped and normal config is
+  process-tree shape: normal config starts additional MCP/Node child processes.
+  Those children explain most of the absolute jump from about 0.35-0.55 GiB to
+  about 1.1 GiB in the minimal prompt sanity check.
+- The `--bare` follow-up confirms the same direction on `qwen3.6-plus`: normal
+  config costs about 0.50-0.59 GiB more process-tree RSS than bare mode for the
+  same prompt shape, while root RSS changes only slightly.
+- The temporary-settings isolation is a better attribution test than `--bare`:
+  disabling MCP alone reduces process-tree RSS by about 0.42-0.47 GiB while
+  keeping the normal settings-loading path. Disabling hooks alone does not show
+  a meaningful RSS change in no-tool-call cases.
+- Per-MCP isolation points to `chrome-devtools` as the dominant MCP contributor:
+  it is enough by itself to reproduce the high RSS band, and removing it returns
+  the run near the no-MCP baseline.
+- Within the local Qwen runtime, the most suspicious areas are no longer "raw
+  diff bytes sent to the model". The model-facing request body is bounded.
+- The stronger suspects are static per-request context cost, repeated request
+  rounds, tool schema size, and local retention/capture of large tool outputs
+  before or outside model-facing truncation.
+- Because RSS remains much higher than V8 heap, the next profiling layer should
+  include module/startup accounting, external memory, and heap snapshots around
+  tool execution and final response emission.
+
+## RSS Attribution From Current Diagnostics
+
+The current counters do not identify an exact retained object or source file,
+but they do narrow what is and is not driving RSS in these local runs:
+
+| Signal                       | Current evidence                                                                                                            | RSS implication                                                                                                                           |
+| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
+| Root RSS vs process-tree RSS | Root and tree peaks are usually within about 2-10 MiB; DeepSeek large PR is the widest gap at about 23.6 MiB                | No persistent child process explains the RSS in this local bundle run; the main Node process dominates                                    |
+| Normal config process tree   | Minimal-prompt normal-config runs reach about 1.1 GiB tree RSS while root RSS stays about 213-250 MiB                       | User-visible 1 GiB peaks can be dominated by MCP/Node child processes rather than Qwen root RSS alone                                     |
+| `--bare` comparison          | `qwen3.6-plus` normal runs peak around 1.02-1.05 GiB tree RSS; bare runs peak around 0.45-0.53 GiB                          | Loading normal config adds about 0.50-0.59 GiB process-tree RSS in this environment                                                       |
+| Temporary MCP isolation      | Clearing MCP servers drops startup/config tree RSS from 865-1,017 MiB to 443-549 MiB                                        | MCP startup and MCP child processes explain about 0.42-0.47 GiB of process-tree RSS in the controlled config check                        |
+| Per-MCP isolation            | `chrome-devtools` alone reaches about 1.0 GiB in repeated samples; without it the run stays around 461 MiB                  | `chrome-devtools` is the dominant MCP process-tree RSS contributor in this environment                                                    |
+| Temporary hooks isolation    | `disableAllHooks=true` with MCP still enabled changes tree RSS by only about 13-18 MiB in no-tool-call cases                | Hook config alone is not a visible startup RSS driver here; hook execution still needs a tool-call benchmark                              |
+| V8 heap vs RSS               | End heap is about 99-143 MiB while end RSS is about 216-411 MiB                                                             | Live JS heap is not the whole footprint; loaded modules, native allocations, external buffers, or runtime overhead are likely significant |
+| PR/diff size vs RSS          | DeepSeek small/medium/large PRs scale from 1 to 4,750 changed lines, but tree RSS stays in a narrow 340.7-360.0 MiB band    | Raw PR size is not linearly driving RSS once tool output is bounded                                                                       |
+| Tool output size             | Large diff runs capture about 772-799 KiB tool results and show some higher end RSS / heap, but RSS does not scale linearly | Tool result capture/normalization contributes pressure, especially large-output cases, but is unlikely to be the only RSS driver          |
+| Request body size            | Max model-facing body ranges from about 103-289 KiB while RSS stays near the same band                                      | Request serialization size affects tokens and latency more clearly than RSS peak                                                          |
+| Static per-request context   | System prompt is about 39-51 KiB and tool schema about 37-48 KiB per request                                                | Repeated rounds are a token/cost amplifier; this alone does not explain RSS but is a likely optimization target for token pressure        |
+
+Working attribution: in the stripped local bundle benchmark, the RSS floor looks
+mostly like task-time runtime/module/native footprint, with large tool output
+adding incremental pressure. In the normal-config run, the user-visible 1 GiB
+tree peak is mostly process-tree composition: Qwen root plus MCP/Node child
+processes. The next targeted measurement should split Qwen root diagnostics
+from configured MCP server diagnostics, then add startup/module/external-memory
+checkpoints inside the Qwen root process.
+
+## Progress Snapshot
+
+Current confirmed signals:
+
+1. The user-visible 1 GiB startup/config peak is reproducible with both the
+   installed CLI and the local diagnostics bundle when the normal config is
+   loaded. It is not primarily explained by the diagnostics branch or PR `#4186`.
+2. In this environment, that 1 GiB peak is mostly process-tree composition:
+   Qwen root process plus relaunch child process plus MCP child processes.
+3. `chrome-devtools` is the dominant configured MCP contributor in the current
+   config. It is enough by itself to reproduce the high process-tree RSS band,
+   even when the prompt does not explicitly use that MCP.
+4. The no-MCP normal relaunch shape still sits around 0.45 GiB process-tree RSS.
+   A single Qwen runtime process without the relaunch parent is closer to
+   0.22-0.24 GiB in the startup attribution check. This means the 0.45 GiB
+   baseline is not a single-process root RSS number.
+5. In stripped non-interactive task runs, model choice changes turns, token
+   totals, latency, and request sizes more clearly than RSS. RSS stayed in a
+   relatively narrow range across `pai/glm-5`, `qwen3.6-plus`, and
+   `DeepSeek/deepseek-v4-pro`.
+6. Current short-task diagnostics show model-facing tool/function responses are
+   bounded, but local tool-result capture and runtime state can still increase
+   heap/RSS on large-output cases. This keeps large-output retention on the
+   investigation path.
+
+Current gaps:
+
+1. The short-task benchmark matrix is still short-lived. A later interactive
+   long-review run did reproduce a 41.9 min failure, but it is still one sample
+   and needs repeat runs plus heap/object attribution.
+2. The current counters are enough to attribute process-tree RSS and request
+   size, but not enough to name the retained JS object graph during long
+   sessions.
+3. Startup/config RSS and long-session OOM must remain separate tracks. MCP and
+   relaunch explain a large idle/startup RSS band; they do not by themselves
+   explain V8 heap OOM after long tasks.
+4. Interactive TUI memory still needs a separate run from non-interactive mode,
+   because UI history and Ink static output are not exercised the same way.
+
+## Long-Task OOM Evidence From Issues And PRs
+
+Issue/PR evidence points to several different OOM shapes, not one single
+failure mode:
+
+| Source                                                                                                                 | Evidence summary                                                                                                                                      | Hypothesis to test                                                                                                               |
+| ---------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| [`#4309`](https://github.com/QwenLM/qwen-code/issues/4309)                                                             | User reports 5.84 GiB memory usage / 7.02 GiB warning with YOLO mode and DeepSeek backend; increasing Node memory to 8 GiB did not remove the symptom | Long autonomous tool loops can retain enough state that simply raising old-space limit is not a root fix                         |
+| [`#4149`](https://github.com/QwenLM/qwen-code/issues/4149)                                                             | Multiple reports show `Ineffective mark-compacts near heap limit`, including 4 GiB and much larger heap-limit cases                                   | A large fraction of heap is reachable application state, not immediately collectible garbage                                     |
+| [`#4116`](https://github.com/QwenLM/qwen-code/issues/4116)                                                             | OOM occurred while context display was around 9.5%; analysis points to `structuredClone`, UI history, Ink static tree, and large context windows      | Token usage can be low while JS heap pressure is high; token threshold alone is not a reliable memory guard                      |
+| [`#4167`](https://github.com/QwenLM/qwen-code/issues/4167)                                                             | User says the crash happened while compressing; analysis identifies compression peak memory as a distinct shape                                       | Compression can itself create a peak when heap is already high, especially if history is cloned/stringified around the same time |
+| [`#2128`](https://github.com/QwenLM/qwen-code/issues/2128)                                                             | Report identifies unbounded UI history, retained file diffs / terminal output, string-width caches, and checkpoint serialization                      | Interactive TUI long sessions may retain memory outside model history and outside non-interactive benchmarks                     |
+| [`#2562`](https://github.com/QwenLM/qwen-code/issues/2562)                                                             | Report focuses on `GeminiChat.getHistory()` deep-cloning full history in long sessions                                                                | Full-history cloning can amplify memory peaks and should be measured separately from retained steady-state size                  |
+| [`#4185`](https://github.com/QwenLM/qwen-code/issues/4185)                                                             | Tracks V8 heap pressure exceeding limit before token-based compaction runs                                                                            | Heap-pressure guard is necessary, but it only mitigates symptoms if retained data remains large                                  |
+| [`#4184`](https://github.com/QwenLM/qwen-code/issues/4184)                                                             | Proposes diagnostics and offload/preview for large retained tool results                                                                              | Large tool output may be bounded for model requests while still retained in local hot memory                                     |
+| [`#4186`](https://github.com/QwenLM/qwen-code/pull/4186)                                                               | Merged heap-pressure auto-compaction safety net and O(1) last-history access for `nextSpeakerChecker`                                                 | Covers part of heap-pressure and clone amplification, but does not claim to solve all OOM classes                                |
+| [`#4127`](https://github.com/QwenLM/qwen-code/pull/4127), [`#4168`](https://github.com/QwenLM/qwen-code/pull/4168)     | Open compaction-threshold PRs; one uses fixed heap thresholds, the other redesigns token thresholds and compression behavior                          | Useful related work, but long-task testing must verify whether heap, token, and compression signals line up in real runs         |
+| [`#3000`](https://github.com/QwenLM/qwen-code/issues/3000), [`#4183`](https://github.com/QwenLM/qwen-code/issues/4183) | Diagnostic roadmap calls out `/doctor memory`, heap snapshot, and bounded memory timeline                                                             | Snapshot/timeline support is needed to move from RSS attribution to retained-object attribution                                  |
+
+Initial interpretation:
+
+- Unused configured MCP can consume memory because normal startup connects to
+  configured MCP servers and advertises their tools before the task needs them.
+  In the measured config, `chrome-devtools` starts extra Node/npm MCP processes
+  and also increases the tool schema count from 19 to 48. This explains a large
+  startup/config RSS band and can also increase repeated request overhead.
+- The long-session OOM reports are a different layer. GC logs where
+  Mark-Compact frees very little memory suggest the heap is full of reachable
+  state. The strongest candidates are retained history/tool/UI objects,
+  full-history clones, compression intermediates, and streaming/logging
+  accumulators.
+- PR `#4186` is a useful mitigation because it can compact based on heap
+  pressure before token thresholds trigger, and it removes one unnecessary
+  full-history clone. It should not be treated as proof that large tool-output
+  retention, UI history retention, or compression peak memory is already solved.
+
+## Long-Task Validation Plan
+
+The next benchmark should keep two tracks separate:
+
+1. Startup/config attribution: normal config vs MCP-disabled vs
+   `chrome-devtools`-only vs no-relaunch attribution. This explains what users
+   see before meaningful work begins.
+2. Long-task runtime growth: repeated tool calls, large outputs, compression,
+   resume, and interactive UI history. This explains OOM after real work.
+
+Recommended long-task cases:
+
+| Case                          | Shape                                                                                                | Why it matters                                                                                    |
+| ----------------------------- | ---------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------- |
+| Long PR review loop           | Repeat medium/large PR review prompts for 30, 60, and 120 minutes, with fixed model and fixed config | Closest to reported agent workflows; captures turns, tool calls, token growth, and RSS/heap trend |
+| Large tool-output retention   | Repeatedly produce bounded 1 MiB / 5 MiB / 20 MiB command outputs, then ask follow-up questions      | Tests whether raw output is retained locally after model-facing truncation                        |
+| Compression pressure          | Use a lower controlled old-space limit and large-context prompts to trigger heap-pressure compaction | Verifies PR `#4186` triggers before OOM and whether compression itself creates a new peak         |
+| Interactive TUI history       | Run the same long loop in tmux TUI mode and compare with non-interactive mode                        | Isolates UI history, Ink static output, rendered diffs, and terminal-output display retention     |
+| Resume stress                 | Resume a large saved session and immediately continue work                                           | Targets `/resume` OOM reports and session reconstruction cost                                     |
+| Streaming/logging accumulator | Force long streamed responses with telemetry/logging enabled vs disabled                             | Tests the suspected `collected responses` / logging-retention path from issue analysis            |
+| MCP idle vs MCP active        | Run no-MCP, `chrome-devtools` configured-but-unused, and `chrome-devtools` actively used variants    | Separates idle MCP child RSS from actual MCP tool execution and tool schema/token overhead        |
+
+Metrics that should be recorded per turn or per sampling interval:
+
+- Root RSS current/peak and process-tree RSS current/peak.
+- Child process count and top child command shapes.
+- V8 `heapUsed`, `heapTotal`, `heap_size_limit`, `external`, and
+  `arrayBuffers`.
+- Turn count, request count, tool-call count, and tool-call rounds.
+- Input/output/cache/total tokens by request and by whole task.
+- Request body bytes, system prompt bytes, tool schema bytes, and function
+  response bytes.
+- Tool-result count, total captured tool-result bytes, max tool-result bytes,
+  and retained tool-result bytes if available.
+- Conversation history message count and approximate history byte size.
+- Interactive-only UI history item count and approximate retained display size.
+- Compression attempts, compression trigger reason, tokens before/after, heap
+  pressure before/after, and compression failure status.
+- Heap snapshot or bounded memory timeline artifacts when heap pressure crosses
+  a configured threshold.
+
+Validation criteria:
+
+1. Repeat at least the key long-task cases twice. Startup RSS has visible
+   variance, so single-run conclusions should be avoided.
+2. Report root RSS and process-tree RSS separately. User-facing memory pressure
+   can come from child processes, while V8 OOM comes from the Qwen root heap.
+3. Treat a flat RSS line as important evidence. If tokens and tool calls grow
+   but heap/RSS stays flat, the issue is likely elsewhere.
+4. When RSS or heap grows, correlate the growth with a specific signal:
+   tool-result bytes, history bytes, UI history count, compression event,
+   streaming accumulator size, or MCP process start.
+5. If a heap snapshot is taken, write a structured diagnostics JSON first, then
+   the snapshot. Heap snapshots may be large and can contain sensitive strings,
+   so they should remain opt-in and local.
+
+## Interactive Long-Review Reproduction
+
+After the short non-interactive prompts kept finishing before the target window,
+an interactive TUI benchmark was run with remote input. The CLI process stayed
+alive in one session while a controller submitted one real PR-review turn at a
+time. The next turn was only submitted after the assistant emitted that turn's
+completion marker. This avoids treating a short one-shot prompt as a long-task
+reproduction.
+
+Setup:
+
+- Installed Qwen Code `0.15.11`, model `qwen-latest-series-invite-beta-v28`.
+- Temporary CLI home derived from the normal settings, with MCP and hook config
+  removed. No global config was modified.
+- Interactive TUI mode with dual JSON event output and remote JSONL input.
+- Static PR review only. The prompt disallowed dependency install, build, test,
+  Playwright, Docker, and other long external build commands.
+- External RSS samplers recorded both process-tree RSS and the Qwen Node root
+  RSS every 5 seconds.
+
+Outcome:
+
+| Signal                        |       Value |
+| ----------------------------- | ----------: |
+| Wall time before exit         |    41.9 min |
+| Exit status                   |           1 |
+| Completed PR-review turns     |           6 |
+| Main chat records             |       1,076 |
+| API response telemetry        |         335 |
+| Tool-call telemetry           |         607 |
+| MCP tool-call telemetry       |           0 |
+| Main/root API responses       |          36 |
+| Subagent API responses        |         299 |
+| Root total tokens             |       2.08M |
+| Subagent total tokens         |      17.24M |
+| Total API telemetry tokens    |      19.32M |
+| Max root input tokens         |      85,655 |
+| Max subagent input tokens     |     215,207 |
+| `/usr/bin/time -l` max RSS    | 1,072.4 MiB |
+| Sampled Qwen root RSS peak    | 1,028.2 MiB |
+| Sampled process-tree RSS peak | 1,038.1 MiB |
+
+The process exited with:
+
+```text
+libc++abi: terminating due to uncaught exception of type std::__1::system_error: thread constructor failed: Resource temporarily unavailable
+```
+
+This is a **thread exhaustion** error, not a V8 heap OOM. The failure mechanism
+is distinct: the OS refused to create a new thread, likely due to per-process
+resource limits (`RLIMIT_NPROC`) or memory fragmentation preventing stack
+allocation. It is still relevant because it occurred in a disabled-MCP,
+no-build/test, interactive long-session review where the Qwen Node process
+itself crossed about 1 GiB RSS.
+The failure happened during the final summary phase, after the controller had
+already completed six review turns.
+
+Turn timeline and sampled Qwen root RSS:
+
+| Window        | Turn state           | Qwen root RSS max | Qwen root RSS at window end |
+| ------------- | -------------------- | ----------------: | --------------------------: |
+| 0.0-9.0 min   | turn 1 completed     |         701.2 MiB |                   255.3 MiB |
+| 9.0-15.1 min  | turn 2 completed     |         503.2 MiB |                   494.4 MiB |
+| 15.1-24.1 min | turn 3 completed     |         468.7 MiB |                   457.5 MiB |
+| 24.1-31.9 min | turn 4 completed     |         619.3 MiB |                   602.3 MiB |
+| 31.9-40.3 min | turn 5 completed     |         955.5 MiB |                   955.5 MiB |
+| 40.3-40.4 min | turn 6 completed     |         988.6 MiB |                   988.6 MiB |
+| 40.4-41.9 min | final summary / exit |       1,028.2 MiB |                 1,028.2 MiB |
+
+Token and tool distribution:
+
+| Owner        | API responses | Input tokens | Output tokens | Total tokens | Max input |
+| ------------ | ------------: | -----------: | ------------: | -----------: | --------: |
+| Root session |            36 |        2.06M |         22.2K |        2.08M |    85,655 |
+| Subagents    |           299 |       17.08M |        154.6K |       17.24M |   215,207 |
+
+Tool-call telemetry by function:
+
+| Tool                | Calls | Captured content length |
+| ------------------- | ----: | ----------------------: |
+| `read_file`         |   271 |                 1.46 MB |
+| `run_shell_command` |   181 |                164.4 KB |
+| `web_fetch`         |    80 |                846.3 KB |
+| `grep_search`       |    25 |                 15.0 KB |
+| `glob`              |    15 |                 27.8 KB |
+| `todo_write`        |    16 |                 16.1 KB |
+| `list_directory`    |     8 |                  6.2 KB |
+| `agent`             |    10 |                       0 |
+| `tool_search`       |     1 |                  2.1 KB |
+
+The top visible TUI token counter for a single agent reached about 3.83M
+tokens. Telemetry also shows the heaviest subagent at about 4.05M total tokens
+with a 215K-token max input request. That makes subagent amplification the
+dominant signal in this reproduction.
+
+Interpretation:
+
+1. This run separates long-session growth from MCP startup/config memory. MCP
+   was disabled and there were no MCP tool calls, yet the Qwen root process
+   still reached about 1 GiB RSS.
+2. The late memory peak aligns with subagent-heavy review turns and final
+   summary/merge-back, not with external build/test child processes.
+3. The RSS curve is not a simple linear leak. It falls after early turns, then
+   rises sharply after later subagent turns and remains high near exit.
+4. The failure mode is native resource exhaustion rather than a V8 heap-limit
+   stack, so the next run should add heap/external/arrayBuffer/thread-count
+   sampling. RSS alone cannot distinguish JS heap from native allocations or
+   thread-resource pressure.
+5. The strongest code paths to inspect remain subagent transcript retention,
+   agent-result merge-back, full-history cloning, checkpoint/session recording,
+   and final summary/history assembly.
+
+## Deterministic Huge-Task Clone-Pressure Reproduction
+
+A deterministic stress harness was added as
+`scripts/memory-pressure-repro.mjs`. It does not call a model. Instead, it
+constructs a Qwen-like long-session object graph with root review turns,
+subagent transcripts, large tool results, checkpoint JSON, and retained
+`structuredClone()` copies. This gives a repeatable reproduction for the clone
+and checkpoint peak suspected from the user-provided OOM stack.
+
+The harness has a lightweight script test:
+
+```bash
+npx vitest run --config ./scripts/tests/vitest.config.ts \
+  scripts/tests/memory-pressure-repro.test.js
+```
+
+Result: passed, 1 test.
+
+Controlled runs used `node --max-old-space-size=256` unless otherwise noted.
+
+| Case                                              | History shape                                                           | Clone/checkpoint pressure                          | Result                            |   Max RSS |
+| ------------------------------------------------- | ----------------------------------------------------------------------- | -------------------------------------------------- | --------------------------------- | --------: |
+| Small sanity                                      | 2 turns, 2 KiB tool result, 1 subagent                                  | 1 clone + 1 checkpoint                             | passed; 2.6 MiB history JSON      |  89.7 MiB |
+| Huge build only                                   | 12 turns, 256 KiB tool result, 2 subagents x 12 subagent turns          | no retained clone/checkpoint                       | passed; 76.2 MiB history JSON     | 491.5 MiB |
+| Huge + 1 clone                                    | same as above                                                           | 1 retained `structuredClone()`                     | passed                            | 569.6 MiB |
+| Huge + 2 clones                                   | same as above                                                           | 2 retained `structuredClone()` copies              | OOM, exit 134                     | 496.5 MiB |
+| Huge + 1 checkpoint                               | same as above                                                           | one checkpoint with original + cloned history JSON | passed; 152.5 MiB checkpoint JSON | 926.9 MiB |
+| Huge + 2 checkpoints                              | same as above                                                           | two checkpoint copies                              | OOM, exit 134                     | 920.1 MiB |
+| Huge + 2 clones, no retained subagent transcripts | same generated subagent output, but parent history keeps only summaries | passed; parent history JSON drops to 3.8 MiB       | 136.8 MiB                         |
+
+The failing huge-clone run produced:
+
+```text
+FATAL ERROR: Reached heap limit Allocation failed - JavaScript heap out of memory
+```
+
+The native stack included:
+
+- `v8::internal::ValueDeserializer::ReadObjectInternal`
+- `v8::internal::ValueDeserializer::ReadDenseJSArray`
+- `node::worker::Message::Deserialize`
+- `node::worker::StructuredClone`
+
+This matches the same stack family as the user-provided OOM log. The controlled
+reproduction also shows why 4 GiB / 8 GiB user reports are plausible: the
+failure is not caused by a single large object, but by large retained
+history/tool-result/subagent state plus one or more full-history clone or
+checkpoint copies. Raising `--max-old-space-size` can delay the crash while
+preserving the same amplification pattern.
+
+Important attribution from this deterministic run:
+
+1. Building a 76.2 MiB parent history JSON can succeed under the reduced heap.
+   The OOM appears when additional full-history clone/checkpoint copies are
+   retained.
+2. A single checkpoint copy can push RSS close to 1 GiB even before OOM.
+3. Removing retained subagent transcripts from the parent hot history changes
+   the same generated workload from OOM to a small 136.8 MiB RSS run. That is
+   the clearest mitigation signal so far.
+4. This reproducer is synthetic and intentionally adversarial, but it exercises
+   the same object-graph shape as the long interactive review: parent session,
+   subagents, large tool outputs, transcript merge-back, and full-history clone
+   pressure.
+
+## DeepSeek PR-Size Follow-Up
+
+After the initial model matrix, an additional Qwen Code-only run tested
+`DeepSeek/deepseek-v4-pro` across three real PR sizes. This model is configured
+through the Anthropic-compatible protocol; OpenAI-compatible execution returned
+404 in a smoke check, so the successful benchmark uses `--auth-type anthropic`.
+
+The diagnostics branch was extended to record Anthropic wire request summaries
+with the same privacy rule as the OpenAI path: aggregate counts and byte sizes
+only, no prompt text, diff content, tool arguments, headers, base URL, or API
+key.
+
+PR sizes:
+
+| Size   | PR      | State  | Files | Changed lines | Title                                                                   |
+| ------ | ------- | ------ | ----: | ------------: | ----------------------------------------------------------------------- |
+| small  | `#4268` | merged |     1 |             1 | fix(serve): add mcp_guardrails to E2E capabilities expectation          |
+| medium | `#4186` | merged |     6 |           494 | fix(core): add heap-pressure auto-compaction safety net                 |
+| large  | `#4168` | open   |    25 |         4,750 | feat(core)!: redesign auto-compaction thresholds with three-tier ladder |
+
+Runtime:
+
+| Size   | PR      |   Wall | Turns | Total tokens | Cache-read tokens | Tree RSS peak | Root RSS peak |  End heap |   End RSS |
+| ------ | ------- | -----: | ----: | -----------: | ----------------: | ------------: | ------------: | --------: | --------: |
+| small  | `#4268` |  39.7s |     2 |       43,362 |            28,672 |     346.9 MiB |     344.8 MiB | 115.2 MiB | 304.3 MiB |
+| medium | `#4186` | 142.6s |     4 |      135,120 |           115,840 |     340.7 MiB |     337.3 MiB | 103.5 MiB | 285.6 MiB |
+| large  | `#4168` | 191.1s |     8 |      386,891 |           332,928 |     360.0 MiB |     336.3 MiB | 119.3 MiB | 237.9 MiB |
+
+Request and tool diagnostics:
+
+| Size   | PR      | Requests | Anthropic wire requests | Max Anthropic body | Max system | Max tool schema | Tool calls | Total tool result | Max tool result | Max function response in request |
+| ------ | ------- | -------: | ----------------------: | -----------------: | ---------: | --------------: | ---------: | ----------------: | --------------: | -------------------------------: |
+| small  | `#4268` |        2 |                       2 |          103.0 KiB |   50.8 KiB |        47.6 KiB |          3 |           0.6 KiB |         0.5 KiB |                          1.1 KiB |
+| medium | `#4186` |        4 |                       4 |          159.8 KiB |   50.8 KiB |        47.6 KiB |          5 |          30.2 KiB |        29.3 KiB |                         56.7 KiB |
+| large  | `#4168` |        8 |                       8 |          289.5 KiB |   50.8 KiB |        47.6 KiB |         11 |         235.0 KiB |       232.1 KiB |                        182.4 KiB |
+
+DeepSeek observations:
+
+1. PR size scaled turns, tokens, Anthropic wire body size, and tool result size
+   clearly, but did not scale RSS proportionally. The small/medium/large tree
+   RSS peaks stayed in a narrow `340.7-360.0 MiB` band.
+2. The large PR was expensive mostly in model rounds and token volume:
+   8 requests and 386,891 total tokens. Its max Anthropic body was 289.5 KiB,
+   much larger than the OpenAI-compatible runs, but RSS still stayed near the
+   same local-bundle band.
+3. The static Anthropic request cost is also visible: system prompt is about
+   50.8 KiB and tool schema about 47.6 KiB per request. Repeated rounds are
+   therefore a major token amplifier.
+4. The large PR produced 235.0 KiB of captured tool results and 182.4 KiB max
+   function response in a request. This is higher than the earlier small PR /
+   code-navigation cases and shows large PRs still put pressure on local
+   tool-result handling and request assembly, even when RSS does not spike.
+5. The DeepSeek run reinforces the model-choice conclusion: provider/model
+   choice strongly changes turns, latency, token volume, and wire payload shape,
+   but the local bundle RSS peak remains dominated by Qwen Code runtime shape
+   rather than scaling linearly with PR size.
+
+## Long-Review JSONL Replay: History Clone Pressure
+
+A recent long PR-review chat record was analyzed as a post-mortem shape for
+the reported OOM class. The raw JSONL is not included here because it contains
+prompt and tool output text. The aggregate shape is:
+
+| Signal                  | Value                         |
+| ----------------------- | ----------------------------- |
+| Duration                | 87.0 min                      |
+| Qwen Code version       | 0.15.10                       |
+| Model                   | qwen-latest-series beta model |
+| API responses           | 380                           |
+| Tool-call telemetry     | 507 events                    |
+| MCP tool-call telemetry | 4 events                      |
+| Subagent API responses  | 313                           |
+| Root API responses      | 67                            |
+| Root prompt growth      | 38,622 -> 168,555 tokens      |
+| Max prompt tokens       | 168,555                       |
+| Total response tokens   | 31.28M                        |
+
+This shape does not support MCP as the primary OOM cause for this case. Only
+4 of 507 tool-call telemetry events were MCP, and all four recorded
+`content_length=0`. The dominant shape is long-session/subagent amplification:
+15 `agent` calls produced 313 subagent API responses and 403 subagent tool-call
+events.
+
+The replay then rebuilt the chat `Content[]` message shape from the JSONL and
+ran controlled clone/stringify pressure tests. The base retained message payload
+is small, so it is not itself enough to OOM:
+
+| Replay scale | Retained clones | History JSON | Checkpoint JSON | End heap |  End RSS |
+| ------------ | --------------: | -----------: | --------------: | -------: | -------: |
+| 1x           |               8 |      0.54 MB |         1.08 MB |  18.0 MB |  88.8 MB |
+| 30x          |               8 |     14.46 MB |        28.92 MB | 260.0 MB | 577.8 MB |
+| 60x          |               8 |     28.86 MB |        57.71 MB | 510.3 MB | 960.8 MB |
+
+The scaled replay is not a user-data claim; it is a controlled amplification of
+the observed JSONL shape to test whether full-history clone and checkpoint
+serialization can create the same failure mode as the reports.
+
+A low-heap reproduction with `--max-old-space-size=256` confirms the mechanism:
+
+| Case                      | History JSON | Result                                                |
+| ------------------------- | -----------: | ----------------------------------------------------- |
+| Build history only        |      38.4 MB | Succeeded; heap 131.6 MB, RSS 378.2 MB                |
+| Build + one clone         |      38.4 MB | Succeeded; heap 183.3 MB, RSS 463.4 MB                |
+| Build + repeated clones   |      38.4 MB | OOM after several retained `structuredClone()` copies |
+| Checkpoint double-history |      38.4 MB | OOM while holding history plus cloned client history  |
+
+The repeated-clone OOM stack contains `ValueDeserializer::ReadObjectInternal`,
+`ValueDeserializer::ReadDenseJSArray`,
+`node::worker::Message::Deserialize`, and
+`node::worker::StructuredClone`, matching the same stack family seen in the
+user-provided OOM log. This proves that full-history `structuredClone()` can be
+the immediate OOM trigger without any MCP server involvement.
+
+Current working hypothesis for this JSONL class:
+
+1. MCP can explain normal-config startup RSS in separate benchmarks, but it is
+   not the likely trigger for this long-review OOM shape.
+2. Long task growth comes from retained chat history, large tool outputs,
+   subagent histories, observable agent messages, and UI/tool-result state.
+3. The immediate OOM trigger can be a full-history clone or checkpoint-style
+   double serialization after the heap is already high.
+4. Compression can mitigate retained history, but compression itself may create
+   a temporary peak if it first clones or serializes large history.
+
+### Local Mitigation Validation: Disabled-MCP PR Review Case
+
+Two targeted mitigations were applied locally and validated before rerunning a
+disabled-MCP PR review case:
+
+1. `checkNextSpeaker()` now reads only the last curated message with
+   `getHistoryTail(1, true)` and sends only that message to the next-speaker
+   side query. The next-speaker prompt only asks about the immediately previous
+   model response, so sending full history was unnecessary clone and token
+   pressure.
+2. `AgentToolInvocation` no longer retains full `responseParts` arrays inside
+   the live `task_execution.toolCalls` display. The real response parts still
+   flow through transcript/history paths, but the parent UI display now keeps
+   only a bounded text summary for nested tool-result streaming instead of
+   holding another full copy of large subagent tool outputs during long runs.
+3. `GeminiChat.sendMessageStream()` now builds model request contents through
+   an internal curated-history view instead of calling public
+   `getHistory(true)`. Public `getHistory()` still returns a defensive
+   `structuredClone()` for external callers, but the request hot path no longer
+   deep-clones the whole retained chat history before every model call.
+
+TDD checks added for these mitigations:
+
+| Test                                                                                                           | Expected protection                                                                      |
+| -------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------- |
+| `checkNextSpeaker > should send only the last curated model message to the side query`                         | Prevents full-history clone/send in next-speaker checks                                  |
+| `AgentTool > should not retain responseParts in live tool call display after TOOL_RESULT`                      | Prevents live subagent display from retaining large tool responses                       |
+| `AgentTool > should keep only a bounded result summary in live tool call display`                              | Preserves nested result readability without retaining the full response body             |
+| `GeminiChat > sendMessageStream > does not deep-clone the full curated history when building request contents` | Prevents request setup from hitting the `ValueDeserializer` / `StructuredClone` OOM path |
+
+Additional reproduction and fix validation:
+
+| Step                                 | Command shape                                                                                                                                | Result                                                                                                                                |
+| ------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
+| Pre-fix deterministic clone pressure | `node --max-old-space-size=256 scripts/memory-pressure-repro.mjs ... --clone-count=2 --mode=clone`                                           | OOM, exit 134; stderr contained `Reached heap limit` and `ValueDeserializer` / `StructuredClone`; max RSS 528.1 MiB in the repeat run |
+| Red test                             | targeted `GeminiChat` test with `structuredClone` forced to throw during request setup                                                       | failed at `GeminiChat.getHistory()` before the mitigation                                                                             |
+| Green test                           | same targeted `GeminiChat` test after the mitigation                                                                                         | passed                                                                                                                                |
+| Built-code smoke                     | `node --max-old-space-size=256` against the built core package, with a 96-entry / about 48 MiB history and `structuredClone` forced to throw | passed; request had 97 contents; process RSS 161.4 MiB, `/usr/bin/time -l` max RSS 161.6 MiB                                          |
+
+This narrows the earlier "same stack family" statement: the deterministic
+synthetic OOM still proves retained full-history clones can fail in the same V8
+stack family as the user log, while the new `GeminiChat` red/green test proves
+one real production request-setup path no longer reaches that clone point.
+Checkpoint/resume and compression internals still need separate long-run
+validation because they can legitimately need durable copied history.
+
+Verification commands:
+
+| Command                                                                                                | Result                                                                                                                                      |
+| ------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `npx vitest run src/core/geminiChat.test.ts`                                                           | passed, 89 tests                                                                                                                            |
+| `npx vitest run src/utils/nextSpeakerChecker.test.ts --coverage=false`                                 | passed, 13 tests                                                                                                                            |
+| `npx vitest run src/tools/agent/agent.test.ts --coverage=false`                                        | passed, 77 tests                                                                                                                            |
+| `npx vitest run --config ./scripts/tests/vitest.config.ts scripts/tests/memory-pressure-repro.test.js` | passed, 1 test                                                                                                                              |
+| `npm run build --workspace=packages/core`                                                              | passed                                                                                                                                      |
+| `npm run build --workspace=packages/cli`                                                               | passed                                                                                                                                      |
+| `npm run typecheck --workspace=packages/core`                                                          | passed                                                                                                                                      |
+| `npm run typecheck --workspace=packages/cli`                                                           | passed                                                                                                                                      |
+| `npm run bundle`                                                                                       | passed                                                                                                                                      |
+| `npm run build`                                                                                        | failed in `packages/vscode-ide-companion` lint on existing internal-module import rules; core, CLI, bundle, and targeted tests above passed |
+
+The full root `npm run build` was not clean in this worktree because the
+`vscode-ide-companion` package hit pre-existing `import/no-internal-modules`
+lint errors. The core/CLI build and bundle needed for the local runtime test
+completed successfully.
+
+The same PR review prompt was then run with a temporary config where MCP and
+hooks were disabled. Both rows were interrupted after a bounded long-run window
+instead of waiting for a full review to finish. **Caveat**: the two runs are
+confounded by workload size (79K vs 390K tokens) and cannot be compared as a
+controlled experiment. The comparison only shows directional evidence.
+
+| Variant           | Runtime | MCP servers | Tools | Assistant messages | Tool use/result blocks | Parent tool ids | Total tokens | Max input tokens | Root max RSS |
+| ----------------- | ------: | ----------: | ----: | -----------------: | ---------------------: | --------------: | -----------: | ---------------: | -----------: |
+| before mitigation | 365.08s |           0 |    19 |                 42 |                42 / 42 |               3 |       79,439 |           26,807 |    357.7 MiB |
+| after mitigation  | 404.52s |           0 |    19 |                 58 |                52 / 42 |               2 |      390,339 |           54,000 |    310.5 MiB |
+
+This is not a deterministic apples-to-apples model benchmark: the patched run
+did more work and consumed substantially more total tokens before the manual
+cutoff. The useful signal is narrower: under a disabled-MCP review case with
+more observed work, root max RSS did not increase and was about 47.2 MiB lower.
+That supports the mitigation direction, but it does not prove the whole
+long-task OOM class is fixed.
+
+Remaining high-risk clone/retention paths to inspect next:
+
+1. Compression still calls full `getHistory(true)` before summarization. If the
+   heap is already high, the compression attempt can create the peak that trips
+   OOM.
+2. Checkpoint creation can hold original history, cloned client history, and a
+   serialized checkpoint payload at the same time.
+3. Fork subagents still seed from parent history with `getHistory(true)`.
+4. ACP/history export/summary/copy paths still call full `getHistory()` and
+   should be audited separately from the normal review loop.
+
+Version timing:
+
+| Issue | Created    | Reported version         | Signal                                   |
+| ----- | ---------- | ------------------------ | ---------------------------------------- |
+| #2128 | 2026-03-05 | not specified            | Long-session UI memory growth            |
+| #2562 | 2026-03-21 | not specified            | `structuredClone` OOM in long sessions   |
+| #2868 | 2026-04-03 | 0.13.2                   | Heap OOM                                 |
+| #2945 | 2026-04-07 | 0.14.0                   | V8 heap OOM                              |
+| #4116 | 2026-05-13 | 0.15.11                  | OOM with structured-clone-style analysis |
+| #4134 | 2026-05-14 | 0.15.11                  | OOM                                      |
+| #4149 | 2026-05-14 | 0.15.10-nightly.20260513 | V8 heap OOM                              |
+| #4167 | 2026-05-15 | 0.15.11                  | Crash near compression                   |
+| #4185 | 2026-05-15 | 0.15.11                  | Heap pressure before token compaction    |
+| #4254 | 2026-05-17 | not specified            | Memory keeps rising                      |
+| #4276 | 2026-05-18 | 0.15.11                  | V8 heap OOM                              |
+| #4309 | 2026-05-19 | 0.15.11                  | High memory warning around 7 GiB         |
+
+The issue history does not prove that 0.15.10 introduced the OOM class; similar
+reports existed in March and April. It does support a recent cluster beginning
+around 2026-05-13, overlapping `v0.15.10`/`v0.15.11` releases. The relevant
+diff between `v0.15.9` and `v0.15.10` touched subagent runtime,
+non-interactive execution, `GeminiChat`, and compression code heavily, so this
+range is a reasonable first bisect window.
+
+## Notes
+
+- The first code-navigation prompt allowed open-ended exploration and hit
+  `maxSessionTurns`; the successful rows above use a constrained command list.
+- The first synthetic-diff attempt used a relative bundle path from inside the
+  temporary repositories; those failed immediately and are excluded from the
+  tables. The successful rows use the absolute local bundle path.
+- Raw JSONL streams are not committed because they contain prompts, tool
+  commands, and tool output. The report only includes aggregate diagnostics.
diff --git a/docs/e2e-tests/2026-05-21-qwen-0.15.11-default-heap-oom-stress-report.md b/docs/e2e-tests/2026-05-21-qwen-0.15.11-default-heap-oom-stress-report.md
new file mode 100644
index 0000000000..e9579dee1b
--- /dev/null
+++ b/docs/e2e-tests/2026-05-21-qwen-0.15.11-default-heap-oom-stress-report.md
@@ -0,0 +1,338 @@
+# Qwen Code 0.15.11 默认 Heap OOM 压测报告
+
+日期：2026-05-21
+
+## 测试范围
+
+本报告记录了针对 Qwen Code `0.15.11` 最新本地构建的一轮默认 heap 压测。
+这轮测试的目标是验证：在不人为降低内存上限的情况下，当前代码是否还能复现
+issue 中提到的长会话 OOM，以及在更极端的大输出场景下还有没有新的风险。
+
+本轮覆盖三个模型：
+
+- `pai/glm-5`
+- `qwen3.6-plus`
+- `DeepSeek/deepseek-v4-pro`
+
+测试分为两部分：
+
+1. 真实长任务、多 agent 并发 review 循环。
+2. amplified foreground stdout 压测，即用大规模前台 shell stdout 放大
+   tool-output 路径压力。
+
+## 测试环境
+
+| 项目                        | 值                                            |
+| --------------------------- | --------------------------------------------- |
+| 分支                        | `codex/memory-investigation-draft-pr`         |
+| Commit                      | `c161e0aa4`                                   |
+| CLI                         | 本地 `dist/cli.js`                            |
+| CLI 版本                    | `0.15.11`                                     |
+| Node 默认 heap limit        | `4144 MiB`                                    |
+| `NODE_OPTIONS`              | 未设置                                        |
+| 显式 `--max-old-space-size` | 未设置                                        |
+| runner `ulimit`             | runner 未设置                                 |
+| 配置模式                    | 临时复制 `~/.qwen`，并隔离 `QWEN_RUNTIME_DIR` |
+| MCP / 正常配置              | 尽量按复制后的正常配置加载                    |
+
+注意：这里的 CLI 版本显示为 `0.15.11`，是因为 package version 尚未 bump。
+实际测试对象是 commit `c161e0aa4` 下本地编译出的 `dist/cli.js`，不是 PATH
+里的全局 `qwen` 可执行文件。
+
+本轮没有修改全局 Qwen 配置。原始 runtime artifacts 在：
+
+- `.qwen/runtime-bench/2026-05-20T13-51-58-731Z-oom-stress`
+- `.qwen/runtime-bench/2026-05-20T15-20-37-790Z-oom-amplified`
+
+注意：本轮里 `env-center` MCP server 启动失败，但其他内置工具和部分
+MCP/child process 仍然加载。因此这些结果代表当前本地环境，不是完全 stripped
+的 `--bare` 环境。
+
+## 核心结论
+
+最新本地构建在 issue 最关心的“长会话 V8 heap OOM”路径上表现明显更好。
+基于这轮默认 heap、多模型、多 agent、长任务压测，可以认为本 PR 对此前遇到的
+long-session heap OOM 问题已经基本解决，至少在当前复现维度下已经不能再复现
+原始 heap OOM。
+
+真实长任务、多 agent 并发测试一共执行了：
+
+- 23 个 worker turn
+- 约 `719,094,118` reported total tokens
+- 77 次 agent tool call
+- 856 次总 tool call
+
+这部分没有复现任何传统 V8 heap OOM 特征：
+
+- `JavaScript heap out of memory`
+- `Reached heap limit`
+- `Ineffective mark-compacts near heap limit`
+- `Allocation failed`
+
+真实长任务阶段最高 process-tree RSS 为 `874.7 MiB`，最高 root-process RSS 为
+`219.1 MiB`。这说明在默认 heap 下，当前代码没有轻易复现原 issue 中那种长任务
+跑挂的 heap OOM。
+
+第二阶段 amplified stdout 压测更激进。它一共执行了 18 个 payload attempt，
+覆盖三个模型和 `128 MiB` 到 `2048 MiB` 的 foreground stdout payload。
+
+结果是：
+
+- 三个模型都成功跑过 `1536 MiB` payload。
+- 最高成功 process-tree RSS 是 `5964.7 MiB`，出现在 `qwen3.6-plus`
+  的 `1536 MiB` payload。
+- 到 `2048 MiB` payload 时，出现了一个新的 extreme large-output failure。
+
+`2048 MiB` 的结果：
+
+- `pai/glm-5`：`exit=1`，stdout 为空，没有标准 OOM 文本。
+- `qwen3.6-plus`：`exit=1`，stdout 为空，没有标准 OOM 文本。
+- `DeepSeek/deepseek-v4-pro`：出现 V8 fatal：
+  `Check failed: i::kMaxInt >= len`，栈在
+  `v8::String::NewFromOneByte` / `node::StringBytes::Encode` /
+  `DecodeUTF8`。
+
+这个新问题不是原 issue 中的传统 long-session heap OOM。它更像是
+multi-GiB foreground stdout 被解码/构造成 JS string 时触发的 V8 字符串长度
+限制或大输出处理问题。建议作为 large-output follow-up 跟踪，而不是把它当作
+当前长会话 heap-pressure 修复失败。
+
+## Phase 1：真实长任务、多 Agent 并发压测
+
+### 测试形态
+
+每个模型 worker 都复用同一个 session，不断 `--resume`。每一轮要求 Qwen Code：
+
+- 进行只读代码审查和代码搜索；
+- 在同一轮中并发启动至少 4 个 `agent` tool call；
+- 重点检查 chat history、compaction、subagent runtime、non-interactive
+  streaming、provider adapters 等 memory 相关区域；
+- 保留足够详细的最终回答，让 session history 自然增长。
+
+runner 每秒采样 process-tree RSS，没有设置任何额外 heap cap。
+
+这部分在观察到内存比较稳定后用 `SIGTERM` 主动停止，以便切换到第二阶段的
+amplified stdout 压测。因此表里的 `SIGTERM` 不是 OOM。
+
+### 汇总结果
+
+| Model                      | Worker turns |    Total tokens | Agent calls | Tool calls | Peak tree RSS | Peak root RSS | Last exit | OOM    |
+| -------------------------- | -----------: | --------------: | ----------: | ---------: | ------------: | ------------: | --------- | ------ |
+| `pai/glm-5`                |            9 |     444,614,704 |          36 |        362 |     874.7 MiB |     217.4 MiB | `SIGTERM` | no     |
+| `qwen3.6-plus`             |            7 |     101,425,927 |          17 |        346 |     862.7 MiB |     219.1 MiB | `SIGTERM` | no     |
+| `DeepSeek/deepseek-v4-pro` |            7 |     173,053,487 |          24 |        148 |     864.5 MiB |     213.8 MiB | `SIGTERM` | no     |
+| **Total / max**            |       **23** | **719,094,118** |      **77** |    **856** | **874.7 MiB** | **219.1 MiB** | -         | **no** |
+
+### 分轮结果
+
+| Model                      | Turn | Exit      | Timed out | OOM | Peak tree RSS | Peak root RSS | Total tokens | Agent calls | Tool calls |
+| -------------------------- | ---: | --------- | --------- | --- | ------------: | ------------: | -----------: | ----------: | ---------: |
+| `DeepSeek/deepseek-v4-pro` |    1 | `0`       | no        | no  |     709.1 MiB |     167.3 MiB |    5,565,147 |           4 |         37 |
+| `DeepSeek/deepseek-v4-pro` |    2 | `0`       | no        | no  |     674.5 MiB |     118.8 MiB |   13,989,721 |           4 |         29 |
+| `DeepSeek/deepseek-v4-pro` |    3 | `0`       | no        | no  |     734.1 MiB |     148.0 MiB |   22,621,542 |           4 |         24 |
+| `DeepSeek/deepseek-v4-pro` |    4 | `0`       | no        | no  |     771.1 MiB |     107.5 MiB |   33,470,249 |           4 |         22 |
+| `DeepSeek/deepseek-v4-pro` |    5 | `0`       | no        | no  |     864.5 MiB |     212.9 MiB |   43,540,313 |           4 |         19 |
+| `DeepSeek/deepseek-v4-pro` |    6 | `0`       | no        | no  |     807.6 MiB |     167.9 MiB |   53,866,515 |           4 |         17 |
+| `DeepSeek/deepseek-v4-pro` |    7 | `SIGTERM` | no        | no  |     785.1 MiB |     213.8 MiB |          n/a |         n/a |        n/a |
+| `pai/glm-5`                |    1 | `SIGTERM` | yes       | no  |     742.8 MiB |     170.5 MiB |   17,071,519 |           4 |        142 |
+| `pai/glm-5`                |    2 | `0`       | no        | no  |     874.7 MiB |     217.4 MiB |   27,438,727 |           4 |         60 |
+| `pai/glm-5`                |    3 | `0`       | no        | no  |     699.7 MiB |     102.1 MiB |   35,627,222 |           4 |         38 |
+| `pai/glm-5`                |    4 | `0`       | no        | no  |     796.0 MiB |     194.0 MiB |   44,130,101 |           4 |         23 |
+| `pai/glm-5`                |    5 | `0`       | no        | no  |     743.4 MiB |     152.1 MiB |   50,465,979 |           4 |         26 |
+| `pai/glm-5`                |    6 | `0`       | no        | no  |     714.9 MiB |     125.2 MiB |   56,357,372 |           4 |         18 |
+| `pai/glm-5`                |    7 | `0`       | no        | no  |     694.5 MiB |      96.6 MiB |   64,047,037 |           4 |         20 |
+| `pai/glm-5`                |    8 | `0`       | no        | no  |     756.0 MiB |     136.8 MiB |   71,891,505 |           4 |         15 |
+| `pai/glm-5`                |    9 | `SIGTERM` | no        | no  |     755.7 MiB |     157.3 MiB |   77,585,242 |           4 |         20 |
+| `qwen3.6-plus`             |    1 | `0`       | no        | no  |     735.1 MiB |     153.1 MiB |    3,890,508 |           4 |         83 |
+| `qwen3.6-plus`             |    2 | `0`       | no        | no  |     702.4 MiB |     142.5 MiB |    4,300,186 |           1 |          9 |
+| `qwen3.6-plus`             |    3 | `0`       | no        | no  |     862.7 MiB |     219.1 MiB |    8,635,953 |           4 |         88 |
+| `qwen3.6-plus`             |    4 | `SIGTERM` | yes       | no  |     685.8 MiB |     106.5 MiB |          n/a |         n/a |        n/a |
+| `qwen3.6-plus`             |    5 | `0`       | no        | no  |     610.5 MiB |      93.1 MiB |   40,191,337 |           4 |         87 |
+| `qwen3.6-plus`             |    6 | `0`       | no        | no  |     723.6 MiB |     121.9 MiB |   44,407,943 |           4 |         79 |
+| `qwen3.6-plus`             |    7 | `SIGTERM` | no        | no  |     810.4 MiB |     116.0 MiB |          n/a |         n/a |        n/a |
+
+### Phase 1 解读
+
+这是本轮里最能说明原始 long-session OOM 已明显改善的数据。
+
+这组测试比 5 月 18 日的小 PR review / code navigation 更重：它包含更多
+`--resume`、更多 subagent activity、更大的 reported token 量和更多 tool call。
+但 process-tree RSS 始终低于 `0.9 GiB`，也没有出现传统 V8 heap OOM。
+
+这不能证明所有用户 OOM 都不可能再发生，但至少说明当前构建在默认 heap 下，
+已经无法轻易复现 issue 中那类长会话 heap-pressure OOM。
+
+## Phase 2：Amplified Foreground Stdout 压测
+
+### 测试形态
+
+第二阶段故意放大 shell-output 路径压力。每个模型、每个 payload size 都要求
+parent session 和并发 agents 运行前台 shell 命令，输出大量 `x` 到 stdout：
+
+```bash
+node -e "const chunk='x'.repeat(1024*1024); for (let i=0; i<N; i++) process.stdout.write(chunk)"
+```
+
+Payload size：
+
+- `128 MiB`
+- `256 MiB`
+- `512 MiB`
+- `1024 MiB`
+- `1536 MiB`
+- `2048 MiB`
+
+这是 synthetic stress，不是正常代码审查工作负载。它的作用是验证极大前台
+stdout 是否还会把 Qwen Code 推入危险内存路径。
+
+### Payload 结果
+
+| Model                      |  Payload | Exit      | Timed out | OOM detector | Peak tree RSS | Peak root RSS | Largest process RSS | Agent calls | Total tokens | Fatal text |
+| -------------------------- | -------: | --------- | --------- | ------------ | ------------: | ------------: | ------------------: | ----------: | -----------: | ---------- |
+| `DeepSeek/deepseek-v4-pro` |  128 MiB | `0`       | no        | no           |    1503.9 MiB |      20.4 MiB |          1379.7 MiB |          10 |    2,142,198 | no         |
+| `DeepSeek/deepseek-v4-pro` |  256 MiB | `0`       | no        | no           |    2635.3 MiB |      41.1 MiB |          2467.0 MiB |           9 |    4,430,876 | no         |
+| `DeepSeek/deepseek-v4-pro` |  512 MiB | `0`       | no        | no           |    4103.5 MiB |      39.0 MiB |          3941.0 MiB |           7 |    6,862,342 | no         |
+| `DeepSeek/deepseek-v4-pro` | 1024 MiB | `0`       | no        | no           |    5638.8 MiB |      19.2 MiB |          5541.0 MiB |          15 |   12,771,536 | no         |
+| `DeepSeek/deepseek-v4-pro` | 1536 MiB | `0`       | no        | no           |    4281.6 MiB |     109.3 MiB |          3936.0 MiB |           5 |   14,471,839 | no         |
+| `DeepSeek/deepseek-v4-pro` | 2048 MiB | `SIGTERM` | no        | no           |    4660.4 MiB |      41.1 MiB |          4527.6 MiB |         n/a |            0 | yes        |
+| `pai/glm-5`                |  128 MiB | `0`       | no        | no           |    1160.1 MiB |      41.7 MiB |          1026.6 MiB |           4 |      443,778 | no         |
+| `pai/glm-5`                |  256 MiB | `0`       | no        | no           |    1709.4 MiB |      24.9 MiB |          1573.2 MiB |           4 |      856,902 | no         |
+| `pai/glm-5`                |  512 MiB | `0`       | no        | no           |    2528.1 MiB |      38.8 MiB |          2351.7 MiB |           4 |    1,285,019 | no         |
+| `pai/glm-5`                | 1024 MiB | `0`       | no        | no           |    4477.6 MiB |      41.1 MiB |          4343.1 MiB |           4 |    1,727,941 | no         |
+| `pai/glm-5`                | 1536 MiB | `0`       | no        | no           |    5941.4 MiB |      41.1 MiB |          5808.5 MiB |           4 |    2,185,419 | no         |
+| `pai/glm-5`                | 2048 MiB | `1`       | no        | no           |    4634.6 MiB |      49.6 MiB |          4493.1 MiB |         n/a |            0 | no         |
+| `qwen3.6-plus`             |  128 MiB | `0`       | no        | no           |     977.0 MiB |      93.0 MiB |           638.4 MiB |           4 |      796,217 | no         |
+| `qwen3.6-plus`             |  256 MiB | `0`       | no        | no           |    1508.2 MiB |      25.0 MiB |          1425.3 MiB |           4 |    1,601,828 | no         |
+| `qwen3.6-plus`             |  512 MiB | `0`       | no        | no           |    2338.2 MiB |      32.7 MiB |          2232.3 MiB |           4 |    2,571,448 | no         |
+| `qwen3.6-plus`             | 1024 MiB | `0`       | no        | no           |    4183.8 MiB |      40.8 MiB |          4001.0 MiB |           4 |    3,555,603 | no         |
+| `qwen3.6-plus`             | 1536 MiB | `0`       | no        | no           |    5964.7 MiB |      41.1 MiB |          5831.7 MiB |           4 |    4,366,032 | no         |
+| `qwen3.6-plus`             | 2048 MiB | `1`       | no        | no           |    4134.8 MiB |      41.1 MiB |          4001.4 MiB |         n/a |            0 | no         |
+
+### 为什么 RSS 会超过 Node Heap Limit
+
+本机 Node heap limit 是约 `4144 MiB`，但 process-tree RSS 最高达到约
+`5964.7 MiB`。这不矛盾：
+
+- RSS 包含 V8 heap、external buffers、native allocations、loaded modules、
+  child processes，以及不计入 old-space heap 的内存。
+- amplified 阶段的最高峰通常出现在 child Node process，而不是 root wrapper。
+- 所以 root RSS 可以维持较低，同时 process-tree RSS 很高。
+
+## 与历史报告的对比
+
+5 月 18 日 maintainer benchmark 里，普通任务下 Qwen process-tree RSS 峰值大多在
+`942.5 MiB` 到 `1006.6 MiB`。这些数据主要说明：当时 Qwen Code 的任务期
+footprint 明显高于 Claude Code。
+
+5 月 19 日 runtime diagnostics report 进一步拆出了两条线：
+
+- startup/config RSS：常由正常配置和 MCP child process 推高；
+- long-session heap OOM：更偏向 history / compaction / clone pressure。
+
+本轮 5 月 21 日数据进一步支持这个拆分：
+
+- 真实长任务、多 agent、长 session 场景稳定在 `0.9 GiB` 以下，没有 hit heap
+  OOM；
+- amplified stdout 可以把 process-tree RSS 推到 `5.96 GiB`，并在 `2048 MiB`
+  触发另一类 V8 string-length fatal。
+
+因此，与原 issue 里的传统 long-session heap OOM 相比，当前构建明显更稳。但这不
+等于 Qwen Code 已经没有任何 large-output memory 风险。
+
+## 新发现：极端 Foreground Stdout Fatal Path
+
+本轮新发现的问题不是原来的 heap OOM，而是 multi-GiB foreground stdout 下的
+string/decoding fatal：
+
+```text
+Fatal error in , line 0
+Check failed: i::kMaxInt >= len.
+...
+v8::String::NewFromOneByte
+node::StringBytes::Encode
+node::encoding_binding::BindingData::DecodeUTF8
+```
+
+触发条件：
+
+- Model：`DeepSeek/deepseek-v4-pro`
+- Payload：`2048 MiB`
+- Peak tree RSS：`4660.4 MiB`
+- Largest process RSS：`4527.6 MiB`
+- runner 记录 exit：`SIGTERM`，因为 fatal 输出已经捕获后，剩余子进程仍在高 CPU
+  空转，被手动终止。
+
+`pai/glm-5` 和 `qwen3.6-plus` 在 `2048 MiB` 也失败，表现为 stdout 为空、
+exit code `1`，但 stderr 没有捕获到 V8 fatal stack。
+
+### 严重程度
+
+这是一个真实的 robustness 问题，但触发条件是 multi-GiB foreground stdout，
+不是正常代码审查任务。它也不能证明当前 long-session heap-pressure 修复失败。
+
+### 是否是本 PR 引入？
+
+本轮没有证据表明 `2048 MiB` stdout failure 是当前 memory PR 引入的回归。
+
+原因：
+
+- 失败路径是 foreground shell stdout decode / string construction。
+- 原 issue 路径是 long-session history、compaction、clone pressure。
+- 本轮没有做同 payload 的 pre-PR baseline，因此不能归因成 regression。
+- 该 failure 只在刻意极端的 `2048 MiB` payload 出现；`128 MiB` 到
+  `1536 MiB` 都能完成。
+
+建议把它作为 dedicated large-output follow-up：更早 stream / spool / hard-cap
+foreground shell output，避免在内存里构造 multi-GiB JS string。除非当前 PR 的目标
+明确包含“任意 multi-GiB 前台 stdout 都必须可处理”，否则不建议把它作为当前 PR 的
+blocker。
+
+## 结论
+
+1. 最新本地 `0.15.11` 构建在 issue 报告的 long-session heap OOM 方向上明显更好。
+   基于当前默认 heap 压测结果，可以认为本 PR 已经基本解决此前遇到的
+   long-session heap OOM 复现路径。
+
+2. 在默认 Node heap 下，真实长任务 + 多 agent review loop 没有在
+   `pai/glm-5`、`qwen3.6-plus`、`DeepSeek/deepseek-v4-pro` 三个模型上复现传统
+   V8 heap OOM。
+
+3. synthetic foreground stdout 压测仍能把 process-tree RSS 推得很高。当前构建在
+   三模型上都撑过了 `1536 MiB` payload，最高成功 tree RSS 是 `5964.7 MiB`。
+
+4. 仍然存在一个独立的极端 large-output 问题：`2048 MiB` stdout 附近，Qwen Code
+   可能在输出 JSON 结果前失败；DeepSeek case 捕获到了 V8 string-length fatal。
+
+5. 这个新发现重要，但更像是后续 large-output robustness 问题，不应直接作为
+   long-session heap-pressure mitigation 的 blocker。
+
+## 建议发到 PR 的评论摘要
+
+建议 PR 评论里只放精简摘要，完整数据放本文档：
+
+```markdown
+I reran default-heap stress tests on the latest local build with
+`pai/glm-5`, `qwen3.6-plus`, and `DeepSeek/deepseek-v4-pro`.
+
+No `NODE_OPTIONS`, `--max-old-space-size`, or runner `ulimit` was used. The
+local Node heap limit was about 4144 MiB.
+
+Results:
+
+- Realistic long-session + multi-agent review loop: 23 worker turns,
+  ~719M reported total tokens, 77 agent calls, 856 total tool calls.
+  No traditional V8 heap OOM was reproduced. Peak process-tree RSS was
+  874.7 MiB; peak root RSS was 219.1 MiB.
+- Amplified stdout stress: 18 payload attempts across 128 MiB -> 2048 MiB.
+  All three models completed through 1536 MiB payloads without traditional
+  heap OOM. Highest successful process-tree RSS was 5964.7 MiB.
+- At 2048 MiB foreground stdout, an extreme large-output failure remains.
+  DeepSeek captured a V8 fatal `Check failed: i::kMaxInt >= len` stack in
+  `String::NewFromOneByte` / `StringBytes::Encode` / `DecodeUTF8`.
+
+Conclusion: this PR appears to have effectively addressed the previously
+observed long-session heap OOM reproduction path under default heap. The
+2048 MiB stdout failure is a separate large-output/string-limit robustness issue
+and should be tracked as a follow-up rather than treated as the same
+long-session heap OOM regression.
+```
diff --git a/docs/plans/2026-05-18-qwen-runtime-memory-investigation.md b/docs/plans/2026-05-18-qwen-runtime-memory-investigation.md
new file mode 100644
index 0000000000..393e3dc8dc
--- /dev/null
+++ b/docs/plans/2026-05-18-qwen-runtime-memory-investigation.md
@@ -0,0 +1,240 @@
+# Qwen Code Runtime Memory Investigation Plan
+
+Date: 2026-05-18
+
+## Context
+
+Local benchmarks show Qwen Code using substantially more process-tree RSS than
+Claude Code for similar non-interactive CLI task shapes. The latest five-case
+matrix found Qwen Code peaking around `0.83-1.04 GiB` while Claude Code stayed
+around `0.27-0.36 GiB`.
+
+This document proposes a draft investigation and optimization direction. It is
+not intended to claim a final root cause yet. The immediate goal is to make the
+memory gap reviewable, reproducible, and explainable with internal diagnostics.
+
+## Progress So Far
+
+The investigation has reached the evidence-and-direction stage:
+
+- A repeatable local matrix has been built for small PR review, code navigation,
+  and synthetic diff workloads.
+- Qwen Code has been compared across multiple models.
+- Qwen Code and Claude Code have been compared on the same task shapes where
+  equivalent model endpoints were available.
+- The observed RSS gap is consistent enough to justify deeper runtime
+  diagnostics.
+- Related upstream work has been mapped so this effort can build on existing
+  `/doctor memory` and memory-diagnostics follow-ups.
+
+The investigation has not yet reached the final root-cause stage because
+external process RSS cannot show whether the retained memory is V8 heap, native
+memory, loaded modules, live history, tool results, or request assembly state.
+
+## Current Evidence
+
+The companion benchmark report is:
+
+- `docs/e2e-tests/2026-05-18-qwen-memory-benchmark-report.md`
+
+The main evidence is:
+
+- The Qwen-vs-Claude RSS gap reproduced across small PR review, code
+  navigation, and synthetic diff workloads.
+- The gap reproduced with both `pai/glm-5` and `qwen3.6-plus`.
+- Qwen Code used more tokens than Claude Code in every tested matrix cell.
+- Large diff size did not produce a clean linear memory increase, which suggests
+  the baseline and bounded/truncated output paths matter more than raw diff
+  bytes alone.
+
+## Related Work
+
+Relevant upstream work already exists:
+
+| Item    | Status                | Role in the memory work                                                                                         |
+| ------- | --------------------- | --------------------------------------------------------------------------------------------------------------- |
+| `#4180` | merged PR             | Adds baseline `/doctor memory` diagnostics. This is the first instrumentation slice.                            |
+| `#4181` | open issue, no PR yet | Adds interpretation and pressure classification for `/doctor memory`.                                           |
+| `#4182` | open issue, no PR yet | Adds structured `/doctor memory --json` output and safe session-scale stats.                                    |
+| `#4183` | open issue, no PR yet | Adds opt-in heap snapshots and bounded memory timeline diagnostics.                                             |
+| `#4184` | open issue, no PR yet | Adds large tool-result retention diagnostics and designs offload/preview mitigation.                            |
+| `#4127` | open PR, conflicting  | Adds heap-pressure safety nets for long-session OOM prevention. Useful mitigation, not enough for attribution.  |
+| `#4168` | open PR               | Redesigns auto-compaction thresholds. Useful for context pressure, not enough for task-time footprint analysis. |
+| `#4172` | open PR               | Decouples auto-memory recall from the main request path. Useful for latency/blocking, not direct RSS proof.     |
+| `#4188` | merged PR             | Bounds build/test caches to prevent OOM in parallel test runs. Important but separate from runtime benchmarks.  |
+
+This investigation should build on that direction rather than wait for all
+follow-up issues to land.
+
+Most of the remaining work is instrumentation-first. The open diagnostics
+issues are designed to make memory reports explainable before attempting a
+runtime fix. The open mitigation PRs may reduce specific OOM paths, but they do
+not yet explain why short non-interactive CLI tasks repeatedly peak near
+`1 GiB`.
+
+## Why This Draft Starts With Documentation
+
+This draft intentionally starts with benchmark evidence and an investigation
+plan instead of bundling a runtime code change.
+
+Reasons:
+
+1. The current goal is to make the performance problem and direction visible,
+   not to claim a same-day fix.
+2. Adding instrumentation and optimization in the same PR would make review
+   harder because it mixes measurement, diagnosis, and behavior changes.
+3. The existing benchmark already supports the need for deeper diagnostics.
+4. The next PR can be narrower and easier to validate: diagnostics-only, then
+   rerun the same matrix and compare internal metrics.
+
+The next implementation PR should add the missing counters and timeline points,
+then rerun the benchmark matrix. Only after that should a targeted optimization
+PR attempt to reduce memory.
+
+## Working Inference
+
+The current data points toward a Qwen Code runtime/path issue more than a model
+provider issue.
+
+The strongest current inference is:
+
+> Qwen Code appears to carry a high non-interactive CLI task execution
+> footprint, likely amplified by larger context/tool-result/session handling.
+> The likely problem area is the CLI runtime and agent data path, not the
+> selected model alone.
+
+More specifically, the evidence points away from "too many tool calls" as the
+primary cause. Tool-call counts were similar across CLIs, and Claude sometimes
+used more turns or tool calls while keeping lower RSS. The more plausible
+problem is that Qwen Code initializes or retains heavier state for the same
+short non-interactive CLI task, then amplifies that execution footprint with
+larger context, tool-result, saved-output, or session-history data.
+
+The most likely buckets are:
+
+1. **Process and module startup/execution cost**: Qwen Code may initialize more
+   runtime, tools, UI/session infrastructure, or provider machinery than needed
+   for non-interactive CLI tasks.
+2. **History and context assembly**: Qwen Code may retain or construct larger
+   model-facing context than Claude Code for the same task shape.
+3. **Tool-result retention**: large or repeated tool results may be retained in
+   live history, UI history, chat recording, or saved-output recovery paths.
+4. **Subagent and saved-output amplification**: previous large PR tests showed
+   saved-output recovery and subagent activity, which can add memory and token
+   pressure.
+5. **MCP child processes**: the companion diagnostics report revealed that MCP
+   servers (e.g. chrome-devtools) contribute ~350 MiB to process-tree RSS. This
+   inflates the absolute numbers but is a constant overhead unrelated to session
+   length.
+6. **Native memory versus JS heap split**: external RSS cannot tell whether the
+   pressure is V8 heap, native buffers, loaded modules, or retained data.
+
+This is deliberately phrased as an inference. The next step is to add enough
+internal measurements to confirm or rule out each bucket.
+
+## Proposed Draft PR Scope
+
+The first draft PR should be evidence and diagnostics focused:
+
+1. Commit the benchmark report and investigation plan.
+2. Add or extend local diagnostic output so Qwen Code can report:
+   - V8 heap and heap-space statistics.
+   - RSS versus heap split.
+   - session message count and approximate retained size.
+   - tool result count, total retained size, and largest retained result size.
+   - truncation and saved-output recovery counters.
+   - subagent/process-tree activity when available.
+3. Re-run the existing matrix against:
+   - current published Qwen Code,
+   - current `main`,
+   - diagnostics-only branch,
+   - candidate optimization branch.
+4. Use those measurements to choose one small optimization target.
+
+The first PR should avoid mixing several unrelated optimizations. It should
+either remain documentation-only or add diagnostics-only code. A separate PR
+should carry the first runtime memory reduction once the cause is clearer.
+
+## Candidate Optimization Directions
+
+These are candidates, not conclusions:
+
+1. **Bounded tool-output retention**: store large output out of the hot path and
+   keep only preview, metadata, and retrieval pointers in live history.
+2. **Non-interactive lazy loading**: avoid initializing TUI-only or
+   interactive-only subsystems during non-interactive CLI task execution.
+3. **Session/UI history caps**: degrade old or heavy history items into compact
+   transcript entries.
+4. **Context assembly accounting**: measure and cap large tool results before
+   model request construction.
+5. **Subagent accounting**: expose subagent lifecycle and memory impact in
+   diagnostics.
+
+Claude Code and OpenAI Codex (OpenAI's CLI coding agent) should be used as
+design references for diagnostic separation, bounded output retention, and lazy
+history loading. The implementation should still follow Qwen Code's own
+architecture and tests.
+
+## Validation Plan
+
+The investigation should keep the same benchmark matrix so before/after results
+remain comparable:
+
+- small PR review
+- code navigation
+- synthetic diff about 100 KiB
+- synthetic diff about 1 MiB
+- synthetic diff about 5 MiB
+
+For each run, record:
+
+- process-tree RSS peak
+- root process RSS peak
+- V8 heap peak
+- heap-space summary
+- duration
+- turns
+- token count
+- tool call count
+- largest retained tool result
+- total retained tool-result size
+- session/history item counts
+- subagent count
+
+The minimum success condition for a candidate fix is not just "RSS went down".
+It should also identify which internal metric changed and why.
+
+## Next PR Candidate
+
+The next PR should be diagnostics-only and should avoid changing runtime
+behavior. A minimal useful slice would add:
+
+- model request input-size accounting;
+- system prompt and tool schema size accounting;
+- retained message count and approximate retained character size;
+- retained tool-result count, total size, and largest item size;
+- lifecycle samples around startup, first request assembly, tool execution,
+  streaming completion, compression, and final response;
+- process memory samples that include RSS, heap used, heap total, external, and
+  heap-space stats.
+
+After that lands locally, rerun the same Qwen model matrix and compare:
+
+- published Qwen Code;
+- current `main`;
+- diagnostics-only branch;
+- candidate optimization branch.
+
+## Non-Goals
+
+This draft does not claim that:
+
+- all memory pressure is caused by tool output;
+- one existing open PR will solve the observed task-time footprint;
+- model provider differences are irrelevant in every environment;
+- single-run local measurements are sufficient for release-level performance
+  claims.
+
+The intended claim is narrower: Qwen Code shows a consistent local RSS gap in
+the tested workloads, and the project needs internal diagnostics to explain and
+reduce that gap.
diff --git a/packages/cli/src/ui/commands/doctorCommand.test.ts b/packages/cli/src/ui/commands/doctorCommand.test.ts
index f9afbd969c..315ebc8cbc 100644
--- a/packages/cli/src/ui/commands/doctorCommand.test.ts
+++ b/packages/cli/src/ui/commands/doctorCommand.test.ts
@@ -143,10 +143,13 @@ describe('doctorCommand', () => {
         },
       ],
       resourceUsage: {
-        maxRSS: 4_000,
+        maxRSS: 4 * 1024,
+        maxRSSRaw: 4,
+        maxRSSUnit: 'KiB',
         userCPUTime: 10,
         systemCPUTime: 20,
       },
+      processTree: null,
       activeHandles: 2,
       activeRequests: 0,
       openFileDescriptors: null,
@@ -839,10 +842,13 @@ describe('doctorCommand', () => {
         nativeContexts: 1,
       },
       resourceUsage: {
-        maxRSS: 8_000,
+        maxRSS: 8 * 1024,
+        maxRSSRaw: 8,
+        maxRSSUnit: 'KiB',
         userCPUTime: 10,
         systemCPUTime: 20,
       },
+      processTree: null,
       activeHandles: 2,
       activeRequests: 0,
       v8HeapSpaces: null,
@@ -946,10 +952,13 @@ describe('doctorCommand', () => {
       },
       v8HeapSpaces: null,
       resourceUsage: {
-        maxRSS: 4_000,
+        maxRSS: 4 * 1024,
+        maxRSSRaw: 4,
+        maxRSSUnit: 'KiB',
         userCPUTime: 10,
         systemCPUTime: 20,
       },
+      processTree: null,
       activeHandles: 2,
       activeRequests: 0,
       openFileDescriptors: null,
@@ -992,7 +1001,14 @@ describe('doctorCommand', () => {
           detachedContexts: 0,
           nativeContexts: 1,
         },
-        resourceUsage: { maxRSS: 0, userCPUTime: 0, systemCPUTime: 0 },
+        resourceUsage: {
+          maxRSS: 0,
+          maxRSSRaw: 0,
+          maxRSSUnit: 'KiB',
+          userCPUTime: 0,
+          systemCPUTime: 0,
+        },
+        processTree: null,
         activeHandles: 0,
         activeRequests: 0,
         v8HeapSpaces: null,
diff --git a/packages/core/src/core/anthropicContentGenerator/anthropicContentGenerator.ts b/packages/core/src/core/anthropicContentGenerator/anthropicContentGenerator.ts
index 987766b15d..2f6ba63cb5 100644
--- a/packages/core/src/core/anthropicContentGenerator/anthropicContentGenerator.ts
+++ b/packages/core/src/core/anthropicContentGenerator/anthropicContentGenerator.ts
@@ -35,6 +35,7 @@ import {
 } from '../../utils/runtimeFetchOptions.js';
 import { DEFAULT_TIMEOUT } from '../openaiContentGenerator/constants.js';
 import { createDebugLogger } from '../../utils/debugLogger.js';
+import { runtimeDiagnostics } from '../../utils/runtimeDiagnostics.js';
 import {
   tokenLimit,
   CAPPED_DEFAULT_MAX_TOKENS,
@@ -226,6 +227,7 @@ export class AnthropicContentGenerator implements ContentGenerator {
     let response: Message;
     try {
       const anthropicRequest = await this.buildRequest(request);
+      runtimeDiagnostics.recordAnthropicWireRequest(anthropicRequest);
       const headers = this.buildPerRequestHeaders(anthropicRequest);
       response = (await this.client.messages.create(anthropicRequest, {
         signal: request.config?.abortSignal,
@@ -249,6 +251,7 @@ export class AnthropicContentGenerator implements ContentGenerator {
       ...anthropicRequest,
       stream: true,
     };
+    runtimeDiagnostics.recordAnthropicWireRequest(streamingRequest);
 
     let stream: AsyncIterable<RawMessageStreamEvent>;
     try {
diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index 7f2d514ce0..efd0043fc1 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -301,10 +301,58 @@ export class GeminiClient {
     return this.getChat().getHistory(curated);
   }
 
+  getHistoryShallow(curated: boolean = false): Content[] {
+    const chat = this.getChat();
+    return chat.getHistoryShallow?.(curated) ?? chat.getHistory(curated);
+  }
+
   getHistoryTail(count: number, curated: boolean = false): Content[] {
     return this.getChat().getHistoryTail(count, curated);
   }
 
+  private getHistoryTailShallow(
+    count: number,
+    curated: boolean = false,
+  ): Content[] {
+    const chat = this.getChat();
+    return (
+      chat.getHistoryTailShallow?.(count, curated) ??
+      chat.getHistoryTail?.(count, curated) ??
+      chat.getHistory(curated).slice(-count)
+    );
+  }
+
+  private peekLastHistoryEntry(): Content | undefined {
+    const chat = this.getChat();
+    return chat.peekLastHistoryEntry?.() ?? chat.getHistory().at(-1);
+  }
+
+  private getHistoryLength(): number {
+    const chat = this.getChat();
+    return chat.getHistoryLength?.() ?? chat.getHistory().length;
+  }
+
+  private getLastModelMessageText(): string | undefined {
+    const chat = this.getChat();
+    if (chat.getLastModelMessageText) {
+      return chat.getLastModelMessageText();
+    }
+    const history = chat.getHistoryShallow?.() ?? chat.getHistory();
+    for (let i = history.length - 1; i >= 0; i--) {
+      const message = history[i];
+      if (message?.role !== 'model') continue;
+      const text =
+        message.parts
+          ?.filter(
+            (part): part is { text: string } => typeof part.text === 'string',
+          )
+          .map((part) => part.text)
+          .join('') ?? '';
+      return text || undefined;
+    }
+    return undefined;
+  }
+
   /**
    * Pop orphaned trailing user entries from the in-memory chat history.
    * Used by:
@@ -921,7 +969,7 @@ export class GeminiClient {
     ) {
       const projectRoot = this.config.getProjectRoot();
       const sessionId = this.config.getSessionId();
-      const history = this.getHistory();
+      const history = this.getHistoryShallow();
       const mgr = this.config.getMemoryManager();
       const autoSkillEnabled = this.config.getAutoSkillEnabled();
 
@@ -985,7 +1033,7 @@ export class GeminiClient {
 
     const projectRoot = this.config.getProjectRoot();
     const sessionId = this.config.getSessionId();
-    const history = this.getHistory();
+    const history = this.getHistoryShallow();
     const mgr = this.config.getMemoryManager();
 
     if (!this.config.getManagedAutoMemoryEnabled()) {
@@ -1259,7 +1307,7 @@ export class GeminiClient {
         // retries/hooks) so that model latency during a tool-call loop
         // doesn't count as user idle time.
         const mcResult = microcompactHistory(
-          this.getChat().getHistory(),
+          this.getHistoryShallow(),
           this.lastApiCompletionTimestamp,
           this.config.getClearContextOnIdle(),
         );
@@ -1394,9 +1442,8 @@ export class GeminiClient {
       // part from the user immediately follows a functionCall part from the model
       // in the conversation history . The IDE context is not discarded; it will
       // be included in the next regular message sent to the model.
-      const history = this.getHistory();
-      const lastMessage =
-        history.length > 0 ? history[history.length - 1] : undefined;
+      const historyLength = this.getHistoryLength();
+      const lastMessage = this.peekLastHistoryEntry();
       const hasPendingToolCall =
         !!lastMessage &&
         lastMessage.role === 'model' &&
@@ -1407,7 +1454,7 @@ export class GeminiClient {
 
       if (this.config.getIdeMode() && !hasPendingToolCall) {
         const { contextParts, newIdeContext } = this.getIdeContextParts(
-          this.forceFullIdeContext || history.length === 0,
+          this.forceFullIdeContext || historyLength === 0,
         );
         if (contextParts.length > 0) {
           ideContextText = wrapIdeContext(contextParts.join('\n'));
@@ -1643,16 +1690,8 @@ export class GeminiClient {
         !signal.aborted &&
         this.config.hasHooksForEvent('Stop')
       ) {
-        // Get response text from the chat history
-        const history = this.getHistory();
-        const lastModelMessage = history
-          .filter((msg) => msg.role === 'model')
-          .pop();
         const responseText =
-          lastModelMessage?.parts
-            ?.filter((p): p is { text: string } => 'text' in p)
-            .map((p) => p.text)
-            .join('') || '[no response text]';
+          this.getLastModelMessageText() || '[no response text]';
 
         const response = await messageBus.request<
           HookExecutionRequest,
@@ -1817,12 +1856,11 @@ export class GeminiClient {
         // see the current turn's history regardless of which path exits below.
         try {
           const chat = this.getChat();
-          const fullHistory = chat.getHistory(true);
           const maxHistoryForCache = 40;
-          const cachedHistory =
-            fullHistory.length > maxHistoryForCache
-              ? fullHistory.slice(-maxHistoryForCache)
-              : fullHistory;
+          const cachedHistory = this.getHistoryTailShallow(
+            maxHistoryForCache,
+            true,
+          );
           saveCacheSafeParams(
             chat.getGenerationConfig(),
             cachedHistory,
@@ -2008,7 +2046,8 @@ export class GeminiClient {
       signal,
     );
     if (info.compressionStatus === CompressionStatus.COMPRESSED) {
-      const compressedHistory = this.getChat().getHistory();
+      const chat = this.getChat();
+      const compressedHistory = chat.getHistoryShallow?.() ?? chat.getHistory();
       await this.startChat(compressedHistory, SessionStartSource.Compact);
       if (
         !this.lastSessionStartContext &&
diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts
index fd25e8a220..5f3caa976b 100644
--- a/packages/core/src/core/geminiChat.test.ts
+++ b/packages/core/src/core/geminiChat.test.ts
@@ -27,18 +27,6 @@ import { CompressionStatus, type ChatCompressionInfo } from './turn.js';
 import { ChatCompressionService } from '../services/chatCompressionService.js';
 import { SessionStartSource } from '../hooks/types.js';
 
-const { mockGetHeapStatistics } = vi.hoisted(() => ({
-  mockGetHeapStatistics: vi.fn(),
-}));
-
-vi.mock('node:v8', async (importOriginal) => {
-  const actual = await importOriginal<typeof import('node:v8')>();
-  return {
-    ...actual,
-    getHeapStatistics: mockGetHeapStatistics,
-  };
-});
-
 // Mock fs module to prevent actual file system operations during tests
 const mockFileSystem = new Map<string, string>();
 
@@ -115,10 +103,6 @@ describe('GeminiChat', async () => {
 
     // Default mock implementation for tests that don't care about retry logic
     mockRetryWithBackoff.mockImplementation(async (apiCall) => apiCall());
-    mockGetHeapStatistics.mockReturnValue({
-      used_heap_size: 0,
-      heap_size_limit: Number.MAX_SAFE_INTEGER,
-    });
     mockConfig = {
       getSessionId: () => 'test-session-id',
       getTelemetryLogPromptsEnabled: () => true,
@@ -1077,6 +1061,61 @@ describe('GeminiChat', async () => {
       );
     });
 
+    it('does not deep-clone the full curated history when building request contents', async () => {
+      chat.setHistory([
+        { role: 'user', parts: [{ text: 'prior question' }] },
+        { role: 'model', parts: [{ text: 'prior answer' }] },
+      ]);
+      const response = (async function* () {
+        yield {
+          candidates: [
+            {
+              content: {
+                parts: [{ text: 'response' }],
+                role: 'model',
+              },
+              finishReason: 'STOP',
+              index: 0,
+              safetyRatings: [],
+            },
+          ],
+          text: () => 'response',
+        } as unknown as GenerateContentResponse;
+      })();
+      vi.mocked(mockContentGenerator.generateContentStream).mockResolvedValue(
+        response,
+      );
+      const structuredCloneSpy = vi
+        .spyOn(globalThis, 'structuredClone')
+        .mockImplementation(() => {
+          throw new Error('structuredClone should not build request contents');
+        });
+
+      try {
+        const stream = await chat.sendMessageStream(
+          'test-model',
+          { message: 'hello' },
+          'prompt-id-no-request-clone',
+        );
+        for await (const _ of stream) {
+          // consume stream
+        }
+      } finally {
+        structuredCloneSpy.mockRestore();
+      }
+
+      expect(mockContentGenerator.generateContentStream).toHaveBeenCalledWith(
+        expect.objectContaining({
+          contents: [
+            { role: 'user', parts: [{ text: 'prior question' }] },
+            { role: 'model', parts: [{ text: 'prior answer' }] },
+            { role: 'user', parts: [{ text: 'hello' }] },
+          ],
+        }),
+        'prompt-id-no-request-clone',
+      );
+    });
+
     it('should not update global telemetry when no telemetryService is provided (subagent isolation)', async () => {
       // Simulate a subagent GeminiChat: created without a telemetryService
       const subagentChat = new GeminiChat(mockConfig, config, []);
@@ -1223,7 +1262,10 @@ describe('GeminiChat', async () => {
             compressionStatus: CompressionStatus.NOOP,
           },
         });
-      vi.spyOn(chat, 'getHistory').mockImplementationOnce(() => {
+      vi.spyOn(
+        chat as unknown as { getRequestHistory: () => Content[] },
+        'getRequestHistory',
+      ).mockImplementationOnce(() => {
         throw new Error('history setup failed');
       });
 
@@ -1928,6 +1970,65 @@ describe('GeminiChat', async () => {
     });
   });
 
+  describe('getHistoryShallow', () => {
+    it('copies containers without structured-cloning large part payloads', () => {
+      const payload = { output: 'x'.repeat(128 * 1024) };
+      const content: Content = {
+        role: 'user',
+        parts: [
+          {
+            functionResponse: {
+              id: 'call-1',
+              name: 'read_file',
+              response: payload,
+            },
+          },
+        ],
+      };
+      chat.addHistory(content);
+      const structuredCloneSpy = vi
+        .spyOn(globalThis, 'structuredClone')
+        .mockImplementation(() => {
+          throw new Error('unexpected deep clone');
+        });
+
+      const history = chat.getHistoryShallow();
+
+      expect(structuredCloneSpy).not.toHaveBeenCalled();
+      expect(history).toEqual([content]);
+      expect(history[0]).not.toBe(content);
+      expect(history[0]!.parts).not.toBe(content.parts);
+      const response = history[0]!.parts![0] as {
+        functionResponse: { response: typeof payload };
+      };
+      expect(response.functionResponse.response).toBe(payload);
+    });
+  });
+
+  describe('getHistoryTailShallow', () => {
+    it('copies only recent containers without cloning payloads', () => {
+      const oldContent: Content = { role: 'user', parts: [{ text: 'old' }] };
+      const recentContent: Content = {
+        role: 'model',
+        parts: [{ text: 'recent' }],
+      };
+      chat.addHistory(oldContent);
+      chat.addHistory(recentContent);
+      const structuredCloneSpy = vi
+        .spyOn(globalThis, 'structuredClone')
+        .mockImplementation(() => {
+          throw new Error('unexpected deep clone');
+        });
+
+      const tail = chat.getHistoryTailShallow(1);
+
+      expect(structuredCloneSpy).not.toHaveBeenCalled();
+      expect(tail).toEqual([recentContent]);
+      expect(tail[0]).not.toBe(recentContent);
+      expect(tail[0]!.parts).not.toBe(recentContent.parts);
+    });
+  });
+
   describe('getLastHistoryEntry', () => {
     it('returns undefined for an empty history', () => {
       expect(chat.getLastHistoryEntry()).toBeUndefined();
@@ -1948,6 +2049,42 @@ describe('GeminiChat', async () => {
     });
   });
 
+  describe('peekLastHistoryEntry', () => {
+    it('returns the last entry without structured-cloning the full history', () => {
+      const first: Content = { role: 'user', parts: [{ text: 'a' }] };
+      const last: Content = { role: 'model', parts: [{ text: 'b' }] };
+      chat.addHistory(first);
+      chat.addHistory(last);
+      const structuredCloneSpy = vi
+        .spyOn(globalThis, 'structuredClone')
+        .mockImplementation(() => {
+          throw new Error('unexpected deep clone');
+        });
+
+      expect(chat.peekLastHistoryEntry()).toBe(last);
+      expect(structuredCloneSpy).not.toHaveBeenCalled();
+    });
+  });
+
+  describe('getLastModelMessageText', () => {
+    it('returns text from the latest model message without cloning history', () => {
+      chat.addHistory({ role: 'model', parts: [{ text: 'older' }] });
+      chat.addHistory({ role: 'user', parts: [{ text: 'question' }] });
+      chat.addHistory({
+        role: 'model',
+        parts: [{ text: 'new' }, { text: ' answer' }],
+      });
+      const structuredCloneSpy = vi
+        .spyOn(globalThis, 'structuredClone')
+        .mockImplementation(() => {
+          throw new Error('unexpected deep clone');
+        });
+
+      expect(chat.getLastModelMessageText()).toBe('new answer');
+      expect(structuredCloneSpy).not.toHaveBeenCalled();
+    });
+  });
+
   describe('sendMessageStream with retries', () => {
     it('should retry on invalid content, succeed, and report metrics', async () => {
       vi.useFakeTimers();
@@ -3620,13 +3757,6 @@ describe('GeminiChat', async () => {
       return compressSpy;
     }
 
-    function mockHeapPressure(usedHeapSize: number, heapLimit = 1000) {
-      mockGetHeapStatistics.mockReturnValue({
-        used_heap_size: usedHeapSize,
-        heap_size_limit: heapLimit,
-      });
-    }
-
     it('replaces history and updates per-chat lastPromptTokenCount on COMPRESSED', async () => {
       mockCompressionService('compressed');
       chat.setHistory([userMsg('a'), modelMsg('b'), userMsg('c')]);
@@ -3690,136 +3820,9 @@ describe('GeminiChat', async () => {
 
     it('forwards force=true to the compression service', async () => {
       const compressSpy = mockCompressionService('compressed');
-      mockHeapPressure(900);
 
       await chat.tryCompress('p1', 'm1', true);
       expect(compressSpy.mock.calls[0][1].force).toBe(true);
-      expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(false);
-      expect(mockGetHeapStatistics).not.toHaveBeenCalled();
-    });
-
-    it('uses heap pressure to bypass the token gate without manual force semantics', async () => {
-      const compressSpy = mockCompressionService('noop');
-      mockHeapPressure(750);
-      vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({
-        authType: AuthType.USE_GEMINI,
-        model: 'test-model',
-        contextWindowSize: 1000,
-      });
-
-      await chat.tryCompress('p1', 'm1');
-
-      expect(compressSpy.mock.calls[0][1].force).toBe(false);
-      expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(true);
-      expect(compressSpy.mock.calls[0][1].originalTokenCount).toBe(0);
-    });
-
-    it('does not bypass the token gate below the heap-pressure threshold', async () => {
-      const compressSpy = mockCompressionService('noop');
-      mockHeapPressure(650);
-
-      await chat.tryCompress('p1', 'm1');
-
-      expect(compressSpy.mock.calls[0][1].force).toBe(false);
-      expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(false);
-    });
-
-    it('does not let a failed heap-pressure attempt latch off later auto-compaction', async () => {
-      const compressSpy = mockCompressionService('failed-inflated');
-      mockHeapPressure(701);
-
-      const first = await chat.tryCompress('p1', 'm1');
-      expect(first.compressionStatus).toBe(
-        CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT,
-      );
-      expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(true);
-
-      compressSpy.mockClear();
-      compressSpy.mockResolvedValue({
-        newHistory: null,
-        info: {
-          originalTokenCount: 0,
-          newTokenCount: 0,
-          compressionStatus: CompressionStatus.NOOP,
-        },
-      });
-      mockHeapPressure(0);
-
-      await chat.tryCompress('p2', 'm1');
-
-      expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(false);
-      expect(compressSpy.mock.calls[0][1].hasFailedCompressionAttempt).toBe(
-        false,
-      );
-    });
-
-    it('backs off repeated heap-pressure bypasses after a heap-triggered failure', async () => {
-      vi.useFakeTimers();
-      vi.setSystemTime(new Date('2026-05-16T00:00:00Z'));
-      try {
-        const compressSpy = mockCompressionService('failed-inflated');
-        mockHeapPressure(800);
-
-        await chat.tryCompress('p1', 'm1');
-        expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(true);
-
-        compressSpy.mockClear();
-        compressSpy.mockResolvedValue({
-          newHistory: null,
-          info: {
-            originalTokenCount: 0,
-            newTokenCount: 0,
-            compressionStatus: CompressionStatus.NOOP,
-          },
-        });
-
-        await chat.tryCompress('p2', 'm1');
-        expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(false);
-
-        vi.setSystemTime(new Date('2026-05-16T00:00:31Z'));
-        compressSpy.mockClear();
-
-        await chat.tryCompress('p3', 'm1');
-        expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(true);
-      } finally {
-        vi.useRealTimers();
-      }
-    });
-
-    it('backs off repeated heap-pressure bypasses after a heap-triggered NOOP', async () => {
-      vi.useFakeTimers();
-      vi.setSystemTime(new Date('2026-05-16T00:00:00Z'));
-      try {
-        const compressSpy = mockCompressionService('noop');
-        mockHeapPressure(800);
-
-        await chat.tryCompress('p1', 'm1');
-        expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(true);
-
-        compressSpy.mockClear();
-
-        await chat.tryCompress('p2', 'm1');
-        expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(false);
-
-        vi.setSystemTime(new Date('2026-05-16T00:00:31Z'));
-        compressSpy.mockClear();
-
-        await chat.tryCompress('p3', 'm1');
-        expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(true);
-      } finally {
-        vi.useRealTimers();
-      }
-    });
-
-    it('falls back to token-threshold behavior if heap statistics are unavailable', async () => {
-      const compressSpy = mockCompressionService('noop');
-      mockGetHeapStatistics.mockImplementation(() => {
-        throw new Error('heap stats unavailable');
-      });
-
-      await chat.tryCompress('p1', 'm1');
-
-      expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(false);
     });
   });
 });
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index c2fc71bbea..2655acd61c 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -17,7 +17,6 @@ import type {
   GenerateContentResponseUsageMetadata,
 } from '@google/genai';
 import { createUserContent, FinishReason } from '@google/genai';
-import { getHeapStatistics } from 'node:v8';
 import { retryWithBackoff, isUnattendedMode } from '../utils/retry.js';
 import { getErrorStatus, isAbortError } from '../utils/errors.js';
 import { createDebugLogger } from '../utils/debugLogger.js';
@@ -59,10 +58,6 @@ import { getCustomSystemPrompt } from './prompts.js';
 
 const debugLogger = createDebugLogger('QWEN_CODE_CHAT');
 
-// Leave roughly 30% V8 heap headroom for compression's transient allocations.
-const HEAP_PRESSURE_COMPRESSION_RATIO = 0.7;
-const HEAP_PRESSURE_COMPRESSION_COOLDOWN_MS = 30_000;
-
 /**
  * Replaces the args on a `structured_output` `functionCall` with the
  * same `__redacted` placeholder used by `ToolCallEvent` telemetry
@@ -353,6 +348,13 @@ function extractCuratedHistory(comprehensiveHistory: Content[]): Content[] {
   return curatedHistory;
 }
 
+function copyContentContainer(content: Content): Content {
+  return {
+    ...content,
+    ...(content.parts ? { parts: [...content.parts] } : {}),
+  };
+}
+
 function stripThoughtPartsFromContent(content: Content): Content | null {
   if (!content.parts) {
     return content;
@@ -441,14 +443,6 @@ export class GeminiChat {
    */
   private hasFailedCompressionAttempt = false;
 
-  /**
-   * Heap-pressure compaction is process-wide pressure applied per chat. If one
-   * heap-triggered attempt cannot reduce history, briefly back off this chat
-   * so every subsequent send does not immediately pay for another compression
-   * side query while memory is already tight.
-   */
-  private heapPressureCompressionCooldownUntil = 0;
-
   /**
    * Creates a new GeminiChat instance.
    *
@@ -482,6 +476,18 @@ export class GeminiChat {
     return this.lastPromptTokenCount;
   }
 
+  /**
+   * Builds request contents for the content generator without deep-cloning the
+   * whole chat history. This is an internal hot path: long sessions can make a
+   * full `structuredClone` larger than the remaining V8 heap headroom.
+   *
+   * Public history readers still use {@link getHistory}, which returns a
+   * defensive deep copy for caller mutation safety.
+   */
+  private getRequestHistory(): Content[] {
+    return extractCuratedHistory(this.history).map(copyContentContainer);
+  }
+
   /**
    * Seed the last-prompt-token-count for chats created with inherited
    * history (forks, subagents, speculation). Without this, the auto-compress
@@ -509,33 +515,6 @@ export class GeminiChat {
     signal?: AbortSignal,
     options?: TryCompressOptions,
   ): Promise<ChatCompressionInfo> {
-    const heapPressureRatio = force ? null : this.getHeapPressureRatio();
-    const heapPressureCooldownActive =
-      !force && Date.now() < this.heapPressureCompressionCooldownUntil;
-    const bypassTokenThreshold =
-      heapPressureRatio !== null &&
-      heapPressureRatio >= HEAP_PRESSURE_COMPRESSION_RATIO &&
-      !heapPressureCooldownActive;
-    if (bypassTokenThreshold) {
-      // Temporary safety net: token-based compaction can be too late for
-      // large-context sessions because JS heap pressure may hit first.
-      // Do not use force=true here because that carries manual /compress
-      // semantics in ChatCompressionService.
-      debugLogger.warn(
-        `Heap pressure at ${(heapPressureRatio * 100).toFixed(1)}%; ` +
-          'attempting auto-compaction before token threshold.',
-      );
-    } else if (
-      heapPressureRatio !== null &&
-      heapPressureRatio >= HEAP_PRESSURE_COMPRESSION_RATIO &&
-      heapPressureCooldownActive
-    ) {
-      debugLogger.debug(
-        `Heap pressure at ${(heapPressureRatio * 100).toFixed(1)}%; ` +
-          'skipping heap-pressure auto-compaction during cooldown.',
-      );
-    }
-
     const service = new ChatCompressionService();
     const { newHistory, info } = await service.compress(this, {
       promptId,
@@ -545,7 +524,6 @@ export class GeminiChat {
       hasFailedCompressionAttempt: this.hasFailedCompressionAttempt,
       originalTokenCount:
         options?.originalTokenCountOverride ?? this.lastPromptTokenCount,
-      bypassTokenThreshold,
       trigger: options?.trigger,
       signal,
     });
@@ -555,37 +533,13 @@ export class GeminiChat {
         info,
         compressedHistory: newHistory,
       });
-      // Auto-compaction replaces history in place — no env-context refresh
-      // here. Manual /compress goes through GeminiClient.tryCompressChat,
-      // which calls startChat() to re-prepend a fresh env snapshot. See
-      // GeminiClient.sendMessageStream for the rationale behind the split.
       this.setHistory(newHistory);
-      // Compaction summarises away prior full-Read tool results, but the
-      // FileReadCache still treats those reads as "in this conversation".
-      // A follow-up Read could then return the file_unchanged placeholder
-      // pointing at content the model can no longer retrieve from history.
       debugLogger.debug('[FILE_READ_CACHE] clear after auto tryCompress');
       this.config.getFileReadCache().clear();
       this.lastPromptTokenCount = info.newTokenCount;
-      // Mirror to the global singleton only when wired (main session).
-      // Subagents pass `telemetryService=undefined` to keep their context
-      // usage out of the main agent's UI counters.
       this.telemetryService?.setLastPromptTokenCount(info.newTokenCount);
-      // Re-enable auto-compaction so a forced /compress recovers a chat
-      // that an earlier auto-attempt latched off.
       this.hasFailedCompressionAttempt = false;
-      this.heapPressureCompressionCooldownUntil = 0;
-    } else if (bypassTokenThreshold) {
-      // If heap-pressure compaction cannot reduce history (NOOP or failure),
-      // avoid repeatedly cloning history and/or paying side-query latency while
-      // the process-wide pressure remains high.
-      this.heapPressureCompressionCooldownUntil =
-        Date.now() + HEAP_PRESSURE_COMPRESSION_COOLDOWN_MS;
     } else if (isCompressionFailureStatus(info.compressionStatus)) {
-      // Track failed attempts (only mark as failed if not forced) so we
-      // stop spending compression-API calls on a chat that can't shrink.
-      // Heap-pressure attempts are a safety net, not evidence that normal
-      // token-threshold compaction should be latched off for this chat.
       if (!force) {
         this.hasFailedCompressionAttempt = true;
       }
@@ -594,24 +548,6 @@ export class GeminiChat {
     return info;
   }
 
-  private getHeapPressureRatio(): number | null {
-    try {
-      const { used_heap_size: usedHeapSize, heap_size_limit: heapLimit } =
-        getHeapStatistics();
-      if (
-        !Number.isFinite(usedHeapSize) ||
-        usedHeapSize < 0 ||
-        !Number.isFinite(heapLimit) ||
-        heapLimit <= 0
-      ) {
-        return null;
-      }
-      return usedHeapSize / heapLimit;
-    } catch {
-      return null;
-    }
-  }
-
   setSystemInstruction(sysInstr: string) {
     this.generationConfig.systemInstruction = sysInstr;
   }
@@ -701,7 +637,7 @@ export class GeminiChat {
       // Add user content to history ONCE before any attempts.
       this.history.push(userContent);
       userContentAdded = true;
-      requestContents = this.getHistory(true);
+      requestContents = this.getRequestHistory();
     } catch (error) {
       if (userContentAdded) {
         this.history.pop();
@@ -866,7 +802,7 @@ export class GeminiChat {
                     reactiveInfo.compressionStatus ===
                     CompressionStatus.COMPRESSED
                   ) {
-                    requestContents = self.getHistory(true);
+                    requestContents = self.getRequestHistory();
                     debugLogger.info(
                       `Reactive compression succeeded: ` +
                         `${reactiveInfo.originalTokenCount} -> ` +
@@ -1070,7 +1006,7 @@ export class GeminiChat {
             // model's continuation appends to the previous partial output.
             yield { type: StreamEventType.RETRY, isContinuation: true };
             // Re-send with the updated history (includes partial + recovery)
-            const recoveryContents = self.getHistory(true);
+            const recoveryContents = self.getRequestHistory();
             escalatedFinishReason = undefined;
             try {
               const recoveryStream = await self.makeApiCallAndProcessStream(
@@ -1237,6 +1173,29 @@ export class GeminiChat {
     return structuredClone(history.slice(-count));
   }
 
+  /**
+   * Returns a shallow copy of the history and each entry's parts array without
+   * cloning large part payloads. Use only for read-only consumers or consumers
+   * that replace touched entries before mutating them.
+   */
+  getHistoryShallow(curated: boolean = false): Content[] {
+    const history = curated
+      ? extractCuratedHistory(this.history)
+      : this.history;
+    return history.map(copyContentContainer);
+  }
+
+  /**
+   * Shallow tail variant for hot paths that only need recent history.
+   */
+  getHistoryTailShallow(count: number, curated: boolean = false): Content[] {
+    if (count <= 0) return [];
+    const history = curated
+      ? extractCuratedHistory(this.history)
+      : this.history;
+    return history.slice(-count).map(copyContentContainer);
+  }
+
   /**
    * Returns a defensive copy of the last raw history entry without cloning the
    * full conversation. This avoids O(history) cloning, though cloning the last
@@ -1246,6 +1205,35 @@ export class GeminiChat {
     return this.getHistoryTail(1)[0];
   }
 
+  /**
+   * Returns the last raw history entry for read-only checks. Callers must not
+   * mutate the returned object.
+   */
+  peekLastHistoryEntry(): Content | undefined {
+    return this.history.at(-1);
+  }
+
+  /**
+   * Returns concatenated text from the last model entry without cloning the
+   * full history. Used by stop hooks, where only the latest assistant text is
+   * needed.
+   */
+  getLastModelMessageText(): string | undefined {
+    for (let i = this.history.length - 1; i >= 0; i--) {
+      const message = this.history[i];
+      if (message?.role !== 'model') continue;
+      const text =
+        message.parts
+          ?.filter(
+            (part): part is { text: string } => typeof part.text === 'string',
+          )
+          .map((part) => part.text)
+          .join('') ?? '';
+      return text || undefined;
+    }
+    return undefined;
+  }
+
   /**
    * Returns the number of entries in the raw chat history. O(1) and
    * does not clone — use this when you only need the count and would
diff --git a/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts b/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts
index 059104d5c6..cfd16be7f2 100644
--- a/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts
+++ b/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts
@@ -44,6 +44,7 @@ import { openaiRequestCaptureContext } from '../openaiContentGenerator/requestCa
 import type { RequestContext } from '../openaiContentGenerator/types.js';
 import { OpenAILogger } from '../../utils/openaiLogger.js';
 import { createDebugLogger } from '../../utils/debugLogger.js';
+import { runtimeDiagnostics } from '../../utils/runtimeDiagnostics.js';
 import {
   getErrorMessage,
   getErrorStatus,
@@ -226,6 +227,10 @@ export class LoggingContentGenerator implements ContentGenerator {
     const isInternal = isInternalPromptId(userPromptId);
     const session = this.startCaptureSession();
     try {
+      runtimeDiagnostics.recordGenerateContentRequest(req, {
+        stream: false,
+        source: 'generateContent',
+      });
       if (!isInternal) {
         addSystemPromptAttributes(
           this.config,
@@ -336,6 +341,10 @@ export class LoggingContentGenerator implements ContentGenerator {
 
     let stream: AsyncGenerator<GenerateContentResponse>;
     try {
+      runtimeDiagnostics.recordGenerateContentRequest(req, {
+        stream: true,
+        source: 'generateContentStream',
+      });
       if (!isInternal) {
         addSystemPromptAttributes(
           this.config,
diff --git a/packages/core/src/core/openaiContentGenerator/pipeline.ts b/packages/core/src/core/openaiContentGenerator/pipeline.ts
index c814527d61..605ab4b45d 100644
--- a/packages/core/src/core/openaiContentGenerator/pipeline.ts
+++ b/packages/core/src/core/openaiContentGenerator/pipeline.ts
@@ -18,6 +18,7 @@ import { StreamingToolCallParser } from './streamingToolCallParser.js';
 import { TaggedThinkingParser } from './taggedThinkingParser.js';
 import type { PipelineConfig, RequestContext } from './types.js';
 import { redactProxyError } from '../../utils/runtimeFetchOptions.js';
+import { runtimeDiagnostics } from '../../utils/runtimeDiagnostics.js';
 
 /**
  * The OpenAI SDK adds an abort listener for every `chat.completions.create`
@@ -515,6 +516,7 @@ export class ContentGenerationPipeline {
       // provider enhancement, post disable-reasoning) and before the SDK call
       // so the logger sees the exact bytes sent on the wire.
       openaiRequestCaptureContext.getStore()?.(openaiRequest);
+      runtimeDiagnostics.recordOpenAIWireRequest(openaiRequest);
 
       const result = await executor(openaiRequest, context);
       return result;
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index 8e69bacd66..73acc7e3ce 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -183,7 +183,11 @@ export * from './memory/writeContextFile.js';
 export * from './ide/ide-client.js';
 export * from './ide/ideContext.js';
 export * from './ide/ide-installer.js';
-export { IDE_DEFINITIONS, type IdeInfo } from './ide/detect-ide.js';
+export {
+  detectIdeFromEnv,
+  IDE_DEFINITIONS,
+  type IdeInfo,
+} from './ide/detect-ide.js';
 export * from './ide/constants.js';
 export * from './ide/types.js';
 
@@ -285,6 +289,7 @@ export * from './utils/errorParsing.js';
 export * from './utils/errors.js';
 export * from './utils/fileUtils.js';
 export * from './utils/filesearch/fileSearch.js';
+export * as crawlCache from './utils/filesearch/crawlCache.js';
 export {
   Ignore,
   loadIgnoreRules,
@@ -301,6 +306,7 @@ export * from './utils/jsonl-utils.js';
 export * from './utils/memoryDiagnostics.js';
 export * from './utils/memoryDiscovery.js';
 export * from './utils/modelId.js';
+export * from './utils/runtimeDiagnostics.js';
 export { ConditionalRulesRegistry } from './utils/rulesDiscovery.js';
 export type { RuleFile } from './utils/rulesDiscovery.js';
 export {
diff --git a/packages/core/src/services/chatCompressionService.test.ts b/packages/core/src/services/chatCompressionService.test.ts
index 3aa349863e..e42d6e80d4 100644
--- a/packages/core/src/services/chatCompressionService.test.ts
+++ b/packages/core/src/services/chatCompressionService.test.ts
@@ -389,6 +389,9 @@ describe('ChatCompressionService', () => {
     service = new ChatCompressionService();
     mockChat = {
       getHistory: vi.fn(),
+      getHistoryShallow: vi.fn((curated?: boolean) =>
+        mockChat.getHistory(curated),
+      ),
       appendSystemInstruction: vi.fn(),
     } as unknown as GeminiChat;
     mockGetHookSystem = vi.fn().mockReturnValue({});
@@ -463,88 +466,6 @@ describe('ChatCompressionService', () => {
     expect(result.newHistory).toBeNull();
   });
 
-  it('should bypass the token threshold when requested without force=true', async () => {
-    const history: Content[] = [
-      { role: 'user', parts: [{ text: 'msg1' }] },
-      { role: 'model', parts: [{ text: 'msg2' }] },
-      { role: 'user', parts: [{ text: 'msg3' }] },
-      { role: 'model', parts: [{ text: 'msg4' }] },
-    ];
-    vi.mocked(mockChat.getHistory).mockReturnValue(history);
-    vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(100);
-    vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({
-      model: 'gemini-pro',
-      contextWindowSize: 1000,
-    } as unknown as ReturnType<typeof mockConfig.getContentGeneratorConfig>);
-
-    const mockGenerateContent = vi.fn().mockResolvedValue({
-      text: 'Summary',
-      usage: {
-        promptTokenCount: 1100,
-        candidatesTokenCount: 50,
-        totalTokenCount: 1150,
-      },
-    });
-    vi.mocked(mockConfig.getBaseLlmClient).mockReturnValue({
-      generateText: mockGenerateContent,
-    } as unknown as BaseLlmClient);
-
-    const result = await service.compress(mockChat, {
-      promptId: mockPromptId,
-      force: false,
-      bypassTokenThreshold: true,
-      model: mockModel,
-      config: mockConfig,
-      hasFailedCompressionAttempt: false,
-      originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
-    });
-
-    expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
-    expect(result.newHistory).not.toBeNull();
-    expect(mockGenerateContent).toHaveBeenCalled();
-  });
-
-  it('should bypass the failed-attempt latch when heap pressure requests compaction', async () => {
-    const history: Content[] = [
-      { role: 'user', parts: [{ text: 'msg1' }] },
-      { role: 'model', parts: [{ text: 'msg2' }] },
-      { role: 'user', parts: [{ text: 'msg3' }] },
-      { role: 'model', parts: [{ text: 'msg4' }] },
-    ];
-    vi.mocked(mockChat.getHistory).mockReturnValue(history);
-    vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(100);
-    vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({
-      model: 'gemini-pro',
-      contextWindowSize: 1000,
-    } as unknown as ReturnType<typeof mockConfig.getContentGeneratorConfig>);
-
-    const mockGenerateContent = vi.fn().mockResolvedValue({
-      text: 'Summary',
-      usage: {
-        promptTokenCount: 1100,
-        candidatesTokenCount: 50,
-        totalTokenCount: 1150,
-      },
-    });
-    vi.mocked(mockConfig.getBaseLlmClient).mockReturnValue({
-      generateText: mockGenerateContent,
-    } as unknown as BaseLlmClient);
-
-    const result = await service.compress(mockChat, {
-      promptId: mockPromptId,
-      force: false,
-      bypassTokenThreshold: true,
-      model: mockModel,
-      config: mockConfig,
-      hasFailedCompressionAttempt: true,
-      originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
-    });
-
-    expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
-    expect(result.newHistory).not.toBeNull();
-    expect(mockGenerateContent).toHaveBeenCalled();
-  });
-
   it('should return NOOP when contextPercentageThreshold is 0', async () => {
     const history: Content[] = [
       { role: 'user', parts: [{ text: 'msg1' }] },
@@ -595,41 +516,6 @@ describe('ChatCompressionService', () => {
     expect(tokenLimit).not.toHaveBeenCalled();
   });
 
-  it('should return NOOP when contextPercentageThreshold is 0 even with token threshold bypass', async () => {
-    const history: Content[] = [
-      { role: 'user', parts: [{ text: 'msg1' }] },
-      { role: 'model', parts: [{ text: 'msg2' }] },
-    ];
-    vi.mocked(mockChat.getHistory).mockReturnValue(history);
-    vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(800);
-    vi.mocked(mockConfig.getChatCompression).mockReturnValue({
-      contextPercentageThreshold: 0,
-    });
-
-    const mockGenerateContent = vi.fn();
-    vi.mocked(mockConfig.getBaseLlmClient).mockReturnValue({
-      generateText: mockGenerateContent,
-    } as unknown as BaseLlmClient);
-
-    const result = await service.compress(mockChat, {
-      promptId: mockPromptId,
-      force: false,
-      bypassTokenThreshold: true,
-      model: mockModel,
-      config: mockConfig,
-      hasFailedCompressionAttempt: false,
-      originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
-    });
-
-    expect(result.info).toMatchObject({
-      compressionStatus: CompressionStatus.NOOP,
-      originalTokenCount: 0,
-      newTokenCount: 0,
-    });
-    expect(mockGenerateContent).not.toHaveBeenCalled();
-    expect(tokenLimit).not.toHaveBeenCalled();
-  });
-
   it('should return NOOP when historyToCompress is below MIN_COMPRESSION_FRACTION of total', async () => {
     // Construct a history where the split point lands on the 2nd regular user
     // message (index 2), but indices 0-1 are tiny relative to the huge content
@@ -715,6 +601,72 @@ describe('ChatCompressionService', () => {
     expect(mockGetHookSystem).toHaveBeenCalled();
   });
 
+  it('does not deep-clone full history while compressing', async () => {
+    const largeToolOutput = 'x'.repeat(1024 * 1024);
+    const history: Content[] = [
+      { role: 'user', parts: [{ text: 'review this PR' }] },
+      {
+        role: 'model',
+        parts: [
+          {
+            functionCall: {
+              id: 'read-1',
+              name: 'read_file',
+              args: { path: 'large.ts' },
+            },
+          },
+        ],
+      },
+      {
+        role: 'user',
+        parts: [
+          {
+            functionResponse: {
+              id: 'read-1',
+              name: 'read_file',
+              response: { output: largeToolOutput },
+            },
+          },
+        ],
+      },
+      { role: 'model', parts: [{ text: 'analysis' }] },
+    ];
+    vi.mocked(mockChat.getHistory).mockImplementation(() => {
+      throw new Error('getHistory should not be called by compression');
+    });
+    vi.mocked(mockChat.getHistoryShallow).mockReturnValue(history);
+    vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({
+      model: 'gemini-pro',
+      contextWindowSize: 1000,
+    } as unknown as ReturnType<typeof mockConfig.getContentGeneratorConfig>);
+    vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(800);
+
+    const mockGenerateContent = vi.fn().mockResolvedValue({
+      text: 'Summary',
+      usage: {
+        promptTokenCount: 1600,
+        candidatesTokenCount: 50,
+        totalTokenCount: 1650,
+      },
+    });
+    vi.mocked(mockConfig.getBaseLlmClient).mockReturnValue({
+      generateText: mockGenerateContent,
+    } as unknown as BaseLlmClient);
+
+    const result = await service.compress(mockChat, {
+      promptId: mockPromptId,
+      force: false,
+      model: mockModel,
+      config: mockConfig,
+      hasFailedCompressionAttempt: false,
+      originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+    });
+
+    expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
+    expect(mockChat.getHistory).not.toHaveBeenCalled();
+    expect(mockChat.getHistoryShallow).toHaveBeenCalledWith(true);
+  });
+
   it('should force compress even if under threshold', async () => {
     const history: Content[] = [
       { role: 'user', parts: [{ text: 'msg1' }] },
diff --git a/packages/core/src/services/chatCompressionService.ts b/packages/core/src/services/chatCompressionService.ts
index f704ee10fe..97934d819e 100644
--- a/packages/core/src/services/chatCompressionService.ts
+++ b/packages/core/src/services/chatCompressionService.ts
@@ -181,14 +181,6 @@ export interface CompressOptions {
    * the service does not read or write any global telemetry.
    */
   originalTokenCount: number;
-  /**
-   * Bypass the token-count threshold gate and the failed-attempt latch while
-   * preserving automatic compaction semantics. Used for temporary heap-pressure
-   * relief where `force=true` would be too broad because it means manual
-   * `/compress`. The heap-pressure check that sets this lives in
-   * `GeminiChat.tryCompress()`.
-   */
-  bypassTokenThreshold?: boolean;
   /**
    * Hook trigger to report for this compression. `force=true` bypasses the
    * threshold gate but does not always mean the user manually requested
@@ -210,7 +202,6 @@ export class ChatCompressionService {
       config,
       hasFailedCompressionAttempt,
       originalTokenCount,
-      bypassTokenThreshold = false,
       trigger,
       signal,
     } = opts;
@@ -221,13 +212,7 @@ export class ChatCompressionService {
       COMPRESSION_TOKEN_THRESHOLD;
     const slimmingConfig = resolveSlimmingConfig(chatCompressionSettings);
 
-    // Cheap gates first — these don't need the curated history. Heap-pressure
-    // bypass must also bypass the failed-attempt latch, otherwise one failed
-    // compression would disable this safety net for the rest of the chat.
-    if (
-      threshold <= 0 ||
-      (hasFailedCompressionAttempt && !force && !bypassTokenThreshold)
-    ) {
+    if (threshold <= 0 || (hasFailedCompressionAttempt && !force)) {
       return {
         newHistory: null,
         info: {
@@ -238,10 +223,7 @@ export class ChatCompressionService {
       };
     }
 
-    // Don't compress if not forced and we are under the token limit. This is
-    // the steady-state path on every send; heap pressure may bypass it because
-    // the JS heap can become the limiting resource before token count does.
-    if (!force && !bypassTokenThreshold) {
+    if (!force) {
       const contextLimit =
         config.getContentGeneratorConfig()?.contextWindowSize ??
         DEFAULT_TOKEN_LIMIT;
@@ -257,7 +239,12 @@ export class ChatCompressionService {
       }
     }
 
-    const curatedHistory = chat.getHistory(true);
+    // Compression only reads the existing history while deciding the split and
+    // preparing the side-query payload. Avoid `getHistory(true)` here: long
+    // tool-heavy sessions can make a defensive deep clone larger than the
+    // remaining V8 heap headroom at exactly the moment compaction is trying to
+    // reduce memory pressure.
+    const curatedHistory = chat.getHistoryShallow(true);
     if (curatedHistory.length === 0) {
       return {
         newHistory: null,
diff --git a/packages/core/src/services/sessionService.test.ts b/packages/core/src/services/sessionService.test.ts
index 24e5942587..83c574bfee 100644
--- a/packages/core/src/services/sessionService.test.ts
+++ b/packages/core/src/services/sessionService.test.ts
@@ -947,6 +947,57 @@ describe('SessionService', () => {
       expect(history).toEqual([recordA1.message, assistantA1.message]);
     });
 
+    it('does not deep-clone stored messages when rebuilding resume API history', () => {
+      const largePayload = {
+        output: 'x'.repeat(128 * 1024),
+        nested: { keep: true },
+      };
+      const toolResult: ChatRecord = {
+        uuid: 'large-tool-result',
+        parentUuid: recordA1.uuid,
+        sessionId: sessionIdA,
+        timestamp: '2024-01-01T00:02:00Z',
+        type: 'tool_result',
+        message: {
+          role: 'user',
+          parts: [
+            {
+              functionResponse: {
+                id: 'call-1',
+                name: 'read_file',
+                response: largePayload,
+              },
+            },
+          ],
+        },
+        cwd: '/test/project/root',
+        version: '1.0.0',
+      };
+      const conversation: ConversationRecord = {
+        sessionId: sessionIdA,
+        projectHash: 'test-project-hash',
+        startTime: '2024-01-01T00:00:00Z',
+        lastUpdated: '2024-01-01T00:02:00Z',
+        messages: [recordA1, toolResult],
+      };
+      const structuredCloneSpy = vi
+        .spyOn(globalThis, 'structuredClone')
+        .mockImplementation(() => {
+          throw new Error('unexpected deep clone');
+        });
+
+      const history = buildApiHistoryFromConversation(conversation);
+
+      expect(structuredCloneSpy).not.toHaveBeenCalled();
+      expect(history).toEqual([recordA1.message, toolResult.message]);
+      expect(history[1]).not.toBe(toolResult.message);
+      expect(history[1].parts).not.toBe(toolResult.message!.parts);
+      const response = history[1].parts![0] as {
+        functionResponse: { response: typeof largePayload };
+      };
+      expect(response.functionResponse.response).toBe(largePayload);
+    });
+
     it('merges mid-turn user messages into the preceding tool result on resume', () => {
       const assistantWithToolCall: ChatRecord = {
         uuid: 'a2',
diff --git a/packages/core/src/services/sessionService.ts b/packages/core/src/services/sessionService.ts
index 3ccf2152fa..ffd0d0c721 100644
--- a/packages/core/src/services/sessionService.ts
+++ b/packages/core/src/services/sessionService.ts
@@ -1191,10 +1191,38 @@ function stripThoughtsFromContent(content: Content): Content | null {
   };
 }
 
+function copyContentForApiHistory(content: Content): Content {
+  return {
+    ...content,
+    parts: content.parts?.map((part) => {
+      if ('functionCall' in part && part.functionCall) {
+        return {
+          ...part,
+          functionCall: {
+            ...part.functionCall,
+            args: part.functionCall.args
+              ? { ...part.functionCall.args }
+              : part.functionCall.args,
+          },
+        };
+      }
+      if ('functionResponse' in part && part.functionResponse) {
+        return {
+          ...part,
+          functionResponse: {
+            ...part.functionResponse,
+          },
+        };
+      }
+      return { ...part };
+    }),
+  };
+}
+
 function appendApiHistoryRecord(history: Content[], record: ChatRecord): void {
   if (!record.message) return;
 
-  const message = structuredClone(record.message as Content);
+  const message = copyContentForApiHistory(record.message as Content);
   if (record.subtype === 'mid_turn_user_message') {
     const previous = history.at(-1);
     if (previous?.role === 'user') {
@@ -1240,7 +1268,9 @@ export function buildApiHistoryFromConversation(
   });
 
   if (compressedHistory && lastCompressionIndex >= 0) {
-    const baseHistory: Content[] = structuredClone(compressedHistory);
+    const baseHistory: Content[] = compressedHistory.map(
+      copyContentForApiHistory,
+    );
 
     // Append everything after the compression record (newer turns)
     for (let i = lastCompressionIndex + 1; i < messages.length; i++) {
diff --git a/packages/core/src/tools/agent/agent.ts b/packages/core/src/tools/agent/agent.ts
index ba871d3c4a..05f8cc2bd3 100644
--- a/packages/core/src/tools/agent/agent.ts
+++ b/packages/core/src/tools/agent/agent.ts
@@ -960,7 +960,10 @@ class AgentToolInvocation extends BaseToolInvocation<AgentParams, ToolResult> {
     toolConfig: ToolConfig;
   }> {
     const geminiClient = this.config.getGeminiClient();
-    const rawHistory = geminiClient ? geminiClient.getHistory(true) : [];
+    const rawHistory = geminiClient
+      ? (geminiClient.getHistoryShallow?.(true) ??
+        geminiClient.getHistory(true))
+      : [];
 
     // Build the history that will seed the fork's chat. Must end with a
     // model message so agent-headless can send the task_prompt as a user
diff --git a/packages/core/src/utils/forkedAgent.ts b/packages/core/src/utils/forkedAgent.ts
index c9c56ef936..f8b13ebf43 100644
--- a/packages/core/src/utils/forkedAgent.ts
+++ b/packages/core/src/utils/forkedAgent.ts
@@ -66,7 +66,7 @@ import {
 export interface CacheSafeParams {
   /** Full generation config including systemInstruction and tools */
   generationConfig: GenerateContentConfig;
-  /** Curated conversation history (deep clone) */
+  /** Curated conversation history (shallow copy; consumers must not mutate) */
   history: Content[];
   /** Model identifier */
   model: string;
diff --git a/packages/core/src/utils/memoryDiagnostics.test.ts b/packages/core/src/utils/memoryDiagnostics.test.ts
index 0e7c3de4a0..5a1daa24e3 100644
--- a/packages/core/src/utils/memoryDiagnostics.test.ts
+++ b/packages/core/src/utils/memoryDiagnostics.test.ts
@@ -83,6 +83,9 @@ describe('collectMemoryDiagnostics', () => {
       activeRequests: () => 3,
       openFileDescriptors: async () => 501,
       smapsRollup: async () => 'Rss: 5000 kB',
+      processTree: async () => {
+        throw new Error('not available');
+      },
       platform: 'linux',
       nodeVersion: 'v20.19.0',
     });
@@ -117,10 +120,13 @@ describe('collectMemoryDiagnostics', () => {
         },
       ],
       resourceUsage: {
-        maxRSS: 6,
+        maxRSS: 6 * 1024,
+        maxRSSRaw: 6,
+        maxRSSUnit: 'KiB',
         userCPUTime: 10,
         systemCPUTime: 20,
       },
+      processTree: null,
       activeHandles: 300,
       activeRequests: 3,
       openFileDescriptors: 501,
@@ -226,7 +232,7 @@ describe('collectMemoryDiagnostics', () => {
     );
   });
 
-  it('treats maxRSS as bytes on all platforms', async () => {
+  it('normalizes resourceUsage maxRSS from KiB to bytes', async () => {
     const diagnostics = await collectMemoryDiagnostics({
       memoryUsage: () => ({
         heapUsed: 100,
@@ -273,8 +279,70 @@ describe('collectMemoryDiagnostics', () => {
       nodeVersion: 'v20.19.0',
     });
 
-    // Node.js >=14.10.0 returns maxRSS in bytes on all platforms.
-    expect(diagnostics.resourceUsage.maxRSS).toBe(4_096);
+    expect(diagnostics.resourceUsage.maxRSS).toBe(4_096 * 1024);
+    expect(diagnostics.resourceUsage.maxRSSRaw).toBe(4_096);
+    expect(diagnostics.resourceUsage.maxRSSUnit).toBe('KiB');
+  });
+
+  it('includes process tree RSS when the optional probe is available', async () => {
+    const diagnostics = await collectMemoryDiagnostics({
+      memoryUsage: () => ({
+        heapUsed: 100,
+        heapTotal: 200,
+        rss: 300,
+        external: 10,
+        arrayBuffers: 5,
+      }),
+      heapStatistics: () => ({
+        heap_size_limit: 1_000,
+        total_heap_size: 200,
+        total_heap_size_executable: 0,
+        total_physical_size: 200,
+        used_heap_size: 100,
+        malloced_memory: 0,
+        peak_malloced_memory: 0,
+        does_zap_garbage: 0,
+        number_of_native_contexts: 1,
+        number_of_detached_contexts: 0,
+        total_available_size: 900,
+        total_global_handles_size: 0,
+        used_global_handles_size: 0,
+        external_memory: 10,
+      }),
+      resourceUsage: () => ({
+        userCPUTime: 10,
+        systemCPUTime: 20,
+        maxRSS: 4_096,
+        sharedMemorySize: 0,
+        unsharedDataSize: 0,
+        unsharedStackSize: 0,
+        minorPageFault: 0,
+        majorPageFault: 0,
+        swappedOut: 0,
+        fsRead: 0,
+        fsWrite: 0,
+        ipcSent: 0,
+        ipcReceived: 0,
+        signalsCount: 0,
+        voluntaryContextSwitches: 0,
+        involuntaryContextSwitches: 0,
+      }),
+      processTree: async () => ({
+        rootPid: 123,
+        processCount: 3,
+        rootRSS: 10 * 1024 * 1024,
+        treeRSS: 25 * 1024 * 1024,
+      }),
+      platform: 'darwin',
+      nodeVersion: 'v20.19.0',
+    });
+
+    expect(diagnostics.processTree).toEqual({
+      rootPid: 123,
+      processCount: 3,
+      rootRSS: 10 * 1024 * 1024,
+      treeRSS: 25 * 1024 * 1024,
+    });
   });
 
   it('treats unsupported optional probes as unavailable instead of failing', async () => {
diff --git a/packages/core/src/utils/memoryDiagnostics.ts b/packages/core/src/utils/memoryDiagnostics.ts
index 2ebe3c88e6..e5f3718b61 100644
--- a/packages/core/src/utils/memoryDiagnostics.ts
+++ b/packages/core/src/utils/memoryDiagnostics.ts
@@ -5,7 +5,9 @@
  */
 
 import { readdir, readFile } from 'node:fs/promises';
+import { execFile } from 'node:child_process';
 import process from 'node:process';
+import { promisify } from 'node:util';
 import v8 from 'node:v8';
 import { createDebugLogger } from './debugLogger.js';
 import { formatMemoryUsage } from './formatters.js';
@@ -20,6 +22,7 @@ const ACTIVE_HANDLES_THRESHOLD = 256;
 const ACTIVE_REQUESTS_THRESHOLD = 100;
 const OPEN_FD_THRESHOLD = 500;
 const debugLogger = createDebugLogger('MEMORY_DIAGNOSTICS');
+const execFileAsync = promisify(execFile);
 
 export interface MemoryDiagnostics {
   timestamp: string;
@@ -30,6 +33,7 @@ export interface MemoryDiagnostics {
   v8HeapStats: V8HeapStats;
   v8HeapSpaces: V8HeapSpaceStats[] | null;
   resourceUsage: MemoryResourceUsage;
+  processTree: ProcessTreeMemoryUsage | null;
   activeHandles: number;
   activeRequests: number;
   openFileDescriptors: number | null;
@@ -57,11 +61,21 @@ export interface V8HeapSpaceStats {
 }
 
 export interface MemoryResourceUsage {
+  /** Normalized bytes. Node/resourceUsage reports maxRSS in KiB. */
   maxRSS: number;
+  maxRSSRaw: number;
+  maxRSSUnit: 'KiB';
   userCPUTime: number;
   systemCPUTime: number;
 }
 
+export interface ProcessTreeMemoryUsage {
+  rootPid: number;
+  processCount: number;
+  rootRSS: number;
+  treeRSS: number;
+}
+
 export interface MemoryDiagnosticsAnalysis {
   risks: MemoryRisk[];
   recommendation: string;
@@ -92,6 +106,7 @@ export interface MemoryDiagnosticsOptions {
   activeRequests?: () => number;
   openFileDescriptors?: () => Promise<number>;
   smapsRollup?: () => Promise<string>;
+  processTree?: () => Promise<ProcessTreeMemoryUsage>;
   platform?: NodeJS.Platform;
   nodeVersion?: string;
 }
@@ -114,7 +129,7 @@ export async function collectMemoryDiagnostics(
   const heapStatistics = options.heapStatistics?.() ?? v8.getHeapStatistics();
   const resourceUsage = options.resourceUsage?.() ?? process.resourceUsage();
   const uptimeSeconds = options.uptimeSeconds?.() ?? process.uptime();
-  const [openFileDescriptors, smapsRollup, heapSpaceStatistics] =
+  const [openFileDescriptors, smapsRollup, heapSpaceStatistics, processTree] =
     await Promise.all([
       optionalProbe(
         'openFileDescriptors',
@@ -125,12 +140,15 @@ export async function collectMemoryDiagnostics(
         'heapSpaceStatistics',
         options.heapSpaceStatistics ?? (() => v8.getHeapSpaceStatistics()),
       ),
+      optionalProbe(
+        'processTree',
+        options.processTree ?? (() => collectProcessTreeMemoryUsage(platform)),
+      ),
     ]);
   const v8HeapSpaces = mapHeapSpaces(heapSpaceStatistics);
 
-  // Node.js >=14.10.0 returns maxRSS in bytes on all platforms.
-  // This project requires Node >=22.
-  const maxRSSBytes = resourceUsage.maxRSS;
+  const maxRSSRaw = resourceUsage.maxRSS;
+  const maxRSSBytes = normalizeMaxRSSBytes(maxRSSRaw);
 
   const diagnostics = {
     timestamp: now().toISOString(),
@@ -142,9 +160,12 @@ export async function collectMemoryDiagnostics(
     v8HeapSpaces,
     resourceUsage: {
       maxRSS: maxRSSBytes,
+      maxRSSRaw,
+      maxRSSUnit: 'KiB' as const,
       userCPUTime: resourceUsage.userCPUTime,
       systemCPUTime: resourceUsage.systemCPUTime,
     },
+    processTree,
     activeHandles: getProcessInternalCount(
       'activeHandles',
       '_getActiveHandles',
@@ -167,6 +188,10 @@ export async function collectMemoryDiagnostics(
   };
 }
 
+function normalizeMaxRSSBytes(maxRSSKiB: number): number {
+  return maxRSSKiB * 1024;
+}
+
 function mapHeapStats(heapInfo: v8.HeapInfo): V8HeapStats {
   return {
     heapSizeLimit: heapInfo.heap_size_limit,
@@ -233,6 +258,85 @@ async function readProcSmapsRollup(): Promise<string> {
   return readFile('/proc/self/smaps_rollup', 'utf8');
 }
 
+async function collectProcessTreeMemoryUsage(
+  platform: NodeJS.Platform,
+): Promise<ProcessTreeMemoryUsage> {
+  if (platform === 'win32') {
+    throw new Error('process tree RSS probe is unavailable on win32');
+  }
+
+  const { stdout } = await execFileAsync('ps', ['-axo', 'pid=,ppid=,rss='], {
+    maxBuffer: 1024 * 1024,
+    timeout: 5000,
+  });
+  const rows = parsePsRows(stdout);
+  const rootPid = process.pid;
+  const rowsByPid = new Map(rows.map((row) => [row.pid, row]));
+  const childrenByParent = new Map<number, PsRow[]>();
+  for (const row of rows) {
+    const children = childrenByParent.get(row.ppid);
+    if (children) {
+      children.push(row);
+    } else {
+      childrenByParent.set(row.ppid, [row]);
+    }
+  }
+
+  const queue = [rootPid];
+  const seen = new Set<number>();
+  let rootRSS = 0;
+  let treeRSS = 0;
+  let processCount = 0;
+  while (queue.length > 0) {
+    const pid = queue.shift()!;
+    if (seen.has(pid)) {
+      continue;
+    }
+    seen.add(pid);
+    const row = rowsByPid.get(pid);
+    if (row) {
+      const rssBytes = row.rssKiB * 1024;
+      if (pid === rootPid) {
+        rootRSS = rssBytes;
+      }
+      treeRSS += rssBytes;
+      processCount += 1;
+    }
+    for (const child of childrenByParent.get(pid) ?? []) {
+      queue.push(child.pid);
+    }
+  }
+
+  return {
+    rootPid,
+    processCount,
+    rootRSS,
+    treeRSS,
+  };
+}
+
+interface PsRow {
+  pid: number;
+  ppid: number;
+  rssKiB: number;
+}
+
+function parsePsRows(output: string): PsRow[] {
+  return output
+    .trim()
+    .split(/\r?\n/)
+    .map((line) => {
+      const [pid, ppid, rssKiB] = line.trim().split(/\s+/).map(Number);
+      return { pid, ppid, rssKiB };
+    })
+    .filter(
+      (row) =>
+        Number.isFinite(row.pid) &&
+        Number.isFinite(row.ppid) &&
+        Number.isFinite(row.rssKiB),
+    );
+}
+
 async function optionalProbe<T>(
   name: string,
   probe: () => Promise<T>,
diff --git a/packages/core/src/utils/nextSpeakerChecker.test.ts b/packages/core/src/utils/nextSpeakerChecker.test.ts
index 5ccb9dd434..451f38ee94 100644
--- a/packages/core/src/utils/nextSpeakerChecker.test.ts
+++ b/packages/core/src/utils/nextSpeakerChecker.test.ts
@@ -88,6 +88,7 @@ describe('checkNextSpeaker', () => {
 
     // Spy on getHistory for chatInstance
     vi.spyOn(chatInstance, 'getHistory');
+    vi.spyOn(chatInstance, 'getHistoryTail');
     vi.spyOn(chatInstance, 'getLastHistoryEntry');
   });
 
@@ -97,6 +98,9 @@ describe('checkNextSpeaker', () => {
 
   function mockChatHistory(history: Content[]): void {
     vi.mocked(chatInstance.getHistory).mockReturnValue(history);
+    vi.mocked(chatInstance.getHistoryTail).mockReturnValue(
+      history.length > 0 ? [structuredClone(history[history.length - 1]!)] : [],
+    );
     vi.mocked(chatInstance.getLastHistoryEntry).mockReturnValue(
       history.length > 0
         ? structuredClone(history[history.length - 1]!)
@@ -279,8 +283,36 @@ describe('checkNextSpeaker', () => {
     expect(generateJsonCall[0].promptId).toBe(promptId);
   });
 
+  it('should send only the last curated model message to the side query', async () => {
+    const oldHistory: Content[] = [
+      { role: 'user', parts: [{ text: 'old user context'.repeat(1000) }] },
+      { role: 'model', parts: [{ text: 'old model context'.repeat(1000) }] },
+    ];
+    const lastModelMessage: Content = {
+      role: 'model',
+      parts: [{ text: 'Some model output.' }],
+    };
+    mockChatHistory([...oldHistory, lastModelMessage]);
+    (mockBaseLlmClient.generateJson as Mock).mockResolvedValue({
+      reasoning: 'Model made a statement, awaiting user input.',
+      next_speaker: 'user',
+    } satisfies NextSpeakerResponse);
+
+    await checkNextSpeaker(chatInstance, mockConfig, abortSignal, promptId);
+
+    const generateJsonCall = (mockBaseLlmClient.generateJson as Mock).mock
+      .calls[0];
+    expect(generateJsonCall[0].contents).toHaveLength(2);
+    expect(generateJsonCall[0].contents[0]).toEqual(lastModelMessage);
+    expect(generateJsonCall[0].contents[1]).toMatchObject({
+      role: 'user',
+    });
+    expect(chatInstance.getHistory).not.toHaveBeenCalled();
+    expect(chatInstance.getHistoryTail).toHaveBeenCalledWith(1, true);
+  });
+
   it('should use raw last history entry to detect function responses', async () => {
-    vi.mocked(chatInstance.getHistory).mockReturnValue([
+    vi.mocked(chatInstance.getHistoryTail).mockReturnValue([
       {
         role: 'model',
         parts: [{ functionCall: { name: 'read_file', args: {} } }],
@@ -310,7 +342,8 @@ describe('checkNextSpeaker', () => {
         'The last message was a function response, so the model should speak next.',
       next_speaker: 'model',
     });
-    expect(chatInstance.getHistory).toHaveBeenCalledWith(true);
+    expect(chatInstance.getHistory).not.toHaveBeenCalled();
+    expect(chatInstance.getHistoryTail).not.toHaveBeenCalled();
     expect(chatInstance.getLastHistoryEntry).toHaveBeenCalledTimes(1);
     expect(mockBaseLlmClient.generateJson).not.toHaveBeenCalled();
   });
@@ -327,8 +360,9 @@ describe('checkNextSpeaker', () => {
 
     await checkNextSpeaker(chatInstance, mockConfig, abortSignal, promptId);
 
-    expect(chatInstance.getHistory).toHaveBeenCalledTimes(1);
-    expect(chatInstance.getHistory).toHaveBeenCalledWith(true);
+    expect(chatInstance.getHistory).not.toHaveBeenCalled();
+    expect(chatInstance.getHistoryTail).toHaveBeenCalledTimes(1);
+    expect(chatInstance.getHistoryTail).toHaveBeenCalledWith(1, true);
     expect(chatInstance.getLastHistoryEntry).toHaveBeenCalledTimes(1);
   });
 });
diff --git a/packages/core/src/utils/nextSpeakerChecker.ts b/packages/core/src/utils/nextSpeakerChecker.ts
index c36a8eb9ac..33b4e2f06c 100644
--- a/packages/core/src/utils/nextSpeakerChecker.ts
+++ b/packages/core/src/utils/nextSpeakerChecker.ts
@@ -48,23 +48,9 @@ export async function checkNextSpeaker(
   abortSignal: AbortSignal,
   promptId: string,
 ): Promise<NextSpeakerResponse | null> {
-  // We need to capture the curated history because there are many moments when the model will return invalid turns
-  // that when passed back up to the endpoint will break subsequent calls. An example of this is when the model decides
-  // to respond with an empty part collection if you were to send that message back to the server it will respond with
-  // a 400 indicating that model part collections MUST have content.
-  const curatedHistory = chat.getHistory(/* curated */ true);
-
-  // Ensure there's a model response to analyze
-  if (curatedHistory.length === 0) {
-    // Cannot determine next speaker if history is empty.
-    return null;
-  }
-
   // Read the last raw history entry by design: functionResponse turns can be
   // stripped from curated history, but they are decisive for next-speaker flow.
   const lastComprehensiveMessage = chat.getLastHistoryEntry();
-  // Raw history can still be empty even if the curated-history guard above is
-  // the normal empty-chat path, so keep this defensive check local.
   if (!lastComprehensiveMessage) {
     return null;
   }
@@ -94,7 +80,10 @@ export async function checkNextSpeaker(
 
   // Things checked out. Let's proceed to potentially making an LLM request.
 
-  const lastMessage = curatedHistory[curatedHistory.length - 1];
+  // The next-speaker prompt only analyzes the immediately preceding response.
+  // Keep the side query and its structuredClone cost bounded to that one
+  // curated message rather than cloning and sending the entire chat history.
+  const [lastMessage] = chat.getHistoryTail(1, /* curated */ true);
   if (!lastMessage || lastMessage.role !== 'model') {
     // Cannot determine next speaker if the last turn wasn't from the model
     // or if history is empty.
@@ -102,7 +91,7 @@ export async function checkNextSpeaker(
   }
 
   const contents: Content[] = [
-    ...curatedHistory,
+    lastMessage,
     { role: 'user', parts: [{ text: CHECK_PROMPT }] },
   ];
 
diff --git a/packages/core/src/utils/runtimeDiagnostics.test.ts b/packages/core/src/utils/runtimeDiagnostics.test.ts
new file mode 100644
index 0000000000..cff3de3c33
--- /dev/null
+++ b/packages/core/src/utils/runtimeDiagnostics.test.ts
@@ -0,0 +1,237 @@
+/**
+ * @license
+ * Copyright 2026 Qwen
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect, it } from 'vitest';
+import type { GenerateContentParameters } from '@google/genai';
+import {
+  RuntimeDiagnosticsCollector,
+  summarizeAnthropicWireRequest,
+  summarizeOpenAIWireRequest,
+} from './runtimeDiagnostics.js';
+
+describe('RuntimeDiagnosticsCollector', () => {
+  it('summarizes generate-content requests without retaining prompt text or tool args', () => {
+    const collector = new RuntimeDiagnosticsCollector({
+      enabled: true,
+      now: () => '2026-05-19T00:00:00.000Z',
+    });
+    const request = {
+      model: 'diagnostic-model',
+      contents: [
+        {
+          role: 'user',
+          parts: [{ text: 'secret user prompt' }],
+        },
+        {
+          role: 'user',
+          parts: [
+            {
+              functionResponse: {
+                id: 'tool-1',
+                name: 'read_file',
+                response: { output: 'secret tool output' },
+              },
+            },
+          ],
+        },
+      ],
+      config: {
+        systemInstruction: { parts: [{ text: 'secret system prompt' }] },
+        tools: [
+          {
+            functionDeclarations: [
+              {
+                name: 'read_file',
+                description: 'Read file',
+                parametersJsonSchema: {
+                  type: 'object',
+                  properties: { path: { type: 'string' } },
+                },
+              },
+            ],
+          },
+        ],
+      },
+    } satisfies GenerateContentParameters;
+
+    collector.recordGenerateContentRequest(request, {
+      stream: true,
+      source: 'generateContentStream',
+    });
+
+    const snapshot = collector.snapshot();
+    expect(snapshot.requests).toHaveLength(1);
+    expect(snapshot.requests[0]).toMatchObject({
+      index: 1,
+      source: 'generateContentStream',
+      model: 'diagnostic-model',
+      stream: true,
+      contents: {
+        count: 2,
+        roleCounts: { user: 2 },
+        partCount: 2,
+        textBytes: Buffer.byteLength('secret user prompt'),
+        functionResponseCount: 1,
+        functionResponseBytes: expect.any(Number),
+      },
+      systemInstructionBytes: Buffer.byteLength('secret system prompt'),
+      tools: {
+        count: 1,
+        functionDeclarationCount: 1,
+        schemaBytes: expect.any(Number),
+      },
+    });
+    expect(JSON.stringify(snapshot)).not.toContain('secret user prompt');
+    expect(JSON.stringify(snapshot)).not.toContain('secret tool output');
+    expect(JSON.stringify(snapshot)).not.toContain('secret system prompt');
+  });
+
+  it('summarizes OpenAI wire requests by size and role only', () => {
+    const summary = summarizeOpenAIWireRequest({
+      model: 'wire-model',
+      stream: true,
+      messages: [
+        { role: 'system', content: 'secret system' },
+        { role: 'user', content: [{ type: 'text', text: 'secret user' }] },
+      ],
+      tools: [
+        {
+          type: 'function',
+          function: {
+            name: 'run_shell_command',
+            description: 'Run shell command',
+            parameters: {
+              type: 'object',
+              properties: { command: { type: 'string' } },
+            },
+          },
+        },
+      ],
+    });
+
+    expect(summary).toMatchObject({
+      model: 'wire-model',
+      stream: true,
+      messageCount: 2,
+      messageBytesByRole: {
+        system: Buffer.byteLength('secret system'),
+        user: expect.any(Number),
+      },
+      toolsCount: 1,
+      toolSchemaBytes: expect.any(Number),
+      bodyBytes: expect.any(Number),
+      topLevelKeys: ['messages', 'model', 'stream', 'tools'],
+    });
+    expect(JSON.stringify(summary)).not.toContain('secret system');
+    expect(JSON.stringify(summary)).not.toContain('secret user');
+  });
+
+  it('summarizes Anthropic wire requests by size and role only', () => {
+    const summary = summarizeAnthropicWireRequest({
+      model: 'anthropic-wire-model',
+      stream: true,
+      system: [{ type: 'text', text: 'secret system' }],
+      messages: [
+        { role: 'user', content: 'secret user' },
+        {
+          role: 'assistant',
+          content: [
+            {
+              type: 'tool_use',
+              id: 'tool-1',
+              name: 'run_shell_command',
+              input: { command: 'secret command' },
+            },
+          ],
+        },
+      ],
+      tools: [
+        {
+          name: 'run_shell_command',
+          description: 'Run shell command',
+          input_schema: {
+            type: 'object',
+            properties: { command: { type: 'string' } },
+          },
+        },
+      ],
+      max_tokens: 1024,
+    });
+
+    expect(summary).toMatchObject({
+      model: 'anthropic-wire-model',
+      stream: true,
+      messageCount: 2,
+      messageBytesByRole: {
+        user: Buffer.byteLength('secret user'),
+        assistant: expect.any(Number),
+      },
+      systemBytes: expect.any(Number),
+      toolsCount: 1,
+      toolSchemaBytes: expect.any(Number),
+      bodyBytes: expect.any(Number),
+      topLevelKeys: [
+        'max_tokens',
+        'messages',
+        'model',
+        'stream',
+        'system',
+        'tools',
+      ],
+    });
+    expect(JSON.stringify(summary)).not.toContain('secret system');
+    expect(JSON.stringify(summary)).not.toContain('secret user');
+    expect(JSON.stringify(summary)).not.toContain('secret command');
+  });
+
+  it('aggregates tool use and tool result sizes without retaining payloads', () => {
+    const collector = new RuntimeDiagnosticsCollector({ enabled: true });
+
+    collector.recordToolUse('read_file', { path: '/private/path.txt' });
+    collector.recordToolResult({
+      name: 'read_file',
+      callId: 'tool-1',
+      resultBytes: 2048,
+      isError: false,
+    });
+    collector.recordToolResult({
+      name: 'run_shell_command',
+      callId: 'tool-2',
+      resultBytes: 512,
+      isError: true,
+    });
+
+    const snapshot = collector.snapshot();
+    expect(snapshot.tools).toMatchObject({
+      toolUseCount: 1,
+      toolResultCount: 2,
+      toolResultErrorCount: 1,
+      totalToolUseArgBytes: expect.any(Number),
+      maxToolUseArgBytes: expect.any(Number),
+      totalToolResultBytes: 2560,
+      maxToolResultBytes: 2048,
+      byName: {
+        read_file: {
+          uses: 1,
+          argBytes: expect.any(Number),
+          maxArgBytes: expect.any(Number),
+          results: 1,
+          errors: 0,
+          resultBytes: 2048,
+          maxResultBytes: 2048,
+        },
+        run_shell_command: {
+          uses: 0,
+          results: 1,
+          errors: 1,
+          resultBytes: 512,
+          maxResultBytes: 512,
+        },
+      },
+    });
+    expect(JSON.stringify(snapshot)).not.toContain('/private/path.txt');
+  });
+});
diff --git a/packages/core/src/utils/runtimeDiagnostics.ts b/packages/core/src/utils/runtimeDiagnostics.ts
new file mode 100644
index 0000000000..74f367bba9
--- /dev/null
+++ b/packages/core/src/utils/runtimeDiagnostics.ts
@@ -0,0 +1,557 @@
+/**
+ * @license
+ * Copyright 2026 Qwen
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import type { GenerateContentParameters } from '@google/genai';
+import type Anthropic from '@anthropic-ai/sdk';
+import type OpenAI from 'openai';
+
+export interface RuntimeDiagnosticsSnapshot {
+  enabled: boolean;
+  startedAt: string;
+  requests: GenerateContentRequestDiagnostics[];
+  openaiWireRequests: OpenAIWireRequestDiagnostics[];
+  anthropicWireRequests: AnthropicWireRequestDiagnostics[];
+  tools: RuntimeToolDiagnostics;
+}
+
+export interface GenerateContentRequestDiagnostics {
+  index: number;
+  timestamp: string;
+  source: 'generateContent' | 'generateContentStream';
+  model: string;
+  stream: boolean;
+  serializedBytes: number;
+  contents: RuntimeContentDiagnostics;
+  systemInstructionBytes: number;
+  generationConfigBytes: number;
+  tools: RuntimeToolSchemaDiagnostics;
+}
+
+export interface RuntimeContentDiagnostics {
+  count: number;
+  roleCounts: Record<string, number>;
+  partCount: number;
+  textBytes: number;
+  functionCallCount: number;
+  functionCallArgBytes: number;
+  functionResponseCount: number;
+  functionResponseBytes: number;
+  inlineDataCount: number;
+  inlineDataBytes: number;
+  fileDataCount: number;
+}
+
+export interface RuntimeToolSchemaDiagnostics {
+  count: number;
+  functionDeclarationCount: number;
+  schemaBytes: number;
+}
+
+export interface OpenAIWireRequestDiagnostics {
+  index?: number;
+  timestamp?: string;
+  model: string;
+  stream: boolean;
+  bodyBytes: number;
+  messageCount: number;
+  messageBytesByRole: Record<string, number>;
+  toolsCount: number;
+  toolSchemaBytes: number;
+  topLevelKeys: string[];
+}
+
+export interface AnthropicWireRequestDiagnostics {
+  index?: number;
+  timestamp?: string;
+  model: string;
+  stream: boolean;
+  bodyBytes: number;
+  messageCount: number;
+  messageBytesByRole: Record<string, number>;
+  systemBytes: number;
+  toolsCount: number;
+  toolSchemaBytes: number;
+  topLevelKeys: string[];
+}
+
+export interface RuntimeToolDiagnostics {
+  toolUseCount: number;
+  toolResultCount: number;
+  toolResultErrorCount: number;
+  totalToolUseArgBytes: number;
+  maxToolUseArgBytes: number;
+  totalToolResultBytes: number;
+  maxToolResultBytes: number;
+  byName: Record<string, RuntimeToolNameDiagnostics>;
+}
+
+export interface RuntimeToolNameDiagnostics {
+  uses: number;
+  argBytes: number;
+  maxArgBytes: number;
+  results: number;
+  errors: number;
+  resultBytes: number;
+  maxResultBytes: number;
+}
+
+export interface RuntimeToolResultRecord {
+  name: string;
+  callId: string;
+  resultBytes: number;
+  isError: boolean;
+}
+
+export interface RuntimeDiagnosticsCollectorOptions {
+  enabled?: boolean;
+  now?: () => string;
+}
+
+const RUNTIME_PROFILE_ENV = 'QWEN_CODE_PROFILE_RUNTIME';
+
+export function isRuntimeDiagnosticsEnabled(
+  env: NodeJS.ProcessEnv = process.env,
+): boolean {
+  return env[RUNTIME_PROFILE_ENV] === '1';
+}
+
+export class RuntimeDiagnosticsCollector {
+  private enabled: boolean;
+  private readonly now: () => string;
+  private startedAt: string;
+  private requestIndex = 0;
+  private openAIWireRequestIndex = 0;
+  private anthropicWireRequestIndex = 0;
+  private requests: GenerateContentRequestDiagnostics[] = [];
+  private openaiWireRequests: OpenAIWireRequestDiagnostics[] = [];
+  private anthropicWireRequests: AnthropicWireRequestDiagnostics[] = [];
+  private tools: RuntimeToolDiagnostics = createInitialToolDiagnostics();
+
+  constructor(options: RuntimeDiagnosticsCollectorOptions = {}) {
+    this.enabled = options.enabled ?? isRuntimeDiagnosticsEnabled();
+    this.now = options.now ?? (() => new Date().toISOString());
+    this.startedAt = this.now();
+  }
+
+  reset(options: { enabled?: boolean } = {}): void {
+    this.enabled = options.enabled ?? isRuntimeDiagnosticsEnabled();
+    this.startedAt = this.now();
+    this.requestIndex = 0;
+    this.openAIWireRequestIndex = 0;
+    this.anthropicWireRequestIndex = 0;
+    this.requests = [];
+    this.openaiWireRequests = [];
+    this.anthropicWireRequests = [];
+    this.tools = createInitialToolDiagnostics();
+  }
+
+  isEnabled(): boolean {
+    return this.enabled;
+  }
+
+  recordGenerateContentRequest(
+    request: GenerateContentParameters,
+    options: {
+      stream: boolean;
+      source: 'generateContent' | 'generateContentStream';
+    },
+  ): void {
+    if (!this.enabled) {
+      return;
+    }
+
+    this.requestIndex += 1;
+    this.requests.push({
+      index: this.requestIndex,
+      timestamp: this.now(),
+      source: options.source,
+      model: request.model,
+      stream: options.stream,
+      serializedBytes: utf8Bytes(toJsonSafeRequest(request)),
+      contents: summarizeContents(request.contents),
+      systemInstructionBytes: summarizeContentTextBytes(
+        request.config?.systemInstruction,
+      ),
+      generationConfigBytes: utf8Bytes(toJsonSafeConfig(request.config)),
+      tools: summarizeToolSchemas(request.config?.tools),
+    });
+  }
+
+  recordOpenAIWireRequest(
+    request: OpenAI.Chat.ChatCompletionCreateParams,
+  ): void {
+    if (!this.enabled) {
+      return;
+    }
+
+    this.openAIWireRequestIndex += 1;
+    this.openaiWireRequests.push({
+      index: this.openAIWireRequestIndex,
+      timestamp: this.now(),
+      ...summarizeOpenAIWireRequest(request),
+    });
+  }
+
+  recordAnthropicWireRequest(
+    request:
+      | Anthropic.MessageCreateParamsNonStreaming
+      | Anthropic.MessageCreateParamsStreaming,
+  ): void {
+    if (!this.enabled) {
+      return;
+    }
+
+    this.anthropicWireRequestIndex += 1;
+    this.anthropicWireRequests.push({
+      index: this.anthropicWireRequestIndex,
+      timestamp: this.now(),
+      ...summarizeAnthropicWireRequest(request),
+    });
+  }
+
+  recordToolUse(name: string, args: unknown): void {
+    if (!this.enabled) {
+      return;
+    }
+
+    const argBytes = utf8Bytes(args);
+    const tool = this.getToolNameDiagnostics(name);
+    tool.uses += 1;
+    tool.argBytes += argBytes;
+    tool.maxArgBytes = Math.max(tool.maxArgBytes, argBytes);
+    this.tools.toolUseCount += 1;
+    this.tools.totalToolUseArgBytes += argBytes;
+    this.tools.maxToolUseArgBytes = Math.max(
+      this.tools.maxToolUseArgBytes,
+      argBytes,
+    );
+  }
+
+  recordToolResult(record: RuntimeToolResultRecord): void {
+    if (!this.enabled) {
+      return;
+    }
+
+    const tool = this.getToolNameDiagnostics(record.name);
+    tool.results += 1;
+    tool.resultBytes += record.resultBytes;
+    tool.maxResultBytes = Math.max(tool.maxResultBytes, record.resultBytes);
+    if (record.isError) {
+      tool.errors += 1;
+      this.tools.toolResultErrorCount += 1;
+    }
+    this.tools.toolResultCount += 1;
+    this.tools.totalToolResultBytes += record.resultBytes;
+    this.tools.maxToolResultBytes = Math.max(
+      this.tools.maxToolResultBytes,
+      record.resultBytes,
+    );
+  }
+
+  snapshot(): RuntimeDiagnosticsSnapshot {
+    return {
+      enabled: this.enabled,
+      startedAt: this.startedAt,
+      requests: this.requests.map((request) => ({
+        ...request,
+        contents: {
+          ...request.contents,
+          roleCounts: { ...request.contents.roleCounts },
+        },
+        tools: { ...request.tools },
+      })),
+      openaiWireRequests: this.openaiWireRequests.map((request) => ({
+        ...request,
+        messageBytesByRole: { ...request.messageBytesByRole },
+        topLevelKeys: [...request.topLevelKeys],
+      })),
+      anthropicWireRequests: this.anthropicWireRequests.map((request) => ({
+        ...request,
+        messageBytesByRole: { ...request.messageBytesByRole },
+        topLevelKeys: [...request.topLevelKeys],
+      })),
+      tools: {
+        ...this.tools,
+        byName: Object.fromEntries(
+          Object.entries(this.tools.byName).map(([name, value]) => [
+            name,
+            { ...value },
+          ]),
+        ),
+      },
+    };
+  }
+
+  private getToolNameDiagnostics(name: string): RuntimeToolNameDiagnostics {
+    const existing = this.tools.byName[name];
+    if (existing) {
+      return existing;
+    }
+    const created = createInitialToolNameDiagnostics();
+    this.tools.byName[name] = created;
+    return created;
+  }
+}
+
+export const runtimeDiagnostics = new RuntimeDiagnosticsCollector();
+
+export function summarizeOpenAIWireRequest(
+  request: OpenAI.Chat.ChatCompletionCreateParams,
+): OpenAIWireRequestDiagnostics {
+  const requestRecord = asRecord(request);
+  const messages = Array.isArray(requestRecord['messages'])
+    ? requestRecord['messages']
+    : [];
+  const tools = Array.isArray(requestRecord['tools'])
+    ? requestRecord['tools']
+    : [];
+  const messageBytesByRole: Record<string, number> = {};
+  for (const message of messages) {
+    const messageRecord = asRecord(message);
+    const role =
+      typeof messageRecord['role'] === 'string'
+        ? messageRecord['role']
+        : 'unknown';
+    messageBytesByRole[role] =
+      (messageBytesByRole[role] ?? 0) + utf8Bytes(messageRecord['content']);
+  }
+
+  return {
+    model:
+      typeof requestRecord['model'] === 'string'
+        ? requestRecord['model']
+        : 'unknown',
+    stream: requestRecord['stream'] === true,
+    bodyBytes: utf8Bytes(request),
+    messageCount: messages.length,
+    messageBytesByRole,
+    toolsCount: tools.length,
+    toolSchemaBytes: utf8Bytes(tools),
+    topLevelKeys: Object.keys(requestRecord).sort(),
+  };
+}
+
+export function summarizeAnthropicWireRequest(
+  request:
+    | Anthropic.MessageCreateParamsNonStreaming
+    | Anthropic.MessageCreateParamsStreaming,
+): AnthropicWireRequestDiagnostics {
+  const requestRecord = asRecord(request);
+  const messages = Array.isArray(requestRecord['messages'])
+    ? requestRecord['messages']
+    : [];
+  const tools = Array.isArray(requestRecord['tools'])
+    ? requestRecord['tools']
+    : [];
+  const messageBytesByRole: Record<string, number> = {};
+  for (const message of messages) {
+    const messageRecord = asRecord(message);
+    const role =
+      typeof messageRecord['role'] === 'string'
+        ? messageRecord['role']
+        : 'unknown';
+    messageBytesByRole[role] =
+      (messageBytesByRole[role] ?? 0) + utf8Bytes(messageRecord['content']);
+  }
+
+  return {
+    model:
+      typeof requestRecord['model'] === 'string'
+        ? requestRecord['model']
+        : 'unknown',
+    stream: requestRecord['stream'] === true,
+    bodyBytes: utf8Bytes(request),
+    messageCount: messages.length,
+    messageBytesByRole,
+    systemBytes: utf8Bytes(requestRecord['system']),
+    toolsCount: tools.length,
+    toolSchemaBytes: utf8Bytes(tools),
+    topLevelKeys: Object.keys(requestRecord).sort(),
+  };
+}
+
+function createInitialToolDiagnostics(): RuntimeToolDiagnostics {
+  return {
+    toolUseCount: 0,
+    toolResultCount: 0,
+    toolResultErrorCount: 0,
+    totalToolUseArgBytes: 0,
+    maxToolUseArgBytes: 0,
+    totalToolResultBytes: 0,
+    maxToolResultBytes: 0,
+    byName: Object.create(null) as Record<string, RuntimeToolNameDiagnostics>,
+  };
+}
+
+function createInitialToolNameDiagnostics(): RuntimeToolNameDiagnostics {
+  return {
+    uses: 0,
+    argBytes: 0,
+    maxArgBytes: 0,
+    results: 0,
+    errors: 0,
+    resultBytes: 0,
+    maxResultBytes: 0,
+  };
+}
+
+function summarizeContents(contents: unknown): RuntimeContentDiagnostics {
+  const summary: RuntimeContentDiagnostics = {
+    count: 0,
+    roleCounts: {},
+    partCount: 0,
+    textBytes: 0,
+    functionCallCount: 0,
+    functionCallArgBytes: 0,
+    functionResponseCount: 0,
+    functionResponseBytes: 0,
+    inlineDataCount: 0,
+    inlineDataBytes: 0,
+    fileDataCount: 0,
+  };
+  const contentItems = Array.isArray(contents)
+    ? contents
+    : contents === undefined || contents === null
+      ? []
+      : [contents];
+
+  for (const content of contentItems) {
+    summary.count += 1;
+    if (typeof content === 'string') {
+      summary.roleCounts['user'] = (summary.roleCounts['user'] ?? 0) + 1;
+      summary.partCount += 1;
+      summary.textBytes += utf8Bytes(content);
+      continue;
+    }
+
+    const contentRecord = asRecord(content);
+    const role =
+      typeof contentRecord['role'] === 'string'
+        ? contentRecord['role']
+        : 'unknown';
+    summary.roleCounts[role] = (summary.roleCounts[role] ?? 0) + 1;
+    const parts = Array.isArray(contentRecord['parts'])
+      ? contentRecord['parts']
+      : [];
+    summarizeParts(parts, summary);
+  }
+
+  return summary;
+}
+
+function summarizeContentTextBytes(content: unknown): number {
+  const summary = summarizeContents(content);
+  return summary.textBytes;
+}
+
+function summarizeParts(
+  parts: unknown[],
+  summary: RuntimeContentDiagnostics,
+): void {
+  for (const part of parts) {
+    summary.partCount += 1;
+    if (typeof part === 'string') {
+      summary.textBytes += utf8Bytes(part);
+      continue;
+    }
+    const partRecord = asRecord(part);
+    if (typeof partRecord['text'] === 'string') {
+      summary.textBytes += utf8Bytes(partRecord['text']);
+    }
+    const functionCall = asOptionalRecord(partRecord['functionCall']);
+    if (functionCall) {
+      summary.functionCallCount += 1;
+      summary.functionCallArgBytes += utf8Bytes(functionCall['args']);
+    }
+    const functionResponse = asOptionalRecord(partRecord['functionResponse']);
+    if (functionResponse) {
+      summary.functionResponseCount += 1;
+      summary.functionResponseBytes +=
+        utf8Bytes(functionResponse['response']) +
+        utf8Bytes(functionResponse['parts']);
+    }
+    const inlineData = asOptionalRecord(partRecord['inlineData']);
+    if (inlineData) {
+      summary.inlineDataCount += 1;
+      summary.inlineDataBytes += utf8Bytes(inlineData['data']);
+    }
+    if (partRecord['fileData']) {
+      summary.fileDataCount += 1;
+    }
+  }
+}
+
+function summarizeToolSchemas(tools: unknown): RuntimeToolSchemaDiagnostics {
+  const toolList = Array.isArray(tools) ? tools : [];
+  let functionDeclarationCount = 0;
+  for (const tool of toolList) {
+    const toolRecord = asRecord(tool);
+    const declarations = Array.isArray(toolRecord['functionDeclarations'])
+      ? toolRecord['functionDeclarations']
+      : [];
+    functionDeclarationCount += declarations.length;
+  }
+  return {
+    count: toolList.length,
+    functionDeclarationCount,
+    schemaBytes: utf8Bytes(toolList),
+  };
+}
+
+function toJsonSafeRequest(request: GenerateContentParameters): unknown {
+  return {
+    model: request.model,
+    contents: request.contents,
+    config: toJsonSafeConfig(request.config),
+  };
+}
+
+function toJsonSafeConfig(
+  config: GenerateContentParameters['config'],
+): unknown {
+  if (!config) {
+    return undefined;
+  }
+  const configRecord = asRecord(config);
+  const safeConfig: Record<string, unknown> = {};
+  for (const [key, value] of Object.entries(configRecord)) {
+    if (key === 'abortSignal') {
+      continue;
+    }
+    safeConfig[key] = value;
+  }
+  return safeConfig;
+}
+
+function utf8Bytes(value: unknown): number {
+  if (value === undefined || value === null) {
+    return 0;
+  }
+  if (typeof value === 'string') {
+    return Buffer.byteLength(value, 'utf8');
+  }
+  return Buffer.byteLength(safeStringify(value), 'utf8');
+}
+
+function safeStringify(value: unknown): string {
+  try {
+    return JSON.stringify(value) ?? '';
+  } catch {
+    return '[unserializable]';
+  }
+}
+
+function asRecord(value: unknown): Record<string, unknown> {
+  return typeof value === 'object' && value !== null
+    ? (value as Record<string, unknown>)
+    : {};
+}
+
+function asOptionalRecord(value: unknown): Record<string, unknown> | null {
+  return typeof value === 'object' && value !== null
+    ? (value as Record<string, unknown>)
+    : null;
+}
diff --git a/packages/vscode-ide-companion/src/diff-manager.ts b/packages/vscode-ide-companion/src/diff-manager.ts
index 755143a4e4..ccabe3657e 100644
--- a/packages/vscode-ide-companion/src/diff-manager.ts
+++ b/packages/vscode-ide-companion/src/diff-manager.ts
@@ -7,7 +7,7 @@
 import {
   IdeDiffAcceptedNotificationSchema,
   IdeDiffClosedNotificationSchema,
-} from '@qwen-code/qwen-code-core/src/ide/types.js';
+} from '@qwen-code/qwen-code-core';
 import { type JSONRPCNotification } from '@modelcontextprotocol/sdk/types.js';
 import * as path from 'node:path';
 import * as vscode from 'vscode';
diff --git a/packages/vscode-ide-companion/src/extension.test.ts b/packages/vscode-ide-companion/src/extension.test.ts
index 72c3d476ef..d22062515d 100644
--- a/packages/vscode-ide-companion/src/extension.test.ts
+++ b/packages/vscode-ide-companion/src/extension.test.ts
@@ -7,18 +7,14 @@
 import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
 import * as vscode from 'vscode';
 import { activate } from './extension.js';
-import {
-  IDE_DEFINITIONS,
-  detectIdeFromEnv,
-} from '@qwen-code/qwen-code-core/src/ide/detect-ide.js';
-
-vi.mock('@qwen-code/qwen-code-core/src/ide/detect-ide.js', async () => {
-  const actual = await vi.importActual(
-    '@qwen-code/qwen-code-core/src/ide/detect-ide.js',
-  );
+import { IDE_DEFINITIONS, detectIdeFromEnv } from '@qwen-code/qwen-code-core';
+
+vi.mock('@qwen-code/qwen-code-core', async (importOriginal) => {
+  const actual =
+    await importOriginal<typeof import('@qwen-code/qwen-code-core')>();
   return {
     ...actual,
-    detectIdeFromEnv: vi.fn(() => IDE_DEFINITIONS.vscode),
+    detectIdeFromEnv: vi.fn(() => actual.IDE_DEFINITIONS.vscode),
   };
 });
 
diff --git a/packages/vscode-ide-companion/src/extension.ts b/packages/vscode-ide-companion/src/extension.ts
index 3f83a67942..56c441af61 100644
--- a/packages/vscode-ide-companion/src/extension.ts
+++ b/packages/vscode-ide-companion/src/extension.ts
@@ -13,7 +13,7 @@ import {
   detectIdeFromEnv,
   IDE_DEFINITIONS,
   type IdeInfo,
-} from '@qwen-code/qwen-code-core/src/ide/detect-ide.js';
+} from '@qwen-code/qwen-code-core';
 import { WebViewProvider } from './webview/providers/WebViewProvider.js';
 import { ChatProviderRegistry } from './webview/providers/ChatProviderRegistry.js';
 import { registerChatViewProviders } from './webview/providers/chatViewRegistration.js';
diff --git a/packages/vscode-ide-companion/src/ide-server.test.ts b/packages/vscode-ide-companion/src/ide-server.test.ts
index 9c51d50215..ee99ce105e 100644
--- a/packages/vscode-ide-companion/src/ide-server.test.ts
+++ b/packages/vscode-ide-companion/src/ide-server.test.ts
@@ -38,9 +38,17 @@ vi.mock('node:os', async (importOriginal) => {
   };
 });
 
-vi.mock('@qwen-code/qwen-code-core/src/ide/detect-ide.js', () => ({
-  detectIdeFromEnv: vi.fn(() => ({ name: 'vscode', displayName: 'VS Code' })),
-}));
+vi.mock('@qwen-code/qwen-code-core', async (importOriginal) => {
+  const actual =
+    await importOriginal<typeof import('@qwen-code/qwen-code-core')>();
+  return {
+    ...actual,
+    detectIdeFromEnv: vi.fn(() => ({
+      name: 'vscode',
+      displayName: 'VS Code',
+    })),
+  };
+});
 
 const vscodeMock = vi.hoisted(() => ({
   workspace: {
@@ -62,13 +70,6 @@ const vscodeMock = vi.hoisted(() => ({
 
 vi.mock('vscode', () => vscodeMock);
 
-vi.mock('@qwen-code/qwen-code-core/src/ide/detect-ide.js', () => ({
-  detectIdeFromEnv: vi.fn(() => ({
-    name: 'vscode',
-    displayName: 'VS Code',
-  })),
-}));
-
 vi.mock('./open-files-manager', () => {
   const OpenFilesManager = vi.fn();
   OpenFilesManager.prototype.onDidChange = vi.fn(() => ({ dispose: vi.fn() }));
diff --git a/packages/vscode-ide-companion/src/ide-server.ts b/packages/vscode-ide-companion/src/ide-server.ts
index 1122677b76..2f19fbbc92 100644
--- a/packages/vscode-ide-companion/src/ide-server.ts
+++ b/packages/vscode-ide-companion/src/ide-server.ts
@@ -7,10 +7,10 @@
 import * as vscode from 'vscode';
 import {
   CloseDiffRequestSchema,
+  detectIdeFromEnv,
   IdeContextNotificationSchema,
   OpenDiffRequestSchema,
-} from '@qwen-code/qwen-code-core/src/ide/types.js';
-import { detectIdeFromEnv } from '@qwen-code/qwen-code-core/src/ide/detect-ide.js';
+} from '@qwen-code/qwen-code-core';
 import { isInitializeRequest } from '@modelcontextprotocol/sdk/types.js';
 import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
 import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/streamableHttp.js';
diff --git a/packages/vscode-ide-companion/src/open-files-manager.ts b/packages/vscode-ide-companion/src/open-files-manager.ts
index ee7f595e18..30c9029ac8 100644
--- a/packages/vscode-ide-companion/src/open-files-manager.ts
+++ b/packages/vscode-ide-companion/src/open-files-manager.ts
@@ -5,10 +5,7 @@
  */
 
 import * as vscode from 'vscode';
-import type {
-  File,
-  IdeContext,
-} from '@qwen-code/qwen-code-core/src/ide/types.js';
+import type { File, IdeContext } from '@qwen-code/qwen-code-core';
 import {
   isFileUri,
   isNotebookFileUri,
diff --git a/packages/vscode-ide-companion/src/services/open-files-manager/notebook-handler.ts b/packages/vscode-ide-companion/src/services/open-files-manager/notebook-handler.ts
index 40e6637446..64907fe315 100644
--- a/packages/vscode-ide-companion/src/services/open-files-manager/notebook-handler.ts
+++ b/packages/vscode-ide-companion/src/services/open-files-manager/notebook-handler.ts
@@ -5,7 +5,7 @@
  */
 
 import * as vscode from 'vscode';
-import type { File } from '@qwen-code/qwen-code-core/src/ide/types.js';
+import type { File } from '@qwen-code/qwen-code-core';
 import { MAX_FILES, MAX_SELECTED_TEXT_LENGTH } from './constants.js';
 import {
   deactivateCurrentActiveFile,
diff --git a/packages/vscode-ide-companion/src/services/open-files-manager/text-handler.ts b/packages/vscode-ide-companion/src/services/open-files-manager/text-handler.ts
index 88853f31bf..a1e7dda5b4 100644
--- a/packages/vscode-ide-companion/src/services/open-files-manager/text-handler.ts
+++ b/packages/vscode-ide-companion/src/services/open-files-manager/text-handler.ts
@@ -5,7 +5,7 @@
  */
 
 import type * as vscode from 'vscode';
-import type { File } from '@qwen-code/qwen-code-core/src/ide/types.js';
+import type { File } from '@qwen-code/qwen-code-core';
 import { MAX_FILES, MAX_SELECTED_TEXT_LENGTH } from './constants.js';
 import {
   deactivateCurrentActiveFile,
diff --git a/packages/vscode-ide-companion/src/services/open-files-manager/utils.ts b/packages/vscode-ide-companion/src/services/open-files-manager/utils.ts
index dd4b46126a..ea59ccdbd7 100644
--- a/packages/vscode-ide-companion/src/services/open-files-manager/utils.ts
+++ b/packages/vscode-ide-companion/src/services/open-files-manager/utils.ts
@@ -5,7 +5,7 @@
  */
 
 import * as vscode from 'vscode';
-import type { File } from '@qwen-code/qwen-code-core/src/ide/types.js';
+import type { File } from '@qwen-code/qwen-code-core';
 
 export function isFileUri(uri: vscode.Uri): boolean {
   return uri.scheme === 'file';
diff --git a/packages/vscode-ide-companion/src/services/qwenSessionManager.ts b/packages/vscode-ide-companion/src/services/qwenSessionManager.ts
index a39a37ebed..34a2f1349a 100644
--- a/packages/vscode-ide-companion/src/services/qwenSessionManager.ts
+++ b/packages/vscode-ide-companion/src/services/qwenSessionManager.ts
@@ -7,7 +7,7 @@
 import * as fs from 'fs';
 import * as path from 'path';
 import * as crypto from 'crypto';
-import { getProjectHash } from '@qwen-code/qwen-code-core/src/utils/paths.js';
+import { getProjectHash } from '@qwen-code/qwen-code-core';
 import { getRuntimeBaseDir } from '../utils/paths.js';
 import type { QwenSession } from './qwenSessionReader.js';
 
diff --git a/packages/vscode-ide-companion/src/services/qwenSessionReader.ts b/packages/vscode-ide-companion/src/services/qwenSessionReader.ts
index 1b15598f97..abfdb126e0 100644
--- a/packages/vscode-ide-companion/src/services/qwenSessionReader.ts
+++ b/packages/vscode-ide-companion/src/services/qwenSessionReader.ts
@@ -8,8 +8,7 @@ import * as fs from 'fs';
 import * as path from 'path';
 import * as readline from 'readline';
 import * as crypto from 'crypto';
-import { getProjectHash } from '@qwen-code/qwen-code-core/src/utils/paths.js';
-import { getGitBranch } from '@qwen-code/qwen-code-core/src/utils/gitUtils.js';
+import { getGitBranch, getProjectHash } from '@qwen-code/qwen-code-core';
 import { getRuntimeBaseDir } from '../utils/paths.js';
 import { truncatePanelTitle } from '../webview/utils/panelTitleUtils.js';
 
diff --git a/packages/vscode-ide-companion/src/utils/acpModelInfo.ts b/packages/vscode-ide-companion/src/utils/acpModelInfo.ts
index 53d14c5bcf..120873f705 100644
--- a/packages/vscode-ide-companion/src/utils/acpModelInfo.ts
+++ b/packages/vscode-ide-companion/src/utils/acpModelInfo.ts
@@ -5,7 +5,7 @@
  */
 
 import type { ModelInfo } from '@agentclientprotocol/sdk';
-import { knownTokenLimit } from '@qwen-code/qwen-code-core/src/core/tokenLimits.js';
+import { knownTokenLimit } from '@qwen-code/qwen-code-core';
 import type { ApprovalModeValue } from '../types/approvalModeValueTypes.js';
 
 type AcpMeta = Record<string, unknown>;
diff --git a/packages/vscode-ide-companion/src/utils/editorGroupUtils.ts b/packages/vscode-ide-companion/src/utils/editorGroupUtils.ts
index 3326cd3368..53575d4884 100644
--- a/packages/vscode-ide-companion/src/utils/editorGroupUtils.ts
+++ b/packages/vscode-ide-companion/src/utils/editorGroupUtils.ts
@@ -29,7 +29,9 @@ function findNeighborGroup(
 ): vscode.ViewColumn | undefined {
   let candidate: vscode.ViewColumn | undefined;
   for (const g of vscode.window.tabGroups.all) {
-    if (!isOnSide(g.viewColumn)) continue;
+    if (!isOnSide(g.viewColumn)) {
+      continue;
+    }
     if (candidate === undefined || isCloser(candidate, g.viewColumn)) {
       candidate = g.viewColumn;
     }
diff --git a/packages/vscode-ide-companion/src/utils/imageSupport.test.ts b/packages/vscode-ide-companion/src/utils/imageSupport.test.ts
index b2b78d0ce5..b7948655d9 100644
--- a/packages/vscode-ide-companion/src/utils/imageSupport.test.ts
+++ b/packages/vscode-ide-companion/src/utils/imageSupport.test.ts
@@ -5,7 +5,7 @@
  */
 
 import { describe, expect, it } from 'vitest';
-import { SUPPORTED_IMAGE_MIME_TYPES } from '@qwen-code/qwen-code-core/src/utils/request-tokenizer/supportedImageFormats.js';
+import { SUPPORTED_IMAGE_MIME_TYPES } from '@qwen-code/qwen-code-core';
 import { SUPPORTED_PASTED_IMAGE_MIME_TYPES } from './imageSupport.js';
 
 describe('imageSupport constants', () => {
diff --git a/packages/vscode-ide-companion/src/webview/handlers/FileMessageHandler.test.ts b/packages/vscode-ide-companion/src/webview/handlers/FileMessageHandler.test.ts
index faeaa8f19f..3d16f841a7 100644
--- a/packages/vscode-ide-companion/src/webview/handlers/FileMessageHandler.test.ts
+++ b/packages/vscode-ide-companion/src/webview/handlers/FileMessageHandler.test.ts
@@ -61,24 +61,25 @@ const vscodeMock = vi.hoisted(() => {
 });
 
 vi.mock('vscode', () => vscodeMock);
-vi.mock(
-  '@qwen-code/qwen-code-core/src/services/fileDiscoveryService.js',
-  () => ({
+vi.mock('@qwen-code/qwen-code-core', async (importOriginal) => {
+  const actual =
+    await importOriginal<typeof import('@qwen-code/qwen-code-core')>();
+  return {
+    ...actual,
     FileDiscoveryService: class {
       shouldIgnoreFile(filePath: string, options?: unknown) {
         return shouldIgnoreFileMock(filePath, options);
       }
     },
-  }),
-);
-vi.mock('@qwen-code/qwen-code-core/src/utils/filesearch/fileSearch.js', () => ({
-  FileSearchFactory: {
-    create: () => fileSearchMock,
-  },
-}));
-vi.mock('@qwen-code/qwen-code-core/src/utils/filesearch/crawlCache.js', () => ({
-  clear: vi.fn(),
-}));
+    FileSearchFactory: {
+      create: () => fileSearchMock,
+    },
+    crawlCache: {
+      ...actual.crawlCache,
+      clear: vi.fn(),
+    },
+  };
+});
 
 const readonlyProviderMock = vi.hoisted(() => ({
   createUri: vi.fn(),
diff --git a/packages/vscode-ide-companion/src/webview/handlers/FileMessageHandler.ts b/packages/vscode-ide-companion/src/webview/handlers/FileMessageHandler.ts
index 547cd6108a..eaf527a147 100644
--- a/packages/vscode-ide-companion/src/webview/handlers/FileMessageHandler.ts
+++ b/packages/vscode-ide-companion/src/webview/handlers/FileMessageHandler.ts
@@ -13,12 +13,12 @@ import {
   findRightGroupOfChatWebview,
 } from '../../utils/editorGroupUtils.js';
 import { ReadonlyFileSystemProvider } from '../../services/readonlyFileSystemProvider.js';
-import { FileDiscoveryService } from '@qwen-code/qwen-code-core/src/services/fileDiscoveryService.js';
 import {
+  crawlCache,
+  FileDiscoveryService,
   FileSearchFactory,
   type FileSearch,
-} from '@qwen-code/qwen-code-core/src/utils/filesearch/fileSearch.js';
-import * as crawlCache from '@qwen-code/qwen-code-core/src/utils/filesearch/crawlCache.js';
+} from '@qwen-code/qwen-code-core';
 import { getErrorMessage } from '../../utils/errorMessage.js';
 
 /**