From 4395af444cf36f20bb0214cab6a5e66ec3c1b0e9 Mon Sep 17 00:00:00 2001
From: yiliang114 <1204183885@qq.com>
Date: Wed, 20 May 2026 21:17:38 +0800
Subject: [PATCH 01/11] docs: add OOM investigation reports and auto-compaction
redesign proposal
- Runtime memory investigation plan
- Non-interactive memory benchmark report
- OOM reproduction report with 2GiB/4GiB synthetic tests
- Runtime diagnostics benchmark report
- Auto-compaction threshold redesign proposal
---
.../auto-compaction-threshold-redesign.md | 418 ++++++++
...2026-05-18-qwen-memory-benchmark-report.md | 280 ++++++
.../2026-05-19-oom-reproduction-report.md | 435 +++++++++
...en-runtime-diagnostics-benchmark-report.md | 902 ++++++++++++++++++
...05-18-qwen-runtime-memory-investigation.md | 235 +++++
5 files changed, 2270 insertions(+)
create mode 100644 docs/design/auto-compaction-threshold-redesign.md
create mode 100644 docs/e2e-tests/2026-05-18-qwen-memory-benchmark-report.md
create mode 100644 docs/e2e-tests/2026-05-19-oom-reproduction-report.md
create mode 100644 docs/e2e-tests/2026-05-19-qwen-runtime-diagnostics-benchmark-report.md
create mode 100644 docs/plans/2026-05-18-qwen-runtime-memory-investigation.md
diff --git a/docs/design/auto-compaction-threshold-redesign.md b/docs/design/auto-compaction-threshold-redesign.md
new file mode 100644
index 0000000000..81e9d74128
--- /dev/null
+++ b/docs/design/auto-compaction-threshold-redesign.md
@@ -0,0 +1,418 @@
+# Auto-Compaction Threshold Redesign
+
+**Status:** Draft · 2026-05-14
+
+## 背景
+
+当前 qwen-code 的自动压缩仅使用单一比例阈值 `COMPRESSION_TOKEN_THRESHOLD = 0.7`(`chatCompressionService.ts:33`),所有窗口大小共用同一比例。对比 claude-code 的「绝对 token 梯子」(autoCompact.ts:62-65),qwen-code 存在三个具体问题:
+
+1. **大窗口下预留过多**:1M 模型 70% 阈值在 700K 触发,剩余 300K 远超摘要 + 输出实际所需的 ~33K
+2. **失败 1 次永久锁**:`hasFailedCompressionAttempt = true` 之后整个 session 不再尝试 auto-compact(geminiChat.ts:504),比 claude-code 的「连续 3 次熔断」更严苛
+3. **tip 系统与 auto 阈值脱钩**:`tipRegistry.ts` 里的三条 `context-*` tip 使用固定的 50/80/95 百分比,与 auto-compact 阈值(70%)完全独立。这意味着在「auto 正常工作」的主路径上 80% / 95% tip 极少触发,而在「auto 失败 / 反应式兜底」的边缘路径上又缺乏与阈值对齐的语义
+4. **压缩调用本身没有输出预算控制**:[chatCompressionService.ts:374-376](packages/core/src/services/chatCompressionService.ts:374) 显式开启 `thinkingConfig.includeThoughts = true`(注释:「Compression quality drives every subsequent main turn」),同时 sideQuery 调用未设 `maxOutputTokens` 上限。代码注释([:436-437](packages/core/src/services/chatCompressionService.ts:436))也承认 `compressionOutputTokenCount may include non-persisted tokens (thoughts)`。在压缩接近窗口顶时,总输出可能膨胀,使 buffer 预留缺乏可预测上限。
更糟糕的是跨 provider 行为不一致:Anthropic 的 thinking budget 与 max_tokens 完全独立;OpenAI 的 reasoning tokens 不受 max_completion_tokens 限制;Gemini 的行为又因模型版本而异。这意味着「单靠加 maxOutputTokens 就能控制总输出」在 qwen-code 这种多 provider 项目里不成立
+
+5. **阈值判断使用的 `lastPromptTokenCount` 系统性下偏。** [geminiChat.ts:1217-1232](packages/core/src/core/geminiChat.ts:1217) 表明这个数来自上一轮 API response 的 `usageMetadata.totalTokenCount`。两个 gap:(a) 不包含本轮即将加入的 user message,每次 cheap-gate 判断都比真实 prompt 小一段;(b) 首轮初始值是 0,`--continue` 恢复巨大 session / sub-agent 继承大量历史时第一次 send 永远绕过所有阈值。对比 claude-code 的 `tokenCountWithEstimation`([query.ts:638](src/query.ts:638))走「最后一条 assistant API usage + 之后新增 message 估算」的双轨制能闭合这两个 gap
+
+## 设计目标
+
+- 引入「比例 + 绝对」混合阈值,让大窗口模型由绝对值接管,小窗口仍走比例兜底
+- 新增 warn / hard 两层(auto 保留为主触发点),形成三层梯子
+- 把 tip 系统重写为跟随新阈值的触发条件
+- 失败处理从「1 次永久锁」升级为「3 次熔断 + 自动恢复」
+- **压缩调用关闭 thinking 并加 `maxOutputTokens` 上限**:与 claude-code 对齐,让总输出受单一参数约束、buffer 预算可预测;接受压缩质量可能下降的代价
+- **加 token 估算补偿**:消除 `lastPromptTokenCount` 的「滞后一轮」和「首轮为 0」两个系统性下偏,让阈值判断更贴近真实 prompt 大小
+- 删除 settings 里的 `contextPercentageThreshold` 配置入口(内部 PCT 常量保留)
+- **不引入** env 覆盖通道、**不**新增显式 enabled 开关
+
+## 三层阈值梯子
+
+```
+ window (raw context window)
+ │
+ │ ← SUMMARY_RESERVE = 20K
+ ▼
+ effectiveWindow
+ │
+ │ ← HARD_BUFFER = 3K
+ ▼
+ hard_threshold = effectiveWindow - 3K
+ │
+ │ ← (AUTOCOMPACT_BUFFER - HARD_BUFFER) = 10K
+ ▼
+auto_threshold = max(PCT * window, effectiveWindow - AUTOCOMPACT_BUFFER)
+ │
+ │ ← WARN_BUFFER = 20K
+ ▼
+warn_threshold = max((PCT - WARN_OFFSET) * window, auto_threshold - WARN_BUFFER)
+ │
+ ▼
+ 0
+```
+
+### 三层语义
+
+| 层 | 触发条件 | 行为 |
+| -------- | ------------------------------ | -------------------------------------------------------- |
+| **warn** | `tokenCount >= warn_threshold` | UI 提示「距自动压缩还剩 X tokens」,不改变 send 行为 |
+| **auto** | `tokenCount >= auto_threshold` | 在 send 前 `tryCompress(force=false)`,正常压缩流程 |
+| **hard** | `tokenCount >= hard_threshold` | 在 send 前 `tryCompress(force=true)`,重置失败锁强制压缩 |
+
+`hard` 层等同于把现有 reactive overflow(geminiChat.ts:711)的兜底逻辑提前到 send 前,避免一次失败的 oversized request round-trip。
+
+## 内部常量
+
+```ts
+// chatCompressionService.ts
+const DEFAULT_PCT = 0.7; // auto 比例兜底
+const WARN_PCT_OFFSET = 0.1; // warn 比例 = PCT - WARN_OFFSET = 0.6
+const COMPACT_MAX_OUTPUT_TOKENS = 20_000; // 压缩 sideQuery 输出硬上限(thinking + summary 合计)
+const SUMMARY_RESERVE = 20_000; // 阈值梯子从窗口顶减去的输出预留 = maxOutput
+const AUTOCOMPACT_BUFFER = 13_000; // auto 与 effectiveWindow 间距
+const WARN_BUFFER = 20_000; // warn 与 auto 间距
+const HARD_BUFFER = 3_000; // hard 与 effectiveWindow 间距
+const MAX_CONSECUTIVE_FAILURES = 3; // 失败熔断阈值
+```
+
+数值来源:全部沿用 claude-code 的实测值([autoCompact.ts:30,62-65](src/services/compact/autoCompact.ts:30))。
+
+`SUMMARY_RESERVE = COMPACT_MAX_OUTPUT_TOKENS` 是关键关系:模型受 `maxOutputTokens` 硬限制约束,输出不可能超出 20K,因此 reserve 不需要额外 safety margin。`thinking + summary` 是合并预算(Gemini SDK / 多数 provider 的 `maxOutputTokens` 语义),模型自行在两者间分配。
+
+## 计算函数
+
+```ts
+export interface CompactionThresholds {
+ warn: number;
+ auto: number;
+ hard: number; // 当 hard < auto 时等于 auto(小窗口退化)
+ effectiveWindow: number;
+}
+
+export function computeThresholds(window: number): CompactionThresholds {
+ const effectiveWindow = window - SUMMARY_RESERVE;
+
+ const absAuto = effectiveWindow - AUTOCOMPACT_BUFFER;
+ const auto = Math.max(DEFAULT_PCT * window, absAuto);
+
+ const absWarn = auto - WARN_BUFFER;
+ const warn = Math.max((DEFAULT_PCT - WARN_PCT_OFFSET) * window, absWarn);
+
+ const rawHard = effectiveWindow - HARD_BUFFER;
+ const hard = Math.max(rawHard, auto); // 小窗口下退化为 auto
+
+ return { warn, auto, hard, effectiveWindow };
+}
+```
+
+### 实测数据
+
+| 窗口 | warn | auto | hard | 备注 |
+| ---- | ----------- | ----------- | ------------ | ------------------------------- |
+| 32K | 19.2K (pct) | 22.4K (pct) | 22.4K (退化) | 比例兜底 |
+| 64K | 38.4K (pct) | 44.8K (pct) | 44.8K (退化) | 比例兜底 |
+| 128K | 76.8K (pct) | 95K (abs) | 105K (abs) | 混合(warn=pct, auto/hard=abs) |
+| 200K | 147K (abs) | 167K (abs) | 177K (abs) | 绝对接管 |
+| 256K | 203K (abs) | 223K (abs) | 233K (abs) | 绝对接管 |
+| 1M | 947K (abs) | 967K (abs) | 977K (abs) | 全绝对 |
+
+`(pct)` 表示该层由比例公式决定,`(abs)` 表示由绝对值公式决定。
+
+## 用户配置
+
+### ChatCompressionSettings 变更
+
+```ts
+// packages/core/src/config/config.ts:217
+export interface ChatCompressionSettings {
+ /** 保留(与本设计无关,由 compactionInputSlimming 使用) */
+ imageTokenEstimate?: number;
+}
+```
+
+**删除:** `contextPercentageThreshold` 字段。理由:
+
+1. 新公式下,对主流窗口(>= 128K)该字段几乎无影响——绝对值接管
+2. 小窗口下用户配置反而可能让阈值"更早"压缩,与节省 token 直觉相反
+3. claude-code 没有暴露此字段,无类似的用户面配置先例
+
+### Breaking change 处理
+
+启动时 `Config` 加载发现 `chatCompression.contextPercentageThreshold` 存在:
+
+- 写入 stderr 一行警告:`"chatCompression.contextPercentageThreshold has been removed and is now controlled by built-in thresholds."`
+- **不**报错、**不**阻塞启动
+- 字段值被忽略
+
+## Token 估算补偿
+
+qwen-code 的 `lastPromptTokenCount` 来自上一轮 API response 的 `usageMetadata.totalTokenCount`([geminiChat.ts:1217-1232](packages/core/src/core/geminiChat.ts:1217))。这导致:
+
+1. **滞后一轮**:cheap-gate 用 `lastPromptTokenCount` 判断,但本次 send 实际 prompt = 它 + 本轮 user message。少算的部分可能让阈值判断 false-negative
+2. **首轮为 0**:初始值是 0,第一次 send 时无论历史多大都不会触发任何阈值(含 `--continue` 恢复 / sub-agent 继承场景)
+
+引入轻量本地估算函数 `estimatePromptTokens`,在 send 前 cheap-gate / hard 判断时补足这两段缺失:
+
+```ts
+// chatCompressionService.ts(或新文件 packages/core/src/services/tokenEstimation.ts)
+
+const BYTES_PER_TOKEN = 4; // 通用 char/4 估算(claude-code 同此)
+const BYTES_PER_TOKEN_JSON = 2; // JSON / tool_call input 更密集
+
+/**
+ * 估算一组 Content 的 token 数,用于补偿 API usage metadata 的滞后。
+ * 对 image / document 复用现有 imageTokenEstimate(默认 1600)。
+ */
+export function estimateContentTokens(
+ contents: Content[],
+ imageTokenEstimate = DEFAULT_IMAGE_TOKEN_ESTIMATE,
+): number {
+ // 复用 estimateContentChars(compactionInputSlimming.ts),再除以 bytesPerToken
+ // 内部对 functionCall / functionResponse 用 BYTES_PER_TOKEN_JSON
+ // ...
+}
+
+/**
+ * cheap-gate 与 hard 判断的统一入口。
+ * 主路径:lastPromptTokenCount 准 + 本轮 user message 估算
+ * 首轮路径:full history 估算
+ */
+export function estimatePromptTokens(
+ history: Content[],
+ userMessage: Content,
+ lastPromptTokenCount: number,
+): number {
+ if (lastPromptTokenCount > 0) {
+ return lastPromptTokenCount + estimateContentTokens([userMessage]);
+ }
+ return estimateContentTokens([...history, userMessage]);
+}
+```
+
+应用位置:
+
+- `chatCompressionService.compress()` 的 cheap-gate:把 `originalTokenCount` 来源换成 `estimatePromptTokens(history, userMessage, lastPromptTokenCount)`
+- `geminiChat.sendMessageStream` 入口的 hard 判断(见下一节)
+
+**估算只用于提前触发,不用于「跳过触发」。** 因为 char/4 是粗略下界估计,作为 false-positive 一侧是安全的(宁可早一点压),作为 false-negative 则不可靠。
+
+## 触发链路改动
+
+### chatCompressionService.ts
+
+1. **导出 `computeThresholds`**,供 cheap-gate / UI / 命令复用
+2. **`compress()` cheap-gate** (line 221-249):
+ ```ts
+ if (consecutiveFailures >= MAX_CONSECUTIVE_FAILURES && !force) {
+ return NOOP;
+ }
+ const { auto } = computeThresholds(contextLimit);
+ const effectiveTokens = estimatePromptTokens(
+ curatedHistory,
+ userMessage,
+ originalTokenCount,
+ );
+ if (!force && effectiveTokens < auto) return NOOP;
+ ```
+3. **`compress()` 的 runSideQuery 调用** (line 356-380):关闭 thinking + 加 `maxOutputTokens`:
+
+ ```ts
+ const summaryResult = await runSideQuery(config, {
+ // ...
+ config: {
+ thinkingConfig: { includeThoughts: false }, // 关闭 thinking(与 claude-code 一致)
+ maxOutputTokens: COMPACT_MAX_OUTPUT_TOKENS, // 硬上限 20K
+ },
+ // ...
+ });
+ ```
+
+ 或者直接删掉 `thinkingConfig` 让 `runSideQuery` 默认值([sideQuery.ts:118](packages/core/src/utils/sideQuery.ts:118) 默认 `includeThoughts: false`)接管。
+
+ 关 thinking 后,`maxOutputTokens` 直接约束总输出(不存在 thinking 单独 budget 的问题),`SUMMARY_RESERVE = maxOutput = 20K` 是干净的硬关系。
+
+ 同时更新 [chatCompressionService.ts:374-376](packages/core/src/services/chatCompressionService.ts:374) 的注释,从「Compression quality drives every subsequent main turn — keep reasoning on」改为说明「为保证跨 provider 可预测的输出上限,与 claude-code 设计对齐」。
+
+ token math 一段([:436-437](packages/core/src/services/chatCompressionService.ts:436))的 "may include non-persisted tokens (thoughts)" 注释也可以同步清理
+
+### geminiChat.ts: `sendMessageStream` 入口(line 562)
+
+```ts
+// 替换前:tryCompress(force=false)
+// 替换后:用估算 token 判断是否触发 hard,决定 force 标志
+
+const { hard } = computeThresholds(contextLimit);
+const effectiveTokens = estimatePromptTokens(
+ this.getHistory(true),
+ createUserContent(params.message),
+ this.lastPromptTokenCount,
+);
+const shouldForceFromHard = effectiveTokens >= hard;
+
+if (shouldForceFromHard) {
+ // 重置熔断器,等同 force compress
+ this.consecutiveFailures = 0;
+}
+
+compressionInfo = await this.tryCompress(
+ prompt_id,
+ model,
+ shouldForceFromHard,
+ params.config?.abortSignal,
+);
+```
+
+### 失败处理升级 (`geminiChat.ts:504-510`)
+
+```ts
+// 替换前
+hasFailedCompressionAttempt: boolean;
+
+// 替换后
+consecutiveFailures: number; // 默认 0
+
+// 失败分支
+} else if (isCompressionFailureStatus(info.compressionStatus)) {
+ if (!force) {
+ this.consecutiveFailures += 1;
+ }
+}
+
+// 成功分支
+this.consecutiveFailures = 0;
+```
+
+`force=true` 调用失败不计入计数(保持现有 reactive / manual 不"占额"的语义)。
+
+## UI 改动
+
+### tipRegistry.ts 重写三条 context-\* tip
+
+三层阈值正好与三条 tip 一一对应。映射关系(按 token 数从低到高):
+
+| Tip ID | 当前条件 | 新条件 | 文案变化 |
+| ------------------ | --------------------------------------------- | ------------------------------------------------------------------- | ----------------------------------------------------------------- |
+| `compress-intro` | `pct >= 50 && < 80 && sessionPromptCount > 5` | `tokenCount >= warn && tokenCount < auto && sessionPromptCount > 5` | 保持不变 |
+| `context-high` | `pct >= 80 && < 95` | `tokenCount >= auto && tokenCount < hard` | 保持不变 |
+| `context-critical` | `pct >= 95` | `tokenCount >= hard` | 加一句「Auto-compact will force on next send.」反映新 hard 层行为 |
+
+**对触发频率的影响:**
+
+- 主路径(auto 正常工作):`tokenCount` 跨越 auto 后立即触发压缩,下一轮 tokenCount 回落,所以 `context-high` 仅在「触发到压缩生效之间」短暂可见
+- 边缘路径(auto 失败 / 熔断 / reactive 来不及):`tokenCount` 持续上涨,会依次穿过 warn → auto → hard 触发三条 tip,跟用户视角的"上下文越来越紧"一致
+- `context-critical` 触发时 hard 层已经在 send 前 force compress(spec 触发链路改动一节),所以这条 tip 实际上是「post-rescue 告知」而非「pre-rescue 警告」,文案补一句说明
+
+`TipContext` 接口增加:
+
+```ts
+export interface TipContext {
+ lastPromptTokenCount: number;
+ contextWindowSize: number;
+ sessionPromptCount: number;
+ sessionCount: number;
+ platform: string;
+ // 新增:让 isRelevant 函数能拿到阈值。
+ // computeThresholds 在调用方算好后注入,避免 tipRegistry 直接依赖 core。
+ thresholds?: CompactionThresholds;
+}
+```
+
+`AppContainer.tsx:1150` 构造 `TipContext` 时同步注入。
+
+### /context 命令同步 (`contextCommand.ts:177-183`)
+
+```ts
+// 替换硬编码 (1 - threshold) * contextWindowSize
+const { warn, auto, hard, effectiveWindow } =
+ computeThresholds(contextWindowSize);
+
+// 显示四行:
+// Effective window: 180K (window − 20K reserve)
+// Warn threshold: 147K (...)
+// Auto threshold: 167K ← 当前位置
+// Hard threshold: 177K
+// 标记当前 token count 落在哪个 tier
+```
+
+### Footer 持续提示(可选 follow-up)
+
+本 spec 不强制实现 footer 持续提示,理由:
+
+- 现有 tip 系统已经能在 history 里给出提示
+- Footer 持续提示需要改 ink 渲染、增加重绘频率
+- 可作为本 spec 后置 follow-up(独立 PR)
+
+如果后续要做,建议触发条件 `tokenCount >= warn && tokenCount < auto`,超过 auto 后隐藏(压缩已开始)。
+
+## 测试覆盖
+
+### 单元测试(chatCompressionService.test.ts)
+
+- `computeThresholds(32K)` → 比例兜底分支(warn/auto 均 pct,hard 退化)
+- `computeThresholds(128K)` → 混合分支(warn=pct,auto=abs,hard=abs)
+- `computeThresholds(200K)` → 绝对接管分支(warn/auto/hard 均 abs)
+- `computeThresholds(1M)` → 全绝对分支
+- `computeThresholds(window=10K)` → 极小窗口(绝对值全负),公式不崩
+- 三层阈值始终满足 `warn <= auto <= hard`
+- max() 公式在边界点(pct \* window == abs)稳定
+
+### 单元测试(tokenEstimation.test.ts)
+
+- `estimateContentTokens` 对纯文本 / json / functionCall / functionResponse / image / document 分别走对应 bytesPerToken
+- `estimatePromptTokens` 在 `lastPromptTokenCount > 0` 时走「主路径」,等于 0 时走「首轮路径」
+- 大 user message 在 cheap-gate 阶段被加上去后能跨越 auto 阈值
+- 估算与真实 API usage 的偏差在 ±30% 以内(用真实历史样本回归)
+
+### 集成测试(geminiChat.test.ts / chatCompressionService.test.ts)
+
+- 3 次连续失败后 cheap-gate NOOP;下一次 force 后恢复
+- 单次失败不再永久锁
+- 估算 token 跨越 hard 后 send 自动 force compress
+- 压缩 sideQuery 调用 `maxOutputTokens = COMPACT_MAX_OUTPUT_TOKENS` 正确透传到 `runSideQuery`,`thinkingConfig.includeThoughts` 为 `false`(或被 sideQuery 默认值接管)
+- **首轮覆盖**:构造一个 `lastPromptTokenCount = 0` 但 history 巨大的 chat(模拟 `--continue` 恢复),首次 send 时 auto 阈值能被估算路径触发
+
+### 兼容性测试
+
+- 设置 `contextPercentageThreshold = 0.5` 启动 → stderr 警告 + 字段被忽略,行为以内部 PCT 常量为准
+
+### Tip 系统测试(tipRegistry.test.ts)
+
+- 三条 context-\* tip 在跨越 warn/auto/hard 时正确触发,且区间不重叠
+- 主路径下 auto 阈值触发压缩后 `context-high` 不持续可见
+- 边缘路径(熔断 + token 继续涨)下三条 tip 依次触发
+- TipContext 缺 `thresholds` 时(fallback)行为合理
+
+## 实施分阶段
+
+| Phase | 内容 | 独立性 |
+| ----- | -------------------------------------------------------------------------------------------- | ------------------ |
+| 1 | 内部常量 + `computeThresholds` + cheap-gate 改动(不含估算补偿) | 可独立合并 |
+| 2 | 失败处理升级(1 → 3 熔断) | 可独立合并 |
+| 3 | hard 层 force compress 提前 | 依赖 P1 + P7 |
+| 4 | 配置面变更 + breaking change 警告 | 依赖 P1 |
+| 5 | UI(tip 重写 + /context) | 依赖 P1 |
+| 6 | 压缩 sideQuery 关 thinking + 加 `maxOutputTokens` 上限 | 独立可先于 P1 落地 |
+| 7 | Token 估算补偿(`estimateContentTokens` + `estimatePromptTokens`,应用到 cheap-gate / hard) | 独立可与 P1 并行 |
+
+每个 Phase 可独立 PR。建议合并顺序 **P6 → P7 → P1 → P2 → P4 → P3 → P5**:先给压缩调用打上 `maxOutputTokens` 上限(让 buffer 假设可信);再加估算补偿(让 token 数判断更可靠);再把阈值基础设施落地;再做失败熔断、配置面变更;最后才打开 hard 层主动救场(这时已有可靠的 token 数 + 熔断器)。每个 PR 都能独立验证、独立回滚。
+
+## 风险与注意事项
+
+1. **关 thinking 可能影响摘要质量。** 原作者注释 "Compression quality drives every subsequent main turn — keep reasoning on" 表达过对此的担忧。本 spec 的判断是「可预测的 token 上限」优先于「最大化质量」,但落地后需要观察 telemetry 里 `compression_input_token_count` / `compression_output_token_count` 的分布,以及主对话在压缩后的质量变化(用户反馈、`COMPRESSION_FAILED_*` 状态率)。如果质量下降明显,再考虑回退到 thinking 开启 + provider-specific thinkingBudget 控制。
+
+2. **`maxOutputTokens` 触顶可能导致 summary 被截断。** 关 thinking 后,20K 直接限制 summary 主体;claude-code 实测 p99.99 ≈ 17K,留 ~3K 安全冗余。但 qwen-code 的压缩 prompt 与 claude-code 不同,分布需要观测。建议在压缩失败分支([chatCompressionService.ts:464-491](packages/core/src/services/chatCompressionService.ts:464))追加「检测到 finish_reason = MAX_TOKENS」的 NOOP 路径,避免持久化半截 summary。
+
+3. **跨 provider 的 maxOutputTokens 映射差异。** OpenAI compat (dashscope) → `max_tokens`、Anthropic → `max_tokens`、Gemini SDK → `maxOutputTokens`。当前 qwen-code 已有这层映射([contentGenerator.ts:94](packages/core/src/core/contentGenerator.ts:94) 等),需要在 P6 实现时验证 sideQuery 路径上 `maxOutputTokens` 字段确实贯穿到所有 provider 的请求体。
+
+4. **Token 估算是粗略下界,不应反向用作"跳过触发"的依据。** `char/4` 与各 provider 真实 tokenizer 偏差可能 ±30%。本 spec 只用估算来「让阈值更早触发」(false-positive 方向,宁可早压不可晚压)。所有「降低 token 计数 / 跳过压缩」的代码路径仍应使用 `lastPromptTokenCount`(API 权威值)。
+
+5. **估算函数与现有 `estimateContentChars` 的关系。** [compactionInputSlimming.ts](packages/core/src/services/compactionInputSlimming.ts) 已经有 `estimateContentChars`(用于压缩 split point 计算),新增的 `estimateContentTokens` 应复用它(除以 bytesPerToken)而非新写一套,避免两套估算口径出现分歧。
+
+## 不在本 spec 范围
+
+- Env 变量覆盖通道(D 方案):维持「配置面最小」原则
+- Footer 常驻可视化:留作 follow-up
+- 摘要 prompt 改进、`MIN_COMPRESSION_FRACTION` 调整:与阈值设计正交
+
+## 开放问题(等 review)
+
+1. **breaking change 强度**:警告 + 忽略字段 vs 启动报错。当前选警告,需要确认对企业部署/团队配置是否够友好
+2. **小窗口(32K)下 hard 与 auto 退化为同一值**:用户视角是否需要在 `/context` 明示「该窗口下 hard 已退化」
diff --git a/docs/e2e-tests/2026-05-18-qwen-memory-benchmark-report.md b/docs/e2e-tests/2026-05-18-qwen-memory-benchmark-report.md
new file mode 100644
index 0000000000..27cf60c278
--- /dev/null
+++ b/docs/e2e-tests/2026-05-18-qwen-memory-benchmark-report.md
@@ -0,0 +1,280 @@
+# Qwen Code Runtime Memory Benchmark Report
+
+Date: 2026-05-18
+
+## Summary
+
+This report records local memory benchmarks for Qwen Code runtime behavior. It
+compares Qwen Code across models and compares Qwen Code with Claude Code on the
+same task shapes where equivalent model endpoints were available.
+
+The headline result is consistent across the latest matrix:
+
+- Qwen Code process-tree RSS peak: about `0.85-1.06 GiB`.
+- Claude Code process-tree RSS peak: about `0.28-0.37 GiB`.
+- Qwen Code was about `2.3x-3.6x` higher in the tested
+ non-interactive CLI task benchmarks.
+
+The difference reproduced in small PR review, code navigation, and synthetic
+diff workloads. It is therefore unlikely to be explained only by one large PR
+or by one model provider.
+
+This report is intended to make the current performance investigation visible:
+what has been measured, what conclusion is already supported, what remains
+unknown, and what diagnostics should be added next.
+
+## Test Environment
+
+| Item | Value |
+| --------------------------------------------- | ------------------------------------------ |
+| Date | 2026-05-18 |
+| Platform | macOS local development machine |
+| Qwen Code version | `0.15.11` |
+| Qwen Code binary | PATH-resolved `qwen` binary |
+| Claude Code version used in the latest matrix | `2.1.129` |
+| Claude Code binary used in the latest matrix | PATH-resolved `claude` binary |
+| Sampling method | External `ps` RSS sampling once per second |
+| Headline metric | Process-tree RSS peak |
+
+Process-tree RSS is used as the headline metric because Qwen Code launches a
+root wrapper and a child Node/Qwen worker. Looking only at the root process can
+understate the memory footprint seen by users.
+
+Temporary CLI config directories were used for matrix runs so the benchmarks
+did not depend on global CLI state.
+
+## Benchmark Artifacts
+
+Five local reports were produced before this consolidated report:
+
+1. Qwen Code PR review memory run.
+2. Qwen Code model comparison run.
+3. Strict Qwen Code vs Claude Code comparison with `pai/glm-5`.
+4. Qwen Code vs Claude Code, two CLIs by two models.
+5. Qwen Code vs Claude Code, five-case matrix.
+
+This consolidated report covers the conclusions and headline metrics from all
+five reports. It does not embed every raw sample row, terminal transcript, or
+temporary runner artifact. Those raw artifacts stayed in local `tmp/`
+directories because they are experiment outputs rather than stable repository
+fixtures.
+
+The latest matrix is the strongest evidence because it covers multiple task
+shapes rather than only one PR review workload.
+
+## Preliminary Conclusion
+
+The current data is strong enough to say that Qwen Code has a higher runtime
+memory footprint than Claude Code in these local non-interactive CLI task
+benchmarks. It is not strong enough to name one final root cause yet.
+
+The leading explanation is a Qwen Code runtime/path difference rather than a
+model provider difference:
+
+- the gap reproduces with both `pai/glm-5` and `qwen3.6-plus`;
+- the gap reproduces in small PR and code-navigation tasks, not only in large
+ diff tasks;
+- Qwen Code repeatedly sends or accounts for more tokens than Claude Code for
+ similar work;
+- Qwen Code's largest observed component is the child Node/Qwen worker process,
+ which points toward task-time process footprint, module loading, context
+ assembly, live history, tool-result retention, or subagent/saved-output
+ paths.
+
+The most useful next measurement is therefore not another external RSS-only
+run. The next measurement should split RSS into V8 heap, native memory,
+session/history size, retained tool-result size, and subagent/process-tree
+activity.
+
+## Initial Cause Analysis
+
+The benchmark does not yet prove one root cause, but it does narrow the likely
+problem area.
+
+| Signal | What it suggests | What it does not prove |
+| -------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------- |
+| Qwen remains near `1 GiB` in small PR and code-navigation cases | A high non-interactive task-time runtime cost is likely involved | It does not identify whether the footprint is V8 heap, native memory, module loading, or retained state |
+| Diff size from 100 KiB to 5 MiB does not scale linearly with RSS | Raw diff bytes alone are probably not the primary driver | Large outputs can still amplify memory in real PR review flows |
+| Qwen uses more tokens than Claude in every matrix cell | Qwen likely constructs or retains larger prompt/context/tool-result state for similar work | Token count is not the same as process memory and may be an effect rather than the cause |
+| Tool call counts are similar, and Claude sometimes uses more turns/tool calls with lower RSS | A longer tool-call chain is unlikely to be the main explanation by itself | Tool output size and retention still need to be measured |
+| Earlier large PR runs showed saved-output recovery and subagent amplification | Tool-output truncation and saved-output paths are likely heavy-workload amplifiers | They do not explain the entire small-task execution footprint |
+
+The current best explanation is therefore:
+
+1. **Task-time runtime cost first**: Qwen Code likely initializes or retains
+ more runtime state during non-interactive CLI task execution than Claude
+ Code. This may include agent runtime, tool registry, provider adapters,
+ session services, or UI/history structures that are not strictly needed for
+ a short non-interactive task.
+2. **Context/tool-result volume second**: Qwen Code appears to carry larger
+ model-facing or session-facing context for similar work. The token gap makes
+ context assembly, tool result normalization, and history retention important
+ suspects.
+3. **Large-output amplification third**: Large PR review can trigger additional
+ saved-output and subagent paths. These are probably not the only cause, but
+ they can make memory and token pressure worse in realistic review tasks.
+
+The next diagnostic run should answer where the `~1 GiB` sits:
+
+- high immediately after startup: module/runtime startup cost;
+- jumps after tool execution: tool-output retention or result normalization;
+- jumps during request assembly: context construction or duplicated histories;
+- grows after streaming/compression: response retention or compression state;
+- mostly RSS outside V8 heap: native buffers, loaded modules, or external
+ memory.
+
+## Latest Matrix
+
+The latest benchmark ran:
+
+- 2 CLIs: Qwen Code and Claude Code.
+- 2 model labels: `pai/glm-5` and `qwen3.6-plus`.
+- 5 cases:
+ - small PR review: PR `#4268`, one-line change
+ - code navigation: `rg` plus `sed` on compression-related files
+ - synthetic local diff, about 100 KiB
+ - synthetic local diff, about 1 MiB
+ - synthetic local diff, about 5 MiB
+
+All 20 runs exited `0` with no timeout.
+
+## Matrix Results
+
+| Case | Model | Qwen tree peak | Claude tree peak | Qwen / Claude |
+| ---------------- | -------------- | -------------: | ---------------: | ------------: |
+| small PR `#4268` | `pai/glm-5` | 1032.7 MiB | 357.8 MiB | 2.89x |
+| small PR `#4268` | `qwen3.6-plus` | 852.2 MiB | 365.5 MiB | 2.33x |
+| code navigation | `pai/glm-5` | 993.1 MiB | 359.6 MiB | 2.76x |
+| code navigation | `qwen3.6-plus` | 996.9 MiB | 349.0 MiB | 2.86x |
+| diff 100 KiB | `pai/glm-5` | 1012.1 MiB | 350.8 MiB | 2.89x |
+| diff 100 KiB | `qwen3.6-plus` | 1001.1 MiB | 336.2 MiB | 2.98x |
+| diff 1 MiB | `pai/glm-5` | 1008.3 MiB | 278.8 MiB | 3.62x |
+| diff 1 MiB | `qwen3.6-plus` | 1003.3 MiB | 340.5 MiB | 2.95x |
+| diff 5 MiB | `pai/glm-5` | 858.8 MiB | 323.2 MiB | 2.66x |
+| diff 5 MiB | `qwen3.6-plus` | 1062.0 MiB | 331.2 MiB | 3.21x |
+
+Average process-tree RSS peak by case:
+
+| Case | Avg Qwen tree peak | Avg Claude tree peak |
+| ---------------- | -----------------: | -------------------: |
+| small PR `#4268` | 942.5 MiB | 361.6 MiB |
+| code navigation | 995.0 MiB | 354.3 MiB |
+| diff 100 KiB | 1006.6 MiB | 343.5 MiB |
+| diff 1 MiB | 1005.8 MiB | 309.6 MiB |
+| diff 5 MiB | 960.4 MiB | 327.2 MiB |
+
+## Runtime And Token Signals
+
+The same matrix also showed Qwen Code using more model-side tokens in every
+tested case.
+
+Selected examples:
+
+| Case | Model | CLI | Duration | Turns | Total tokens | Tool calls |
+| --------------- | -------------- | ------ | -------: | ----: | -----------: | ---------: |
+| small PR | `pai/glm-5` | Qwen | 25.2s | 2 | 32,567 | 3 |
+| small PR | `pai/glm-5` | Claude | 21.1s | 4 | 7,899 | 3 |
+| code navigation | `qwen3.6-plus` | Qwen | 25.2s | 2 | 38,151 | 3 |
+| code navigation | `qwen3.6-plus` | Claude | 46.9s | 6 | 25,861 | 5 |
+| diff 100 KiB | `qwen3.6-plus` | Qwen | 16.5s | 3 | 57,185 | 2 |
+| diff 100 KiB | `qwen3.6-plus` | Claude | 17.2s | 3 | 6,377 | 2 |
+| diff 5 MiB | `pai/glm-5` | Qwen | 23.2s | 2 | 38,574 | 2 |
+| diff 5 MiB | `pai/glm-5` | Claude | 9.8s | 3 | 5,285 | 2 |
+
+This token gap does not prove that token volume is the memory root cause, but it
+does suggest that context assembly, tool result retention, or response
+normalization should be measured alongside RSS and V8 heap statistics.
+
+## Token Usage Analysis
+
+The token gap is one of the strongest clues, but it needs internal request
+metrics before it can be treated as a root cause.
+
+What the data supports today:
+
+- Qwen Code used more total tokens than Claude Code in every matrix cell.
+- The gap appears even when tool-call counts are similar.
+- Claude sometimes used more turns or tool calls while still using less memory.
+
+What this suggests:
+
+- The token delta is unlikely to come only from a longer tool-call chain.
+- Qwen may be carrying larger static prompt/context state, larger tool schemas,
+ larger serialized tool results, or more retained conversation/session content.
+- Large-output flows may add another layer through truncation, saved-output
+ recovery, or subagent paths.
+
+What is still missing:
+
+- per-request input token breakdown;
+- system prompt and tool schema token sizes;
+- retained message and tool-result sizes before each model request;
+- whether large outputs are retained in multiple places, such as model history,
+ UI history, session recording, or saved-output storage.
+
+Those missing metrics are why the next step should add internal diagnostics
+rather than only repeat the external RSS benchmark.
+
+## Earlier Large PR Review Signal
+
+An earlier strict PR review benchmark used PR `#4186` and showed the same broad
+shape:
+
+| Model | CLI | Process-tree RSS peak |
+| -------------- | ----------- | --------------------: |
+| `pai/glm-5` | Qwen Code | 1000.7 MiB |
+| `pai/glm-5` | Claude Code | 349.0 MiB |
+| `qwen3.6-plus` | Qwen Code | 1095.8 MiB |
+| `qwen3.6-plus` | Claude Code | 341.1 MiB |
+
+That earlier run was not enough by itself because a large PR can trigger unusual
+tool-output and saved-output paths. The latest five-case matrix makes the
+finding stronger because small PR and code-navigation tasks also reproduce the
+gap.
+
+## Working Hypothesis
+
+The current evidence supports these hypotheses, in priority order:
+
+1. Qwen Code has a higher non-interactive task-time process footprint than
+ Claude Code. The Qwen child Node worker was typically the largest process in
+ local sampling, often around `0.7-0.8 GiB`.
+2. Model choice is not the main explanation. Both `pai/glm-5` and
+ `qwen3.6-plus` showed the same broad Qwen-vs-Claude gap.
+3. Large diff size alone is not the main explanation. The synthetic diff size
+ did not scale linearly from 100 KiB to 5 MiB, likely because tool-output
+ truncation caps how much output reaches the model.
+4. Context/tool-result handling is still a likely contributor. Qwen Code used
+ more tokens than Claude Code in every matrix cell, and earlier large-PR runs
+ showed saved tool-output recovery and subagent amplification paths.
+5. The next diagnostic layer should separate V8 heap, native RSS, loaded
+ module/runtime startup cost, session history, UI history, tool-result
+ retention, and subagent activity. External RSS alone cannot distinguish
+ those causes.
+
+## Caveats
+
+- These are single runs per matrix cell, not repeated statistical samples.
+- RSS is external process RSS. It cannot distinguish V8 heap, native buffers,
+ module loading, retained tool output, UI state, or session history.
+- Claude Code and Qwen Code use different runtime implementations and protocol
+ adapters, even when the model labels are the same.
+- The benchmark was run locally on macOS. Linux servers should be tested before
+ drawing deployment-specific conclusions.
+
+## Recommended Follow-Up Measurements
+
+The next local investigation branch should add or use diagnostics for:
+
+- `process.memoryUsage()` before and after startup, tool execution, streaming,
+ compression, and session finalization.
+- V8 heap statistics and heap spaces.
+- Active handles and requests.
+- Session message count and approximate retained character/token volume.
+- Tool result count, total retained tool-result size, largest tool-result size,
+ and whether large outputs are retained by UI history or model history.
+- Subagent count and child process/process-tree RSS.
+- Tool-output truncation and saved-output recovery events.
+
+These measurements should be collected with the same benchmark matrix so the
+current RSS comparison can be connected to internal Qwen Code state.
diff --git a/docs/e2e-tests/2026-05-19-oom-reproduction-report.md b/docs/e2e-tests/2026-05-19-oom-reproduction-report.md
new file mode 100644
index 0000000000..4e07233ae7
--- /dev/null
+++ b/docs/e2e-tests/2026-05-19-oom-reproduction-report.md
@@ -0,0 +1,435 @@
+# OOM 压力测试与长任务 Replay 报告
+
+**日期**: 2026-05-19
+**分支**: `codex/memory-diagnostics-local-run`
+**测试人**: yiliang114
+**结论**: 成功复现并定位根因。v0.15.7 (#3735) 引入的 auto-compaction 使 `structuredClone`
+调用频率倍增,在高 heap 压力时形成正反馈死循环导致 OOM。真实 debug 日志完整佐证了该机制。
+
+---
+
+## 一、背景
+
+多个 issue(#4309, #4276, #4185, #4315, #4322, #2868)报告 qwen-code 在长会话中出现 V8 heap OOM crash:
+
+```
+FATAL ERROR: Ineffective mark-compacts near heap limit Allocation failed - JavaScript heap out of memory
+```
+
+用户报告的崩溃特征:
+| Issue | 崩溃时 Heap | 运行时长 | 平台 |
+|-------|------------|---------|------|
+| #4276 | 4014 MB | ~110 分钟 | Linux x64 |
+| #4315 | 2027 MB | ~19.6 小时 | macOS (默认 2GB limit) |
+| #4322 | 4023 MB | ~7 小时 | Windows |
+| #2868 | 2035 MB | ~1.7 分钟 | Linux |
+| #4309 | 7020 MB | 未知 | Windows (设了 8GB limit 仍崩) |
+
+---
+
+## 二、方法论修正
+
+本报告区分两类测试:
+
+1. **低 heap 压力测试**:通过降低 `--max-old-space-size` 放大问题,用于快速定位
+ “history 很大时整段复制导致瞬时峰值”的代码路径。它是诊断工具,不等价于用户真实
+ 4G/8G OOM 复现。
+2. **默认 heap 长任务 replay**:不设置 `NODE_OPTIONS`,使用真实 JSONL 历史恢复并
+ 继续执行 review 任务,同时从进程外采样 process-tree RSS。这类结果才用于判断
+ 用户侧实际内存量级。
+
+因此,低 heap 结果不能单独作为“真实 OOM 已修复”的证明。它只能说明某条路径在
+history 足够大时会产生峰值放大,需要再用默认 heap 长任务验证。
+
+## 三、低 heap 压力测试条件
+
+| 参数 | 值 |
+| ------------------------ | ------------------------------------------------------------ |
+| CLI 版本 | 0.15.11 (从 `codex/memory-diagnostics-local-run` 分支 build) |
+| Model | `qwen3.6-plus` (128K context window) |
+| Heap limit | `--max-old-space-size=512` |
+| Heap-pressure safety net | **禁用** (HEAP_PRESSURE_COMPRESSION_RATIO 设为 99.0) |
+| 操作模式 | YOLO + 自动化多轮 Read 文件任务 |
+| 工作目录 | qwen-code monorepo (3538 .ts files, 1.26M lines) |
+
+### 关键配置修改
+
+`packages/core/src/core/geminiChat.ts` 中将 heap-pressure compaction 阈值从 0.7 改为 99.0(使其永远不触发),模拟 #4186 修复前的状态。
+
+---
+
+## 四、低 heap 压力测试结果
+
+### 崩溃时间线
+
+```
+[21:26:59] #1 RSS:193.6MB Ctx:0% → Read geminiChat.ts (1500 行)
+[21:27:46] #2 RSS:270.4MB Ctx:4.2% → Read agent.ts
+[21:28:32] #3 RSS:397.5MB Ctx:4.3% → grep + Read 3 个文件
+[21:29:18] #4 RSS:452.7MB Ctx:5.7% → Read slashCommandProcessor.ts
+[21:30:04] #5 RSS:515.0MB Ctx:5.9% → Read chatCompressionService.ts
+[21:30:50] #6 RSS:649.1MB Ctx:4.0% ← TOKEN COMPACTION 触发 (5.9%→4.0%)
+ RSS 反增 134MB (structuredClone 峰值)
+[21:31:36] #7 RSS:666.7MB Ctx:3.2% ← 再次 compaction, RSS 继续涨
+[21:32:22] CRASH — FATAL ERROR: Ineffective mark-compacts near heap limit
+```
+
+**总耗时**: ~5.5 分钟,7 轮任务后崩溃。
+
+这证明在受限 heap 下,长 history + compaction/history clone 可以触发 V8 heap OOM。
+但该结果不代表默认 heap 下的真实用户 OOM 已经被完整复现。
+
+### 更大 heap 的 synthetic 复现
+
+为避免只依赖 512 MiB 低 heap 结论,补充了更大 heap 的 synthetic runtime
+pressure 测试。该测试不调用模型,而是构造类似长 review/subagent 任务的历史:
+
+- root review turns: 10
+- subagent calls: 30
+- subagent transcript records: 780
+- retained tool result bytes: 193,986,560
+- serialized history bytes: 195,620,061
+- pressure mode: retained `structuredClone(history)` copies
+
+| Heap limit | Clone pressure | 结果 | 关键 GC / stack |
+| ---------- | -----------------: | ---------------------------------------- | ------------------------------------------------------------ |
+| 2 GiB | 8 retained clones | 未崩溃,RSS 2.42 GiB,heap used 1.87 GiB | 接近 heap limit |
+| 2 GiB | 10 retained clones | OOM | `Reached heap limit`, `ValueDeserializer`, `StructuredClone` |
+| 4 GiB | 20 retained clones | OOM | `Reached heap limit`, `ValueDeserializer`, `StructuredClone` |
+
+2 GiB 复现的 GC 摘要:
+
+```
+Mark-Compact 2042.9 (2081.9) -> 2042.9 (2081.1) MB
+Mark-Compact 2048.9 (2087.2) -> 2048.9 (2087.2) MB
+FATAL ERROR: Reached heap limit Allocation failed - JavaScript heap out of memory
+...
+node::worker::(anonymous namespace)::StructuredClone
+```
+
+4 GiB 复现的 GC 摘要:
+
+```
+Mark-Compact 4082.5 (4126.8) -> 4082.5 (4126.3) MB
+Mark-Compact 4095.1 (4139.0) -> 4095.1 (4139.0) MB
+FATAL ERROR: Reached heap limit Allocation failed - JavaScript heap out of memory
+...
+node::worker::(anonymous namespace)::StructuredClone
+```
+
+这组结果比 512 MiB 压力测试更接近用户报告的 2 GiB / 4 GiB heap OOM:
+只要 history 中保留足够多的大 tool result / subagent transcript,对整段 history
+做 retained 或瞬时 clone 都可以在 2-4 GiB heap 下触发 V8 OOM。它仍然是 synthetic
+复现,不等价于完整业务长任务 replay,但能直接证明问题不是“小 heap 人为制造”的。
+
+### 崩溃时 GC 状态
+
+```
+[41381:0x130008000] 342468 ms: Mark-Compact 508.6 (526.7) -> 507.0 (526.9) MB,
+ pooled: 1 MB, 86.42 / 0.00 ms (average mu = 0.175, current mu = 0.150)
+ task; scavenge might not succeed
+
+[41381:0x130008000] 342568 ms: Mark-Compact 509.1 (526.9) -> 507.1 (528.2) MB,
+ pooled: 0 MB, 93.79 / 0.12 ms (average mu = 0.121, current mu = 0.068)
+ allocation failure; scavenge might not succeed
+
+FATAL ERROR: Ineffective mark-compacts near heap limit
+Allocation failed - JavaScript heap out of memory
+```
+
+Mark-Compact 只能回收 1-2 MB(几乎所有对象都是 reachable),证明内存确实被合法持有的对象占满。
+
+---
+
+## 五、默认 heap 长任务 replay
+
+为了避免低 heap 结论过度外推,补充了默认 heap 的真实 JSONL replay:
+
+- 不设置 `NODE_OPTIONS`
+- 不启用内部 runtime profiler,避免采样器自身影响 heap
+- 每个 CLI 从同一份 rewound JSONL 复制出 fresh session
+- 使用临时 `QWEN_HOME`,禁用 MCP 和 hooks,避免本地全局配置污染
+- 只用进程外采样统计 process-tree RSS
+
+| CLI | 结果 | 时长 | Tree RSS 峰值 | Root RSS 峰值 | Worker RSS 峰值 | 备注 |
+| -------------------- | ---- | -----: | ------------: | ------------: | --------------: | ----------------------------------------------------------- |
+| installed `qwen` | 成功 | 167.3s | 838.0 MiB | 230.2 MiB | 566.3 MiB | 第一次 fresh run 遇到模型服务端错误,未纳入结论;retry 成功 |
+| local rebuilt bundle | 成功 | 106.3s | 527.5 MiB | 182.1 MiB | 345.4 MiB | 包含本地 clone 热路径修复 |
+
+默认 heap replay 的结论:
+
+1. 当前这份 review JSONL 可以稳定跑出数百 MiB 到约 0.8 GiB 的 process-tree RSS,
+ 但没有复现 4G/8G OOM。
+2. 本地 rebuilt bundle 在同起点 replay 上的峰值低于 installed CLI,说明减少
+ history clone 热路径有实际收益。
+3. 这还不能证明所有用户 OOM 都已解决。真实 4G/8G OOM 仍需要更长任务、更大
+ tool-result 累积,或保留 MCP/tool schema 压力的 replay 继续验证。
+
+## 六、根因分析
+
+### OOM 的三层机制
+
+```
+┌─────────────────────────────────────────────────────────┐
+│ Layer 3: V8 Heap Limit (512MB/2GB/4GB) │ ← 用户最终撞到这里
+├─────────────────────────────────────────────────────────┤
+│ Layer 2: structuredClone() 峰值放大 (瞬时 ~2x) │ ← 直接诱因
+├─────────────────────────────────────────────────────────┤
+│ Layer 1: History 中 tool result 累积 (线性增长) │ ← 基础增长
+├─────────────────────────────────────────────────────────┤
+│ Layer 0: Token compaction 触发时机 │ ← 控制点
+└─────────────────────────────────────────────────────────┘
+```
+
+### 精确崩溃路径
+
+```
+sendMessage()
+ → tryCompress()
+ → heapPressureRatio < threshold (safety net disabled)
+ → ChatCompressionService.compress()
+ → chat.getHistory(true)
+ → structuredClone(this._history) ← 峰值分配!
+ → V8 需要额外 ~N MB 来容纳 clone
+ → 如果 existing heap + N > limit → OOM
+```
+
+### 关键证据
+
+| 观察 | 含义 |
+| --------------------------------------- | ---------------------------------------------- |
+| Task #5→#6: Context 5.9%→4.0% (降了) | Token compaction **成功执行**了 |
+| Task #5→#6: RSS 515→649 MB (涨了 134MB) | Compaction 过程的 `structuredClone` 制造了峰值 |
+| GC 只能回收 1-2 MB | 所有对象都是 live(history + clone 都在) |
+| #4309 设 8GB limit 仍崩 | history 足够大时,clone 峰值可超任何 limit |
+
+需要注意:以上证据来自低 heap 压力测试和 issue 现象的组合推断。默认 heap replay
+目前支持”clone 热路径会显著影响峰值 RSS”,但尚未单独复现 4G/8G OOM。
+
+### 为什么 128K context window 更容易触发
+
+- 128K × 70% = ~90K tokens 触发 compaction
+- 大 context window (1M) 的 70% = 700K tokens,几乎不会触发
+- **compaction 越频繁 → structuredClone 越频繁 → OOM 风险越高**
+- DeepSeek 等未配置 contextWindowSize 的模型默认 128K,更易触发
+
+---
+
+## 六.5、真实运行日志佐证
+
+以下日志提取自本地 crash session 的 debug 输出。为避免泄露本地路径和 session id,
+报告只保留时间线和关键日志内容。
+
+该 session 启动于 `2026-05-19T13:26:35Z` (本地 21:26:35),crash 于
+`2026-05-19T13:32:10Z` (本地 21:32:10)。
+
+### Heap Pressure 与 Auto-Compaction 事件时间线
+
+```
+13:29:43 [WARN] Heap pressure at 74.9%; attempting auto-compaction before token threshold.
+13:30:06 [DEBUG] [FILE_READ_CACHE] clear after auto tryCompress ← compaction #1 执行成功
+13:30:13 [WARN] Heap pressure at 70.7%; attempting auto-compaction before token threshold.
+ ← 刚压完 heap 从 74.9% 仅降到 70.7%,仍超阈值,立即再次尝试
+13:30:52 [DEBUG] Heap pressure at 86.0%; skipping heap-pressure auto-compaction during cooldown.
+ ← 30s cooldown 期间拒绝执行
+13:30:56 [WARN] Heap pressure at 85.3%; attempting auto-compaction before token threshold.
+ ← cooldown 过期,heap 已升至 85.3%
+13:31:21 [DEBUG] [FILE_READ_CACHE] clear after auto tryCompress ← compaction #2 执行成功
+13:31:37 [WARN] Heap pressure at 88.8%; attempting auto-compaction before token threshold.
+ ← 压完后 heap 反弹至 88.8%
+13:32:09 [DEBUG] Heap pressure at 90.2%; skipping heap-pressure auto-compaction during cooldown.
+ ← heap 已达 90.2%,cooldown 中无法执行
+13:32:10 ← 日志终止(进程 OOM crash)
+```
+
+### 日志证据解读
+
+| 日志观察 | 含义 |
+| ---------------------------------------------------------- | --------------------------------------------------------- |
+| 5.5 分钟内触发 **5 次** heap-pressure auto-compaction 尝试 | #3735 引入的 `tryCompress` 在高压时频繁触发 |
+| 每次 compaction 执行后 heap 占比仍 >70% | `structuredClone()` 制造的临时峰值抵消了压缩收益 |
+| 74.9% → 70.7% → 86% → 85.3% → 88.8% → 90.2% → crash | 正反馈循环:压缩→clone 峰值→heap 更高→再压缩→更高 |
+| 日志在 90.2% 后 1 秒内断裂 | 下一次 `getHistory(true)` 的 `structuredClone()` 瞬间超限 |
+| `[FILE_READ_CACHE] clear after auto tryCompress` 出现 2 次 | 证实 compaction 确实走了完整的 compress → setHistory 路径 |
+
+### 正反馈死循环机制
+
+```
+heap 占比高 (>70%)
+ → 触发 heap-pressure auto-compaction
+ → tryCompress() 内部调用 getHistory(true)
+ → structuredClone(this._history) ← 瞬时 heap 峰值 +30~40%
+ → compaction 成功,释放旧 history
+ → 但 clone 峰值已经把 heap 推高到更危险的水位
+ → 下一轮 send 继续累积
+ → heap 占比更高 → 更频繁触发 → crash
+```
+
+---
+
+## 六.6、版本归因:为什么 0.15.7 ~ 0.15.11 期间 OOM 报告增多
+
+### 关键 commit 时间线
+
+| 版本 | PR | 改动 | 对 `structuredClone` 调用频率的影响 |
+| ------------ | ---------------------------------------------------- | ----------------------------------------------------------------------------------- | ----------------------------------- |
+| **v0.15.6** | — | `getHistory(true)` 仅在 `sendMessage` 入口调用 1 次 | 基线:每次 send 1 次 clone |
+| **v0.15.7** | **#3735** `auto-compact subagent context` | 将 `tryCompress()` 下沉到 `GeminiChat`,**每次 send 前**先执行一次 compaction 检查 | **+1 次**:send 前 compress 检查 |
+| **v0.15.10** | **#3879** `reactive compression on context overflow` | 当 provider 返回 context overflow 时,再次触发 `tryCompress()` + `getHistory(true)` | **+1~2 次**:overflow retry 路径 |
+| **v0.15.10** | **#3985** `harden reactive compression` | 强化 reactive compression 重试逻辑 | 同上 |
+
+### v0.15.6 vs v0.15.11 的 `getHistory(true)` 调用点对比
+
+**v0.15.6** (2 处):
+
+```
+L367: const requestContents = this.getHistory(true); ← send 构造 request
+L618: const recoveryContents = self.getHistory(true); ← MAX_TOKENS escalation (极少触发)
+```
+
+**v0.15.11** (5 处):
+
+```
+L467: ChatCompressionService.compress() 内部调用 ← #3735: 每次 send 前的 auto-compact
+L574: requestContents = this.getHistory(true); ← send 构造 request
+L724: reactive tryCompress() 内部调用 ← #3879: context overflow 后 retry
+L739: requestContents = self.getHistory(true); ← #3879: retry 构造新 request
+L943: const recoveryContents = self.getHistory(true); ← MAX_TOKENS escalation
+```
+
+### 最坏路径:一次 send 可触发 4 次 `structuredClone`
+
+```
+sendMessage()
+ → tryCompress() ← #3735: getHistory(true) [clone #1]
+ → getHistory(true) ← 构造 request [clone #2]
+ → API 返回 context overflow
+ → reactive tryCompress() ← #3879: getHistory(true) [clone #3]
+ → getHistory(true) ← retry request [clone #4]
+```
+
+### 结论
+
+**#3735 (v0.15.7)** 是 OOM 频率显著上升的根本原因——它使每次 `sendMessage` 都会先跑一次
+`tryCompress()`,而 `tryCompress` 内部通过 `ChatCompressionService.compress()` →
+`chat.getHistory(true)` 做全量 `structuredClone`。在 history 较大时,这个 “先 clone 再判断
+是否需要压缩” 的设计让内存峰值从 ~1.3x 升至 ~2x+。
+
+**#3879 (v0.15.10)** 进一步恶化了问题——在已经处于 heap 边界时 (provider 返回 context overflow)
+再触发一次全量 clone,使原本就危险的 session 更容易 crash。
+
+---
+
+## 七、#4186 修复效果验证(对比测试)
+
+启用 heap-pressure safety net (HEAP_PRESSURE_COMPRESSION_RATIO = 0.7) 后的对比测试:
+
+| 指标 | 禁用 safety net | 启用 safety net |
+| --------------- | ------------------ | ------------------------- |
+| OOM 发生 | 是(7 轮后 crash) | 否(持续运行 >10 分钟) |
+| RSS 峰值 | 666 MB → crash | 555 MB → GC 回收到 280 MB |
+| Compaction 触发 | 仅 token threshold | heap 70% 时提前触发 |
+| Context 行为 | 5.9%→4.0%→crash | 22.7%→17.0%(安全回落) |
+
+**结论**: #4186 的 heap-pressure safety net 有效防止了 OOM,但它是一个**缓解**而非根治:
+
+- 如果 history 本身已经占了 heap 的 60%+,即使提前 compact,clone 的峰值仍然可能超限
+- 这解释了为什么 #4309 用户设了 8GB limit 后仍然 crash
+
+---
+
+## 八、内存占用分布
+
+基于测试中的 RSS 增长模式估算:
+
+| 内存位置 | 占比 | 增长特征 |
+| -------------------------------- | ------ | --------------------------- |
+| `this._history[]` (tool results) | 40-50% | 线性累积,每轮 +30-100MB |
+| `structuredClone()` 临时拷贝 | 30-40% | 瞬时峰值,compaction 时出现 |
+| V8 runtime (GC metadata, code) | ~15% | 基本恒定 |
+| UI/logging/stream buffers | ~5% | 缓慢增长 |
+
+---
+
+## 九、复现脚本与环境
+
+### 自动化驱动脚本
+
+```bash
+#!/bin/bash
+# /tmp/oom-simple-driver.sh
+SESSION="$1"
+
+TASKS=(
+ "用 Read 工具完整读取 packages/core/src/core/geminiChat.ts"
+ "用 Read 工具完整读取 packages/core/src/tools/agent/agent.ts"
+ "用 grep -rn structuredClone packages/core/src 然后 Read 前 3 个文件"
+ "用 Read 完整读取 packages/cli/src/ui/hooks/slashCommandProcessor.ts"
+ "用 Read 完整读取 packages/core/src/services/chatCompressionService.ts"
+ "用 find packages/cli/src/ui/commands -name '*.ts' 然后逐一 Read"
+ "用 Read 完整读取 packages/core/src/core/turn.ts"
+ # ... 更多任务
+)
+
+i=0
+while true; do
+ TASK="${TASKS[$((i % ${#TASKS[@]}))]}"
+ i=$((i + 1))
+
+ QWEN_PID=$(ps aux | grep "dist/index.js" | grep -v grep | awk '{print $2}' | sort -rn | head -1)
+ RSS=$(ps -o rss= -p $QWEN_PID 2>/dev/null)
+ [ -z "$RSS" ] && { echo "CRASH after $((i-1)) tasks!"; exit 0; }
+
+ RSS_MB=$(echo "scale=1; $RSS/1024" | bc)
+ CTX=$(tmux capture-pane -t "$SESSION:1" -p 2>/dev/null | grep -oE "[0-9]+\.[0-9]+% 已用" | tail -1)
+ echo "[$(date +%H:%M:%S)] #$i RSS:${RSS_MB}MB Ctx:$CTX | ${TASK:0:55}"
+
+ tmux send-keys -t "$SESSION:1" C-u
+ sleep 0.2
+ tmux send-keys -t "$SESSION:1" "$TASK" Enter
+ sleep 0.5
+ tmux send-keys -t "$SESSION:1" Enter
+ sleep 45
+done
+```
+
+### 启动命令
+
+```bash
+# 1. 禁用 heap-pressure safety net
+# geminiChat.ts: HEAP_PRESSURE_COMPRESSION_RATIO = 99.0
+
+# 2. Build
+npm run build --workspace=packages/core && npm run build --workspace=packages/cli
+
+# 3. 启动 qwen (128K context model, 512MB heap)
+SESSION="oom-test"
+tmux new-session -d -s "$SESSION" -c "$REPO_DIR"
+tmux send-keys -t "$SESSION" \
+ "NODE_OPTIONS='--max-old-space-size=512' node packages/cli/dist/index.js --model 'qwen3.6-plus'" Enter
+
+# 4. 等待启动后运行驱动
+sleep 10
+bash /tmp/oom-simple-driver.sh "$SESSION"
+```
+
+---
+
+## 十、后续建议
+
+### 短期缓解(已有)
+
+- [x] #4186: heap-pressure auto-compaction safety net (0.7 threshold)
+- [x] #4188: fileReadCache / crawlCache 上限
+
+### 中期修复(建议)
+
+- [ ] 减少 `structuredClone()` 调用 — `nextSpeakerChecker` 只需最后一条消息,不需 clone 全量
+- [ ] Compaction 使用 slice + 引用替代全量 deep clone
+- [ ] 大 tool result (>100KB) 写入临时文件,history 中只保留摘要引用
+
+### 长期方向
+
+- [ ] Tool result offload 到磁盘 + lazy load (#4184)
+- [ ] 基于 RSS 的分级压缩策略(不仅是 token count)
+- [ ] History 分段存储,避免单次全量操作
diff --git a/docs/e2e-tests/2026-05-19-qwen-runtime-diagnostics-benchmark-report.md b/docs/e2e-tests/2026-05-19-qwen-runtime-diagnostics-benchmark-report.md
new file mode 100644
index 0000000000..ac92ec1e43
--- /dev/null
+++ b/docs/e2e-tests/2026-05-19-qwen-runtime-diagnostics-benchmark-report.md
@@ -0,0 +1,902 @@
+# Qwen Code Runtime Diagnostics Benchmark Report
+
+Date: 2026-05-19
+
+## Scope
+
+This run repeats the previous Qwen Code benchmark shapes with the new opt-in
+runtime diagnostics enabled. It only tests Qwen Code, not Claude Code.
+
+Initial model matrix:
+
+- `pai/glm-5`
+- `qwen3.6-plus`
+
+Additional PR-size follow-up:
+
+- `DeepSeek/deepseek-v4-pro` through Anthropic-compatible protocol
+
+Cases:
+
+- small GitHub PR review: PR `#4268`
+- code navigation: compression / compaction related code search and reads
+- synthetic local diff: about 94.6 KiB
+- synthetic local diff: about 968.5 KiB
+- synthetic local diff: about 4.84 MiB
+
+The run used the local bundled CLI from the diagnostics branch, with
+`QWEN_CODE_PROFILE_RUNTIME=1` and a temporary CLI home. Global MCP servers and
+hooks were not loaded for this benchmark.
+
+Important caveat: these absolute RSS numbers are lower than the previous
+PATH-resolved `qwen` runs because this run used `node dist/cli.js` from the
+local branch plus a stripped temporary config. Treat this report as an internal
+diagnostics distribution run, not a direct replacement for the earlier installed
+CLI RSS comparison.
+
+## Installed CLI vs Local Bundle Sanity Check
+
+A follow-up sanity check used the same minimal prompt, model, and non-interactive
+mode across the installed CLI and the local diagnostics bundle. The only
+intentional variable was whether Qwen Code loaded a stripped temporary CLI home
+or the normal user config.
+
+| CLI | Config mode | Total tokens | Tree RSS peak | Root RSS peak | Process count peak | Runtime diagnostics |
+| ------------------- | --------------- | -----------: | ------------: | ------------: | -----------------: | ------------------- |
+| PATH `qwen` | stripped config | 33,965 | 542.4 MiB | 249.9 MiB | 3 | no |
+| local `dist/cli.js` | stripped config | 47,281 | 455.2 MiB | 214.2 MiB | 4 | yes |
+| PATH `qwen` | normal config | 97,615 | 1,099.9 MiB | 250.1 MiB | 6 | no |
+| local `dist/cli.js` | normal config | 97,954 | 1,105.4 MiB | 212.7 MiB | 8 | yes |
+
+This check changes the attribution: the earlier 1 GiB user-visible peak is
+reproducible with the normal config even on the local diagnostics bundle. It is
+therefore not primarily explained by the local branch including PR `#4186`.
+
+At the normal-config peak, the local process-tree sample was dominated by
+multiple Node/MCP processes rather than the Qwen root process alone:
+
+| Role | Command shape | RSS at tree peak |
+| ----- | ------------------------- | ---------------: |
+| child | Node process | 252.9 MiB |
+| child | Chrome DevTools MCP | 219.7 MiB |
+| child | Node process | 219.2 MiB |
+| root | Qwen Node process | 215.1 MiB |
+| child | Chrome DevTools MCP setup | 175.2 MiB |
+
+PR `#4186` is present in the local diagnostics branch, but it is a V8 heap
+pressure auto-compaction safety net. It triggers at about 70% V8 heap pressure;
+on this environment the Node heap limit is about 4.1 GiB, while the stripped
+benchmark end heap was about 99-143 MiB. Based on these numbers, the lower
+stripped-config RSS is not caused by `#4186` actively compressing context during
+these benchmark runs.
+
+### Bare Mode Config Attribution Check
+
+A second follow-up used `qwen3.6-plus` with the same PR-review prompt shape on
+both the installed CLI and the local bundle. This is not a normal end-to-end
+business benchmark. It is a controlled attribution check for startup/config
+memory only.
+
+`--bare` changes the runtime inputs: it skips normal global settings discovery,
+MCP startup, hooks, implicit context, skills, and other startup integrations. It
+can therefore fail or behave differently when a model provider is configured
+only in global settings. For this run, model credentials were supplied only
+through the child-process environment because bare mode intentionally does not
+load the normal provider settings. Nothing was written back to the user's global
+config.
+
+This run did not produce useful token/tool-call statistics: the model completed
+in one turn and did not call the requested shell command. Do not use these rows
+as normal task benchmark results, and do not compare their token/tool-call
+behavior with the matrix above. They are only useful for estimating how much
+process-tree RSS comes from normal config and configured child processes.
+
+| CLI | Mode | Wall | Turns | Tool uses | Tree RSS peak | Root RSS peak | Process count peak |
+| ------------------- | -------- | ---: | ----: | --------: | ------------: | ------------: | -----------------: |
+| PATH `qwen` | normal | 5.5s | 1 | 0 | 1,021.3 MiB | 251.5 MiB | 5 |
+| PATH `qwen` | `--bare` | 2.4s | 1 | 0 | 525.7 MiB | 246.4 MiB | 2 |
+| local `dist/cli.js` | normal | 4.9s | 1 | 0 | 1,046.2 MiB | 213.3 MiB | 5 |
+| local `dist/cli.js` | `--bare` | 2.3s | 1 | 0 | 454.3 MiB | 216.5 MiB | 3 |
+
+The result confirms the process-tree hypothesis for startup/config attribution.
+On this machine, normal config adds roughly 0.50-0.59 GiB of user-visible
+process-tree RSS over `--bare`, while root RSS stays in the same 0.21-0.25 GiB
+band. At the normal-config peak, the extra RSS again came from additional
+Node/MCP child processes, including a Chrome DevTools MCP process and its setup
+wrapper. `--bare` removes those startup/config children and brings
+installed/local runs back into the 0.45-0.53 GiB tree-RSS range.
+
+### Temporary Settings MCP / Hooks Isolation
+
+Because `--bare` changes too many runtime inputs to be treated as a normal
+benchmark, a follow-up used temporary `QWEN_HOME` directories with generated
+settings files derived from the normal settings. The run stayed on the normal
+settings-loading path, but toggled only two config dimensions:
+
+- MCP disabled: `mcpServers` cleared and MCP allow/exclude lists emptied.
+- Hooks disabled: `disableAllHooks` set to true.
+
+No global settings were modified. The case used `qwen3.6-plus` and a minimal
+startup prompt, so it measures startup/config process-tree cost, not task
+reasoning quality.
+
+| CLI | Temporary config | MCP servers | Tools | Tree RSS peak | Root RSS peak | Process count peak |
+| ------------------- | -------------------- | ----------: | ----: | ------------: | ------------: | -----------------: |
+| PATH `qwen` | full | 4 | 46 | 1,017.4 MiB | 249.8 MiB | 5 |
+| PATH `qwen` | MCP disabled | 0 | 17 | 548.7 MiB | 252.4 MiB | 2 |
+| PATH `qwen` | hooks disabled | 4 | 46 | 1,003.8 MiB | 246.4 MiB | 5 |
+| PATH `qwen` | MCP + hooks disabled | 0 | 17 | 542.5 MiB | 248.0 MiB | 2 |
+| local `dist/cli.js` | full | 4 | 48 | 865.9 MiB | 220.4 MiB | 6 |
+| local `dist/cli.js` | MCP disabled | 0 | 19 | 442.9 MiB | 209.6 MiB | 2 |
+| local `dist/cli.js` | hooks disabled | 4 | 48 | 848.3 MiB | 212.6 MiB | 5 |
+| local `dist/cli.js` | MCP + hooks disabled | 0 | 19 | 447.2 MiB | 217.8 MiB | 2 |
+
+Interpretation:
+
+1. Disabling MCP is the dominant change. It removes 4 MCP servers, reduces the
+ advertised tool count by about 29 tools, and lowers process-tree RSS by about
+ 0.42-0.47 GiB in this startup/config case.
+2. Disabling hooks alone barely changes RSS in this case. That is expected
+ because the prompt did not produce tool calls, so `PreToolUse` /
+ `PostToolUse` hooks were not executed.
+3. The root process stays around 0.21-0.25 GiB across all rows. The large
+ difference is again process-tree composition, not root Qwen RSS.
+
+Two attempted code-navigation follow-ups with `qwen3.6-plus` and `pai/glm-5`
+also reproduced the same MCP-vs-no-MCP memory split, but neither model produced
+tool calls in those runs. Those rows are therefore not used as hooks execution
+evidence. A valid hooks benchmark still needs a task/model combination that
+reliably emits tool calls.
+
+### Per-MCP Isolation
+
+The previous row showed MCP as a group is the dominant startup/config memory
+factor. A follow-up isolated each configured MCP server while keeping hooks
+disabled for all rows. This keeps the test on the normal settings-loading path
+but changes only the MCP server subset.
+
+Configured MCP server names:
+
+- `approval-bridge`
+- `env-center`
+- `chrome-devtools`
+- `code`
+
+Single-pass isolation:
+
+| Variant | Enabled MCPs | Tools | MCP servers | Tree RSS peak | Root RSS peak | Interpretation |
+| ------------------------- | -------------------------------------------- | ----: | ----------: | ------------: | ------------: | ------------------------------------ |
+| none | none | 19 | 0 | 444.4 MiB | 211.7 MiB | baseline without MCP |
+| full | all 4 | 48 | 4 | 857.3 MiB | 215.9 MiB | full MCP startup shape |
+| only `approval-bridge` | `approval-bridge` | 19 | 1 | 455.5 MiB | 214.0 MiB | near baseline |
+| only `env-center` | `env-center` | 19 | 1 | 452.3 MiB | 214.4 MiB | near baseline |
+| only `chrome-devtools` | `chrome-devtools` | 48 | 1 | 824.4 MiB | 209.5 MiB | large RSS increase and tool increase |
+| only `code` | `code` | 19 | 1 | 452.1 MiB | 216.6 MiB | near baseline |
+| without `approval-bridge` | `env-center`, `chrome-devtools`, `code` | 48 | 3 | 997.1 MiB | 215.4 MiB | still high; run showed variance |
+| without `env-center` | `approval-bridge`, `chrome-devtools`, `code` | 48 | 3 | 863.8 MiB | 220.9 MiB | still high |
+| without `chrome-devtools` | `approval-bridge`, `env-center`, `code` | 19 | 3 | 463.4 MiB | 221.6 MiB | returns near baseline |
+| without `code` | `approval-bridge`, `env-center`, `chrome` | 48 | 3 | 858.1 MiB | 219.5 MiB | still high |
+
+Because startup RSS has some variance, the key variants were repeated twice:
+
+| Variant | Samples | Tree RSS range | Avg tree RSS | Result |
+| ------------------------- | ------: | ------------------- | -----------: | ------------------------------ |
+| none | 2 | 443.3-451.9 MiB | 447.6 MiB | stable no-MCP baseline |
+| full | 2 | 856.1-922.8 MiB | 889.5 MiB | stable high-MCP range |
+| only `chrome-devtools` | 2 | 1,007.1-1,021.2 MiB | 1,014.2 MiB | enough alone to reproduce high |
+| without `chrome-devtools` | 2 | 461.1-461.6 MiB | 461.4 MiB | removes the high RSS |
+| only `approval-bridge` | 2 | 449.1-449.9 MiB | 449.5 MiB | near baseline |
+| only `env-center` | 2 | 438.7-449.5 MiB | 444.1 MiB | near baseline |
+| only `code` | 2 | 450.6-451.3 MiB | 451.0 MiB | near baseline |
+
+Interpretation:
+
+1. `chrome-devtools` is the dominant MCP contributor in this environment. It is
+ sufficient by itself to reproduce the high process-tree RSS.
+2. Removing `chrome-devtools` from the full MCP set returns RSS to the no-MCP
+ band. Removing other MCPs while keeping `chrome-devtools` does not.
+3. The advertised tool count follows the same pattern: baseline is 19 tools,
+ while `chrome-devtools` raises the tool count to 48. That means this MCP is
+ also likely to increase request tool schema size and token pressure, not just
+ process-tree RSS.
+4. `approval-bridge`, `env-center`, and `code` individually stay near the
+ no-MCP baseline in these startup/config runs. They emitted startup warnings
+ in this environment, so this result should be interpreted as "no persistent
+ startup RSS owner observed" rather than proof that they have zero cost in all
+ workflows.
+
+## Runtime Summary
+
+| Case | Model | Wall | Turns | Total tokens | Tree RSS peak | Root RSS peak | End heap | End RSS |
+| ---------------- | -------------- | ----: | ----: | -----------: | ------------: | ------------: | --------: | --------: |
+| small PR `#4268` | `pai/glm-5` | 20.1s | 7 | 173,216 | 362.1 MiB | 359.8 MiB | 103.1 MiB | 216.5 MiB |
+| code navigation | `pai/glm-5` | 18.4s | 2 | 49,127 | 378.0 MiB | 376.0 MiB | 102.4 MiB | 313.4 MiB |
+| diff 94.6 KiB | `pai/glm-5` | 16.6s | 6 | 135,716 | 367.9 MiB | 366.0 MiB | 99.1 MiB | 295.0 MiB |
+| diff 968.5 KiB | `pai/glm-5` | 11.4s | 2 | 42,590 | 373.2 MiB | 362.5 MiB | 106.4 MiB | 345.6 MiB |
+| diff 4.84 MiB | `pai/glm-5` | 12.0s | 4 | 95,119 | 414.2 MiB | 412.0 MiB | 123.6 MiB | 410.7 MiB |
+| small PR `#4268` | `qwen3.6-plus` | 35.0s | 6 | 156,556 | 358.9 MiB | 356.9 MiB | 102.6 MiB | 293.1 MiB |
+| code navigation | `qwen3.6-plus` | 28.9s | 4 | 99,800 | 370.3 MiB | 368.3 MiB | 105.8 MiB | 298.2 MiB |
+| diff 94.6 KiB | `qwen3.6-plus` | 28.3s | 4 | 90,808 | 358.8 MiB | 356.9 MiB | 105.9 MiB | 307.0 MiB |
+| diff 968.5 KiB | `qwen3.6-plus` | 30.9s | 6 | 151,782 | 366.1 MiB | 364.1 MiB | 101.0 MiB | 316.9 MiB |
+| diff 4.84 MiB | `qwen3.6-plus` | 24.1s | 4 | 93,271 | 372.8 MiB | 366.0 MiB | 142.8 MiB | 366.0 MiB |
+
+Average by model:
+
+| Model | Avg tree RSS peak | Avg root RSS peak | Avg turns | Avg total tokens | Avg max wire body | Avg total tool result |
+| -------------- | ----------------: | ----------------: | --------: | ---------------: | ----------------: | --------------------: |
+| `pai/glm-5` | 379.1 MiB | 375.3 MiB | 4.2 | 99,154 | 111.8 KiB | 335.1 KiB |
+| `qwen3.6-plus` | 365.4 MiB | 362.4 MiB | 4.8 | 118,443 | 119.3 KiB | 344.3 KiB |
+
+Overlapping small PR `#4268` model snapshot:
+
+| Model | Protocol | Wall | Turns | Total tokens | Tree RSS peak | Root RSS peak | Max wire body |
+| -------------------------- | --------- | ----: | ----: | -----------: | ------------: | ------------: | ------------: |
+| `pai/glm-5` | OpenAI | 20.1s | 7 | 173,216 | 362.1 MiB | 359.8 MiB | 113.8 KiB |
+| `qwen3.6-plus` | OpenAI | 35.0s | 6 | 156,556 | 358.9 MiB | 356.9 MiB | 134.1 KiB |
+| `DeepSeek/deepseek-v4-pro` | Anthropic | 39.7s | 2 | 43,362 | 346.9 MiB | 344.8 MiB | 103.0 KiB |
+
+## Request And Tool Diagnostics
+
+| Case | Model | Requests | Max wire body | Max system prompt | Max tool schema | Tool calls | Total tool result | Max tool result | Max function response in request |
+| ---------------- | -------------- | -------: | ------------: | ----------------: | --------------: | ---------: | ----------------: | --------------: | -------------------------------: |
+| small PR `#4268` | `pai/glm-5` | 7 | 113.8 KiB | 51.4 KiB | 40.2 KiB | 9 | 4.7 KiB | 3.9 KiB | 15.3 KiB |
+| code navigation | `pai/glm-5` | 2 | 114.6 KiB | 51.5 KiB | 40.2 KiB | 3 | 17.5 KiB | 6.2 KiB | 18.4 KiB |
+| diff 94.6 KiB | `pai/glm-5` | 6 | 111.2 KiB | 39.1 KiB | 37.2 KiB | 9 | 94.9 KiB | 92.6 KiB | 29.2 KiB |
+| diff 968.5 KiB | `pai/glm-5` | 2 | 104.8 KiB | 39.1 KiB | 37.2 KiB | 2 | 772.1 KiB | 771.9 KiB | 25.6 KiB |
+| diff 4.84 MiB | `pai/glm-5` | 4 | 114.7 KiB | 39.1 KiB | 37.2 KiB | 4 | 786.3 KiB | 783.2 KiB | 34.7 KiB |
+| small PR `#4268` | `qwen3.6-plus` | 6 | 134.1 KiB | 51.4 KiB | 40.2 KiB | 5 | 34.6 KiB | 15.6 KiB | 36.6 KiB |
+| code navigation | `qwen3.6-plus` | 4 | 114.9 KiB | 51.5 KiB | 40.2 KiB | 3 | 17.5 KiB | 6.2 KiB | 18.4 KiB |
+| diff 94.6 KiB | `qwen3.6-plus` | 4 | 112.8 KiB | 39.1 KiB | 37.2 KiB | 3 | 92.9 KiB | 92.6 KiB | 33.0 KiB |
+| diff 968.5 KiB | `qwen3.6-plus` | 6 | 113.1 KiB | 39.1 KiB | 37.2 KiB | 5 | 778.0 KiB | 771.9 KiB | 32.1 KiB |
+| diff 4.84 MiB | `qwen3.6-plus` | 4 | 121.5 KiB | 39.1 KiB | 37.2 KiB | 4 | 798.5 KiB | 783.2 KiB | 41.3 KiB |
+
+## Observations
+
+1. Process-tree RSS is almost the same as root RSS in this local bundle run.
+ The root/tree gap is usually below 10 MiB. That means these runs did not
+ show a persistent child-process memory owner. The dominant process is the
+ main Node process.
+2. The local bundle run peaks around 0.36-0.41 GiB, not the earlier
+ 0.85-1.06 GiB, because the matrix used a stripped temporary config. A
+ follow-up normal-config sanity check reproduced about 1.1 GiB tree RSS on
+ both PATH `qwen` and local `dist/cli.js`, with the extra memory coming from
+ child MCP/Node processes in the process tree.
+3. V8 heap is much smaller than RSS. End heap is about 99-143 MiB while end RSS
+ is about 216-411 MiB. The remaining footprint is likely loaded modules,
+ native allocations, external buffers, or runtime overhead outside live JS
+ heap.
+4. Static request overhead is large and repeated. The system prompt is about
+ 39-51 KiB per request, and tool schema is about 37-40 KiB per request. This
+ explains why even small tasks can produce high accumulated token counts when
+ the model takes several turns.
+5. Large diff output is capped before it reaches the model request. The 968 KiB
+ and 4.84 MiB diff cases produced around 772-799 KiB of captured tool result,
+ but the largest model-facing function response in a request stayed around
+ 25-41 KiB, and max wire body stayed around 105-122 KiB. This points to
+ truncation / saved-output handling working on the model-facing path.
+6. Memory still increases on large-output cases even though wire body remains
+ bounded. For example, the 4.84 MiB GLM run reached 414.2 MiB tree RSS and
+ 410.7 MiB end RSS, and the 4.84 MiB qwen3.6-plus run ended with 142.8 MiB
+ heap. That suggests large tool output can still affect local capture,
+ normalization, or retained runtime state even when the final request payload
+ is capped.
+7. Model choice changed turns and token totals more than RSS in this run.
+ `qwen3.6-plus` averaged more tokens and turns than `pai/glm-5`, but its
+ average tree RSS peak was slightly lower. This supports the earlier
+ conclusion that model choice is not the main explanation for process memory.
+
+## Updated Working Inference
+
+The new diagnostics make the earlier hypothesis more precise:
+
+- The installed-CLI user-visible 1 GiB peak is now reproducible with the normal
+ config on the local diagnostics bundle. The stripped run should be used for
+ internal Qwen runtime attribution; the normal-config run should be used for
+ user-visible process-tree attribution.
+- The largest observed difference between stripped and normal config is
+ process-tree shape: normal config starts additional MCP/Node child processes.
+ Those children explain most of the absolute jump from about 0.35-0.55 GiB to
+ about 1.1 GiB in the minimal prompt sanity check.
+- The `--bare` follow-up confirms the same direction on `qwen3.6-plus`: normal
+ config costs about 0.50-0.59 GiB more process-tree RSS than bare mode for the
+ same prompt shape, while root RSS changes only slightly.
+- The temporary-settings isolation is a better attribution test than `--bare`:
+ disabling MCP alone reduces process-tree RSS by about 0.42-0.47 GiB while
+ keeping the normal settings-loading path. Disabling hooks alone does not show
+ a meaningful RSS change in no-tool-call cases.
+- Per-MCP isolation points to `chrome-devtools` as the dominant MCP contributor:
+ it is enough by itself to reproduce the high RSS band, and removing it returns
+ the run near the no-MCP baseline.
+- Within the local Qwen runtime, the most suspicious areas are no longer "raw
+ diff bytes sent to the model". The model-facing request body is bounded.
+- The stronger suspects are static per-request context cost, repeated request
+ rounds, tool schema size, and local retention/capture of large tool outputs
+ before or outside model-facing truncation.
+- Because RSS remains much higher than V8 heap, the next profiling layer should
+ include module/startup accounting, external memory, and heap snapshots around
+ tool execution and final response emission.
+
+## RSS Attribution From Current Diagnostics
+
+The current counters do not identify an exact retained object or source file,
+but they do narrow what is and is not driving RSS in these local runs:
+
+| Signal | Current evidence | RSS implication |
+| ---------------------------- | --------------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------- |
+| Root RSS vs process-tree RSS | Root and tree peaks are usually within about 2-10 MiB; DeepSeek large PR is the widest gap at about 23.6 MiB | No persistent child process explains the RSS in this local bundle run; the main Node process dominates |
+| Normal config process tree | Minimal-prompt normal-config runs reach about 1.1 GiB tree RSS while root RSS stays about 213-250 MiB | User-visible 1 GiB peaks can be dominated by MCP/Node child processes rather than Qwen root RSS alone |
+| `--bare` comparison | `qwen3.6-plus` normal runs peak around 1.02-1.05 GiB tree RSS; bare runs peak around 0.45-0.53 GiB | Loading normal config adds about 0.50-0.59 GiB process-tree RSS in this environment |
+| Temporary MCP isolation | Clearing MCP servers drops startup/config tree RSS from 865-1,017 MiB to 443-549 MiB | MCP startup and MCP child processes explain about 0.42-0.47 GiB of process-tree RSS in the controlled config check |
+| Per-MCP isolation | `chrome-devtools` alone reaches about 1.0 GiB in repeated samples; without it the run stays around 461 MiB | `chrome-devtools` is the dominant MCP process-tree RSS contributor in this environment |
+| Temporary hooks isolation | `disableAllHooks=true` with MCP still enabled changes tree RSS by only about 13-18 MiB in no-tool-call cases | Hook config alone is not a visible startup RSS driver here; hook execution still needs a tool-call benchmark |
+| V8 heap vs RSS | End heap is about 99-143 MiB while end RSS is about 216-411 MiB | Live JS heap is not the whole footprint; loaded modules, native allocations, external buffers, or runtime overhead are likely significant |
+| PR/diff size vs RSS | DeepSeek small/medium/large PRs scale from 1 to 4,750 changed lines, but tree RSS stays in a narrow 340.7-360.0 MiB band | Raw PR size is not linearly driving RSS once tool output is bounded |
+| Tool output size | Large diff runs capture about 772-799 KiB tool results and show some higher end RSS / heap, but RSS does not scale linearly | Tool result capture/normalization contributes pressure, especially large-output cases, but is unlikely to be the only RSS driver |
+| Request body size | Max model-facing body ranges from about 103-289 KiB while RSS stays near the same band | Request serialization size affects tokens and latency more clearly than RSS peak |
+| Static per-request context | System prompt is about 39-51 KiB and tool schema about 37-48 KiB per request | Repeated rounds are a token/cost amplifier; this alone does not explain RSS but is a likely optimization target for token pressure |
+
+Working attribution: in the stripped local bundle benchmark, the RSS floor looks
+mostly like task-time runtime/module/native footprint, with large tool output
+adding incremental pressure. In the normal-config run, the user-visible 1 GiB
+tree peak is mostly process-tree composition: Qwen root plus MCP/Node child
+processes. The next targeted measurement should split Qwen root diagnostics
+from configured MCP server diagnostics, then add startup/module/external-memory
+checkpoints inside the Qwen root process.
+
+## Progress Snapshot
+
+Current confirmed signals:
+
+1. The user-visible 1 GiB startup/config peak is reproducible with both the
+ installed CLI and the local diagnostics bundle when the normal config is
+ loaded. It is not primarily explained by the diagnostics branch or PR `#4186`.
+2. In this environment, that 1 GiB peak is mostly process-tree composition:
+ Qwen root process plus relaunch child process plus MCP child processes.
+3. `chrome-devtools` is the dominant configured MCP contributor in the current
+ config. It is enough by itself to reproduce the high process-tree RSS band,
+ even when the prompt does not explicitly use that MCP.
+4. The no-MCP normal relaunch shape still sits around 0.45 GiB process-tree RSS.
+ A single Qwen runtime process without the relaunch parent is closer to
+ 0.22-0.24 GiB in the startup attribution check. This means the 0.45 GiB
+ baseline is not a single-process root RSS number.
+5. In stripped non-interactive task runs, model choice changes turns, token
+ totals, latency, and request sizes more clearly than RSS. RSS stayed in a
+ relatively narrow range across `pai/glm-5`, `qwen3.6-plus`, and
+ `DeepSeek/deepseek-v4-pro`.
+6. Current short-task diagnostics show model-facing tool/function responses are
+ bounded, but local tool-result capture and runtime state can still increase
+ heap/RSS on large-output cases. This keeps large-output retention on the
+ investigation path.
+
+Current gaps:
+
+1. The short-task benchmark matrix is still short-lived. A later interactive
+ long-review run did reproduce a 41.9 min failure, but it is still one sample
+ and needs repeat runs plus heap/object attribution.
+2. The current counters are enough to attribute process-tree RSS and request
+ size, but not enough to name the retained JS object graph during long
+ sessions.
+3. Startup/config RSS and long-session OOM must remain separate tracks. MCP and
+ relaunch explain a large idle/startup RSS band; they do not by themselves
+ explain V8 heap OOM after long tasks.
+4. Interactive TUI memory still needs a separate run from non-interactive mode,
+ because UI history and Ink static output are not exercised the same way.
+
+## Long-Task OOM Evidence From Issues And PRs
+
+Issue/PR evidence points to several different OOM shapes, not one single
+failure mode:
+
+| Source | Evidence summary | Hypothesis to test |
+| ---------------------------------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------- |
+| [`#4309`](https://github.com/QwenLM/qwen-code/issues/4309) | User reports 5.84 GiB memory usage / 7.02 GiB warning with YOLO mode and DeepSeek backend; increasing Node memory to 8 GiB did not remove the symptom | Long autonomous tool loops can retain enough state that simply raising old-space limit is not a root fix |
+| [`#4149`](https://github.com/QwenLM/qwen-code/issues/4149) | Multiple reports show `Ineffective mark-compacts near heap limit`, including 4 GiB and much larger heap-limit cases | A large fraction of heap is reachable application state, not immediately collectible garbage |
+| [`#4116`](https://github.com/QwenLM/qwen-code/issues/4116) | OOM occurred while context display was around 9.5%; analysis points to `structuredClone`, UI history, Ink static tree, and large context windows | Token usage can be low while JS heap pressure is high; token threshold alone is not a reliable memory guard |
+| [`#4167`](https://github.com/QwenLM/qwen-code/issues/4167) | User says the crash happened while compressing; analysis identifies compression peak memory as a distinct shape | Compression can itself create a peak when heap is already high, especially if history is cloned/stringified around the same time |
+| [`#2128`](https://github.com/QwenLM/qwen-code/issues/2128) | Report identifies unbounded UI history, retained file diffs / terminal output, string-width caches, and checkpoint serialization | Interactive TUI long sessions may retain memory outside model history and outside non-interactive benchmarks |
+| [`#2562`](https://github.com/QwenLM/qwen-code/issues/2562) | Report focuses on `GeminiChat.getHistory()` deep-cloning full history in long sessions | Full-history cloning can amplify memory peaks and should be measured separately from retained steady-state size |
+| [`#4185`](https://github.com/QwenLM/qwen-code/issues/4185) | Tracks V8 heap pressure exceeding limit before token-based compaction runs | Heap-pressure guard is necessary, but it only mitigates symptoms if retained data remains large |
+| [`#4184`](https://github.com/QwenLM/qwen-code/issues/4184) | Proposes diagnostics and offload/preview for large retained tool results | Large tool output may be bounded for model requests while still retained in local hot memory |
+| [`#4186`](https://github.com/QwenLM/qwen-code/pull/4186) | Merged heap-pressure auto-compaction safety net and O(1) last-history access for `nextSpeakerChecker` | Covers part of heap-pressure and clone amplification, but does not claim to solve all OOM classes |
+| [`#4127`](https://github.com/QwenLM/qwen-code/pull/4127), [`#4168`](https://github.com/QwenLM/qwen-code/pull/4168) | Open compaction-threshold PRs; one uses fixed heap thresholds, the other redesigns token thresholds and compression behavior | Useful related work, but long-task testing must verify whether heap, token, and compression signals line up in real runs |
+| [`#3000`](https://github.com/QwenLM/qwen-code/issues/3000), [`#4183`](https://github.com/QwenLM/qwen-code/issues/4183) | Diagnostic roadmap calls out `/doctor memory`, heap snapshot, and bounded memory timeline | Snapshot/timeline support is needed to move from RSS attribution to retained-object attribution |
+
+Initial interpretation:
+
+- Unused configured MCP can consume memory because normal startup connects to
+ configured MCP servers and advertises their tools before the task needs them.
+ In the measured config, `chrome-devtools` starts extra Node/npm MCP processes
+ and also increases the tool schema count from 19 to 48. This explains a large
+ startup/config RSS band and can also increase repeated request overhead.
+- The long-session OOM reports are a different layer. GC logs where
+ Mark-Compact frees very little memory suggest the heap is full of reachable
+ state. The strongest candidates are retained history/tool/UI objects,
+ full-history clones, compression intermediates, and streaming/logging
+ accumulators.
+- PR `#4186` is a useful mitigation because it can compact based on heap
+ pressure before token thresholds trigger, and it removes one unnecessary
+ full-history clone. It should not be treated as proof that large tool-output
+ retention, UI history retention, or compression peak memory is already solved.
+
+## Long-Task Validation Plan
+
+The next benchmark should keep two tracks separate:
+
+1. Startup/config attribution: normal config vs MCP-disabled vs
+ `chrome-devtools`-only vs no-relaunch attribution. This explains what users
+ see before meaningful work begins.
+2. Long-task runtime growth: repeated tool calls, large outputs, compression,
+ resume, and interactive UI history. This explains OOM after real work.
+
+Recommended long-task cases:
+
+| Case | Shape | Why it matters |
+| ----------------------------- | ---------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------- |
+| Long PR review loop | Repeat medium/large PR review prompts for 30, 60, and 120 minutes, with fixed model and fixed config | Closest to reported agent workflows; captures turns, tool calls, token growth, and RSS/heap trend |
+| Large tool-output retention | Repeatedly produce bounded 1 MiB / 5 MiB / 20 MiB command outputs, then ask follow-up questions | Tests whether raw output is retained locally after model-facing truncation |
+| Compression pressure | Use a lower controlled old-space limit and large-context prompts to trigger heap-pressure compaction | Verifies PR `#4186` triggers before OOM and whether compression itself creates a new peak |
+| Interactive TUI history | Run the same long loop in tmux TUI mode and compare with non-interactive mode | Isolates UI history, Ink static output, rendered diffs, and terminal-output display retention |
+| Resume stress | Resume a large saved session and immediately continue work | Targets `/resume` OOM reports and session reconstruction cost |
+| Streaming/logging accumulator | Force long streamed responses with telemetry/logging enabled vs disabled | Tests the suspected `collected responses` / logging-retention path from issue analysis |
+| MCP idle vs MCP active | Run no-MCP, `chrome-devtools` configured-but-unused, and `chrome-devtools` actively used variants | Separates idle MCP child RSS from actual MCP tool execution and tool schema/token overhead |
+
+Metrics that should be recorded per turn or per sampling interval:
+
+- Root RSS current/peak and process-tree RSS current/peak.
+- Child process count and top child command shapes.
+- V8 `heapUsed`, `heapTotal`, `heap_size_limit`, `external`, and
+ `arrayBuffers`.
+- Turn count, request count, tool-call count, and tool-call rounds.
+- Input/output/cache/total tokens by request and by whole task.
+- Request body bytes, system prompt bytes, tool schema bytes, and function
+ response bytes.
+- Tool-result count, total captured tool-result bytes, max tool-result bytes,
+ and retained tool-result bytes if available.
+- Conversation history message count and approximate history byte size.
+- Interactive-only UI history item count and approximate retained display size.
+- Compression attempts, compression trigger reason, tokens before/after, heap
+ pressure before/after, and compression failure status.
+- Heap snapshot or bounded memory timeline artifacts when heap pressure crosses
+ a configured threshold.
+
+Validation criteria:
+
+1. Repeat at least the key long-task cases twice. Startup RSS has visible
+ variance, so single-run conclusions should be avoided.
+2. Report root RSS and process-tree RSS separately. User-facing memory pressure
+ can come from child processes, while V8 OOM comes from the Qwen root heap.
+3. Treat a flat RSS line as important evidence. If tokens and tool calls grow
+ but heap/RSS stays flat, the issue is likely elsewhere.
+4. When RSS or heap grows, correlate the growth with a specific signal:
+ tool-result bytes, history bytes, UI history count, compression event,
+ streaming accumulator size, or MCP process start.
+5. If a heap snapshot is taken, write a structured diagnostics JSON first, then
+ the snapshot. Heap snapshots may be large and can contain sensitive strings,
+ so they should remain opt-in and local.
+
+## Interactive Long-Review Reproduction
+
+After the short non-interactive prompts kept finishing before the target window,
+an interactive TUI benchmark was run with remote input. The CLI process stayed
+alive in one session while a controller submitted one real PR-review turn at a
+time. The next turn was only submitted after the assistant emitted that turn's
+completion marker. This avoids treating a short one-shot prompt as a long-task
+reproduction.
+
+Setup:
+
+- Installed Qwen Code `0.15.11`, model `qwen-latest-series-invite-beta-v28`.
+- Temporary CLI home derived from the normal settings, with MCP and hook config
+ removed. No global config was modified.
+- Interactive TUI mode with dual JSON event output and remote JSONL input.
+- Static PR review only. The prompt disallowed dependency install, build, test,
+ Playwright, Docker, and other long external build commands.
+- External RSS samplers recorded both process-tree RSS and the Qwen Node root
+ RSS every 5 seconds.
+
+Outcome:
+
+| Signal | Value |
+| ----------------------------- | ----------: |
+| Wall time before exit | 41.9 min |
+| Exit status | 1 |
+| Completed PR-review turns | 6 |
+| Main chat records | 1,076 |
+| API response telemetry | 335 |
+| Tool-call telemetry | 607 |
+| MCP tool-call telemetry | 0 |
+| Main/root API responses | 36 |
+| Subagent API responses | 299 |
+| Root total tokens | 2.08M |
+| Subagent total tokens | 17.24M |
+| Total API telemetry tokens | 19.32M |
+| Max root input tokens | 85,655 |
+| Max subagent input tokens | 215,207 |
+| `/usr/bin/time -l` max RSS | 1,072.4 MiB |
+| Sampled Qwen root RSS peak | 1,028.2 MiB |
+| Sampled process-tree RSS peak | 1,038.1 MiB |
+
+The process exited with:
+
+```text
+libc++abi: terminating due to uncaught exception of type std::__1::system_error: thread constructor failed: Resource temporarily unavailable
+```
+
+This is not the same failure string as the V8 heap OOM logs. It is still
+important because it occurred in a disabled-MCP, no-build/test, interactive
+long-session review where the Qwen Node process itself crossed about 1 GiB RSS.
+The failure happened during the final summary phase, after the controller had
+already completed six review turns.
+
+Turn timeline and sampled Qwen root RSS:
+
+| Window | Turn state | Qwen root RSS max | Qwen root RSS at window end |
+| ------------- | -------------------- | ----------------: | --------------------------: |
+| 0.0-9.0 min | turn 1 completed | 701.2 MiB | 255.3 MiB |
+| 9.0-15.1 min | turn 2 completed | 503.2 MiB | 494.4 MiB |
+| 15.1-24.1 min | turn 3 completed | 468.7 MiB | 457.5 MiB |
+| 24.1-31.9 min | turn 4 completed | 619.3 MiB | 602.3 MiB |
+| 31.9-40.3 min | turn 5 completed | 955.5 MiB | 955.5 MiB |
+| 40.3-40.4 min | turn 6 completed | 988.6 MiB | 988.6 MiB |
+| 40.4-41.9 min | final summary / exit | 1,028.2 MiB | 1,028.2 MiB |
+
+Token and tool distribution:
+
+| Owner | API responses | Input tokens | Output tokens | Total tokens | Max input |
+| ------------ | ------------: | -----------: | ------------: | -----------: | --------: |
+| Root session | 36 | 2.06M | 22.2K | 2.08M | 85,655 |
+| Subagents | 299 | 17.08M | 154.6K | 17.24M | 215,207 |
+
+Tool-call telemetry by function:
+
+| Tool | Calls | Captured content length |
+| ------------------- | ----: | ----------------------: |
+| `read_file` | 271 | 1.46 MB |
+| `run_shell_command` | 181 | 164.4 KB |
+| `web_fetch` | 80 | 846.3 KB |
+| `grep_search` | 25 | 15.0 KB |
+| `glob` | 15 | 27.8 KB |
+| `todo_write` | 16 | 16.1 KB |
+| `list_directory` | 8 | 6.2 KB |
+| `agent` | 10 | 0 |
+| `tool_search` | 1 | 2.1 KB |
+
+The top visible TUI token counter for a single agent reached about 3.83M
+tokens. Telemetry also shows the heaviest subagent at about 4.05M total tokens
+with a 215K-token max input request. That makes subagent amplification the
+dominant signal in this reproduction.
+
+Interpretation:
+
+1. This run separates long-session growth from MCP startup/config memory. MCP
+ was disabled and there were no MCP tool calls, yet the Qwen root process
+ still reached about 1 GiB RSS.
+2. The late memory peak aligns with subagent-heavy review turns and final
+ summary/merge-back, not with external build/test child processes.
+3. The RSS curve is not a simple linear leak. It falls after early turns, then
+ rises sharply after later subagent turns and remains high near exit.
+4. The failure mode is native resource exhaustion rather than a V8 heap-limit
+ stack, so the next run should add heap/external/arrayBuffer/thread-count
+ sampling. RSS alone cannot distinguish JS heap from native allocations or
+ thread-resource pressure.
+5. The strongest code paths to inspect remain subagent transcript retention,
+ agent-result merge-back, full-history cloning, checkpoint/session recording,
+ and final summary/history assembly.
+
+## Deterministic Huge-Task Clone-Pressure Reproduction
+
+A deterministic stress harness was added as
+`scripts/memory-pressure-repro.mjs`. It does not call a model. Instead, it
+constructs a Qwen-like long-session object graph with root review turns,
+subagent transcripts, large tool results, checkpoint JSON, and retained
+`structuredClone()` copies. This gives a repeatable reproduction for the clone
+and checkpoint peak suspected from the user-provided OOM stack.
+
+The harness has a lightweight script test:
+
+```bash
+npx vitest run --config ./scripts/tests/vitest.config.ts \
+ scripts/tests/memory-pressure-repro.test.js
+```
+
+Result: passed, 1 test.
+
+Controlled runs used `node --max-old-space-size=256` unless otherwise noted.
+
+| Case | History shape | Clone/checkpoint pressure | Result | Max RSS |
+| ------------------------------------------------- | ----------------------------------------------------------------------- | -------------------------------------------------- | --------------------------------- | --------: |
+| Small sanity | 2 turns, 2 KiB tool result, 1 subagent | 1 clone + 1 checkpoint | passed; 2.6 MiB history JSON | 89.7 MiB |
+| Huge build only | 12 turns, 256 KiB tool result, 2 subagents x 12 subagent turns | no retained clone/checkpoint | passed; 76.2 MiB history JSON | 491.5 MiB |
+| Huge + 1 clone | same as above | 1 retained `structuredClone()` | passed | 569.6 MiB |
+| Huge + 2 clones | same as above | 2 retained `structuredClone()` copies | OOM, exit 134 | 496.5 MiB |
+| Huge + 1 checkpoint | same as above | one checkpoint with original + cloned history JSON | passed; 152.5 MiB checkpoint JSON | 926.9 MiB |
+| Huge + 2 checkpoints | same as above | two checkpoint copies | OOM, exit 134 | 920.1 MiB |
+| Huge + 2 clones, no retained subagent transcripts | same generated subagent output, but parent history keeps only summaries | passed; parent history JSON drops to 3.8 MiB | 136.8 MiB |
+
+The failing huge-clone run produced:
+
+```text
+FATAL ERROR: Reached heap limit Allocation failed - JavaScript heap out of memory
+```
+
+The native stack included:
+
+- `v8::internal::ValueDeserializer::ReadObjectInternal`
+- `v8::internal::ValueDeserializer::ReadDenseJSArray`
+- `node::worker::Message::Deserialize`
+- `node::worker::StructuredClone`
+
+This matches the same stack family as the user-provided OOM log. The controlled
+reproduction also shows why 4 GiB / 8 GiB user reports are plausible: the
+failure is not caused by a single large object, but by large retained
+history/tool-result/subagent state plus one or more full-history clone or
+checkpoint copies. Raising `--max-old-space-size` can delay the crash while
+preserving the same amplification pattern.
+
+Important attribution from this deterministic run:
+
+1. Building a 76.2 MiB parent history JSON can succeed under the reduced heap.
+ The OOM appears when additional full-history clone/checkpoint copies are
+ retained.
+2. A single checkpoint copy can push RSS close to 1 GiB even before OOM.
+3. Removing retained subagent transcripts from the parent hot history changes
+ the same generated workload from OOM to a small 136.8 MiB RSS run. That is
+ the clearest mitigation signal so far.
+4. This reproducer is synthetic and intentionally adversarial, but it exercises
+ the same object-graph shape as the long interactive review: parent session,
+ subagents, large tool outputs, transcript merge-back, and full-history clone
+ pressure.
+
+## DeepSeek PR-Size Follow-Up
+
+After the initial model matrix, an additional Qwen Code-only run tested
+`DeepSeek/deepseek-v4-pro` across three real PR sizes. This model is configured
+through the Anthropic-compatible protocol; OpenAI-compatible execution returned
+404 in a smoke check, so the successful benchmark uses `--auth-type anthropic`.
+
+The diagnostics branch was extended to record Anthropic wire request summaries
+with the same privacy rule as the OpenAI path: aggregate counts and byte sizes
+only, no prompt text, diff content, tool arguments, headers, base URL, or API
+key.
+
+PR sizes:
+
+| Size | PR | State | Files | Changed lines | Title |
+| ------ | ------- | ------ | ----: | ------------: | ----------------------------------------------------------------------- |
+| small | `#4268` | merged | 1 | 1 | fix(serve): add mcp_guardrails to E2E capabilities expectation |
+| medium | `#4186` | merged | 6 | 494 | fix(core): add heap-pressure auto-compaction safety net |
+| large | `#4168` | open | 25 | 4,750 | feat(core)!: redesign auto-compaction thresholds with three-tier ladder |
+
+Runtime:
+
+| Size | PR | Wall | Turns | Total tokens | Cache-read tokens | Tree RSS peak | Root RSS peak | End heap | End RSS |
+| ------ | ------- | -----: | ----: | -----------: | ----------------: | ------------: | ------------: | --------: | --------: |
+| small | `#4268` | 39.7s | 2 | 43,362 | 28,672 | 346.9 MiB | 344.8 MiB | 115.2 MiB | 304.3 MiB |
+| medium | `#4186` | 142.6s | 4 | 135,120 | 115,840 | 340.7 MiB | 337.3 MiB | 103.5 MiB | 285.6 MiB |
+| large | `#4168` | 191.1s | 8 | 386,891 | 332,928 | 360.0 MiB | 336.3 MiB | 119.3 MiB | 237.9 MiB |
+
+Request and tool diagnostics:
+
+| Size | PR | Requests | Anthropic wire requests | Max Anthropic body | Max system | Max tool schema | Tool calls | Total tool result | Max tool result | Max function response in request |
+| ------ | ------- | -------: | ----------------------: | -----------------: | ---------: | --------------: | ---------: | ----------------: | --------------: | -------------------------------: |
+| small | `#4268` | 2 | 2 | 103.0 KiB | 50.8 KiB | 47.6 KiB | 3 | 0.6 KiB | 0.5 KiB | 1.1 KiB |
+| medium | `#4186` | 4 | 4 | 159.8 KiB | 50.8 KiB | 47.6 KiB | 5 | 30.2 KiB | 29.3 KiB | 56.7 KiB |
+| large | `#4168` | 8 | 8 | 289.5 KiB | 50.8 KiB | 47.6 KiB | 11 | 235.0 KiB | 232.1 KiB | 182.4 KiB |
+
+DeepSeek observations:
+
+1. PR size scaled turns, tokens, Anthropic wire body size, and tool result size
+ clearly, but did not scale RSS proportionally. The small/medium/large tree
+ RSS peaks stayed in a narrow `340.7-360.0 MiB` band.
+2. The large PR was expensive mostly in model rounds and token volume:
+ 8 requests and 386,891 total tokens. Its max Anthropic body was 289.5 KiB,
+ much larger than the OpenAI-compatible runs, but RSS still stayed near the
+ same local-bundle band.
+3. The static Anthropic request cost is also visible: system prompt is about
+ 50.8 KiB and tool schema about 47.6 KiB per request. Repeated rounds are
+ therefore a major token amplifier.
+4. The large PR produced 235.0 KiB of captured tool results and 182.4 KiB max
+ function response in a request. This is higher than the earlier small PR /
+ code-navigation cases and shows large PRs still put pressure on local
+ tool-result handling and request assembly, even when RSS does not spike.
+5. The DeepSeek run reinforces the model-choice conclusion: provider/model
+ choice strongly changes turns, latency, token volume, and wire payload shape,
+ but the local bundle RSS peak remains dominated by Qwen Code runtime shape
+ rather than scaling linearly with PR size.
+
+## Long-Review JSONL Replay: History Clone Pressure
+
+A recent long PR-review chat record was analyzed as a post-mortem shape for
+the reported OOM class. The raw JSONL is not included here because it contains
+prompt and tool output text. The aggregate shape is:
+
+| Signal | Value |
+| ----------------------- | ----------------------------- |
+| Duration | 87.0 min |
+| Qwen Code version | 0.15.10 |
+| Model | qwen-latest-series beta model |
+| API responses | 380 |
+| Tool-call telemetry | 507 events |
+| MCP tool-call telemetry | 4 events |
+| Subagent API responses | 313 |
+| Root API responses | 67 |
+| Root prompt growth | 38,622 -> 168,555 tokens |
+| Max prompt tokens | 168,555 |
+| Total response tokens | 31.28M |
+
+This shape does not support MCP as the primary OOM cause for this case. Only
+4 of 507 tool-call telemetry events were MCP, and all four recorded
+`content_length=0`. The dominant shape is long-session/subagent amplification:
+15 `agent` calls produced 313 subagent API responses and 403 subagent tool-call
+events.
+
+The replay then rebuilt the chat `Content[]` message shape from the JSONL and
+ran controlled clone/stringify pressure tests. The base retained message payload
+is small, so it is not itself enough to OOM:
+
+| Replay scale | Retained clones | History JSON | Checkpoint JSON | End heap | End RSS |
+| ------------ | --------------: | -----------: | --------------: | -------: | -------: |
+| 1x | 8 | 0.54 MB | 1.08 MB | 18.0 MB | 88.8 MB |
+| 30x | 8 | 14.46 MB | 28.92 MB | 260.0 MB | 577.8 MB |
+| 60x | 8 | 28.86 MB | 57.71 MB | 510.3 MB | 960.8 MB |
+
+The scaled replay is not a user-data claim; it is a controlled amplification of
+the observed JSONL shape to test whether full-history clone and checkpoint
+serialization can create the same failure mode as the reports.
+
+A low-heap reproduction with `--max-old-space-size=256` confirms the mechanism:
+
+| Case | History JSON | Result |
+| ------------------------- | -----------: | ----------------------------------------------------- |
+| Build history only | 38.4 MB | Succeeded; heap 131.6 MB, RSS 378.2 MB |
+| Build + one clone | 38.4 MB | Succeeded; heap 183.3 MB, RSS 463.4 MB |
+| Build + repeated clones | 38.4 MB | OOM after several retained `structuredClone()` copies |
+| Checkpoint double-history | 38.4 MB | OOM while holding history plus cloned client history |
+
+The repeated-clone OOM stack contains `ValueDeserializer::ReadObjectInternal`,
+`ValueDeserializer::ReadDenseJSArray`,
+`node::worker::Message::Deserialize`, and
+`node::worker::StructuredClone`, matching the same stack family seen in the
+user-provided OOM log. This proves that full-history `structuredClone()` can be
+the immediate OOM trigger without any MCP server involvement.
+
+Current working hypothesis for this JSONL class:
+
+1. MCP can explain normal-config startup RSS in separate benchmarks, but it is
+ not the likely trigger for this long-review OOM shape.
+2. Long task growth comes from retained chat history, large tool outputs,
+ subagent histories, observable agent messages, and UI/tool-result state.
+3. The immediate OOM trigger can be a full-history clone or checkpoint-style
+ double serialization after the heap is already high.
+4. Compression can mitigate retained history, but compression itself may create
+ a temporary peak if it first clones or serializes large history.
+
+### Local Mitigation Validation: Disabled-MCP PR Review Case
+
+Two targeted mitigations were applied locally and validated before rerunning a
+disabled-MCP PR review case:
+
+1. `checkNextSpeaker()` now reads only the last curated message with
+ `getHistoryTail(1, true)` and sends only that message to the next-speaker
+ side query. The next-speaker prompt only asks about the immediately previous
+ model response, so sending full history was unnecessary clone and token
+ pressure.
+2. `AgentToolInvocation` no longer retains full `responseParts` arrays inside
+ the live `task_execution.toolCalls` display. The real response parts still
+ flow through transcript/history paths, but the parent UI display now keeps
+ only a bounded text summary for nested tool-result streaming instead of
+ holding another full copy of large subagent tool outputs during long runs.
+3. `GeminiChat.sendMessageStream()` now builds model request contents through
+ an internal curated-history view instead of calling public
+ `getHistory(true)`. Public `getHistory()` still returns a defensive
+ `structuredClone()` for external callers, but the request hot path no longer
+ deep-clones the whole retained chat history before every model call.
+
+TDD checks added for these mitigations:
+
+| Test | Expected protection |
+| -------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------- |
+| `checkNextSpeaker > should send only the last curated model message to the side query` | Prevents full-history clone/send in next-speaker checks |
+| `AgentTool > should not retain responseParts in live tool call display after TOOL_RESULT` | Prevents live subagent display from retaining large tool responses |
+| `AgentTool > should keep only a bounded result summary in live tool call display` | Preserves nested result readability without retaining the full response body |
+| `GeminiChat > sendMessageStream > does not deep-clone the full curated history when building request contents` | Prevents request setup from hitting the `ValueDeserializer` / `StructuredClone` OOM path |
+
+Additional reproduction and fix validation:
+
+| Step | Command shape | Result |
+| ------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------- |
+| Pre-fix deterministic clone pressure | `node --max-old-space-size=256 scripts/memory-pressure-repro.mjs ... --clone-count=2 --mode=clone` | OOM, exit 134; stderr contained `Reached heap limit` and `ValueDeserializer` / `StructuredClone`; max RSS 528.1 MiB in the repeat run |
+| Red test | targeted `GeminiChat` test with `structuredClone` forced to throw during request setup | failed at `GeminiChat.getHistory()` before the mitigation |
+| Green test | same targeted `GeminiChat` test after the mitigation | passed |
+| Built-code smoke | `node --max-old-space-size=256` against the built core package, with a 96-entry / about 48 MiB history and `structuredClone` forced to throw | passed; request had 97 contents; process RSS 161.4 MiB, `/usr/bin/time -l` max RSS 161.6 MiB |
+
+This narrows the earlier "same stack family" statement: the deterministic
+synthetic OOM still proves retained full-history clones can fail in the same V8
+stack family as the user log, while the new `GeminiChat` red/green test proves
+one real production request-setup path no longer reaches that clone point.
+Checkpoint/resume and compression internals still need separate long-run
+validation because they can legitimately need durable copied history.
+
+Verification commands:
+
+| Command | Result |
+| ------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------- |
+| `npx vitest run src/core/geminiChat.test.ts` | passed, 89 tests |
+| `npx vitest run src/utils/nextSpeakerChecker.test.ts --coverage=false` | passed, 13 tests |
+| `npx vitest run src/tools/agent/agent.test.ts --coverage=false` | passed, 77 tests |
+| `npx vitest run --config ./scripts/tests/vitest.config.ts scripts/tests/memory-pressure-repro.test.js` | passed, 1 test |
+| `npm run build --workspace=packages/core` | passed |
+| `npm run build --workspace=packages/cli` | passed |
+| `npm run typecheck --workspace=packages/core` | passed |
+| `npm run typecheck --workspace=packages/cli` | passed |
+| `npm run bundle` | passed |
+| `npm run build` | failed in `packages/vscode-ide-companion` lint on existing internal-module import rules; core, CLI, bundle, and targeted tests above passed |
+| `npm run bundle` | passed |
+| `npm run typecheck --workspace=packages/core` | passed |
+| `npm run typecheck --workspace=packages/cli` | passed |
+
+The full root `npm run build` was not clean in this worktree because the
+`vscode-ide-companion` package hit pre-existing `import/no-internal-modules`
+lint errors. The core/CLI build and bundle needed for the local runtime test
+completed successfully.
+
+The same PR review prompt was then run with a temporary config where MCP and
+hooks were disabled. Both rows were interrupted after a bounded long-run window
+instead of waiting for a full review to finish.
+
+| Variant | Runtime | MCP servers | Tools | Assistant messages | Tool use/result blocks | Parent tool ids | Total tokens | Max input tokens | Root max RSS |
+| ----------------- | ------: | ----------: | ----: | -----------------: | ---------------------: | --------------: | -----------: | ---------------: | -----------: |
+| before mitigation | 365.08s | 0 | 19 | 42 | 42 / 42 | 3 | 79,439 | 26,807 | 357.7 MiB |
+| after mitigation | 404.52s | 0 | 19 | 58 | 52 / 42 | 2 | 390,339 | 54,000 | 310.5 MiB |
+
+This is not a deterministic apples-to-apples model benchmark: the patched run
+did more work and consumed substantially more total tokens before the manual
+cutoff. The useful signal is narrower: under a disabled-MCP review case with
+more observed work, root max RSS did not increase and was about 47.2 MiB lower.
+That supports the mitigation direction, but it does not prove the whole
+long-task OOM class is fixed.
+
+Remaining high-risk clone/retention paths to inspect next:
+
+1. Compression still calls full `getHistory(true)` before summarization. If the
+ heap is already high, the compression attempt can create the peak that trips
+ OOM.
+2. Checkpoint creation can hold original history, cloned client history, and a
+ serialized checkpoint payload at the same time.
+3. Fork subagents still seed from parent history with `getHistory(true)`.
+4. ACP/history export/summary/copy paths still call full `getHistory()` and
+ should be audited separately from the normal review loop.
+
+Version timing:
+
+| Issue | Created | Reported version | Signal |
+| ----- | ---------- | ------------------------ | ---------------------------------------- |
+| #2128 | 2026-03-05 | not specified | Long-session UI memory growth |
+| #2562 | 2026-03-21 | not specified | `structuredClone` OOM in long sessions |
+| #2868 | 2026-04-03 | 0.13.2 | Heap OOM |
+| #2945 | 2026-04-07 | 0.14.0 | V8 heap OOM |
+| #4116 | 2026-05-13 | 0.15.11 | OOM with structured-clone-style analysis |
+| #4134 | 2026-05-14 | 0.15.11 | OOM |
+| #4149 | 2026-05-14 | 0.15.10-nightly.20260513 | V8 heap OOM |
+| #4167 | 2026-05-15 | 0.15.11 | Crash near compression |
+| #4185 | 2026-05-15 | 0.15.11 | Heap pressure before token compaction |
+| #4254 | 2026-05-17 | not specified | Memory keeps rising |
+| #4276 | 2026-05-18 | 0.15.11 | V8 heap OOM |
+| #4309 | 2026-05-19 | 0.15.11 | High memory warning around 7 GiB |
+
+The issue history does not prove that 0.15.10 introduced the OOM class; similar
+reports existed in March and April. It does support a recent cluster beginning
+around 2026-05-13, overlapping `v0.15.10`/`v0.15.11` releases. The relevant
+diff between `v0.15.9` and `v0.15.10` touched subagent runtime,
+non-interactive execution, `GeminiChat`, and compression code heavily, so this
+range is a reasonable first bisect window.
+
+## Notes
+
+- The first code-navigation prompt allowed open-ended exploration and hit
+ `maxSessionTurns`; the successful rows above use a constrained command list.
+- The first synthetic-diff attempt used a relative bundle path from inside the
+ temporary repositories; those failed immediately and are excluded from the
+ tables. The successful rows use the absolute local bundle path.
+- Raw JSONL streams are not committed because they contain prompts, tool
+ commands, and tool output. The report only includes aggregate diagnostics.
diff --git a/docs/plans/2026-05-18-qwen-runtime-memory-investigation.md b/docs/plans/2026-05-18-qwen-runtime-memory-investigation.md
new file mode 100644
index 0000000000..ed044b18de
--- /dev/null
+++ b/docs/plans/2026-05-18-qwen-runtime-memory-investigation.md
@@ -0,0 +1,235 @@
+# Qwen Code Runtime Memory Investigation Plan
+
+Date: 2026-05-18
+
+## Context
+
+Local benchmarks show Qwen Code using substantially more process-tree RSS than
+Claude Code for similar non-interactive CLI task shapes. The latest five-case
+matrix found Qwen Code peaking around `0.85-1.06 GiB` while Claude Code stayed
+around `0.28-0.37 GiB`.
+
+This document proposes a draft investigation and optimization direction. It is
+not intended to claim a final root cause yet. The immediate goal is to make the
+memory gap reviewable, reproducible, and explainable with internal diagnostics.
+
+## Progress So Far
+
+The investigation has reached the evidence-and-direction stage:
+
+- A repeatable local matrix has been built for small PR review, code navigation,
+ and synthetic diff workloads.
+- Qwen Code has been compared across multiple models.
+- Qwen Code and Claude Code have been compared on the same task shapes where
+ equivalent model endpoints were available.
+- The observed RSS gap is consistent enough to justify deeper runtime
+ diagnostics.
+- Related upstream work has been mapped so this effort can build on existing
+ `/doctor memory` and memory-diagnostics follow-ups.
+
+The investigation has not yet reached the final root-cause stage because
+external process RSS cannot show whether the retained memory is V8 heap, native
+memory, loaded modules, live history, tool results, or request assembly state.
+
+## Current Evidence
+
+The companion benchmark report is:
+
+- `docs/e2e-tests/2026-05-18-qwen-memory-benchmark-report.md`
+
+The main evidence is:
+
+- The Qwen-vs-Claude RSS gap reproduced across small PR review, code
+ navigation, and synthetic diff workloads.
+- The gap reproduced with both `pai/glm-5` and `qwen3.6-plus`.
+- Qwen Code used more tokens than Claude Code in every tested matrix cell.
+- Large diff size did not produce a clean linear memory increase, which suggests
+ the baseline and bounded/truncated output paths matter more than raw diff
+ bytes alone.
+
+## Related Work
+
+Relevant upstream work already exists:
+
+| Item | Status | Role in the memory work |
+| ------- | --------------------- | --------------------------------------------------------------------------------------------------------------- |
+| `#4180` | merged PR | Adds baseline `/doctor memory` diagnostics. This is the first instrumentation slice. |
+| `#4181` | open issue, no PR yet | Adds interpretation and pressure classification for `/doctor memory`. |
+| `#4182` | open issue, no PR yet | Adds structured `/doctor memory --json` output and safe session-scale stats. |
+| `#4183` | open issue, no PR yet | Adds opt-in heap snapshots and bounded memory timeline diagnostics. |
+| `#4184` | open issue, no PR yet | Adds large tool-result retention diagnostics and designs offload/preview mitigation. |
+| `#4127` | open PR, conflicting | Adds heap-pressure safety nets for long-session OOM prevention. Useful mitigation, not enough for attribution. |
+| `#4168` | open PR | Redesigns auto-compaction thresholds. Useful for context pressure, not enough for task-time footprint analysis. |
+| `#4172` | open PR | Decouples auto-memory recall from the main request path. Useful for latency/blocking, not direct RSS proof. |
+| `#4188` | merged PR | Bounds build/test caches to prevent OOM in parallel test runs. Important but separate from runtime benchmarks. |
+
+This investigation should build on that direction rather than wait for all
+follow-up issues to land.
+
+Most of the remaining work is instrumentation-first. The open diagnostics
+issues are designed to make memory reports explainable before attempting a
+runtime fix. The open mitigation PRs may reduce specific OOM paths, but they do
+not yet explain why short non-interactive CLI tasks repeatedly peak near
+`1 GiB`.
+
+## Why This Draft Starts With Documentation
+
+This draft intentionally starts with benchmark evidence and an investigation
+plan instead of bundling a runtime code change.
+
+Reasons:
+
+1. The current goal is to make the performance problem and direction visible,
+ not to claim a same-day fix.
+2. Adding instrumentation and optimization in the same PR would make review
+ harder because it mixes measurement, diagnosis, and behavior changes.
+3. The existing benchmark already supports the need for deeper diagnostics.
+4. The next PR can be narrower and easier to validate: diagnostics-only, then
+ rerun the same matrix and compare internal metrics.
+
+The next implementation PR should add the missing counters and timeline points,
+then rerun the benchmark matrix. Only after that should a targeted optimization
+PR attempt to reduce memory.
+
+## Working Inference
+
+The current data points toward a Qwen Code runtime/path issue more than a model
+provider issue.
+
+The strongest current inference is:
+
+> Qwen Code appears to carry a high non-interactive CLI task execution
+> footprint, likely amplified by larger context/tool-result/session handling.
+> The likely problem area is the CLI runtime and agent data path, not the
+> selected model alone.
+
+More specifically, the evidence points away from "too many tool calls" as the
+primary cause. Tool-call counts were similar across CLIs, and Claude sometimes
+used more turns or tool calls while keeping lower RSS. The more plausible
+problem is that Qwen Code initializes or retains heavier state for the same
+short non-interactive CLI task, then amplifies that execution footprint with
+larger context, tool-result, saved-output, or session-history data.
+
+The most likely buckets are:
+
+1. **Process and module startup/execution cost**: Qwen Code may initialize more
+ runtime, tools, UI/session infrastructure, or provider machinery than needed
+ for non-interactive CLI tasks.
+2. **History and context assembly**: Qwen Code may retain or construct larger
+ model-facing context than Claude Code for the same task shape.
+3. **Tool-result retention**: large or repeated tool results may be retained in
+ live history, UI history, chat recording, or saved-output recovery paths.
+4. **Subagent and saved-output amplification**: previous large PR tests showed
+ saved-output recovery and subagent activity, which can add memory and token
+ pressure.
+5. **Native memory versus JS heap split**: external RSS cannot tell whether the
+ pressure is V8 heap, native buffers, loaded modules, or retained data.
+
+This is deliberately phrased as an inference. The next step is to add enough
+internal measurements to confirm or rule out each bucket.
+
+## Proposed Draft PR Scope
+
+The first draft PR should be evidence and diagnostics focused:
+
+1. Commit the benchmark report and investigation plan.
+2. Add or extend local diagnostic output so Qwen Code can report:
+ - V8 heap and heap-space statistics.
+ - RSS versus heap split.
+ - session message count and approximate retained size.
+ - tool result count, total retained size, and largest retained result size.
+ - truncation and saved-output recovery counters.
+ - subagent/process-tree activity when available.
+3. Re-run the existing matrix against:
+ - current published Qwen Code,
+ - current `main`,
+ - diagnostics-only branch,
+ - candidate optimization branch.
+4. Use those measurements to choose one small optimization target.
+
+The first PR should avoid mixing several unrelated optimizations. It should
+either remain documentation-only or add diagnostics-only code. A separate PR
+should carry the first runtime memory reduction once the cause is clearer.
+
+## Candidate Optimization Directions
+
+These are candidates, not conclusions:
+
+1. **Bounded tool-output retention**: store large output out of the hot path and
+ keep only preview, metadata, and retrieval pointers in live history.
+2. **Non-interactive lazy loading**: avoid initializing TUI-only or
+ interactive-only subsystems during non-interactive CLI task execution.
+3. **Session/UI history caps**: degrade old or heavy history items into compact
+ transcript entries.
+4. **Context assembly accounting**: measure and cap large tool results before
+ model request construction.
+5. **Subagent accounting**: expose subagent lifecycle and memory impact in
+ diagnostics.
+
+Claude Code and Codex should be used as design references for diagnostic
+separation, bounded output retention, and lazy history loading. The implementation
+should still follow Qwen Code's own architecture and tests.
+
+## Validation Plan
+
+The investigation should keep the same benchmark matrix so before/after results
+remain comparable:
+
+- small PR review
+- code navigation
+- synthetic diff about 100 KiB
+- synthetic diff about 1 MiB
+- synthetic diff about 5 MiB
+
+For each run, record:
+
+- process-tree RSS peak
+- root process RSS peak
+- V8 heap peak
+- heap-space summary
+- duration
+- turns
+- token count
+- tool call count
+- largest retained tool result
+- total retained tool-result size
+- session/history item counts
+- subagent count
+
+The minimum success condition for a candidate fix is not just "RSS went down".
+It should also identify which internal metric changed and why.
+
+## Next PR Candidate
+
+The next PR should be diagnostics-only and should avoid changing runtime
+behavior. A minimal useful slice would add:
+
+- model request input-size accounting;
+- system prompt and tool schema size accounting;
+- retained message count and approximate retained character size;
+- retained tool-result count, total size, and largest item size;
+- lifecycle samples around startup, first request assembly, tool execution,
+ streaming completion, compression, and final response;
+- process memory samples that include RSS, heap used, heap total, external, and
+ heap-space stats.
+
+After that lands locally, rerun the same Qwen model matrix and compare:
+
+- published Qwen Code;
+- current `main`;
+- diagnostics-only branch;
+- candidate optimization branch.
+
+## Non-Goals
+
+This draft does not claim that:
+
+- all memory pressure is caused by tool output;
+- one existing open PR will solve the observed task-time footprint;
+- model provider differences are irrelevant in every environment;
+- single-run local measurements are sufficient for release-level performance
+ claims.
+
+The intended claim is narrower: Qwen Code shows a consistent local RSS gap in
+the tested workloads, and the project needs internal diagnostics to explain and
+reduce that gap.
From 9957d4c0221d56eba4510ace61d3efbe0fd27d65 Mon Sep 17 00:00:00 2001
From: yiliang114 <1204183885@qq.com>
Date: Wed, 20 May 2026 21:21:19 +0800
Subject: [PATCH 02/11] fix(core): replace structuredClone with shallow copy to
prevent OOM
Replace `structuredClone(this.history)` (called up to 4x per turn on the
send path) with a lightweight shallow copy via `copyContentContainer()`.
This eliminates the OOM root cause in long tool-heavy sessions where the
full deep clone exceeded remaining V8 heap headroom.
Key changes:
- Add `copyContentContainer()` helper ({...content, parts: [...parts]})
- Add `getRequestHistory()` private method for the send path
- Add `getHistoryShallow()`, `getHistoryTailShallow()`,
`peekLastHistoryEntry()`, `getLastModelMessageText()`,
`getHistoryLength()` for read-only callers
- Remove HEAP_PRESSURE_COMPRESSION_RATIO safety net (no longer needed
now that the underlying OOM cause is fixed)
- Update chatCompressionService to use getHistoryShallow(true)
- Update nextSpeakerChecker to send only lastMessage (not full history)
- Update memoryDiagnostics with process-tree RSS measurement
---
.../anthropicContentGenerator.ts | 3 +
packages/core/src/core/client.ts | 91 ++++--
packages/core/src/core/geminiChat.test.ts | 305 +++++++++---------
packages/core/src/core/geminiChat.ts | 158 +++++----
.../loggingContentGenerator.ts | 9 +
.../core/openaiContentGenerator/pipeline.ts | 2 +
packages/core/src/index.ts | 1 +
.../services/chatCompressionService.test.ts | 186 ++++-------
.../src/services/chatCompressionService.ts | 29 +-
.../core/src/services/sessionService.test.ts | 51 +++
packages/core/src/services/sessionService.ts | 34 +-
packages/core/src/tools/agent/agent.ts | 5 +-
.../core/src/utils/memoryDiagnostics.test.ts | 76 ++++-
packages/core/src/utils/memoryDiagnostics.ts | 111 ++++++-
.../core/src/utils/nextSpeakerChecker.test.ts | 42 ++-
packages/core/src/utils/nextSpeakerChecker.ts | 21 +-
16 files changed, 696 insertions(+), 428 deletions(-)
diff --git a/packages/core/src/core/anthropicContentGenerator/anthropicContentGenerator.ts b/packages/core/src/core/anthropicContentGenerator/anthropicContentGenerator.ts
index 987766b15d..2f6ba63cb5 100644
--- a/packages/core/src/core/anthropicContentGenerator/anthropicContentGenerator.ts
+++ b/packages/core/src/core/anthropicContentGenerator/anthropicContentGenerator.ts
@@ -35,6 +35,7 @@ import {
} from '../../utils/runtimeFetchOptions.js';
import { DEFAULT_TIMEOUT } from '../openaiContentGenerator/constants.js';
import { createDebugLogger } from '../../utils/debugLogger.js';
+import { runtimeDiagnostics } from '../../utils/runtimeDiagnostics.js';
import {
tokenLimit,
CAPPED_DEFAULT_MAX_TOKENS,
@@ -226,6 +227,7 @@ export class AnthropicContentGenerator implements ContentGenerator {
let response: Message;
try {
const anthropicRequest = await this.buildRequest(request);
+ runtimeDiagnostics.recordAnthropicWireRequest(anthropicRequest);
const headers = this.buildPerRequestHeaders(anthropicRequest);
response = (await this.client.messages.create(anthropicRequest, {
signal: request.config?.abortSignal,
@@ -249,6 +251,7 @@ export class AnthropicContentGenerator implements ContentGenerator {
...anthropicRequest,
stream: true,
};
+ runtimeDiagnostics.recordAnthropicWireRequest(streamingRequest);
let stream: AsyncIterable;
try {
diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index 7f2d514ce0..59d8ae9be0 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -301,10 +301,66 @@ export class GeminiClient {
return this.getChat().getHistory(curated);
}
+ getHistoryShallow(curated: boolean = false): Content[] {
+ const chat = this.getChat();
+ return chat.getHistoryShallow?.(curated) ?? chat.getHistory(curated);
+ }
+
getHistoryTail(count: number, curated: boolean = false): Content[] {
return this.getChat().getHistoryTail(count, curated);
}
+ private getHistoryTailShallow(
+ count: number,
+ curated: boolean = false,
+ ): Content[] {
+ const chat = this.getChat();
+ if (chat.getHistoryTailShallow) {
+ return chat.getHistoryTailShallow(count, curated);
+ }
+ if (chat.getHistoryTail) {
+ return chat.getHistoryTail(count, curated);
+ }
+ return chat.getHistory(curated).slice(-count);
+ }
+
+ private peekLastHistoryEntry(): Content | undefined {
+ const chat = this.getChat();
+ return (
+ chat.peekLastHistoryEntry?.() ??
+ chat.getHistoryTail?.(1)?.[0] ??
+ chat.getHistory().at(-1)
+ );
+ }
+
+ private getHistoryLength(): number {
+ const chat = this.getChat();
+ return (
+ chat.getHistoryLength?.() ??
+ chat.getHistoryShallow?.().length ??
+ chat.getHistory().length
+ );
+ }
+
+ private getLastModelMessageText(): string | undefined {
+ const chat = this.getChat();
+ if (chat.getLastModelMessageText) {
+ return chat.getLastModelMessageText();
+ }
+ const history = chat.getHistoryShallow?.() ?? chat.getHistory();
+ for (let i = history.length - 1; i >= 0; i--) {
+ const message = history[i];
+ if (message?.role !== 'model') continue;
+ const text =
+ message.parts
+ ?.filter((part): part is { text: string } => 'text' in part)
+ .map((part) => part.text)
+ .join('') ?? '';
+ return text || undefined;
+ }
+ return undefined;
+ }
+
/**
* Pop orphaned trailing user entries from the in-memory chat history.
* Used by:
@@ -921,7 +977,7 @@ export class GeminiClient {
) {
const projectRoot = this.config.getProjectRoot();
const sessionId = this.config.getSessionId();
- const history = this.getHistory();
+ const history = this.getHistoryShallow();
const mgr = this.config.getMemoryManager();
const autoSkillEnabled = this.config.getAutoSkillEnabled();
@@ -985,7 +1041,7 @@ export class GeminiClient {
const projectRoot = this.config.getProjectRoot();
const sessionId = this.config.getSessionId();
- const history = this.getHistory();
+ const history = this.getHistoryShallow();
const mgr = this.config.getMemoryManager();
if (!this.config.getManagedAutoMemoryEnabled()) {
@@ -1259,7 +1315,7 @@ export class GeminiClient {
// retries/hooks) so that model latency during a tool-call loop
// doesn't count as user idle time.
const mcResult = microcompactHistory(
- this.getChat().getHistory(),
+ this.getHistoryShallow(),
this.lastApiCompletionTimestamp,
this.config.getClearContextOnIdle(),
);
@@ -1394,9 +1450,8 @@ export class GeminiClient {
// part from the user immediately follows a functionCall part from the model
// in the conversation history . The IDE context is not discarded; it will
// be included in the next regular message sent to the model.
- const history = this.getHistory();
- const lastMessage =
- history.length > 0 ? history[history.length - 1] : undefined;
+ const historyLength = this.getHistoryLength();
+ const lastMessage = this.peekLastHistoryEntry();
const hasPendingToolCall =
!!lastMessage &&
lastMessage.role === 'model' &&
@@ -1407,7 +1462,7 @@ export class GeminiClient {
if (this.config.getIdeMode() && !hasPendingToolCall) {
const { contextParts, newIdeContext } = this.getIdeContextParts(
- this.forceFullIdeContext || history.length === 0,
+ this.forceFullIdeContext || historyLength === 0,
);
if (contextParts.length > 0) {
ideContextText = wrapIdeContext(contextParts.join('\n'));
@@ -1643,16 +1698,8 @@ export class GeminiClient {
!signal.aborted &&
this.config.hasHooksForEvent('Stop')
) {
- // Get response text from the chat history
- const history = this.getHistory();
- const lastModelMessage = history
- .filter((msg) => msg.role === 'model')
- .pop();
const responseText =
- lastModelMessage?.parts
- ?.filter((p): p is { text: string } => 'text' in p)
- .map((p) => p.text)
- .join('') || '[no response text]';
+ this.getLastModelMessageText() || '[no response text]';
const response = await messageBus.request<
HookExecutionRequest,
@@ -1817,12 +1864,11 @@ export class GeminiClient {
// see the current turn's history regardless of which path exits below.
try {
const chat = this.getChat();
- const fullHistory = chat.getHistory(true);
const maxHistoryForCache = 40;
- const cachedHistory =
- fullHistory.length > maxHistoryForCache
- ? fullHistory.slice(-maxHistoryForCache)
- : fullHistory;
+ const cachedHistory = this.getHistoryTailShallow(
+ maxHistoryForCache,
+ true,
+ );
saveCacheSafeParams(
chat.getGenerationConfig(),
cachedHistory,
@@ -2008,7 +2054,8 @@ export class GeminiClient {
signal,
);
if (info.compressionStatus === CompressionStatus.COMPRESSED) {
- const compressedHistory = this.getChat().getHistory();
+ const chat = this.getChat();
+ const compressedHistory = chat.getHistoryShallow?.() ?? chat.getHistory();
await this.startChat(compressedHistory, SessionStartSource.Compact);
if (
!this.lastSessionStartContext &&
diff --git a/packages/core/src/core/geminiChat.test.ts b/packages/core/src/core/geminiChat.test.ts
index fd25e8a220..5f3caa976b 100644
--- a/packages/core/src/core/geminiChat.test.ts
+++ b/packages/core/src/core/geminiChat.test.ts
@@ -27,18 +27,6 @@ import { CompressionStatus, type ChatCompressionInfo } from './turn.js';
import { ChatCompressionService } from '../services/chatCompressionService.js';
import { SessionStartSource } from '../hooks/types.js';
-const { mockGetHeapStatistics } = vi.hoisted(() => ({
- mockGetHeapStatistics: vi.fn(),
-}));
-
-vi.mock('node:v8', async (importOriginal) => {
- const actual = await importOriginal();
- return {
- ...actual,
- getHeapStatistics: mockGetHeapStatistics,
- };
-});
-
// Mock fs module to prevent actual file system operations during tests
const mockFileSystem = new Map();
@@ -115,10 +103,6 @@ describe('GeminiChat', async () => {
// Default mock implementation for tests that don't care about retry logic
mockRetryWithBackoff.mockImplementation(async (apiCall) => apiCall());
- mockGetHeapStatistics.mockReturnValue({
- used_heap_size: 0,
- heap_size_limit: Number.MAX_SAFE_INTEGER,
- });
mockConfig = {
getSessionId: () => 'test-session-id',
getTelemetryLogPromptsEnabled: () => true,
@@ -1077,6 +1061,61 @@ describe('GeminiChat', async () => {
);
});
+ it('does not deep-clone the full curated history when building request contents', async () => {
+ chat.setHistory([
+ { role: 'user', parts: [{ text: 'prior question' }] },
+ { role: 'model', parts: [{ text: 'prior answer' }] },
+ ]);
+ const response = (async function* () {
+ yield {
+ candidates: [
+ {
+ content: {
+ parts: [{ text: 'response' }],
+ role: 'model',
+ },
+ finishReason: 'STOP',
+ index: 0,
+ safetyRatings: [],
+ },
+ ],
+ text: () => 'response',
+ } as unknown as GenerateContentResponse;
+ })();
+ vi.mocked(mockContentGenerator.generateContentStream).mockResolvedValue(
+ response,
+ );
+ const structuredCloneSpy = vi
+ .spyOn(globalThis, 'structuredClone')
+ .mockImplementation(() => {
+ throw new Error('structuredClone should not build request contents');
+ });
+
+ try {
+ const stream = await chat.sendMessageStream(
+ 'test-model',
+ { message: 'hello' },
+ 'prompt-id-no-request-clone',
+ );
+ for await (const _ of stream) {
+ // consume stream
+ }
+ } finally {
+ structuredCloneSpy.mockRestore();
+ }
+
+ expect(mockContentGenerator.generateContentStream).toHaveBeenCalledWith(
+ expect.objectContaining({
+ contents: [
+ { role: 'user', parts: [{ text: 'prior question' }] },
+ { role: 'model', parts: [{ text: 'prior answer' }] },
+ { role: 'user', parts: [{ text: 'hello' }] },
+ ],
+ }),
+ 'prompt-id-no-request-clone',
+ );
+ });
+
it('should not update global telemetry when no telemetryService is provided (subagent isolation)', async () => {
// Simulate a subagent GeminiChat: created without a telemetryService
const subagentChat = new GeminiChat(mockConfig, config, []);
@@ -1223,7 +1262,10 @@ describe('GeminiChat', async () => {
compressionStatus: CompressionStatus.NOOP,
},
});
- vi.spyOn(chat, 'getHistory').mockImplementationOnce(() => {
+ vi.spyOn(
+ chat as unknown as { getRequestHistory: () => Content[] },
+ 'getRequestHistory',
+ ).mockImplementationOnce(() => {
throw new Error('history setup failed');
});
@@ -1928,6 +1970,65 @@ describe('GeminiChat', async () => {
});
});
+ describe('getHistoryShallow', () => {
+ it('copies containers without structured-cloning large part payloads', () => {
+ const payload = { output: 'x'.repeat(128 * 1024) };
+ const content: Content = {
+ role: 'user',
+ parts: [
+ {
+ functionResponse: {
+ id: 'call-1',
+ name: 'read_file',
+ response: payload,
+ },
+ },
+ ],
+ };
+ chat.addHistory(content);
+ const structuredCloneSpy = vi
+ .spyOn(globalThis, 'structuredClone')
+ .mockImplementation(() => {
+ throw new Error('unexpected deep clone');
+ });
+
+ const history = chat.getHistoryShallow();
+
+ expect(structuredCloneSpy).not.toHaveBeenCalled();
+ expect(history).toEqual([content]);
+ expect(history[0]).not.toBe(content);
+ expect(history[0]!.parts).not.toBe(content.parts);
+ const response = history[0]!.parts![0] as {
+ functionResponse: { response: typeof payload };
+ };
+ expect(response.functionResponse.response).toBe(payload);
+ });
+ });
+
+ describe('getHistoryTailShallow', () => {
+ it('copies only recent containers without cloning payloads', () => {
+ const oldContent: Content = { role: 'user', parts: [{ text: 'old' }] };
+ const recentContent: Content = {
+ role: 'model',
+ parts: [{ text: 'recent' }],
+ };
+ chat.addHistory(oldContent);
+ chat.addHistory(recentContent);
+ const structuredCloneSpy = vi
+ .spyOn(globalThis, 'structuredClone')
+ .mockImplementation(() => {
+ throw new Error('unexpected deep clone');
+ });
+
+ const tail = chat.getHistoryTailShallow(1);
+
+ expect(structuredCloneSpy).not.toHaveBeenCalled();
+ expect(tail).toEqual([recentContent]);
+ expect(tail[0]).not.toBe(recentContent);
+ expect(tail[0]!.parts).not.toBe(recentContent.parts);
+ });
+ });
+
describe('getLastHistoryEntry', () => {
it('returns undefined for an empty history', () => {
expect(chat.getLastHistoryEntry()).toBeUndefined();
@@ -1948,6 +2049,42 @@ describe('GeminiChat', async () => {
});
});
+ describe('peekLastHistoryEntry', () => {
+ it('returns the last entry without structured-cloning the full history', () => {
+ const first: Content = { role: 'user', parts: [{ text: 'a' }] };
+ const last: Content = { role: 'model', parts: [{ text: 'b' }] };
+ chat.addHistory(first);
+ chat.addHistory(last);
+ const structuredCloneSpy = vi
+ .spyOn(globalThis, 'structuredClone')
+ .mockImplementation(() => {
+ throw new Error('unexpected deep clone');
+ });
+
+ expect(chat.peekLastHistoryEntry()).toBe(last);
+ expect(structuredCloneSpy).not.toHaveBeenCalled();
+ });
+ });
+
+ describe('getLastModelMessageText', () => {
+ it('returns text from the latest model message without cloning history', () => {
+ chat.addHistory({ role: 'model', parts: [{ text: 'older' }] });
+ chat.addHistory({ role: 'user', parts: [{ text: 'question' }] });
+ chat.addHistory({
+ role: 'model',
+ parts: [{ text: 'new' }, { text: ' answer' }],
+ });
+ const structuredCloneSpy = vi
+ .spyOn(globalThis, 'structuredClone')
+ .mockImplementation(() => {
+ throw new Error('unexpected deep clone');
+ });
+
+ expect(chat.getLastModelMessageText()).toBe('new answer');
+ expect(structuredCloneSpy).not.toHaveBeenCalled();
+ });
+ });
+
describe('sendMessageStream with retries', () => {
it('should retry on invalid content, succeed, and report metrics', async () => {
vi.useFakeTimers();
@@ -3620,13 +3757,6 @@ describe('GeminiChat', async () => {
return compressSpy;
}
- function mockHeapPressure(usedHeapSize: number, heapLimit = 1000) {
- mockGetHeapStatistics.mockReturnValue({
- used_heap_size: usedHeapSize,
- heap_size_limit: heapLimit,
- });
- }
-
it('replaces history and updates per-chat lastPromptTokenCount on COMPRESSED', async () => {
mockCompressionService('compressed');
chat.setHistory([userMsg('a'), modelMsg('b'), userMsg('c')]);
@@ -3690,136 +3820,9 @@ describe('GeminiChat', async () => {
it('forwards force=true to the compression service', async () => {
const compressSpy = mockCompressionService('compressed');
- mockHeapPressure(900);
await chat.tryCompress('p1', 'm1', true);
expect(compressSpy.mock.calls[0][1].force).toBe(true);
- expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(false);
- expect(mockGetHeapStatistics).not.toHaveBeenCalled();
- });
-
- it('uses heap pressure to bypass the token gate without manual force semantics', async () => {
- const compressSpy = mockCompressionService('noop');
- mockHeapPressure(750);
- vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({
- authType: AuthType.USE_GEMINI,
- model: 'test-model',
- contextWindowSize: 1000,
- });
-
- await chat.tryCompress('p1', 'm1');
-
- expect(compressSpy.mock.calls[0][1].force).toBe(false);
- expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(true);
- expect(compressSpy.mock.calls[0][1].originalTokenCount).toBe(0);
- });
-
- it('does not bypass the token gate below the heap-pressure threshold', async () => {
- const compressSpy = mockCompressionService('noop');
- mockHeapPressure(650);
-
- await chat.tryCompress('p1', 'm1');
-
- expect(compressSpy.mock.calls[0][1].force).toBe(false);
- expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(false);
- });
-
- it('does not let a failed heap-pressure attempt latch off later auto-compaction', async () => {
- const compressSpy = mockCompressionService('failed-inflated');
- mockHeapPressure(701);
-
- const first = await chat.tryCompress('p1', 'm1');
- expect(first.compressionStatus).toBe(
- CompressionStatus.COMPRESSION_FAILED_INFLATED_TOKEN_COUNT,
- );
- expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(true);
-
- compressSpy.mockClear();
- compressSpy.mockResolvedValue({
- newHistory: null,
- info: {
- originalTokenCount: 0,
- newTokenCount: 0,
- compressionStatus: CompressionStatus.NOOP,
- },
- });
- mockHeapPressure(0);
-
- await chat.tryCompress('p2', 'm1');
-
- expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(false);
- expect(compressSpy.mock.calls[0][1].hasFailedCompressionAttempt).toBe(
- false,
- );
- });
-
- it('backs off repeated heap-pressure bypasses after a heap-triggered failure', async () => {
- vi.useFakeTimers();
- vi.setSystemTime(new Date('2026-05-16T00:00:00Z'));
- try {
- const compressSpy = mockCompressionService('failed-inflated');
- mockHeapPressure(800);
-
- await chat.tryCompress('p1', 'm1');
- expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(true);
-
- compressSpy.mockClear();
- compressSpy.mockResolvedValue({
- newHistory: null,
- info: {
- originalTokenCount: 0,
- newTokenCount: 0,
- compressionStatus: CompressionStatus.NOOP,
- },
- });
-
- await chat.tryCompress('p2', 'm1');
- expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(false);
-
- vi.setSystemTime(new Date('2026-05-16T00:00:31Z'));
- compressSpy.mockClear();
-
- await chat.tryCompress('p3', 'm1');
- expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(true);
- } finally {
- vi.useRealTimers();
- }
- });
-
- it('backs off repeated heap-pressure bypasses after a heap-triggered NOOP', async () => {
- vi.useFakeTimers();
- vi.setSystemTime(new Date('2026-05-16T00:00:00Z'));
- try {
- const compressSpy = mockCompressionService('noop');
- mockHeapPressure(800);
-
- await chat.tryCompress('p1', 'm1');
- expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(true);
-
- compressSpy.mockClear();
-
- await chat.tryCompress('p2', 'm1');
- expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(false);
-
- vi.setSystemTime(new Date('2026-05-16T00:00:31Z'));
- compressSpy.mockClear();
-
- await chat.tryCompress('p3', 'm1');
- expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(true);
- } finally {
- vi.useRealTimers();
- }
- });
-
- it('falls back to token-threshold behavior if heap statistics are unavailable', async () => {
- const compressSpy = mockCompressionService('noop');
- mockGetHeapStatistics.mockImplementation(() => {
- throw new Error('heap stats unavailable');
- });
-
- await chat.tryCompress('p1', 'm1');
-
- expect(compressSpy.mock.calls[0][1].bypassTokenThreshold).toBe(false);
});
});
});
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index c2fc71bbea..d20e63065f 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -17,7 +17,6 @@ import type {
GenerateContentResponseUsageMetadata,
} from '@google/genai';
import { createUserContent, FinishReason } from '@google/genai';
-import { getHeapStatistics } from 'node:v8';
import { retryWithBackoff, isUnattendedMode } from '../utils/retry.js';
import { getErrorStatus, isAbortError } from '../utils/errors.js';
import { createDebugLogger } from '../utils/debugLogger.js';
@@ -59,10 +58,6 @@ import { getCustomSystemPrompt } from './prompts.js';
const debugLogger = createDebugLogger('QWEN_CODE_CHAT');
-// Leave roughly 30% V8 heap headroom for compression's transient allocations.
-const HEAP_PRESSURE_COMPRESSION_RATIO = 0.7;
-const HEAP_PRESSURE_COMPRESSION_COOLDOWN_MS = 30_000;
-
/**
* Replaces the args on a `structured_output` `functionCall` with the
* same `__redacted` placeholder used by `ToolCallEvent` telemetry
@@ -353,6 +348,13 @@ function extractCuratedHistory(comprehensiveHistory: Content[]): Content[] {
return curatedHistory;
}
+function copyContentContainer(content: Content): Content {
+ return {
+ ...content,
+ ...(content.parts ? { parts: [...content.parts] } : {}),
+ };
+}
+
function stripThoughtPartsFromContent(content: Content): Content | null {
if (!content.parts) {
return content;
@@ -441,14 +443,6 @@ export class GeminiChat {
*/
private hasFailedCompressionAttempt = false;
- /**
- * Heap-pressure compaction is process-wide pressure applied per chat. If one
- * heap-triggered attempt cannot reduce history, briefly back off this chat
- * so every subsequent send does not immediately pay for another compression
- * side query while memory is already tight.
- */
- private heapPressureCompressionCooldownUntil = 0;
-
/**
* Creates a new GeminiChat instance.
*
@@ -482,6 +476,18 @@ export class GeminiChat {
return this.lastPromptTokenCount;
}
+ /**
+ * Builds request contents for the content generator without deep-cloning the
+ * whole chat history. This is an internal hot path: long sessions can make a
+ * full `structuredClone` larger than the remaining V8 heap headroom.
+ *
+ * Public history readers still use {@link getHistory}, which returns a
+ * defensive deep copy for caller mutation safety.
+ */
+ private getRequestHistory(): Content[] {
+ return extractCuratedHistory(this.history).map(copyContentContainer);
+ }
+
/**
* Seed the last-prompt-token-count for chats created with inherited
* history (forks, subagents, speculation). Without this, the auto-compress
@@ -509,33 +515,6 @@ export class GeminiChat {
signal?: AbortSignal,
options?: TryCompressOptions,
): Promise {
- const heapPressureRatio = force ? null : this.getHeapPressureRatio();
- const heapPressureCooldownActive =
- !force && Date.now() < this.heapPressureCompressionCooldownUntil;
- const bypassTokenThreshold =
- heapPressureRatio !== null &&
- heapPressureRatio >= HEAP_PRESSURE_COMPRESSION_RATIO &&
- !heapPressureCooldownActive;
- if (bypassTokenThreshold) {
- // Temporary safety net: token-based compaction can be too late for
- // large-context sessions because JS heap pressure may hit first.
- // Do not use force=true here because that carries manual /compress
- // semantics in ChatCompressionService.
- debugLogger.warn(
- `Heap pressure at ${(heapPressureRatio * 100).toFixed(1)}%; ` +
- 'attempting auto-compaction before token threshold.',
- );
- } else if (
- heapPressureRatio !== null &&
- heapPressureRatio >= HEAP_PRESSURE_COMPRESSION_RATIO &&
- heapPressureCooldownActive
- ) {
- debugLogger.debug(
- `Heap pressure at ${(heapPressureRatio * 100).toFixed(1)}%; ` +
- 'skipping heap-pressure auto-compaction during cooldown.',
- );
- }
-
const service = new ChatCompressionService();
const { newHistory, info } = await service.compress(this, {
promptId,
@@ -545,7 +524,6 @@ export class GeminiChat {
hasFailedCompressionAttempt: this.hasFailedCompressionAttempt,
originalTokenCount:
options?.originalTokenCountOverride ?? this.lastPromptTokenCount,
- bypassTokenThreshold,
trigger: options?.trigger,
signal,
});
@@ -555,37 +533,13 @@ export class GeminiChat {
info,
compressedHistory: newHistory,
});
- // Auto-compaction replaces history in place — no env-context refresh
- // here. Manual /compress goes through GeminiClient.tryCompressChat,
- // which calls startChat() to re-prepend a fresh env snapshot. See
- // GeminiClient.sendMessageStream for the rationale behind the split.
this.setHistory(newHistory);
- // Compaction summarises away prior full-Read tool results, but the
- // FileReadCache still treats those reads as "in this conversation".
- // A follow-up Read could then return the file_unchanged placeholder
- // pointing at content the model can no longer retrieve from history.
debugLogger.debug('[FILE_READ_CACHE] clear after auto tryCompress');
this.config.getFileReadCache().clear();
this.lastPromptTokenCount = info.newTokenCount;
- // Mirror to the global singleton only when wired (main session).
- // Subagents pass `telemetryService=undefined` to keep their context
- // usage out of the main agent's UI counters.
this.telemetryService?.setLastPromptTokenCount(info.newTokenCount);
- // Re-enable auto-compaction so a forced /compress recovers a chat
- // that an earlier auto-attempt latched off.
this.hasFailedCompressionAttempt = false;
- this.heapPressureCompressionCooldownUntil = 0;
- } else if (bypassTokenThreshold) {
- // If heap-pressure compaction cannot reduce history (NOOP or failure),
- // avoid repeatedly cloning history and/or paying side-query latency while
- // the process-wide pressure remains high.
- this.heapPressureCompressionCooldownUntil =
- Date.now() + HEAP_PRESSURE_COMPRESSION_COOLDOWN_MS;
} else if (isCompressionFailureStatus(info.compressionStatus)) {
- // Track failed attempts (only mark as failed if not forced) so we
- // stop spending compression-API calls on a chat that can't shrink.
- // Heap-pressure attempts are a safety net, not evidence that normal
- // token-threshold compaction should be latched off for this chat.
if (!force) {
this.hasFailedCompressionAttempt = true;
}
@@ -594,24 +548,6 @@ export class GeminiChat {
return info;
}
- private getHeapPressureRatio(): number | null {
- try {
- const { used_heap_size: usedHeapSize, heap_size_limit: heapLimit } =
- getHeapStatistics();
- if (
- !Number.isFinite(usedHeapSize) ||
- usedHeapSize < 0 ||
- !Number.isFinite(heapLimit) ||
- heapLimit <= 0
- ) {
- return null;
- }
- return usedHeapSize / heapLimit;
- } catch {
- return null;
- }
- }
-
setSystemInstruction(sysInstr: string) {
this.generationConfig.systemInstruction = sysInstr;
}
@@ -701,7 +637,7 @@ export class GeminiChat {
// Add user content to history ONCE before any attempts.
this.history.push(userContent);
userContentAdded = true;
- requestContents = this.getHistory(true);
+ requestContents = this.getRequestHistory();
} catch (error) {
if (userContentAdded) {
this.history.pop();
@@ -866,7 +802,7 @@ export class GeminiChat {
reactiveInfo.compressionStatus ===
CompressionStatus.COMPRESSED
) {
- requestContents = self.getHistory(true);
+ requestContents = self.getRequestHistory();
debugLogger.info(
`Reactive compression succeeded: ` +
`${reactiveInfo.originalTokenCount} -> ` +
@@ -1070,7 +1006,7 @@ export class GeminiChat {
// model's continuation appends to the previous partial output.
yield { type: StreamEventType.RETRY, isContinuation: true };
// Re-send with the updated history (includes partial + recovery)
- const recoveryContents = self.getHistory(true);
+ const recoveryContents = self.getRequestHistory();
escalatedFinishReason = undefined;
try {
const recoveryStream = await self.makeApiCallAndProcessStream(
@@ -1237,6 +1173,29 @@ export class GeminiChat {
return structuredClone(history.slice(-count));
}
+ /**
+ * Returns a shallow copy of the history and each entry's parts array without
+ * cloning large part payloads. Use only for read-only consumers or consumers
+ * that replace touched entries before mutating them.
+ */
+ getHistoryShallow(curated: boolean = false): Content[] {
+ const history = curated
+ ? extractCuratedHistory(this.history)
+ : this.history;
+ return history.map(copyContentContainer);
+ }
+
+ /**
+ * Shallow tail variant for hot paths that only need recent history.
+ */
+ getHistoryTailShallow(count: number, curated: boolean = false): Content[] {
+ if (count <= 0) return [];
+ const history = curated
+ ? extractCuratedHistory(this.history)
+ : this.history;
+ return history.slice(-count).map(copyContentContainer);
+ }
+
/**
* Returns a defensive copy of the last raw history entry without cloning the
* full conversation. This avoids O(history) cloning, though cloning the last
@@ -1246,6 +1205,33 @@ export class GeminiChat {
return this.getHistoryTail(1)[0];
}
+ /**
+ * Returns the last raw history entry for read-only checks. Callers must not
+ * mutate the returned object.
+ */
+ peekLastHistoryEntry(): Content | undefined {
+ return this.history.at(-1);
+ }
+
+ /**
+ * Returns concatenated text from the last model entry without cloning the
+ * full history. Used by stop hooks, where only the latest assistant text is
+ * needed.
+ */
+ getLastModelMessageText(): string | undefined {
+ for (let i = this.history.length - 1; i >= 0; i--) {
+ const message = this.history[i];
+ if (message?.role !== 'model') continue;
+ const text =
+ message.parts
+ ?.filter((part): part is { text: string } => 'text' in part)
+ .map((part) => part.text)
+ .join('') ?? '';
+ return text || undefined;
+ }
+ return undefined;
+ }
+
/**
* Returns the number of entries in the raw chat history. O(1) and
* does not clone — use this when you only need the count and would
diff --git a/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts b/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts
index 059104d5c6..cfd16be7f2 100644
--- a/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts
+++ b/packages/core/src/core/loggingContentGenerator/loggingContentGenerator.ts
@@ -44,6 +44,7 @@ import { openaiRequestCaptureContext } from '../openaiContentGenerator/requestCa
import type { RequestContext } from '../openaiContentGenerator/types.js';
import { OpenAILogger } from '../../utils/openaiLogger.js';
import { createDebugLogger } from '../../utils/debugLogger.js';
+import { runtimeDiagnostics } from '../../utils/runtimeDiagnostics.js';
import {
getErrorMessage,
getErrorStatus,
@@ -226,6 +227,10 @@ export class LoggingContentGenerator implements ContentGenerator {
const isInternal = isInternalPromptId(userPromptId);
const session = this.startCaptureSession();
try {
+ runtimeDiagnostics.recordGenerateContentRequest(req, {
+ stream: false,
+ source: 'generateContent',
+ });
if (!isInternal) {
addSystemPromptAttributes(
this.config,
@@ -336,6 +341,10 @@ export class LoggingContentGenerator implements ContentGenerator {
let stream: AsyncGenerator;
try {
+ runtimeDiagnostics.recordGenerateContentRequest(req, {
+ stream: true,
+ source: 'generateContentStream',
+ });
if (!isInternal) {
addSystemPromptAttributes(
this.config,
diff --git a/packages/core/src/core/openaiContentGenerator/pipeline.ts b/packages/core/src/core/openaiContentGenerator/pipeline.ts
index c814527d61..605ab4b45d 100644
--- a/packages/core/src/core/openaiContentGenerator/pipeline.ts
+++ b/packages/core/src/core/openaiContentGenerator/pipeline.ts
@@ -18,6 +18,7 @@ import { StreamingToolCallParser } from './streamingToolCallParser.js';
import { TaggedThinkingParser } from './taggedThinkingParser.js';
import type { PipelineConfig, RequestContext } from './types.js';
import { redactProxyError } from '../../utils/runtimeFetchOptions.js';
+import { runtimeDiagnostics } from '../../utils/runtimeDiagnostics.js';
/**
* The OpenAI SDK adds an abort listener for every `chat.completions.create`
@@ -515,6 +516,7 @@ export class ContentGenerationPipeline {
// provider enhancement, post disable-reasoning) and before the SDK call
// so the logger sees the exact bytes sent on the wire.
openaiRequestCaptureContext.getStore()?.(openaiRequest);
+ runtimeDiagnostics.recordOpenAIWireRequest(openaiRequest);
const result = await executor(openaiRequest, context);
return result;
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index 8e69bacd66..95b516264e 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -301,6 +301,7 @@ export * from './utils/jsonl-utils.js';
export * from './utils/memoryDiagnostics.js';
export * from './utils/memoryDiscovery.js';
export * from './utils/modelId.js';
+export * from './utils/runtimeDiagnostics.js';
export { ConditionalRulesRegistry } from './utils/rulesDiscovery.js';
export type { RuleFile } from './utils/rulesDiscovery.js';
export {
diff --git a/packages/core/src/services/chatCompressionService.test.ts b/packages/core/src/services/chatCompressionService.test.ts
index 3aa349863e..e42d6e80d4 100644
--- a/packages/core/src/services/chatCompressionService.test.ts
+++ b/packages/core/src/services/chatCompressionService.test.ts
@@ -389,6 +389,9 @@ describe('ChatCompressionService', () => {
service = new ChatCompressionService();
mockChat = {
getHistory: vi.fn(),
+ getHistoryShallow: vi.fn((curated?: boolean) =>
+ mockChat.getHistory(curated),
+ ),
appendSystemInstruction: vi.fn(),
} as unknown as GeminiChat;
mockGetHookSystem = vi.fn().mockReturnValue({});
@@ -463,88 +466,6 @@ describe('ChatCompressionService', () => {
expect(result.newHistory).toBeNull();
});
- it('should bypass the token threshold when requested without force=true', async () => {
- const history: Content[] = [
- { role: 'user', parts: [{ text: 'msg1' }] },
- { role: 'model', parts: [{ text: 'msg2' }] },
- { role: 'user', parts: [{ text: 'msg3' }] },
- { role: 'model', parts: [{ text: 'msg4' }] },
- ];
- vi.mocked(mockChat.getHistory).mockReturnValue(history);
- vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(100);
- vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({
- model: 'gemini-pro',
- contextWindowSize: 1000,
- } as unknown as ReturnType);
-
- const mockGenerateContent = vi.fn().mockResolvedValue({
- text: 'Summary',
- usage: {
- promptTokenCount: 1100,
- candidatesTokenCount: 50,
- totalTokenCount: 1150,
- },
- });
- vi.mocked(mockConfig.getBaseLlmClient).mockReturnValue({
- generateText: mockGenerateContent,
- } as unknown as BaseLlmClient);
-
- const result = await service.compress(mockChat, {
- promptId: mockPromptId,
- force: false,
- bypassTokenThreshold: true,
- model: mockModel,
- config: mockConfig,
- hasFailedCompressionAttempt: false,
- originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
- });
-
- expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
- expect(result.newHistory).not.toBeNull();
- expect(mockGenerateContent).toHaveBeenCalled();
- });
-
- it('should bypass the failed-attempt latch when heap pressure requests compaction', async () => {
- const history: Content[] = [
- { role: 'user', parts: [{ text: 'msg1' }] },
- { role: 'model', parts: [{ text: 'msg2' }] },
- { role: 'user', parts: [{ text: 'msg3' }] },
- { role: 'model', parts: [{ text: 'msg4' }] },
- ];
- vi.mocked(mockChat.getHistory).mockReturnValue(history);
- vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(100);
- vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({
- model: 'gemini-pro',
- contextWindowSize: 1000,
- } as unknown as ReturnType);
-
- const mockGenerateContent = vi.fn().mockResolvedValue({
- text: 'Summary',
- usage: {
- promptTokenCount: 1100,
- candidatesTokenCount: 50,
- totalTokenCount: 1150,
- },
- });
- vi.mocked(mockConfig.getBaseLlmClient).mockReturnValue({
- generateText: mockGenerateContent,
- } as unknown as BaseLlmClient);
-
- const result = await service.compress(mockChat, {
- promptId: mockPromptId,
- force: false,
- bypassTokenThreshold: true,
- model: mockModel,
- config: mockConfig,
- hasFailedCompressionAttempt: true,
- originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
- });
-
- expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
- expect(result.newHistory).not.toBeNull();
- expect(mockGenerateContent).toHaveBeenCalled();
- });
-
it('should return NOOP when contextPercentageThreshold is 0', async () => {
const history: Content[] = [
{ role: 'user', parts: [{ text: 'msg1' }] },
@@ -595,41 +516,6 @@ describe('ChatCompressionService', () => {
expect(tokenLimit).not.toHaveBeenCalled();
});
- it('should return NOOP when contextPercentageThreshold is 0 even with token threshold bypass', async () => {
- const history: Content[] = [
- { role: 'user', parts: [{ text: 'msg1' }] },
- { role: 'model', parts: [{ text: 'msg2' }] },
- ];
- vi.mocked(mockChat.getHistory).mockReturnValue(history);
- vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(800);
- vi.mocked(mockConfig.getChatCompression).mockReturnValue({
- contextPercentageThreshold: 0,
- });
-
- const mockGenerateContent = vi.fn();
- vi.mocked(mockConfig.getBaseLlmClient).mockReturnValue({
- generateText: mockGenerateContent,
- } as unknown as BaseLlmClient);
-
- const result = await service.compress(mockChat, {
- promptId: mockPromptId,
- force: false,
- bypassTokenThreshold: true,
- model: mockModel,
- config: mockConfig,
- hasFailedCompressionAttempt: false,
- originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
- });
-
- expect(result.info).toMatchObject({
- compressionStatus: CompressionStatus.NOOP,
- originalTokenCount: 0,
- newTokenCount: 0,
- });
- expect(mockGenerateContent).not.toHaveBeenCalled();
- expect(tokenLimit).not.toHaveBeenCalled();
- });
-
it('should return NOOP when historyToCompress is below MIN_COMPRESSION_FRACTION of total', async () => {
// Construct a history where the split point lands on the 2nd regular user
// message (index 2), but indices 0-1 are tiny relative to the huge content
@@ -715,6 +601,72 @@ describe('ChatCompressionService', () => {
expect(mockGetHookSystem).toHaveBeenCalled();
});
+ it('does not deep-clone full history while compressing', async () => {
+ const largeToolOutput = 'x'.repeat(1024 * 1024);
+ const history: Content[] = [
+ { role: 'user', parts: [{ text: 'review this PR' }] },
+ {
+ role: 'model',
+ parts: [
+ {
+ functionCall: {
+ id: 'read-1',
+ name: 'read_file',
+ args: { path: 'large.ts' },
+ },
+ },
+ ],
+ },
+ {
+ role: 'user',
+ parts: [
+ {
+ functionResponse: {
+ id: 'read-1',
+ name: 'read_file',
+ response: { output: largeToolOutput },
+ },
+ },
+ ],
+ },
+ { role: 'model', parts: [{ text: 'analysis' }] },
+ ];
+ vi.mocked(mockChat.getHistory).mockImplementation(() => {
+ throw new Error('getHistory should not be called by compression');
+ });
+ vi.mocked(mockChat.getHistoryShallow).mockReturnValue(history);
+ vi.mocked(mockConfig.getContentGeneratorConfig).mockReturnValue({
+ model: 'gemini-pro',
+ contextWindowSize: 1000,
+ } as unknown as ReturnType);
+ vi.mocked(uiTelemetryService.getLastPromptTokenCount).mockReturnValue(800);
+
+ const mockGenerateContent = vi.fn().mockResolvedValue({
+ text: 'Summary',
+ usage: {
+ promptTokenCount: 1600,
+ candidatesTokenCount: 50,
+ totalTokenCount: 1650,
+ },
+ });
+ vi.mocked(mockConfig.getBaseLlmClient).mockReturnValue({
+ generateText: mockGenerateContent,
+ } as unknown as BaseLlmClient);
+
+ const result = await service.compress(mockChat, {
+ promptId: mockPromptId,
+ force: false,
+ model: mockModel,
+ config: mockConfig,
+ hasFailedCompressionAttempt: false,
+ originalTokenCount: uiTelemetryService.getLastPromptTokenCount(),
+ });
+
+ expect(result.info.compressionStatus).toBe(CompressionStatus.COMPRESSED);
+ expect(mockChat.getHistory).not.toHaveBeenCalled();
+ expect(mockChat.getHistoryShallow).toHaveBeenCalledWith(true);
+ });
+
it('should force compress even if under threshold', async () => {
const history: Content[] = [
{ role: 'user', parts: [{ text: 'msg1' }] },
diff --git a/packages/core/src/services/chatCompressionService.ts b/packages/core/src/services/chatCompressionService.ts
index f704ee10fe..97934d819e 100644
--- a/packages/core/src/services/chatCompressionService.ts
+++ b/packages/core/src/services/chatCompressionService.ts
@@ -181,14 +181,6 @@ export interface CompressOptions {
* the service does not read or write any global telemetry.
*/
originalTokenCount: number;
- /**
- * Bypass the token-count threshold gate and the failed-attempt latch while
- * preserving automatic compaction semantics. Used for temporary heap-pressure
- * relief where `force=true` would be too broad because it means manual
- * `/compress`. The heap-pressure check that sets this lives in
- * `GeminiChat.tryCompress()`.
- */
- bypassTokenThreshold?: boolean;
/**
* Hook trigger to report for this compression. `force=true` bypasses the
* threshold gate but does not always mean the user manually requested
@@ -210,7 +202,6 @@ export class ChatCompressionService {
config,
hasFailedCompressionAttempt,
originalTokenCount,
- bypassTokenThreshold = false,
trigger,
signal,
} = opts;
@@ -221,13 +212,7 @@ export class ChatCompressionService {
COMPRESSION_TOKEN_THRESHOLD;
const slimmingConfig = resolveSlimmingConfig(chatCompressionSettings);
- // Cheap gates first — these don't need the curated history. Heap-pressure
- // bypass must also bypass the failed-attempt latch, otherwise one failed
- // compression would disable this safety net for the rest of the chat.
- if (
- threshold <= 0 ||
- (hasFailedCompressionAttempt && !force && !bypassTokenThreshold)
- ) {
+ if (threshold <= 0 || (hasFailedCompressionAttempt && !force)) {
return {
newHistory: null,
info: {
@@ -238,10 +223,7 @@ export class ChatCompressionService {
};
}
- // Don't compress if not forced and we are under the token limit. This is
- // the steady-state path on every send; heap pressure may bypass it because
- // the JS heap can become the limiting resource before token count does.
- if (!force && !bypassTokenThreshold) {
+ if (!force) {
const contextLimit =
config.getContentGeneratorConfig()?.contextWindowSize ??
DEFAULT_TOKEN_LIMIT;
@@ -257,7 +239,12 @@ export class ChatCompressionService {
}
}
- const curatedHistory = chat.getHistory(true);
+ // Compression only reads the existing history while deciding the split and
+ // preparing the side-query payload. Avoid `getHistory(true)` here: long
+ // tool-heavy sessions can make a defensive deep clone larger than the
+ // remaining V8 heap headroom at exactly the moment compaction is trying to
+ // reduce memory pressure.
+ const curatedHistory = chat.getHistoryShallow(true);
if (curatedHistory.length === 0) {
return {
newHistory: null,
diff --git a/packages/core/src/services/sessionService.test.ts b/packages/core/src/services/sessionService.test.ts
index 24e5942587..83c574bfee 100644
--- a/packages/core/src/services/sessionService.test.ts
+++ b/packages/core/src/services/sessionService.test.ts
@@ -947,6 +947,57 @@ describe('SessionService', () => {
expect(history).toEqual([recordA1.message, assistantA1.message]);
});
+ it('does not deep-clone stored messages when rebuilding resume API history', () => {
+ const largePayload = {
+ output: 'x'.repeat(128 * 1024),
+ nested: { keep: true },
+ };
+ const toolResult: ChatRecord = {
+ uuid: 'large-tool-result',
+ parentUuid: recordA1.uuid,
+ sessionId: sessionIdA,
+ timestamp: '2024-01-01T00:02:00Z',
+ type: 'tool_result',
+ message: {
+ role: 'user',
+ parts: [
+ {
+ functionResponse: {
+ id: 'call-1',
+ name: 'read_file',
+ response: largePayload,
+ },
+ },
+ ],
+ },
+ cwd: '/test/project/root',
+ version: '1.0.0',
+ };
+ const conversation: ConversationRecord = {
+ sessionId: sessionIdA,
+ projectHash: 'test-project-hash',
+ startTime: '2024-01-01T00:00:00Z',
+ lastUpdated: '2024-01-01T00:02:00Z',
+ messages: [recordA1, toolResult],
+ };
+ const structuredCloneSpy = vi
+ .spyOn(globalThis, 'structuredClone')
+ .mockImplementation(() => {
+ throw new Error('unexpected deep clone');
+ });
+
+ const history = buildApiHistoryFromConversation(conversation);
+
+ expect(structuredCloneSpy).not.toHaveBeenCalled();
+ expect(history).toEqual([recordA1.message, toolResult.message]);
+ expect(history[1]).not.toBe(toolResult.message);
+ expect(history[1].parts).not.toBe(toolResult.message!.parts);
+ const response = history[1].parts![0] as {
+ functionResponse: { response: typeof largePayload };
+ };
+ expect(response.functionResponse.response).toBe(largePayload);
+ });
+
it('merges mid-turn user messages into the preceding tool result on resume', () => {
const assistantWithToolCall: ChatRecord = {
uuid: 'a2',
diff --git a/packages/core/src/services/sessionService.ts b/packages/core/src/services/sessionService.ts
index 3ccf2152fa..ffd0d0c721 100644
--- a/packages/core/src/services/sessionService.ts
+++ b/packages/core/src/services/sessionService.ts
@@ -1191,10 +1191,38 @@ function stripThoughtsFromContent(content: Content): Content | null {
};
}
+function copyContentForApiHistory(content: Content): Content {
+ return {
+ ...content,
+ parts: content.parts?.map((part) => {
+ if ('functionCall' in part && part.functionCall) {
+ return {
+ ...part,
+ functionCall: {
+ ...part.functionCall,
+ args: part.functionCall.args
+ ? { ...part.functionCall.args }
+ : part.functionCall.args,
+ },
+ };
+ }
+ if ('functionResponse' in part && part.functionResponse) {
+ return {
+ ...part,
+ functionResponse: {
+ ...part.functionResponse,
+ },
+ };
+ }
+ return { ...part };
+ }),
+ };
+}
+
function appendApiHistoryRecord(history: Content[], record: ChatRecord): void {
if (!record.message) return;
- const message = structuredClone(record.message as Content);
+ const message = copyContentForApiHistory(record.message as Content);
if (record.subtype === 'mid_turn_user_message') {
const previous = history.at(-1);
if (previous?.role === 'user') {
@@ -1240,7 +1268,9 @@ export function buildApiHistoryFromConversation(
});
if (compressedHistory && lastCompressionIndex >= 0) {
- const baseHistory: Content[] = structuredClone(compressedHistory);
+ const baseHistory: Content[] = compressedHistory.map(
+ copyContentForApiHistory,
+ );
// Append everything after the compression record (newer turns)
for (let i = lastCompressionIndex + 1; i < messages.length; i++) {
diff --git a/packages/core/src/tools/agent/agent.ts b/packages/core/src/tools/agent/agent.ts
index ba871d3c4a..05f8cc2bd3 100644
--- a/packages/core/src/tools/agent/agent.ts
+++ b/packages/core/src/tools/agent/agent.ts
@@ -960,7 +960,10 @@ class AgentToolInvocation extends BaseToolInvocation {
toolConfig: ToolConfig;
}> {
const geminiClient = this.config.getGeminiClient();
- const rawHistory = geminiClient ? geminiClient.getHistory(true) : [];
+ const rawHistory = geminiClient
+ ? (geminiClient.getHistoryShallow?.(true) ??
+ geminiClient.getHistory(true))
+ : [];
// Build the history that will seed the fork's chat. Must end with a
// model message so agent-headless can send the task_prompt as a user
diff --git a/packages/core/src/utils/memoryDiagnostics.test.ts b/packages/core/src/utils/memoryDiagnostics.test.ts
index 0e7c3de4a0..5a1daa24e3 100644
--- a/packages/core/src/utils/memoryDiagnostics.test.ts
+++ b/packages/core/src/utils/memoryDiagnostics.test.ts
@@ -83,6 +83,9 @@ describe('collectMemoryDiagnostics', () => {
activeRequests: () => 3,
openFileDescriptors: async () => 501,
smapsRollup: async () => 'Rss: 5000 kB',
+ processTree: async () => {
+ throw new Error('not available');
+ },
platform: 'linux',
nodeVersion: 'v20.19.0',
});
@@ -117,10 +120,13 @@ describe('collectMemoryDiagnostics', () => {
},
],
resourceUsage: {
- maxRSS: 6,
+ maxRSS: 6 * 1024,
+ maxRSSRaw: 6,
+ maxRSSUnit: 'KiB',
userCPUTime: 10,
systemCPUTime: 20,
},
+ processTree: null,
activeHandles: 300,
activeRequests: 3,
openFileDescriptors: 501,
@@ -226,7 +232,7 @@ describe('collectMemoryDiagnostics', () => {
);
});
- it('treats maxRSS as bytes on all platforms', async () => {
+ it('normalizes resourceUsage maxRSS from KiB to bytes', async () => {
const diagnostics = await collectMemoryDiagnostics({
memoryUsage: () => ({
heapUsed: 100,
@@ -273,8 +279,70 @@ describe('collectMemoryDiagnostics', () => {
nodeVersion: 'v20.19.0',
});
- // Node.js >=14.10.0 returns maxRSS in bytes on all platforms.
- expect(diagnostics.resourceUsage.maxRSS).toBe(4_096);
+ expect(diagnostics.resourceUsage.maxRSS).toBe(4_096 * 1024);
+ expect(diagnostics.resourceUsage.maxRSSRaw).toBe(4_096);
+ expect(diagnostics.resourceUsage.maxRSSUnit).toBe('KiB');
+ });
+
+ it('includes process tree RSS when the optional probe is available', async () => {
+ const diagnostics = await collectMemoryDiagnostics({
+ memoryUsage: () => ({
+ heapUsed: 100,
+ heapTotal: 200,
+ rss: 300,
+ external: 10,
+ arrayBuffers: 5,
+ }),
+ heapStatistics: () => ({
+ heap_size_limit: 1_000,
+ total_heap_size: 200,
+ total_heap_size_executable: 0,
+ total_physical_size: 200,
+ used_heap_size: 100,
+ malloced_memory: 0,
+ peak_malloced_memory: 0,
+ does_zap_garbage: 0,
+ number_of_native_contexts: 1,
+ number_of_detached_contexts: 0,
+ total_available_size: 900,
+ total_global_handles_size: 0,
+ used_global_handles_size: 0,
+ external_memory: 10,
+ }),
+ resourceUsage: () => ({
+ userCPUTime: 10,
+ systemCPUTime: 20,
+ maxRSS: 4_096,
+ sharedMemorySize: 0,
+ unsharedDataSize: 0,
+ unsharedStackSize: 0,
+ minorPageFault: 0,
+ majorPageFault: 0,
+ swappedOut: 0,
+ fsRead: 0,
+ fsWrite: 0,
+ ipcSent: 0,
+ ipcReceived: 0,
+ signalsCount: 0,
+ voluntaryContextSwitches: 0,
+ involuntaryContextSwitches: 0,
+ }),
+ processTree: async () => ({
+ rootPid: 123,
+ processCount: 3,
+ rootRSS: 10 * 1024 * 1024,
+ treeRSS: 25 * 1024 * 1024,
+ }),
+ platform: 'darwin',
+ nodeVersion: 'v20.19.0',
+ });
+
+ expect(diagnostics.processTree).toEqual({
+ rootPid: 123,
+ processCount: 3,
+ rootRSS: 10 * 1024 * 1024,
+ treeRSS: 25 * 1024 * 1024,
+ });
});
it('treats unsupported optional probes as unavailable instead of failing', async () => {
diff --git a/packages/core/src/utils/memoryDiagnostics.ts b/packages/core/src/utils/memoryDiagnostics.ts
index 2ebe3c88e6..b142d4b361 100644
--- a/packages/core/src/utils/memoryDiagnostics.ts
+++ b/packages/core/src/utils/memoryDiagnostics.ts
@@ -5,7 +5,9 @@
*/
import { readdir, readFile } from 'node:fs/promises';
+import { execFile } from 'node:child_process';
import process from 'node:process';
+import { promisify } from 'node:util';
import v8 from 'node:v8';
import { createDebugLogger } from './debugLogger.js';
import { formatMemoryUsage } from './formatters.js';
@@ -20,6 +22,7 @@ const ACTIVE_HANDLES_THRESHOLD = 256;
const ACTIVE_REQUESTS_THRESHOLD = 100;
const OPEN_FD_THRESHOLD = 500;
const debugLogger = createDebugLogger('MEMORY_DIAGNOSTICS');
+const execFileAsync = promisify(execFile);
export interface MemoryDiagnostics {
timestamp: string;
@@ -30,6 +33,7 @@ export interface MemoryDiagnostics {
v8HeapStats: V8HeapStats;
v8HeapSpaces: V8HeapSpaceStats[] | null;
resourceUsage: MemoryResourceUsage;
+ processTree: ProcessTreeMemoryUsage | null;
activeHandles: number;
activeRequests: number;
openFileDescriptors: number | null;
@@ -57,11 +61,21 @@ export interface V8HeapSpaceStats {
}
export interface MemoryResourceUsage {
+ /** Normalized bytes. Node/resourceUsage reports maxRSS in KiB. */
maxRSS: number;
+ maxRSSRaw: number;
+ maxRSSUnit: 'KiB';
userCPUTime: number;
systemCPUTime: number;
}
+export interface ProcessTreeMemoryUsage {
+ rootPid: number;
+ processCount: number;
+ rootRSS: number;
+ treeRSS: number;
+}
+
export interface MemoryDiagnosticsAnalysis {
risks: MemoryRisk[];
recommendation: string;
@@ -92,6 +106,7 @@ export interface MemoryDiagnosticsOptions {
activeRequests?: () => number;
openFileDescriptors?: () => Promise;
smapsRollup?: () => Promise;
+ processTree?: () => Promise;
platform?: NodeJS.Platform;
nodeVersion?: string;
}
@@ -114,7 +129,7 @@ export async function collectMemoryDiagnostics(
const heapStatistics = options.heapStatistics?.() ?? v8.getHeapStatistics();
const resourceUsage = options.resourceUsage?.() ?? process.resourceUsage();
const uptimeSeconds = options.uptimeSeconds?.() ?? process.uptime();
- const [openFileDescriptors, smapsRollup, heapSpaceStatistics] =
+ const [openFileDescriptors, smapsRollup, heapSpaceStatistics, processTree] =
await Promise.all([
optionalProbe(
'openFileDescriptors',
@@ -125,12 +140,15 @@ export async function collectMemoryDiagnostics(
'heapSpaceStatistics',
options.heapSpaceStatistics ?? (() => v8.getHeapSpaceStatistics()),
),
+ optionalProbe(
+ 'processTree',
+ options.processTree ?? (() => collectProcessTreeMemoryUsage(platform)),
+ ),
]);
const v8HeapSpaces = mapHeapSpaces(heapSpaceStatistics);
- // Node.js >=14.10.0 returns maxRSS in bytes on all platforms.
- // This project requires Node >=22.
- const maxRSSBytes = resourceUsage.maxRSS;
+ const maxRSSRaw = resourceUsage.maxRSS;
+ const maxRSSBytes = normalizeMaxRSSBytes(maxRSSRaw);
const diagnostics = {
timestamp: now().toISOString(),
@@ -142,9 +160,12 @@ export async function collectMemoryDiagnostics(
v8HeapSpaces,
resourceUsage: {
maxRSS: maxRSSBytes,
+ maxRSSRaw,
+ maxRSSUnit: 'KiB' as const,
userCPUTime: resourceUsage.userCPUTime,
systemCPUTime: resourceUsage.systemCPUTime,
},
+ processTree,
activeHandles: getProcessInternalCount(
'activeHandles',
'_getActiveHandles',
@@ -167,6 +188,10 @@ export async function collectMemoryDiagnostics(
};
}
+function normalizeMaxRSSBytes(maxRSSKiB: number): number {
+ return maxRSSKiB * 1024;
+}
+
function mapHeapStats(heapInfo: v8.HeapInfo): V8HeapStats {
return {
heapSizeLimit: heapInfo.heap_size_limit,
@@ -233,6 +258,84 @@ async function readProcSmapsRollup(): Promise {
return readFile('/proc/self/smaps_rollup', 'utf8');
}
+async function collectProcessTreeMemoryUsage(
+ platform: NodeJS.Platform,
+): Promise {
+ if (platform === 'win32') {
+ throw new Error('process tree RSS probe is unavailable on win32');
+ }
+
+ const { stdout } = await execFileAsync('ps', ['-axo', 'pid=,ppid=,rss='], {
+ maxBuffer: 1024 * 1024,
+ });
+ const rows = parsePsRows(stdout);
+ const rootPid = process.pid;
+ const rowsByPid = new Map(rows.map((row) => [row.pid, row]));
+ const childrenByParent = new Map();
+ for (const row of rows) {
+ const children = childrenByParent.get(row.ppid);
+ if (children) {
+ children.push(row);
+ } else {
+ childrenByParent.set(row.ppid, [row]);
+ }
+ }
+
+ const queue = [rootPid];
+ const seen = new Set();
+ let rootRSS = 0;
+ let treeRSS = 0;
+ let processCount = 0;
+ while (queue.length > 0) {
+ const pid = queue.shift()!;
+ if (seen.has(pid)) {
+ continue;
+ }
+ seen.add(pid);
+ const row = rowsByPid.get(pid);
+ if (row) {
+ const rssBytes = row.rssKiB * 1024;
+ if (pid === rootPid) {
+ rootRSS = rssBytes;
+ }
+ treeRSS += rssBytes;
+ processCount += 1;
+ }
+ for (const child of childrenByParent.get(pid) ?? []) {
+ queue.push(child.pid);
+ }
+ }
+
+ return {
+ rootPid,
+ processCount,
+ rootRSS,
+ treeRSS,
+ };
+}
+
+interface PsRow {
+ pid: number;
+ ppid: number;
+ rssKiB: number;
+}
+
+function parsePsRows(output: string): PsRow[] {
+ return output
+ .trim()
+ .split(/\r?\n/)
+ .map((line) => {
+ const [pid, ppid, rssKiB] = line.trim().split(/\s+/).map(Number);
+ return { pid, ppid, rssKiB };
+ })
+ .filter(
+ (row) =>
+ Number.isFinite(row.pid) &&
+ Number.isFinite(row.ppid) &&
+ Number.isFinite(row.rssKiB),
+ );
+}
+
async function optionalProbe(
name: string,
probe: () => Promise,
diff --git a/packages/core/src/utils/nextSpeakerChecker.test.ts b/packages/core/src/utils/nextSpeakerChecker.test.ts
index 5ccb9dd434..451f38ee94 100644
--- a/packages/core/src/utils/nextSpeakerChecker.test.ts
+++ b/packages/core/src/utils/nextSpeakerChecker.test.ts
@@ -88,6 +88,7 @@ describe('checkNextSpeaker', () => {
// Spy on getHistory for chatInstance
vi.spyOn(chatInstance, 'getHistory');
+ vi.spyOn(chatInstance, 'getHistoryTail');
vi.spyOn(chatInstance, 'getLastHistoryEntry');
});
@@ -97,6 +98,9 @@ describe('checkNextSpeaker', () => {
function mockChatHistory(history: Content[]): void {
vi.mocked(chatInstance.getHistory).mockReturnValue(history);
+ vi.mocked(chatInstance.getHistoryTail).mockReturnValue(
+ history.length > 0 ? [structuredClone(history[history.length - 1]!)] : [],
+ );
vi.mocked(chatInstance.getLastHistoryEntry).mockReturnValue(
history.length > 0
? structuredClone(history[history.length - 1]!)
@@ -279,8 +283,36 @@ describe('checkNextSpeaker', () => {
expect(generateJsonCall[0].promptId).toBe(promptId);
});
+ it('should send only the last curated model message to the side query', async () => {
+ const oldHistory: Content[] = [
+ { role: 'user', parts: [{ text: 'old user context'.repeat(1000) }] },
+ { role: 'model', parts: [{ text: 'old model context'.repeat(1000) }] },
+ ];
+ const lastModelMessage: Content = {
+ role: 'model',
+ parts: [{ text: 'Some model output.' }],
+ };
+ mockChatHistory([...oldHistory, lastModelMessage]);
+ (mockBaseLlmClient.generateJson as Mock).mockResolvedValue({
+ reasoning: 'Model made a statement, awaiting user input.',
+ next_speaker: 'user',
+ } satisfies NextSpeakerResponse);
+
+ await checkNextSpeaker(chatInstance, mockConfig, abortSignal, promptId);
+
+ const generateJsonCall = (mockBaseLlmClient.generateJson as Mock).mock
+ .calls[0];
+ expect(generateJsonCall[0].contents).toHaveLength(2);
+ expect(generateJsonCall[0].contents[0]).toEqual(lastModelMessage);
+ expect(generateJsonCall[0].contents[1]).toMatchObject({
+ role: 'user',
+ });
+ expect(chatInstance.getHistory).not.toHaveBeenCalled();
+ expect(chatInstance.getHistoryTail).toHaveBeenCalledWith(1, true);
+ });
+
it('should use raw last history entry to detect function responses', async () => {
- vi.mocked(chatInstance.getHistory).mockReturnValue([
+ vi.mocked(chatInstance.getHistoryTail).mockReturnValue([
{
role: 'model',
parts: [{ functionCall: { name: 'read_file', args: {} } }],
@@ -310,7 +342,8 @@ describe('checkNextSpeaker', () => {
'The last message was a function response, so the model should speak next.',
next_speaker: 'model',
});
- expect(chatInstance.getHistory).toHaveBeenCalledWith(true);
+ expect(chatInstance.getHistory).not.toHaveBeenCalled();
+ expect(chatInstance.getHistoryTail).not.toHaveBeenCalled();
expect(chatInstance.getLastHistoryEntry).toHaveBeenCalledTimes(1);
expect(mockBaseLlmClient.generateJson).not.toHaveBeenCalled();
});
@@ -327,8 +360,9 @@ describe('checkNextSpeaker', () => {
await checkNextSpeaker(chatInstance, mockConfig, abortSignal, promptId);
- expect(chatInstance.getHistory).toHaveBeenCalledTimes(1);
- expect(chatInstance.getHistory).toHaveBeenCalledWith(true);
+ expect(chatInstance.getHistory).not.toHaveBeenCalled();
+ expect(chatInstance.getHistoryTail).toHaveBeenCalledTimes(1);
+ expect(chatInstance.getHistoryTail).toHaveBeenCalledWith(1, true);
expect(chatInstance.getLastHistoryEntry).toHaveBeenCalledTimes(1);
});
});
diff --git a/packages/core/src/utils/nextSpeakerChecker.ts b/packages/core/src/utils/nextSpeakerChecker.ts
index c36a8eb9ac..33b4e2f06c 100644
--- a/packages/core/src/utils/nextSpeakerChecker.ts
+++ b/packages/core/src/utils/nextSpeakerChecker.ts
@@ -48,23 +48,9 @@ export async function checkNextSpeaker(
abortSignal: AbortSignal,
promptId: string,
): Promise {
- // We need to capture the curated history because there are many moments when the model will return invalid turns
- // that when passed back up to the endpoint will break subsequent calls. An example of this is when the model decides
- // to respond with an empty part collection if you were to send that message back to the server it will respond with
- // a 400 indicating that model part collections MUST have content.
- const curatedHistory = chat.getHistory(/* curated */ true);
-
- // Ensure there's a model response to analyze
- if (curatedHistory.length === 0) {
- // Cannot determine next speaker if history is empty.
- return null;
- }
-
// Read the last raw history entry by design: functionResponse turns can be
// stripped from curated history, but they are decisive for next-speaker flow.
const lastComprehensiveMessage = chat.getLastHistoryEntry();
- // Raw history can still be empty even if the curated-history guard above is
- // the normal empty-chat path, so keep this defensive check local.
if (!lastComprehensiveMessage) {
return null;
}
@@ -94,7 +80,10 @@ export async function checkNextSpeaker(
// Things checked out. Let's proceed to potentially making an LLM request.
- const lastMessage = curatedHistory[curatedHistory.length - 1];
+ // The next-speaker prompt only analyzes the immediately preceding response.
+ // Keep the side query and its structuredClone cost bounded to that one
+ // curated message rather than cloning and sending the entire chat history.
+ const [lastMessage] = chat.getHistoryTail(1, /* curated */ true);
if (!lastMessage || lastMessage.role !== 'model') {
// Cannot determine next speaker if the last turn wasn't from the model
// or if history is empty.
@@ -102,7 +91,7 @@ export async function checkNextSpeaker(
}
const contents: Content[] = [
- ...curatedHistory,
+ lastMessage,
{ role: 'user', parts: [{ text: CHECK_PROMPT }] },
];
From 38e789c1c85daa0a95a25367b45a22c893c4ab19 Mon Sep 17 00:00:00 2001
From: yiliang114 <1204183885@qq.com>
Date: Wed, 20 May 2026 21:21:51 +0800
Subject: [PATCH 03/11] feat(core): add runtimeDiagnostics utility for
heap/memory instrumentation
Required by content generators (anthropic, openai, logging) which import
runtimeDiagnostics for optional heap-pressure telemetry during streaming.
Gated by QWEN_CODE_PROFILE_RUNTIME=1 environment variable.
---
.../core/src/utils/runtimeDiagnostics.test.ts | 237 ++++++++
packages/core/src/utils/runtimeDiagnostics.ts | 557 ++++++++++++++++++
2 files changed, 794 insertions(+)
create mode 100644 packages/core/src/utils/runtimeDiagnostics.test.ts
create mode 100644 packages/core/src/utils/runtimeDiagnostics.ts
diff --git a/packages/core/src/utils/runtimeDiagnostics.test.ts b/packages/core/src/utils/runtimeDiagnostics.test.ts
new file mode 100644
index 0000000000..8fcd81d5b3
--- /dev/null
+++ b/packages/core/src/utils/runtimeDiagnostics.test.ts
@@ -0,0 +1,237 @@
+/**
+ * @license
+ * Copyright 2025 Qwen
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import { describe, expect, it } from 'vitest';
+import type { GenerateContentParameters } from '@google/genai';
+import {
+ RuntimeDiagnosticsCollector,
+ summarizeAnthropicWireRequest,
+ summarizeOpenAIWireRequest,
+} from './runtimeDiagnostics.js';
+
+describe('RuntimeDiagnosticsCollector', () => {
+ it('summarizes generate-content requests without retaining prompt text or tool args', () => {
+ const collector = new RuntimeDiagnosticsCollector({
+ enabled: true,
+ now: () => '2026-05-19T00:00:00.000Z',
+ });
+ const request = {
+ model: 'diagnostic-model',
+ contents: [
+ {
+ role: 'user',
+ parts: [{ text: 'secret user prompt' }],
+ },
+ {
+ role: 'user',
+ parts: [
+ {
+ functionResponse: {
+ id: 'tool-1',
+ name: 'read_file',
+ response: { output: 'secret tool output' },
+ },
+ },
+ ],
+ },
+ ],
+ config: {
+ systemInstruction: { parts: [{ text: 'secret system prompt' }] },
+ tools: [
+ {
+ functionDeclarations: [
+ {
+ name: 'read_file',
+ description: 'Read file',
+ parametersJsonSchema: {
+ type: 'object',
+ properties: { path: { type: 'string' } },
+ },
+ },
+ ],
+ },
+ ],
+ },
+ } satisfies GenerateContentParameters;
+
+ collector.recordGenerateContentRequest(request, {
+ stream: true,
+ source: 'generateContentStream',
+ });
+
+ const snapshot = collector.snapshot();
+ expect(snapshot.requests).toHaveLength(1);
+ expect(snapshot.requests[0]).toMatchObject({
+ index: 1,
+ source: 'generateContentStream',
+ model: 'diagnostic-model',
+ stream: true,
+ contents: {
+ count: 2,
+ roleCounts: { user: 2 },
+ partCount: 2,
+ textBytes: Buffer.byteLength('secret user prompt'),
+ functionResponseCount: 1,
+ functionResponseBytes: expect.any(Number),
+ },
+ systemInstructionBytes: Buffer.byteLength('secret system prompt'),
+ tools: {
+ count: 1,
+ functionDeclarationCount: 1,
+ schemaBytes: expect.any(Number),
+ },
+ });
+ expect(JSON.stringify(snapshot)).not.toContain('secret user prompt');
+ expect(JSON.stringify(snapshot)).not.toContain('secret tool output');
+ expect(JSON.stringify(snapshot)).not.toContain('secret system prompt');
+ });
+
+ it('summarizes OpenAI wire requests by size and role only', () => {
+ const summary = summarizeOpenAIWireRequest({
+ model: 'wire-model',
+ stream: true,
+ messages: [
+ { role: 'system', content: 'secret system' },
+ { role: 'user', content: [{ type: 'text', text: 'secret user' }] },
+ ],
+ tools: [
+ {
+ type: 'function',
+ function: {
+ name: 'run_shell_command',
+ description: 'Run shell command',
+ parameters: {
+ type: 'object',
+ properties: { command: { type: 'string' } },
+ },
+ },
+ },
+ ],
+ });
+
+ expect(summary).toMatchObject({
+ model: 'wire-model',
+ stream: true,
+ messageCount: 2,
+ messageBytesByRole: {
+ system: Buffer.byteLength('secret system'),
+ user: expect.any(Number),
+ },
+ toolsCount: 1,
+ toolSchemaBytes: expect.any(Number),
+ bodyBytes: expect.any(Number),
+ topLevelKeys: ['messages', 'model', 'stream', 'tools'],
+ });
+ expect(JSON.stringify(summary)).not.toContain('secret system');
+ expect(JSON.stringify(summary)).not.toContain('secret user');
+ });
+
+ it('summarizes Anthropic wire requests by size and role only', () => {
+ const summary = summarizeAnthropicWireRequest({
+ model: 'anthropic-wire-model',
+ stream: true,
+ system: [{ type: 'text', text: 'secret system' }],
+ messages: [
+ { role: 'user', content: 'secret user' },
+ {
+ role: 'assistant',
+ content: [
+ {
+ type: 'tool_use',
+ id: 'tool-1',
+ name: 'run_shell_command',
+ input: { command: 'secret command' },
+ },
+ ],
+ },
+ ],
+ tools: [
+ {
+ name: 'run_shell_command',
+ description: 'Run shell command',
+ input_schema: {
+ type: 'object',
+ properties: { command: { type: 'string' } },
+ },
+ },
+ ],
+ max_tokens: 1024,
+ });
+
+ expect(summary).toMatchObject({
+ model: 'anthropic-wire-model',
+ stream: true,
+ messageCount: 2,
+ messageBytesByRole: {
+ user: Buffer.byteLength('secret user'),
+ assistant: expect.any(Number),
+ },
+ systemBytes: expect.any(Number),
+ toolsCount: 1,
+ toolSchemaBytes: expect.any(Number),
+ bodyBytes: expect.any(Number),
+ topLevelKeys: [
+ 'max_tokens',
+ 'messages',
+ 'model',
+ 'stream',
+ 'system',
+ 'tools',
+ ],
+ });
+ expect(JSON.stringify(summary)).not.toContain('secret system');
+ expect(JSON.stringify(summary)).not.toContain('secret user');
+ expect(JSON.stringify(summary)).not.toContain('secret command');
+ });
+
+ it('aggregates tool use and tool result sizes without retaining payloads', () => {
+ const collector = new RuntimeDiagnosticsCollector({ enabled: true });
+
+ collector.recordToolUse('read_file', { path: '/private/path.txt' });
+ collector.recordToolResult({
+ name: 'read_file',
+ callId: 'tool-1',
+ resultBytes: 2048,
+ isError: false,
+ });
+ collector.recordToolResult({
+ name: 'run_shell_command',
+ callId: 'tool-2',
+ resultBytes: 512,
+ isError: true,
+ });
+
+ const snapshot = collector.snapshot();
+ expect(snapshot.tools).toMatchObject({
+ toolUseCount: 1,
+ toolResultCount: 2,
+ toolResultErrorCount: 1,
+ totalToolUseArgBytes: expect.any(Number),
+ maxToolUseArgBytes: expect.any(Number),
+ totalToolResultBytes: 2560,
+ maxToolResultBytes: 2048,
+ byName: {
+ read_file: {
+ uses: 1,
+ argBytes: expect.any(Number),
+ maxArgBytes: expect.any(Number),
+ results: 1,
+ errors: 0,
+ resultBytes: 2048,
+ maxResultBytes: 2048,
+ },
+ run_shell_command: {
+ uses: 0,
+ results: 1,
+ errors: 1,
+ resultBytes: 512,
+ maxResultBytes: 512,
+ },
+ },
+ });
+ expect(JSON.stringify(snapshot)).not.toContain('/private/path.txt');
+ });
+});
diff --git a/packages/core/src/utils/runtimeDiagnostics.ts b/packages/core/src/utils/runtimeDiagnostics.ts
new file mode 100644
index 0000000000..01a4ec4d6b
--- /dev/null
+++ b/packages/core/src/utils/runtimeDiagnostics.ts
@@ -0,0 +1,557 @@
+/**
+ * @license
+ * Copyright 2025 Qwen
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+import type { GenerateContentParameters } from '@google/genai';
+import type Anthropic from '@anthropic-ai/sdk';
+import type OpenAI from 'openai';
+
+export interface RuntimeDiagnosticsSnapshot {
+ enabled: boolean;
+ startedAt: string;
+ requests: GenerateContentRequestDiagnostics[];
+ openaiWireRequests: OpenAIWireRequestDiagnostics[];
+ anthropicWireRequests: AnthropicWireRequestDiagnostics[];
+ tools: RuntimeToolDiagnostics;
+}
+
+export interface GenerateContentRequestDiagnostics {
+ index: number;
+ timestamp: string;
+ source: 'generateContent' | 'generateContentStream';
+ model: string;
+ stream: boolean;
+ serializedBytes: number;
+ contents: RuntimeContentDiagnostics;
+ systemInstructionBytes: number;
+ generationConfigBytes: number;
+ tools: RuntimeToolSchemaDiagnostics;
+}
+
+export interface RuntimeContentDiagnostics {
+ count: number;
+ roleCounts: Record;
+ partCount: number;
+ textBytes: number;
+ functionCallCount: number;
+ functionCallArgBytes: number;
+ functionResponseCount: number;
+ functionResponseBytes: number;
+ inlineDataCount: number;
+ inlineDataBytes: number;
+ fileDataCount: number;
+}
+
+export interface RuntimeToolSchemaDiagnostics {
+ count: number;
+ functionDeclarationCount: number;
+ schemaBytes: number;
+}
+
+export interface OpenAIWireRequestDiagnostics {
+ index?: number;
+ timestamp?: string;
+ model: string;
+ stream: boolean;
+ bodyBytes: number;
+ messageCount: number;
+ messageBytesByRole: Record;
+ toolsCount: number;
+ toolSchemaBytes: number;
+ topLevelKeys: string[];
+}
+
+export interface AnthropicWireRequestDiagnostics {
+ index?: number;
+ timestamp?: string;
+ model: string;
+ stream: boolean;
+ bodyBytes: number;
+ messageCount: number;
+ messageBytesByRole: Record;
+ systemBytes: number;
+ toolsCount: number;
+ toolSchemaBytes: number;
+ topLevelKeys: string[];
+}
+
+export interface RuntimeToolDiagnostics {
+ toolUseCount: number;
+ toolResultCount: number;
+ toolResultErrorCount: number;
+ totalToolUseArgBytes: number;
+ maxToolUseArgBytes: number;
+ totalToolResultBytes: number;
+ maxToolResultBytes: number;
+ byName: Record;
+}
+
+export interface RuntimeToolNameDiagnostics {
+ uses: number;
+ argBytes: number;
+ maxArgBytes: number;
+ results: number;
+ errors: number;
+ resultBytes: number;
+ maxResultBytes: number;
+}
+
+export interface RuntimeToolResultRecord {
+ name: string;
+ callId: string;
+ resultBytes: number;
+ isError: boolean;
+}
+
+export interface RuntimeDiagnosticsCollectorOptions {
+ enabled?: boolean;
+ now?: () => string;
+}
+
+const RUNTIME_PROFILE_ENV = 'QWEN_CODE_PROFILE_RUNTIME';
+
+export function isRuntimeDiagnosticsEnabled(
+ env: NodeJS.ProcessEnv = process.env,
+): boolean {
+ return env[RUNTIME_PROFILE_ENV] === '1';
+}
+
+export class RuntimeDiagnosticsCollector {
+ private enabled: boolean;
+ private readonly now: () => string;
+ private startedAt: string;
+ private requestIndex = 0;
+ private openAIWireRequestIndex = 0;
+ private anthropicWireRequestIndex = 0;
+ private requests: GenerateContentRequestDiagnostics[] = [];
+ private openaiWireRequests: OpenAIWireRequestDiagnostics[] = [];
+ private anthropicWireRequests: AnthropicWireRequestDiagnostics[] = [];
+ private tools: RuntimeToolDiagnostics = createInitialToolDiagnostics();
+
+ constructor(options: RuntimeDiagnosticsCollectorOptions = {}) {
+ this.enabled = options.enabled ?? isRuntimeDiagnosticsEnabled();
+ this.now = options.now ?? (() => new Date().toISOString());
+ this.startedAt = this.now();
+ }
+
+ reset(options: { enabled?: boolean } = {}): void {
+ this.enabled = options.enabled ?? isRuntimeDiagnosticsEnabled();
+ this.startedAt = this.now();
+ this.requestIndex = 0;
+ this.openAIWireRequestIndex = 0;
+ this.anthropicWireRequestIndex = 0;
+ this.requests = [];
+ this.openaiWireRequests = [];
+ this.anthropicWireRequests = [];
+ this.tools = createInitialToolDiagnostics();
+ }
+
+ isEnabled(): boolean {
+ return this.enabled;
+ }
+
+ recordGenerateContentRequest(
+ request: GenerateContentParameters,
+ options: {
+ stream: boolean;
+ source: 'generateContent' | 'generateContentStream';
+ },
+ ): void {
+ if (!this.enabled) {
+ return;
+ }
+
+ this.requestIndex += 1;
+ this.requests.push({
+ index: this.requestIndex,
+ timestamp: this.now(),
+ source: options.source,
+ model: request.model,
+ stream: options.stream,
+ serializedBytes: utf8Bytes(toJsonSafeRequest(request)),
+ contents: summarizeContents(request.contents),
+ systemInstructionBytes: summarizeContentTextBytes(
+ request.config?.systemInstruction,
+ ),
+ generationConfigBytes: utf8Bytes(toJsonSafeConfig(request.config)),
+ tools: summarizeToolSchemas(request.config?.tools),
+ });
+ }
+
+ recordOpenAIWireRequest(
+ request: OpenAI.Chat.ChatCompletionCreateParams,
+ ): void {
+ if (!this.enabled) {
+ return;
+ }
+
+ this.openAIWireRequestIndex += 1;
+ this.openaiWireRequests.push({
+ index: this.openAIWireRequestIndex,
+ timestamp: this.now(),
+ ...summarizeOpenAIWireRequest(request),
+ });
+ }
+
+ recordAnthropicWireRequest(
+ request:
+ | Anthropic.MessageCreateParamsNonStreaming
+ | Anthropic.MessageCreateParamsStreaming,
+ ): void {
+ if (!this.enabled) {
+ return;
+ }
+
+ this.anthropicWireRequestIndex += 1;
+ this.anthropicWireRequests.push({
+ index: this.anthropicWireRequestIndex,
+ timestamp: this.now(),
+ ...summarizeAnthropicWireRequest(request),
+ });
+ }
+
+ recordToolUse(name: string, args: unknown): void {
+ if (!this.enabled) {
+ return;
+ }
+
+ const argBytes = utf8Bytes(args);
+ const tool = this.getToolNameDiagnostics(name);
+ tool.uses += 1;
+ tool.argBytes += argBytes;
+ tool.maxArgBytes = Math.max(tool.maxArgBytes, argBytes);
+ this.tools.toolUseCount += 1;
+ this.tools.totalToolUseArgBytes += argBytes;
+ this.tools.maxToolUseArgBytes = Math.max(
+ this.tools.maxToolUseArgBytes,
+ argBytes,
+ );
+ }
+
+ recordToolResult(record: RuntimeToolResultRecord): void {
+ if (!this.enabled) {
+ return;
+ }
+
+ const tool = this.getToolNameDiagnostics(record.name);
+ tool.results += 1;
+ tool.resultBytes += record.resultBytes;
+ tool.maxResultBytes = Math.max(tool.maxResultBytes, record.resultBytes);
+ if (record.isError) {
+ tool.errors += 1;
+ this.tools.toolResultErrorCount += 1;
+ }
+ this.tools.toolResultCount += 1;
+ this.tools.totalToolResultBytes += record.resultBytes;
+ this.tools.maxToolResultBytes = Math.max(
+ this.tools.maxToolResultBytes,
+ record.resultBytes,
+ );
+ }
+
+ snapshot(): RuntimeDiagnosticsSnapshot {
+ return {
+ enabled: this.enabled,
+ startedAt: this.startedAt,
+ requests: this.requests.map((request) => ({
+ ...request,
+ contents: {
+ ...request.contents,
+ roleCounts: { ...request.contents.roleCounts },
+ },
+ tools: { ...request.tools },
+ })),
+ openaiWireRequests: this.openaiWireRequests.map((request) => ({
+ ...request,
+ messageBytesByRole: { ...request.messageBytesByRole },
+ topLevelKeys: [...request.topLevelKeys],
+ })),
+ anthropicWireRequests: this.anthropicWireRequests.map((request) => ({
+ ...request,
+ messageBytesByRole: { ...request.messageBytesByRole },
+ topLevelKeys: [...request.topLevelKeys],
+ })),
+ tools: {
+ ...this.tools,
+ byName: Object.fromEntries(
+ Object.entries(this.tools.byName).map(([name, value]) => [
+ name,
+ { ...value },
+ ]),
+ ),
+ },
+ };
+ }
+
+ private getToolNameDiagnostics(name: string): RuntimeToolNameDiagnostics {
+ const existing = this.tools.byName[name];
+ if (existing) {
+ return existing;
+ }
+ const created = createInitialToolNameDiagnostics();
+ this.tools.byName[name] = created;
+ return created;
+ }
+}
+
+export const runtimeDiagnostics = new RuntimeDiagnosticsCollector();
+
+export function summarizeOpenAIWireRequest(
+ request: OpenAI.Chat.ChatCompletionCreateParams,
+): OpenAIWireRequestDiagnostics {
+ const requestRecord = asRecord(request);
+ const messages = Array.isArray(requestRecord['messages'])
+ ? requestRecord['messages']
+ : [];
+ const tools = Array.isArray(requestRecord['tools'])
+ ? requestRecord['tools']
+ : [];
+ const messageBytesByRole: Record = {};
+ for (const message of messages) {
+ const messageRecord = asRecord(message);
+ const role =
+ typeof messageRecord['role'] === 'string'
+ ? messageRecord['role']
+ : 'unknown';
+ messageBytesByRole[role] =
+ (messageBytesByRole[role] ?? 0) + utf8Bytes(messageRecord['content']);
+ }
+
+ return {
+ model:
+ typeof requestRecord['model'] === 'string'
+ ? requestRecord['model']
+ : 'unknown',
+ stream: requestRecord['stream'] === true,
+ bodyBytes: utf8Bytes(request),
+ messageCount: messages.length,
+ messageBytesByRole,
+ toolsCount: tools.length,
+ toolSchemaBytes: utf8Bytes(tools),
+ topLevelKeys: Object.keys(requestRecord).sort(),
+ };
+}
+
+export function summarizeAnthropicWireRequest(
+ request:
+ | Anthropic.MessageCreateParamsNonStreaming
+ | Anthropic.MessageCreateParamsStreaming,
+): AnthropicWireRequestDiagnostics {
+ const requestRecord = asRecord(request);
+ const messages = Array.isArray(requestRecord['messages'])
+ ? requestRecord['messages']
+ : [];
+ const tools = Array.isArray(requestRecord['tools'])
+ ? requestRecord['tools']
+ : [];
+ const messageBytesByRole: Record = {};
+ for (const message of messages) {
+ const messageRecord = asRecord(message);
+ const role =
+ typeof messageRecord['role'] === 'string'
+ ? messageRecord['role']
+ : 'unknown';
+ messageBytesByRole[role] =
+ (messageBytesByRole[role] ?? 0) + utf8Bytes(messageRecord['content']);
+ }
+
+ return {
+ model:
+ typeof requestRecord['model'] === 'string'
+ ? requestRecord['model']
+ : 'unknown',
+ stream: requestRecord['stream'] === true,
+ bodyBytes: utf8Bytes(request),
+ messageCount: messages.length,
+ messageBytesByRole,
+ systemBytes: utf8Bytes(requestRecord['system']),
+ toolsCount: tools.length,
+ toolSchemaBytes: utf8Bytes(tools),
+ topLevelKeys: Object.keys(requestRecord).sort(),
+ };
+}
+
+function createInitialToolDiagnostics(): RuntimeToolDiagnostics {
+ return {
+ toolUseCount: 0,
+ toolResultCount: 0,
+ toolResultErrorCount: 0,
+ totalToolUseArgBytes: 0,
+ maxToolUseArgBytes: 0,
+ totalToolResultBytes: 0,
+ maxToolResultBytes: 0,
+ byName: Object.create(null) as Record,
+ };
+}
+
+function createInitialToolNameDiagnostics(): RuntimeToolNameDiagnostics {
+ return {
+ uses: 0,
+ argBytes: 0,
+ maxArgBytes: 0,
+ results: 0,
+ errors: 0,
+ resultBytes: 0,
+ maxResultBytes: 0,
+ };
+}
+
+function summarizeContents(contents: unknown): RuntimeContentDiagnostics {
+ const summary: RuntimeContentDiagnostics = {
+ count: 0,
+ roleCounts: {},
+ partCount: 0,
+ textBytes: 0,
+ functionCallCount: 0,
+ functionCallArgBytes: 0,
+ functionResponseCount: 0,
+ functionResponseBytes: 0,
+ inlineDataCount: 0,
+ inlineDataBytes: 0,
+ fileDataCount: 0,
+ };
+ const contentItems = Array.isArray(contents)
+ ? contents
+ : contents === undefined || contents === null
+ ? []
+ : [contents];
+
+ for (const content of contentItems) {
+ summary.count += 1;
+ if (typeof content === 'string') {
+ summary.roleCounts['user'] = (summary.roleCounts['user'] ?? 0) + 1;
+ summary.partCount += 1;
+ summary.textBytes += utf8Bytes(content);
+ continue;
+ }
+
+ const contentRecord = asRecord(content);
+ const role =
+ typeof contentRecord['role'] === 'string'
+ ? contentRecord['role']
+ : 'unknown';
+ summary.roleCounts[role] = (summary.roleCounts[role] ?? 0) + 1;
+ const parts = Array.isArray(contentRecord['parts'])
+ ? contentRecord['parts']
+ : [];
+ summarizeParts(parts, summary);
+ }
+
+ return summary;
+}
+
+function summarizeContentTextBytes(content: unknown): number {
+ const summary = summarizeContents(content);
+ return summary.textBytes;
+}
+
+function summarizeParts(
+ parts: unknown[],
+ summary: RuntimeContentDiagnostics,
+): void {
+ for (const part of parts) {
+ summary.partCount += 1;
+ if (typeof part === 'string') {
+ summary.textBytes += utf8Bytes(part);
+ continue;
+ }
+ const partRecord = asRecord(part);
+ if (typeof partRecord['text'] === 'string') {
+ summary.textBytes += utf8Bytes(partRecord['text']);
+ }
+ const functionCall = asOptionalRecord(partRecord['functionCall']);
+ if (functionCall) {
+ summary.functionCallCount += 1;
+ summary.functionCallArgBytes += utf8Bytes(functionCall['args']);
+ }
+ const functionResponse = asOptionalRecord(partRecord['functionResponse']);
+ if (functionResponse) {
+ summary.functionResponseCount += 1;
+ summary.functionResponseBytes +=
+ utf8Bytes(functionResponse['response']) +
+ utf8Bytes(functionResponse['parts']);
+ }
+ const inlineData = asOptionalRecord(partRecord['inlineData']);
+ if (inlineData) {
+ summary.inlineDataCount += 1;
+ summary.inlineDataBytes += utf8Bytes(inlineData['data']);
+ }
+ if (partRecord['fileData']) {
+ summary.fileDataCount += 1;
+ }
+ }
+}
+
+function summarizeToolSchemas(tools: unknown): RuntimeToolSchemaDiagnostics {
+ const toolList = Array.isArray(tools) ? tools : [];
+ let functionDeclarationCount = 0;
+ for (const tool of toolList) {
+ const toolRecord = asRecord(tool);
+ const declarations = Array.isArray(toolRecord['functionDeclarations'])
+ ? toolRecord['functionDeclarations']
+ : [];
+ functionDeclarationCount += declarations.length;
+ }
+ return {
+ count: toolList.length,
+ functionDeclarationCount,
+ schemaBytes: utf8Bytes(toolList),
+ };
+}
+
+function toJsonSafeRequest(request: GenerateContentParameters): unknown {
+ return {
+ model: request.model,
+ contents: request.contents,
+ config: toJsonSafeConfig(request.config),
+ };
+}
+
+function toJsonSafeConfig(
+ config: GenerateContentParameters['config'],
+): unknown {
+ if (!config) {
+ return undefined;
+ }
+ const configRecord = asRecord(config);
+ const safeConfig: Record = {};
+ for (const [key, value] of Object.entries(configRecord)) {
+ if (key === 'abortSignal') {
+ continue;
+ }
+ safeConfig[key] = value;
+ }
+ return safeConfig;
+}
+
+function utf8Bytes(value: unknown): number {
+ if (value === undefined || value === null) {
+ return 0;
+ }
+ if (typeof value === 'string') {
+ return Buffer.byteLength(value, 'utf8');
+ }
+ return Buffer.byteLength(safeStringify(value), 'utf8');
+}
+
+function safeStringify(value: unknown): string {
+ try {
+ return JSON.stringify(value) ?? '';
+ } catch {
+ return '[unserializable]';
+ }
+}
+
+function asRecord(value: unknown): Record {
+ return typeof value === 'object' && value !== null
+ ? (value as Record)
+ : {};
+}
+
+function asOptionalRecord(value: unknown): Record | null {
+ return typeof value === 'object' && value !== null
+ ? (value as Record)
+ : null;
+}
From 25712fdacb17ae4f332031972b819ba41470bc31 Mon Sep 17 00:00:00 2001
From: yiliang114 <1204183885@qq.com>
Date: Wed, 20 May 2026 21:22:15 +0800
Subject: [PATCH 04/11] fix(cli): update doctorCommand test mocks for new
MemoryDiagnostics interface
Add missing maxRSSRaw, maxRSSUnit, and processTree fields to test fixtures
to match the updated MemoryResourceUsage and MemoryDiagnostics interfaces.
---
.../cli/src/ui/commands/doctorCommand.test.ts | 24 +++++++++++++++----
1 file changed, 20 insertions(+), 4 deletions(-)
diff --git a/packages/cli/src/ui/commands/doctorCommand.test.ts b/packages/cli/src/ui/commands/doctorCommand.test.ts
index f9afbd969c..315ebc8cbc 100644
--- a/packages/cli/src/ui/commands/doctorCommand.test.ts
+++ b/packages/cli/src/ui/commands/doctorCommand.test.ts
@@ -143,10 +143,13 @@ describe('doctorCommand', () => {
},
],
resourceUsage: {
- maxRSS: 4_000,
+ maxRSS: 4 * 1024,
+ maxRSSRaw: 4,
+ maxRSSUnit: 'KiB',
userCPUTime: 10,
systemCPUTime: 20,
},
+ processTree: null,
activeHandles: 2,
activeRequests: 0,
openFileDescriptors: null,
@@ -839,10 +842,13 @@ describe('doctorCommand', () => {
nativeContexts: 1,
},
resourceUsage: {
- maxRSS: 8_000,
+ maxRSS: 8 * 1024,
+ maxRSSRaw: 8,
+ maxRSSUnit: 'KiB',
userCPUTime: 10,
systemCPUTime: 20,
},
+ processTree: null,
activeHandles: 2,
activeRequests: 0,
v8HeapSpaces: null,
@@ -946,10 +952,13 @@ describe('doctorCommand', () => {
},
v8HeapSpaces: null,
resourceUsage: {
- maxRSS: 4_000,
+ maxRSS: 4 * 1024,
+ maxRSSRaw: 4,
+ maxRSSUnit: 'KiB',
userCPUTime: 10,
systemCPUTime: 20,
},
+ processTree: null,
activeHandles: 2,
activeRequests: 0,
openFileDescriptors: null,
@@ -992,7 +1001,14 @@ describe('doctorCommand', () => {
detachedContexts: 0,
nativeContexts: 1,
},
- resourceUsage: { maxRSS: 0, userCPUTime: 0, systemCPUTime: 0 },
+ resourceUsage: {
+ maxRSS: 0,
+ maxRSSRaw: 0,
+ maxRSSUnit: 'KiB',
+ userCPUTime: 0,
+ systemCPUTime: 0,
+ },
+ processTree: null,
activeHandles: 0,
activeRequests: 0,
v8HeapSpaces: null,
From 81cf0ef68d378c3fe0b2791c94c98c5a45248f3e Mon Sep 17 00:00:00 2001
From: yiliang114 <1204183885@qq.com>
Date: Wed, 20 May 2026 21:43:04 +0800
Subject: [PATCH 05/11] fix(vscode-ide-companion): use public core imports
---
packages/core/src/index.ts | 7 ++++-
.../vscode-ide-companion/src/diff-manager.ts | 2 +-
.../src/extension.test.ts | 16 +++++------
.../vscode-ide-companion/src/extension.ts | 2 +-
.../src/ide-server.test.ts | 21 ++++++++-------
.../vscode-ide-companion/src/ide-server.ts | 4 +--
.../src/open-files-manager.ts | 5 +---
.../open-files-manager/notebook-handler.ts | 2 +-
.../open-files-manager/text-handler.ts | 2 +-
.../src/services/open-files-manager/utils.ts | 2 +-
.../src/services/qwenSessionManager.ts | 2 +-
.../src/services/qwenSessionReader.ts | 3 +--
.../src/utils/acpModelInfo.ts | 2 +-
.../src/utils/editorGroupUtils.ts | 4 ++-
.../src/utils/imageSupport.test.ts | 2 +-
.../handlers/FileMessageHandler.test.ts | 27 ++++++++++---------
.../webview/handlers/FileMessageHandler.ts | 6 ++---
17 files changed, 55 insertions(+), 54 deletions(-)
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index 95b516264e..73acc7e3ce 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -183,7 +183,11 @@ export * from './memory/writeContextFile.js';
export * from './ide/ide-client.js';
export * from './ide/ideContext.js';
export * from './ide/ide-installer.js';
-export { IDE_DEFINITIONS, type IdeInfo } from './ide/detect-ide.js';
+export {
+ detectIdeFromEnv,
+ IDE_DEFINITIONS,
+ type IdeInfo,
+} from './ide/detect-ide.js';
export * from './ide/constants.js';
export * from './ide/types.js';
@@ -285,6 +289,7 @@ export * from './utils/errorParsing.js';
export * from './utils/errors.js';
export * from './utils/fileUtils.js';
export * from './utils/filesearch/fileSearch.js';
+export * as crawlCache from './utils/filesearch/crawlCache.js';
export {
Ignore,
loadIgnoreRules,
diff --git a/packages/vscode-ide-companion/src/diff-manager.ts b/packages/vscode-ide-companion/src/diff-manager.ts
index 755143a4e4..ccabe3657e 100644
--- a/packages/vscode-ide-companion/src/diff-manager.ts
+++ b/packages/vscode-ide-companion/src/diff-manager.ts
@@ -7,7 +7,7 @@
import {
IdeDiffAcceptedNotificationSchema,
IdeDiffClosedNotificationSchema,
-} from '@qwen-code/qwen-code-core/src/ide/types.js';
+} from '@qwen-code/qwen-code-core';
import { type JSONRPCNotification } from '@modelcontextprotocol/sdk/types.js';
import * as path from 'node:path';
import * as vscode from 'vscode';
diff --git a/packages/vscode-ide-companion/src/extension.test.ts b/packages/vscode-ide-companion/src/extension.test.ts
index 72c3d476ef..d22062515d 100644
--- a/packages/vscode-ide-companion/src/extension.test.ts
+++ b/packages/vscode-ide-companion/src/extension.test.ts
@@ -7,18 +7,14 @@
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
import * as vscode from 'vscode';
import { activate } from './extension.js';
-import {
- IDE_DEFINITIONS,
- detectIdeFromEnv,
-} from '@qwen-code/qwen-code-core/src/ide/detect-ide.js';
-
-vi.mock('@qwen-code/qwen-code-core/src/ide/detect-ide.js', async () => {
- const actual = await vi.importActual(
- '@qwen-code/qwen-code-core/src/ide/detect-ide.js',
- );
+import { IDE_DEFINITIONS, detectIdeFromEnv } from '@qwen-code/qwen-code-core';
+
+vi.mock('@qwen-code/qwen-code-core', async (importOriginal) => {
+ const actual =
+ await importOriginal();
return {
...actual,
- detectIdeFromEnv: vi.fn(() => IDE_DEFINITIONS.vscode),
+ detectIdeFromEnv: vi.fn(() => actual.IDE_DEFINITIONS.vscode),
};
});
diff --git a/packages/vscode-ide-companion/src/extension.ts b/packages/vscode-ide-companion/src/extension.ts
index 3f83a67942..56c441af61 100644
--- a/packages/vscode-ide-companion/src/extension.ts
+++ b/packages/vscode-ide-companion/src/extension.ts
@@ -13,7 +13,7 @@ import {
detectIdeFromEnv,
IDE_DEFINITIONS,
type IdeInfo,
-} from '@qwen-code/qwen-code-core/src/ide/detect-ide.js';
+} from '@qwen-code/qwen-code-core';
import { WebViewProvider } from './webview/providers/WebViewProvider.js';
import { ChatProviderRegistry } from './webview/providers/ChatProviderRegistry.js';
import { registerChatViewProviders } from './webview/providers/chatViewRegistration.js';
diff --git a/packages/vscode-ide-companion/src/ide-server.test.ts b/packages/vscode-ide-companion/src/ide-server.test.ts
index 9c51d50215..ee99ce105e 100644
--- a/packages/vscode-ide-companion/src/ide-server.test.ts
+++ b/packages/vscode-ide-companion/src/ide-server.test.ts
@@ -38,9 +38,17 @@ vi.mock('node:os', async (importOriginal) => {
};
});
-vi.mock('@qwen-code/qwen-code-core/src/ide/detect-ide.js', () => ({
- detectIdeFromEnv: vi.fn(() => ({ name: 'vscode', displayName: 'VS Code' })),
-}));
+vi.mock('@qwen-code/qwen-code-core', async (importOriginal) => {
+ const actual =
+ await importOriginal();
+ return {
+ ...actual,
+ detectIdeFromEnv: vi.fn(() => ({
+ name: 'vscode',
+ displayName: 'VS Code',
+ })),
+ };
+});
const vscodeMock = vi.hoisted(() => ({
workspace: {
@@ -62,13 +70,6 @@ const vscodeMock = vi.hoisted(() => ({
vi.mock('vscode', () => vscodeMock);
-vi.mock('@qwen-code/qwen-code-core/src/ide/detect-ide.js', () => ({
- detectIdeFromEnv: vi.fn(() => ({
- name: 'vscode',
- displayName: 'VS Code',
- })),
-}));
-
vi.mock('./open-files-manager', () => {
const OpenFilesManager = vi.fn();
OpenFilesManager.prototype.onDidChange = vi.fn(() => ({ dispose: vi.fn() }));
diff --git a/packages/vscode-ide-companion/src/ide-server.ts b/packages/vscode-ide-companion/src/ide-server.ts
index 1122677b76..2f19fbbc92 100644
--- a/packages/vscode-ide-companion/src/ide-server.ts
+++ b/packages/vscode-ide-companion/src/ide-server.ts
@@ -7,10 +7,10 @@
import * as vscode from 'vscode';
import {
CloseDiffRequestSchema,
+ detectIdeFromEnv,
IdeContextNotificationSchema,
OpenDiffRequestSchema,
-} from '@qwen-code/qwen-code-core/src/ide/types.js';
-import { detectIdeFromEnv } from '@qwen-code/qwen-code-core/src/ide/detect-ide.js';
+} from '@qwen-code/qwen-code-core';
import { isInitializeRequest } from '@modelcontextprotocol/sdk/types.js';
import { McpServer } from '@modelcontextprotocol/sdk/server/mcp.js';
import { StreamableHTTPServerTransport } from '@modelcontextprotocol/sdk/server/streamableHttp.js';
diff --git a/packages/vscode-ide-companion/src/open-files-manager.ts b/packages/vscode-ide-companion/src/open-files-manager.ts
index ee7f595e18..30c9029ac8 100644
--- a/packages/vscode-ide-companion/src/open-files-manager.ts
+++ b/packages/vscode-ide-companion/src/open-files-manager.ts
@@ -5,10 +5,7 @@
*/
import * as vscode from 'vscode';
-import type {
- File,
- IdeContext,
-} from '@qwen-code/qwen-code-core/src/ide/types.js';
+import type { File, IdeContext } from '@qwen-code/qwen-code-core';
import {
isFileUri,
isNotebookFileUri,
diff --git a/packages/vscode-ide-companion/src/services/open-files-manager/notebook-handler.ts b/packages/vscode-ide-companion/src/services/open-files-manager/notebook-handler.ts
index 40e6637446..64907fe315 100644
--- a/packages/vscode-ide-companion/src/services/open-files-manager/notebook-handler.ts
+++ b/packages/vscode-ide-companion/src/services/open-files-manager/notebook-handler.ts
@@ -5,7 +5,7 @@
*/
import * as vscode from 'vscode';
-import type { File } from '@qwen-code/qwen-code-core/src/ide/types.js';
+import type { File } from '@qwen-code/qwen-code-core';
import { MAX_FILES, MAX_SELECTED_TEXT_LENGTH } from './constants.js';
import {
deactivateCurrentActiveFile,
diff --git a/packages/vscode-ide-companion/src/services/open-files-manager/text-handler.ts b/packages/vscode-ide-companion/src/services/open-files-manager/text-handler.ts
index 88853f31bf..a1e7dda5b4 100644
--- a/packages/vscode-ide-companion/src/services/open-files-manager/text-handler.ts
+++ b/packages/vscode-ide-companion/src/services/open-files-manager/text-handler.ts
@@ -5,7 +5,7 @@
*/
import type * as vscode from 'vscode';
-import type { File } from '@qwen-code/qwen-code-core/src/ide/types.js';
+import type { File } from '@qwen-code/qwen-code-core';
import { MAX_FILES, MAX_SELECTED_TEXT_LENGTH } from './constants.js';
import {
deactivateCurrentActiveFile,
diff --git a/packages/vscode-ide-companion/src/services/open-files-manager/utils.ts b/packages/vscode-ide-companion/src/services/open-files-manager/utils.ts
index dd4b46126a..ea59ccdbd7 100644
--- a/packages/vscode-ide-companion/src/services/open-files-manager/utils.ts
+++ b/packages/vscode-ide-companion/src/services/open-files-manager/utils.ts
@@ -5,7 +5,7 @@
*/
import * as vscode from 'vscode';
-import type { File } from '@qwen-code/qwen-code-core/src/ide/types.js';
+import type { File } from '@qwen-code/qwen-code-core';
export function isFileUri(uri: vscode.Uri): boolean {
return uri.scheme === 'file';
diff --git a/packages/vscode-ide-companion/src/services/qwenSessionManager.ts b/packages/vscode-ide-companion/src/services/qwenSessionManager.ts
index a39a37ebed..34a2f1349a 100644
--- a/packages/vscode-ide-companion/src/services/qwenSessionManager.ts
+++ b/packages/vscode-ide-companion/src/services/qwenSessionManager.ts
@@ -7,7 +7,7 @@
import * as fs from 'fs';
import * as path from 'path';
import * as crypto from 'crypto';
-import { getProjectHash } from '@qwen-code/qwen-code-core/src/utils/paths.js';
+import { getProjectHash } from '@qwen-code/qwen-code-core';
import { getRuntimeBaseDir } from '../utils/paths.js';
import type { QwenSession } from './qwenSessionReader.js';
diff --git a/packages/vscode-ide-companion/src/services/qwenSessionReader.ts b/packages/vscode-ide-companion/src/services/qwenSessionReader.ts
index 1b15598f97..abfdb126e0 100644
--- a/packages/vscode-ide-companion/src/services/qwenSessionReader.ts
+++ b/packages/vscode-ide-companion/src/services/qwenSessionReader.ts
@@ -8,8 +8,7 @@ import * as fs from 'fs';
import * as path from 'path';
import * as readline from 'readline';
import * as crypto from 'crypto';
-import { getProjectHash } from '@qwen-code/qwen-code-core/src/utils/paths.js';
-import { getGitBranch } from '@qwen-code/qwen-code-core/src/utils/gitUtils.js';
+import { getGitBranch, getProjectHash } from '@qwen-code/qwen-code-core';
import { getRuntimeBaseDir } from '../utils/paths.js';
import { truncatePanelTitle } from '../webview/utils/panelTitleUtils.js';
diff --git a/packages/vscode-ide-companion/src/utils/acpModelInfo.ts b/packages/vscode-ide-companion/src/utils/acpModelInfo.ts
index 53d14c5bcf..120873f705 100644
--- a/packages/vscode-ide-companion/src/utils/acpModelInfo.ts
+++ b/packages/vscode-ide-companion/src/utils/acpModelInfo.ts
@@ -5,7 +5,7 @@
*/
import type { ModelInfo } from '@agentclientprotocol/sdk';
-import { knownTokenLimit } from '@qwen-code/qwen-code-core/src/core/tokenLimits.js';
+import { knownTokenLimit } from '@qwen-code/qwen-code-core';
import type { ApprovalModeValue } from '../types/approvalModeValueTypes.js';
type AcpMeta = Record;
diff --git a/packages/vscode-ide-companion/src/utils/editorGroupUtils.ts b/packages/vscode-ide-companion/src/utils/editorGroupUtils.ts
index 3326cd3368..53575d4884 100644
--- a/packages/vscode-ide-companion/src/utils/editorGroupUtils.ts
+++ b/packages/vscode-ide-companion/src/utils/editorGroupUtils.ts
@@ -29,7 +29,9 @@ function findNeighborGroup(
): vscode.ViewColumn | undefined {
let candidate: vscode.ViewColumn | undefined;
for (const g of vscode.window.tabGroups.all) {
- if (!isOnSide(g.viewColumn)) continue;
+ if (!isOnSide(g.viewColumn)) {
+ continue;
+ }
if (candidate === undefined || isCloser(candidate, g.viewColumn)) {
candidate = g.viewColumn;
}
diff --git a/packages/vscode-ide-companion/src/utils/imageSupport.test.ts b/packages/vscode-ide-companion/src/utils/imageSupport.test.ts
index b2b78d0ce5..b7948655d9 100644
--- a/packages/vscode-ide-companion/src/utils/imageSupport.test.ts
+++ b/packages/vscode-ide-companion/src/utils/imageSupport.test.ts
@@ -5,7 +5,7 @@
*/
import { describe, expect, it } from 'vitest';
-import { SUPPORTED_IMAGE_MIME_TYPES } from '@qwen-code/qwen-code-core/src/utils/request-tokenizer/supportedImageFormats.js';
+import { SUPPORTED_IMAGE_MIME_TYPES } from '@qwen-code/qwen-code-core';
import { SUPPORTED_PASTED_IMAGE_MIME_TYPES } from './imageSupport.js';
describe('imageSupport constants', () => {
diff --git a/packages/vscode-ide-companion/src/webview/handlers/FileMessageHandler.test.ts b/packages/vscode-ide-companion/src/webview/handlers/FileMessageHandler.test.ts
index faeaa8f19f..3d16f841a7 100644
--- a/packages/vscode-ide-companion/src/webview/handlers/FileMessageHandler.test.ts
+++ b/packages/vscode-ide-companion/src/webview/handlers/FileMessageHandler.test.ts
@@ -61,24 +61,25 @@ const vscodeMock = vi.hoisted(() => {
});
vi.mock('vscode', () => vscodeMock);
-vi.mock(
- '@qwen-code/qwen-code-core/src/services/fileDiscoveryService.js',
- () => ({
+vi.mock('@qwen-code/qwen-code-core', async (importOriginal) => {
+ const actual =
+ await importOriginal();
+ return {
+ ...actual,
FileDiscoveryService: class {
shouldIgnoreFile(filePath: string, options?: unknown) {
return shouldIgnoreFileMock(filePath, options);
}
},
- }),
-);
-vi.mock('@qwen-code/qwen-code-core/src/utils/filesearch/fileSearch.js', () => ({
- FileSearchFactory: {
- create: () => fileSearchMock,
- },
-}));
-vi.mock('@qwen-code/qwen-code-core/src/utils/filesearch/crawlCache.js', () => ({
- clear: vi.fn(),
-}));
+ FileSearchFactory: {
+ create: () => fileSearchMock,
+ },
+ crawlCache: {
+ ...actual.crawlCache,
+ clear: vi.fn(),
+ },
+ };
+});
const readonlyProviderMock = vi.hoisted(() => ({
createUri: vi.fn(),
diff --git a/packages/vscode-ide-companion/src/webview/handlers/FileMessageHandler.ts b/packages/vscode-ide-companion/src/webview/handlers/FileMessageHandler.ts
index 547cd6108a..eaf527a147 100644
--- a/packages/vscode-ide-companion/src/webview/handlers/FileMessageHandler.ts
+++ b/packages/vscode-ide-companion/src/webview/handlers/FileMessageHandler.ts
@@ -13,12 +13,12 @@ import {
findRightGroupOfChatWebview,
} from '../../utils/editorGroupUtils.js';
import { ReadonlyFileSystemProvider } from '../../services/readonlyFileSystemProvider.js';
-import { FileDiscoveryService } from '@qwen-code/qwen-code-core/src/services/fileDiscoveryService.js';
import {
+ crawlCache,
+ FileDiscoveryService,
FileSearchFactory,
type FileSearch,
-} from '@qwen-code/qwen-code-core/src/utils/filesearch/fileSearch.js';
-import * as crawlCache from '@qwen-code/qwen-code-core/src/utils/filesearch/crawlCache.js';
+} from '@qwen-code/qwen-code-core';
import { getErrorMessage } from '../../utils/errorMessage.js';
/**
From b55c7efb549fd1b2c099aa9bc93eb21480aba6e4 Mon Sep 17 00:00:00 2001
From: yiliang114 <1204183885@qq.com>
Date: Wed, 20 May 2026 22:10:33 +0800
Subject: [PATCH 06/11] =?UTF-8?q?fix:=20address=20review=20comments=20?=
=?UTF-8?q?=E2=80=94=20type=20guards,=20dead=20fallbacks,=20and=20doc=20ac?=
=?UTF-8?q?curacy?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit
Code:
- Fix unsound type guard: `'text' in part` → `typeof part.text === 'string'`
in geminiChat.ts and client.ts (Copilot + wenshao feedback)
- Remove unnecessary optional chaining and dead fallback chains in client.ts
(getHistoryShallow, peekLastHistoryEntry, getHistoryLength, etc. now call
GeminiChat methods directly)
- Add 5s timeout to `execFileAsync('ps', ...)` in memoryDiagnostics.ts
Docs:
- Fix GiB conversion accuracy and add single-run caveat to summary
- Add Node.js version to test environment table
- Fix auto-compaction attempt count (5→4) in OOM report
- Soften root-cause attribution certainty
- Add MCP child process context to investigation plan
- Clarify "Codex" reference (→ OpenAI Codex)
- Fix truncated MCP server name (chrome → chrome-devtools)
- Remove duplicate verification commands in benchmark table
- Clarify thread exhaustion vs V8 heap OOM distinction
- Add workload confound caveat to before/after comparison
- Fix SUMMARY_RESERVE "hard relationship" vs thinking budget contradiction
---
.../auto-compaction-threshold-redesign.md | 2 +-
...2026-05-18-qwen-memory-benchmark-report.md | 12 +++--
.../2026-05-19-oom-reproduction-report.md | 24 +++++-----
...en-runtime-diagnostics-benchmark-report.md | 40 +++++++++--------
...05-18-qwen-runtime-memory-investigation.md | 13 ++++--
packages/core/src/core/client.ts | 44 +++----------------
packages/core/src/core/geminiChat.ts | 4 +-
packages/core/src/utils/memoryDiagnostics.ts | 1 +
8 files changed, 63 insertions(+), 77 deletions(-)
diff --git a/docs/design/auto-compaction-threshold-redesign.md b/docs/design/auto-compaction-threshold-redesign.md
index 81e9d74128..79bd6a8afc 100644
--- a/docs/design/auto-compaction-threshold-redesign.md
+++ b/docs/design/auto-compaction-threshold-redesign.md
@@ -75,7 +75,7 @@ const MAX_CONSECUTIVE_FAILURES = 3; // 失败熔断阈值
数值来源:全部沿用 claude-code 的实测值([autoCompact.ts:30,62-65](src/services/compact/autoCompact.ts:30))。
-`SUMMARY_RESERVE = COMPACT_MAX_OUTPUT_TOKENS` 是关键关系:模型受 `maxOutputTokens` 硬限制约束,输出不可能超出 20K,因此 reserve 不需要额外 safety margin。`thinking + summary` 是合并预算(Gemini SDK / 多数 provider 的 `maxOutputTokens` 语义),模型自行在两者间分配。
+`SUMMARY_RESERVE = COMPACT_MAX_OUTPUT_TOKENS` 是关键关系:模型受 `maxOutputTokens` 硬限制约束,输出不可能超出 20K,因此 reserve 不需要额外 safety margin。注意:本设计关闭 thinking 后该等式成立(output budget 全部给 summary);若保留 thinking,`thinking + summary` 共享预算(Gemini SDK / 多数 provider 的 `maxOutputTokens` 语义),模型自行在两者间分配,此时 summary 的实际可用空间小于 20K(见「风险与注意事项」第 1、2 条)。
## 计算函数
diff --git a/docs/e2e-tests/2026-05-18-qwen-memory-benchmark-report.md b/docs/e2e-tests/2026-05-18-qwen-memory-benchmark-report.md
index 27cf60c278..1a7aaf3253 100644
--- a/docs/e2e-tests/2026-05-18-qwen-memory-benchmark-report.md
+++ b/docs/e2e-tests/2026-05-18-qwen-memory-benchmark-report.md
@@ -8,13 +8,18 @@ This report records local memory benchmarks for Qwen Code runtime behavior. It
compares Qwen Code across models and compares Qwen Code with Claude Code on the
same task shapes where equivalent model endpoints were available.
-The headline result is consistent across the latest matrix:
+The headline result is consistent across the latest matrix (single run per cell,
+not statistically repeated):
-- Qwen Code process-tree RSS peak: about `0.85-1.06 GiB`.
-- Claude Code process-tree RSS peak: about `0.28-0.37 GiB`.
+- Qwen Code process-tree RSS peak: about `852-1062 MiB` (`0.83-1.04 GiB`).
+- Claude Code process-tree RSS peak: about `279-366 MiB` (`0.27-0.36 GiB`).
- Qwen Code was about `2.3x-3.6x` higher in the tested
non-interactive CLI task benchmarks.
+Note: process-tree RSS includes MCP child processes (~350 MiB overhead on the
+Qwen side). This inflates the absolute numbers but the relative comparison
+remains informative since both CLIs were measured the same way.
+
The difference reproduced in small PR review, code navigation, and synthetic
diff workloads. It is therefore unlikely to be explained only by one large PR
or by one model provider.
@@ -33,6 +38,7 @@ unknown, and what diagnostics should be added next.
| Qwen Code binary | PATH-resolved `qwen` binary |
| Claude Code version used in the latest matrix | `2.1.129` |
| Claude Code binary used in the latest matrix | PATH-resolved `claude` binary |
+| Node.js version | v22.x (default system install) |
| Sampling method | External `ps` RSS sampling once per second |
| Headline metric | Process-tree RSS peak |
diff --git a/docs/e2e-tests/2026-05-19-oom-reproduction-report.md b/docs/e2e-tests/2026-05-19-oom-reproduction-report.md
index 4e07233ae7..8716e208f5 100644
--- a/docs/e2e-tests/2026-05-19-oom-reproduction-report.md
+++ b/docs/e2e-tests/2026-05-19-oom-reproduction-report.md
@@ -244,13 +244,13 @@ sendMessage()
### 日志证据解读
-| 日志观察 | 含义 |
-| ---------------------------------------------------------- | --------------------------------------------------------- |
-| 5.5 分钟内触发 **5 次** heap-pressure auto-compaction 尝试 | #3735 引入的 `tryCompress` 在高压时频繁触发 |
-| 每次 compaction 执行后 heap 占比仍 >70% | `structuredClone()` 制造的临时峰值抵消了压缩收益 |
-| 74.9% → 70.7% → 86% → 85.3% → 88.8% → 90.2% → crash | 正反馈循环:压缩→clone 峰值→heap 更高→再压缩→更高 |
-| 日志在 90.2% 后 1 秒内断裂 | 下一次 `getHistory(true)` 的 `structuredClone()` 瞬间超限 |
-| `[FILE_READ_CACHE] clear after auto tryCompress` 出现 2 次 | 证实 compaction 确实走了完整的 compress → setHistory 路径 |
+| 日志观察 | 含义 |
+| ------------------------------------------------------------------------------------- | --------------------------------------------------------- |
+| 2.5 分钟内触发 **4 次** heap-pressure auto-compaction 尝试(另有 2 次 cooldown 拒绝) | #3735 引入的 `tryCompress` 在高压时频繁触发 |
+| 每次 compaction 执行后 heap 占比仍 >70% | `structuredClone()` 制造的临时峰值抵消了压缩收益 |
+| 74.9% → 70.7% → 86% → 85.3% → 88.8% → 90.2% → crash | 正反馈循环:压缩→clone 峰值→heap 更高→再压缩→更高 |
+| 日志在 90.2% 后 1 秒内断裂 | 下一次 `getHistory(true)` 的 `structuredClone()` 瞬间超限 |
+| `[FILE_READ_CACHE] clear after auto tryCompress` 出现 2 次 | 证实 compaction 确实走了完整的 compress → setHistory 路径 |
### 正反馈死循环机制
@@ -310,10 +310,12 @@ sendMessage()
### 结论
-**#3735 (v0.15.7)** 是 OOM 频率显著上升的根本原因——它使每次 `sendMessage` 都会先跑一次
-`tryCompress()`,而 `tryCompress` 内部通过 `ChatCompressionService.compress()` →
-`chat.getHistory(true)` 做全量 `structuredClone`。在 history 较大时,这个 “先 clone 再判断
-是否需要压缩” 的设计让内存峰值从 ~1.3x 升至 ~2x+。
+**#3735 (v0.15.7)** 是 OOM 频率显著上升的最可能触发因素(非唯一根因)——它使每次
+`sendMessage` 都会先跑一次 `tryCompress()`,而 `tryCompress` 内部通过
+`ChatCompressionService.compress()` → `chat.getHistory(true)` 做全量 `structuredClone`。
+在 history 较大时,这个 “先 clone 再判断是否需要压缩” 的设计让内存峰值从 ~1.3x 升至 ~2x+。
+注:issue history 显示 OOM 报告在 #3735 之前就已存在,但 #3735 大幅增加了 structuredClone
+的调用频率,从而显著提高了 OOM 的触发概率。
**#3879 (v0.15.10)** 进一步恶化了问题——在已经处于 heap 边界时 (provider 返回 context overflow)
再触发一次全量 clone,使原本就危险的 session 更容易 crash。
diff --git a/docs/e2e-tests/2026-05-19-qwen-runtime-diagnostics-benchmark-report.md b/docs/e2e-tests/2026-05-19-qwen-runtime-diagnostics-benchmark-report.md
index ac92ec1e43..b0848761e4 100644
--- a/docs/e2e-tests/2026-05-19-qwen-runtime-diagnostics-benchmark-report.md
+++ b/docs/e2e-tests/2026-05-19-qwen-runtime-diagnostics-benchmark-report.md
@@ -164,18 +164,18 @@ Configured MCP server names:
Single-pass isolation:
-| Variant | Enabled MCPs | Tools | MCP servers | Tree RSS peak | Root RSS peak | Interpretation |
-| ------------------------- | -------------------------------------------- | ----: | ----------: | ------------: | ------------: | ------------------------------------ |
-| none | none | 19 | 0 | 444.4 MiB | 211.7 MiB | baseline without MCP |
-| full | all 4 | 48 | 4 | 857.3 MiB | 215.9 MiB | full MCP startup shape |
-| only `approval-bridge` | `approval-bridge` | 19 | 1 | 455.5 MiB | 214.0 MiB | near baseline |
-| only `env-center` | `env-center` | 19 | 1 | 452.3 MiB | 214.4 MiB | near baseline |
-| only `chrome-devtools` | `chrome-devtools` | 48 | 1 | 824.4 MiB | 209.5 MiB | large RSS increase and tool increase |
-| only `code` | `code` | 19 | 1 | 452.1 MiB | 216.6 MiB | near baseline |
-| without `approval-bridge` | `env-center`, `chrome-devtools`, `code` | 48 | 3 | 997.1 MiB | 215.4 MiB | still high; run showed variance |
-| without `env-center` | `approval-bridge`, `chrome-devtools`, `code` | 48 | 3 | 863.8 MiB | 220.9 MiB | still high |
-| without `chrome-devtools` | `approval-bridge`, `env-center`, `code` | 19 | 3 | 463.4 MiB | 221.6 MiB | returns near baseline |
-| without `code` | `approval-bridge`, `env-center`, `chrome` | 48 | 3 | 858.1 MiB | 219.5 MiB | still high |
+| Variant | Enabled MCPs | Tools | MCP servers | Tree RSS peak | Root RSS peak | Interpretation |
+| ------------------------- | -------------------------------------------------- | ----: | ----------: | ------------: | ------------: | ------------------------------------ |
+| none | none | 19 | 0 | 444.4 MiB | 211.7 MiB | baseline without MCP |
+| full | all 4 | 48 | 4 | 857.3 MiB | 215.9 MiB | full MCP startup shape |
+| only `approval-bridge` | `approval-bridge` | 19 | 1 | 455.5 MiB | 214.0 MiB | near baseline |
+| only `env-center` | `env-center` | 19 | 1 | 452.3 MiB | 214.4 MiB | near baseline |
+| only `chrome-devtools` | `chrome-devtools` | 48 | 1 | 824.4 MiB | 209.5 MiB | large RSS increase and tool increase |
+| only `code` | `code` | 19 | 1 | 452.1 MiB | 216.6 MiB | near baseline |
+| without `approval-bridge` | `env-center`, `chrome-devtools`, `code` | 48 | 3 | 997.1 MiB | 215.4 MiB | still high; run showed variance |
+| without `env-center` | `approval-bridge`, `chrome-devtools`, `code` | 48 | 3 | 863.8 MiB | 220.9 MiB | still high |
+| without `chrome-devtools` | `approval-bridge`, `env-center`, `code` | 19 | 3 | 463.4 MiB | 221.6 MiB | returns near baseline |
+| without `code` | `approval-bridge`, `env-center`, `chrome-devtools` | 48 | 3 | 858.1 MiB | 219.5 MiB | still high |
Because startup RSS has some variance, the key variants were repeated twice:
@@ -522,9 +522,12 @@ The process exited with:
libc++abi: terminating due to uncaught exception of type std::__1::system_error: thread constructor failed: Resource temporarily unavailable
```
-This is not the same failure string as the V8 heap OOM logs. It is still
-important because it occurred in a disabled-MCP, no-build/test, interactive
-long-session review where the Qwen Node process itself crossed about 1 GiB RSS.
+This is a **thread exhaustion** error, not a V8 heap OOM. The failure mechanism
+is distinct: the OS refused to create a new thread, likely due to per-process
+resource limits (`RLIMIT_NPROC`) or memory fragmentation preventing stack
+allocation. It is still relevant because it occurred in a disabled-MCP,
+no-build/test, interactive long-session review where the Qwen Node process
+itself crossed about 1 GiB RSS.
The failure happened during the final summary phase, after the controller had
already completed six review turns.
@@ -831,9 +834,6 @@ Verification commands:
| `npm run typecheck --workspace=packages/cli` | passed |
| `npm run bundle` | passed |
| `npm run build` | failed in `packages/vscode-ide-companion` lint on existing internal-module import rules; core, CLI, bundle, and targeted tests above passed |
-| `npm run bundle` | passed |
-| `npm run typecheck --workspace=packages/core` | passed |
-| `npm run typecheck --workspace=packages/cli` | passed |
The full root `npm run build` was not clean in this worktree because the
`vscode-ide-companion` package hit pre-existing `import/no-internal-modules`
@@ -842,7 +842,9 @@ completed successfully.
The same PR review prompt was then run with a temporary config where MCP and
hooks were disabled. Both rows were interrupted after a bounded long-run window
-instead of waiting for a full review to finish.
+instead of waiting for a full review to finish. **Caveat**: the two runs are
+confounded by workload size (79K vs 390K tokens) and cannot be compared as a
+controlled experiment. The comparison only shows directional evidence.
| Variant | Runtime | MCP servers | Tools | Assistant messages | Tool use/result blocks | Parent tool ids | Total tokens | Max input tokens | Root max RSS |
| ----------------- | ------: | ----------: | ----: | -----------------: | ---------------------: | --------------: | -----------: | ---------------: | -----------: |
diff --git a/docs/plans/2026-05-18-qwen-runtime-memory-investigation.md b/docs/plans/2026-05-18-qwen-runtime-memory-investigation.md
index ed044b18de..c6c7e3834a 100644
--- a/docs/plans/2026-05-18-qwen-runtime-memory-investigation.md
+++ b/docs/plans/2026-05-18-qwen-runtime-memory-investigation.md
@@ -122,7 +122,11 @@ The most likely buckets are:
4. **Subagent and saved-output amplification**: previous large PR tests showed
saved-output recovery and subagent activity, which can add memory and token
pressure.
-5. **Native memory versus JS heap split**: external RSS cannot tell whether the
+5. **MCP child processes**: the companion diagnostics report revealed that MCP
+ servers (e.g. chrome-devtools) contribute ~350 MiB to process-tree RSS. This
+ inflates the absolute numbers but is a constant overhead unrelated to session
+ length.
+6. **Native memory versus JS heap split**: external RSS cannot tell whether the
pressure is V8 heap, native buffers, loaded modules, or retained data.
This is deliberately phrased as an inference. The next step is to add enough
@@ -166,9 +170,10 @@ These are candidates, not conclusions:
5. **Subagent accounting**: expose subagent lifecycle and memory impact in
diagnostics.
-Claude Code and Codex should be used as design references for diagnostic
-separation, bounded output retention, and lazy history loading. The implementation
-should still follow Qwen Code's own architecture and tests.
+Claude Code and OpenAI Codex (OpenAI's CLI coding agent) should be used as
+design references for diagnostic separation, bounded output retention, and lazy
+history loading. The implementation should still follow Qwen Code's own
+architecture and tests.
## Validation Plan
diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index 59d8ae9be0..d89a41a455 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -303,7 +303,7 @@ export class GeminiClient {
getHistoryShallow(curated: boolean = false): Content[] {
const chat = this.getChat();
- return chat.getHistoryShallow?.(curated) ?? chat.getHistory(curated);
+ return chat.getHistoryShallow(curated);
}
getHistoryTail(count: number, curated: boolean = false): Content[] {
@@ -314,51 +314,19 @@ export class GeminiClient {
count: number,
curated: boolean = false,
): Content[] {
- const chat = this.getChat();
- if (chat.getHistoryTailShallow) {
- return chat.getHistoryTailShallow(count, curated);
- }
- if (chat.getHistoryTail) {
- return chat.getHistoryTail(count, curated);
- }
- return chat.getHistory(curated).slice(-count);
+ return this.getChat().getHistoryTailShallow(count, curated);
}
private peekLastHistoryEntry(): Content | undefined {
- const chat = this.getChat();
- return (
- chat.peekLastHistoryEntry?.() ??
- chat.getHistoryTail?.(1)?.[0] ??
- chat.getHistory().at(-1)
- );
+ return this.getChat().peekLastHistoryEntry();
}
private getHistoryLength(): number {
- const chat = this.getChat();
- return (
- chat.getHistoryLength?.() ??
- chat.getHistoryShallow?.().length ??
- chat.getHistory().length
- );
+ return this.getChat().getHistoryLength();
}
private getLastModelMessageText(): string | undefined {
- const chat = this.getChat();
- if (chat.getLastModelMessageText) {
- return chat.getLastModelMessageText();
- }
- const history = chat.getHistoryShallow?.() ?? chat.getHistory();
- for (let i = history.length - 1; i >= 0; i--) {
- const message = history[i];
- if (message?.role !== 'model') continue;
- const text =
- message.parts
- ?.filter((part): part is { text: string } => 'text' in part)
- .map((part) => part.text)
- .join('') ?? '';
- return text || undefined;
- }
- return undefined;
+ return this.getChat().getLastModelMessageText();
}
/**
@@ -2055,7 +2023,7 @@ export class GeminiClient {
);
if (info.compressionStatus === CompressionStatus.COMPRESSED) {
const chat = this.getChat();
- const compressedHistory = chat.getHistoryShallow?.() ?? chat.getHistory();
+ const compressedHistory = chat.getHistoryShallow();
await this.startChat(compressedHistory, SessionStartSource.Compact);
if (
!this.lastSessionStartContext &&
diff --git a/packages/core/src/core/geminiChat.ts b/packages/core/src/core/geminiChat.ts
index d20e63065f..2655acd61c 100644
--- a/packages/core/src/core/geminiChat.ts
+++ b/packages/core/src/core/geminiChat.ts
@@ -1224,7 +1224,9 @@ export class GeminiChat {
if (message?.role !== 'model') continue;
const text =
message.parts
- ?.filter((part): part is { text: string } => 'text' in part)
+ ?.filter(
+ (part): part is { text: string } => typeof part.text === 'string',
+ )
.map((part) => part.text)
.join('') ?? '';
return text || undefined;
diff --git a/packages/core/src/utils/memoryDiagnostics.ts b/packages/core/src/utils/memoryDiagnostics.ts
index b142d4b361..e5f3718b61 100644
--- a/packages/core/src/utils/memoryDiagnostics.ts
+++ b/packages/core/src/utils/memoryDiagnostics.ts
@@ -267,6 +267,7 @@ async function collectProcessTreeMemoryUsage(
const { stdout } = await execFileAsync('ps', ['-axo', 'pid=,ppid=,rss='], {
maxBuffer: 1024 * 1024,
+ timeout: 5000,
});
const rows = parsePsRows(stdout);
const rootPid = process.pid;
From cf72a00c7384d408d462f8477808ba17f5c53d88 Mon Sep 17 00:00:00 2001
From: yiliang114 <1204183885@qq.com>
Date: Wed, 20 May 2026 23:07:11 +0800
Subject: [PATCH 07/11] fix(core): restore fallback chains in client.ts for
mock compatibility
The previous commit removed optional chaining from client.ts wrapper
methods, but client.test.ts mocks getChat() with partial objects that
lack the new shallow methods. Restore ?. fallback chains so both
production (GeminiChat) and test (mock) paths work correctly.
---
packages/core/src/core/client.ts | 36 ++++++++++++++++++++++++++------
1 file changed, 30 insertions(+), 6 deletions(-)
diff --git a/packages/core/src/core/client.ts b/packages/core/src/core/client.ts
index d89a41a455..efd0043fc1 100644
--- a/packages/core/src/core/client.ts
+++ b/packages/core/src/core/client.ts
@@ -303,7 +303,7 @@ export class GeminiClient {
getHistoryShallow(curated: boolean = false): Content[] {
const chat = this.getChat();
- return chat.getHistoryShallow(curated);
+ return chat.getHistoryShallow?.(curated) ?? chat.getHistory(curated);
}
getHistoryTail(count: number, curated: boolean = false): Content[] {
@@ -314,19 +314,43 @@ export class GeminiClient {
count: number,
curated: boolean = false,
): Content[] {
- return this.getChat().getHistoryTailShallow(count, curated);
+ const chat = this.getChat();
+ return (
+ chat.getHistoryTailShallow?.(count, curated) ??
+ chat.getHistoryTail?.(count, curated) ??
+ chat.getHistory(curated).slice(-count)
+ );
}
private peekLastHistoryEntry(): Content | undefined {
- return this.getChat().peekLastHistoryEntry();
+ const chat = this.getChat();
+ return chat.peekLastHistoryEntry?.() ?? chat.getHistory().at(-1);
}
private getHistoryLength(): number {
- return this.getChat().getHistoryLength();
+ const chat = this.getChat();
+ return chat.getHistoryLength?.() ?? chat.getHistory().length;
}
private getLastModelMessageText(): string | undefined {
- return this.getChat().getLastModelMessageText();
+ const chat = this.getChat();
+ if (chat.getLastModelMessageText) {
+ return chat.getLastModelMessageText();
+ }
+ const history = chat.getHistoryShallow?.() ?? chat.getHistory();
+ for (let i = history.length - 1; i >= 0; i--) {
+ const message = history[i];
+ if (message?.role !== 'model') continue;
+ const text =
+ message.parts
+ ?.filter(
+ (part): part is { text: string } => typeof part.text === 'string',
+ )
+ .map((part) => part.text)
+ .join('') ?? '';
+ return text || undefined;
+ }
+ return undefined;
}
/**
@@ -2023,7 +2047,7 @@ export class GeminiClient {
);
if (info.compressionStatus === CompressionStatus.COMPRESSED) {
const chat = this.getChat();
- const compressedHistory = chat.getHistoryShallow();
+ const compressedHistory = chat.getHistoryShallow?.() ?? chat.getHistory();
await this.startChat(compressedHistory, SessionStartSource.Compact);
if (
!this.lastSessionStartContext &&
From 1ee6fee4c6be56fcdb78577e97ae51fba74e6b92 Mon Sep 17 00:00:00 2001
From: yiliang114 <1204183885@qq.com>
Date: Wed, 20 May 2026 23:48:15 +0800
Subject: [PATCH 08/11] docs: clarify memory review follow-ups
---
docs/plans/2026-05-18-qwen-runtime-memory-investigation.md | 4 ++--
packages/core/src/utils/forkedAgent.ts | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/docs/plans/2026-05-18-qwen-runtime-memory-investigation.md b/docs/plans/2026-05-18-qwen-runtime-memory-investigation.md
index c6c7e3834a..393e3dc8dc 100644
--- a/docs/plans/2026-05-18-qwen-runtime-memory-investigation.md
+++ b/docs/plans/2026-05-18-qwen-runtime-memory-investigation.md
@@ -6,8 +6,8 @@ Date: 2026-05-18
Local benchmarks show Qwen Code using substantially more process-tree RSS than
Claude Code for similar non-interactive CLI task shapes. The latest five-case
-matrix found Qwen Code peaking around `0.85-1.06 GiB` while Claude Code stayed
-around `0.28-0.37 GiB`.
+matrix found Qwen Code peaking around `0.83-1.04 GiB` while Claude Code stayed
+around `0.27-0.36 GiB`.
This document proposes a draft investigation and optimization direction. It is
not intended to claim a final root cause yet. The immediate goal is to make the
diff --git a/packages/core/src/utils/forkedAgent.ts b/packages/core/src/utils/forkedAgent.ts
index c9c56ef936..f8b13ebf43 100644
--- a/packages/core/src/utils/forkedAgent.ts
+++ b/packages/core/src/utils/forkedAgent.ts
@@ -66,7 +66,7 @@ import {
export interface CacheSafeParams {
/** Full generation config including systemInstruction and tools */
generationConfig: GenerateContentConfig;
- /** Curated conversation history (deep clone) */
+ /** Curated conversation history (shallow copy; consumers must not mutate) */
history: Content[];
/** Model identifier */
model: string;
From c161e0aa4c4829feb79f9d5d5984156572fd339e Mon Sep 17 00:00:00 2001
From: yiliang114 <1204183885@qq.com>
Date: Thu, 21 May 2026 00:21:36 +0800
Subject: [PATCH 09/11] docs: fix runtime benchmark unit conversion
---
.../2026-05-19-qwen-runtime-diagnostics-benchmark-report.md | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/docs/e2e-tests/2026-05-19-qwen-runtime-diagnostics-benchmark-report.md b/docs/e2e-tests/2026-05-19-qwen-runtime-diagnostics-benchmark-report.md
index b0848761e4..e482f0f94c 100644
--- a/docs/e2e-tests/2026-05-19-qwen-runtime-diagnostics-benchmark-report.md
+++ b/docs/e2e-tests/2026-05-19-qwen-runtime-diagnostics-benchmark-report.md
@@ -257,7 +257,7 @@ Overlapping small PR `#4268` model snapshot:
show a persistent child-process memory owner. The dominant process is the
main Node process.
2. The local bundle run peaks around 0.36-0.41 GiB, not the earlier
- 0.85-1.06 GiB, because the matrix used a stripped temporary config. A
+ 0.83-1.04 GiB, because the matrix used a stripped temporary config. A
follow-up normal-config sanity check reproduced about 1.1 GiB tree RSS on
both PATH `qwen` and local `dist/cli.js`, with the extra memory coming from
child MCP/Node processes in the process tree.
From 517175843c6991ca3003d2729cc4bfa86c9ecda2 Mon Sep 17 00:00:00 2001
From: yiliang114 <1204183885@qq.com>
Date: Thu, 21 May 2026 00:43:13 +0800
Subject: [PATCH 10/11] docs: add default-heap OOM stress report
---
...-0.15.11-default-heap-oom-stress-report.md | 338 ++++++++++++++++++
1 file changed, 338 insertions(+)
create mode 100644 docs/e2e-tests/2026-05-21-qwen-0.15.11-default-heap-oom-stress-report.md
diff --git a/docs/e2e-tests/2026-05-21-qwen-0.15.11-default-heap-oom-stress-report.md b/docs/e2e-tests/2026-05-21-qwen-0.15.11-default-heap-oom-stress-report.md
new file mode 100644
index 0000000000..e9579dee1b
--- /dev/null
+++ b/docs/e2e-tests/2026-05-21-qwen-0.15.11-default-heap-oom-stress-report.md
@@ -0,0 +1,338 @@
+# Qwen Code 0.15.11 默认 Heap OOM 压测报告
+
+日期:2026-05-21
+
+## 测试范围
+
+本报告记录了针对 Qwen Code `0.15.11` 最新本地构建的一轮默认 heap 压测。
+这轮测试的目标是验证:在不人为降低内存上限的情况下,当前代码是否还能复现
+issue 中提到的长会话 OOM,以及在更极端的大输出场景下还有没有新的风险。
+
+本轮覆盖三个模型:
+
+- `pai/glm-5`
+- `qwen3.6-plus`
+- `DeepSeek/deepseek-v4-pro`
+
+测试分为两部分:
+
+1. 真实长任务、多 agent 并发 review 循环。
+2. amplified foreground stdout 压测,即用大规模前台 shell stdout 放大
+ tool-output 路径压力。
+
+## 测试环境
+
+| 项目 | 值 |
+| --------------------------- | --------------------------------------------- |
+| 分支 | `codex/memory-investigation-draft-pr` |
+| Commit | `c161e0aa4` |
+| CLI | 本地 `dist/cli.js` |
+| CLI 版本 | `0.15.11` |
+| Node 默认 heap limit | `4144 MiB` |
+| `NODE_OPTIONS` | 未设置 |
+| 显式 `--max-old-space-size` | 未设置 |
+| runner `ulimit` | runner 未设置 |
+| 配置模式 | 临时复制 `~/.qwen`,并隔离 `QWEN_RUNTIME_DIR` |
+| MCP / 正常配置 | 尽量按复制后的正常配置加载 |
+
+注意:这里的 CLI 版本显示为 `0.15.11`,是因为 package version 尚未 bump。
+实际测试对象是 commit `c161e0aa4` 下本地编译出的 `dist/cli.js`,不是 PATH
+里的全局 `qwen` 可执行文件。
+
+本轮没有修改全局 Qwen 配置。原始 runtime artifacts 在:
+
+- `.qwen/runtime-bench/2026-05-20T13-51-58-731Z-oom-stress`
+- `.qwen/runtime-bench/2026-05-20T15-20-37-790Z-oom-amplified`
+
+注意:本轮里 `env-center` MCP server 启动失败,但其他内置工具和部分
+MCP/child process 仍然加载。因此这些结果代表当前本地环境,不是完全 stripped
+的 `--bare` 环境。
+
+## 核心结论
+
+最新本地构建在 issue 最关心的“长会话 V8 heap OOM”路径上表现明显更好。
+基于这轮默认 heap、多模型、多 agent、长任务压测,可以认为本 PR 对此前遇到的
+long-session heap OOM 问题已经基本解决,至少在当前复现维度下已经不能再复现
+原始 heap OOM。
+
+真实长任务、多 agent 并发测试一共执行了:
+
+- 23 个 worker turn
+- 约 `719,094,118` reported total tokens
+- 77 次 agent tool call
+- 856 次总 tool call
+
+这部分没有复现任何传统 V8 heap OOM 特征:
+
+- `JavaScript heap out of memory`
+- `Reached heap limit`
+- `Ineffective mark-compacts near heap limit`
+- `Allocation failed`
+
+真实长任务阶段最高 process-tree RSS 为 `874.7 MiB`,最高 root-process RSS 为
+`219.1 MiB`。这说明在默认 heap 下,当前代码没有轻易复现原 issue 中那种长任务
+跑挂的 heap OOM。
+
+第二阶段 amplified stdout 压测更激进。它一共执行了 18 个 payload attempt,
+覆盖三个模型和 `128 MiB` 到 `2048 MiB` 的 foreground stdout payload。
+
+结果是:
+
+- 三个模型都成功跑过 `1536 MiB` payload。
+- 最高成功 process-tree RSS 是 `5964.7 MiB`,出现在 `qwen3.6-plus`
+ 的 `1536 MiB` payload。
+- 到 `2048 MiB` payload 时,出现了一个新的 extreme large-output failure。
+
+`2048 MiB` 的结果:
+
+- `pai/glm-5`:`exit=1`,stdout 为空,没有标准 OOM 文本。
+- `qwen3.6-plus`:`exit=1`,stdout 为空,没有标准 OOM 文本。
+- `DeepSeek/deepseek-v4-pro`:出现 V8 fatal:
+ `Check failed: i::kMaxInt >= len`,栈在
+ `v8::String::NewFromOneByte` / `node::StringBytes::Encode` /
+ `DecodeUTF8`。
+
+这个新问题不是原 issue 中的传统 long-session heap OOM。它更像是
+multi-GiB foreground stdout 被解码/构造成 JS string 时触发的 V8 字符串长度
+限制或大输出处理问题。建议作为 large-output follow-up 跟踪,而不是把它当作
+当前长会话 heap-pressure 修复失败。
+
+## Phase 1:真实长任务、多 Agent 并发压测
+
+### 测试形态
+
+每个模型 worker 都复用同一个 session,不断 `--resume`。每一轮要求 Qwen Code:
+
+- 进行只读代码审查和代码搜索;
+- 在同一轮中并发启动至少 4 个 `agent` tool call;
+- 重点检查 chat history、compaction、subagent runtime、non-interactive
+ streaming、provider adapters 等 memory 相关区域;
+- 保留足够详细的最终回答,让 session history 自然增长。
+
+runner 每秒采样 process-tree RSS,没有设置任何额外 heap cap。
+
+这部分在观察到内存比较稳定后用 `SIGTERM` 主动停止,以便切换到第二阶段的
+amplified stdout 压测。因此表里的 `SIGTERM` 不是 OOM。
+
+### 汇总结果
+
+| Model | Worker turns | Total tokens | Agent calls | Tool calls | Peak tree RSS | Peak root RSS | Last exit | OOM |
+| -------------------------- | -----------: | --------------: | ----------: | ---------: | ------------: | ------------: | --------- | ------ |
+| `pai/glm-5` | 9 | 444,614,704 | 36 | 362 | 874.7 MiB | 217.4 MiB | `SIGTERM` | no |
+| `qwen3.6-plus` | 7 | 101,425,927 | 17 | 346 | 862.7 MiB | 219.1 MiB | `SIGTERM` | no |
+| `DeepSeek/deepseek-v4-pro` | 7 | 173,053,487 | 24 | 148 | 864.5 MiB | 213.8 MiB | `SIGTERM` | no |
+| **Total / max** | **23** | **719,094,118** | **77** | **856** | **874.7 MiB** | **219.1 MiB** | - | **no** |
+
+### 分轮结果
+
+| Model | Turn | Exit | Timed out | OOM | Peak tree RSS | Peak root RSS | Total tokens | Agent calls | Tool calls |
+| -------------------------- | ---: | --------- | --------- | --- | ------------: | ------------: | -----------: | ----------: | ---------: |
+| `DeepSeek/deepseek-v4-pro` | 1 | `0` | no | no | 709.1 MiB | 167.3 MiB | 5,565,147 | 4 | 37 |
+| `DeepSeek/deepseek-v4-pro` | 2 | `0` | no | no | 674.5 MiB | 118.8 MiB | 13,989,721 | 4 | 29 |
+| `DeepSeek/deepseek-v4-pro` | 3 | `0` | no | no | 734.1 MiB | 148.0 MiB | 22,621,542 | 4 | 24 |
+| `DeepSeek/deepseek-v4-pro` | 4 | `0` | no | no | 771.1 MiB | 107.5 MiB | 33,470,249 | 4 | 22 |
+| `DeepSeek/deepseek-v4-pro` | 5 | `0` | no | no | 864.5 MiB | 212.9 MiB | 43,540,313 | 4 | 19 |
+| `DeepSeek/deepseek-v4-pro` | 6 | `0` | no | no | 807.6 MiB | 167.9 MiB | 53,866,515 | 4 | 17 |
+| `DeepSeek/deepseek-v4-pro` | 7 | `SIGTERM` | no | no | 785.1 MiB | 213.8 MiB | n/a | n/a | n/a |
+| `pai/glm-5` | 1 | `SIGTERM` | yes | no | 742.8 MiB | 170.5 MiB | 17,071,519 | 4 | 142 |
+| `pai/glm-5` | 2 | `0` | no | no | 874.7 MiB | 217.4 MiB | 27,438,727 | 4 | 60 |
+| `pai/glm-5` | 3 | `0` | no | no | 699.7 MiB | 102.1 MiB | 35,627,222 | 4 | 38 |
+| `pai/glm-5` | 4 | `0` | no | no | 796.0 MiB | 194.0 MiB | 44,130,101 | 4 | 23 |
+| `pai/glm-5` | 5 | `0` | no | no | 743.4 MiB | 152.1 MiB | 50,465,979 | 4 | 26 |
+| `pai/glm-5` | 6 | `0` | no | no | 714.9 MiB | 125.2 MiB | 56,357,372 | 4 | 18 |
+| `pai/glm-5` | 7 | `0` | no | no | 694.5 MiB | 96.6 MiB | 64,047,037 | 4 | 20 |
+| `pai/glm-5` | 8 | `0` | no | no | 756.0 MiB | 136.8 MiB | 71,891,505 | 4 | 15 |
+| `pai/glm-5` | 9 | `SIGTERM` | no | no | 755.7 MiB | 157.3 MiB | 77,585,242 | 4 | 20 |
+| `qwen3.6-plus` | 1 | `0` | no | no | 735.1 MiB | 153.1 MiB | 3,890,508 | 4 | 83 |
+| `qwen3.6-plus` | 2 | `0` | no | no | 702.4 MiB | 142.5 MiB | 4,300,186 | 1 | 9 |
+| `qwen3.6-plus` | 3 | `0` | no | no | 862.7 MiB | 219.1 MiB | 8,635,953 | 4 | 88 |
+| `qwen3.6-plus` | 4 | `SIGTERM` | yes | no | 685.8 MiB | 106.5 MiB | n/a | n/a | n/a |
+| `qwen3.6-plus` | 5 | `0` | no | no | 610.5 MiB | 93.1 MiB | 40,191,337 | 4 | 87 |
+| `qwen3.6-plus` | 6 | `0` | no | no | 723.6 MiB | 121.9 MiB | 44,407,943 | 4 | 79 |
+| `qwen3.6-plus` | 7 | `SIGTERM` | no | no | 810.4 MiB | 116.0 MiB | n/a | n/a | n/a |
+
+### Phase 1 解读
+
+这是本轮里最能说明原始 long-session OOM 已明显改善的数据。
+
+这组测试比 5 月 18 日的小 PR review / code navigation 更重:它包含更多
+`--resume`、更多 subagent activity、更大的 reported token 量和更多 tool call。
+但 process-tree RSS 始终低于 `0.9 GiB`,也没有出现传统 V8 heap OOM。
+
+这不能证明所有用户 OOM 都不可能再发生,但至少说明当前构建在默认 heap 下,
+已经无法轻易复现 issue 中那类长会话 heap-pressure OOM。
+
+## Phase 2:Amplified Foreground Stdout 压测
+
+### 测试形态
+
+第二阶段故意放大 shell-output 路径压力。每个模型、每个 payload size 都要求
+parent session 和并发 agents 运行前台 shell 命令,输出大量 `x` 到 stdout:
+
+```bash
+node -e "const chunk='x'.repeat(1024*1024); for (let i=0; i= len.
+...
+v8::String::NewFromOneByte
+node::StringBytes::Encode
+node::encoding_binding::BindingData::DecodeUTF8
+```
+
+触发条件:
+
+- Model:`DeepSeek/deepseek-v4-pro`
+- Payload:`2048 MiB`
+- Peak tree RSS:`4660.4 MiB`
+- Largest process RSS:`4527.6 MiB`
+- runner 记录 exit:`SIGTERM`,因为 fatal 输出已经捕获后,剩余子进程仍在高 CPU
+ 空转,被手动终止。
+
+`pai/glm-5` 和 `qwen3.6-plus` 在 `2048 MiB` 也失败,表现为 stdout 为空、
+exit code `1`,但 stderr 没有捕获到 V8 fatal stack。
+
+### 严重程度
+
+这是一个真实的 robustness 问题,但触发条件是 multi-GiB foreground stdout,
+不是正常代码审查任务。它也不能证明当前 long-session heap-pressure 修复失败。
+
+### 是否是本 PR 引入?
+
+本轮没有证据表明 `2048 MiB` stdout failure 是当前 memory PR 引入的回归。
+
+原因:
+
+- 失败路径是 foreground shell stdout decode / string construction。
+- 原 issue 路径是 long-session history、compaction、clone pressure。
+- 本轮没有做同 payload 的 pre-PR baseline,因此不能归因成 regression。
+- 该 failure 只在刻意极端的 `2048 MiB` payload 出现;`128 MiB` 到
+ `1536 MiB` 都能完成。
+
+建议把它作为 dedicated large-output follow-up:更早 stream / spool / hard-cap
+foreground shell output,避免在内存里构造 multi-GiB JS string。除非当前 PR 的目标
+明确包含“任意 multi-GiB 前台 stdout 都必须可处理”,否则不建议把它作为当前 PR 的
+blocker。
+
+## 结论
+
+1. 最新本地 `0.15.11` 构建在 issue 报告的 long-session heap OOM 方向上明显更好。
+ 基于当前默认 heap 压测结果,可以认为本 PR 已经基本解决此前遇到的
+ long-session heap OOM 复现路径。
+
+2. 在默认 Node heap 下,真实长任务 + 多 agent review loop 没有在
+ `pai/glm-5`、`qwen3.6-plus`、`DeepSeek/deepseek-v4-pro` 三个模型上复现传统
+ V8 heap OOM。
+
+3. synthetic foreground stdout 压测仍能把 process-tree RSS 推得很高。当前构建在
+ 三模型上都撑过了 `1536 MiB` payload,最高成功 tree RSS 是 `5964.7 MiB`。
+
+4. 仍然存在一个独立的极端 large-output 问题:`2048 MiB` stdout 附近,Qwen Code
+ 可能在输出 JSON 结果前失败;DeepSeek case 捕获到了 V8 string-length fatal。
+
+5. 这个新发现重要,但更像是后续 large-output robustness 问题,不应直接作为
+ long-session heap-pressure mitigation 的 blocker。
+
+## 建议发到 PR 的评论摘要
+
+建议 PR 评论里只放精简摘要,完整数据放本文档:
+
+```markdown
+I reran default-heap stress tests on the latest local build with
+`pai/glm-5`, `qwen3.6-plus`, and `DeepSeek/deepseek-v4-pro`.
+
+No `NODE_OPTIONS`, `--max-old-space-size`, or runner `ulimit` was used. The
+local Node heap limit was about 4144 MiB.
+
+Results:
+
+- Realistic long-session + multi-agent review loop: 23 worker turns,
+ ~719M reported total tokens, 77 agent calls, 856 total tool calls.
+ No traditional V8 heap OOM was reproduced. Peak process-tree RSS was
+ 874.7 MiB; peak root RSS was 219.1 MiB.
+- Amplified stdout stress: 18 payload attempts across 128 MiB -> 2048 MiB.
+ All three models completed through 1536 MiB payloads without traditional
+ heap OOM. Highest successful process-tree RSS was 5964.7 MiB.
+- At 2048 MiB foreground stdout, an extreme large-output failure remains.
+ DeepSeek captured a V8 fatal `Check failed: i::kMaxInt >= len` stack in
+ `String::NewFromOneByte` / `StringBytes::Encode` / `DecodeUTF8`.
+
+Conclusion: this PR appears to have effectively addressed the previously
+observed long-session heap OOM reproduction path under default heap. The
+2048 MiB stdout failure is a separate large-output/string-limit robustness issue
+and should be tracked as a follow-up rather than treated as the same
+long-session heap OOM regression.
+```
From 7acf542a019ea2dab18620073dd4e48f2afe5bb5 Mon Sep 17 00:00:00 2001
From: yiliang114 <1204183885@qq.com>
Date: Thu, 21 May 2026 02:01:57 +0800
Subject: [PATCH 11/11] fix: update copyright year to 2026 in new files [skip
ci]
New files added in this PR had 2025 copyright headers. Updated to 2026
to reflect the current year.
---
packages/core/src/utils/runtimeDiagnostics.test.ts | 2 +-
packages/core/src/utils/runtimeDiagnostics.ts | 2 +-
2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/packages/core/src/utils/runtimeDiagnostics.test.ts b/packages/core/src/utils/runtimeDiagnostics.test.ts
index 8fcd81d5b3..cff3de3c33 100644
--- a/packages/core/src/utils/runtimeDiagnostics.test.ts
+++ b/packages/core/src/utils/runtimeDiagnostics.test.ts
@@ -1,6 +1,6 @@
/**
* @license
- * Copyright 2025 Qwen
+ * Copyright 2026 Qwen
* SPDX-License-Identifier: Apache-2.0
*/
diff --git a/packages/core/src/utils/runtimeDiagnostics.ts b/packages/core/src/utils/runtimeDiagnostics.ts
index 01a4ec4d6b..74f367bba9 100644
--- a/packages/core/src/utils/runtimeDiagnostics.ts
+++ b/packages/core/src/utils/runtimeDiagnostics.ts
@@ -1,6 +1,6 @@
/**
* @license
- * Copyright 2025 Qwen
+ * Copyright 2026 Qwen
* SPDX-License-Identifier: Apache-2.0
*/