Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changeset/image-compression.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
"@moonshot-ai/kimi-code-sdk": minor
"@moonshot-ai/kimi-code": minor
---

Automatically compress oversized images before they reach the model. Whatever the source — pasted into the CLI, uploaded from the web/desktop client, sent over ACP, read via `ReadMediaFile`, or returned by an MCP tool — images are downsampled (longest edge ≤ 2000px) and re-encoded to fit a per-image byte budget, cutting vision-token cost and avoiding provider image-size errors. Screenshots stay lossless PNG and only degrade to JPEG when the byte budget cannot otherwise be met. Compression runs as an input-stage step at each ingestion point (while the content part is built), and guards against decompression bombs by skipping absurdly large pixel/byte payloads before decoding. Best-effort: if it fails for any reason the original image is sent unchanged.
1 change: 1 addition & 0 deletions apps/kimi-code/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@
"chalk": "^5.4.1",
"cli-highlight": "^2.1.11",
"commander": "^13.1.0",
"jimp": "^1.6.1",
"pathe": "^2.0.3",
"postject": "1.0.0-alpha.6",
"semver": "^7.7.4",
Expand Down
15 changes: 14 additions & 1 deletion apps/kimi-code/src/tui/controllers/editor-keyboard.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import type { Session } from '@moonshot-ai/kimi-code-sdk';
import { compressImageForModel } from '@moonshot-ai/kimi-code-sdk';

import { ClipboardMediaError, readClipboardMedia } from '#/utils/clipboard/clipboard-image';
import { parseImageMeta } from '#/utils/image/image-mime';
Expand Down Expand Up @@ -360,7 +361,19 @@ export class EditorKeyboardController {

const meta = parseImageMeta(media.bytes);
if (meta === null) return false;
const attachment = this.imageStore.addImage(media.bytes, meta.mime, meta.width, meta.height);
// Compress at ingestion — a pure data step while building the attachment, so
// the stored bytes, the inline thumbnail, the `[image #N (W×H)]` placeholder,
// and the submitted image all agree, and the agent core only ever sees an
// already-compressed image. Best effort: originals pass through on failure.
const compressed = await compressImageForModel(media.bytes, meta.mime);
const attachment = compressed.changed
? this.imageStore.addImage(
compressed.data,
compressed.mimeType,
compressed.width,
compressed.height,
)
: this.imageStore.addImage(media.bytes, meta.mime, meta.width, meta.height);
this.host.state.editor.insertTextAtCursor?.(`${attachment.placeholder} `);
this.host.state.ui.requestRender();
this.host.track('shortcut_paste', { kind: 'image' });
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/**
* Clipboard image paste → attachment store, with ingestion-time compression.
*
* Tests pin:
* - an oversized pasted image is downsampled while building the attachment,
* so the stored bytes, the `[image #N (W×H)]` placeholder, and the eventual
* submitted image all agree on the compressed size
* - a within-budget paste is stored byte-for-byte (fast path)
*/

import { Jimp } from 'jimp';
import { beforeEach, describe, expect, it, vi } from 'vitest';

import {
EditorKeyboardController,
type EditorKeyboardHost,
} from '#/tui/controllers/editor-keyboard';
import { ImageAttachmentStore } from '#/tui/utils/image-attachment-store';
import { parseImageMeta } from '#/utils/image/image-mime';

// vitest hoists vi.mock/vi.hoisted above the imports above, so the mock still
// applies to the editor-keyboard module that pulls in readClipboardMedia.
const { readClipboardMedia } = vi.hoisted(() => ({ readClipboardMedia: vi.fn() }));

vi.mock('#/utils/clipboard/clipboard-image', async (importActual) => {
const actual = await importActual<typeof import('#/utils/clipboard/clipboard-image')>();
return { ...actual, readClipboardMedia };
});

interface PasteHarness {
readonly store: ImageAttachmentStore;
pasteImage(): Promise<void>;
}

function createPasteHarness(): PasteHarness {
const editor: Record<string, ((...args: never[]) => unknown) | undefined> = {};
const store = new ImageAttachmentStore();
const host = {
state: {
editor,
activeDialog: null,
appState: { streamingPhase: 'idle', isCompacting: false },
footer: { setTransientHint: vi.fn() },
ui: { requestRender: vi.fn() },
},
session: undefined,
btwPanelController: { closeOrCancel: vi.fn(() => false) },
track: vi.fn(),
showError: vi.fn(),
openUndoSelector: vi.fn(),
cancelRunningShellCommand: vi.fn(),
} as unknown as EditorKeyboardHost;

const controller = new EditorKeyboardController(host, store);
controller.install();

return {
store,
async pasteImage() {
const handler = editor['onPasteImage'];
if (handler === undefined) throw new Error('onPasteImage handler not installed');
await (handler as () => Promise<boolean>)();
},
};
}

async function solidPng(width: number, height: number): Promise<Uint8Array> {
return new Uint8Array(
await new Jimp({ width, height, color: 0x3366ccff }).getBuffer('image/png'),
);
}

describe('clipboard image paste compression', () => {
beforeEach(() => {
readClipboardMedia.mockReset();
});

it('downsamples an oversized pasted image before storing it', async () => {
const big = await solidPng(2600, 2600);
readClipboardMedia.mockResolvedValue({ kind: 'image', bytes: big, mimeType: 'image/png' });

const { store, pasteImage } = createPasteHarness();
await pasteImage();

expect(store.size()).toBe(1);
const att = store.get(1);
expect(att?.kind).toBe('image');
if (att?.kind !== 'image') throw new Error('expected image attachment');

// Stored metadata reflects the compressed size.
expect(Math.max(att.width, att.height)).toBeLessThanOrEqual(2000);
expect(att.placeholder).toContain('2000×2000');

// The stored bytes decode to the compressed dimensions — the thumbnail and
// the submitted image both read from these bytes, so they cannot diverge.
const dims = parseImageMeta(att.bytes);
expect(dims).not.toBeNull();
expect(Math.max(dims!.width, dims!.height)).toBeLessThanOrEqual(2000);
});

it('stores a within-budget paste byte-for-byte', async () => {
const small = await solidPng(80, 80);
readClipboardMedia.mockResolvedValue({ kind: 'image', bytes: small, mimeType: 'image/png' });

const { store, pasteImage } = createPasteHarness();
await pasteImage();

const att = store.get(1);
if (att?.kind !== 'image') throw new Error('expected image attachment');
expect(att.width).toBe(80);
expect(att.height).toBe(80);
expect(att.bytes).toBe(small); // identity: no re-encode on the fast path
});
});
2 changes: 1 addition & 1 deletion flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@
inherit (finalAttrs) pname version src pnpmWorkspaces;
inherit pnpm;
fetcherVersion = 3;
hash = "sha256-oratz8x67ZEJGTiNy+s4XaKe0TtpRKh63aIqkV79vvM=";
hash = "sha256-mqyi0VuPZwESZcdU5E8F3XUG99OH636knBfb8y6TQpw=";
};

nativeBuildInputs = [
Expand Down
3 changes: 3 additions & 0 deletions packages/acp-adapter/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,8 @@
"@moonshot-ai/agent-core": "workspace:^",
"@moonshot-ai/kaos": "workspace:^",
"@moonshot-ai/kimi-code-sdk": "workspace:^"
},
"devDependencies": {
"jimp": "^1.6.1"
}
}
36 changes: 36 additions & 0 deletions packages/acp-adapter/src/convert.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import type { ContentBlock, ToolCallContent } from '@agentclientprotocol/sdk';
import {
log,
compressBase64ForModel,
type PromptPart,
type ToolInputDisplay,
type ToolResultEvent,
Expand Down Expand Up @@ -71,6 +72,41 @@ export function acpBlocksToPromptParts(
return out;
}

/**
* Shrink oversized inline images in a prompt-part list — the ACP ingestion
* point's input-stage compression, mirroring the CLI's paste-time and the
* server's upload-time step. Best effort: a part that cannot be compressed is
* passed through unchanged.
*/
export async function compressPromptImageParts(
parts: readonly PromptPart[],
): Promise<PromptPart[]> {
const out: PromptPart[] = [];
for (const part of parts) {
if (part.type === 'image_url') {
const parsed = parseImageDataUrl(part.imageUrl.url);
if (parsed !== null) {
const result = await compressBase64ForModel(parsed.base64, parsed.mimeType);
if (result.changed) {
out.push({
type: 'image_url',
imageUrl: { ...part.imageUrl, url: `data:${result.mimeType};base64,${result.base64}` },
});
continue;
}
}
}
out.push(part);
}
return out;
}

function parseImageDataUrl(url: string): { mimeType: string; base64: string } | null {
const match = /^data:([^;,]+);base64,(.*)$/s.exec(url);
if (match === null) return null;
return { mimeType: match[1]!, base64: match[2]! };
}

/**
* Minimum-viable XML-attribute escaping for prompt-embedded resource
* wrappers. The output is consumed by an LLM, not parsed by a canonical
Expand Down
4 changes: 2 additions & 2 deletions packages/acp-adapter/src/session.ts
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ import {
} from './builtin-commands';
import { buildSessionConfigOptions } from './config-options';
import { listModelsFromHarness } from './model-catalog';
import { acpBlocksToPromptParts } from './convert';
import { acpBlocksToPromptParts, compressPromptImageParts } from './convert';
import {
acpToolCallId,
assistantDeltaToSessionUpdate,
Expand Down Expand Up @@ -715,7 +715,7 @@ export class AcpSession {
* sees a JSON-RPC error rather than a hung request.
*/
async prompt(blocks: readonly ContentBlock[]): Promise<PromptResponse> {
const parts = acpBlocksToPromptParts(blocks);
const parts = await compressPromptImageParts(acpBlocksToPromptParts(blocks));
Comment thread
RealKai42 marked this conversation as resolved.
Outdated
const sessionId = this.id;
const conn = this.conn;

Expand Down
35 changes: 34 additions & 1 deletion packages/acp-adapter/test/convert.test.ts
Original file line number Diff line number Diff line change
@@ -1,10 +1,15 @@
import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';

import type { ContentBlock } from '@agentclientprotocol/sdk';
import { Jimp } from 'jimp';

import { log, type ToolInputDisplay } from '@moonshot-ai/kimi-code-sdk';

import { acpBlocksToPromptParts, displayBlockToAcpContent } from '../src/convert';
import {
acpBlocksToPromptParts,
compressPromptImageParts,
displayBlockToAcpContent,
} from '../src/convert';

const textBlock = (text: string): ContentBlock => ({ type: 'text', text });
const imageBlock = (data: string, mimeType: string): ContentBlock => ({
Expand Down Expand Up @@ -320,3 +325,31 @@ describe('displayBlockToAcpContent — plan_review branch (Phase 13.2)', () => {
expect(displayBlockToAcpContent(cmd)).toBeNull();
});
});

describe('compressPromptImageParts', () => {
async function pngBase64(width: number, height: number): Promise<string> {
const buf = await new Jimp({ width, height, color: 0x3366ccff }).getBuffer('image/png');
return Buffer.from(buf).toString('base64');
}

it('downsamples an oversized inline image part', async () => {
const parts = acpBlocksToPromptParts([imageBlock(await pngBase64(2600, 2600), 'image/png')]);
const compressed = await compressPromptImageParts(parts);

const part = compressed[0];
if (part?.type !== 'image_url') throw new Error('expected an image_url part');
const match = /^data:(image\/[a-z]+);base64,(.+)$/.exec(part.imageUrl.url);
expect(match).not.toBeNull();
const decoded = await Jimp.fromBuffer(Buffer.from(match![2]!, 'base64'));
expect(Math.max(decoded.width, decoded.height)).toBeLessThanOrEqual(2000);
});

it('passes a within-budget image and text through unchanged', async () => {
const parts = acpBlocksToPromptParts([
imageBlock(await pngBase64(32, 32), 'image/png'),
textBlock('hi'),
]);
const compressed = await compressPromptImageParts(parts);
expect(compressed).toEqual(parts);
});
});
1 change: 1 addition & 0 deletions packages/agent-core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@
"ajv-formats": "^3.0.1",
"chokidar": "^4.0.3",
"ignore": "^5.3.2",
"jimp": "^1.6.1",
"js-yaml": "^4.1.1",
"linkedom": "^0.18.12",
"node-pty": "^1.1.0",
Expand Down
17 changes: 17 additions & 0 deletions packages/agent-core/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,23 @@ export type {
QuestionBackgroundTaskInfo,
} from './agent/background';
export type { ToolServices } from './tools/support/services';

// Image compression — the input-stage helper each ingestion site (CLI paste,
// server upload resolution, ACP, ReadMediaFile, MCP) calls to shrink oversized
// images while constructing the content part. Re-exported from the package root
// so consumers (node-sdk, server) import it without a deep subpath.
export {
compressImageForModel,
compressBase64ForModel,
compressImageContentParts,
IMAGE_BYTE_BUDGET,
MAX_IMAGE_EDGE_PX,
} from './tools/support/image-compress';
export type {
CompressImageOptions,
CompressImageResult,
CompressBase64Result,
} from './tools/support/image-compress';
export { SingleModelProvider } from './session/provider-manager';
export type {
BearerTokenProvider,
Expand Down
11 changes: 8 additions & 3 deletions packages/agent-core/src/mcp/output.ts
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@

import type { ContentPart } from '@moonshot-ai/kosong';

import { compressImageContentParts } from '../tools/support/image-compress';
import type { MCPContentBlock, MCPToolResult } from './types';

// MCP servers can produce arbitrarily large outputs; cap what we feed back to
Expand Down Expand Up @@ -130,10 +131,10 @@ export function convertMCPContentBlock(block: MCPContentBlock): ContentPart | nu
* `mcp__github__create_pr`) — embedded into the `<mcp_tool_result name="…">`
* wrap when the result is media-only, so the model can attribute binary parts.
*/
export function mcpResultToExecutableOutput(
export async function mcpResultToExecutableOutput(
result: MCPToolResult,
qualifiedToolName: string,
): { output: string | ContentPart[]; isError: boolean; truncated?: true } {
): Promise<{ output: string | ContentPart[]; isError: boolean; truncated?: true }> {
const converted: ContentPart[] = [];
for (const block of result.content) {
const part = convertMCPContentBlock(block);
Expand All @@ -143,7 +144,11 @@ export function mcpResultToExecutableOutput(
}

const wrapped = wrapMediaOnly(converted, qualifiedToolName);
const limited = applyOutputLimits(wrapped);
// Shrink oversized images BEFORE the per-part byte cap, so a large but
// compressible screenshot is downsampled and kept rather than dropped to a
// text notice. Best effort: parts that cannot be compressed pass through.
const compressed = await compressImageContentParts(wrapped);
Comment thread
RealKai42 marked this conversation as resolved.
const limited = applyOutputLimits(compressed);
const output = collapseSingleText(limited.parts);
return limited.truncated
? { output, isError: result.isError, truncated: true }
Expand Down
Loading
Loading