Skip to content
Merged
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .changeset/image-compression.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
"@moonshot-ai/kimi-code-sdk": minor
"@moonshot-ai/kimi-code": minor
---

Automatically compress oversized images before they reach the model. Whatever the source — pasted into the CLI, uploaded from the web/desktop client, sent over ACP, read via `ReadMediaFile`, or returned by an MCP tool — images are downsampled (longest edge ≤ 2000px) and re-encoded to fit a per-image byte budget, cutting vision-token cost and avoiding provider image-size errors. Screenshots stay lossless PNG and only degrade to JPEG when the byte budget cannot otherwise be met. Compression runs as an input-stage step at each ingestion point (while the content part is built), and guards against decompression bombs by skipping absurdly large pixel/byte payloads before decoding. Best-effort: if it fails for any reason the original image is sent unchanged.
1 change: 1 addition & 0 deletions apps/kimi-code/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@
"chalk": "^5.4.1",
"cli-highlight": "^2.1.11",
"commander": "^13.1.0",
"jimp": "^1.6.1",
"pathe": "^2.0.3",
"postject": "1.0.0-alpha.6",
"semver": "^7.7.4",
Expand Down
15 changes: 14 additions & 1 deletion apps/kimi-code/src/tui/controllers/editor-keyboard.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import type { Session } from '@moonshot-ai/kimi-code-sdk';
import { compressImageForModel } from '@moonshot-ai/kimi-code-sdk';

import { ClipboardMediaError, readClipboardMedia } from '#/utils/clipboard/clipboard-image';
import { parseImageMeta } from '#/utils/image/image-mime';
Expand Down Expand Up @@ -360,7 +361,19 @@ export class EditorKeyboardController {

const meta = parseImageMeta(media.bytes);
if (meta === null) return false;
const attachment = this.imageStore.addImage(media.bytes, meta.mime, meta.width, meta.height);
// Compress at ingestion — a pure data step while building the attachment, so
// the stored bytes, the inline thumbnail, the `[image #N (W×H)]` placeholder,
// and the submitted image all agree, and the agent core only ever sees an
// already-compressed image. Best effort: originals pass through on failure.
const compressed = await compressImageForModel(media.bytes, meta.mime);
const attachment = compressed.changed
? this.imageStore.addImage(
compressed.data,
compressed.mimeType,
compressed.width,
compressed.height,
)
: this.imageStore.addImage(media.bytes, meta.mime, meta.width, meta.height);
this.host.state.editor.insertTextAtCursor?.(`${attachment.placeholder} `);
this.host.state.ui.requestRender();
this.host.track('shortcut_paste', { kind: 'image' });
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
/**
* Clipboard image paste → attachment store, with ingestion-time compression.
*
* Tests pin:
* - an oversized pasted image is downsampled while building the attachment,
* so the stored bytes, the `[image #N (W×H)]` placeholder, and the eventual
* submitted image all agree on the compressed size
* - a within-budget paste is stored byte-for-byte (fast path)
*/

import { Jimp } from 'jimp';
import { beforeEach, describe, expect, it, vi } from 'vitest';

import {
EditorKeyboardController,
type EditorKeyboardHost,
} from '#/tui/controllers/editor-keyboard';
import { ImageAttachmentStore } from '#/tui/utils/image-attachment-store';
import { parseImageMeta } from '#/utils/image/image-mime';

// vitest hoists vi.mock/vi.hoisted above the imports above, so the mock still
// applies to the editor-keyboard module that pulls in readClipboardMedia.
const { readClipboardMedia } = vi.hoisted(() => ({ readClipboardMedia: vi.fn() }));

vi.mock('#/utils/clipboard/clipboard-image', async (importActual) => {
const actual = await importActual<typeof import('#/utils/clipboard/clipboard-image')>();
return { ...actual, readClipboardMedia };
});

interface PasteHarness {
readonly store: ImageAttachmentStore;
pasteImage(): Promise<void>;
}

function createPasteHarness(): PasteHarness {
const editor: Record<string, ((...args: never[]) => unknown) | undefined> = {};
const store = new ImageAttachmentStore();
const host = {
state: {
editor,
activeDialog: null,
appState: { streamingPhase: 'idle', isCompacting: false },
footer: { setTransientHint: vi.fn() },
ui: { requestRender: vi.fn() },
},
session: undefined,
btwPanelController: { closeOrCancel: vi.fn(() => false) },
track: vi.fn(),
showError: vi.fn(),
openUndoSelector: vi.fn(),
cancelRunningShellCommand: vi.fn(),
} as unknown as EditorKeyboardHost;

const controller = new EditorKeyboardController(host, store);
controller.install();

return {
store,
async pasteImage() {
const handler = editor['onPasteImage'];
if (handler === undefined) throw new Error('onPasteImage handler not installed');
await (handler as () => Promise<boolean>)();
},
};
}

async function solidPng(width: number, height: number): Promise<Uint8Array> {
return new Uint8Array(
await new Jimp({ width, height, color: 0x3366ccff }).getBuffer('image/png'),
);
}

describe('clipboard image paste compression', () => {
beforeEach(() => {
readClipboardMedia.mockReset();
});

it('downsamples an oversized pasted image before storing it', async () => {
const big = await solidPng(2600, 2600);
readClipboardMedia.mockResolvedValue({ kind: 'image', bytes: big, mimeType: 'image/png' });

const { store, pasteImage } = createPasteHarness();
await pasteImage();

expect(store.size()).toBe(1);
const att = store.get(1);
expect(att?.kind).toBe('image');
if (att?.kind !== 'image') throw new Error('expected image attachment');

// Stored metadata reflects the compressed size.
expect(Math.max(att.width, att.height)).toBeLessThanOrEqual(2000);
expect(att.placeholder).toContain('2000×2000');

// The stored bytes decode to the compressed dimensions — the thumbnail and
// the submitted image both read from these bytes, so they cannot diverge.
const dims = parseImageMeta(att.bytes);
expect(dims).not.toBeNull();
expect(Math.max(dims!.width, dims!.height)).toBeLessThanOrEqual(2000);
});

it('stores a within-budget paste byte-for-byte', async () => {
const small = await solidPng(80, 80);
readClipboardMedia.mockResolvedValue({ kind: 'image', bytes: small, mimeType: 'image/png' });

const { store, pasteImage } = createPasteHarness();
await pasteImage();

const att = store.get(1);
if (att?.kind !== 'image') throw new Error('expected image attachment');
expect(att.width).toBe(80);
expect(att.height).toBe(80);
expect(att.bytes).toBe(small); // identity: no re-encode on the fast path
});
});
2 changes: 1 addition & 1 deletion flake.nix
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@
inherit (finalAttrs) pname version src pnpmWorkspaces;
inherit pnpm;
fetcherVersion = 3;
hash = "sha256-oratz8x67ZEJGTiNy+s4XaKe0TtpRKh63aIqkV79vvM=";
hash = "sha256-mqyi0VuPZwESZcdU5E8F3XUG99OH636knBfb8y6TQpw=";
};

nativeBuildInputs = [
Expand Down
3 changes: 3 additions & 0 deletions packages/acp-adapter/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -44,5 +44,8 @@
"@moonshot-ai/agent-core": "workspace:^",
"@moonshot-ai/kaos": "workspace:^",
"@moonshot-ai/kimi-code-sdk": "workspace:^"
},
"devDependencies": {
"jimp": "^1.6.1"
}
}
36 changes: 36 additions & 0 deletions packages/acp-adapter/src/convert.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import type { ContentBlock, ToolCallContent } from '@agentclientprotocol/sdk';
import {
log,
compressBase64ForModel,
type PromptPart,
type ToolInputDisplay,
type ToolResultEvent,
Expand Down Expand Up @@ -71,6 +72,41 @@ export function acpBlocksToPromptParts(
return out;
}

/**
* Shrink oversized inline images in a prompt-part list — the ACP ingestion
* point's input-stage compression, mirroring the CLI's paste-time and the
* server's upload-time step. Best effort: a part that cannot be compressed is
* passed through unchanged.
*/
export async function compressPromptImageParts(
parts: readonly PromptPart[],
): Promise<PromptPart[]> {
const out: PromptPart[] = [];
for (const part of parts) {
if (part.type === 'image_url') {
const parsed = parseImageDataUrl(part.imageUrl.url);
if (parsed !== null) {
const result = await compressBase64ForModel(parsed.base64, parsed.mimeType);
if (result.changed) {
out.push({
type: 'image_url',
imageUrl: { ...part.imageUrl, url: `data:${result.mimeType};base64,${result.base64}` },
});
continue;
}
}
}
out.push(part);
}
return out;
}

function parseImageDataUrl(url: string): { mimeType: string; base64: string } | null {
const match = /^data:([^;,]+);base64,(.*)$/s.exec(url);
if (match === null) return null;
return { mimeType: match[1]!, base64: match[2]! };
}

/**
* Minimum-viable XML-attribute escaping for prompt-embedded resource
* wrappers. The output is consumed by an LLM, not parsed by a canonical
Expand Down
30 changes: 28 additions & 2 deletions packages/acp-adapter/src/session.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import {
type KimiErrorPayload,
type KimiHarness,
type McpServerInfo,
type PromptPart,
type QuestionAnswers,
type QuestionRequest,
type Session,
Expand All @@ -38,7 +39,7 @@ import {
} from './builtin-commands';
import { buildSessionConfigOptions } from './config-options';
import { listModelsFromHarness } from './model-catalog';
import { acpBlocksToPromptParts } from './convert';
import { acpBlocksToPromptParts, compressPromptImageParts } from './convert';
import {
acpToolCallId,
assistantDeltaToSessionUpdate,
Expand Down Expand Up @@ -147,6 +148,13 @@ export class AcpSession {
*/
private skillCommandMap: ReadonlyMap<string, string> = new Map();

// One token per in-flight `prompt()` that is still awaiting image compression
// (before any turn exists). A `session/cancel` in that window has no turn to
// abort, so it flips every token and each affected `prompt()` returns
// `cancelled` instead of launching. A set (not a single field) so concurrent
// prompts are all covered rather than only the most recent.
private readonly pendingPromptAborts = new Set<{ aborted: boolean }>();

/**
* The most recent command palette advertised to the ACP client. Used by
* `/help` so the response matches the client's `available_commands_update`
Expand Down Expand Up @@ -268,6 +276,11 @@ export class AcpSession {
* acceptable.
*/
async cancel(): Promise<void> {
// If any prompt is mid-compression (no turn yet), mark them aborted so they
// do not launch once compression finishes.
for (const pending of this.pendingPromptAborts) {
pending.aborted = true;
}
await this.session.cancel();
}

Expand Down Expand Up @@ -715,7 +728,20 @@ export class AcpSession {
* sees a JSON-RPC error rather than a hung request.
*/
async prompt(blocks: readonly ContentBlock[]): Promise<PromptResponse> {
const parts = acpBlocksToPromptParts(blocks);
// Compression happens before any turn exists, so honor a `session/cancel`
// that arrives during it: flip the flag from cancel() and bail out here
// rather than launching a turn the client already asked to stop.
const pending = { aborted: false };
this.pendingPromptAborts.add(pending);
let parts: readonly PromptPart[];
try {
parts = await compressPromptImageParts(acpBlocksToPromptParts(blocks));
} finally {
this.pendingPromptAborts.delete(pending);
}
if (pending.aborted) {
return { stopReason: 'cancelled' };
}
const sessionId = this.id;
const conn = this.conn;

Expand Down
78 changes: 78 additions & 0 deletions packages/acp-adapter/test/cancel.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ import {
type WriteTextFileResponse,
} from '@agentclientprotocol/sdk';
import { log, type KimiHarness, type Session } from '@moonshot-ai/kimi-code-sdk';
import { Jimp } from 'jimp';

import { AcpServer } from '../src/server';
import { AUTHED_STATUS } from './_helpers/harness-stubs';
Expand Down Expand Up @@ -139,4 +140,81 @@ describe('AcpServer cancel', () => {
expect.objectContaining({ sessionId: 'sess-erroring' }),
);
});

it('returns cancelled without launching when cancel arrives during image compression', async () => {
let promptCalls = 0;
const fakeSession = {
id: 'sess-cancel-compress',
prompt: async () => {
promptCalls += 1;
return undefined;
},
cancel: async () => undefined,
onEvent: () => () => undefined,
} as unknown as Session;
const harness = {
auth: { status: async () => AUTHED_STATUS },
createSession: async () => fakeSession,
} as unknown as KimiHarness;

const { agentStream, clientStream } = makeInMemoryStreamPair();
new AgentSideConnection((c) => new AcpServer(harness, c), agentStream);
const client = new ClientSideConnection((_a) => new StubClient(), clientStream);

const { sessionId } = await client.newSession({ cwd: '/tmp/x', mcpServers: [] });

// A solid 2600×2600 image is small in bytes but slow enough to compress
// that the cancel below reliably lands mid-compression, before any turn.
const data = Buffer.from(
await new Jimp({ width: 2600, height: 2600, color: 0x3366ccff }).getBuffer('image/png'),
).toString('base64');

const promptP = client.prompt({
sessionId,
prompt: [{ type: 'image', data, mimeType: 'image/png' }],
});
await client.cancel({ sessionId });
const res = await promptP;

expect(res.stopReason).toBe('cancelled');
expect(promptCalls).toBe(0); // the turn was never launched
});

it('cancels every prompt compressing concurrently, not just the most recent', async () => {
let promptCalls = 0;
const fakeSession = {
id: 'sess-cancel-concurrent',
prompt: async () => {
promptCalls += 1;
return undefined;
},
cancel: async () => undefined,
onEvent: () => () => undefined,
} as unknown as Session;
const harness = {
auth: { status: async () => AUTHED_STATUS },
createSession: async () => fakeSession,
} as unknown as KimiHarness;

const { agentStream, clientStream } = makeInMemoryStreamPair();
new AgentSideConnection((c) => new AcpServer(harness, c), agentStream);
const client = new ClientSideConnection((_a) => new StubClient(), clientStream);

const { sessionId } = await client.newSession({ cwd: '/tmp/x', mcpServers: [] });

const data = Buffer.from(
await new Jimp({ width: 2600, height: 2600, color: 0x3366ccff }).getBuffer('image/png'),
).toString('base64');
const imageBlock = { type: 'image' as const, data, mimeType: 'image/png' };

// Two prompts compressing at once; a single cancel must cover both.
const p1 = client.prompt({ sessionId, prompt: [imageBlock] });
const p2 = client.prompt({ sessionId, prompt: [imageBlock] });
await client.cancel({ sessionId });
const [r1, r2] = await Promise.all([p1, p2]);

expect(r1.stopReason).toBe('cancelled');
expect(r2.stopReason).toBe('cancelled');
expect(promptCalls).toBe(0);
});
});
Loading
Loading