diff --git a/apps/gateway/src/responses.e2e.ts b/apps/gateway/src/responses.e2e.ts new file mode 100644 index 0000000000..ed6bda727b --- /dev/null +++ b/apps/gateway/src/responses.e2e.ts @@ -0,0 +1,271 @@ +import "dotenv/config"; +import { beforeAll, beforeEach, describe, expect, test } from "vitest"; + +import { app } from "@/app.js"; +import { + beforeAllHook, + beforeEachHook, + generateTestRequestId, + getConcurrentTestOptions, + getTestOptions, + logMode, + testModels, + toolCallModels, + validateLogByRequestId, +} from "@/chat-helpers.e2e.js"; + +// Pick one model per provider to keep CI cost manageable while still +// validating the Responses API conversion layer across every provider. +function oneModelPerProvider(list: T[]): T[] { + const seen = new Set(); + const out: T[] = []; + for (const item of list) { + const provider = item.model.split("/")[0]; + if (seen.has(provider)) { + continue; + } + seen.add(provider); + out.push(item); + } + return out; +} + +// Models excluded from the tool-call round-trip test because the underlying +// provider adapter does not emit stable tool_call ids — the id returned in the +// first turn is not recognized when sent back as tool_call_id, so the second +// turn fails. This is a provider/adapter-level issue, unrelated to the +// Responses API conversion layer. +const TOOL_CALL_DENYLIST = new Set(["bytedance/gpt-oss-120b"]); + +const responsesTestModels = oneModelPerProvider(testModels); +const responsesToolCallModels = oneModelPerProvider(toolCallModels).filter( + (m) => !TOOL_CALL_DENYLIST.has(m.model), +); + +interface ResponsesOutputItem { + type: string; + role?: string; + content?: { type: string; text?: string }[]; + call_id?: string; + name?: string; + arguments?: string; +} + +function getOutputText(json: { output?: ResponsesOutputItem[] }): string { + const items = json.output ?? []; + const parts: string[] = []; + for (const item of items) { + if (item.type === "message" && Array.isArray(item.content)) { + for (const c of item.content) { + if (c.type === "output_text" && typeof c.text === "string") { + parts.push(c.text); + } + } + } + } + return parts.join(""); +} + +function getFunctionCall(json: { + output?: ResponsesOutputItem[]; +}): ResponsesOutputItem | undefined { + return (json.output ?? []).find((i) => i.type === "function_call"); +} + +async function postResponses(body: unknown, requestId: string) { + return await app.request("/v1/responses", { + method: "POST", + headers: { + "Content-Type": "application/json", + "x-request-id": requestId, + "x-no-fallback": "true", + Authorization: `Bearer real-token`, + }, + body: JSON.stringify(body), + }); +} + +describe("e2e", getConcurrentTestOptions(), () => { + beforeAll(beforeAllHook); + + beforeEach(beforeEachHook); + + test("empty", () => { + expect(true).toBe(true); + }); + + test.each(responsesTestModels)( + "responses single-turn $model", + getTestOptions(), + async ({ model }) => { + const requestId = generateTestRequestId(); + const res = await postResponses( + { + model, + input: "Say hello in one short sentence.", + }, + requestId, + ); + + const json = await res.json(); + if (logMode) { + console.log( + "responses single-turn response:", + JSON.stringify(json, null, 2), + ); + } + + expect(res.status).toBe(200); + expect(json).toHaveProperty("id"); + expect(typeof json.id).toBe("string"); + expect(json.id.startsWith("resp_")).toBe(true); + expect(Array.isArray(json.output)).toBe(true); + + const text = getOutputText(json); + expect(text.length).toBeGreaterThan(0); + + expect(json).toHaveProperty("usage"); + expect(typeof json.usage.input_tokens).toBe("number"); + expect(typeof json.usage.output_tokens).toBe("number"); + expect(json.usage.input_tokens).toBeGreaterThan(0); + expect(json.usage.output_tokens).toBeGreaterThan(0); + + await validateLogByRequestId(requestId); + }, + ); + + test.each(responsesTestModels)( + "responses multi-turn $model", + getTestOptions(), + async ({ model }) => { + const firstRequestId = generateTestRequestId(); + const firstRes = await postResponses( + { + model, + input: + "My name is Ada. Please remember it. Reply with a brief acknowledgement.", + }, + firstRequestId, + ); + const firstJson = await firstRes.json(); + if (logMode) { + console.log( + "responses multi-turn first:", + JSON.stringify(firstJson, null, 2), + ); + } + expect(firstRes.status).toBe(200); + expect(typeof firstJson.id).toBe("string"); + + const secondRequestId = generateTestRequestId(); + const secondRes = await postResponses( + { + model, + input: "What is my name? Reply with just the name.", + previous_response_id: firstJson.id, + }, + secondRequestId, + ); + const secondJson = await secondRes.json(); + if (logMode) { + console.log( + "responses multi-turn second:", + JSON.stringify(secondJson, null, 2), + ); + } + expect(secondRes.status).toBe(200); + const text = getOutputText(secondJson); + expect(text.toLowerCase()).toContain("ada"); + }, + ); + + test.each(responsesToolCallModels)( + "responses tool calls $model", + getTestOptions(), + async ({ model }) => { + const tools = [ + { + type: "function", + name: "get_weather", + description: "Get the current weather for a given city", + parameters: { + type: "object", + properties: { + city: { + type: "string", + description: "The city name to get weather for", + }, + }, + required: ["city"], + }, + }, + ]; + + const firstRequestId = generateTestRequestId(); + const firstRes = await postResponses( + { + model, + input: [ + { + role: "user", + content: "What's the weather like in San Francisco?", + }, + ], + tools, + tool_choice: "required", + }, + firstRequestId, + ); + const firstJson = await firstRes.json(); + if (logMode) { + console.log( + "responses tool calls first:", + JSON.stringify(firstJson, null, 2), + ); + } + + expect(firstRes.status).toBe(200); + const fnCall = getFunctionCall(firstJson); + expect(fnCall).toBeDefined(); + expect(fnCall?.name).toBe("get_weather"); + expect(typeof fnCall?.call_id).toBe("string"); + expect(typeof fnCall?.arguments).toBe("string"); + const parsedArgs = JSON.parse(fnCall?.arguments ?? "{}"); + expect(typeof parsedArgs.city).toBe("string"); + expect(parsedArgs.city.toLowerCase()).toContain("san francisco"); + + const secondRequestId = generateTestRequestId(); + const secondRes = await postResponses( + { + model, + previous_response_id: firstJson.id, + input: [ + { + type: "function_call_output", + call_id: fnCall?.call_id, + output: "72F and sunny", + }, + ], + tools, + }, + secondRequestId, + ); + const secondJson = await secondRes.json(); + if (logMode) { + console.log( + "responses tool calls second:", + JSON.stringify(secondJson, null, 2), + ); + } + + expect(secondRes.status).toBe(200); + const finalText = getOutputText(secondJson).toLowerCase(); + expect(finalText.length).toBeGreaterThan(0); + expect( + finalText.includes("sunny") || + finalText.includes("72") || + finalText.includes("weather"), + ).toBe(true); + }, + ); +}); diff --git a/apps/gateway/src/responses/tools/convert-chat-to-responses.ts b/apps/gateway/src/responses/tools/convert-chat-to-responses.ts index 965d1434e1..e1f5f24905 100644 --- a/apps/gateway/src/responses/tools/convert-chat-to-responses.ts +++ b/apps/gateway/src/responses/tools/convert-chat-to-responses.ts @@ -112,8 +112,17 @@ export function convertChatResponseToResponses( } } - // Add message output - if (message?.content !== null && message?.content !== undefined) { + // Add message output. Skip if content is empty/whitespace-only — many + // providers return content: "" alongside tool_calls, and emitting an empty + // message item pollutes stored conversations: on replay via + // previous_response_id it becomes a stray assistant message that separates + // the tool_calls assistant from its tool result, causing strict providers + // (deepseek, bytedance, aws-bedrock, kimi, etc.) to reject the request. + if ( + message?.content !== null && + message?.content !== undefined && + message.content.trim() !== "" + ) { const contentParts: Array> = [ { type: "output_text", diff --git a/apps/gateway/src/responses/tools/convert-responses-to-chat.ts b/apps/gateway/src/responses/tools/convert-responses-to-chat.ts index 738ead0da7..9cd2be6ba2 100644 --- a/apps/gateway/src/responses/tools/convert-responses-to-chat.ts +++ b/apps/gateway/src/responses/tools/convert-responses-to-chat.ts @@ -38,7 +38,15 @@ export function convertResponsesInputToMessages( while (i < input.length) { const item = input[i]!; - // function_call items -> collect consecutive ones into assistant tool_calls + // function_call items -> collect consecutive ones into assistant tool_calls. + // Also fold any immediately-following assistant `message` items into the + // same assistant message: in the Responses API the tool_calls and the + // assistant text are emitted as separate output items, but in chat + // completions they belong on a single assistant message. Splitting them + // inserts a stray assistant message between the tool_calls and the tool + // result, which strict providers (deepseek family, bytedance, etc.) + // reject with "assistant message with tool_calls must be followed by + // tool messages". if ("type" in item && item.type === "function_call") { const toolCalls: ChatMessage["tool_calls"] = []; @@ -58,9 +66,29 @@ export function convertResponsesInputToMessages( i++; } + // Fold trailing assistant message content (if any) into this same + // assistant message rather than emitting it as a separate message. + let foldedContent: string | null = null; + while (i < input.length) { + const next = input[i] as Record | undefined; + if ( + next && + next.type === "message" && + (next.role === "assistant" || next.role === undefined) + ) { + const text = extractTextFromContent(next.content); + if (text) { + foldedContent = (foldedContent ?? "") + text; + } + i++; + continue; + } + break; + } + messages.push({ role: "assistant", - content: null, + content: foldedContent, tool_calls: toolCalls, }); continue; @@ -130,6 +158,35 @@ export function convertResponsesInputToMessages( return messages; } +/** + * Extract concatenated plain text from a Responses API message content field + * (which can be a string, an array of content parts, null, or undefined). + * Used when folding a trailing assistant text message into a tool_calls + * assistant message. + */ +function extractTextFromContent(content: unknown): string { + if (content === null || content === undefined) { + return ""; + } + if (typeof content === "string") { + return content; + } + if (!Array.isArray(content)) { + return ""; + } + const parts: string[] = []; + for (const part of content) { + if ( + part && + typeof part === "object" && + typeof (part as { text?: unknown }).text === "string" + ) { + parts.push((part as { text: string }).text); + } + } + return parts.join(""); +} + /** * Convert Responses API content types to chat completions content types. * input_text/output_text -> text, input_image -> image_url