diff --git a/apps/gateway/src/responses/openresponses-compliance.spec.ts b/apps/gateway/src/responses/openresponses-compliance.spec.ts new file mode 100644 index 0000000000..9a385d2c11 --- /dev/null +++ b/apps/gateway/src/responses/openresponses-compliance.spec.ts @@ -0,0 +1,434 @@ +import { describe, it, expect } from "vitest"; +import { z } from "zod"; + +import { responsesRequestSchema } from "./schemas.js"; +import { convertChatResponseToResponses } from "./tools/convert-chat-to-responses.js"; +import { + createCompletionEvents, + createResponseCreatedEvent, + createStreamingState, + processStreamChunk, +} from "./tools/convert-streaming-to-responses.js"; + +/** + * Vendored subset of the Open Responses spec's `ResponseResource` schema. + * Mirrors the required-fields list at + * https://github.com/openresponses/openresponses (public/openapi/openapi.json + * lines 2424-2717). A response that conforms to this schema also conforms to + * the spec's ResponseResource (modulo extension fields, which the spec allows). + * + * Output items only validate the basic discriminator + the fields the + * compliance suite asserts on (message.role, function_call.call_id, etc.). + */ +const usageSchema = z + .object({ + input_tokens: z.number(), + output_tokens: z.number(), + total_tokens: z.number(), + input_tokens_details: z.object({ + cached_tokens: z.number(), + }), + output_tokens_details: z.object({ + reasoning_tokens: z.number(), + }), + }) + .passthrough(); + +const messageOutputItemSchema = z + .object({ + type: z.literal("message"), + id: z.string(), + status: z.string(), + role: z.enum(["assistant", "user", "system", "developer"]), + content: z.array(z.unknown()), + phase: z.enum(["commentary", "final_answer"]).optional(), + }) + .passthrough(); + +const functionCallOutputItemSchema = z + .object({ + type: z.literal("function_call"), + id: z.string(), + call_id: z.string(), + name: z.string(), + arguments: z.string(), + status: z.enum(["in_progress", "completed", "incomplete"]), + }) + .passthrough(); + +const reasoningOutputItemSchema = z + .object({ + type: z.literal("reasoning"), + id: z.string(), + }) + .passthrough(); + +const outputItemSchema = z.union([ + messageOutputItemSchema, + functionCallOutputItemSchema, + reasoningOutputItemSchema, + z.object({ type: z.string() }).passthrough(), +]); + +export const responseResourceSchema = z + .object({ + id: z.string(), + object: z.literal("response"), + created_at: z.number(), + completed_at: z.number().nullable(), + status: z.string(), + incomplete_details: z + .object({ reason: z.string() }) + .passthrough() + .nullable(), + model: z.string(), + previous_response_id: z.string().nullable(), + instructions: z.string().nullable(), + output: z.array(outputItemSchema), + error: z + .object({ code: z.string(), message: z.string() }) + .passthrough() + .nullable(), + tools: z.array(z.unknown()), + tool_choice: z.unknown(), + truncation: z.enum(["auto", "disabled"]), + parallel_tool_calls: z.boolean(), + text: z + .object({ + format: z.object({ type: z.string() }).passthrough(), + }) + .passthrough(), + top_p: z.number(), + presence_penalty: z.number(), + frequency_penalty: z.number(), + top_logprobs: z.number(), + temperature: z.number(), + reasoning: z + .object({ + effort: z.string().nullable(), + summary: z.string().nullable(), + }) + .passthrough() + .nullable(), + usage: usageSchema.nullable(), + max_output_tokens: z.number().nullable(), + max_tool_calls: z.number().nullable(), + store: z.boolean(), + background: z.boolean(), + service_tier: z.string(), + metadata: z.record(z.unknown()), + safety_identifier: z.string().nullable(), + prompt_cache_key: z.string().nullable(), + }) + .passthrough(); + +function expectValid(data: unknown, label: string) { + const result = responseResourceSchema.safeParse(data); + if (!result.success) { + const issues = result.error.issues + .map((i) => ` - ${i.path.join(".") || "(root)"}: ${i.message}`) + .join("\n"); + throw new Error(`${label} failed Open Responses schema:\n${issues}`); + } + expect(result.success).toBe(true); +} + +describe("Open Responses compliance: non-streaming response shape", () => { + const baseChat = { + id: "chatcmpl-1", + object: "chat.completion" as const, + created: 1_700_000_000, + model: "gpt-4o-mini", + choices: [ + { + index: 0, + message: { role: "assistant", content: "Hello!" }, + finish_reason: "stop", + }, + ], + usage: { prompt_tokens: 10, completion_tokens: 5, total_tokens: 15 }, + }; + + it("emits all required ResponseResource fields with no echo request", () => { + const out = convertChatResponseToResponses(baseChat, "gpt-4o-mini"); + expectValid(out, "basic response"); + }); + + it("echoes request fields (tools, tool_choice, instructions, temperature, etc.)", () => { + const out = convertChatResponseToResponses( + baseChat, + "gpt-4o-mini", + "resp_test_123", + { + instructions: "You are helpful", + tools: [{ type: "function", name: "get_weather" }], + tool_choice: "auto", + temperature: 0.7, + top_p: 0.9, + max_output_tokens: 100, + metadata: { trace: "abc" }, + prompt_cache_key: "user-123", + previous_response_id: "resp_prev", + }, + ); + expectValid(out, "echo response"); + expect(out.id).toBe("resp_test_123"); + expect(out.instructions).toBe("You are helpful"); + expect(out.temperature).toBe(0.7); + expect(out.top_p).toBe(0.9); + expect(out.max_output_tokens).toBe(100); + expect(out.previous_response_id).toBe("resp_prev"); + expect(out.prompt_cache_key).toBe("user-123"); + expect(out.metadata).toMatchObject({ trace: "abc" }); + expect(out.tools).toHaveLength(1); + }); + + it("usage always includes input_tokens_details and output_tokens_details", () => { + const out = convertChatResponseToResponses(baseChat, "gpt-4o-mini"); + expect(out.usage?.input_tokens_details.cached_tokens).toBe(0); + expect(out.usage?.output_tokens_details.reasoning_tokens).toBe(0); + }); + + it("function_call outputs have required call_id, name, arguments, status", () => { + const chat = { + ...baseChat, + choices: [ + { + message: { + role: "assistant", + content: null, + tool_calls: [ + { + id: "call_abc", + type: "function", + function: { + name: "get_weather", + arguments: '{"location":"SF"}', + }, + }, + ], + }, + finish_reason: "tool_calls", + }, + ], + }; + const out = convertChatResponseToResponses(chat, "gpt-4o-mini"); + expectValid(out, "function_call response"); + const fc = out.output.find((o) => o.type === "function_call") as + | Record + | undefined; + expect(fc).toBeDefined(); + expect(fc!.call_id).toBe("call_abc"); + expect(fc!.name).toBe("get_weather"); + expect(fc!.arguments).toBe('{"location":"SF"}'); + expect(fc!.status).toBe("completed"); + }); + + it("incomplete status sets completed_at to null and provides incomplete_details", () => { + const chat = { + ...baseChat, + choices: [ + { + message: { role: "assistant", content: "Truncated..." }, + finish_reason: "length", + }, + ], + }; + const out = convertChatResponseToResponses(chat, "gpt-4o-mini"); + expectValid(out, "incomplete response"); + expect(out.status).toBe("incomplete"); + expect(out.completed_at).toBeNull(); + expect(out.incomplete_details).not.toBeNull(); + expect(out.incomplete_details!.reason).toBe("max_output_tokens"); + }); +}); + +describe("Open Responses compliance: streaming response shape", () => { + it("response.created payload validates against ResponseResource", () => { + const state = createStreamingState("gpt-4o-mini", "resp_stream_1", { + instructions: "be terse", + temperature: 0.5, + }); + const event = createResponseCreatedEvent(state); + const data = JSON.parse(event.data); + expectValid(data.response, "response.created"); + expect(data.response.status).toBe("in_progress"); + }); + + it("response.completed payload validates against ResponseResource", () => { + const state = createStreamingState("gpt-4o-mini", "resp_stream_2"); + processStreamChunk({ choices: [{ delta: { content: "Hi" } }] }, state); + processStreamChunk( + { + choices: [{ delta: {}, finish_reason: "stop" }], + usage: { prompt_tokens: 1, completion_tokens: 1, total_tokens: 2 }, + }, + state, + ); + const events = createCompletionEvents(state); + const completed = events.find((e) => e.event === "response.completed")!; + const data = JSON.parse(completed.data); + expectValid(data.response, "response.completed"); + expect(data.response.status).toBe("completed"); + expect(data.response.completed_at).not.toBeNull(); + expect(data.response.usage.input_tokens_details.cached_tokens).toBe(0); + expect(data.response.usage.output_tokens_details.reasoning_tokens).toBe(0); + }); + + it("response.completed echoes request fields when streaming state has them", () => { + const state = createStreamingState("gpt-4o-mini", "resp_stream_3", { + tools: [{ type: "function", name: "lookup" }], + tool_choice: "required", + temperature: 0.3, + top_p: 0.95, + parallel_tool_calls: false, + }); + processStreamChunk({ choices: [{ delta: { content: "ok" } }] }, state); + const events = createCompletionEvents(state); + const data = JSON.parse( + events.find((e) => e.event === "response.completed")!.data, + ); + expectValid(data.response, "response.completed (with echo)"); + expect(data.response.temperature).toBe(0.3); + expect(data.response.top_p).toBe(0.95); + expect(data.response.parallel_tool_calls).toBe(false); + expect(data.response.tool_choice).toBe("required"); + expect(data.response.tools).toHaveLength(1); + }); + + it("captures reasoning_tokens from upstream completion_tokens_details", () => { + const state = createStreamingState("gpt-4o-mini"); + processStreamChunk({ choices: [{ delta: { content: "x" } }] }, state); + processStreamChunk( + { + choices: [{ delta: {}, finish_reason: "stop" }], + usage: { + prompt_tokens: 1, + completion_tokens: 10, + total_tokens: 11, + completion_tokens_details: { reasoning_tokens: 7 }, + }, + }, + state, + ); + const events = createCompletionEvents(state); + const data = JSON.parse( + events.find((e) => e.event === "response.completed")!.data, + ); + expect(data.response.usage.output_tokens_details.reasoning_tokens).toBe(7); + }); +}); + +describe("Open Responses compliance: input request shape", () => { + it("accepts assistant messages with phase: commentary", () => { + const result = responsesRequestSchema.safeParse({ + model: "gpt-4o-mini", + input: [ + { + type: "message", + role: "assistant", + phase: "commentary", + content: "thinking out loud", + }, + { + type: "message", + role: "assistant", + phase: "final_answer", + content: "the answer", + }, + { type: "message", role: "user", content: "ok" }, + ], + }); + expect(result.success).toBe(true); + }); + + it("rejects unknown phase values", () => { + const result = responsesRequestSchema.safeParse({ + model: "gpt-4o-mini", + input: [ + { + type: "message", + role: "assistant", + phase: "midthought", + content: "x", + }, + ], + }); + expect(result.success).toBe(false); + }); + + it("accepts the exact ResponseResource mock fixture from the upstream compliance suite", () => { + // Mirrors compliance-tests.ts response-output-phase-schema fixture. + // If our vendored schema starts rejecting this, the schema has drifted + // from the spec. + const fixture = { + id: "resp_phase_schema", + object: "response", + created_at: 1_764_967_971, + completed_at: 1_764_967_972, + status: "completed", + incomplete_details: null, + model: "gpt-4o-mini", + previous_response_id: null, + instructions: null, + output: [ + { + id: "msg_phase_commentary", + type: "message", + status: "completed", + role: "assistant", + phase: "commentary", + content: [ + { + type: "output_text", + text: "I am checking the answer.", + annotations: [], + }, + ], + }, + { + id: "msg_phase_final", + type: "message", + status: "completed", + role: "assistant", + phase: "final_answer", + content: [ + { + type: "output_text", + text: "The answer is four.", + annotations: [], + }, + ], + }, + ], + error: null, + tools: [], + tool_choice: "auto", + truncation: "disabled", + parallel_tool_calls: true, + text: { format: { type: "text" } }, + top_p: 1, + presence_penalty: 0, + frequency_penalty: 0, + top_logprobs: 0, + temperature: 1, + reasoning: { effort: null, summary: null }, + usage: { + input_tokens: 1, + output_tokens: 2, + total_tokens: 3, + input_tokens_details: { cached_tokens: 0 }, + output_tokens_details: { reasoning_tokens: 0 }, + }, + max_output_tokens: null, + max_tool_calls: null, + store: true, + background: false, + service_tier: "default", + metadata: {}, + safety_identifier: null, + prompt_cache_key: null, + }; + expectValid(fixture, "upstream mock fixture"); + }); +}); diff --git a/apps/gateway/src/responses/responses.spec.ts b/apps/gateway/src/responses/responses.spec.ts index 5a30adc7a5..a4917a5375 100644 --- a/apps/gateway/src/responses/responses.spec.ts +++ b/apps/gateway/src/responses/responses.spec.ts @@ -282,8 +282,8 @@ describe("convertChatResponseToResponses", () => { expect(result.id).toMatch(/^resp_/); expect(result.status).toBe("completed"); expect(result.model).toBe("gpt-4o-mini"); - expect(result.usage.input_tokens).toBe(10); - expect(result.usage.output_tokens).toBe(5); + expect(result.usage!.input_tokens).toBe(10); + expect(result.usage!.output_tokens).toBe(5); const messageOutput = result.output.find((o) => o.type === "message"); expect(messageOutput).toBeDefined(); @@ -443,8 +443,8 @@ describe("convertChatResponseToResponses", () => { const result = convertChatResponseToResponses(chatResponse, "gpt-4o-mini"); - expect(result.usage.cost).toBe(0.001); - expect(result.usage.cost_details).toEqual({ + expect(result.usage!.cost).toBe(0.001); + expect(result.usage!.cost_details).toEqual({ upstream_inference_cost: 0.001, upstream_inference_prompt_cost: 0.0005, upstream_inference_completions_cost: 0.0005, diff --git a/apps/gateway/src/responses/responses.ts b/apps/gateway/src/responses/responses.ts index 6dce680d3d..6910963efa 100644 --- a/apps/gateway/src/responses/responses.ts +++ b/apps/gateway/src/responses/responses.ts @@ -19,7 +19,11 @@ import { shortid } from "@llmgateway/db"; import { logger } from "@llmgateway/logger"; import { responsesRequestSchema } from "./schemas.js"; -import { convertChatResponseToResponses } from "./tools/convert-chat-to-responses.js"; +import { + convertChatResponseToResponses, + type ResponsesApiOutput, + type ResponsesApiResponse, +} from "./tools/convert-chat-to-responses.js"; import { convertResponsesInputToMessages } from "./tools/convert-responses-to-chat.js"; import { createStreamingState, @@ -291,7 +295,7 @@ responses.post("/", async (c) => { // Generate log ID with resp_ prefix — this is both the log entry's primary key // and the Responses API response ID const logId = `resp_${shortid(24)}`; - const state = createStreamingState(req.model, logId); + const state = createStreamingState(req.model, logId, req); // Build Responses API data for storage in the log entry. // Output starts empty and is updated after completion via storeResponse(). @@ -490,6 +494,7 @@ responses.post("/", async (c) => { chatJson, req.model, logId, + req, ); // Store for previous_response_id (unless store: false) @@ -500,8 +505,10 @@ responses.post("/", async (c) => { output: responsesResponse.output, instructions: req.instructions, model: req.model, - status: responsesResponse.status, - usage: responsesResponse.usage, + status: responsesResponse.status as "completed" | "incomplete" | "failed", + usage: (responsesResponse.usage ?? undefined) as + | Record + | undefined, created_at: responsesResponse.created_at, }); } @@ -545,17 +552,60 @@ responses.get("/:response_id", async (c) => { ); } - return c.json({ + const createdAt = stored.created_at ?? Math.floor(Date.now() / 1000); + const status = stored.status as ResponsesApiResponse["status"]; + const storedUsage = stored.usage as Record | undefined; + + const usage: ResponsesApiResponse["usage"] = { + input_tokens: (storedUsage?.input_tokens as number) ?? 0, + output_tokens: (storedUsage?.output_tokens as number) ?? 0, + total_tokens: (storedUsage?.total_tokens as number) ?? 0, + input_tokens_details: { + cached_tokens: + ((storedUsage?.input_tokens_details as Record) + ?.cached_tokens as number) ?? 0, + }, + output_tokens_details: { + reasoning_tokens: + ((storedUsage?.output_tokens_details as Record) + ?.reasoning_tokens as number) ?? 0, + }, + }; + + const responsePayload: ResponsesApiResponse = { id: stored.id, object: "response", - created_at: stored.created_at ?? Math.floor(Date.now() / 1000), + created_at: createdAt, + completed_at: status === "completed" ? createdAt : null, + status, + incomplete_details: + status === "incomplete" ? { reason: "max_output_tokens" } : null, model: stored.model, - output: stored.output, - usage: stored.usage ?? { - input_tokens: 0, - output_tokens: 0, - total_tokens: 0, - }, - status: stored.status, - }); + previous_response_id: null, + instructions: stored.instructions ?? null, + output: stored.output as ResponsesApiOutput[], + error: null, + tools: [], + tool_choice: "auto", + truncation: "disabled", + parallel_tool_calls: true, + text: { format: { type: "text" } }, + top_p: 1, + presence_penalty: 0, + frequency_penalty: 0, + top_logprobs: 0, + temperature: 1, + reasoning: { effort: null, summary: null }, + usage, + max_output_tokens: null, + max_tool_calls: null, + store: true, + background: false, + service_tier: "default", + metadata: {}, + safety_identifier: null, + prompt_cache_key: null, + }; + + return c.json(responsePayload); }); diff --git a/apps/gateway/src/responses/schemas.ts b/apps/gateway/src/responses/schemas.ts index 4dec08d61d..60a28fded2 100644 --- a/apps/gateway/src/responses/schemas.ts +++ b/apps/gateway/src/responses/schemas.ts @@ -22,7 +22,9 @@ const responseInputContentSchema = z.union([ ]); const messageItemSchema = z.object({ + type: z.literal("message").optional(), role: z.enum(["user", "assistant", "system", "developer"]), + phase: z.enum(["commentary", "final_answer"]).optional(), content: z .union([ z.string(), diff --git a/apps/gateway/src/responses/tools/convert-chat-to-responses.ts b/apps/gateway/src/responses/tools/convert-chat-to-responses.ts index bf3e0379b8..b1d5d5b945 100644 --- a/apps/gateway/src/responses/tools/convert-chat-to-responses.ts +++ b/apps/gateway/src/responses/tools/convert-chat-to-responses.ts @@ -65,40 +65,94 @@ export interface ResponsesApiOutput { [key: string]: unknown; } +export interface ResponsesApiUsage { + input_tokens: number; + output_tokens: number; + total_tokens: number; + output_tokens_details: { + reasoning_tokens: number; + }; + input_tokens_details: { + cached_tokens: number; + }; + cost?: number; + cost_details?: { + upstream_inference_cost: number; + upstream_inference_prompt_cost: number; + upstream_inference_completions_cost: number; + total_cost?: number | null; + input_cost?: number | null; + output_cost?: number | null; + cached_input_cost?: number | null; + request_cost?: number | null; + web_search_cost?: number | null; + image_input_cost?: number | null; + image_output_cost?: number | null; + data_storage_cost?: number | null; + }; +} + export interface ResponsesApiResponse { id: string; object: "response"; created_at: number; + completed_at: number | null; + status: "completed" | "incomplete" | "failed" | "in_progress"; + incomplete_details: { reason: string } | null; model: string; + previous_response_id: string | null; + instructions: string | null; output: ResponsesApiOutput[]; - usage: { - input_tokens: number; - output_tokens: number; - total_tokens: number; - output_tokens_details?: { - reasoning_tokens: number; - }; - input_tokens_details?: { - cached_tokens: number; - }; - cost?: number; - cost_details?: { - upstream_inference_cost: number; - upstream_inference_prompt_cost: number; - upstream_inference_completions_cost: number; - total_cost?: number | null; - input_cost?: number | null; - output_cost?: number | null; - cached_input_cost?: number | null; - request_cost?: number | null; - web_search_cost?: number | null; - image_input_cost?: number | null; - image_output_cost?: number | null; - data_storage_cost?: number | null; - }; - }; - status: "completed" | "incomplete" | "failed"; + error: { code: string; message: string } | null; + tools: unknown[]; + tool_choice: unknown; + truncation: "auto" | "disabled"; + parallel_tool_calls: boolean; + text: { format: Record }; + top_p: number; + presence_penalty: number; + frequency_penalty: number; + top_logprobs: number; + temperature: number; + reasoning: { effort: string | null; summary: string | null } | null; + usage: ResponsesApiUsage | null; + max_output_tokens: number | null; + max_tool_calls: number | null; + store: boolean; + background: boolean; + service_tier: string; + metadata: Record; + safety_identifier: string | null; + prompt_cache_key: string | null; +} + +/** + * Subset of the original /v1/responses request needed to echo fields + * back on the response (per the Open Responses spec, which requires + * many fields to be present even when they were not user-supplied). + */ +export interface ResponsesEchoRequest { + previous_response_id?: string; + instructions?: string; + tools?: unknown[]; + tool_choice?: unknown; + truncation?: "auto" | "disabled"; + parallel_tool_calls?: boolean; + text?: { format?: Record } & Record; + top_p?: number; + presence_penalty?: number; + frequency_penalty?: number; + top_logprobs?: number; + temperature?: number; + reasoning?: { effort?: string | null; summary?: string | null } | null; + max_output_tokens?: number; + max_tool_calls?: number; + store?: boolean; + background?: boolean; + service_tier?: string; metadata?: Record; + safety_identifier?: string; + prompt_cache_key?: string; } /** @@ -108,6 +162,7 @@ export function convertChatResponseToResponses( chatResponse: ChatCompletionsResponse, requestedModel: string, responseId?: string, + request?: ResponsesEchoRequest, ): ResponsesApiResponse { const choice = chatResponse.choices?.[0]; const message = choice?.message; @@ -170,24 +225,19 @@ export function convertChatResponseToResponses( status = "incomplete"; } - const usage: ResponsesApiResponse["usage"] = { + const usage: ResponsesApiUsage = { input_tokens: chatResponse.usage?.prompt_tokens ?? 0, output_tokens: chatResponse.usage?.completion_tokens ?? 0, total_tokens: chatResponse.usage?.total_tokens ?? 0, - }; - - if (chatResponse.usage?.completion_tokens_details?.reasoning_tokens) { - usage.output_tokens_details = { + input_tokens_details: { + cached_tokens: + chatResponse.usage?.prompt_tokens_details?.cached_tokens ?? 0, + }, + output_tokens_details: { reasoning_tokens: - chatResponse.usage.completion_tokens_details.reasoning_tokens, - }; - } - - if (chatResponse.usage?.prompt_tokens_details?.cached_tokens) { - usage.input_tokens_details = { - cached_tokens: chatResponse.usage.prompt_tokens_details.cached_tokens, - }; - } + chatResponse.usage?.completion_tokens_details?.reasoning_tokens ?? 0, + }, + }; if (chatResponse.usage?.cost !== undefined) { usage.cost = chatResponse.usage.cost; @@ -196,14 +246,48 @@ export function convertChatResponseToResponses( usage.cost_details = chatResponse.usage.cost_details; } + const created = chatResponse.created ?? Math.floor(Date.now() / 1000); + return { id: responseId ?? `resp_${shortid(24)}`, object: "response", - created_at: chatResponse.created ?? Math.floor(Date.now() / 1000), + created_at: created, + completed_at: status === "completed" ? created : null, + status, + incomplete_details: + status === "incomplete" ? { reason: "max_output_tokens" } : null, model: chatResponse.model ?? requestedModel, + previous_response_id: request?.previous_response_id ?? null, + instructions: request?.instructions ?? null, output, + error: null, + tools: request?.tools ?? [], + tool_choice: request?.tool_choice ?? "auto", + truncation: request?.truncation ?? "disabled", + parallel_tool_calls: request?.parallel_tool_calls ?? true, + text: { + format: request?.text?.format ?? { type: "text" }, + }, + top_p: request?.top_p ?? 1, + presence_penalty: request?.presence_penalty ?? 0, + frequency_penalty: request?.frequency_penalty ?? 0, + top_logprobs: request?.top_logprobs ?? 0, + temperature: request?.temperature ?? 1, + reasoning: { + effort: request?.reasoning?.effort ?? null, + summary: request?.reasoning?.summary ?? null, + }, usage, - status, - ...(chatResponse.metadata ? { metadata: chatResponse.metadata } : {}), + max_output_tokens: request?.max_output_tokens ?? null, + max_tool_calls: request?.max_tool_calls ?? null, + store: request?.store ?? true, + background: request?.background ?? false, + service_tier: request?.service_tier ?? "default", + metadata: { + ...(request?.metadata ?? {}), + ...(chatResponse.metadata ?? {}), + }, + safety_identifier: request?.safety_identifier ?? null, + prompt_cache_key: request?.prompt_cache_key ?? null, }; } diff --git a/apps/gateway/src/responses/tools/convert-streaming-to-responses.ts b/apps/gateway/src/responses/tools/convert-streaming-to-responses.ts index 42defdfd6c..752ac4db3e 100644 --- a/apps/gateway/src/responses/tools/convert-streaming-to-responses.ts +++ b/apps/gateway/src/responses/tools/convert-streaming-to-responses.ts @@ -1,5 +1,7 @@ import { shortid } from "@llmgateway/db"; +import type { ResponsesEchoRequest } from "./convert-chat-to-responses.js"; + interface StreamingState { responseId: string; model: string; @@ -23,11 +25,13 @@ interface StreamingState { outputIndex: number; } >; + request?: ResponsesEchoRequest; usage: { input_tokens: number; output_tokens: number; total_tokens: number; input_tokens_details?: { cached_tokens: number }; + output_tokens_details?: { reasoning_tokens: number }; cost?: number; cost_details?: { upstream_inference_cost: number; @@ -49,6 +53,7 @@ interface StreamingState { export function createStreamingState( model: string, responseId?: string, + request?: ResponsesEchoRequest, ): StreamingState { return { responseId: responseId ?? `resp_${shortid(24)}`, @@ -64,10 +69,83 @@ export function createStreamingState( reasoningStarted: false, finishReason: null, toolCalls: new Map(), + request, usage: { input_tokens: 0, output_tokens: 0, total_tokens: 0 }, }; } +/** + * Build a fully-padded ResponseResource payload from streaming state. + * Used by response.created and response.completed events so the streaming + * shape matches the non-streaming shape and the Open Responses spec. + */ +function buildResponsePayload( + state: StreamingState, + overrides: { + status: "in_progress" | "completed" | "incomplete" | "failed"; + output?: Record[]; + }, +): Record { + const req = state.request; + const status = overrides.status; + const output = overrides.output ?? []; + + const usage = { + input_tokens: state.usage.input_tokens, + output_tokens: state.usage.output_tokens, + total_tokens: state.usage.total_tokens, + input_tokens_details: { + cached_tokens: state.usage.input_tokens_details?.cached_tokens ?? 0, + }, + output_tokens_details: { + reasoning_tokens: + state.usage.output_tokens_details?.reasoning_tokens ?? 0, + }, + ...(state.usage.cost !== undefined ? { cost: state.usage.cost } : {}), + ...(state.usage.cost_details !== undefined + ? { cost_details: state.usage.cost_details } + : {}), + }; + + return { + id: state.responseId, + object: "response", + created_at: state.createdAt, + completed_at: status === "completed" ? state.createdAt : null, + status, + incomplete_details: + status === "incomplete" ? { reason: "max_output_tokens" } : null, + model: state.model, + previous_response_id: req?.previous_response_id ?? null, + instructions: req?.instructions ?? null, + output, + error: null, + tools: req?.tools ?? [], + tool_choice: req?.tool_choice ?? "auto", + truncation: req?.truncation ?? "disabled", + parallel_tool_calls: req?.parallel_tool_calls ?? true, + text: { format: req?.text?.format ?? { type: "text" } }, + top_p: req?.top_p ?? 1, + presence_penalty: req?.presence_penalty ?? 0, + frequency_penalty: req?.frequency_penalty ?? 0, + top_logprobs: req?.top_logprobs ?? 0, + temperature: req?.temperature ?? 1, + reasoning: { + effort: req?.reasoning?.effort ?? null, + summary: req?.reasoning?.summary ?? null, + }, + usage, + max_output_tokens: req?.max_output_tokens ?? null, + max_tool_calls: req?.max_tool_calls ?? null, + store: req?.store ?? true, + background: req?.background ?? false, + service_tier: req?.service_tier ?? "default", + metadata: req?.metadata ?? {}, + safety_identifier: req?.safety_identifier ?? null, + prompt_cache_key: req?.prompt_cache_key ?? null, + }; +} + interface SSEEvent { event: string; data: string; @@ -81,14 +159,7 @@ export function createResponseCreatedEvent(state: StreamingState): SSEEvent { event: "response.created", data: JSON.stringify({ type: "response.created", - response: { - id: state.responseId, - object: "response", - created_at: state.createdAt, - model: state.model, - status: "in_progress", - output: [], - }, + response: buildResponsePayload(state, { status: "in_progress" }), }), }; } @@ -144,6 +215,14 @@ export function processStreamChunk( cached_tokens: ptd.cached_tokens as number, }; } + const ctd = usage.completion_tokens_details as + | Record + | undefined; + if (ctd?.reasoning_tokens !== undefined) { + state.usage.output_tokens_details = { + reasoning_tokens: ctd.reasoning_tokens as number, + }; + } if (usage.cost !== undefined) { state.usage.cost = usage.cost as number; } @@ -296,6 +375,14 @@ export function processStreamChunk( cached_tokens: ptd.cached_tokens as number, }; } + const ctd = usage.completion_tokens_details as + | Record + | undefined; + if (ctd?.reasoning_tokens !== undefined) { + state.usage.output_tokens_details = { + reasoning_tokens: ctd.reasoning_tokens as number, + }; + } if (usage.cost !== undefined) { state.usage.cost = usage.cost as number; } @@ -433,15 +520,7 @@ export function createCompletionEvents(state: StreamingState): SSEEvent[] { event: "response.completed", data: JSON.stringify({ type: "response.completed", - response: { - id: state.responseId, - object: "response", - created_at: state.createdAt, - model: state.model, - output, - usage: state.usage, - status, - }, + response: buildResponsePayload(state, { status, output }), }), }); @@ -456,15 +535,7 @@ export function createFailedEvent(state: StreamingState): SSEEvent { event: "response.failed", data: JSON.stringify({ type: "response.failed", - response: { - id: state.responseId, - object: "response", - created_at: state.createdAt, - model: state.model, - output: [], - usage: state.usage, - status: "failed", - }, + response: buildResponsePayload(state, { status: "failed" }), }), }; }