diff --git a/apps/gateway/src/chat/chat.ts b/apps/gateway/src/chat/chat.ts index c83c618463..9a04c93f7e 100644 --- a/apps/gateway/src/chat/chat.ts +++ b/apps/gateway/src/chat/chat.ts @@ -1032,6 +1032,8 @@ chat.openapi(completions, async (c) => { model: modelInput, response_format, stream, + prompt_cache_key, + prompt_cache_retention, tool_choice, free_models_only, onboarding, @@ -3502,6 +3504,8 @@ chat.openapi(completions, async (c) => { response_format, reasoning_effort, reasoning_max_tokens, + prompt_cache_key, + prompt_cache_retention, }; if (stream) { @@ -4088,6 +4092,8 @@ chat.openapi(completions, async (c) => { webSearchTool, reasoning_max_tokens, useResponsesApi, + prompt_cache_key, + prompt_cache_retention, ); if (forceImageStreamUpstream) { @@ -4215,6 +4221,8 @@ chat.openapi(completions, async (c) => { tool_choice, reasoning_effort, reasoning_max_tokens, + prompt_cache_key, + prompt_cache_retention, effort, webSearchTool, image_config, diff --git a/apps/gateway/src/chat/schemas/completions.ts b/apps/gateway/src/chat/schemas/completions.ts index 3056b499bd..ad40e7371e 100644 --- a/apps/gateway/src/chat/schemas/completions.ts +++ b/apps/gateway/src/chat/schemas/completions.ts @@ -139,6 +139,26 @@ export const completionsRequestSchema = z.object({ ]) .optional(), stream: z.boolean().optional().default(false), + prompt_cache_key: z + .string() + .nullable() + .optional() + .transform((val) => (val === null ? undefined : val)) + .openapi({ + description: + "OpenAI prompt caching key used to improve cache routing for requests with shared prompt prefixes.", + example: "tenant-123", + }), + prompt_cache_retention: z + .enum(["in_memory", "24h"]) + .nullable() + .optional() + .transform((val) => (val === null ? undefined : val)) + .openapi({ + description: + "OpenAI prompt cache retention policy. OpenAI supports in_memory and 24h for eligible models.", + example: "24h", + }), tools: z .array( z.union([ diff --git a/apps/gateway/src/chat/tools/resolve-provider-context.ts b/apps/gateway/src/chat/tools/resolve-provider-context.ts index d9c50edd58..5c3debdc12 100644 --- a/apps/gateway/src/chat/tools/resolve-provider-context.ts +++ b/apps/gateway/src/chat/tools/resolve-provider-context.ts @@ -19,6 +19,7 @@ import { type ModelDefinition, type OpenAIRequestBody, type OpenAIToolInput, + type PromptCacheRetention, type Provider, type ProviderRequestBody, providers, @@ -83,6 +84,8 @@ export interface ProviderContextOptions { tool_choice: ToolChoiceType | undefined; reasoning_effort: "minimal" | "low" | "medium" | "high" | "xhigh" | undefined; reasoning_max_tokens: number | undefined; + prompt_cache_key: string | undefined; + prompt_cache_retention: PromptCacheRetention | undefined; effort: "low" | "medium" | "high" | undefined; webSearchTool: WebSearchTool | undefined; image_config: @@ -447,6 +450,8 @@ export async function resolveProviderContext( options.webSearchTool, options.reasoning_max_tokens, useResponsesApi, + options.prompt_cache_key, + options.prompt_cache_retention, ); // Post-validation of max_tokens in request body diff --git a/apps/gateway/src/lib/costs.spec.ts b/apps/gateway/src/lib/costs.spec.ts index ff80a8259a..80d87dac27 100644 --- a/apps/gateway/src/lib/costs.spec.ts +++ b/apps/gateway/src/lib/costs.spec.ts @@ -112,6 +112,44 @@ describe("calculateCosts", () => { expect(result.estimatedCost).toBe(false); // Not estimated }); + it("does not add a separate cache write fee for OpenAI", async () => { + const withoutCacheWrite = await calculateCosts( + "gpt-4o", + "openai", + 100, + 50, + 20, + ); + const withCacheWrite = await calculateCosts( + "gpt-4o", + "openai", + 100, + 50, + 20, + undefined, + null, + 0, + undefined, + 0, + null, + null, + undefined, + null, + null, + { + cacheWriteTokens: 30, + }, + ); + + expect(withCacheWrite.inputCost).toBe(withoutCacheWrite.inputCost); + expect(withCacheWrite.cachedInputCost).toBe( + withoutCacheWrite.cachedInputCost, + ); + expect(withCacheWrite.cacheWriteInputCost).toBe(0); + expect(withCacheWrite.totalCost).toBe(withoutCacheWrite.totalCost); + expect(withCacheWrite.cacheWriteTokens).toBe(30); + }); + it("should calculate costs with cached tokens for Anthropic (first request - cache creation)", async () => { // For Anthropic first request: 4 non-cached + 1659 cache creation = 1663 total tokens, 0 cache reads const result = await calculateCosts( diff --git a/apps/gateway/src/responses/responses.ts b/apps/gateway/src/responses/responses.ts index 6910963efa..954b4efbf9 100644 --- a/apps/gateway/src/responses/responses.ts +++ b/apps/gateway/src/responses/responses.ts @@ -283,6 +283,12 @@ responses.post("/", async (c) => { if (req.reasoning?.effort) { chatRequest.reasoning_effort = req.reasoning.effort; } + if (req.prompt_cache_key !== undefined) { + chatRequest.prompt_cache_key = req.prompt_cache_key; + } + if (req.prompt_cache_retention !== undefined) { + chatRequest.prompt_cache_retention = req.prompt_cache_retention; + } if (response_format) { chatRequest.response_format = response_format; } diff --git a/apps/gateway/src/responses/schemas.ts b/apps/gateway/src/responses/schemas.ts index 60a28fded2..e33c7e2833 100644 --- a/apps/gateway/src/responses/schemas.ts +++ b/apps/gateway/src/responses/schemas.ts @@ -106,6 +106,16 @@ export const responsesRequestSchema = z.object({ instructions: z.string().optional(), previous_response_id: z.string().optional(), stream: z.boolean().optional().default(false), + prompt_cache_key: z + .string() + .nullable() + .optional() + .transform((val) => (val === null ? undefined : val)), + prompt_cache_retention: z + .enum(["in_memory", "24h"]) + .nullable() + .optional() + .transform((val) => (val === null ? undefined : val)), temperature: z .number() .nullable() diff --git a/apps/gateway/src/responses/tools/convert-chat-to-responses.ts b/apps/gateway/src/responses/tools/convert-chat-to-responses.ts index c734e11c03..895faaba23 100644 --- a/apps/gateway/src/responses/tools/convert-chat-to-responses.ts +++ b/apps/gateway/src/responses/tools/convert-chat-to-responses.ts @@ -155,6 +155,7 @@ export interface ResponsesEchoRequest { metadata?: Record; safety_identifier?: string; prompt_cache_key?: string; + prompt_cache_retention?: "in_memory" | "24h"; } /** diff --git a/packages/actions/src/prepare-request-body.spec.ts b/packages/actions/src/prepare-request-body.spec.ts index c5027e288e..38e23b0f7a 100644 --- a/packages/actions/src/prepare-request-body.spec.ts +++ b/packages/actions/src/prepare-request-body.spec.ts @@ -35,6 +35,43 @@ async function prepareOpenAIImageRequest(imageConfig: { ); } +async function prepareOpenAITextRequest(options: { + provider?: "openai" | "azure"; + model?: string; + useResponsesApi?: boolean; + promptCacheKey?: string; + promptCacheRetention?: "in_memory" | "24h"; +}) { + return await prepareRequestBody( + options.provider ?? "openai", + options.model ?? "gpt-5.5", + [{ role: "user", content: "Hello!" }], + false, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + false, + false, + 20, + null, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + options.useResponsesApi ?? false, + options.promptCacheKey, + options.promptCacheRetention, + ); +} + describe("prepareRequestBody - Anthropic", () => { test("should extract system messages to system field for caching", async () => { const requestBody = (await prepareRequestBody( @@ -238,6 +275,69 @@ describe("prepareRequestBody - OpenAI image generation", () => { }); }); +describe("prepareRequestBody - OpenAI prompt caching", () => { + test("should forward prompt cache controls to OpenAI chat completions", async () => { + const requestBody = (await prepareOpenAITextRequest({ + promptCacheKey: "tenant-a", + promptCacheRetention: "24h", + })) as any; + + expect(requestBody.prompt_cache_key).toBe("tenant-a"); + expect(requestBody.prompt_cache_retention).toBe("24h"); + }); + + test("should forward prompt cache controls to OpenAI Responses API", async () => { + const requestBody = (await prepareOpenAITextRequest({ + useResponsesApi: true, + promptCacheKey: "tenant-a", + promptCacheRetention: "in_memory", + })) as any; + + expect(requestBody.prompt_cache_key).toBe("tenant-a"); + expect(requestBody.prompt_cache_retention).toBe("in_memory"); + }); + + test("should not forward OpenAI prompt cache controls to Azure", async () => { + const requestBody = (await prepareOpenAITextRequest({ + provider: "azure", + promptCacheKey: "tenant-a", + promptCacheRetention: "24h", + })) as any; + + expect(requestBody.prompt_cache_key).toBeUndefined(); + expect(requestBody.prompt_cache_retention).toBeUndefined(); + }); + + test("should strip prompt_cache_retention=24h on models that don't support extended retention", async () => { + const requestBody = (await prepareOpenAITextRequest({ + model: "gpt-4o", + promptCacheKey: "tenant-a", + promptCacheRetention: "24h", + })) as any; + + expect(requestBody.prompt_cache_key).toBe("tenant-a"); + expect(requestBody.prompt_cache_retention).toBeUndefined(); + }); + + test("should still forward prompt_cache_retention=in_memory on models without 24h support", async () => { + const requestBody = (await prepareOpenAITextRequest({ + model: "gpt-4o", + promptCacheRetention: "in_memory", + })) as any; + + expect(requestBody.prompt_cache_retention).toBe("in_memory"); + }); + + test("should forward prompt_cache_retention=24h on models that do support extended retention", async () => { + const requestBody = (await prepareOpenAITextRequest({ + model: "gpt-4.1", + promptCacheRetention: "24h", + })) as any; + + expect(requestBody.prompt_cache_retention).toBe("24h"); + }); +}); + describe("prepareRequestBody - Google AI Studio", () => { test("should map gateway 0.5K image size to Google 512", async () => { const requestBody = (await prepareRequestBody( diff --git a/packages/actions/src/prepare-request-body.ts b/packages/actions/src/prepare-request-body.ts index 169c2013cb..2ad99fb724 100644 --- a/packages/actions/src/prepare-request-body.ts +++ b/packages/actions/src/prepare-request-body.ts @@ -10,7 +10,9 @@ import { type OpenAIRequestBody, type OpenAIResponsesRequestBody, type OpenAIToolInput, + type PromptCacheRetention, type ProviderRequestBody, + supportsOpenAIExtendedPromptCache, type ToolChoiceType, type WebSearchTool, } from "@llmgateway/models"; @@ -653,6 +655,8 @@ export async function prepareRequestBody( webSearchTool?: WebSearchTool, reasoning_max_tokens?: number, useResponsesApi?: boolean, + prompt_cache_key?: string, + prompt_cache_retention?: PromptCacheRetention, ): Promise { // Handle OpenAI / Azure image generation models (e.g. gpt-image-2) if ( @@ -1123,6 +1127,19 @@ export async function prepareRequestBody( }, }; + if (usedProvider === "openai") { + if (prompt_cache_key !== undefined) { + responsesBody.prompt_cache_key = prompt_cache_key; + } + if ( + prompt_cache_retention !== undefined && + (prompt_cache_retention !== "24h" || + supportsOpenAIExtendedPromptCache(usedModel)) + ) { + responsesBody.prompt_cache_retention = prompt_cache_retention; + } + } + // Add streaming support if (stream) { responsesBody.stream = true; @@ -1194,6 +1211,19 @@ export async function prepareRequestBody( return responsesBody; } else { // Use regular chat completions format + if (usedProvider === "openai") { + if (prompt_cache_key !== undefined) { + requestBody.prompt_cache_key = prompt_cache_key; + } + if ( + prompt_cache_retention !== undefined && + (prompt_cache_retention !== "24h" || + supportsOpenAIExtendedPromptCache(usedModel)) + ) { + requestBody.prompt_cache_retention = prompt_cache_retention; + } + } + if (stream) { requestBody.stream_options = { include_usage: true, diff --git a/packages/models/src/helpers.ts b/packages/models/src/helpers.ts index cb4d40bc1e..87cd39d8a0 100644 --- a/packages/models/src/helpers.ts +++ b/packages/models/src/helpers.ts @@ -68,3 +68,26 @@ export function getModelStreamingSupport( const providerInfo = providers.find((p) => p.id === providerId); return providerInfo?.streaming === true; } + +// OpenAI prompt_cache_retention="24h" eligibility per +// https://developers.openai.com/api/docs/guides/prompt-caching. +// gpt-5.5 and gpt-5.5-pro default to 24h and reject "in_memory"; the rest +// accept either. Models not on this list only support "in_memory" caching. +const OPENAI_EXTENDED_PROMPT_CACHE_MODELS = new Set([ + "gpt-5.5", + "gpt-5.5-pro", + "gpt-5.4", + "gpt-5.2", + "gpt-5.1-codex-max", + "gpt-5.1", + "gpt-5.1-codex", + "gpt-5.1-codex-mini", + "gpt-5.1-chat-latest", + "gpt-5", + "gpt-5-codex", + "gpt-4.1", +]); + +export function supportsOpenAIExtendedPromptCache(modelName: string): boolean { + return OPENAI_EXTENDED_PROMPT_CACHE_MODELS.has(modelName); +} diff --git a/packages/models/src/types.ts b/packages/models/src/types.ts index 5996f2e28e..db596fbd5c 100644 --- a/packages/models/src/types.ts +++ b/packages/models/src/types.ts @@ -175,6 +175,8 @@ export type ToolChoiceType = }; }; +export type PromptCacheRetention = "in_memory" | "24h"; + export type AnthropicToolChoice = | "auto" | "any" @@ -199,6 +201,8 @@ export interface OpenAIRequestBody extends BaseRequestBody { messages: OpenAIMessage[]; tools?: OpenAITool[]; tool_choice?: ToolChoiceType; + prompt_cache_key?: string; + prompt_cache_retention?: PromptCacheRetention; response_format?: { type: "text" | "json_object" | "json_schema"; json_schema?: { @@ -236,6 +240,8 @@ export type OpenAIResponsesInputItem = export interface OpenAIResponsesRequestBody { model: string; input: OpenAIResponsesInputItem[]; + prompt_cache_key?: string; + prompt_cache_retention?: PromptCacheRetention; reasoning: { effort: "minimal" | "low" | "medium" | "high" | "xhigh"; summary: "detailed"; @@ -368,20 +374,29 @@ export type RequestBodyPreparer = ( frequency_penalty?: number, presence_penalty?: number, response_format?: OpenAIRequestBody["response_format"], - tools?: OpenAITool[], + tools?: OpenAIToolInput[], tool_choice?: ToolChoiceType, reasoning_effort?: "minimal" | "low" | "medium" | "high" | "xhigh", supportsReasoning?: boolean, isProd?: boolean, maxImageSizeMB?: number, - userPlan?: "free" | "pro" | null, + userPlan?: "free" | "pro" | "enterprise" | null, sensitive_word_check?: { status: "DISABLE" | "ENABLE" }, image_config?: { aspect_ratio?: string; image_size?: string; image_quality?: string; + n?: number; + seed?: number; }, -) => Promise; + effort?: "low" | "medium" | "high", + imageGenerations?: boolean, + webSearchTool?: WebSearchTool, + reasoning_max_tokens?: number, + useResponsesApi?: boolean, + prompt_cache_key?: string, + prompt_cache_retention?: PromptCacheRetention, +) => Promise; // Type guards export function isTextContent(content: MessageContent): content is TextContent {