theopenco · steebchen · May 7, 2026 · May 7, 2026 · May 7, 2026 · coderabbitai
diff --git a/apps/gateway/src/chat/chat.ts b/apps/gateway/src/chat/chat.ts
@@ -1032,6 +1032,8 @@ chat.openapi(completions, async (c) => {
 		model: modelInput,
 		response_format,
 		stream,
+		prompt_cache_key,
+		prompt_cache_retention,
 		tool_choice,
 		free_models_only,
 		onboarding,
@@ -3502,6 +3504,8 @@ chat.openapi(completions, async (c) => {
 			response_format,
 			reasoning_effort,
 			reasoning_max_tokens,
+			prompt_cache_key,
+			prompt_cache_retention,
 		};
 
 		if (stream) {
@@ -4088,6 +4092,8 @@ chat.openapi(completions, async (c) => {
 		webSearchTool,
 		reasoning_max_tokens,
 		useResponsesApi,
+		prompt_cache_key,
+		prompt_cache_retention,
 	);
 
 	if (forceImageStreamUpstream) {
@@ -4215,6 +4221,8 @@ chat.openapi(completions, async (c) => {
 				tool_choice,
 				reasoning_effort,
 				reasoning_max_tokens,
+				prompt_cache_key,
+				prompt_cache_retention,
 				effort,
 				webSearchTool,
 				image_config,

diff --git a/apps/gateway/src/chat/schemas/completions.ts b/apps/gateway/src/chat/schemas/completions.ts
@@ -139,6 +139,26 @@ export const completionsRequestSchema = z.object({
 		])
 		.optional(),
 	stream: z.boolean().optional().default(false),
+	prompt_cache_key: z
+		.string()
+		.nullable()
+		.optional()
+		.transform((val) => (val === null ? undefined : val))
+		.openapi({
+			description:
+				"OpenAI prompt caching key used to improve cache routing for requests with shared prompt prefixes.",
+			example: "tenant-123",
+		}),
+	prompt_cache_retention: z
+		.enum(["in_memory", "24h"])
+		.nullable()
+		.optional()
+		.transform((val) => (val === null ? undefined : val))
+		.openapi({
+			description:
+				"OpenAI prompt cache retention policy. OpenAI supports in_memory and 24h for eligible models.",
+			example: "24h",
+		}),
 	tools: z
 		.array(
 			z.union([

diff --git a/apps/gateway/src/chat/tools/resolve-provider-context.ts b/apps/gateway/src/chat/tools/resolve-provider-context.ts
@@ -19,6 +19,7 @@ import {
 	type ModelDefinition,
 	type OpenAIRequestBody,
 	type OpenAIToolInput,
+	type PromptCacheRetention,
 	type Provider,
 	type ProviderRequestBody,
 	providers,
@@ -83,6 +84,8 @@ export interface ProviderContextOptions {
 	tool_choice: ToolChoiceType | undefined;
 	reasoning_effort: "minimal" | "low" | "medium" | "high" | "xhigh" | undefined;
 	reasoning_max_tokens: number | undefined;
+	prompt_cache_key: string | undefined;
+	prompt_cache_retention: PromptCacheRetention | undefined;
 	effort: "low" | "medium" | "high" | undefined;
 	webSearchTool: WebSearchTool | undefined;
 	image_config:
@@ -447,6 +450,8 @@ export async function resolveProviderContext(
 		options.webSearchTool,
 		options.reasoning_max_tokens,
 		useResponsesApi,
+		options.prompt_cache_key,
+		options.prompt_cache_retention,
 	);
 
 	// Post-validation of max_tokens in request body

diff --git a/apps/gateway/src/lib/costs.spec.ts b/apps/gateway/src/lib/costs.spec.ts
@@ -112,6 +112,44 @@ describe("calculateCosts", () => {
 		expect(result.estimatedCost).toBe(false); // Not estimated
 	});
 
+	it("does not add a separate cache write fee for OpenAI", async () => {
+		const withoutCacheWrite = await calculateCosts(
+			"gpt-4o",
+			"openai",
+			100,
+			50,
+			20,
+		);
+		const withCacheWrite = await calculateCosts(
+			"gpt-4o",
+			"openai",
+			100,
+			50,
+			20,
+			undefined,
+			null,
+			0,
+			undefined,
+			0,
+			null,
+			null,
+			undefined,
+			null,
+			null,
+			{
+				cacheWriteTokens: 30,
+			},
+		);
+
+		expect(withCacheWrite.inputCost).toBe(withoutCacheWrite.inputCost);
+		expect(withCacheWrite.cachedInputCost).toBe(
+			withoutCacheWrite.cachedInputCost,
+		);
+		expect(withCacheWrite.cacheWriteInputCost).toBe(0);
+		expect(withCacheWrite.totalCost).toBe(withoutCacheWrite.totalCost);
+		expect(withCacheWrite.cacheWriteTokens).toBe(30);
+	});
+
 	it("should calculate costs with cached tokens for Anthropic (first request - cache creation)", async () => {
 		// For Anthropic first request: 4 non-cached + 1659 cache creation = 1663 total tokens, 0 cache reads
 		const result = await calculateCosts(

diff --git a/apps/gateway/src/responses/responses.ts b/apps/gateway/src/responses/responses.ts
@@ -283,6 +283,12 @@ responses.post("/", async (c) => {
 	if (req.reasoning?.effort) {
 		chatRequest.reasoning_effort = req.reasoning.effort;
 	}
+	if (req.prompt_cache_key !== undefined) {
+		chatRequest.prompt_cache_key = req.prompt_cache_key;
+	}
+	if (req.prompt_cache_retention !== undefined) {
+		chatRequest.prompt_cache_retention = req.prompt_cache_retention;
+	}
 	if (response_format) {
 		chatRequest.response_format = response_format;
 	}

diff --git a/apps/gateway/src/responses/schemas.ts b/apps/gateway/src/responses/schemas.ts
@@ -106,6 +106,16 @@ export const responsesRequestSchema = z.object({
 	instructions: z.string().optional(),
 	previous_response_id: z.string().optional(),
 	stream: z.boolean().optional().default(false),
+	prompt_cache_key: z
+		.string()
+		.nullable()
+		.optional()
+		.transform((val) => (val === null ? undefined : val)),
+	prompt_cache_retention: z
+		.enum(["in_memory", "24h"])
+		.nullable()
+		.optional()
+		.transform((val) => (val === null ? undefined : val)),
 	temperature: z
 		.number()
 		.nullable()

diff --git a/apps/gateway/src/responses/tools/convert-chat-to-responses.ts b/apps/gateway/src/responses/tools/convert-chat-to-responses.ts
@@ -155,6 +155,7 @@ export interface ResponsesEchoRequest {
 	metadata?: Record<string, unknown>;
 	safety_identifier?: string;
 	prompt_cache_key?: string;
+	prompt_cache_retention?: "in_memory" | "24h";
 }
 
 /**

diff --git a/packages/actions/src/prepare-request-body.spec.ts b/packages/actions/src/prepare-request-body.spec.ts
@@ -35,6 +35,43 @@ async function prepareOpenAIImageRequest(imageConfig: {
 	);
 }
 
+async function prepareOpenAITextRequest(options: {
+	provider?: "openai" | "azure";
+	model?: string;
+	useResponsesApi?: boolean;
+	promptCacheKey?: string;
+	promptCacheRetention?: "in_memory" | "24h";
+}) {
+	return await prepareRequestBody(
+		options.provider ?? "openai",
+		options.model ?? "gpt-5.5",
+		[{ role: "user", content: "Hello!" }],
+		false,
+		undefined,
+		undefined,
+		undefined,
+		undefined,
+		undefined,
+		undefined,
+		undefined,
+		undefined,
+		undefined,
+		false,
+		false,
+		20,
+		null,
+		undefined,
+		undefined,
+		undefined,
+		undefined,
+		undefined,
+		undefined,
+		options.useResponsesApi ?? false,
+		options.promptCacheKey,
+		options.promptCacheRetention,
+	);
+}
+
 describe("prepareRequestBody - Anthropic", () => {
 	test("should extract system messages to system field for caching", async () => {
 		const requestBody = (await prepareRequestBody(
@@ -238,6 +275,69 @@ describe("prepareRequestBody - OpenAI image generation", () => {
 	});
 });
 
+describe("prepareRequestBody - OpenAI prompt caching", () => {
+	test("should forward prompt cache controls to OpenAI chat completions", async () => {
+		const requestBody = (await prepareOpenAITextRequest({
+			promptCacheKey: "tenant-a",
+			promptCacheRetention: "24h",
+		})) as any;
+
+		expect(requestBody.prompt_cache_key).toBe("tenant-a");
+		expect(requestBody.prompt_cache_retention).toBe("24h");
+	});
+
+	test("should forward prompt cache controls to OpenAI Responses API", async () => {
+		const requestBody = (await prepareOpenAITextRequest({
+			useResponsesApi: true,
+			promptCacheKey: "tenant-a",
+			promptCacheRetention: "in_memory",
+		})) as any;
+
+		expect(requestBody.prompt_cache_key).toBe("tenant-a");
+		expect(requestBody.prompt_cache_retention).toBe("in_memory");
+	});
+
+	test("should not forward OpenAI prompt cache controls to Azure", async () => {
+		const requestBody = (await prepareOpenAITextRequest({
+			provider: "azure",
+			promptCacheKey: "tenant-a",
+			promptCacheRetention: "24h",
+		})) as any;
+
+		expect(requestBody.prompt_cache_key).toBeUndefined();
+		expect(requestBody.prompt_cache_retention).toBeUndefined();
+	});
+
+	test("should strip prompt_cache_retention=24h on models that don't support extended retention", async () => {
+		const requestBody = (await prepareOpenAITextRequest({
+			model: "gpt-4o",
+			promptCacheKey: "tenant-a",
+			promptCacheRetention: "24h",
+		})) as any;
+
+		expect(requestBody.prompt_cache_key).toBe("tenant-a");
+		expect(requestBody.prompt_cache_retention).toBeUndefined();
+	});
+
+	test("should still forward prompt_cache_retention=in_memory on models without 24h support", async () => {
+		const requestBody = (await prepareOpenAITextRequest({
+			model: "gpt-4o",
+			promptCacheRetention: "in_memory",
+		})) as any;
+
+		expect(requestBody.prompt_cache_retention).toBe("in_memory");
+	});
+
+	test("should forward prompt_cache_retention=24h on models that do support extended retention", async () => {
+		const requestBody = (await prepareOpenAITextRequest({
+			model: "gpt-4.1",
+			promptCacheRetention: "24h",
+		})) as any;
+
+		expect(requestBody.prompt_cache_retention).toBe("24h");
+	});
+});
+
 describe("prepareRequestBody - Google AI Studio", () => {
 	test("should map gateway 0.5K image size to Google 512", async () => {
 		const requestBody = (await prepareRequestBody(

diff --git a/packages/actions/src/prepare-request-body.ts b/packages/actions/src/prepare-request-body.ts
@@ -10,7 +10,9 @@ import {
 	type OpenAIRequestBody,
 	type OpenAIResponsesRequestBody,
 	type OpenAIToolInput,
+	type PromptCacheRetention,
 	type ProviderRequestBody,
+	supportsOpenAIExtendedPromptCache,
 	type ToolChoiceType,
 	type WebSearchTool,
 } from "@llmgateway/models";
@@ -653,6 +655,8 @@ export async function prepareRequestBody(
 	webSearchTool?: WebSearchTool,
 	reasoning_max_tokens?: number,
 	useResponsesApi?: boolean,
+	prompt_cache_key?: string,
+	prompt_cache_retention?: PromptCacheRetention,
 ): Promise<ProviderRequestBody | FormData> {
 	// Handle OpenAI / Azure image generation models (e.g. gpt-image-2)
 	if (
@@ -1123,6 +1127,19 @@ export async function prepareRequestBody(
 					},
 				};
 
+				if (usedProvider === "openai") {
+					if (prompt_cache_key !== undefined) {
+						responsesBody.prompt_cache_key = prompt_cache_key;
+					}
+					if (
+						prompt_cache_retention !== undefined &&
+						(prompt_cache_retention !== "24h" ||
+							supportsOpenAIExtendedPromptCache(usedModel))
+					) {
+						responsesBody.prompt_cache_retention = prompt_cache_retention;
+					}
+				}
+
 				// Add streaming support
 				if (stream) {
 					responsesBody.stream = true;
@@ -1194,6 +1211,19 @@ export async function prepareRequestBody(
 				return responsesBody;
 			} else {
 				// Use regular chat completions format
+				if (usedProvider === "openai") {
+					if (prompt_cache_key !== undefined) {
+						requestBody.prompt_cache_key = prompt_cache_key;
+					}
+					if (
+						prompt_cache_retention !== undefined &&
+						(prompt_cache_retention !== "24h" ||
+							supportsOpenAIExtendedPromptCache(usedModel))
+					) {
+						requestBody.prompt_cache_retention = prompt_cache_retention;
+					}
+				}
+
 				if (stream) {
 					requestBody.stream_options = {
 						include_usage: true,

diff --git a/packages/models/src/helpers.ts b/packages/models/src/helpers.ts
@@ -68,3 +68,26 @@ export function getModelStreamingSupport(
 	const providerInfo = providers.find((p) => p.id === providerId);
 	return providerInfo?.streaming === true;
 }
+
+// OpenAI prompt_cache_retention="24h" eligibility per
+// https://developers.openai.com/api/docs/guides/prompt-caching.
+// gpt-5.5 and gpt-5.5-pro default to 24h and reject "in_memory"; the rest
+// accept either. Models not on this list only support "in_memory" caching.
+const OPENAI_EXTENDED_PROMPT_CACHE_MODELS = new Set<string>([
+	"gpt-5.5",
+	"gpt-5.5-pro",
+	"gpt-5.4",
+	"gpt-5.2",
+	"gpt-5.1-codex-max",
+	"gpt-5.1",
+	"gpt-5.1-codex",
+	"gpt-5.1-codex-mini",
+	"gpt-5.1-chat-latest",
+	"gpt-5",
+	"gpt-5-codex",
+	"gpt-4.1",
+]);
+
+export function supportsOpenAIExtendedPromptCache(modelName: string): boolean {
+	return OPENAI_EXTENDED_PROMPT_CACHE_MODELS.has(modelName);
+}