Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions apps/gateway/src/chat/chat.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1032,6 +1032,8 @@ chat.openapi(completions, async (c) => {
model: modelInput,
response_format,
stream,
prompt_cache_key,
prompt_cache_retention,
tool_choice,
free_models_only,
onboarding,
Expand Down Expand Up @@ -3502,6 +3504,8 @@ chat.openapi(completions, async (c) => {
response_format,
reasoning_effort,
reasoning_max_tokens,
prompt_cache_key,
prompt_cache_retention,
};

if (stream) {
Expand Down Expand Up @@ -4088,6 +4092,8 @@ chat.openapi(completions, async (c) => {
webSearchTool,
reasoning_max_tokens,
useResponsesApi,
prompt_cache_key,
prompt_cache_retention,
);

if (forceImageStreamUpstream) {
Expand Down Expand Up @@ -4215,6 +4221,8 @@ chat.openapi(completions, async (c) => {
tool_choice,
reasoning_effort,
reasoning_max_tokens,
prompt_cache_key,
prompt_cache_retention,
effort,
webSearchTool,
image_config,
Expand Down
20 changes: 20 additions & 0 deletions apps/gateway/src/chat/schemas/completions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,26 @@ export const completionsRequestSchema = z.object({
])
.optional(),
stream: z.boolean().optional().default(false),
prompt_cache_key: z
.string()
.nullable()
.optional()
.transform((val) => (val === null ? undefined : val))
.openapi({
description:
"OpenAI prompt caching key used to improve cache routing for requests with shared prompt prefixes.",
example: "tenant-123",
}),
prompt_cache_retention: z
.enum(["in_memory", "24h"])
.nullable()
.optional()
.transform((val) => (val === null ? undefined : val))
.openapi({
description:
"OpenAI prompt cache retention policy. OpenAI supports in_memory and 24h for eligible models.",
example: "24h",
}),
tools: z
.array(
z.union([
Expand Down
5 changes: 5 additions & 0 deletions apps/gateway/src/chat/tools/resolve-provider-context.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import {
type ModelDefinition,
type OpenAIRequestBody,
type OpenAIToolInput,
type PromptCacheRetention,
type Provider,
type ProviderRequestBody,
providers,
Expand Down Expand Up @@ -83,6 +84,8 @@ export interface ProviderContextOptions {
tool_choice: ToolChoiceType | undefined;
reasoning_effort: "minimal" | "low" | "medium" | "high" | "xhigh" | undefined;
reasoning_max_tokens: number | undefined;
prompt_cache_key: string | undefined;
prompt_cache_retention: PromptCacheRetention | undefined;
effort: "low" | "medium" | "high" | undefined;
webSearchTool: WebSearchTool | undefined;
image_config:
Expand Down Expand Up @@ -447,6 +450,8 @@ export async function resolveProviderContext(
options.webSearchTool,
options.reasoning_max_tokens,
useResponsesApi,
options.prompt_cache_key,
options.prompt_cache_retention,
);

// Post-validation of max_tokens in request body
Expand Down
38 changes: 38 additions & 0 deletions apps/gateway/src/lib/costs.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,44 @@ describe("calculateCosts", () => {
expect(result.estimatedCost).toBe(false); // Not estimated
});

it("does not add a separate cache write fee for OpenAI", async () => {
const withoutCacheWrite = await calculateCosts(
"gpt-4o",
"openai",
100,
50,
20,
);
const withCacheWrite = await calculateCosts(
"gpt-4o",
"openai",
100,
50,
20,
undefined,
null,
0,
undefined,
0,
null,
null,
undefined,
null,
null,
{
cacheWriteTokens: 30,
},
);

expect(withCacheWrite.inputCost).toBe(withoutCacheWrite.inputCost);
expect(withCacheWrite.cachedInputCost).toBe(
withoutCacheWrite.cachedInputCost,
);
expect(withCacheWrite.cacheWriteInputCost).toBe(0);
expect(withCacheWrite.totalCost).toBe(withoutCacheWrite.totalCost);
expect(withCacheWrite.cacheWriteTokens).toBe(30);
});

it("should calculate costs with cached tokens for Anthropic (first request - cache creation)", async () => {
// For Anthropic first request: 4 non-cached + 1659 cache creation = 1663 total tokens, 0 cache reads
const result = await calculateCosts(
Expand Down
6 changes: 6 additions & 0 deletions apps/gateway/src/responses/responses.ts
Original file line number Diff line number Diff line change
Expand Up @@ -283,6 +283,12 @@ responses.post("/", async (c) => {
if (req.reasoning?.effort) {
chatRequest.reasoning_effort = req.reasoning.effort;
}
if (req.prompt_cache_key !== undefined) {
chatRequest.prompt_cache_key = req.prompt_cache_key;
}
if (req.prompt_cache_retention !== undefined) {
chatRequest.prompt_cache_retention = req.prompt_cache_retention;
}
if (response_format) {
chatRequest.response_format = response_format;
}
Expand Down
10 changes: 10 additions & 0 deletions apps/gateway/src/responses/schemas.ts
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,16 @@ export const responsesRequestSchema = z.object({
instructions: z.string().optional(),
previous_response_id: z.string().optional(),
stream: z.boolean().optional().default(false),
prompt_cache_key: z
.string()
.nullable()
.optional()
.transform((val) => (val === null ? undefined : val)),
prompt_cache_retention: z
.enum(["in_memory", "24h"])
.nullable()
.optional()
.transform((val) => (val === null ? undefined : val)),
temperature: z
.number()
.nullable()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,7 @@ export interface ResponsesEchoRequest {
metadata?: Record<string, unknown>;
safety_identifier?: string;
prompt_cache_key?: string;
prompt_cache_retention?: "in_memory" | "24h";
}

/**
Expand Down
100 changes: 100 additions & 0 deletions packages/actions/src/prepare-request-body.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,43 @@ async function prepareOpenAIImageRequest(imageConfig: {
);
}

async function prepareOpenAITextRequest(options: {
provider?: "openai" | "azure";
model?: string;
useResponsesApi?: boolean;
promptCacheKey?: string;
promptCacheRetention?: "in_memory" | "24h";
}) {
return await prepareRequestBody(
options.provider ?? "openai",
options.model ?? "gpt-5.5",
[{ role: "user", content: "Hello!" }],
false,
undefined,
undefined,
undefined,
undefined,
undefined,
undefined,
undefined,
undefined,
undefined,
false,
false,
20,
null,
undefined,
undefined,
undefined,
undefined,
undefined,
undefined,
options.useResponsesApi ?? false,
options.promptCacheKey,
options.promptCacheRetention,
);
}

describe("prepareRequestBody - Anthropic", () => {
test("should extract system messages to system field for caching", async () => {
const requestBody = (await prepareRequestBody(
Expand Down Expand Up @@ -238,6 +275,69 @@ describe("prepareRequestBody - OpenAI image generation", () => {
});
});

describe("prepareRequestBody - OpenAI prompt caching", () => {
test("should forward prompt cache controls to OpenAI chat completions", async () => {
const requestBody = (await prepareOpenAITextRequest({
promptCacheKey: "tenant-a",
promptCacheRetention: "24h",
})) as any;

expect(requestBody.prompt_cache_key).toBe("tenant-a");
expect(requestBody.prompt_cache_retention).toBe("24h");
});

test("should forward prompt cache controls to OpenAI Responses API", async () => {
const requestBody = (await prepareOpenAITextRequest({
useResponsesApi: true,
promptCacheKey: "tenant-a",
promptCacheRetention: "in_memory",
})) as any;

expect(requestBody.prompt_cache_key).toBe("tenant-a");
expect(requestBody.prompt_cache_retention).toBe("in_memory");
});

test("should not forward OpenAI prompt cache controls to Azure", async () => {
const requestBody = (await prepareOpenAITextRequest({
provider: "azure",
promptCacheKey: "tenant-a",
promptCacheRetention: "24h",
})) as any;

expect(requestBody.prompt_cache_key).toBeUndefined();
expect(requestBody.prompt_cache_retention).toBeUndefined();
});

test("should strip prompt_cache_retention=24h on models that don't support extended retention", async () => {
const requestBody = (await prepareOpenAITextRequest({
model: "gpt-4o",
promptCacheKey: "tenant-a",
promptCacheRetention: "24h",
})) as any;

expect(requestBody.prompt_cache_key).toBe("tenant-a");
expect(requestBody.prompt_cache_retention).toBeUndefined();
});

test("should still forward prompt_cache_retention=in_memory on models without 24h support", async () => {
const requestBody = (await prepareOpenAITextRequest({
model: "gpt-4o",
promptCacheRetention: "in_memory",
})) as any;

expect(requestBody.prompt_cache_retention).toBe("in_memory");
});

test("should forward prompt_cache_retention=24h on models that do support extended retention", async () => {
const requestBody = (await prepareOpenAITextRequest({
model: "gpt-4.1",
promptCacheRetention: "24h",
})) as any;

expect(requestBody.prompt_cache_retention).toBe("24h");
Comment on lines +280 to +337
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🛠️ Refactor suggestion | 🟠 Major | ⚡ Quick win

Avoid as any in the new prompt-caching tests.

The new assertions use as any for fields that can be checked with proper unions/type guards, which weakens test type-safety unnecessarily.

As per coding guidelines, **/*.{ts,tsx}: "Never use any type or force cast with as any unless absolutely necessary."

🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@packages/actions/src/prepare-request-body.spec.ts` around lines 280 - 337,
Tests are weakening type-safety by casting the response of
prepareOpenAITextRequest to "any"; instead remove the "as any" casts and assert
the concrete shape using proper typing or runtime type guards (e.g., define/
reuse an interface for the expected OpenAI request body or use ReturnType<typeof
prepareOpenAITextRequest> and narrow with type guards) and then check
requestBody.prompt_cache_key and requestBody.prompt_cache_retention directly;
update the test files to import/declare the expected request body type or add
minimal type guards around requestBody so the assertions remain strongly typed
without using "as any".

});
});

describe("prepareRequestBody - Google AI Studio", () => {
test("should map gateway 0.5K image size to Google 512", async () => {
const requestBody = (await prepareRequestBody(
Expand Down
30 changes: 30 additions & 0 deletions packages/actions/src/prepare-request-body.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ import {
type OpenAIRequestBody,
type OpenAIResponsesRequestBody,
type OpenAIToolInput,
type PromptCacheRetention,
type ProviderRequestBody,
supportsOpenAIExtendedPromptCache,
type ToolChoiceType,
type WebSearchTool,
} from "@llmgateway/models";
Expand Down Expand Up @@ -653,6 +655,8 @@ export async function prepareRequestBody(
webSearchTool?: WebSearchTool,
reasoning_max_tokens?: number,
useResponsesApi?: boolean,
prompt_cache_key?: string,
prompt_cache_retention?: PromptCacheRetention,
): Promise<ProviderRequestBody | FormData> {
// Handle OpenAI / Azure image generation models (e.g. gpt-image-2)
if (
Expand Down Expand Up @@ -1123,6 +1127,19 @@ export async function prepareRequestBody(
},
};

if (usedProvider === "openai") {
if (prompt_cache_key !== undefined) {
responsesBody.prompt_cache_key = prompt_cache_key;
}
if (
prompt_cache_retention !== undefined &&
(prompt_cache_retention !== "24h" ||
supportsOpenAIExtendedPromptCache(usedModel))
) {
responsesBody.prompt_cache_retention = prompt_cache_retention;
}
}

// Add streaming support
if (stream) {
responsesBody.stream = true;
Expand Down Expand Up @@ -1194,6 +1211,19 @@ export async function prepareRequestBody(
return responsesBody;
} else {
// Use regular chat completions format
if (usedProvider === "openai") {
if (prompt_cache_key !== undefined) {
requestBody.prompt_cache_key = prompt_cache_key;
}
if (
prompt_cache_retention !== undefined &&
(prompt_cache_retention !== "24h" ||
supportsOpenAIExtendedPromptCache(usedModel))
) {
requestBody.prompt_cache_retention = prompt_cache_retention;
}
}

if (stream) {
requestBody.stream_options = {
include_usage: true,
Expand Down
23 changes: 23 additions & 0 deletions packages/models/src/helpers.ts
Original file line number Diff line number Diff line change
Expand Up @@ -68,3 +68,26 @@ export function getModelStreamingSupport(
const providerInfo = providers.find((p) => p.id === providerId);
return providerInfo?.streaming === true;
}

// OpenAI prompt_cache_retention="24h" eligibility per
// https://developers.openai.com/api/docs/guides/prompt-caching.
// gpt-5.5 and gpt-5.5-pro default to 24h and reject "in_memory"; the rest
// accept either. Models not on this list only support "in_memory" caching.
const OPENAI_EXTENDED_PROMPT_CACHE_MODELS = new Set<string>([
"gpt-5.5",
"gpt-5.5-pro",
"gpt-5.4",
"gpt-5.2",
"gpt-5.1-codex-max",
"gpt-5.1",
"gpt-5.1-codex",
"gpt-5.1-codex-mini",
"gpt-5.1-chat-latest",
"gpt-5",
"gpt-5-codex",
"gpt-4.1",
]);

export function supportsOpenAIExtendedPromptCache(modelName: string): boolean {
return OPENAI_EXTENDED_PROMPT_CACHE_MODELS.has(modelName);
}
Loading
Loading