Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions apps/gateway/src/chat/chat.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1032,6 +1032,8 @@ chat.openapi(completions, async (c) => {
model: modelInput,
response_format,
stream,
prompt_cache_key,
prompt_cache_retention,
tool_choice,
free_models_only,
onboarding,
Expand Down Expand Up @@ -3502,6 +3504,8 @@ chat.openapi(completions, async (c) => {
response_format,
reasoning_effort,
reasoning_max_tokens,
prompt_cache_key,
prompt_cache_retention,
};

if (stream) {
Expand Down Expand Up @@ -4088,6 +4092,8 @@ chat.openapi(completions, async (c) => {
webSearchTool,
reasoning_max_tokens,
useResponsesApi,
prompt_cache_key,
prompt_cache_retention,
);

if (forceImageStreamUpstream) {
Expand Down Expand Up @@ -4215,6 +4221,8 @@ chat.openapi(completions, async (c) => {
tool_choice,
reasoning_effort,
reasoning_max_tokens,
prompt_cache_key,
prompt_cache_retention,
effort,
webSearchTool,
image_config,
Expand Down
20 changes: 20 additions & 0 deletions apps/gateway/src/chat/schemas/completions.ts
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,26 @@ export const completionsRequestSchema = z.object({
])
.optional(),
stream: z.boolean().optional().default(false),
prompt_cache_key: z
.string()
.nullable()
.optional()
.transform((val) => (val === null ? undefined : val))
.openapi({
description:
"OpenAI prompt caching key used to improve cache routing for requests with shared prompt prefixes.",
example: "tenant-123",
}),
prompt_cache_retention: z
.enum(["in_memory", "24h"])
.nullable()
.optional()
.transform((val) => (val === null ? undefined : val))
.openapi({
description:
"OpenAI prompt cache retention policy. OpenAI supports in_memory and 24h for eligible models.",
example: "24h",
}),
tools: z
.array(
z.union([
Expand Down
5 changes: 5 additions & 0 deletions apps/gateway/src/chat/tools/resolve-provider-context.ts
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ import {
type ModelDefinition,
type OpenAIRequestBody,
type OpenAIToolInput,
type PromptCacheRetention,
type Provider,
type ProviderRequestBody,
providers,
Expand Down Expand Up @@ -83,6 +84,8 @@ export interface ProviderContextOptions {
tool_choice: ToolChoiceType | undefined;
reasoning_effort: "minimal" | "low" | "medium" | "high" | "xhigh" | undefined;
reasoning_max_tokens: number | undefined;
prompt_cache_key: string | undefined;
prompt_cache_retention: PromptCacheRetention | undefined;
effort: "low" | "medium" | "high" | undefined;
webSearchTool: WebSearchTool | undefined;
image_config:
Expand Down Expand Up @@ -447,6 +450,8 @@ export async function resolveProviderContext(
options.webSearchTool,
options.reasoning_max_tokens,
useResponsesApi,
options.prompt_cache_key,
options.prompt_cache_retention,
);

// Post-validation of max_tokens in request body
Expand Down
38 changes: 38 additions & 0 deletions apps/gateway/src/lib/costs.spec.ts
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,44 @@ describe("calculateCosts", () => {
expect(result.estimatedCost).toBe(false); // Not estimated
});

it("does not add a separate cache write fee for OpenAI", async () => {
const withoutCacheWrite = await calculateCosts(
"gpt-4o",
"openai",
100,
50,
20,
);
const withCacheWrite = await calculateCosts(
"gpt-4o",
"openai",
100,
50,
20,
undefined,
null,
0,
undefined,
0,
null,
null,
undefined,
null,
null,
{
cacheWriteTokens: 30,
},
);

expect(withCacheWrite.inputCost).toBe(withoutCacheWrite.inputCost);
expect(withCacheWrite.cachedInputCost).toBe(
withoutCacheWrite.cachedInputCost,
);
expect(withCacheWrite.cacheWriteInputCost).toBe(0);
expect(withCacheWrite.totalCost).toBe(withoutCacheWrite.totalCost);
expect(withCacheWrite.cacheWriteTokens).toBe(30);
});

it("should calculate costs with cached tokens for Anthropic (first request - cache creation)", async () => {
// For Anthropic first request: 4 non-cached + 1659 cache creation = 1663 total tokens, 0 cache reads
const result = await calculateCosts(
Expand Down
257 changes: 257 additions & 0 deletions apps/gateway/src/native-openai-cache.e2e.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
import "dotenv/config";
import { beforeAll, beforeEach, describe, expect, test } from "vitest";

import {
beforeAllHook,
beforeEachHook,
generateTestRequestId,
getConcurrentTestOptions,
getTestOptions,
logMode,
} from "@/chat-helpers.e2e.js";

import { app } from "./app.js";

// OpenAI prompt caching kicks in at >= 1024 prompt tokens. ~6.5k chars of
// padding repeated 50x easily clears that bar regardless of tokenizer.
function buildLongSystemPrompt(): string {
return (
"You are a helpful AI assistant. " +
"This is detailed background context that should be cached for optimal request performance and consistent latency across many calls. ".repeat(
50,
) +
"Please answer succinctly."
);
}

async function sendUntilCacheRead(
send: () => Promise<{ status: number; json: any }>,
maxAttempts = 4,
): Promise<{ status: number; json: any; attempts: number }> {
let last: { status: number; json: any } = { status: 0, json: null };
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated
for (let attempt = 1; attempt <= maxAttempts; attempt++) {
last = await send();
if (last.status !== 200) {
return { ...last, attempts: attempt };
}
const cached =
last.json?.usage?.prompt_tokens_details?.cached_tokens ??
last.json?.usage?.input_tokens_details?.cached_tokens ??
0;
if (cached > 0) {
return { ...last, attempts: attempt };
}
if (attempt < maxAttempts) {
await new Promise((r) => setTimeout(r, 500 * attempt));
}
}
return { ...last, attempts: maxAttempts };
}

const hasOpenAIKey = !!process.env.LLM_OPENAI_API_KEY;
Comment thread
coderabbitai[bot] marked this conversation as resolved.
Outdated

describe("e2e openai prompt cache", getConcurrentTestOptions(), () => {
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major | ⚡ Quick win

Disable parallel execution for this e2e suite.

Line 53 uses getConcurrentTestOptions() in an .e2e.ts suite, which can introduce flaky cache/rate-limit behavior and conflicts with repo policy for e2e execution.

Suggested change
-describe("e2e openai prompt cache", getConcurrentTestOptions(), () => {
+describe("e2e openai prompt cache", getTestOptions(), () => {

As per coding guidelines, "**/*.{spec.ts,e2e.ts}: Do not run test files or suites in parallel unless the repository instructions for that exact suite explicitly require it".

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
describe("e2e openai prompt cache", getConcurrentTestOptions(), () => {
describe("e2e openai prompt cache", getTestOptions(), () => {
🤖 Prompt for AI Agents
Verify each finding against current code. Fix only still-valid issues, skip the
rest with a brief reason, keep changes minimal, and validate.

In `@apps/gateway/src/native-openai-cache.e2e.ts` at line 53, The e2e suite
declaration describe("e2e openai prompt cache", getConcurrentTestOptions(), ()
=> { ... }) enables concurrent execution; remove the concurrent options so the
suite runs serially — locate the describe call for "e2e openai prompt cache" and
delete the getConcurrentTestOptions() argument (or replace it with no options)
so the test file is not run in parallel per repository e2e guidelines.

beforeAll(beforeAllHook);
beforeEach(beforeEachHook);

test("empty", () => {
expect(true).toBe(true);
});

// /v1/chat/completions: priming + warm call should report cached_tokens
// under prompt_tokens_details. Also confirms prompt_cache_key is accepted
// (forwarded as a routing hint, not echoed in the chat response).
(hasOpenAIKey ? test : test.skip)(
"chat-completions surfaces cached_tokens for openai",
getTestOptions(),
async () => {
const longText = buildLongSystemPrompt();
const body = {
model: "openai/gpt-4.1",
prompt_cache_key: "e2e-openai-cache-cc",
messages: [
{ role: "system", content: longText },
{ role: "user", content: "Just reply OK." },
],
};

const send = async () => {
const requestId = generateTestRequestId();
const res = await app.request("/v1/chat/completions", {
method: "POST",
headers: {
"Content-Type": "application/json",
"x-request-id": requestId,
Authorization: `Bearer real-token`,
},
body: JSON.stringify(body),
});
const json = await res.json();
if (logMode) {
console.log(
"openai chat cache",
requestId,
"status",
res.status,
"usage",
JSON.stringify(json.usage),
);
}
return { status: res.status, json };
};

const first = await send();
expect(first.status).toBe(200);
expect(first.json.usage).toBeDefined();
expect(typeof first.json.usage.prompt_tokens).toBe("number");

const second = await sendUntilCacheRead(send);
expect(second.status).toBe(200);
expect(second.json.usage.prompt_tokens_details).toBeDefined();
expect(
second.json.usage.prompt_tokens_details.cached_tokens,
`expected cached_tokens > 0 after ${second.attempts} attempts`,
).toBeGreaterThan(0);
},
);

// /v1/responses: same surface but on the Responses API. Verifies the
// schema accepts prompt_cache_key/prompt_cache_retention and that
// input_tokens_details.cached_tokens is surfaced.
(hasOpenAIKey ? test : test.skip)(
"responses-api surfaces cached_tokens for openai",
getTestOptions(),
async () => {
const longText = buildLongSystemPrompt();
const body = {
model: "openai/gpt-4.1",
prompt_cache_key: "e2e-openai-cache-resp",
prompt_cache_retention: "in_memory" as const,
input: [
{ role: "system" as const, content: longText },
{ role: "user" as const, content: "Just reply OK." },
],
};

const send = async () => {
const requestId = generateTestRequestId();
const res = await app.request("/v1/responses", {
method: "POST",
headers: {
"Content-Type": "application/json",
"x-request-id": requestId,
Authorization: `Bearer real-token`,
},
body: JSON.stringify(body),
});
const json = await res.json();
if (logMode) {
console.log(
"openai responses cache",
requestId,
"status",
res.status,
"usage",
JSON.stringify(json.usage),
);
}
return { status: res.status, json };
};

const first = await send();
expect(first.status).toBe(200);

const second = await sendUntilCacheRead(send);
expect(second.status).toBe(200);
const cached =
second.json.usage?.input_tokens_details?.cached_tokens ??
second.json.usage?.prompt_tokens_details?.cached_tokens ??
0;
expect(
cached,
`expected cached_tokens > 0 after ${second.attempts} attempts`,
).toBeGreaterThan(0);
},
);

// Extended retention is only valid for the docs-listed models. Sending
// "24h" on gpt-4o (which is in_memory-only) should NOT cause OpenAI to
// 400 — the gateway strips it. Verifies the request still succeeds.
(hasOpenAIKey ? test : test.skip)(
"chat-completions strips prompt_cache_retention=24h on unsupported model",
getTestOptions(),
async () => {
const body = {
model: "openai/gpt-4o",
prompt_cache_retention: "24h" as const,
messages: [{ role: "user", content: "Reply OK." }],
};

const requestId = generateTestRequestId();
const res = await app.request("/v1/chat/completions", {
method: "POST",
headers: {
"Content-Type": "application/json",
"x-request-id": requestId,
Authorization: `Bearer real-token`,
},
body: JSON.stringify(body),
});
const json = await res.json();
if (logMode) {
console.log(
"openai 24h-strip",
requestId,
"status",
res.status,
"body",
JSON.stringify(json).slice(0, 200),
);
}
expect(res.status).toBe(200);
},
);

// 24h retention should round-trip on an eligible model (gpt-4.1).
// Successful 200 confirms upstream acceptance and gateway forwarding.
(hasOpenAIKey ? test : test.skip)(
"chat-completions forwards prompt_cache_retention=24h on supported model",
getTestOptions(),
async () => {
const longText = buildLongSystemPrompt();
const body = {
model: "openai/gpt-4.1",
prompt_cache_retention: "24h" as const,
prompt_cache_key: "e2e-openai-cache-24h",
messages: [
{ role: "system", content: longText },
{ role: "user", content: "Just reply OK." },
],
};

const requestId = generateTestRequestId();
const res = await app.request("/v1/chat/completions", {
method: "POST",
headers: {
"Content-Type": "application/json",
"x-request-id": requestId,
Authorization: `Bearer real-token`,
},
body: JSON.stringify(body),
});
const json = await res.json();
if (logMode) {
console.log(
"openai 24h-forward",
requestId,
"status",
res.status,
"usage",
JSON.stringify(json.usage),
);
}
expect(res.status).toBe(200);
expect(json.usage).toBeDefined();
},
);
});
Loading
Loading