diff --git a/apps/gateway/src/chat/tools/extract-token-usage.spec.ts b/apps/gateway/src/chat/tools/extract-token-usage.spec.ts index 7883c6009..dd7d81cad 100644 --- a/apps/gateway/src/chat/tools/extract-token-usage.spec.ts +++ b/apps/gateway/src/chat/tools/extract-token-usage.spec.ts @@ -26,6 +26,28 @@ describe("extractTokenUsage", () => { expect(result.totalTokens).toBe(350); }); + it("extracts cache creation tokens from cacheDetails by TTL", () => { + const data = { + usage: { + inputTokens: 100, + cacheReadInputTokens: 0, + cacheWriteInputTokens: 1000, + cacheDetails: [ + { ttl: "1h", inputTokens: 700 }, + { ttl: "5m", inputTokens: 300 }, + ], + outputTokens: 200, + totalTokens: 1300, + }, + }; + + const result = extractTokenUsage(data, "aws-bedrock"); + + expect(result.cacheCreationTokens).toBe(1000); + expect(result.cacheCreation5mTokens).toBe(300); + expect(result.cacheCreation1hTokens).toBe(700); + }); + it("returns cachedTokens with correct value when cacheReadInputTokens > 0", () => { const data = { usage: { diff --git a/apps/gateway/src/chat/tools/extract-token-usage.ts b/apps/gateway/src/chat/tools/extract-token-usage.ts index 4b5ec12d8..ffd79875b 100644 --- a/apps/gateway/src/chat/tools/extract-token-usage.ts +++ b/apps/gateway/src/chat/tools/extract-token-usage.ts @@ -31,6 +31,31 @@ export function adjustGoogleCandidateTokens( return candidatesTokenCount; } +export function extractBedrockCacheCreationDetails(usage: any): { + cacheCreation5mTokens: number | null; + cacheCreation1hTokens: number | null; +} { + let fiveMinuteTokens = 0; + let oneHourTokens = 0; + + const cacheDetails = Array.isArray(usage?.cacheDetails) + ? usage.cacheDetails + : []; + for (const detail of cacheDetails) { + const inputTokens = detail?.inputTokens ?? 0; + if (detail?.ttl === "1h") { + oneHourTokens += inputTokens; + } else if (detail?.ttl === "5m") { + fiveMinuteTokens += inputTokens; + } + } + + return { + cacheCreation5mTokens: fiveMinuteTokens > 0 ? fiveMinuteTokens : null, + cacheCreation1hTokens: oneHourTokens > 0 ? oneHourTokens : null, + }; +} + /** * Extracts token usage information from streaming data based on provider format */ @@ -106,6 +131,7 @@ export function extractTokenUsage( const inputTokens = data.usage.inputTokens ?? 0; const cacheReadTokens = data.usage.cacheReadInputTokens ?? 0; const cacheWriteTokens = data.usage.cacheWriteInputTokens ?? 0; + const cacheDetails = extractBedrockCacheCreationDetails(data.usage); // Total prompt tokens = regular input + cache read + cache write promptTokens = inputTokens + cacheReadTokens + cacheWriteTokens; @@ -113,6 +139,8 @@ export function extractTokenUsage( // Cached tokens are the tokens read from cache (discount applies to these) cachedTokens = cacheReadTokens; cacheCreationTokens = cacheWriteTokens; + cacheCreation5mTokens = cacheDetails.cacheCreation5mTokens; + cacheCreation1hTokens = cacheDetails.cacheCreation1hTokens; totalTokens = data.usage.totalTokens ?? null; } break; diff --git a/apps/gateway/src/chat/tools/parse-provider-response.spec.ts b/apps/gateway/src/chat/tools/parse-provider-response.spec.ts index 80cdbef83..68edda9e6 100644 --- a/apps/gateway/src/chat/tools/parse-provider-response.spec.ts +++ b/apps/gateway/src/chat/tools/parse-provider-response.spec.ts @@ -84,6 +84,39 @@ describe("parseProviderResponse", () => { expect(result.promptTokens).toBe(150); // 100 + 0 + 50 }); + it("extracts cache creation tokens from cacheDetails by TTL", () => { + const json = { + output: { + message: { + content: [{ text: "Hello" }], + role: "assistant", + }, + }, + stopReason: "end_turn", + usage: { + inputTokens: 100, + cacheReadInputTokens: 0, + cacheWriteInputTokens: 1000, + cacheDetails: [ + { ttl: "1h", inputTokens: 700 }, + { ttl: "5m", inputTokens: 300 }, + ], + outputTokens: 200, + totalTokens: 1300, + }, + }; + + const result = parseProviderResponse( + "aws-bedrock", + "anthropic.claude-sonnet-4-5-20250929-v1:0", + json, + ); + + expect(result.cacheCreationTokens).toBe(1000); + expect(result.cacheCreation5mTokens).toBe(300); + expect(result.cacheCreation1hTokens).toBe(700); + }); + it("returns cachedTokens with correct value when cacheReadInputTokens > 0", () => { const json = { output: { diff --git a/apps/gateway/src/chat/tools/parse-provider-response.ts b/apps/gateway/src/chat/tools/parse-provider-response.ts index 5b7793f05..2303d6f9a 100644 --- a/apps/gateway/src/chat/tools/parse-provider-response.ts +++ b/apps/gateway/src/chat/tools/parse-provider-response.ts @@ -2,7 +2,10 @@ import { redisClient } from "@llmgateway/cache"; import { logger } from "@llmgateway/logger"; import { estimateTokens } from "./estimate-tokens.js"; -import { adjustGoogleCandidateTokens } from "./extract-token-usage.js"; +import { + adjustGoogleCandidateTokens, + extractBedrockCacheCreationDetails, +} from "./extract-token-usage.js"; import { extractReasoningDetailsText, splitReasoningFromTaggedContent, @@ -104,6 +107,7 @@ export function parseProviderResponse( const inputTokens = json.usage.inputTokens ?? 0; const cacheReadTokens = json.usage.cacheReadInputTokens ?? 0; const cacheWriteTokens = json.usage.cacheWriteInputTokens ?? 0; + const cacheDetails = extractBedrockCacheCreationDetails(json.usage); // Total prompt tokens = regular input + cache read + cache write promptTokens = inputTokens + cacheReadTokens + cacheWriteTokens; @@ -112,6 +116,8 @@ export function parseProviderResponse( // Cached tokens are the tokens read from cache (discount applies to these) cachedTokens = cacheReadTokens; cacheCreationTokens = cacheWriteTokens; + cacheCreation5mTokens = cacheDetails.cacheCreation5mTokens; + cacheCreation1hTokens = cacheDetails.cacheCreation1hTokens; } // Extract tool calls if present diff --git a/apps/gateway/src/chat/tools/transform-streaming-to-openai.spec.ts b/apps/gateway/src/chat/tools/transform-streaming-to-openai.spec.ts index 9805ab18a..d65da50ab 100644 --- a/apps/gateway/src/chat/tools/transform-streaming-to-openai.spec.ts +++ b/apps/gateway/src/chat/tools/transform-streaming-to-openai.spec.ts @@ -158,6 +158,50 @@ describe("transformStreamingToOpenai", () => { expect(warn).not.toHaveBeenCalled(); }); + it("maps AWS Bedrock metadata cache creation details", () => { + warn.mockClear(); + + const result = transformStreamingToOpenai( + "aws-bedrock", + "anthropic.claude-sonnet-4-5-20250929-v1:0", + { + __aws_event_type: "metadata", + usage: { + inputTokens: 10, + cacheReadInputTokens: 0, + cacheWriteInputTokens: 1000, + cacheDetails: [ + { ttl: "1h", inputTokens: 700 }, + { ttl: "5m", inputTokens: 300 }, + ], + outputTokens: 1, + totalTokens: 1011, + }, + }, + [], + ); + + expect(result).toMatchObject({ + object: "chat.completion.chunk", + model: "anthropic.claude-sonnet-4-5-20250929-v1:0", + usage: { + prompt_tokens: 1010, + completion_tokens: 1, + total_tokens: 1011, + prompt_tokens_details: { + cached_tokens: 0, + cache_write_tokens: 1000, + cache_creation_tokens: 1000, + cache_creation: { + ephemeral_5m_input_tokens: 300, + ephemeral_1h_input_tokens: 700, + }, + }, + }, + }); + expect(warn).not.toHaveBeenCalled(); + }); + it("treats non-text AWS Bedrock contentBlockDelta members as handled", () => { warn.mockClear(); diff --git a/apps/gateway/src/chat/tools/transform-streaming-to-openai.ts b/apps/gateway/src/chat/tools/transform-streaming-to-openai.ts index 231b2b2a6..13ccc5d21 100644 --- a/apps/gateway/src/chat/tools/transform-streaming-to-openai.ts +++ b/apps/gateway/src/chat/tools/transform-streaming-to-openai.ts @@ -3,7 +3,10 @@ import { logger } from "@llmgateway/logger"; import { calculatePromptTokensFromMessages } from "./calculate-prompt-tokens.js"; import { extractImages } from "./extract-images.js"; -import { adjustGoogleCandidateTokens } from "./extract-token-usage.js"; +import { + adjustGoogleCandidateTokens, + extractBedrockCacheCreationDetails, +} from "./extract-token-usage.js"; import { mapFinishReasonToOpenai } from "./map-finish-reason-to-openai.js"; import { transformOpenaiStreaming } from "./transform-openai-streaming.js"; @@ -1224,7 +1227,11 @@ export function transformStreamingToOpenai( const inputTokens = data.usage.inputTokens ?? 0; const cacheReadTokens = data.usage.cacheReadInputTokens ?? 0; const cacheWriteTokens = data.usage.cacheWriteInputTokens ?? 0; + const cacheDetails = extractBedrockCacheCreationDetails(data.usage); const promptTokens = inputTokens + cacheReadTokens + cacheWriteTokens; + const hasCacheCreationDetails = + cacheDetails.cacheCreation5mTokens !== null || + cacheDetails.cacheCreation1hTokens !== null; transformedData = { id: `chatcmpl-${Date.now()}`, @@ -1242,9 +1249,27 @@ export function transformStreamingToOpenai( prompt_tokens: promptTokens, completion_tokens: data.usage.outputTokens ?? 0, total_tokens: data.usage.totalTokens ?? 0, - ...(cacheReadTokens > 0 && { + ...((cacheReadTokens > 0 || cacheWriteTokens > 0) && { prompt_tokens_details: { cached_tokens: cacheReadTokens, + ...(cacheWriteTokens > 0 && { + cache_write_tokens: cacheWriteTokens, + cache_creation_tokens: cacheWriteTokens, + }), + ...(cacheWriteTokens > 0 && + hasCacheCreationDetails && { + cache_creation: { + ephemeral_5m_input_tokens: + cacheDetails.cacheCreation5mTokens ?? + Math.max( + 0, + cacheWriteTokens - + (cacheDetails.cacheCreation1hTokens ?? 0), + ), + ephemeral_1h_input_tokens: + cacheDetails.cacheCreation1hTokens ?? 0, + }, + }), }, }), }, diff --git a/apps/gateway/src/lib/anthropic-pricing.spec.ts b/apps/gateway/src/lib/anthropic-pricing.spec.ts index 5284f9ad6..330d6369b 100644 --- a/apps/gateway/src/lib/anthropic-pricing.spec.ts +++ b/apps/gateway/src/lib/anthropic-pricing.spec.ts @@ -137,3 +137,91 @@ describe("Anthropic model pricing", () => { }, ); }); + +describe("AWS Bedrock Anthropic model pricing", () => { + const bedrockProviderEntries = models.flatMap((model) => + model.family === "anthropic" + ? model.providers + .filter((provider) => provider.providerId === "aws-bedrock") + .map((provider) => ({ + modelId: model.id, + provider: provider as ProviderModelMapping, + })) + : [], + ); + + it("has at least one AWS Bedrock Anthropic provider mapping to validate", () => { + expect(bedrockProviderEntries.length).toBeGreaterThan(0); + }); + + it.each(bedrockProviderEntries)( + "$modelId defines cacheWriteInputPrice whenever cachedInputPrice is set", + ({ provider }) => { + if (provider.cachedInputPrice === undefined) { + return; + } + expect( + provider.cacheWriteInputPrice, + `${provider.modelName}: cachedInputPrice is set but cacheWriteInputPrice is missing`, + ).toBeDefined(); + }, + ); + + const ONE_HOUR_BEDROCK_PREFIXES = [ + "anthropic.claude-opus-4-5", + "anthropic.claude-opus-4-6", + "anthropic.claude-opus-4-7", + "anthropic.claude-haiku-4-5", + "anthropic.claude-sonnet-4-5", + "anthropic.claude-sonnet-4-6", + ]; + const supportsBedrock1h = (modelName: string) => + ONE_HOUR_BEDROCK_PREFIXES.some((prefix) => modelName.startsWith(prefix)); + + it.each(bedrockProviderEntries)( + "$modelId only sets cacheWriteInputPrice1h on bedrock models that support 1h TTL", + ({ provider }) => { + if (provider.cacheWriteInputPrice1h === undefined) { + return; + } + expect( + supportsBedrock1h(provider.modelName), + `${provider.modelName}: cacheWriteInputPrice1h is set but bedrock does not document 1h TTL support for this model`, + ).toBe(true); + }, + ); + + it.each(bedrockProviderEntries)( + "$modelId cache prices follow the standard 1.25x/2x/0.1x ratios", + ({ provider }) => { + if (provider.inputPrice === undefined) { + return; + } + const base = provider.inputPrice; + if (provider.cacheWriteInputPrice !== undefined) { + assertRatio( + provider.modelName, + "cacheWriteInputPrice (5m)", + provider.cacheWriteInputPrice, + base * FIVE_MIN_WRITE_MULTIPLIER, + ); + } + if (provider.cacheWriteInputPrice1h !== undefined) { + assertRatio( + provider.modelName, + "cacheWriteInputPrice1h", + provider.cacheWriteInputPrice1h, + base * ONE_HOUR_WRITE_MULTIPLIER, + ); + } + if (provider.cachedInputPrice !== undefined) { + assertRatio( + provider.modelName, + "cachedInputPrice", + provider.cachedInputPrice, + base * CACHE_READ_MULTIPLIER, + ); + } + }, + ); +}); diff --git a/apps/gateway/src/lib/costs.spec.ts b/apps/gateway/src/lib/costs.spec.ts index bfab453a3..ada191320 100644 --- a/apps/gateway/src/lib/costs.spec.ts +++ b/apps/gateway/src/lib/costs.spec.ts @@ -200,6 +200,42 @@ describe("calculateCosts", () => { expect(result.cacheWriteInputCost).toBeCloseTo(1000 * (3.75 / 1e6)); }); + it("should calculate AWS Bedrock Claude cache write costs", async () => { + // Bedrock Claude Haiku 4.5 input is 1.0/1M; 5m write 1.25/1M; 1h write 2.0/1M. + const result = await calculateCosts( + "claude-haiku-4-5", + "aws-bedrock", + 1004, + 50, + 0, + undefined, + null, + 0, + undefined, + 0, + null, + null, + undefined, + { + cacheWriteTokens: 1000, + cacheWrite1hTokens: 700, + }, + ); + + const discountMultiplier = 0.8; + expect(result.inputCost).toBeCloseTo(4 * (1.0 / 1e6) * discountMultiplier); + expect(result.outputCost).toBeCloseTo( + 50 * (5.0 / 1e6) * discountMultiplier, + ); + const fiveMinuteCacheWriteCost = 300 * (1.25 / 1e6); + const oneHourCacheWriteCost = 700 * (2.0 / 1e6); + expect(result.cacheWriteInputCost).toBeCloseTo( + (fiveMinuteCacheWriteCost + oneHourCacheWriteCost) * discountMultiplier, + ); + expect(result.discount).toBeCloseTo(0.2); + expect(result.cacheWriteTokens).toBe(1000); + }); + it("should calculate costs with cached tokens for Anthropic (subsequent request - cache read)", async () => { // For Anthropic subsequent request: 4 non-cached + 1659 cache read = 1663 total tokens, 1659 cache reads const result = await calculateCosts( diff --git a/apps/gateway/src/native-anthropic-cache.e2e.ts b/apps/gateway/src/native-anthropic-cache.e2e.ts index 8e24cc016..9efb3c1ee 100644 --- a/apps/gateway/src/native-anthropic-cache.e2e.ts +++ b/apps/gateway/src/native-anthropic-cache.e2e.ts @@ -588,4 +588,83 @@ describe("e2e native /v1/messages cache", getConcurrentTestOptions(), () => { } }, ); + + // 1h cache TTL via Bedrock /v1/chat/completions: opts into Bedrock's 1h + // cache write rate (2x base) on a model that supports it (Haiku 4.5) and + // asserts the gateway forwards ttl:"1h" to the Converse API cachePoint and + // surfaces the response breakdown + // (prompt_tokens_details.cache_creation.ephemeral_1h_input_tokens) so SDK + // clients can attribute spend across rates. + (hasBedrockKey ? test : test.skip)( + "openai-compat /v1/chat/completions forwards 1h ttl and surfaces cache_creation breakdown for bedrock", + getTestOptions(), + async () => { + const longText = buildLongSystemPrompt(); + const body = { + model: "aws-bedrock/claude-sonnet-4-6", + messages: [ + { + role: "system", + content: [ + { + type: "text" as const, + text: longText, + cache_control: { + type: "ephemeral" as const, + ttl: "1h" as const, + }, + }, + ], + }, + { role: "user", content: "Just reply OK." }, + ], + }; + + const send = async () => { + const requestId = generateTestRequestId(); + const res = await app.request("/v1/chat/completions", { + method: "POST", + headers: { + "Content-Type": "application/json", + "x-request-id": requestId, + Authorization: `Bearer real-token`, + }, + body: JSON.stringify(body), + }); + const json = await res.json(); + if (logMode) { + console.log( + "openai-compat bedrock 1h", + requestId, + "status", + res.status, + "usage", + JSON.stringify(json.usage), + ); + } + return { status: res.status, json }; + }; + + const first = await send(); + expect(first.status).toBe(200); + // On the priming call Bedrock should write to the 1h cache. If + // prepare-request-body strips ttl, the write falls back to 5m and + // ephemeral_1h_input_tokens will be 0 / absent. + const cacheWriteTokens = + first.json.usage?.prompt_tokens_details?.cache_creation_tokens ?? + first.json.usage?.prompt_tokens_details?.cache_write_tokens ?? + 0; + if (cacheWriteTokens > 0) { + const breakdown = + first.json.usage?.prompt_tokens_details?.cache_creation; + expect(breakdown).toBeDefined(); + expect(breakdown.ephemeral_1h_input_tokens).toBeGreaterThan(0); + expect(breakdown.ephemeral_5m_input_tokens).toBe(0); + expect( + breakdown.ephemeral_5m_input_tokens + + breakdown.ephemeral_1h_input_tokens, + ).toBe(cacheWriteTokens); + } + }, + ); }); diff --git a/packages/actions/src/prepare-request-body.spec.ts b/packages/actions/src/prepare-request-body.spec.ts index cd7e4585d..a9f1699c5 100644 --- a/packages/actions/src/prepare-request-body.spec.ts +++ b/packages/actions/src/prepare-request-body.spec.ts @@ -787,6 +787,106 @@ describe("prepareRequestBody - MiniMax", () => { }); describe("prepareRequestBody - AWS Bedrock", () => { + test("should preserve explicit cache_control ttl as Bedrock cachePoint ttl", async () => { + const requestBody = (await prepareRequestBody( + "aws-bedrock", + "anthropic.claude-sonnet-4-5-20250929-v1:0", + [ + { + role: "system", + content: [ + { + type: "text", + text: "Cache this system prompt.", + cache_control: { type: "ephemeral", ttl: "1h" }, + }, + ], + }, + { + role: "user", + content: [ + { + type: "text", + text: "What should I do next?", + cache_control: { type: "ephemeral", ttl: "5m" }, + }, + ], + }, + ], + false, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + false, + false, + )) as any; + + expect(requestBody.system).toEqual([ + { text: "Cache this system prompt." }, + { cachePoint: { type: "default", ttl: "1h" } }, + ]); + expect(requestBody.messages[0].content).toEqual([ + { text: "What should I do next?" }, + { cachePoint: { type: "default", ttl: "5m" } }, + ]); + }); + + test("should drop ttl:1h on bedrock models that do not support 1h TTL", async () => { + const requestBody = (await prepareRequestBody( + "aws-bedrock", + "anthropic.claude-3-7-sonnet-20250219-v1:0", + [ + { + role: "system", + content: [ + { + type: "text", + text: "Cache this system prompt.", + cache_control: { type: "ephemeral", ttl: "1h" }, + }, + ], + }, + { + role: "user", + content: [ + { + type: "text", + text: "What should I do next?", + cache_control: { type: "ephemeral", ttl: "1h" }, + }, + ], + }, + ], + false, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + undefined, + false, + false, + )) as any; + + expect(requestBody.system).toEqual([ + { text: "Cache this system prompt." }, + { cachePoint: { type: "default" } }, + ]); + expect(requestBody.messages[0].content).toEqual([ + { text: "What should I do next?" }, + { cachePoint: { type: "default" } }, + ]); + }); + test("should sanitize complex tool schemas for Bedrock Converse", async () => { const requestBody = (await prepareRequestBody( "aws-bedrock", diff --git a/packages/actions/src/prepare-request-body.ts b/packages/actions/src/prepare-request-body.ts index 6276526c9..169c2013c 100644 --- a/packages/actions/src/prepare-request-body.ts +++ b/packages/actions/src/prepare-request-body.ts @@ -1624,6 +1624,9 @@ export async function prepareRequestBody( // Track cache control usage (max 4 blocks per Anthropic/Bedrock limit) let bedrockCacheControlCount = 0; const bedrockMaxCacheControlBlocks = 4; + interface BedrockCachePoint { + cachePoint: { type: "default"; ttl?: "5m" | "1h" }; + } // Get the minCacheableTokens from the model definition (default to 1024 if not specified) const bedrockProviderMapping = modelDef?.providers.find( @@ -1634,6 +1637,25 @@ export async function prepareRequestBody( // Approximate 4 characters per token const bedrockMinCacheableChars = bedrockMinCacheableTokens * 4; + // AWS Bedrock supports 1h TTL only on Claude Opus/Haiku/Sonnet 4.5+. For + // other models, forwarding ttl:"1h" causes Bedrock to reject the request. + // Use cacheWriteInputPrice1h on the model definition as the source of + // truth and silently downgrade unsupported 1h hints to the default 5m. + const bedrockSupports1hTtl = + bedrockProviderMapping?.cacheWriteInputPrice1h !== undefined; + const createBedrockCachePoint = ( + ttl?: "5m" | "1h", + ): BedrockCachePoint => { + const effectiveTtl = + ttl === "1h" && !bedrockSupports1hTtl ? undefined : ttl; + return { + cachePoint: { + type: "default", + ...(effectiveTtl && { ttl: effectiveTtl }), + }, + }; + }; + // Extract system messages for Bedrock's system field (required for prompt caching) const bedrockSystemMessages = processedMessages.filter( (m) => m.role === "system", @@ -1649,13 +1671,12 @@ export async function prepareRequestBody( // cachePoint, and fall back to a length heuristic when nothing was // explicitly opted in. if (bedrockSystemMessages.length > 0) { - const systemContent: Array< - { text: string } | { cachePoint: { type: "default" } } - > = []; + const systemContent: Array<{ text: string } | BedrockCachePoint> = []; const collectedBedrockBlocks: Array<{ text: string; hasExplicitCacheControl: boolean; + ttl?: "5m" | "1h"; }> = []; for (const sysMsg of bedrockSystemMessages) { if (typeof sysMsg.content === "string") { @@ -1671,6 +1692,7 @@ export async function prepareRequestBody( collectedBedrockBlocks.push({ text: part.text, hasExplicitCacheControl: !!part.cache_control, + ttl: part.cache_control?.ttl, }); } } @@ -1687,7 +1709,7 @@ export async function prepareRequestBody( if (block.hasExplicitCacheControl) { if (bedrockCacheControlCount < bedrockMaxCacheControlBlocks) { bedrockCacheControlCount++; - systemContent.push({ cachePoint: { type: "default" } }); + systemContent.push(createBedrockCachePoint(block.ttl)); } continue; } @@ -1699,7 +1721,7 @@ export async function prepareRequestBody( if (shouldHeuristicCache) { bedrockCacheControlCount++; - systemContent.push({ cachePoint: { type: "default" } }); + systemContent.push(createBedrockCachePoint()); } } @@ -1800,9 +1822,7 @@ export async function prepareRequestBody( if (shouldCache) { bedrockCacheControlCount++; - bedrockMessage.content.push({ - cachePoint: { type: "default" }, - }); + bedrockMessage.content.push(createBedrockCachePoint()); } } } else if (Array.isArray(msg.content)) { @@ -1818,9 +1838,9 @@ export async function prepareRequestBody( if (part.cache_control) { if (bedrockCacheControlCount < bedrockMaxCacheControlBlocks) { bedrockCacheControlCount++; - bedrockMessage.content.push({ - cachePoint: { type: "default" }, - }); + bedrockMessage.content.push( + createBedrockCachePoint(part.cache_control.ttl), + ); } } else { // Add cachePoint as separate block for long text parts @@ -1831,9 +1851,7 @@ export async function prepareRequestBody( if (shouldCache) { bedrockCacheControlCount++; - bedrockMessage.content.push({ - cachePoint: { type: "default" }, - }); + bedrockMessage.content.push(createBedrockCachePoint()); } } } @@ -1878,9 +1896,7 @@ export async function prepareRequestBody( boundaryMsg.content[boundaryMsg.content.length - 1]; // Only add if the last block isn't already a cachePoint. if (!lastBlock.cachePoint) { - boundaryMsg.content.push({ - cachePoint: { type: "default" }, - }); + boundaryMsg.content.push(createBedrockCachePoint()); bedrockCacheControlCount++; } } diff --git a/packages/models/src/models/anthropic.ts b/packages/models/src/models/anthropic.ts index a39d94358..324c8e398 100644 --- a/packages/models/src/models/anthropic.ts +++ b/packages/models/src/models/anthropic.ts @@ -40,6 +40,7 @@ export const anthropicModels = [ inputPrice: 3.0 / 1e6, outputPrice: 15.0 / 1e6, cachedInputPrice: 0.3 / 1e6, + cacheWriteInputPrice: 3.75 / 1e6, minCacheableTokens: 1024, requestPrice: 0, discount: 0.2, @@ -154,6 +155,7 @@ export const anthropicModels = [ inputPrice: 3.0 / 1e6, outputPrice: 15.0 / 1e6, cachedInputPrice: 0.3 / 1e6, + cacheWriteInputPrice: 3.75 / 1e6, minCacheableTokens: 1024, requestPrice: 0, discount: 0.2, @@ -225,6 +227,7 @@ export const anthropicModels = [ inputPrice: 3.0 / 1e6, outputPrice: 15.0 / 1e6, cachedInputPrice: 0.3 / 1e6, + cacheWriteInputPrice: 3.75 / 1e6, minCacheableTokens: 1024, requestPrice: 0, discount: 0.2, @@ -276,6 +279,8 @@ export const anthropicModels = [ inputPrice: 3.0 / 1e6, outputPrice: 15.0 / 1e6, cachedInputPrice: 0.3 / 1e6, + cacheWriteInputPrice: 3.75 / 1e6, + cacheWriteInputPrice1h: 6.0 / 1e6, minCacheableTokens: 1024, requestPrice: 0, discount: 0.2, @@ -327,6 +332,8 @@ export const anthropicModels = [ inputPrice: 3.0 / 1e6, outputPrice: 15.0 / 1e6, cachedInputPrice: 0.3 / 1e6, + cacheWriteInputPrice: 3.75 / 1e6, + cacheWriteInputPrice1h: 6.0 / 1e6, minCacheableTokens: 1024, requestPrice: 0, discount: 0.2, @@ -376,6 +383,8 @@ export const anthropicModels = [ inputPrice: 3.0 / 1e6, outputPrice: 15.0 / 1e6, cachedInputPrice: 0.3 / 1e6, + cacheWriteInputPrice: 3.75 / 1e6, + cacheWriteInputPrice1h: 6.0 / 1e6, minCacheableTokens: 2048, requestPrice: 0, discount: 0.2, @@ -443,6 +452,8 @@ export const anthropicModels = [ inputPrice: 1.0 / 1e6, outputPrice: 5.0 / 1e6, cachedInputPrice: 0.1 / 1e6, + cacheWriteInputPrice: 1.25 / 1e6, + cacheWriteInputPrice1h: 2.0 / 1e6, minCacheableTokens: 4096, requestPrice: 0, discount: 0.2, @@ -490,6 +501,8 @@ export const anthropicModels = [ inputPrice: 1.0 / 1e6, outputPrice: 5.0 / 1e6, cachedInputPrice: 0.1 / 1e6, + cacheWriteInputPrice: 1.25 / 1e6, + cacheWriteInputPrice1h: 2.0 / 1e6, minCacheableTokens: 4096, requestPrice: 0, discount: 0.2, @@ -540,6 +553,7 @@ export const anthropicModels = [ inputPrice: 15.0 / 1e6, outputPrice: 75.0 / 1e6, cachedInputPrice: 1.5 / 1e6, + cacheWriteInputPrice: 18.75 / 1e6, minCacheableTokens: 1024, requestPrice: 0, discount: 0.2, @@ -590,6 +604,7 @@ export const anthropicModels = [ inputPrice: 15.0 / 1e6, outputPrice: 75.0 / 1e6, cachedInputPrice: 1.5 / 1e6, + cacheWriteInputPrice: 18.75 / 1e6, minCacheableTokens: 1024, requestPrice: 0, discount: 0.2, @@ -698,6 +713,7 @@ export const anthropicModels = [ inputPrice: 0.8 / 1e6, outputPrice: 4.0 / 1e6, cachedInputPrice: 0.08 / 1e6, + cacheWriteInputPrice: 1.0 / 1e6, minCacheableTokens: 2048, requestPrice: 0, discount: 0.2, @@ -830,6 +846,8 @@ export const anthropicModels = [ inputPrice: 5.0 / 1e6, outputPrice: 25.0 / 1e6, cachedInputPrice: 0.5 / 1e6, + cacheWriteInputPrice: 6.25 / 1e6, + cacheWriteInputPrice1h: 10.0 / 1e6, minCacheableTokens: 4096, requestPrice: 0, discount: 0.2, @@ -921,6 +939,8 @@ export const anthropicModels = [ inputPrice: 5.0 / 1e6, outputPrice: 25.0 / 1e6, cachedInputPrice: 0.5 / 1e6, + cacheWriteInputPrice: 6.25 / 1e6, + cacheWriteInputPrice1h: 10.0 / 1e6, minCacheableTokens: 4096, requestPrice: 0, discount: 0.2, @@ -994,6 +1014,8 @@ export const anthropicModels = [ inputPrice: 5.0 / 1e6, outputPrice: 25.0 / 1e6, cachedInputPrice: 0.5 / 1e6, + cacheWriteInputPrice: 6.25 / 1e6, + cacheWriteInputPrice1h: 10.0 / 1e6, minCacheableTokens: 4096, requestPrice: 0, discount: 0.2,