From 5738a540ab630d65853fbabbbabfd056b62be6ef Mon Sep 17 00:00:00 2001 From: Luca Steeb Date: Sun, 29 Mar 2026 19:13:09 +0700 Subject: [PATCH 01/14] refactor: queue chat logs in middleware --- apps/gateway/src/api-individual.e2e.ts | 6 + apps/gateway/src/api.spec.ts | 30 +- apps/gateway/src/chat/chat.ts | 7155 ++++++++--------- .../chat/middleware/chat-completion-log.ts | 144 + .../src/chat/tools/chat-log-context.ts | 126 + apps/gateway/src/test-utils/test-helpers.ts | 4 + apps/gateway/src/vars.ts | 2 + 7 files changed, 3868 insertions(+), 3599 deletions(-) create mode 100644 apps/gateway/src/chat/middleware/chat-completion-log.ts create mode 100644 apps/gateway/src/chat/tools/chat-log-context.ts diff --git a/apps/gateway/src/api-individual.e2e.ts b/apps/gateway/src/api-individual.e2e.ts index a96234bd38..89de64dd5e 100644 --- a/apps/gateway/src/api-individual.e2e.ts +++ b/apps/gateway/src/api-individual.e2e.ts @@ -293,6 +293,12 @@ describe("e2e individual tests", () => { expect((log.errorDetails as { message?: string })?.message).toContain( "the word 'json'", ); + + const matchingLogs = await db + .select() + .from(tables.log) + .where(eq(tables.log.requestId, requestId)); + expect(matchingLogs).toHaveLength(1); }, ); diff --git a/apps/gateway/src/api.spec.ts b/apps/gateway/src/api.spec.ts index fb61959d0d..2bf6fe4d3c 100644 --- a/apps/gateway/src/api.spec.ts +++ b/apps/gateway/src/api.spec.ts @@ -1,11 +1,16 @@ import { afterAll, beforeAll, describe, expect, test, vi } from "vitest"; -import { db, tables } from "@llmgateway/db"; +import { db, eq, tables } from "@llmgateway/db"; import { logger } from "@llmgateway/logger"; import { app } from "./app.js"; import { createGatewayApiTestHarness } from "./test-utils/gateway-api-test-harness.js"; -import { readAll, waitForLogs } from "./test-utils/test-helpers.js"; +import { + readAll, + processPendingLogs, + waitForLogByRequestId, + waitForLogs, +} from "./test-utils/test-helpers.js"; describe("api", () => { const harness = createGatewayApiTestHarness({ @@ -1343,10 +1348,12 @@ describe("api", () => { // test for missing Authorization header test("/v1/chat/completions missing Authorization header", async () => { + const requestId = "missing-auth-request-id"; const res = await app.request("/v1/chat/completions", { method: "POST", headers: { "Content-Type": "application/json", + "x-request-id": requestId, // Intentionally not setting Authorization header }, body: JSON.stringify({ @@ -1360,6 +1367,13 @@ describe("api", () => { }), }); expect(res.status).toBe(401); + + await processPendingLogs(); + const logs = await db + .select() + .from(tables.log) + .where(eq(tables.log.requestId, requestId)); + expect(logs).toHaveLength(0); }); // test for explicitly specifying a provider in the format "provider/model" @@ -1483,6 +1497,7 @@ describe("api", () => { // test for missing provider API key test("/v1/chat/completions with missing provider API key", async () => { + const requestId = "missing-provider-key-request-id"; await db.insert(tables.apiKey).values({ id: "token-id", token: "real-token", @@ -1495,6 +1510,7 @@ describe("api", () => { method: "POST", headers: { "Content-Type": "application/json", + "x-request-id": requestId, Authorization: `Bearer real-token`, }, body: JSON.stringify({ @@ -1512,6 +1528,16 @@ describe("api", () => { expect(errorMessage).toMatchInlineSnapshot( `"{"error":true,"status":400,"message":"No provider key set for any of the providers that support model gpt-4o-mini. Please add the provider key in the settings or switch the project mode to credits or hybrid."}"`, ); + + const log = await waitForLogByRequestId(requestId); + expect(log.finishReason).toBe("client_error"); + expect(log.unifiedFinishReason).toBe("client_error"); + + const matchingLogs = await db + .select() + .from(tables.log) + .where(eq(tables.log.requestId, requestId)); + expect(matchingLogs).toHaveLength(1); }); // test for provider error response and error logging diff --git a/apps/gateway/src/chat/chat.ts b/apps/gateway/src/chat/chat.ts index e93aece19a..79ab0dc17a 100644 --- a/apps/gateway/src/chat/chat.ts +++ b/apps/gateway/src/chat/chat.ts @@ -21,7 +21,6 @@ import { throwIamException, validateModelAccess } from "@/lib/iam.js"; import { calculateDataStorageCost, getUnifiedFinishReason, - insertLog as _insertLog, } from "@/lib/logs.js"; import { checkProviderRateLimit, @@ -86,7 +85,14 @@ import { stripRegionFromModelName, } from "@llmgateway/models"; +import { chatCompletionLogMiddleware } from "./middleware/chat-completion-log.js"; import { completionsRequestSchema } from "./schemas/completions.js"; +import { + enqueueChatLog, + finishStreamCompletion, + registerStreamCompletion, + updateBaseLogOptions, +} from "./tools/chat-log-context.js"; import { checkContentFilter, getContentFilterMethod, @@ -95,7 +101,6 @@ import { } from "./tools/check-content-filter.js"; import { convertImagesToBase64 } from "./tools/convert-images-to-base64.js"; import { countInputImages } from "./tools/count-input-images.js"; -import { createLogEntry } from "./tools/create-log-entry.js"; import { estimateTokensFromContent } from "./tools/estimate-tokens-from-content.js"; import { estimateTokens } from "./tools/estimate-tokens.js"; import { @@ -350,6 +355,8 @@ const sharedTextDecoder = new TextDecoder(); export const chat = new OpenAPIHono(); +chat.use("/completions", chatCompletionLogMiddleware); + const completions = createRoute({ operationId: "v1_chat_completions", summary: "Chat Completions", @@ -654,6 +661,7 @@ chat.openapi(completions, async (c) => { // Extract custom X-LLMGateway-* headers const customHeaders = extractCustomHeaders(c); + const requestPluginIds = plugins?.map((plugin) => plugin.id) ?? []; // Check for X-No-Fallback header to disable provider fallback on low uptime const noFallback = @@ -848,6 +856,36 @@ chat.openapi(completions, async (c) => { }); } + updateBaseLogOptions(c, { + requestId, + project, + apiKey, + usedModel: initialRequestedModel, + usedModelMapping: requestedModel, + usedProvider: requestedProvider ?? "llmgateway", + requestedModel: initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoningEffort: reasoning_effort, + reasoningMaxTokens: reasoning_max_tokens, + effort, + responseFormat: response_format, + tools, + toolChoice: tool_choice, + source, + customHeaders, + debugMode, + userAgent, + imageConfig: image_config, + rawRequest: rawBody, + plugins: requestPluginIds, + }); + // Run guardrails check for enterprise organizations let guardrailResult: Awaited> | undefined; if (organization.plan === "enterprise") { @@ -888,6 +926,9 @@ chat.openapi(completions, async (c) => { messages as Parameters[0], guardrailResult.redactions, ) as typeof messages; + updateBaseLogOptions(c, { + messages, + }); } // Log non-blocking violations (redact/warn) @@ -1121,7 +1162,7 @@ chat.openapi(completions, async (c) => { // Filter by context size requirement, reasoning capability, and deprecation status const suitableProviders = availableModelProviders.filter((provider) => { // Skip deprecated provider mappings - if (provider.deprecatedAt && now > provider.deprecatedAt!) { + if (provider.deprecatedAt && now > provider.deprecatedAt) { return false; } @@ -2157,6 +2198,10 @@ chat.openapi(completions, async (c) => { } } + updateBaseLogOptions(c, { + reasoningEffort: reasoning_effort, + }); + let url: string | undefined; // Get the provider key for the selected provider based on project mode @@ -2462,6 +2507,35 @@ chat.openapi(completions, async (c) => { }); } + updateBaseLogOptions(c, { + providerKeyId: providerKey?.id, + usedModel: usedModelFormatted, + usedModelMapping, + usedProvider, + requestedModel: initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoningEffort: reasoning_effort, + reasoningMaxTokens: reasoning_max_tokens, + effort, + responseFormat: response_format, + tools, + toolChoice: tool_choice, + source, + customHeaders, + debugMode, + userAgent, + imageConfig: image_config, + routingMetadata, + rawRequest: rawBody, + plugins: requestPluginIds, + }); + // Check gateway-level content filter before routing the request upstream. const contentFilterMode = getContentFilterMode(); const contentFilterMethod = getContentFilterMethod(); @@ -2499,50 +2573,27 @@ chat.openapi(completions, async (c) => { .length ? openAIContentFilterResult.responses : null; - const insertLog = (logData: Parameters[0]) => - _insertLog({ - ...logData, - internalContentFilter: shouldTagContentFilter - ? true - : logData.internalContentFilter, - gatewayContentFilterResponse: - logData.gatewayContentFilterResponse ?? gatewayContentFilterResponse, - }); + updateBaseLogOptions(c, { + gatewayContentFilterResponse, + }); + const chatCompletionLogState = c.get("chatCompletionLogState"); + if (chatCompletionLogState) { + chatCompletionLogState.internalContentFilter = shouldTagContentFilter; + } if (contentFilterBlocked) { const contentFilterResponseId = `chatcmpl-${Date.now()}`; const contentFilterCreated = Math.floor(Date.now() / 1000); - // Log the filtered request - try { - await insertLog({ - ...createLogEntry( - requestId, - project, - apiKey, - undefined, - "", - undefined, - "llmgateway", - requestedModel, - requestedProvider, - messages as any[], - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - undefined, - undefined, - effort as "low" | "medium" | "high" | undefined, - response_format, - tools, - tool_choice, - source, - customHeaders, - c.req.header("x-debug") === "true", - c.req.header("user-agent"), - ), + enqueueChatLog( + c, + { + providerKeyId: undefined, + usedModel: "", + usedModelMapping: undefined, + usedProvider: "llmgateway", + }, + { content: null, responseSize: 0, finishReason: "llmgateway_content_filter", @@ -2558,6 +2609,7 @@ chat.openapi(completions, async (c) => { errorDetails: null, duration: 0, timeToFirstToken: null, + timeToFirstReasoningToken: null, inputCost: 0, outputCost: 0, cachedInputCost: 0, @@ -2572,31 +2624,36 @@ chat.openapi(completions, async (c) => { discount: null, pricingTier: null, dataStorageCost: "0", - }); - } catch { - // Silently ignore logging failures - } + cached: false, + toolResults: null, + }, + ); if (stream) { + void registerStreamCompletion(c); return streamSSE(c, async (sseStream) => { - const chunk = { - id: contentFilterResponseId, - object: "chat.completion.chunk", - created: contentFilterCreated, - model: requestedModel, - choices: [ - { - index: 0, - delta: {}, - finish_reason: "content_filter", - }, - ], - }; - await sseStream.writeSSE({ - data: JSON.stringify(chunk), - id: "0", - }); - await sseStream.writeSSE({ data: "[DONE]" }); + try { + const chunk = { + id: contentFilterResponseId, + object: "chat.completion.chunk", + created: contentFilterCreated, + model: requestedModel, + choices: [ + { + index: 0, + delta: {}, + finish_reason: "content_filter", + }, + ], + }; + await sseStream.writeSSE({ + data: JSON.stringify(chunk), + id: "0", + }); + await sseStream.writeSSE({ data: "[DONE]" }); + } finally { + finishStreamCompletion(c); + } }); } @@ -2793,46 +2850,6 @@ chat.openapi(completions, async (c) => { } } - // Log the cached streaming request with reconstructed content - // Extract plugin IDs for logging (cached streaming) - const cachedStreamingPluginIds = plugins?.map((p) => p.id) ?? []; - - const baseLogEntry = createLogEntry( - requestId, - project, - apiKey, - providerKey?.id, - usedModelFormatted, - usedModelMapping, - usedProvider, - initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoning_effort, - reasoning_max_tokens, - effort, - response_format, - tools, - tool_choice, - source, - customHeaders, - debugMode, - userAgent, - image_config, - routingMetadata, - rawBody, - rawCachedResponseData, // Raw SSE data from cached response - null, // No upstream request for cached response - rawCachedResponseData, // Raw SSE data from cached response (same for both) - cachedStreamingPluginIds, - undefined, // No plugin results for cached response - ); - // Calculate costs for cached response const costs = await calculateCosts( usedModel, @@ -2849,82 +2866,121 @@ chat.openapi(completions, async (c) => { project.organizationId, ); - await insertLog({ - ...baseLogEntry, - duration: 0, // No processing time for cached response - timeToFirstToken: null, // Not applicable for cached response - timeToFirstReasoningToken: null, // Not applicable for cached response - responseSize: cachedResponseSize, - content: fullContent || null, - reasoningContent: fullReasoningContent || null, - finishReason: cachedStreamingResponse.metadata.finishReason, - promptTokens: - (costs.promptTokens ?? promptTokens)?.toString() ?? null, - completionTokens: completionTokens?.toString() ?? null, - totalTokens: costs.imageInputTokens - ? ( - (costs.promptTokens ?? promptTokens ?? 0) + - (completionTokens ?? 0) + - (reasoningTokens ?? 0) - ).toString() - : (totalTokens?.toString() ?? null), - reasoningTokens: reasoningTokens?.toString() ?? null, - cachedTokens: cachedTokens?.toString() ?? null, - hasError: false, - streamed: true, - canceled: false, - errorDetails: null, - inputCost: costs.inputCost ?? 0, - outputCost: costs.outputCost ?? 0, - cachedInputCost: costs.cachedInputCost ?? 0, - requestCost: costs.requestCost ?? 0, - webSearchCost: costs.webSearchCost ?? 0, - imageInputTokens: costs.imageInputTokens?.toString() ?? null, - imageOutputTokens: costs.imageOutputTokens?.toString() ?? null, - imageInputCost: costs.imageInputCost ?? null, - imageOutputCost: costs.imageOutputCost ?? null, - cost: costs.totalCost ?? 0, - estimatedCost: costs.estimatedCost, - discount: costs.discount ?? null, - pricingTier: costs.pricingTier ?? null, - dataStorageCost: calculateDataStorageCost( - costs.promptTokens ?? promptTokens, - cachedTokens, - completionTokens, - reasoningTokens, - retentionLevel, - ), - cached: true, - toolResults: - (cachedStreamingResponse.metadata as { toolResults?: any }) - ?.toolResults ?? null, - }); + enqueueChatLog( + c, + { + providerKeyId: providerKey?.id, + usedModel: usedModelFormatted, + usedModelMapping, + usedProvider, + requestedModel: initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoningEffort: reasoning_effort, + reasoningMaxTokens: reasoning_max_tokens, + effort, + responseFormat: response_format, + tools, + toolChoice: tool_choice, + source, + customHeaders, + debugMode, + userAgent, + imageConfig: image_config, + routingMetadata, + rawRequest: rawBody, + rawResponse: rawCachedResponseData, + upstreamRequest: null, + upstreamResponse: rawCachedResponseData, + plugins: requestPluginIds, + pluginResults: undefined, + }, + { + duration: 0, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + responseSize: cachedResponseSize, + content: fullContent || null, + reasoningContent: fullReasoningContent || null, + finishReason: cachedStreamingResponse.metadata.finishReason, + promptTokens: + (costs.promptTokens ?? promptTokens)?.toString() ?? null, + completionTokens: completionTokens?.toString() ?? null, + totalTokens: costs.imageInputTokens + ? ( + (costs.promptTokens ?? promptTokens ?? 0) + + (completionTokens ?? 0) + + (reasoningTokens ?? 0) + ).toString() + : (totalTokens?.toString() ?? null), + reasoningTokens: reasoningTokens?.toString() ?? null, + cachedTokens: cachedTokens?.toString() ?? null, + hasError: false, + streamed: true, + canceled: false, + errorDetails: null, + inputCost: costs.inputCost ?? 0, + outputCost: costs.outputCost ?? 0, + cachedInputCost: costs.cachedInputCost ?? 0, + requestCost: costs.requestCost ?? 0, + webSearchCost: costs.webSearchCost ?? 0, + imageInputTokens: costs.imageInputTokens?.toString() ?? null, + imageOutputTokens: costs.imageOutputTokens?.toString() ?? null, + imageInputCost: costs.imageInputCost ?? null, + imageOutputCost: costs.imageOutputCost ?? null, + cost: costs.totalCost ?? 0, + estimatedCost: costs.estimatedCost, + discount: costs.discount ?? null, + pricingTier: costs.pricingTier ?? null, + dataStorageCost: calculateDataStorageCost( + costs.promptTokens ?? promptTokens, + cachedTokens, + completionTokens, + reasoningTokens, + retentionLevel, + ), + cached: true, + toolResults: + (cachedStreamingResponse.metadata as { toolResults?: any }) + ?.toolResults ?? null, + }, + ); // Return cached streaming response by replaying chunks with original timing + void registerStreamCompletion(c); return streamSSE( c, async (stream) => { - let previousTimestamp = 0; + try { + let previousTimestamp = 0; - for (const chunk of cachedStreamingResponse.chunks) { - // Calculate delay based on original chunk timing - const delay = Math.max(0, chunk.timestamp - previousTimestamp); - // Cap the delay to prevent excessively long waits (max 1 second) - const cappedDelay = Math.min(delay, 1000); + for (const chunk of cachedStreamingResponse.chunks) { + // Calculate delay based on original chunk timing + const delay = Math.max(0, chunk.timestamp - previousTimestamp); + // Cap the delay to prevent excessively long waits (max 1 second) + const cappedDelay = Math.min(delay, 1000); - if (cappedDelay > 0) { - await new Promise((resolve) => { - setTimeout(() => resolve(), cappedDelay); - }); - } + if (cappedDelay > 0) { + await new Promise((resolve) => { + setTimeout(() => resolve(), cappedDelay); + }); + } - await stream.writeSSE({ - data: chunk.data, - id: String(chunk.eventId), - event: chunk.event, - }); + await stream.writeSSE({ + data: chunk.data, + id: String(chunk.eventId), + event: chunk.event, + }); - previousTimestamp = chunk.timestamp; + previousTimestamp = chunk.timestamp; + } + } finally { + finishStreamCompletion(c); } }, async (error) => { @@ -2944,44 +3000,6 @@ chat.openapi(completions, async (c) => { if (cachedResponse) { // Log the cached request const duration = 0; // No processing time needed - // Extract plugin IDs for logging (cached non-streaming) - const cachedPluginIds = plugins?.map((p) => p.id) ?? []; - - const baseLogEntry = createLogEntry( - requestId, - project, - apiKey, - providerKey?.id, - usedModelFormatted, - usedModelMapping, - usedProvider, - initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoning_effort, - reasoning_max_tokens, - effort, - response_format, - tools, - tool_choice, - source, - customHeaders, - debugMode, - userAgent, - image_config, - routingMetadata, - rawBody, - cachedResponse, - null, // No upstream request for cached response - cachedResponse, // upstream response is same as cached response - cachedPluginIds, - undefined, // No plugin results for cached response - ); // Calculate costs for cached response const cachedCosts = await calculateCosts( @@ -3008,59 +3026,96 @@ chat.openapi(completions, async (c) => { (cachedReasoningContent?.length ?? 0) + 500; // overhead for metadata - await insertLog({ - ...baseLogEntry, - duration, - timeToFirstToken: null, // Not applicable for cached response - timeToFirstReasoningToken: null, // Not applicable for cached response - responseSize: estimatedCachedSize, - content: cachedContent ?? null, - reasoningContent: cachedReasoningContent ?? null, - finishReason: cachedResponse.choices?.[0]?.finish_reason ?? null, - promptTokens: - ( - cachedCosts.promptTokens ?? cachedResponse.usage?.prompt_tokens - )?.toString() ?? null, - completionTokens: cachedResponse.usage?.completion_tokens ?? null, - totalTokens: cachedCosts.imageInputTokens - ? ( - (cachedCosts.promptTokens ?? - cachedResponse.usage?.prompt_tokens ?? - 0) + - (cachedResponse.usage?.completion_tokens ?? 0) + - (cachedResponse.usage?.reasoning_tokens ?? 0) - ).toString() - : (cachedResponse.usage?.total_tokens ?? null), - reasoningTokens: cachedResponse.usage?.reasoning_tokens ?? null, - cachedTokens: - cachedResponse.usage?.prompt_tokens_details?.cached_tokens ?? null, - hasError: false, - streamed: false, - canceled: false, - errorDetails: null, - inputCost: cachedCosts.inputCost ?? 0, - outputCost: cachedCosts.outputCost ?? 0, - cachedInputCost: cachedCosts.cachedInputCost ?? 0, - requestCost: cachedCosts.requestCost ?? 0, - webSearchCost: cachedCosts.webSearchCost ?? 0, - imageInputTokens: cachedCosts.imageInputTokens?.toString() ?? null, - imageOutputTokens: cachedCosts.imageOutputTokens?.toString() ?? null, - imageInputCost: cachedCosts.imageInputCost ?? null, - imageOutputCost: cachedCosts.imageOutputCost ?? null, - cost: cachedCosts.totalCost ?? 0, - estimatedCost: cachedCosts.estimatedCost, - discount: cachedCosts.discount ?? null, - pricingTier: cachedCosts.pricingTier ?? null, - dataStorageCost: calculateDataStorageCost( - cachedCosts.promptTokens ?? cachedResponse.usage?.prompt_tokens, - cachedResponse.usage?.prompt_tokens_details?.cached_tokens, - cachedResponse.usage?.completion_tokens, - cachedResponse.usage?.reasoning_tokens, - retentionLevel, - ), - cached: true, - toolResults: cachedResponse.choices?.[0]?.message?.tool_calls ?? null, - }); + enqueueChatLog( + c, + { + providerKeyId: providerKey?.id, + usedModel: usedModelFormatted, + usedModelMapping, + usedProvider, + requestedModel: initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoningEffort: reasoning_effort, + reasoningMaxTokens: reasoning_max_tokens, + effort, + responseFormat: response_format, + tools, + toolChoice: tool_choice, + source, + customHeaders, + debugMode, + userAgent, + imageConfig: image_config, + routingMetadata, + rawRequest: rawBody, + rawResponse: cachedResponse, + upstreamRequest: null, + upstreamResponse: cachedResponse, + plugins: requestPluginIds, + pluginResults: undefined, + }, + { + duration, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + responseSize: estimatedCachedSize, + content: cachedContent ?? null, + reasoningContent: cachedReasoningContent ?? null, + finishReason: cachedResponse.choices?.[0]?.finish_reason ?? null, + promptTokens: + ( + cachedCosts.promptTokens ?? cachedResponse.usage?.prompt_tokens + )?.toString() ?? null, + completionTokens: cachedResponse.usage?.completion_tokens ?? null, + totalTokens: cachedCosts.imageInputTokens + ? ( + (cachedCosts.promptTokens ?? + cachedResponse.usage?.prompt_tokens ?? + 0) + + (cachedResponse.usage?.completion_tokens ?? 0) + + (cachedResponse.usage?.reasoning_tokens ?? 0) + ).toString() + : (cachedResponse.usage?.total_tokens ?? null), + reasoningTokens: cachedResponse.usage?.reasoning_tokens ?? null, + cachedTokens: + cachedResponse.usage?.prompt_tokens_details?.cached_tokens ?? + null, + hasError: false, + streamed: false, + canceled: false, + errorDetails: null, + inputCost: cachedCosts.inputCost ?? 0, + outputCost: cachedCosts.outputCost ?? 0, + cachedInputCost: cachedCosts.cachedInputCost ?? 0, + requestCost: cachedCosts.requestCost ?? 0, + webSearchCost: cachedCosts.webSearchCost ?? 0, + imageInputTokens: cachedCosts.imageInputTokens?.toString() ?? null, + imageOutputTokens: + cachedCosts.imageOutputTokens?.toString() ?? null, + imageInputCost: cachedCosts.imageInputCost ?? null, + imageOutputCost: cachedCosts.imageOutputCost ?? null, + cost: cachedCosts.totalCost ?? 0, + estimatedCost: cachedCosts.estimatedCost, + discount: cachedCosts.discount ?? null, + pricingTier: cachedCosts.pricingTier ?? null, + dataStorageCost: calculateDataStorageCost( + cachedCosts.promptTokens ?? cachedResponse.usage?.prompt_tokens, + cachedResponse.usage?.prompt_tokens_details?.cached_tokens, + cachedResponse.usage?.completion_tokens, + cachedResponse.usage?.reasoning_tokens, + retentionLevel, + ), + cached: true, + toolResults: + cachedResponse.choices?.[0]?.message?.tool_calls ?? null, + }, + ); return c.json(cachedResponse); } @@ -3316,1934 +3371,1907 @@ chat.openapi(completions, async (c) => { // Handle streaming response if requested // For image generation models, we skip real streaming and use fake streaming later if (effectiveStream) { + void registerStreamCompletion(c); return streamSSE( c, async (stream) => { - let eventId = 0; - let canceled = false; - let streamingError: unknown = null; - let doneSent = false; // Track if [DONE] has been sent downstream - - // Raw logging variables - let streamingRawResponseData = ""; // Raw SSE data sent back to the client - - // Streaming cache variables - const streamingChunks: Array<{ - data: string; - eventId: number; - event?: string; - timestamp: number; - }> = []; - const streamStartTime = Date.now(); - - // SSE keepalive to prevent proxy/load balancer timeouts - // Sends SSE comments (ignored by clients) every 15 seconds to keep connection alive - const KEEPALIVE_INTERVAL_MS = 15000; - const keepaliveInterval = setInterval(() => { - stream.write(": ping\n\n").catch(() => { - // Stream likely closed, cleanup will happen via abort handler or finally - }); - }, KEEPALIVE_INTERVAL_MS); - const clearKeepalive = () => clearInterval(keepaliveInterval); - - // Timing tracking variables - let timeToFirstToken: number | null = null; - let timeToFirstReasoningToken: number | null = null; - let firstTokenReceived = false; - let firstReasoningTokenReceived = false; - - // Helper function to write SSE and capture for cache - const writeSSEAndCache = async (sseData: { - data: string; - event?: string; - id?: string; - }) => { - await stream.writeSSE(sseData); - - // Collect raw response data for logging only in debug mode and within size limit - if ( - debugMode && - streamingRawResponseData.length < MAX_RAW_DATA_SIZE - ) { - const sseString = `${sseData.event ? `event: ${sseData.event}\n` : ""}data: ${sseData.data}${sseData.id ? `\nid: ${sseData.id}` : ""}\n\n`; - streamingRawResponseData += sseString; - } - - // Capture for streaming cache if enabled - if (cachingEnabled && streamingCacheKey) { - streamingChunks.push({ - data: sseData.data, - eventId: sseData.id ? parseInt(sseData.id, 10) : eventId, - event: sseData.event, - timestamp: Date.now() - streamStartTime, + return await (async () => { + let eventId = 0; + let canceled = false; + let streamingError: unknown = null; + let doneSent = false; // Track if [DONE] has been sent downstream + + // Raw logging variables + let streamingRawResponseData = ""; // Raw SSE data sent back to the client + + // Streaming cache variables + const streamingChunks: Array<{ + data: string; + eventId: number; + event?: string; + timestamp: number; + }> = []; + const streamStartTime = Date.now(); + + // SSE keepalive to prevent proxy/load balancer timeouts + // Sends SSE comments (ignored by clients) every 15 seconds to keep connection alive + const KEEPALIVE_INTERVAL_MS = 15000; + const keepaliveInterval = setInterval(() => { + stream.write(": ping\n\n").catch(() => { + // Stream likely closed, cleanup will happen via abort handler or finally }); - } - }; + }, KEEPALIVE_INTERVAL_MS); + const clearKeepalive = () => clearInterval(keepaliveInterval); + + // Timing tracking variables + let timeToFirstToken: number | null = null; + let timeToFirstReasoningToken: number | null = null; + let firstTokenReceived = false; + let firstReasoningTokenReceived = false; + + // Helper function to write SSE and capture for cache + const writeSSEAndCache = async (sseData: { + data: string; + event?: string; + id?: string; + }) => { + await stream.writeSSE(sseData); + + // Collect raw response data for logging only in debug mode and within size limit + if ( + debugMode && + streamingRawResponseData.length < MAX_RAW_DATA_SIZE + ) { + const sseString = `${sseData.event ? `event: ${sseData.event}\n` : ""}data: ${sseData.data}${sseData.id ? `\nid: ${sseData.id}` : ""}\n\n`; + streamingRawResponseData += sseString; + } - const writeStreamingContentFilterResponse = async ({ - billingModel, - billingProvider, - responseModel, - metadata, - }: { - billingModel: string; - billingProvider: Provider; - responseModel: string; - metadata?: Record; - }) => { - const { calculatedPromptTokens } = estimateTokens( - billingProvider, - messages, - null, - null, - 0, - ); - const promptTokenCount = Math.max( - 1, - Math.round(calculatedPromptTokens ?? 1), - ); - const streamingCosts = await calculateCosts( + // Capture for streaming cache if enabled + if (cachingEnabled && streamingCacheKey) { + streamingChunks.push({ + data: sseData.data, + eventId: sseData.id ? parseInt(sseData.id, 10) : eventId, + event: sseData.event, + timestamp: Date.now() - streamStartTime, + }); + } + }; + + const writeStreamingContentFilterResponse = async ({ billingModel, billingProvider, - promptTokenCount, - 0, - null, - { - prompt: messages - .map((m) => messageContentToString(m.content)) - .join("\n"), - completion: "", - }, - null, - 0, - image_config?.image_size, - inputImageCount, - 0, - project.organizationId, - ); - - await writeSSEAndCache({ - data: JSON.stringify({ - id: `chatcmpl-${Date.now()}`, - object: "chat.completion.chunk", - created: Math.floor(Date.now() / 1000), - model: responseModel, - choices: [ - { - index: 0, - delta: {}, - finish_reason: "content_filter", - }, - ], - ...(metadata && { metadata }), - }), - id: String(eventId++), - }); - - await writeSSEAndCache({ - data: JSON.stringify({ - id: `chatcmpl-${Date.now()}`, - object: "chat.completion.chunk", - created: Math.floor(Date.now() / 1000), - model: responseModel, - choices: [ - { - index: 0, - delta: {}, - finish_reason: null, - }, - ], - usage: { - prompt_tokens: promptTokenCount, - completion_tokens: 0, - total_tokens: promptTokenCount, - cost_usd_total: streamingCosts.totalCost, - cost_usd_input: streamingCosts.inputCost, - cost_usd_output: streamingCosts.outputCost, - cost_usd_cached_input: streamingCosts.cachedInputCost, - cost_usd_request: streamingCosts.requestCost, - cost_usd_image_input: streamingCosts.imageInputCost, - cost_usd_image_output: streamingCosts.imageOutputCost, - }, - }), - id: String(eventId++), - }); - - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); - doneSent = true; - }; - - // Set up cancellation handling - const controller = new AbortController(); - // Set up a listener for the request being aborted - const onAbort = () => { - clearKeepalive(); - if (requestCanBeCanceled) { - canceled = true; - controller.abort(); - } - }; - - // Add event listener for the abort event on the connection - c.req.raw.signal.addEventListener("abort", onAbort); - - // --- Retry loop for provider fallback --- - const routingAttempts: RoutingAttempt[] = []; - const failedProviderIds = new Set(); - let res: Response | undefined; - const finalLogId = shortid(); - for ( - let retryAttempt = 0; - retryAttempt <= MAX_RETRIES; - retryAttempt++ - ) { - const perAttemptStartTime = Date.now(); - - // Type guard: narrow variables that TypeScript widens due to loop reassignment - if ( - !usedProvider || - !usedToken || - !url || - !usedModelFormatted || - !usedModelMapping - ) { - throw new Error("Provider context not initialized"); - } - - if (retryAttempt > 0) { - // Re-add abort listener (catch block removes it on error) - c.req.raw.signal.addEventListener("abort", onAbort); - - const nextProvider = selectNextProvider( - routingMetadata?.providerScores ?? [], - failedProviderIds, - iamFilteredModelProviders, + responseModel, + metadata, + }: { + billingModel: string; + billingProvider: Provider; + responseModel: string; + metadata?: Record; + }) => { + const { calculatedPromptTokens } = estimateTokens( + billingProvider, + messages, + null, + null, + 0, ); - if (!nextProvider) { - break; - } - - // Check if the fallback candidate is rate-limited - const retryRateLimitPeek = await peekProviderRateLimit( + const promptTokenCount = Math.max( + 1, + Math.round(calculatedPromptTokens ?? 1), + ); + const streamingCosts = await calculateCosts( + billingModel, + billingProvider, + promptTokenCount, + 0, + null, + { + prompt: messages + .map((m) => messageContentToString(m.content)) + .join("\n"), + completion: "", + }, + null, + 0, + image_config?.image_size, + inputImageCount, + 0, project.organizationId, - nextProvider.providerId, - modelInfo.id, - nextProvider.modelName, ); - if (retryRateLimitPeek.rateLimited) { - failedProviderIds.add( - providerRetryKey(nextProvider.providerId, nextProvider.region), - ); - // Mark as rate-limited in routing metadata - const scoreEntry = routingMetadata?.providerScores.find( - (s) => s.providerId === nextProvider.providerId, - ); - if (scoreEntry) { - scoreEntry.rate_limited = true; - } - // Don't consume a retry slot for rate-limit skips - retryAttempt--; - continue; - } - try { - const ctx = await resolveProviderContext( - nextProvider, - { - mode: project.mode, - organizationId: project.organizationId, - }, - { - id: organization.id, - credits: organization.credits, - devPlan: organization.devPlan, - devPlanCreditsLimit: organization.devPlanCreditsLimit, - devPlanCreditsUsed: organization.devPlanCreditsUsed, - devPlanExpiresAt: organization.devPlanExpiresAt, - }, - modelInfo, - originalRequestParams, - { - requestId, - stream: true, - effectiveStream, - messages: messages as BaseMessage[], - response_format, - tools, - tool_choice, - reasoning_effort, - reasoning_max_tokens, - effort, - webSearchTool, - image_config, - sensitive_word_check, - maxImageSizeMB, - userPlan, - hasExistingToolCalls, - customProviderName, - webSearchEnabled: !!webSearchTool, + await writeSSEAndCache({ + data: JSON.stringify({ + id: `chatcmpl-${Date.now()}`, + object: "chat.completion.chunk", + created: Math.floor(Date.now() / 1000), + model: responseModel, + choices: [ + { + index: 0, + delta: {}, + finish_reason: "content_filter", + }, + ], + ...(metadata && { metadata }), + }), + id: String(eventId++), + }); + + await writeSSEAndCache({ + data: JSON.stringify({ + id: `chatcmpl-${Date.now()}`, + object: "chat.completion.chunk", + created: Math.floor(Date.now() / 1000), + model: responseModel, + choices: [ + { + index: 0, + delta: {}, + finish_reason: null, + }, + ], + usage: { + prompt_tokens: promptTokenCount, + completion_tokens: 0, + total_tokens: promptTokenCount, + cost_usd_total: streamingCosts.totalCost, + cost_usd_input: streamingCosts.inputCost, + cost_usd_output: streamingCosts.outputCost, + cost_usd_cached_input: streamingCosts.cachedInputCost, + cost_usd_request: streamingCosts.requestCost, + cost_usd_image_input: streamingCosts.imageInputCost, + cost_usd_image_output: streamingCosts.imageOutputCost, }, - ); - usedProvider = ctx.usedProvider; - usedModel = ctx.usedModel; - usedModelFormatted = ctx.usedModelFormatted; - usedModelMapping = ctx.usedModelMapping; - baseModelName = ctx.baseModelName; - usedToken = ctx.usedToken; - providerKey = ctx.providerKey; - configIndex = ctx.configIndex; - envVarName = ctx.envVarName; - url = ctx.url; - requestBody = ctx.requestBody; - useResponsesApi = ctx.useResponsesApi; - requestCanBeCanceled = ctx.requestCanBeCanceled; - isImageGeneration = ctx.isImageGeneration; - supportsReasoning = ctx.supportsReasoning; - temperature = ctx.temperature; - max_tokens = ctx.max_tokens; - top_p = ctx.top_p; - frequency_penalty = ctx.frequency_penalty; - presence_penalty = ctx.presence_penalty; - usedRegion = ctx.usedRegion; - } catch { - failedProviderIds.add( - providerRetryKey(nextProvider.providerId, nextProvider.region), - ); - // Don't consume a retry slot for context-resolution failures - retryAttempt--; - continue; - } - } + }), + id: String(eventId++), + }); - try { - const headers = getProviderHeaders(usedProvider, usedToken, { - webSearchEnabled: !!webSearchTool, + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), }); - headers["Content-Type"] = "application/json"; - - // Add effort beta header for Anthropic if effort parameter is specified - if (usedProvider === "anthropic" && effort !== undefined) { - const currentBeta = headers["anthropic-beta"]; - headers["anthropic-beta"] = currentBeta - ? `${currentBeta},effort-2025-11-24` - : "effort-2025-11-24"; + doneSent = true; + }; + + // Set up cancellation handling + const controller = new AbortController(); + // Set up a listener for the request being aborted + const onAbort = () => { + clearKeepalive(); + if (requestCanBeCanceled) { + canceled = true; + controller.abort(); } + }; + + // Add event listener for the abort event on the connection + c.req.raw.signal.addEventListener("abort", onAbort); + + // --- Retry loop for provider fallback --- + const routingAttempts: RoutingAttempt[] = []; + const failedProviderIds = new Set(); + let res: Response | undefined; + const finalLogId = shortid(); + for ( + let retryAttempt = 0; + retryAttempt <= MAX_RETRIES; + retryAttempt++ + ) { + const perAttemptStartTime = Date.now(); - // Add structured outputs beta header for Anthropic if json_schema response_format is specified + // Type guard: narrow variables that TypeScript widens due to loop reassignment if ( - usedProvider === "anthropic" && - response_format?.type === "json_schema" + !usedProvider || + !usedToken || + !url || + !usedModelFormatted || + !usedModelMapping ) { - const currentBeta = headers["anthropic-beta"]; - headers["anthropic-beta"] = currentBeta - ? `${currentBeta},structured-outputs-2025-11-13` - : "structured-outputs-2025-11-13"; + throw new Error("Provider context not initialized"); } - // Create a combined signal for both timeout and cancellation - const fetchSignal = createStreamingCombinedSignal( - requestCanBeCanceled ? controller : undefined, - ); - - res = await fetch(url, { - method: "POST", - headers, - body: JSON.stringify(requestBody), - signal: fetchSignal, - }); - } catch (error) { - // Clean up the event listeners - c.req.raw.signal.removeEventListener("abort", onAbort); - - // Check for timeout error first (AbortSignal.timeout throws TimeoutError) - if (isTimeoutError(error)) { - // Handle timeout error - const errorMessage = - error instanceof Error ? error.message : "Request timeout"; - const timeoutCause = extractErrorCause(error); - logger.warn("Upstream request timeout", { - error: errorMessage, - cause: timeoutCause, - usedProvider, - requestedProvider, - usedModel, - initialRequestedModel, - unifiedFinishReason: getUnifiedFinishReason( - "upstream_error", - usedProvider, - ), - }); - - // Log the timeout error in the database - const timeoutPluginIds = plugins?.map((p) => p.id) ?? []; - - // Check if we should retry before logging so we can mark the log as retried - const willRetryTimeout = shouldRetryRequest({ - requestedProvider, - noFallback, - statusCode: 0, - retryCount: retryAttempt, - remainingProviders: - (routingMetadata?.providerScores.length ?? 0) - - failedProviderIds.size - - 1, - usedProvider, - }); + if (retryAttempt > 0) { + // Re-add abort listener (catch block removes it on error) + c.req.raw.signal.addEventListener("abort", onAbort); - const baseLogEntry = createLogEntry( - requestId, - project, - apiKey, - providerKey?.id, - usedModelFormatted, - usedModelMapping, - usedProvider, - initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoning_effort, - reasoning_max_tokens, - effort, - response_format, - tools, - tool_choice, - source, - customHeaders, - debugMode, - userAgent, - image_config, - routingMetadata, - rawBody, - null, // No response for timeout error - requestBody, - null, // No upstream response for timeout error - timeoutPluginIds, - undefined, // No plugin results for error case + const nextProvider = selectNextProvider( + routingMetadata?.providerScores ?? [], + failedProviderIds, + iamFilteredModelProviders, ); + if (!nextProvider) { + break; + } - await insertLog({ - ...baseLogEntry, - duration: Date.now() - perAttemptStartTime, - timeToFirstToken: null, - timeToFirstReasoningToken: null, - responseSize: 0, - content: null, - reasoningContent: null, - finishReason: "upstream_error", - promptTokens: null, - completionTokens: null, - totalTokens: null, - reasoningTokens: null, - cachedTokens: null, - hasError: true, - streamed: true, - canceled: false, - errorDetails: { - statusCode: 0, - statusText: "TimeoutError", - responseText: errorMessage, - cause: timeoutCause, - }, - cachedInputCost: null, - requestCost: null, - webSearchCost: null, - imageInputTokens: null, - imageOutputTokens: null, - imageInputCost: null, - imageOutputCost: null, - discount: null, - dataStorageCost: "0", - cached: false, - toolResults: null, - retried: willRetryTimeout, - retriedByLogId: willRetryTimeout ? finalLogId : null, - }); - - if (willRetryTimeout) { - routingAttempts.push({ - provider: usedProvider, - model: baseModelName, - ...(usedRegion && { region: usedRegion }), - status_code: 0, - error_type: getErrorType(0), - succeeded: false, - }); + // Check if the fallback candidate is rate-limited + const retryRateLimitPeek = await peekProviderRateLimit( + project.organizationId, + nextProvider.providerId, + modelInfo.id, + nextProvider.modelName, + ); + if (retryRateLimitPeek.rateLimited) { failedProviderIds.add( - providerRetryKey(usedProvider, usedRegion), + providerRetryKey( + nextProvider.providerId, + nextProvider.region, + ), ); + // Mark as rate-limited in routing metadata + const scoreEntry = routingMetadata?.providerScores.find( + (s) => s.providerId === nextProvider.providerId, + ); + if (scoreEntry) { + scoreEntry.rate_limited = true; + } + // Don't consume a retry slot for rate-limit skips + retryAttempt--; continue; } - await stream.writeSSE({ - event: "error", - data: JSON.stringify({ - error: { - message: `Upstream provider timeout: ${errorMessage}`, - type: "upstream_timeout", - code: "timeout", + try { + const ctx = await resolveProviderContext( + nextProvider, + { + mode: project.mode, + organizationId: project.organizationId, }, - }), - id: String(eventId++), - }); - return; - } else if (error instanceof Error && error.name === "AbortError") { - // Log the canceled request - // Extract plugin IDs for logging (canceled request) - const canceledPluginIds = plugins?.map((p) => p.id) ?? []; - - // Calculate costs for cancelled request if billing is enabled - const billCancelled = shouldBillCancelledRequests(); - let cancelledCosts: Awaited< - ReturnType - > | null = null; - let estimatedPromptTokens: number | null = null; - - if (billCancelled) { - // Estimate prompt tokens from messages - const tokenEstimation = estimateTokens( - usedProvider, - messages, - null, - null, - null, - ); - estimatedPromptTokens = tokenEstimation.calculatedPromptTokens; - - // Calculate costs based on prompt tokens only (no completion yet) - // If web search tool was enabled, count it as 1 search for billing - cancelledCosts = await calculateCosts( - usedModel, - usedProvider, - estimatedPromptTokens, - 0, // No completion tokens yet - null, // No cached tokens { - prompt: messages - .map((m) => messageContentToString(m.content)) - .join("\n"), - completion: "", + id: organization.id, + credits: organization.credits, + devPlan: organization.devPlan, + devPlanCreditsLimit: organization.devPlanCreditsLimit, + devPlanCreditsUsed: organization.devPlanCreditsUsed, + devPlanExpiresAt: organization.devPlanExpiresAt, + }, + modelInfo, + originalRequestParams, + { + requestId, + stream: true, + effectiveStream, + messages: messages as BaseMessage[], + response_format, + tools, + tool_choice, + reasoning_effort, + reasoning_max_tokens, + effort, + webSearchTool, + image_config, + sensitive_word_check, + maxImageSizeMB, + userPlan, + hasExistingToolCalls, + customProviderName, + webSearchEnabled: !!webSearchTool, }, - null, // No reasoning tokens - 0, // No output images - undefined, - inputImageCount, - webSearchTool ? 1 : null, // Bill for web search if it was enabled - project.organizationId, ); + usedProvider = ctx.usedProvider; + usedModel = ctx.usedModel; + usedModelFormatted = ctx.usedModelFormatted; + usedModelMapping = ctx.usedModelMapping; + baseModelName = ctx.baseModelName; + usedToken = ctx.usedToken; + providerKey = ctx.providerKey; + configIndex = ctx.configIndex; + envVarName = ctx.envVarName; + url = ctx.url; + requestBody = ctx.requestBody; + useResponsesApi = ctx.useResponsesApi; + requestCanBeCanceled = ctx.requestCanBeCanceled; + isImageGeneration = ctx.isImageGeneration; + supportsReasoning = ctx.supportsReasoning; + temperature = ctx.temperature; + max_tokens = ctx.max_tokens; + top_p = ctx.top_p; + frequency_penalty = ctx.frequency_penalty; + presence_penalty = ctx.presence_penalty; + usedRegion = ctx.usedRegion; + } catch { + failedProviderIds.add( + providerRetryKey( + nextProvider.providerId, + nextProvider.region, + ), + ); + // Don't consume a retry slot for context-resolution failures + retryAttempt--; + continue; } + } - const baseLogEntry = createLogEntry( - requestId, - project, - apiKey, - providerKey?.id, - usedModelFormatted, - usedModelMapping, - usedProvider, - initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoning_effort, - reasoning_max_tokens, - effort, - response_format, - tools, - tool_choice, - source, - customHeaders, - debugMode, - userAgent, - image_config, - routingMetadata, - rawBody, - null, // No response for canceled request - requestBody, // The request that was sent before cancellation - null, // No upstream response for canceled request - canceledPluginIds, - undefined, // No plugin results for canceled request + try { + const headers = getProviderHeaders(usedProvider, usedToken, { + webSearchEnabled: !!webSearchTool, + }); + headers["Content-Type"] = "application/json"; + + // Add effort beta header for Anthropic if effort parameter is specified + if (usedProvider === "anthropic" && effort !== undefined) { + const currentBeta = headers["anthropic-beta"]; + headers["anthropic-beta"] = currentBeta + ? `${currentBeta},effort-2025-11-24` + : "effort-2025-11-24"; + } + + // Add structured outputs beta header for Anthropic if json_schema response_format is specified + if ( + usedProvider === "anthropic" && + response_format?.type === "json_schema" + ) { + const currentBeta = headers["anthropic-beta"]; + headers["anthropic-beta"] = currentBeta + ? `${currentBeta},structured-outputs-2025-11-13` + : "structured-outputs-2025-11-13"; + } + + // Create a combined signal for both timeout and cancellation + const fetchSignal = createStreamingCombinedSignal( + requestCanBeCanceled ? controller : undefined, ); - await insertLog({ - ...baseLogEntry, - duration: Date.now() - perAttemptStartTime, - timeToFirstToken: null, // Not applicable for canceled request - timeToFirstReasoningToken: null, // Not applicable for canceled request - responseSize: 0, - content: null, - reasoningContent: null, - finishReason: "canceled", - promptTokens: billCancelled - ? ( - cancelledCosts?.promptTokens ?? estimatedPromptTokens - )?.toString() - : null, - completionTokens: billCancelled ? "0" : null, - totalTokens: billCancelled - ? ( - cancelledCosts?.promptTokens ?? estimatedPromptTokens - )?.toString() - : null, - reasoningTokens: null, - cachedTokens: null, - hasError: false, - streamed: true, - canceled: true, - errorDetails: null, - inputCost: cancelledCosts?.inputCost ?? null, - outputCost: cancelledCosts?.outputCost ?? null, - cachedInputCost: cancelledCosts?.cachedInputCost ?? null, - requestCost: cancelledCosts?.requestCost ?? null, - webSearchCost: cancelledCosts?.webSearchCost ?? null, - imageInputTokens: - cancelledCosts?.imageInputTokens?.toString() ?? null, - imageOutputTokens: - cancelledCosts?.imageOutputTokens?.toString() ?? null, - imageInputCost: cancelledCosts?.imageInputCost ?? null, - imageOutputCost: cancelledCosts?.imageOutputCost ?? null, - cost: cancelledCosts?.totalCost ?? null, - estimatedCost: cancelledCosts?.estimatedCost ?? false, - discount: cancelledCosts?.discount ?? null, - dataStorageCost: billCancelled - ? calculateDataStorageCost( - cancelledCosts?.promptTokens ?? estimatedPromptTokens, - null, - 0, - null, - retentionLevel, - ) - : "0", - cached: false, - toolResults: null, + res = await fetch(url, { + method: "POST", + headers, + body: JSON.stringify(requestBody), + signal: fetchSignal, }); + } catch (error) { + // Clean up the event listeners + c.req.raw.signal.removeEventListener("abort", onAbort); + + // Check for timeout error first (AbortSignal.timeout throws TimeoutError) + if (isTimeoutError(error)) { + // Handle timeout error + const errorMessage = + error instanceof Error ? error.message : "Request timeout"; + const timeoutCause = extractErrorCause(error); + logger.warn("Upstream request timeout", { + error: errorMessage, + cause: timeoutCause, + usedProvider, + requestedProvider, + usedModel, + initialRequestedModel, + unifiedFinishReason: getUnifiedFinishReason( + "upstream_error", + usedProvider, + ), + }); - // Send a cancellation event to the client - await writeSSEAndCache({ - event: "canceled", - data: JSON.stringify({ - message: "Request canceled by client", - }), - id: String(eventId++), - }); - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); - clearKeepalive(); - return; - } else if (error instanceof Error) { - // Handle fetch errors (timeout, connection failures, etc.) - const errorMessage = error.message; - const fetchCause = extractErrorCause(error); - logger.warn("Fetch error", { - error: errorMessage, - cause: fetchCause, - usedProvider, - requestedProvider, - usedModel, - initialRequestedModel, - unifiedFinishReason: getUnifiedFinishReason( - "upstream_error", + // Check if we should retry before logging so we can mark the log as retried + const willRetryTimeout = shouldRetryRequest({ + requestedProvider, + noFallback, + statusCode: 0, + retryCount: retryAttempt, + remainingProviders: + (routingMetadata?.providerScores.length ?? 0) - + failedProviderIds.size - + 1, usedProvider, - ), - }); + }); - // Log the error in the database - // Extract plugin IDs for logging (fetch error) - const fetchErrorPluginIds = plugins?.map((p) => p.id) ?? []; + enqueueChatLog( + c, + { + providerKeyId: providerKey?.id, + usedModel: usedModelFormatted, + usedModelMapping, + usedProvider, + requestedModel: initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoningEffort: reasoning_effort, + reasoningMaxTokens: reasoning_max_tokens, + effort, + responseFormat: response_format, + tools, + toolChoice: tool_choice, + source, + customHeaders, + debugMode, + userAgent, + imageConfig: image_config, + routingMetadata, + rawRequest: rawBody, + rawResponse: null, + upstreamRequest: requestBody, + upstreamResponse: null, + plugins: requestPluginIds, + pluginResults: undefined, + }, + { + duration: Date.now() - perAttemptStartTime, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + responseSize: 0, + content: null, + reasoningContent: null, + finishReason: "upstream_error", + promptTokens: null, + completionTokens: null, + totalTokens: null, + reasoningTokens: null, + cachedTokens: null, + hasError: true, + streamed: true, + canceled: false, + errorDetails: { + statusCode: 0, + statusText: "TimeoutError", + responseText: errorMessage, + cause: timeoutCause, + }, + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + discount: null, + dataStorageCost: "0", + cached: false, + toolResults: null, + retried: willRetryTimeout, + retriedByLogId: willRetryTimeout ? finalLogId : null, + }, + ); - // Check if we should retry before logging so we can mark the log as retried - const willRetryFetch = shouldRetryRequest({ - requestedProvider, - noFallback, - statusCode: 0, - retryCount: retryAttempt, - remainingProviders: - (routingMetadata?.providerScores.length ?? 0) - - failedProviderIds.size - - 1, - usedProvider, - }); - - const baseLogEntry = createLogEntry( - requestId, - project, - apiKey, - providerKey?.id, - usedModelFormatted, - usedModelMapping, - usedProvider, - initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoning_effort, - reasoning_max_tokens, - effort, - response_format, - tools, - tool_choice, - source, - customHeaders, - debugMode, - userAgent, - image_config, - routingMetadata, - rawBody, - null, // No response for fetch error - requestBody, // The request that resulted in error - null, // No upstream response for fetch error - fetchErrorPluginIds, - undefined, // No plugin results for error case - ); - - await insertLog({ - ...baseLogEntry, - duration: Date.now() - perAttemptStartTime, - timeToFirstToken: null, // Not applicable for error case - timeToFirstReasoningToken: null, // Not applicable for error case - responseSize: 0, - content: null, - reasoningContent: null, - finishReason: "upstream_error", - promptTokens: null, - completionTokens: null, - totalTokens: null, - reasoningTokens: null, - cachedTokens: null, - hasError: true, - streamed: true, - canceled: false, - errorDetails: { - statusCode: 0, - statusText: error.name, - responseText: errorMessage, - cause: fetchCause, - }, - cachedInputCost: null, - requestCost: null, - webSearchCost: null, - imageInputTokens: null, - imageOutputTokens: null, - imageInputCost: null, - imageOutputCost: null, - discount: null, - dataStorageCost: "0", - cached: false, - toolResults: null, - retried: willRetryFetch, - retriedByLogId: willRetryFetch ? finalLogId : null, - }); - - // Report key health for environment-based tokens - if (envVarName !== undefined) { - reportKeyError(envVarName, configIndex, 0); - } + if (willRetryTimeout) { + routingAttempts.push({ + provider: usedProvider, + model: baseModelName, + ...(usedRegion && { region: usedRegion }), + status_code: 0, + error_type: getErrorType(0), + succeeded: false, + }); + failedProviderIds.add( + providerRetryKey(usedProvider, usedRegion), + ); + continue; + } - if (willRetryFetch) { - routingAttempts.push({ - provider: usedProvider, - model: baseModelName, - ...(usedRegion && { region: usedRegion }), - status_code: 0, - error_type: getErrorType(0), - succeeded: false, + await stream.writeSSE({ + event: "error", + data: JSON.stringify({ + error: { + message: `Upstream provider timeout: ${errorMessage}`, + type: "upstream_timeout", + code: "timeout", + }, + }), + id: String(eventId++), }); - failedProviderIds.add( - providerRetryKey(usedProvider, usedRegion), - ); - continue; - } + return; + } else if ( + error instanceof Error && + error.name === "AbortError" + ) { + // Calculate costs for cancelled request if billing is enabled + const billCancelled = shouldBillCancelledRequests(); + let cancelledCosts: Awaited< + ReturnType + > | null = null; + let estimatedPromptTokens: number | null = null; + + if (billCancelled) { + // Estimate prompt tokens from messages + const tokenEstimation = estimateTokens( + usedProvider, + messages, + null, + null, + null, + ); + estimatedPromptTokens = + tokenEstimation.calculatedPromptTokens; - // Send error event to the client - await writeSSEAndCache({ - event: "error", - data: JSON.stringify({ - error: { - message: `Failed to connect to provider: ${errorMessage}`, - type: "upstream_error", - code: "fetch_failed", - }, - }), - id: String(eventId++), - }); - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); - clearKeepalive(); - return; - } else { - throw error; - } - } + // Calculate costs based on prompt tokens only (no completion yet) + // If web search tool was enabled, count it as 1 search for billing + cancelledCosts = await calculateCosts( + usedModel, + usedProvider, + estimatedPromptTokens, + 0, // No completion tokens yet + null, // No cached tokens + { + prompt: messages + .map((m) => messageContentToString(m.content)) + .join("\n"), + completion: "", + }, + null, // No reasoning tokens + 0, // No output images + undefined, + inputImageCount, + webSearchTool ? 1 : null, // Bill for web search if it was enabled + project.organizationId, + ); + } - if (!res.ok) { - const rawErrorResponseText = await res.text(); - const errorResponseText = - usedProvider === "aws-bedrock" - ? extractAwsBedrockHttpError(res, rawErrorResponseText) - : rawErrorResponseText; - - // Determine the finish reason for error handling - const finishReason = getFinishReasonFromError( - res.status, - errorResponseText, - ); + enqueueChatLog( + c, + { + providerKeyId: providerKey?.id, + usedModel: usedModelFormatted, + usedModelMapping, + usedProvider, + requestedModel: initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoningEffort: reasoning_effort, + reasoningMaxTokens: reasoning_max_tokens, + effort, + responseFormat: response_format, + tools, + toolChoice: tool_choice, + source, + customHeaders, + debugMode, + userAgent, + imageConfig: image_config, + routingMetadata, + rawRequest: rawBody, + rawResponse: null, + upstreamRequest: requestBody, + upstreamResponse: null, + plugins: requestPluginIds, + pluginResults: undefined, + }, + { + duration: Date.now() - perAttemptStartTime, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + responseSize: 0, + content: null, + reasoningContent: null, + finishReason: "canceled", + promptTokens: billCancelled + ? ( + cancelledCosts?.promptTokens ?? estimatedPromptTokens + )?.toString() + : null, + completionTokens: billCancelled ? "0" : null, + totalTokens: billCancelled + ? ( + cancelledCosts?.promptTokens ?? estimatedPromptTokens + )?.toString() + : null, + reasoningTokens: null, + cachedTokens: null, + hasError: false, + streamed: true, + canceled: true, + errorDetails: null, + inputCost: cancelledCosts?.inputCost ?? null, + outputCost: cancelledCosts?.outputCost ?? null, + cachedInputCost: cancelledCosts?.cachedInputCost ?? null, + requestCost: cancelledCosts?.requestCost ?? null, + webSearchCost: cancelledCosts?.webSearchCost ?? null, + imageInputTokens: + cancelledCosts?.imageInputTokens?.toString() ?? null, + imageOutputTokens: + cancelledCosts?.imageOutputTokens?.toString() ?? null, + imageInputCost: cancelledCosts?.imageInputCost ?? null, + imageOutputCost: cancelledCosts?.imageOutputCost ?? null, + cost: cancelledCosts?.totalCost ?? null, + estimatedCost: cancelledCosts?.estimatedCost ?? false, + discount: cancelledCosts?.discount ?? null, + dataStorageCost: billCancelled + ? calculateDataStorageCost( + cancelledCosts?.promptTokens ?? estimatedPromptTokens, + null, + 0, + null, + retentionLevel, + ) + : "0", + cached: false, + toolResults: null, + }, + ); - if ( - finishReason !== "client_error" && - finishReason !== "content_filter" - ) { - logger.warn("Provider error", { - status: res.status, - errorText: errorResponseText, - usedProvider, - requestedProvider, - usedModel, - initialRequestedModel, - organizationId: project.organizationId, - projectId: apiKey.projectId, - apiKeyId: apiKey.id, - unifiedFinishReason: getUnifiedFinishReason( - finishReason, + // Send a cancellation event to the client + await writeSSEAndCache({ + event: "canceled", + data: JSON.stringify({ + message: "Request canceled by client", + }), + id: String(eventId++), + }); + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + clearKeepalive(); + return; + } else if (error instanceof Error) { + // Handle fetch errors (timeout, connection failures, etc.) + const errorMessage = error.message; + const fetchCause = extractErrorCause(error); + logger.warn("Fetch error", { + error: errorMessage, + cause: fetchCause, usedProvider, - ), - }); - } - - // Log the request in the database - // Extract plugin IDs for logging - const streamingErrorPluginIds = plugins?.map((p) => p.id) ?? []; - - // Check if we should retry before logging so we can mark the log as retried - const willRetryHttpError = shouldRetryRequest({ - requestedProvider, - noFallback, - statusCode: res.status, - retryCount: retryAttempt, - remainingProviders: - (routingMetadata?.providerScores.length ?? 0) - - failedProviderIds.size - - 1, - usedProvider, - }); + requestedProvider, + usedModel, + initialRequestedModel, + unifiedFinishReason: getUnifiedFinishReason( + "upstream_error", + usedProvider, + ), + }); - const baseLogEntry = createLogEntry( - requestId, - project, - apiKey, - providerKey?.id, - usedModelFormatted, - usedModelMapping, - usedProvider, - initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoning_effort, - reasoning_max_tokens, - effort, - response_format, - tools, - tool_choice, - source, - customHeaders, - debugMode, - userAgent, - image_config, - routingMetadata, - rawBody, - null, // No response for error case - requestBody, // The request that was sent and resulted in error - null, // No upstream response for error case - streamingErrorPluginIds, - undefined, // No plugin results for error case - ); + // Check if we should retry before logging so we can mark the log as retried + const willRetryFetch = shouldRetryRequest({ + requestedProvider, + noFallback, + statusCode: 0, + retryCount: retryAttempt, + remainingProviders: + (routingMetadata?.providerScores.length ?? 0) - + failedProviderIds.size - + 1, + usedProvider, + }); - await insertLog({ - ...baseLogEntry, - duration: Date.now() - perAttemptStartTime, - timeToFirstToken: null, - timeToFirstReasoningToken: null, - responseSize: errorResponseText.length, - content: null, - reasoningContent: null, - finishReason, - promptTokens: - finishReason === "content_filter" - ? ( - estimateTokens(usedProvider, messages, null, null, 0) - .calculatedPromptTokens ?? null - )?.toString() - : null, - completionTokens: null, - totalTokens: - finishReason === "content_filter" - ? ( - estimateTokens(usedProvider, messages, null, null, 0) - .calculatedPromptTokens ?? null - )?.toString() - : null, - reasoningTokens: null, - cachedTokens: null, - hasError: finishReason !== "content_filter", // content_filter is not an error - streamed: true, - canceled: false, - errorDetails: - finishReason === "content_filter" - ? null - : { - statusCode: res.status, - statusText: res.statusText, - responseText: errorResponseText, + enqueueChatLog( + c, + { + providerKeyId: providerKey?.id, + usedModel: usedModelFormatted, + usedModelMapping, + usedProvider, + requestedModel: initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoningEffort: reasoning_effort, + reasoningMaxTokens: reasoning_max_tokens, + effort, + responseFormat: response_format, + tools, + toolChoice: tool_choice, + source, + customHeaders, + debugMode, + userAgent, + imageConfig: image_config, + routingMetadata, + rawRequest: rawBody, + rawResponse: null, + upstreamRequest: requestBody, + upstreamResponse: null, + plugins: requestPluginIds, + pluginResults: undefined, + }, + { + duration: Date.now() - perAttemptStartTime, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + responseSize: 0, + content: null, + reasoningContent: null, + finishReason: "upstream_error", + promptTokens: null, + completionTokens: null, + totalTokens: null, + reasoningTokens: null, + cachedTokens: null, + hasError: true, + streamed: true, + canceled: false, + errorDetails: { + statusCode: 0, + statusText: error.name, + responseText: errorMessage, + cause: fetchCause, }, - cachedInputCost: null, - requestCost: null, - webSearchCost: null, - imageInputTokens: null, - imageOutputTokens: null, - imageInputCost: null, - imageOutputCost: null, - discount: null, - dataStorageCost: "0", - cached: false, - toolResults: null, - retried: willRetryHttpError, - retriedByLogId: willRetryHttpError ? finalLogId : null, - }); + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + discount: null, + dataStorageCost: "0", + cached: false, + toolResults: null, + retried: willRetryFetch, + retriedByLogId: willRetryFetch ? finalLogId : null, + }, + ); - // Report key health for environment-based tokens - // Don't report content_filter as a key error - it's intentional provider behavior - if (envVarName !== undefined && finishReason !== "content_filter") { - reportKeyError( - envVarName, - configIndex, - res.status, - errorResponseText, - ); - } + // Report key health for environment-based tokens + if (envVarName !== undefined) { + reportKeyError(envVarName, configIndex, 0); + } - if (willRetryHttpError) { - routingAttempts.push({ - provider: usedProvider, - model: baseModelName, - ...(usedRegion && { region: usedRegion }), - status_code: res.status, - error_type: getErrorType(res.status), - succeeded: false, - }); - failedProviderIds.add(providerRetryKey(usedProvider, usedRegion)); - continue; - } + if (willRetryFetch) { + routingAttempts.push({ + provider: usedProvider, + model: baseModelName, + ...(usedRegion && { region: usedRegion }), + status_code: 0, + error_type: getErrorType(0), + succeeded: false, + }); + failedProviderIds.add( + providerRetryKey(usedProvider, usedRegion), + ); + continue; + } - // For content_filter, return a proper completion chunk (not an error) - // This handles Azure ResponsibleAIPolicyViolation and similar content filtering errors - if (finishReason === "content_filter") { - await writeStreamingContentFilterResponse({ - billingModel: usedModel, - billingProvider: usedProvider, - responseModel: `${usedProvider}/${baseModelName}`, - metadata: { - requested_model: initialRequestedModel, - requested_provider: requestedProvider, - used_model: baseModelName, - used_provider: usedProvider, - ...(usedRegion && { used_region: usedRegion }), - underlying_used_model: usedModel, - }, - }); - } else { - // For client errors, return the original provider error response - let errorData; - if (finishReason === "client_error") { - try { - errorData = JSON.parse(errorResponseText); - } catch { - // If we can't parse the original error, fall back to our format - errorData = { + // Send error event to the client + await writeSSEAndCache({ + event: "error", + data: JSON.stringify({ error: { - message: `Error from provider: ${res.status} ${res.statusText} ${errorResponseText}`, - type: finishReason, - param: null, - code: finishReason, - responseText: errorResponseText, + message: `Failed to connect to provider: ${errorMessage}`, + type: "upstream_error", + code: "fetch_failed", }, - }; - } + }), + id: String(eventId++), + }); + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + clearKeepalive(); + return; } else { - errorData = { - error: { - message: `Error from provider: ${res.status} ${res.statusText} ${errorResponseText}`, - type: finishReason, - param: null, - code: finishReason, - responseText: errorResponseText, - }, - }; + throw error; } - - await writeSSEAndCache({ - event: "error", - data: JSON.stringify(errorData), - id: String(eventId++), - }); - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); } - clearKeepalive(); - return; - } + if (!res.ok) { + const rawErrorResponseText = await res.text(); + const errorResponseText = + usedProvider === "aws-bedrock" + ? extractAwsBedrockHttpError(res, rawErrorResponseText) + : rawErrorResponseText; - break; // Fetch succeeded, exit retry loop - } // End of retry for loop - - // Add the final attempt (successful or last failed) to routing - if (res && res.ok && usedProvider) { - routingAttempts.push({ - provider: usedProvider, - model: baseModelName, - ...(usedRegion && { region: usedRegion }), - status_code: res.status, - error_type: "none", - succeeded: true, - }); - } + // Determine the finish reason for error handling + const finishReason = getFinishReasonFromError( + res.status, + errorResponseText, + ); - // Update routingMetadata with all routing attempts for DB logging - if (routingMetadata) { - // Enrich providerScores with failure info from routing attempts - const failedMap = new Map( - routingAttempts - .filter((a) => !a.succeeded) - .map((f) => [f.provider, f]), - ); - routingMetadata = { - ...routingMetadata, - routing: routingAttempts, - providerScores: routingMetadata.providerScores.map((score) => { - const failure = failedMap.get(score.providerId); - if (failure) { - return { - ...score, - failed: true, - status_code: failure.status_code, - error_type: failure.error_type, - }; + if ( + finishReason !== "client_error" && + finishReason !== "content_filter" + ) { + logger.warn("Provider error", { + status: res.status, + errorText: errorResponseText, + usedProvider, + requestedProvider, + usedModel, + initialRequestedModel, + organizationId: project.organizationId, + projectId: apiKey.projectId, + apiKeyId: apiKey.id, + unifiedFinishReason: getUnifiedFinishReason( + finishReason, + usedProvider, + ), + }); } - return score; - }), - }; - } - - // If all retries exhausted without a successful response - if (!res || !res.ok) { - await writeSSEAndCache({ - event: "error", - data: JSON.stringify({ - error: { - message: "All provider attempts failed", - type: "upstream_error", - code: "all_providers_failed", - }, - }), - id: String(eventId++), - }); - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); - clearKeepalive(); - return; - } - // After retry loop: narrow provider variables for the rest of the streaming body - if ( - !usedProvider || - !usedToken || - !url || - !usedModelFormatted || - !usedModelMapping - ) { - throw new Error("Provider context not initialized"); - } - - if (!res.body) { - await writeSSEAndCache({ - event: "error", - data: JSON.stringify({ - error: { - message: "No response body from provider", - type: "gateway_error", - param: null, - code: "gateway_error", - }, - }), - id: String(eventId++), - }); - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); - clearKeepalive(); - return; - } + // Check if we should retry before logging so we can mark the log as retried + const willRetryHttpError = shouldRetryRequest({ + requestedProvider, + noFallback, + statusCode: res.status, + retryCount: retryAttempt, + remainingProviders: + (routingMetadata?.providerScores.length ?? 0) - + failedProviderIds.size - + 1, + usedProvider, + }); - const reader = res.body.getReader(); - let fullContent = ""; - let fullReasoningContent = ""; - let finishReason = null; - let promptTokens = null; - let completionTokens = null; - let totalTokens = null; - let reasoningTokens = null; - let cachedTokens = null; - let streamingToolCalls = null; - let imageByteSize = 0; // Track total image data size for token estimation - let outputImageCount = 0; // Track number of output images for cost calculation - let webSearchCount = 0; // Track web search calls for cost calculation - const serverToolUseIndices = new Set(); // Track Anthropic server_tool_use block indices - let sawUpstreamDoneSentinel = false; - let sawProviderTerminalEvent = false; - let sawOpenAiResponsesDoneEvent = false; - let sawOpenAiResponsesCompletedStatus = false; - let sentDownstreamFinishReasonChunk = false; - let handledTerminalProviderEvent = false; - let buffer = ""; // Buffer for accumulating partial data across chunks (string for SSE) - let binaryBuffer = new Uint8Array(0); // Buffer for binary event streams (AWS Bedrock) - let rawUpstreamData = ""; // Raw data received from upstream provider - const isAwsBedrock = usedProvider === "aws-bedrock"; - let shouldTerminateStream = false; - - // Response healing for streaming mode - const streamingResponseHealingEnabled = plugins?.some( - (p) => p.id === "response-healing", - ); - const streamingIsJsonResponseFormat = - response_format?.type === "json_object" || - response_format?.type === "json_schema"; - const shouldBufferForHealing = - streamingIsJsonResponseFormat && - (streamingResponseHealingEnabled === true || - usedProvider === "novita" || - usedProvider === "minimax"); - - // Buffer for storing chunks when healing is enabled - // We need to buffer content, track last chunk info, and replay healed content at the end - const bufferedContentChunks: string[] = []; - let lastChunkId: string | null = null; - let lastChunkModel: string | null = null; - let lastChunkCreated: number | null = null; - const streamingPluginResults: { - responseHealing?: { - healed: boolean; - healingMethod?: string; - }; - } = {}; + enqueueChatLog( + c, + { + providerKeyId: providerKey?.id, + usedModel: usedModelFormatted, + usedModelMapping, + usedProvider, + requestedModel: initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoningEffort: reasoning_effort, + reasoningMaxTokens: reasoning_max_tokens, + effort, + responseFormat: response_format, + tools, + toolChoice: tool_choice, + source, + customHeaders, + debugMode, + userAgent, + imageConfig: image_config, + routingMetadata, + rawRequest: rawBody, + rawResponse: null, + upstreamRequest: requestBody, + upstreamResponse: null, + plugins: requestPluginIds, + pluginResults: undefined, + }, + { + duration: Date.now() - perAttemptStartTime, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + responseSize: errorResponseText.length, + content: null, + reasoningContent: null, + finishReason, + promptTokens: + finishReason === "content_filter" + ? ( + estimateTokens(usedProvider, messages, null, null, 0) + .calculatedPromptTokens ?? null + )?.toString() + : null, + completionTokens: null, + totalTokens: + finishReason === "content_filter" + ? ( + estimateTokens(usedProvider, messages, null, null, 0) + .calculatedPromptTokens ?? null + )?.toString() + : null, + reasoningTokens: null, + cachedTokens: null, + hasError: finishReason !== "content_filter", + streamed: true, + canceled: false, + errorDetails: + finishReason === "content_filter" + ? null + : { + statusCode: res.status, + statusText: res.statusText, + responseText: errorResponseText, + }, + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + discount: null, + dataStorageCost: "0", + cached: false, + toolResults: null, + retried: willRetryHttpError, + retriedByLogId: willRetryHttpError ? finalLogId : null, + }, + ); - try { - while (true) { - const { done, value } = await reader.read(); - if (done) { - break; - } + // Report key health for environment-based tokens + // Don't report content_filter as a key error - it's intentional provider behavior + if ( + envVarName !== undefined && + finishReason !== "content_filter" + ) { + reportKeyError( + envVarName, + configIndex, + res.status, + errorResponseText, + ); + } - // For AWS Bedrock, convert binary event stream to SSE format - let chunk: string; - if (isAwsBedrock) { - // Append binary data to buffer - const newBuffer = new Uint8Array( - binaryBuffer.length + value.length, - ); - newBuffer.set(binaryBuffer); - newBuffer.set(value, binaryBuffer.length); - binaryBuffer = newBuffer; - - // Parse and convert available events - const { sse, bytesConsumed } = - convertAwsEventStreamToSSE(binaryBuffer); - chunk = sse; - - // Remove consumed bytes from binary buffer - if (bytesConsumed > 0) { - binaryBuffer = binaryBuffer.slice(bytesConsumed); + if (willRetryHttpError) { + routingAttempts.push({ + provider: usedProvider, + model: baseModelName, + ...(usedRegion && { region: usedRegion }), + status_code: res.status, + error_type: getErrorType(res.status), + succeeded: false, + }); + failedProviderIds.add( + providerRetryKey(usedProvider, usedRegion), + ); + continue; } - } else { - // Convert the Uint8Array to a string for SSE - chunk = sharedTextDecoder.decode(value, { stream: true }); - } - - // Log error on large chunks (1MB+) - should almost never happen - if (chunk.length > 1024 * 1024) { - logger.error( - `Large chunk received: ${(chunk.length / 1024 / 1024).toFixed(2)}MB`, - ); - } - - buffer += chunk; - // Collect raw upstream data for logging only in debug mode and within size limit - if (debugMode && rawUpstreamData.length < MAX_RAW_DATA_SIZE) { - rawUpstreamData += chunk; - } - // Check buffer size to prevent memory exhaustion - if (buffer.length > MAX_BUFFER_SIZE) { - const bufferSizeMB = MAX_BUFFER_SIZE / 1024 / 1024; - logger.error( - `Buffer size exceeded ${bufferSizeMB}MB limit, aborting stream`, - ); - - // Send error to client - try { - await stream.writeSSE({ - event: "error", - data: JSON.stringify({ + // For content_filter, return a proper completion chunk (not an error) + // This handles Azure ResponsibleAIPolicyViolation and similar content filtering errors + if (finishReason === "content_filter") { + await writeStreamingContentFilterResponse({ + billingModel: usedModel, + billingProvider: usedProvider, + responseModel: `${usedProvider}/${baseModelName}`, + metadata: { + requested_model: initialRequestedModel, + requested_provider: requestedProvider, + used_model: baseModelName, + used_provider: usedProvider, + ...(usedRegion && { used_region: usedRegion }), + underlying_used_model: usedModel, + }, + }); + } else { + // For client errors, return the original provider error response + let errorData; + if (finishReason === "client_error") { + try { + errorData = JSON.parse(errorResponseText); + } catch { + // If we can't parse the original error, fall back to our format + errorData = { + error: { + message: `Error from provider: ${res.status} ${res.statusText} ${errorResponseText}`, + type: finishReason, + param: null, + code: finishReason, + responseText: errorResponseText, + }, + }; + } + } else { + errorData = { error: { - message: `Streaming buffer exceeded ${bufferSizeMB}MB limit`, - type: "gateway_error", + message: `Error from provider: ${res.status} ${res.statusText} ${errorResponseText}`, + type: finishReason, param: null, - code: "buffer_overflow", + code: finishReason, + responseText: errorResponseText, }, - }), + }; + } + + await writeSSEAndCache({ + event: "error", + data: JSON.stringify(errorData), id: String(eventId++), }); - await stream.writeSSE({ + await writeSSEAndCache({ event: "done", data: "[DONE]", id: String(eventId++), }); - doneSent = true; - } catch (sseError) { - logger.error( - "Failed to send buffer overflow error SSE", - sseError instanceof Error - ? sseError - : new Error(String(sseError)), - ); } - // Set error for logging - streamingError = { - message: `Streaming buffer exceeded ${bufferSizeMB}MB limit`, - type: "buffer_overflow", - code: "buffer_overflow", - details: { - bufferSize: buffer.length, - maxBufferSize: MAX_BUFFER_SIZE, - provider: usedProvider, - model: usedModel, + clearKeepalive(); + return; + } + + break; // Fetch succeeded, exit retry loop + } // End of retry for loop + + // Add the final attempt (successful or last failed) to routing + if (res && res.ok && usedProvider) { + routingAttempts.push({ + provider: usedProvider, + model: baseModelName, + ...(usedRegion && { region: usedRegion }), + status_code: res.status, + error_type: "none", + succeeded: true, + }); + } + + // Update routingMetadata with all routing attempts for DB logging + if (routingMetadata) { + // Enrich providerScores with failure info from routing attempts + const failedMap = new Map( + routingAttempts + .filter((a) => !a.succeeded) + .map((f) => [f.provider, f]), + ); + routingMetadata = { + ...routingMetadata, + routing: routingAttempts, + providerScores: routingMetadata.providerScores.map((score) => { + const failure = failedMap.get(score.providerId); + if (failure) { + return { + ...score, + failed: true, + status_code: failure.status_code, + error_type: failure.error_type, + }; + } + return score; + }), + }; + } + + // If all retries exhausted without a successful response + if (!res || !res.ok) { + await writeSSEAndCache({ + event: "error", + data: JSON.stringify({ + error: { + message: "All provider attempts failed", + type: "upstream_error", + code: "all_providers_failed", }, - }; + }), + id: String(eventId++), + }); + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + clearKeepalive(); + return; + } - break; - } + // After retry loop: narrow provider variables for the rest of the streaming body + if ( + !usedProvider || + !usedToken || + !url || + !usedModelFormatted || + !usedModelMapping + ) { + throw new Error("Provider context not initialized"); + } + + if (!res.body) { + await writeSSEAndCache({ + event: "error", + data: JSON.stringify({ + error: { + message: "No response body from provider", + type: "gateway_error", + param: null, + code: "gateway_error", + }, + }), + id: String(eventId++), + }); + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + clearKeepalive(); + return; + } - // Process SSE events from buffer - let processedLength = 0; - const bufferCopy = buffer; + const reader = res.body.getReader(); + let fullContent = ""; + let fullReasoningContent = ""; + let finishReason = null; + let promptTokens = null; + let completionTokens = null; + let totalTokens = null; + let reasoningTokens = null; + let cachedTokens = null; + let streamingToolCalls = null; + let imageByteSize = 0; // Track total image data size for token estimation + let outputImageCount = 0; // Track number of output images for cost calculation + let webSearchCount = 0; // Track web search calls for cost calculation + const serverToolUseIndices = new Set(); // Track Anthropic server_tool_use block indices + let sawUpstreamDoneSentinel = false; + let sawProviderTerminalEvent = false; + let handledTerminalProviderEvent = false; + let buffer = ""; // Buffer for accumulating partial data across chunks (string for SSE) + let binaryBuffer = new Uint8Array(0); // Buffer for binary event streams (AWS Bedrock) + let rawUpstreamData = ""; // Raw data received from upstream provider + const isAwsBedrock = usedProvider === "aws-bedrock"; + let shouldTerminateStream = false; + + // Response healing for streaming mode + const streamingResponseHealingEnabled = plugins?.some( + (p) => p.id === "response-healing", + ); + const streamingIsJsonResponseFormat = + response_format?.type === "json_object" || + response_format?.type === "json_schema"; + const shouldBufferForHealing = + streamingIsJsonResponseFormat && + (streamingResponseHealingEnabled === true || + usedProvider === "novita" || + usedProvider === "minimax"); + + // Buffer for storing chunks when healing is enabled + // We need to buffer content, track last chunk info, and replay healed content at the end + const bufferedContentChunks: string[] = []; + let lastChunkId: string | null = null; + let lastChunkModel: string | null = null; + let lastChunkCreated: number | null = null; + const streamingPluginResults: { + responseHealing?: { + healed: boolean; + healingMethod?: string; + }; + } = {}; - // Look for complete SSE events, handling events at buffer start - let searchStart = 0; - while (searchStart < bufferCopy.length) { - // Find "data: " - could be at start of buffer or after newline - let dataIndex = -1; + try { + while (true) { + const { done, value } = await reader.read(); + if (done) { + break; + } - if (searchStart === 0 && bufferCopy.startsWith("data: ")) { - // Event at buffer start - dataIndex = 0; - } else { - // Look for "\ndata: " pattern - const newlineDataIndex = bufferCopy.indexOf( - "\ndata: ", - searchStart, + // For AWS Bedrock, convert binary event stream to SSE format + let chunk: string; + if (isAwsBedrock) { + // Append binary data to buffer + const newBuffer = new Uint8Array( + binaryBuffer.length + value.length, ); - if (newlineDataIndex !== -1) { - dataIndex = newlineDataIndex + 1; // Skip the newline + newBuffer.set(binaryBuffer); + newBuffer.set(value, binaryBuffer.length); + binaryBuffer = newBuffer; + + // Parse and convert available events + const { sse, bytesConsumed } = + convertAwsEventStreamToSSE(binaryBuffer); + chunk = sse; + + // Remove consumed bytes from binary buffer + if (bytesConsumed > 0) { + binaryBuffer = binaryBuffer.slice(bytesConsumed); } + } else { + // Convert the Uint8Array to a string for SSE + chunk = sharedTextDecoder.decode(value, { stream: true }); + } + + // Log error on large chunks (1MB+) - should almost never happen + if (chunk.length > 1024 * 1024) { + logger.error( + `Large chunk received: ${(chunk.length / 1024 / 1024).toFixed(2)}MB`, + ); + } + + buffer += chunk; + // Collect raw upstream data for logging only in debug mode and within size limit + if (debugMode && rawUpstreamData.length < MAX_RAW_DATA_SIZE) { + rawUpstreamData += chunk; } - if (dataIndex === -1) { + // Check buffer size to prevent memory exhaustion + if (buffer.length > MAX_BUFFER_SIZE) { + const bufferSizeMB = MAX_BUFFER_SIZE / 1024 / 1024; + logger.error( + `Buffer size exceeded ${bufferSizeMB}MB limit, aborting stream`, + ); + + // Send error to client + try { + await stream.writeSSE({ + event: "error", + data: JSON.stringify({ + error: { + message: `Streaming buffer exceeded ${bufferSizeMB}MB limit`, + type: "gateway_error", + param: null, + code: "buffer_overflow", + }, + }), + id: String(eventId++), + }); + await stream.writeSSE({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + doneSent = true; + } catch (sseError) { + logger.error( + "Failed to send buffer overflow error SSE", + sseError instanceof Error + ? sseError + : new Error(String(sseError)), + ); + } + + // Set error for logging + streamingError = { + message: `Streaming buffer exceeded ${bufferSizeMB}MB limit`, + type: "buffer_overflow", + code: "buffer_overflow", + details: { + bufferSize: buffer.length, + maxBufferSize: MAX_BUFFER_SIZE, + provider: usedProvider, + model: usedModel, + }, + }; + break; } - // Find the end of this SSE event - // Look for next event or proper event termination - let eventEnd = -1; + // Process SSE events from buffer + let processedLength = 0; + const bufferCopy = buffer; - // First, look for the next "data: " event (after a newline) - const nextEventIndex = bufferCopy.indexOf( - "\ndata: ", - dataIndex + 6, - ); - if (nextEventIndex !== -1) { - // Found next data event, but we still need to check if there are SSE fields in between - // For Anthropic, we might have: data: {...}\n\nevent: something\n\ndata: {...} - const betweenEvents = bufferCopy.slice( + // Look for complete SSE events, handling events at buffer start + let searchStart = 0; + while (searchStart < bufferCopy.length) { + // Find "data: " - could be at start of buffer or after newline + let dataIndex = -1; + + if (searchStart === 0 && bufferCopy.startsWith("data: ")) { + // Event at buffer start + dataIndex = 0; + } else { + // Look for "\ndata: " pattern + const newlineDataIndex = bufferCopy.indexOf( + "\ndata: ", + searchStart, + ); + if (newlineDataIndex !== -1) { + dataIndex = newlineDataIndex + 1; // Skip the newline + } + } + + if (dataIndex === -1) { + break; + } + + // Find the end of this SSE event + // Look for next event or proper event termination + let eventEnd = -1; + + // First, look for the next "data: " event (after a newline) + const nextEventIndex = bufferCopy.indexOf( + "\ndata: ", dataIndex + 6, - nextEventIndex, ); - const firstNewline = betweenEvents.indexOf("\n"); - - if (firstNewline !== -1) { - // Check if JSON up to first newline is valid - const jsonCandidate = betweenEvents - .slice(0, firstNewline) - .trim(); - // Quick heuristic check before expensive JSON.parse - let isValidJson = false; - if (mightBeCompleteJson(jsonCandidate)) { - try { - JSON.parse(jsonCandidate); - isValidJson = true; - } catch { - // JSON is not complete + if (nextEventIndex !== -1) { + // Found next data event, but we still need to check if there are SSE fields in between + // For Anthropic, we might have: data: {...}\n\nevent: something\n\ndata: {...} + const betweenEvents = bufferCopy.slice( + dataIndex + 6, + nextEventIndex, + ); + const firstNewline = betweenEvents.indexOf("\n"); + + if (firstNewline !== -1) { + // Check if JSON up to first newline is valid + const jsonCandidate = betweenEvents + .slice(0, firstNewline) + .trim(); + // Quick heuristic check before expensive JSON.parse + let isValidJson = false; + if (mightBeCompleteJson(jsonCandidate)) { + try { + JSON.parse(jsonCandidate); + isValidJson = true; + } catch { + // JSON is not complete + } + } + if (isValidJson) { + // JSON is valid - end at first newline to exclude SSE fields + eventEnd = dataIndex + 6 + firstNewline; + } else { + // JSON is not complete, use the full segment to next data event + eventEnd = nextEventIndex; } - } - if (isValidJson) { - // JSON is valid - end at first newline to exclude SSE fields - eventEnd = dataIndex + 6 + firstNewline; } else { - // JSON is not complete, use the full segment to next data event + // No newline found, use full segment eventEnd = nextEventIndex; } } else { - // No newline found, use full segment - eventEnd = nextEventIndex; - } - } else { - // No next event found - check for proper event termination - // SSE events should end with at least one newline - const eventStartPos = dataIndex + 6; // Start of event data - - // For Anthropic SSE format, we need to be more careful about event boundaries - // Try to find the end of the JSON data by looking for the closing brace - const newlinePos = bufferCopy.indexOf("\n", eventStartPos); - if (newlinePos !== -1) { - // We found a newline - check if the JSON before it is valid - const jsonCandidate = bufferCopy - .slice(eventStartPos, newlinePos) - .trim(); - // Quick heuristic check before expensive JSON.parse - let isValidJson = false; - if (mightBeCompleteJson(jsonCandidate)) { - try { - JSON.parse(jsonCandidate); - isValidJson = true; - } catch { - // JSON is not complete + // No next event found - check for proper event termination + // SSE events should end with at least one newline + const eventStartPos = dataIndex + 6; // Start of event data + + // For Anthropic SSE format, we need to be more careful about event boundaries + // Try to find the end of the JSON data by looking for the closing brace + const newlinePos = bufferCopy.indexOf("\n", eventStartPos); + if (newlinePos !== -1) { + // We found a newline - check if the JSON before it is valid + const jsonCandidate = bufferCopy + .slice(eventStartPos, newlinePos) + .trim(); + // Quick heuristic check before expensive JSON.parse + let isValidJson = false; + if (mightBeCompleteJson(jsonCandidate)) { + try { + JSON.parse(jsonCandidate); + isValidJson = true; + } catch { + // JSON is not complete + } } - } - if (isValidJson) { - // JSON is valid - this newline marks the end of our data - eventEnd = newlinePos; - } else { - // JSON is not valid, check if there's more content after the newline - if (newlinePos + 1 >= bufferCopy.length) { - // Newline is at the end of buffer - event is incomplete - break; + if (isValidJson) { + // JSON is valid - this newline marks the end of our data + eventEnd = newlinePos; } else { - // There's content after the newline - // Check if it's another SSE field (like event:, id:, retry:, etc.) or if the event continues - const restOfBuffer = bufferCopy.slice(newlinePos + 1); - - // Check for SSE field patterns (event:, id:, retry:, etc.) - // Skip leading newlines efficiently without creating new strings - let trimStart = 0; - while ( - trimStart < restOfBuffer.length && - restOfBuffer[trimStart] === "\n" - ) { - trimStart++; - } + // JSON is not valid, check if there's more content after the newline + if (newlinePos + 1 >= bufferCopy.length) { + // Newline is at the end of buffer - event is incomplete + break; + } else { + // There's content after the newline + // Check if it's another SSE field (like event:, id:, retry:, etc.) or if the event continues + const restOfBuffer = bufferCopy.slice(newlinePos + 1); + + // Check for SSE field patterns (event:, id:, retry:, etc.) + // Skip leading newlines efficiently without creating new strings + let trimStart = 0; + while ( + trimStart < restOfBuffer.length && + restOfBuffer[trimStart] === "\n" + ) { + trimStart++; + } - if ( - restOfBuffer.startsWith("\n") || // Empty line - end of event - restOfBuffer.startsWith("data: ") // Next data field - ) { - // This is the end of our data event - eventEnd = newlinePos; - } else if (trimStart > 0) { - // Had leading newlines - check for SSE fields after them - const afterNewlines = restOfBuffer.substring(trimStart); if ( - afterNewlines.startsWith("event:") || - afterNewlines.startsWith("id:") || - afterNewlines.startsWith("retry:") || - SSE_FIELD_PATTERN.test(afterNewlines) + restOfBuffer.startsWith("\n") || // Empty line - end of event + restOfBuffer.startsWith("data: ") // Next data field ) { + // This is the end of our data event eventEnd = newlinePos; + } else if (trimStart > 0) { + // Had leading newlines - check for SSE fields after them + const afterNewlines = + restOfBuffer.substring(trimStart); + if ( + afterNewlines.startsWith("event:") || + afterNewlines.startsWith("id:") || + afterNewlines.startsWith("retry:") || + SSE_FIELD_PATTERN.test(afterNewlines) + ) { + eventEnd = newlinePos; + } else { + // Content continues on next line - use full buffer + eventEnd = bufferCopy.length; + } } else { - // Content continues on next line - use full buffer - eventEnd = bufferCopy.length; - } - } else { - // No leading newlines - check SSE field directly - if (SSE_FIELD_PATTERN.test(restOfBuffer)) { - eventEnd = newlinePos; - } else { - // Content continues on next line - use full buffer - eventEnd = bufferCopy.length; + // No leading newlines - check SSE field directly + if (SSE_FIELD_PATTERN.test(restOfBuffer)) { + eventEnd = newlinePos; + } else { + // Content continues on next line - use full buffer + eventEnd = bufferCopy.length; + } } } } - } - } else { - // No newline found after event data - event is incomplete - // Try to detect if we have a complete JSON object - const eventDataCandidate = bufferCopy.slice(eventStartPos); - if (eventDataCandidate.length > 0) { - // Quick heuristic check before expensive JSON.parse - const trimmedCandidate = eventDataCandidate.trim(); - if (mightBeCompleteJson(trimmedCandidate)) { - try { - JSON.parse(trimmedCandidate); - // If we can parse it, it's complete - eventEnd = bufferCopy.length; - } catch { - // JSON parsing failed - event is incomplete + } else { + // No newline found after event data - event is incomplete + // Try to detect if we have a complete JSON object + const eventDataCandidate = bufferCopy.slice(eventStartPos); + if (eventDataCandidate.length > 0) { + // Quick heuristic check before expensive JSON.parse + const trimmedCandidate = eventDataCandidate.trim(); + if (mightBeCompleteJson(trimmedCandidate)) { + try { + JSON.parse(trimmedCandidate); + // If we can parse it, it's complete + eventEnd = bufferCopy.length; + } catch { + // JSON parsing failed - event is incomplete + break; + } + } else { + // Heuristic says incomplete - don't bother parsing break; } } else { - // Heuristic says incomplete - don't bother parsing + // No event data yet break; } - } else { - // No event data yet - break; } } - } - const eventData = bufferCopy - .slice(dataIndex + 6, eventEnd) - .trim(); - - // Debug logging for troublesome events - // Only scan for SSE field contamination on small events to avoid - // O(n) scans on multi-MB payloads (e.g. base64 image data). - // Large events (>64KB) are almost always valid image/binary data. - if ( - eventData.length < 65536 && - (eventData.includes("event:") || eventData.includes("id:")) - ) { - logger.warn("Event data contains SSE field", { - eventData: - eventData.substring(0, 200) + - (eventData.length > 200 ? "..." : ""), - dataIndex, - eventEnd, - bufferLength: bufferCopy.length, - provider: usedProvider, - }); - } + const eventData = bufferCopy + .slice(dataIndex + 6, eventEnd) + .trim(); - if (eventData === "[DONE]") { - sawUpstreamDoneSentinel = true; - // Set default finish_reason if not provided by the stream - // Some providers (like Novita) don't send finish_reason in streaming chunks - if (finishReason === null) { - // Default to "stop" unless we have tool calls - finishReason = - streamingToolCalls && streamingToolCalls.length > 0 - ? "tool_calls" - : "stop"; + // Debug logging for troublesome events + // Only scan for SSE field contamination on small events to avoid + // O(n) scans on multi-MB payloads (e.g. base64 image data). + // Large events (>64KB) are almost always valid image/binary data. + if ( + eventData.length < 65536 && + (eventData.includes("event:") || eventData.includes("id:")) + ) { + logger.warn("Event data contains SSE field", { + eventData: + eventData.substring(0, 200) + + (eventData.length > 200 ? "..." : ""), + dataIndex, + eventEnd, + bufferLength: bufferCopy.length, + provider: usedProvider, + }); } - // Calculate final usage if we don't have complete data - let finalPromptTokens = promptTokens; - let finalCompletionTokens = completionTokens; - let finalTotalTokens = totalTokens; + if (eventData === "[DONE]") { + sawUpstreamDoneSentinel = true; + // Set default finish_reason if not provided by the stream + // Some providers (like Novita) don't send finish_reason in streaming chunks + if (finishReason === null) { + // Default to "stop" unless we have tool calls + finishReason = + streamingToolCalls && streamingToolCalls.length > 0 + ? "tool_calls" + : "stop"; + } - // Estimate missing tokens if needed using helper function - if (finalPromptTokens === null || finalPromptTokens === 0) { - const estimation = estimateTokens( - usedProvider, - messages, - null, - null, - null, - ); - finalPromptTokens = estimation.calculatedPromptTokens; - } + // Calculate final usage if we don't have complete data + let finalPromptTokens = promptTokens; + let finalCompletionTokens = completionTokens; + let finalTotalTokens = totalTokens; - if (finalCompletionTokens === null) { - const textTokens = estimateTokensFromContent(fullContent); - // For images, estimate ~258 tokens per image + 1 token per 750 bytes - // This is based on Google's image token calculation - let imageTokens = 0; - if (imageByteSize > 0) { - // Base tokens per image (258) + additional tokens based on size - imageTokens = 258 + Math.ceil(imageByteSize / 750); + // Estimate missing tokens if needed using helper function + if (finalPromptTokens === null || finalPromptTokens === 0) { + const estimation = estimateTokens( + usedProvider, + messages, + null, + null, + null, + ); + finalPromptTokens = estimation.calculatedPromptTokens; } - finalCompletionTokens = textTokens + imageTokens; - } - - if (finalTotalTokens === null) { - finalTotalTokens = - (finalPromptTokens ?? 0) + - (finalCompletionTokens ?? 0) + - (reasoningTokens ?? 0); - } - // Send final usage chunk before [DONE] if we have any usage data - if ( - finalPromptTokens !== null || - finalCompletionTokens !== null || - finalTotalTokens !== null - ) { - // Calculate costs for streaming response - const streamingCosts = await calculateCosts( - usedModel, - usedProvider, - finalPromptTokens, - finalCompletionTokens, - cachedTokens, - { - prompt: messages - .map((m) => messageContentToString(m.content)) - .join("\n"), - completion: fullContent, - toolResults: streamingToolCalls ?? undefined, - }, - reasoningTokens, - outputImageCount, - image_config?.image_size, - inputImageCount, - webSearchCount, - project.organizationId, - ); + if (finalCompletionTokens === null) { + const textTokens = estimateTokensFromContent(fullContent); + // For images, estimate ~258 tokens per image + 1 token per 750 bytes + // This is based on Google's image token calculation + let imageTokens = 0; + if (imageByteSize > 0) { + // Base tokens per image (258) + additional tokens based on size + imageTokens = 258 + Math.ceil(imageByteSize / 750); + } + finalCompletionTokens = textTokens + imageTokens; + } - // Include costs in response for all users - const shouldIncludeCosts = true; + if (finalTotalTokens === null) { + finalTotalTokens = + (finalPromptTokens ?? 0) + + (finalCompletionTokens ?? 0) + + (reasoningTokens ?? 0); + } - const finalUsageChunk = { - id: `chatcmpl-${Date.now()}`, - object: "chat.completion.chunk", - created: Math.floor(Date.now() / 1000), - model: usedModel, - choices: [ + // Send final usage chunk before [DONE] if we have any usage data + if ( + finalPromptTokens !== null || + finalCompletionTokens !== null || + finalTotalTokens !== null + ) { + // Calculate costs for streaming response + const streamingCosts = await calculateCosts( + usedModel, + usedProvider, + finalPromptTokens, + finalCompletionTokens, + cachedTokens, { - index: 0, - delta: {}, - finish_reason: null, + prompt: messages + .map((m) => messageContentToString(m.content)) + .join("\n"), + completion: fullContent, + toolResults: streamingToolCalls ?? undefined, }, - ], - usage: { - prompt_tokens: Math.max( - 1, - streamingCosts.promptTokens ?? finalPromptTokens ?? 1, - ), - completion_tokens: - streamingCosts.completionTokens ?? - finalCompletionTokens ?? - 0, - total_tokens: Math.max( - 1, - (streamingCosts.promptTokens ?? - finalPromptTokens ?? - 0) + - (streamingCosts.completionTokens ?? - finalCompletionTokens ?? + reasoningTokens, + outputImageCount, + image_config?.image_size, + inputImageCount, + webSearchCount, + project.organizationId, + ); + + // Include costs in response for all users + const shouldIncludeCosts = true; + + const finalUsageChunk = { + id: `chatcmpl-${Date.now()}`, + object: "chat.completion.chunk", + created: Math.floor(Date.now() / 1000), + model: usedModel, + choices: [ + { + index: 0, + delta: {}, + finish_reason: null, + }, + ], + usage: { + prompt_tokens: Math.max( + 1, + streamingCosts.promptTokens ?? finalPromptTokens ?? 1, + ), + completion_tokens: + streamingCosts.completionTokens ?? + finalCompletionTokens ?? + 0, + total_tokens: Math.max( + 1, + (streamingCosts.promptTokens ?? + finalPromptTokens ?? 0) + - (reasoningTokens ?? 0), - ), - ...(shouldIncludeCosts && { - cost_usd_total: streamingCosts.totalCost, - cost_usd_input: streamingCosts.inputCost, - cost_usd_output: streamingCosts.outputCost, - cost_usd_cached_input: streamingCosts.cachedInputCost, - cost_usd_request: streamingCosts.requestCost, - cost_usd_image_input: streamingCosts.imageInputCost, - cost_usd_image_output: streamingCosts.imageOutputCost, - }), - }, - }; + (streamingCosts.completionTokens ?? + finalCompletionTokens ?? + 0) + + (reasoningTokens ?? 0), + ), + ...(shouldIncludeCosts && { + cost_usd_total: streamingCosts.totalCost, + cost_usd_input: streamingCosts.inputCost, + cost_usd_output: streamingCosts.outputCost, + cost_usd_cached_input: streamingCosts.cachedInputCost, + cost_usd_request: streamingCosts.requestCost, + cost_usd_image_input: streamingCosts.imageInputCost, + cost_usd_image_output: streamingCosts.imageOutputCost, + }), + }, + }; - await writeSSEAndCache({ - data: JSON.stringify(finalUsageChunk), - id: String(eventId++), - }); - } + await writeSSEAndCache({ + data: JSON.stringify(finalUsageChunk), + id: String(eventId++), + }); + } - if (!shouldBufferForHealing) { - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); - doneSent = true; - } + if (!shouldBufferForHealing) { + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + doneSent = true; + } - processedLength = eventEnd; - } else { - // Try to parse JSON data - it might span multiple lines - let data; - try { - data = JSON.parse(eventData); - } catch (e) { - // If JSON parsing fails, this might be an incomplete event - // Since we already validated JSON completeness above, this is likely a format issue - // Create structured error for logging - streamingError = { - message: e instanceof Error ? e.message : String(e), - type: "json_parse_error", - code: "json_parse_error", - details: { - name: e instanceof Error ? e.name : "ParseError", - eventData: eventData.substring(0, 5000), + processedLength = eventEnd; + } else { + // Try to parse JSON data - it might span multiple lines + let data; + try { + data = JSON.parse(eventData); + } catch (e) { + // If JSON parsing fails, this might be an incomplete event + // Since we already validated JSON completeness above, this is likely a format issue + // Create structured error for logging + streamingError = { + message: e instanceof Error ? e.message : String(e), + type: "json_parse_error", + code: "json_parse_error", + details: { + name: e instanceof Error ? e.name : "ParseError", + eventData: eventData.substring(0, 5000), + provider: usedProvider, + model: usedModel, + eventLength: eventData.length, + bufferEnd: eventEnd, + bufferLength: bufferCopy.length, + timestamp: new Date().toISOString(), + }, + }; + logger.warn("Failed to parse streaming JSON", { + error: e instanceof Error ? e.message : String(e), + eventData: + eventData.substring(0, 200) + + (eventData.length > 200 ? "..." : ""), provider: usedProvider, - model: usedModel, eventLength: eventData.length, bufferEnd: eventEnd, bufferLength: bufferCopy.length, - timestamp: new Date().toISOString(), - }, - }; - logger.warn("Failed to parse streaming JSON", { - error: e instanceof Error ? e.message : String(e), - eventData: - eventData.substring(0, 200) + - (eventData.length > 200 ? "..." : ""), - provider: usedProvider, - eventLength: eventData.length, - bufferEnd: eventEnd, - bufferLength: bufferCopy.length, - }); + }); - processedLength = eventEnd; - searchStart = eventEnd; - continue; - } + processedLength = eventEnd; + searchStart = eventEnd; + continue; + } - const awsBedrockStreamError = - usedProvider === "aws-bedrock" - ? extractAwsBedrockStreamError(data) - : null; - if ( - data && - typeof data === "object" && - "response" in data && - data.response && - typeof data.response === "object" && - "status" in data.response && - data.response.status === "completed" - ) { - sawOpenAiResponsesCompletedStatus = true; - } - if ( - data && - typeof data === "object" && - "type" in data && - typeof data.type === "string" && - (data.type === "response.content_part.done" || - data.type === "response.output_item.done" || - data.type === "response.output_text.done") - ) { - sawOpenAiResponsesDoneEvent = true; - } - const openAiCompatibleStreamError = - !awsBedrockStreamError && - data && - typeof data === "object" && - "error" in data && - data.error && - typeof data.error === "object" - ? (data.error as Record) - : null; - if (openAiCompatibleStreamError) { - const errorResponseText = JSON.stringify(data); - if ( - debugMode && - streamingRawResponseData.length < MAX_RAW_DATA_SIZE - ) { - const rawProviderSseEvent = `data: ${errorResponseText}\n\n`; - streamingRawResponseData += rawProviderSseEvent.substring( - 0, - Math.max( + const awsBedrockStreamError = + usedProvider === "aws-bedrock" + ? extractAwsBedrockStreamError(data) + : null; + const openAiCompatibleStreamError = + !awsBedrockStreamError && + data && + typeof data === "object" && + "error" in data && + data.error && + typeof data.error === "object" + ? (data.error as Record) + : null; + if (openAiCompatibleStreamError) { + const errorResponseText = JSON.stringify(data); + if ( + debugMode && + streamingRawResponseData.length < MAX_RAW_DATA_SIZE + ) { + const rawProviderSseEvent = `data: ${errorResponseText}\n\n`; + streamingRawResponseData += rawProviderSseEvent.substring( 0, - MAX_RAW_DATA_SIZE - streamingRawResponseData.length, - ), + Math.max( + 0, + MAX_RAW_DATA_SIZE - streamingRawResponseData.length, + ), + ); + } + const inferredStatusCode = + typeof openAiCompatibleStreamError.status_code === + "number" + ? openAiCompatibleStreamError.status_code + : typeof openAiCompatibleStreamError.status === "number" + ? openAiCompatibleStreamError.status + : 400; + const errorType = getFinishReasonFromError( + inferredStatusCode, + errorResponseText, ); - } - const inferredStatusCode = - typeof openAiCompatibleStreamError.status_code === "number" - ? openAiCompatibleStreamError.status_code - : typeof openAiCompatibleStreamError.status === "number" - ? openAiCompatibleStreamError.status - : 400; - const errorType = getFinishReasonFromError( - inferredStatusCode, - errorResponseText, - ); - const errorMessage = - typeof openAiCompatibleStreamError.message === "string" - ? openAiCompatibleStreamError.message - : "Upstream provider returned a streaming error"; - const errorCode = - typeof openAiCompatibleStreamError.code === "string" - ? openAiCompatibleStreamError.code - : typeof openAiCompatibleStreamError.type === "string" - ? openAiCompatibleStreamError.type - : errorType; - - logger.info("[streaming] Provider SSE error received", { - requestId, - provider: usedProvider, - model: usedModel, - errorType, - errorCode, - inferredStatusCode, - errorMessage, - errorPayload: errorResponseText.substring(0, 5000), - }); + const errorMessage = + typeof openAiCompatibleStreamError.message === "string" + ? openAiCompatibleStreamError.message + : "Upstream provider returned a streaming error"; + const errorCode = + typeof openAiCompatibleStreamError.code === "string" + ? openAiCompatibleStreamError.code + : typeof openAiCompatibleStreamError.type === "string" + ? openAiCompatibleStreamError.type + : errorType; + + logger.info("[streaming] Provider SSE error received", { + requestId, + provider: usedProvider, + model: usedModel, + errorType, + errorCode, + inferredStatusCode, + errorMessage, + errorPayload: errorResponseText.substring(0, 5000), + }); - finishReason = errorType; + finishReason = errorType; + + if (errorType === "content_filter") { + await writeStreamingContentFilterResponse({ + billingModel: usedModel, + billingProvider: usedProvider, + responseModel: data.model ?? usedModel, + }); + handledTerminalProviderEvent = true; + } else { + streamingError = { + message: errorMessage, + type: errorType, + code: errorCode, + details: { + statusCode: inferredStatusCode, + statusText: + typeof openAiCompatibleStreamError.type === "string" + ? openAiCompatibleStreamError.type + : "stream_error", + responseText: errorResponseText, + }, + }; + + await writeSSEAndCache({ + event: "error", + data: JSON.stringify({ + error: { + message: errorMessage, + type: errorType, + code: errorCode, + param: + "param" in openAiCompatibleStreamError + ? (openAiCompatibleStreamError.param ?? null) + : null, + responseText: errorResponseText, + }, + }), + id: String(eventId++), + }); + } + + if (!doneSent) { + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + doneSent = true; + } + shouldTerminateStream = true; + processedLength = eventEnd; + searchStart = eventEnd; + break; + } + if (awsBedrockStreamError) { + const errorType = getFinishReasonFromError( + awsBedrockStreamError.statusCode, + awsBedrockStreamError.responseText, + ); - if (errorType === "content_filter") { - await writeStreamingContentFilterResponse({ - billingModel: usedModel, - billingProvider: usedProvider, - responseModel: data.model ?? usedModel, - }); - handledTerminalProviderEvent = true; - } else { streamingError = { - message: errorMessage, + message: awsBedrockStreamError.message, type: errorType, - code: errorCode, + code: awsBedrockStreamError.eventType, details: { - statusCode: inferredStatusCode, - statusText: - typeof openAiCompatibleStreamError.type === "string" - ? openAiCompatibleStreamError.type - : "stream_error", - responseText: errorResponseText, + statusCode: awsBedrockStreamError.statusCode, + statusText: awsBedrockStreamError.eventType, + responseText: awsBedrockStreamError.responseText, }, }; + finishReason = errorType; await writeSSEAndCache({ event: "error", data: JSON.stringify({ error: { - message: errorMessage, + message: awsBedrockStreamError.message, type: errorType, - code: errorCode, - param: - "param" in openAiCompatibleStreamError - ? (openAiCompatibleStreamError.param ?? null) - : null, - responseText: errorResponseText, + code: awsBedrockStreamError.eventType, + param: null, + responseText: awsBedrockStreamError.responseText, }, }), id: String(eventId++), }); - } - - if (!doneSent) { await writeSSEAndCache({ event: "done", data: "[DONE]", id: String(eventId++), }); doneSent = true; + shouldTerminateStream = true; + processedLength = eventEnd; + searchStart = eventEnd; + break; } - shouldTerminateStream = true; - processedLength = eventEnd; - searchStart = eventEnd; - break; - } - if (awsBedrockStreamError) { - const errorType = getFinishReasonFromError( - awsBedrockStreamError.statusCode, - awsBedrockStreamError.responseText, - ); - streamingError = { - message: awsBedrockStreamError.message, - type: errorType, - code: awsBedrockStreamError.eventType, - details: { - statusCode: awsBedrockStreamError.statusCode, - statusText: awsBedrockStreamError.eventType, - responseText: awsBedrockStreamError.responseText, - }, - }; - finishReason = errorType; - - await writeSSEAndCache({ - event: "error", - data: JSON.stringify({ - error: { - message: awsBedrockStreamError.message, - type: errorType, - code: awsBedrockStreamError.eventType, - param: null, - responseText: awsBedrockStreamError.responseText, - }, - }), - id: String(eventId++), - }); - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); - doneSent = true; - shouldTerminateStream = true; - processedLength = eventEnd; - searchStart = eventEnd; - break; - } + // Transform streaming responses to OpenAI format for all providers + const transformedData = transformStreamingToOpenai( + usedProvider, + usedModel, + data, + messages, + serverToolUseIndices, + ); - // Transform streaming responses to OpenAI format for all providers - const transformedData = transformStreamingToOpenai( - usedProvider, - usedModel, - data, - messages, - serverToolUseIndices, - ); + // Skip null events (some providers have non-data events) + if (!transformedData) { + processedLength = eventEnd; + searchStart = eventEnd; + continue; + } - // Skip null events (some providers have non-data events) - if (!transformedData) { - processedLength = eventEnd; - searchStart = eventEnd; - continue; - } + // For Anthropic, if we have partial usage data, complete it + if (usedProvider === "anthropic" && transformedData.usage) { + const usage = transformedData.usage; + if ( + usage.output_tokens !== undefined && + usage.prompt_tokens === undefined + ) { + // Estimate prompt tokens if not provided + const estimation = estimateTokens( + usedProvider, + messages, + null, + null, + null, + ); + const estimatedPromptTokens = + estimation.calculatedPromptTokens; + transformedData.usage = { + prompt_tokens: estimatedPromptTokens, + completion_tokens: usage.output_tokens, + total_tokens: + estimatedPromptTokens + usage.output_tokens, + }; + } + } - // For Anthropic, if we have partial usage data, complete it - if (usedProvider === "anthropic" && transformedData.usage) { - const usage = transformedData.usage; - if ( - usage.output_tokens !== undefined && - usage.prompt_tokens === undefined - ) { - // Estimate prompt tokens if not provided - const estimation = estimateTokens( + // For Google providers, add usage information when available + if (isGoogleCompatibleProvider(usedProvider)) { + const usage = extractTokenUsage( + data, usedProvider, - messages, - null, - null, - null, + fullContent, + imageByteSize, ); - const estimatedPromptTokens = - estimation.calculatedPromptTokens; - transformedData.usage = { - prompt_tokens: estimatedPromptTokens, - completion_tokens: usage.output_tokens, - total_tokens: estimatedPromptTokens + usage.output_tokens, - }; - } - } - - // For Google providers, add usage information when available - if (isGoogleCompatibleProvider(usedProvider)) { - const usage = extractTokenUsage( - data, - usedProvider, - fullContent, - imageByteSize, - ); - // If we have usage data from Google, add it to the streaming chunk - if ( - usage.promptTokens !== null || - usage.completionTokens !== null || - usage.totalTokens !== null - ) { - transformedData.usage = { - prompt_tokens: usage.promptTokens ?? 0, - completion_tokens: usage.completionTokens ?? 0, - total_tokens: usage.totalTokens ?? 0, - ...(usage.reasoningTokens !== null && { - reasoning_tokens: usage.reasoningTokens, - }), - }; + // If we have usage data from Google, add it to the streaming chunk + if ( + usage.promptTokens !== null || + usage.completionTokens !== null || + usage.totalTokens !== null + ) { + transformedData.usage = { + prompt_tokens: usage.promptTokens ?? 0, + completion_tokens: usage.completionTokens ?? 0, + total_tokens: usage.totalTokens ?? 0, + ...(usage.reasoningTokens !== null && { + reasoning_tokens: usage.reasoningTokens, + }), + }; + } } - } - // Normalize usage.prompt_tokens_details to always include cached_tokens - if (transformedData.usage) { - if (transformedData.usage.prompt_tokens_details) { - // Preserve all existing keys and only default cached_tokens - transformedData.usage.prompt_tokens_details = { - ...transformedData.usage.prompt_tokens_details, - cached_tokens: - transformedData.usage.prompt_tokens_details - .cached_tokens ?? 0, - }; - } else { - // Create prompt_tokens_details with cached_tokens set to 0 - transformedData.usage.prompt_tokens_details = { - cached_tokens: 0, - }; + // Normalize usage.prompt_tokens_details to always include cached_tokens + if (transformedData.usage) { + if (transformedData.usage.prompt_tokens_details) { + // Preserve all existing keys and only default cached_tokens + transformedData.usage.prompt_tokens_details = { + ...transformedData.usage.prompt_tokens_details, + cached_tokens: + transformedData.usage.prompt_tokens_details + .cached_tokens ?? 0, + }; + } else { + // Create prompt_tokens_details with cached_tokens set to 0 + transformedData.usage.prompt_tokens_details = { + cached_tokens: 0, + }; + } } - } - // For Anthropic streaming tool calls, enrich delta chunks with id/type/name - // from the initial content_block_start event. This ensures OpenAI SDK compatibility. - if (usedProvider === "anthropic") { - const toolCalls = - transformedData.choices?.[0]?.delta?.tool_calls; - if (toolCalls && toolCalls.length > 0) { - // First, extract tool calls to update our tracking - const rawToolCalls = extractToolCalls(data, usedProvider); - if (rawToolCalls && rawToolCalls.length > 0) { - streamingToolCalls ??= []; - for (const newCall of rawToolCalls) { - // For content_block_start events (have id), add to tracking - if (newCall.id) { - const contentBlockIndex: number = - typeof data.index === "number" - ? data.index - : streamingToolCalls.length; - // Store at the content block index position - streamingToolCalls[contentBlockIndex] = { - ...newCall, - _contentBlockIndex: contentBlockIndex, - }; - } - // For content_block_delta events, enrich with stored id/type/name - else if (newCall._contentBlockIndex !== undefined) { - const existingCall = - streamingToolCalls[newCall._contentBlockIndex]; - if (existingCall) { - // Enrich the transformed data with id, type, and function.name - for (const tc of toolCalls) { - if (tc.index === newCall._contentBlockIndex) { - tc.id = existingCall.id; - tc.type = "function"; - tc.function ??= {}; - tc.function.name = existingCall.function.name; + // For Anthropic streaming tool calls, enrich delta chunks with id/type/name + // from the initial content_block_start event. This ensures OpenAI SDK compatibility. + if (usedProvider === "anthropic") { + const toolCalls = + transformedData.choices?.[0]?.delta?.tool_calls; + if (toolCalls && toolCalls.length > 0) { + // First, extract tool calls to update our tracking + const rawToolCalls = extractToolCalls(data, usedProvider); + if (rawToolCalls && rawToolCalls.length > 0) { + streamingToolCalls ??= []; + for (const newCall of rawToolCalls) { + // For content_block_start events (have id), add to tracking + if (newCall.id) { + const contentBlockIndex: number = + typeof data.index === "number" + ? data.index + : streamingToolCalls.length; + // Store at the content block index position + streamingToolCalls[contentBlockIndex] = { + ...newCall, + _contentBlockIndex: contentBlockIndex, + }; + } + // For content_block_delta events, enrich with stored id/type/name + else if (newCall._contentBlockIndex !== undefined) { + const existingCall = + streamingToolCalls[newCall._contentBlockIndex]; + if (existingCall) { + // Enrich the transformed data with id, type, and function.name + for (const tc of toolCalls) { + if (tc.index === newCall._contentBlockIndex) { + tc.id = existingCall.id; + tc.type = "function"; + tc.function ??= {}; + tc.function.name = existingCall.function.name; + } } } } @@ -5251,707 +5279,738 @@ chat.openapi(completions, async (c) => { } } } - } - // When buffering for healing, strip content from chunks and buffer it - // We still send metadata (usage, finish_reason, tool_calls) but buffer text content - if (shouldBufferForHealing) { - const deltaContent = - transformedData.choices?.[0]?.delta?.content; - if (deltaContent) { - bufferedContentChunks.push(deltaContent); - // Store chunk metadata for later use when sending healed content - lastChunkId = transformedData.id ?? lastChunkId; - lastChunkModel = transformedData.model ?? lastChunkModel; - lastChunkCreated = - transformedData.created ?? lastChunkCreated; - } + // When buffering for healing, strip content from chunks and buffer it + // We still send metadata (usage, finish_reason, tool_calls) but buffer text content + if (shouldBufferForHealing) { + const deltaContent = + transformedData.choices?.[0]?.delta?.content; + if (deltaContent) { + bufferedContentChunks.push(deltaContent); + // Store chunk metadata for later use when sending healed content + lastChunkId = transformedData.id ?? lastChunkId; + lastChunkModel = transformedData.model ?? lastChunkModel; + lastChunkCreated = + transformedData.created ?? lastChunkCreated; + } - // Create a copy without content in delta for streaming - const chunkWithoutContent = JSON.parse( - JSON.stringify(transformedData), - ); - if (chunkWithoutContent.choices?.[0]?.delta?.content) { - delete chunkWithoutContent.choices[0].delta.content; - } + // Create a copy without content in delta for streaming + const chunkWithoutContent = JSON.parse( + JSON.stringify(transformedData), + ); + if (chunkWithoutContent.choices?.[0]?.delta?.content) { + delete chunkWithoutContent.choices[0].delta.content; + } - // Only send chunk if it has meaningful data (not just empty delta) - const hasUsage = !!chunkWithoutContent.usage; - const hasToolCalls = - !!chunkWithoutContent.choices?.[0]?.delta?.tool_calls; - const hasFinishReason = - !!chunkWithoutContent.choices?.[0]?.finish_reason; - const hasRole = - !!chunkWithoutContent.choices?.[0]?.delta?.role; + // Only send chunk if it has meaningful data (not just empty delta) + const hasUsage = !!chunkWithoutContent.usage; + const hasToolCalls = + !!chunkWithoutContent.choices?.[0]?.delta?.tool_calls; + const hasFinishReason = + !!chunkWithoutContent.choices?.[0]?.finish_reason; + const hasRole = + !!chunkWithoutContent.choices?.[0]?.delta?.role; - if (hasUsage || hasToolCalls || hasFinishReason || hasRole) { + if ( + hasUsage || + hasToolCalls || + hasFinishReason || + hasRole + ) { + await writeSSEAndCache({ + data: JSON.stringify(chunkWithoutContent), + id: String(eventId++), + }); + } + } else { await writeSSEAndCache({ - data: JSON.stringify(chunkWithoutContent), + data: JSON.stringify(transformedData), id: String(eventId++), }); } - } else { - await writeSSEAndCache({ - data: JSON.stringify(transformedData), - id: String(eventId++), - }); - } - // Extract usage data from transformedData to update tracking variables - if ( - transformedData.usage && - (usedProvider === "openai" || usedProvider === "azure") - ) { - const usage = transformedData.usage; - if ( - usage.prompt_tokens !== undefined && - usage.prompt_tokens > 0 - ) { - promptTokens = usage.prompt_tokens; - } + // Extract usage data from transformedData to update tracking variables if ( - usage.completion_tokens !== undefined && - usage.completion_tokens > 0 + transformedData.usage && + (usedProvider === "openai" || usedProvider === "azure") ) { - completionTokens = usage.completion_tokens; - } - if ( - usage.total_tokens !== undefined && - usage.total_tokens > 0 - ) { - totalTokens = usage.total_tokens; - } - if (usage.reasoning_tokens !== undefined) { - reasoningTokens = usage.reasoning_tokens; - } - } - - // Extract finishReason from transformedData to update tracking variable - if (transformedData.choices?.[0]?.finish_reason) { - finishReason = transformedData.choices[0].finish_reason; - sawProviderTerminalEvent = true; - sentDownstreamFinishReasonChunk = true; - } - - // Extract content for logging using helper function - // For providers with custom extraction logic (google-ai-studio, anthropic), - // use raw data. For others (like aws-bedrock), use transformed OpenAI format. - const contentChunk = extractContent( - isGoogleCompatibleProvider(usedProvider) || - usedProvider === "anthropic" - ? data - : transformedData, - usedProvider, - ); - if (contentChunk) { - fullContent += contentChunk; - - // Track time to first token if this is the first content chunk - if (!firstTokenReceived) { - timeToFirstToken = Date.now() - startTime; - firstTokenReceived = true; - } - } - - // Track image data size for Google providers (for token estimation) - if (isGoogleCompatibleProvider(usedProvider)) { - const parts = data.candidates?.[0]?.content?.parts ?? []; - for (const part of parts) { - if (part.inlineData?.data) { - // Base64 string length * 0.75 ≈ actual byte size - imageByteSize += Math.ceil( - part.inlineData.data.length * 0.75, - ); - outputImageCount++; + const usage = transformedData.usage; + if ( + usage.prompt_tokens !== undefined && + usage.prompt_tokens > 0 + ) { + promptTokens = usage.prompt_tokens; } - } - } - - // Track web search calls for cost calculation - // Check for web search results based on provider-specific data - if (usedProvider === "anthropic") { - // For Anthropic, count web_search_tool_result blocks - if ( - data.type === "content_block_start" && - data.content_block?.type === "web_search_tool_result" - ) { - webSearchCount++; - } - } else if (isGoogleCompatibleProvider(usedProvider)) { - // For Google, count when grounding metadata is present - if (data.candidates?.[0]?.groundingMetadata) { - const groundingMetadata = - data.candidates[0].groundingMetadata; if ( - groundingMetadata.webSearchQueries && - groundingMetadata.webSearchQueries.length > 0 && - webSearchCount === 0 + usage.completion_tokens !== undefined && + usage.completion_tokens > 0 ) { - // Only count once for the entire response - webSearchCount = - groundingMetadata.webSearchQueries.length; - } else if ( - groundingMetadata.groundingChunks && - webSearchCount === 0 + completionTokens = usage.completion_tokens; + } + if ( + usage.total_tokens !== undefined && + usage.total_tokens > 0 ) { - // Fallback: count once if we have grounding chunks - webSearchCount = 1; + totalTokens = usage.total_tokens; + } + if (usage.reasoning_tokens !== undefined) { + reasoningTokens = usage.reasoning_tokens; } } - } else if (usedProvider === "openai") { - // For OpenAI Responses API, count web_search_call.completed events - if (data.type === "response.web_search_call.completed") { - webSearchCount++; - } - } - - // Extract reasoning content for logging using helper function - // For providers with custom extraction logic (google-ai-studio, anthropic), - // use raw data. For others, use transformed OpenAI format. - const reasoningContentChunk = extractReasoning( - isGoogleCompatibleProvider(usedProvider) || - usedProvider === "anthropic" - ? data - : transformedData, - usedProvider, - ); - if (reasoningContentChunk) { - fullReasoningContent += reasoningContentChunk; - // Track time to first reasoning token if this is the first reasoning chunk - if (!firstReasoningTokenReceived) { - timeToFirstReasoningToken = Date.now() - startTime; - firstReasoningTokenReceived = true; + // Extract finishReason from transformedData to update tracking variable + if (transformedData.choices?.[0]?.finish_reason) { + finishReason = transformedData.choices[0].finish_reason; } - } - - const toolCallsChunk = extractToolCalls( - data, - usedProvider, - transformedData, - ); - if (toolCallsChunk && toolCallsChunk.length > 0) { - streamingToolCalls ??= []; - // Merge tool calls (accumulating function arguments) - for (const newCall of toolCallsChunk) { - let existingCall = null; - // For Anthropic content_block_delta events, match by content block index - if ( - usedProvider === "anthropic" && - newCall._contentBlockIndex !== undefined - ) { - existingCall = - streamingToolCalls[newCall._contentBlockIndex]; - } else { - // For other providers and Anthropic content_block_start, match by ID - // Note: Array may have sparse entries due to index-based assignment, so check for null/undefined - existingCall = streamingToolCalls.find( - (call) => call && call.id === newCall.id, - ); + // Extract content for logging using helper function + // For providers with custom extraction logic (google-ai-studio, anthropic), + // use raw data. For others (like aws-bedrock), use transformed OpenAI format. + const contentChunk = extractContent( + isGoogleCompatibleProvider(usedProvider) || + usedProvider === "anthropic" + ? data + : transformedData, + usedProvider, + ); + if (contentChunk) { + fullContent += contentChunk; + + // Track time to first token if this is the first content chunk + if (!firstTokenReceived) { + timeToFirstToken = Date.now() - startTime; + firstTokenReceived = true; } + } - if (existingCall) { - // Accumulate function arguments - if (newCall.function?.arguments) { - existingCall.function.arguments = - (existingCall.function.arguments ?? "") + - newCall.function.arguments; + // Track image data size for Google providers (for token estimation) + if (isGoogleCompatibleProvider(usedProvider)) { + const parts = data.candidates?.[0]?.content?.parts ?? []; + for (const part of parts) { + if (part.inlineData?.data) { + // Base64 string length * 0.75 ≈ actual byte size + imageByteSize += Math.ceil( + part.inlineData.data.length * 0.75, + ); + outputImageCount++; } - } else { - // Clean up temporary fields and add new tool call - const cleanCall = { ...newCall }; - delete cleanCall._contentBlockIndex; - streamingToolCalls.push(cleanCall); } } - } - // Handle provider-specific finish reason extraction - switch (usedProvider) { - case "google-ai-studio": - case "google-vertex": - case "quartz": - case "obsidian": - // Preserve original Google finish reason for logging - if (data.promptFeedback?.blockReason) { - finishReason = data.promptFeedback.blockReason; - sawProviderTerminalEvent = true; - } else if (data.candidates?.[0]?.finishReason) { - finishReason = data.candidates[0].finishReason; - sawProviderTerminalEvent = true; - } - break; - case "anthropic": + // Track web search calls for cost calculation + // Check for web search results based on provider-specific data + if (usedProvider === "anthropic") { + // For Anthropic, count web_search_tool_result blocks if ( - data.type === "message_delta" && - data.delta?.stop_reason + data.type === "content_block_start" && + data.content_block?.type === "web_search_tool_result" ) { - finishReason = data.delta.stop_reason; - sawProviderTerminalEvent = true; - } else if ( - data.type === "message_stop" || - data.stop_reason - ) { - finishReason = data.stop_reason ?? "end_turn"; - sawProviderTerminalEvent = true; - } else if (data.delta?.stop_reason) { - finishReason = data.delta.stop_reason; - sawProviderTerminalEvent = true; + webSearchCount++; } - break; - default: // OpenAI format - if (data.choices && data.choices[0]?.finish_reason) { - finishReason = data.choices[0].finish_reason; + } else if (isGoogleCompatibleProvider(usedProvider)) { + // For Google, count when grounding metadata is present + if (data.candidates?.[0]?.groundingMetadata) { + const groundingMetadata = + data.candidates[0].groundingMetadata; + if ( + groundingMetadata.webSearchQueries && + groundingMetadata.webSearchQueries.length > 0 && + webSearchCount === 0 + ) { + // Only count once for the entire response + webSearchCount = + groundingMetadata.webSearchQueries.length; + } else if ( + groundingMetadata.groundingChunks && + webSearchCount === 0 + ) { + // Fallback: count once if we have grounding chunks + webSearchCount = 1; + } } - break; - } + } else if (usedProvider === "openai") { + // For OpenAI Responses API, count web_search_call.completed events + if (data.type === "response.web_search_call.completed") { + webSearchCount++; + } + } - // Extract token usage using helper function - const usage = extractTokenUsage( - data, - usedProvider, - fullContent, - imageByteSize, - ); - if (usage.promptTokens !== null) { - promptTokens = usage.promptTokens; - } - if (usage.completionTokens !== null) { - completionTokens = usage.completionTokens; - } - if (usage.totalTokens !== null) { - totalTokens = usage.totalTokens; - } - if (usage.reasoningTokens !== null) { - reasoningTokens = usage.reasoningTokens; - } - if (usage.cachedTokens !== null) { - cachedTokens = usage.cachedTokens; - } + // Extract reasoning content for logging using helper function + // For providers with custom extraction logic (google-ai-studio, anthropic), + // use raw data. For others, use transformed OpenAI format. + const reasoningContentChunk = extractReasoning( + isGoogleCompatibleProvider(usedProvider) || + usedProvider === "anthropic" + ? data + : transformedData, + usedProvider, + ); + if (reasoningContentChunk) { + fullReasoningContent += reasoningContentChunk; - // Estimate tokens if not provided and we have a finish reason - if (finishReason && (!promptTokens || !completionTokens)) { - if (!promptTokens) { - const estimation = estimateTokens( - usedProvider, - messages, - null, - null, - null, - ); - promptTokens = estimation.calculatedPromptTokens; + // Track time to first reasoning token if this is the first reasoning chunk + if (!firstReasoningTokenReceived) { + timeToFirstReasoningToken = Date.now() - startTime; + firstReasoningTokenReceived = true; + } } - if (!completionTokens) { - const textTokens = estimateTokensFromContent(fullContent); - // For images, estimate ~258 tokens per image + 1 token per 750 bytes - let imageTokens = 0; - if (imageByteSize > 0) { - imageTokens = 258 + Math.ceil(imageByteSize / 750); + const toolCallsChunk = extractToolCalls( + data, + usedProvider, + transformedData, + ); + if (toolCallsChunk && toolCallsChunk.length > 0) { + streamingToolCalls ??= []; + // Merge tool calls (accumulating function arguments) + for (const newCall of toolCallsChunk) { + let existingCall = null; + + // For Anthropic content_block_delta events, match by content block index + if ( + usedProvider === "anthropic" && + newCall._contentBlockIndex !== undefined + ) { + existingCall = + streamingToolCalls[newCall._contentBlockIndex]; + } else { + // For other providers and Anthropic content_block_start, match by ID + // Note: Array may have sparse entries due to index-based assignment, so check for null/undefined + existingCall = streamingToolCalls.find( + (call) => call && call.id === newCall.id, + ); + } + + if (existingCall) { + // Accumulate function arguments + if (newCall.function?.arguments) { + existingCall.function.arguments = + (existingCall.function.arguments ?? "") + + newCall.function.arguments; + } + } else { + // Clean up temporary fields and add new tool call + const cleanCall = { ...newCall }; + delete cleanCall._contentBlockIndex; + streamingToolCalls.push(cleanCall); + } } - completionTokens = textTokens + imageTokens; } - totalTokens = (promptTokens ?? 0) + (completionTokens ?? 0); - } + // Handle provider-specific finish reason extraction + switch (usedProvider) { + case "google-ai-studio": + case "google-vertex": + case "quartz": + case "obsidian": + // Preserve original Google finish reason for logging + if (data.promptFeedback?.blockReason) { + finishReason = data.promptFeedback.blockReason; + sawProviderTerminalEvent = true; + } else if (data.candidates?.[0]?.finishReason) { + finishReason = data.candidates[0].finishReason; + sawProviderTerminalEvent = true; + } + break; + case "anthropic": + if ( + data.type === "message_delta" && + data.delta?.stop_reason + ) { + finishReason = data.delta.stop_reason; + sawProviderTerminalEvent = true; + } else if ( + data.type === "message_stop" || + data.stop_reason + ) { + finishReason = data.stop_reason ?? "end_turn"; + sawProviderTerminalEvent = true; + } else if (data.delta?.stop_reason) { + finishReason = data.delta.stop_reason; + sawProviderTerminalEvent = true; + } + break; + default: // OpenAI format + if (data.choices && data.choices[0]?.finish_reason) { + finishReason = data.choices[0].finish_reason; + } + break; + } - processedLength = eventEnd; - } + // Extract token usage using helper function + const usage = extractTokenUsage( + data, + usedProvider, + fullContent, + imageByteSize, + ); + if (usage.promptTokens !== null) { + promptTokens = usage.promptTokens; + } + if (usage.completionTokens !== null) { + completionTokens = usage.completionTokens; + } + if (usage.totalTokens !== null) { + totalTokens = usage.totalTokens; + } + if (usage.reasoningTokens !== null) { + reasoningTokens = usage.reasoningTokens; + } + if (usage.cachedTokens !== null) { + cachedTokens = usage.cachedTokens; + } - searchStart = eventEnd; - } + // Estimate tokens if not provided and we have a finish reason + if (finishReason && (!promptTokens || !completionTokens)) { + if (!promptTokens) { + const estimation = estimateTokens( + usedProvider, + messages, + null, + null, + null, + ); + promptTokens = estimation.calculatedPromptTokens; + } - // Remove processed data from buffer - if (processedLength > 0) { - buffer = bufferCopy.slice(processedLength); - } + if (!completionTokens) { + const textTokens = estimateTokensFromContent(fullContent); + // For images, estimate ~258 tokens per image + 1 token per 750 bytes + let imageTokens = 0; + if (imageByteSize > 0) { + imageTokens = 258 + Math.ceil(imageByteSize / 750); + } + completionTokens = textTokens + imageTokens; + } - if (shouldTerminateStream) { - break; - } - } - } catch (error) { - if (error instanceof Error && error.name === "AbortError") { - canceled = true; - } else if (isTimeoutError(error)) { - const errorMessage = - error instanceof Error ? error.message : "Stream reading timeout"; - logger.warn("Stream reading timeout", { - error: errorMessage, - usedProvider, - requestedProvider, - usedModel, - initialRequestedModel, - unifiedFinishReason: getUnifiedFinishReason( - "upstream_error", - usedProvider, - ), - }); + totalTokens = (promptTokens ?? 0) + (completionTokens ?? 0); + } - try { - await stream.writeSSE({ - event: "error", - data: JSON.stringify({ - error: { - message: `Upstream provider timeout: ${errorMessage}`, - type: "upstream_timeout", - param: null, - code: "timeout", - }, - }), - id: String(eventId++), - }); - await stream.writeSSE({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); - doneSent = true; - } catch (sseError) { - logger.error( - "Failed to send timeout error SSE", - sseError instanceof Error - ? sseError - : new Error(String(sseError)), - ); - } + processedLength = eventEnd; + } - streamingError = { - message: errorMessage, - type: "upstream_timeout", - code: "timeout", - details: { - name: "TimeoutError", - timestamp: new Date().toISOString(), - provider: usedProvider, - model: usedModel, - }, - }; - } else { - const normalizedStreamingError = normalizeStreamingError({ - error, - provider: usedProvider, - model: usedModel, - bufferSnapshot: buffer ? buffer.substring(0, 5000) : undefined, - phase: "upstream_read", - }); + searchStart = eventEnd; + } - logger.error( - "Error reading upstream stream", - error instanceof Error ? error : new Error(String(error)), - { - requestId, + // Remove processed data from buffer + if (processedLength > 0) { + buffer = bufferCopy.slice(processedLength); + } + + if (shouldTerminateStream) { + break; + } + } + } catch (error) { + if (error instanceof Error && error.name === "AbortError") { + canceled = true; + } else if (isTimeoutError(error)) { + const errorMessage = + error instanceof Error + ? error.message + : "Stream reading timeout"; + logger.warn("Stream reading timeout", { + error: errorMessage, usedProvider, requestedProvider, usedModel, initialRequestedModel, - upstreamStatus: res?.status ?? null, - upstreamStatusText: res?.statusText ?? null, - upstreamHeaders: res - ? { - contentType: res.headers.get("content-type"), - contentLength: res.headers.get("content-length"), - transferEncoding: res.headers.get("transfer-encoding"), - requestId: - res.headers.get("x-request-id") ?? - res.headers.get("request-id") ?? - res.headers.get("openai-request-id"), - } - : null, - streamingDiagnostics: normalizedStreamingError.log.details, - timeToFirstToken, - timeToFirstReasoningToken, - firstTokenReceived, - firstReasoningTokenReceived, unifiedFinishReason: getUnifiedFinishReason( - normalizedStreamingError.client.type === "gateway_error" - ? "gateway_error" - : "upstream_error", + "upstream_error", usedProvider, ), - }, - ); - - // Forward the error to the client with the buffered content that caused the error - try { - await stream.writeSSE({ - event: "error", - data: JSON.stringify({ - error: normalizedStreamingError.client, - }), - id: String(eventId++), }); - await stream.writeSSE({ - event: "done", - data: "[DONE]", - id: String(eventId++), + + try { + await stream.writeSSE({ + event: "error", + data: JSON.stringify({ + error: { + message: `Upstream provider timeout: ${errorMessage}`, + type: "upstream_timeout", + param: null, + code: "timeout", + }, + }), + id: String(eventId++), + }); + await stream.writeSSE({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + doneSent = true; + } catch (sseError) { + logger.error( + "Failed to send timeout error SSE", + sseError instanceof Error + ? sseError + : new Error(String(sseError)), + ); + } + + streamingError = { + message: errorMessage, + type: "upstream_timeout", + code: "timeout", + details: { + name: "TimeoutError", + timestamp: new Date().toISOString(), + provider: usedProvider, + model: usedModel, + }, + }; + } else { + const normalizedStreamingError = normalizeStreamingError({ + error, + provider: usedProvider, + model: usedModel, + bufferSnapshot: buffer ? buffer.substring(0, 5000) : undefined, + phase: "upstream_read", }); - doneSent = true; - } catch (sseError) { + logger.error( - "Failed to send error SSE", - sseError instanceof Error - ? sseError - : new Error(String(sseError)), + "Error reading upstream stream", + error instanceof Error ? error : new Error(String(error)), + { + requestId, + usedProvider, + requestedProvider, + usedModel, + initialRequestedModel, + upstreamStatus: res?.status ?? null, + upstreamStatusText: res?.statusText ?? null, + upstreamHeaders: res + ? { + contentType: res.headers.get("content-type"), + contentLength: res.headers.get("content-length"), + transferEncoding: res.headers.get("transfer-encoding"), + requestId: + res.headers.get("x-request-id") ?? + res.headers.get("request-id") ?? + res.headers.get("openai-request-id"), + } + : null, + streamingDiagnostics: normalizedStreamingError.log.details, + timeToFirstToken, + timeToFirstReasoningToken, + firstTokenReceived, + firstReasoningTokenReceived, + unifiedFinishReason: getUnifiedFinishReason( + normalizedStreamingError.client.type === "gateway_error" + ? "gateway_error" + : "upstream_error", + usedProvider, + ), + }, ); - } - streamingError = normalizedStreamingError.log; - } - } finally { - // Clean up the reader to prevent file descriptor leaks - try { - await reader.cancel(); - } catch { - // Ignore errors from cancel - the stream may already be aborted due to timeout - } - // Clean up the event listeners - c.req.raw.signal.removeEventListener("abort", onAbort); + // Forward the error to the client with the buffered content that caused the error + try { + await stream.writeSSE({ + event: "error", + data: JSON.stringify({ + error: normalizedStreamingError.client, + }), + id: String(eventId++), + }); + await stream.writeSSE({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + doneSent = true; + } catch (sseError) { + logger.error( + "Failed to send error SSE", + sseError instanceof Error + ? sseError + : new Error(String(sseError)), + ); + } - // Log the streaming request - const duration = Date.now() - startTime; + streamingError = normalizedStreamingError.log; + } + } finally { + // Clean up the reader to prevent file descriptor leaks + try { + await reader.cancel(); + } catch { + // Ignore errors from cancel - the stream may already be aborted due to timeout + } + // Clean up the event listeners + c.req.raw.signal.removeEventListener("abort", onAbort); - // Calculate estimated tokens if not provided - let calculatedPromptTokens = promptTokens; - let calculatedCompletionTokens = completionTokens; - let calculatedTotalTokens = totalTokens; + // Log the streaming request + const duration = Date.now() - startTime; - // Estimate tokens for providers that don't provide them during streaming - if (!promptTokens || !completionTokens) { - if (!promptTokens && messages && messages.length > 0) { - calculatedPromptTokens = encodeChatMessages(messages); - } + // Calculate estimated tokens if not provided + let calculatedPromptTokens = promptTokens; + let calculatedCompletionTokens = completionTokens; + let calculatedTotalTokens = totalTokens; - if (!completionTokens && (fullContent || imageByteSize > 0)) { - // For images, estimate ~258 tokens per image + 1 token per 750 bytes - let imageTokens = 0; - if (imageByteSize > 0) { - imageTokens = 258 + Math.ceil(imageByteSize / 750); + // Estimate tokens for providers that don't provide them during streaming + if (!promptTokens || !completionTokens) { + if (!promptTokens && messages && messages.length > 0) { + calculatedPromptTokens = encodeChatMessages(messages); } - // Skip expensive token encoding for image responses - use simple estimation - // Token encoding on large base64 content causes CPU spikes - if (imageByteSize > 0) { - const textTokens = estimateTokensFromContent(fullContent); - calculatedCompletionTokens = textTokens + imageTokens; - } else { - try { - const textTokens = fullContent - ? encode(JSON.stringify(fullContent)).length - : 0; - calculatedCompletionTokens = textTokens + imageTokens; - } catch (error) { - // Fallback to simple estimation if encoding fails - logger.error( - "Failed to encode completion text in streaming", - error instanceof Error ? error : new Error(String(error)), - ); + if (!completionTokens && (fullContent || imageByteSize > 0)) { + // For images, estimate ~258 tokens per image + 1 token per 750 bytes + let imageTokens = 0; + if (imageByteSize > 0) { + imageTokens = 258 + Math.ceil(imageByteSize / 750); + } + + // Skip expensive token encoding for image responses - use simple estimation + // Token encoding on large base64 content causes CPU spikes + if (imageByteSize > 0) { const textTokens = estimateTokensFromContent(fullContent); calculatedCompletionTokens = textTokens + imageTokens; + } else { + try { + const textTokens = fullContent + ? encode(JSON.stringify(fullContent)).length + : 0; + calculatedCompletionTokens = textTokens + imageTokens; + } catch (error) { + // Fallback to simple estimation if encoding fails + logger.error( + "Failed to encode completion text in streaming", + error instanceof Error ? error : new Error(String(error)), + ); + const textTokens = estimateTokensFromContent(fullContent); + calculatedCompletionTokens = textTokens + imageTokens; + } } } + + calculatedTotalTokens = + (calculatedPromptTokens ?? 0) + + (calculatedCompletionTokens ?? 0); + } + + // Estimate reasoning tokens if not provided but reasoning content exists + let calculatedReasoningTokens = reasoningTokens; + if (!reasoningTokens && fullReasoningContent) { + try { + calculatedReasoningTokens = encode(fullReasoningContent).length; + } catch (error) { + // Fallback to simple estimation if encoding fails + logger.error( + "Failed to encode reasoning text in streaming", + error instanceof Error ? error : new Error(String(error)), + ); + calculatedReasoningTokens = + estimateTokensFromContent(fullReasoningContent); + } + } + + const streamHasVerifiedTerminalEvent = + sawUpstreamDoneSentinel || + sawProviderTerminalEvent || + handledTerminalProviderEvent; + const streamEndedWithoutTerminalEvent = + !streamingError && + !canceled && + (!streamHasVerifiedTerminalEvent || finishReason === null); + if (streamEndedWithoutTerminalEvent) { + const hasBufferedNonWhitespace = /\S/u.test(buffer); + const responseText = hasBufferedNonWhitespace + ? buffer.slice(0, 5000) + : "Stream ended before a terminal finish reason or [DONE] event"; + const errorMessage = + "Upstream stream terminated unexpectedly before completion"; + + logger.warn("[streaming] Stream ended without terminal event", { + provider: usedProvider, + model: usedModel, + bufferLength: buffer.length, + fullContentLength: fullContent.length, + hasToolCalls: + !!streamingToolCalls && streamingToolCalls.length > 0, + unifiedFinishReason: getUnifiedFinishReason( + "upstream_error", + usedProvider, + ), + }); + + streamingError = { + message: errorMessage, + type: "upstream_error", + code: "stream_truncated", + details: { + statusCode: 502, + statusText: "Upstream Stream Terminated", + responseText, + timestamp: new Date().toISOString(), + provider: usedProvider, + model: usedModel, + bufferLength: buffer.length, + }, + }; + finishReason = "upstream_error"; + + try { + await writeSSEAndCache({ + event: "error", + data: JSON.stringify({ + error: { + message: errorMessage, + type: "upstream_error", + code: "stream_truncated", + param: null, + responseText, + }, + }), + id: String(eventId++), + }); + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + doneSent = true; + } catch (sseError) { + logger.error( + "Failed to send truncated stream error SSE", + sseError instanceof Error + ? sseError + : new Error(String(sseError)), + ); + } } - calculatedTotalTokens = - (calculatedPromptTokens ?? 0) + (calculatedCompletionTokens ?? 0); - } - - // Estimate reasoning tokens if not provided but reasoning content exists - let calculatedReasoningTokens = reasoningTokens; - if (!reasoningTokens && fullReasoningContent) { - try { - calculatedReasoningTokens = encode(fullReasoningContent).length; - } catch (error) { - // Fallback to simple estimation if encoding fails - logger.error( - "Failed to encode reasoning text in streaming", - error instanceof Error ? error : new Error(String(error)), - ); - calculatedReasoningTokens = - estimateTokensFromContent(fullReasoningContent); - } - } - - if ( - !streamingError && - !canceled && - finishReason === null && - sawOpenAiResponsesDoneEvent && - sawOpenAiResponsesCompletedStatus - ) { - sawProviderTerminalEvent = true; - finishReason = - streamingToolCalls && streamingToolCalls.length > 0 - ? "tool_calls" - : "stop"; - } - - const streamHasVerifiedTerminalEvent = - sawUpstreamDoneSentinel || - sawProviderTerminalEvent || - handledTerminalProviderEvent; - const streamEndedWithoutTerminalEvent = - !streamingError && - !canceled && - (!streamHasVerifiedTerminalEvent || finishReason === null); - if (streamEndedWithoutTerminalEvent) { - const hasBufferedNonWhitespace = /\S/u.test(buffer); - const responseText = hasBufferedNonWhitespace - ? buffer.slice(0, 5000) - : "Stream ended before a terminal finish reason or [DONE] event"; - const errorMessage = - "Upstream stream terminated unexpectedly before completion"; - - logger.warn("[streaming] Stream ended without terminal event", { - provider: usedProvider, - model: usedModel, - bufferLength: buffer.length, - fullContentLength: fullContent.length, - hasToolCalls: - !!streamingToolCalls && streamingToolCalls.length > 0, - unifiedFinishReason: getUnifiedFinishReason( - "upstream_error", - usedProvider, - ), - }); - - streamingError = { - message: errorMessage, - type: "upstream_error", - code: "stream_truncated", - details: { - statusCode: 502, - statusText: "Upstream Stream Terminated", - responseText, - timestamp: new Date().toISOString(), + // Check if the response finished successfully but has no content, tokens, or tool calls + // This indicates an empty response which should be marked as an error + // Do this check BEFORE sending usage chunks to ensure proper event ordering + // Exclude content_filter responses as they are intentionally empty (blocked by provider) + // For Google, check for original finish reasons that indicate content filtering + // These include both finishReason values and promptFeedback.blockReason values + const isGoogleContentFilterStreaming = + isGoogleCompatibleProvider(usedProvider) && + (finishReason === "SAFETY" || + finishReason === "PROHIBITED_CONTENT" || + finishReason === "RECITATION" || + finishReason === "BLOCKLIST" || + finishReason === "SPII" || + finishReason === "OTHER"); + const hasEmptyResponse = + !streamingError && + finishReason && + finishReason !== "content_filter" && + finishReason !== "incomplete" && + !isGoogleContentFilterStreaming && + (!calculatedCompletionTokens || + calculatedCompletionTokens === 0) && + (!calculatedReasoningTokens || calculatedReasoningTokens === 0) && + (!fullContent || fullContent.trim() === "") && + (!streamingToolCalls || streamingToolCalls.length === 0); + + let streamingCostsEarly: + | Awaited> + | undefined; + + if (hasEmptyResponse) { + logger.warn("[streaming] Empty response detected", { provider: usedProvider, model: usedModel, - bufferLength: buffer.length, - }, - }; - finishReason = "upstream_error"; - - try { - await writeSSEAndCache({ - event: "error", - data: JSON.stringify({ - error: { - message: errorMessage, - type: "upstream_error", - code: "stream_truncated", - param: null, - responseText, - }, - }), - id: String(eventId++), - }); - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), + finishReason, + calculatedCompletionTokens, + calculatedReasoningTokens, + fullContentLength: fullContent?.length ?? 0, + fullContentTrimmed: fullContent?.trim()?.length ?? 0, + streamingToolCallsCount: streamingToolCalls?.length ?? 0, + promptTokens, + completionTokens, + totalTokens, + reasoningTokens, + unifiedFinishReason: getUnifiedFinishReason( + "upstream_error", + usedProvider, + ), }); - doneSent = true; - } catch (sseError) { - logger.error( - "Failed to send truncated stream error SSE", - sseError instanceof Error - ? sseError - : new Error(String(sseError)), - ); - } - } + const errorMessage = + "Response finished successfully but returned no content or tool calls"; + streamingError = errorMessage; + finishReason = "upstream_error"; - // Check if the response finished successfully but has no content, tokens, or tool calls - // This indicates an empty response which should be marked as an error - // Do this check BEFORE sending usage chunks to ensure proper event ordering - // Exclude content_filter responses as they are intentionally empty (blocked by provider) - // For Google, check for original finish reasons that indicate content filtering - // These include both finishReason values and promptFeedback.blockReason values - const isGoogleContentFilterStreaming = - isGoogleCompatibleProvider(usedProvider) && - (finishReason === "SAFETY" || - finishReason === "PROHIBITED_CONTENT" || - finishReason === "RECITATION" || - finishReason === "BLOCKLIST" || - finishReason === "SPII" || - finishReason === "OTHER"); - const hasEmptyResponse = - !streamingError && - finishReason && - finishReason !== "content_filter" && - finishReason !== "incomplete" && - !isGoogleContentFilterStreaming && - (!calculatedCompletionTokens || calculatedCompletionTokens === 0) && - (!calculatedReasoningTokens || calculatedReasoningTokens === 0) && - (!fullContent || fullContent.trim() === "") && - (!streamingToolCalls || streamingToolCalls.length === 0); - - let streamingCostsEarly: - | Awaited> - | undefined; - - if (hasEmptyResponse) { - logger.warn("[streaming] Empty response detected", { - provider: usedProvider, - model: usedModel, - finishReason, - calculatedCompletionTokens, - calculatedReasoningTokens, - fullContentLength: fullContent?.length ?? 0, - fullContentTrimmed: fullContent?.trim()?.length ?? 0, - streamingToolCallsCount: streamingToolCalls?.length ?? 0, - promptTokens, - completionTokens, - totalTokens, - reasoningTokens, - unifiedFinishReason: getUnifiedFinishReason( - "upstream_error", - usedProvider, - ), - }); - const errorMessage = - "Response finished successfully but returned no content or tool calls"; - streamingError = errorMessage; - finishReason = "upstream_error"; + // Send error event to client using writeSSEAndCache to cache the error + try { + await writeSSEAndCache({ + event: "error", + data: JSON.stringify({ + error: { + message: errorMessage, + type: "upstream_error", + code: "upstream_error", + param: null, + responseText: errorMessage, + }, + }), + id: String(eventId++), + }); + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + doneSent = true; + } catch (sseError) { + logger.error( + "Failed to send upstream error SSE", + sseError instanceof Error + ? sseError + : new Error(String(sseError)), + ); + } + } else if (!streamingError && !doneSent) { + // Calculate costs before sending usage chunk so we can include cost data + const billCancelledRequestsEarly = shouldBillCancelledRequests(); + streamingCostsEarly = + canceled && !billCancelledRequestsEarly + ? { + inputCost: null, + outputCost: null, + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + totalCost: null, + promptTokens: null, + completionTokens: null, + cachedTokens: null, + estimatedCost: false, + discount: undefined, + pricingTier: undefined, + } + : await calculateCosts( + usedModel, + usedProvider, + calculatedPromptTokens, + calculatedCompletionTokens, + cachedTokens, + { + prompt: messages + .map((m) => messageContentToString(m.content)) + .join("\n"), + completion: fullContent, + toolResults: streamingToolCalls ?? undefined, + }, + reasoningTokens, + outputImageCount, + image_config?.image_size, + inputImageCount, + webSearchCount, + project.organizationId, + ); - // Send error event to client using writeSSEAndCache to cache the error - try { - await writeSSEAndCache({ - event: "error", - data: JSON.stringify({ - error: { - message: errorMessage, - type: "upstream_error", - code: "upstream_error", - param: null, - responseText: errorMessage, - }, - }), - id: String(eventId++), - }); - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); - doneSent = true; - } catch (sseError) { - logger.error( - "Failed to send upstream error SSE", - sseError instanceof Error - ? sseError - : new Error(String(sseError)), - ); - } - } else if (!streamingError && !doneSent) { - if ( - finishReason && - !sentDownstreamFinishReasonChunk && - !shouldBufferForHealing - ) { + // Always send final usage chunk with cost data for SDK compatibility try { - const finishChunk = { + const finalUsageChunk = { id: `chatcmpl-${Date.now()}`, object: "chat.completion.chunk", created: Math.floor(Date.now() / 1000), @@ -5960,28 +6019,204 @@ chat.openapi(completions, async (c) => { { index: 0, delta: {}, - finish_reason: finishReason, + finish_reason: null, }, ], + usage: (() => { + // Only add image input tokens for providers that + // exclude them from upstream usage (Google) + const providerExcludesImageInput = + isGoogleCompatibleProvider(usedProvider); + const imageInputAdj = providerExcludesImageInput + ? inputImageCount * 560 + : 0; + const adjPrompt = Math.max( + 1, + Math.round( + promptTokens && promptTokens > 0 + ? promptTokens + imageInputAdj + : (calculatedPromptTokens ?? 1) + imageInputAdj, + ), + ); + const adjCompletion = Math.round( + completionTokens ?? calculatedCompletionTokens ?? 0, + ); + return { + prompt_tokens: adjPrompt, + completion_tokens: adjCompletion, + total_tokens: Math.max( + 1, + Math.round(adjPrompt + adjCompletion), + ), + ...(cachedTokens !== null && { + prompt_tokens_details: { + cached_tokens: cachedTokens, + }, + }), + cost_usd_total: streamingCostsEarly.totalCost, + cost_usd_input: streamingCostsEarly.inputCost, + cost_usd_output: streamingCostsEarly.outputCost, + cost_usd_cached_input: + streamingCostsEarly.cachedInputCost, + cost_usd_request: streamingCostsEarly.requestCost, + cost_usd_image_input: streamingCostsEarly.imageInputCost, + cost_usd_image_output: + streamingCostsEarly.imageOutputCost, + }; + })(), }; await writeSSEAndCache({ - data: JSON.stringify(finishChunk), + data: JSON.stringify(finalUsageChunk), id: String(eventId++), }); - sentDownstreamFinishReasonChunk = true; } catch (error) { logger.error( - "Error sending synthesized finish chunk", + "Error sending final usage chunk", error instanceof Error ? error : new Error(String(error)), ); } + + // Send healed content if buffering was enabled + if ( + shouldBufferForHealing && + bufferedContentChunks.length > 0 && + !streamingError + ) { + try { + // Combine buffered content and apply healing + const bufferedContent = bufferedContentChunks.join(""); + const healingResult = healJsonResponse(bufferedContent); + + // Store plugin results for logging + streamingPluginResults.responseHealing = { + healed: healingResult.healed, + healingMethod: healingResult.healingMethod, + }; + + if (healingResult.healed) { + logger.debug("Streaming response healing applied", { + method: healingResult.healingMethod, + originalLength: healingResult.originalContent.length, + healedLength: healingResult.content.length, + }); + // Update fullContent with healed version for logging + fullContent = healingResult.content; + } + + // Send the healed (or original if no healing needed) content as a single chunk + const healedContentChunk = { + id: lastChunkId ?? `chatcmpl-${Date.now()}`, + object: "chat.completion.chunk", + created: lastChunkCreated ?? Math.floor(Date.now() / 1000), + model: lastChunkModel ?? usedModel, + choices: [ + { + index: 0, + delta: { + content: healingResult.content, + }, + finish_reason: null, + }, + ], + }; + + await writeSSEAndCache({ + data: JSON.stringify(healedContentChunk), + id: String(eventId++), + }); + + // Send finish_reason chunk + const finishChunk = { + id: lastChunkId ?? `chatcmpl-${Date.now()}`, + object: "chat.completion.chunk", + created: lastChunkCreated ?? Math.floor(Date.now() / 1000), + model: lastChunkModel ?? usedModel, + choices: [ + { + index: 0, + delta: {}, + finish_reason: finishReason ?? "stop", + }, + ], + }; + + await writeSSEAndCache({ + data: JSON.stringify(finishChunk), + id: String(eventId++), + }); + } catch (error) { + logger.error( + "Error sending healed content chunk", + error instanceof Error ? error : new Error(String(error)), + ); + } + } + + // Send routing metadata for all attempts (including successful) + if (routingAttempts.length > 0 && !doneSent) { + try { + const routingChunk = { + id: `chatcmpl-${Date.now()}`, + object: "chat.completion.chunk", + created: Math.floor(Date.now() / 1000), + model: usedModel, + choices: [ + { + index: 0, + delta: {}, + finish_reason: null, + }, + ], + metadata: { + requested_model: initialRequestedModel, + requested_provider: requestedProvider ?? null, + used_model: baseModelName, + used_provider: usedProvider, + ...(usedRegion && { used_region: usedRegion }), + underlying_used_model: usedModel, + routing: routingAttempts, + }, + }; + await writeSSEAndCache({ + data: JSON.stringify(routingChunk), + id: String(eventId++), + }); + } catch (error) { + logger.error( + "Error sending routing metadata chunk", + error instanceof Error ? error : new Error(String(error)), + ); + } + } + + // Always send [DONE] at the end of streaming if not already sent + if (!doneSent) { + try { + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + } catch (error) { + logger.error( + "Error sending [DONE] event", + error instanceof Error ? error : new Error(String(error)), + ); + } + } } - // Calculate costs before sending usage chunk so we can include cost data - const billCancelledRequestsEarly = shouldBillCancelledRequests(); - streamingCostsEarly = - canceled && !billCancelledRequestsEarly + // Clean up keepalive before any potentially-throwing operations (insertLog, etc.) + // clearInterval is idempotent so calling it multiple times is safe + clearKeepalive(); + + // Reuse costs calculated earlier (before usage chunk was sent) + // If we came through the error path (hasEmptyResponse), calculate now + const billCancelledRequests = shouldBillCancelledRequests(); + const costs = + streamingCostsEarly ?? + (canceled && !billCancelledRequests ? { inputCost: null, outputCost: null, @@ -6019,473 +6254,226 @@ chat.openapi(completions, async (c) => { inputImageCount, webSearchCount, project.organizationId, - ); + )); - // Always send final usage chunk with cost data for SDK compatibility - try { - const finalUsageChunk = { - id: `chatcmpl-${Date.now()}`, - object: "chat.completion.chunk", - created: Math.floor(Date.now() / 1000), - model: usedModel, - choices: [ - { - index: 0, - delta: {}, - finish_reason: null, - }, - ], - usage: (() => { - // Only add image input tokens for providers that - // exclude them from upstream usage (Google) - const providerExcludesImageInput = - isGoogleCompatibleProvider(usedProvider); - const imageInputAdj = providerExcludesImageInput - ? inputImageCount * 560 - : 0; - const adjPrompt = Math.max( - 1, - Math.round( - promptTokens && promptTokens > 0 - ? promptTokens + imageInputAdj - : (calculatedPromptTokens ?? 1) + imageInputAdj, - ), - ); - const adjCompletion = Math.round( - completionTokens ?? calculatedCompletionTokens ?? 0, - ); - return { - prompt_tokens: adjPrompt, - completion_tokens: adjCompletion, - total_tokens: Math.max( - 1, - Math.round(adjPrompt + adjCompletion), - ), - ...(cachedTokens !== null && { - prompt_tokens_details: { - cached_tokens: cachedTokens, - }, - }), - cost_usd_total: streamingCostsEarly.totalCost, - cost_usd_input: streamingCostsEarly.inputCost, - cost_usd_output: streamingCostsEarly.outputCost, - cost_usd_cached_input: streamingCostsEarly.cachedInputCost, - cost_usd_request: streamingCostsEarly.requestCost, - cost_usd_image_input: streamingCostsEarly.imageInputCost, - cost_usd_image_output: streamingCostsEarly.imageOutputCost, - }; - })(), - }; + // Use costs.promptTokens as canonical value (includes image input + // tokens for providers that exclude them from upstream usage) + if ( + costs.promptTokens !== null && + costs.promptTokens !== undefined + ) { + const promptDelta = + (costs.promptTokens ?? 0) - (calculatedPromptTokens ?? 0); + if (promptDelta > 0) { + calculatedPromptTokens = costs.promptTokens; + calculatedTotalTokens = + (calculatedTotalTokens ?? 0) + promptDelta; + } + } + + // Determine plugin results for logging (includes healing results if applicable) + const finalPluginResults = + Object.keys(streamingPluginResults).length > 0 + ? streamingPluginResults + : undefined; + + // Enhanced logging for Google models streaming to debug missing responses + if (isGoogleCompatibleProvider(usedProvider)) { + logger.debug("Google model streaming response completed", { + usedProvider, + usedModel, + hasContent: !!fullContent, + contentLength: fullContent.length, + finishReason, + promptTokens: calculatedPromptTokens, + completionTokens: calculatedCompletionTokens, + totalTokens: calculatedTotalTokens, + reasoningTokens, + streamingError: streamingError ? String(streamingError) : null, + canceled, + hasToolCalls: + !!streamingToolCalls && streamingToolCalls.length > 0, + }); + } + + // For cancelled requests, determine if we should include token counts for billing + const shouldIncludeTokensForBilling = + !canceled || (canceled && billCancelledRequests); + + enqueueChatLog( + c, + { + providerKeyId: providerKey?.id, + usedModel: usedModelFormatted, + usedModelMapping, + usedProvider, + requestedModel: initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoningEffort: reasoning_effort, + reasoningMaxTokens: reasoning_max_tokens, + effort, + responseFormat: response_format, + tools, + toolChoice: tool_choice, + source, + customHeaders, + debugMode, + userAgent, + imageConfig: image_config, + routingMetadata, + rawRequest: rawBody, + rawResponse: streamingError ?? streamingRawResponseData, + upstreamRequest: requestBody, + upstreamResponse: streamingError ?? rawUpstreamData, + plugins: requestPluginIds, + pluginResults: finalPluginResults, + }, + { + id: routingAttempts.length > 0 ? finalLogId : undefined, + duration, + timeToFirstToken, + timeToFirstReasoningToken, + responseSize: fullContent.length, + content: fullContent, + reasoningContent: fullReasoningContent || null, + finishReason: canceled ? "canceled" : finishReason, + promptTokens: shouldIncludeTokensForBilling + ? (calculatedPromptTokens?.toString() ?? null) + : null, + completionTokens: shouldIncludeTokensForBilling + ? (calculatedCompletionTokens?.toString() ?? null) + : null, + totalTokens: shouldIncludeTokensForBilling + ? (calculatedTotalTokens?.toString() ?? null) + : null, + reasoningTokens: shouldIncludeTokensForBilling + ? (calculatedReasoningTokens?.toString() ?? null) + : null, + cachedTokens: shouldIncludeTokensForBilling + ? (cachedTokens?.toString() ?? null) + : null, + hasError: streamingError !== null, + errorDetails: streamingError + ? { + statusCode: + typeof streamingError === "object" && + streamingError !== null && + "details" in streamingError && + typeof streamingError.details === "object" && + streamingError.details !== null && + "statusCode" in streamingError.details && + typeof streamingError.details.statusCode === "number" + ? streamingError.details.statusCode + : 500, + statusText: + typeof streamingError === "object" && + streamingError !== null && + "details" in streamingError && + typeof streamingError.details === "object" && + streamingError.details !== null && + "statusText" in streamingError.details && + typeof streamingError.details.statusText === "string" + ? streamingError.details.statusText + : "Streaming Error", + responseText: + typeof streamingError === "object" && + streamingError !== null && + "details" in streamingError && + typeof streamingError.details === "object" && + streamingError.details !== null && + "responseText" in streamingError.details && + typeof streamingError.details.responseText === "string" + ? streamingError.details.responseText + : typeof streamingError === "object" && + streamingError !== null && + "details" in streamingError + ? JSON.stringify(streamingError) + : streamingError instanceof Error + ? streamingError.message + : String(streamingError), + } + : null, + streamed: true, + canceled: canceled, + inputCost: costs.inputCost, + outputCost: costs.outputCost, + cachedInputCost: costs.cachedInputCost, + requestCost: costs.requestCost, + webSearchCost: costs.webSearchCost, + imageInputTokens: costs.imageInputTokens?.toString() ?? null, + imageOutputTokens: costs.imageOutputTokens?.toString() ?? null, + imageInputCost: costs.imageInputCost ?? null, + imageOutputCost: costs.imageOutputCost ?? null, + cost: costs.totalCost, + estimatedCost: costs.estimatedCost, + discount: costs.discount, + pricingTier: costs.pricingTier, + dataStorageCost: shouldIncludeTokensForBilling + ? calculateDataStorageCost( + calculatedPromptTokens, + cachedTokens, + calculatedCompletionTokens, + calculatedReasoningTokens, + retentionLevel, + ) + : "0", + cached: false, + toolResults: streamingToolCalls, + }, + ); - await writeSSEAndCache({ - data: JSON.stringify(finalUsageChunk), - id: String(eventId++), - }); - } catch (error) { - logger.error( - "Error sending final usage chunk", - error instanceof Error ? error : new Error(String(error)), - ); + // Report key health for environment-based tokens + if (envVarName !== undefined) { + if (streamingError !== null) { + reportKeyError(envVarName, configIndex, 500); + } else { + reportKeySuccess(envVarName, configIndex); + } } - // Send healed content if buffering was enabled + // Save streaming cache if enabled and not canceled and no errors if ( - shouldBufferForHealing && - bufferedContentChunks.length > 0 && + cachingEnabled && + streamingCacheKey && + !canceled && + finishReason && !streamingError ) { try { - // Combine buffered content and apply healing - const bufferedContent = bufferedContentChunks.join(""); - const healingResult = healJsonResponse(bufferedContent); - - // Store plugin results for logging - streamingPluginResults.responseHealing = { - healed: healingResult.healed, - healingMethod: healingResult.healingMethod, - }; - - if (healingResult.healed) { - logger.debug("Streaming response healing applied", { - method: healingResult.healingMethod, - originalLength: healingResult.originalContent.length, - healedLength: healingResult.content.length, - }); - // Update fullContent with healed version for logging - fullContent = healingResult.content; - } - - // Send the healed (or original if no healing needed) content as a single chunk - const healedContentChunk = { - id: lastChunkId ?? `chatcmpl-${Date.now()}`, - object: "chat.completion.chunk", - created: lastChunkCreated ?? Math.floor(Date.now() / 1000), - model: lastChunkModel ?? usedModel, - choices: [ - { - index: 0, - delta: { - content: healingResult.content, - }, - finish_reason: null, - }, - ], - }; - - await writeSSEAndCache({ - data: JSON.stringify(healedContentChunk), - id: String(eventId++), - }); - - // Send finish_reason chunk - const finishChunk = { - id: lastChunkId ?? `chatcmpl-${Date.now()}`, - object: "chat.completion.chunk", - created: lastChunkCreated ?? Math.floor(Date.now() / 1000), - model: lastChunkModel ?? usedModel, - choices: [ - { - index: 0, - delta: {}, - finish_reason: finishReason ?? "stop", - }, - ], - }; - - await writeSSEAndCache({ - data: JSON.stringify(finishChunk), - id: String(eventId++), - }); - } catch (error) { - logger.error( - "Error sending healed content chunk", - error instanceof Error ? error : new Error(String(error)), - ); - } - } - - // Send routing metadata for all attempts (including successful) - if (routingAttempts.length > 0 && !doneSent) { - try { - const routingChunk = { - id: `chatcmpl-${Date.now()}`, - object: "chat.completion.chunk", - created: Math.floor(Date.now() / 1000), - model: usedModel, - choices: [ - { - index: 0, - delta: {}, - finish_reason: null, - }, - ], + const streamingCacheData = { + chunks: streamingChunks, metadata: { - requested_model: initialRequestedModel, - requested_provider: requestedProvider ?? null, - used_model: baseModelName, - used_provider: usedProvider, - ...(usedRegion && { used_region: usedRegion }), - underlying_used_model: usedModel, - routing: routingAttempts, + model: usedModel, + provider: usedProvider, + finishReason: finishReason, + totalChunks: streamingChunks.length, + duration: duration, + completed: true, }, }; - await writeSSEAndCache({ - data: JSON.stringify(routingChunk), - id: String(eventId++), - }); - } catch (error) { - logger.error( - "Error sending routing metadata chunk", - error instanceof Error ? error : new Error(String(error)), - ); - } - } - // Always send [DONE] at the end of streaming if not already sent - if (!doneSent) { - try { - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); + await setStreamingCache( + streamingCacheKey, + streamingCacheData, + cacheDuration, + ); } catch (error) { logger.error( - "Error sending [DONE] event", + "Error saving streaming cache", error instanceof Error ? error : new Error(String(error)), ); } } } - - // Clean up keepalive before any potentially-throwing operations (insertLog, etc.) - // clearInterval is idempotent so calling it multiple times is safe - clearKeepalive(); - - // Reuse costs calculated earlier (before usage chunk was sent) - // If we came through the error path (hasEmptyResponse), calculate now - const billCancelledRequests = shouldBillCancelledRequests(); - const costs = - streamingCostsEarly ?? - (canceled && !billCancelledRequests - ? { - inputCost: null, - outputCost: null, - cachedInputCost: null, - requestCost: null, - webSearchCost: null, - imageInputTokens: null, - imageOutputTokens: null, - imageInputCost: null, - imageOutputCost: null, - totalCost: null, - promptTokens: null, - completionTokens: null, - cachedTokens: null, - estimatedCost: false, - discount: undefined, - pricingTier: undefined, - } - : await calculateCosts( - usedModel, - usedProvider, - calculatedPromptTokens, - calculatedCompletionTokens, - cachedTokens, - { - prompt: messages - .map((m) => messageContentToString(m.content)) - .join("\n"), - completion: fullContent, - toolResults: streamingToolCalls ?? undefined, - }, - reasoningTokens, - outputImageCount, - image_config?.image_size, - inputImageCount, - webSearchCount, - project.organizationId, - )); - - // Use costs.promptTokens as canonical value (includes image input - // tokens for providers that exclude them from upstream usage) - if (costs.promptTokens !== null && costs.promptTokens !== undefined) { - const promptDelta = - (costs.promptTokens ?? 0) - (calculatedPromptTokens ?? 0); - if (promptDelta > 0) { - calculatedPromptTokens = costs.promptTokens; - calculatedTotalTokens = - (calculatedTotalTokens ?? 0) + promptDelta; - } - } - - // Extract plugin IDs for logging - const streamingPluginIds = plugins?.map((p) => p.id) ?? []; - - // Determine plugin results for logging (includes healing results if applicable) - const finalPluginResults = - Object.keys(streamingPluginResults).length > 0 - ? streamingPluginResults - : undefined; - - const baseLogEntry = createLogEntry( - requestId, - project, - apiKey, - providerKey?.id, - usedModelFormatted, - usedModelMapping, - usedProvider, - initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoning_effort, - reasoning_max_tokens, - effort, - response_format, - tools, - tool_choice, - source, - customHeaders, - debugMode, - userAgent, - image_config, - routingMetadata, - rawBody, - streamingError ?? streamingRawResponseData, // Raw SSE data sent back to the client - requestBody, // The request sent to the provider - streamingError ?? rawUpstreamData, // Raw streaming data received from upstream provider - streamingPluginIds, - finalPluginResults, // Plugin results including healing (if enabled) - ); - - // Enhanced logging for Google models streaming to debug missing responses - if (isGoogleCompatibleProvider(usedProvider)) { - logger.debug("Google model streaming response completed", { - usedProvider, - usedModel, - hasContent: !!fullContent, - contentLength: fullContent.length, - finishReason, - promptTokens: calculatedPromptTokens, - completionTokens: calculatedCompletionTokens, - totalTokens: calculatedTotalTokens, - reasoningTokens, - streamingError: streamingError ? String(streamingError) : null, - canceled, - hasToolCalls: - !!streamingToolCalls && streamingToolCalls.length > 0, - }); - } - - // For cancelled requests, determine if we should include token counts for billing - const shouldIncludeTokensForBilling = - !canceled || (canceled && billCancelledRequests); - - await insertLog({ - ...baseLogEntry, - id: routingAttempts.length > 0 ? finalLogId : undefined, - duration, - timeToFirstToken, - timeToFirstReasoningToken, - responseSize: fullContent.length, - content: fullContent, - reasoningContent: fullReasoningContent || null, - finishReason: canceled ? "canceled" : finishReason, - promptTokens: shouldIncludeTokensForBilling - ? (calculatedPromptTokens?.toString() ?? null) - : null, - completionTokens: shouldIncludeTokensForBilling - ? (calculatedCompletionTokens?.toString() ?? null) - : null, - totalTokens: shouldIncludeTokensForBilling - ? (calculatedTotalTokens?.toString() ?? null) - : null, - reasoningTokens: shouldIncludeTokensForBilling - ? (calculatedReasoningTokens?.toString() ?? null) - : null, - cachedTokens: shouldIncludeTokensForBilling - ? (cachedTokens?.toString() ?? null) - : null, - hasError: streamingError !== null, - errorDetails: streamingError - ? { - statusCode: - typeof streamingError === "object" && - streamingError !== null && - "details" in streamingError && - typeof streamingError.details === "object" && - streamingError.details !== null && - "statusCode" in streamingError.details && - typeof streamingError.details.statusCode === "number" - ? streamingError.details.statusCode - : 500, - statusText: - typeof streamingError === "object" && - streamingError !== null && - "details" in streamingError && - typeof streamingError.details === "object" && - streamingError.details !== null && - "statusText" in streamingError.details && - typeof streamingError.details.statusText === "string" - ? streamingError.details.statusText - : "Streaming Error", - responseText: - typeof streamingError === "object" && - streamingError !== null && - "details" in streamingError && - typeof streamingError.details === "object" && - streamingError.details !== null && - "responseText" in streamingError.details && - typeof streamingError.details.responseText === "string" - ? streamingError.details.responseText - : typeof streamingError === "object" && - streamingError !== null && - "details" in streamingError - ? JSON.stringify(streamingError) - : streamingError instanceof Error - ? streamingError.message - : String(streamingError), - } - : null, - streamed: true, - canceled: canceled, - inputCost: costs.inputCost, - outputCost: costs.outputCost, - cachedInputCost: costs.cachedInputCost, - requestCost: costs.requestCost, - webSearchCost: costs.webSearchCost, - imageInputTokens: costs.imageInputTokens?.toString() ?? null, - imageOutputTokens: costs.imageOutputTokens?.toString() ?? null, - imageInputCost: costs.imageInputCost ?? null, - imageOutputCost: costs.imageOutputCost ?? null, - cost: costs.totalCost, - estimatedCost: costs.estimatedCost, - discount: costs.discount, - pricingTier: costs.pricingTier, - dataStorageCost: shouldIncludeTokensForBilling - ? calculateDataStorageCost( - calculatedPromptTokens, - cachedTokens, - calculatedCompletionTokens, - calculatedReasoningTokens, - retentionLevel, - ) - : "0", - cached: false, - tools, - toolResults: streamingToolCalls, - toolChoice: tool_choice, - }); - - // Report key health for environment-based tokens - if (envVarName !== undefined) { - if (streamingError !== null) { - reportKeyError(envVarName, configIndex, 500); - } else { - reportKeySuccess(envVarName, configIndex); - } - } - - // Save streaming cache if enabled and not canceled and no errors - if ( - cachingEnabled && - streamingCacheKey && - !canceled && - finishReason && - !streamingError - ) { - try { - const streamingCacheData = { - chunks: streamingChunks, - metadata: { - model: usedModel, - provider: usedProvider, - finishReason: finishReason, - totalChunks: streamingChunks.length, - duration: duration, - completed: true, - }, - }; - - await setStreamingCache( - streamingCacheKey, - streamingCacheData, - cacheDuration, - ); - } catch (error) { - logger.error( - "Error saving streaming cache", - error instanceof Error ? error : new Error(String(error)), - ); - } - } - } + })().finally(() => { + finishStreamCompletion(c); + }); }, async (error) => { if (error.name === "TimeoutError") { @@ -6726,10 +6714,6 @@ chat.openapi(completions, async (c) => { ), }); - // Log the error in the database - // Extract plugin IDs for logging (non-streaming fetch error) - const nonStreamingFetchErrorPluginIds = plugins?.map((p) => p.id) ?? []; - // Check if we should retry before logging so we can mark the log as retried const willRetryFetchNonStreaming = shouldRetryRequest({ requestedProvider, @@ -6743,80 +6727,78 @@ chat.openapi(completions, async (c) => { usedProvider, }); - const baseLogEntry = createLogEntry( - requestId, - project, - apiKey, - providerKey?.id, - usedModelFormatted, - usedModelMapping, - usedProvider, - initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoning_effort, - reasoning_max_tokens, - effort, - response_format, - tools, - tool_choice, - source, - customHeaders, - debugMode, - userAgent, - image_config, - routingMetadata, - rawBody, - null, // No response for fetch error - requestBody, // The request that resulted in error - null, // No upstream response for fetch error - nonStreamingFetchErrorPluginIds, - undefined, // No plugin results for error case - ); - - await insertLog({ - ...baseLogEntry, - duration: perAttemptDuration, - timeToFirstToken: null, // Not applicable for error case - timeToFirstReasoningToken: null, // Not applicable for error case - responseSize: 0, - content: null, - reasoningContent: null, - finishReason: "upstream_error", - promptTokens: null, - completionTokens: null, - totalTokens: null, - reasoningTokens: null, - cachedTokens: null, - hasError: true, - streamed: false, - canceled: false, - errorDetails: { - statusCode: 0, - statusText: fetchError.name, - responseText: errorMessage, - cause: nonStreamingFetchCause, + enqueueChatLog( + c, + { + providerKeyId: providerKey?.id, + usedModel: usedModelFormatted, + usedModelMapping, + usedProvider, + requestedModel: initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoningEffort: reasoning_effort, + reasoningMaxTokens: reasoning_max_tokens, + effort, + responseFormat: response_format, + tools, + toolChoice: tool_choice, + source, + customHeaders, + debugMode, + userAgent, + imageConfig: image_config, + routingMetadata, + rawRequest: rawBody, + rawResponse: null, + upstreamRequest: requestBody, + upstreamResponse: null, + plugins: requestPluginIds, + pluginResults: undefined, }, - cachedInputCost: null, - requestCost: null, - webSearchCost: null, - imageInputTokens: null, - imageOutputTokens: null, - imageInputCost: null, - imageOutputCost: null, - estimatedCost: false, - discount: null, - dataStorageCost: "0", - cached: false, - toolResults: null, - retried: willRetryFetchNonStreaming, - retriedByLogId: willRetryFetchNonStreaming ? finalLogId : null, - }); + { + duration: perAttemptDuration, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + responseSize: 0, + content: null, + reasoningContent: null, + finishReason: "upstream_error", + promptTokens: null, + completionTokens: null, + totalTokens: null, + reasoningTokens: null, + cachedTokens: null, + hasError: true, + streamed: false, + canceled: false, + errorDetails: { + statusCode: 0, + statusText: fetchError.name, + responseText: errorMessage, + cause: nonStreamingFetchCause, + }, + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + estimatedCost: false, + discount: null, + dataStorageCost: "0", + cached: false, + toolResults: null, + retried: willRetryFetchNonStreaming, + retriedByLogId: willRetryFetchNonStreaming ? finalLogId : null, + }, + ); // Report key health for environment-based tokens if (envVarName !== undefined) { @@ -6858,10 +6840,6 @@ chat.openapi(completions, async (c) => { // If the request was canceled, log it and return a response if (canceled) { - // Log the canceled request - // Extract plugin IDs for logging (canceled non-streaming) - const canceledNonStreamingPluginIds = plugins?.map((p) => p.id) ?? []; - // Calculate costs for cancelled request if billing is enabled const billCancelled = shouldBillCancelledRequests(); let cancelledCosts: Awaited> | null = @@ -6902,90 +6880,93 @@ chat.openapi(completions, async (c) => { ); } - const baseLogEntry = createLogEntry( - requestId, - project, - apiKey, - providerKey?.id, - usedModelFormatted, - usedModelMapping, - usedProvider, - initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoning_effort, - reasoning_max_tokens, - effort, - response_format, - tools, - tool_choice, - source, - customHeaders, - debugMode, - userAgent, - image_config, - routingMetadata, - rawBody, - null, // No response for canceled request - requestBody, // The request that was prepared before cancellation - null, // No upstream response for canceled request - canceledNonStreamingPluginIds, - undefined, // No plugin results for canceled request + enqueueChatLog( + c, + { + providerKeyId: providerKey?.id, + usedModel: usedModelFormatted, + usedModelMapping, + usedProvider, + requestedModel: initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoningEffort: reasoning_effort, + reasoningMaxTokens: reasoning_max_tokens, + effort, + responseFormat: response_format, + tools, + toolChoice: tool_choice, + source, + customHeaders, + debugMode, + userAgent, + imageConfig: image_config, + routingMetadata, + rawRequest: rawBody, + rawResponse: null, + upstreamRequest: requestBody, + upstreamResponse: null, + plugins: requestPluginIds, + pluginResults: undefined, + }, + { + duration, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + responseSize: 0, + content: null, + reasoningContent: null, + finishReason: "canceled", + promptTokens: billCancelled + ? ( + cancelledCosts?.promptTokens ?? estimatedPromptTokens + )?.toString() + : null, + completionTokens: billCancelled ? "0" : null, + totalTokens: billCancelled + ? ( + cancelledCosts?.promptTokens ?? estimatedPromptTokens + )?.toString() + : null, + reasoningTokens: null, + cachedTokens: null, + hasError: false, + streamed: false, + canceled: true, + errorDetails: null, + inputCost: cancelledCosts?.inputCost ?? null, + outputCost: cancelledCosts?.outputCost ?? null, + cachedInputCost: cancelledCosts?.cachedInputCost ?? null, + requestCost: cancelledCosts?.requestCost ?? null, + webSearchCost: cancelledCosts?.webSearchCost ?? null, + imageInputTokens: + cancelledCosts?.imageInputTokens?.toString() ?? null, + imageOutputTokens: + cancelledCosts?.imageOutputTokens?.toString() ?? null, + imageInputCost: cancelledCosts?.imageInputCost ?? null, + imageOutputCost: cancelledCosts?.imageOutputCost ?? null, + cost: cancelledCosts?.totalCost ?? null, + estimatedCost: cancelledCosts?.estimatedCost ?? false, + discount: cancelledCosts?.discount ?? null, + dataStorageCost: billCancelled + ? calculateDataStorageCost( + cancelledCosts?.promptTokens ?? estimatedPromptTokens, + null, + 0, + null, + retentionLevel, + ) + : "0", + cached: false, + toolResults: null, + }, ); - await insertLog({ - ...baseLogEntry, - duration, - timeToFirstToken: null, // Not applicable for canceled request - timeToFirstReasoningToken: null, // Not applicable for canceled request - responseSize: 0, - content: null, - reasoningContent: null, - finishReason: "canceled", - promptTokens: billCancelled - ? (cancelledCosts?.promptTokens ?? estimatedPromptTokens)?.toString() - : null, - completionTokens: billCancelled ? "0" : null, - totalTokens: billCancelled - ? (cancelledCosts?.promptTokens ?? estimatedPromptTokens)?.toString() - : null, - reasoningTokens: null, - cachedTokens: null, - hasError: false, - streamed: false, - canceled: true, - errorDetails: null, - inputCost: cancelledCosts?.inputCost ?? null, - outputCost: cancelledCosts?.outputCost ?? null, - cachedInputCost: cancelledCosts?.cachedInputCost ?? null, - requestCost: cancelledCosts?.requestCost ?? null, - webSearchCost: cancelledCosts?.webSearchCost ?? null, - imageInputTokens: cancelledCosts?.imageInputTokens?.toString() ?? null, - imageOutputTokens: - cancelledCosts?.imageOutputTokens?.toString() ?? null, - imageInputCost: cancelledCosts?.imageInputCost ?? null, - imageOutputCost: cancelledCosts?.imageOutputCost ?? null, - cost: cancelledCosts?.totalCost ?? null, - estimatedCost: cancelledCosts?.estimatedCost ?? false, - discount: cancelledCosts?.discount ?? null, - dataStorageCost: billCancelled - ? calculateDataStorageCost( - cancelledCosts?.promptTokens ?? estimatedPromptTokens, - null, - 0, - null, - retentionLevel, - ) - : "0", - cached: false, - toolResults: null, - }); - return c.json( { error: { @@ -7027,79 +7008,76 @@ chat.openapi(completions, async (c) => { ), }); - const bodyTimeoutPluginIds = plugins?.map((p) => p.id) ?? []; - const baseLogEntry = createLogEntry( - requestId, - project, - apiKey, - providerKey?.id, - usedModelFormatted, - usedModelMapping!, - usedProvider!, - initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoning_effort, - reasoning_max_tokens, - effort, - response_format, - tools, - tool_choice, - source, - customHeaders, - debugMode, - userAgent, - image_config, - routingMetadata, - rawBody, - null, - requestBody, - null, - bodyTimeoutPluginIds, - undefined, - ); - - await insertLog({ - ...baseLogEntry, - duration: Date.now() - perAttemptStartTime, - timeToFirstToken: null, - timeToFirstReasoningToken: null, - responseSize: 0, - content: null, - reasoningContent: null, - finishReason: "upstream_error", - promptTokens: null, - completionTokens: null, - totalTokens: null, - reasoningTokens: null, - cachedTokens: null, - hasError: true, - streamed: false, - canceled: false, - errorDetails: { - statusCode: res.status, - statusText: "TimeoutError", - responseText: errorMessage, - cause: bodyErrorCause, + enqueueChatLog( + c, + { + providerKeyId: providerKey?.id, + usedModel: usedModelFormatted, + usedModelMapping, + usedProvider, + requestedModel: initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoningEffort: reasoning_effort, + reasoningMaxTokens: reasoning_max_tokens, + effort, + responseFormat: response_format, + tools, + toolChoice: tool_choice, + source, + customHeaders, + debugMode, + userAgent, + imageConfig: image_config, + routingMetadata, + rawRequest: rawBody, + rawResponse: null, + upstreamRequest: requestBody, + upstreamResponse: null, + plugins: requestPluginIds, + pluginResults: undefined, }, - cachedInputCost: null, - requestCost: null, - webSearchCost: null, - imageInputTokens: null, - imageOutputTokens: null, - imageInputCost: null, - imageOutputCost: null, - estimatedCost: false, - discount: null, - dataStorageCost: "0", - cached: false, - toolResults: null, - }); + { + duration: Date.now() - perAttemptStartTime, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + responseSize: 0, + content: null, + reasoningContent: null, + finishReason: "upstream_error", + promptTokens: null, + completionTokens: null, + totalTokens: null, + reasoningTokens: null, + cachedTokens: null, + hasError: true, + streamed: false, + canceled: false, + errorDetails: { + statusCode: res.status, + statusText: "TimeoutError", + responseText: errorMessage, + cause: bodyErrorCause, + }, + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + estimatedCost: false, + discount: null, + dataStorageCost: "0", + cached: false, + toolResults: null, + }, + ); return c.json( { @@ -7143,10 +7121,6 @@ chat.openapi(completions, async (c) => { }); } - // Log the request in the database - // Extract plugin IDs for logging - const providerErrorPluginIds = plugins?.map((p) => p.id) ?? []; - // Check if we should retry before logging so we can mark the log as retried const willRetryHttpNonStreaming = shouldRetryRequest({ requestedProvider, @@ -7160,99 +7134,95 @@ chat.openapi(completions, async (c) => { usedProvider, }); - const baseLogEntry = createLogEntry( - requestId, - project, - apiKey, - providerKey?.id, - usedModelFormatted, - usedModelMapping, - usedProvider, - initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoning_effort, - reasoning_max_tokens, - effort, - response_format, - tools, - tool_choice, - source, - customHeaders, - debugMode, - userAgent, - image_config, - routingMetadata, - rawBody, - errorResponseText, // Our formatted error response - requestBody, // The request that resulted in error - errorResponseText, // Raw upstream error response - providerErrorPluginIds, - undefined, // No plugin results for error case - ); - - await insertLog({ - ...baseLogEntry, - duration: perAttemptDuration, - timeToFirstToken: null, // Not applicable for error case - timeToFirstReasoningToken: null, // Not applicable for error case - responseSize: errorResponseText.length, - content: null, - reasoningContent: null, - finishReason, - promptTokens: null, - completionTokens: null, - totalTokens: null, - reasoningTokens: null, - cachedTokens: null, - hasError: finishReason !== "content_filter", // content_filter is not an error - streamed: false, - canceled: false, - errorDetails: (() => { - // content_filter is not an error, no error details needed - if (finishReason === "content_filter") { - return null; - } - // For client errors, try to parse the original error and include the message - if (finishReason === "client_error") { - try { - const originalError = JSON.parse(errorResponseText); - return { - statusCode: res.status, - statusText: res.statusText, - responseText: errorResponseText, - message: originalError.error?.message ?? errorResponseText, - }; - } catch { - // If parsing fails, use default format + enqueueChatLog( + c, + { + providerKeyId: providerKey?.id, + usedModel: usedModelFormatted, + usedModelMapping, + usedProvider, + requestedModel: initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoningEffort: reasoning_effort, + reasoningMaxTokens: reasoning_max_tokens, + effort, + responseFormat: response_format, + tools, + toolChoice: tool_choice, + source, + customHeaders, + debugMode, + userAgent, + imageConfig: image_config, + routingMetadata, + rawRequest: rawBody, + rawResponse: errorResponseText, + upstreamRequest: requestBody, + upstreamResponse: errorResponseText, + plugins: requestPluginIds, + pluginResults: undefined, + }, + { + duration: perAttemptDuration, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + responseSize: errorResponseText.length, + content: null, + reasoningContent: null, + finishReason, + promptTokens: null, + completionTokens: null, + totalTokens: null, + reasoningTokens: null, + cachedTokens: null, + hasError: finishReason !== "content_filter", + streamed: false, + canceled: false, + errorDetails: (() => { + if (finishReason === "content_filter") { + return null; } - } - return { - statusCode: res.status, - statusText: res.statusText, - responseText: errorResponseText, - }; - })(), - cachedInputCost: null, - requestCost: null, - webSearchCost: null, - imageInputTokens: null, - imageOutputTokens: null, - imageInputCost: null, - imageOutputCost: null, - estimatedCost: false, - discount: null, - dataStorageCost: "0", - cached: false, - toolResults: null, - retried: willRetryHttpNonStreaming, - retriedByLogId: willRetryHttpNonStreaming ? finalLogId : null, - }); + if (finishReason === "client_error") { + try { + const originalError = JSON.parse(errorResponseText); + return { + statusCode: res.status, + statusText: res.statusText, + responseText: errorResponseText, + message: originalError.error?.message ?? errorResponseText, + }; + } catch { + // If parsing fails, use default format + } + } + return { + statusCode: res.status, + statusText: res.statusText, + responseText: errorResponseText, + }; + })(), + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + estimatedCost: false, + discount: null, + dataStorageCost: "0", + cached: false, + toolResults: null, + retried: willRetryHttpNonStreaming, + retriedByLogId: willRetryHttpNonStreaming ? finalLogId : null, + }, + ); // Report key health for environment-based tokens // Don't report content_filter as a key error - it's intentional provider behavior @@ -7416,79 +7386,76 @@ chat.openapi(completions, async (c) => { ), }); - const bodyTimeoutPluginIds = plugins?.map((p) => p.id) ?? []; - const baseLogEntry = createLogEntry( - requestId, - project, - apiKey, - providerKey?.id, - usedModelFormatted!, - usedModelMapping, - usedProvider, - initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoning_effort, - reasoning_max_tokens, - effort, - response_format, - tools, - tool_choice, - source, - customHeaders, - debugMode, - userAgent, - image_config, - routingMetadata, - rawBody, - null, - requestBody, - null, - bodyTimeoutPluginIds, - undefined, - ); - - await insertLog({ - ...baseLogEntry, - duration: Date.now() - startTime, - timeToFirstToken: null, - timeToFirstReasoningToken: null, - responseSize: 0, - content: null, - reasoningContent: null, - finishReason: "upstream_error", - promptTokens: null, - completionTokens: null, - totalTokens: null, - reasoningTokens: null, - cachedTokens: null, - hasError: true, - streamed: false, - canceled: false, - errorDetails: { - statusCode: res.status, - statusText: "TimeoutError", - responseText: errorMessage, - cause: bodyReadCause, + enqueueChatLog( + c, + { + providerKeyId: providerKey?.id, + usedModel: usedModelFormatted, + usedModelMapping, + usedProvider, + requestedModel: initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoningEffort: reasoning_effort, + reasoningMaxTokens: reasoning_max_tokens, + effort, + responseFormat: response_format, + tools, + toolChoice: tool_choice, + source, + customHeaders, + debugMode, + userAgent, + imageConfig: image_config, + routingMetadata, + rawRequest: rawBody, + rawResponse: null, + upstreamRequest: requestBody, + upstreamResponse: null, + plugins: requestPluginIds, + pluginResults: undefined, }, - cachedInputCost: null, - requestCost: null, - webSearchCost: null, - imageInputTokens: null, - imageOutputTokens: null, - imageInputCost: null, - imageOutputCost: null, - estimatedCost: false, - discount: null, - dataStorageCost: "0", - cached: false, - toolResults: null, - }); + { + duration: Date.now() - startTime, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + responseSize: 0, + content: null, + reasoningContent: null, + finishReason: "upstream_error", + promptTokens: null, + completionTokens: null, + totalTokens: null, + reasoningTokens: null, + cachedTokens: null, + hasError: true, + streamed: false, + canceled: false, + errorDetails: { + statusCode: res.status, + statusText: "TimeoutError", + responseText: errorMessage, + cause: bodyReadCause, + }, + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + estimatedCost: false, + discount: null, + dataStorageCost: "0", + cached: false, + toolResults: null, + }, + ); return c.json( { @@ -7706,45 +7673,6 @@ chat.openapi(completions, async (c) => { usedRegion, ); - // Extract plugin IDs for logging - const pluginIds = plugins?.map((p) => p.id) ?? []; - - const baseLogEntry = createLogEntry( - requestId, - project, - apiKey, - providerKey?.id, - usedModelFormatted, - usedModelMapping, - usedProvider, - initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoning_effort, - reasoning_max_tokens, - effort, - response_format, - tools, - tool_choice, - source, - customHeaders, - debugMode, - userAgent, - image_config, - routingMetadata, - rawBody, - transformedResponse, // Our formatted response that we return to user - requestBody, // The request sent to the provider - json, // Raw upstream response from provider - pluginIds, - Object.keys(pluginResults).length > 0 ? pluginResults : undefined, - ); - // Check if the non-streaming response is empty (no content, tokens, or tool calls) // Exclude content_filter responses as they are intentionally empty (blocked by provider) // For Google, check for original finish reasons that indicate content filtering @@ -7792,63 +7720,96 @@ chat.openapi(completions, async (c) => { } } - await insertLog({ - ...baseLogEntry, - id: routingAttempts.length > 0 ? finalLogId : undefined, - duration, - timeToFirstToken: null, // Not applicable for non-streaming requests - timeToFirstReasoningToken: null, // Not applicable for non-streaming requests - responseSize, - content: content, - reasoningContent: reasoningContent, - finishReason: hasEmptyNonStreamingResponse - ? "upstream_error" - : finishReason, - promptTokens: calculatedPromptTokens?.toString() ?? null, - completionTokens: calculatedCompletionTokens?.toString() ?? null, - totalTokens: - totalTokens ?? - ( - (calculatedPromptTokens ?? 0) + (calculatedCompletionTokens ?? 0) - ).toString(), - reasoningTokens: calculatedReasoningTokens?.toString() ?? null, - cachedTokens: cachedTokens?.toString() ?? null, - hasError: hasEmptyNonStreamingResponse, - streamed: false, - canceled: false, - errorDetails: hasEmptyNonStreamingResponse - ? { - statusCode: 500, - statusText: "Empty Response", - responseText: - "Response finished successfully but returned no content or tool calls", - } - : null, - inputCost: costs.inputCost, - outputCost: costs.outputCost, - cachedInputCost: costs.cachedInputCost, - requestCost: costs.requestCost, - webSearchCost: costs.webSearchCost, - imageInputTokens: costs.imageInputTokens?.toString() ?? null, - imageOutputTokens: costs.imageOutputTokens?.toString() ?? null, - imageInputCost: costs.imageInputCost ?? null, - imageOutputCost: costs.imageOutputCost ?? null, - cost: costs.totalCost, - estimatedCost: costs.estimatedCost, - discount: costs.discount, - pricingTier: costs.pricingTier, - dataStorageCost: calculateDataStorageCost( - calculatedPromptTokens, - cachedTokens, - calculatedCompletionTokens, - calculatedReasoningTokens, - retentionLevel, - ), - cached: false, - tools, - toolResults, - toolChoice: tool_choice, - }); + enqueueChatLog( + c, + { + providerKeyId: providerKey?.id, + usedModel: usedModelFormatted, + usedModelMapping, + usedProvider, + requestedModel: initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoningEffort: reasoning_effort, + reasoningMaxTokens: reasoning_max_tokens, + effort, + responseFormat: response_format, + tools, + toolChoice: tool_choice, + source, + customHeaders, + debugMode, + userAgent, + imageConfig: image_config, + routingMetadata, + rawRequest: rawBody, + rawResponse: transformedResponse, + upstreamRequest: requestBody, + upstreamResponse: json, + plugins: requestPluginIds, + pluginResults: + Object.keys(pluginResults).length > 0 ? pluginResults : undefined, + }, + { + id: routingAttempts.length > 0 ? finalLogId : undefined, + duration, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + responseSize, + content: content, + reasoningContent: reasoningContent, + finishReason: hasEmptyNonStreamingResponse + ? "upstream_error" + : finishReason, + promptTokens: calculatedPromptTokens?.toString() ?? null, + completionTokens: calculatedCompletionTokens?.toString() ?? null, + totalTokens: + totalTokens ?? + ( + (calculatedPromptTokens ?? 0) + (calculatedCompletionTokens ?? 0) + ).toString(), + reasoningTokens: calculatedReasoningTokens?.toString() ?? null, + cachedTokens: cachedTokens?.toString() ?? null, + hasError: hasEmptyNonStreamingResponse, + streamed: false, + canceled: false, + errorDetails: hasEmptyNonStreamingResponse + ? { + statusCode: 500, + statusText: "Empty Response", + responseText: + "Response finished successfully but returned no content or tool calls", + } + : null, + inputCost: costs.inputCost, + outputCost: costs.outputCost, + cachedInputCost: costs.cachedInputCost, + requestCost: costs.requestCost, + webSearchCost: costs.webSearchCost, + imageInputTokens: costs.imageInputTokens?.toString() ?? null, + imageOutputTokens: costs.imageOutputTokens?.toString() ?? null, + imageInputCost: costs.imageInputCost ?? null, + imageOutputCost: costs.imageOutputCost ?? null, + cost: costs.totalCost, + estimatedCost: costs.estimatedCost, + discount: costs.discount, + pricingTier: costs.pricingTier, + dataStorageCost: calculateDataStorageCost( + calculatedPromptTokens, + cachedTokens, + calculatedCompletionTokens, + calculatedReasoningTokens, + retentionLevel, + ), + cached: false, + toolResults, + }, + ); // Report key health for environment-based tokens // Note: We don't report empty responses as key errors since they're not upstream errors diff --git a/apps/gateway/src/chat/middleware/chat-completion-log.ts b/apps/gateway/src/chat/middleware/chat-completion-log.ts new file mode 100644 index 0000000000..aee0971676 --- /dev/null +++ b/apps/gateway/src/chat/middleware/chat-completion-log.ts @@ -0,0 +1,144 @@ +import { createMiddleware } from "hono/factory"; +import { HTTPException } from "hono/http-exception"; + +import { + buildBaseLogEntry, + type ChatCompletionLogState, +} from "@/chat/tools/chat-log-context.js"; +import { insertLog as _insertLog } from "@/lib/logs.js"; + +import { logger } from "@llmgateway/logger"; + +import type { ServerTypes } from "@/vars.js"; +import type { LogInsertData } from "@llmgateway/db"; + +function getSynthesizedClientErrorLog( + baseLogEntry: ReturnType, + status: number, + error: unknown, +): LogInsertData | null { + if (!baseLogEntry) { + return null; + } + + const responseText = + error instanceof HTTPException + ? error.message + : error instanceof Error + ? error.message + : "Client error"; + + return { + ...baseLogEntry, + content: null, + responseSize: responseText.length, + finishReason: "client_error", + unifiedFinishReason: "client_error", + promptTokens: null, + completionTokens: null, + totalTokens: null, + reasoningTokens: null, + cachedTokens: null, + hasError: true, + streamed: + typeof baseLogEntry.rawRequest === "object" && + baseLogEntry.rawRequest !== null && + "stream" in baseLogEntry.rawRequest + ? Boolean(baseLogEntry.rawRequest.stream) + : false, + canceled: false, + errorDetails: { + statusCode: status, + statusText: + error instanceof HTTPException + ? "Client Error" + : error instanceof Error + ? error.name + : "Client Error", + responseText, + }, + duration: 0, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + inputCost: null, + outputCost: null, + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + cost: null, + estimatedCost: false, + discount: null, + pricingTier: null, + dataStorageCost: "0", + cached: false, + toolResults: null, + }; +} + +export const chatCompletionLogMiddleware = createMiddleware( + async (c, next) => { + const state: ChatCompletionLogState = { + pendingLogs: [], + clientErrorSynthesized: false, + }; + c.set("chatCompletionLogState", state); + + try { + await next(); + } catch (error) { + state.caughtError = error; + throw error; + } finally { + try { + await state.streamCompletion; + } catch (error) { + logger.error( + "Error waiting for chat stream completion before flushing logs", + error instanceof Error ? error : new Error(String(error)), + ); + } + + const status = + state.caughtError instanceof HTTPException + ? state.caughtError.status + : c.res.status; + const hasQueuedClientError = state.pendingLogs.some( + (log) => + log.finishReason === "client_error" || + log.unifiedFinishReason === "client_error", + ); + + if (status >= 400 && status < 500 && !hasQueuedClientError) { + const synthesizedLog = getSynthesizedClientErrorLog( + buildBaseLogEntry(c), + status, + state.caughtError, + ); + if (synthesizedLog) { + state.pendingLogs.push(synthesizedLog); + state.clientErrorSynthesized = true; + } + } + + for (const logData of state.pendingLogs) { + try { + await _insertLog({ + ...logData, + internalContentFilter: state.internalContentFilter + ? true + : logData.internalContentFilter, + }); + } catch (error) { + logger.error( + "Failed to flush queued chat completion log", + error instanceof Error ? error : new Error(String(error)), + ); + } + } + } + }, +); diff --git a/apps/gateway/src/chat/tools/chat-log-context.ts b/apps/gateway/src/chat/tools/chat-log-context.ts new file mode 100644 index 0000000000..f914a1f019 --- /dev/null +++ b/apps/gateway/src/chat/tools/chat-log-context.ts @@ -0,0 +1,126 @@ +import { logger } from "@llmgateway/logger"; + +import { + createLogEntry, + type CreateLogEntryOptions, +} from "./create-log-entry.js"; + +import type { ServerTypes } from "@/vars.js"; +import type { LogInsertData } from "@llmgateway/db"; +import type { Context } from "hono"; + +export interface ChatCompletionLogState { + pendingLogs: LogInsertData[]; + baseLogOptions?: Partial; + streamCompletion?: Promise; + resolveStreamCompletion?: () => void; + caughtError?: unknown; + internalContentFilter?: boolean; + clientErrorSynthesized?: boolean; +} + +function getOrCreateChatCompletionLogState( + c: Context, +): ChatCompletionLogState { + const existingState = c.get("chatCompletionLogState"); + if (existingState) { + return existingState; + } + + const nextState: ChatCompletionLogState = { + pendingLogs: [], + clientErrorSynthesized: false, + }; + c.set("chatCompletionLogState", nextState); + return nextState; +} + +export function getChatCompletionLogState( + c: Context, +): ChatCompletionLogState | undefined { + return c.get("chatCompletionLogState"); +} + +export function updateBaseLogOptions( + c: Context, + patch: Partial, +) { + const state = getOrCreateChatCompletionLogState(c); + state.baseLogOptions = { + ...state.baseLogOptions, + ...patch, + }; +} + +function hasCompleteBaseLogOptions( + options?: Partial, +): options is CreateLogEntryOptions { + return Boolean( + options && + typeof options.requestId === "string" && + options.project && + options.apiKey && + typeof options.usedModel === "string" && + typeof options.usedProvider === "string" && + typeof options.requestedModel === "string" && + Array.isArray(options.messages) && + options.customHeaders !== undefined && + typeof options.debugMode === "boolean", + ); +} + +export function buildBaseLogEntry( + c: Context, + patch: Partial = {}, +) { + const state = getOrCreateChatCompletionLogState(c); + const mergedOptions = { + ...state.baseLogOptions, + ...patch, + }; + + if (!hasCompleteBaseLogOptions(mergedOptions)) { + return null; + } + + return createLogEntry(mergedOptions); +} + +export function enqueueChatLog( + c: Context, + basePatch: Partial, + logFields: Omit>, +) { + const state = getOrCreateChatCompletionLogState(c); + const baseLogEntry = buildBaseLogEntry(c, basePatch); + + if (!baseLogEntry) { + logger.warn( + "Skipping chat log enqueue because base log options are incomplete", + { + requestId: state.baseLogOptions?.requestId, + }, + ); + return; + } + + state.pendingLogs.push({ + ...baseLogEntry, + ...logFields, + }); +} + +export function registerStreamCompletion(c: Context) { + const state = getOrCreateChatCompletionLogState(c); + state.streamCompletion ??= new Promise((resolve) => { + state.resolveStreamCompletion = resolve; + }); + + return state.streamCompletion; +} + +export function finishStreamCompletion(c: Context) { + const state = getOrCreateChatCompletionLogState(c); + state.resolveStreamCompletion?.(); + state.resolveStreamCompletion = undefined; +} diff --git a/apps/gateway/src/test-utils/test-helpers.ts b/apps/gateway/src/test-utils/test-helpers.ts index c4e7a5991e..42cfcc9d04 100644 --- a/apps/gateway/src/test-utils/test-helpers.ts +++ b/apps/gateway/src/test-utils/test-helpers.ts @@ -7,6 +7,10 @@ export async function clearCache() { await redisClient.flushdb(); } +export async function processPendingLogs() { + await processLogQueue(); +} + /** * Helper function to wait for logs to be processed by the worker * @param expectedCount The expected number of logs diff --git a/apps/gateway/src/vars.ts b/apps/gateway/src/vars.ts index bb9187e75c..dd4d785c14 100644 --- a/apps/gateway/src/vars.ts +++ b/apps/gateway/src/vars.ts @@ -1,8 +1,10 @@ +import type { ChatCompletionLogState } from "@/chat/tools/chat-log-context.js"; import type { Env } from "hono/types"; export interface ServerTypes extends Env { Variables: { traceId?: string; spanId?: string; + chatCompletionLogState?: ChatCompletionLogState; }; } From 9d8f7ea8526df4aa316163848942ca84ea89f6f1 Mon Sep 17 00:00:00 2001 From: Luca Steeb Date: Sun, 29 Mar 2026 19:24:47 +0700 Subject: [PATCH 02/14] fix: avoid chat log stream deadlock --- .../chat/middleware/chat-completion-log.ts | 99 +++++++++++-------- 1 file changed, 56 insertions(+), 43 deletions(-) diff --git a/apps/gateway/src/chat/middleware/chat-completion-log.ts b/apps/gateway/src/chat/middleware/chat-completion-log.ts index aee0971676..c6cca72a15 100644 --- a/apps/gateway/src/chat/middleware/chat-completion-log.ts +++ b/apps/gateway/src/chat/middleware/chat-completion-log.ts @@ -11,6 +11,7 @@ import { logger } from "@llmgateway/logger"; import type { ServerTypes } from "@/vars.js"; import type { LogInsertData } from "@llmgateway/db"; +import type { Context } from "hono"; function getSynthesizedClientErrorLog( baseLogEntry: ReturnType, @@ -79,6 +80,58 @@ function getSynthesizedClientErrorLog( }; } +async function flushChatCompletionLogs( + c: Context, + state: ChatCompletionLogState, +) { + try { + await state.streamCompletion; + } catch (error) { + logger.error( + "Error waiting for chat stream completion before flushing logs", + error instanceof Error ? error : new Error(String(error)), + ); + } + + const status = + state.caughtError instanceof HTTPException + ? state.caughtError.status + : c.res.status; + const hasQueuedClientError = state.pendingLogs.some( + (log) => + log.finishReason === "client_error" || + log.unifiedFinishReason === "client_error", + ); + + if (status >= 400 && status < 500 && !hasQueuedClientError) { + const synthesizedLog = getSynthesizedClientErrorLog( + buildBaseLogEntry(c), + status, + state.caughtError, + ); + if (synthesizedLog) { + state.pendingLogs.push(synthesizedLog); + state.clientErrorSynthesized = true; + } + } + + for (const logData of state.pendingLogs) { + try { + await _insertLog({ + ...logData, + internalContentFilter: state.internalContentFilter + ? true + : logData.internalContentFilter, + }); + } catch (error) { + logger.error( + "Failed to flush queued chat completion log", + error instanceof Error ? error : new Error(String(error)), + ); + } + } +} + export const chatCompletionLogMiddleware = createMiddleware( async (c, next) => { const state: ChatCompletionLogState = { @@ -93,52 +146,12 @@ export const chatCompletionLogMiddleware = createMiddleware( state.caughtError = error; throw error; } finally { - try { - await state.streamCompletion; - } catch (error) { + void flushChatCompletionLogs(c, state).catch((error) => { logger.error( - "Error waiting for chat stream completion before flushing logs", + "Unexpected failure flushing queued chat completion logs", error instanceof Error ? error : new Error(String(error)), ); - } - - const status = - state.caughtError instanceof HTTPException - ? state.caughtError.status - : c.res.status; - const hasQueuedClientError = state.pendingLogs.some( - (log) => - log.finishReason === "client_error" || - log.unifiedFinishReason === "client_error", - ); - - if (status >= 400 && status < 500 && !hasQueuedClientError) { - const synthesizedLog = getSynthesizedClientErrorLog( - buildBaseLogEntry(c), - status, - state.caughtError, - ); - if (synthesizedLog) { - state.pendingLogs.push(synthesizedLog); - state.clientErrorSynthesized = true; - } - } - - for (const logData of state.pendingLogs) { - try { - await _insertLog({ - ...logData, - internalContentFilter: state.internalContentFilter - ? true - : logData.internalContentFilter, - }); - } catch (error) { - logger.error( - "Failed to flush queued chat completion log", - error instanceof Error ? error : new Error(String(error)), - ); - } - } + }); } }, ); From 4059a4016c22dcf4dc16544034242e6eebbdec6f Mon Sep 17 00:00:00 2001 From: Luca Steeb Date: Mon, 30 Mar 2026 00:33:54 +0700 Subject: [PATCH 03/14] fix: flush non-stream chat logs --- apps/gateway/src/api.spec.ts | 12 ++++++++++++ .../src/chat/middleware/chat-completion-log.ts | 16 ++++++++++------ 2 files changed, 22 insertions(+), 6 deletions(-) diff --git a/apps/gateway/src/api.spec.ts b/apps/gateway/src/api.spec.ts index 2bf6fe4d3c..390fab7f99 100644 --- a/apps/gateway/src/api.spec.ts +++ b/apps/gateway/src/api.spec.ts @@ -1162,6 +1162,7 @@ describe("api", () => { }); test("Reasoning effort error for unsupported model", async () => { + const requestId = "reasoning-effort-unsupported-request-id"; await db.insert(tables.apiKey).values({ id: "token-id", token: "real-token", @@ -1174,6 +1175,7 @@ describe("api", () => { method: "POST", headers: { "Content-Type": "application/json", + "x-request-id": requestId, Authorization: `Bearer real-token`, }, body: JSON.stringify({ @@ -1192,6 +1194,16 @@ describe("api", () => { const json = await res.json(); expect(json.message).toContain("does not support reasoning"); + + const log = await waitForLogByRequestId(requestId); + expect(log.finishReason).toBe("client_error"); + expect(log.unifiedFinishReason).toBe("client_error"); + + const matchingLogs = await db + .select() + .from(tables.log) + .where(eq(tables.log.requestId, requestId)); + expect(matchingLogs).toHaveLength(1); }); test("Max tokens validation error when exceeding model limit", async () => { diff --git a/apps/gateway/src/chat/middleware/chat-completion-log.ts b/apps/gateway/src/chat/middleware/chat-completion-log.ts index c6cca72a15..619d16f78f 100644 --- a/apps/gateway/src/chat/middleware/chat-completion-log.ts +++ b/apps/gateway/src/chat/middleware/chat-completion-log.ts @@ -146,12 +146,16 @@ export const chatCompletionLogMiddleware = createMiddleware( state.caughtError = error; throw error; } finally { - void flushChatCompletionLogs(c, state).catch((error) => { - logger.error( - "Unexpected failure flushing queued chat completion logs", - error instanceof Error ? error : new Error(String(error)), - ); - }); + if (state.streamCompletion) { + void flushChatCompletionLogs(c, state).catch((error) => { + logger.error( + "Unexpected failure flushing queued chat completion logs", + error instanceof Error ? error : new Error(String(error)), + ); + }); + } else { + await flushChatCompletionLogs(c, state); + } } }, ); From 13ba95e392a62392ad573a3043c55a60415329d7 Mon Sep 17 00:00:00 2001 From: Luca Steeb Date: Mon, 30 Mar 2026 18:22:41 +0700 Subject: [PATCH 04/14] refactor: drop insertLog alias --- apps/gateway/src/chat/middleware/chat-completion-log.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/apps/gateway/src/chat/middleware/chat-completion-log.ts b/apps/gateway/src/chat/middleware/chat-completion-log.ts index 619d16f78f..0dccf0dc3c 100644 --- a/apps/gateway/src/chat/middleware/chat-completion-log.ts +++ b/apps/gateway/src/chat/middleware/chat-completion-log.ts @@ -5,7 +5,7 @@ import { buildBaseLogEntry, type ChatCompletionLogState, } from "@/chat/tools/chat-log-context.js"; -import { insertLog as _insertLog } from "@/lib/logs.js"; +import { insertLog } from "@/lib/logs.js"; import { logger } from "@llmgateway/logger"; @@ -117,7 +117,7 @@ async function flushChatCompletionLogs( for (const logData of state.pendingLogs) { try { - await _insertLog({ + await insertLog({ ...logData, internalContentFilter: state.internalContentFilter ? true From e28767cff14d6801d173b1bf352703046f2a7038 Mon Sep 17 00:00:00 2001 From: steebchen Date: Mon, 30 Mar 2026 15:21:30 +0000 Subject: [PATCH 05/14] fix: handle streaming terminal events --- apps/gateway/src/chat/chat.ts | 8 +- .../tools/transform-streaming-to-openai.ts | 77 ++++++++----------- 2 files changed, 40 insertions(+), 45 deletions(-) diff --git a/apps/gateway/src/chat/chat.ts b/apps/gateway/src/chat/chat.ts index 79ab0dc17a..f83553188f 100644 --- a/apps/gateway/src/chat/chat.ts +++ b/apps/gateway/src/chat/chat.ts @@ -5531,8 +5531,12 @@ chat.openapi(completions, async (c) => { } break; default: // OpenAI format - if (data.choices && data.choices[0]?.finish_reason) { - finishReason = data.choices[0].finish_reason; + if ( + transformedData?.choices && + transformedData.choices[0]?.finish_reason + ) { + finishReason = transformedData.choices[0].finish_reason; + sawProviderTerminalEvent = true; } break; } diff --git a/apps/gateway/src/chat/tools/transform-streaming-to-openai.ts b/apps/gateway/src/chat/tools/transform-streaming-to-openai.ts index a04b82e36e..ab9c37277d 100644 --- a/apps/gateway/src/chat/tools/transform-streaming-to-openai.ts +++ b/apps/gateway/src/chat/tools/transform-streaming-to-openai.ts @@ -18,6 +18,26 @@ export function transformStreamingToOpenai( ): any { let transformedData = data; + const mapOpenAIResponsesUsage = (responseUsage: any) => { + if (!responseUsage) { + return null; + } + + return { + prompt_tokens: responseUsage.input_tokens ?? 0, + completion_tokens: responseUsage.output_tokens ?? 0, + total_tokens: responseUsage.total_tokens ?? 0, + ...(responseUsage.output_tokens_details?.reasoning_tokens && { + reasoning_tokens: responseUsage.output_tokens_details.reasoning_tokens, + }), + ...(responseUsage.input_tokens_details?.cached_tokens && { + prompt_tokens_details: { + cached_tokens: responseUsage.input_tokens_details.cached_tokens, + }, + }), + }; + }; + switch (usedProvider) { case "anthropic": { if (data.type === "content_block_delta" && data.delta?.text) { @@ -769,7 +789,13 @@ export function transformStreamingToOpenai( case "response.output_text.done": case "response.web_search_call.in_progress": case "response.web_search_call.searching": - case "response.web_search_call.completed": + case "response.web_search_call.completed": { + const responseStatus = data.response?.status; + const isCompletedTerminalEvent = + responseStatus === "completed" && + (data.type === "response.content_part.done" || + data.type === "response.output_text.done" || + data.type === "response.output_item.done"); transformedData = { id: data.response?.id ?? `chatcmpl-${Date.now()}`, object: "chat.completion.chunk", @@ -780,12 +806,15 @@ export function transformStreamingToOpenai( { index: 0, delta: { role: "assistant" }, - finish_reason: null, + finish_reason: isCompletedTerminalEvent ? "stop" : null, }, ], - usage: null, + usage: isCompletedTerminalEvent + ? mapOpenAIResponsesUsage(data.response?.usage) + : null, }; break; + } case "response.reasoning_summary_part.added": case "response.reasoning_summary_text.delta": @@ -908,25 +937,6 @@ export function transformStreamingToOpenai( } case "response.completed": { - const responseUsage = data.response?.usage; - let usage = null; - if (responseUsage) { - usage = { - prompt_tokens: responseUsage.input_tokens ?? 0, - completion_tokens: responseUsage.output_tokens ?? 0, - total_tokens: responseUsage.total_tokens ?? 0, - ...(responseUsage.output_tokens_details?.reasoning_tokens && { - reasoning_tokens: - responseUsage.output_tokens_details.reasoning_tokens, - }), - ...(responseUsage.input_tokens_details?.cached_tokens && { - prompt_tokens_details: { - cached_tokens: - responseUsage.input_tokens_details.cached_tokens, - }, - }), - }; - } transformedData = { id: data.response?.id ?? `chatcmpl-${Date.now()}`, object: "chat.completion.chunk", @@ -940,31 +950,12 @@ export function transformStreamingToOpenai( finish_reason: "stop", }, ], - usage, + usage: mapOpenAIResponsesUsage(data.response?.usage), }; break; } case "response.incomplete": { - const incompleteUsage = data.response?.usage; - let usage = null; - if (incompleteUsage) { - usage = { - prompt_tokens: incompleteUsage.input_tokens ?? 0, - completion_tokens: incompleteUsage.output_tokens ?? 0, - total_tokens: incompleteUsage.total_tokens ?? 0, - ...(incompleteUsage.output_tokens_details?.reasoning_tokens && { - reasoning_tokens: - incompleteUsage.output_tokens_details.reasoning_tokens, - }), - ...(incompleteUsage.input_tokens_details?.cached_tokens && { - prompt_tokens_details: { - cached_tokens: - incompleteUsage.input_tokens_details.cached_tokens, - }, - }), - }; - } const reason = data.response?.incomplete_details?.reason; // Map incomplete reason to appropriate finish_reason const mappedFinishReason = @@ -982,7 +973,7 @@ export function transformStreamingToOpenai( finish_reason: mappedFinishReason, }, ], - usage, + usage: mapOpenAIResponsesUsage(data.response?.usage), }; break; } From cb19acee228dd114baa9f8f2f4662b00a6847bbd Mon Sep 17 00:00:00 2001 From: steebchen Date: Mon, 30 Mar 2026 16:28:27 +0000 Subject: [PATCH 06/14] Revert "fix: handle streaming terminal events" This reverts commit e28767cff14d6801d173b1bf352703046f2a7038. --- apps/gateway/src/chat/chat.ts | 8 +- .../tools/transform-streaming-to-openai.ts | 77 +++++++++++-------- 2 files changed, 45 insertions(+), 40 deletions(-) diff --git a/apps/gateway/src/chat/chat.ts b/apps/gateway/src/chat/chat.ts index f83553188f..79ab0dc17a 100644 --- a/apps/gateway/src/chat/chat.ts +++ b/apps/gateway/src/chat/chat.ts @@ -5531,12 +5531,8 @@ chat.openapi(completions, async (c) => { } break; default: // OpenAI format - if ( - transformedData?.choices && - transformedData.choices[0]?.finish_reason - ) { - finishReason = transformedData.choices[0].finish_reason; - sawProviderTerminalEvent = true; + if (data.choices && data.choices[0]?.finish_reason) { + finishReason = data.choices[0].finish_reason; } break; } diff --git a/apps/gateway/src/chat/tools/transform-streaming-to-openai.ts b/apps/gateway/src/chat/tools/transform-streaming-to-openai.ts index ab9c37277d..a04b82e36e 100644 --- a/apps/gateway/src/chat/tools/transform-streaming-to-openai.ts +++ b/apps/gateway/src/chat/tools/transform-streaming-to-openai.ts @@ -18,26 +18,6 @@ export function transformStreamingToOpenai( ): any { let transformedData = data; - const mapOpenAIResponsesUsage = (responseUsage: any) => { - if (!responseUsage) { - return null; - } - - return { - prompt_tokens: responseUsage.input_tokens ?? 0, - completion_tokens: responseUsage.output_tokens ?? 0, - total_tokens: responseUsage.total_tokens ?? 0, - ...(responseUsage.output_tokens_details?.reasoning_tokens && { - reasoning_tokens: responseUsage.output_tokens_details.reasoning_tokens, - }), - ...(responseUsage.input_tokens_details?.cached_tokens && { - prompt_tokens_details: { - cached_tokens: responseUsage.input_tokens_details.cached_tokens, - }, - }), - }; - }; - switch (usedProvider) { case "anthropic": { if (data.type === "content_block_delta" && data.delta?.text) { @@ -789,13 +769,7 @@ export function transformStreamingToOpenai( case "response.output_text.done": case "response.web_search_call.in_progress": case "response.web_search_call.searching": - case "response.web_search_call.completed": { - const responseStatus = data.response?.status; - const isCompletedTerminalEvent = - responseStatus === "completed" && - (data.type === "response.content_part.done" || - data.type === "response.output_text.done" || - data.type === "response.output_item.done"); + case "response.web_search_call.completed": transformedData = { id: data.response?.id ?? `chatcmpl-${Date.now()}`, object: "chat.completion.chunk", @@ -806,15 +780,12 @@ export function transformStreamingToOpenai( { index: 0, delta: { role: "assistant" }, - finish_reason: isCompletedTerminalEvent ? "stop" : null, + finish_reason: null, }, ], - usage: isCompletedTerminalEvent - ? mapOpenAIResponsesUsage(data.response?.usage) - : null, + usage: null, }; break; - } case "response.reasoning_summary_part.added": case "response.reasoning_summary_text.delta": @@ -937,6 +908,25 @@ export function transformStreamingToOpenai( } case "response.completed": { + const responseUsage = data.response?.usage; + let usage = null; + if (responseUsage) { + usage = { + prompt_tokens: responseUsage.input_tokens ?? 0, + completion_tokens: responseUsage.output_tokens ?? 0, + total_tokens: responseUsage.total_tokens ?? 0, + ...(responseUsage.output_tokens_details?.reasoning_tokens && { + reasoning_tokens: + responseUsage.output_tokens_details.reasoning_tokens, + }), + ...(responseUsage.input_tokens_details?.cached_tokens && { + prompt_tokens_details: { + cached_tokens: + responseUsage.input_tokens_details.cached_tokens, + }, + }), + }; + } transformedData = { id: data.response?.id ?? `chatcmpl-${Date.now()}`, object: "chat.completion.chunk", @@ -950,12 +940,31 @@ export function transformStreamingToOpenai( finish_reason: "stop", }, ], - usage: mapOpenAIResponsesUsage(data.response?.usage), + usage, }; break; } case "response.incomplete": { + const incompleteUsage = data.response?.usage; + let usage = null; + if (incompleteUsage) { + usage = { + prompt_tokens: incompleteUsage.input_tokens ?? 0, + completion_tokens: incompleteUsage.output_tokens ?? 0, + total_tokens: incompleteUsage.total_tokens ?? 0, + ...(incompleteUsage.output_tokens_details?.reasoning_tokens && { + reasoning_tokens: + incompleteUsage.output_tokens_details.reasoning_tokens, + }), + ...(incompleteUsage.input_tokens_details?.cached_tokens && { + prompt_tokens_details: { + cached_tokens: + incompleteUsage.input_tokens_details.cached_tokens, + }, + }), + }; + } const reason = data.response?.incomplete_details?.reason; // Map incomplete reason to appropriate finish_reason const mappedFinishReason = @@ -973,7 +982,7 @@ export function transformStreamingToOpenai( finish_reason: mappedFinishReason, }, ], - usage: mapOpenAIResponsesUsage(data.response?.usage), + usage, }; break; } From b9c865fd938a06d7a605633ec1a092dda388ebce Mon Sep 17 00:00:00 2001 From: "Luca Steeb (bot)" Date: Sun, 5 Apr 2026 08:15:19 +0000 Subject: [PATCH 07/14] chore(autofix): apply diff --- .../chat/middleware/chat-completion-log.ts | 25 +++++++++++-------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/apps/gateway/src/chat/middleware/chat-completion-log.ts b/apps/gateway/src/chat/middleware/chat-completion-log.ts index 2011dc0aaf..ceaf26d18f 100644 --- a/apps/gateway/src/chat/middleware/chat-completion-log.ts +++ b/apps/gateway/src/chat/middleware/chat-completion-log.ts @@ -117,17 +117,20 @@ async function flushChatCompletionLogs( for (const logData of state.pendingLogs) { try { - await insertLog({ - ...logData, - ...(state.logIdOverride && !logData.retried - ? { id: state.logIdOverride } - : {}), - responsesApiData: - logData.responsesApiData ?? state.responsesApiData ?? null, - internalContentFilter: state.internalContentFilter - ? true - : logData.internalContentFilter, - }, { syncInsert: state.syncInsert }); + await insertLog( + { + ...logData, + ...(state.logIdOverride && !logData.retried + ? { id: state.logIdOverride } + : {}), + responsesApiData: + logData.responsesApiData ?? state.responsesApiData ?? null, + internalContentFilter: state.internalContentFilter + ? true + : logData.internalContentFilter, + }, + { syncInsert: state.syncInsert }, + ); } catch (error) { logger.error( "Failed to flush queued chat completion log", From 4ecd85778d2a05356842afbb667cc799af0804f7 Mon Sep 17 00:00:00 2001 From: Luca Steeb Date: Sun, 5 Apr 2026 17:57:38 +0700 Subject: [PATCH 08/14] fix: avoid duplicate client error logs --- .../middleware/chat-completion-log.spec.ts | 32 +++++++++++++++++++ .../chat/middleware/chat-completion-log.ts | 14 ++++---- 2 files changed, 40 insertions(+), 6 deletions(-) create mode 100644 apps/gateway/src/chat/middleware/chat-completion-log.spec.ts diff --git a/apps/gateway/src/chat/middleware/chat-completion-log.spec.ts b/apps/gateway/src/chat/middleware/chat-completion-log.spec.ts new file mode 100644 index 0000000000..0591052c5c --- /dev/null +++ b/apps/gateway/src/chat/middleware/chat-completion-log.spec.ts @@ -0,0 +1,32 @@ +import { describe, expect, it } from "vitest"; + +import { shouldSynthesizeClientError } from "./chat-completion-log.js"; + +describe("shouldSynthesizeClientError", () => { + it("synthesizes for 4xx responses when no logs are queued", () => { + expect(shouldSynthesizeClientError(400, [])).toBe(true); + expect(shouldSynthesizeClientError(429, [])).toBe(true); + }); + + it("skips synthesis when any terminal log is already queued", () => { + expect( + shouldSynthesizeClientError(400, [ + { + finishReason: "canceled", + } as never, + ]), + ).toBe(false); + expect( + shouldSynthesizeClientError(400, [ + { + finishReason: "content_filter", + } as never, + ]), + ).toBe(false); + }); + + it("skips synthesis for non-4xx responses", () => { + expect(shouldSynthesizeClientError(200, [])).toBe(false); + expect(shouldSynthesizeClientError(500, [])).toBe(false); + }); +}); diff --git a/apps/gateway/src/chat/middleware/chat-completion-log.ts b/apps/gateway/src/chat/middleware/chat-completion-log.ts index ceaf26d18f..1e520c1921 100644 --- a/apps/gateway/src/chat/middleware/chat-completion-log.ts +++ b/apps/gateway/src/chat/middleware/chat-completion-log.ts @@ -80,6 +80,13 @@ function getSynthesizedClientErrorLog( }; } +export function shouldSynthesizeClientError( + status: number, + pendingLogs: LogInsertData[], +): boolean { + return status >= 400 && status < 500 && pendingLogs.length === 0; +} + async function flushChatCompletionLogs( c: Context, state: ChatCompletionLogState, @@ -97,13 +104,8 @@ async function flushChatCompletionLogs( state.caughtError instanceof HTTPException ? state.caughtError.status : c.res.status; - const hasQueuedClientError = state.pendingLogs.some( - (log) => - log.finishReason === "client_error" || - log.unifiedFinishReason === "client_error", - ); - if (status >= 400 && status < 500 && !hasQueuedClientError) { + if (shouldSynthesizeClientError(status, state.pendingLogs)) { const synthesizedLog = getSynthesizedClientErrorLog( buildBaseLogEntry(c), status, From b852c5183c87788d74daf0140b10786c49f34991 Mon Sep 17 00:00:00 2001 From: Luca Steeb Date: Sun, 5 Apr 2026 18:00:07 +0700 Subject: [PATCH 09/14] fix: warn on client errors in activity --- .../activity/[logId]/log-detail-client.tsx | 30 ++++++++++-- apps/ui/src/components/dashboard/log-card.tsx | 48 +++++++++++++++---- 2 files changed, 63 insertions(+), 15 deletions(-) diff --git a/apps/ui/src/app/dashboard/[orgId]/[projectId]/activity/[logId]/log-detail-client.tsx b/apps/ui/src/app/dashboard/[orgId]/[projectId]/activity/[logId]/log-detail-client.tsx index 1a4f318e8b..74bdb73571 100644 --- a/apps/ui/src/app/dashboard/[orgId]/[projectId]/activity/[logId]/log-detail-client.tsx +++ b/apps/ui/src/app/dashboard/[orgId]/[projectId]/activity/[logId]/log-detail-client.tsx @@ -113,8 +113,14 @@ function StatusIndicator({ log }: { log: Partial }) { let color = "text-emerald-500"; let bgColor = "bg-emerald-500/10"; let label = "Completed"; + const isClientError = log.unifiedFinishReason === "client_error"; - if (log.hasError || log.unifiedFinishReason === "error") { + if (isClientError) { + StatusIcon = AlertCircle; + color = "text-orange-500"; + bgColor = "bg-orange-500/10"; + label = "Client Error"; + } else if (log.hasError || log.unifiedFinishReason === "error") { StatusIcon = AlertCircle; color = "text-red-500"; bgColor = "bg-red-500/10"; @@ -897,23 +903,37 @@ export function LogDetailClient({ {log.hasError && !!log.errorDetails && (
-
+
-

Status Code

+

+ Status Code +

{log.errorDetails.statusCode}

-

Status Text

+

+ Status Text +

{log.errorDetails.statusText}

-

Error Message

+

+ Error Message +

 									{log.errorDetails.responseText}
 								
diff --git a/apps/ui/src/components/dashboard/log-card.tsx b/apps/ui/src/components/dashboard/log-card.tsx index d4678f64fd..6a8bf561eb 100644 --- a/apps/ui/src/components/dashboard/log-card.tsx +++ b/apps/ui/src/components/dashboard/log-card.tsx @@ -101,8 +101,13 @@ export function LogCard({ let StatusIcon = CheckCircle2; let color = "text-green-500"; let bgColor = "bg-green-100"; + const isClientError = log.unifiedFinishReason === "client_error"; - if (log.hasError || log.unifiedFinishReason === "error") { + if (isClientError) { + StatusIcon = AlertCircle; + color = "text-orange-500"; + bgColor = "bg-orange-100"; + } else if (log.hasError || log.unifiedFinishReason === "error") { StatusIcon = AlertCircle; color = "text-red-500"; bgColor = "bg-red-100"; @@ -190,11 +195,18 @@ export function LogCard({ )} {log.unifiedFinishReason} @@ -1061,15 +1073,31 @@ export function LogCard({ )} {log.hasError && !!log.errorDetails && (
-

+

Error Details

-
-
Status Code
+
+
+ Status Code +
{log.errorDetails.statusCode}
-
Status Text
+
+ Status Text +
{log.errorDetails.statusText}
-
Error Message
+
+ Error Message +
{log.errorDetails.responseText}
From d6cac672c56462ffec1d4adb56cbc8c20fc1d2d4 Mon Sep 17 00:00:00 2001 From: Luca Steeb Date: Sun, 5 Apr 2026 18:33:37 +0700 Subject: [PATCH 10/14] fix: log validation client errors --- apps/gateway/src/api.spec.ts | 54 +++++ .../chat/middleware/chat-completion-log.ts | 203 ++++++++++++++++-- .../src/chat/tools/chat-log-context.ts | 1 + 3 files changed, 241 insertions(+), 17 deletions(-) diff --git a/apps/gateway/src/api.spec.ts b/apps/gateway/src/api.spec.ts index 40a170579d..dd780a8aea 100644 --- a/apps/gateway/src/api.spec.ts +++ b/apps/gateway/src/api.spec.ts @@ -1208,6 +1208,60 @@ describe("api", () => { expect(matchingLogs).toHaveLength(1); }); + test("Schema validation errors are logged as client_error", async () => { + const requestId = "schema-validation-client-error-request-id"; + await db.insert(tables.apiKey).values({ + id: "token-id-schema-validation", + token: "real-token-schema-validation", + projectId: "project-id", + description: "Test API Key", + createdBy: "user-id", + }); + + const res = await app.request("/v1/chat/completions", { + method: "POST", + headers: { + "Content-Type": "application/json", + "x-request-id": requestId, + Authorization: "Bearer real-token-schema-validation", + }, + body: JSON.stringify({ + model: "gpt-4o-mini", + messages: [ + { + role: "user", + content: 5555, + }, + ], + }), + }); + + expect(res.status).toBe(400); + + const json = await res.json(); + expect(json.success).toBe(false); + expect(JSON.stringify(json)).toContain("invalid_union"); + + const log = await waitForLogByRequestId(requestId); + expect(log.finishReason).toBe("client_error"); + expect(log.unifiedFinishReason).toBe("client_error"); + expect(log.errorDetails?.statusCode).toBe(400); + expect(log.errorDetails?.responseText).toContain("invalid_union"); + expect(log.errorDetails?.responseText).toContain("messages"); + expect(log.messages).toEqual([ + { + role: "user", + content: 5555, + }, + ]); + + const matchingLogs = await db + .select() + .from(tables.log) + .where(eq(tables.log.requestId, requestId)); + expect(matchingLogs).toHaveLength(1); + }); + test("Max tokens validation error when exceeding model limit", async () => { await db.insert(tables.apiKey).values({ id: "token-id", diff --git a/apps/gateway/src/chat/middleware/chat-completion-log.ts b/apps/gateway/src/chat/middleware/chat-completion-log.ts index 1e520c1921..d5742b4d3d 100644 --- a/apps/gateway/src/chat/middleware/chat-completion-log.ts +++ b/apps/gateway/src/chat/middleware/chat-completion-log.ts @@ -4,30 +4,199 @@ import { HTTPException } from "hono/http-exception"; import { buildBaseLogEntry, type ChatCompletionLogState, + updateBaseLogOptions, } from "@/chat/tools/chat-log-context.js"; +import { extractCustomHeaders } from "@/chat/tools/extract-custom-headers.js"; +import { parseModelInput } from "@/chat/tools/parse-model-input.js"; +import { validateSource } from "@/chat/tools/validate-source.js"; +import { assertApiKeyWithinUsageLimits } from "@/lib/api-key-usage-limits.js"; +import { findApiKeyByToken, findProjectById } from "@/lib/cached-queries.js"; +import { parseApiToken } from "@/lib/extract-api-token.js"; import { insertLog } from "@/lib/logs.js"; +import { shortid } from "@llmgateway/db"; import { logger } from "@llmgateway/logger"; import type { ServerTypes } from "@/vars.js"; import type { LogInsertData } from "@llmgateway/db"; import type { Context } from "hono"; -function getSynthesizedClientErrorLog( - baseLogEntry: ReturnType, +function getRequestId(c: Context): string { + return c.req.header("x-request-id") ?? shortid(40); +} + +function getDebugMode(c: Context): boolean { + return ( + c.req.header("x-debug") === "true" || + process.env.FORCE_DEBUG_MODE === "true" || + process.env.NODE_ENV !== "production" + ); +} + +function getSource(c: Context): string | undefined { + let source = validateSource( + c.req.header("x-source"), + c.req.header("HTTP-Referer"), + ); + const userAgent = c.req.header("User-Agent"); + + if (!source && userAgent && /^claude-cli\/.+/.test(userAgent)) { + source = "claude.com/claude-code"; + } + + return source; +} + +function getRawRequestDetails(rawRequest: unknown): { + messages: unknown[]; + requestedModel: string; + requestedProvider?: string; + usedModelMapping?: string; + usedProvider: string; +} { + const messages = + typeof rawRequest === "object" && + rawRequest !== null && + "messages" in rawRequest && + Array.isArray(rawRequest.messages) + ? rawRequest.messages + : []; + + const requestedModel = + typeof rawRequest === "object" && + rawRequest !== null && + "model" in rawRequest && + typeof rawRequest.model === "string" + ? rawRequest.model + : "unknown"; + + if (requestedModel === "unknown") { + return { + messages, + requestedModel, + usedProvider: "llmgateway", + }; + } + + try { + const parsedModel = parseModelInput(requestedModel); + return { + messages, + requestedModel, + requestedProvider: parsedModel.requestedProvider, + usedModelMapping: parsedModel.requestedModel, + usedProvider: parsedModel.requestedProvider ?? "llmgateway", + }; + } catch { + return { + messages, + requestedModel, + usedProvider: "llmgateway", + }; + } +} + +async function buildFallbackBaseLogEntry( + c: Context, + state: ChatCompletionLogState, +): Promise | null> { + const existingBaseLogEntry = buildBaseLogEntry(c); + if (existingBaseLogEntry) { + return existingBaseLogEntry; + } + + const token = parseApiToken(c); + if (!token) { + return null; + } + + const apiKey = await findApiKeyByToken(token); + if (!apiKey || apiKey.status !== "active") { + return null; + } + + try { + assertApiKeyWithinUsageLimits(apiKey); + } catch { + return null; + } + + const project = await findProjectById(apiKey.projectId); + if (!project || project.status === "deleted") { + return null; + } + + const rawRequest = await state.rawRequestPreviewPromise?.catch( + () => undefined, + ); + const rawRequestDetails = getRawRequestDetails(rawRequest); + + updateBaseLogOptions(c, { + requestId: getRequestId(c), + project, + apiKey, + usedModel: rawRequestDetails.requestedModel, + usedModelMapping: rawRequestDetails.usedModelMapping, + usedProvider: rawRequestDetails.usedProvider, + requestedModel: rawRequestDetails.requestedModel, + requestedProvider: rawRequestDetails.requestedProvider, + messages: rawRequestDetails.messages, + customHeaders: extractCustomHeaders(c), + debugMode: getDebugMode(c), + userAgent: c.req.header("User-Agent") ?? undefined, + source: getSource(c), + rawRequest, + }); + + return buildBaseLogEntry(c); +} + +async function getSynthesizedClientErrorDetails( + c: Context, + error: unknown, +): Promise<{ + responseText: string; + statusText: string; +}> { + if (error instanceof HTTPException) { + return { + responseText: error.message, + statusText: error.res?.statusText ?? "Client Error", + }; + } + + try { + const responseText = await c.res.clone().text(); + return { + responseText: responseText || "Client error", + statusText: c.res.statusText ?? "Client Error", + }; + } catch { + return { + responseText: error instanceof Error ? error.message : "Client error", + statusText: + error instanceof Error + ? error.name + : (c.res.statusText ?? "Client Error"), + }; + } +} + +async function getSynthesizedClientErrorLog( + c: Context, + state: ChatCompletionLogState, status: number, error: unknown, -): LogInsertData | null { +): Promise { + const baseLogEntry = await buildFallbackBaseLogEntry(c, state); if (!baseLogEntry) { return null; } - const responseText = - error instanceof HTTPException - ? error.message - : error instanceof Error - ? error.message - : "Client error"; + const { responseText, statusText } = await getSynthesizedClientErrorDetails( + c, + error, + ); return { ...baseLogEntry, @@ -50,12 +219,7 @@ function getSynthesizedClientErrorLog( canceled: false, errorDetails: { statusCode: status, - statusText: - error instanceof HTTPException - ? "Client Error" - : error instanceof Error - ? error.name - : "Client Error", + statusText, responseText, }, duration: 0, @@ -106,8 +270,9 @@ async function flushChatCompletionLogs( : c.res.status; if (shouldSynthesizeClientError(status, state.pendingLogs)) { - const synthesizedLog = getSynthesizedClientErrorLog( - buildBaseLogEntry(c), + const synthesizedLog = await getSynthesizedClientErrorLog( + c, + state, status, state.caughtError, ); @@ -147,6 +312,10 @@ export const chatCompletionLogMiddleware = createMiddleware( const state: ChatCompletionLogState = { pendingLogs: [], clientErrorSynthesized: false, + rawRequestPreviewPromise: c.req.raw + .clone() + .json() + .catch(() => undefined), }; c.set("chatCompletionLogState", state); diff --git a/apps/gateway/src/chat/tools/chat-log-context.ts b/apps/gateway/src/chat/tools/chat-log-context.ts index 12fd6fd12c..ba507e9355 100644 --- a/apps/gateway/src/chat/tools/chat-log-context.ts +++ b/apps/gateway/src/chat/tools/chat-log-context.ts @@ -12,6 +12,7 @@ import type { Context } from "hono"; export interface ChatCompletionLogState { pendingLogs: LogInsertData[]; baseLogOptions?: Partial; + rawRequestPreviewPromise?: Promise; streamCompletion?: Promise; resolveStreamCompletion?: () => void; caughtError?: unknown; From a664ea420a9fb1fca37b80ed32d308e2fd5cf1fe Mon Sep 17 00:00:00 2001 From: Luca Steeb Date: Sun, 5 Apr 2026 18:44:47 +0700 Subject: [PATCH 11/14] fix: lazily parse fallback body --- .../chat/middleware/chat-completion-log.ts | 19 ++++++++++++------- .../src/chat/tools/chat-log-context.ts | 1 + 2 files changed, 13 insertions(+), 7 deletions(-) diff --git a/apps/gateway/src/chat/middleware/chat-completion-log.ts b/apps/gateway/src/chat/middleware/chat-completion-log.ts index d5742b4d3d..56c6781e8d 100644 --- a/apps/gateway/src/chat/middleware/chat-completion-log.ts +++ b/apps/gateway/src/chat/middleware/chat-completion-log.ts @@ -96,6 +96,16 @@ function getRawRequestDetails(rawRequest: unknown): { } } +async function getRawRequestPreview( + state: ChatCompletionLogState, +): Promise { + state.rawRequestPreviewPromise ??= state.rawRequestPreview + ?.json() + .catch(() => undefined); + + return state.rawRequestPreviewPromise; +} + async function buildFallbackBaseLogEntry( c: Context, state: ChatCompletionLogState, @@ -126,9 +136,7 @@ async function buildFallbackBaseLogEntry( return null; } - const rawRequest = await state.rawRequestPreviewPromise?.catch( - () => undefined, - ); + const rawRequest = await getRawRequestPreview(state); const rawRequestDetails = getRawRequestDetails(rawRequest); updateBaseLogOptions(c, { @@ -312,10 +320,7 @@ export const chatCompletionLogMiddleware = createMiddleware( const state: ChatCompletionLogState = { pendingLogs: [], clientErrorSynthesized: false, - rawRequestPreviewPromise: c.req.raw - .clone() - .json() - .catch(() => undefined), + rawRequestPreview: c.req.raw.clone(), }; c.set("chatCompletionLogState", state); diff --git a/apps/gateway/src/chat/tools/chat-log-context.ts b/apps/gateway/src/chat/tools/chat-log-context.ts index ba507e9355..0e860ebe35 100644 --- a/apps/gateway/src/chat/tools/chat-log-context.ts +++ b/apps/gateway/src/chat/tools/chat-log-context.ts @@ -12,6 +12,7 @@ import type { Context } from "hono"; export interface ChatCompletionLogState { pendingLogs: LogInsertData[]; baseLogOptions?: Partial; + rawRequestPreview?: Request; rawRequestPreviewPromise?: Promise; streamCompletion?: Promise; resolveStreamCompletion?: () => void; From 30e90abb1e8866552df077f93b937a102963ef2d Mon Sep 17 00:00:00 2001 From: "Luca Steeb (bot)" Date: Sun, 5 Apr 2026 11:51:25 +0000 Subject: [PATCH 12/14] chore(autofix): apply diff --- apps/gateway/src/chat/middleware/chat-completion-log.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/apps/gateway/src/chat/middleware/chat-completion-log.ts b/apps/gateway/src/chat/middleware/chat-completion-log.ts index 56c6781e8d..d1da03c180 100644 --- a/apps/gateway/src/chat/middleware/chat-completion-log.ts +++ b/apps/gateway/src/chat/middleware/chat-completion-log.ts @@ -103,7 +103,7 @@ async function getRawRequestPreview( ?.json() .catch(() => undefined); - return state.rawRequestPreviewPromise; + return await state.rawRequestPreviewPromise; } async function buildFallbackBaseLogEntry( From e6c4e6987630875885aff0cdcf1173856ca1748f Mon Sep 17 00:00:00 2001 From: Luca Steeb Date: Wed, 22 Apr 2026 18:31:21 +0700 Subject: [PATCH 13/14] fix: add dataStorageCost to cancelled streaming cost stub --- apps/gateway/src/chat/chat.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/apps/gateway/src/chat/chat.ts b/apps/gateway/src/chat/chat.ts index 0d4b0b5b61..6d8f80a32d 100644 --- a/apps/gateway/src/chat/chat.ts +++ b/apps/gateway/src/chat/chat.ts @@ -6523,6 +6523,7 @@ chat.openapi(completions, async (c) => { imageInputCost: null, imageOutputCost: null, totalCost: null, + dataStorageCost: null as number | null, promptTokens: null, completionTokens: null, cachedTokens: null, From 54e3806bb2789ebcd5743ef2ad8fdd013f5b0e49 Mon Sep 17 00:00:00 2001 From: Luca Steeb Date: Mon, 27 Apr 2026 02:52:30 +0700 Subject: [PATCH 14/14] fix: port retry/routing features into log middleware Adopts main's chat.ts wholesale and rewires its insertLog wrappers to push entries onto the middleware's pendingLogs queue, so the branch gets the immediate-SSE-error retry, same-provider alternate-key retry, buildRoutingAttempt logId/apiKeyHash stamping, and request_id metadata flow without losing the middleware-based 4xx synthesis. Co-Authored-By: Claude Opus 4.7 --- apps/gateway/src/chat/chat.ts | 4227 ++++++++++------- .../chat/middleware/chat-completion-log.ts | 6 + .../src/chat/tools/chat-log-context.ts | 1 + 3 files changed, 2577 insertions(+), 1657 deletions(-) diff --git a/apps/gateway/src/chat/chat.ts b/apps/gateway/src/chat/chat.ts index 6d8f80a32d..8615f01669 100644 --- a/apps/gateway/src/chat/chat.ts +++ b/apps/gateway/src/chat/chat.ts @@ -5,7 +5,13 @@ import { streamSSE } from "hono/streaming"; import { extractFirstSseEventData } from "@/chat/tools/extract-first-sse-event-data.js"; import { validateSource } from "@/chat/tools/validate-source.js"; -import { reportKeyError, reportKeySuccess } from "@/lib/api-key-health.js"; +import { getApiKeyFingerprint } from "@/lib/api-key-fingerprint.js"; +import { + reportKeyError, + reportKeySuccess, + reportTrackedKeyError, + reportTrackedKeySuccess, +} from "@/lib/api-key-health.js"; import { assertApiKeyWithinUsageLimits } from "@/lib/api-key-usage-limits.js"; import { findApiKeyByToken, @@ -22,6 +28,8 @@ import { throwIamException, validateModelAccess } from "@/lib/iam.js"; import { calculateDataStorageCost, getUnifiedFinishReason, + isContentFilterFinishReason, + insertLog as _insertLog, } from "@/lib/logs.js"; import { checkProviderRateLimit, @@ -59,6 +67,7 @@ import { type InferSelectModel, isCachingEnabled, metricsKey, + type LogInsertData, shortid, type tables, type ProviderMetrics, @@ -92,11 +101,8 @@ import { import { chatCompletionLogMiddleware } from "./middleware/chat-completion-log.js"; import { completionsRequestSchema } from "./schemas/completions.js"; import { - enqueueChatLog, finishStreamCompletion, registerStreamCompletion, - updateBaseLogOptions, - updateLogInsertOptions, } from "./tools/chat-log-context.js"; import { checkContentFilter, @@ -106,6 +112,7 @@ import { } from "./tools/check-content-filter.js"; import { convertImagesToBase64 } from "./tools/convert-images-to-base64.js"; import { countInputImages } from "./tools/count-input-images.js"; +import { createLogEntry } from "./tools/create-log-entry.js"; import { estimateTokensFromContent } from "./tools/estimate-tokens-from-content.js"; import { estimateTokens } from "./tools/estimate-tokens.js"; import { @@ -123,6 +130,7 @@ import { getProviderEnv } from "./tools/get-provider-env.js"; import { hasMeaningfulAssistantOutput } from "./tools/has-meaningful-assistant-output.js"; import { healJsonResponse } from "./tools/heal-json-response.js"; import { isModelTrulyFree } from "./tools/is-model-truly-free.js"; +import { mapFinishReasonToOpenai } from "./tools/map-finish-reason-to-openai.js"; import { messagesContainImages } from "./tools/messages-contain-images.js"; import { mightBeCompleteJson } from "./tools/might-be-complete-json.js"; import { normalizeStreamingError } from "./tools/normalize-streaming-error.js"; @@ -143,6 +151,7 @@ import { import { type RoutingAttempt, getErrorType, + isRetryableErrorType, MAX_RETRIES, providerRetryKey, selectNextProvider, @@ -152,7 +161,12 @@ import { encodeChatMessages, messageContentToString, } from "./tools/tokenizer.js"; -import { transformResponseToOpenai } from "./tools/transform-response-to-openai.js"; +import { + applyExtendedUsageFields, + stripRequestScopedMetadataFromOpenAiResponse, + transformResponseToOpenai, + withCurrentRequestMetadataOnOpenAiResponse, +} from "./tools/transform-response-to-openai.js"; import { transformStreamingToOpenai } from "./tools/transform-streaming-to-openai.js"; import { validateFreeModelUsage } from "./tools/validate-free-model-usage.js"; import { validateModelCapabilities } from "./tools/validate-model-capabilities.js"; @@ -167,6 +181,27 @@ import type { ServerTypes } from "@/vars.js"; * - Non-default regions only pass if a region-specific env key exists * (e.g. LLM_ALIBABA_API_KEY__US_VIRGINIA). */ +function toDataStorageCostNumber( + promptTokens: number | string | null | undefined, + cachedTokens: number | string | null | undefined, + completionTokens: number | string | null | undefined, + reasoningTokens: number | string | null | undefined, + retentionLevel: "retain" | "none" | null, +): number | null { + if (retentionLevel === "none") { + return null; + } + const str = calculateDataStorageCost( + promptTokens, + cachedTokens, + completionTokens, + reasoningTokens, + retentionLevel, + ); + const num = Number(str); + return Number.isFinite(num) ? num : null; +} + function filterRegionsByAvailableKeys( expandedProviders: ProviderModelMapping[], ): ProviderModelMapping[] { @@ -450,6 +485,48 @@ function addContentFilterRoutingMetadata( }; } +function withUsedApiKeyHash( + routingMetadata: RoutingMetadata | undefined, + usedApiKeyHash: string | undefined, +): RoutingMetadata | undefined { + if (!routingMetadata || !usedApiKeyHash) { + return routingMetadata; + } + + if (routingMetadata.usedApiKeyHash === usedApiKeyHash) { + return routingMetadata; + } + + return { + ...routingMetadata, + usedApiKeyHash, + }; +} + +function buildRoutingAttempt( + provider: string, + model: string, + statusCode: number, + errorType: string, + succeeded: boolean, + options?: { + region?: string; + apiKeyHash?: string; + logId?: string; + }, +): RoutingAttempt { + return { + provider, + model, + ...(options?.region && { region: options.region }), + status_code: statusCode, + error_type: errorType, + succeeded, + ...(options?.apiKeyHash && { apiKeyHash: options.apiKeyHash }), + ...(options?.logId && { logId: options.logId }), + }; +} + function usesGoogleQueryToken(provider: string): boolean { return ( provider === "google-ai-studio" || @@ -635,7 +712,7 @@ export async function inspectImmediateStreamingProviderError( try { await reader.cancel(); } catch { - // Ignore cancellation errors once the immediate error is extracted. + // Ignore cancellation errors - the response body is no longer needed. } return { @@ -654,7 +731,7 @@ export async function inspectImmediateStreamingProviderError( try { await reader.cancel(); } catch { - // Ignore cancellation errors when the replay stream setup fails. + // Ignore cancellation errors - the response body is no longer needed. } return { @@ -787,16 +864,40 @@ const completions = createRoute({ prompt_tokens_details: z .object({ cached_tokens: z.number(), + cache_write_tokens: z.number().optional(), + cache_creation_tokens: z.number().optional(), + audio_tokens: z.number().optional(), + video_tokens: z.number().optional(), + }) + .optional(), + completion_tokens_details: z + .object({ + reasoning_tokens: z.number().optional(), + image_tokens: z.number().optional(), + audio_tokens: z.number().optional(), + }) + .optional(), + cost: z.number().nullable().optional(), + cost_details: z + .object({ + upstream_inference_cost: z.number(), + upstream_inference_prompt_cost: z.number(), + upstream_inference_completions_cost: z.number(), + total_cost: z.number().nullable().optional(), + input_cost: z.number().nullable().optional(), + output_cost: z.number().nullable().optional(), + cached_input_cost: z.number().nullable().optional(), + request_cost: z.number().nullable().optional(), + web_search_cost: z.number().nullable().optional(), + image_input_cost: z.number().nullable().optional(), + image_output_cost: z.number().nullable().optional(), + data_storage_cost: z.number().nullable().optional(), }) .optional(), - cost_usd_total: z.number().nullable().optional(), - cost_usd_input: z.number().nullable().optional(), - cost_usd_output: z.number().nullable().optional(), - cost_usd_cached_input: z.number().nullable().optional(), info: z.string().optional(), - cost_usd_request: z.number().nullable().optional(), }), metadata: z.object({ + request_id: z.string(), requested_model: z.string(), requested_provider: z.string().nullable(), used_model: z.string(), @@ -811,6 +912,9 @@ const completions = createRoute({ region: z.string().optional(), status_code: z.number(), error_type: z.string(), + succeeded: z.boolean(), + apiKeyHash: z.string().optional(), + logId: z.string().optional(), }), ) .optional(), @@ -846,7 +950,7 @@ const completions = createRoute({ chat.openapi(completions, async (c) => { // Extract or generate request ID - const requestId = c.req.header("x-request-id") ?? shortid(40); + const requestId = c.req.header("x-request-id")?.trim() || shortid(40); // Parse JSON manually even if it's malformed let rawBody: unknown; @@ -1021,17 +1125,38 @@ chat.openapi(completions, async (c) => { // Extract custom X-LLMGateway-* headers const customHeaders = extractCustomHeaders(c); - const requestPluginIds = plugins?.map((plugin) => plugin.id) ?? []; + + // Read Responses API context from in-memory Map (set by /v1/responses proxy). + // Uses a lookup key passed via header; actual data is never in headers. + // External callers cannot exploit this: the key is a resp_ + shortid(24) that + // only exists in the Map for the duration of a single app.request() call, and + // getResponsesContext() deletes on read (one-time use). const responsesContextKey = c.req.header("x-responses-context-key"); const responsesContext = responsesContextKey ? getResponsesContext(responsesContextKey) : undefined; + const syncLogInsert = responsesContext?.syncInsert ?? false; const logIdOverride = responsesContext?.logId; - updateLogInsertOptions(c, { - syncInsert: responsesContext?.syncInsert ?? false, - logIdOverride, - responsesApiData: responsesContext?.responsesApiData ?? null, - }); + const responsesApiData: unknown = responsesContext?.responsesApiData ?? null; + + const chatLogState = c.get("chatCompletionLogState"); + if (chatLogState) { + chatLogState.syncInsert = syncLogInsert; + chatLogState.logIdOverride = logIdOverride; + chatLogState.responsesApiData = responsesApiData; + } + + // Queue a log entry for the middleware to flush after the request completes. + // The middleware applies logIdOverride/responsesApiData/syncInsert from state + // at flush time, so we just push the raw log data here. + const insertLogEntry = (logData: LogInsertData): Promise => { + if (chatLogState) { + chatLogState.pendingLogs.push(logData); + } else { + void _insertLog(logData); + } + return Promise.resolve(1); + }; // Check for X-No-Fallback header to disable provider fallback on low uptime const xNoFallbackHeaderSet = @@ -1160,9 +1285,6 @@ chat.openapi(completions, async (c) => { }); } - const validatedApiKey = apiKey; - const validatedProject = project; - // Check if project is deleted (archived) if (project.status === "deleted") { throw new HTTPException(410, { @@ -1232,35 +1354,18 @@ chat.openapi(completions, async (c) => { }); } - updateBaseLogOptions(c, { - requestId, - project, - apiKey, - usedModel: initialRequestedModel, - usedModelMapping: requestedModel, - usedProvider: requestedProvider ?? "llmgateway", - requestedModel: initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoningEffort: reasoning_effort, - reasoningMaxTokens: reasoning_max_tokens, - effort, - responseFormat: response_format, - tools, - toolChoice: tool_choice, - source, - customHeaders, - debugMode, - userAgent, - imageConfig: image_config, - rawRequest: rawBody, - plugins: requestPluginIds, - }); + const retryProjectContext = { + mode: project.mode, + organizationId: project.organizationId, + }; + const retryOrganizationContext = { + id: organization.id, + credits: organization.credits, + devPlan: organization.devPlan, + devPlanCreditsLimit: organization.devPlanCreditsLimit, + devPlanCreditsUsed: organization.devPlanCreditsUsed, + devPlanExpiresAt: organization.devPlanExpiresAt, + }; // Run guardrails check for enterprise organizations let guardrailResult: Awaited> | undefined; @@ -1302,9 +1407,6 @@ chat.openapi(completions, async (c) => { messages as Parameters[0], guardrailResult.redactions, ) as typeof messages; - updateBaseLogOptions(c, { - messages, - }); } // Log non-blocking violations (redact/warn) @@ -1481,8 +1583,8 @@ chat.openapi(completions, async (c) => { } } - // Find the cheapest model that meets our context size requirements. - // Only consider hardcoded models for auto selection unless free_models_only is set. + // Find the cheapest model that meets our context size requirements + // Only consider hardcoded models for auto selection const allowedAutoModels = [ "claude-opus-4-6", "claude-sonnet-4-6", @@ -1491,175 +1593,151 @@ chat.openapi(completions, async (c) => { let selectedModel: ModelDefinition | undefined; let selectedProviders: any[] = []; + let lowestPrice = Number.MAX_VALUE; + const now = new Date(); // Cache current time for deprecation checks - async function findBestAutoRoutingCandidate( - candidateModels: ModelDefinition[], - ): Promise<{ - selectedModel: ModelDefinition; - selectedProviders: ProviderModelMapping[]; - } | null> { - let bestModel: ModelDefinition | undefined; - let bestProviders: ProviderModelMapping[] = []; - let lowestPrice = Number.MAX_VALUE; - const now = new Date(); // Cache current time for deprecation checks - - for (const modelDef of candidateModels) { - if (modelDef.id === "auto" || modelDef.id === "custom") { - continue; - } + for (const modelDef of models) { + if (modelDef.id === "auto" || modelDef.id === "custom") { + continue; + } - // When free_models_only is true, only consider models marked as free. - if (free_models_only) { - if (!("free" in modelDef && modelDef.free)) { - continue; - } - } else if (!allowedAutoModels.includes(modelDef.id)) { - continue; - } else if ( - estimatedInputTokens > 10_000 && - modelDef.id === "claude-haiku-4-5" - ) { - // Prefer Sonnet over Haiku for larger prompts once the input crosses 10k tokens. + // When free_models_only is true, only consider models marked as free + // Otherwise, only consider hardcoded allowed models + if (free_models_only) { + if (!("free" in modelDef && modelDef.free)) { continue; } + } else if (!allowedAutoModels.includes(modelDef.id)) { + continue; + } else if ( + estimatedInputTokens > 10_000 && + modelDef.id === "claude-haiku-4-5" + ) { + // Prefer Sonnet over Haiku for larger prompts once the input crosses 10k tokens + continue; + } - // Validate IAM rules for this candidate model and filter providers. - // We must re-evaluate per model because iamAllowedProviders was computed - // for the "auto" model which only has the "llmgateway" provider. - const candidateIam = await validateModelAccess( - validatedApiKey.id, - modelDef.id, - undefined, - modelDef, - ); - if (!candidateIam.allowed) { - continue; - } - const candidateAllowedProviders = candidateIam.allowedProviders; + // Validate IAM rules for this candidate model and filter providers. + // We must re-evaluate per model because iamAllowedProviders was computed + // for the "auto" model which only has the "llmgateway" provider. + const candidateIam = await validateModelAccess( + apiKey.id, + modelDef.id, + undefined, + modelDef, + ); + if (!candidateIam.allowed) { + continue; + } + const candidateAllowedProviders = candidateIam.allowedProviders; - const candidateProviders = preferConcreteRegionalMappings( - validatedProject.mode === "credits" - ? filterRegionsByAvailableKeys( - expandAllProviderRegions( - modelDef.providers as ProviderModelMapping[], - ), - ) - : expandAllProviderRegions( + const candidateProviders = preferConcreteRegionalMappings( + project.mode === "credits" + ? filterRegionsByAvailableKeys( + expandAllProviderRegions( modelDef.providers as ProviderModelMapping[], ), - ); - // Check if any of the model's providers are available - const availableModelProviders = candidateProviders.filter( - (provider) => - availableProviders.includes(provider.providerId) && - (!candidateAllowedProviders || - candidateAllowedProviders.includes(provider.providerId)), - ); - - // Filter by context size requirement, reasoning capability, and deprecation status - const suitableProviders = availableModelProviders.filter((provider) => { - // Skip deprecated provider mappings - if (provider.deprecatedAt && now > provider.deprecatedAt) { - return false; - } - - // Use the provider's context size, defaulting to a reasonable value if not specified - const modelContextSize = provider.contextSize ?? 8192; - const contextSizeMet = modelContextSize >= requiredContextSize; + ) + : expandAllProviderRegions( + modelDef.providers as ProviderModelMapping[], + ), + ); + // Check if any of the model's providers are available + const availableModelProviders = candidateProviders.filter( + (provider) => + availableProviders.includes(provider.providerId) && + (!candidateAllowedProviders || + candidateAllowedProviders.includes(provider.providerId)), + ); - // If no_reasoning is true, exclude reasoning models - if (no_reasoning && provider.reasoning === true) { - return false; - } + // Filter by context size requirement, reasoning capability, and deprecation status + const suitableProviders = availableModelProviders.filter((provider) => { + // Skip deprecated provider mappings + if (provider.deprecatedAt && now > provider.deprecatedAt!) { + return false; + } - // Check reasoning capability if reasoning_effort is specified - if (reasoning_effort !== undefined && provider.reasoning !== true) { - return false; - } + // Use the provider's context size, defaulting to a reasonable value if not specified + const modelContextSize = provider.contextSize ?? 8192; + const contextSizeMet = modelContextSize >= requiredContextSize; - // Check reasoning.max_tokens support if specified - if ( - reasoning_max_tokens !== undefined && - provider.reasoningMaxTokens !== true - ) { - return false; - } + // If no_reasoning is true, exclude reasoning models + if (no_reasoning && provider.reasoning === true) { + return false; + } - // Check tool capability if tools or tool_choice is specified - if ( - (tools !== undefined || tool_choice !== undefined) && - provider.tools !== true - ) { - return false; - } + // Check reasoning capability if reasoning_effort is specified + if (reasoning_effort !== undefined && provider.reasoning !== true) { + return false; + } - // Check web search capability if web search tool is requested - if (webSearchTool && provider.webSearch !== true) { - return false; - } + // Check reasoning.max_tokens support if specified + if ( + reasoning_max_tokens !== undefined && + provider.reasoningMaxTokens !== true + ) { + return false; + } - // Check JSON output capability if json_object or json_schema response format is requested - if ( - response_format?.type === "json_object" || - response_format?.type === "json_schema" - ) { - if (provider.jsonOutput !== true) { - return false; - } - } + // Check tool capability if tools or tool_choice is specified + if ( + (tools !== undefined || tool_choice !== undefined) && + provider.tools !== true + ) { + return false; + } - // Check JSON schema output capability if json_schema response format is requested - if (response_format?.type === "json_schema") { - if (provider.jsonOutputSchema !== true) { - return false; - } - } + // Check web search capability if web search tool is requested + if (webSearchTool && provider.webSearch !== true) { + return false; + } - // Check vision capability if images are present in messages - if (hasImages && provider.vision !== true) { + // Check JSON output capability if json_object or json_schema response format is requested + if ( + response_format?.type === "json_object" || + response_format?.type === "json_schema" + ) { + if (provider.jsonOutput !== true) { return false; } + } - if ( - max_tokens !== undefined && - provider.maxOutput !== undefined && - max_tokens > provider.maxOutput - ) { + // Check JSON schema output capability if json_schema response format is requested + if (response_format?.type === "json_schema") { + if (provider.jsonOutputSchema !== true) { return false; } + } - return contextSizeMet; - }); - - if (suitableProviders.length > 0) { - // Find the cheapest among the suitable providers for this model - for (const provider of suitableProviders) { - const totalPrice = - ((provider.inputPrice ?? 0) + (provider.outputPrice ?? 0)) / 2; + // Check vision capability if images are present in messages + if (hasImages && provider.vision !== true) { + return false; + } - if (totalPrice < lowestPrice) { - lowestPrice = totalPrice; - bestModel = modelDef; - bestProviders = suitableProviders; - } - } + if ( + max_tokens !== undefined && + provider.maxOutput !== undefined && + max_tokens > provider.maxOutput + ) { + return false; } - } - if (!bestModel) { - return null; - } + return contextSizeMet; + }); - return { - selectedModel: bestModel, - selectedProviders: bestProviders, - }; - } + if (suitableProviders.length > 0) { + // Find the cheapest among the suitable providers for this model + for (const provider of suitableProviders) { + const totalPrice = + ((provider.inputPrice ?? 0) + (provider.outputPrice ?? 0)) / 2; - const autoRoutingCandidate = await findBestAutoRoutingCandidate(models); - if (autoRoutingCandidate) { - selectedModel = autoRoutingCandidate.selectedModel; - selectedProviders = autoRoutingCandidate.selectedProviders; + if (totalPrice < lowestPrice) { + lowestPrice = totalPrice; + selectedModel = modelDef; + selectedProviders = suitableProviders; + } + } + } } let providerAgnosticSelectedProviders = selectedProviders; @@ -2218,6 +2296,7 @@ chat.openapi(completions, async (c) => { if (cheapestResult) { usedProvider = cheapestResult.provider.providerId; usedModel = cheapestResult.provider.modelName; + usedRegion = cheapestResult.provider.region; routingMetadata = { ...cheapestResult.metadata, selectionReason: "low-uptime-fallback", @@ -2677,16 +2756,13 @@ chat.openapi(completions, async (c) => { } } - updateBaseLogOptions(c, { - reasoningEffort: reasoning_effort, - }); - let url: string | undefined; // Get the provider key for the selected provider based on project mode let providerKey: InferSelectModel | undefined; let usedToken: string | undefined; + let usedApiKeyHash: string | undefined; let configIndex = 0; // Index for round-robin environment variables let envVarName: string | undefined; // Environment variable name for health tracking if ( @@ -2986,6 +3062,9 @@ chat.openapi(completions, async (c) => { }); } + usedApiKeyHash = getApiKeyFingerprint(usedToken); + routingMetadata = withUsedApiKeyHash(routingMetadata, usedApiKeyHash); + const contentFilterBlocked = contentFilterMode === "enabled" && contentFilterMatched && @@ -3000,27 +3079,66 @@ chat.openapi(completions, async (c) => { .length ? openAIContentFilterResult.responses : null; - updateBaseLogOptions(c, { - gatewayContentFilterResponse, - }); - const chatCompletionLogState = c.get("chatCompletionLogState"); - if (chatCompletionLogState) { - chatCompletionLogState.internalContentFilter = shouldTagContentFilter; + + if (chatLogState) { + if (shouldTagContentFilter) { + chatLogState.internalContentFilter = true; + } + chatLogState.gatewayContentFilterResponse = gatewayContentFilterResponse; } + const insertLog = ( + logData: Parameters[0], + _options?: Parameters[1], + ): Promise => { + if (chatLogState) { + chatLogState.pendingLogs.push(logData as LogInsertData); + } else { + const enriched = { + ...logData, + gatewayContentFilterResponse: + logData.gatewayContentFilterResponse ?? gatewayContentFilterResponse, + ...(shouldTagContentFilter ? { internalContentFilter: true } : {}), + }; + void _insertLog(enriched); + } + return Promise.resolve(1); + }; + if (contentFilterBlocked) { const contentFilterResponseId = `chatcmpl-${Date.now()}`; const contentFilterCreated = Math.floor(Date.now() / 1000); - enqueueChatLog( - c, - { - providerKeyId: undefined, - usedModel: "", - usedModelMapping: undefined, - usedProvider: "llmgateway", - }, - { + // Log the filtered request + try { + await insertLog({ + ...createLogEntry( + requestId, + project, + apiKey, + undefined, + "", + undefined, + "llmgateway", + requestedModel, + requestedProvider, + messages as any[], + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + undefined, + undefined, + effort as "low" | "medium" | "high" | undefined, + response_format, + tools, + tool_choice, + source, + customHeaders, + c.req.header("x-debug") === "true", + c.req.header("user-agent"), + ), content: null, responseSize: 0, finishReason: "llmgateway_content_filter", @@ -3036,7 +3154,6 @@ chat.openapi(completions, async (c) => { errorDetails: null, duration: 0, timeToFirstToken: null, - timeToFirstReasoningToken: null, inputCost: 0, outputCost: 0, cachedInputCost: 0, @@ -3051,10 +3168,10 @@ chat.openapi(completions, async (c) => { discount: null, pricingTier: null, dataStorageCost: "0", - cached: false, - toolResults: null, - }, - ); + }); + } catch { + // Silently ignore logging failures + } if (stream) { void registerStreamCompletion(c); @@ -3146,6 +3263,7 @@ chat.openapi(completions, async (c) => { configIndex, isImageGeneration, usedRegion, + providerKey !== undefined, ); // If region is still unset but the provider supports regions, resolve the @@ -3279,6 +3397,46 @@ chat.openapi(completions, async (c) => { } } + // Log the cached streaming request with reconstructed content + // Extract plugin IDs for logging (cached streaming) + const cachedStreamingPluginIds = plugins?.map((p) => p.id) ?? []; + + const baseLogEntry = createLogEntry( + requestId, + project, + apiKey, + providerKey?.id, + usedModelFormatted, + usedModelMapping, + usedProvider, + initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoning_effort, + reasoning_max_tokens, + effort, + response_format, + tools, + tool_choice, + source, + customHeaders, + debugMode, + userAgent, + image_config, + routingMetadata, + rawBody, + rawCachedResponseData, // Raw SSE data from cached response + null, // No upstream request for cached response + rawCachedResponseData, // Raw SSE data from cached response (same for both) + cachedStreamingPluginIds, + undefined, // No plugin results for cached response + ); + // Calculate costs for cached response const costs = await calculateCosts( usedModel, @@ -3295,90 +3453,56 @@ chat.openapi(completions, async (c) => { project.organizationId, ); - enqueueChatLog( - c, - { - providerKeyId: providerKey?.id, - usedModel: usedModelFormatted, - usedModelMapping, - usedProvider, - requestedModel: initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoningEffort: reasoning_effort, - reasoningMaxTokens: reasoning_max_tokens, - effort, - responseFormat: response_format, - tools, - toolChoice: tool_choice, - source, - customHeaders, - debugMode, - userAgent, - imageConfig: image_config, - routingMetadata, - rawRequest: rawBody, - rawResponse: rawCachedResponseData, - upstreamRequest: null, - upstreamResponse: rawCachedResponseData, - plugins: requestPluginIds, - pluginResults: undefined, - }, - { - duration: 0, - timeToFirstToken: null, - timeToFirstReasoningToken: null, - responseSize: cachedResponseSize, - content: fullContent || null, - reasoningContent: fullReasoningContent || null, - finishReason: cachedStreamingResponse.metadata.finishReason, - promptTokens: - (costs.promptTokens ?? promptTokens)?.toString() ?? null, - completionTokens: completionTokens?.toString() ?? null, - totalTokens: costs.imageInputTokens - ? ( - (costs.promptTokens ?? promptTokens ?? 0) + - (completionTokens ?? 0) + - (reasoningTokens ?? 0) - ).toString() - : (totalTokens?.toString() ?? null), - reasoningTokens: reasoningTokens?.toString() ?? null, - cachedTokens: cachedTokens?.toString() ?? null, - hasError: false, - streamed: true, - canceled: false, - errorDetails: null, - inputCost: costs.inputCost ?? 0, - outputCost: costs.outputCost ?? 0, - cachedInputCost: costs.cachedInputCost ?? 0, - requestCost: costs.requestCost ?? 0, - webSearchCost: costs.webSearchCost ?? 0, - imageInputTokens: costs.imageInputTokens?.toString() ?? null, - imageOutputTokens: costs.imageOutputTokens?.toString() ?? null, - imageInputCost: costs.imageInputCost ?? null, - imageOutputCost: costs.imageOutputCost ?? null, - cost: costs.totalCost ?? 0, - estimatedCost: costs.estimatedCost, - discount: costs.discount ?? null, - pricingTier: costs.pricingTier ?? null, - dataStorageCost: calculateDataStorageCost( - costs.promptTokens ?? promptTokens, - cachedTokens, - completionTokens, - reasoningTokens, - retentionLevel, - ), - cached: true, - toolResults: - (cachedStreamingResponse.metadata as { toolResults?: any }) - ?.toolResults ?? null, - }, - ); + await insertLogEntry({ + ...baseLogEntry, + duration: 0, // No processing time for cached response + timeToFirstToken: null, // Not applicable for cached response + timeToFirstReasoningToken: null, // Not applicable for cached response + responseSize: cachedResponseSize, + content: fullContent || null, + reasoningContent: fullReasoningContent || null, + finishReason: cachedStreamingResponse.metadata.finishReason, + promptTokens: + (costs.promptTokens ?? promptTokens)?.toString() ?? null, + completionTokens: completionTokens?.toString() ?? null, + totalTokens: costs.imageInputTokens + ? ( + (costs.promptTokens ?? promptTokens ?? 0) + + (completionTokens ?? 0) + + (reasoningTokens ?? 0) + ).toString() + : (totalTokens?.toString() ?? null), + reasoningTokens: reasoningTokens?.toString() ?? null, + cachedTokens: cachedTokens?.toString() ?? null, + hasError: false, + streamed: true, + canceled: false, + errorDetails: null, + inputCost: costs.inputCost ?? 0, + outputCost: costs.outputCost ?? 0, + cachedInputCost: costs.cachedInputCost ?? 0, + requestCost: costs.requestCost ?? 0, + webSearchCost: costs.webSearchCost ?? 0, + imageInputTokens: costs.imageInputTokens?.toString() ?? null, + imageOutputTokens: costs.imageOutputTokens?.toString() ?? null, + imageInputCost: costs.imageInputCost ?? null, + imageOutputCost: costs.imageOutputCost ?? null, + cost: costs.totalCost ?? 0, + estimatedCost: costs.estimatedCost, + discount: costs.discount ?? null, + pricingTier: costs.pricingTier ?? null, + dataStorageCost: calculateDataStorageCost( + costs.promptTokens ?? promptTokens, + cachedTokens, + completionTokens, + reasoningTokens, + retentionLevel, + ), + cached: true, + toolResults: + (cachedStreamingResponse.metadata as { toolResults?: any }) + ?.toolResults ?? null, + }); // Return cached streaming response by replaying chunks with original timing void registerStreamCompletion(c); @@ -3420,6 +3544,7 @@ chat.openapi(completions, async (c) => { } else { logger.error("Error replaying cached stream", error); } + finishStreamCompletion(c); }, ); } @@ -3427,18 +3552,59 @@ chat.openapi(completions, async (c) => { cacheKey = generateCacheKey(cachePayload); const cachedResponse = cacheKey ? await getCache(cacheKey) : null; if (cachedResponse) { + const responseForCurrentRequest = + withCurrentRequestMetadataOnOpenAiResponse(cachedResponse, requestId); + // Log the cached request const duration = 0; // No processing time needed + // Extract plugin IDs for logging (cached non-streaming) + const cachedPluginIds = plugins?.map((p) => p.id) ?? []; - // Calculate costs for cached response - const cachedCosts = await calculateCosts( - usedModel, + const baseLogEntry = createLogEntry( + requestId, + project, + apiKey, + providerKey?.id, + usedModelFormatted, + usedModelMapping, usedProvider, - cachedResponse.usage?.prompt_tokens ?? null, - cachedResponse.usage?.completion_tokens ?? null, - cachedResponse.usage?.prompt_tokens_details?.cached_tokens ?? null, - undefined, - cachedResponse.usage?.reasoning_tokens ?? null, + initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoning_effort, + reasoning_max_tokens, + effort, + response_format, + tools, + tool_choice, + source, + customHeaders, + debugMode, + userAgent, + image_config, + routingMetadata, + rawBody, + responseForCurrentRequest, + null, // No upstream request for cached response + responseForCurrentRequest, // upstream response is same as cached response + cachedPluginIds, + undefined, // No plugin results for cached response + ); + + // Calculate costs for cached response + const cachedCosts = await calculateCosts( + usedModel, + usedProvider, + cachedResponse.usage?.prompt_tokens ?? null, + cachedResponse.usage?.completion_tokens ?? null, + cachedResponse.usage?.prompt_tokens_details?.cached_tokens ?? null, + undefined, + cachedResponse.usage?.reasoning_tokens ?? null, 0, // outputImageCount undefined, // imageSize inputImageCount, @@ -3455,98 +3621,61 @@ chat.openapi(completions, async (c) => { (cachedReasoningContent?.length ?? 0) + 500; // overhead for metadata - enqueueChatLog( - c, - { - providerKeyId: providerKey?.id, - usedModel: usedModelFormatted, - usedModelMapping, - usedProvider, - requestedModel: initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoningEffort: reasoning_effort, - reasoningMaxTokens: reasoning_max_tokens, - effort, - responseFormat: response_format, - tools, - toolChoice: tool_choice, - source, - customHeaders, - debugMode, - userAgent, - imageConfig: image_config, - routingMetadata, - rawRequest: rawBody, - rawResponse: cachedResponse, - upstreamRequest: null, - upstreamResponse: cachedResponse, - plugins: requestPluginIds, - pluginResults: undefined, - }, - { - duration, - timeToFirstToken: null, - timeToFirstReasoningToken: null, - responseSize: estimatedCachedSize, - content: cachedContent ?? null, - reasoningContent: cachedReasoningContent ?? null, - finishReason: cachedResponse.choices?.[0]?.finish_reason ?? null, - promptTokens: - ( - cachedCosts.promptTokens ?? cachedResponse.usage?.prompt_tokens - )?.toString() ?? null, - completionTokens: cachedResponse.usage?.completion_tokens ?? null, - totalTokens: cachedCosts.imageInputTokens - ? ( - (cachedCosts.promptTokens ?? - cachedResponse.usage?.prompt_tokens ?? - 0) + - (cachedResponse.usage?.completion_tokens ?? 0) + - (cachedResponse.usage?.reasoning_tokens ?? 0) - ).toString() - : (cachedResponse.usage?.total_tokens ?? null), - reasoningTokens: cachedResponse.usage?.reasoning_tokens ?? null, - cachedTokens: - cachedResponse.usage?.prompt_tokens_details?.cached_tokens ?? - null, - hasError: false, - streamed: false, - canceled: false, - errorDetails: null, - inputCost: cachedCosts.inputCost ?? 0, - outputCost: cachedCosts.outputCost ?? 0, - cachedInputCost: cachedCosts.cachedInputCost ?? 0, - requestCost: cachedCosts.requestCost ?? 0, - webSearchCost: cachedCosts.webSearchCost ?? 0, - imageInputTokens: cachedCosts.imageInputTokens?.toString() ?? null, - imageOutputTokens: - cachedCosts.imageOutputTokens?.toString() ?? null, - imageInputCost: cachedCosts.imageInputCost ?? null, - imageOutputCost: cachedCosts.imageOutputCost ?? null, - cost: cachedCosts.totalCost ?? 0, - estimatedCost: cachedCosts.estimatedCost, - discount: cachedCosts.discount ?? null, - pricingTier: cachedCosts.pricingTier ?? null, - dataStorageCost: calculateDataStorageCost( - cachedCosts.promptTokens ?? cachedResponse.usage?.prompt_tokens, - cachedResponse.usage?.prompt_tokens_details?.cached_tokens, - cachedResponse.usage?.completion_tokens, - cachedResponse.usage?.reasoning_tokens, - retentionLevel, - ), - cached: true, - toolResults: - cachedResponse.choices?.[0]?.message?.tool_calls ?? null, - }, - ); + await insertLogEntry({ + ...baseLogEntry, + duration, + timeToFirstToken: null, // Not applicable for cached response + timeToFirstReasoningToken: null, // Not applicable for cached response + responseSize: estimatedCachedSize, + content: cachedContent ?? null, + reasoningContent: cachedReasoningContent ?? null, + finishReason: cachedResponse.choices?.[0]?.finish_reason ?? null, + promptTokens: + ( + cachedCosts.promptTokens ?? cachedResponse.usage?.prompt_tokens + )?.toString() ?? null, + completionTokens: cachedResponse.usage?.completion_tokens ?? null, + totalTokens: cachedCosts.imageInputTokens + ? ( + (cachedCosts.promptTokens ?? + cachedResponse.usage?.prompt_tokens ?? + 0) + + (cachedResponse.usage?.completion_tokens ?? 0) + + (cachedResponse.usage?.reasoning_tokens ?? 0) + ).toString() + : (cachedResponse.usage?.total_tokens ?? null), + reasoningTokens: cachedResponse.usage?.reasoning_tokens ?? null, + cachedTokens: + cachedResponse.usage?.prompt_tokens_details?.cached_tokens ?? null, + hasError: false, + streamed: false, + canceled: false, + errorDetails: null, + inputCost: cachedCosts.inputCost ?? 0, + outputCost: cachedCosts.outputCost ?? 0, + cachedInputCost: cachedCosts.cachedInputCost ?? 0, + requestCost: cachedCosts.requestCost ?? 0, + webSearchCost: cachedCosts.webSearchCost ?? 0, + imageInputTokens: cachedCosts.imageInputTokens?.toString() ?? null, + imageOutputTokens: cachedCosts.imageOutputTokens?.toString() ?? null, + imageInputCost: cachedCosts.imageInputCost ?? null, + imageOutputCost: cachedCosts.imageOutputCost ?? null, + cost: cachedCosts.totalCost ?? 0, + estimatedCost: cachedCosts.estimatedCost, + discount: cachedCosts.discount ?? null, + pricingTier: cachedCosts.pricingTier ?? null, + dataStorageCost: calculateDataStorageCost( + cachedCosts.promptTokens ?? cachedResponse.usage?.prompt_tokens, + cachedResponse.usage?.prompt_tokens_details?.cached_tokens, + cachedResponse.usage?.completion_tokens, + cachedResponse.usage?.reasoning_tokens, + retentionLevel, + ), + cached: true, + toolResults: cachedResponse.choices?.[0]?.message?.tool_calls ?? null, + }); - return c.json(cachedResponse); + return c.json(responseForCurrentRequest); } } } @@ -3577,13 +3706,20 @@ chat.openapi(completions, async (c) => { // Check if streaming is requested and if the model/provider combination supports it // For image generation models, we'll fake streaming by converting the response const fakeStreamingForImageGen = stream && isImageGeneration; - const effectiveStream = fakeStreamingForImageGen ? false : stream; + const streamingSupport = getModelStreamingSupport( + baseModelName, + usedProvider, + usedRegion, + ); + // When the provider only supports streaming, force it even if the client didn't request it. + // The upstream request uses effectiveStream; the client response uses stream. + const forceStream = streamingSupport === "only" && !stream; + const effectiveStream = fakeStreamingForImageGen + ? false + : stream || forceStream; if (stream) { - if ( - !isImageGeneration && - getModelStreamingSupport(baseModelName, usedProvider) === false - ) { + if (!isImageGeneration && streamingSupport === false) { throw new HTTPException(400, { message: `Model ${usedModel} with provider ${usedProvider} does not support streaming`, }); @@ -3700,39 +3836,7 @@ chat.openapi(completions, async (c) => { } } - // For Moonshot provider, enrich assistant messages with cached reasoning_content - // This is needed for multi-turn tool call conversations with thinking models - // Moonshot requires reasoning_content in assistant messages with tool_calls - if (usedProvider === "moonshot") { - const { redisClient } = await import("@llmgateway/cache"); - for (const message of messages) { - if ( - message.role === "assistant" && - message.tool_calls && - Array.isArray(message.tool_calls) && - message.tool_calls.length > 0 && - !(message as any).reasoning_content // Only add if not already present - ) { - // Get reasoning_content from the first tool call (all tool calls share the same reasoning) - const firstToolCall = message.tool_calls[0]; - if (firstToolCall?.id) { - try { - const cachedReasoningContent = await redisClient.get( - `reasoning_content:${firstToolCall.id}`, - ); - if (cachedReasoningContent) { - // Add reasoning_content to the message for Moonshot - (message as any).reasoning_content = cachedReasoningContent; - } - } catch { - // Silently fail - reasoning_content caching is optional - } - } - } - } - } - - let requestBody: ProviderRequestBody = await prepareRequestBody( + let requestBody: ProviderRequestBody | FormData = await prepareRequestBody( usedProvider, upstreamModelName, messages as BaseMessage[], @@ -3761,6 +3865,7 @@ chat.openapi(completions, async (c) => { // Validate effective max_tokens value after prepareRequestBody if ( + !(requestBody instanceof FormData) && hasMaxTokens(requestBody) && requestBody.max_tokens !== undefined && finalModelInfo @@ -3790,16 +3895,171 @@ chat.openapi(completions, async (c) => { isImageGeneration && usedProvider === "xai" && url && + !(requestBody instanceof FormData) && ("image" in requestBody || "images" in requestBody) ) { url = url.replace("/v1/images/generations", "/v1/images/edits"); } + // Switch OpenAI image generation endpoint to /edits when input images are present. + // prepareRequestBody returns a FormData (multipart/form-data) only for this edits flow. + if ( + isImageGeneration && + usedProvider === "openai" && + url && + requestBody instanceof FormData + ) { + url = url.replace("/v1/images/generations", "/v1/images/edits"); + } + const startTime = Date.now(); + const failedEnvKeyIndicesByProvider = new Map>(); + const failedTrackedKeyIdsByProvider = new Map>(); + + function rememberFailedKey( + providerId: string, + region: string | undefined, + options: { + envVarName?: string; + configIndex?: number; + providerKeyId?: string; + }, + ): void { + const retryKey = providerRetryKey(providerId, region); + + if (options.envVarName !== undefined && options.configIndex !== undefined) { + const failedIndices = + failedEnvKeyIndicesByProvider.get(retryKey) ?? new Set(); + failedIndices.add(options.configIndex); + failedEnvKeyIndicesByProvider.set(retryKey, failedIndices); + } + + if (options.providerKeyId) { + const failedKeyIds = + failedTrackedKeyIdsByProvider.get(retryKey) ?? new Set(); + failedKeyIds.add(options.providerKeyId); + failedTrackedKeyIdsByProvider.set(retryKey, failedKeyIds); + } + } + + async function resolveProviderContextForRetry( + providerMapping: { + providerId: string; + modelName: string; + region?: string; + }, + streamValue: boolean, + ) { + const retryKey = providerRetryKey( + providerMapping.providerId, + providerMapping.region, + ); + return await resolveProviderContext( + providerMapping, + retryProjectContext, + retryOrganizationContext, + modelInfo, + originalRequestParams, + { + requestId, + stream: streamValue, + effectiveStream, + messages: messages as BaseMessage[], + response_format, + tools, + tool_choice, + reasoning_effort, + reasoning_max_tokens, + effort, + webSearchTool, + image_config, + sensitive_word_check, + maxImageSizeMB, + userPlan, + hasExistingToolCalls, + customProviderName, + webSearchEnabled: !!webSearchTool, + excludedEnvKeyIndices: failedEnvKeyIndicesByProvider.get(retryKey), + excludedProviderKeyIds: failedTrackedKeyIdsByProvider.get(retryKey), + }, + ); + } + + function applyResolvedProviderContext( + ctx: Awaited>, + ): void { + usedProvider = ctx.usedProvider; + usedModel = ctx.usedModel; + usedModelFormatted = ctx.usedModelFormatted; + usedModelMapping = ctx.usedModelMapping; + baseModelName = ctx.baseModelName; + usedToken = ctx.usedToken; + usedApiKeyHash = ctx.usedApiKeyHash; + providerKey = ctx.providerKey; + configIndex = ctx.configIndex; + envVarName = ctx.envVarName; + url = ctx.url; + requestBody = ctx.requestBody; + useResponsesApi = ctx.useResponsesApi; + requestCanBeCanceled = ctx.requestCanBeCanceled; + isImageGeneration = ctx.isImageGeneration; + supportsReasoning = ctx.supportsReasoning; + splitTaggedReasoning = ctx.splitTaggedReasoning ?? false; + temperature = ctx.temperature; + max_tokens = ctx.max_tokens; + top_p = ctx.top_p; + frequency_penalty = ctx.frequency_penalty; + presence_penalty = ctx.presence_penalty; + usedRegion = ctx.usedRegion; + routingMetadata = withUsedApiKeyHash(routingMetadata, usedApiKeyHash); + } + + async function tryResolveAlternateKeyForCurrentProvider( + streamValue: boolean, + ): Promise> | null> { + if (!usedProvider || !usedModel) { + return null; + } + + const currentProviderKeyId = providerKey?.id; + const currentEnvVarName = envVarName; + const currentConfigIndex = configIndex; + const currentToken = usedToken; + + try { + const nextContext = await resolveProviderContextForRetry( + { + providerId: usedProvider, + modelName: usedModel, + region: usedRegion, + }, + streamValue, + ); + + const isDifferentTrackedKey = + nextContext.providerKey?.id !== undefined && + nextContext.providerKey.id !== currentProviderKeyId; + const isDifferentEnvKey = + nextContext.envVarName !== undefined && + (nextContext.envVarName !== currentEnvVarName || + nextContext.configIndex !== currentConfigIndex); + const isDifferentToken = nextContext.usedToken !== currentToken; + + if (!isDifferentTrackedKey && !isDifferentEnvKey && !isDifferentToken) { + return null; + } + + return nextContext; + } catch { + return null; + } + } // Handle streaming response if requested // For image generation models, we skip real streaming and use fake streaming later - if (effectiveStream) { + // For stream-only models where the client didn't request streaming, use the non-streaming path + // (effectiveStream forces streaming upstream, but the client gets a regular JSON response) + if (effectiveStream && !forceStream) { void registerStreamCompletion(c); return streamSSE( c, @@ -3907,6 +4167,13 @@ chat.openapi(completions, async (c) => { 0, project.organizationId, ); + streamingCosts.dataStorageCost = toDataStorageCostNumber( + streamingCosts.promptTokens ?? promptTokenCount, + null, + 0, + null, + retentionLevel, + ); await writeSSEAndCache({ data: JSON.stringify({ @@ -3926,6 +4193,27 @@ chat.openapi(completions, async (c) => { id: String(eventId++), }); + const contentFilterUsage: Record = { + prompt_tokens: promptTokenCount, + completion_tokens: 0, + total_tokens: promptTokenCount, + }; + applyExtendedUsageFields(contentFilterUsage, { + costs: { + inputCost: streamingCosts.inputCost, + outputCost: streamingCosts.outputCost, + cachedInputCost: streamingCosts.cachedInputCost, + requestCost: streamingCosts.requestCost, + webSearchCost: streamingCosts.webSearchCost, + imageInputCost: streamingCosts.imageInputCost, + imageOutputCost: streamingCosts.imageOutputCost, + totalCost: streamingCosts.totalCost, + dataStorageCost: streamingCosts.dataStorageCost, + }, + cachedTokens: null, + cacheCreationTokens: null, + reasoningTokens: null, + }); await writeSSEAndCache({ data: JSON.stringify({ id: `chatcmpl-${Date.now()}`, @@ -3939,18 +4227,7 @@ chat.openapi(completions, async (c) => { finish_reason: null, }, ], - usage: { - prompt_tokens: promptTokenCount, - completion_tokens: 0, - total_tokens: promptTokenCount, - cost_usd_total: streamingCosts.totalCost, - cost_usd_input: streamingCosts.inputCost, - cost_usd_output: streamingCosts.outputCost, - cost_usd_cached_input: streamingCosts.cachedInputCost, - cost_usd_request: streamingCosts.requestCost, - cost_usd_image_input: streamingCosts.imageInputCost, - cost_usd_image_output: streamingCosts.imageOutputCost, - }, + usage: contentFilterUsage, }), id: String(eventId++), }); @@ -4040,65 +4317,11 @@ chat.openapi(completions, async (c) => { } try { - const ctx = await resolveProviderContext( + const ctx = await resolveProviderContextForRetry( nextProvider, - { - mode: project.mode, - organizationId: project.organizationId, - }, - { - id: organization.id, - credits: organization.credits, - devPlan: organization.devPlan, - devPlanCreditsLimit: organization.devPlanCreditsLimit, - devPlanCreditsUsed: organization.devPlanCreditsUsed, - devPlanExpiresAt: organization.devPlanExpiresAt, - }, - modelInfo, - originalRequestParams, - { - requestId, - stream: true, - effectiveStream, - messages: messages as BaseMessage[], - response_format, - tools, - tool_choice, - reasoning_effort, - reasoning_max_tokens, - effort, - webSearchTool, - image_config, - sensitive_word_check, - maxImageSizeMB, - userPlan, - hasExistingToolCalls, - customProviderName, - webSearchEnabled: !!webSearchTool, - }, + true, ); - usedProvider = ctx.usedProvider; - usedModel = ctx.usedModel; - usedModelFormatted = ctx.usedModelFormatted; - usedModelMapping = ctx.usedModelMapping; - baseModelName = ctx.baseModelName; - usedToken = ctx.usedToken; - providerKey = ctx.providerKey; - configIndex = ctx.configIndex; - envVarName = ctx.envVarName; - url = ctx.url; - requestBody = ctx.requestBody; - useResponsesApi = ctx.useResponsesApi; - requestCanBeCanceled = ctx.requestCanBeCanceled; - isImageGeneration = ctx.isImageGeneration; - supportsReasoning = ctx.supportsReasoning; - splitTaggedReasoning = ctx.splitTaggedReasoning ?? false; - temperature = ctx.temperature; - max_tokens = ctx.max_tokens; - top_p = ctx.top_p; - frequency_penalty = ctx.frequency_penalty; - presence_penalty = ctx.presence_penalty; - usedRegion = ctx.usedRegion; + applyResolvedProviderContext(ctx); } catch { failedProviderIds.add( providerRetryKey( @@ -4114,6 +4337,7 @@ chat.openapi(completions, async (c) => { try { const headers = getProviderHeaders(usedProvider, usedToken, { + requestId, webSearchEnabled: !!webSearchTool, }); headers["Content-Type"] = "application/json"; @@ -4171,6 +4395,20 @@ chat.openapi(completions, async (c) => { ), }); + // Log the timeout error in the database + const timeoutPluginIds = plugins?.map((p) => p.id) ?? []; + + let sameProviderRetryContext: Awaited< + ReturnType + > | null = null; + rememberFailedKey(usedProvider, usedRegion, { + envVarName, + configIndex, + providerKeyId: providerKey?.id, + }); + sameProviderRetryContext = + await tryResolveAlternateKeyForCurrentProvider(true); + // Check if we should retry before logging so we can mark the log as retried const willRetryTimeout = shouldRetryRequest({ requestedProvider, @@ -4183,88 +4421,121 @@ chat.openapi(completions, async (c) => { 1, usedProvider, }); + const willRetrySameProvider = sameProviderRetryContext !== null; + const willRetryRequest = + willRetrySameProvider || willRetryTimeout; - enqueueChatLog( - c, - { - providerKeyId: providerKey?.id, - usedModel: usedModelFormatted, - usedModelMapping, - usedProvider, - requestedModel: initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoningEffort: reasoning_effort, - reasoningMaxTokens: reasoning_max_tokens, - effort, - responseFormat: response_format, - tools, - toolChoice: tool_choice, - source, - customHeaders, - debugMode, - userAgent, - imageConfig: image_config, - routingMetadata, - rawRequest: rawBody, - rawResponse: null, - upstreamRequest: requestBody, - upstreamResponse: null, - plugins: requestPluginIds, - pluginResults: undefined, - }, - { - duration: Date.now() - perAttemptStartTime, - timeToFirstToken: null, - timeToFirstReasoningToken: null, - responseSize: 0, - content: null, - reasoningContent: null, - finishReason: "upstream_error", - promptTokens: null, - completionTokens: null, - totalTokens: null, - reasoningTokens: null, - cachedTokens: null, - hasError: true, - streamed: true, - canceled: false, - errorDetails: { - statusCode: 0, - statusText: "TimeoutError", - responseText: errorMessage, - cause: timeoutCause, - }, - cachedInputCost: null, - requestCost: null, - webSearchCost: null, - imageInputTokens: null, - imageOutputTokens: null, - imageInputCost: null, - imageOutputCost: null, - discount: null, - dataStorageCost: "0", - cached: false, - toolResults: null, - retried: willRetryTimeout, - retriedByLogId: willRetryTimeout ? finalLogId : null, - }, + const baseLogEntry = createLogEntry( + requestId, + project, + apiKey, + providerKey?.id, + usedModelFormatted, + usedModelMapping, + usedProvider, + initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoning_effort, + reasoning_max_tokens, + effort, + response_format, + tools, + tool_choice, + source, + customHeaders, + debugMode, + userAgent, + image_config, + routingMetadata, + rawBody, + null, // No response for timeout error + requestBody, + null, // No upstream response for timeout error + timeoutPluginIds, + undefined, // No plugin results for error case ); + const attemptLogId = shortid(); + + await insertLogEntry({ + ...baseLogEntry, + id: attemptLogId, + duration: Date.now() - perAttemptStartTime, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + responseSize: 0, + content: null, + reasoningContent: null, + finishReason: "upstream_error", + promptTokens: null, + completionTokens: null, + totalTokens: null, + reasoningTokens: null, + cachedTokens: null, + hasError: true, + streamed: true, + canceled: false, + errorDetails: { + statusCode: 0, + statusText: "TimeoutError", + responseText: errorMessage, + cause: timeoutCause, + }, + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + discount: null, + dataStorageCost: "0", + cached: false, + toolResults: null, + retried: willRetryRequest, + retriedByLogId: willRetryRequest ? finalLogId : null, + }); + + if (willRetrySameProvider && sameProviderRetryContext) { + routingAttempts.push( + buildRoutingAttempt( + usedProvider, + baseModelName, + 0, + getErrorType(0), + false, + { + region: usedRegion, + apiKeyHash: usedApiKeyHash, + logId: attemptLogId, + }, + ), + ); + applyResolvedProviderContext(sameProviderRetryContext); + retryAttempt--; + continue; + } if (willRetryTimeout) { - routingAttempts.push({ - provider: usedProvider, - model: baseModelName, - ...(usedRegion && { region: usedRegion }), - status_code: 0, - error_type: getErrorType(0), - succeeded: false, - }); + routingAttempts.push( + buildRoutingAttempt( + usedProvider, + baseModelName, + 0, + getErrorType(0), + false, + { + region: usedRegion, + apiKeyHash: usedApiKeyHash, + logId: attemptLogId, + }, + ), + ); failedProviderIds.add( providerRetryKey(usedProvider, usedRegion), ); @@ -4287,6 +4558,10 @@ chat.openapi(completions, async (c) => { error instanceof Error && error.name === "AbortError" ) { + // Log the canceled request + // Extract plugin IDs for logging (canceled request) + const canceledPluginIds = plugins?.map((p) => p.id) ?? []; + // Calculate costs for cancelled request if billing is enabled const billCancelled = shouldBillCancelledRequests(); let cancelledCosts: Awaited< @@ -4329,95 +4604,97 @@ chat.openapi(completions, async (c) => { ); } - enqueueChatLog( - c, - { - providerKeyId: providerKey?.id, - usedModel: usedModelFormatted, - usedModelMapping, - usedProvider, - requestedModel: initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoningEffort: reasoning_effort, - reasoningMaxTokens: reasoning_max_tokens, - effort, - responseFormat: response_format, - tools, - toolChoice: tool_choice, - source, - customHeaders, - debugMode, - userAgent, - imageConfig: image_config, - routingMetadata, - rawRequest: rawBody, - rawResponse: null, - upstreamRequest: requestBody, - upstreamResponse: null, - plugins: requestPluginIds, - pluginResults: undefined, - }, - { - duration: Date.now() - perAttemptStartTime, - timeToFirstToken: null, - timeToFirstReasoningToken: null, - responseSize: 0, - content: null, - reasoningContent: null, - finishReason: "canceled", - promptTokens: billCancelled - ? ( - cancelledCosts?.promptTokens ?? estimatedPromptTokens - )?.toString() - : null, - completionTokens: billCancelled ? "0" : null, - totalTokens: billCancelled - ? ( - cancelledCosts?.promptTokens ?? estimatedPromptTokens - )?.toString() - : null, - reasoningTokens: null, - cachedTokens: null, - hasError: false, - streamed: true, - canceled: true, - errorDetails: null, - inputCost: cancelledCosts?.inputCost ?? null, - outputCost: cancelledCosts?.outputCost ?? null, - cachedInputCost: cancelledCosts?.cachedInputCost ?? null, - requestCost: cancelledCosts?.requestCost ?? null, - webSearchCost: cancelledCosts?.webSearchCost ?? null, - imageInputTokens: - cancelledCosts?.imageInputTokens?.toString() ?? null, - imageOutputTokens: - cancelledCosts?.imageOutputTokens?.toString() ?? null, - imageInputCost: cancelledCosts?.imageInputCost ?? null, - imageOutputCost: cancelledCosts?.imageOutputCost ?? null, - cost: cancelledCosts?.totalCost ?? null, - estimatedCost: cancelledCosts?.estimatedCost ?? false, - discount: cancelledCosts?.discount ?? null, - dataStorageCost: billCancelled - ? calculateDataStorageCost( - cancelledCosts?.promptTokens ?? estimatedPromptTokens, - null, - 0, - null, - retentionLevel, - ) - : "0", - cached: false, - toolResults: null, - }, + const baseLogEntry = createLogEntry( + requestId, + project, + apiKey, + providerKey?.id, + usedModelFormatted, + usedModelMapping, + usedProvider, + initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoning_effort, + reasoning_max_tokens, + effort, + response_format, + tools, + tool_choice, + source, + customHeaders, + debugMode, + userAgent, + image_config, + routingMetadata, + rawBody, + null, // No response for canceled request + requestBody, // The request that was sent before cancellation + null, // No upstream response for canceled request + canceledPluginIds, + undefined, // No plugin results for canceled request ); - // Send a cancellation event to the client - await writeSSEAndCache({ + await insertLogEntry({ + ...baseLogEntry, + duration: Date.now() - perAttemptStartTime, + timeToFirstToken: null, // Not applicable for canceled request + timeToFirstReasoningToken: null, // Not applicable for canceled request + responseSize: 0, + content: null, + reasoningContent: null, + finishReason: "canceled", + promptTokens: billCancelled + ? ( + cancelledCosts?.promptTokens ?? estimatedPromptTokens + )?.toString() + : null, + completionTokens: billCancelled ? "0" : null, + totalTokens: billCancelled + ? ( + cancelledCosts?.promptTokens ?? estimatedPromptTokens + )?.toString() + : null, + reasoningTokens: null, + cachedTokens: null, + hasError: false, + streamed: true, + canceled: true, + errorDetails: null, + inputCost: cancelledCosts?.inputCost ?? null, + outputCost: cancelledCosts?.outputCost ?? null, + cachedInputCost: cancelledCosts?.cachedInputCost ?? null, + requestCost: cancelledCosts?.requestCost ?? null, + webSearchCost: cancelledCosts?.webSearchCost ?? null, + imageInputTokens: + cancelledCosts?.imageInputTokens?.toString() ?? null, + imageOutputTokens: + cancelledCosts?.imageOutputTokens?.toString() ?? null, + imageInputCost: cancelledCosts?.imageInputCost ?? null, + imageOutputCost: cancelledCosts?.imageOutputCost ?? null, + cost: cancelledCosts?.totalCost ?? null, + estimatedCost: cancelledCosts?.estimatedCost ?? false, + discount: cancelledCosts?.discount ?? null, + dataStorageCost: billCancelled + ? calculateDataStorageCost( + cancelledCosts?.promptTokens ?? estimatedPromptTokens, + null, + 0, + null, + retentionLevel, + ) + : "0", + cached: false, + toolResults: null, + }); + + // Send a cancellation event to the client + await writeSSEAndCache({ event: "canceled", data: JSON.stringify({ message: "Request canceled by client", @@ -4448,6 +4725,23 @@ chat.openapi(completions, async (c) => { ), }); + // Log the error in the database + // Extract plugin IDs for logging (fetch error) + const fetchErrorPluginIds = plugins?.map((p) => p.id) ?? []; + + let sameProviderRetryContext: Awaited< + ReturnType + > | null = null; + if (isRetryableErrorType("network_error")) { + rememberFailedKey(usedProvider, usedRegion, { + envVarName, + configIndex, + providerKeyId: providerKey?.id, + }); + sameProviderRetryContext = + await tryResolveAlternateKeyForCurrentProvider(true); + } + // Check if we should retry before logging so we can mark the log as retried const willRetryFetch = shouldRetryRequest({ requestedProvider, @@ -4460,93 +4754,129 @@ chat.openapi(completions, async (c) => { 1, usedProvider, }); + const willRetrySameProvider = sameProviderRetryContext !== null; + const willRetryRequest = + willRetrySameProvider || willRetryFetch; - enqueueChatLog( - c, - { - providerKeyId: providerKey?.id, - usedModel: usedModelFormatted, - usedModelMapping, - usedProvider, - requestedModel: initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoningEffort: reasoning_effort, - reasoningMaxTokens: reasoning_max_tokens, - effort, - responseFormat: response_format, - tools, - toolChoice: tool_choice, - source, - customHeaders, - debugMode, - userAgent, - imageConfig: image_config, - routingMetadata, - rawRequest: rawBody, - rawResponse: null, - upstreamRequest: requestBody, - upstreamResponse: null, - plugins: requestPluginIds, - pluginResults: undefined, - }, - { - duration: Date.now() - perAttemptStartTime, - timeToFirstToken: null, - timeToFirstReasoningToken: null, - responseSize: 0, - content: null, - reasoningContent: null, - finishReason: "upstream_error", - promptTokens: null, - completionTokens: null, - totalTokens: null, - reasoningTokens: null, - cachedTokens: null, - hasError: true, - streamed: true, - canceled: false, - errorDetails: { - statusCode: 0, - statusText: error.name, - responseText: errorMessage, - cause: fetchCause, - }, - cachedInputCost: null, - requestCost: null, - webSearchCost: null, - imageInputTokens: null, - imageOutputTokens: null, - imageInputCost: null, - imageOutputCost: null, - discount: null, - dataStorageCost: "0", - cached: false, - toolResults: null, - retried: willRetryFetch, - retriedByLogId: willRetryFetch ? finalLogId : null, - }, + const baseLogEntry = createLogEntry( + requestId, + project, + apiKey, + providerKey?.id, + usedModelFormatted, + usedModelMapping, + usedProvider, + initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoning_effort, + reasoning_max_tokens, + effort, + response_format, + tools, + tool_choice, + source, + customHeaders, + debugMode, + userAgent, + image_config, + routingMetadata, + rawBody, + null, // No response for fetch error + requestBody, // The request that resulted in error + null, // No upstream response for fetch error + fetchErrorPluginIds, + undefined, // No plugin results for error case ); + const attemptLogId = shortid(); - // Report key health for environment-based tokens + await insertLogEntry({ + ...baseLogEntry, + id: attemptLogId, + duration: Date.now() - perAttemptStartTime, + timeToFirstToken: null, // Not applicable for error case + timeToFirstReasoningToken: null, // Not applicable for error case + responseSize: 0, + content: null, + reasoningContent: null, + finishReason: "upstream_error", + promptTokens: null, + completionTokens: null, + totalTokens: null, + reasoningTokens: null, + cachedTokens: null, + hasError: true, + streamed: true, + canceled: false, + errorDetails: { + statusCode: 0, + statusText: error.name, + responseText: errorMessage, + cause: fetchCause, + }, + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + discount: null, + dataStorageCost: "0", + cached: false, + toolResults: null, + retried: willRetryRequest, + retriedByLogId: willRetryRequest ? finalLogId : null, + }); + + // Report key health for the selected token source if (envVarName !== undefined) { reportKeyError(envVarName, configIndex, 0); } + if (providerKey?.id) { + reportTrackedKeyError(providerKey.id, 0); + } + + if (willRetrySameProvider && sameProviderRetryContext) { + routingAttempts.push( + buildRoutingAttempt( + usedProvider, + baseModelName, + 0, + getErrorType(0), + false, + { + region: usedRegion, + apiKeyHash: usedApiKeyHash, + logId: attemptLogId, + }, + ), + ); + applyResolvedProviderContext(sameProviderRetryContext); + retryAttempt--; + continue; + } if (willRetryFetch) { - routingAttempts.push({ - provider: usedProvider, - model: baseModelName, - ...(usedRegion && { region: usedRegion }), - status_code: 0, - error_type: getErrorType(0), - succeeded: false, - }); + routingAttempts.push( + buildRoutingAttempt( + usedProvider, + baseModelName, + 0, + getErrorType(0), + false, + { + region: usedRegion, + apiKeyHash: usedApiKeyHash, + logId: attemptLogId, + }, + ), + ); failedProviderIds.add( providerRetryKey(usedProvider, usedRegion), ); @@ -4611,6 +4941,23 @@ chat.openapi(completions, async (c) => { }); } + // Log the request in the database + // Extract plugin IDs for logging + const streamingErrorPluginIds = plugins?.map((p) => p.id) ?? []; + + let sameProviderRetryContext: Awaited< + ReturnType + > | null = null; + if (isRetryableErrorType(finishReason)) { + rememberFailedKey(usedProvider, usedRegion, { + envVarName, + configIndex, + providerKeyId: providerKey?.id, + }); + sameProviderRetryContext = + await tryResolveAlternateKeyForCurrentProvider(true); + } + // Check if we should retry before logging so we can mark the log as retried const willRetryHttpError = shouldRetryRequest({ requestedProvider, @@ -4623,94 +4970,101 @@ chat.openapi(completions, async (c) => { 1, usedProvider, }); - - enqueueChatLog( - c, - { - providerKeyId: providerKey?.id, - usedModel: usedModelFormatted, - usedModelMapping, - usedProvider, - requestedModel: initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoningEffort: reasoning_effort, - reasoningMaxTokens: reasoning_max_tokens, - effort, - responseFormat: response_format, - tools, - toolChoice: tool_choice, - source, - customHeaders, - debugMode, - userAgent, - imageConfig: image_config, - routingMetadata, - rawRequest: rawBody, - rawResponse: null, - upstreamRequest: requestBody, - upstreamResponse: null, - plugins: requestPluginIds, - pluginResults: undefined, - }, - { - duration: Date.now() - perAttemptStartTime, - timeToFirstToken: null, - timeToFirstReasoningToken: null, - responseSize: errorResponseText.length, - content: null, - reasoningContent: null, - finishReason, - promptTokens: - finishReason === "content_filter" - ? ( - estimateTokens(usedProvider, messages, null, null, 0) - .calculatedPromptTokens ?? null - )?.toString() - : null, - completionTokens: null, - totalTokens: - finishReason === "content_filter" - ? ( - estimateTokens(usedProvider, messages, null, null, 0) - .calculatedPromptTokens ?? null - )?.toString() - : null, - reasoningTokens: null, - cachedTokens: null, - hasError: finishReason !== "content_filter", - streamed: true, - canceled: false, - errorDetails: - finishReason === "content_filter" - ? null - : { - statusCode: res.status, - statusText: res.statusText, - responseText: errorResponseText, - }, - cachedInputCost: null, - requestCost: null, - webSearchCost: null, - imageInputTokens: null, - imageOutputTokens: null, - imageInputCost: null, - imageOutputCost: null, - discount: null, - dataStorageCost: "0", - cached: false, - toolResults: null, - retried: willRetryHttpError, - retriedByLogId: willRetryHttpError ? finalLogId : null, - }, + const willRetrySameProvider = sameProviderRetryContext !== null; + const willRetryRequest = + willRetrySameProvider || willRetryHttpError; + + const baseLogEntry = createLogEntry( + requestId, + project, + apiKey, + providerKey?.id, + usedModelFormatted, + usedModelMapping, + usedProvider, + initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoning_effort, + reasoning_max_tokens, + effort, + response_format, + tools, + tool_choice, + source, + customHeaders, + debugMode, + userAgent, + image_config, + routingMetadata, + rawBody, + null, // No response for error case + requestBody, // The request that was sent and resulted in error + null, // No upstream response for error case + streamingErrorPluginIds, + undefined, // No plugin results for error case ); + const attemptLogId = shortid(); + + await insertLogEntry({ + ...baseLogEntry, + id: attemptLogId, + duration: Date.now() - perAttemptStartTime, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + responseSize: errorResponseText.length, + content: null, + reasoningContent: null, + finishReason, + promptTokens: + finishReason === "content_filter" + ? ( + estimateTokens(usedProvider, messages, null, null, 0) + .calculatedPromptTokens ?? null + )?.toString() + : null, + completionTokens: null, + totalTokens: + finishReason === "content_filter" + ? ( + estimateTokens(usedProvider, messages, null, null, 0) + .calculatedPromptTokens ?? null + )?.toString() + : null, + reasoningTokens: null, + cachedTokens: null, + hasError: finishReason !== "content_filter", // content_filter is not an error + streamed: true, + canceled: false, + errorDetails: + finishReason === "content_filter" + ? null + : { + statusCode: res.status, + statusText: res.statusText, + responseText: errorResponseText, + }, + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + discount: null, + dataStorageCost: "0", + cached: false, + toolResults: null, + retried: willRetryRequest, + retriedByLogId: willRetryRequest ? finalLogId : null, + }); - // Report key health for environment-based tokens + // Report key health for the selected token source // Don't report content_filter as a key error - it's intentional provider behavior if ( envVarName !== undefined && @@ -4723,16 +5077,49 @@ chat.openapi(completions, async (c) => { errorResponseText, ); } + if (providerKey?.id && finishReason !== "content_filter") { + reportTrackedKeyError( + providerKey.id, + res.status, + errorResponseText, + ); + } + + if (willRetrySameProvider && sameProviderRetryContext) { + routingAttempts.push( + buildRoutingAttempt( + usedProvider, + baseModelName, + res.status, + getErrorType(res.status), + false, + { + region: usedRegion, + apiKeyHash: usedApiKeyHash, + logId: attemptLogId, + }, + ), + ); + applyResolvedProviderContext(sameProviderRetryContext); + retryAttempt--; + continue; + } if (willRetryHttpError) { - routingAttempts.push({ - provider: usedProvider, - model: baseModelName, - ...(usedRegion && { region: usedRegion }), - status_code: res.status, - error_type: getErrorType(res.status), - succeeded: false, - }); + routingAttempts.push( + buildRoutingAttempt( + usedProvider, + baseModelName, + res.status, + getErrorType(res.status), + false, + { + region: usedRegion, + apiKeyHash: usedApiKeyHash, + logId: attemptLogId, + }, + ), + ); failedProviderIds.add( providerRetryKey(usedProvider, usedRegion), ); @@ -4801,19 +5188,241 @@ chat.openapi(completions, async (c) => { return; } + const inspectedStreamingResponse = + await inspectImmediateStreamingProviderError(res, usedProvider); + res = inspectedStreamingResponse.response; + if (inspectedStreamingResponse.immediateError) { + const { + errorCode, + errorMessage, + errorResponseText, + errorType, + inferredStatusCode, + statusText, + } = inspectedStreamingResponse.immediateError; + + logger.warn("Immediate streaming provider error", { + status: inferredStatusCode, + errorText: errorResponseText, + usedProvider, + requestedProvider, + usedModel, + initialRequestedModel, + organizationId: project.organizationId, + projectId: apiKey.projectId, + apiKeyId: apiKey.id, + unifiedFinishReason: getUnifiedFinishReason( + errorType, + usedProvider, + ), + }); + + const streamingErrorPluginIds = plugins?.map((p) => p.id) ?? []; + + let sameProviderRetryContext: Awaited< + ReturnType + > | null = null; + if (isRetryableErrorType(errorType)) { + rememberFailedKey(usedProvider, usedRegion, { + envVarName, + configIndex, + providerKeyId: providerKey?.id, + }); + sameProviderRetryContext = + await tryResolveAlternateKeyForCurrentProvider(true); + } + + const willRetryStreamingError = shouldRetryRequest({ + requestedProvider, + noFallback, + errorType, + retryCount: retryAttempt, + remainingProviders: + (routingMetadata?.providerScores.length ?? 0) - + failedProviderIds.size - + 1, + usedProvider, + }); + const willRetrySameProvider = sameProviderRetryContext !== null; + const willRetryRequest = + willRetrySameProvider || willRetryStreamingError; + + const baseLogEntry = createLogEntry( + requestId, + project, + apiKey, + providerKey?.id, + usedModelFormatted, + usedModelMapping, + usedProvider, + initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoning_effort, + reasoning_max_tokens, + effort, + response_format, + tools, + tool_choice, + source, + customHeaders, + debugMode, + userAgent, + image_config, + routingMetadata, + rawBody, + null, + requestBody, + null, + streamingErrorPluginIds, + undefined, + ); + const attemptLogId = shortid(); + + await insertLogEntry({ + ...baseLogEntry, + id: attemptLogId, + duration: Date.now() - perAttemptStartTime, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + responseSize: errorResponseText.length, + content: null, + reasoningContent: null, + finishReason: errorType, + promptTokens: null, + completionTokens: null, + totalTokens: null, + reasoningTokens: null, + cachedTokens: null, + hasError: errorType !== "content_filter", + streamed: true, + canceled: false, + errorDetails: + errorType === "content_filter" + ? null + : { + statusCode: inferredStatusCode, + statusText, + responseText: errorResponseText, + }, + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + discount: null, + dataStorageCost: "0", + cached: false, + toolResults: null, + retried: willRetryRequest, + retriedByLogId: willRetryRequest ? finalLogId : null, + }); + + if (envVarName !== undefined && errorType !== "content_filter") { + reportKeyError( + envVarName, + configIndex, + inferredStatusCode, + errorResponseText, + ); + } + if (providerKey?.id && errorType !== "content_filter") { + reportTrackedKeyError( + providerKey.id, + inferredStatusCode, + errorResponseText, + ); + } + + if (willRetrySameProvider && sameProviderRetryContext) { + routingAttempts.push( + buildRoutingAttempt( + usedProvider, + baseModelName, + inferredStatusCode, + getErrorType(inferredStatusCode), + false, + { + region: usedRegion, + apiKeyHash: usedApiKeyHash, + logId: attemptLogId, + }, + ), + ); + applyResolvedProviderContext(sameProviderRetryContext); + retryAttempt--; + continue; + } + + if (willRetryStreamingError) { + routingAttempts.push( + buildRoutingAttempt( + usedProvider, + baseModelName, + inferredStatusCode, + getErrorType(inferredStatusCode), + false, + { + region: usedRegion, + apiKeyHash: usedApiKeyHash, + logId: attemptLogId, + }, + ), + ); + failedProviderIds.add( + providerRetryKey(usedProvider, usedRegion), + ); + continue; + } + + await writeSSEAndCache({ + event: "error", + data: JSON.stringify({ + error: { + message: errorMessage, + type: errorType, + code: errorCode, + param: null, + responseText: errorResponseText, + }, + }), + id: String(eventId++), + }); + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + clearKeepalive(); + return; + } + break; // Fetch succeeded, exit retry loop } // End of retry for loop // Add the final attempt (successful or last failed) to routing if (res && res.ok && usedProvider) { - routingAttempts.push({ - provider: usedProvider, - model: baseModelName, - ...(usedRegion && { region: usedRegion }), - status_code: res.status, - error_type: "none", - succeeded: true, - }); + routingAttempts.push( + buildRoutingAttempt( + usedProvider, + baseModelName, + res.status, + "none", + true, + { + region: usedRegion, + apiKeyHash: usedApiKeyHash, + logId: finalLogId, + }, + ), + ); } // Update routingMetadata with all routing attempts for DB logging @@ -4906,6 +5515,7 @@ chat.openapi(completions, async (c) => { let totalTokens = null; let reasoningTokens = null; let cachedTokens = null; + let cacheCreationTokens: number | null = null; let streamingToolCalls = null; let imageByteSize = 0; // Track total image data size for token estimation let outputImageCount = 0; // Track number of output images for cost calculation @@ -4915,6 +5525,7 @@ chat.openapi(completions, async (c) => { let sawProviderTerminalEvent = false; let sawOpenAiResponsesDoneEvent = false; let sawOpenAiResponsesCompletedStatus = false; + let sentDownstreamFinishReasonChunk = false; let handledTerminalProviderEvent = false; let buffer = ""; // Buffer for accumulating partial data across chunks (string for SSE) let binaryBuffer = new Uint8Array(0); // Buffer for binary event streams (AWS Bedrock) @@ -5328,10 +5939,70 @@ chat.openapi(completions, async (c) => { webSearchCount, project.organizationId, ); + streamingCosts.dataStorageCost = toDataStorageCostNumber( + streamingCosts.promptTokens ?? finalPromptTokens, + cachedTokens, + streamingCosts.completionTokens ?? finalCompletionTokens, + reasoningTokens, + retentionLevel, + ); // Include costs in response for all users const shouldIncludeCosts = true; + const finalStreamUsage: Record = { + prompt_tokens: Math.max( + 1, + streamingCosts.promptTokens ?? finalPromptTokens ?? 1, + ), + completion_tokens: + streamingCosts.completionTokens ?? + finalCompletionTokens ?? + 0, + total_tokens: Math.max( + 1, + (streamingCosts.promptTokens ?? + finalPromptTokens ?? + 0) + + (streamingCosts.completionTokens ?? + finalCompletionTokens ?? + 0) + + (reasoningTokens ?? 0), + ), + ...(reasoningTokens !== null && + reasoningTokens > 0 && { + reasoning_tokens: reasoningTokens, + }), + ...((cachedTokens !== null || + (cacheCreationTokens !== null && + cacheCreationTokens > 0)) && { + prompt_tokens_details: { + cached_tokens: cachedTokens ?? 0, + ...(cacheCreationTokens !== null && + cacheCreationTokens > 0 && { + cache_creation_tokens: cacheCreationTokens, + }), + }, + }), + }; + applyExtendedUsageFields(finalStreamUsage, { + costs: shouldIncludeCosts + ? { + inputCost: streamingCosts.inputCost, + outputCost: streamingCosts.outputCost, + cachedInputCost: streamingCosts.cachedInputCost, + requestCost: streamingCosts.requestCost, + webSearchCost: streamingCosts.webSearchCost, + imageInputCost: streamingCosts.imageInputCost, + imageOutputCost: streamingCosts.imageOutputCost, + totalCost: streamingCosts.totalCost, + dataStorageCost: streamingCosts.dataStorageCost, + } + : null, + cachedTokens, + cacheCreationTokens, + reasoningTokens, + }); const finalUsageChunk = { id: `chatcmpl-${Date.now()}`, object: "chat.completion.chunk", @@ -5344,35 +6015,7 @@ chat.openapi(completions, async (c) => { finish_reason: null, }, ], - usage: { - prompt_tokens: Math.max( - 1, - streamingCosts.promptTokens ?? finalPromptTokens ?? 1, - ), - completion_tokens: - streamingCosts.completionTokens ?? - finalCompletionTokens ?? - 0, - total_tokens: Math.max( - 1, - (streamingCosts.promptTokens ?? - finalPromptTokens ?? - 0) + - (streamingCosts.completionTokens ?? - finalCompletionTokens ?? - 0) + - (reasoningTokens ?? 0), - ), - ...(shouldIncludeCosts && { - cost_usd_total: streamingCosts.totalCost, - cost_usd_input: streamingCosts.inputCost, - cost_usd_output: streamingCosts.outputCost, - cost_usd_cached_input: streamingCosts.cachedInputCost, - cost_usd_request: streamingCosts.requestCost, - cost_usd_image_input: streamingCosts.imageInputCost, - cost_usd_image_output: streamingCosts.imageOutputCost, - }), - }, + usage: finalStreamUsage, }; await writeSSEAndCache({ @@ -5468,6 +6111,28 @@ chat.openapi(completions, async (c) => { usedProvider === "aws-bedrock" ? extractAwsBedrockStreamError(data) : null; + if ( + data && + typeof data === "object" && + "response" in data && + data.response && + typeof data.response === "object" && + "status" in data.response && + data.response.status === "completed" + ) { + sawOpenAiResponsesCompletedStatus = true; + } + if ( + data && + typeof data === "object" && + "type" in data && + typeof data.type === "string" && + (data.type === "response.content_part.done" || + data.type === "response.output_item.done" || + data.type === "response.output_text.done") + ) { + sawOpenAiResponsesDoneEvent = true; + } const openAiCompatibleStreamError = !awsBedrockStreamError && data && @@ -5492,13 +6157,10 @@ chat.openapi(completions, async (c) => { ), ); } - const inferredStatusCode = - typeof openAiCompatibleStreamError.status_code === - "number" - ? openAiCompatibleStreamError.status_code - : typeof openAiCompatibleStreamError.status === "number" - ? openAiCompatibleStreamError.status - : 400; + const inferredStatusCode = inferStreamingErrorStatusCode( + openAiCompatibleStreamError, + errorResponseText, + ); const errorType = getFinishReasonFromError( inferredStatusCode, errorResponseText, @@ -5640,29 +6302,6 @@ chat.openapi(completions, async (c) => { continue; } - if ( - data && - typeof data === "object" && - "response" in data && - data.response && - typeof data.response === "object" && - "status" in data.response && - data.response.status === "completed" - ) { - sawOpenAiResponsesCompletedStatus = true; - } - if ( - data && - typeof data === "object" && - "type" in data && - typeof data.type === "string" && - (data.type === "response.content_part.done" || - data.type === "response.output_item.done" || - data.type === "response.output_text.done") - ) { - sawOpenAiResponsesDoneEvent = true; - } - if (splitTaggedReasoning) { const deltaContent = transformedData.choices?.[0]?.delta?.content; @@ -5886,6 +6525,8 @@ chat.openapi(completions, async (c) => { // Extract finishReason from transformedData to update tracking variable if (transformedData.choices?.[0]?.finish_reason) { finishReason = transformedData.choices[0].finish_reason; + sawProviderTerminalEvent = true; + sentDownstreamFinishReasonChunk = true; } // Extract content for logging using helper function @@ -6056,12 +6697,8 @@ chat.openapi(completions, async (c) => { } break; default: // OpenAI format - if ( - transformedData?.choices && - transformedData.choices[0]?.finish_reason - ) { - finishReason = transformedData.choices[0].finish_reason; - sawProviderTerminalEvent = true; + if (data.choices && data.choices[0]?.finish_reason) { + finishReason = data.choices[0].finish_reason; } break; } @@ -6088,6 +6725,9 @@ chat.openapi(completions, async (c) => { if (usage.cachedTokens !== null) { cachedTokens = usage.cachedTokens; } + if (usage.cacheCreationTokens !== null) { + cacheCreationTokens = usage.cacheCreationTokens; + } // Estimate tokens if not provided and we have a finish reason if (finishReason && (!promptTokens || !completionTokens)) { @@ -6353,10 +6993,18 @@ chat.openapi(completions, async (c) => { sawUpstreamDoneSentinel || sawProviderTerminalEvent || handledTerminalProviderEvent; + // A terminal finish reason (stop, tool_calls, length) also counts + // as a valid stream completion — some providers (e.g. MiniMax) + // send finish_reason but omit the [DONE] sentinel. + const hasTerminalFinishReason = + finishReason !== null && + finishReason !== "upstream_error" && + finishReason !== "gateway_error"; const streamEndedWithoutTerminalEvent = !streamingError && !canceled && - (!streamHasVerifiedTerminalEvent || finishReason === null); + !streamHasVerifiedTerminalEvent && + !hasTerminalFinishReason; if (streamEndedWithoutTerminalEvent) { const hasBufferedNonWhitespace = /\S/u.test(buffer); const responseText = hasBufferedNonWhitespace @@ -6427,23 +7075,14 @@ chat.openapi(completions, async (c) => { // Check if the response finished successfully but has no content, tokens, or tool calls // This indicates an empty response which should be marked as an error // Do this check BEFORE sending usage chunks to ensure proper event ordering - // Exclude content_filter responses as they are intentionally empty (blocked by provider) - // For Google, check for original finish reasons that indicate content filtering - // These include both finishReason values and promptFeedback.blockReason values - const isGoogleContentFilterStreaming = - isGoogleCompatibleProvider(usedProvider) && - (finishReason === "SAFETY" || - finishReason === "PROHIBITED_CONTENT" || - finishReason === "RECITATION" || - finishReason === "BLOCKLIST" || - finishReason === "SPII" || - finishReason === "OTHER"); + // Exclude content filter responses as they are intentionally empty. + const isContentFilterStreamingResponse = + isContentFilterFinishReason(finishReason, usedProvider); const hasEmptyResponse = !streamingError && finishReason && - finishReason !== "content_filter" && finishReason !== "incomplete" && - !isGoogleContentFilterStreaming && + !isContentFilterStreamingResponse && (!calculatedCompletionTokens || calculatedCompletionTokens === 0) && (!calculatedReasoningTokens || calculatedReasoningTokens === 0) && @@ -6507,7 +7146,44 @@ chat.openapi(completions, async (c) => { : new Error(String(sseError)), ); } - } else if (!streamingError && !doneSent) { + } else if (!streamingError && !doneSent) { + if ( + finishReason && + !sentDownstreamFinishReasonChunk && + !shouldBufferForHealing + ) { + try { + const finishChunk = { + id: `chatcmpl-${Date.now()}`, + object: "chat.completion.chunk", + created: Math.floor(Date.now() / 1000), + model: usedModel, + choices: [ + { + index: 0, + delta: {}, + finish_reason: mapFinishReasonToOpenai( + finishReason, + usedProvider, + !!streamingToolCalls && streamingToolCalls.length > 0, + ), + }, + ], + }; + + await writeSSEAndCache({ + data: JSON.stringify(finishChunk), + id: String(eventId++), + }); + sentDownstreamFinishReasonChunk = true; + } catch (error) { + logger.error( + "Error sending synthesized finish chunk", + error instanceof Error ? error : new Error(String(error)), + ); + } + } + // Calculate costs before sending usage chunk so we can include cost data const billCancelledRequestsEarly = shouldBillCancelledRequests(); streamingCostsEarly = @@ -6523,13 +7199,13 @@ chat.openapi(completions, async (c) => { imageInputCost: null, imageOutputCost: null, totalCost: null, - dataStorageCost: null as number | null, promptTokens: null, completionTokens: null, cachedTokens: null, estimatedCost: false, discount: undefined, pricingTier: undefined, + dataStorageCost: null as number | null, } : await calculateCosts( usedModel, @@ -6551,6 +7227,16 @@ chat.openapi(completions, async (c) => { webSearchCount, project.organizationId, ); + if (streamingCostsEarly.totalCost !== null) { + streamingCostsEarly.dataStorageCost = toDataStorageCostNumber( + streamingCostsEarly.promptTokens ?? calculatedPromptTokens, + cachedTokens, + streamingCostsEarly.completionTokens ?? + calculatedCompletionTokens, + reasoningTokens, + retentionLevel, + ); + } // Always send final usage chunk with cost data for SDK compatibility try { @@ -6585,28 +7271,46 @@ chat.openapi(completions, async (c) => { const adjCompletion = Math.round( completionTokens ?? calculatedCompletionTokens ?? 0, ); - return { + const earlyUsage: Record = { prompt_tokens: adjPrompt, completion_tokens: adjCompletion, total_tokens: Math.max( 1, Math.round(adjPrompt + adjCompletion), ), - ...(cachedTokens !== null && { + ...(reasoningTokens !== null && + reasoningTokens > 0 && { + reasoning_tokens: reasoningTokens, + }), + ...((cachedTokens !== null || + (cacheCreationTokens !== null && + cacheCreationTokens > 0)) && { prompt_tokens_details: { - cached_tokens: cachedTokens, + cached_tokens: cachedTokens ?? 0, + ...(cacheCreationTokens !== null && + cacheCreationTokens > 0 && { + cache_creation_tokens: cacheCreationTokens, + }), }, }), - cost_usd_total: streamingCostsEarly.totalCost, - cost_usd_input: streamingCostsEarly.inputCost, - cost_usd_output: streamingCostsEarly.outputCost, - cost_usd_cached_input: - streamingCostsEarly.cachedInputCost, - cost_usd_request: streamingCostsEarly.requestCost, - cost_usd_image_input: streamingCostsEarly.imageInputCost, - cost_usd_image_output: - streamingCostsEarly.imageOutputCost, }; + applyExtendedUsageFields(earlyUsage, { + costs: { + inputCost: streamingCostsEarly.inputCost, + outputCost: streamingCostsEarly.outputCost, + cachedInputCost: streamingCostsEarly.cachedInputCost, + requestCost: streamingCostsEarly.requestCost, + webSearchCost: streamingCostsEarly.webSearchCost, + imageInputCost: streamingCostsEarly.imageInputCost, + imageOutputCost: streamingCostsEarly.imageOutputCost, + totalCost: streamingCostsEarly.totalCost, + dataStorageCost: streamingCostsEarly.dataStorageCost, + }, + cachedTokens, + cacheCreationTokens, + reasoningTokens, + }); + return earlyUsage; })(), }; @@ -6680,7 +7384,11 @@ chat.openapi(completions, async (c) => { { index: 0, delta: {}, - finish_reason: finishReason ?? "stop", + finish_reason: mapFinishReasonToOpenai( + finishReason, + usedProvider, + !!streamingToolCalls && streamingToolCalls.length > 0, + ), }, ], }; @@ -6786,6 +7494,7 @@ chat.openapi(completions, async (c) => { estimatedCost: false, discount: undefined, pricingTier: undefined, + dataStorageCost: null as number | null, } : await calculateCosts( usedModel, @@ -6823,12 +7532,51 @@ chat.openapi(completions, async (c) => { } } + // Extract plugin IDs for logging + const streamingPluginIds = plugins?.map((p) => p.id) ?? []; + // Determine plugin results for logging (includes healing results if applicable) const finalPluginResults = Object.keys(streamingPluginResults).length > 0 ? streamingPluginResults : undefined; + const baseLogEntry = createLogEntry( + requestId, + project, + apiKey, + providerKey?.id, + usedModelFormatted, + usedModelMapping, + usedProvider, + initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoning_effort, + reasoning_max_tokens, + effort, + response_format, + tools, + tool_choice, + source, + customHeaders, + debugMode, + userAgent, + image_config, + routingMetadata, + rawBody, + streamingError ?? streamingRawResponseData, // Raw SSE data sent back to the client + requestBody, // The request sent to the provider + streamingError ?? rawUpstreamData, // Raw streaming data received from upstream provider + streamingPluginIds, + finalPluginResults, // Plugin results including healing (if enabled) + ); + // Enhanced logging for Google models streaming to debug missing responses if (isGoogleCompatibleProvider(usedProvider)) { logger.debug("Google model streaming response completed", { @@ -6852,142 +7600,123 @@ chat.openapi(completions, async (c) => { const shouldIncludeTokensForBilling = !canceled || (canceled && billCancelledRequests); - enqueueChatLog( - c, - { - providerKeyId: providerKey?.id, - usedModel: usedModelFormatted, - usedModelMapping, - usedProvider, - requestedModel: initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoningEffort: reasoning_effort, - reasoningMaxTokens: reasoning_max_tokens, - effort, - responseFormat: response_format, - tools, - toolChoice: tool_choice, - source, - customHeaders, - debugMode, - userAgent, - imageConfig: image_config, - routingMetadata, - rawRequest: rawBody, - rawResponse: streamingError ?? streamingRawResponseData, - upstreamRequest: requestBody, - upstreamResponse: streamingError ?? rawUpstreamData, - plugins: requestPluginIds, - pluginResults: finalPluginResults, - }, - { - id: routingAttempts.length > 0 ? finalLogId : undefined, - duration, - timeToFirstToken, - timeToFirstReasoningToken, - responseSize: fullContent.length, - content: fullContent, - reasoningContent: fullReasoningContent || null, - finishReason: canceled ? "canceled" : finishReason, - promptTokens: shouldIncludeTokensForBilling - ? (calculatedPromptTokens?.toString() ?? null) - : null, - completionTokens: shouldIncludeTokensForBilling - ? (calculatedCompletionTokens?.toString() ?? null) - : null, - totalTokens: shouldIncludeTokensForBilling - ? (calculatedTotalTokens?.toString() ?? null) - : null, - reasoningTokens: shouldIncludeTokensForBilling - ? (calculatedReasoningTokens?.toString() ?? null) - : null, - cachedTokens: shouldIncludeTokensForBilling - ? (cachedTokens?.toString() ?? null) - : null, - hasError: streamingError !== null, - errorDetails: streamingError - ? { - statusCode: - typeof streamingError === "object" && - streamingError !== null && - "details" in streamingError && - typeof streamingError.details === "object" && - streamingError.details !== null && - "statusCode" in streamingError.details && - typeof streamingError.details.statusCode === "number" - ? streamingError.details.statusCode - : 500, - statusText: - typeof streamingError === "object" && - streamingError !== null && - "details" in streamingError && - typeof streamingError.details === "object" && - streamingError.details !== null && - "statusText" in streamingError.details && - typeof streamingError.details.statusText === "string" - ? streamingError.details.statusText - : "Streaming Error", - responseText: - typeof streamingError === "object" && - streamingError !== null && - "details" in streamingError && - typeof streamingError.details === "object" && - streamingError.details !== null && - "responseText" in streamingError.details && - typeof streamingError.details.responseText === "string" - ? streamingError.details.responseText - : typeof streamingError === "object" && - streamingError !== null && - "details" in streamingError - ? JSON.stringify(streamingError) - : streamingError instanceof Error - ? streamingError.message - : String(streamingError), - } - : null, - streamed: true, - canceled: canceled, - inputCost: costs.inputCost, - outputCost: costs.outputCost, - cachedInputCost: costs.cachedInputCost, - requestCost: costs.requestCost, - webSearchCost: costs.webSearchCost, - imageInputTokens: costs.imageInputTokens?.toString() ?? null, - imageOutputTokens: costs.imageOutputTokens?.toString() ?? null, - imageInputCost: costs.imageInputCost ?? null, - imageOutputCost: costs.imageOutputCost ?? null, - cost: costs.totalCost, - estimatedCost: costs.estimatedCost, - discount: costs.discount, - pricingTier: costs.pricingTier, - dataStorageCost: shouldIncludeTokensForBilling - ? calculateDataStorageCost( - calculatedPromptTokens, - cachedTokens, - calculatedCompletionTokens, - calculatedReasoningTokens, - retentionLevel, - ) - : "0", - cached: false, - toolResults: streamingToolCalls, - }, - ); + const streamingErrorStatusCode = + typeof streamingError === "object" && + streamingError !== null && + "details" in streamingError && + typeof streamingError.details === "object" && + streamingError.details !== null && + "statusCode" in streamingError.details && + typeof streamingError.details.statusCode === "number" + ? streamingError.details.statusCode + : 500; + + await insertLogEntry({ + ...baseLogEntry, + id: routingAttempts.length > 0 ? finalLogId : undefined, + duration, + timeToFirstToken, + timeToFirstReasoningToken, + responseSize: fullContent.length, + content: fullContent, + reasoningContent: fullReasoningContent || null, + finishReason: canceled ? "canceled" : finishReason, + promptTokens: shouldIncludeTokensForBilling + ? (calculatedPromptTokens?.toString() ?? null) + : null, + completionTokens: shouldIncludeTokensForBilling + ? (calculatedCompletionTokens?.toString() ?? null) + : null, + totalTokens: shouldIncludeTokensForBilling + ? (calculatedTotalTokens?.toString() ?? null) + : null, + reasoningTokens: shouldIncludeTokensForBilling + ? (calculatedReasoningTokens?.toString() ?? null) + : null, + cachedTokens: shouldIncludeTokensForBilling + ? (cachedTokens?.toString() ?? null) + : null, + hasError: streamingError !== null, + errorDetails: streamingError + ? { + statusCode: streamingErrorStatusCode, + statusText: + typeof streamingError === "object" && + streamingError !== null && + "details" in streamingError && + typeof streamingError.details === "object" && + streamingError.details !== null && + "statusText" in streamingError.details && + typeof streamingError.details.statusText === "string" + ? streamingError.details.statusText + : "Streaming Error", + responseText: + typeof streamingError === "object" && + streamingError !== null && + "details" in streamingError && + typeof streamingError.details === "object" && + streamingError.details !== null && + "responseText" in streamingError.details && + typeof streamingError.details.responseText === "string" + ? streamingError.details.responseText + : typeof streamingError === "object" && + streamingError !== null && + "details" in streamingError + ? JSON.stringify(streamingError) + : streamingError instanceof Error + ? streamingError.message + : String(streamingError), + } + : null, + streamed: true, + canceled: canceled, + inputCost: costs.inputCost, + outputCost: costs.outputCost, + cachedInputCost: costs.cachedInputCost, + requestCost: costs.requestCost, + webSearchCost: costs.webSearchCost, + imageInputTokens: costs.imageInputTokens?.toString() ?? null, + imageOutputTokens: costs.imageOutputTokens?.toString() ?? null, + imageInputCost: costs.imageInputCost ?? null, + imageOutputCost: costs.imageOutputCost ?? null, + cost: costs.totalCost, + estimatedCost: costs.estimatedCost, + discount: costs.discount, + pricingTier: costs.pricingTier, + dataStorageCost: shouldIncludeTokensForBilling + ? calculateDataStorageCost( + calculatedPromptTokens, + cachedTokens, + calculatedCompletionTokens, + calculatedReasoningTokens, + retentionLevel, + ) + : "0", + cached: false, + tools, + toolResults: streamingToolCalls, + toolChoice: tool_choice, + }); - // Report key health for environment-based tokens + // Report key health for the selected token source if (envVarName !== undefined) { if (streamingError !== null) { - reportKeyError(envVarName, configIndex, 500); + reportKeyError( + envVarName, + configIndex, + streamingErrorStatusCode, + ); } else { reportKeySuccess(envVarName, configIndex); } } + if (providerKey?.id) { + if (streamingError !== null) { + reportTrackedKeyError(providerKey.id, streamingErrorStatusCode); + } else { + reportTrackedKeySuccess(providerKey.id); + } + } // Save streaming cache if enabled and not canceled and no errors if ( @@ -7041,6 +7770,7 @@ chat.openapi(completions, async (c) => { } else { logger.error("Streaming request error (escaped handler)", error); } + finishStreamCompletion(c); }, ); } @@ -7115,65 +7845,8 @@ chat.openapi(completions, async (c) => { } try { - const ctx = await resolveProviderContext( - nextProvider, - { - mode: project.mode, - organizationId: project.organizationId, - }, - { - id: organization.id, - credits: organization.credits, - devPlan: organization.devPlan, - devPlanCreditsLimit: organization.devPlanCreditsLimit, - devPlanCreditsUsed: organization.devPlanCreditsUsed, - devPlanExpiresAt: organization.devPlanExpiresAt, - }, - modelInfo, - originalRequestParams, - { - requestId, - stream, - effectiveStream, - messages: messages as BaseMessage[], - response_format, - tools, - tool_choice, - reasoning_effort, - reasoning_max_tokens, - effort, - webSearchTool, - image_config, - sensitive_word_check, - maxImageSizeMB, - userPlan, - hasExistingToolCalls, - customProviderName, - webSearchEnabled: !!webSearchTool, - }, - ); - usedProvider = ctx.usedProvider; - usedModel = ctx.usedModel; - usedModelFormatted = ctx.usedModelFormatted; - usedModelMapping = ctx.usedModelMapping; - baseModelName = ctx.baseModelName; - usedToken = ctx.usedToken; - providerKey = ctx.providerKey; - configIndex = ctx.configIndex; - envVarName = ctx.envVarName; - url = ctx.url; - requestBody = ctx.requestBody; - useResponsesApi = ctx.useResponsesApi; - requestCanBeCanceled = ctx.requestCanBeCanceled; - isImageGeneration = ctx.isImageGeneration; - supportsReasoning = ctx.supportsReasoning; - splitTaggedReasoning = ctx.splitTaggedReasoning ?? false; - temperature = ctx.temperature; - max_tokens = ctx.max_tokens; - top_p = ctx.top_p; - frequency_penalty = ctx.frequency_penalty; - presence_penalty = ctx.presence_penalty; - usedRegion = ctx.usedRegion; + const ctx = await resolveProviderContextForRetry(nextProvider, stream); + applyResolvedProviderContext(ctx); } catch { failedProviderIds.add( providerRetryKey(nextProvider.providerId, nextProvider.region), @@ -7192,9 +7865,12 @@ chat.openapi(completions, async (c) => { try { const headers = getProviderHeaders(usedProvider, usedToken, { + requestId, webSearchEnabled: !!webSearchTool, }); - headers["Content-Type"] = "application/json"; + if (!(requestBody instanceof FormData)) { + headers["Content-Type"] = "application/json"; + } // Add effort beta header for Anthropic if effort parameter is specified if (usedProvider === "anthropic" && effort !== undefined) { @@ -7224,7 +7900,10 @@ chat.openapi(completions, async (c) => { res = await fetch(url, { method: "POST", headers, - body: JSON.stringify(requestBody), + body: + requestBody instanceof FormData + ? requestBody + : JSON.stringify(requestBody), signal: fetchSignal, }); } catch (error) { @@ -7267,7 +7946,24 @@ chat.openapi(completions, async (c) => { ), }); + // Log the error in the database + // Extract plugin IDs for logging (non-streaming fetch error) + const nonStreamingFetchErrorPluginIds = plugins?.map((p) => p.id) ?? []; + // Check if we should retry before logging so we can mark the log as retried + let sameProviderRetryContext: Awaited< + ReturnType + > | null = null; + if (isRetryableErrorType("network_error")) { + rememberFailedKey(usedProvider, usedRegion, { + envVarName, + configIndex, + providerKeyId: providerKey?.id, + }); + sameProviderRetryContext = + await tryResolveAlternateKeyForCurrentProvider(stream); + } + const willRetryFetchNonStreaming = shouldRetryRequest({ requestedProvider, noFallback, @@ -7279,94 +7975,130 @@ chat.openapi(completions, async (c) => { 1, usedProvider, }); + const willRetrySameProvider = sameProviderRetryContext !== null; + const willRetryRequest = + willRetrySameProvider || willRetryFetchNonStreaming; - enqueueChatLog( - c, - { - providerKeyId: providerKey?.id, - usedModel: usedModelFormatted, - usedModelMapping, - usedProvider, - requestedModel: initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoningEffort: reasoning_effort, - reasoningMaxTokens: reasoning_max_tokens, - effort, - responseFormat: response_format, - tools, - toolChoice: tool_choice, - source, - customHeaders, - debugMode, - userAgent, - imageConfig: image_config, - routingMetadata, - rawRequest: rawBody, - rawResponse: null, - upstreamRequest: requestBody, - upstreamResponse: null, - plugins: requestPluginIds, - pluginResults: undefined, - }, - { - duration: perAttemptDuration, - timeToFirstToken: null, - timeToFirstReasoningToken: null, - responseSize: 0, - content: null, - reasoningContent: null, - finishReason: "upstream_error", - promptTokens: null, - completionTokens: null, - totalTokens: null, - reasoningTokens: null, - cachedTokens: null, - hasError: true, - streamed: false, - canceled: false, - errorDetails: { - statusCode: 0, - statusText: fetchError.name, - responseText: errorMessage, - cause: nonStreamingFetchCause, - }, - cachedInputCost: null, - requestCost: null, - webSearchCost: null, - imageInputTokens: null, - imageOutputTokens: null, - imageInputCost: null, - imageOutputCost: null, - estimatedCost: false, - discount: null, - dataStorageCost: "0", - cached: false, - toolResults: null, - retried: willRetryFetchNonStreaming, - retriedByLogId: willRetryFetchNonStreaming ? finalLogId : null, - }, + const baseLogEntry = createLogEntry( + requestId, + project, + apiKey, + providerKey?.id, + usedModelFormatted, + usedModelMapping, + usedProvider, + initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoning_effort, + reasoning_max_tokens, + effort, + response_format, + tools, + tool_choice, + source, + customHeaders, + debugMode, + userAgent, + image_config, + routingMetadata, + rawBody, + null, // No response for fetch error + requestBody, // The request that resulted in error + null, // No upstream response for fetch error + nonStreamingFetchErrorPluginIds, + undefined, // No plugin results for error case ); + const attemptLogId = shortid(); + + await insertLogEntry({ + ...baseLogEntry, + id: attemptLogId, + duration: perAttemptDuration, + timeToFirstToken: null, // Not applicable for error case + timeToFirstReasoningToken: null, // Not applicable for error case + responseSize: 0, + content: null, + reasoningContent: null, + finishReason: "upstream_error", + promptTokens: null, + completionTokens: null, + totalTokens: null, + reasoningTokens: null, + cachedTokens: null, + hasError: true, + streamed: false, + canceled: false, + errorDetails: { + statusCode: 0, + statusText: fetchError.name, + responseText: errorMessage, + cause: nonStreamingFetchCause, + }, + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + estimatedCost: false, + discount: null, + dataStorageCost: "0", + cached: false, + toolResults: null, + retried: willRetryRequest, + retriedByLogId: willRetryRequest ? finalLogId : null, + }); - // Report key health for environment-based tokens + // Report key health for the selected token source if (envVarName !== undefined) { reportKeyError(envVarName, configIndex, 0); } + if (providerKey?.id) { + reportTrackedKeyError(providerKey.id, 0); + } + + if (willRetrySameProvider && sameProviderRetryContext) { + routingAttempts.push( + buildRoutingAttempt( + usedProvider, + baseModelName, + 0, + getErrorType(0), + false, + { + region: usedRegion, + apiKeyHash: usedApiKeyHash, + logId: attemptLogId, + }, + ), + ); + applyResolvedProviderContext(sameProviderRetryContext); + retryAttempt--; + continue; + } if (willRetryFetchNonStreaming) { - routingAttempts.push({ - provider: usedProvider, - model: baseModelName, - ...(usedRegion && { region: usedRegion }), - status_code: 0, - error_type: getErrorType(0), - succeeded: false, - }); + routingAttempts.push( + buildRoutingAttempt( + usedProvider, + baseModelName, + 0, + getErrorType(0), + false, + { + region: usedRegion, + apiKeyHash: usedApiKeyHash, + logId: attemptLogId, + }, + ), + ); failedProviderIds.add(providerRetryKey(usedProvider, usedRegion)); continue; } @@ -7393,6 +8125,10 @@ chat.openapi(completions, async (c) => { // If the request was canceled, log it and return a response if (canceled) { + // Log the canceled request + // Extract plugin IDs for logging (canceled non-streaming) + const canceledNonStreamingPluginIds = plugins?.map((p) => p.id) ?? []; + // Calculate costs for cancelled request if billing is enabled const billCancelled = shouldBillCancelledRequests(); let cancelledCosts: Awaited> | null = @@ -7433,93 +8169,90 @@ chat.openapi(completions, async (c) => { ); } - enqueueChatLog( - c, - { - providerKeyId: providerKey?.id, - usedModel: usedModelFormatted, - usedModelMapping, - usedProvider, - requestedModel: initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoningEffort: reasoning_effort, - reasoningMaxTokens: reasoning_max_tokens, - effort, - responseFormat: response_format, - tools, - toolChoice: tool_choice, - source, - customHeaders, - debugMode, - userAgent, - imageConfig: image_config, - routingMetadata, - rawRequest: rawBody, - rawResponse: null, - upstreamRequest: requestBody, - upstreamResponse: null, - plugins: requestPluginIds, - pluginResults: undefined, - }, - { - duration, - timeToFirstToken: null, - timeToFirstReasoningToken: null, - responseSize: 0, - content: null, - reasoningContent: null, - finishReason: "canceled", - promptTokens: billCancelled - ? ( - cancelledCosts?.promptTokens ?? estimatedPromptTokens - )?.toString() - : null, - completionTokens: billCancelled ? "0" : null, - totalTokens: billCancelled - ? ( - cancelledCosts?.promptTokens ?? estimatedPromptTokens - )?.toString() - : null, - reasoningTokens: null, - cachedTokens: null, - hasError: false, - streamed: false, - canceled: true, - errorDetails: null, - inputCost: cancelledCosts?.inputCost ?? null, - outputCost: cancelledCosts?.outputCost ?? null, - cachedInputCost: cancelledCosts?.cachedInputCost ?? null, - requestCost: cancelledCosts?.requestCost ?? null, - webSearchCost: cancelledCosts?.webSearchCost ?? null, - imageInputTokens: - cancelledCosts?.imageInputTokens?.toString() ?? null, - imageOutputTokens: - cancelledCosts?.imageOutputTokens?.toString() ?? null, - imageInputCost: cancelledCosts?.imageInputCost ?? null, - imageOutputCost: cancelledCosts?.imageOutputCost ?? null, - cost: cancelledCosts?.totalCost ?? null, - estimatedCost: cancelledCosts?.estimatedCost ?? false, - discount: cancelledCosts?.discount ?? null, - dataStorageCost: billCancelled - ? calculateDataStorageCost( - cancelledCosts?.promptTokens ?? estimatedPromptTokens, - null, - 0, - null, - retentionLevel, - ) - : "0", - cached: false, - toolResults: null, - }, + const baseLogEntry = createLogEntry( + requestId, + project, + apiKey, + providerKey?.id, + usedModelFormatted, + usedModelMapping, + usedProvider, + initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoning_effort, + reasoning_max_tokens, + effort, + response_format, + tools, + tool_choice, + source, + customHeaders, + debugMode, + userAgent, + image_config, + routingMetadata, + rawBody, + null, // No response for canceled request + requestBody, // The request that was prepared before cancellation + null, // No upstream response for canceled request + canceledNonStreamingPluginIds, + undefined, // No plugin results for canceled request ); + await insertLogEntry({ + ...baseLogEntry, + duration, + timeToFirstToken: null, // Not applicable for canceled request + timeToFirstReasoningToken: null, // Not applicable for canceled request + responseSize: 0, + content: null, + reasoningContent: null, + finishReason: "canceled", + promptTokens: billCancelled + ? (cancelledCosts?.promptTokens ?? estimatedPromptTokens)?.toString() + : null, + completionTokens: billCancelled ? "0" : null, + totalTokens: billCancelled + ? (cancelledCosts?.promptTokens ?? estimatedPromptTokens)?.toString() + : null, + reasoningTokens: null, + cachedTokens: null, + hasError: false, + streamed: false, + canceled: true, + errorDetails: null, + inputCost: cancelledCosts?.inputCost ?? null, + outputCost: cancelledCosts?.outputCost ?? null, + cachedInputCost: cancelledCosts?.cachedInputCost ?? null, + requestCost: cancelledCosts?.requestCost ?? null, + webSearchCost: cancelledCosts?.webSearchCost ?? null, + imageInputTokens: cancelledCosts?.imageInputTokens?.toString() ?? null, + imageOutputTokens: + cancelledCosts?.imageOutputTokens?.toString() ?? null, + imageInputCost: cancelledCosts?.imageInputCost ?? null, + imageOutputCost: cancelledCosts?.imageOutputCost ?? null, + cost: cancelledCosts?.totalCost ?? null, + estimatedCost: cancelledCosts?.estimatedCost ?? false, + discount: cancelledCosts?.discount ?? null, + dataStorageCost: billCancelled + ? calculateDataStorageCost( + cancelledCosts?.promptTokens ?? estimatedPromptTokens, + null, + 0, + null, + retentionLevel, + ) + : "0", + cached: false, + toolResults: null, + }); + return c.json( { error: { @@ -7561,77 +8294,80 @@ chat.openapi(completions, async (c) => { ), }); - enqueueChatLog( - c, - { - providerKeyId: providerKey?.id, - usedModel: usedModelFormatted, - usedModelMapping, - usedProvider, - requestedModel: initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoningEffort: reasoning_effort, - reasoningMaxTokens: reasoning_max_tokens, - effort, - responseFormat: response_format, - tools, - toolChoice: tool_choice, - source, - customHeaders, - debugMode, - userAgent, - imageConfig: image_config, - routingMetadata, - rawRequest: rawBody, - rawResponse: null, - upstreamRequest: requestBody, - upstreamResponse: null, - plugins: requestPluginIds, - pluginResults: undefined, - }, - { - duration: Date.now() - perAttemptStartTime, - timeToFirstToken: null, - timeToFirstReasoningToken: null, - responseSize: 0, - content: null, - reasoningContent: null, - finishReason: "upstream_error", - promptTokens: null, - completionTokens: null, - totalTokens: null, - reasoningTokens: null, - cachedTokens: null, - hasError: true, - streamed: false, - canceled: false, - errorDetails: { - statusCode: res.status, - statusText: "TimeoutError", - responseText: errorMessage, - cause: bodyErrorCause, - }, - cachedInputCost: null, - requestCost: null, - webSearchCost: null, - imageInputTokens: null, - imageOutputTokens: null, - imageInputCost: null, - imageOutputCost: null, - estimatedCost: false, - discount: null, - dataStorageCost: "0", - cached: false, - toolResults: null, - }, + const bodyTimeoutPluginIds = plugins?.map((p) => p.id) ?? []; + const baseLogEntry = createLogEntry( + requestId, + project, + apiKey, + providerKey?.id, + usedModelFormatted, + usedModelMapping!, + usedProvider!, + initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoning_effort, + reasoning_max_tokens, + effort, + response_format, + tools, + tool_choice, + source, + customHeaders, + debugMode, + userAgent, + image_config, + routingMetadata, + rawBody, + null, + requestBody, + null, + bodyTimeoutPluginIds, + undefined, ); + await insertLogEntry({ + ...baseLogEntry, + duration: Date.now() - perAttemptStartTime, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + responseSize: 0, + content: null, + reasoningContent: null, + finishReason: "upstream_error", + promptTokens: null, + completionTokens: null, + totalTokens: null, + reasoningTokens: null, + cachedTokens: null, + hasError: true, + streamed: false, + canceled: false, + errorDetails: { + statusCode: res.status, + statusText: "TimeoutError", + responseText: errorMessage, + cause: bodyErrorCause, + }, + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + estimatedCost: false, + discount: null, + dataStorageCost: "0", + cached: false, + toolResults: null, + }); + return c.json( { error: { @@ -7674,6 +8410,23 @@ chat.openapi(completions, async (c) => { }); } + // Log the request in the database + // Extract plugin IDs for logging + const providerErrorPluginIds = plugins?.map((p) => p.id) ?? []; + + let sameProviderRetryContext: Awaited< + ReturnType + > | null = null; + if (isRetryableErrorType(finishReason)) { + rememberFailedKey(usedProvider, usedRegion, { + envVarName, + configIndex, + providerKeyId: providerKey?.id, + }); + sameProviderRetryContext = + await tryResolveAlternateKeyForCurrentProvider(stream); + } + // Check if we should retry before logging so we can mark the log as retried const willRetryHttpNonStreaming = shouldRetryRequest({ requestedProvider, @@ -7686,112 +8439,150 @@ chat.openapi(completions, async (c) => { 1, usedProvider, }); + const willRetrySameProvider = sameProviderRetryContext !== null; + const willRetryRequest = + willRetrySameProvider || willRetryHttpNonStreaming; - enqueueChatLog( - c, - { - providerKeyId: providerKey?.id, - usedModel: usedModelFormatted, - usedModelMapping, - usedProvider, - requestedModel: initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoningEffort: reasoning_effort, - reasoningMaxTokens: reasoning_max_tokens, - effort, - responseFormat: response_format, - tools, - toolChoice: tool_choice, - source, - customHeaders, - debugMode, - userAgent, - imageConfig: image_config, - routingMetadata, - rawRequest: rawBody, - rawResponse: errorResponseText, - upstreamRequest: requestBody, - upstreamResponse: errorResponseText, - plugins: requestPluginIds, - pluginResults: undefined, - }, - { - duration: perAttemptDuration, - timeToFirstToken: null, - timeToFirstReasoningToken: null, - responseSize: errorResponseText.length, - content: null, - reasoningContent: null, - finishReason, - promptTokens: null, - completionTokens: null, - totalTokens: null, - reasoningTokens: null, - cachedTokens: null, - hasError: finishReason !== "content_filter", - streamed: false, - canceled: false, - errorDetails: (() => { - if (finishReason === "content_filter") { - return null; - } - if (finishReason === "client_error") { - try { - const originalError = JSON.parse(errorResponseText); - return { - statusCode: res.status, - statusText: res.statusText, - responseText: errorResponseText, - message: originalError.error?.message ?? errorResponseText, - }; - } catch { - // If parsing fails, use default format - } - } - return { - statusCode: res.status, - statusText: res.statusText, - responseText: errorResponseText, - }; - })(), - cachedInputCost: null, - requestCost: null, - webSearchCost: null, - imageInputTokens: null, - imageOutputTokens: null, - imageInputCost: null, - imageOutputCost: null, - estimatedCost: false, - discount: null, - dataStorageCost: "0", - cached: false, - toolResults: null, - retried: willRetryHttpNonStreaming, - retriedByLogId: willRetryHttpNonStreaming ? finalLogId : null, - }, + const baseLogEntry = createLogEntry( + requestId, + project, + apiKey, + providerKey?.id, + usedModelFormatted, + usedModelMapping, + usedProvider, + initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoning_effort, + reasoning_max_tokens, + effort, + response_format, + tools, + tool_choice, + source, + customHeaders, + debugMode, + userAgent, + image_config, + routingMetadata, + rawBody, + errorResponseText, // Our formatted error response + requestBody, // The request that resulted in error + errorResponseText, // Raw upstream error response + providerErrorPluginIds, + undefined, // No plugin results for error case ); + const attemptLogId = shortid(); + + await insertLogEntry({ + ...baseLogEntry, + id: attemptLogId, + duration: perAttemptDuration, + timeToFirstToken: null, // Not applicable for error case + timeToFirstReasoningToken: null, // Not applicable for error case + responseSize: errorResponseText.length, + content: null, + reasoningContent: null, + finishReason, + promptTokens: null, + completionTokens: null, + totalTokens: null, + reasoningTokens: null, + cachedTokens: null, + hasError: finishReason !== "content_filter", // content_filter is not an error + streamed: false, + canceled: false, + errorDetails: (() => { + // content_filter is not an error, no error details needed + if (finishReason === "content_filter") { + return null; + } + // For client errors, try to parse the original error and include the message + if (finishReason === "client_error") { + try { + const originalError = JSON.parse(errorResponseText); + return { + statusCode: res.status, + statusText: res.statusText, + responseText: errorResponseText, + message: originalError.error?.message ?? errorResponseText, + }; + } catch { + // If parsing fails, use default format + } + } + return { + statusCode: res.status, + statusText: res.statusText, + responseText: errorResponseText, + }; + })(), + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + estimatedCost: false, + discount: null, + dataStorageCost: "0", + cached: false, + toolResults: null, + retried: willRetryRequest, + retriedByLogId: willRetryRequest ? finalLogId : null, + }); - // Report key health for environment-based tokens + // Report key health for the selected token source // Don't report content_filter as a key error - it's intentional provider behavior if (envVarName !== undefined && finishReason !== "content_filter") { reportKeyError(envVarName, configIndex, res.status, errorResponseText); } + if (providerKey?.id && finishReason !== "content_filter") { + reportTrackedKeyError(providerKey.id, res.status, errorResponseText); + } + + if (willRetrySameProvider && sameProviderRetryContext) { + routingAttempts.push( + buildRoutingAttempt( + usedProvider, + baseModelName, + res.status, + getErrorType(res.status), + false, + { + region: usedRegion, + apiKeyHash: usedApiKeyHash, + logId: attemptLogId, + }, + ), + ); + applyResolvedProviderContext(sameProviderRetryContext); + retryAttempt--; + continue; + } if (willRetryHttpNonStreaming) { - routingAttempts.push({ - provider: usedProvider, - model: baseModelName, - ...(usedRegion && { region: usedRegion }), - status_code: res.status, - error_type: getErrorType(res.status), - succeeded: false, - }); + routingAttempts.push( + buildRoutingAttempt( + usedProvider, + baseModelName, + res.status, + getErrorType(res.status), + false, + { + region: usedRegion, + apiKeyHash: usedApiKeyHash, + logId: attemptLogId, + }, + ), + ); failedProviderIds.add(providerRetryKey(usedProvider, usedRegion)); continue; } @@ -7820,6 +8611,7 @@ chat.openapi(completions, async (c) => { total_tokens: 0, }, metadata: { + request_id: requestId, requested_model: initialRequestedModel, requested_provider: requestedProvider, used_model: baseModelName, @@ -7864,14 +8656,20 @@ chat.openapi(completions, async (c) => { // Add the final attempt (successful or last failed) to routing if (res && res.ok && usedProvider) { - routingAttempts.push({ - provider: usedProvider, - model: baseModelName, - ...(usedRegion && { region: usedRegion }), - status_code: res.status, - error_type: "none", - succeeded: true, - }); + routingAttempts.push( + buildRoutingAttempt( + usedProvider, + baseModelName, + res.status, + "none", + true, + { + region: usedRegion, + apiKeyHash: usedApiKeyHash, + logId: finalLogId, + }, + ), + ); } // Update routingMetadata with all routing attempts for DB logging @@ -7920,7 +8718,92 @@ chat.openapi(completions, async (c) => { let json: any; try { - json = await res.json(); + if (forceStream && res.body) { + // Stream-only model: upstream returned SSE but client expects JSON. + // Read the full stream and assemble a non-streaming response. + const text = await res.text(); + const lines = text.split("\n"); + let content = ""; + const toolCalls: any[] = []; + let finishReason: string | null = null; + let usage: any = null; + let responseId = ""; + let model = ""; + let created = 0; + + for (const line of lines) { + if (!line.startsWith("data: ") || line === "data: [DONE]") { + continue; + } + try { + const chunk = JSON.parse(line.slice(6)); + if (!responseId && chunk.id) { + responseId = chunk.id; + } + if (!model && chunk.model) { + model = chunk.model; + } + if (!created && chunk.created) { + created = chunk.created; + } + const delta = chunk.choices?.[0]?.delta; + if (delta?.content) { + content += delta.content; + } + if (delta?.tool_calls) { + for (const tc of delta.tool_calls) { + const idx = tc.index ?? 0; + if (!toolCalls[idx]) { + toolCalls[idx] = { + id: tc.id ?? "", + type: tc.type ?? "function", + function: { name: tc.function?.name ?? "", arguments: "" }, + }; + } else { + if (tc.id) { + toolCalls[idx].id = tc.id; + } + if (tc.function?.name) { + toolCalls[idx].function.name = tc.function.name; + } + } + if (tc.function?.arguments) { + toolCalls[idx].function.arguments += tc.function.arguments; + } + } + } + if (chunk.choices?.[0]?.finish_reason) { + finishReason = chunk.choices[0].finish_reason; + } + if (chunk.usage) { + usage = chunk.usage; + } + } catch { + // skip unparseable lines + } + } + + json = { + id: responseId, + object: "chat.completion", + created, + model, + choices: [ + { + index: 0, + message: { + role: "assistant", + content: content || null, + ...(toolCalls.length > 0 ? { tool_calls: toolCalls } : {}), + }, + finish_reason: finishReason ?? "stop", + }, + ], + ...(usage ? { usage } : {}), + }; + } else { + json = await res.json(); + } } catch (bodyError) { if (isTimeoutError(bodyError)) { const errorMessage = @@ -7939,77 +8822,80 @@ chat.openapi(completions, async (c) => { ), }); - enqueueChatLog( - c, - { - providerKeyId: providerKey?.id, - usedModel: usedModelFormatted, - usedModelMapping, - usedProvider, - requestedModel: initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoningEffort: reasoning_effort, - reasoningMaxTokens: reasoning_max_tokens, - effort, - responseFormat: response_format, - tools, - toolChoice: tool_choice, - source, - customHeaders, - debugMode, - userAgent, - imageConfig: image_config, - routingMetadata, - rawRequest: rawBody, - rawResponse: null, - upstreamRequest: requestBody, - upstreamResponse: null, - plugins: requestPluginIds, - pluginResults: undefined, - }, - { - duration: Date.now() - startTime, - timeToFirstToken: null, - timeToFirstReasoningToken: null, - responseSize: 0, - content: null, - reasoningContent: null, - finishReason: "upstream_error", - promptTokens: null, - completionTokens: null, - totalTokens: null, - reasoningTokens: null, - cachedTokens: null, - hasError: true, - streamed: false, - canceled: false, - errorDetails: { - statusCode: res.status, - statusText: "TimeoutError", - responseText: errorMessage, - cause: bodyReadCause, - }, - cachedInputCost: null, - requestCost: null, - webSearchCost: null, - imageInputTokens: null, - imageOutputTokens: null, - imageInputCost: null, - imageOutputCost: null, - estimatedCost: false, - discount: null, - dataStorageCost: "0", - cached: false, - toolResults: null, - }, + const bodyTimeoutPluginIds = plugins?.map((p) => p.id) ?? []; + const baseLogEntry = createLogEntry( + requestId, + project, + apiKey, + providerKey?.id, + usedModelFormatted!, + usedModelMapping, + usedProvider, + initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoning_effort, + reasoning_max_tokens, + effort, + response_format, + tools, + tool_choice, + source, + customHeaders, + debugMode, + userAgent, + image_config, + routingMetadata, + rawBody, + null, + requestBody, + null, + bodyTimeoutPluginIds, + undefined, ); + await insertLogEntry({ + ...baseLogEntry, + duration: Date.now() - startTime, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + responseSize: 0, + content: null, + reasoningContent: null, + finishReason: "upstream_error", + promptTokens: null, + completionTokens: null, + totalTokens: null, + reasoningTokens: null, + cachedTokens: null, + hasError: true, + streamed: false, + canceled: false, + errorDetails: { + statusCode: res.status, + statusText: "TimeoutError", + responseText: errorMessage, + cause: bodyReadCause, + }, + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + estimatedCost: false, + discount: null, + dataStorageCost: "0", + cached: false, + toolResults: null, + }); + return c.json( { error: { @@ -8050,6 +8936,7 @@ chat.openapi(completions, async (c) => { completionTokens, reasoningTokens, cachedTokens, + cacheCreationTokens, toolResults, images, annotations, @@ -8176,6 +9063,13 @@ chat.openapi(completions, async (c) => { webSearchCount, project.organizationId, ); + costs.dataStorageCost = toDataStorageCostNumber( + costs.promptTokens ?? calculatedPromptTokens, + cachedTokens, + costs.completionTokens ?? calculatedCompletionTokens, + calculatedReasoningTokens, + retentionLevel, + ); // Use costs.promptTokens as canonical value (includes image input // tokens for providers that exclude them from upstream usage) @@ -8224,31 +9118,66 @@ chat.openapi(completions, async (c) => { imageInputCost: costs.imageInputCost, imageOutputCost: costs.imageOutputCost, totalCost: costs.totalCost, + dataStorageCost: costs.dataStorageCost, } : null, false, // showUpgradeMessage - never show since Pro plan is removed annotations, routingAttempts.length > 0 ? routingAttempts : null, + requestId, usedRegion, + cacheCreationTokens, + ); + + // Extract plugin IDs for logging + const pluginIds = plugins?.map((p) => p.id) ?? []; + + const baseLogEntry = createLogEntry( + requestId, + project, + apiKey, + providerKey?.id, + usedModelFormatted, + usedModelMapping, + usedProvider, + initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoning_effort, + reasoning_max_tokens, + effort, + response_format, + tools, + tool_choice, + source, + customHeaders, + debugMode, + userAgent, + image_config, + routingMetadata, + rawBody, + transformedResponse, // Our formatted response that we return to user + requestBody, // The request sent to the provider + json, // Raw upstream response from provider + pluginIds, + Object.keys(pluginResults).length > 0 ? pluginResults : undefined, ); // Check if the non-streaming response is empty (no content, tokens, or tool calls) - // Exclude content_filter responses as they are intentionally empty (blocked by provider) - // For Google, check for original finish reasons that indicate content filtering - // These include both finishReason values and promptFeedback.blockReason values - const isGoogleContentFilter = - isGoogleCompatibleProvider(usedProvider) && - (finishReason === "SAFETY" || - finishReason === "PROHIBITED_CONTENT" || - finishReason === "RECITATION" || - finishReason === "BLOCKLIST" || - finishReason === "SPII" || - finishReason === "OTHER"); + // Exclude content filter responses as they are intentionally empty. + const isContentFilterResponse = isContentFilterFinishReason( + finishReason, + usedProvider, + ); const hasEmptyNonStreamingResponse = !!finishReason && - finishReason !== "content_filter" && finishReason !== "incomplete" && - !isGoogleContentFilter && + !isContentFilterResponse && !hasMeaningfulAssistantOutput({ completionTokens: calculatedCompletionTokens, reasoningTokens: calculatedReasoningTokens, @@ -8283,105 +9212,89 @@ chat.openapi(completions, async (c) => { } } - enqueueChatLog( - c, - { - providerKeyId: providerKey?.id, - usedModel: usedModelFormatted, - usedModelMapping, - usedProvider, - requestedModel: initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoningEffort: reasoning_effort, - reasoningMaxTokens: reasoning_max_tokens, - effort, - responseFormat: response_format, - tools, - toolChoice: tool_choice, - source, - customHeaders, - debugMode, - userAgent, - imageConfig: image_config, - routingMetadata, - rawRequest: rawBody, - rawResponse: transformedResponse, - upstreamRequest: requestBody, - upstreamResponse: json, - plugins: requestPluginIds, - pluginResults: - Object.keys(pluginResults).length > 0 ? pluginResults : undefined, - }, - { - id: routingAttempts.length > 0 ? finalLogId : undefined, - duration, - timeToFirstToken: null, - timeToFirstReasoningToken: null, - responseSize, - content: content, - reasoningContent: reasoningContent, - finishReason: hasEmptyNonStreamingResponse - ? "upstream_error" - : finishReason, - promptTokens: calculatedPromptTokens?.toString() ?? null, - completionTokens: calculatedCompletionTokens?.toString() ?? null, - totalTokens: - totalTokens ?? - ( - (calculatedPromptTokens ?? 0) + (calculatedCompletionTokens ?? 0) - ).toString(), - reasoningTokens: calculatedReasoningTokens?.toString() ?? null, - cachedTokens: cachedTokens?.toString() ?? null, - hasError: hasEmptyNonStreamingResponse, - streamed: false, - canceled: false, - errorDetails: hasEmptyNonStreamingResponse - ? { - statusCode: 500, - statusText: "Empty Response", - responseText: - "Response finished successfully but returned no content or tool calls", - } - : null, - inputCost: costs.inputCost, - outputCost: costs.outputCost, - cachedInputCost: costs.cachedInputCost, - requestCost: costs.requestCost, - webSearchCost: costs.webSearchCost, - imageInputTokens: costs.imageInputTokens?.toString() ?? null, - imageOutputTokens: costs.imageOutputTokens?.toString() ?? null, - imageInputCost: costs.imageInputCost ?? null, - imageOutputCost: costs.imageOutputCost ?? null, - cost: costs.totalCost, - estimatedCost: costs.estimatedCost, - discount: costs.discount, - pricingTier: costs.pricingTier, - dataStorageCost: calculateDataStorageCost( - calculatedPromptTokens, - cachedTokens, - calculatedCompletionTokens, - calculatedReasoningTokens, - retentionLevel, - ), - cached: false, - toolResults, - }, - ); + // For image generation, store the base64 data URLs in content + // so the activity detail page can render the images + const base64Images = + convertedImages?.filter((img) => img.image_url.url.startsWith("data:")) ?? + []; + const logContent = + base64Images.length > 0 + ? base64Images.map((img) => img.image_url.url).join("\n") + : content; + + await insertLogEntry({ + ...baseLogEntry, + id: routingAttempts.length > 0 ? finalLogId : undefined, + duration, + timeToFirstToken: null, // Not applicable for non-streaming requests + timeToFirstReasoningToken: null, // Not applicable for non-streaming requests + responseSize, + content: logContent, + reasoningContent: reasoningContent, + finishReason: hasEmptyNonStreamingResponse + ? "upstream_error" + : finishReason, + promptTokens: calculatedPromptTokens?.toString() ?? null, + completionTokens: calculatedCompletionTokens?.toString() ?? null, + totalTokens: + totalTokens ?? + ( + (calculatedPromptTokens ?? 0) + (calculatedCompletionTokens ?? 0) + ).toString(), + reasoningTokens: calculatedReasoningTokens?.toString() ?? null, + cachedTokens: cachedTokens?.toString() ?? null, + hasError: hasEmptyNonStreamingResponse, + streamed: false, + canceled: false, + errorDetails: hasEmptyNonStreamingResponse + ? { + statusCode: 500, + statusText: "Empty Response", + responseText: + "Response finished successfully but returned no content or tool calls", + } + : null, + inputCost: costs.inputCost, + outputCost: costs.outputCost, + cachedInputCost: costs.cachedInputCost, + requestCost: costs.requestCost, + webSearchCost: costs.webSearchCost, + imageInputTokens: costs.imageInputTokens?.toString() ?? null, + imageOutputTokens: costs.imageOutputTokens?.toString() ?? null, + imageInputCost: costs.imageInputCost ?? null, + imageOutputCost: costs.imageOutputCost ?? null, + cost: costs.totalCost, + estimatedCost: costs.estimatedCost, + discount: costs.discount, + pricingTier: costs.pricingTier, + dataStorageCost: calculateDataStorageCost( + calculatedPromptTokens, + cachedTokens, + calculatedCompletionTokens, + calculatedReasoningTokens, + retentionLevel, + ), + cached: false, + tools, + toolResults, + toolChoice: tool_choice, + }); - // Report key health for environment-based tokens + // Report key health for the selected token source // Note: We don't report empty responses as key errors since they're not upstream errors if (envVarName !== undefined) { reportKeySuccess(envVarName, configIndex); } + if (providerKey?.id) { + reportTrackedKeySuccess(providerKey.id); + } if (cachingEnabled && cacheKey && !stream && !hasEmptyNonStreamingResponse) { - await setCache(cacheKey, transformedResponse, cacheDuration); + await setCache( + cacheKey, + stripRequestScopedMetadataFromOpenAiResponse(transformedResponse), + cacheDuration, + ); } // For image generation models with streaming requested, convert to SSE format diff --git a/apps/gateway/src/chat/middleware/chat-completion-log.ts b/apps/gateway/src/chat/middleware/chat-completion-log.ts index d1da03c180..f69e6f0f76 100644 --- a/apps/gateway/src/chat/middleware/chat-completion-log.ts +++ b/apps/gateway/src/chat/middleware/chat-completion-log.ts @@ -303,6 +303,12 @@ async function flushChatCompletionLogs( internalContentFilter: state.internalContentFilter ? true : logData.internalContentFilter, + gatewayContentFilterResponse: + logData.gatewayContentFilterResponse ?? + (state.gatewayContentFilterResponse as + | LogInsertData["gatewayContentFilterResponse"] + | undefined) ?? + null, }, { syncInsert: state.syncInsert }, ); diff --git a/apps/gateway/src/chat/tools/chat-log-context.ts b/apps/gateway/src/chat/tools/chat-log-context.ts index 0e860ebe35..c9d14082f7 100644 --- a/apps/gateway/src/chat/tools/chat-log-context.ts +++ b/apps/gateway/src/chat/tools/chat-log-context.ts @@ -18,6 +18,7 @@ export interface ChatCompletionLogState { resolveStreamCompletion?: () => void; caughtError?: unknown; internalContentFilter?: boolean; + gatewayContentFilterResponse?: unknown; clientErrorSynthesized?: boolean; syncInsert?: boolean; logIdOverride?: string;