diff --git a/apps/gateway/src/api-individual.e2e.ts b/apps/gateway/src/api-individual.e2e.ts index 471aee598..5e81bdcfa 100644 --- a/apps/gateway/src/api-individual.e2e.ts +++ b/apps/gateway/src/api-individual.e2e.ts @@ -322,6 +322,12 @@ describe("e2e individual tests", () => { expect((log.errorDetails as { message?: string })?.message).toContain( "the word 'json'", ); + + const matchingLogs = await db + .select() + .from(tables.log) + .where(eq(tables.log.requestId, requestId)); + expect(matchingLogs).toHaveLength(1); }, ); diff --git a/apps/gateway/src/api.spec.ts b/apps/gateway/src/api.spec.ts index 097eaeb70..d196e4e09 100644 --- a/apps/gateway/src/api.spec.ts +++ b/apps/gateway/src/api.spec.ts @@ -10,7 +10,12 @@ import { resetKeyHealth, } from "./lib/api-key-health.js"; import { createGatewayApiTestHarness } from "./test-utils/gateway-api-test-harness.js"; -import { readAll, waitForLogs } from "./test-utils/test-helpers.js"; +import { + readAll, + processPendingLogs, + waitForLogByRequestId, + waitForLogs, +} from "./test-utils/test-helpers.js"; describe("api", () => { const harness = createGatewayApiTestHarness({ @@ -1616,6 +1621,7 @@ describe("api", () => { }); test("Reasoning effort error for unsupported model", async () => { + const requestId = "reasoning-effort-unsupported-request-id"; await db.insert(tables.apiKey).values({ id: "token-id", token: "real-token", @@ -1628,6 +1634,7 @@ describe("api", () => { method: "POST", headers: { "Content-Type": "application/json", + "x-request-id": requestId, Authorization: `Bearer real-token`, }, body: JSON.stringify({ @@ -1646,6 +1653,70 @@ describe("api", () => { const json = await res.json(); expect(json.message).toContain("does not support reasoning"); + + const log = await waitForLogByRequestId(requestId); + expect(log.finishReason).toBe("client_error"); + expect(log.unifiedFinishReason).toBe("client_error"); + + const matchingLogs = await db + .select() + .from(tables.log) + .where(eq(tables.log.requestId, requestId)); + expect(matchingLogs).toHaveLength(1); + }); + + test("Schema validation errors are logged as client_error", async () => { + const requestId = "schema-validation-client-error-request-id"; + await db.insert(tables.apiKey).values({ + id: "token-id-schema-validation", + token: "real-token-schema-validation", + projectId: "project-id", + description: "Test API Key", + createdBy: "user-id", + }); + + const res = await app.request("/v1/chat/completions", { + method: "POST", + headers: { + "Content-Type": "application/json", + "x-request-id": requestId, + Authorization: "Bearer real-token-schema-validation", + }, + body: JSON.stringify({ + model: "gpt-4o-mini", + messages: [ + { + role: "user", + content: 5555, + }, + ], + }), + }); + + expect(res.status).toBe(400); + + const json = await res.json(); + expect(json.success).toBe(false); + expect(JSON.stringify(json)).toContain("invalid_union"); + + const log = await waitForLogByRequestId(requestId); + expect(log.finishReason).toBe("client_error"); + expect(log.unifiedFinishReason).toBe("client_error"); + expect(log.errorDetails?.statusCode).toBe(400); + expect(log.errorDetails?.responseText).toContain("invalid_union"); + expect(log.errorDetails?.responseText).toContain("messages"); + expect(log.messages).toEqual([ + { + role: "user", + content: 5555, + }, + ]); + + const matchingLogs = await db + .select() + .from(tables.log) + .where(eq(tables.log.requestId, requestId)); + expect(matchingLogs).toHaveLength(1); }); test("Max tokens validation error when exceeding model limit", async () => { @@ -1802,10 +1873,12 @@ describe("api", () => { // test for missing Authorization header test("/v1/chat/completions missing Authorization header", async () => { + const requestId = "missing-auth-request-id"; const res = await app.request("/v1/chat/completions", { method: "POST", headers: { "Content-Type": "application/json", + "x-request-id": requestId, // Intentionally not setting Authorization header }, body: JSON.stringify({ @@ -1819,6 +1892,13 @@ describe("api", () => { }), }); expect(res.status).toBe(401); + + await processPendingLogs(); + const logs = await db + .select() + .from(tables.log) + .where(eq(tables.log.requestId, requestId)); + expect(logs).toHaveLength(0); }); // test for explicitly specifying a provider in the format "provider/model" @@ -1954,6 +2034,7 @@ describe("api", () => { // test for missing provider API key test("/v1/chat/completions with missing provider API key", async () => { + const requestId = "missing-provider-key-request-id"; await db.insert(tables.apiKey).values({ id: "token-id", token: "real-token", @@ -1966,6 +2047,7 @@ describe("api", () => { method: "POST", headers: { "Content-Type": "application/json", + "x-request-id": requestId, Authorization: `Bearer real-token`, }, body: JSON.stringify({ @@ -1983,6 +2065,16 @@ describe("api", () => { expect(errorMessage).toMatchInlineSnapshot( `"{"error":true,"status":400,"message":"No API key set for provider: openai. Please add a provider key in your settings or add credits and switch to credits or hybrid mode."}"`, ); + + const log = await waitForLogByRequestId(requestId); + expect(log.finishReason).toBe("client_error"); + expect(log.unifiedFinishReason).toBe("client_error"); + + const matchingLogs = await db + .select() + .from(tables.log) + .where(eq(tables.log.requestId, requestId)); + expect(matchingLogs).toHaveLength(1); }); // test for provider error response and error logging diff --git a/apps/gateway/src/chat/chat.ts b/apps/gateway/src/chat/chat.ts index 68ad2d2b8..93c8f7c8d 100644 --- a/apps/gateway/src/chat/chat.ts +++ b/apps/gateway/src/chat/chat.ts @@ -103,7 +103,12 @@ import { stripRegionFromModelName, } from "@llmgateway/models"; +import { chatCompletionLogMiddleware } from "./middleware/chat-completion-log.js"; import { completionsRequestSchema } from "./schemas/completions.js"; +import { + finishStreamCompletion, + registerStreamCompletion, +} from "./tools/chat-log-context.js"; import { checkContentFilter, getContentFilterMethod, @@ -828,6 +833,8 @@ const sharedTextDecoder = new TextDecoder(); export const chat = new OpenAPIHono(); +chat.use("/completions", chatCompletionLogMiddleware); + const completions = createRoute({ operationId: "v1_chat_completions", summary: "Chat Completions", @@ -1189,18 +1196,24 @@ chat.openapi(completions, async (c) => { const logIdOverride = responsesContext?.logId; const responsesApiData: unknown = responsesContext?.responsesApiData ?? null; - // Wrapper that injects Responses API fields into every log entry. - // Only override the id for the final log entry (retried !== true) to avoid - // PK conflicts when the request retries across multiple providers. - const insertLogEntry = (logData: LogInsertData) => - insertLog( - { - ...logData, - ...(logIdOverride && !logData.retried ? { id: logIdOverride } : {}), - responsesApiData, - }, - { syncInsert: syncLogInsert }, - ); + const chatLogState = c.get("chatCompletionLogState"); + if (chatLogState) { + chatLogState.syncInsert = syncLogInsert; + chatLogState.logIdOverride = logIdOverride; + chatLogState.responsesApiData = responsesApiData; + } + + // Queue a log entry for the middleware to flush after the request completes. + // The middleware applies logIdOverride/responsesApiData/syncInsert from state + // at flush time, so we just push the raw log data here. + const insertLogEntry = (logData: LogInsertData): Promise => { + if (chatLogState) { + chatLogState.pendingLogs.push(logData); + } else { + void _insertLog(logData); + } + return Promise.resolve(1); + }; // Check for X-No-Fallback header to disable provider fallback on low uptime const xNoFallbackHeaderSet = @@ -3212,21 +3225,31 @@ chat.openapi(completions, async (c) => { .length ? openAIContentFilterResult.responses : null; + + if (chatLogState) { + if (shouldTagContentFilter) { + chatLogState.internalContentFilter = true; + } + chatLogState.gatewayContentFilterResponse = gatewayContentFilterResponse; + } + const insertLog = ( logData: Parameters[0], - options?: Parameters[1], - ) => - _insertLog( - { + _options?: Parameters[1], + ): Promise => { + if (chatLogState) { + chatLogState.pendingLogs.push(logData as LogInsertData); + } else { + const enriched = { ...logData, - internalContentFilter: shouldTagContentFilter - ? true - : logData.internalContentFilter, gatewayContentFilterResponse: logData.gatewayContentFilterResponse ?? gatewayContentFilterResponse, - }, - options, - ); + ...(shouldTagContentFilter ? { internalContentFilter: true } : {}), + }; + void _insertLog(enriched); + } + return Promise.resolve(1); + }; if (contentFilterBlocked) { const contentFilterResponseId = `chatcmpl-${Date.now()}`; @@ -3297,25 +3320,30 @@ chat.openapi(completions, async (c) => { } if (stream) { + void registerStreamCompletion(c); return streamSSE(c, async (sseStream) => { - const chunk = { - id: contentFilterResponseId, - object: "chat.completion.chunk", - created: contentFilterCreated, - model: requestedModel, - choices: [ - { - index: 0, - delta: {}, - finish_reason: "content_filter", - }, - ], - }; - await sseStream.writeSSE({ - data: JSON.stringify(chunk), - id: "0", - }); - await sseStream.writeSSE({ data: "[DONE]" }); + try { + const chunk = { + id: contentFilterResponseId, + object: "chat.completion.chunk", + created: contentFilterCreated, + model: requestedModel, + choices: [ + { + index: 0, + delta: {}, + finish_reason: "content_filter", + }, + ], + }; + await sseStream.writeSSE({ + data: JSON.stringify(chunk), + id: "0", + }); + await sseStream.writeSSE({ data: "[DONE]" }); + } finally { + finishStreamCompletion(c); + } }); } @@ -3653,30 +3681,35 @@ chat.openapi(completions, async (c) => { }); // Return cached streaming response by replaying chunks with original timing + void registerStreamCompletion(c); return streamSSE( c, async (stream) => { - let previousTimestamp = 0; + try { + let previousTimestamp = 0; - for (const chunk of cachedStreamingResponse.chunks) { - // Calculate delay based on original chunk timing - const delay = Math.max(0, chunk.timestamp - previousTimestamp); - // Cap the delay to prevent excessively long waits (max 1 second) - const cappedDelay = Math.min(delay, 1000); + for (const chunk of cachedStreamingResponse.chunks) { + // Calculate delay based on original chunk timing + const delay = Math.max(0, chunk.timestamp - previousTimestamp); + // Cap the delay to prevent excessively long waits (max 1 second) + const cappedDelay = Math.min(delay, 1000); - if (cappedDelay > 0) { - await new Promise((resolve) => { - setTimeout(() => resolve(), cappedDelay); - }); - } + if (cappedDelay > 0) { + await new Promise((resolve) => { + setTimeout(() => resolve(), cappedDelay); + }); + } - await stream.writeSSE({ - data: chunk.data, - id: String(chunk.eventId), - event: chunk.event, - }); + await stream.writeSSE({ + data: chunk.data, + id: String(chunk.eventId), + event: chunk.event, + }); - previousTimestamp = chunk.timestamp; + previousTimestamp = chunk.timestamp; + } + } finally { + finishStreamCompletion(c); } }, async (error) => { @@ -3687,6 +3720,7 @@ chat.openapi(completions, async (c) => { } else { logger.error("Error replaying cached stream", error); } + finishStreamCompletion(c); }, ); } @@ -4282,354 +4316,911 @@ chat.openapi(completions, async (c) => { // For stream-only models where the client didn't request streaming, use the non-streaming path // (effectiveStream forces streaming upstream, but the client gets a regular JSON response) if (effectiveStream && !forceStream) { + void registerStreamCompletion(c); return streamSSE( c, async (stream) => { - let eventId = 0; - let canceled = false; - let streamingError: unknown = null; - let doneSent = false; // Track if [DONE] has been sent downstream - - // Raw logging variables - let streamingRawResponseData = ""; // Raw SSE data sent back to the client - - // Streaming cache variables - const streamingChunks: Array<{ - data: string; - eventId: number; - event?: string; - timestamp: number; - }> = []; - const streamStartTime = Date.now(); - - // SSE keepalive to prevent proxy/load balancer timeouts - // Sends SSE comments (ignored by clients) every 15 seconds to keep connection alive - const KEEPALIVE_INTERVAL_MS = 15000; - const keepaliveInterval = setInterval(() => { - stream.write(": ping\n\n").catch(() => { - // Stream likely closed, cleanup will happen via abort handler or finally - }); - }, KEEPALIVE_INTERVAL_MS); - const clearKeepalive = () => clearInterval(keepaliveInterval); - - // Timing tracking variables - let timeToFirstToken: number | null = null; - let timeToFirstReasoningToken: number | null = null; - let firstTokenReceived = false; - let firstReasoningTokenReceived = false; - - // Helper function to write SSE and capture for cache - const writeSSEAndCache = async (sseData: { - data: string; - event?: string; - id?: string; - }) => { - await stream.writeSSE(sseData); - - // Collect raw response data for logging only in debug mode and within size limit - if ( - debugMode && - streamingRawResponseData.length < MAX_RAW_DATA_SIZE - ) { - const sseString = `${sseData.event ? `event: ${sseData.event}\n` : ""}data: ${sseData.data}${sseData.id ? `\nid: ${sseData.id}` : ""}\n\n`; - streamingRawResponseData += sseString; - } - - // Capture for streaming cache if enabled - if (cachingEnabled && streamingCacheKey) { - streamingChunks.push({ - data: sseData.data, - eventId: sseData.id ? parseInt(sseData.id, 10) : eventId, - event: sseData.event, - timestamp: Date.now() - streamStartTime, + return await (async () => { + let eventId = 0; + let canceled = false; + let streamingError: unknown = null; + let doneSent = false; // Track if [DONE] has been sent downstream + + // Raw logging variables + let streamingRawResponseData = ""; // Raw SSE data sent back to the client + + // Streaming cache variables + const streamingChunks: Array<{ + data: string; + eventId: number; + event?: string; + timestamp: number; + }> = []; + const streamStartTime = Date.now(); + + // SSE keepalive to prevent proxy/load balancer timeouts + // Sends SSE comments (ignored by clients) every 15 seconds to keep connection alive + const KEEPALIVE_INTERVAL_MS = 15000; + const keepaliveInterval = setInterval(() => { + stream.write(": ping\n\n").catch(() => { + // Stream likely closed, cleanup will happen via abort handler or finally }); - } - }; + }, KEEPALIVE_INTERVAL_MS); + const clearKeepalive = () => clearInterval(keepaliveInterval); + + // Timing tracking variables + let timeToFirstToken: number | null = null; + let timeToFirstReasoningToken: number | null = null; + let firstTokenReceived = false; + let firstReasoningTokenReceived = false; + + // Helper function to write SSE and capture for cache + const writeSSEAndCache = async (sseData: { + data: string; + event?: string; + id?: string; + }) => { + await stream.writeSSE(sseData); + + // Collect raw response data for logging only in debug mode and within size limit + if ( + debugMode && + streamingRawResponseData.length < MAX_RAW_DATA_SIZE + ) { + const sseString = `${sseData.event ? `event: ${sseData.event}\n` : ""}data: ${sseData.data}${sseData.id ? `\nid: ${sseData.id}` : ""}\n\n`; + streamingRawResponseData += sseString; + } - const writeStreamingContentFilterResponse = async ({ - billingModel, - billingProvider, - responseModel, - metadata, - }: { - billingModel: string; - billingProvider: Provider; - responseModel: string; - metadata?: Record; - }) => { - const { calculatedPromptTokens } = estimateTokens( - billingProvider, - messages, - null, - null, - 0, - ); - const promptTokenCount = Math.max( - 1, - Math.round(calculatedPromptTokens ?? 1), - ); - const streamingCosts = await calculateCosts( + // Capture for streaming cache if enabled + if (cachingEnabled && streamingCacheKey) { + streamingChunks.push({ + data: sseData.data, + eventId: sseData.id ? parseInt(sseData.id, 10) : eventId, + event: sseData.event, + timestamp: Date.now() - streamStartTime, + }); + } + }; + + const writeStreamingContentFilterResponse = async ({ billingModel, billingProvider, - promptTokenCount, - 0, - null, - { - prompt: messages - .map((m) => messageContentToString(m.content)) - .join("\n"), - completion: "", - }, - null, - 0, - image_config?.image_size, - inputImageCount, - 0, - project.organizationId, - image_config?.image_quality, - ); - streamingCosts.dataStorageCost = toDataStorageCostNumber( - streamingCosts.promptTokens ?? promptTokenCount, - null, - 0, - null, - retentionLevel, - ); - - await writeSSEAndCache({ - data: JSON.stringify({ - id: `chatcmpl-${Date.now()}`, - object: "chat.completion.chunk", - created: Math.floor(Date.now() / 1000), - model: responseModel, - choices: [ - { - index: 0, - delta: {}, - finish_reason: "content_filter", - }, - ], - ...(metadata && { metadata }), - }), - id: String(eventId++), - }); + responseModel, + metadata, + }: { + billingModel: string; + billingProvider: Provider; + responseModel: string; + metadata?: Record; + }) => { + const { calculatedPromptTokens } = estimateTokens( + billingProvider, + messages, + null, + null, + 0, + ); + const promptTokenCount = Math.max( + 1, + Math.round(calculatedPromptTokens ?? 1), + ); + const streamingCosts = await calculateCosts( + billingModel, + billingProvider, + promptTokenCount, + 0, + null, + { + prompt: messages + .map((m) => messageContentToString(m.content)) + .join("\n"), + completion: "", + }, + null, + 0, + image_config?.image_size, + inputImageCount, + 0, + project.organizationId, + image_config?.image_quality, + ); + streamingCosts.dataStorageCost = toDataStorageCostNumber( + streamingCosts.promptTokens ?? promptTokenCount, + null, + 0, + null, + retentionLevel, + ); - const contentFilterUsage: Record = { - prompt_tokens: promptTokenCount, - completion_tokens: 0, - total_tokens: promptTokenCount, - }; - applyExtendedUsageFields(contentFilterUsage, { - costs: { - inputCost: streamingCosts.inputCost, - outputCost: streamingCosts.outputCost, - cachedInputCost: streamingCosts.cachedInputCost, - cacheWriteInputCost: streamingCosts.cacheWriteInputCost, - requestCost: streamingCosts.requestCost, - webSearchCost: streamingCosts.webSearchCost, - imageInputCost: streamingCosts.imageInputCost, - imageOutputCost: streamingCosts.imageOutputCost, - totalCost: streamingCosts.totalCost, - dataStorageCost: streamingCosts.dataStorageCost, - }, - cachedTokens: null, - cacheCreationTokens: null, - reasoningTokens: null, - }); - await writeSSEAndCache({ - data: JSON.stringify({ - id: `chatcmpl-${Date.now()}`, - object: "chat.completion.chunk", - created: Math.floor(Date.now() / 1000), - model: responseModel, - choices: [ - { - index: 0, - delta: {}, - finish_reason: null, - }, - ], - usage: contentFilterUsage, - }), - id: String(eventId++), - }); + await writeSSEAndCache({ + data: JSON.stringify({ + id: `chatcmpl-${Date.now()}`, + object: "chat.completion.chunk", + created: Math.floor(Date.now() / 1000), + model: responseModel, + choices: [ + { + index: 0, + delta: {}, + finish_reason: "content_filter", + }, + ], + ...(metadata && { metadata }), + }), + id: String(eventId++), + }); - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); - doneSent = true; - }; + const contentFilterUsage: Record = { + prompt_tokens: promptTokenCount, + completion_tokens: 0, + total_tokens: promptTokenCount, + }; + applyExtendedUsageFields(contentFilterUsage, { + costs: { + inputCost: streamingCosts.inputCost, + outputCost: streamingCosts.outputCost, + cachedInputCost: streamingCosts.cachedInputCost, + cacheWriteInputCost: streamingCosts.cacheWriteInputCost, + requestCost: streamingCosts.requestCost, + webSearchCost: streamingCosts.webSearchCost, + imageInputCost: streamingCosts.imageInputCost, + imageOutputCost: streamingCosts.imageOutputCost, + totalCost: streamingCosts.totalCost, + dataStorageCost: streamingCosts.dataStorageCost, + }, + cachedTokens: null, + cacheCreationTokens: null, + reasoningTokens: null, + }); + await writeSSEAndCache({ + data: JSON.stringify({ + id: `chatcmpl-${Date.now()}`, + object: "chat.completion.chunk", + created: Math.floor(Date.now() / 1000), + model: responseModel, + choices: [ + { + index: 0, + delta: {}, + finish_reason: null, + }, + ], + usage: contentFilterUsage, + }), + id: String(eventId++), + }); - // Set up cancellation handling - const controller = new AbortController(); - // Set up a listener for the request being aborted - const onAbort = () => { - clearKeepalive(); - if (requestCanBeCanceled) { - canceled = true; - controller.abort(); - } - }; + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + doneSent = true; + }; - // Add event listener for the abort event on the connection - c.req.raw.signal.addEventListener("abort", onAbort); - - // --- Retry loop for provider fallback --- - const routingAttempts: RoutingAttempt[] = []; - const failedProviderIds = new Set(); - let res: Response | undefined; - const finalLogId = logIdOverride ?? shortid(); - for ( - let retryAttempt = 0; - retryAttempt <= MAX_RETRIES; - retryAttempt++ - ) { - const perAttemptStartTime = Date.now(); + // Set up cancellation handling + const controller = new AbortController(); + // Set up a listener for the request being aborted + const onAbort = () => { + clearKeepalive(); + if (requestCanBeCanceled) { + canceled = true; + controller.abort(); + } + }; - // Type guard: narrow variables that TypeScript widens due to loop reassignment - if ( - !usedProvider || - !usedToken || - !url || - !usedModelFormatted || - !usedModelMapping + // Add event listener for the abort event on the connection + c.req.raw.signal.addEventListener("abort", onAbort); + + // --- Retry loop for provider fallback --- + const routingAttempts: RoutingAttempt[] = []; + const failedProviderIds = new Set(); + let res: Response | undefined; + const finalLogId = logIdOverride ?? shortid(); + for ( + let retryAttempt = 0; + retryAttempt <= MAX_RETRIES; + retryAttempt++ ) { - throw new Error("Provider context not initialized"); - } - - if (retryAttempt > 0) { - // Re-add abort listener (catch block removes it on error) - c.req.raw.signal.addEventListener("abort", onAbort); + const perAttemptStartTime = Date.now(); - const nextProvider = selectNextProvider( - routingMetadata?.providerScores ?? [], - failedProviderIds, - iamFilteredModelProviders, - ); - if (!nextProvider) { - break; + // Type guard: narrow variables that TypeScript widens due to loop reassignment + if ( + !usedProvider || + !usedToken || + !url || + !usedModelFormatted || + !usedModelMapping + ) { + throw new Error("Provider context not initialized"); } - // Check if the fallback candidate is rate-limited - const retryRateLimitPeek = await peekProviderRateLimit( - project.organizationId, - nextProvider.providerId, - modelInfo.id, - nextProvider.modelName, - ); - if (retryRateLimitPeek.rateLimited) { - failedProviderIds.add( - providerRetryKey(nextProvider.providerId, nextProvider.region), - ); - // Mark as rate-limited in routing metadata - const scoreEntry = routingMetadata?.providerScores.find( - (s) => s.providerId === nextProvider.providerId, + if (retryAttempt > 0) { + // Re-add abort listener (catch block removes it on error) + c.req.raw.signal.addEventListener("abort", onAbort); + + const nextProvider = selectNextProvider( + routingMetadata?.providerScores ?? [], + failedProviderIds, + iamFilteredModelProviders, ); - if (scoreEntry) { - scoreEntry.rate_limited = true; + if (!nextProvider) { + break; } - // Don't consume a retry slot for rate-limit skips - retryAttempt--; - continue; - } - try { - const ctx = await resolveProviderContextForRetry( - nextProvider, - true, + // Check if the fallback candidate is rate-limited + const retryRateLimitPeek = await peekProviderRateLimit( + project.organizationId, + nextProvider.providerId, + modelInfo.id, + nextProvider.modelName, ); - applyResolvedProviderContext(ctx); - } catch { - failedProviderIds.add( - providerRetryKey(nextProvider.providerId, nextProvider.region), - ); - // Don't consume a retry slot for context-resolution failures - retryAttempt--; - continue; - } - } + if (retryRateLimitPeek.rateLimited) { + failedProviderIds.add( + providerRetryKey( + nextProvider.providerId, + nextProvider.region, + ), + ); + // Mark as rate-limited in routing metadata + const scoreEntry = routingMetadata?.providerScores.find( + (s) => s.providerId === nextProvider.providerId, + ); + if (scoreEntry) { + scoreEntry.rate_limited = true; + } + // Don't consume a retry slot for rate-limit skips + retryAttempt--; + continue; + } - try { - const headers = getProviderHeaders(usedProvider, usedToken, { - requestId, - webSearchEnabled: !!webSearchTool, - }); - headers["Content-Type"] = "application/json"; - - // Add effort beta header for Anthropic if effort parameter is specified - if (usedProvider === "anthropic" && effort !== undefined) { - const currentBeta = headers["anthropic-beta"]; - headers["anthropic-beta"] = currentBeta - ? `${currentBeta},effort-2025-11-24` - : "effort-2025-11-24"; + try { + const ctx = await resolveProviderContextForRetry( + nextProvider, + true, + ); + applyResolvedProviderContext(ctx); + } catch { + failedProviderIds.add( + providerRetryKey( + nextProvider.providerId, + nextProvider.region, + ), + ); + // Don't consume a retry slot for context-resolution failures + retryAttempt--; + continue; + } } - // Add structured outputs beta header for Anthropic if json_schema response_format is specified - if ( - usedProvider === "anthropic" && - response_format?.type === "json_schema" - ) { - const currentBeta = headers["anthropic-beta"]; - headers["anthropic-beta"] = currentBeta - ? `${currentBeta},structured-outputs-2025-11-13` - : "structured-outputs-2025-11-13"; - } + try { + const headers = getProviderHeaders(usedProvider, usedToken, { + requestId, + webSearchEnabled: !!webSearchTool, + }); + headers["Content-Type"] = "application/json"; + + // Add effort beta header for Anthropic if effort parameter is specified + if (usedProvider === "anthropic" && effort !== undefined) { + const currentBeta = headers["anthropic-beta"]; + headers["anthropic-beta"] = currentBeta + ? `${currentBeta},effort-2025-11-24` + : "effort-2025-11-24"; + } - // Create a combined signal for both timeout and cancellation - const fetchSignal = createStreamingCombinedSignal( - requestCanBeCanceled ? controller : undefined, - ); + // Add structured outputs beta header for Anthropic if json_schema response_format is specified + if ( + usedProvider === "anthropic" && + response_format?.type === "json_schema" + ) { + const currentBeta = headers["anthropic-beta"]; + headers["anthropic-beta"] = currentBeta + ? `${currentBeta},structured-outputs-2025-11-13` + : "structured-outputs-2025-11-13"; + } - res = await fetch(url, { - method: "POST", - headers, - body: JSON.stringify(requestBody), - signal: fetchSignal, - }); - } catch (error) { - // Clean up the event listeners - c.req.raw.signal.removeEventListener("abort", onAbort); + // Create a combined signal for both timeout and cancellation + const fetchSignal = createStreamingCombinedSignal( + requestCanBeCanceled ? controller : undefined, + ); - // Check for timeout error first (AbortSignal.timeout throws TimeoutError) - if (isTimeoutError(error)) { - // Handle timeout error - const errorMessage = - error instanceof Error ? error.message : "Request timeout"; - const timeoutCause = extractErrorCause(error); - logger.warn("Upstream request timeout", { - error: errorMessage, - cause: timeoutCause, - usedProvider, - requestedProvider, - usedModel, - initialRequestedModel, - unifiedFinishReason: getUnifiedFinishReason( - "upstream_error", - usedProvider, - ), + res = await fetch(url, { + method: "POST", + headers, + body: JSON.stringify(requestBody), + signal: fetchSignal, }); + } catch (error) { + // Clean up the event listeners + c.req.raw.signal.removeEventListener("abort", onAbort); + + // Check for timeout error first (AbortSignal.timeout throws TimeoutError) + if (isTimeoutError(error)) { + // Handle timeout error + const errorMessage = + error instanceof Error ? error.message : "Request timeout"; + const timeoutCause = extractErrorCause(error); + logger.warn("Upstream request timeout", { + error: errorMessage, + cause: timeoutCause, + usedProvider, + requestedProvider, + usedModel, + initialRequestedModel, + unifiedFinishReason: getUnifiedFinishReason( + "upstream_error", + usedProvider, + ), + }); - // Log the timeout error in the database - const timeoutPluginIds = plugins?.map((p) => p.id) ?? []; - - let sameProviderRetryContext: Awaited< - ReturnType - > | null = null; - rememberFailedKey(usedProvider, usedRegion, { - envVarName, - configIndex, - providerKeyId: providerKey?.id, - }); - sameProviderRetryContext = - await tryResolveAlternateKeyForCurrentProvider(true); + // Log the timeout error in the database + const timeoutPluginIds = plugins?.map((p) => p.id) ?? []; - // Check if we should retry before logging so we can mark the log as retried - const willRetryTimeout = shouldRetryRequest({ - requestedProvider, + let sameProviderRetryContext: Awaited< + ReturnType + > | null = null; + rememberFailedKey(usedProvider, usedRegion, { + envVarName, + configIndex, + providerKeyId: providerKey?.id, + }); + sameProviderRetryContext = + await tryResolveAlternateKeyForCurrentProvider(true); + + // Check if we should retry before logging so we can mark the log as retried + const willRetryTimeout = shouldRetryRequest({ + requestedProvider, + noFallback, + errorType: "upstream_timeout", + retryCount: retryAttempt, + remainingProviders: + (routingMetadata?.providerScores.length ?? 0) - + failedProviderIds.size - + 1, + usedProvider, + }); + const willRetrySameProvider = sameProviderRetryContext !== null; + const willRetryRequest = + willRetrySameProvider || willRetryTimeout; + + const baseLogEntry = createLogEntry( + requestId, + project, + apiKey, + providerKey?.id, + usedModelFormatted, + usedModelMapping, + usedProvider, + initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoning_effort, + reasoning_max_tokens, + effort, + response_format, + tools, + tool_choice, + source, + customHeaders, + debugMode, + userAgent, + image_config, + routingMetadata, + rawBody, + null, // No response for timeout error + requestBody, + null, // No upstream response for timeout error + timeoutPluginIds, + undefined, // No plugin results for error case + ); + const attemptLogId = shortid(); + + await insertLogEntry({ + ...baseLogEntry, + id: attemptLogId, + duration: Date.now() - perAttemptStartTime, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + responseSize: 0, + content: null, + reasoningContent: null, + finishReason: "upstream_error", + promptTokens: null, + completionTokens: null, + totalTokens: null, + reasoningTokens: null, + cachedTokens: null, + hasError: true, + streamed: true, + canceled: false, + errorDetails: { + statusCode: 0, + statusText: "TimeoutError", + responseText: errorMessage, + cause: timeoutCause, + }, + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + discount: null, + dataStorageCost: "0", + cached: false, + toolResults: null, + retried: willRetryRequest, + retriedByLogId: willRetryRequest ? finalLogId : null, + }); + + if (willRetrySameProvider && sameProviderRetryContext) { + routingAttempts.push( + buildRoutingAttempt( + usedProvider, + baseModelName, + 0, + getErrorType(0), + false, + { + region: usedRegion, + apiKeyHash: usedApiKeyHash, + logId: attemptLogId, + }, + ), + ); + applyResolvedProviderContext(sameProviderRetryContext); + retryAttempt--; + continue; + } + + if (willRetryTimeout) { + routingAttempts.push( + buildRoutingAttempt( + usedProvider, + baseModelName, + 0, + getErrorType(0), + false, + { + region: usedRegion, + apiKeyHash: usedApiKeyHash, + logId: attemptLogId, + }, + ), + ); + failedProviderIds.add( + providerRetryKey(usedProvider, usedRegion), + ); + continue; + } + + await stream.writeSSE({ + event: "error", + data: JSON.stringify({ + error: { + message: `Upstream provider timeout: ${errorMessage}`, + type: "upstream_timeout", + code: "timeout", + }, + }), + id: String(eventId++), + }); + return; + } else if ( + error instanceof Error && + error.name === "AbortError" + ) { + // Log the canceled request + // Extract plugin IDs for logging (canceled request) + const canceledPluginIds = plugins?.map((p) => p.id) ?? []; + + // Calculate costs for cancelled request if billing is enabled + const billCancelled = shouldBillCancelledRequests(); + let cancelledCosts: Awaited< + ReturnType + > | null = null; + let estimatedPromptTokens: number | null = null; + + if (billCancelled) { + // Estimate prompt tokens from messages + const tokenEstimation = estimateTokens( + usedProvider, + messages, + null, + null, + null, + ); + estimatedPromptTokens = + tokenEstimation.calculatedPromptTokens; + + // Calculate costs based on prompt tokens only (no completion yet) + // If web search tool was enabled, count it as 1 search for billing + cancelledCosts = await calculateCosts( + usedModel, + usedProvider, + estimatedPromptTokens, + 0, // No completion tokens yet + null, // No cached tokens + { + prompt: messages + .map((m) => messageContentToString(m.content)) + .join("\n"), + completion: "", + }, + null, // No reasoning tokens + 0, // No output images + undefined, + inputImageCount, + webSearchTool ? 1 : null, // Bill for web search if it was enabled + project.organizationId, + ); + } + + const baseLogEntry = createLogEntry( + requestId, + project, + apiKey, + providerKey?.id, + usedModelFormatted, + usedModelMapping, + usedProvider, + initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoning_effort, + reasoning_max_tokens, + effort, + response_format, + tools, + tool_choice, + source, + customHeaders, + debugMode, + userAgent, + image_config, + routingMetadata, + rawBody, + null, // No response for canceled request + requestBody, // The request that was sent before cancellation + null, // No upstream response for canceled request + canceledPluginIds, + undefined, // No plugin results for canceled request + ); + + await insertLogEntry({ + ...baseLogEntry, + duration: Date.now() - perAttemptStartTime, + timeToFirstToken: null, // Not applicable for canceled request + timeToFirstReasoningToken: null, // Not applicable for canceled request + responseSize: 0, + content: null, + reasoningContent: null, + finishReason: "canceled", + promptTokens: billCancelled + ? ( + cancelledCosts?.promptTokens ?? estimatedPromptTokens + )?.toString() + : null, + completionTokens: billCancelled ? "0" : null, + totalTokens: billCancelled + ? ( + cancelledCosts?.promptTokens ?? estimatedPromptTokens + )?.toString() + : null, + reasoningTokens: null, + cachedTokens: null, + hasError: false, + streamed: true, + canceled: true, + errorDetails: null, + inputCost: cancelledCosts?.inputCost ?? null, + outputCost: cancelledCosts?.outputCost ?? null, + cachedInputCost: cancelledCosts?.cachedInputCost ?? null, + requestCost: cancelledCosts?.requestCost ?? null, + webSearchCost: cancelledCosts?.webSearchCost ?? null, + imageInputTokens: + cancelledCosts?.imageInputTokens?.toString() ?? null, + imageOutputTokens: + cancelledCosts?.imageOutputTokens?.toString() ?? null, + imageInputCost: cancelledCosts?.imageInputCost ?? null, + imageOutputCost: cancelledCosts?.imageOutputCost ?? null, + cost: cancelledCosts?.totalCost ?? null, + estimatedCost: cancelledCosts?.estimatedCost ?? false, + discount: cancelledCosts?.discount ?? null, + dataStorageCost: billCancelled + ? calculateDataStorageCost( + cancelledCosts?.promptTokens ?? estimatedPromptTokens, + null, + 0, + null, + retentionLevel, + ) + : "0", + cached: false, + toolResults: null, + }); + + // Send a cancellation event to the client + await writeSSEAndCache({ + event: "canceled", + data: JSON.stringify({ + message: "Request canceled by client", + }), + id: String(eventId++), + }); + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + clearKeepalive(); + return; + } else if (error instanceof Error) { + // Handle fetch errors (timeout, connection failures, etc.) + const errorMessage = error.message; + const fetchCause = extractErrorCause(error); + logger.warn("Fetch error", { + error: errorMessage, + cause: fetchCause, + usedProvider, + requestedProvider, + usedModel, + initialRequestedModel, + unifiedFinishReason: getUnifiedFinishReason( + "upstream_error", + usedProvider, + ), + }); + + // Log the error in the database + // Extract plugin IDs for logging (fetch error) + const fetchErrorPluginIds = plugins?.map((p) => p.id) ?? []; + + let sameProviderRetryContext: Awaited< + ReturnType + > | null = null; + if (isRetryableErrorType("network_error")) { + rememberFailedKey(usedProvider, usedRegion, { + envVarName, + configIndex, + providerKeyId: providerKey?.id, + }); + sameProviderRetryContext = + await tryResolveAlternateKeyForCurrentProvider(true); + } + + // Check if we should retry before logging so we can mark the log as retried + const willRetryFetch = shouldRetryRequest({ + requestedProvider, + noFallback, + errorType: "network_error", + retryCount: retryAttempt, + remainingProviders: + (routingMetadata?.providerScores.length ?? 0) - + failedProviderIds.size - + 1, + usedProvider, + }); + const willRetrySameProvider = sameProviderRetryContext !== null; + const willRetryRequest = + willRetrySameProvider || willRetryFetch; + + const baseLogEntry = createLogEntry( + requestId, + project, + apiKey, + providerKey?.id, + usedModelFormatted, + usedModelMapping, + usedProvider, + initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoning_effort, + reasoning_max_tokens, + effort, + response_format, + tools, + tool_choice, + source, + customHeaders, + debugMode, + userAgent, + image_config, + routingMetadata, + rawBody, + null, // No response for fetch error + requestBody, // The request that resulted in error + null, // No upstream response for fetch error + fetchErrorPluginIds, + undefined, // No plugin results for error case + ); + const attemptLogId = shortid(); + + await insertLogEntry({ + ...baseLogEntry, + id: attemptLogId, + duration: Date.now() - perAttemptStartTime, + timeToFirstToken: null, // Not applicable for error case + timeToFirstReasoningToken: null, // Not applicable for error case + responseSize: 0, + content: null, + reasoningContent: null, + finishReason: "upstream_error", + promptTokens: null, + completionTokens: null, + totalTokens: null, + reasoningTokens: null, + cachedTokens: null, + hasError: true, + streamed: true, + canceled: false, + errorDetails: { + statusCode: 0, + statusText: error.name, + responseText: errorMessage, + cause: fetchCause, + }, + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + discount: null, + dataStorageCost: "0", + cached: false, + toolResults: null, + retried: willRetryRequest, + retriedByLogId: willRetryRequest ? finalLogId : null, + }); + + // Report key health for the selected token source + if (envVarName !== undefined) { + reportKeyError(envVarName, configIndex, 0); + } + if (providerKey?.id) { + reportTrackedKeyError(providerKey.id, 0); + } + + if (willRetrySameProvider && sameProviderRetryContext) { + routingAttempts.push( + buildRoutingAttempt( + usedProvider, + baseModelName, + 0, + getErrorType(0), + false, + { + region: usedRegion, + apiKeyHash: usedApiKeyHash, + logId: attemptLogId, + }, + ), + ); + applyResolvedProviderContext(sameProviderRetryContext); + retryAttempt--; + continue; + } + + if (willRetryFetch) { + routingAttempts.push( + buildRoutingAttempt( + usedProvider, + baseModelName, + 0, + getErrorType(0), + false, + { + region: usedRegion, + apiKeyHash: usedApiKeyHash, + logId: attemptLogId, + }, + ), + ); + failedProviderIds.add( + providerRetryKey(usedProvider, usedRegion), + ); + continue; + } + + // Send error event to the client + await writeSSEAndCache({ + event: "error", + data: JSON.stringify({ + error: { + message: `Failed to connect to provider: ${errorMessage}`, + type: "upstream_error", + code: "fetch_failed", + }, + }), + id: String(eventId++), + }); + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + clearKeepalive(); + return; + } else { + throw error; + } + } + + if (!res.ok) { + const rawErrorResponseText = await res.text(); + const errorResponseText = + usedProvider === "aws-bedrock" + ? extractAwsBedrockHttpError(res, rawErrorResponseText) + : rawErrorResponseText; + + // Determine the finish reason for error handling + const finishReason = getFinishReasonFromError( + res.status, + errorResponseText, + ); + + if ( + finishReason !== "client_error" && + finishReason !== "content_filter" + ) { + logger.warn("Provider error", { + status: res.status, + errorText: errorResponseText, + usedProvider, + requestedProvider, + usedModel, + initialRequestedModel, + organizationId: project.organizationId, + projectId: apiKey.projectId, + apiKeyId: apiKey.id, + unifiedFinishReason: getUnifiedFinishReason( + finishReason, + usedProvider, + ), + }); + } + + // Log the request in the database + // Extract plugin IDs for logging + const streamingErrorPluginIds = plugins?.map((p) => p.id) ?? []; + + let sameProviderRetryContext: Awaited< + ReturnType + > | null = null; + if (isRetryableErrorType(finishReason)) { + rememberFailedKey(usedProvider, usedRegion, { + envVarName, + configIndex, + providerKeyId: providerKey?.id, + }); + sameProviderRetryContext = + await tryResolveAlternateKeyForCurrentProvider(true); + } + + // Check if we should retry before logging so we can mark the log as retried + const willRetryHttpError = shouldRetryRequest({ + requestedProvider, noFallback, - errorType: "upstream_timeout", + errorType: finishReason, retryCount: retryAttempt, remainingProviders: (routingMetadata?.providerScores.length ?? 0) - @@ -4639,7 +5230,7 @@ chat.openapi(completions, async (c) => { }); const willRetrySameProvider = sameProviderRetryContext !== null; const willRetryRequest = - willRetrySameProvider || willRetryTimeout; + willRetrySameProvider || willRetryHttpError; const baseLogEntry = createLogEntry( requestId, @@ -4670,10 +5261,10 @@ chat.openapi(completions, async (c) => { image_config, routingMetadata, rawBody, - null, // No response for timeout error - requestBody, - null, // No upstream response for timeout error - timeoutPluginIds, + null, // No response for error case + requestBody, // The request that was sent and resulted in error + null, // No upstream response for error case + streamingErrorPluginIds, undefined, // No plugin results for error case ); const attemptLogId = shortid(); @@ -4684,24 +5275,38 @@ chat.openapi(completions, async (c) => { duration: Date.now() - perAttemptStartTime, timeToFirstToken: null, timeToFirstReasoningToken: null, - responseSize: 0, + responseSize: errorResponseText.length, content: null, reasoningContent: null, - finishReason: "upstream_error", - promptTokens: null, + finishReason, + promptTokens: + finishReason === "content_filter" + ? ( + estimateTokens(usedProvider, messages, null, null, 0) + .calculatedPromptTokens ?? null + )?.toString() + : null, completionTokens: null, - totalTokens: null, + totalTokens: + finishReason === "content_filter" + ? ( + estimateTokens(usedProvider, messages, null, null, 0) + .calculatedPromptTokens ?? null + )?.toString() + : null, reasoningTokens: null, cachedTokens: null, - hasError: true, + hasError: finishReason !== "content_filter", // content_filter is not an error streamed: true, canceled: false, - errorDetails: { - statusCode: 0, - statusText: "TimeoutError", - responseText: errorMessage, - cause: timeoutCause, - }, + errorDetails: + finishReason === "content_filter" + ? null + : { + statusCode: res.status, + statusText: res.statusText, + responseText: errorResponseText, + }, cachedInputCost: null, requestCost: null, webSearchCost: null, @@ -4717,13 +5322,34 @@ chat.openapi(completions, async (c) => { retriedByLogId: willRetryRequest ? finalLogId : null, }); + // Report key health for the selected token source + // Don't report content_filter as a key error - it's intentional provider behavior + if ( + envVarName !== undefined && + finishReason !== "content_filter" + ) { + reportKeyError( + envVarName, + configIndex, + res.status, + errorResponseText, + ); + } + if (providerKey?.id && finishReason !== "content_filter") { + reportTrackedKeyError( + providerKey.id, + res.status, + errorResponseText, + ); + } + if (willRetrySameProvider && sameProviderRetryContext) { routingAttempts.push( buildRoutingAttempt( usedProvider, baseModelName, - 0, - getErrorType(0), + res.status, + getErrorType(res.status), false, { region: usedRegion, @@ -4737,13 +5363,13 @@ chat.openapi(completions, async (c) => { continue; } - if (willRetryTimeout) { + if (willRetryHttpError) { routingAttempts.push( buildRoutingAttempt( usedProvider, baseModelName, - 0, - getErrorType(0), + res.status, + getErrorType(res.status), false, { region: usedRegion, @@ -4758,193 +5384,103 @@ chat.openapi(completions, async (c) => { continue; } - await stream.writeSSE({ - event: "error", - data: JSON.stringify({ - error: { - message: `Upstream provider timeout: ${errorMessage}`, - type: "upstream_timeout", - code: "timeout", + // For content_filter, return a proper completion chunk (not an error) + // This handles Azure ResponsibleAIPolicyViolation and similar content filtering errors + if (finishReason === "content_filter") { + await writeStreamingContentFilterResponse({ + billingModel: usedModel, + billingProvider: usedProvider, + responseModel: `${usedProvider}/${baseModelName}`, + metadata: { + requested_model: initialRequestedModel, + requested_provider: requestedProvider, + used_model: baseModelName, + used_provider: usedProvider, + ...(usedRegion && { used_region: usedRegion }), + underlying_used_model: usedModel, }, - }), - id: String(eventId++), - }); - return; - } else if (error instanceof Error && error.name === "AbortError") { - // Log the canceled request - // Extract plugin IDs for logging (canceled request) - const canceledPluginIds = plugins?.map((p) => p.id) ?? []; - - // Calculate costs for cancelled request if billing is enabled - const billCancelled = shouldBillCancelledRequests(); - let cancelledCosts: Awaited< - ReturnType - > | null = null; - let estimatedPromptTokens: number | null = null; - - if (billCancelled) { - // Estimate prompt tokens from messages - const tokenEstimation = estimateTokens( - usedProvider, - messages, - null, - null, - null, - ); - estimatedPromptTokens = tokenEstimation.calculatedPromptTokens; + }); + } else { + // For client errors, return the original provider error response + let errorData; + if (finishReason === "client_error") { + try { + errorData = JSON.parse(errorResponseText); + } catch { + // If we can't parse the original error, fall back to our format + errorData = { + error: { + message: `Error from provider: ${res.status} ${res.statusText} ${errorResponseText}`, + type: finishReason, + param: null, + code: finishReason, + responseText: errorResponseText, + }, + }; + } + } else { + errorData = { + error: { + message: `Error from provider: ${res.status} ${res.statusText} ${errorResponseText}`, + type: finishReason, + param: null, + code: finishReason, + responseText: errorResponseText, + }, + }; + } - // Calculate costs based on prompt tokens only (no completion yet) - // If web search tool was enabled, count it as 1 search for billing - cancelledCosts = await calculateCosts( - usedModel, - usedProvider, - estimatedPromptTokens, - 0, // No completion tokens yet - null, // No cached tokens - { - prompt: messages - .map((m) => messageContentToString(m.content)) - .join("\n"), - completion: "", - }, - null, // No reasoning tokens - 0, // No output images - undefined, - inputImageCount, - webSearchTool ? 1 : null, // Bill for web search if it was enabled - project.organizationId, - ); + await writeSSEAndCache({ + event: "error", + data: JSON.stringify(errorData), + id: String(eventId++), + }); + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); } - const baseLogEntry = createLogEntry( - requestId, - project, - apiKey, - providerKey?.id, - usedModelFormatted, - usedModelMapping, - usedProvider, - initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoning_effort, - reasoning_max_tokens, - effort, - response_format, - tools, - tool_choice, - source, - customHeaders, - debugMode, - userAgent, - image_config, - routingMetadata, - rawBody, - null, // No response for canceled request - requestBody, // The request that was sent before cancellation - null, // No upstream response for canceled request - canceledPluginIds, - undefined, // No plugin results for canceled request - ); - - await insertLogEntry({ - ...baseLogEntry, - duration: Date.now() - perAttemptStartTime, - timeToFirstToken: null, // Not applicable for canceled request - timeToFirstReasoningToken: null, // Not applicable for canceled request - responseSize: 0, - content: null, - reasoningContent: null, - finishReason: "canceled", - promptTokens: billCancelled - ? ( - cancelledCosts?.promptTokens ?? estimatedPromptTokens - )?.toString() - : null, - completionTokens: billCancelled ? "0" : null, - totalTokens: billCancelled - ? ( - cancelledCosts?.promptTokens ?? estimatedPromptTokens - )?.toString() - : null, - reasoningTokens: null, - cachedTokens: null, - hasError: false, - streamed: true, - canceled: true, - errorDetails: null, - inputCost: cancelledCosts?.inputCost ?? null, - outputCost: cancelledCosts?.outputCost ?? null, - cachedInputCost: cancelledCosts?.cachedInputCost ?? null, - requestCost: cancelledCosts?.requestCost ?? null, - webSearchCost: cancelledCosts?.webSearchCost ?? null, - imageInputTokens: - cancelledCosts?.imageInputTokens?.toString() ?? null, - imageOutputTokens: - cancelledCosts?.imageOutputTokens?.toString() ?? null, - imageInputCost: cancelledCosts?.imageInputCost ?? null, - imageOutputCost: cancelledCosts?.imageOutputCost ?? null, - cost: cancelledCosts?.totalCost ?? null, - estimatedCost: cancelledCosts?.estimatedCost ?? false, - discount: cancelledCosts?.discount ?? null, - dataStorageCost: billCancelled - ? calculateDataStorageCost( - cancelledCosts?.promptTokens ?? estimatedPromptTokens, - null, - 0, - null, - retentionLevel, - ) - : "0", - cached: false, - toolResults: null, - }); - - // Send a cancellation event to the client - await writeSSEAndCache({ - event: "canceled", - data: JSON.stringify({ - message: "Request canceled by client", - }), - id: String(eventId++), - }); - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); clearKeepalive(); return; - } else if (error instanceof Error) { - // Handle fetch errors (timeout, connection failures, etc.) - const errorMessage = error.message; - const fetchCause = extractErrorCause(error); - logger.warn("Fetch error", { - error: errorMessage, - cause: fetchCause, + } + + const inspectedStreamingResponse = + await inspectImmediateStreamingProviderError(res, usedProvider); + res = inspectedStreamingResponse.response; + if (inspectedStreamingResponse.immediateError) { + const { + errorCode, + errorMessage, + errorResponseText, + errorType, + inferredStatusCode, + statusText, + } = inspectedStreamingResponse.immediateError; + + logger.warn("Immediate streaming provider error", { + status: inferredStatusCode, + errorText: errorResponseText, usedProvider, requestedProvider, usedModel, initialRequestedModel, + organizationId: project.organizationId, + projectId: apiKey.projectId, + apiKeyId: apiKey.id, unifiedFinishReason: getUnifiedFinishReason( - "upstream_error", + errorType, usedProvider, ), }); - // Log the error in the database - // Extract plugin IDs for logging (fetch error) - const fetchErrorPluginIds = plugins?.map((p) => p.id) ?? []; + const streamingErrorPluginIds = plugins?.map((p) => p.id) ?? []; let sameProviderRetryContext: Awaited< ReturnType > | null = null; - if (isRetryableErrorType("network_error")) { + if (isRetryableErrorType(errorType)) { rememberFailedKey(usedProvider, usedRegion, { envVarName, configIndex, @@ -4954,11 +5490,10 @@ chat.openapi(completions, async (c) => { await tryResolveAlternateKeyForCurrentProvider(true); } - // Check if we should retry before logging so we can mark the log as retried - const willRetryFetch = shouldRetryRequest({ + const willRetryStreamingError = shouldRetryRequest({ requestedProvider, noFallback, - errorType: "network_error", + errorType, retryCount: retryAttempt, remainingProviders: (routingMetadata?.providerScores.length ?? 0) - @@ -4967,7 +5502,8 @@ chat.openapi(completions, async (c) => { usedProvider, }); const willRetrySameProvider = sameProviderRetryContext !== null; - const willRetryRequest = willRetrySameProvider || willRetryFetch; + const willRetryRequest = + willRetrySameProvider || willRetryStreamingError; const baseLogEntry = createLogEntry( requestId, @@ -4998,11 +5534,11 @@ chat.openapi(completions, async (c) => { image_config, routingMetadata, rawBody, - null, // No response for fetch error - requestBody, // The request that resulted in error - null, // No upstream response for fetch error - fetchErrorPluginIds, - undefined, // No plugin results for error case + null, + requestBody, + null, + streamingErrorPluginIds, + undefined, ); const attemptLogId = shortid(); @@ -5010,26 +5546,28 @@ chat.openapi(completions, async (c) => { ...baseLogEntry, id: attemptLogId, duration: Date.now() - perAttemptStartTime, - timeToFirstToken: null, // Not applicable for error case - timeToFirstReasoningToken: null, // Not applicable for error case - responseSize: 0, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + responseSize: errorResponseText.length, content: null, reasoningContent: null, - finishReason: "upstream_error", + finishReason: errorType, promptTokens: null, completionTokens: null, totalTokens: null, reasoningTokens: null, cachedTokens: null, - hasError: true, + hasError: errorType !== "content_filter", streamed: true, canceled: false, - errorDetails: { - statusCode: 0, - statusText: error.name, - responseText: errorMessage, - cause: fetchCause, - }, + errorDetails: + errorType === "content_filter" + ? null + : { + statusCode: inferredStatusCode, + statusText, + responseText: errorResponseText, + }, cachedInputCost: null, requestCost: null, webSearchCost: null, @@ -5045,12 +5583,20 @@ chat.openapi(completions, async (c) => { retriedByLogId: willRetryRequest ? finalLogId : null, }); - // Report key health for the selected token source - if (envVarName !== undefined) { - reportKeyError(envVarName, configIndex, 0); + if (envVarName !== undefined && errorType !== "content_filter") { + reportKeyError( + envVarName, + configIndex, + inferredStatusCode, + errorResponseText, + ); } - if (providerKey?.id) { - reportTrackedKeyError(providerKey.id, 0); + if (providerKey?.id && errorType !== "content_filter") { + reportTrackedKeyError( + providerKey.id, + inferredStatusCode, + errorResponseText, + ); } if (willRetrySameProvider && sameProviderRetryContext) { @@ -5058,8 +5604,8 @@ chat.openapi(completions, async (c) => { buildRoutingAttempt( usedProvider, baseModelName, - 0, - getErrorType(0), + inferredStatusCode, + getErrorType(inferredStatusCode), false, { region: usedRegion, @@ -5073,13 +5619,13 @@ chat.openapi(completions, async (c) => { continue; } - if (willRetryFetch) { + if (willRetryStreamingError) { routingAttempts.push( buildRoutingAttempt( usedProvider, baseModelName, - 0, - getErrorType(0), + inferredStatusCode, + getErrorType(inferredStatusCode), false, { region: usedRegion, @@ -5094,14 +5640,15 @@ chat.openapi(completions, async (c) => { continue; } - // Send error event to the client await writeSSEAndCache({ event: "error", data: JSON.stringify({ error: { - message: `Failed to connect to provider: ${errorMessage}`, - type: "upstream_error", - code: "fetch_failed", + message: errorMessage, + type: errorType, + code: errorCode, + param: null, + responseText: errorResponseText, }, }), id: String(eventId++), @@ -5113,1553 +5660,1065 @@ chat.openapi(completions, async (c) => { }); clearKeepalive(); return; - } else { - throw error; } - } - if (!res.ok) { - const rawErrorResponseText = await res.text(); - const errorResponseText = - usedProvider === "aws-bedrock" - ? extractAwsBedrockHttpError(res, rawErrorResponseText) - : rawErrorResponseText; - - // Determine the finish reason for error handling - const finishReason = getFinishReasonFromError( - res.status, - errorResponseText, - ); + break; // Fetch succeeded, exit retry loop + } // End of retry for loop - if ( - finishReason !== "client_error" && - finishReason !== "content_filter" - ) { - logger.warn("Provider error", { - status: res.status, - errorText: errorResponseText, + // Add the final attempt (successful or last failed) to routing + if (res && res.ok && usedProvider) { + routingAttempts.push( + buildRoutingAttempt( usedProvider, - requestedProvider, - usedModel, - initialRequestedModel, - organizationId: project.organizationId, - projectId: apiKey.projectId, - apiKeyId: apiKey.id, - unifiedFinishReason: getUnifiedFinishReason( - finishReason, - usedProvider, - ), - }); - } + baseModelName, + res.status, + "none", + true, + { + region: usedRegion, + apiKeyHash: usedApiKeyHash, + logId: finalLogId, + }, + ), + ); + } - // Log the request in the database - // Extract plugin IDs for logging - const streamingErrorPluginIds = plugins?.map((p) => p.id) ?? []; - - let sameProviderRetryContext: Awaited< - ReturnType - > | null = null; - if (isRetryableErrorType(finishReason)) { - rememberFailedKey(usedProvider, usedRegion, { - envVarName, - configIndex, - providerKeyId: providerKey?.id, - }); - sameProviderRetryContext = - await tryResolveAlternateKeyForCurrentProvider(true); - } + // Update routingMetadata with all routing attempts for DB logging + if (routingMetadata) { + // Enrich providerScores with failure info from routing attempts + const failedMap = new Map( + routingAttempts + .filter((a) => !a.succeeded) + .map((f) => [f.provider, f]), + ); + routingMetadata = { + ...routingMetadata, + routing: routingAttempts, + providerScores: routingMetadata.providerScores.map((score) => { + const failure = failedMap.get(score.providerId); + if (failure) { + return { + ...score, + failed: true, + status_code: failure.status_code, + error_type: failure.error_type, + }; + } + return score; + }), + }; + } - // Check if we should retry before logging so we can mark the log as retried - const willRetryHttpError = shouldRetryRequest({ - requestedProvider, - noFallback, - errorType: finishReason, - retryCount: retryAttempt, - remainingProviders: - (routingMetadata?.providerScores.length ?? 0) - - failedProviderIds.size - - 1, - usedProvider, + // If all retries exhausted without a successful response + if (!res || !res.ok) { + await writeSSEAndCache({ + event: "error", + data: JSON.stringify({ + error: { + message: "All provider attempts failed", + type: "upstream_error", + code: "all_providers_failed", + }, + }), + id: String(eventId++), + }); + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), }); - const willRetrySameProvider = sameProviderRetryContext !== null; - const willRetryRequest = - willRetrySameProvider || willRetryHttpError; + clearKeepalive(); + return; + } - const baseLogEntry = createLogEntry( - requestId, - project, - apiKey, - providerKey?.id, - usedModelFormatted, - usedModelMapping, - usedProvider, - initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoning_effort, - reasoning_max_tokens, - effort, - response_format, - tools, - tool_choice, - source, - customHeaders, - debugMode, - userAgent, - image_config, - routingMetadata, - rawBody, - null, // No response for error case - requestBody, // The request that was sent and resulted in error - null, // No upstream response for error case - streamingErrorPluginIds, - undefined, // No plugin results for error case - ); - const attemptLogId = shortid(); + // After retry loop: narrow provider variables for the rest of the streaming body + if ( + !usedProvider || + !usedToken || + !url || + !usedModelFormatted || + !usedModelMapping + ) { + throw new Error("Provider context not initialized"); + } - await insertLogEntry({ - ...baseLogEntry, - id: attemptLogId, - duration: Date.now() - perAttemptStartTime, - timeToFirstToken: null, - timeToFirstReasoningToken: null, - responseSize: errorResponseText.length, - content: null, - reasoningContent: null, - finishReason, - promptTokens: - finishReason === "content_filter" - ? ( - estimateTokens(usedProvider, messages, null, null, 0) - .calculatedPromptTokens ?? null - )?.toString() - : null, - completionTokens: null, - totalTokens: - finishReason === "content_filter" - ? ( - estimateTokens(usedProvider, messages, null, null, 0) - .calculatedPromptTokens ?? null - )?.toString() - : null, - reasoningTokens: null, - cachedTokens: null, - hasError: finishReason !== "content_filter", // content_filter is not an error - streamed: true, - canceled: false, - errorDetails: - finishReason === "content_filter" - ? null - : { - statusCode: res.status, - statusText: res.statusText, - responseText: errorResponseText, - }, - cachedInputCost: null, - requestCost: null, - webSearchCost: null, - imageInputTokens: null, - imageOutputTokens: null, - imageInputCost: null, - imageOutputCost: null, - discount: null, - dataStorageCost: "0", - cached: false, - toolResults: null, - retried: willRetryRequest, - retriedByLogId: willRetryRequest ? finalLogId : null, + if (!res.body) { + await writeSSEAndCache({ + event: "error", + data: JSON.stringify({ + error: { + message: "No response body from provider", + type: "gateway_error", + param: null, + code: "gateway_error", + }, + }), + id: String(eventId++), + }); + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), }); + clearKeepalive(); + return; + } - // Report key health for the selected token source - // Don't report content_filter as a key error - it's intentional provider behavior - if (envVarName !== undefined && finishReason !== "content_filter") { - reportKeyError( - envVarName, - configIndex, - res.status, - errorResponseText, - ); - } - if (providerKey?.id && finishReason !== "content_filter") { - reportTrackedKeyError( - providerKey.id, - res.status, - errorResponseText, - ); - } + const reader = res.body.getReader(); + let fullContent = ""; + let fullReasoningContent = ""; + let finishReason = null; + let promptTokens = null; + let completionTokens = null; + let totalTokens = null; + let reasoningTokens = null; + let cachedTokens = null; + let cacheCreationTokens: number | null = null; + let cacheCreation5mTokens: number | null = null; + let cacheCreation1hTokens: number | null = null; + let streamingToolCalls = null; + let imageByteSize = 0; // Track total image data size for token estimation + let outputImageCount = 0; // Track number of output images for cost calculation + let webSearchCount = 0; // Track web search calls for cost calculation + const serverToolUseIndices = new Set(); // Track Anthropic server_tool_use block indices + let sawUpstreamDoneSentinel = false; + let sawProviderTerminalEvent = false; + let sawOpenAiResponsesDoneEvent = false; + let sawOpenAiResponsesCompletedStatus = false; + let sentDownstreamFinishReasonChunk = false; + let handledTerminalProviderEvent = false; + let buffer = ""; // Buffer for accumulating partial data across chunks (string for SSE) + let binaryBuffer = new Uint8Array(0); // Buffer for binary event streams (AWS Bedrock) + let rawUpstreamData = ""; // Raw data received from upstream provider + const isAwsBedrock = usedProvider === "aws-bedrock"; + const taggedReasoningStreamState = { + inReasoning: false, + pending: "", + }; + let shouldTerminateStream = false; - if (willRetrySameProvider && sameProviderRetryContext) { - routingAttempts.push( - buildRoutingAttempt( - usedProvider, - baseModelName, - res.status, - getErrorType(res.status), - false, - { - region: usedRegion, - apiKeyHash: usedApiKeyHash, - logId: attemptLogId, - }, - ), - ); - applyResolvedProviderContext(sameProviderRetryContext); - retryAttempt--; - continue; - } + // Response healing for streaming mode + const streamingResponseHealingEnabled = plugins?.some( + (p) => p.id === "response-healing", + ); + const streamingIsJsonResponseFormat = + response_format?.type === "json_object" || + response_format?.type === "json_schema"; + const shouldBufferForHealing = + streamingIsJsonResponseFormat && + (streamingResponseHealingEnabled === true || + (usedProvider === "anthropic" && + response_format?.type === "json_object") || + (usedProvider === "aws-bedrock" && + response_format?.type === "json_object") || + usedProvider === "novita" || + splitTaggedReasoning); + + // Buffer for storing chunks when healing is enabled + // We need to buffer content, track last chunk info, and replay healed content at the end + const bufferedContentChunks: string[] = []; + let lastChunkId: string | null = null; + let lastChunkModel: string | null = null; + let lastChunkCreated: number | null = null; + const streamingPluginResults: { + responseHealing?: { + healed: boolean; + healingMethod?: string; + }; + } = {}; - if (willRetryHttpError) { - routingAttempts.push( - buildRoutingAttempt( - usedProvider, - baseModelName, - res.status, - getErrorType(res.status), - false, - { - region: usedRegion, - apiKeyHash: usedApiKeyHash, - logId: attemptLogId, - }, - ), - ); - failedProviderIds.add(providerRetryKey(usedProvider, usedRegion)); - continue; - } + try { + while (true) { + const { done, value } = await reader.read(); + if (done) { + break; + } - // For content_filter, return a proper completion chunk (not an error) - // This handles Azure ResponsibleAIPolicyViolation and similar content filtering errors - if (finishReason === "content_filter") { - await writeStreamingContentFilterResponse({ - billingModel: usedModel, - billingProvider: usedProvider, - responseModel: `${usedProvider}/${baseModelName}`, - metadata: { - requested_model: initialRequestedModel, - requested_provider: requestedProvider, - used_model: baseModelName, - used_provider: usedProvider, - ...(usedRegion && { used_region: usedRegion }), - underlying_used_model: usedModel, - }, - }); - } else { - // For client errors, return the original provider error response - let errorData; - if (finishReason === "client_error") { - try { - errorData = JSON.parse(errorResponseText); - } catch { - // If we can't parse the original error, fall back to our format - errorData = { - error: { - message: `Error from provider ${usedProvider}: ${res.status} ${res.statusText} ${errorResponseText}`, - type: finishReason, - param: null, - code: finishReason, - responseText: errorResponseText, - }, - }; + // For AWS Bedrock, convert binary event stream to SSE format + let chunk: string; + if (isAwsBedrock) { + // Append binary data to buffer + const newBuffer = new Uint8Array( + binaryBuffer.length + value.length, + ); + newBuffer.set(binaryBuffer); + newBuffer.set(value, binaryBuffer.length); + binaryBuffer = newBuffer; + + // Parse and convert available events + const { sse, bytesConsumed } = + convertAwsEventStreamToSSE(binaryBuffer); + chunk = sse; + + // Remove consumed bytes from binary buffer + if (bytesConsumed > 0) { + binaryBuffer = binaryBuffer.slice(bytesConsumed); } } else { - errorData = { - error: { - message: `Error from provider ${usedProvider}: ${res.status} ${res.statusText} ${errorResponseText}`, - type: finishReason, - param: null, - code: finishReason, - responseText: errorResponseText, - }, - }; + // Convert the Uint8Array to a string for SSE + chunk = sharedTextDecoder.decode(value, { stream: true }); } - await writeSSEAndCache({ - event: "error", - data: JSON.stringify(errorData), - id: String(eventId++), - }); - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); - } - - clearKeepalive(); - return; - } - - const inspectedStreamingResponse = - await inspectImmediateStreamingProviderError(res, usedProvider); - res = inspectedStreamingResponse.response; - if (inspectedStreamingResponse.immediateError) { - const { - errorCode, - errorMessage, - errorResponseText, - errorType, - inferredStatusCode, - statusText, - } = inspectedStreamingResponse.immediateError; - - logger.warn("Immediate streaming provider error", { - status: inferredStatusCode, - errorText: errorResponseText, - usedProvider, - requestedProvider, - usedModel, - initialRequestedModel, - organizationId: project.organizationId, - projectId: apiKey.projectId, - apiKeyId: apiKey.id, - unifiedFinishReason: getUnifiedFinishReason( - errorType, - usedProvider, - ), - }); - - const streamingErrorPluginIds = plugins?.map((p) => p.id) ?? []; - - let sameProviderRetryContext: Awaited< - ReturnType - > | null = null; - if (isRetryableErrorType(errorType)) { - rememberFailedKey(usedProvider, usedRegion, { - envVarName, - configIndex, - providerKeyId: providerKey?.id, - }); - sameProviderRetryContext = - await tryResolveAlternateKeyForCurrentProvider(true); - } - - const willRetryStreamingError = shouldRetryRequest({ - requestedProvider, - noFallback, - errorType, - retryCount: retryAttempt, - remainingProviders: - (routingMetadata?.providerScores.length ?? 0) - - failedProviderIds.size - - 1, - usedProvider, - }); - const willRetrySameProvider = sameProviderRetryContext !== null; - const willRetryRequest = - willRetrySameProvider || willRetryStreamingError; - - const baseLogEntry = createLogEntry( - requestId, - project, - apiKey, - providerKey?.id, - usedModelFormatted, - usedModelMapping, - usedProvider, - initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoning_effort, - reasoning_max_tokens, - effort, - response_format, - tools, - tool_choice, - source, - customHeaders, - debugMode, - userAgent, - image_config, - routingMetadata, - rawBody, - null, - requestBody, - null, - streamingErrorPluginIds, - undefined, - ); - const attemptLogId = shortid(); - - await insertLogEntry({ - ...baseLogEntry, - id: attemptLogId, - duration: Date.now() - perAttemptStartTime, - timeToFirstToken: null, - timeToFirstReasoningToken: null, - responseSize: errorResponseText.length, - content: null, - reasoningContent: null, - finishReason: errorType, - promptTokens: null, - completionTokens: null, - totalTokens: null, - reasoningTokens: null, - cachedTokens: null, - hasError: errorType !== "content_filter", - streamed: true, - canceled: false, - errorDetails: - errorType === "content_filter" - ? null - : { - statusCode: inferredStatusCode, - statusText, - responseText: errorResponseText, - }, - cachedInputCost: null, - requestCost: null, - webSearchCost: null, - imageInputTokens: null, - imageOutputTokens: null, - imageInputCost: null, - imageOutputCost: null, - discount: null, - dataStorageCost: "0", - cached: false, - toolResults: null, - retried: willRetryRequest, - retriedByLogId: willRetryRequest ? finalLogId : null, - }); - - if (envVarName !== undefined && errorType !== "content_filter") { - reportKeyError( - envVarName, - configIndex, - inferredStatusCode, - errorResponseText, - ); - } - if (providerKey?.id && errorType !== "content_filter") { - reportTrackedKeyError( - providerKey.id, - inferredStatusCode, - errorResponseText, - ); - } - - if (willRetrySameProvider && sameProviderRetryContext) { - routingAttempts.push( - buildRoutingAttempt( - usedProvider, - baseModelName, - inferredStatusCode, - getErrorType(inferredStatusCode), - false, - { - region: usedRegion, - apiKeyHash: usedApiKeyHash, - logId: attemptLogId, - }, - ), - ); - applyResolvedProviderContext(sameProviderRetryContext); - retryAttempt--; - continue; - } - - if (willRetryStreamingError) { - routingAttempts.push( - buildRoutingAttempt( - usedProvider, - baseModelName, - inferredStatusCode, - getErrorType(inferredStatusCode), - false, - { - region: usedRegion, - apiKeyHash: usedApiKeyHash, - logId: attemptLogId, - }, - ), - ); - failedProviderIds.add(providerRetryKey(usedProvider, usedRegion)); - continue; - } - - await writeSSEAndCache({ - event: "error", - data: JSON.stringify({ - error: { - message: errorMessage, - type: errorType, - code: errorCode, - param: null, - responseText: errorResponseText, - }, - }), - id: String(eventId++), - }); - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); - clearKeepalive(); - return; - } - - break; // Fetch succeeded, exit retry loop - } // End of retry for loop - - // Add the final attempt (successful or last failed) to routing - if (res && res.ok && usedProvider) { - routingAttempts.push( - buildRoutingAttempt( - usedProvider, - baseModelName, - res.status, - "none", - true, - { - region: usedRegion, - apiKeyHash: usedApiKeyHash, - logId: finalLogId, - }, - ), - ); - } - - // Update routingMetadata with all routing attempts for DB logging - if (routingMetadata) { - // Enrich providerScores with failure info from routing attempts - const failedMap = new Map( - routingAttempts - .filter((a) => !a.succeeded) - .map((f) => [f.provider, f]), - ); - routingMetadata = { - ...routingMetadata, - routing: routingAttempts, - providerScores: routingMetadata.providerScores.map((score) => { - const failure = failedMap.get(score.providerId); - if (failure) { - return { - ...score, - failed: true, - status_code: failure.status_code, - error_type: failure.error_type, - }; + // Log error on large chunks (1MB+) - should almost never happen + if (chunk.length > 1024 * 1024) { + logger.error( + `Large chunk received: ${(chunk.length / 1024 / 1024).toFixed(2)}MB`, + ); } - return score; - }), - }; - } - - // If all retries exhausted without a successful response - if (!res || !res.ok) { - await writeSSEAndCache({ - event: "error", - data: JSON.stringify({ - error: { - message: "All provider attempts failed", - type: "upstream_error", - code: "all_providers_failed", - }, - }), - id: String(eventId++), - }); - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); - clearKeepalive(); - return; - } - - // After retry loop: narrow provider variables for the rest of the streaming body - if ( - !usedProvider || - !usedToken || - !url || - !usedModelFormatted || - !usedModelMapping - ) { - throw new Error("Provider context not initialized"); - } - - if (!res.body) { - await writeSSEAndCache({ - event: "error", - data: JSON.stringify({ - error: { - message: "No response body from provider", - type: "gateway_error", - param: null, - code: "gateway_error", - }, - }), - id: String(eventId++), - }); - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); - clearKeepalive(); - return; - } - const reader = res.body.getReader(); - let fullContent = ""; - let fullReasoningContent = ""; - let finishReason = null; - let promptTokens = null; - let completionTokens = null; - let totalTokens = null; - let reasoningTokens = null; - let cachedTokens = null; - let cacheCreationTokens: number | null = null; - let cacheCreation5mTokens: number | null = null; - let cacheCreation1hTokens: number | null = null; - let streamingToolCalls = null; - let imageByteSize = 0; // Track total image data size for token estimation - let outputImageCount = 0; // Track number of output images for cost calculation - let webSearchCount = 0; // Track web search calls for cost calculation - const serverToolUseIndices = new Set(); // Track Anthropic server_tool_use block indices - let sawUpstreamDoneSentinel = false; - let sawProviderTerminalEvent = false; - let sawOpenAiResponsesDoneEvent = false; - let sawOpenAiResponsesCompletedStatus = false; - let sentDownstreamFinishReasonChunk = false; - let handledTerminalProviderEvent = false; - let buffer = ""; // Buffer for accumulating partial data across chunks (string for SSE) - let binaryBuffer = new Uint8Array(0); // Buffer for binary event streams (AWS Bedrock) - let rawUpstreamData = ""; // Raw data received from upstream provider - const isAwsBedrock = usedProvider === "aws-bedrock"; - const taggedReasoningStreamState = { - inReasoning: false, - pending: "", - }; - let shouldTerminateStream = false; - - // Response healing for streaming mode - const streamingResponseHealingEnabled = plugins?.some( - (p) => p.id === "response-healing", - ); - const streamingIsJsonResponseFormat = - response_format?.type === "json_object" || - response_format?.type === "json_schema"; - const shouldBufferForHealing = - streamingIsJsonResponseFormat && - (streamingResponseHealingEnabled === true || - (usedProvider === "anthropic" && - response_format?.type === "json_object") || - (usedProvider === "aws-bedrock" && - response_format?.type === "json_object") || - usedProvider === "novita" || - splitTaggedReasoning); - - // Buffer for storing chunks when healing is enabled - // We need to buffer content, track last chunk info, and replay healed content at the end - const bufferedContentChunks: string[] = []; - let lastChunkId: string | null = null; - let lastChunkModel: string | null = null; - let lastChunkCreated: number | null = null; - const streamingPluginResults: { - responseHealing?: { - healed: boolean; - healingMethod?: string; - }; - } = {}; - - try { - while (true) { - const { done, value } = await reader.read(); - if (done) { - break; - } - - // For AWS Bedrock, convert binary event stream to SSE format - let chunk: string; - if (isAwsBedrock) { - // Append binary data to buffer - const newBuffer = new Uint8Array( - binaryBuffer.length + value.length, - ); - newBuffer.set(binaryBuffer); - newBuffer.set(value, binaryBuffer.length); - binaryBuffer = newBuffer; - - // Parse and convert available events - const { sse, bytesConsumed } = - convertAwsEventStreamToSSE(binaryBuffer); - chunk = sse; - - // Remove consumed bytes from binary buffer - if (bytesConsumed > 0) { - binaryBuffer = binaryBuffer.slice(bytesConsumed); + buffer += chunk; + // Collect raw upstream data for logging only in debug mode and within size limit + if (debugMode && rawUpstreamData.length < MAX_RAW_DATA_SIZE) { + rawUpstreamData += chunk; } - } else { - // Convert the Uint8Array to a string for SSE - chunk = sharedTextDecoder.decode(value, { stream: true }); - } - // Log error on large chunks (1MB+) - should almost never happen - if (chunk.length > 1024 * 1024) { - logger.error( - `Large chunk received: ${(chunk.length / 1024 / 1024).toFixed(2)}MB`, - ); - } - - buffer += chunk; - // Collect raw upstream data for logging only in debug mode and within size limit - if (debugMode && rawUpstreamData.length < MAX_RAW_DATA_SIZE) { - rawUpstreamData += chunk; - } - - // Check buffer size to prevent memory exhaustion - if (buffer.length > MAX_BUFFER_SIZE) { - const bufferSizeMB = MAX_BUFFER_SIZE / 1024 / 1024; - logger.error( - `Buffer size exceeded ${bufferSizeMB}MB limit, aborting stream`, - ); - - // Send error to client - try { - await stream.writeSSE({ - event: "error", - data: JSON.stringify({ - error: { - message: `Streaming buffer exceeded ${bufferSizeMB}MB limit`, - type: "gateway_error", - param: null, - code: "buffer_overflow", - }, - }), - id: String(eventId++), - }); - await stream.writeSSE({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); - doneSent = true; - } catch (sseError) { + // Check buffer size to prevent memory exhaustion + if (buffer.length > MAX_BUFFER_SIZE) { + const bufferSizeMB = MAX_BUFFER_SIZE / 1024 / 1024; logger.error( - "Failed to send buffer overflow error SSE", - sseError instanceof Error - ? sseError - : new Error(String(sseError)), + `Buffer size exceeded ${bufferSizeMB}MB limit, aborting stream`, ); - } - // Set error for logging - streamingError = { - message: `Streaming buffer exceeded ${bufferSizeMB}MB limit`, - type: "buffer_overflow", - code: "buffer_overflow", - details: { - bufferSize: buffer.length, - maxBufferSize: MAX_BUFFER_SIZE, - provider: usedProvider, - model: usedModel, - }, - }; + // Send error to client + try { + await stream.writeSSE({ + event: "error", + data: JSON.stringify({ + error: { + message: `Streaming buffer exceeded ${bufferSizeMB}MB limit`, + type: "gateway_error", + param: null, + code: "buffer_overflow", + }, + }), + id: String(eventId++), + }); + await stream.writeSSE({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + doneSent = true; + } catch (sseError) { + logger.error( + "Failed to send buffer overflow error SSE", + sseError instanceof Error + ? sseError + : new Error(String(sseError)), + ); + } - break; - } + // Set error for logging + streamingError = { + message: `Streaming buffer exceeded ${bufferSizeMB}MB limit`, + type: "buffer_overflow", + code: "buffer_overflow", + details: { + bufferSize: buffer.length, + maxBufferSize: MAX_BUFFER_SIZE, + provider: usedProvider, + model: usedModel, + }, + }; - // Process SSE events from buffer - let processedLength = 0; - const bufferCopy = buffer; + break; + } - // Look for complete SSE events, handling events at buffer start - let searchStart = 0; - while (searchStart < bufferCopy.length) { - // Find "data: " - could be at start of buffer or after newline - let dataIndex = -1; + // Process SSE events from buffer + let processedLength = 0; + const bufferCopy = buffer; - if (searchStart === 0 && bufferCopy.startsWith("data: ")) { - // Event at buffer start - dataIndex = 0; - } else { - // Look for "\ndata: " pattern - const newlineDataIndex = bufferCopy.indexOf( - "\ndata: ", - searchStart, - ); - if (newlineDataIndex !== -1) { - dataIndex = newlineDataIndex + 1; // Skip the newline + // Look for complete SSE events, handling events at buffer start + let searchStart = 0; + while (searchStart < bufferCopy.length) { + // Find "data: " - could be at start of buffer or after newline + let dataIndex = -1; + + if (searchStart === 0 && bufferCopy.startsWith("data: ")) { + // Event at buffer start + dataIndex = 0; + } else { + // Look for "\ndata: " pattern + const newlineDataIndex = bufferCopy.indexOf( + "\ndata: ", + searchStart, + ); + if (newlineDataIndex !== -1) { + dataIndex = newlineDataIndex + 1; // Skip the newline + } } - } - if (dataIndex === -1) { - break; - } + if (dataIndex === -1) { + break; + } - // Find the end of this SSE event - // Look for next event or proper event termination - let eventEnd = -1; + // Find the end of this SSE event + // Look for next event or proper event termination + let eventEnd = -1; - // First, look for the next "data: " event (after a newline) - const nextEventIndex = bufferCopy.indexOf( - "\ndata: ", - dataIndex + 6, - ); - if (nextEventIndex !== -1) { - // Found next data event, but we still need to check if there are SSE fields in between - // For Anthropic, we might have: data: {...}\n\nevent: something\n\ndata: {...} - const betweenEvents = bufferCopy.slice( + // First, look for the next "data: " event (after a newline) + const nextEventIndex = bufferCopy.indexOf( + "\ndata: ", dataIndex + 6, - nextEventIndex, ); - const firstNewline = betweenEvents.indexOf("\n"); - - if (firstNewline !== -1) { - // Check if JSON up to first newline is valid - const jsonCandidate = betweenEvents - .slice(0, firstNewline) - .trim(); - // Quick heuristic check before expensive JSON.parse - let isValidJson = false; - if (mightBeCompleteJson(jsonCandidate)) { - try { - JSON.parse(jsonCandidate); - isValidJson = true; - } catch { - // JSON is not complete + if (nextEventIndex !== -1) { + // Found next data event, but we still need to check if there are SSE fields in between + // For Anthropic, we might have: data: {...}\n\nevent: something\n\ndata: {...} + const betweenEvents = bufferCopy.slice( + dataIndex + 6, + nextEventIndex, + ); + const firstNewline = betweenEvents.indexOf("\n"); + + if (firstNewline !== -1) { + // Check if JSON up to first newline is valid + const jsonCandidate = betweenEvents + .slice(0, firstNewline) + .trim(); + // Quick heuristic check before expensive JSON.parse + let isValidJson = false; + if (mightBeCompleteJson(jsonCandidate)) { + try { + JSON.parse(jsonCandidate); + isValidJson = true; + } catch { + // JSON is not complete + } + } + if (isValidJson) { + // JSON is valid - end at first newline to exclude SSE fields + eventEnd = dataIndex + 6 + firstNewline; + } else { + // JSON is not complete, use the full segment to next data event + eventEnd = nextEventIndex; } - } - if (isValidJson) { - // JSON is valid - end at first newline to exclude SSE fields - eventEnd = dataIndex + 6 + firstNewline; } else { - // JSON is not complete, use the full segment to next data event + // No newline found, use full segment eventEnd = nextEventIndex; } } else { - // No newline found, use full segment - eventEnd = nextEventIndex; - } - } else { - // No next event found - check for proper event termination - // SSE events should end with at least one newline - const eventStartPos = dataIndex + 6; // Start of event data - - // For Anthropic SSE format, we need to be more careful about event boundaries - // Try to find the end of the JSON data by looking for the closing brace - const newlinePos = bufferCopy.indexOf("\n", eventStartPos); - if (newlinePos !== -1) { - // We found a newline - check if the JSON before it is valid - const jsonCandidate = bufferCopy - .slice(eventStartPos, newlinePos) - .trim(); - // Quick heuristic check before expensive JSON.parse - let isValidJson = false; - if (mightBeCompleteJson(jsonCandidate)) { - try { - JSON.parse(jsonCandidate); - isValidJson = true; - } catch { - // JSON is not complete + // No next event found - check for proper event termination + // SSE events should end with at least one newline + const eventStartPos = dataIndex + 6; // Start of event data + + // For Anthropic SSE format, we need to be more careful about event boundaries + // Try to find the end of the JSON data by looking for the closing brace + const newlinePos = bufferCopy.indexOf("\n", eventStartPos); + if (newlinePos !== -1) { + // We found a newline - check if the JSON before it is valid + const jsonCandidate = bufferCopy + .slice(eventStartPos, newlinePos) + .trim(); + // Quick heuristic check before expensive JSON.parse + let isValidJson = false; + if (mightBeCompleteJson(jsonCandidate)) { + try { + JSON.parse(jsonCandidate); + isValidJson = true; + } catch { + // JSON is not complete + } } - } - if (isValidJson) { - // JSON is valid - this newline marks the end of our data - eventEnd = newlinePos; - } else { - // JSON is not valid, check if there's more content after the newline - if (newlinePos + 1 >= bufferCopy.length) { - // Newline is at the end of buffer - event is incomplete - break; + if (isValidJson) { + // JSON is valid - this newline marks the end of our data + eventEnd = newlinePos; } else { - // There's content after the newline - // Check if it's another SSE field (like event:, id:, retry:, etc.) or if the event continues - const restOfBuffer = bufferCopy.slice(newlinePos + 1); - - // Check for SSE field patterns (event:, id:, retry:, etc.) - // Skip leading newlines efficiently without creating new strings - let trimStart = 0; - while ( - trimStart < restOfBuffer.length && - restOfBuffer[trimStart] === "\n" - ) { - trimStart++; - } + // JSON is not valid, check if there's more content after the newline + if (newlinePos + 1 >= bufferCopy.length) { + // Newline is at the end of buffer - event is incomplete + break; + } else { + // There's content after the newline + // Check if it's another SSE field (like event:, id:, retry:, etc.) or if the event continues + const restOfBuffer = bufferCopy.slice(newlinePos + 1); + + // Check for SSE field patterns (event:, id:, retry:, etc.) + // Skip leading newlines efficiently without creating new strings + let trimStart = 0; + while ( + trimStart < restOfBuffer.length && + restOfBuffer[trimStart] === "\n" + ) { + trimStart++; + } - if ( - restOfBuffer.startsWith("\n") || // Empty line - end of event - restOfBuffer.startsWith("data: ") // Next data field - ) { - // This is the end of our data event - eventEnd = newlinePos; - } else if (trimStart > 0) { - // Had leading newlines - check for SSE fields after them - const afterNewlines = restOfBuffer.substring(trimStart); if ( - afterNewlines.startsWith("event:") || - afterNewlines.startsWith("id:") || - afterNewlines.startsWith("retry:") || - SSE_FIELD_PATTERN.test(afterNewlines) + restOfBuffer.startsWith("\n") || // Empty line - end of event + restOfBuffer.startsWith("data: ") // Next data field ) { + // This is the end of our data event eventEnd = newlinePos; + } else if (trimStart > 0) { + // Had leading newlines - check for SSE fields after them + const afterNewlines = + restOfBuffer.substring(trimStart); + if ( + afterNewlines.startsWith("event:") || + afterNewlines.startsWith("id:") || + afterNewlines.startsWith("retry:") || + SSE_FIELD_PATTERN.test(afterNewlines) + ) { + eventEnd = newlinePos; + } else { + // Content continues on next line - use full buffer + eventEnd = bufferCopy.length; + } } else { - // Content continues on next line - use full buffer - eventEnd = bufferCopy.length; - } - } else { - // No leading newlines - check SSE field directly - if (SSE_FIELD_PATTERN.test(restOfBuffer)) { - eventEnd = newlinePos; - } else { - // Content continues on next line - use full buffer - eventEnd = bufferCopy.length; + // No leading newlines - check SSE field directly + if (SSE_FIELD_PATTERN.test(restOfBuffer)) { + eventEnd = newlinePos; + } else { + // Content continues on next line - use full buffer + eventEnd = bufferCopy.length; + } } } } - } - } else { - // No newline found after event data - event is incomplete - // Try to detect if we have a complete JSON object - const eventDataCandidate = bufferCopy.slice(eventStartPos); - if (eventDataCandidate.length > 0) { - // Quick heuristic check before expensive JSON.parse - const trimmedCandidate = eventDataCandidate.trim(); - if (mightBeCompleteJson(trimmedCandidate)) { - try { - JSON.parse(trimmedCandidate); - // If we can parse it, it's complete - eventEnd = bufferCopy.length; - } catch { - // JSON parsing failed - event is incomplete + } else { + // No newline found after event data - event is incomplete + // Try to detect if we have a complete JSON object + const eventDataCandidate = bufferCopy.slice(eventStartPos); + if (eventDataCandidate.length > 0) { + // Quick heuristic check before expensive JSON.parse + const trimmedCandidate = eventDataCandidate.trim(); + if (mightBeCompleteJson(trimmedCandidate)) { + try { + JSON.parse(trimmedCandidate); + // If we can parse it, it's complete + eventEnd = bufferCopy.length; + } catch { + // JSON parsing failed - event is incomplete + break; + } + } else { + // Heuristic says incomplete - don't bother parsing break; } } else { - // Heuristic says incomplete - don't bother parsing + // No event data yet break; } - } else { - // No event data yet - break; } } - } - const eventData = bufferCopy - .slice(dataIndex + 6, eventEnd) - .trim(); + const eventData = bufferCopy + .slice(dataIndex + 6, eventEnd) + .trim(); - // Debug logging for troublesome events - // Only scan for SSE field contamination on small events to avoid - // O(n) scans on multi-MB payloads (e.g. base64 image data). - // Large events (>64KB) are almost always valid image/binary data. - if ( - eventData.length < 65536 && - (eventData.includes("event:") || eventData.includes("id:")) - ) { - logger.warn("Event data contains SSE field", { - eventData: - eventData.substring(0, 200) + - (eventData.length > 200 ? "..." : ""), - dataIndex, - eventEnd, - bufferLength: bufferCopy.length, - provider: usedProvider, - }); - } - - if (eventData === "[DONE]") { - sawUpstreamDoneSentinel = true; - // Set default finish_reason if not provided by the stream - // Some providers (like Novita) don't send finish_reason in streaming chunks - if (finishReason === null) { - // Default to "stop" unless we have tool calls - finishReason = - streamingToolCalls && streamingToolCalls.length > 0 - ? "tool_calls" - : "stop"; + // Debug logging for troublesome events + // Only scan for SSE field contamination on small events to avoid + // O(n) scans on multi-MB payloads (e.g. base64 image data). + // Large events (>64KB) are almost always valid image/binary data. + if ( + eventData.length < 65536 && + (eventData.includes("event:") || eventData.includes("id:")) + ) { + logger.warn("Event data contains SSE field", { + eventData: + eventData.substring(0, 200) + + (eventData.length > 200 ? "..." : ""), + dataIndex, + eventEnd, + bufferLength: bufferCopy.length, + provider: usedProvider, + }); } - // Calculate final usage if we don't have complete data - let finalPromptTokens = promptTokens; - let finalCompletionTokens = completionTokens; - let finalTotalTokens = totalTokens; + if (eventData === "[DONE]") { + sawUpstreamDoneSentinel = true; + // Set default finish_reason if not provided by the stream + // Some providers (like Novita) don't send finish_reason in streaming chunks + if (finishReason === null) { + // Default to "stop" unless we have tool calls + finishReason = + streamingToolCalls && streamingToolCalls.length > 0 + ? "tool_calls" + : "stop"; + } - // Estimate missing tokens if needed using helper function - if (finalPromptTokens === null || finalPromptTokens === 0) { - const estimation = estimateTokens( - usedProvider, - messages, - null, - null, - null, - ); - finalPromptTokens = estimation.calculatedPromptTokens; - } + // Calculate final usage if we don't have complete data + let finalPromptTokens = promptTokens; + let finalCompletionTokens = completionTokens; + let finalTotalTokens = totalTokens; - if (finalCompletionTokens === null) { - const textTokens = estimateTokensFromContent(fullContent); - // For images, estimate ~258 tokens per image + 1 token per 750 bytes - // This is based on Google's image token calculation - let imageTokens = 0; - if (imageByteSize > 0) { - // Base tokens per image (258) + additional tokens based on size - imageTokens = 258 + Math.ceil(imageByteSize / 750); + // Estimate missing tokens if needed using helper function + if (finalPromptTokens === null || finalPromptTokens === 0) { + const estimation = estimateTokens( + usedProvider, + messages, + null, + null, + null, + ); + finalPromptTokens = estimation.calculatedPromptTokens; } - finalCompletionTokens = textTokens + imageTokens; - } - if (finalTotalTokens === null) { - finalTotalTokens = - (finalPromptTokens ?? 0) + - (finalCompletionTokens ?? 0) + - (reasoningTokens ?? 0); - } + if (finalCompletionTokens === null) { + const textTokens = estimateTokensFromContent(fullContent); + // For images, estimate ~258 tokens per image + 1 token per 750 bytes + // This is based on Google's image token calculation + let imageTokens = 0; + if (imageByteSize > 0) { + // Base tokens per image (258) + additional tokens based on size + imageTokens = 258 + Math.ceil(imageByteSize / 750); + } + finalCompletionTokens = textTokens + imageTokens; + } - // Send final usage chunk before [DONE] if we have any usage data - if ( - finalPromptTokens !== null || - finalCompletionTokens !== null || - finalTotalTokens !== null - ) { - // Calculate costs for streaming response - const streamingCosts = await calculateCosts( - usedModel, - usedProvider, - finalPromptTokens, - finalCompletionTokens, - cachedTokens, - { - prompt: messages - .map((m) => messageContentToString(m.content)) - .join("\n"), - completion: fullContent, - toolResults: streamingToolCalls ?? undefined, - }, - reasoningTokens, - outputImageCount, - image_config?.image_size, - inputImageCount, - webSearchCount, - project.organizationId, - image_config?.image_quality, - { - cacheWriteTokens: cacheCreationTokens, - cacheWrite1hTokens: cacheCreation1hTokens, - }, - ); - streamingCosts.dataStorageCost = toDataStorageCostNumber( - streamingCosts.promptTokens ?? finalPromptTokens, - cachedTokens, - streamingCosts.completionTokens ?? finalCompletionTokens, - reasoningTokens, - retentionLevel, - ); + if (finalTotalTokens === null) { + finalTotalTokens = + (finalPromptTokens ?? 0) + + (finalCompletionTokens ?? 0) + + (reasoningTokens ?? 0); + } + + // Send final usage chunk before [DONE] if we have any usage data + if ( + finalPromptTokens !== null || + finalCompletionTokens !== null || + finalTotalTokens !== null + ) { + // Calculate costs for streaming response + const streamingCosts = await calculateCosts( + usedModel, + usedProvider, + finalPromptTokens, + finalCompletionTokens, + cachedTokens, + { + prompt: messages + .map((m) => messageContentToString(m.content)) + .join("\n"), + completion: fullContent, + toolResults: streamingToolCalls ?? undefined, + }, + reasoningTokens, + outputImageCount, + image_config?.image_size, + inputImageCount, + webSearchCount, + project.organizationId, + image_config?.image_quality, + { + cacheWriteTokens: cacheCreationTokens, + cacheWrite1hTokens: cacheCreation1hTokens, + }, + ); + streamingCosts.dataStorageCost = toDataStorageCostNumber( + streamingCosts.promptTokens ?? finalPromptTokens, + cachedTokens, + streamingCosts.completionTokens ?? finalCompletionTokens, + reasoningTokens, + retentionLevel, + ); - // Include costs in response for all users - const shouldIncludeCosts = true; + // Include costs in response for all users + const shouldIncludeCosts = true; - const finalStreamUsage: Record = { - prompt_tokens: Math.max( - 1, - streamingCosts.promptTokens ?? finalPromptTokens ?? 1, - ), - completion_tokens: - streamingCosts.completionTokens ?? - finalCompletionTokens ?? - 0, - total_tokens: Math.max( - 1, - (streamingCosts.promptTokens ?? finalPromptTokens ?? 0) + - (streamingCosts.completionTokens ?? - finalCompletionTokens ?? + const finalStreamUsage: Record = { + prompt_tokens: Math.max( + 1, + streamingCosts.promptTokens ?? finalPromptTokens ?? 1, + ), + completion_tokens: + streamingCosts.completionTokens ?? + finalCompletionTokens ?? + 0, + total_tokens: Math.max( + 1, + (streamingCosts.promptTokens ?? + finalPromptTokens ?? 0) + - (reasoningTokens ?? 0), - ), - ...(reasoningTokens !== null && - reasoningTokens > 0 && { - reasoning_tokens: reasoningTokens, + (streamingCosts.completionTokens ?? + finalCompletionTokens ?? + 0) + + (reasoningTokens ?? 0), + ), + ...(reasoningTokens !== null && + reasoningTokens > 0 && { + reasoning_tokens: reasoningTokens, + }), + ...((cachedTokens !== null || + (cacheCreationTokens !== null && + cacheCreationTokens > 0)) && { + prompt_tokens_details: { + cached_tokens: cachedTokens ?? 0, + ...(cacheCreationTokens !== null && + cacheCreationTokens > 0 && { + cache_creation_tokens: cacheCreationTokens, + }), + ...(cacheCreationTokens !== null && + cacheCreationTokens > 0 && + (cacheCreation5mTokens !== null || + cacheCreation1hTokens !== null) && { + cache_creation: { + ephemeral_5m_input_tokens: + cacheCreation5mTokens ?? + Math.max( + 0, + cacheCreationTokens - + (cacheCreation1hTokens ?? 0), + ), + ephemeral_1h_input_tokens: + cacheCreation1hTokens ?? 0, + }, + }), + }, }), - ...((cachedTokens !== null || - (cacheCreationTokens !== null && - cacheCreationTokens > 0)) && { - prompt_tokens_details: { - cached_tokens: cachedTokens ?? 0, - ...(cacheCreationTokens !== null && - cacheCreationTokens > 0 && { - cache_creation_tokens: cacheCreationTokens, - }), - ...(cacheCreationTokens !== null && - cacheCreationTokens > 0 && - (cacheCreation5mTokens !== null || - cacheCreation1hTokens !== null) && { - cache_creation: { - ephemeral_5m_input_tokens: - cacheCreation5mTokens ?? - Math.max( - 0, - cacheCreationTokens - - (cacheCreation1hTokens ?? 0), - ), - ephemeral_1h_input_tokens: - cacheCreation1hTokens ?? 0, - }, + }; + applyExtendedUsageFields(finalStreamUsage, { + costs: shouldIncludeCosts + ? { + inputCost: streamingCosts.inputCost, + outputCost: streamingCosts.outputCost, + cachedInputCost: streamingCosts.cachedInputCost, + cacheWriteInputCost: + streamingCosts.cacheWriteInputCost, + requestCost: streamingCosts.requestCost, + webSearchCost: streamingCosts.webSearchCost, + imageInputCost: streamingCosts.imageInputCost, + imageOutputCost: streamingCosts.imageOutputCost, + totalCost: streamingCosts.totalCost, + dataStorageCost: streamingCosts.dataStorageCost, + } + : null, + cachedTokens, + cacheCreationTokens, + reasoningTokens, + }); + const finalUsageChunk = { + id: `chatcmpl-${Date.now()}`, + object: "chat.completion.chunk", + created: Math.floor(Date.now() / 1000), + model: usedModel, + choices: [ + { + index: 0, + delta: {}, + finish_reason: null, + }, + ], + usage: finalStreamUsage, + }; + + await writeSSEAndCache({ + data: JSON.stringify(finalUsageChunk), + id: String(eventId++), + }); + } + + if (!shouldBufferForHealing) { + if (splitTaggedReasoning) { + const flushedRemainder = flushTaggedStreamingRemainder( + taggedReasoningStreamState, + ); + if ( + flushedRemainder.content || + flushedRemainder.reasoning + ) { + await writeSSEAndCache({ + data: JSON.stringify({ + id: `chatcmpl-${Date.now()}`, + object: "chat.completion.chunk", + created: Math.floor(Date.now() / 1000), + model: usedModel, + choices: [ + { + index: 0, + delta: { + ...(flushedRemainder.content && { + content: flushedRemainder.content, + }), + ...(flushedRemainder.reasoning && { + reasoning: flushedRemainder.reasoning, + }), + }, + }, + ], }), + id: String(eventId++), + }); + } + } + + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + doneSent = true; + } + + processedLength = eventEnd; + } else { + // Try to parse JSON data - it might span multiple lines + let data; + try { + data = JSON.parse(eventData); + } catch (e) { + // If JSON parsing fails, this might be an incomplete event + // Since we already validated JSON completeness above, this is likely a format issue + // Create structured error for logging + streamingError = { + message: e instanceof Error ? e.message : String(e), + type: "json_parse_error", + code: "json_parse_error", + details: { + name: e instanceof Error ? e.name : "ParseError", + eventData: eventData.substring(0, 5000), + provider: usedProvider, + model: usedModel, + eventLength: eventData.length, + bufferEnd: eventEnd, + bufferLength: bufferCopy.length, + timestamp: new Date().toISOString(), }, - }), - }; - applyExtendedUsageFields(finalStreamUsage, { - costs: shouldIncludeCosts - ? { - inputCost: streamingCosts.inputCost, - outputCost: streamingCosts.outputCost, - cachedInputCost: streamingCosts.cachedInputCost, - cacheWriteInputCost: - streamingCosts.cacheWriteInputCost, - requestCost: streamingCosts.requestCost, - webSearchCost: streamingCosts.webSearchCost, - imageInputCost: streamingCosts.imageInputCost, - imageOutputCost: streamingCosts.imageOutputCost, - totalCost: streamingCosts.totalCost, - dataStorageCost: streamingCosts.dataStorageCost, - } - : null, - cachedTokens, - cacheCreationTokens, - reasoningTokens, - }); - const finalUsageChunk = { - id: `chatcmpl-${Date.now()}`, - object: "chat.completion.chunk", - created: Math.floor(Date.now() / 1000), - model: usedModel, - choices: [ - { - index: 0, - delta: {}, - finish_reason: null, - }, - ], - usage: finalStreamUsage, - }; + }; + logger.warn("Failed to parse streaming JSON", { + error: e instanceof Error ? e.message : String(e), + eventData: + eventData.substring(0, 200) + + (eventData.length > 200 ? "..." : ""), + provider: usedProvider, + eventLength: eventData.length, + bufferEnd: eventEnd, + bufferLength: bufferCopy.length, + }); - await writeSSEAndCache({ - data: JSON.stringify(finalUsageChunk), - id: String(eventId++), - }); - } + processedLength = eventEnd; + searchStart = eventEnd; + continue; + } - if (!shouldBufferForHealing) { - if (splitTaggedReasoning) { - const flushedRemainder = flushTaggedStreamingRemainder( - taggedReasoningStreamState, - ); + const awsBedrockStreamError = + usedProvider === "aws-bedrock" + ? extractAwsBedrockStreamError(data) + : null; + if ( + data && + typeof data === "object" && + "response" in data && + data.response && + typeof data.response === "object" && + "status" in data.response && + data.response.status === "completed" + ) { + sawOpenAiResponsesCompletedStatus = true; + } + if ( + data && + typeof data === "object" && + "type" in data && + typeof data.type === "string" && + (data.type === "response.content_part.done" || + data.type === "response.output_item.done" || + data.type === "response.output_text.done") + ) { + sawOpenAiResponsesDoneEvent = true; + } + const openAiCompatibleStreamError = + !awsBedrockStreamError && + data && + typeof data === "object" && + "error" in data && + data.error && + typeof data.error === "object" + ? (data.error as Record) + : null; + if (openAiCompatibleStreamError) { + const errorResponseText = JSON.stringify(data); if ( - flushedRemainder.content || - flushedRemainder.reasoning + debugMode && + streamingRawResponseData.length < MAX_RAW_DATA_SIZE ) { + const rawProviderSseEvent = `data: ${errorResponseText}\n\n`; + streamingRawResponseData += rawProviderSseEvent.substring( + 0, + Math.max( + 0, + MAX_RAW_DATA_SIZE - streamingRawResponseData.length, + ), + ); + } + const inferredStatusCode = inferStreamingErrorStatusCode( + openAiCompatibleStreamError, + errorResponseText, + ); + const errorType = getFinishReasonFromError( + inferredStatusCode, + errorResponseText, + ); + const errorMessage = + typeof openAiCompatibleStreamError.message === "string" + ? openAiCompatibleStreamError.message + : "Upstream provider returned a streaming error"; + const errorCode = + typeof openAiCompatibleStreamError.code === "string" + ? openAiCompatibleStreamError.code + : typeof openAiCompatibleStreamError.type === "string" + ? openAiCompatibleStreamError.type + : errorType; + + logger.info("[streaming] Provider SSE error received", { + requestId, + provider: usedProvider, + model: usedModel, + errorType, + errorCode, + inferredStatusCode, + errorMessage, + errorPayload: errorResponseText.substring(0, 5000), + }); + + finishReason = errorType; + + if (errorType === "content_filter") { + await writeStreamingContentFilterResponse({ + billingModel: usedModel, + billingProvider: usedProvider, + responseModel: data.model ?? usedModel, + }); + handledTerminalProviderEvent = true; + } else { + streamingError = { + message: errorMessage, + type: errorType, + code: errorCode, + details: { + statusCode: inferredStatusCode, + statusText: + typeof openAiCompatibleStreamError.type === "string" + ? openAiCompatibleStreamError.type + : "stream_error", + responseText: errorResponseText, + }, + }; + await writeSSEAndCache({ + event: "error", data: JSON.stringify({ - id: `chatcmpl-${Date.now()}`, - object: "chat.completion.chunk", - created: Math.floor(Date.now() / 1000), - model: usedModel, - choices: [ - { - index: 0, - delta: { - ...(flushedRemainder.content && { - content: flushedRemainder.content, - }), - ...(flushedRemainder.reasoning && { - reasoning: flushedRemainder.reasoning, - }), - }, - }, - ], + error: { + message: errorMessage, + type: errorType, + code: errorCode, + param: + "param" in openAiCompatibleStreamError + ? (openAiCompatibleStreamError.param ?? null) + : null, + responseText: errorResponseText, + }, }), id: String(eventId++), }); } - } - - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); - doneSent = true; - } - - processedLength = eventEnd; - } else { - // Try to parse JSON data - it might span multiple lines - let data; - try { - data = JSON.parse(eventData); - } catch (e) { - // If JSON parsing fails, this might be an incomplete event - // Since we already validated JSON completeness above, this is likely a format issue - // Create structured error for logging - streamingError = { - message: e instanceof Error ? e.message : String(e), - type: "json_parse_error", - code: "json_parse_error", - details: { - name: e instanceof Error ? e.name : "ParseError", - eventData: eventData.substring(0, 5000), - provider: usedProvider, - model: usedModel, - eventLength: eventData.length, - bufferEnd: eventEnd, - bufferLength: bufferCopy.length, - timestamp: new Date().toISOString(), - }, - }; - logger.warn("Failed to parse streaming JSON", { - error: e instanceof Error ? e.message : String(e), - eventData: - eventData.substring(0, 200) + - (eventData.length > 200 ? "..." : ""), - provider: usedProvider, - eventLength: eventData.length, - bufferEnd: eventEnd, - bufferLength: bufferCopy.length, - }); - - processedLength = eventEnd; - searchStart = eventEnd; - continue; - } - const awsBedrockStreamError = - usedProvider === "aws-bedrock" - ? extractAwsBedrockStreamError(data) - : null; - if ( - data && - typeof data === "object" && - "response" in data && - data.response && - typeof data.response === "object" && - "status" in data.response && - data.response.status === "completed" - ) { - sawOpenAiResponsesCompletedStatus = true; - } - if ( - data && - typeof data === "object" && - "type" in data && - typeof data.type === "string" && - (data.type === "response.content_part.done" || - data.type === "response.output_item.done" || - data.type === "response.output_text.done") - ) { - sawOpenAiResponsesDoneEvent = true; - } - const openAiCompatibleStreamError = - !awsBedrockStreamError && - data && - typeof data === "object" && - "error" in data && - data.error && - typeof data.error === "object" - ? (data.error as Record) - : null; - if (openAiCompatibleStreamError) { - const errorResponseText = JSON.stringify(data); - if ( - debugMode && - streamingRawResponseData.length < MAX_RAW_DATA_SIZE - ) { - const rawProviderSseEvent = `data: ${errorResponseText}\n\n`; - streamingRawResponseData += rawProviderSseEvent.substring( - 0, - Math.max( - 0, - MAX_RAW_DATA_SIZE - streamingRawResponseData.length, - ), - ); + if (!doneSent) { + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + doneSent = true; + } + shouldTerminateStream = true; + processedLength = eventEnd; + searchStart = eventEnd; + break; } - const inferredStatusCode = inferStreamingErrorStatusCode( - openAiCompatibleStreamError, - errorResponseText, - ); - const errorType = getFinishReasonFromError( - inferredStatusCode, - errorResponseText, - ); - const errorMessage = - typeof openAiCompatibleStreamError.message === "string" - ? openAiCompatibleStreamError.message - : "Upstream provider returned a streaming error"; - const errorCode = - typeof openAiCompatibleStreamError.code === "string" - ? openAiCompatibleStreamError.code - : typeof openAiCompatibleStreamError.type === "string" - ? openAiCompatibleStreamError.type - : errorType; - - logger.info("[streaming] Provider SSE error received", { - requestId, - provider: usedProvider, - model: usedModel, - errorType, - errorCode, - inferredStatusCode, - errorMessage, - errorPayload: errorResponseText.substring(0, 5000), - }); - - finishReason = errorType; + if (awsBedrockStreamError) { + const errorType = getFinishReasonFromError( + awsBedrockStreamError.statusCode, + awsBedrockStreamError.responseText, + ); - if (errorType === "content_filter") { - await writeStreamingContentFilterResponse({ - billingModel: usedModel, - billingProvider: usedProvider, - responseModel: data.model ?? usedModel, - }); - handledTerminalProviderEvent = true; - } else { streamingError = { - message: errorMessage, + message: awsBedrockStreamError.message, type: errorType, - code: errorCode, + code: awsBedrockStreamError.eventType, details: { - statusCode: inferredStatusCode, - statusText: - typeof openAiCompatibleStreamError.type === "string" - ? openAiCompatibleStreamError.type - : "stream_error", - responseText: errorResponseText, + statusCode: awsBedrockStreamError.statusCode, + statusText: awsBedrockStreamError.eventType, + responseText: awsBedrockStreamError.responseText, }, }; + finishReason = errorType; await writeSSEAndCache({ event: "error", data: JSON.stringify({ error: { - message: errorMessage, + message: awsBedrockStreamError.message, type: errorType, - code: errorCode, - param: - "param" in openAiCompatibleStreamError - ? (openAiCompatibleStreamError.param ?? null) - : null, - responseText: errorResponseText, + code: awsBedrockStreamError.eventType, + param: null, + responseText: awsBedrockStreamError.responseText, }, }), id: String(eventId++), }); - } - - if (!doneSent) { await writeSSEAndCache({ event: "done", data: "[DONE]", id: String(eventId++), }); doneSent = true; + shouldTerminateStream = true; + processedLength = eventEnd; + searchStart = eventEnd; + break; } - shouldTerminateStream = true; - processedLength = eventEnd; - searchStart = eventEnd; - break; - } - if (awsBedrockStreamError) { - const errorType = getFinishReasonFromError( - awsBedrockStreamError.statusCode, - awsBedrockStreamError.responseText, - ); - - streamingError = { - message: awsBedrockStreamError.message, - type: errorType, - code: awsBedrockStreamError.eventType, - details: { - statusCode: awsBedrockStreamError.statusCode, - statusText: awsBedrockStreamError.eventType, - responseText: awsBedrockStreamError.responseText, - }, - }; - finishReason = errorType; - await writeSSEAndCache({ - event: "error", - data: JSON.stringify({ - error: { - message: awsBedrockStreamError.message, - type: errorType, - code: awsBedrockStreamError.eventType, - param: null, - responseText: awsBedrockStreamError.responseText, - }, - }), - id: String(eventId++), - }); - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); - doneSent = true; - shouldTerminateStream = true; - processedLength = eventEnd; - searchStart = eventEnd; - break; - } + // Transform streaming responses to OpenAI format for all providers + const transformedData = transformStreamingToOpenai( + usedProvider, + usedModel, + data, + messages, + serverToolUseIndices, + supportsReasoning, + ); - // Transform streaming responses to OpenAI format for all providers - const transformedData = transformStreamingToOpenai( - usedProvider, - usedModel, - data, - messages, - serverToolUseIndices, - supportsReasoning, - ); + // Skip null events (some providers have non-data events) + if (!transformedData) { + processedLength = eventEnd; + searchStart = eventEnd; + continue; + } - // Skip null events (some providers have non-data events) - if (!transformedData) { - processedLength = eventEnd; - searchStart = eventEnd; - continue; - } + if (splitTaggedReasoning) { + const deltaContent = + transformedData.choices?.[0]?.delta?.content; - if (splitTaggedReasoning) { - const deltaContent = - transformedData.choices?.[0]?.delta?.content; + if ( + typeof deltaContent === "string" && + deltaContent.length > 0 + ) { + const splitChunk = splitTaggedStreamingContentChunk( + deltaContent, + taggedReasoningStreamState, + ); - if ( - typeof deltaContent === "string" && - deltaContent.length > 0 - ) { - const splitChunk = splitTaggedStreamingContentChunk( - deltaContent, - taggedReasoningStreamState, - ); + if (splitChunk.content) { + transformedData.choices[0].delta.content = + splitChunk.content; + } else { + delete transformedData.choices[0].delta.content; + } - if (splitChunk.content) { - transformedData.choices[0].delta.content = - splitChunk.content; - } else { - delete transformedData.choices[0].delta.content; + if (splitChunk.reasoning) { + transformedData.choices[0].delta.reasoning = + (transformedData.choices[0].delta.reasoning ?? "") + + splitChunk.reasoning; + } } + } - if (splitChunk.reasoning) { - transformedData.choices[0].delta.reasoning = - (transformedData.choices[0].delta.reasoning ?? "") + - splitChunk.reasoning; + // For Anthropic, if we have partial usage data, complete it + if (usedProvider === "anthropic" && transformedData.usage) { + const usage = transformedData.usage; + if ( + usage.output_tokens !== undefined && + usage.prompt_tokens === undefined + ) { + // Estimate prompt tokens if not provided + const estimation = estimateTokens( + usedProvider, + messages, + null, + null, + null, + ); + const estimatedPromptTokens = + estimation.calculatedPromptTokens; + transformedData.usage = { + prompt_tokens: estimatedPromptTokens, + completion_tokens: usage.output_tokens, + total_tokens: + estimatedPromptTokens + usage.output_tokens, + }; } } - } - // For Anthropic, if we have partial usage data, complete it - if (usedProvider === "anthropic" && transformedData.usage) { - const usage = transformedData.usage; - if ( - usage.output_tokens !== undefined && - usage.prompt_tokens === undefined - ) { - // Estimate prompt tokens if not provided - const estimation = estimateTokens( + // For Google providers, add usage information when available + if (isGoogleCompatibleProvider(usedProvider)) { + const usage = extractTokenUsage( + data, usedProvider, - messages, - null, - null, - null, + fullContent, + imageByteSize, ); - const estimatedPromptTokens = - estimation.calculatedPromptTokens; - transformedData.usage = { - prompt_tokens: estimatedPromptTokens, - completion_tokens: usage.output_tokens, - total_tokens: estimatedPromptTokens + usage.output_tokens, - }; - } - } - - // For Google providers, add usage information when available - if (isGoogleCompatibleProvider(usedProvider)) { - const usage = extractTokenUsage( - data, - usedProvider, - fullContent, - imageByteSize, - ); - // If we have usage data from Google, add it to the streaming chunk - if ( - usage.promptTokens !== null || - usage.completionTokens !== null || - usage.totalTokens !== null - ) { - transformedData.usage = { - prompt_tokens: usage.promptTokens ?? 0, - completion_tokens: usage.completionTokens ?? 0, - total_tokens: usage.totalTokens ?? 0, - ...(usage.reasoningTokens !== null && { - reasoning_tokens: usage.reasoningTokens, - }), - }; + // If we have usage data from Google, add it to the streaming chunk + if ( + usage.promptTokens !== null || + usage.completionTokens !== null || + usage.totalTokens !== null + ) { + transformedData.usage = { + prompt_tokens: usage.promptTokens ?? 0, + completion_tokens: usage.completionTokens ?? 0, + total_tokens: usage.totalTokens ?? 0, + ...(usage.reasoningTokens !== null && { + reasoning_tokens: usage.reasoningTokens, + }), + }; + } } - } - // Normalize usage.prompt_tokens_details to always include cached_tokens - if (transformedData.usage) { - if (transformedData.usage.prompt_tokens_details) { - // Preserve all existing keys and only default cached_tokens - transformedData.usage.prompt_tokens_details = { - ...transformedData.usage.prompt_tokens_details, - cached_tokens: - transformedData.usage.prompt_tokens_details - .cached_tokens ?? 0, - }; - } else { - // Create prompt_tokens_details with cached_tokens set to 0 - transformedData.usage.prompt_tokens_details = { - cached_tokens: 0, - }; + // Normalize usage.prompt_tokens_details to always include cached_tokens + if (transformedData.usage) { + if (transformedData.usage.prompt_tokens_details) { + // Preserve all existing keys and only default cached_tokens + transformedData.usage.prompt_tokens_details = { + ...transformedData.usage.prompt_tokens_details, + cached_tokens: + transformedData.usage.prompt_tokens_details + .cached_tokens ?? 0, + }; + } else { + // Create prompt_tokens_details with cached_tokens set to 0 + transformedData.usage.prompt_tokens_details = { + cached_tokens: 0, + }; + } } - } - // For Anthropic streaming tool calls, enrich delta chunks with id/type/name - // from the initial content_block_start event. This ensures OpenAI SDK compatibility. - if (usedProvider === "anthropic") { - const toolCalls = - transformedData.choices?.[0]?.delta?.tool_calls; - if (toolCalls && toolCalls.length > 0) { - // First, extract tool calls to update our tracking - const rawToolCalls = extractToolCalls(data, usedProvider); - if (rawToolCalls && rawToolCalls.length > 0) { - streamingToolCalls ??= []; - for (const newCall of rawToolCalls) { - // For content_block_start events (have id), add to tracking - if (newCall.id) { - const contentBlockIndex: number = - typeof data.index === "number" - ? data.index - : streamingToolCalls.length; - // Store at the content block index position - streamingToolCalls[contentBlockIndex] = { - ...newCall, - _contentBlockIndex: contentBlockIndex, - }; - } - // For content_block_delta events, enrich with stored id/type/name - else if (newCall._contentBlockIndex !== undefined) { - const existingCall = - streamingToolCalls[newCall._contentBlockIndex]; - if (existingCall) { - // Enrich the transformed data with id, type, and function.name - for (const tc of toolCalls) { - if (tc.index === newCall._contentBlockIndex) { - tc.id = existingCall.id; - tc.type = "function"; - tc.function ??= {}; - tc.function.name = existingCall.function.name; + // For Anthropic streaming tool calls, enrich delta chunks with id/type/name + // from the initial content_block_start event. This ensures OpenAI SDK compatibility. + if (usedProvider === "anthropic") { + const toolCalls = + transformedData.choices?.[0]?.delta?.tool_calls; + if (toolCalls && toolCalls.length > 0) { + // First, extract tool calls to update our tracking + const rawToolCalls = extractToolCalls(data, usedProvider); + if (rawToolCalls && rawToolCalls.length > 0) { + streamingToolCalls ??= []; + for (const newCall of rawToolCalls) { + // For content_block_start events (have id), add to tracking + if (newCall.id) { + const contentBlockIndex: number = + typeof data.index === "number" + ? data.index + : streamingToolCalls.length; + // Store at the content block index position + streamingToolCalls[contentBlockIndex] = { + ...newCall, + _contentBlockIndex: contentBlockIndex, + }; + } + // For content_block_delta events, enrich with stored id/type/name + else if (newCall._contentBlockIndex !== undefined) { + const existingCall = + streamingToolCalls[newCall._contentBlockIndex]; + if (existingCall) { + // Enrich the transformed data with id, type, and function.name + for (const tc of toolCalls) { + if (tc.index === newCall._contentBlockIndex) { + tc.id = existingCall.id; + tc.type = "function"; + tc.function ??= {}; + tc.function.name = existingCall.function.name; + } } } } @@ -6667,696 +6726,796 @@ chat.openapi(completions, async (c) => { } } } - } - // When buffering for healing, strip content from chunks and buffer it - // We still send metadata (usage, finish_reason, tool_calls) but buffer text content - if (shouldBufferForHealing) { - const deltaContent = - transformedData.choices?.[0]?.delta?.content; - if (deltaContent) { - bufferedContentChunks.push(deltaContent); - // Store chunk metadata for later use when sending healed content - lastChunkId = transformedData.id ?? lastChunkId; - lastChunkModel = transformedData.model ?? lastChunkModel; - lastChunkCreated = - transformedData.created ?? lastChunkCreated; - } + // When buffering for healing, strip content from chunks and buffer it + // We still send metadata (usage, finish_reason, tool_calls) but buffer text content + if (shouldBufferForHealing) { + const deltaContent = + transformedData.choices?.[0]?.delta?.content; + if (deltaContent) { + bufferedContentChunks.push(deltaContent); + // Store chunk metadata for later use when sending healed content + lastChunkId = transformedData.id ?? lastChunkId; + lastChunkModel = transformedData.model ?? lastChunkModel; + lastChunkCreated = + transformedData.created ?? lastChunkCreated; + } - // Create a copy without content in delta for streaming - const chunkWithoutContent = JSON.parse( - JSON.stringify(transformedData), - ); - if (chunkWithoutContent.choices?.[0]?.delta?.content) { - delete chunkWithoutContent.choices[0].delta.content; - } + // Create a copy without content in delta for streaming + const chunkWithoutContent = JSON.parse( + JSON.stringify(transformedData), + ); + if (chunkWithoutContent.choices?.[0]?.delta?.content) { + delete chunkWithoutContent.choices[0].delta.content; + } - // Only send chunk if it has meaningful data (not just empty delta) - const hasUsage = !!chunkWithoutContent.usage; - const hasToolCalls = - !!chunkWithoutContent.choices?.[0]?.delta?.tool_calls; - const hasFinishReason = - !!chunkWithoutContent.choices?.[0]?.finish_reason; - const hasRole = - !!chunkWithoutContent.choices?.[0]?.delta?.role; + // Only send chunk if it has meaningful data (not just empty delta) + const hasUsage = !!chunkWithoutContent.usage; + const hasToolCalls = + !!chunkWithoutContent.choices?.[0]?.delta?.tool_calls; + const hasFinishReason = + !!chunkWithoutContent.choices?.[0]?.finish_reason; + const hasRole = + !!chunkWithoutContent.choices?.[0]?.delta?.role; - if (hasUsage || hasToolCalls || hasFinishReason || hasRole) { + if ( + hasUsage || + hasToolCalls || + hasFinishReason || + hasRole + ) { + await writeSSEAndCache({ + data: JSON.stringify(chunkWithoutContent), + id: String(eventId++), + }); + } + } else { await writeSSEAndCache({ - data: JSON.stringify(chunkWithoutContent), + data: JSON.stringify(transformedData), id: String(eventId++), }); } - } else { - await writeSSEAndCache({ - data: JSON.stringify(transformedData), - id: String(eventId++), - }); - } - // Extract usage data from transformedData to update tracking variables - if ( - transformedData.usage && - (usedProvider === "openai" || usedProvider === "azure") - ) { - const usage = transformedData.usage; + // Extract usage data from transformedData to update tracking variables if ( - usage.prompt_tokens !== undefined && - usage.prompt_tokens > 0 + transformedData.usage && + (usedProvider === "openai" || usedProvider === "azure") ) { - promptTokens = usage.prompt_tokens; + const usage = transformedData.usage; + if ( + usage.prompt_tokens !== undefined && + usage.prompt_tokens > 0 + ) { + promptTokens = usage.prompt_tokens; + } + if ( + usage.completion_tokens !== undefined && + usage.completion_tokens > 0 + ) { + completionTokens = usage.completion_tokens; + } + if ( + usage.total_tokens !== undefined && + usage.total_tokens > 0 + ) { + totalTokens = usage.total_tokens; + } + if (usage.reasoning_tokens !== undefined) { + reasoningTokens = usage.reasoning_tokens; + } } - if ( - usage.completion_tokens !== undefined && - usage.completion_tokens > 0 - ) { - completionTokens = usage.completion_tokens; + + // Extract finishReason from transformedData to update tracking variable + if (transformedData.choices?.[0]?.finish_reason) { + finishReason = transformedData.choices[0].finish_reason; + sawProviderTerminalEvent = true; + sentDownstreamFinishReasonChunk = true; } - if ( - usage.total_tokens !== undefined && - usage.total_tokens > 0 - ) { - totalTokens = usage.total_tokens; + + // Extract content for logging using helper function + // For providers with custom extraction logic (google-ai-studio, anthropic), + // use raw data. For others (like aws-bedrock), use transformed OpenAI format. + const contentChunk = extractContent( + isGoogleCompatibleProvider(usedProvider) || + usedProvider === "anthropic" + ? data + : transformedData, + usedProvider, + ); + if (contentChunk) { + fullContent += contentChunk; + + // Track time to first token if this is the first content chunk + if (!firstTokenReceived) { + timeToFirstToken = Date.now() - startTime; + firstTokenReceived = true; + } } - if (usage.reasoning_tokens !== undefined) { - reasoningTokens = usage.reasoning_tokens; + + // Track image data size for Google providers (for token estimation) + if (isGoogleCompatibleProvider(usedProvider)) { + const parts = data.candidates?.[0]?.content?.parts ?? []; + for (const part of parts) { + if (part.inlineData?.data) { + // Base64 string length * 0.75 ≈ actual byte size + imageByteSize += Math.ceil( + part.inlineData.data.length * 0.75, + ); + outputImageCount++; + } + } } - } - // Extract finishReason from transformedData to update tracking variable - if (transformedData.choices?.[0]?.finish_reason) { - finishReason = transformedData.choices[0].finish_reason; - sawProviderTerminalEvent = true; - sentDownstreamFinishReasonChunk = true; - } + // Track web search calls for cost calculation + // Check for web search results based on provider-specific data + if (usedProvider === "anthropic") { + // For Anthropic, count web_search_tool_result blocks + if ( + data.type === "content_block_start" && + data.content_block?.type === "web_search_tool_result" + ) { + webSearchCount++; + } + } else if (isGoogleCompatibleProvider(usedProvider)) { + // For Google, count when grounding metadata is present + if (data.candidates?.[0]?.groundingMetadata) { + const groundingMetadata = + data.candidates[0].groundingMetadata; + if ( + groundingMetadata.webSearchQueries && + groundingMetadata.webSearchQueries.length > 0 && + webSearchCount === 0 + ) { + // Only count once for the entire response + webSearchCount = + groundingMetadata.webSearchQueries.length; + } else if ( + groundingMetadata.groundingChunks && + webSearchCount === 0 + ) { + // Fallback: count once if we have grounding chunks + webSearchCount = 1; + } + } + } else if (usedProvider === "openai") { + // For OpenAI Responses API, count web_search_call.completed events + if (data.type === "response.web_search_call.completed") { + webSearchCount++; + } + } - // Extract content for logging using helper function - // For providers with custom extraction logic (google-ai-studio, anthropic), - // use raw data. For others (like aws-bedrock), use transformed OpenAI format. - const contentChunk = extractContent( - isGoogleCompatibleProvider(usedProvider) || - usedProvider === "anthropic" - ? data - : transformedData, - usedProvider, - ); - if (contentChunk) { - fullContent += contentChunk; + // Extract reasoning content for logging using helper function + // For providers with custom extraction logic (google-ai-studio, anthropic), + // use raw data. For others, use transformed OpenAI format. + const reasoningContentChunk = extractReasoning( + isGoogleCompatibleProvider(usedProvider) || + usedProvider === "anthropic" + ? data + : transformedData, + usedProvider, + ); + if (reasoningContentChunk) { + fullReasoningContent += reasoningContentChunk; - // Track time to first token if this is the first content chunk - if (!firstTokenReceived) { - timeToFirstToken = Date.now() - startTime; - firstTokenReceived = true; + // Track time to first reasoning token if this is the first reasoning chunk + if (!firstReasoningTokenReceived) { + timeToFirstReasoningToken = Date.now() - startTime; + firstReasoningTokenReceived = true; + } } - } - // Track image data size for Google providers (for token estimation) - if (isGoogleCompatibleProvider(usedProvider)) { - const parts = data.candidates?.[0]?.content?.parts ?? []; - for (const part of parts) { - if (part.inlineData?.data) { - // Base64 string length * 0.75 ≈ actual byte size - imageByteSize += Math.ceil( - part.inlineData.data.length * 0.75, - ); - outputImageCount++; + const toolCallsChunk = extractToolCalls( + data, + usedProvider, + transformedData, + ); + if (toolCallsChunk && toolCallsChunk.length > 0) { + streamingToolCalls ??= []; + // Merge tool calls (accumulating function arguments) + for (const newCall of toolCallsChunk) { + let existingCall = null; + + // For Anthropic content_block_delta events, match by content block index + if ( + usedProvider === "anthropic" && + newCall._contentBlockIndex !== undefined + ) { + existingCall = + streamingToolCalls[newCall._contentBlockIndex]; + } else { + // For other providers and Anthropic content_block_start, match by ID + // Note: Array may have sparse entries due to index-based assignment, so check for null/undefined + existingCall = streamingToolCalls.find( + (call) => call && call.id === newCall.id, + ); + } + + if (existingCall) { + // Accumulate function arguments + if (newCall.function?.arguments) { + existingCall.function.arguments = + (existingCall.function.arguments ?? "") + + newCall.function.arguments; + } + } else { + // Clean up temporary fields and add new tool call + const cleanCall = { ...newCall }; + delete cleanCall._contentBlockIndex; + streamingToolCalls.push(cleanCall); + } } } - } - - // Track web search calls for cost calculation - // Check for web search results based on provider-specific data - if (usedProvider === "anthropic") { - // For Anthropic, count web_search_tool_result blocks - if ( - data.type === "content_block_start" && - data.content_block?.type === "web_search_tool_result" - ) { - webSearchCount++; + + // Handle provider-specific finish reason extraction + switch (usedProvider) { + case "google-ai-studio": + case "glacier": + case "google-vertex": + case "quartz": + // Preserve original Google finish reason for logging + if (data.promptFeedback?.blockReason) { + finishReason = data.promptFeedback.blockReason; + sawProviderTerminalEvent = true; + } else if (data.candidates?.[0]?.finishReason) { + finishReason = data.candidates[0].finishReason; + sawProviderTerminalEvent = true; + } + break; + case "anthropic": + if ( + data.type === "message_delta" && + data.delta?.stop_reason + ) { + finishReason = data.delta.stop_reason; + sawProviderTerminalEvent = true; + } else if ( + data.type === "message_stop" || + data.stop_reason + ) { + finishReason = data.stop_reason ?? "end_turn"; + sawProviderTerminalEvent = true; + } else if (data.delta?.stop_reason) { + finishReason = data.delta.stop_reason; + sawProviderTerminalEvent = true; + } + break; + default: // OpenAI format + if (data.choices && data.choices[0]?.finish_reason) { + finishReason = data.choices[0].finish_reason; + } + break; + } + + // Extract token usage using helper function + const usage = extractTokenUsage( + data, + usedProvider, + fullContent, + imageByteSize, + ); + if (usage.promptTokens !== null) { + promptTokens = usage.promptTokens; + } + if (usage.completionTokens !== null) { + completionTokens = usage.completionTokens; + } + if (usage.totalTokens !== null) { + totalTokens = usage.totalTokens; + } + if (usage.reasoningTokens !== null) { + reasoningTokens = usage.reasoningTokens; + } + if (usage.cachedTokens !== null) { + cachedTokens = usage.cachedTokens; } - } else if (isGoogleCompatibleProvider(usedProvider)) { - // For Google, count when grounding metadata is present - if (data.candidates?.[0]?.groundingMetadata) { - const groundingMetadata = - data.candidates[0].groundingMetadata; - if ( - groundingMetadata.webSearchQueries && - groundingMetadata.webSearchQueries.length > 0 && - webSearchCount === 0 - ) { - // Only count once for the entire response - webSearchCount = - groundingMetadata.webSearchQueries.length; - } else if ( - groundingMetadata.groundingChunks && - webSearchCount === 0 - ) { - // Fallback: count once if we have grounding chunks - webSearchCount = 1; - } + if (usage.cacheCreationTokens !== null) { + cacheCreationTokens = usage.cacheCreationTokens; } - } else if (usedProvider === "openai") { - // For OpenAI Responses API, count web_search_call.completed events - if (data.type === "response.web_search_call.completed") { - webSearchCount++; + if (usage.cacheCreation5mTokens !== null) { + cacheCreation5mTokens = usage.cacheCreation5mTokens; } - } - - // Extract reasoning content for logging using helper function - // For providers with custom extraction logic (google-ai-studio, anthropic), - // use raw data. For others, use transformed OpenAI format. - const reasoningContentChunk = extractReasoning( - isGoogleCompatibleProvider(usedProvider) || - usedProvider === "anthropic" - ? data - : transformedData, - usedProvider, - ); - if (reasoningContentChunk) { - fullReasoningContent += reasoningContentChunk; - - // Track time to first reasoning token if this is the first reasoning chunk - if (!firstReasoningTokenReceived) { - timeToFirstReasoningToken = Date.now() - startTime; - firstReasoningTokenReceived = true; + if (usage.cacheCreation1hTokens !== null) { + cacheCreation1hTokens = usage.cacheCreation1hTokens; + } + if ( + usage.totalTokens === null && + promptTokens !== null && + completionTokens !== null + ) { + totalTokens = promptTokens + completionTokens; } - } - - const toolCallsChunk = extractToolCalls( - data, - usedProvider, - transformedData, - ); - if (toolCallsChunk && toolCallsChunk.length > 0) { - streamingToolCalls ??= []; - // Merge tool calls (accumulating function arguments) - for (const newCall of toolCallsChunk) { - let existingCall = null; - // For Anthropic content_block_delta events, match by content block index - if ( - usedProvider === "anthropic" && - newCall._contentBlockIndex !== undefined - ) { - existingCall = - streamingToolCalls[newCall._contentBlockIndex]; - } else { - // For other providers and Anthropic content_block_start, match by ID - // Note: Array may have sparse entries due to index-based assignment, so check for null/undefined - existingCall = streamingToolCalls.find( - (call) => call && call.id === newCall.id, + // Estimate tokens if not provided and we have a finish reason + if (finishReason && (!promptTokens || !completionTokens)) { + if (!promptTokens) { + const estimation = estimateTokens( + usedProvider, + messages, + null, + null, + null, ); + promptTokens = estimation.calculatedPromptTokens; } - if (existingCall) { - // Accumulate function arguments - if (newCall.function?.arguments) { - existingCall.function.arguments = - (existingCall.function.arguments ?? "") + - newCall.function.arguments; + if (!completionTokens) { + const textTokens = estimateTokensFromContent(fullContent); + // For images, estimate ~258 tokens per image + 1 token per 750 bytes + let imageTokens = 0; + if (imageByteSize > 0) { + imageTokens = 258 + Math.ceil(imageByteSize / 750); } - } else { - // Clean up temporary fields and add new tool call - const cleanCall = { ...newCall }; - delete cleanCall._contentBlockIndex; - streamingToolCalls.push(cleanCall); - } - } - } - - // Handle provider-specific finish reason extraction - switch (usedProvider) { - case "google-ai-studio": - case "glacier": - case "google-vertex": - case "quartz": - // Preserve original Google finish reason for logging - if (data.promptFeedback?.blockReason) { - finishReason = data.promptFeedback.blockReason; - sawProviderTerminalEvent = true; - } else if (data.candidates?.[0]?.finishReason) { - finishReason = data.candidates[0].finishReason; - sawProviderTerminalEvent = true; - } - break; - case "anthropic": - if ( - data.type === "message_delta" && - data.delta?.stop_reason - ) { - finishReason = data.delta.stop_reason; - sawProviderTerminalEvent = true; - } else if ( - data.type === "message_stop" || - data.stop_reason - ) { - finishReason = data.stop_reason ?? "end_turn"; - sawProviderTerminalEvent = true; - } else if (data.delta?.stop_reason) { - finishReason = data.delta.stop_reason; - sawProviderTerminalEvent = true; - } - break; - default: // OpenAI format - if (data.choices && data.choices[0]?.finish_reason) { - finishReason = data.choices[0].finish_reason; + completionTokens = textTokens + imageTokens; } - break; - } - - // Extract token usage using helper function - const usage = extractTokenUsage( - data, - usedProvider, - fullContent, - imageByteSize, - ); - if (usage.promptTokens !== null) { - promptTokens = usage.promptTokens; - } - if (usage.completionTokens !== null) { - completionTokens = usage.completionTokens; - } - if (usage.totalTokens !== null) { - totalTokens = usage.totalTokens; - } - if (usage.reasoningTokens !== null) { - reasoningTokens = usage.reasoningTokens; - } - if (usage.cachedTokens !== null) { - cachedTokens = usage.cachedTokens; - } - if (usage.cacheCreationTokens !== null) { - cacheCreationTokens = usage.cacheCreationTokens; - } - if (usage.cacheCreation5mTokens !== null) { - cacheCreation5mTokens = usage.cacheCreation5mTokens; - } - if (usage.cacheCreation1hTokens !== null) { - cacheCreation1hTokens = usage.cacheCreation1hTokens; - } - if ( - usage.totalTokens === null && - promptTokens !== null && - completionTokens !== null - ) { - totalTokens = promptTokens + completionTokens; - } - - // Estimate tokens if not provided and we have a finish reason - if (finishReason && (!promptTokens || !completionTokens)) { - if (!promptTokens) { - const estimation = estimateTokens( - usedProvider, - messages, - null, - null, - null, - ); - promptTokens = estimation.calculatedPromptTokens; - } - if (!completionTokens) { - const textTokens = estimateTokensFromContent(fullContent); - // For images, estimate ~258 tokens per image + 1 token per 750 bytes - let imageTokens = 0; - if (imageByteSize > 0) { - imageTokens = 258 + Math.ceil(imageByteSize / 750); - } - completionTokens = textTokens + imageTokens; + totalTokens = (promptTokens ?? 0) + (completionTokens ?? 0); } - totalTokens = (promptTokens ?? 0) + (completionTokens ?? 0); + processedLength = eventEnd; } - processedLength = eventEnd; + searchStart = eventEnd; } - searchStart = eventEnd; - } - - // Remove processed data from buffer - if (processedLength > 0) { - buffer = bufferCopy.slice(processedLength); - } - - if (shouldTerminateStream) { - break; - } - } - } catch (error) { - if (error instanceof Error && error.name === "AbortError") { - canceled = true; - } else if (isTimeoutError(error)) { - const errorMessage = - error instanceof Error ? error.message : "Stream reading timeout"; - logger.warn("Stream reading timeout", { - error: errorMessage, - usedProvider, - requestedProvider, - usedModel, - initialRequestedModel, - unifiedFinishReason: getUnifiedFinishReason( - "upstream_error", - usedProvider, - ), - }); + // Remove processed data from buffer + if (processedLength > 0) { + buffer = bufferCopy.slice(processedLength); + } - try { - await stream.writeSSE({ - event: "error", - data: JSON.stringify({ - error: { - message: `Upstream provider timeout: ${errorMessage}`, - type: "upstream_timeout", - param: null, - code: "timeout", - }, - }), - id: String(eventId++), - }); - await stream.writeSSE({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); - doneSent = true; - } catch (sseError) { - logger.error( - "Failed to send timeout error SSE", - sseError instanceof Error - ? sseError - : new Error(String(sseError)), - ); + if (shouldTerminateStream) { + break; + } } - - streamingError = { - message: errorMessage, - type: "upstream_timeout", - code: "timeout", - details: { - name: "TimeoutError", - timestamp: new Date().toISOString(), - provider: usedProvider, - model: usedModel, - }, - }; - } else { - const normalizedStreamingError = normalizeStreamingError({ - error, - provider: usedProvider, - model: usedModel, - bufferSnapshot: buffer ? buffer.substring(0, 5000) : undefined, - phase: "upstream_read", - }); - - logger.error( - "Error reading upstream stream", - error instanceof Error ? error : new Error(String(error)), - { - requestId, + } catch (error) { + if (error instanceof Error && error.name === "AbortError") { + canceled = true; + } else if (isTimeoutError(error)) { + const errorMessage = + error instanceof Error + ? error.message + : "Stream reading timeout"; + logger.warn("Stream reading timeout", { + error: errorMessage, usedProvider, requestedProvider, usedModel, initialRequestedModel, - upstreamStatus: res?.status ?? null, - upstreamStatusText: res?.statusText ?? null, - upstreamHeaders: res - ? { - contentType: res.headers.get("content-type"), - contentLength: res.headers.get("content-length"), - transferEncoding: res.headers.get("transfer-encoding"), - requestId: - res.headers.get("x-request-id") ?? - res.headers.get("request-id") ?? - res.headers.get("openai-request-id"), - } - : null, - streamingDiagnostics: normalizedStreamingError.log.details, - timeToFirstToken, - timeToFirstReasoningToken, - firstTokenReceived, - firstReasoningTokenReceived, unifiedFinishReason: getUnifiedFinishReason( - normalizedStreamingError.client.type === "gateway_error" - ? "gateway_error" - : "upstream_error", + "upstream_error", usedProvider, ), - }, - ); - - // Forward the error to the client with the buffered content that caused the error - try { - await stream.writeSSE({ - event: "error", - data: JSON.stringify({ - error: normalizedStreamingError.client, - }), - id: String(eventId++), }); - await stream.writeSSE({ - event: "done", - data: "[DONE]", - id: String(eventId++), + + try { + await stream.writeSSE({ + event: "error", + data: JSON.stringify({ + error: { + message: `Upstream provider timeout: ${errorMessage}`, + type: "upstream_timeout", + param: null, + code: "timeout", + }, + }), + id: String(eventId++), + }); + await stream.writeSSE({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + doneSent = true; + } catch (sseError) { + logger.error( + "Failed to send timeout error SSE", + sseError instanceof Error + ? sseError + : new Error(String(sseError)), + ); + } + + streamingError = { + message: errorMessage, + type: "upstream_timeout", + code: "timeout", + details: { + name: "TimeoutError", + timestamp: new Date().toISOString(), + provider: usedProvider, + model: usedModel, + }, + }; + } else { + const normalizedStreamingError = normalizeStreamingError({ + error, + provider: usedProvider, + model: usedModel, + bufferSnapshot: buffer ? buffer.substring(0, 5000) : undefined, + phase: "upstream_read", }); - doneSent = true; - } catch (sseError) { + logger.error( - "Failed to send error SSE", - sseError instanceof Error - ? sseError - : new Error(String(sseError)), + "Error reading upstream stream", + error instanceof Error ? error : new Error(String(error)), + { + requestId, + usedProvider, + requestedProvider, + usedModel, + initialRequestedModel, + upstreamStatus: res?.status ?? null, + upstreamStatusText: res?.statusText ?? null, + upstreamHeaders: res + ? { + contentType: res.headers.get("content-type"), + contentLength: res.headers.get("content-length"), + transferEncoding: res.headers.get("transfer-encoding"), + requestId: + res.headers.get("x-request-id") ?? + res.headers.get("request-id") ?? + res.headers.get("openai-request-id"), + } + : null, + streamingDiagnostics: normalizedStreamingError.log.details, + timeToFirstToken, + timeToFirstReasoningToken, + firstTokenReceived, + firstReasoningTokenReceived, + unifiedFinishReason: getUnifiedFinishReason( + normalizedStreamingError.client.type === "gateway_error" + ? "gateway_error" + : "upstream_error", + usedProvider, + ), + }, ); + + // Forward the error to the client with the buffered content that caused the error + try { + await stream.writeSSE({ + event: "error", + data: JSON.stringify({ + error: normalizedStreamingError.client, + }), + id: String(eventId++), + }); + await stream.writeSSE({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + doneSent = true; + } catch (sseError) { + logger.error( + "Failed to send error SSE", + sseError instanceof Error + ? sseError + : new Error(String(sseError)), + ); + } + + streamingError = normalizedStreamingError.log; } + } finally { + // Clean up the reader to prevent file descriptor leaks + try { + await reader.cancel(); + } catch { + // Ignore errors from cancel - the stream may already be aborted due to timeout + } + // Clean up the event listeners + c.req.raw.signal.removeEventListener("abort", onAbort); - streamingError = normalizedStreamingError.log; - } - } finally { - // Clean up the reader to prevent file descriptor leaks - try { - await reader.cancel(); - } catch { - // Ignore errors from cancel - the stream may already be aborted due to timeout - } - // Clean up the event listeners - c.req.raw.signal.removeEventListener("abort", onAbort); + // Log the streaming request + const duration = Date.now() - startTime; - // Log the streaming request - const duration = Date.now() - startTime; + // Calculate estimated tokens if not provided + let calculatedPromptTokens = promptTokens; + let calculatedCompletionTokens = completionTokens; + let calculatedTotalTokens = totalTokens; - // Calculate estimated tokens if not provided - let calculatedPromptTokens = promptTokens; - let calculatedCompletionTokens = completionTokens; - let calculatedTotalTokens = totalTokens; + // Estimate tokens for providers that don't provide them during streaming + if (!promptTokens || !completionTokens) { + if (!promptTokens && messages && messages.length > 0) { + calculatedPromptTokens = encodeChatMessages(messages); + } - // Estimate tokens for providers that don't provide them during streaming - if (!promptTokens || !completionTokens) { - if (!promptTokens && messages && messages.length > 0) { - calculatedPromptTokens = encodeChatMessages(messages); - } + if (!completionTokens && (fullContent || imageByteSize > 0)) { + // For images, estimate ~258 tokens per image + 1 token per 750 bytes + let imageTokens = 0; + if (imageByteSize > 0) { + imageTokens = 258 + Math.ceil(imageByteSize / 750); + } - if (!completionTokens && (fullContent || imageByteSize > 0)) { - // For images, estimate ~258 tokens per image + 1 token per 750 bytes - let imageTokens = 0; - if (imageByteSize > 0) { - imageTokens = 258 + Math.ceil(imageByteSize / 750); + const textTokens = estimateTokensFromContent(fullContent); + calculatedCompletionTokens = textTokens + imageTokens; } - const textTokens = estimateTokensFromContent(fullContent); - calculatedCompletionTokens = textTokens + imageTokens; + calculatedTotalTokens = + (calculatedPromptTokens ?? 0) + + (calculatedCompletionTokens ?? 0); } - calculatedTotalTokens = - (calculatedPromptTokens ?? 0) + (calculatedCompletionTokens ?? 0); - } + // Estimate reasoning tokens if not provided but reasoning content exists + let calculatedReasoningTokens = reasoningTokens; + if (!reasoningTokens && fullReasoningContent) { + calculatedReasoningTokens = + estimateTokensFromContent(fullReasoningContent); + } - // Estimate reasoning tokens if not provided but reasoning content exists - let calculatedReasoningTokens = reasoningTokens; - if (!reasoningTokens && fullReasoningContent) { - calculatedReasoningTokens = - estimateTokensFromContent(fullReasoningContent); - } + if ( + !streamingError && + !canceled && + finishReason === null && + sawOpenAiResponsesDoneEvent && + sawOpenAiResponsesCompletedStatus + ) { + sawProviderTerminalEvent = true; + finishReason = + streamingToolCalls && streamingToolCalls.length > 0 + ? "tool_calls" + : "stop"; + } - if ( - !streamingError && - !canceled && - finishReason === null && - sawOpenAiResponsesDoneEvent && - sawOpenAiResponsesCompletedStatus - ) { - sawProviderTerminalEvent = true; - finishReason = - streamingToolCalls && streamingToolCalls.length > 0 - ? "tool_calls" - : "stop"; - } + const streamHasVerifiedTerminalEvent = + sawUpstreamDoneSentinel || + sawProviderTerminalEvent || + handledTerminalProviderEvent; + // A terminal finish reason (stop, tool_calls, length) also counts + // as a valid stream completion — some providers (e.g. MiniMax) + // send finish_reason but omit the [DONE] sentinel. + const hasTerminalFinishReason = + finishReason !== null && + finishReason !== "upstream_error" && + finishReason !== "gateway_error"; + const streamEndedWithoutTerminalEvent = + !streamingError && + !canceled && + !streamHasVerifiedTerminalEvent && + !hasTerminalFinishReason; + if (streamEndedWithoutTerminalEvent) { + const hasBufferedNonWhitespace = /\S/u.test(buffer); + const responseText = hasBufferedNonWhitespace + ? buffer.slice(0, 5000) + : "Stream ended before a terminal finish reason or [DONE] event"; + const errorMessage = + "Upstream stream terminated unexpectedly before completion"; - const streamHasVerifiedTerminalEvent = - sawUpstreamDoneSentinel || - sawProviderTerminalEvent || - handledTerminalProviderEvent; - // A terminal finish reason (stop, tool_calls, length) also counts - // as a valid stream completion — some providers (e.g. MiniMax) - // send finish_reason but omit the [DONE] sentinel. - const hasTerminalFinishReason = - finishReason !== null && - finishReason !== "upstream_error" && - finishReason !== "gateway_error"; - const streamEndedWithoutTerminalEvent = - !streamingError && - !canceled && - !streamHasVerifiedTerminalEvent && - !hasTerminalFinishReason; - if (streamEndedWithoutTerminalEvent) { - const hasBufferedNonWhitespace = /\S/u.test(buffer); - const responseText = hasBufferedNonWhitespace - ? buffer.slice(0, 5000) - : "Stream ended before a terminal finish reason or [DONE] event"; - const errorMessage = - "Upstream stream terminated unexpectedly before completion"; - - logger.warn("[streaming] Stream ended without terminal event", { - provider: usedProvider, - model: usedModel, - bufferLength: buffer.length, - fullContentLength: fullContent.length, - hasToolCalls: - !!streamingToolCalls && streamingToolCalls.length > 0, - unifiedFinishReason: getUnifiedFinishReason( - "upstream_error", - usedProvider, - ), - }); + logger.warn("[streaming] Stream ended without terminal event", { + provider: usedProvider, + model: usedModel, + bufferLength: buffer.length, + fullContentLength: fullContent.length, + hasToolCalls: + !!streamingToolCalls && streamingToolCalls.length > 0, + unifiedFinishReason: getUnifiedFinishReason( + "upstream_error", + usedProvider, + ), + }); + + streamingError = { + message: errorMessage, + type: "upstream_error", + code: "stream_truncated", + details: { + statusCode: 502, + statusText: "Upstream Stream Terminated", + responseText, + timestamp: new Date().toISOString(), + provider: usedProvider, + model: usedModel, + bufferLength: buffer.length, + }, + }; + finishReason = "upstream_error"; + + try { + await writeSSEAndCache({ + event: "error", + data: JSON.stringify({ + error: { + message: errorMessage, + type: "upstream_error", + code: "stream_truncated", + param: null, + responseText, + }, + }), + id: String(eventId++), + }); + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + doneSent = true; + } catch (sseError) { + logger.error( + "Failed to send truncated stream error SSE", + sseError instanceof Error + ? sseError + : new Error(String(sseError)), + ); + } + } - streamingError = { - message: errorMessage, - type: "upstream_error", - code: "stream_truncated", - details: { - statusCode: 502, - statusText: "Upstream Stream Terminated", - responseText, - timestamp: new Date().toISOString(), + // Check if the response finished successfully but has no content, tokens, or tool calls + // This indicates an empty response which should be marked as an error + // Do this check BEFORE sending usage chunks to ensure proper event ordering + // Exclude content filter responses as they are intentionally empty. + const isContentFilterStreamingResponse = + isContentFilterFinishReason(finishReason, usedProvider); + const hasEmptyResponse = + !streamingError && + finishReason && + finishReason !== "incomplete" && + !isContentFilterStreamingResponse && + (!calculatedCompletionTokens || + calculatedCompletionTokens === 0) && + (!calculatedReasoningTokens || calculatedReasoningTokens === 0) && + (!fullContent || fullContent.trim() === "") && + (!streamingToolCalls || streamingToolCalls.length === 0); + + let streamingCostsEarly: + | Awaited> + | undefined; + + if (hasEmptyResponse) { + logger.warn("[streaming] Empty response detected", { provider: usedProvider, model: usedModel, - bufferLength: buffer.length, - }, - }; - finishReason = "upstream_error"; - - try { - await writeSSEAndCache({ - event: "error", - data: JSON.stringify({ - error: { - message: errorMessage, - type: "upstream_error", - code: "stream_truncated", - param: null, - responseText, - }, - }), - id: String(eventId++), - }); - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), + finishReason, + calculatedCompletionTokens, + calculatedReasoningTokens, + fullContentLength: fullContent?.length ?? 0, + fullContentTrimmed: fullContent?.trim()?.length ?? 0, + streamingToolCallsCount: streamingToolCalls?.length ?? 0, + promptTokens, + completionTokens, + totalTokens, + reasoningTokens, + unifiedFinishReason: getUnifiedFinishReason( + "upstream_error", + usedProvider, + ), }); - doneSent = true; - } catch (sseError) { - logger.error( - "Failed to send truncated stream error SSE", - sseError instanceof Error - ? sseError - : new Error(String(sseError)), - ); - } - } + const errorMessage = + "Response finished successfully but returned no content or tool calls"; + streamingError = errorMessage; + finishReason = "upstream_error"; - // Check if the response finished successfully but has no content, tokens, or tool calls - // This indicates an empty response which should be marked as an error - // Do this check BEFORE sending usage chunks to ensure proper event ordering - // Exclude content filter responses as they are intentionally empty. - const isContentFilterStreamingResponse = isContentFilterFinishReason( - finishReason, - usedProvider, - ); - const hasEmptyResponse = - !streamingError && - finishReason && - finishReason !== "incomplete" && - !isContentFilterStreamingResponse && - (!calculatedCompletionTokens || calculatedCompletionTokens === 0) && - (!calculatedReasoningTokens || calculatedReasoningTokens === 0) && - (!fullContent || fullContent.trim() === "") && - (!streamingToolCalls || streamingToolCalls.length === 0); - - let streamingCostsEarly: - | Awaited> - | undefined; - - if (hasEmptyResponse) { - logger.warn("[streaming] Empty response detected", { - provider: usedProvider, - model: usedModel, - finishReason, - calculatedCompletionTokens, - calculatedReasoningTokens, - fullContentLength: fullContent?.length ?? 0, - fullContentTrimmed: fullContent?.trim()?.length ?? 0, - streamingToolCallsCount: streamingToolCalls?.length ?? 0, - promptTokens, - completionTokens, - totalTokens, - reasoningTokens, - unifiedFinishReason: getUnifiedFinishReason( - "upstream_error", - usedProvider, - ), - }); - const errorMessage = - "Response finished successfully but returned no content or tool calls"; - streamingError = errorMessage; - finishReason = "upstream_error"; + // Send error event to client using writeSSEAndCache to cache the error + try { + await writeSSEAndCache({ + event: "error", + data: JSON.stringify({ + error: { + message: errorMessage, + type: "upstream_error", + code: "upstream_error", + param: null, + responseText: errorMessage, + }, + }), + id: String(eventId++), + }); + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + doneSent = true; + } catch (sseError) { + logger.error( + "Failed to send upstream error SSE", + sseError instanceof Error + ? sseError + : new Error(String(sseError)), + ); + } + } else if (!streamingError && !doneSent) { + if ( + finishReason && + !sentDownstreamFinishReasonChunk && + !shouldBufferForHealing + ) { + try { + const finishChunk = { + id: `chatcmpl-${Date.now()}`, + object: "chat.completion.chunk", + created: Math.floor(Date.now() / 1000), + model: usedModel, + choices: [ + { + index: 0, + delta: {}, + finish_reason: mapFinishReasonToOpenai( + finishReason, + usedProvider, + !!streamingToolCalls && streamingToolCalls.length > 0, + ), + }, + ], + }; - // Send error event to client using writeSSEAndCache to cache the error - try { - await writeSSEAndCache({ - event: "error", - data: JSON.stringify({ - error: { - message: errorMessage, - type: "upstream_error", - code: "upstream_error", - param: null, - responseText: errorMessage, - }, - }), - id: String(eventId++), - }); - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); - doneSent = true; - } catch (sseError) { - logger.error( - "Failed to send upstream error SSE", - sseError instanceof Error - ? sseError - : new Error(String(sseError)), - ); - } - } else if (!streamingError && !doneSent) { - if ( - finishReason && - !sentDownstreamFinishReasonChunk && - !shouldBufferForHealing - ) { + await writeSSEAndCache({ + data: JSON.stringify(finishChunk), + id: String(eventId++), + }); + sentDownstreamFinishReasonChunk = true; + } catch (error) { + logger.error( + "Error sending synthesized finish chunk", + error instanceof Error ? error : new Error(String(error)), + ); + } + } + + // Calculate costs before sending usage chunk so we can include cost data + const billCancelledRequestsEarly = shouldBillCancelledRequests(); + streamingCostsEarly = + canceled && !billCancelledRequestsEarly + ? { + inputCost: null, + outputCost: null, + cachedInputCost: null, + cacheWriteInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + totalCost: null, + promptTokens: null, + completionTokens: null, + cachedTokens: null, + cacheWriteTokens: null, + estimatedCost: false, + discount: undefined, + pricingTier: undefined, + dataStorageCost: null as number | null, + } + : await calculateCosts( + usedModel, + usedProvider, + calculatedPromptTokens, + calculatedCompletionTokens, + cachedTokens, + { + prompt: messages + .map((m) => messageContentToString(m.content)) + .join("\n"), + completion: fullContent, + toolResults: streamingToolCalls ?? undefined, + }, + reasoningTokens, + outputImageCount, + image_config?.image_size, + inputImageCount, + webSearchCount, + project.organizationId, + image_config?.image_quality, + { + cacheWriteTokens: cacheCreationTokens, + cacheWrite1hTokens: cacheCreation1hTokens, + }, + ); + if (streamingCostsEarly.totalCost !== null) { + streamingCostsEarly.dataStorageCost = toDataStorageCostNumber( + streamingCostsEarly.promptTokens ?? calculatedPromptTokens, + cachedTokens, + streamingCostsEarly.completionTokens ?? + calculatedCompletionTokens, + reasoningTokens, + retentionLevel, + ); + } + + // Always send final usage chunk with cost data for SDK compatibility try { - const finishChunk = { + const finalUsageChunk = { id: `chatcmpl-${Date.now()}`, object: "chat.completion.chunk", created: Math.floor(Date.now() / 1000), @@ -7365,32 +7524,252 @@ chat.openapi(completions, async (c) => { { index: 0, delta: {}, - finish_reason: mapFinishReasonToOpenai( - finishReason, - usedProvider, - !!streamingToolCalls && streamingToolCalls.length > 0, - ), + finish_reason: null, }, ], + usage: (() => { + // Only add image input tokens for providers that + // exclude them from upstream usage (Google) + const providerExcludesImageInput = + isGoogleCompatibleProvider(usedProvider); + const imageInputAdj = providerExcludesImageInput + ? inputImageCount * 560 + : 0; + const adjPrompt = Math.max( + 1, + Math.round( + promptTokens && promptTokens > 0 + ? promptTokens + imageInputAdj + : (calculatedPromptTokens ?? 1) + imageInputAdj, + ), + ); + const adjCompletion = Math.round( + completionTokens ?? calculatedCompletionTokens ?? 0, + ); + const earlyUsage: Record = { + prompt_tokens: adjPrompt, + completion_tokens: adjCompletion, + total_tokens: Math.max( + 1, + Math.round(adjPrompt + adjCompletion), + ), + ...(reasoningTokens !== null && + reasoningTokens > 0 && { + reasoning_tokens: reasoningTokens, + }), + ...((cachedTokens !== null || + (cacheCreationTokens !== null && + cacheCreationTokens > 0)) && { + prompt_tokens_details: { + cached_tokens: cachedTokens ?? 0, + ...(cacheCreationTokens !== null && + cacheCreationTokens > 0 && { + cache_creation_tokens: cacheCreationTokens, + }), + ...(cacheCreationTokens !== null && + cacheCreationTokens > 0 && + (cacheCreation5mTokens !== null || + cacheCreation1hTokens !== null) && { + cache_creation: { + ephemeral_5m_input_tokens: + cacheCreation5mTokens ?? + Math.max( + 0, + cacheCreationTokens - + (cacheCreation1hTokens ?? 0), + ), + ephemeral_1h_input_tokens: + cacheCreation1hTokens ?? 0, + }, + }), + }, + }), + }; + applyExtendedUsageFields(earlyUsage, { + costs: { + inputCost: streamingCostsEarly.inputCost, + outputCost: streamingCostsEarly.outputCost, + cachedInputCost: streamingCostsEarly.cachedInputCost, + cacheWriteInputCost: + streamingCostsEarly.cacheWriteInputCost, + requestCost: streamingCostsEarly.requestCost, + webSearchCost: streamingCostsEarly.webSearchCost, + imageInputCost: streamingCostsEarly.imageInputCost, + imageOutputCost: streamingCostsEarly.imageOutputCost, + totalCost: streamingCostsEarly.totalCost, + dataStorageCost: streamingCostsEarly.dataStorageCost, + }, + cachedTokens, + cacheCreationTokens, + reasoningTokens, + }); + return earlyUsage; + })(), }; await writeSSEAndCache({ - data: JSON.stringify(finishChunk), + data: JSON.stringify(finalUsageChunk), id: String(eventId++), }); - sentDownstreamFinishReasonChunk = true; } catch (error) { logger.error( - "Error sending synthesized finish chunk", + "Error sending final usage chunk", error instanceof Error ? error : new Error(String(error)), ); } + + // Send healed content if buffering was enabled + if ( + shouldBufferForHealing && + bufferedContentChunks.length > 0 && + !streamingError + ) { + try { + // Combine buffered content and apply healing + const bufferedContent = bufferedContentChunks.join(""); + const healingResult = healJsonResponse(bufferedContent); + + // Store plugin results for logging + streamingPluginResults.responseHealing = { + healed: healingResult.healed, + healingMethod: healingResult.healingMethod, + }; + + if (healingResult.healed) { + logger.debug("Streaming response healing applied", { + method: healingResult.healingMethod, + originalLength: healingResult.originalContent.length, + healedLength: healingResult.content.length, + }); + // Update fullContent with healed version for logging + fullContent = healingResult.content; + } + + // Send the healed (or original if no healing needed) content as a single chunk + const healedContentChunk = { + id: lastChunkId ?? `chatcmpl-${Date.now()}`, + object: "chat.completion.chunk", + created: lastChunkCreated ?? Math.floor(Date.now() / 1000), + model: lastChunkModel ?? usedModel, + choices: [ + { + index: 0, + delta: { + content: healingResult.content, + }, + finish_reason: null, + }, + ], + }; + + await writeSSEAndCache({ + data: JSON.stringify(healedContentChunk), + id: String(eventId++), + }); + + // Send finish_reason chunk + const finishChunk = { + id: lastChunkId ?? `chatcmpl-${Date.now()}`, + object: "chat.completion.chunk", + created: lastChunkCreated ?? Math.floor(Date.now() / 1000), + model: lastChunkModel ?? usedModel, + choices: [ + { + index: 0, + delta: {}, + finish_reason: mapFinishReasonToOpenai( + finishReason, + usedProvider, + !!streamingToolCalls && streamingToolCalls.length > 0, + ), + }, + ], + }; + + await writeSSEAndCache({ + data: JSON.stringify(finishChunk), + id: String(eventId++), + }); + } catch (error) { + logger.error( + "Error sending healed content chunk", + error instanceof Error ? error : new Error(String(error)), + ); + } + } + + // Send routing metadata for all attempts (including successful) + if (routingAttempts.length > 0 && !doneSent) { + try { + const routingChunk = { + id: `chatcmpl-${Date.now()}`, + object: "chat.completion.chunk", + created: Math.floor(Date.now() / 1000), + model: usedModel, + choices: [ + { + index: 0, + delta: {}, + finish_reason: null, + }, + ], + metadata: { + requested_model: initialRequestedModel, + requested_provider: requestedProvider ?? null, + used_model: baseModelName, + used_provider: usedProvider, + ...(usedRegion && { used_region: usedRegion }), + underlying_used_model: usedModel, + routing: routingAttempts, + }, + }; + await writeSSEAndCache({ + data: JSON.stringify(routingChunk), + id: String(eventId++), + }); + } catch (error) { + logger.error( + "Error sending routing metadata chunk", + error instanceof Error ? error : new Error(String(error)), + ); + } + } + + // Always send [DONE] at the end of streaming if not already sent + if (!doneSent) { + try { + await writeSSEAndCache({ + event: "done", + data: "[DONE]", + id: String(eventId++), + }); + } catch (error) { + logger.error( + "Error sending [DONE] event", + error instanceof Error ? error : new Error(String(error)), + ); + } + } } - // Calculate costs before sending usage chunk so we can include cost data - const billCancelledRequestsEarly = shouldBillCancelledRequests(); - streamingCostsEarly = - canceled && !billCancelledRequestsEarly + // Clean up keepalive before any potentially-throwing operations (insertLog, etc.) + // clearInterval is idempotent so calling it multiple times is safe + clearKeepalive(); + + if (splitTaggedReasoning && !fullReasoningContent) { + const splitContent = splitReasoningFromTaggedContent(fullContent); + if (splitContent.reasoningContent) { + fullContent = splitContent.content ?? ""; + fullReasoningContent = splitContent.reasoningContent; + } + } + + // Reuse costs calculated earlier (before usage chunk was sent) + // If we came through the error path (hasEmptyResponse), calculate now + const billCancelledRequests = shouldBillCancelledRequests(); + const costs = + streamingCostsEarly ?? + (canceled && !billCancelledRequests ? { inputCost: null, outputCost: null, @@ -7422,568 +7801,264 @@ chat.openapi(completions, async (c) => { prompt: messages .map((m) => messageContentToString(m.content)) .join("\n"), - completion: fullContent, - toolResults: streamingToolCalls ?? undefined, - }, - reasoningTokens, - outputImageCount, - image_config?.image_size, - inputImageCount, - webSearchCount, - project.organizationId, - image_config?.image_quality, - { - cacheWriteTokens: cacheCreationTokens, - cacheWrite1hTokens: cacheCreation1hTokens, - }, - ); - if (streamingCostsEarly.totalCost !== null) { - streamingCostsEarly.dataStorageCost = toDataStorageCostNumber( - streamingCostsEarly.promptTokens ?? calculatedPromptTokens, - cachedTokens, - streamingCostsEarly.completionTokens ?? - calculatedCompletionTokens, - reasoningTokens, - retentionLevel, - ); - } - - // Always send final usage chunk with cost data for SDK compatibility - try { - const finalUsageChunk = { - id: `chatcmpl-${Date.now()}`, - object: "chat.completion.chunk", - created: Math.floor(Date.now() / 1000), - model: usedModel, - choices: [ - { - index: 0, - delta: {}, - finish_reason: null, - }, - ], - usage: (() => { - // Only add image input tokens for providers that - // exclude them from upstream usage (Google) - const providerExcludesImageInput = - isGoogleCompatibleProvider(usedProvider); - const imageInputAdj = providerExcludesImageInput - ? inputImageCount * 560 - : 0; - const adjPrompt = Math.max( - 1, - Math.round( - promptTokens && promptTokens > 0 - ? promptTokens + imageInputAdj - : (calculatedPromptTokens ?? 1) + imageInputAdj, - ), - ); - const adjCompletion = Math.round( - completionTokens ?? calculatedCompletionTokens ?? 0, - ); - const earlyUsage: Record = { - prompt_tokens: adjPrompt, - completion_tokens: adjCompletion, - total_tokens: Math.max( - 1, - Math.round(adjPrompt + adjCompletion), - ), - ...(reasoningTokens !== null && - reasoningTokens > 0 && { - reasoning_tokens: reasoningTokens, - }), - ...((cachedTokens !== null || - (cacheCreationTokens !== null && - cacheCreationTokens > 0)) && { - prompt_tokens_details: { - cached_tokens: cachedTokens ?? 0, - ...(cacheCreationTokens !== null && - cacheCreationTokens > 0 && { - cache_creation_tokens: cacheCreationTokens, - }), - ...(cacheCreationTokens !== null && - cacheCreationTokens > 0 && - (cacheCreation5mTokens !== null || - cacheCreation1hTokens !== null) && { - cache_creation: { - ephemeral_5m_input_tokens: - cacheCreation5mTokens ?? - Math.max( - 0, - cacheCreationTokens - - (cacheCreation1hTokens ?? 0), - ), - ephemeral_1h_input_tokens: - cacheCreation1hTokens ?? 0, - }, - }), - }, - }), - }; - applyExtendedUsageFields(earlyUsage, { - costs: { - inputCost: streamingCostsEarly.inputCost, - outputCost: streamingCostsEarly.outputCost, - cachedInputCost: streamingCostsEarly.cachedInputCost, - cacheWriteInputCost: - streamingCostsEarly.cacheWriteInputCost, - requestCost: streamingCostsEarly.requestCost, - webSearchCost: streamingCostsEarly.webSearchCost, - imageInputCost: streamingCostsEarly.imageInputCost, - imageOutputCost: streamingCostsEarly.imageOutputCost, - totalCost: streamingCostsEarly.totalCost, - dataStorageCost: streamingCostsEarly.dataStorageCost, + completion: fullContent, + toolResults: streamingToolCalls ?? undefined, }, - cachedTokens, - cacheCreationTokens, reasoningTokens, - }); - return earlyUsage; - })(), - }; - - await writeSSEAndCache({ - data: JSON.stringify(finalUsageChunk), - id: String(eventId++), - }); - } catch (error) { - logger.error( - "Error sending final usage chunk", - error instanceof Error ? error : new Error(String(error)), - ); - } + outputImageCount, + image_config?.image_size, + inputImageCount, + webSearchCount, + project.organizationId, + image_config?.image_quality, + { + cacheWriteTokens: cacheCreationTokens, + cacheWrite1hTokens: cacheCreation1hTokens, + }, + )); - // Send healed content if buffering was enabled + // Use costs.promptTokens as canonical value (includes image input + // tokens for providers that exclude them from upstream usage) if ( - shouldBufferForHealing && - bufferedContentChunks.length > 0 && - !streamingError + costs.promptTokens !== null && + costs.promptTokens !== undefined ) { - try { - // Combine buffered content and apply healing - const bufferedContent = bufferedContentChunks.join(""); - const healingResult = healJsonResponse(bufferedContent); - - // Store plugin results for logging - streamingPluginResults.responseHealing = { - healed: healingResult.healed, - healingMethod: healingResult.healingMethod, - }; + const promptDelta = + (costs.promptTokens ?? 0) - (calculatedPromptTokens ?? 0); + if (promptDelta > 0) { + calculatedPromptTokens = costs.promptTokens; + calculatedTotalTokens = + (calculatedTotalTokens ?? 0) + promptDelta; + } + } - if (healingResult.healed) { - logger.debug("Streaming response healing applied", { - method: healingResult.healingMethod, - originalLength: healingResult.originalContent.length, - healedLength: healingResult.content.length, - }); - // Update fullContent with healed version for logging - fullContent = healingResult.content; - } + // Extract plugin IDs for logging + const streamingPluginIds = plugins?.map((p) => p.id) ?? []; - // Send the healed (or original if no healing needed) content as a single chunk - const healedContentChunk = { - id: lastChunkId ?? `chatcmpl-${Date.now()}`, - object: "chat.completion.chunk", - created: lastChunkCreated ?? Math.floor(Date.now() / 1000), - model: lastChunkModel ?? usedModel, - choices: [ - { - index: 0, - delta: { - content: healingResult.content, - }, - finish_reason: null, - }, - ], - }; + // Determine plugin results for logging (includes healing results if applicable) + const finalPluginResults = + Object.keys(streamingPluginResults).length > 0 + ? streamingPluginResults + : undefined; - await writeSSEAndCache({ - data: JSON.stringify(healedContentChunk), - id: String(eventId++), - }); + const baseLogEntry = createLogEntry( + requestId, + project, + apiKey, + providerKey?.id, + usedModelFormatted, + usedModelMapping, + usedProvider, + initialRequestedModel, + requestedProvider, + messages, + temperature, + max_tokens, + top_p, + frequency_penalty, + presence_penalty, + reasoning_effort, + reasoning_max_tokens, + effort, + response_format, + tools, + tool_choice, + source, + customHeaders, + debugMode, + userAgent, + image_config, + routingMetadata, + rawBody, + streamingError ?? streamingRawResponseData, // Raw SSE data sent back to the client + requestBody, // The request sent to the provider + streamingError ?? rawUpstreamData, // Raw streaming data received from upstream provider + streamingPluginIds, + finalPluginResults, // Plugin results including healing (if enabled) + ); - // Send finish_reason chunk - const finishChunk = { - id: lastChunkId ?? `chatcmpl-${Date.now()}`, - object: "chat.completion.chunk", - created: lastChunkCreated ?? Math.floor(Date.now() / 1000), - model: lastChunkModel ?? usedModel, - choices: [ - { - index: 0, - delta: {}, - finish_reason: mapFinishReasonToOpenai( - finishReason, - usedProvider, - !!streamingToolCalls && streamingToolCalls.length > 0, - ), - }, - ], - }; + // Enhanced logging for Google models streaming to debug missing responses + if (isGoogleCompatibleProvider(usedProvider)) { + logger.debug("Google model streaming response completed", { + usedProvider, + usedModel, + hasContent: !!fullContent, + contentLength: fullContent.length, + finishReason, + promptTokens: calculatedPromptTokens, + completionTokens: calculatedCompletionTokens, + totalTokens: calculatedTotalTokens, + reasoningTokens, + streamingError: streamingError ? String(streamingError) : null, + canceled, + hasToolCalls: + !!streamingToolCalls && streamingToolCalls.length > 0, + }); + } - await writeSSEAndCache({ - data: JSON.stringify(finishChunk), - id: String(eventId++), - }); - } catch (error) { - logger.error( - "Error sending healed content chunk", - error instanceof Error ? error : new Error(String(error)), + // For cancelled requests, determine if we should include token counts for billing + const shouldIncludeTokensForBilling = + !canceled || (canceled && billCancelledRequests); + + const streamingErrorStatusCode = + typeof streamingError === "object" && + streamingError !== null && + "details" in streamingError && + typeof streamingError.details === "object" && + streamingError.details !== null && + "statusCode" in streamingError.details && + typeof streamingError.details.statusCode === "number" + ? streamingError.details.statusCode + : 500; + + await insertLogEntry({ + ...baseLogEntry, + id: routingAttempts.length > 0 ? finalLogId : undefined, + duration, + timeToFirstToken, + timeToFirstReasoningToken, + responseSize: fullContent.length, + content: fullContent, + reasoningContent: fullReasoningContent || null, + finishReason: canceled ? "canceled" : finishReason, + promptTokens: shouldIncludeTokensForBilling + ? (calculatedPromptTokens?.toString() ?? null) + : null, + completionTokens: shouldIncludeTokensForBilling + ? (calculatedCompletionTokens?.toString() ?? null) + : null, + totalTokens: shouldIncludeTokensForBilling + ? (calculatedTotalTokens?.toString() ?? null) + : null, + reasoningTokens: shouldIncludeTokensForBilling + ? (calculatedReasoningTokens?.toString() ?? null) + : null, + cachedTokens: shouldIncludeTokensForBilling + ? (cachedTokens?.toString() ?? null) + : null, + cacheWriteTokens: shouldIncludeTokensForBilling + ? (cacheCreationTokens?.toString() ?? null) + : null, + hasError: streamingError !== null, + errorDetails: streamingError + ? { + statusCode: streamingErrorStatusCode, + statusText: + typeof streamingError === "object" && + streamingError !== null && + "details" in streamingError && + typeof streamingError.details === "object" && + streamingError.details !== null && + "statusText" in streamingError.details && + typeof streamingError.details.statusText === "string" + ? streamingError.details.statusText + : "Streaming Error", + responseText: + typeof streamingError === "object" && + streamingError !== null && + "details" in streamingError && + typeof streamingError.details === "object" && + streamingError.details !== null && + "responseText" in streamingError.details && + typeof streamingError.details.responseText === "string" + ? streamingError.details.responseText + : typeof streamingError === "object" && + streamingError !== null && + "details" in streamingError + ? JSON.stringify(streamingError) + : streamingError instanceof Error + ? streamingError.message + : String(streamingError), + } + : null, + streamed: true, + canceled: canceled, + inputCost: costs.inputCost, + outputCost: costs.outputCost, + cachedInputCost: costs.cachedInputCost, + cacheWriteInputCost: costs.cacheWriteInputCost, + requestCost: costs.requestCost, + webSearchCost: costs.webSearchCost, + imageInputTokens: costs.imageInputTokens?.toString() ?? null, + imageOutputTokens: costs.imageOutputTokens?.toString() ?? null, + imageInputCost: costs.imageInputCost ?? null, + imageOutputCost: costs.imageOutputCost ?? null, + cost: costs.totalCost, + estimatedCost: costs.estimatedCost, + discount: costs.discount, + pricingTier: costs.pricingTier, + dataStorageCost: shouldIncludeTokensForBilling + ? calculateDataStorageCost( + calculatedPromptTokens, + cachedTokens, + calculatedCompletionTokens, + calculatedReasoningTokens, + retentionLevel, + ) + : "0", + cached: false, + tools, + toolResults: streamingToolCalls, + toolChoice: tool_choice, + }); + + // Report key health for the selected token source + if (envVarName !== undefined) { + if (streamingError !== null) { + reportKeyError( + envVarName, + configIndex, + streamingErrorStatusCode, ); + } else { + reportKeySuccess(envVarName, configIndex); + } + } + if (providerKey?.id) { + if (streamingError !== null) { + reportTrackedKeyError(providerKey.id, streamingErrorStatusCode); + } else { + reportTrackedKeySuccess(providerKey.id); } } - // Send routing metadata for all attempts (including successful) - if (routingAttempts.length > 0 && !doneSent) { + // Save streaming cache if enabled and not canceled and no errors + if ( + cachingEnabled && + streamingCacheKey && + !canceled && + finishReason && + !streamingError + ) { try { - const routingChunk = { - id: `chatcmpl-${Date.now()}`, - object: "chat.completion.chunk", - created: Math.floor(Date.now() / 1000), - model: usedModel, - choices: [ - { - index: 0, - delta: {}, - finish_reason: null, - }, - ], + const streamingCacheData = { + chunks: streamingChunks, metadata: { - requested_model: initialRequestedModel, - requested_provider: requestedProvider ?? null, - used_model: baseModelName, - used_provider: usedProvider, - ...(usedRegion && { used_region: usedRegion }), - underlying_used_model: usedModel, - routing: routingAttempts, + model: usedModel, + provider: usedProvider, + finishReason: finishReason, + totalChunks: streamingChunks.length, + duration: duration, + completed: true, }, }; - await writeSSEAndCache({ - data: JSON.stringify(routingChunk), - id: String(eventId++), - }); - } catch (error) { - logger.error( - "Error sending routing metadata chunk", - error instanceof Error ? error : new Error(String(error)), - ); - } - } - // Always send [DONE] at the end of streaming if not already sent - if (!doneSent) { - try { - await writeSSEAndCache({ - event: "done", - data: "[DONE]", - id: String(eventId++), - }); + await setStreamingCache( + streamingCacheKey, + streamingCacheData, + cacheDuration, + ); } catch (error) { logger.error( - "Error sending [DONE] event", + "Error saving streaming cache", error instanceof Error ? error : new Error(String(error)), ); } } } - - // Clean up keepalive before any potentially-throwing operations (insertLog, etc.) - // clearInterval is idempotent so calling it multiple times is safe - clearKeepalive(); - - if (splitTaggedReasoning && !fullReasoningContent) { - const splitContent = splitReasoningFromTaggedContent(fullContent); - if (splitContent.reasoningContent) { - fullContent = splitContent.content ?? ""; - fullReasoningContent = splitContent.reasoningContent; - } - } - - // Reuse costs calculated earlier (before usage chunk was sent) - // If we came through the error path (hasEmptyResponse), calculate now - const billCancelledRequests = shouldBillCancelledRequests(); - const costs = - streamingCostsEarly ?? - (canceled && !billCancelledRequests - ? { - inputCost: null, - outputCost: null, - cachedInputCost: null, - cacheWriteInputCost: null, - requestCost: null, - webSearchCost: null, - imageInputTokens: null, - imageOutputTokens: null, - imageInputCost: null, - imageOutputCost: null, - totalCost: null, - promptTokens: null, - completionTokens: null, - cachedTokens: null, - cacheWriteTokens: null, - estimatedCost: false, - discount: undefined, - pricingTier: undefined, - dataStorageCost: null as number | null, - } - : await calculateCosts( - usedModel, - usedProvider, - calculatedPromptTokens, - calculatedCompletionTokens, - cachedTokens, - { - prompt: messages - .map((m) => messageContentToString(m.content)) - .join("\n"), - completion: fullContent, - toolResults: streamingToolCalls ?? undefined, - }, - reasoningTokens, - outputImageCount, - image_config?.image_size, - inputImageCount, - webSearchCount, - project.organizationId, - image_config?.image_quality, - { - cacheWriteTokens: cacheCreationTokens, - cacheWrite1hTokens: cacheCreation1hTokens, - }, - )); - - // Use costs.promptTokens as canonical value (includes image input - // tokens for providers that exclude them from upstream usage) - if (costs.promptTokens !== null && costs.promptTokens !== undefined) { - const promptDelta = - (costs.promptTokens ?? 0) - (calculatedPromptTokens ?? 0); - if (promptDelta > 0) { - calculatedPromptTokens = costs.promptTokens; - calculatedTotalTokens = - (calculatedTotalTokens ?? 0) + promptDelta; - } - } - - // Extract plugin IDs for logging - const streamingPluginIds = plugins?.map((p) => p.id) ?? []; - - // Determine plugin results for logging (includes healing results if applicable) - const finalPluginResults = - Object.keys(streamingPluginResults).length > 0 - ? streamingPluginResults - : undefined; - - const baseLogEntry = createLogEntry( - requestId, - project, - apiKey, - providerKey?.id, - usedModelFormatted, - usedModelMapping, - usedProvider, - initialRequestedModel, - requestedProvider, - messages, - temperature, - max_tokens, - top_p, - frequency_penalty, - presence_penalty, - reasoning_effort, - reasoning_max_tokens, - effort, - response_format, - tools, - tool_choice, - source, - customHeaders, - debugMode, - userAgent, - image_config, - routingMetadata, - rawBody, - streamingError ?? streamingRawResponseData, // Raw SSE data sent back to the client - requestBody, // The request sent to the provider - streamingError ?? rawUpstreamData, // Raw streaming data received from upstream provider - streamingPluginIds, - finalPluginResults, // Plugin results including healing (if enabled) - ); - - // Enhanced logging for Google models streaming to debug missing responses - if (isGoogleCompatibleProvider(usedProvider)) { - logger.debug("Google model streaming response completed", { - usedProvider, - usedModel, - hasContent: !!fullContent, - contentLength: fullContent.length, - finishReason, - promptTokens: calculatedPromptTokens, - completionTokens: calculatedCompletionTokens, - totalTokens: calculatedTotalTokens, - reasoningTokens, - streamingError: streamingError ? String(streamingError) : null, - canceled, - hasToolCalls: - !!streamingToolCalls && streamingToolCalls.length > 0, - }); - } - - // For cancelled requests, determine if we should include token counts for billing - const shouldIncludeTokensForBilling = - !canceled || (canceled && billCancelledRequests); - - const streamingErrorStatusCode = - typeof streamingError === "object" && - streamingError !== null && - "details" in streamingError && - typeof streamingError.details === "object" && - streamingError.details !== null && - "statusCode" in streamingError.details && - typeof streamingError.details.statusCode === "number" - ? streamingError.details.statusCode - : 500; - - await insertLogEntry({ - ...baseLogEntry, - id: routingAttempts.length > 0 ? finalLogId : undefined, - duration, - timeToFirstToken, - timeToFirstReasoningToken, - responseSize: fullContent.length, - content: fullContent, - reasoningContent: fullReasoningContent || null, - finishReason: canceled ? "canceled" : finishReason, - promptTokens: shouldIncludeTokensForBilling - ? (calculatedPromptTokens?.toString() ?? null) - : null, - completionTokens: shouldIncludeTokensForBilling - ? (calculatedCompletionTokens?.toString() ?? null) - : null, - totalTokens: shouldIncludeTokensForBilling - ? (calculatedTotalTokens?.toString() ?? null) - : null, - reasoningTokens: shouldIncludeTokensForBilling - ? (calculatedReasoningTokens?.toString() ?? null) - : null, - cachedTokens: shouldIncludeTokensForBilling - ? (cachedTokens?.toString() ?? null) - : null, - cacheWriteTokens: shouldIncludeTokensForBilling - ? (cacheCreationTokens?.toString() ?? null) - : null, - hasError: streamingError !== null, - errorDetails: streamingError - ? { - statusCode: streamingErrorStatusCode, - statusText: - typeof streamingError === "object" && - streamingError !== null && - "details" in streamingError && - typeof streamingError.details === "object" && - streamingError.details !== null && - "statusText" in streamingError.details && - typeof streamingError.details.statusText === "string" - ? streamingError.details.statusText - : "Streaming Error", - responseText: - typeof streamingError === "object" && - streamingError !== null && - "details" in streamingError && - typeof streamingError.details === "object" && - streamingError.details !== null && - "responseText" in streamingError.details && - typeof streamingError.details.responseText === "string" - ? streamingError.details.responseText - : typeof streamingError === "object" && - streamingError !== null && - "details" in streamingError - ? JSON.stringify(streamingError) - : streamingError instanceof Error - ? streamingError.message - : String(streamingError), - } - : null, - streamed: true, - canceled: canceled, - inputCost: costs.inputCost, - outputCost: costs.outputCost, - cachedInputCost: costs.cachedInputCost, - cacheWriteInputCost: costs.cacheWriteInputCost, - requestCost: costs.requestCost, - webSearchCost: costs.webSearchCost, - imageInputTokens: costs.imageInputTokens?.toString() ?? null, - imageOutputTokens: costs.imageOutputTokens?.toString() ?? null, - imageInputCost: costs.imageInputCost ?? null, - imageOutputCost: costs.imageOutputCost ?? null, - cost: costs.totalCost, - estimatedCost: costs.estimatedCost, - discount: costs.discount, - pricingTier: costs.pricingTier, - dataStorageCost: shouldIncludeTokensForBilling - ? calculateDataStorageCost( - calculatedPromptTokens, - cachedTokens, - calculatedCompletionTokens, - calculatedReasoningTokens, - retentionLevel, - ) - : "0", - cached: false, - tools, - toolResults: streamingToolCalls, - toolChoice: tool_choice, - }); - - // Report key health for the selected token source - if (envVarName !== undefined) { - if (streamingError !== null) { - reportKeyError(envVarName, configIndex, streamingErrorStatusCode); - } else { - reportKeySuccess(envVarName, configIndex); - } - } - if (providerKey?.id) { - if (streamingError !== null) { - reportTrackedKeyError(providerKey.id, streamingErrorStatusCode); - } else { - reportTrackedKeySuccess(providerKey.id); - } - } - - // Save streaming cache if enabled and not canceled and no errors - if ( - cachingEnabled && - streamingCacheKey && - !canceled && - finishReason && - !streamingError - ) { - try { - const streamingCacheData = { - chunks: streamingChunks, - metadata: { - model: usedModel, - provider: usedProvider, - finishReason: finishReason, - totalChunks: streamingChunks.length, - duration: duration, - completed: true, - }, - }; - - await setStreamingCache( - streamingCacheKey, - streamingCacheData, - cacheDuration, - ); - } catch (error) { - logger.error( - "Error saving streaming cache", - error instanceof Error ? error : new Error(String(error)), - ); - } - } - } + })().finally(() => { + finishStreamCompletion(c); + }); }, async (error) => { if (error.name === "TimeoutError") { @@ -7999,6 +8074,7 @@ chat.openapi(completions, async (c) => { } else { logger.error("Streaming request error (escaped handler)", error); } + finishStreamCompletion(c); }, ); } @@ -9315,8 +9391,6 @@ chat.openapi(completions, async (c) => { reasoningTokens, cachedTokens, cacheCreationTokens, - cacheCreation5mTokens, - cacheCreation1hTokens, imageInputTokens, imageOutputTokens, toolResults, @@ -9436,10 +9510,6 @@ chat.openapi(completions, async (c) => { webSearchCount, project.organizationId, image_config?.image_quality, - { - cacheWriteTokens: cacheCreationTokens, - cacheWrite1hTokens: cacheCreation1hTokens, - }, ); costs.dataStorageCost = toDataStorageCostNumber( costs.promptTokens ?? calculatedPromptTokens, @@ -9491,7 +9561,6 @@ chat.openapi(completions, async (c) => { inputCost: costs.inputCost, outputCost: costs.outputCost, cachedInputCost: costs.cachedInputCost, - cacheWriteInputCost: costs.cacheWriteInputCost, requestCost: costs.requestCost, webSearchCost: costs.webSearchCost, imageInputCost: costs.imageInputCost, @@ -9508,8 +9577,6 @@ chat.openapi(completions, async (c) => { cacheCreationTokens, imageInputTokens, imageOutputTokens, - cacheCreation5mTokens, - cacheCreation1hTokens, ); // Extract plugin IDs for logging @@ -9626,7 +9693,6 @@ chat.openapi(completions, async (c) => { ).toString(), reasoningTokens: calculatedReasoningTokens?.toString() ?? null, cachedTokens: cachedTokens?.toString() ?? null, - cacheWriteTokens: cacheCreationTokens?.toString() ?? null, hasError: hasEmptyNonStreamingResponse, streamed: false, canceled: false, @@ -9641,7 +9707,6 @@ chat.openapi(completions, async (c) => { inputCost: costs.inputCost, outputCost: costs.outputCost, cachedInputCost: costs.cachedInputCost, - cacheWriteInputCost: costs.cacheWriteInputCost, requestCost: costs.requestCost, webSearchCost: costs.webSearchCost, imageInputTokens: costs.imageInputTokens?.toString() ?? null, diff --git a/apps/gateway/src/chat/middleware/chat-completion-log.spec.ts b/apps/gateway/src/chat/middleware/chat-completion-log.spec.ts new file mode 100644 index 000000000..0591052c5 --- /dev/null +++ b/apps/gateway/src/chat/middleware/chat-completion-log.spec.ts @@ -0,0 +1,32 @@ +import { describe, expect, it } from "vitest"; + +import { shouldSynthesizeClientError } from "./chat-completion-log.js"; + +describe("shouldSynthesizeClientError", () => { + it("synthesizes for 4xx responses when no logs are queued", () => { + expect(shouldSynthesizeClientError(400, [])).toBe(true); + expect(shouldSynthesizeClientError(429, [])).toBe(true); + }); + + it("skips synthesis when any terminal log is already queued", () => { + expect( + shouldSynthesizeClientError(400, [ + { + finishReason: "canceled", + } as never, + ]), + ).toBe(false); + expect( + shouldSynthesizeClientError(400, [ + { + finishReason: "content_filter", + } as never, + ]), + ).toBe(false); + }); + + it("skips synthesis for non-4xx responses", () => { + expect(shouldSynthesizeClientError(200, [])).toBe(false); + expect(shouldSynthesizeClientError(500, [])).toBe(false); + }); +}); diff --git a/apps/gateway/src/chat/middleware/chat-completion-log.ts b/apps/gateway/src/chat/middleware/chat-completion-log.ts new file mode 100644 index 000000000..f69e6f0f7 --- /dev/null +++ b/apps/gateway/src/chat/middleware/chat-completion-log.ts @@ -0,0 +1,351 @@ +import { createMiddleware } from "hono/factory"; +import { HTTPException } from "hono/http-exception"; + +import { + buildBaseLogEntry, + type ChatCompletionLogState, + updateBaseLogOptions, +} from "@/chat/tools/chat-log-context.js"; +import { extractCustomHeaders } from "@/chat/tools/extract-custom-headers.js"; +import { parseModelInput } from "@/chat/tools/parse-model-input.js"; +import { validateSource } from "@/chat/tools/validate-source.js"; +import { assertApiKeyWithinUsageLimits } from "@/lib/api-key-usage-limits.js"; +import { findApiKeyByToken, findProjectById } from "@/lib/cached-queries.js"; +import { parseApiToken } from "@/lib/extract-api-token.js"; +import { insertLog } from "@/lib/logs.js"; + +import { shortid } from "@llmgateway/db"; +import { logger } from "@llmgateway/logger"; + +import type { ServerTypes } from "@/vars.js"; +import type { LogInsertData } from "@llmgateway/db"; +import type { Context } from "hono"; + +function getRequestId(c: Context): string { + return c.req.header("x-request-id") ?? shortid(40); +} + +function getDebugMode(c: Context): boolean { + return ( + c.req.header("x-debug") === "true" || + process.env.FORCE_DEBUG_MODE === "true" || + process.env.NODE_ENV !== "production" + ); +} + +function getSource(c: Context): string | undefined { + let source = validateSource( + c.req.header("x-source"), + c.req.header("HTTP-Referer"), + ); + const userAgent = c.req.header("User-Agent"); + + if (!source && userAgent && /^claude-cli\/.+/.test(userAgent)) { + source = "claude.com/claude-code"; + } + + return source; +} + +function getRawRequestDetails(rawRequest: unknown): { + messages: unknown[]; + requestedModel: string; + requestedProvider?: string; + usedModelMapping?: string; + usedProvider: string; +} { + const messages = + typeof rawRequest === "object" && + rawRequest !== null && + "messages" in rawRequest && + Array.isArray(rawRequest.messages) + ? rawRequest.messages + : []; + + const requestedModel = + typeof rawRequest === "object" && + rawRequest !== null && + "model" in rawRequest && + typeof rawRequest.model === "string" + ? rawRequest.model + : "unknown"; + + if (requestedModel === "unknown") { + return { + messages, + requestedModel, + usedProvider: "llmgateway", + }; + } + + try { + const parsedModel = parseModelInput(requestedModel); + return { + messages, + requestedModel, + requestedProvider: parsedModel.requestedProvider, + usedModelMapping: parsedModel.requestedModel, + usedProvider: parsedModel.requestedProvider ?? "llmgateway", + }; + } catch { + return { + messages, + requestedModel, + usedProvider: "llmgateway", + }; + } +} + +async function getRawRequestPreview( + state: ChatCompletionLogState, +): Promise { + state.rawRequestPreviewPromise ??= state.rawRequestPreview + ?.json() + .catch(() => undefined); + + return await state.rawRequestPreviewPromise; +} + +async function buildFallbackBaseLogEntry( + c: Context, + state: ChatCompletionLogState, +): Promise | null> { + const existingBaseLogEntry = buildBaseLogEntry(c); + if (existingBaseLogEntry) { + return existingBaseLogEntry; + } + + const token = parseApiToken(c); + if (!token) { + return null; + } + + const apiKey = await findApiKeyByToken(token); + if (!apiKey || apiKey.status !== "active") { + return null; + } + + try { + assertApiKeyWithinUsageLimits(apiKey); + } catch { + return null; + } + + const project = await findProjectById(apiKey.projectId); + if (!project || project.status === "deleted") { + return null; + } + + const rawRequest = await getRawRequestPreview(state); + const rawRequestDetails = getRawRequestDetails(rawRequest); + + updateBaseLogOptions(c, { + requestId: getRequestId(c), + project, + apiKey, + usedModel: rawRequestDetails.requestedModel, + usedModelMapping: rawRequestDetails.usedModelMapping, + usedProvider: rawRequestDetails.usedProvider, + requestedModel: rawRequestDetails.requestedModel, + requestedProvider: rawRequestDetails.requestedProvider, + messages: rawRequestDetails.messages, + customHeaders: extractCustomHeaders(c), + debugMode: getDebugMode(c), + userAgent: c.req.header("User-Agent") ?? undefined, + source: getSource(c), + rawRequest, + }); + + return buildBaseLogEntry(c); +} + +async function getSynthesizedClientErrorDetails( + c: Context, + error: unknown, +): Promise<{ + responseText: string; + statusText: string; +}> { + if (error instanceof HTTPException) { + return { + responseText: error.message, + statusText: error.res?.statusText ?? "Client Error", + }; + } + + try { + const responseText = await c.res.clone().text(); + return { + responseText: responseText || "Client error", + statusText: c.res.statusText ?? "Client Error", + }; + } catch { + return { + responseText: error instanceof Error ? error.message : "Client error", + statusText: + error instanceof Error + ? error.name + : (c.res.statusText ?? "Client Error"), + }; + } +} + +async function getSynthesizedClientErrorLog( + c: Context, + state: ChatCompletionLogState, + status: number, + error: unknown, +): Promise { + const baseLogEntry = await buildFallbackBaseLogEntry(c, state); + if (!baseLogEntry) { + return null; + } + + const { responseText, statusText } = await getSynthesizedClientErrorDetails( + c, + error, + ); + + return { + ...baseLogEntry, + content: null, + responseSize: responseText.length, + finishReason: "client_error", + unifiedFinishReason: "client_error", + promptTokens: null, + completionTokens: null, + totalTokens: null, + reasoningTokens: null, + cachedTokens: null, + hasError: true, + streamed: + typeof baseLogEntry.rawRequest === "object" && + baseLogEntry.rawRequest !== null && + "stream" in baseLogEntry.rawRequest + ? Boolean(baseLogEntry.rawRequest.stream) + : false, + canceled: false, + errorDetails: { + statusCode: status, + statusText, + responseText, + }, + duration: 0, + timeToFirstToken: null, + timeToFirstReasoningToken: null, + inputCost: null, + outputCost: null, + cachedInputCost: null, + requestCost: null, + webSearchCost: null, + imageInputTokens: null, + imageOutputTokens: null, + imageInputCost: null, + imageOutputCost: null, + cost: null, + estimatedCost: false, + discount: null, + pricingTier: null, + dataStorageCost: "0", + cached: false, + toolResults: null, + }; +} + +export function shouldSynthesizeClientError( + status: number, + pendingLogs: LogInsertData[], +): boolean { + return status >= 400 && status < 500 && pendingLogs.length === 0; +} + +async function flushChatCompletionLogs( + c: Context, + state: ChatCompletionLogState, +) { + try { + await state.streamCompletion; + } catch (error) { + logger.error( + "Error waiting for chat stream completion before flushing logs", + error instanceof Error ? error : new Error(String(error)), + ); + } + + const status = + state.caughtError instanceof HTTPException + ? state.caughtError.status + : c.res.status; + + if (shouldSynthesizeClientError(status, state.pendingLogs)) { + const synthesizedLog = await getSynthesizedClientErrorLog( + c, + state, + status, + state.caughtError, + ); + if (synthesizedLog) { + state.pendingLogs.push(synthesizedLog); + state.clientErrorSynthesized = true; + } + } + + for (const logData of state.pendingLogs) { + try { + await insertLog( + { + ...logData, + ...(state.logIdOverride && !logData.retried + ? { id: state.logIdOverride } + : {}), + responsesApiData: + logData.responsesApiData ?? state.responsesApiData ?? null, + internalContentFilter: state.internalContentFilter + ? true + : logData.internalContentFilter, + gatewayContentFilterResponse: + logData.gatewayContentFilterResponse ?? + (state.gatewayContentFilterResponse as + | LogInsertData["gatewayContentFilterResponse"] + | undefined) ?? + null, + }, + { syncInsert: state.syncInsert }, + ); + } catch (error) { + logger.error( + "Failed to flush queued chat completion log", + error instanceof Error ? error : new Error(String(error)), + ); + } + } +} + +export const chatCompletionLogMiddleware = createMiddleware( + async (c, next) => { + const state: ChatCompletionLogState = { + pendingLogs: [], + clientErrorSynthesized: false, + rawRequestPreview: c.req.raw.clone(), + }; + c.set("chatCompletionLogState", state); + + try { + await next(); + } catch (error) { + state.caughtError = error; + throw error; + } finally { + if (state.streamCompletion) { + void flushChatCompletionLogs(c, state).catch((error) => { + logger.error( + "Unexpected failure flushing queued chat completion logs", + error instanceof Error ? error : new Error(String(error)), + ); + }); + } else { + await flushChatCompletionLogs(c, state); + } + } + }, +); diff --git a/apps/gateway/src/chat/tools/chat-log-context.ts b/apps/gateway/src/chat/tools/chat-log-context.ts new file mode 100644 index 000000000..c9d14082f --- /dev/null +++ b/apps/gateway/src/chat/tools/chat-log-context.ts @@ -0,0 +1,145 @@ +import { logger } from "@llmgateway/logger"; + +import { + createLogEntry, + type CreateLogEntryOptions, +} from "./create-log-entry.js"; + +import type { ServerTypes } from "@/vars.js"; +import type { LogInsertData } from "@llmgateway/db"; +import type { Context } from "hono"; + +export interface ChatCompletionLogState { + pendingLogs: LogInsertData[]; + baseLogOptions?: Partial; + rawRequestPreview?: Request; + rawRequestPreviewPromise?: Promise; + streamCompletion?: Promise; + resolveStreamCompletion?: () => void; + caughtError?: unknown; + internalContentFilter?: boolean; + gatewayContentFilterResponse?: unknown; + clientErrorSynthesized?: boolean; + syncInsert?: boolean; + logIdOverride?: string; + responsesApiData?: unknown; +} + +function getOrCreateChatCompletionLogState( + c: Context, +): ChatCompletionLogState { + const existingState = c.get("chatCompletionLogState"); + if (existingState) { + return existingState; + } + + const nextState: ChatCompletionLogState = { + pendingLogs: [], + clientErrorSynthesized: false, + }; + c.set("chatCompletionLogState", nextState); + return nextState; +} + +export function getChatCompletionLogState( + c: Context, +): ChatCompletionLogState | undefined { + return c.get("chatCompletionLogState"); +} + +export function updateBaseLogOptions( + c: Context, + patch: Partial, +) { + const state = getOrCreateChatCompletionLogState(c); + state.baseLogOptions = { + ...state.baseLogOptions, + ...patch, + }; +} + +export function updateLogInsertOptions( + c: Context, + patch: Pick< + ChatCompletionLogState, + "syncInsert" | "logIdOverride" | "responsesApiData" + >, +) { + const state = getOrCreateChatCompletionLogState(c); + state.syncInsert = patch.syncInsert; + state.logIdOverride = patch.logIdOverride; + state.responsesApiData = patch.responsesApiData; +} + +function hasCompleteBaseLogOptions( + options?: Partial, +): options is CreateLogEntryOptions { + return Boolean( + options && + typeof options.requestId === "string" && + options.project && + options.apiKey && + typeof options.usedModel === "string" && + typeof options.usedProvider === "string" && + typeof options.requestedModel === "string" && + Array.isArray(options.messages) && + options.customHeaders !== undefined && + typeof options.debugMode === "boolean", + ); +} + +export function buildBaseLogEntry( + c: Context, + patch: Partial = {}, +) { + const state = getOrCreateChatCompletionLogState(c); + const mergedOptions = { + ...state.baseLogOptions, + ...patch, + }; + + if (!hasCompleteBaseLogOptions(mergedOptions)) { + return null; + } + + return createLogEntry(mergedOptions); +} + +export function enqueueChatLog( + c: Context, + basePatch: Partial, + logFields: Omit>, +) { + const state = getOrCreateChatCompletionLogState(c); + const baseLogEntry = buildBaseLogEntry(c, basePatch); + + if (!baseLogEntry) { + logger.warn( + "Skipping chat log enqueue because base log options are incomplete", + { + requestId: state.baseLogOptions?.requestId, + }, + ); + return; + } + + state.pendingLogs.push({ + ...baseLogEntry, + ...logFields, + }); +} + +export function registerStreamCompletion(c: Context) { + const state = getOrCreateChatCompletionLogState(c); + state.streamCompletion ??= new Promise((resolve) => { + state.resolveStreamCompletion = resolve; + }); + + return state.streamCompletion; +} + +export function finishStreamCompletion(c: Context) { + const state = getOrCreateChatCompletionLogState(c); + state.resolveStreamCompletion?.(); + state.resolveStreamCompletion = undefined; +} diff --git a/apps/gateway/src/chat/tools/transform-streaming-to-openai.ts b/apps/gateway/src/chat/tools/transform-streaming-to-openai.ts index 231b2b2a6..e7f447d5a 100644 --- a/apps/gateway/src/chat/tools/transform-streaming-to-openai.ts +++ b/apps/gateway/src/chat/tools/transform-streaming-to-openai.ts @@ -63,6 +63,26 @@ export function transformStreamingToOpenai( ): any { let transformedData = data; + const mapOpenAIResponsesUsage = (responseUsage: any) => { + if (!responseUsage) { + return null; + } + + return { + prompt_tokens: responseUsage.input_tokens ?? 0, + completion_tokens: responseUsage.output_tokens ?? 0, + total_tokens: responseUsage.total_tokens ?? 0, + ...(responseUsage.output_tokens_details?.reasoning_tokens && { + reasoning_tokens: responseUsage.output_tokens_details.reasoning_tokens, + }), + ...(responseUsage.input_tokens_details?.cached_tokens && { + prompt_tokens_details: { + cached_tokens: responseUsage.input_tokens_details.cached_tokens, + }, + }), + }; + }; + const isKnownNonRenderableAwsBedrockDelta = (delta: any): boolean => { if (!delta || typeof delta !== "object") { return false; @@ -817,7 +837,13 @@ export function transformStreamingToOpenai( case "response.output_text.done": case "response.web_search_call.in_progress": case "response.web_search_call.searching": - case "response.web_search_call.completed": + case "response.web_search_call.completed": { + const responseStatus = data.response?.status; + const isCompletedTerminalEvent = + responseStatus === "completed" && + (data.type === "response.output_item.done" || + data.type === "response.content_part.done" || + data.type === "response.output_text.done"); transformedData = { id: data.response?.id ?? `chatcmpl-${Date.now()}`, object: "chat.completion.chunk", @@ -828,12 +854,15 @@ export function transformStreamingToOpenai( { index: 0, delta: { role: "assistant" }, - finish_reason: null, + finish_reason: isCompletedTerminalEvent ? "stop" : null, }, ], - usage: null, + usage: isCompletedTerminalEvent + ? mapOpenAIResponsesUsage(data.response?.usage) + : null, }; break; + } case "response.reasoning_summary_part.added": case "response.reasoning_summary_text.delta": @@ -956,25 +985,6 @@ export function transformStreamingToOpenai( } case "response.completed": { - const responseUsage = data.response?.usage; - let usage = null; - if (responseUsage) { - usage = { - prompt_tokens: responseUsage.input_tokens ?? 0, - completion_tokens: responseUsage.output_tokens ?? 0, - total_tokens: responseUsage.total_tokens ?? 0, - ...(responseUsage.output_tokens_details?.reasoning_tokens && { - reasoning_tokens: - responseUsage.output_tokens_details.reasoning_tokens, - }), - ...(responseUsage.input_tokens_details?.cached_tokens && { - prompt_tokens_details: { - cached_tokens: - responseUsage.input_tokens_details.cached_tokens, - }, - }), - }; - } transformedData = { id: data.response?.id ?? `chatcmpl-${Date.now()}`, object: "chat.completion.chunk", @@ -988,31 +998,12 @@ export function transformStreamingToOpenai( finish_reason: "stop", }, ], - usage, + usage: mapOpenAIResponsesUsage(data.response?.usage), }; break; } case "response.incomplete": { - const incompleteUsage = data.response?.usage; - let usage = null; - if (incompleteUsage) { - usage = { - prompt_tokens: incompleteUsage.input_tokens ?? 0, - completion_tokens: incompleteUsage.output_tokens ?? 0, - total_tokens: incompleteUsage.total_tokens ?? 0, - ...(incompleteUsage.output_tokens_details?.reasoning_tokens && { - reasoning_tokens: - incompleteUsage.output_tokens_details.reasoning_tokens, - }), - ...(incompleteUsage.input_tokens_details?.cached_tokens && { - prompt_tokens_details: { - cached_tokens: - incompleteUsage.input_tokens_details.cached_tokens, - }, - }), - }; - } const reason = data.response?.incomplete_details?.reason; // Map incomplete reason to appropriate finish_reason const mappedFinishReason = @@ -1030,7 +1021,7 @@ export function transformStreamingToOpenai( finish_reason: mappedFinishReason, }, ], - usage, + usage: mapOpenAIResponsesUsage(data.response?.usage), }; break; } diff --git a/apps/gateway/src/test-utils/test-helpers.ts b/apps/gateway/src/test-utils/test-helpers.ts index c4e7a5991..42cfcc9d0 100644 --- a/apps/gateway/src/test-utils/test-helpers.ts +++ b/apps/gateway/src/test-utils/test-helpers.ts @@ -7,6 +7,10 @@ export async function clearCache() { await redisClient.flushdb(); } +export async function processPendingLogs() { + await processLogQueue(); +} + /** * Helper function to wait for logs to be processed by the worker * @param expectedCount The expected number of logs diff --git a/apps/gateway/src/vars.ts b/apps/gateway/src/vars.ts index bb9187e75..dd4d785c1 100644 --- a/apps/gateway/src/vars.ts +++ b/apps/gateway/src/vars.ts @@ -1,8 +1,10 @@ +import type { ChatCompletionLogState } from "@/chat/tools/chat-log-context.js"; import type { Env } from "hono/types"; export interface ServerTypes extends Env { Variables: { traceId?: string; spanId?: string; + chatCompletionLogState?: ChatCompletionLogState; }; } diff --git a/apps/ui/src/app/dashboard/[orgId]/[projectId]/activity/[logId]/log-detail-client.tsx b/apps/ui/src/app/dashboard/[orgId]/[projectId]/activity/[logId]/log-detail-client.tsx index 92b43aa88..3962b29cd 100644 --- a/apps/ui/src/app/dashboard/[orgId]/[projectId]/activity/[logId]/log-detail-client.tsx +++ b/apps/ui/src/app/dashboard/[orgId]/[projectId]/activity/[logId]/log-detail-client.tsx @@ -155,8 +155,14 @@ function StatusIndicator({ log }: { log: Partial }) { let color = "text-emerald-500"; let bgColor = "bg-emerald-500/10"; let label = "Completed"; + const isClientError = log.unifiedFinishReason === "client_error"; - if (log.hasError || log.unifiedFinishReason === "error") { + if (isClientError) { + StatusIcon = AlertCircle; + color = "text-orange-500"; + bgColor = "bg-orange-500/10"; + label = "Client Error"; + } else if (log.hasError || log.unifiedFinishReason === "error") { StatusIcon = AlertCircle; color = "text-red-500"; bgColor = "bg-red-500/10"; @@ -434,6 +440,7 @@ export function LogDetailClient({ log.dataStorageCost !== null && log.dataStorageCost !== undefined && Number(log.dataStorageCost) > 0; + const isClientError = log.unifiedFinishReason === "client_error"; const throughput = log.duration && log.totalTokens @@ -1202,23 +1209,37 @@ export function LogDetailClient({ {log.hasError && !!log.errorDetails && (
-
+
-

Status Code

+

+ Status Code +

{log.errorDetails.statusCode}

-

Status Text

+

+ Status Text +

{log.errorDetails.statusText}

-

Error Message

+

+ Error Message +

 									{log.errorDetails.responseText}
 								
diff --git a/apps/worker/src/worker.ts b/apps/worker/src/worker.ts index d11777b74..f4fd40c38 100644 --- a/apps/worker/src/worker.ts +++ b/apps/worker/src/worker.ts @@ -266,7 +266,7 @@ export async function processAutoTopUp(): Promise { // Filter organizations that need top-up based on credits vs threshold const filteredOrgs = orgsNeedingTopUp.filter((org) => { - const credits = Number(org.credits || 0); + const credits = Number(org.credits ?? 0); const threshold = Number(org.autoTopUpThreshold ?? 10); return credits < threshold; }); @@ -834,10 +834,10 @@ export async function batchProcessLogs(): Promise { // First, try to deduct from dev plan credits if available if (org && org.devPlan !== "none") { const devPlanCreditsLimit = new Decimal( - org.devPlanCreditsLimit || "0", + org.devPlanCreditsLimit ?? "0", ); const devPlanCreditsUsed = new Decimal( - org.devPlanCreditsUsed || "0", + org.devPlanCreditsUsed ?? "0", ); const devPlanRemaining = devPlanCreditsLimit.minus(devPlanCreditsUsed); diff --git a/packages/shared/src/components/log-card.tsx b/packages/shared/src/components/log-card.tsx index 5e708300e..a42432168 100644 --- a/packages/shared/src/components/log-card.tsx +++ b/packages/shared/src/components/log-card.tsx @@ -384,13 +384,18 @@ export function LogCard({ }); const detailUrl = getDetailUrl?.(log.id); + const isClientError = log.unifiedFinishReason === "client_error"; // Status icon logic let StatusIcon = CheckCircle2; let color = "text-green-500"; let bgColor = "bg-green-100 dark:bg-green-900/30"; - if (log.hasError || log.unifiedFinishReason === "error") { + if (isClientError) { + StatusIcon = AlertCircle; + color = "text-orange-500"; + bgColor = "bg-orange-100 dark:bg-orange-900/30"; + } else if (log.hasError || log.unifiedFinishReason === "error") { StatusIcon = AlertCircle; color = "text-red-500"; bgColor = "bg-red-100 dark:bg-red-900/30"; @@ -475,11 +480,18 @@ export function LogCard({ )} {log.unifiedFinishReason}