diff --git a/src/ai-sdk/examples/openai-image.ts b/src/ai-sdk/examples/openai-image.ts new file mode 100644 index 00000000..d81605a4 --- /dev/null +++ b/src/ai-sdk/examples/openai-image.ts @@ -0,0 +1,36 @@ +import { generateImage } from "ai"; +import { openai } from "../index"; + +async function main() { + console.log("generating image with gpt-image-1..."); + const { images } = await generateImage({ + model: openai.imageModel("gpt-image-1"), + prompt: + "a futuristic city at sunset, neon lights reflecting on wet streets", + size: "1024x1024", + }); + + console.log(`image generated: ${images[0]!.uint8Array.byteLength} bytes`); + await Bun.write("output/openai-gpt-image.png", images[0]!.uint8Array); + + console.log("\ngenerating with dall-e-3..."); + const { images: dalleImages } = await generateImage({ + model: openai.imageModel("dall-e-3"), + prompt: + "a whimsical treehouse in an enchanted forest, fairy lights, cozy atmosphere", + aspectRatio: "16:9", + providerOptions: { + openai: { + quality: "hd", + style: "vivid", + }, + }, + }); + + console.log(`dall-e-3 image: ${dalleImages[0]!.uint8Array.byteLength} bytes`); + await Bun.write("output/openai-dalle3.png", dalleImages[0]!.uint8Array); + + console.log("\ndone!"); +} + +main().catch(console.error); diff --git a/src/ai-sdk/examples/openai-transcription.ts b/src/ai-sdk/examples/openai-transcription.ts new file mode 100644 index 00000000..5d2348e5 --- /dev/null +++ b/src/ai-sdk/examples/openai-transcription.ts @@ -0,0 +1,37 @@ +import { experimental_transcribe as transcribe } from "ai"; +import { openai } from "../index"; + +async function main() { + const audioPath = process.argv[2] || "media/sample-audio.mp3"; + console.log(`transcribing ${audioPath} with whisper-1...`); + + const audioFile = Bun.file(audioPath); + const audioBuffer = await audioFile.arrayBuffer(); + + const result = await transcribe({ + model: openai.transcriptionModel("whisper-1"), + audio: new Uint8Array(audioBuffer), + providerOptions: { + openai: { + language: "en", + timestamp_granularities: ["segment"], + }, + }, + }); + + console.log("\ntranscription:"); + console.log(result.text); + + if (result.segments && result.segments.length > 0) { + console.log("\nsegments:"); + for (const segment of result.segments) { + console.log( + ` [${segment.startSecond.toFixed(2)}s - ${segment.endSecond.toFixed(2)}s] ${segment.text}`, + ); + } + } + + console.log("\ndone!"); +} + +main().catch(console.error); diff --git a/src/ai-sdk/providers/openai.ts b/src/ai-sdk/providers/openai.ts index caeeb2c7..b9681782 100644 --- a/src/ai-sdk/providers/openai.ts +++ b/src/ai-sdk/providers/openai.ts @@ -3,18 +3,63 @@ import { type OpenAIProvider as OpenAIProviderBase, type OpenAIProviderSettings, } from "@ai-sdk/openai"; -import type { SharedV3Warning } from "@ai-sdk/provider"; +import type { + ImageModelV3, + ImageModelV3CallOptions, + SharedV3Warning, + TranscriptionModelV3, + TranscriptionModelV3CallOptions, +} from "@ai-sdk/provider"; import type { VideoModelV3, VideoModelV3CallOptions } from "../video-model"; // re-export base types export type { OpenAIProviderSettings }; +// video models const VIDEO_MODELS = ["sora-2", "sora-2-pro"] as const; type VideoModelId = (typeof VIDEO_MODELS)[number]; -const SIZE_MAP: Record = { +// image models +const IMAGE_MODELS = [ + "gpt-image-1", + "gpt-image-1-mini", + "gpt-image-1.5", + "dall-e-2", + "dall-e-3", +] as const; +type ImageModelId = (typeof IMAGE_MODELS)[number]; + +// transcription models +const TRANSCRIPTION_MODELS = [ + "whisper-1", + "gpt-4o-transcribe", + "gpt-4o-mini-transcribe", +] as const; +type TranscriptionModelId = (typeof TRANSCRIPTION_MODELS)[number]; + +// sora video size mappings - full support per official api +const VIDEO_SIZE_MAP: Record = { "9:16": "720x1280", "16:9": "1280x720", + "1:1": "1024x1024", + // additional supported sizes + "9:21": "1024x1792", + "21:9": "1792x1024", +}; + +// image size mappings per model +const DALLE3_SIZE_MAP: Record = { + "1:1": "1024x1024", + "16:9": "1792x1024", + "9:16": "1024x1792", +}; + +const DALLE2_SIZES = ["256x256", "512x512", "1024x1024"]; + +const GPT_IMAGE_SIZE_MAP: Record = { + "1:1": "1024x1024", + "3:2": "1536x1024", + "2:3": "1024x1536", }; class OpenAIVideoModel implements VideoModelV3 { @@ -65,14 +110,14 @@ class OpenAIVideoModel implements VideoModelV3 { // size from aspect ratio if (aspectRatio) { - const size = SIZE_MAP[aspectRatio]; + const size = VIDEO_SIZE_MAP[aspectRatio]; if (size) { formData.append("size", size); } else { warnings.push({ type: "unsupported", feature: "aspectRatio", - details: `Aspect ratio ${aspectRatio} not directly supported. Use 9:16, 16:9, or 1:1.`, + details: `Aspect ratio ${aspectRatio} not directly supported. Supported: 9:16, 16:9, 1:1, 9:21, 21:9.`, }); } } @@ -136,7 +181,8 @@ class OpenAIVideoModel implements VideoModelV3 { warnings.push({ type: "unsupported", feature: "resolution", - details: "Use aspectRatio instead. Sora supports 9:16, 16:9, 1:1.", + details: + "Use aspectRatio instead. Sora supports 9:16, 16:9, 1:1, 9:21, 21:9.", }); } @@ -223,8 +269,292 @@ class OpenAIVideoModel implements VideoModelV3 { } } +class OpenAIImageModel implements ImageModelV3 { + readonly specificationVersion = "v3" as const; + readonly provider = "openai"; + readonly modelId: string; + readonly maxImagesPerCall: number; + + private apiKey: string; + private baseURL: string; + + constructor( + modelId: string, + options: { apiKey?: string; baseURL?: string } = {}, + ) { + this.modelId = modelId; + this.apiKey = options.apiKey ?? process.env.OPENAI_API_KEY ?? ""; + this.baseURL = options.baseURL ?? "https://api.openai.com/v1"; + // dall-e-3 only supports n=1, others support more + this.maxImagesPerCall = modelId === "dall-e-3" ? 1 : 10; + } + + async doGenerate(options: ImageModelV3CallOptions) { + const { + prompt, + n = 1, + size, + aspectRatio, + seed, + providerOptions, + abortSignal, + } = options; + const warnings: SharedV3Warning[] = []; + + const isGptImage = this.modelId.startsWith("gpt-image"); + const isDalle3 = this.modelId === "dall-e-3"; + const isDalle2 = this.modelId === "dall-e-2"; + + // build request body + const body: Record = { + model: this.modelId, + prompt: prompt ?? "", + n: isDalle3 ? 1 : n, + }; + + // handle size/aspectRatio + let resolvedSize: string | undefined; + if (size) { + resolvedSize = size; + } else if (aspectRatio) { + if (isGptImage) { + resolvedSize = GPT_IMAGE_SIZE_MAP[aspectRatio]; + } else if (isDalle3) { + resolvedSize = DALLE3_SIZE_MAP[aspectRatio]; + } else if (isDalle2) { + // dalle-2 only supports square + resolvedSize = "1024x1024"; + if (aspectRatio !== "1:1") { + warnings.push({ + type: "unsupported", + feature: "aspectRatio", + details: `DALL-E 2 only supports 1:1. Using 1024x1024.`, + }); + } + } + } + + if (resolvedSize) { + body.size = resolvedSize; + } + + // gpt-image specific options + if (isGptImage) { + // gpt-image always returns b64_json + body.response_format = "b64_json"; + + const gptOptions = providerOptions?.openai as + | Record + | undefined; + if (gptOptions?.quality) { + body.quality = gptOptions.quality; // low, medium, high, auto + } + if (gptOptions?.output_format) { + body.output_format = gptOptions.output_format; // png, jpeg, webp + } + if (gptOptions?.background) { + body.background = gptOptions.background; // transparent, opaque, auto + } + } else { + // dall-e models + body.response_format = "b64_json"; + + if (isDalle3) { + const dalleOptions = providerOptions?.openai as + | Record + | undefined; + if (dalleOptions?.quality) { + body.quality = dalleOptions.quality; // hd, standard + } + if (dalleOptions?.style) { + body.style = dalleOptions.style; // vivid, natural + } + } + } + + // seed not supported + if (seed !== undefined) { + warnings.push({ + type: "unsupported", + feature: "seed", + details: "Seed is not supported by OpenAI image models", + }); + } + + // dall-e-3 only supports n=1 + if (isDalle3 && n > 1) { + warnings.push({ + type: "unsupported", + feature: "n", + details: `DALL-E 3 only supports n=1. Requested ${n}, generating 1 image.`, + }); + } + + const response = await fetch(`${this.baseURL}/images/generations`, { + method: "POST", + headers: { + Authorization: `Bearer ${this.apiKey}`, + "Content-Type": "application/json", + }, + body: JSON.stringify(body), + signal: abortSignal, + }); + + if (!response.ok) { + const error = await response.text(); + throw new Error(`OpenAI image generation failed: ${error}`); + } + + const result = (await response.json()) as { + data: Array<{ b64_json?: string; url?: string; revised_prompt?: string }>; + }; + + const images = result.data.map((item) => { + if (item.b64_json) { + return Uint8Array.from(atob(item.b64_json), (c) => c.charCodeAt(0)); + } + throw new Error("Expected b64_json response from OpenAI"); + }); + + return { + images, + warnings, + response: { + timestamp: new Date(), + modelId: this.modelId, + headers: undefined, + }, + }; + } +} + +class OpenAITranscriptionModel implements TranscriptionModelV3 { + readonly specificationVersion = "v3" as const; + readonly provider = "openai"; + readonly modelId: string; + + private apiKey: string; + private baseURL: string; + + constructor( + modelId: string, + options: { apiKey?: string; baseURL?: string } = {}, + ) { + this.modelId = modelId; + this.apiKey = options.apiKey ?? process.env.OPENAI_API_KEY ?? ""; + this.baseURL = options.baseURL ?? "https://api.openai.com/v1"; + } + + async doGenerate(options: TranscriptionModelV3CallOptions) { + const { audio, mediaType, providerOptions, abortSignal } = options; + const warnings: SharedV3Warning[] = []; + + // convert audio to blob + const audioBytes = + typeof audio === "string" + ? Uint8Array.from(atob(audio), (c) => c.charCodeAt(0)) + : audio; + + // determine file extension from media type + const extMap: Record = { + "audio/flac": "flac", + "audio/mpeg": "mp3", + "audio/mp3": "mp3", + "audio/mp4": "mp4", + "audio/m4a": "m4a", + "audio/ogg": "ogg", + "audio/wav": "wav", + "audio/webm": "webm", + }; + const ext = extMap[mediaType] ?? "mp3"; + + const blob = new Blob([audioBytes], { type: mediaType }); + + // build form data + const formData = new FormData(); + formData.append("file", blob, `audio.${ext}`); + formData.append("model", this.modelId); + + // default to verbose_json for segment timestamps + formData.append("response_format", "verbose_json"); + + // provider options + const openaiOptions = providerOptions?.openai as + | Record + | undefined; + if (openaiOptions?.language) { + formData.append("language", String(openaiOptions.language)); + } + if (openaiOptions?.prompt) { + formData.append("prompt", String(openaiOptions.prompt)); + } + if (openaiOptions?.temperature !== undefined) { + formData.append("temperature", String(openaiOptions.temperature)); + } + // whisper-1 supports timestamp granularities + if ( + this.modelId === "whisper-1" && + openaiOptions?.timestamp_granularities + ) { + const granularities = openaiOptions.timestamp_granularities as string[]; + for (const g of granularities) { + formData.append("timestamp_granularities[]", g); + } + } + + const response = await fetch(`${this.baseURL}/audio/transcriptions`, { + method: "POST", + headers: { + Authorization: `Bearer ${this.apiKey}`, + }, + body: formData, + signal: abortSignal, + }); + + if (!response.ok) { + const error = await response.text(); + throw new Error(`OpenAI transcription failed: ${error}`); + } + + const result = (await response.json()) as { + text: string; + language?: string; + duration?: number; + segments?: Array<{ + start: number; + end: number; + text: string; + }>; + words?: Array<{ + start: number; + end: number; + word: string; + }>; + }; + + return { + text: result.text, + segments: (result.segments ?? []).map((seg) => ({ + text: seg.text, + startSecond: seg.start, + endSecond: seg.end, + })), + language: result.language, + durationInSeconds: result.duration, + warnings, + response: { + timestamp: new Date(), + modelId: this.modelId, + headers: undefined, + }, + }; + } +} + export interface OpenAIProvider extends OpenAIProviderBase { videoModel(modelId: VideoModelId): VideoModelV3; + imageModel(modelId: ImageModelId): ImageModelV3; + transcriptionModel(modelId: TranscriptionModelId): TranscriptionModelV3; } export function createOpenAI( @@ -245,6 +575,22 @@ export function createOpenAI( baseURL: settings.baseURL, }); + // add imageModel method + provider.imageModel = (modelId: ImageModelId): ImageModelV3 => + new OpenAIImageModel(modelId, { + apiKey: settings.apiKey, + baseURL: settings.baseURL, + }); + + // add transcriptionModel method + provider.transcriptionModel = ( + modelId: TranscriptionModelId, + ): TranscriptionModelV3 => + new OpenAITranscriptionModel(modelId, { + apiKey: settings.apiKey, + baseURL: settings.baseURL, + }); + return provider; }