From ef4e3cb3f370e668b039c7b07fdfd79b5d49616c Mon Sep 17 00:00:00 2001 From: waibiwaibi <141296631+waibiwaibig@users.noreply.github.com> Date: Sat, 20 Jun 2026 21:06:22 +0800 Subject: [PATCH 1/3] Add SenseVoice local STT model support --- .../server/speech/providers/local/models.ts | 5 + .../providers/local/sherpa/model-catalog.ts | 57 +++++++- .../local/sherpa/model-downloader.test.ts | 22 +++ .../sherpa/sherpa-offline-recognizer.test.ts | 89 ++++++++++++ .../local/sherpa/sherpa-offline-recognizer.ts | 128 ++++++++++++++---- .../speech/providers/local/worker-process.ts | 53 ++++++-- .../speech/speech-config-resolver.test.ts | 31 +++++ public-docs/voice.md | 31 ++++- 8 files changed, 373 insertions(+), 43 deletions(-) create mode 100644 packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.test.ts diff --git a/packages/server/src/server/speech/providers/local/models.ts b/packages/server/src/server/speech/providers/local/models.ts index fbbaaa8d6a..f4c94cfe3a 100644 --- a/packages/server/src/server/speech/providers/local/models.ts +++ b/packages/server/src/server/speech/providers/local/models.ts @@ -2,6 +2,7 @@ import { ensureSherpaOnnxModels, getSherpaOnnxModelDir } from "./sherpa/model-do import { DEFAULT_LOCAL_STT_MODEL, DEFAULT_LOCAL_TTS_MODEL, + getSherpaOnnxModelSpec, LocalSttModelIdSchema, LocalTtsModelIdSchema, listSherpaOnnxModels, @@ -30,6 +31,10 @@ export function getLocalSpeechModelDir(modelsDir: string, modelId: LocalSpeechMo return getSherpaOnnxModelDir(modelsDir, modelId); } +export function getLocalSpeechModelSpec(modelId: LocalSpeechModelId): LocalSpeechModelSpec { + return getSherpaOnnxModelSpec(modelId); +} + export async function ensureLocalSpeechModels(options: { modelsDir: string; modelIds: LocalSpeechModelId[]; diff --git a/packages/server/src/server/speech/providers/local/sherpa/model-catalog.ts b/packages/server/src/server/speech/providers/local/sherpa/model-catalog.ts index 06d838c49a..536a1fafe6 100644 --- a/packages/server/src/server/speech/providers/local/sherpa/model-catalog.ts +++ b/packages/server/src/server/speech/providers/local/sherpa/model-catalog.ts @@ -4,7 +4,23 @@ export type SherpaOnnxModelKind = "stt-offline" | "tts"; type DefaultModelRole = "stt" | "tts"; -interface SherpaOnnxCatalogEntry { +export type SherpaOfflineRecognizerModelSpec = + | { + kind: "nemo_transducer"; + encoder: string; + decoder: string; + joiner: string; + tokens: string; + } + | { + kind: "sense_voice"; + model: string; + tokens: string; + language: "auto"; + useInverseTextNormalization: boolean; + }; + +interface SherpaOnnxCatalogEntryBase { kind: SherpaOnnxModelKind; archiveUrl: string; extractedDir: string; @@ -13,6 +29,15 @@ interface SherpaOnnxCatalogEntry { defaultFor?: DefaultModelRole; } +type SherpaOnnxCatalogEntry = + | (SherpaOnnxCatalogEntryBase & { + kind: "stt-offline"; + recognizer: SherpaOfflineRecognizerModelSpec; + }) + | (SherpaOnnxCatalogEntryBase & { + kind: "tts"; + }); + export const SHERPA_ONNX_MODEL_CATALOG = { "parakeet-tdt-0.6b-v2-int8": { kind: "stt-offline", @@ -20,6 +45,13 @@ export const SHERPA_ONNX_MODEL_CATALOG = { "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2", extractedDir: "sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8", requiredFiles: ["encoder.int8.onnx", "decoder.int8.onnx", "joiner.int8.onnx", "tokens.txt"], + recognizer: { + kind: "nemo_transducer", + encoder: "encoder.int8.onnx", + decoder: "decoder.int8.onnx", + joiner: "joiner.int8.onnx", + tokens: "tokens.txt", + }, description: "NVIDIA Parakeet TDT v2 (offline NeMo transducer, English).", defaultFor: "stt", }, @@ -29,9 +61,32 @@ export const SHERPA_ONNX_MODEL_CATALOG = { "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v3-int8.tar.bz2", extractedDir: "sherpa-onnx-nemo-parakeet-tdt-0.6b-v3-int8", requiredFiles: ["encoder.int8.onnx", "decoder.int8.onnx", "joiner.int8.onnx", "tokens.txt"], + recognizer: { + kind: "nemo_transducer", + encoder: "encoder.int8.onnx", + decoder: "decoder.int8.onnx", + joiner: "joiner.int8.onnx", + tokens: "tokens.txt", + }, description: "NVIDIA Parakeet TDT v3 (offline NeMo transducer, 25 European languages, auto-detected).", }, + "sense-voice-zh-en-ja-ko-yue-int8-2025-09-09": { + kind: "stt-offline", + archiveUrl: + "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09.tar.bz2", + extractedDir: "sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09", + requiredFiles: ["model.int8.onnx", "tokens.txt"], + recognizer: { + kind: "sense_voice", + model: "model.int8.onnx", + tokens: "tokens.txt", + language: "auto", + useInverseTextNormalization: true, + }, + description: + "SenseVoice int8 (offline, Chinese/English/Japanese/Korean/Cantonese, auto-detected).", + }, "kokoro-en-v0_19": { kind: "tts", archiveUrl: diff --git a/packages/server/src/server/speech/providers/local/sherpa/model-downloader.test.ts b/packages/server/src/server/speech/providers/local/sherpa/model-downloader.test.ts index 993c959b4d..b8d6b4bbb8 100644 --- a/packages/server/src/server/speech/providers/local/sherpa/model-downloader.test.ts +++ b/packages/server/src/server/speech/providers/local/sherpa/model-downloader.test.ts @@ -6,6 +6,8 @@ import pino from "pino"; import { ensureSherpaOnnxModel, getSherpaOnnxModelDir } from "./model-downloader.js"; +const SENSE_VOICE_MODEL = "sense-voice-zh-en-ja-ko-yue-int8-2025-09-09"; + function makeTmpDir(): string { return mkdtempSync(path.join(tmpdir(), "paseo-speech-models-")); } @@ -19,6 +21,9 @@ describe("sherpa model downloader", () => { "sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8", ); expect(getSherpaOnnxModelDir(modelsDir, "kokoro-en-v0_19")).toContain("kokoro-en-v0_19"); + expect(getSherpaOnnxModelDir(modelsDir, SENSE_VOICE_MODEL)).toContain( + "sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09", + ); }); test("ensureSherpaOnnxModel succeeds without downloading when files exist", async () => { @@ -38,4 +43,21 @@ describe("sherpa model downloader", () => { expect(out).toBe(modelDir); }); + + test("ensureSherpaOnnxModel accepts existing SenseVoice files without downloading", async () => { + const modelsDir = makeTmpDir(); + const modelDir = getSherpaOnnxModelDir(modelsDir, SENSE_VOICE_MODEL); + + mkdirSync(modelDir, { recursive: true }); + writeFileSync(path.join(modelDir, "model.int8.onnx"), "x"); + writeFileSync(path.join(modelDir, "tokens.txt"), "x"); + + const out = await ensureSherpaOnnxModel({ + modelsDir, + modelId: SENSE_VOICE_MODEL, + logger, + }); + + expect(out).toBe(modelDir); + }); }); diff --git a/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.test.ts b/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.test.ts new file mode 100644 index 0000000000..0680cc432b --- /dev/null +++ b/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.test.ts @@ -0,0 +1,89 @@ +import { mkdtempSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import path from "node:path"; + +import pino from "pino"; +import { beforeEach, describe, expect, it, vi } from "vitest"; + +const mockState = vi.hoisted(() => ({ + offlineRecognizerConfigs: [] as unknown[], +})); + +vi.mock("./sherpa-onnx-node-loader.js", () => ({ + loadSherpaOnnxNode: () => ({ + OfflineRecognizer: class { + public readonly config: unknown; + + constructor(config: unknown) { + this.config = config; + mockState.offlineRecognizerConfigs.push(config); + } + + createStream() { + return {}; + } + + decode() {} + + getResult() { + return ""; + } + }, + }), +})); + +function makeTmpDir(): string { + return mkdtempSync(path.join(tmpdir(), "paseo-sherpa-recognizer-")); +} + +function touch(filePath: string): string { + writeFileSync(filePath, "x"); + return filePath; +} + +describe("SherpaOfflineRecognizerEngine", () => { + beforeEach(() => { + mockState.offlineRecognizerConfigs.length = 0; + }); + + it("initializes SenseVoice recognizers with sherpa-onnx senseVoice config", async () => { + const { SherpaOfflineRecognizerEngine } = await import("./sherpa-offline-recognizer.js"); + const dir = makeTmpDir(); + + const engine = new SherpaOfflineRecognizerEngine( + { + model: { + kind: "sense_voice", + model: touch(path.join(dir, "model.int8.onnx")), + tokens: touch(path.join(dir, "tokens.txt")), + language: "auto", + useInverseTextNormalization: true, + }, + numThreads: 2, + debug: 0, + }, + pino({ level: "silent" }), + ); + + expect(engine.sampleRate).toBe(16000); + expect(mockState.offlineRecognizerConfigs).toEqual([ + { + featConfig: { + sampleRate: 16000, + featureDim: 80, + }, + modelConfig: { + senseVoice: { + model: path.join(dir, "model.int8.onnx"), + language: "auto", + useInverseTextNormalization: 1, + }, + tokens: path.join(dir, "tokens.txt"), + numThreads: 2, + provider: "cpu", + debug: 0, + }, + }, + ]); + }); +}); diff --git a/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.ts b/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.ts index cfde73addf..0d49f609e4 100644 --- a/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.ts +++ b/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.ts @@ -9,7 +9,23 @@ function assertFileExists(filePath: string, label: string): void { } } -export interface SherpaOfflineRecognizerModel { +export type SherpaOfflineRecognizerModel = + | { + kind: "nemo_transducer"; + encoder: string; + decoder: string; + joiner: string; + tokens: string; + } + | { + kind: "sense_voice"; + model: string; + tokens: string; + language?: "auto"; + useInverseTextNormalization?: boolean; + }; + +interface NemoTransducerRecognizerModel { kind: "nemo_transducer"; encoder: string; decoder: string; @@ -17,6 +33,14 @@ export interface SherpaOfflineRecognizerModel { tokens: string; } +interface SenseVoiceRecognizerModel { + kind: "sense_voice"; + model: string; + tokens: string; + language?: "auto"; + useInverseTextNormalization?: boolean; +} + export interface SherpaOfflineRecognizerConfig { model: SherpaOfflineRecognizerModel; numThreads?: number; @@ -42,6 +66,75 @@ interface SherpaOfflineStreamNative { free?: () => void; } +function assertModelFilesExist(model: SherpaOfflineRecognizerModel): void { + if (model.kind === "nemo_transducer") { + assertFileExists(model.encoder, "offline encoder"); + assertFileExists(model.decoder, "offline decoder"); + assertFileExists(model.joiner, "offline joiner"); + assertFileExists(model.tokens, "tokens"); + return; + } + + assertFileExists(model.model, "SenseVoice model"); + assertFileExists(model.tokens, "tokens"); +} + +function buildNemoTransducerRecognizerConfig( + config: SherpaOfflineRecognizerConfig, + model: NemoTransducerRecognizerModel, +) { + return { + featConfig: { + sampleRate: config.sampleRate ?? 16000, + featureDim: config.featureDim ?? 80, + }, + modelConfig: { + transducer: { + encoder: model.encoder, + decoder: model.decoder, + joiner: model.joiner, + }, + tokens: model.tokens, + modelType: "nemo_transducer", + numThreads: config.numThreads ?? 1, + provider: config.provider ?? "cpu", + debug: config.debug ?? 0, + }, + decodingMethod: config.decodingMethod ?? "greedy_search", + maxActivePaths: config.maxActivePaths ?? 4, + }; +} + +function buildSenseVoiceRecognizerConfig( + config: SherpaOfflineRecognizerConfig, + model: SenseVoiceRecognizerModel, +) { + return { + featConfig: { + sampleRate: config.sampleRate ?? 16000, + featureDim: config.featureDim ?? 80, + }, + modelConfig: { + senseVoice: { + model: model.model, + language: model.language ?? "auto", + useInverseTextNormalization: model.useInverseTextNormalization === false ? 0 : 1, + }, + tokens: model.tokens, + numThreads: config.numThreads ?? 1, + provider: config.provider ?? "cpu", + debug: config.debug ?? 0, + }, + }; +} + +function buildRecognizerConfig(config: SherpaOfflineRecognizerConfig) { + if (config.model.kind === "nemo_transducer") { + return buildNemoTransducerRecognizerConfig(config, config.model); + } + return buildSenseVoiceRecognizerConfig(config, config.model); +} + export class SherpaOfflineRecognizerEngine { public readonly recognizer: SherpaOfflineRecognizerNative; public readonly sampleRate: number; @@ -54,33 +147,10 @@ export class SherpaOfflineRecognizerEngine { component: "offline-recognizer", }); - assertFileExists(config.model.encoder, "offline encoder"); - assertFileExists(config.model.decoder, "offline decoder"); - assertFileExists(config.model.joiner, "offline joiner"); - assertFileExists(config.model.tokens, "tokens"); + assertModelFilesExist(config.model); const sherpa = loadSherpaOnnxNode(); - - const recognizerConfig = { - featConfig: { - sampleRate: config.sampleRate ?? 16000, - featureDim: config.featureDim ?? 80, - }, - modelConfig: { - transducer: { - encoder: config.model.encoder, - decoder: config.model.decoder, - joiner: config.model.joiner, - }, - tokens: config.model.tokens, - modelType: "nemo_transducer", - numThreads: config.numThreads ?? 1, - provider: config.provider ?? "cpu", - debug: config.debug ?? 0, - }, - decodingMethod: config.decodingMethod ?? "greedy_search", - maxActivePaths: config.maxActivePaths ?? 4, - }; + const recognizerConfig = buildRecognizerConfig(config); this.recognizer = new ( sherpa as unknown as { @@ -94,7 +164,11 @@ export class SherpaOfflineRecognizerEngine { : recognizerConfig.featConfig.sampleRate; this.logger.info( - { sampleRate: this.sampleRate, numThreads: recognizerConfig.modelConfig.numThreads }, + { + sampleRate: this.sampleRate, + numThreads: recognizerConfig.modelConfig.numThreads, + modelKind: config.model.kind, + }, "Sherpa offline recognizer initialized", ); } diff --git a/packages/server/src/server/speech/providers/local/worker-process.ts b/packages/server/src/server/speech/providers/local/worker-process.ts index 957ee6aaa2..39c9ab93d1 100644 --- a/packages/server/src/server/speech/providers/local/worker-process.ts +++ b/packages/server/src/server/speech/providers/local/worker-process.ts @@ -2,8 +2,16 @@ import pino from "pino"; import type { StreamingTranscriptionSession } from "../../speech-provider.js"; import type { TurnDetectionSession } from "../../turn-detection-provider.js"; -import { getLocalSpeechModelDir, type LocalSttModelId, type LocalTtsModelId } from "./models.js"; -import { SherpaOfflineRecognizerEngine } from "./sherpa/sherpa-offline-recognizer.js"; +import { + getLocalSpeechModelDir, + getLocalSpeechModelSpec, + type LocalSttModelId, + type LocalTtsModelId, +} from "./models.js"; +import { + SherpaOfflineRecognizerEngine, + type SherpaOfflineRecognizerModel, +} from "./sherpa/sherpa-offline-recognizer.js"; import { SherpaOnnxParakeetSTT } from "./sherpa/sherpa-parakeet-stt.js"; import { SherpaParakeetRealtimeTranscriptionSession } from "./sherpa/sherpa-parakeet-realtime-session.js"; import { SherpaOnnxTTS } from "./sherpa/sherpa-tts.js"; @@ -72,6 +80,39 @@ function ttsKey(config: LocalSpeechWorkerConfig): string { ].join(":"); } +function localModelPath(modelDir: string, relPath: string): string { + return `${modelDir}/${relPath}`; +} + +function buildSttRecognizerModel( + modelDir: string, + modelId: LocalSttModelId, +): SherpaOfflineRecognizerModel { + const spec = getLocalSpeechModelSpec(modelId); + if (spec.kind !== "stt-offline") { + throw new Error(`Local model '${modelId}' is not an STT model`); + } + + const recognizer = spec.recognizer; + if (recognizer.kind === "nemo_transducer") { + return { + kind: "nemo_transducer", + encoder: localModelPath(modelDir, recognizer.encoder), + decoder: localModelPath(modelDir, recognizer.decoder), + joiner: localModelPath(modelDir, recognizer.joiner), + tokens: localModelPath(modelDir, recognizer.tokens), + }; + } + + return { + kind: "sense_voice", + model: localModelPath(modelDir, recognizer.model), + tokens: localModelPath(modelDir, recognizer.tokens), + language: recognizer.language, + useInverseTextNormalization: recognizer.useInverseTextNormalization, + }; +} + function getSttEngine( config: LocalSpeechWorkerConfig, model: "voice" | "dictation", @@ -85,13 +126,7 @@ function getSttEngine( const modelDir = getLocalSpeechModelDir(config.modelsDir, modelId); const created = new SherpaOfflineRecognizerEngine( { - model: { - kind: "nemo_transducer", - encoder: `${modelDir}/encoder.int8.onnx`, - decoder: `${modelDir}/decoder.int8.onnx`, - joiner: `${modelDir}/joiner.int8.onnx`, - tokens: `${modelDir}/tokens.txt`, - }, + model: buildSttRecognizerModel(modelDir, modelId), numThreads: 2, debug: 0, }, diff --git a/packages/server/src/server/speech/speech-config-resolver.test.ts b/packages/server/src/server/speech/speech-config-resolver.test.ts index 3d93b3673e..fa85e061ff 100644 --- a/packages/server/src/server/speech/speech-config-resolver.test.ts +++ b/packages/server/src/server/speech/speech-config-resolver.test.ts @@ -5,6 +5,8 @@ import { describe, expect, test } from "vitest"; import { PersistedConfigSchema } from "../persisted-config.js"; import { resolveSpeechConfig } from "./speech-config-resolver.js"; +const SENSE_VOICE_MODEL = "sense-voice-zh-en-ja-ko-yue-int8-2025-09-09"; + describe("resolveSpeechConfig", () => { test("resolves local-first defaults without env overrides", () => { const paseoHome = "/tmp/paseo-home"; @@ -133,6 +135,35 @@ describe("resolveSpeechConfig", () => { expect(result.openai?.stt?.model).toBe("gpt-4o-transcribe"); }); + test("accepts SenseVoice as a local STT model for dictation and voice mode", () => { + const persisted = PersistedConfigSchema.parse({ + features: { + dictation: { + stt: { + provider: "local", + model: SENSE_VOICE_MODEL, + }, + }, + voiceMode: { + stt: { + provider: "local", + model: SENSE_VOICE_MODEL, + }, + }, + }, + }); + + const result = resolveSpeechConfig({ + paseoHome: "/tmp/paseo-home", + env: {} as NodeJS.ProcessEnv, + persisted, + }); + + expect(result.speech.local?.models.dictationStt).toBe(SENSE_VOICE_MODEL); + expect(result.speech.local?.models.voiceStt).toBe(SENSE_VOICE_MODEL); + expect(result.speech.local?.models.voiceTts).toBe("kokoro-en-v0_19"); + }); + test("resolves STT language from env, settings, and voice-to-dictation fallback", () => { const persisted = PersistedConfigSchema.parse({ features: { diff --git a/public-docs/voice.md b/public-docs/voice.md index bb6adbf1ba..b5c082f586 100644 --- a/public-docs/voice.md +++ b/public-docs/voice.md @@ -31,12 +31,15 @@ Missing models are downloaded at daemon startup into `$PASEO_HOME/models/local-s ### Local STT models and language support -| Model ID | Languages | -| --------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `parakeet-tdt-0.6b-v2-int8` | English only (default). Includes punctuation and capitalization. | -| `parakeet-tdt-0.6b-v3-int8` | 25 European languages, auto-detected: Bulgarian, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, German, Greek, Hungarian, Italian, Latvian, Lithuanian, Maltese, Polish, Portuguese, Romanian, Russian, Slovak, Slovenian, Spanish, Swedish, Ukrainian. | +| Model ID | Languages | +| --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `parakeet-tdt-0.6b-v2-int8` | English only (default). Includes punctuation and capitalization. | +| `parakeet-tdt-0.6b-v3-int8` | 25 European languages, auto-detected: Bulgarian, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, German, Greek, Hungarian, Italian, Latvian, Lithuanian, Maltese, Polish, Portuguese, Romanian, Russian, Slovak, Slovenian, Spanish, Swedish, Ukrainian. | +| `sense-voice-zh-en-ja-ko-yue-int8-2025-09-09` | Chinese, English, Japanese, Korean, and Cantonese, auto-detected. | -**To use a non-English language, switch the local STT model to `parakeet-tdt-0.6b-v3-int8`.** v3 detects the spoken language automatically — there is no per-language setting for it. The `language` field below does **not** steer the local Parakeet model (v2 is English-only, v3 auto-detects); it only applies to the OpenAI STT provider. +For supported European languages, switch the local STT model to `parakeet-tdt-0.6b-v3-int8`. v3 detects the spoken language automatically — there is no per-language setting for it. The `language` field below does **not** steer the local Parakeet model (v2 is English-only, v3 auto-detects); it only applies to the OpenAI STT provider. + +For Chinese or Chinese/English mixed local STT, use `sense-voice-zh-en-ja-ko-yue-int8-2025-09-09`. ```json { @@ -72,7 +75,23 @@ For multilingual local dictation, set the model to v3 — it auto-detects the la } ``` -The `language` field applies only to the OpenAI STT provider: set `features.dictation.stt.language` for dictation and `features.voiceMode.stt.language` for realtime voice. If voice language is omitted, Paseo uses the dictation language before falling back to `en`. It has no effect on the local Parakeet models. +For Chinese/English mixed local dictation, set the model to SenseVoice: + +```json +{ + "version": 1, + "features": { + "dictation": { + "stt": { "provider": "local", "model": "sense-voice-zh-en-ja-ko-yue-int8-2025-09-09" } + }, + "voiceMode": { + "stt": { "provider": "local", "model": "sense-voice-zh-en-ja-ko-yue-int8-2025-09-09" } + } + } +} +``` + +The `language` field applies only to the OpenAI STT provider: set `features.dictation.stt.language` for dictation and `features.voiceMode.stt.language` for realtime voice. If voice language is omitted, Paseo uses the dictation language before falling back to `en`. It does not steer local Parakeet or SenseVoice models. ## OpenAI Speech Option From 60cdbc17f8a0831725d2a181da5c8a339e533781 Mon Sep 17 00:00:00 2001 From: waibiwaibi <141296631+waibiwaibig@users.noreply.github.com> Date: Sat, 20 Jun 2026 21:21:49 +0800 Subject: [PATCH 2/3] Use direct SenseVoice model mirrors for local downloads --- .../server/scripts/download-speech-models.ts | 16 ++++++ .../server/scripts/transcribe-local-wav.ts | 2 + .../providers/local/sherpa/model-catalog.ts | 20 +++++++ .../local/sherpa/model-downloader.test.ts | 36 ++++++++++++- .../local/sherpa/model-downloader.ts | 54 +++++++++++++++++++ public-docs/voice.md | 2 + 6 files changed, 128 insertions(+), 2 deletions(-) diff --git a/packages/server/scripts/download-speech-models.ts b/packages/server/scripts/download-speech-models.ts index b75086bb7d..8ba940453b 100644 --- a/packages/server/scripts/download-speech-models.ts +++ b/packages/server/scripts/download-speech-models.ts @@ -7,7 +7,23 @@ import { type LocalSpeechModelId, } from "../src/server/speech/providers/local/models.js"; +function usage(): string { + return [ + "Usage: npm run speech:download -- [--models-dir