From ef4e3cb3f370e668b039c7b07fdfd79b5d49616c Mon Sep 17 00:00:00 2001 From: waibiwaibi <141296631+waibiwaibig@users.noreply.github.com> Date: Sat, 20 Jun 2026 21:06:22 +0800 Subject: [PATCH 1/3] Add SenseVoice local STT model support --- .../server/speech/providers/local/models.ts | 5 + .../providers/local/sherpa/model-catalog.ts | 57 +++++++- .../local/sherpa/model-downloader.test.ts | 22 +++ .../sherpa/sherpa-offline-recognizer.test.ts | 89 ++++++++++++ .../local/sherpa/sherpa-offline-recognizer.ts | 128 ++++++++++++++---- .../speech/providers/local/worker-process.ts | 53 ++++++-- .../speech/speech-config-resolver.test.ts | 31 +++++ public-docs/voice.md | 31 ++++- 8 files changed, 373 insertions(+), 43 deletions(-) create mode 100644 packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.test.ts diff --git a/packages/server/src/server/speech/providers/local/models.ts b/packages/server/src/server/speech/providers/local/models.ts index fbbaaa8d6a..f4c94cfe3a 100644 --- a/packages/server/src/server/speech/providers/local/models.ts +++ b/packages/server/src/server/speech/providers/local/models.ts @@ -2,6 +2,7 @@ import { ensureSherpaOnnxModels, getSherpaOnnxModelDir } from "./sherpa/model-do import { DEFAULT_LOCAL_STT_MODEL, DEFAULT_LOCAL_TTS_MODEL, + getSherpaOnnxModelSpec, LocalSttModelIdSchema, LocalTtsModelIdSchema, listSherpaOnnxModels, @@ -30,6 +31,10 @@ export function getLocalSpeechModelDir(modelsDir: string, modelId: LocalSpeechMo return getSherpaOnnxModelDir(modelsDir, modelId); } +export function getLocalSpeechModelSpec(modelId: LocalSpeechModelId): LocalSpeechModelSpec { + return getSherpaOnnxModelSpec(modelId); +} + export async function ensureLocalSpeechModels(options: { modelsDir: string; modelIds: LocalSpeechModelId[]; diff --git a/packages/server/src/server/speech/providers/local/sherpa/model-catalog.ts b/packages/server/src/server/speech/providers/local/sherpa/model-catalog.ts index 06d838c49a..536a1fafe6 100644 --- a/packages/server/src/server/speech/providers/local/sherpa/model-catalog.ts +++ b/packages/server/src/server/speech/providers/local/sherpa/model-catalog.ts @@ -4,7 +4,23 @@ export type SherpaOnnxModelKind = "stt-offline" | "tts"; type DefaultModelRole = "stt" | "tts"; -interface SherpaOnnxCatalogEntry { +export type SherpaOfflineRecognizerModelSpec = + | { + kind: "nemo_transducer"; + encoder: string; + decoder: string; + joiner: string; + tokens: string; + } + | { + kind: "sense_voice"; + model: string; + tokens: string; + language: "auto"; + useInverseTextNormalization: boolean; + }; + +interface SherpaOnnxCatalogEntryBase { kind: SherpaOnnxModelKind; archiveUrl: string; extractedDir: string; @@ -13,6 +29,15 @@ interface SherpaOnnxCatalogEntry { defaultFor?: DefaultModelRole; } +type SherpaOnnxCatalogEntry = + | (SherpaOnnxCatalogEntryBase & { + kind: "stt-offline"; + recognizer: SherpaOfflineRecognizerModelSpec; + }) + | (SherpaOnnxCatalogEntryBase & { + kind: "tts"; + }); + export const SHERPA_ONNX_MODEL_CATALOG = { "parakeet-tdt-0.6b-v2-int8": { kind: "stt-offline", @@ -20,6 +45,13 @@ export const SHERPA_ONNX_MODEL_CATALOG = { "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2", extractedDir: "sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8", requiredFiles: ["encoder.int8.onnx", "decoder.int8.onnx", "joiner.int8.onnx", "tokens.txt"], + recognizer: { + kind: "nemo_transducer", + encoder: "encoder.int8.onnx", + decoder: "decoder.int8.onnx", + joiner: "joiner.int8.onnx", + tokens: "tokens.txt", + }, description: "NVIDIA Parakeet TDT v2 (offline NeMo transducer, English).", defaultFor: "stt", }, @@ -29,9 +61,32 @@ export const SHERPA_ONNX_MODEL_CATALOG = { "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v3-int8.tar.bz2", extractedDir: "sherpa-onnx-nemo-parakeet-tdt-0.6b-v3-int8", requiredFiles: ["encoder.int8.onnx", "decoder.int8.onnx", "joiner.int8.onnx", "tokens.txt"], + recognizer: { + kind: "nemo_transducer", + encoder: "encoder.int8.onnx", + decoder: "decoder.int8.onnx", + joiner: "joiner.int8.onnx", + tokens: "tokens.txt", + }, description: "NVIDIA Parakeet TDT v3 (offline NeMo transducer, 25 European languages, auto-detected).", }, + "sense-voice-zh-en-ja-ko-yue-int8-2025-09-09": { + kind: "stt-offline", + archiveUrl: + "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09.tar.bz2", + extractedDir: "sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09", + requiredFiles: ["model.int8.onnx", "tokens.txt"], + recognizer: { + kind: "sense_voice", + model: "model.int8.onnx", + tokens: "tokens.txt", + language: "auto", + useInverseTextNormalization: true, + }, + description: + "SenseVoice int8 (offline, Chinese/English/Japanese/Korean/Cantonese, auto-detected).", + }, "kokoro-en-v0_19": { kind: "tts", archiveUrl: diff --git a/packages/server/src/server/speech/providers/local/sherpa/model-downloader.test.ts b/packages/server/src/server/speech/providers/local/sherpa/model-downloader.test.ts index 993c959b4d..b8d6b4bbb8 100644 --- a/packages/server/src/server/speech/providers/local/sherpa/model-downloader.test.ts +++ b/packages/server/src/server/speech/providers/local/sherpa/model-downloader.test.ts @@ -6,6 +6,8 @@ import pino from "pino"; import { ensureSherpaOnnxModel, getSherpaOnnxModelDir } from "./model-downloader.js"; +const SENSE_VOICE_MODEL = "sense-voice-zh-en-ja-ko-yue-int8-2025-09-09"; + function makeTmpDir(): string { return mkdtempSync(path.join(tmpdir(), "paseo-speech-models-")); } @@ -19,6 +21,9 @@ describe("sherpa model downloader", () => { "sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8", ); expect(getSherpaOnnxModelDir(modelsDir, "kokoro-en-v0_19")).toContain("kokoro-en-v0_19"); + expect(getSherpaOnnxModelDir(modelsDir, SENSE_VOICE_MODEL)).toContain( + "sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09", + ); }); test("ensureSherpaOnnxModel succeeds without downloading when files exist", async () => { @@ -38,4 +43,21 @@ describe("sherpa model downloader", () => { expect(out).toBe(modelDir); }); + + test("ensureSherpaOnnxModel accepts existing SenseVoice files without downloading", async () => { + const modelsDir = makeTmpDir(); + const modelDir = getSherpaOnnxModelDir(modelsDir, SENSE_VOICE_MODEL); + + mkdirSync(modelDir, { recursive: true }); + writeFileSync(path.join(modelDir, "model.int8.onnx"), "x"); + writeFileSync(path.join(modelDir, "tokens.txt"), "x"); + + const out = await ensureSherpaOnnxModel({ + modelsDir, + modelId: SENSE_VOICE_MODEL, + logger, + }); + + expect(out).toBe(modelDir); + }); }); diff --git a/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.test.ts b/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.test.ts new file mode 100644 index 0000000000..0680cc432b --- /dev/null +++ b/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.test.ts @@ -0,0 +1,89 @@ +import { mkdtempSync, writeFileSync } from "node:fs"; +import { tmpdir } from "node:os"; +import path from "node:path"; + +import pino from "pino"; +import { beforeEach, describe, expect, it, vi } from "vitest"; + +const mockState = vi.hoisted(() => ({ + offlineRecognizerConfigs: [] as unknown[], +})); + +vi.mock("./sherpa-onnx-node-loader.js", () => ({ + loadSherpaOnnxNode: () => ({ + OfflineRecognizer: class { + public readonly config: unknown; + + constructor(config: unknown) { + this.config = config; + mockState.offlineRecognizerConfigs.push(config); + } + + createStream() { + return {}; + } + + decode() {} + + getResult() { + return ""; + } + }, + }), +})); + +function makeTmpDir(): string { + return mkdtempSync(path.join(tmpdir(), "paseo-sherpa-recognizer-")); +} + +function touch(filePath: string): string { + writeFileSync(filePath, "x"); + return filePath; +} + +describe("SherpaOfflineRecognizerEngine", () => { + beforeEach(() => { + mockState.offlineRecognizerConfigs.length = 0; + }); + + it("initializes SenseVoice recognizers with sherpa-onnx senseVoice config", async () => { + const { SherpaOfflineRecognizerEngine } = await import("./sherpa-offline-recognizer.js"); + const dir = makeTmpDir(); + + const engine = new SherpaOfflineRecognizerEngine( + { + model: { + kind: "sense_voice", + model: touch(path.join(dir, "model.int8.onnx")), + tokens: touch(path.join(dir, "tokens.txt")), + language: "auto", + useInverseTextNormalization: true, + }, + numThreads: 2, + debug: 0, + }, + pino({ level: "silent" }), + ); + + expect(engine.sampleRate).toBe(16000); + expect(mockState.offlineRecognizerConfigs).toEqual([ + { + featConfig: { + sampleRate: 16000, + featureDim: 80, + }, + modelConfig: { + senseVoice: { + model: path.join(dir, "model.int8.onnx"), + language: "auto", + useInverseTextNormalization: 1, + }, + tokens: path.join(dir, "tokens.txt"), + numThreads: 2, + provider: "cpu", + debug: 0, + }, + }, + ]); + }); +}); diff --git a/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.ts b/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.ts index cfde73addf..0d49f609e4 100644 --- a/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.ts +++ b/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.ts @@ -9,7 +9,23 @@ function assertFileExists(filePath: string, label: string): void { } } -export interface SherpaOfflineRecognizerModel { +export type SherpaOfflineRecognizerModel = + | { + kind: "nemo_transducer"; + encoder: string; + decoder: string; + joiner: string; + tokens: string; + } + | { + kind: "sense_voice"; + model: string; + tokens: string; + language?: "auto"; + useInverseTextNormalization?: boolean; + }; + +interface NemoTransducerRecognizerModel { kind: "nemo_transducer"; encoder: string; decoder: string; @@ -17,6 +33,14 @@ export interface SherpaOfflineRecognizerModel { tokens: string; } +interface SenseVoiceRecognizerModel { + kind: "sense_voice"; + model: string; + tokens: string; + language?: "auto"; + useInverseTextNormalization?: boolean; +} + export interface SherpaOfflineRecognizerConfig { model: SherpaOfflineRecognizerModel; numThreads?: number; @@ -42,6 +66,75 @@ interface SherpaOfflineStreamNative { free?: () => void; } +function assertModelFilesExist(model: SherpaOfflineRecognizerModel): void { + if (model.kind === "nemo_transducer") { + assertFileExists(model.encoder, "offline encoder"); + assertFileExists(model.decoder, "offline decoder"); + assertFileExists(model.joiner, "offline joiner"); + assertFileExists(model.tokens, "tokens"); + return; + } + + assertFileExists(model.model, "SenseVoice model"); + assertFileExists(model.tokens, "tokens"); +} + +function buildNemoTransducerRecognizerConfig( + config: SherpaOfflineRecognizerConfig, + model: NemoTransducerRecognizerModel, +) { + return { + featConfig: { + sampleRate: config.sampleRate ?? 16000, + featureDim: config.featureDim ?? 80, + }, + modelConfig: { + transducer: { + encoder: model.encoder, + decoder: model.decoder, + joiner: model.joiner, + }, + tokens: model.tokens, + modelType: "nemo_transducer", + numThreads: config.numThreads ?? 1, + provider: config.provider ?? "cpu", + debug: config.debug ?? 0, + }, + decodingMethod: config.decodingMethod ?? "greedy_search", + maxActivePaths: config.maxActivePaths ?? 4, + }; +} + +function buildSenseVoiceRecognizerConfig( + config: SherpaOfflineRecognizerConfig, + model: SenseVoiceRecognizerModel, +) { + return { + featConfig: { + sampleRate: config.sampleRate ?? 16000, + featureDim: config.featureDim ?? 80, + }, + modelConfig: { + senseVoice: { + model: model.model, + language: model.language ?? "auto", + useInverseTextNormalization: model.useInverseTextNormalization === false ? 0 : 1, + }, + tokens: model.tokens, + numThreads: config.numThreads ?? 1, + provider: config.provider ?? "cpu", + debug: config.debug ?? 0, + }, + }; +} + +function buildRecognizerConfig(config: SherpaOfflineRecognizerConfig) { + if (config.model.kind === "nemo_transducer") { + return buildNemoTransducerRecognizerConfig(config, config.model); + } + return buildSenseVoiceRecognizerConfig(config, config.model); +} + export class SherpaOfflineRecognizerEngine { public readonly recognizer: SherpaOfflineRecognizerNative; public readonly sampleRate: number; @@ -54,33 +147,10 @@ export class SherpaOfflineRecognizerEngine { component: "offline-recognizer", }); - assertFileExists(config.model.encoder, "offline encoder"); - assertFileExists(config.model.decoder, "offline decoder"); - assertFileExists(config.model.joiner, "offline joiner"); - assertFileExists(config.model.tokens, "tokens"); + assertModelFilesExist(config.model); const sherpa = loadSherpaOnnxNode(); - - const recognizerConfig = { - featConfig: { - sampleRate: config.sampleRate ?? 16000, - featureDim: config.featureDim ?? 80, - }, - modelConfig: { - transducer: { - encoder: config.model.encoder, - decoder: config.model.decoder, - joiner: config.model.joiner, - }, - tokens: config.model.tokens, - modelType: "nemo_transducer", - numThreads: config.numThreads ?? 1, - provider: config.provider ?? "cpu", - debug: config.debug ?? 0, - }, - decodingMethod: config.decodingMethod ?? "greedy_search", - maxActivePaths: config.maxActivePaths ?? 4, - }; + const recognizerConfig = buildRecognizerConfig(config); this.recognizer = new ( sherpa as unknown as { @@ -94,7 +164,11 @@ export class SherpaOfflineRecognizerEngine { : recognizerConfig.featConfig.sampleRate; this.logger.info( - { sampleRate: this.sampleRate, numThreads: recognizerConfig.modelConfig.numThreads }, + { + sampleRate: this.sampleRate, + numThreads: recognizerConfig.modelConfig.numThreads, + modelKind: config.model.kind, + }, "Sherpa offline recognizer initialized", ); } diff --git a/packages/server/src/server/speech/providers/local/worker-process.ts b/packages/server/src/server/speech/providers/local/worker-process.ts index 957ee6aaa2..39c9ab93d1 100644 --- a/packages/server/src/server/speech/providers/local/worker-process.ts +++ b/packages/server/src/server/speech/providers/local/worker-process.ts @@ -2,8 +2,16 @@ import pino from "pino"; import type { StreamingTranscriptionSession } from "../../speech-provider.js"; import type { TurnDetectionSession } from "../../turn-detection-provider.js"; -import { getLocalSpeechModelDir, type LocalSttModelId, type LocalTtsModelId } from "./models.js"; -import { SherpaOfflineRecognizerEngine } from "./sherpa/sherpa-offline-recognizer.js"; +import { + getLocalSpeechModelDir, + getLocalSpeechModelSpec, + type LocalSttModelId, + type LocalTtsModelId, +} from "./models.js"; +import { + SherpaOfflineRecognizerEngine, + type SherpaOfflineRecognizerModel, +} from "./sherpa/sherpa-offline-recognizer.js"; import { SherpaOnnxParakeetSTT } from "./sherpa/sherpa-parakeet-stt.js"; import { SherpaParakeetRealtimeTranscriptionSession } from "./sherpa/sherpa-parakeet-realtime-session.js"; import { SherpaOnnxTTS } from "./sherpa/sherpa-tts.js"; @@ -72,6 +80,39 @@ function ttsKey(config: LocalSpeechWorkerConfig): string { ].join(":"); } +function localModelPath(modelDir: string, relPath: string): string { + return `${modelDir}/${relPath}`; +} + +function buildSttRecognizerModel( + modelDir: string, + modelId: LocalSttModelId, +): SherpaOfflineRecognizerModel { + const spec = getLocalSpeechModelSpec(modelId); + if (spec.kind !== "stt-offline") { + throw new Error(`Local model '${modelId}' is not an STT model`); + } + + const recognizer = spec.recognizer; + if (recognizer.kind === "nemo_transducer") { + return { + kind: "nemo_transducer", + encoder: localModelPath(modelDir, recognizer.encoder), + decoder: localModelPath(modelDir, recognizer.decoder), + joiner: localModelPath(modelDir, recognizer.joiner), + tokens: localModelPath(modelDir, recognizer.tokens), + }; + } + + return { + kind: "sense_voice", + model: localModelPath(modelDir, recognizer.model), + tokens: localModelPath(modelDir, recognizer.tokens), + language: recognizer.language, + useInverseTextNormalization: recognizer.useInverseTextNormalization, + }; +} + function getSttEngine( config: LocalSpeechWorkerConfig, model: "voice" | "dictation", @@ -85,13 +126,7 @@ function getSttEngine( const modelDir = getLocalSpeechModelDir(config.modelsDir, modelId); const created = new SherpaOfflineRecognizerEngine( { - model: { - kind: "nemo_transducer", - encoder: `${modelDir}/encoder.int8.onnx`, - decoder: `${modelDir}/decoder.int8.onnx`, - joiner: `${modelDir}/joiner.int8.onnx`, - tokens: `${modelDir}/tokens.txt`, - }, + model: buildSttRecognizerModel(modelDir, modelId), numThreads: 2, debug: 0, }, diff --git a/packages/server/src/server/speech/speech-config-resolver.test.ts b/packages/server/src/server/speech/speech-config-resolver.test.ts index 3d93b3673e..fa85e061ff 100644 --- a/packages/server/src/server/speech/speech-config-resolver.test.ts +++ b/packages/server/src/server/speech/speech-config-resolver.test.ts @@ -5,6 +5,8 @@ import { describe, expect, test } from "vitest"; import { PersistedConfigSchema } from "../persisted-config.js"; import { resolveSpeechConfig } from "./speech-config-resolver.js"; +const SENSE_VOICE_MODEL = "sense-voice-zh-en-ja-ko-yue-int8-2025-09-09"; + describe("resolveSpeechConfig", () => { test("resolves local-first defaults without env overrides", () => { const paseoHome = "/tmp/paseo-home"; @@ -133,6 +135,35 @@ describe("resolveSpeechConfig", () => { expect(result.openai?.stt?.model).toBe("gpt-4o-transcribe"); }); + test("accepts SenseVoice as a local STT model for dictation and voice mode", () => { + const persisted = PersistedConfigSchema.parse({ + features: { + dictation: { + stt: { + provider: "local", + model: SENSE_VOICE_MODEL, + }, + }, + voiceMode: { + stt: { + provider: "local", + model: SENSE_VOICE_MODEL, + }, + }, + }, + }); + + const result = resolveSpeechConfig({ + paseoHome: "/tmp/paseo-home", + env: {} as NodeJS.ProcessEnv, + persisted, + }); + + expect(result.speech.local?.models.dictationStt).toBe(SENSE_VOICE_MODEL); + expect(result.speech.local?.models.voiceStt).toBe(SENSE_VOICE_MODEL); + expect(result.speech.local?.models.voiceTts).toBe("kokoro-en-v0_19"); + }); + test("resolves STT language from env, settings, and voice-to-dictation fallback", () => { const persisted = PersistedConfigSchema.parse({ features: { diff --git a/public-docs/voice.md b/public-docs/voice.md index bb6adbf1ba..b5c082f586 100644 --- a/public-docs/voice.md +++ b/public-docs/voice.md @@ -31,12 +31,15 @@ Missing models are downloaded at daemon startup into `$PASEO_HOME/models/local-s ### Local STT models and language support -| Model ID | Languages | -| --------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| `parakeet-tdt-0.6b-v2-int8` | English only (default). Includes punctuation and capitalization. | -| `parakeet-tdt-0.6b-v3-int8` | 25 European languages, auto-detected: Bulgarian, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, German, Greek, Hungarian, Italian, Latvian, Lithuanian, Maltese, Polish, Portuguese, Romanian, Russian, Slovak, Slovenian, Spanish, Swedish, Ukrainian. | +| Model ID | Languages | +| --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `parakeet-tdt-0.6b-v2-int8` | English only (default). Includes punctuation and capitalization. | +| `parakeet-tdt-0.6b-v3-int8` | 25 European languages, auto-detected: Bulgarian, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, German, Greek, Hungarian, Italian, Latvian, Lithuanian, Maltese, Polish, Portuguese, Romanian, Russian, Slovak, Slovenian, Spanish, Swedish, Ukrainian. | +| `sense-voice-zh-en-ja-ko-yue-int8-2025-09-09` | Chinese, English, Japanese, Korean, and Cantonese, auto-detected. | -**To use a non-English language, switch the local STT model to `parakeet-tdt-0.6b-v3-int8`.** v3 detects the spoken language automatically — there is no per-language setting for it. The `language` field below does **not** steer the local Parakeet model (v2 is English-only, v3 auto-detects); it only applies to the OpenAI STT provider. +For supported European languages, switch the local STT model to `parakeet-tdt-0.6b-v3-int8`. v3 detects the spoken language automatically — there is no per-language setting for it. The `language` field below does **not** steer the local Parakeet model (v2 is English-only, v3 auto-detects); it only applies to the OpenAI STT provider. + +For Chinese or Chinese/English mixed local STT, use `sense-voice-zh-en-ja-ko-yue-int8-2025-09-09`. ```json { @@ -72,7 +75,23 @@ For multilingual local dictation, set the model to v3 — it auto-detects the la } ``` -The `language` field applies only to the OpenAI STT provider: set `features.dictation.stt.language` for dictation and `features.voiceMode.stt.language` for realtime voice. If voice language is omitted, Paseo uses the dictation language before falling back to `en`. It has no effect on the local Parakeet models. +For Chinese/English mixed local dictation, set the model to SenseVoice: + +```json +{ + "version": 1, + "features": { + "dictation": { + "stt": { "provider": "local", "model": "sense-voice-zh-en-ja-ko-yue-int8-2025-09-09" } + }, + "voiceMode": { + "stt": { "provider": "local", "model": "sense-voice-zh-en-ja-ko-yue-int8-2025-09-09" } + } + } +} +``` + +The `language` field applies only to the OpenAI STT provider: set `features.dictation.stt.language` for dictation and `features.voiceMode.stt.language` for realtime voice. If voice language is omitted, Paseo uses the dictation language before falling back to `en`. It does not steer local Parakeet or SenseVoice models. ## OpenAI Speech Option From 60cdbc17f8a0831725d2a181da5c8a339e533781 Mon Sep 17 00:00:00 2001 From: waibiwaibi <141296631+waibiwaibig@users.noreply.github.com> Date: Sat, 20 Jun 2026 21:21:49 +0800 Subject: [PATCH 2/3] Use direct SenseVoice model mirrors for local downloads --- .../server/scripts/download-speech-models.ts | 16 ++++++ .../server/scripts/transcribe-local-wav.ts | 2 + .../providers/local/sherpa/model-catalog.ts | 20 +++++++ .../local/sherpa/model-downloader.test.ts | 36 ++++++++++++- .../local/sherpa/model-downloader.ts | 54 +++++++++++++++++++ public-docs/voice.md | 2 + 6 files changed, 128 insertions(+), 2 deletions(-) diff --git a/packages/server/scripts/download-speech-models.ts b/packages/server/scripts/download-speech-models.ts index b75086bb7d..8ba940453b 100644 --- a/packages/server/scripts/download-speech-models.ts +++ b/packages/server/scripts/download-speech-models.ts @@ -7,7 +7,23 @@ import { type LocalSpeechModelId, } from "../src/server/speech/providers/local/models.js"; +function usage(): string { + return [ + "Usage: npm run speech:download -- [--models-dir ] [--model ]", + "", + "Examples:", + " npm run speech:download -- --model parakeet-tdt-0.6b-v2-int8", + " npm run speech:download -- --model sense-voice-zh-en-ja-ko-yue-int8-2025-09-09", + " npm run speech:download -- --models-dir /tmp/paseo-speech --model sense-voice-zh-en-ja-ko-yue-int8-2025-09-09", + ].join("\n"); +} + function parseArgs(argv: string[]): { modelsDir: string; modelIds: LocalSpeechModelId[] } { + if (argv.includes("--help") || argv.includes("-h")) { + process.stdout.write(`${usage()}\n`); + process.exit(0); + } + const home = resolvePaseoHome(); let modelsDir = process.env.PASEO_LOCAL_MODELS_DIR || `${home}/models/local-speech`; const modelIds: LocalSpeechModelId[] = []; diff --git a/packages/server/scripts/transcribe-local-wav.ts b/packages/server/scripts/transcribe-local-wav.ts index 0b323df029..cbe2debf9f 100644 --- a/packages/server/scripts/transcribe-local-wav.ts +++ b/packages/server/scripts/transcribe-local-wav.ts @@ -120,6 +120,8 @@ async function main(): Promise { const providers: RequestedSpeechProviders = { dictationStt: { provider: "local", explicit: true }, + // Not used for single-file transcription. + voiceTurnDetection: { provider: "local", explicit: false, enabled: false }, voiceStt: { provider: "local", explicit: true }, // Not used here, but required by the shared runtime config shape. voiceTts: { provider: "openai", explicit: false }, diff --git a/packages/server/src/server/speech/providers/local/sherpa/model-catalog.ts b/packages/server/src/server/speech/providers/local/sherpa/model-catalog.ts index 536a1fafe6..0b9fc5b071 100644 --- a/packages/server/src/server/speech/providers/local/sherpa/model-catalog.ts +++ b/packages/server/src/server/speech/providers/local/sherpa/model-catalog.ts @@ -25,6 +25,10 @@ interface SherpaOnnxCatalogEntryBase { archiveUrl: string; extractedDir: string; requiredFiles: string[]; + directFiles?: Array<{ + path: string; + urls: string[]; + }>; description: string; defaultFor?: DefaultModelRole; } @@ -77,6 +81,22 @@ export const SHERPA_ONNX_MODEL_CATALOG = { "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09.tar.bz2", extractedDir: "sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09", requiredFiles: ["model.int8.onnx", "tokens.txt"], + directFiles: [ + { + path: "model.int8.onnx", + urls: [ + "https://hf-mirror.com/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09/resolve/main/model.int8.onnx?download=true", + "https://huggingface.co/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09/resolve/main/model.int8.onnx?download=true", + ], + }, + { + path: "tokens.txt", + urls: [ + "https://hf-mirror.com/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09/resolve/main/tokens.txt?download=true", + "https://huggingface.co/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09/resolve/main/tokens.txt?download=true", + ], + }, + ], recognizer: { kind: "sense_voice", model: "model.int8.onnx", diff --git a/packages/server/src/server/speech/providers/local/sherpa/model-downloader.test.ts b/packages/server/src/server/speech/providers/local/sherpa/model-downloader.test.ts index b8d6b4bbb8..7676d2a6ef 100644 --- a/packages/server/src/server/speech/providers/local/sherpa/model-downloader.test.ts +++ b/packages/server/src/server/speech/providers/local/sherpa/model-downloader.test.ts @@ -1,5 +1,5 @@ -import { describe, expect, test } from "vitest"; -import { mkdtempSync, mkdirSync, writeFileSync } from "node:fs"; +import { afterEach, describe, expect, test, vi } from "vitest"; +import { existsSync, mkdtempSync, mkdirSync, readFileSync, writeFileSync } from "node:fs"; import { tmpdir } from "node:os"; import path from "node:path"; import pino from "pino"; @@ -15,6 +15,10 @@ function makeTmpDir(): string { const logger = pino({ level: "silent" }); describe("sherpa model downloader", () => { + afterEach(() => { + vi.unstubAllGlobals(); + }); + test("getSherpaOnnxModelDir maps modelId to extractedDir", () => { const modelsDir = "/tmp/models"; expect(getSherpaOnnxModelDir(modelsDir, "parakeet-tdt-0.6b-v2-int8")).toContain( @@ -60,4 +64,32 @@ describe("sherpa model downloader", () => { expect(out).toBe(modelDir); }); + + test("ensureSherpaOnnxModel downloads SenseVoice direct files before archive fallback", async () => { + const fetch = vi + .fn() + .mockResolvedValueOnce(new Response("model-bytes")) + .mockResolvedValueOnce(new Response("tokens-bytes")); + vi.stubGlobal("fetch", fetch); + + const modelsDir = makeTmpDir(); + const modelDir = getSherpaOnnxModelDir(modelsDir, SENSE_VOICE_MODEL); + + const out = await ensureSherpaOnnxModel({ + modelsDir, + modelId: SENSE_VOICE_MODEL, + logger, + }); + + expect(out).toBe(modelDir); + expect(existsSync(path.join(modelsDir, ".downloads"))).toBe(false); + expect(readFileSync(path.join(modelDir, "model.int8.onnx"), "utf8")).toBe("model-bytes"); + expect(readFileSync(path.join(modelDir, "tokens.txt"), "utf8")).toBe("tokens-bytes"); + expect(fetch).toHaveBeenCalledWith( + "https://hf-mirror.com/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09/resolve/main/model.int8.onnx?download=true", + ); + expect(fetch).toHaveBeenCalledWith( + "https://hf-mirror.com/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09/resolve/main/tokens.txt?download=true", + ); + }); }); diff --git a/packages/server/src/server/speech/providers/local/sherpa/model-downloader.ts b/packages/server/src/server/speech/providers/local/sherpa/model-downloader.ts index 393237d37c..54aa13e4de 100644 --- a/packages/server/src/server/speech/providers/local/sherpa/model-downloader.ts +++ b/packages/server/src/server/speech/providers/local/sherpa/model-downloader.ts @@ -83,6 +83,40 @@ async function extractTarArchive(archivePath: string, destDir: string): Promise< }); } +async function downloadDirectFiles(options: { + modelDir: string; + directFiles: Array<{ path: string; urls: string[] }>; + logger: pino.Logger; +}): Promise { + const { modelDir, directFiles, logger } = options; + for (const file of directFiles) { + const outputPath = path.join(modelDir, file.path); + if (await isNonEmptyFile(outputPath)) { + continue; + } + + let lastError: unknown = null; + for (const url of file.urls) { + try { + logger.info({ url, outputPath }, "Downloading model file"); + await downloadToFile({ + url, + outputPath, + }); + lastError = null; + break; + } catch (error) { + lastError = error; + logger.warn({ err: error, url, outputPath }, "Model file download failed"); + } + } + + if (lastError) { + throw lastError; + } + } +} + async function isNonEmptyFile(filePath: string): Promise { try { const s = await stat(filePath); @@ -111,6 +145,26 @@ export async function ensureSherpaOnnxModel( logger.info({ modelsDir: options.modelsDir }, "Starting model download"); try { + if (spec.directFiles) { + try { + await downloadDirectFiles({ + modelDir, + directFiles: spec.directFiles, + logger, + }); + if (await hasRequiredFiles(modelDir, spec.requiredFiles)) { + logger.info({ modelDir }, "Model direct file download completed"); + return modelDir; + } + logger.warn( + { modelDir, requiredFiles: spec.requiredFiles }, + "Downloaded direct model files, but required files are still missing", + ); + } catch (error) { + logger.warn({ err: error }, "Direct model file download failed; falling back to archive"); + } + } + const downloadsDir = path.join(options.modelsDir, ".downloads"); const archiveFilename = path.basename(new URL(spec.archiveUrl).pathname); const archivePath = path.join(downloadsDir, archiveFilename); diff --git a/public-docs/voice.md b/public-docs/voice.md index b5c082f586..63bdf95418 100644 --- a/public-docs/voice.md +++ b/public-docs/voice.md @@ -41,6 +41,8 @@ For supported European languages, switch the local STT model to `parakeet-tdt-0. For Chinese or Chinese/English mixed local STT, use `sense-voice-zh-en-ja-ko-yue-int8-2025-09-09`. +Paseo downloads SenseVoice from Hugging Face mirror direct files before falling back to the GitHub release archive. This avoids relying only on GitHub release assets for large local speech model setup. + ```json { "version": 1, From 88de37e2a9db05135680175324b826950ea45453 Mon Sep 17 00:00:00 2001 From: waibiwaibi <141296631+waibiwaibig@users.noreply.github.com> Date: Sat, 20 Jun 2026 22:25:00 +0800 Subject: [PATCH 3/3] Use path.join for local speech model paths --- .../src/server/speech/providers/local/worker-process.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/server/src/server/speech/providers/local/worker-process.ts b/packages/server/src/server/speech/providers/local/worker-process.ts index 39c9ab93d1..6fa48ca3e5 100644 --- a/packages/server/src/server/speech/providers/local/worker-process.ts +++ b/packages/server/src/server/speech/providers/local/worker-process.ts @@ -1,3 +1,5 @@ +import path from "node:path"; + import pino from "pino"; import type { StreamingTranscriptionSession } from "../../speech-provider.js"; @@ -81,7 +83,7 @@ function ttsKey(config: LocalSpeechWorkerConfig): string { } function localModelPath(modelDir: string, relPath: string): string { - return `${modelDir}/${relPath}`; + return path.join(modelDir, relPath); } function buildSttRecognizerModel(