diff --git a/packages/server/scripts/download-speech-models.ts b/packages/server/scripts/download-speech-models.ts
index b75086bb7d..8ba940453b 100644
--- a/packages/server/scripts/download-speech-models.ts
+++ b/packages/server/scripts/download-speech-models.ts
@@ -7,7 +7,23 @@ import {
type LocalSpeechModelId,
} from "../src/server/speech/providers/local/models.js";
+function usage(): string {
+ return [
+ "Usage: npm run speech:download -- [--models-dir
] [--model ]",
+ "",
+ "Examples:",
+ " npm run speech:download -- --model parakeet-tdt-0.6b-v2-int8",
+ " npm run speech:download -- --model sense-voice-zh-en-ja-ko-yue-int8-2025-09-09",
+ " npm run speech:download -- --models-dir /tmp/paseo-speech --model sense-voice-zh-en-ja-ko-yue-int8-2025-09-09",
+ ].join("\n");
+}
+
function parseArgs(argv: string[]): { modelsDir: string; modelIds: LocalSpeechModelId[] } {
+ if (argv.includes("--help") || argv.includes("-h")) {
+ process.stdout.write(`${usage()}\n`);
+ process.exit(0);
+ }
+
const home = resolvePaseoHome();
let modelsDir = process.env.PASEO_LOCAL_MODELS_DIR || `${home}/models/local-speech`;
const modelIds: LocalSpeechModelId[] = [];
diff --git a/packages/server/scripts/transcribe-local-wav.ts b/packages/server/scripts/transcribe-local-wav.ts
index 0b323df029..cbe2debf9f 100644
--- a/packages/server/scripts/transcribe-local-wav.ts
+++ b/packages/server/scripts/transcribe-local-wav.ts
@@ -120,6 +120,8 @@ async function main(): Promise {
const providers: RequestedSpeechProviders = {
dictationStt: { provider: "local", explicit: true },
+ // Not used for single-file transcription.
+ voiceTurnDetection: { provider: "local", explicit: false, enabled: false },
voiceStt: { provider: "local", explicit: true },
// Not used here, but required by the shared runtime config shape.
voiceTts: { provider: "openai", explicit: false },
diff --git a/packages/server/src/server/speech/providers/local/models.ts b/packages/server/src/server/speech/providers/local/models.ts
index fbbaaa8d6a..f4c94cfe3a 100644
--- a/packages/server/src/server/speech/providers/local/models.ts
+++ b/packages/server/src/server/speech/providers/local/models.ts
@@ -2,6 +2,7 @@ import { ensureSherpaOnnxModels, getSherpaOnnxModelDir } from "./sherpa/model-do
import {
DEFAULT_LOCAL_STT_MODEL,
DEFAULT_LOCAL_TTS_MODEL,
+ getSherpaOnnxModelSpec,
LocalSttModelIdSchema,
LocalTtsModelIdSchema,
listSherpaOnnxModels,
@@ -30,6 +31,10 @@ export function getLocalSpeechModelDir(modelsDir: string, modelId: LocalSpeechMo
return getSherpaOnnxModelDir(modelsDir, modelId);
}
+export function getLocalSpeechModelSpec(modelId: LocalSpeechModelId): LocalSpeechModelSpec {
+ return getSherpaOnnxModelSpec(modelId);
+}
+
export async function ensureLocalSpeechModels(options: {
modelsDir: string;
modelIds: LocalSpeechModelId[];
diff --git a/packages/server/src/server/speech/providers/local/sherpa/model-catalog.ts b/packages/server/src/server/speech/providers/local/sherpa/model-catalog.ts
index 06d838c49a..0b9fc5b071 100644
--- a/packages/server/src/server/speech/providers/local/sherpa/model-catalog.ts
+++ b/packages/server/src/server/speech/providers/local/sherpa/model-catalog.ts
@@ -4,15 +4,44 @@ export type SherpaOnnxModelKind = "stt-offline" | "tts";
type DefaultModelRole = "stt" | "tts";
-interface SherpaOnnxCatalogEntry {
+export type SherpaOfflineRecognizerModelSpec =
+ | {
+ kind: "nemo_transducer";
+ encoder: string;
+ decoder: string;
+ joiner: string;
+ tokens: string;
+ }
+ | {
+ kind: "sense_voice";
+ model: string;
+ tokens: string;
+ language: "auto";
+ useInverseTextNormalization: boolean;
+ };
+
+interface SherpaOnnxCatalogEntryBase {
kind: SherpaOnnxModelKind;
archiveUrl: string;
extractedDir: string;
requiredFiles: string[];
+ directFiles?: Array<{
+ path: string;
+ urls: string[];
+ }>;
description: string;
defaultFor?: DefaultModelRole;
}
+type SherpaOnnxCatalogEntry =
+ | (SherpaOnnxCatalogEntryBase & {
+ kind: "stt-offline";
+ recognizer: SherpaOfflineRecognizerModelSpec;
+ })
+ | (SherpaOnnxCatalogEntryBase & {
+ kind: "tts";
+ });
+
export const SHERPA_ONNX_MODEL_CATALOG = {
"parakeet-tdt-0.6b-v2-int8": {
kind: "stt-offline",
@@ -20,6 +49,13 @@ export const SHERPA_ONNX_MODEL_CATALOG = {
"https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2",
extractedDir: "sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8",
requiredFiles: ["encoder.int8.onnx", "decoder.int8.onnx", "joiner.int8.onnx", "tokens.txt"],
+ recognizer: {
+ kind: "nemo_transducer",
+ encoder: "encoder.int8.onnx",
+ decoder: "decoder.int8.onnx",
+ joiner: "joiner.int8.onnx",
+ tokens: "tokens.txt",
+ },
description: "NVIDIA Parakeet TDT v2 (offline NeMo transducer, English).",
defaultFor: "stt",
},
@@ -29,9 +65,48 @@ export const SHERPA_ONNX_MODEL_CATALOG = {
"https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v3-int8.tar.bz2",
extractedDir: "sherpa-onnx-nemo-parakeet-tdt-0.6b-v3-int8",
requiredFiles: ["encoder.int8.onnx", "decoder.int8.onnx", "joiner.int8.onnx", "tokens.txt"],
+ recognizer: {
+ kind: "nemo_transducer",
+ encoder: "encoder.int8.onnx",
+ decoder: "decoder.int8.onnx",
+ joiner: "joiner.int8.onnx",
+ tokens: "tokens.txt",
+ },
description:
"NVIDIA Parakeet TDT v3 (offline NeMo transducer, 25 European languages, auto-detected).",
},
+ "sense-voice-zh-en-ja-ko-yue-int8-2025-09-09": {
+ kind: "stt-offline",
+ archiveUrl:
+ "https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09.tar.bz2",
+ extractedDir: "sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09",
+ requiredFiles: ["model.int8.onnx", "tokens.txt"],
+ directFiles: [
+ {
+ path: "model.int8.onnx",
+ urls: [
+ "https://hf-mirror.com/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09/resolve/main/model.int8.onnx?download=true",
+ "https://huggingface.co/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09/resolve/main/model.int8.onnx?download=true",
+ ],
+ },
+ {
+ path: "tokens.txt",
+ urls: [
+ "https://hf-mirror.com/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09/resolve/main/tokens.txt?download=true",
+ "https://huggingface.co/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09/resolve/main/tokens.txt?download=true",
+ ],
+ },
+ ],
+ recognizer: {
+ kind: "sense_voice",
+ model: "model.int8.onnx",
+ tokens: "tokens.txt",
+ language: "auto",
+ useInverseTextNormalization: true,
+ },
+ description:
+ "SenseVoice int8 (offline, Chinese/English/Japanese/Korean/Cantonese, auto-detected).",
+ },
"kokoro-en-v0_19": {
kind: "tts",
archiveUrl:
diff --git a/packages/server/src/server/speech/providers/local/sherpa/model-downloader.test.ts b/packages/server/src/server/speech/providers/local/sherpa/model-downloader.test.ts
index 993c959b4d..7676d2a6ef 100644
--- a/packages/server/src/server/speech/providers/local/sherpa/model-downloader.test.ts
+++ b/packages/server/src/server/speech/providers/local/sherpa/model-downloader.test.ts
@@ -1,11 +1,13 @@
-import { describe, expect, test } from "vitest";
-import { mkdtempSync, mkdirSync, writeFileSync } from "node:fs";
+import { afterEach, describe, expect, test, vi } from "vitest";
+import { existsSync, mkdtempSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
import { tmpdir } from "node:os";
import path from "node:path";
import pino from "pino";
import { ensureSherpaOnnxModel, getSherpaOnnxModelDir } from "./model-downloader.js";
+const SENSE_VOICE_MODEL = "sense-voice-zh-en-ja-ko-yue-int8-2025-09-09";
+
function makeTmpDir(): string {
return mkdtempSync(path.join(tmpdir(), "paseo-speech-models-"));
}
@@ -13,12 +15,19 @@ function makeTmpDir(): string {
const logger = pino({ level: "silent" });
describe("sherpa model downloader", () => {
+ afterEach(() => {
+ vi.unstubAllGlobals();
+ });
+
test("getSherpaOnnxModelDir maps modelId to extractedDir", () => {
const modelsDir = "/tmp/models";
expect(getSherpaOnnxModelDir(modelsDir, "parakeet-tdt-0.6b-v2-int8")).toContain(
"sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8",
);
expect(getSherpaOnnxModelDir(modelsDir, "kokoro-en-v0_19")).toContain("kokoro-en-v0_19");
+ expect(getSherpaOnnxModelDir(modelsDir, SENSE_VOICE_MODEL)).toContain(
+ "sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09",
+ );
});
test("ensureSherpaOnnxModel succeeds without downloading when files exist", async () => {
@@ -38,4 +47,49 @@ describe("sherpa model downloader", () => {
expect(out).toBe(modelDir);
});
+
+ test("ensureSherpaOnnxModel accepts existing SenseVoice files without downloading", async () => {
+ const modelsDir = makeTmpDir();
+ const modelDir = getSherpaOnnxModelDir(modelsDir, SENSE_VOICE_MODEL);
+
+ mkdirSync(modelDir, { recursive: true });
+ writeFileSync(path.join(modelDir, "model.int8.onnx"), "x");
+ writeFileSync(path.join(modelDir, "tokens.txt"), "x");
+
+ const out = await ensureSherpaOnnxModel({
+ modelsDir,
+ modelId: SENSE_VOICE_MODEL,
+ logger,
+ });
+
+ expect(out).toBe(modelDir);
+ });
+
+ test("ensureSherpaOnnxModel downloads SenseVoice direct files before archive fallback", async () => {
+ const fetch = vi
+ .fn()
+ .mockResolvedValueOnce(new Response("model-bytes"))
+ .mockResolvedValueOnce(new Response("tokens-bytes"));
+ vi.stubGlobal("fetch", fetch);
+
+ const modelsDir = makeTmpDir();
+ const modelDir = getSherpaOnnxModelDir(modelsDir, SENSE_VOICE_MODEL);
+
+ const out = await ensureSherpaOnnxModel({
+ modelsDir,
+ modelId: SENSE_VOICE_MODEL,
+ logger,
+ });
+
+ expect(out).toBe(modelDir);
+ expect(existsSync(path.join(modelsDir, ".downloads"))).toBe(false);
+ expect(readFileSync(path.join(modelDir, "model.int8.onnx"), "utf8")).toBe("model-bytes");
+ expect(readFileSync(path.join(modelDir, "tokens.txt"), "utf8")).toBe("tokens-bytes");
+ expect(fetch).toHaveBeenCalledWith(
+ "https://hf-mirror.com/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09/resolve/main/model.int8.onnx?download=true",
+ );
+ expect(fetch).toHaveBeenCalledWith(
+ "https://hf-mirror.com/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09/resolve/main/tokens.txt?download=true",
+ );
+ });
});
diff --git a/packages/server/src/server/speech/providers/local/sherpa/model-downloader.ts b/packages/server/src/server/speech/providers/local/sherpa/model-downloader.ts
index 393237d37c..54aa13e4de 100644
--- a/packages/server/src/server/speech/providers/local/sherpa/model-downloader.ts
+++ b/packages/server/src/server/speech/providers/local/sherpa/model-downloader.ts
@@ -83,6 +83,40 @@ async function extractTarArchive(archivePath: string, destDir: string): Promise<
});
}
+async function downloadDirectFiles(options: {
+ modelDir: string;
+ directFiles: Array<{ path: string; urls: string[] }>;
+ logger: pino.Logger;
+}): Promise {
+ const { modelDir, directFiles, logger } = options;
+ for (const file of directFiles) {
+ const outputPath = path.join(modelDir, file.path);
+ if (await isNonEmptyFile(outputPath)) {
+ continue;
+ }
+
+ let lastError: unknown = null;
+ for (const url of file.urls) {
+ try {
+ logger.info({ url, outputPath }, "Downloading model file");
+ await downloadToFile({
+ url,
+ outputPath,
+ });
+ lastError = null;
+ break;
+ } catch (error) {
+ lastError = error;
+ logger.warn({ err: error, url, outputPath }, "Model file download failed");
+ }
+ }
+
+ if (lastError) {
+ throw lastError;
+ }
+ }
+}
+
async function isNonEmptyFile(filePath: string): Promise {
try {
const s = await stat(filePath);
@@ -111,6 +145,26 @@ export async function ensureSherpaOnnxModel(
logger.info({ modelsDir: options.modelsDir }, "Starting model download");
try {
+ if (spec.directFiles) {
+ try {
+ await downloadDirectFiles({
+ modelDir,
+ directFiles: spec.directFiles,
+ logger,
+ });
+ if (await hasRequiredFiles(modelDir, spec.requiredFiles)) {
+ logger.info({ modelDir }, "Model direct file download completed");
+ return modelDir;
+ }
+ logger.warn(
+ { modelDir, requiredFiles: spec.requiredFiles },
+ "Downloaded direct model files, but required files are still missing",
+ );
+ } catch (error) {
+ logger.warn({ err: error }, "Direct model file download failed; falling back to archive");
+ }
+ }
+
const downloadsDir = path.join(options.modelsDir, ".downloads");
const archiveFilename = path.basename(new URL(spec.archiveUrl).pathname);
const archivePath = path.join(downloadsDir, archiveFilename);
diff --git a/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.test.ts b/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.test.ts
new file mode 100644
index 0000000000..0680cc432b
--- /dev/null
+++ b/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.test.ts
@@ -0,0 +1,89 @@
+import { mkdtempSync, writeFileSync } from "node:fs";
+import { tmpdir } from "node:os";
+import path from "node:path";
+
+import pino from "pino";
+import { beforeEach, describe, expect, it, vi } from "vitest";
+
+const mockState = vi.hoisted(() => ({
+ offlineRecognizerConfigs: [] as unknown[],
+}));
+
+vi.mock("./sherpa-onnx-node-loader.js", () => ({
+ loadSherpaOnnxNode: () => ({
+ OfflineRecognizer: class {
+ public readonly config: unknown;
+
+ constructor(config: unknown) {
+ this.config = config;
+ mockState.offlineRecognizerConfigs.push(config);
+ }
+
+ createStream() {
+ return {};
+ }
+
+ decode() {}
+
+ getResult() {
+ return "";
+ }
+ },
+ }),
+}));
+
+function makeTmpDir(): string {
+ return mkdtempSync(path.join(tmpdir(), "paseo-sherpa-recognizer-"));
+}
+
+function touch(filePath: string): string {
+ writeFileSync(filePath, "x");
+ return filePath;
+}
+
+describe("SherpaOfflineRecognizerEngine", () => {
+ beforeEach(() => {
+ mockState.offlineRecognizerConfigs.length = 0;
+ });
+
+ it("initializes SenseVoice recognizers with sherpa-onnx senseVoice config", async () => {
+ const { SherpaOfflineRecognizerEngine } = await import("./sherpa-offline-recognizer.js");
+ const dir = makeTmpDir();
+
+ const engine = new SherpaOfflineRecognizerEngine(
+ {
+ model: {
+ kind: "sense_voice",
+ model: touch(path.join(dir, "model.int8.onnx")),
+ tokens: touch(path.join(dir, "tokens.txt")),
+ language: "auto",
+ useInverseTextNormalization: true,
+ },
+ numThreads: 2,
+ debug: 0,
+ },
+ pino({ level: "silent" }),
+ );
+
+ expect(engine.sampleRate).toBe(16000);
+ expect(mockState.offlineRecognizerConfigs).toEqual([
+ {
+ featConfig: {
+ sampleRate: 16000,
+ featureDim: 80,
+ },
+ modelConfig: {
+ senseVoice: {
+ model: path.join(dir, "model.int8.onnx"),
+ language: "auto",
+ useInverseTextNormalization: 1,
+ },
+ tokens: path.join(dir, "tokens.txt"),
+ numThreads: 2,
+ provider: "cpu",
+ debug: 0,
+ },
+ },
+ ]);
+ });
+});
diff --git a/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.ts b/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.ts
index cfde73addf..0d49f609e4 100644
--- a/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.ts
+++ b/packages/server/src/server/speech/providers/local/sherpa/sherpa-offline-recognizer.ts
@@ -9,7 +9,23 @@ function assertFileExists(filePath: string, label: string): void {
}
}
-export interface SherpaOfflineRecognizerModel {
+export type SherpaOfflineRecognizerModel =
+ | {
+ kind: "nemo_transducer";
+ encoder: string;
+ decoder: string;
+ joiner: string;
+ tokens: string;
+ }
+ | {
+ kind: "sense_voice";
+ model: string;
+ tokens: string;
+ language?: "auto";
+ useInverseTextNormalization?: boolean;
+ };
+
+interface NemoTransducerRecognizerModel {
kind: "nemo_transducer";
encoder: string;
decoder: string;
@@ -17,6 +33,14 @@ export interface SherpaOfflineRecognizerModel {
tokens: string;
}
+interface SenseVoiceRecognizerModel {
+ kind: "sense_voice";
+ model: string;
+ tokens: string;
+ language?: "auto";
+ useInverseTextNormalization?: boolean;
+}
+
export interface SherpaOfflineRecognizerConfig {
model: SherpaOfflineRecognizerModel;
numThreads?: number;
@@ -42,6 +66,75 @@ interface SherpaOfflineStreamNative {
free?: () => void;
}
+function assertModelFilesExist(model: SherpaOfflineRecognizerModel): void {
+ if (model.kind === "nemo_transducer") {
+ assertFileExists(model.encoder, "offline encoder");
+ assertFileExists(model.decoder, "offline decoder");
+ assertFileExists(model.joiner, "offline joiner");
+ assertFileExists(model.tokens, "tokens");
+ return;
+ }
+
+ assertFileExists(model.model, "SenseVoice model");
+ assertFileExists(model.tokens, "tokens");
+}
+
+function buildNemoTransducerRecognizerConfig(
+ config: SherpaOfflineRecognizerConfig,
+ model: NemoTransducerRecognizerModel,
+) {
+ return {
+ featConfig: {
+ sampleRate: config.sampleRate ?? 16000,
+ featureDim: config.featureDim ?? 80,
+ },
+ modelConfig: {
+ transducer: {
+ encoder: model.encoder,
+ decoder: model.decoder,
+ joiner: model.joiner,
+ },
+ tokens: model.tokens,
+ modelType: "nemo_transducer",
+ numThreads: config.numThreads ?? 1,
+ provider: config.provider ?? "cpu",
+ debug: config.debug ?? 0,
+ },
+ decodingMethod: config.decodingMethod ?? "greedy_search",
+ maxActivePaths: config.maxActivePaths ?? 4,
+ };
+}
+
+function buildSenseVoiceRecognizerConfig(
+ config: SherpaOfflineRecognizerConfig,
+ model: SenseVoiceRecognizerModel,
+) {
+ return {
+ featConfig: {
+ sampleRate: config.sampleRate ?? 16000,
+ featureDim: config.featureDim ?? 80,
+ },
+ modelConfig: {
+ senseVoice: {
+ model: model.model,
+ language: model.language ?? "auto",
+ useInverseTextNormalization: model.useInverseTextNormalization === false ? 0 : 1,
+ },
+ tokens: model.tokens,
+ numThreads: config.numThreads ?? 1,
+ provider: config.provider ?? "cpu",
+ debug: config.debug ?? 0,
+ },
+ };
+}
+
+function buildRecognizerConfig(config: SherpaOfflineRecognizerConfig) {
+ if (config.model.kind === "nemo_transducer") {
+ return buildNemoTransducerRecognizerConfig(config, config.model);
+ }
+ return buildSenseVoiceRecognizerConfig(config, config.model);
+}
+
export class SherpaOfflineRecognizerEngine {
public readonly recognizer: SherpaOfflineRecognizerNative;
public readonly sampleRate: number;
@@ -54,33 +147,10 @@ export class SherpaOfflineRecognizerEngine {
component: "offline-recognizer",
});
- assertFileExists(config.model.encoder, "offline encoder");
- assertFileExists(config.model.decoder, "offline decoder");
- assertFileExists(config.model.joiner, "offline joiner");
- assertFileExists(config.model.tokens, "tokens");
+ assertModelFilesExist(config.model);
const sherpa = loadSherpaOnnxNode();
-
- const recognizerConfig = {
- featConfig: {
- sampleRate: config.sampleRate ?? 16000,
- featureDim: config.featureDim ?? 80,
- },
- modelConfig: {
- transducer: {
- encoder: config.model.encoder,
- decoder: config.model.decoder,
- joiner: config.model.joiner,
- },
- tokens: config.model.tokens,
- modelType: "nemo_transducer",
- numThreads: config.numThreads ?? 1,
- provider: config.provider ?? "cpu",
- debug: config.debug ?? 0,
- },
- decodingMethod: config.decodingMethod ?? "greedy_search",
- maxActivePaths: config.maxActivePaths ?? 4,
- };
+ const recognizerConfig = buildRecognizerConfig(config);
this.recognizer = new (
sherpa as unknown as {
@@ -94,7 +164,11 @@ export class SherpaOfflineRecognizerEngine {
: recognizerConfig.featConfig.sampleRate;
this.logger.info(
- { sampleRate: this.sampleRate, numThreads: recognizerConfig.modelConfig.numThreads },
+ {
+ sampleRate: this.sampleRate,
+ numThreads: recognizerConfig.modelConfig.numThreads,
+ modelKind: config.model.kind,
+ },
"Sherpa offline recognizer initialized",
);
}
diff --git a/packages/server/src/server/speech/providers/local/worker-process.ts b/packages/server/src/server/speech/providers/local/worker-process.ts
index 957ee6aaa2..6fa48ca3e5 100644
--- a/packages/server/src/server/speech/providers/local/worker-process.ts
+++ b/packages/server/src/server/speech/providers/local/worker-process.ts
@@ -1,9 +1,19 @@
+import path from "node:path";
+
import pino from "pino";
import type { StreamingTranscriptionSession } from "../../speech-provider.js";
import type { TurnDetectionSession } from "../../turn-detection-provider.js";
-import { getLocalSpeechModelDir, type LocalSttModelId, type LocalTtsModelId } from "./models.js";
-import { SherpaOfflineRecognizerEngine } from "./sherpa/sherpa-offline-recognizer.js";
+import {
+ getLocalSpeechModelDir,
+ getLocalSpeechModelSpec,
+ type LocalSttModelId,
+ type LocalTtsModelId,
+} from "./models.js";
+import {
+ SherpaOfflineRecognizerEngine,
+ type SherpaOfflineRecognizerModel,
+} from "./sherpa/sherpa-offline-recognizer.js";
import { SherpaOnnxParakeetSTT } from "./sherpa/sherpa-parakeet-stt.js";
import { SherpaParakeetRealtimeTranscriptionSession } from "./sherpa/sherpa-parakeet-realtime-session.js";
import { SherpaOnnxTTS } from "./sherpa/sherpa-tts.js";
@@ -72,6 +82,39 @@ function ttsKey(config: LocalSpeechWorkerConfig): string {
].join(":");
}
+function localModelPath(modelDir: string, relPath: string): string {
+ return path.join(modelDir, relPath);
+}
+
+function buildSttRecognizerModel(
+ modelDir: string,
+ modelId: LocalSttModelId,
+): SherpaOfflineRecognizerModel {
+ const spec = getLocalSpeechModelSpec(modelId);
+ if (spec.kind !== "stt-offline") {
+ throw new Error(`Local model '${modelId}' is not an STT model`);
+ }
+
+ const recognizer = spec.recognizer;
+ if (recognizer.kind === "nemo_transducer") {
+ return {
+ kind: "nemo_transducer",
+ encoder: localModelPath(modelDir, recognizer.encoder),
+ decoder: localModelPath(modelDir, recognizer.decoder),
+ joiner: localModelPath(modelDir, recognizer.joiner),
+ tokens: localModelPath(modelDir, recognizer.tokens),
+ };
+ }
+
+ return {
+ kind: "sense_voice",
+ model: localModelPath(modelDir, recognizer.model),
+ tokens: localModelPath(modelDir, recognizer.tokens),
+ language: recognizer.language,
+ useInverseTextNormalization: recognizer.useInverseTextNormalization,
+ };
+}
+
function getSttEngine(
config: LocalSpeechWorkerConfig,
model: "voice" | "dictation",
@@ -85,13 +128,7 @@ function getSttEngine(
const modelDir = getLocalSpeechModelDir(config.modelsDir, modelId);
const created = new SherpaOfflineRecognizerEngine(
{
- model: {
- kind: "nemo_transducer",
- encoder: `${modelDir}/encoder.int8.onnx`,
- decoder: `${modelDir}/decoder.int8.onnx`,
- joiner: `${modelDir}/joiner.int8.onnx`,
- tokens: `${modelDir}/tokens.txt`,
- },
+ model: buildSttRecognizerModel(modelDir, modelId),
numThreads: 2,
debug: 0,
},
diff --git a/packages/server/src/server/speech/speech-config-resolver.test.ts b/packages/server/src/server/speech/speech-config-resolver.test.ts
index 3d93b3673e..fa85e061ff 100644
--- a/packages/server/src/server/speech/speech-config-resolver.test.ts
+++ b/packages/server/src/server/speech/speech-config-resolver.test.ts
@@ -5,6 +5,8 @@ import { describe, expect, test } from "vitest";
import { PersistedConfigSchema } from "../persisted-config.js";
import { resolveSpeechConfig } from "./speech-config-resolver.js";
+const SENSE_VOICE_MODEL = "sense-voice-zh-en-ja-ko-yue-int8-2025-09-09";
+
describe("resolveSpeechConfig", () => {
test("resolves local-first defaults without env overrides", () => {
const paseoHome = "/tmp/paseo-home";
@@ -133,6 +135,35 @@ describe("resolveSpeechConfig", () => {
expect(result.openai?.stt?.model).toBe("gpt-4o-transcribe");
});
+ test("accepts SenseVoice as a local STT model for dictation and voice mode", () => {
+ const persisted = PersistedConfigSchema.parse({
+ features: {
+ dictation: {
+ stt: {
+ provider: "local",
+ model: SENSE_VOICE_MODEL,
+ },
+ },
+ voiceMode: {
+ stt: {
+ provider: "local",
+ model: SENSE_VOICE_MODEL,
+ },
+ },
+ },
+ });
+
+ const result = resolveSpeechConfig({
+ paseoHome: "/tmp/paseo-home",
+ env: {} as NodeJS.ProcessEnv,
+ persisted,
+ });
+
+ expect(result.speech.local?.models.dictationStt).toBe(SENSE_VOICE_MODEL);
+ expect(result.speech.local?.models.voiceStt).toBe(SENSE_VOICE_MODEL);
+ expect(result.speech.local?.models.voiceTts).toBe("kokoro-en-v0_19");
+ });
+
test("resolves STT language from env, settings, and voice-to-dictation fallback", () => {
const persisted = PersistedConfigSchema.parse({
features: {
diff --git a/public-docs/voice.md b/public-docs/voice.md
index bb6adbf1ba..63bdf95418 100644
--- a/public-docs/voice.md
+++ b/public-docs/voice.md
@@ -31,12 +31,17 @@ Missing models are downloaded at daemon startup into `$PASEO_HOME/models/local-s
### Local STT models and language support
-| Model ID | Languages |
-| --------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| `parakeet-tdt-0.6b-v2-int8` | English only (default). Includes punctuation and capitalization. |
-| `parakeet-tdt-0.6b-v3-int8` | 25 European languages, auto-detected: Bulgarian, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, German, Greek, Hungarian, Italian, Latvian, Lithuanian, Maltese, Polish, Portuguese, Romanian, Russian, Slovak, Slovenian, Spanish, Swedish, Ukrainian. |
+| Model ID | Languages |
+| --------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| `parakeet-tdt-0.6b-v2-int8` | English only (default). Includes punctuation and capitalization. |
+| `parakeet-tdt-0.6b-v3-int8` | 25 European languages, auto-detected: Bulgarian, Croatian, Czech, Danish, Dutch, English, Estonian, Finnish, French, German, Greek, Hungarian, Italian, Latvian, Lithuanian, Maltese, Polish, Portuguese, Romanian, Russian, Slovak, Slovenian, Spanish, Swedish, Ukrainian. |
+| `sense-voice-zh-en-ja-ko-yue-int8-2025-09-09` | Chinese, English, Japanese, Korean, and Cantonese, auto-detected. |
-**To use a non-English language, switch the local STT model to `parakeet-tdt-0.6b-v3-int8`.** v3 detects the spoken language automatically — there is no per-language setting for it. The `language` field below does **not** steer the local Parakeet model (v2 is English-only, v3 auto-detects); it only applies to the OpenAI STT provider.
+For supported European languages, switch the local STT model to `parakeet-tdt-0.6b-v3-int8`. v3 detects the spoken language automatically — there is no per-language setting for it. The `language` field below does **not** steer the local Parakeet model (v2 is English-only, v3 auto-detects); it only applies to the OpenAI STT provider.
+
+For Chinese or Chinese/English mixed local STT, use `sense-voice-zh-en-ja-ko-yue-int8-2025-09-09`.
+
+Paseo downloads SenseVoice from Hugging Face mirror direct files before falling back to the GitHub release archive. This avoids relying only on GitHub release assets for large local speech model setup.
```json
{
@@ -72,7 +77,23 @@ For multilingual local dictation, set the model to v3 — it auto-detects the la
}
```
-The `language` field applies only to the OpenAI STT provider: set `features.dictation.stt.language` for dictation and `features.voiceMode.stt.language` for realtime voice. If voice language is omitted, Paseo uses the dictation language before falling back to `en`. It has no effect on the local Parakeet models.
+For Chinese/English mixed local dictation, set the model to SenseVoice:
+
+```json
+{
+ "version": 1,
+ "features": {
+ "dictation": {
+ "stt": { "provider": "local", "model": "sense-voice-zh-en-ja-ko-yue-int8-2025-09-09" }
+ },
+ "voiceMode": {
+ "stt": { "provider": "local", "model": "sense-voice-zh-en-ja-ko-yue-int8-2025-09-09" }
+ }
+ }
+}
+```
+
+The `language` field applies only to the OpenAI STT provider: set `features.dictation.stt.language` for dictation and `features.voiceMode.stt.language` for realtime voice. If voice language is omitted, Paseo uses the dictation language before falling back to `en`. It does not steer local Parakeet or SenseVoice models.
## OpenAI Speech Option