Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions packages/server/scripts/download-speech-models.ts
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,23 @@ import {
type LocalSpeechModelId,
} from "../src/server/speech/providers/local/models.js";

function usage(): string {
return [
"Usage: npm run speech:download -- [--models-dir <dir>] [--model <modelId>]",
"",
"Examples:",
" npm run speech:download -- --model parakeet-tdt-0.6b-v2-int8",
" npm run speech:download -- --model sense-voice-zh-en-ja-ko-yue-int8-2025-09-09",
" npm run speech:download -- --models-dir /tmp/paseo-speech --model sense-voice-zh-en-ja-ko-yue-int8-2025-09-09",
].join("\n");
}

function parseArgs(argv: string[]): { modelsDir: string; modelIds: LocalSpeechModelId[] } {
if (argv.includes("--help") || argv.includes("-h")) {
process.stdout.write(`${usage()}\n`);
process.exit(0);
}

const home = resolvePaseoHome();
let modelsDir = process.env.PASEO_LOCAL_MODELS_DIR || `${home}/models/local-speech`;
const modelIds: LocalSpeechModelId[] = [];
Expand Down
2 changes: 2 additions & 0 deletions packages/server/scripts/transcribe-local-wav.ts
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,8 @@ async function main(): Promise<void> {

const providers: RequestedSpeechProviders = {
dictationStt: { provider: "local", explicit: true },
// Not used for single-file transcription.
voiceTurnDetection: { provider: "local", explicit: false, enabled: false },
voiceStt: { provider: "local", explicit: true },
// Not used here, but required by the shared runtime config shape.
voiceTts: { provider: "openai", explicit: false },
Expand Down
5 changes: 5 additions & 0 deletions packages/server/src/server/speech/providers/local/models.ts
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ import { ensureSherpaOnnxModels, getSherpaOnnxModelDir } from "./sherpa/model-do
import {
DEFAULT_LOCAL_STT_MODEL,
DEFAULT_LOCAL_TTS_MODEL,
getSherpaOnnxModelSpec,
LocalSttModelIdSchema,
LocalTtsModelIdSchema,
listSherpaOnnxModels,
Expand Down Expand Up @@ -30,6 +31,10 @@ export function getLocalSpeechModelDir(modelsDir: string, modelId: LocalSpeechMo
return getSherpaOnnxModelDir(modelsDir, modelId);
}

export function getLocalSpeechModelSpec(modelId: LocalSpeechModelId): LocalSpeechModelSpec {
return getSherpaOnnxModelSpec(modelId);
}

export async function ensureLocalSpeechModels(options: {
modelsDir: string;
modelIds: LocalSpeechModelId[];
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,58 @@ export type SherpaOnnxModelKind = "stt-offline" | "tts";

type DefaultModelRole = "stt" | "tts";

interface SherpaOnnxCatalogEntry {
export type SherpaOfflineRecognizerModelSpec =
| {
kind: "nemo_transducer";
encoder: string;
decoder: string;
joiner: string;
tokens: string;
}
| {
kind: "sense_voice";
model: string;
tokens: string;
language: "auto";
useInverseTextNormalization: boolean;
};

interface SherpaOnnxCatalogEntryBase {
kind: SherpaOnnxModelKind;
archiveUrl: string;
extractedDir: string;
requiredFiles: string[];
directFiles?: Array<{
path: string;
urls: string[];
}>;
description: string;
defaultFor?: DefaultModelRole;
}

type SherpaOnnxCatalogEntry =
| (SherpaOnnxCatalogEntryBase & {
kind: "stt-offline";
recognizer: SherpaOfflineRecognizerModelSpec;
})
| (SherpaOnnxCatalogEntryBase & {
kind: "tts";
});

export const SHERPA_ONNX_MODEL_CATALOG = {
"parakeet-tdt-0.6b-v2-int8": {
kind: "stt-offline",
archiveUrl:
"https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8.tar.bz2",
extractedDir: "sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8",
requiredFiles: ["encoder.int8.onnx", "decoder.int8.onnx", "joiner.int8.onnx", "tokens.txt"],
recognizer: {
kind: "nemo_transducer",
encoder: "encoder.int8.onnx",
decoder: "decoder.int8.onnx",
joiner: "joiner.int8.onnx",
tokens: "tokens.txt",
},
description: "NVIDIA Parakeet TDT v2 (offline NeMo transducer, English).",
defaultFor: "stt",
},
Expand All @@ -29,9 +65,48 @@ export const SHERPA_ONNX_MODEL_CATALOG = {
"https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-nemo-parakeet-tdt-0.6b-v3-int8.tar.bz2",
extractedDir: "sherpa-onnx-nemo-parakeet-tdt-0.6b-v3-int8",
requiredFiles: ["encoder.int8.onnx", "decoder.int8.onnx", "joiner.int8.onnx", "tokens.txt"],
recognizer: {
kind: "nemo_transducer",
encoder: "encoder.int8.onnx",
decoder: "decoder.int8.onnx",
joiner: "joiner.int8.onnx",
tokens: "tokens.txt",
},
description:
"NVIDIA Parakeet TDT v3 (offline NeMo transducer, 25 European languages, auto-detected).",
},
"sense-voice-zh-en-ja-ko-yue-int8-2025-09-09": {
kind: "stt-offline",
archiveUrl:
"https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09.tar.bz2",
extractedDir: "sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09",
requiredFiles: ["model.int8.onnx", "tokens.txt"],
directFiles: [
{
path: "model.int8.onnx",
urls: [
"https://hf-mirror.com/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09/resolve/main/model.int8.onnx?download=true",
"https://huggingface.co/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09/resolve/main/model.int8.onnx?download=true",
],
},
{
path: "tokens.txt",
urls: [
"https://hf-mirror.com/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09/resolve/main/tokens.txt?download=true",
"https://huggingface.co/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09/resolve/main/tokens.txt?download=true",
],
},
],
recognizer: {
kind: "sense_voice",
model: "model.int8.onnx",
tokens: "tokens.txt",
language: "auto",
useInverseTextNormalization: true,
},
description:
"SenseVoice int8 (offline, Chinese/English/Japanese/Korean/Cantonese, auto-detected).",
},
"kokoro-en-v0_19": {
kind: "tts",
archiveUrl:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,24 +1,33 @@
import { describe, expect, test } from "vitest";
import { mkdtempSync, mkdirSync, writeFileSync } from "node:fs";
import { afterEach, describe, expect, test, vi } from "vitest";
import { existsSync, mkdtempSync, mkdirSync, readFileSync, writeFileSync } from "node:fs";
import { tmpdir } from "node:os";
import path from "node:path";
import pino from "pino";

import { ensureSherpaOnnxModel, getSherpaOnnxModelDir } from "./model-downloader.js";

const SENSE_VOICE_MODEL = "sense-voice-zh-en-ja-ko-yue-int8-2025-09-09";

function makeTmpDir(): string {
return mkdtempSync(path.join(tmpdir(), "paseo-speech-models-"));
}

const logger = pino({ level: "silent" });

describe("sherpa model downloader", () => {
afterEach(() => {
vi.unstubAllGlobals();
});

test("getSherpaOnnxModelDir maps modelId to extractedDir", () => {
const modelsDir = "/tmp/models";
expect(getSherpaOnnxModelDir(modelsDir, "parakeet-tdt-0.6b-v2-int8")).toContain(
"sherpa-onnx-nemo-parakeet-tdt-0.6b-v2-int8",
);
expect(getSherpaOnnxModelDir(modelsDir, "kokoro-en-v0_19")).toContain("kokoro-en-v0_19");
expect(getSherpaOnnxModelDir(modelsDir, SENSE_VOICE_MODEL)).toContain(
"sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09",
);
});

test("ensureSherpaOnnxModel succeeds without downloading when files exist", async () => {
Expand All @@ -38,4 +47,49 @@ describe("sherpa model downloader", () => {

expect(out).toBe(modelDir);
});

test("ensureSherpaOnnxModel accepts existing SenseVoice files without downloading", async () => {
const modelsDir = makeTmpDir();
const modelDir = getSherpaOnnxModelDir(modelsDir, SENSE_VOICE_MODEL);

mkdirSync(modelDir, { recursive: true });
writeFileSync(path.join(modelDir, "model.int8.onnx"), "x");
writeFileSync(path.join(modelDir, "tokens.txt"), "x");

const out = await ensureSherpaOnnxModel({
modelsDir,
modelId: SENSE_VOICE_MODEL,
logger,
});

expect(out).toBe(modelDir);
});

test("ensureSherpaOnnxModel downloads SenseVoice direct files before archive fallback", async () => {
const fetch = vi
.fn()
.mockResolvedValueOnce(new Response("model-bytes"))
.mockResolvedValueOnce(new Response("tokens-bytes"));
vi.stubGlobal("fetch", fetch);

const modelsDir = makeTmpDir();
const modelDir = getSherpaOnnxModelDir(modelsDir, SENSE_VOICE_MODEL);

const out = await ensureSherpaOnnxModel({
modelsDir,
modelId: SENSE_VOICE_MODEL,
logger,
});

expect(out).toBe(modelDir);
expect(existsSync(path.join(modelsDir, ".downloads"))).toBe(false);
expect(readFileSync(path.join(modelDir, "model.int8.onnx"), "utf8")).toBe("model-bytes");
expect(readFileSync(path.join(modelDir, "tokens.txt"), "utf8")).toBe("tokens-bytes");
expect(fetch).toHaveBeenCalledWith(
"https://hf-mirror.com/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09/resolve/main/model.int8.onnx?download=true",
);
expect(fetch).toHaveBeenCalledWith(
"https://hf-mirror.com/csukuangfj/sherpa-onnx-sense-voice-zh-en-ja-ko-yue-int8-2025-09-09/resolve/main/tokens.txt?download=true",
);
});
});
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,40 @@ async function extractTarArchive(archivePath: string, destDir: string): Promise<
});
}

async function downloadDirectFiles(options: {
modelDir: string;
directFiles: Array<{ path: string; urls: string[] }>;
logger: pino.Logger;
}): Promise<void> {
const { modelDir, directFiles, logger } = options;
for (const file of directFiles) {
const outputPath = path.join(modelDir, file.path);
if (await isNonEmptyFile(outputPath)) {
continue;
}

let lastError: unknown = null;
for (const url of file.urls) {
try {
logger.info({ url, outputPath }, "Downloading model file");
await downloadToFile({
url,
outputPath,
});
lastError = null;
break;
} catch (error) {
lastError = error;
logger.warn({ err: error, url, outputPath }, "Model file download failed");
}
}

if (lastError) {
throw lastError;
}
}
}

async function isNonEmptyFile(filePath: string): Promise<boolean> {
try {
const s = await stat(filePath);
Expand Down Expand Up @@ -111,6 +145,26 @@ export async function ensureSherpaOnnxModel(
logger.info({ modelsDir: options.modelsDir }, "Starting model download");

try {
if (spec.directFiles) {
try {
await downloadDirectFiles({
modelDir,
directFiles: spec.directFiles,
logger,
});
if (await hasRequiredFiles(modelDir, spec.requiredFiles)) {
logger.info({ modelDir }, "Model direct file download completed");
return modelDir;
}
logger.warn(
{ modelDir, requiredFiles: spec.requiredFiles },
"Downloaded direct model files, but required files are still missing",
);
} catch (error) {
logger.warn({ err: error }, "Direct model file download failed; falling back to archive");
}
}

const downloadsDir = path.join(options.modelsDir, ".downloads");
const archiveFilename = path.basename(new URL(spec.archiveUrl).pathname);
const archivePath = path.join(downloadsDir, archiveFilename);
Expand Down
Loading