Edit on GitHub

File Transcriber

FileTranscriber transcribes audio from media files like mp3, mp4, wav, ogg, ect. The supported file types depend on the browser used. Every format that can be played by the <audio> or <video> tag should work.

Full example of the FileTranscriber.

import createModule from "/your/project/shout.wasm.js"; // make sure to exclude from your bundler
// import createModule from "@transcribe/shout"; // if you use import map
import { FileTranscriber } from "@transcribe/transcriber";

// create new instance
const transcriber = new FileTranscriber({
  createModule, // create module function from emscripten wasm build
  model: "/path/to/model.bin", // can be path to model file, or File() object
  workerPath: "/path/to/shout", // set path to directory of shout.wasm.worker.mjs

  dtwType: "tiny", // optional, use for word level timestamps, must match model type (tiny, tiny.en, base, base.en, ...)

  // custom build callbacks
  onReady: () => console.log("ready"), // called after init, aka. transcriber is ready
  onProgress: (progress) => console.log(progress), // progress 0..100
  onSegment: (segment) => console.log(segment), // on new segement
  onComplete: (result) => console.log(result), // on transcription done
  onCanceled: () => console.log("canceled"), // called after transcriber.cancel() when the wasm operation actually canceled

  print: (message) => console.log(message), // message print to stdout
  printErr: (message) => console.error(message), // message print to errout

  // other emscripten Module callbacks
  preInit: () => {},
  preRun: () => {},
  onAbort: () => {},
  onExit: (existStatus) => {},
  locateFile: (file) => `path/${file}`, // used by shout.wasm.js to determine location of worker file; you don't need this if you've set `workerPath`
});

// init wasm (loads model file and creates a new shout instance)
await transcriber.init();

// transcribe audio/video file
const result = await transcriber.transcribe(
  "my.mp3", // path to media file, or File() object
  {
    lang: "en", // language of the speech to transcribe
    threads: 2, // use number of threads (choose based on number of instances and hardware)
    translate: false, // translate to english
    max_len: 0, // limit max number of characters in one token, 0 -> no limit
    split_on_words: false, //split on new word rather than token
    suppress_non_speech: false, // remove non speech tokens
    token_timestamps: true, // calculate token level timestamps
  }
);

console.log(result);

// clear instances and memory when all transcriptions are done
transcriber.destroy();