Real-time local speech-to-text transcription using decibri and sherpa-onnx. Runs entirely offline with no API key, no cloud service, and no network dependency.
This integration captures live audio from your microphone using decibri and feeds it to a sherpa-onnx streaming speech recognizer. Text appears in your terminal as you speak. Everything runs locally on your machine.
Choose this when you need real-time transcription without sending audio to a cloud API. It is ideal for privacy-sensitive applications, offline environments, or low-latency voice pipelines.
Sherpa-ONNX requires a pre-trained model. Download a streaming (online) model from the sherpa-onnx releases. For example, the Zipformer transducer model trained on English:
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2
tar xvf sherpa-onnx-streaming-zipformer-en-20M-2023-02-17.tar.bz2
This creates a directory with the model files: encoder.onnx, decoder.onnx, joiner.onnx, and tokens.txt.
Define the paths to your model files and set the sample rate. Sherpa-ONNX streaming models typically expect 16 kHz mono audio.
const Decibri = require('decibri');
const sherpa = require('sherpa-onnx');
const modelDir = './sherpa-onnx-streaming-zipformer-en-20M-2023-02-17';
const config = {
featConfig: { sampleRate: 16000, featureDim: 80 },
modelConfig: {
transducer: {
encoder: `${modelDir}/encoder-epoch-99-avg-1.onnx`,
decoder: `${modelDir}/decoder-epoch-99-avg-1.onnx`,
joiner: `${modelDir}/joiner-epoch-99-avg-1.onnx`,
},
tokens: `${modelDir}/tokens.txt`,
numThreads: 2,
provider: 'cpu',
modelType: 'zipformer',
},
};
Instantiate the sherpa-onnx online (streaming) recognizer with your configuration, then create a recognition stream.
const recognizer = new sherpa.OnlineRecognizer(config);
const stream = recognizer.createStream();
Create a decibri instance at 16 kHz mono. The default format is 16-bit signed integer PCM, which is what we need. Each chunk is 100 ms of audio (1,600 frames).
const mic = new Decibri({ sampleRate: 16000, channels: 1 });
Convert each incoming Int16 buffer to Float32 (sherpa-onnx expects float samples in the range -1.0 to 1.0), feed it to the recognizer, and print partial results as they arrive.
let lastText = '';
mic.on('data', (chunk) => {
// Convert Int16 PCM to Float32
const int16 = new Int16Array(chunk.buffer, chunk.byteOffset, chunk.length / 2);
const float32 = new Float32Array(int16.length);
for (let i = 0; i < int16.length; i++) {
float32[i] = int16[i] / 32768;
}
// Feed audio to the recognizer
stream.acceptWaveform(16000, float32);
while (recognizer.isReady(stream)) {
recognizer.decode(stream);
}
// Print partial results
const text = recognizer.getResult(stream).text.trim();
if (text && text !== lastText) {
lastText = text;
process.stdout.write('\r' + text);
}
});
Stop the microphone and free resources when the user presses Ctrl+C.
process.on('SIGINT', () => {
mic.stop();
const finalText = recognizer.getResult(stream).text.trim();
if (finalText) console.log('\n' + finalText);
stream.free();
recognizer.free();
process.exit(0);
});
console.log('Listening... (Ctrl+C to stop)');
const Decibri = require('decibri');
const sherpa = require('sherpa-onnx');
const modelDir = './sherpa-onnx-streaming-zipformer-en-20M-2023-02-17';
const config = {
featConfig: { sampleRate: 16000, featureDim: 80 },
modelConfig: {
transducer: {
encoder: `${modelDir}/encoder-epoch-99-avg-1.onnx`,
decoder: `${modelDir}/decoder-epoch-99-avg-1.onnx`,
joiner: `${modelDir}/joiner-epoch-99-avg-1.onnx`,
},
tokens: `${modelDir}/tokens.txt`,
numThreads: 2,
provider: 'cpu',
modelType: 'zipformer',
},
};
const recognizer = new sherpa.OnlineRecognizer(config);
const stream = recognizer.createStream();
const mic = new Decibri({ sampleRate: 16000, channels: 1 });
let lastText = '';
mic.on('data', (chunk) => {
const int16 = new Int16Array(chunk.buffer, chunk.byteOffset, chunk.length / 2);
const float32 = new Float32Array(int16.length);
for (let i = 0; i < int16.length; i++) {
float32[i] = int16[i] / 32768;
}
stream.acceptWaveform(16000, float32);
while (recognizer.isReady(stream)) {
recognizer.decode(stream);
}
const text = recognizer.getResult(stream).text.trim();
if (text && text !== lastText) {
lastText = text;
process.stdout.write('\r' + text);
}
});
process.on('SIGINT', () => {
mic.stop();
const finalText = recognizer.getResult(stream).text.trim();
if (finalText) console.log('\n' + finalText);
stream.free();
recognizer.free();
process.exit(0);
});
console.log('Listening... (Ctrl+C to stop)');