Detect when someone is speaking versus silence in real time using decibri and sherpa-onnx Silero VAD. Runs entirely offline with no API key, no cloud service, and no network dependency.
This integration captures live microphone audio using decibri and feeds it to the Silero VAD model via sherpa-onnx. The VAD segments the audio stream into speech and non-speech regions, reporting when speech starts and ends with sample-level timestamps.
Choose this when you need to know when someone is speaking rather than what they are saying. Common uses include gating audio before sending to a speech recognizer, detecting turn-taking in conversations, or triggering recording only when speech is present.
vad: true option) that uses simple RMS energy thresholding. The sherpa-onnx Silero VAD shown here is a neural network model that is significantly more accurate, especially in noisy environments. Use decibri's built-in VAD for simple cases; use Silero VAD when accuracy matters.
Download the Silero VAD ONNX model from the sherpa-onnx releases:
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/asr-models/silero_vad.onnx
This is a single small file (~2 MB).
Configure the Silero VAD model. The key parameters are the detection threshold and the minimum silence/speech duration.
const Decibri = require('decibri');
const sherpa = require('sherpa-onnx');
const config = {
sileroVad: {
model: './silero_vad.onnx',
threshold: 0.5,
minSilenceDuration: 0.25,
minSpeechDuration: 0.25,
windowSize: 512,
},
sampleRate: 16000,
debug: false,
bufferSizeInSeconds: 60,
};
Instantiate the voice activity detector.
const vad = new sherpa.Vad(config);
Create a decibri instance at 16 kHz mono. The VAD processes audio in windows of 512 samples (32 ms at 16 kHz), but decibri delivers larger chunks which we feed in windows.
const mic = new Decibri({ sampleRate: 16000, channels: 1 });
Convert each incoming Int16 buffer to Float32, feed it to the VAD in window-sized chunks, and check for completed speech segments.
const windowSize = 512;
let speechActive = false;
mic.on('data', (chunk) => {
// Convert Int16 PCM to Float32
const int16 = new Int16Array(chunk.buffer, chunk.byteOffset, chunk.length / 2);
const float32 = new Float32Array(int16.length);
for (let i = 0; i < int16.length; i++) {
float32[i] = int16[i] / 32768;
}
// Feed audio in window-sized chunks
for (let offset = 0; offset + windowSize <= float32.length; offset += windowSize) {
const window = float32.subarray(offset, offset + windowSize);
vad.acceptWaveform(window);
if (vad.isSpeechDetected() && !speechActive) {
speechActive = true;
console.log('[speech start]');
}
if (!vad.isSpeechDetected() && speechActive) {
speechActive = false;
console.log('[speech end]');
}
// Drain completed speech segments
while (!vad.isEmpty()) {
const segment = vad.front();
const duration = (segment.samples.length / 16000).toFixed(2);
console.log(` segment: ${duration}s of speech`);
vad.pop();
}
}
});
Stop the microphone when the user presses Ctrl+C.
process.on('SIGINT', () => {
mic.stop();
vad.free();
process.exit(0);
});
console.log('Listening for speech... (Ctrl+C to stop)');
const Decibri = require('decibri');
const sherpa = require('sherpa-onnx');
const config = {
sileroVad: {
model: './silero_vad.onnx',
threshold: 0.5,
minSilenceDuration: 0.25,
minSpeechDuration: 0.25,
windowSize: 512,
},
sampleRate: 16000,
debug: false,
bufferSizeInSeconds: 60,
};
const vad = new sherpa.Vad(config);
const mic = new Decibri({ sampleRate: 16000, channels: 1 });
const windowSize = 512;
let speechActive = false;
mic.on('data', (chunk) => {
const int16 = new Int16Array(chunk.buffer, chunk.byteOffset, chunk.length / 2);
const float32 = new Float32Array(int16.length);
for (let i = 0; i < int16.length; i++) {
float32[i] = int16[i] / 32768;
}
for (let offset = 0; offset + windowSize <= float32.length; offset += windowSize) {
const window = float32.subarray(offset, offset + windowSize);
vad.acceptWaveform(window);
if (vad.isSpeechDetected() && !speechActive) {
speechActive = true;
console.log('[speech start]');
}
if (!vad.isSpeechDetected() && speechActive) {
speechActive = false;
console.log('[speech end]');
}
while (!vad.isEmpty()) {
const segment = vad.front();
const duration = (segment.samples.length / 16000).toFixed(2);
console.log(` segment: ${duration}s of speech`);
vad.pop();
}
}
});
process.on('SIGINT', () => {
mic.stop();
vad.free();
process.exit(0);
});
console.log('Listening for speech... (Ctrl+C to stop)');