Stream live microphone audio to AWS Transcribe for real-time cloud transcription using decibri and the official AWS SDK.
This integration captures live audio from your microphone using decibri and streams it to Amazon Transcribe's cloud API via HTTP/2. Transcription results return in real-time as you speak, with both partial (in-progress) and final (confirmed) results. There is no model download, no local inference, and no format conversion required.
Choose this when you need managed cloud transcription with AWS IAM integration, support for 100+ languages, or features like automatic language detection and medical transcription. For use cases where audio must stay on your device, see the sherpa-onnx or whisper.cpp local integrations instead.
AWS Transcribe uses IAM credentials, not a simple API key. You need an AWS account with an IAM user that has the transcribe:StartStreamTranscription permission.
Configure credentials using one of these methods:
Option 1: Environment variables
export AWS_ACCESS_KEY_ID=AKIA...
export AWS_SECRET_ACCESS_KEY=your_secret
export AWS_REGION=us-east-1
Option 2: AWS credentials file
# ~/.aws/credentials
[default]
aws_access_key_id = AKIA...
aws_secret_access_key = your_secret
region = us-east-1
Option 3: .env file with dotenv
AWS_ACCESS_KEY_ID=AKIA...
AWS_SECRET_ACCESS_KEY=your_secret
AWS_REGION=us-east-1
The SDK resolves credentials automatically in this order: environment variables, shared credentials file, IAM role (for EC2/Lambda). If you get a service endpoint error, check that your region supports AWS Transcribe Streaming—not all regions do.
transcribe:StartStreamTranscription action. For testing, you can use the managed policy AmazonTranscribeFullAccess. For production, use a minimal policy:
{
"Version": "2012-10-17",
"Statement": [
{
"Effect": "Allow",
"Action": "transcribe:StartStreamTranscription",
"Resource": "*"
}
]
}
The dotenv package loads your credentials from a .env file. If you set environment variables another way or use the AWS credentials file, you can skip it.
No model download is required. All processing happens in Amazon's cloud.
Import decibri, the AWS Transcribe Streaming SDK, and dotenv. Create a client with your region.
'use strict';
require('dotenv').config();
const Decibri = require('decibri');
const {
TranscribeStreamingClient,
StartStreamTranscriptionCommand,
} = require('@aws-sdk/client-transcribe-streaming');
const client = new TranscribeStreamingClient({
region: process.env.AWS_REGION || 'us-east-1',
});
AWS Transcribe requires an async iterator that yields AudioEvent objects. This generator bridges decibri's data events into the format the SDK expects. This is the key difference from other integrations, which send chunks directly over a WebSocket.
function createAudioStream(mic) {
const chunks = [];
let done = false;
let resolve;
mic.on('data', (chunk) => {
chunks.push(chunk);
if (resolve) {
resolve();
resolve = null;
}
});
mic.on('end', () => {
done = true;
if (resolve) {
resolve();
resolve = null;
}
});
return {
[Symbol.asyncIterator]() {
return {
async next() {
while (chunks.length === 0 && !done) {
await new Promise((r) => { resolve = r; });
}
if (chunks.length > 0) {
return { value: { AudioEvent: { AudioChunk: chunks.shift() } }, done: false };
}
return { done: true };
},
};
},
};
}
Create a decibri instance at 16 kHz mono. The default format is 16-bit signed integer PCM, which matches AWS Transcribe's pcm encoding directly.
const mic = new Decibri({ sampleRate: 16000, channels: 1 });
Send a StartStreamTranscriptionCommand with the audio parameters and the async generator as the AudioStream.
const command = new StartStreamTranscriptionCommand({
LanguageCode: 'en-US',
MediaEncoding: 'pcm',
MediaSampleRateHertz: '16000',
AudioStream: createAudioStream(mic),
});
const response = await client.send(command);
MediaSampleRateHertz is passed as a string '16000', not a number. This matches the AWS SDK's own JavaScript examples.
Iterate over the response stream. AWS Transcribe returns both partial and final results:
IsPartial: true): Words as they're being recognized. Updates rapidly. Good for live caption display.IsPartial: false): Confirmed transcription. Won't change. Use these for command processing.for await (const event of response.TranscriptResultStream) {
if (event.TranscriptEvent) {
const results = event.TranscriptEvent.Transcript.Results;
for (const result of results) {
const transcript = result.Alternatives[0].Transcript;
if (result.IsPartial) {
process.stdout.write(`\r [partial] ${transcript} `);
} else {
console.log(`\n [final] ${transcript}`);
}
}
}
}
Stop the microphone and exit when the user presses Ctrl+C.
process.on('SIGINT', () => {
console.log('\nStopping...');
process.exit(0);
});
'use strict';
require('dotenv').config();
const Decibri = require('decibri');
const {
TranscribeStreamingClient,
StartStreamTranscriptionCommand,
} = require('@aws-sdk/client-transcribe-streaming');
// ── Audio stream generator ──────────────────────────────────
function createAudioStream(mic) {
const chunks = [];
let done = false;
let resolve;
mic.on('data', (chunk) => {
chunks.push(chunk);
if (resolve) {
resolve();
resolve = null;
}
});
mic.on('end', () => {
done = true;
if (resolve) {
resolve();
resolve = null;
}
});
return {
[Symbol.asyncIterator]() {
return {
async next() {
while (chunks.length === 0 && !done) {
await new Promise((r) => { resolve = r; });
}
if (chunks.length > 0) {
return { value: { AudioEvent: { AudioChunk: chunks.shift() } }, done: false };
}
return { done: true };
},
};
},
};
}
// ── Main ────────────────────────────────────────────────────
async function main() {
const client = new TranscribeStreamingClient({
region: process.env.AWS_REGION || 'us-east-1',
});
const mic = new Decibri({ sampleRate: 16000, channels: 1 });
console.log('Starting transcription. Speak into your microphone...');
console.log('Press Ctrl+C to stop.\n');
const command = new StartStreamTranscriptionCommand({
LanguageCode: 'en-US',
MediaEncoding: 'pcm',
MediaSampleRateHertz: '16000',
AudioStream: createAudioStream(mic),
});
const response = await client.send(command);
for await (const event of response.TranscriptResultStream) {
if (event.TranscriptEvent) {
const results = event.TranscriptEvent.Transcript.Results;
for (const result of results) {
const transcript = result.Alternatives[0].Transcript;
if (result.IsPartial) {
process.stdout.write(`\r [partial] ${transcript} `);
} else {
console.log(`\n [final] ${transcript}`);
}
}
}
}
}
// ── Cleanup on Ctrl+C ──────────────────────────────────────
process.on('SIGINT', () => {
console.log('\nStopping...');
process.exit(0);
});
main().catch(console.error);
The command options control how AWS Transcribe processes your audio. Here are the most useful ones:
| Option | Default | Description |
|---|---|---|
LanguageCode |
'en-US' |
Language code. Supports 100+ languages (e.g. 'es-US', 'fr-FR', 'de-DE'). |
MediaEncoding |
'pcm' |
Audio encoding. Use 'pcm' for decibri's Int16 output. |
MediaSampleRateHertz |
'16000' |
Sample rate as a string. Must match decibri's sampleRate. Supports 8000–48000 Hz. |
IdentifyLanguage |
omitted | Set to true to enable automatic language detection. Mutually exclusive with LanguageCode—use one or the other, not both. |
See the AWS Transcribe SDK documentation for the complete list of options.