fix: audio pipeline — 16 kHz AudioContext, 4096-sample buffering, SERVER_READY handshake
Root causes of disconnection and slow transcription:
- AudioWorklet was firing every 128 native samples (~48 kHz), sending
~375 tiny WebSocket messages/sec. Server flooded with tiny frames
during silence → keepalive ping timed out → connection dropped.
- JS resampling 48 kHz → 16 kHz added CPU overhead on every chunk.
- Audio started on ws.onopen before server sent SERVER_READY, so early
frames were dropped.
Fixes:
- audioWorklet.js: accumulate 4096 samples before posting (256 ms/chunk
at 16 kHz, ~4 messages/sec), transfer ArrayBuffer zero-copy.
- transcriptionService: AudioContext({ sampleRate: 16000 }) — browser
handles native resampling, no JS resampling needed. Remove
resampleTo16kHZ entirely.
- Wait for SERVER_READY message before calling setupAudioProcessing().
- Send 'END_OF_AUDIO' string on stop so server can finalise last segment.
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
308889937c
commit
4139eb8fd3
@ -1,12 +1,26 @@
|
|||||||
class AudioProcessor extends AudioWorkletProcessor {
|
class AudioProcessor extends AudioWorkletProcessor {
|
||||||
|
constructor() {
|
||||||
|
super();
|
||||||
|
this._buffer = new Float32Array(4096);
|
||||||
|
this._bufferIndex = 0;
|
||||||
|
}
|
||||||
|
|
||||||
process(inputs) {
|
process(inputs) {
|
||||||
const input = inputs[0];
|
const input = inputs[0];
|
||||||
if (input.length > 0) {
|
if (input.length > 0) {
|
||||||
const audioData = input[0];
|
const samples = input[0];
|
||||||
this.port.postMessage(audioData);
|
for (let i = 0; i < samples.length; i++) {
|
||||||
|
this._buffer[this._bufferIndex++] = samples[i];
|
||||||
|
if (this._bufferIndex >= 4096) {
|
||||||
|
// Transfer ownership (zero-copy) to main thread
|
||||||
|
this.port.postMessage(this._buffer.buffer, [this._buffer.buffer]);
|
||||||
|
this._buffer = new Float32Array(4096);
|
||||||
|
this._bufferIndex = 0;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
registerProcessor('audio-processor', AudioProcessor);
|
registerProcessor('audio-processor', AudioProcessor);
|
||||||
|
|||||||
@ -33,7 +33,7 @@ export class TranscriptionService {
|
|||||||
|
|
||||||
// Call getUserMedia directly — this triggers the browser permission prompt.
|
// Call getUserMedia directly — this triggers the browser permission prompt.
|
||||||
// The old code called enumerateDevices() first to find a device ID, but
|
// The old code called enumerateDevices() first to find a device ID, but
|
||||||
// without microphone permission deviceId is always "" (empty string, falsy),
|
// without microphone permission deviceId is always (empty string, falsy),
|
||||||
// causing an early return that never prompted the user for permission.
|
// causing an early return that never prompted the user for permission.
|
||||||
const audioConstraints: MediaTrackConstraints = this.selectedDeviceId
|
const audioConstraints: MediaTrackConstraints = this.selectedDeviceId
|
||||||
? { deviceId: { exact: this.selectedDeviceId } }
|
? { deviceId: { exact: this.selectedDeviceId } }
|
||||||
@ -59,18 +59,15 @@ export class TranscriptionService {
|
|||||||
ws.onopen = () => {
|
ws.onopen = () => {
|
||||||
clearTimeout(connectionTimeout);
|
clearTimeout(connectionTimeout);
|
||||||
logger.info('transcription-service', '✅ WebSocket connected');
|
logger.info('transcription-service', '✅ WebSocket connected');
|
||||||
|
|
||||||
// Send initial configuration message
|
// Send initial configuration — audio capture starts only after SERVER_READY.
|
||||||
const message = JSON.stringify({
|
ws.send(JSON.stringify({
|
||||||
uid: uuid,
|
uid: uuid,
|
||||||
language: config.language || 'en',
|
language: config.language || 'en',
|
||||||
task: config.task || 'transcribe',
|
task: config.task || 'transcribe',
|
||||||
model: config.modelSize || 'small',
|
model: config.modelSize || 'base',
|
||||||
use_vad: config.useVad ?? true,
|
use_vad: config.useVad ?? true,
|
||||||
});
|
}));
|
||||||
|
|
||||||
ws.send(message);
|
|
||||||
this.setupAudioProcessing();
|
|
||||||
};
|
};
|
||||||
|
|
||||||
ws.onerror = (error) => {
|
ws.onerror = (error) => {
|
||||||
@ -88,6 +85,13 @@ export class TranscriptionService {
|
|||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (data.message === 'SERVER_READY') {
|
||||||
|
// Server is ready — now safe to start streaming audio.
|
||||||
|
logger.info('transcription-service', '🟢 Server ready, starting audio capture');
|
||||||
|
this.setupAudioProcessing();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
if (data.status === 'WAIT') {
|
if (data.status === 'WAIT') {
|
||||||
logger.info('transcription-service', `⏳ Wait time: ${Math.round(data.message)} minutes`);
|
logger.info('transcription-service', `⏳ Wait time: ${Math.round(data.message)} minutes`);
|
||||||
this.cleanup();
|
this.cleanup();
|
||||||
@ -105,7 +109,7 @@ export class TranscriptionService {
|
|||||||
const lastIdx = segments.length - 1;
|
const lastIdx = segments.length - 1;
|
||||||
|
|
||||||
// Only emit segments we have not finalized yet — avoids re-processing the
|
// Only emit segments we have not finalized yet — avoids re-processing the
|
||||||
// full array on every message (which caused the "stuck last segment" bug).
|
// full array on every message (which caused the stuck last segment bug).
|
||||||
for (let i = this.finalizedSegmentCount; i < lastIdx; i++) {
|
for (let i = this.finalizedSegmentCount; i < lastIdx; i++) {
|
||||||
const seg = segments[i];
|
const seg = segments[i];
|
||||||
this.onTranscriptionUpdate(seg.text, true, {
|
this.onTranscriptionUpdate(seg.text, true, {
|
||||||
@ -135,19 +139,21 @@ export class TranscriptionService {
|
|||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
this.audioContext = new AudioContext();
|
// Request 16 kHz from the browser — it resamples natively so we send
|
||||||
|
// the correct rate to the server without any JS resampling overhead.
|
||||||
// Load and register the audio worklet
|
this.audioContext = new AudioContext({ sampleRate: 16000 });
|
||||||
|
|
||||||
await this.audioContext.audioWorklet.addModule('/audioWorklet.js');
|
await this.audioContext.audioWorklet.addModule('/audioWorklet.js');
|
||||||
|
|
||||||
this.mediaStreamSource = this.audioContext.createMediaStreamSource(this.stream);
|
this.mediaStreamSource = this.audioContext.createMediaStreamSource(this.stream);
|
||||||
this.workletNode = new AudioWorkletNode(this.audioContext, 'audio-processor');
|
this.workletNode = new AudioWorkletNode(this.audioContext, 'audio-processor');
|
||||||
|
|
||||||
// Handle audio data from the worklet
|
// The worklet accumulates 4096 samples (256 ms at 16 kHz) before posting,
|
||||||
|
// matching the reference frontend chunk size and eliminating the tiny-frame
|
||||||
|
// flood that was overwhelming the server during silence.
|
||||||
this.workletNode.port.onmessage = (event) => {
|
this.workletNode.port.onmessage = (event) => {
|
||||||
if (this.socket?.readyState === WebSocket.OPEN) {
|
if (this.socket?.readyState === WebSocket.OPEN) {
|
||||||
const resampledData = this.resampleTo16kHZ(event.data, this.audioContext!.sampleRate);
|
this.socket.send(event.data); // event.data is a transferred ArrayBuffer
|
||||||
this.socket.send(resampledData);
|
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
@ -158,23 +164,11 @@ export class TranscriptionService {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
private resampleTo16kHZ(audioData: Float32Array, origSampleRate: number): Float32Array {
|
|
||||||
const ratio = origSampleRate / 16000;
|
|
||||||
const newLength = Math.round(audioData.length / ratio);
|
|
||||||
const result = new Float32Array(newLength);
|
|
||||||
|
|
||||||
for (let i = 0; i < newLength; i++) {
|
|
||||||
const pos = i * ratio;
|
|
||||||
const leftPos = Math.floor(pos);
|
|
||||||
const rightPos = Math.ceil(pos);
|
|
||||||
const weight = pos - leftPos;
|
|
||||||
result[i] = audioData[leftPos] * (1 - weight) + (audioData[rightPos] || 0) * weight;
|
|
||||||
}
|
|
||||||
|
|
||||||
return result;
|
|
||||||
}
|
|
||||||
|
|
||||||
stopTranscription() {
|
stopTranscription() {
|
||||||
|
// Signal the server cleanly so it can finalise the last segment.
|
||||||
|
if (this.socket?.readyState === WebSocket.OPEN) {
|
||||||
|
this.socket.send('END_OF_AUDIO');
|
||||||
|
}
|
||||||
this.cleanup();
|
this.cleanup();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user