fix: audio pipeline — 16 kHz AudioContext, 4096-sample buffering, SERVER_READY handshake

Root causes of disconnection and slow transcription:
- AudioWorklet was firing every 128 native samples (~48 kHz), sending
  ~375 tiny WebSocket messages/sec. Server flooded with tiny frames
  during silence → keepalive ping timed out → connection dropped.
- JS resampling 48 kHz → 16 kHz added CPU overhead on every chunk.
- Audio started on ws.onopen before server sent SERVER_READY, so early
  frames were dropped.

Fixes:
- audioWorklet.js: accumulate 4096 samples before posting (256 ms/chunk
  at 16 kHz, ~4 messages/sec), transfer ArrayBuffer zero-copy.
- transcriptionService: AudioContext({ sampleRate: 16000 }) — browser
  handles native resampling, no JS resampling needed. Remove
  resampleTo16kHZ entirely.
- Wait for SERVER_READY message before calling setupAudioProcessing().
- Send 'END_OF_AUDIO' string on stop so server can finalise last segment.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
kcar 2026-05-23 07:26:20 +00:00
parent 308889937c
commit 4139eb8fd3
2 changed files with 44 additions and 36 deletions

View File

@ -1,12 +1,26 @@
class AudioProcessor extends AudioWorkletProcessor { class AudioProcessor extends AudioWorkletProcessor {
constructor() {
super();
this._buffer = new Float32Array(4096);
this._bufferIndex = 0;
}
process(inputs) { process(inputs) {
const input = inputs[0]; const input = inputs[0];
if (input.length > 0) { if (input.length > 0) {
const audioData = input[0]; const samples = input[0];
this.port.postMessage(audioData); for (let i = 0; i < samples.length; i++) {
this._buffer[this._bufferIndex++] = samples[i];
if (this._bufferIndex >= 4096) {
// Transfer ownership (zero-copy) to main thread
this.port.postMessage(this._buffer.buffer, [this._buffer.buffer]);
this._buffer = new Float32Array(4096);
this._bufferIndex = 0;
}
}
} }
return true; return true;
} }
} }
registerProcessor('audio-processor', AudioProcessor); registerProcessor('audio-processor', AudioProcessor);

View File

@ -33,7 +33,7 @@ export class TranscriptionService {
// Call getUserMedia directly — this triggers the browser permission prompt. // Call getUserMedia directly — this triggers the browser permission prompt.
// The old code called enumerateDevices() first to find a device ID, but // The old code called enumerateDevices() first to find a device ID, but
// without microphone permission deviceId is always "" (empty string, falsy), // without microphone permission deviceId is always (empty string, falsy),
// causing an early return that never prompted the user for permission. // causing an early return that never prompted the user for permission.
const audioConstraints: MediaTrackConstraints = this.selectedDeviceId const audioConstraints: MediaTrackConstraints = this.selectedDeviceId
? { deviceId: { exact: this.selectedDeviceId } } ? { deviceId: { exact: this.selectedDeviceId } }
@ -59,18 +59,15 @@ export class TranscriptionService {
ws.onopen = () => { ws.onopen = () => {
clearTimeout(connectionTimeout); clearTimeout(connectionTimeout);
logger.info('transcription-service', '✅ WebSocket connected'); logger.info('transcription-service', '✅ WebSocket connected');
// Send initial configuration message // Send initial configuration — audio capture starts only after SERVER_READY.
const message = JSON.stringify({ ws.send(JSON.stringify({
uid: uuid, uid: uuid,
language: config.language || 'en', language: config.language || 'en',
task: config.task || 'transcribe', task: config.task || 'transcribe',
model: config.modelSize || 'small', model: config.modelSize || 'base',
use_vad: config.useVad ?? true, use_vad: config.useVad ?? true,
}); }));
ws.send(message);
this.setupAudioProcessing();
}; };
ws.onerror = (error) => { ws.onerror = (error) => {
@ -88,6 +85,13 @@ export class TranscriptionService {
return; return;
} }
if (data.message === 'SERVER_READY') {
// Server is ready — now safe to start streaming audio.
logger.info('transcription-service', '🟢 Server ready, starting audio capture');
this.setupAudioProcessing();
return;
}
if (data.status === 'WAIT') { if (data.status === 'WAIT') {
logger.info('transcription-service', `⏳ Wait time: ${Math.round(data.message)} minutes`); logger.info('transcription-service', `⏳ Wait time: ${Math.round(data.message)} minutes`);
this.cleanup(); this.cleanup();
@ -105,7 +109,7 @@ export class TranscriptionService {
const lastIdx = segments.length - 1; const lastIdx = segments.length - 1;
// Only emit segments we have not finalized yet — avoids re-processing the // Only emit segments we have not finalized yet — avoids re-processing the
// full array on every message (which caused the "stuck last segment" bug). // full array on every message (which caused the stuck last segment bug).
for (let i = this.finalizedSegmentCount; i < lastIdx; i++) { for (let i = this.finalizedSegmentCount; i < lastIdx; i++) {
const seg = segments[i]; const seg = segments[i];
this.onTranscriptionUpdate(seg.text, true, { this.onTranscriptionUpdate(seg.text, true, {
@ -135,19 +139,21 @@ export class TranscriptionService {
} }
try { try {
this.audioContext = new AudioContext(); // Request 16 kHz from the browser — it resamples natively so we send
// the correct rate to the server without any JS resampling overhead.
// Load and register the audio worklet this.audioContext = new AudioContext({ sampleRate: 16000 });
await this.audioContext.audioWorklet.addModule('/audioWorklet.js'); await this.audioContext.audioWorklet.addModule('/audioWorklet.js');
this.mediaStreamSource = this.audioContext.createMediaStreamSource(this.stream); this.mediaStreamSource = this.audioContext.createMediaStreamSource(this.stream);
this.workletNode = new AudioWorkletNode(this.audioContext, 'audio-processor'); this.workletNode = new AudioWorkletNode(this.audioContext, 'audio-processor');
// Handle audio data from the worklet // The worklet accumulates 4096 samples (256 ms at 16 kHz) before posting,
// matching the reference frontend chunk size and eliminating the tiny-frame
// flood that was overwhelming the server during silence.
this.workletNode.port.onmessage = (event) => { this.workletNode.port.onmessage = (event) => {
if (this.socket?.readyState === WebSocket.OPEN) { if (this.socket?.readyState === WebSocket.OPEN) {
const resampledData = this.resampleTo16kHZ(event.data, this.audioContext!.sampleRate); this.socket.send(event.data); // event.data is a transferred ArrayBuffer
this.socket.send(resampledData);
} }
}; };
@ -158,23 +164,11 @@ export class TranscriptionService {
} }
} }
private resampleTo16kHZ(audioData: Float32Array, origSampleRate: number): Float32Array {
const ratio = origSampleRate / 16000;
const newLength = Math.round(audioData.length / ratio);
const result = new Float32Array(newLength);
for (let i = 0; i < newLength; i++) {
const pos = i * ratio;
const leftPos = Math.floor(pos);
const rightPos = Math.ceil(pos);
const weight = pos - leftPos;
result[i] = audioData[leftPos] * (1 - weight) + (audioData[rightPos] || 0) * weight;
}
return result;
}
stopTranscription() { stopTranscription() {
// Signal the server cleanly so it can finalise the last segment.
if (this.socket?.readyState === WebSocket.OPEN) {
this.socket.send('END_OF_AUDIO');
}
this.cleanup(); this.cleanup();
} }