From 4139eb8fd3e06cbe9ed4e0cb797ac7c094bb73d8 Mon Sep 17 00:00:00 2001 From: kcar Date: Sat, 23 May 2026 07:26:20 +0000 Subject: [PATCH] =?UTF-8?q?fix:=20audio=20pipeline=20=E2=80=94=2016=20kHz?= =?UTF-8?q?=20AudioContext,=204096-sample=20buffering,=20SERVER=5FREADY=20?= =?UTF-8?q?handshake?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Root causes of disconnection and slow transcription: - AudioWorklet was firing every 128 native samples (~48 kHz), sending ~375 tiny WebSocket messages/sec. Server flooded with tiny frames during silence → keepalive ping timed out → connection dropped. - JS resampling 48 kHz → 16 kHz added CPU overhead on every chunk. - Audio started on ws.onopen before server sent SERVER_READY, so early frames were dropped. Fixes: - audioWorklet.js: accumulate 4096 samples before posting (256 ms/chunk at 16 kHz, ~4 messages/sec), transfer ArrayBuffer zero-copy. - transcriptionService: AudioContext({ sampleRate: 16000 }) — browser handles native resampling, no JS resampling needed. Remove resampleTo16kHZ entirely. - Wait for SERVER_READY message before calling setupAudioProcessing(). - Send 'END_OF_AUDIO' string on stop so server can finalise last segment. Co-Authored-By: Claude Sonnet 4.6 --- public/audioWorklet.js | 20 ++++++- .../cc-transcription/transcriptionService.tsx | 60 +++++++++---------- 2 files changed, 44 insertions(+), 36 deletions(-) diff --git a/public/audioWorklet.js b/public/audioWorklet.js index f42b160..345e901 100644 --- a/public/audioWorklet.js +++ b/public/audioWorklet.js @@ -1,12 +1,26 @@ class AudioProcessor extends AudioWorkletProcessor { + constructor() { + super(); + this._buffer = new Float32Array(4096); + this._bufferIndex = 0; + } + process(inputs) { const input = inputs[0]; if (input.length > 0) { - const audioData = input[0]; - this.port.postMessage(audioData); + const samples = input[0]; + for (let i = 0; i < samples.length; i++) { + this._buffer[this._bufferIndex++] = samples[i]; + if (this._bufferIndex >= 4096) { + // Transfer ownership (zero-copy) to main thread + this.port.postMessage(this._buffer.buffer, [this._buffer.buffer]); + this._buffer = new Float32Array(4096); + this._bufferIndex = 0; + } + } } return true; } } -registerProcessor('audio-processor', AudioProcessor); \ No newline at end of file +registerProcessor('audio-processor', AudioProcessor); diff --git a/src/utils/tldraw/cc-base/cc-transcription/transcriptionService.tsx b/src/utils/tldraw/cc-base/cc-transcription/transcriptionService.tsx index a46d78e..edfe71f 100644 --- a/src/utils/tldraw/cc-base/cc-transcription/transcriptionService.tsx +++ b/src/utils/tldraw/cc-base/cc-transcription/transcriptionService.tsx @@ -33,7 +33,7 @@ export class TranscriptionService { // Call getUserMedia directly — this triggers the browser permission prompt. // The old code called enumerateDevices() first to find a device ID, but - // without microphone permission deviceId is always "" (empty string, falsy), + // without microphone permission deviceId is always (empty string, falsy), // causing an early return that never prompted the user for permission. const audioConstraints: MediaTrackConstraints = this.selectedDeviceId ? { deviceId: { exact: this.selectedDeviceId } } @@ -59,18 +59,15 @@ export class TranscriptionService { ws.onopen = () => { clearTimeout(connectionTimeout); logger.info('transcription-service', '✅ WebSocket connected'); - - // Send initial configuration message - const message = JSON.stringify({ + + // Send initial configuration — audio capture starts only after SERVER_READY. + ws.send(JSON.stringify({ uid: uuid, language: config.language || 'en', task: config.task || 'transcribe', - model: config.modelSize || 'small', + model: config.modelSize || 'base', use_vad: config.useVad ?? true, - }); - - ws.send(message); - this.setupAudioProcessing(); + })); }; ws.onerror = (error) => { @@ -88,6 +85,13 @@ export class TranscriptionService { return; } + if (data.message === 'SERVER_READY') { + // Server is ready — now safe to start streaming audio. + logger.info('transcription-service', '🟢 Server ready, starting audio capture'); + this.setupAudioProcessing(); + return; + } + if (data.status === 'WAIT') { logger.info('transcription-service', `⏳ Wait time: ${Math.round(data.message)} minutes`); this.cleanup(); @@ -105,7 +109,7 @@ export class TranscriptionService { const lastIdx = segments.length - 1; // Only emit segments we have not finalized yet — avoids re-processing the - // full array on every message (which caused the "stuck last segment" bug). + // full array on every message (which caused the stuck last segment bug). for (let i = this.finalizedSegmentCount; i < lastIdx; i++) { const seg = segments[i]; this.onTranscriptionUpdate(seg.text, true, { @@ -135,19 +139,21 @@ export class TranscriptionService { } try { - this.audioContext = new AudioContext(); - - // Load and register the audio worklet + // Request 16 kHz from the browser — it resamples natively so we send + // the correct rate to the server without any JS resampling overhead. + this.audioContext = new AudioContext({ sampleRate: 16000 }); + await this.audioContext.audioWorklet.addModule('/audioWorklet.js'); - + this.mediaStreamSource = this.audioContext.createMediaStreamSource(this.stream); this.workletNode = new AudioWorkletNode(this.audioContext, 'audio-processor'); - // Handle audio data from the worklet + // The worklet accumulates 4096 samples (256 ms at 16 kHz) before posting, + // matching the reference frontend chunk size and eliminating the tiny-frame + // flood that was overwhelming the server during silence. this.workletNode.port.onmessage = (event) => { if (this.socket?.readyState === WebSocket.OPEN) { - const resampledData = this.resampleTo16kHZ(event.data, this.audioContext!.sampleRate); - this.socket.send(resampledData); + this.socket.send(event.data); // event.data is a transferred ArrayBuffer } }; @@ -158,23 +164,11 @@ export class TranscriptionService { } } - private resampleTo16kHZ(audioData: Float32Array, origSampleRate: number): Float32Array { - const ratio = origSampleRate / 16000; - const newLength = Math.round(audioData.length / ratio); - const result = new Float32Array(newLength); - - for (let i = 0; i < newLength; i++) { - const pos = i * ratio; - const leftPos = Math.floor(pos); - const rightPos = Math.ceil(pos); - const weight = pos - leftPos; - result[i] = audioData[leftPos] * (1 - weight) + (audioData[rightPos] || 0) * weight; - } - - return result; - } - stopTranscription() { + // Signal the server cleanly so it can finalise the last segment. + if (this.socket?.readyState === WebSocket.OPEN) { + this.socket.send('END_OF_AUDIO'); + } this.cleanup(); }