fix: audio pipeline — 16 kHz AudioContext, 4096-sample buffering, SERVER_READY handshake

Root causes of disconnection and slow transcription: - AudioWorklet was firing every 128 native samples (~48 kHz), sending ~375 tiny WebSocket messages/sec. Server flooded with tiny frames during silence → keepalive ping timed out → connection dropped. - JS resampling 48 kHz → 16 kHz added CPU overhead on every chunk. - Audio started on ws.onopen before server sent SERVER_READY, so early frames were dropped. Fixes: - audioWorklet.js: accumulate 4096 samples before posting (256 ms/chunk at 16 kHz, ~4 messages/sec), transfer ArrayBuffer zero-copy. - transcriptionService: AudioContext({ sampleRate: 16000 }) — browser handles native resampling, no JS resampling needed. Remove resampleTo16kHZ entirely. - Wait for SERVER_READY message before calling setupAudioProcessing(). - Send 'END_OF_AUDIO' string on stop so server can finalise last segment. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-23 07:26:20 +00:00 · 2026-05-23 07:26:20 +00:00 · 4139eb8fd3
commit 4139eb8fd3
parent 308889937c
2 changed files with 44 additions and 36 deletions
--- a/public/audioWorklet.js
+++ b/public/audioWorklet.js
@ -1,12 +1,26 @@
 class AudioProcessor extends AudioWorkletProcessor {
  constructor() {
    super();
    this._buffer = new Float32Array(4096);
    this._bufferIndex = 0;
  }
  process(inputs) {
    const input = inputs[0];
    if (input.length > 0) {
-      const audioData = input[0];
+      const samples = input[0];
-      this.port.postMessage(audioData);
+      for (let i = 0; i < samples.length; i++) {
        this._buffer[this._bufferIndex++] = samples[i];
        if (this._bufferIndex >= 4096) {
          // Transfer ownership (zero-copy) to main thread
          this.port.postMessage(this._buffer.buffer, [this._buffer.buffer]);
          this._buffer = new Float32Array(4096);
          this._bufferIndex = 0;
        }
      }
    }
    return true;
  }
 }
-registerProcessor('audio-processor', AudioProcessor); 
+registerProcessor('audio-processor', AudioProcessor);
--- a/src/utils/tldraw/cc-base/cc-transcription/transcriptionService.tsx
+++ b/src/utils/tldraw/cc-base/cc-transcription/transcriptionService.tsx
@ -33,7 +33,7 @@ export class TranscriptionService {
      // Call getUserMedia directly — this triggers the browser permission prompt.
      // The old code called enumerateDevices() first to find a device ID, but
-      // without microphone permission deviceId is always "" (empty string, falsy),
+      // without microphone permission deviceId is always  (empty string, falsy),
      // causing an early return that never prompted the user for permission.
      const audioConstraints: MediaTrackConstraints = this.selectedDeviceId
        ? { deviceId: { exact: this.selectedDeviceId } }
@ -59,18 +59,15 @@ export class TranscriptionService {
      ws.onopen = () => {
        clearTimeout(connectionTimeout);
        logger.info('transcription-service', '✅ WebSocket connected');
-        
+
-        // Send initial configuration message
+        // Send initial configuration — audio capture starts only after SERVER_READY.
-        const message = JSON.stringify({
+        ws.send(JSON.stringify({
          uid: uuid,
          language: config.language || 'en',
          task: config.task || 'transcribe',
-          model: config.modelSize || 'small',
+          model: config.modelSize || 'base',
          use_vad: config.useVad ?? true,
-        });
+        }));
        ws.send(message);
        this.setupAudioProcessing();
      };
      ws.onerror = (error) => {
@ -88,6 +85,13 @@ export class TranscriptionService {
          return;
        }
        if (data.message === 'SERVER_READY') {
          // Server is ready — now safe to start streaming audio.
          logger.info('transcription-service', '🟢 Server ready, starting audio capture');
          this.setupAudioProcessing();
          return;
        }
        if (data.status === 'WAIT') {
          logger.info('transcription-service', `⏳ Wait time: ${Math.round(data.message)} minutes`);
          this.cleanup();
@ -105,7 +109,7 @@ export class TranscriptionService {
          const lastIdx = segments.length - 1;
          // Only emit segments we have not finalized yet — avoids re-processing the
-          // full array on every message (which caused the "stuck last segment" bug).
+          // full array on every message (which caused the stuck last segment bug).
          for (let i = this.finalizedSegmentCount; i < lastIdx; i++) {
            const seg = segments[i];
            this.onTranscriptionUpdate(seg.text, true, {
@ -135,19 +139,21 @@ export class TranscriptionService {
    }
    try {
-      this.audioContext = new AudioContext();
+      // Request 16 kHz from the browser — it resamples natively so we send
-      
+      // the correct rate to the server without any JS resampling overhead.
-      // Load and register the audio worklet
+      this.audioContext = new AudioContext({ sampleRate: 16000 });
      await this.audioContext.audioWorklet.addModule('/audioWorklet.js');
-      
+
      this.mediaStreamSource = this.audioContext.createMediaStreamSource(this.stream);
      this.workletNode = new AudioWorkletNode(this.audioContext, 'audio-processor');
-      // Handle audio data from the worklet
+      // The worklet accumulates 4096 samples (256 ms at 16 kHz) before posting,
      // matching the reference frontend chunk size and eliminating the tiny-frame
      // flood that was overwhelming the server during silence.
      this.workletNode.port.onmessage = (event) => {
        if (this.socket?.readyState === WebSocket.OPEN) {
-          const resampledData = this.resampleTo16kHZ(event.data, this.audioContext!.sampleRate);
+          this.socket.send(event.data); // event.data is a transferred ArrayBuffer
          this.socket.send(resampledData);
        }
      };
@ -158,23 +164,11 @@ export class TranscriptionService {
    }
  }
  private resampleTo16kHZ(audioData: Float32Array, origSampleRate: number): Float32Array {
    const ratio = origSampleRate / 16000;
    const newLength = Math.round(audioData.length / ratio);
    const result = new Float32Array(newLength);
    for (let i = 0; i < newLength; i++) {
      const pos = i * ratio;
      const leftPos = Math.floor(pos);
      const rightPos = Math.ceil(pos);
      const weight = pos - leftPos;
      result[i] = audioData[leftPos] * (1 - weight) + (audioData[rightPos] || 0) * weight;
    }
    return result;
  }
  stopTranscription() {
    // Signal the server cleanly so it can finalise the last segment.
    if (this.socket?.readyState === WebSocket.OPEN) {
      this.socket.send('END_OF_AUDIO');
    }
    this.cleanup();
  }