From 4139eb8fd3e06cbe9ed4e0cb797ac7c094bb73d8 Mon Sep 17 00:00:00 2001
From: kcar <kcar@kevlarai.com>
Date: Sat, 23 May 2026 07:26:20 +0000
Subject: [PATCH] =?UTF-8?q?fix:=20audio=20pipeline=20=E2=80=94=2016=20kHz?=
 =?UTF-8?q?=20AudioContext,=204096-sample=20buffering,=20SERVER=5FREADY=20?=
 =?UTF-8?q?handshake?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Root causes of disconnection and slow transcription:
- AudioWorklet was firing every 128 native samples (~48 kHz), sending
  ~375 tiny WebSocket messages/sec. Server flooded with tiny frames
  during silence → keepalive ping timed out → connection dropped.
- JS resampling 48 kHz → 16 kHz added CPU overhead on every chunk.
- Audio started on ws.onopen before server sent SERVER_READY, so early
  frames were dropped.

Fixes:
- audioWorklet.js: accumulate 4096 samples before posting (256 ms/chunk
  at 16 kHz, ~4 messages/sec), transfer ArrayBuffer zero-copy.
- transcriptionService: AudioContext({ sampleRate: 16000 }) — browser
  handles native resampling, no JS resampling needed. Remove
  resampleTo16kHZ entirely.
- Wait for SERVER_READY message before calling setupAudioProcessing().
- Send 'END_OF_AUDIO' string on stop so server can finalise last segment.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 public/audioWorklet.js                        | 20 ++++++-
 .../cc-transcription/transcriptionService.tsx | 60 +++++++++----------
 2 files changed, 44 insertions(+), 36 deletions(-)

diff --git a/public/audioWorklet.js b/public/audioWorklet.js
index f42b160..345e901 100644
--- a/public/audioWorklet.js
+++ b/public/audioWorklet.js
@@ -1,12 +1,26 @@
 class AudioProcessor extends AudioWorkletProcessor {
+  constructor() {
+    super();
+    this._buffer = new Float32Array(4096);
+    this._bufferIndex = 0;
+  }
+
   process(inputs) {
     const input = inputs[0];
     if (input.length > 0) {
-      const audioData = input[0];
-      this.port.postMessage(audioData);
+      const samples = input[0];
+      for (let i = 0; i < samples.length; i++) {
+        this._buffer[this._bufferIndex++] = samples[i];
+        if (this._bufferIndex >= 4096) {
+          // Transfer ownership (zero-copy) to main thread
+          this.port.postMessage(this._buffer.buffer, [this._buffer.buffer]);
+          this._buffer = new Float32Array(4096);
+          this._bufferIndex = 0;
+        }
+      }
     }
     return true;
   }
 }
 
-registerProcessor('audio-processor', AudioProcessor); 
\ No newline at end of file
+registerProcessor('audio-processor', AudioProcessor);
diff --git a/src/utils/tldraw/cc-base/cc-transcription/transcriptionService.tsx b/src/utils/tldraw/cc-base/cc-transcription/transcriptionService.tsx
index a46d78e..edfe71f 100644
--- a/src/utils/tldraw/cc-base/cc-transcription/transcriptionService.tsx
+++ b/src/utils/tldraw/cc-base/cc-transcription/transcriptionService.tsx
@@ -33,7 +33,7 @@ export class TranscriptionService {
 
       // Call getUserMedia directly — this triggers the browser permission prompt.
       // The old code called enumerateDevices() first to find a device ID, but
-      // without microphone permission deviceId is always "" (empty string, falsy),
+      // without microphone permission deviceId is always  (empty string, falsy),
       // causing an early return that never prompted the user for permission.
       const audioConstraints: MediaTrackConstraints = this.selectedDeviceId
         ? { deviceId: { exact: this.selectedDeviceId } }
@@ -59,18 +59,15 @@ export class TranscriptionService {
       ws.onopen = () => {
         clearTimeout(connectionTimeout);
         logger.info('transcription-service', '✅ WebSocket connected');
-        
-        // Send initial configuration message
-        const message = JSON.stringify({
+
+        // Send initial configuration — audio capture starts only after SERVER_READY.
+        ws.send(JSON.stringify({
           uid: uuid,
           language: config.language || 'en',
           task: config.task || 'transcribe',
-          model: config.modelSize || 'small',
+          model: config.modelSize || 'base',
           use_vad: config.useVad ?? true,
-        });
-        
-        ws.send(message);
-        this.setupAudioProcessing();
+        }));
       };
 
       ws.onerror = (error) => {
@@ -88,6 +85,13 @@ export class TranscriptionService {
           return;
         }
 
+        if (data.message === 'SERVER_READY') {
+          // Server is ready — now safe to start streaming audio.
+          logger.info('transcription-service', '🟢 Server ready, starting audio capture');
+          this.setupAudioProcessing();
+          return;
+        }
+
         if (data.status === 'WAIT') {
           logger.info('transcription-service', `⏳ Wait time: ${Math.round(data.message)} minutes`);
           this.cleanup();
@@ -105,7 +109,7 @@ export class TranscriptionService {
           const lastIdx = segments.length - 1;
 
           // Only emit segments we have not finalized yet — avoids re-processing the
-          // full array on every message (which caused the "stuck last segment" bug).
+          // full array on every message (which caused the stuck last segment bug).
           for (let i = this.finalizedSegmentCount; i < lastIdx; i++) {
             const seg = segments[i];
             this.onTranscriptionUpdate(seg.text, true, {
@@ -135,19 +139,21 @@ export class TranscriptionService {
     }
 
     try {
-      this.audioContext = new AudioContext();
-      
-      // Load and register the audio worklet
+      // Request 16 kHz from the browser — it resamples natively so we send
+      // the correct rate to the server without any JS resampling overhead.
+      this.audioContext = new AudioContext({ sampleRate: 16000 });
+
       await this.audioContext.audioWorklet.addModule('/audioWorklet.js');
-      
+
       this.mediaStreamSource = this.audioContext.createMediaStreamSource(this.stream);
       this.workletNode = new AudioWorkletNode(this.audioContext, 'audio-processor');
 
-      // Handle audio data from the worklet
+      // The worklet accumulates 4096 samples (256 ms at 16 kHz) before posting,
+      // matching the reference frontend chunk size and eliminating the tiny-frame
+      // flood that was overwhelming the server during silence.
       this.workletNode.port.onmessage = (event) => {
         if (this.socket?.readyState === WebSocket.OPEN) {
-          const resampledData = this.resampleTo16kHZ(event.data, this.audioContext!.sampleRate);
-          this.socket.send(resampledData);
+          this.socket.send(event.data); // event.data is a transferred ArrayBuffer
         }
       };
 
@@ -158,23 +164,11 @@ export class TranscriptionService {
     }
   }
 
-  private resampleTo16kHZ(audioData: Float32Array, origSampleRate: number): Float32Array {
-    const ratio = origSampleRate / 16000;
-    const newLength = Math.round(audioData.length / ratio);
-    const result = new Float32Array(newLength);
-    
-    for (let i = 0; i < newLength; i++) {
-      const pos = i * ratio;
-      const leftPos = Math.floor(pos);
-      const rightPos = Math.ceil(pos);
-      const weight = pos - leftPos;
-      result[i] = audioData[leftPos] * (1 - weight) + (audioData[rightPos] || 0) * weight;
-    }
-    
-    return result;
-  }
-
   stopTranscription() {
+    // Signal the server cleanly so it can finalise the last segment.
+    if (this.socket?.readyState === WebSocket.OPEN) {
+      this.socket.send('END_OF_AUDIO');
+    }
     this.cleanup();
   }