From 65e8426cf7d6b9038810fb9bab126cf2ae6ed777 Mon Sep 17 00:00:00 2001
From: kcar <kcar@kevlarai.com>
Date: Wed, 27 May 2026 15:51:33 +0000
Subject: [PATCH] fix: tune WhisperLive buffering and CUDA detection

---
 whisper_live/server.py | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

diff --git a/whisper_live/server.py b/whisper_live/server.py
index f9bb594..af691f8 100644
--- a/whisper_live/server.py
+++ b/whisper_live/server.py
@@ -427,7 +427,7 @@ class ServeClientBase(object):
         self.show_prev_out_thresh = 5   # if pause(no output from whisper) show previous output for 5 seconds
         self.add_pause_thresh = 3       # add a blank to segment list as a pause(no speech) for 3 seconds
         self.transcript = []
-        self.send_last_n_segments = 10
+        self.send_last_n_segments = 30
 
         # text formatting
         self.pick_previous_segments = 2
@@ -461,9 +461,9 @@ class ServeClientBase(object):
 
         """
         self.lock.acquire()
-        if self.frames_np is not None and self.frames_np.shape[0] > 45*self.RATE:
-            self.frames_offset += 30.0
-            self.frames_np = self.frames_np[int(30*self.RATE):]
+        if self.frames_np is not None and self.frames_np.shape[0] > 90*self.RATE:
+            self.frames_offset += 60.0
+            self.frames_np = self.frames_np[int(60*self.RATE):]
             # check timestamp offset(should be >= self.frame_offset)
             # this basically means that there is no speech as timestamp offset hasnt updated
             # and is less than frame_offset
@@ -482,7 +482,7 @@ class ServeClientBase(object):
         no valid segment for the last 30 seconds from whisper
         """
         with self.lock:
-            if self.frames_np[int((self.timestamp_offset - self.frames_offset)*self.RATE):].shape[0] > 25 * self.RATE:
+            if self.frames_np[int((self.timestamp_offset - self.frames_offset)*self.RATE):].shape[0] > 60 * self.RATE:
                 duration = self.frames_np.shape[0] / self.RATE
                 self.timestamp_offset = self.frames_offset + duration - 5
 
@@ -807,10 +807,19 @@ class ServeClientFasterWhisper(ServeClientBase):
         self.same_output_threshold = 10
         self.end_time_for_same_output = None
 
-        device = "cuda" if torch.cuda.is_available() else "cpu"
+        # torch.cuda.is_available() fails when torch was compiled against a newer CUDA
+        # than the driver provides. Use ctranslate2's own CUDA probe instead, since
+        # faster_whisper relies on ctranslate2 — not torch — for inference.
+        try:
+            import ctranslate2 as _ct2
+            _cuda_types = _ct2.get_supported_compute_types("cuda")
+            device = "cuda" if _cuda_types else "cpu"
+        except Exception:
+            device = "cpu"
+
         if device == "cuda":
-            major, _ = torch.cuda.get_device_capability(device)
-            self.compute_type = "float16" if major >= 7 else "float32"
+            # Use int8 to stay within shared GPU memory budget (GPU 1 is shared with TTS/ComfyUI)
+            self.compute_type = "int8"
         else:
             self.compute_type = "int8"