fix: tune WhisperLive buffering and CUDA detection

2026-05-27 15:51:33 +00:00 · 2026-05-27 15:51:33 +00:00 · 65e8426cf7
commit 65e8426cf7
parent 83b09b7a4a
1 changed files with 17 additions and 8 deletions
--- a/whisper_live/server.py
+++ b/whisper_live/server.py
@ -427,7 +427,7 @@ class ServeClientBase(object):
        self.show_prev_out_thresh = 5   # if pause(no output from whisper) show previous output for 5 seconds
        self.add_pause_thresh = 3       # add a blank to segment list as a pause(no speech) for 3 seconds
        self.transcript = []
-        self.send_last_n_segments = 10
+        self.send_last_n_segments = 30

        # text formatting
        self.pick_previous_segments = 2
@ -461,9 +461,9 @@ class ServeClientBase(object):

        """
        self.lock.acquire()
-        if self.frames_np is not None and self.frames_np.shape[0] > 45*self.RATE:
-            self.frames_offset += 30.0
-            self.frames_np = self.frames_np[int(30*self.RATE):]
+        if self.frames_np is not None and self.frames_np.shape[0] > 90*self.RATE:
+            self.frames_offset += 60.0
+            self.frames_np = self.frames_np[int(60*self.RATE):]
            # check timestamp offset(should be >= self.frame_offset)
            # this basically means that there is no speech as timestamp offset hasnt updated
            # and is less than frame_offset
@ -482,7 +482,7 @@ class ServeClientBase(object):
        no valid segment for the last 30 seconds from whisper
        """
        with self.lock:
-            if self.frames_np[int((self.timestamp_offset - self.frames_offset)*self.RATE):].shape[0] > 25 * self.RATE:
+            if self.frames_np[int((self.timestamp_offset - self.frames_offset)*self.RATE):].shape[0] > 60 * self.RATE:
                duration = self.frames_np.shape[0] / self.RATE
                self.timestamp_offset = self.frames_offset + duration - 5

@ -807,10 +807,19 @@ class ServeClientFasterWhisper(ServeClientBase):
        self.same_output_threshold = 10
        self.end_time_for_same_output = None

-        device = "cuda" if torch.cuda.is_available() else "cpu"
+        # torch.cuda.is_available() fails when torch was compiled against a newer CUDA
+        # than the driver provides. Use ctranslate2's own CUDA probe instead, since
+        # faster_whisper relies on ctranslate2 — not torch — for inference.
+        try:
+            import ctranslate2 as _ct2
+            _cuda_types = _ct2.get_supported_compute_types("cuda")
+            device = "cuda" if _cuda_types else "cpu"
+        except Exception:
+            device = "cpu"
+
        if device == "cuda":
-            major, _ = torch.cuda.get_device_capability(device)
-            self.compute_type = "float16" if major >= 7 else "float32"
+            # Use int8 to stay within shared GPU memory budget (GPU 1 is shared with TTS/ComfyUI)
+            self.compute_type = "int8"
        else:
            self.compute_type = "int8"