From 65e8426cf7d6b9038810fb9bab126cf2ae6ed777 Mon Sep 17 00:00:00 2001 From: kcar Date: Wed, 27 May 2026 15:51:33 +0000 Subject: [PATCH] fix: tune WhisperLive buffering and CUDA detection --- whisper_live/server.py | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/whisper_live/server.py b/whisper_live/server.py index f9bb594..af691f8 100644 --- a/whisper_live/server.py +++ b/whisper_live/server.py @@ -427,7 +427,7 @@ class ServeClientBase(object): self.show_prev_out_thresh = 5 # if pause(no output from whisper) show previous output for 5 seconds self.add_pause_thresh = 3 # add a blank to segment list as a pause(no speech) for 3 seconds self.transcript = [] - self.send_last_n_segments = 10 + self.send_last_n_segments = 30 # text formatting self.pick_previous_segments = 2 @@ -461,9 +461,9 @@ class ServeClientBase(object): """ self.lock.acquire() - if self.frames_np is not None and self.frames_np.shape[0] > 45*self.RATE: - self.frames_offset += 30.0 - self.frames_np = self.frames_np[int(30*self.RATE):] + if self.frames_np is not None and self.frames_np.shape[0] > 90*self.RATE: + self.frames_offset += 60.0 + self.frames_np = self.frames_np[int(60*self.RATE):] # check timestamp offset(should be >= self.frame_offset) # this basically means that there is no speech as timestamp offset hasnt updated # and is less than frame_offset @@ -482,7 +482,7 @@ class ServeClientBase(object): no valid segment for the last 30 seconds from whisper """ with self.lock: - if self.frames_np[int((self.timestamp_offset - self.frames_offset)*self.RATE):].shape[0] > 25 * self.RATE: + if self.frames_np[int((self.timestamp_offset - self.frames_offset)*self.RATE):].shape[0] > 60 * self.RATE: duration = self.frames_np.shape[0] / self.RATE self.timestamp_offset = self.frames_offset + duration - 5 @@ -807,10 +807,19 @@ class ServeClientFasterWhisper(ServeClientBase): self.same_output_threshold = 10 self.end_time_for_same_output = None - device = "cuda" if torch.cuda.is_available() else "cpu" + # torch.cuda.is_available() fails when torch was compiled against a newer CUDA + # than the driver provides. Use ctranslate2's own CUDA probe instead, since + # faster_whisper relies on ctranslate2 — not torch — for inference. + try: + import ctranslate2 as _ct2 + _cuda_types = _ct2.get_supported_compute_types("cuda") + device = "cuda" if _cuda_types else "cpu" + except Exception: + device = "cpu" + if device == "cuda": - major, _ = torch.cuda.get_device_capability(device) - self.compute_type = "float16" if major >= 7 else "float32" + # Use int8 to stay within shared GPU memory budget (GPU 1 is shared with TTS/ComfyUI) + self.compute_type = "int8" else: self.compute_type = "int8"