WhisperLive-Server/hybrid_server.py

1230 lines
48 KiB
Python

import argparse
import ssl
import os
import socket
import threading
import tempfile
from pathlib import Path
from flask import Flask, request, jsonify, send_file, Response
from flask_sock import Sock
from werkzeug.utils import secure_filename
import websocket as ws_client
import json
import logging
def format_time_srt(s):
hours = int(s // 3600)
minutes = int((s % 3600) // 60)
seconds = int(s % 60)
milliseconds = int((s - int(s)) * 1000)
return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
def format_time_vtt(s):
hours = int(s // 3600)
minutes = int((s % 3600) // 60)
seconds = int(s % 60)
milliseconds = int((s - int(s)) * 1000)
return f"{hours:02}:{minutes:02}:{seconds:02}.{milliseconds:03}"
def generate_srt(segments):
output = ""
for i, segment in enumerate(segments, start=1):
start_time = format_time_srt(float(segment['start']))
end_time = format_time_srt(float(segment['end']))
text = segment['text'].strip()
output += f"{i}\n{start_time} --> {end_time}\n{text}\n\n"
return output
def generate_vtt(segments):
output = "WEBVTT\n\n"
for segment in segments:
start_time = format_time_vtt(float(segment['start']))
end_time = format_time_vtt(float(segment['end']))
text = segment['text'].strip()
output += f"{start_time} --> {end_time}\n{text}\n\n"
return output
# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def check_port_availability(port):
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
result = sock.connect_ex(('0.0.0.0', port))
sock.close()
return result != 0
class HybridWhisperServer:
def __init__(self, websocket_port, http_port, backend="faster_whisper",
faster_whisper_custom_model_path=None, whisper_tensorrt_path=None,
trt_multilingual=False, single_model=True, ssl_context=None):
self.websocket_port = websocket_port
self.http_port = http_port
self.backend = backend
self.faster_whisper_custom_model_path = faster_whisper_custom_model_path
self.whisper_tensorrt_path = whisper_tensorrt_path
self.trt_multilingual = trt_multilingual
self.single_model = single_model
self.ssl_context = ssl_context
# Initialize Flask app
self.app = Flask(__name__)
self.app.config['MAX_CONTENT_LENGTH'] = 100 * 1024 * 1024 # 100MB max file size
self.sock = Sock(self.app)
self.setup_routes()
# Initialize WhisperLive server
from whisper_live.server import TranscriptionServer
self.whisper_server = TranscriptionServer()
# Create a shared transcriber instance for HTTP requests
self.shared_transcriber = None
if self.backend == "faster_whisper":
from whisper_live.transcriber import WhisperModel
# Use base model as default for HTTP requests
model_size = "base"
if self.faster_whisper_custom_model_path:
model_size = self.faster_whisper_custom_model_path
self.shared_transcriber = WhisperModel(model_size)
def setup_routes(self):
@self.app.route('/health', methods=['GET'])
def health_check():
return jsonify({'status': 'healthy', 'service': 'WhisperLive Hybrid Server'})
@self.app.route('/', methods=['GET'])
def serve_test_form():
"""Serve the HTML test form"""
html_content = """
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>WhisperLive Dashboard</title>
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
<style>
:root {
--primary: #4f46e5;
--primary-hover: #4338ca;
--bg-color: #0f172a;
--card-bg: rgba(30, 41, 59, 0.7);
--text-main: #f8fafc;
--text-muted: #94a3b8;
--border: rgba(255, 255, 255, 0.1);
--success: #10b981;
--danger: #ef4444;
--warning: #f59e0b;
}
* {
box-sizing: border-box;
margin: 0;
padding: 0;
}
body {
font-family: 'Inter', sans-serif;
background-color: var(--bg-color);
color: var(--text-main);
min-height: 100vh;
background-image:
radial-gradient(at 0% 0%, rgba(79, 70, 229, 0.15) 0px, transparent 50%),
radial-gradient(at 100% 100%, rgba(16, 185, 129, 0.1) 0px, transparent 50%);
background-attachment: fixed;
padding: 2rem;
}
.container {
max-width: 1000px;
margin: 0 auto;
}
.header {
text-align: center;
margin-bottom: 2rem;
}
.header h1 {
font-size: 2.5rem;
font-weight: 700;
background: linear-gradient(to right, #818cf8, #34d399);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
margin-bottom: 0.5rem;
}
.header p {
color: var(--text-muted);
}
.glass-panel {
background: var(--card-bg);
backdrop-filter: blur(12px);
-webkit-backdrop-filter: blur(12px);
border: 1px solid var(--border);
border-radius: 1rem;
padding: 1.5rem;
margin-bottom: 1.5rem;
box-shadow: 0 10px 25px -5px rgba(0, 0, 0, 0.3);
}
/* Config Section */
.config-grid {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 1rem;
}
@media (max-width: 768px) {
.config-grid {
grid-template-columns: 1fr;
}
}
.form-group {
margin-bottom: 1rem;
}
.form-group label {
display: block;
font-size: 0.875rem;
font-weight: 500;
margin-bottom: 0.5rem;
color: var(--text-muted);
}
input[type="text"],
input[type="file"],
select {
width: 100%;
padding: 0.75rem 1rem;
background: rgba(15, 23, 42, 0.6);
border: 1px solid var(--border);
border-radius: 0.5rem;
color: var(--text-main);
font-size: 0.875rem;
transition: all 0.2s;
}
input[type="text"]:focus,
select:focus {
outline: none;
border-color: var(--primary);
box-shadow: 0 0 0 2px rgba(79, 70, 229, 0.2);
}
/* Tabs */
.tabs {
display: flex;
gap: 0.5rem;
margin-bottom: 1rem;
border-bottom: 1px solid var(--border);
padding-bottom: 0.5rem;
}
.tab-btn {
background: transparent;
border: none;
color: var(--text-muted);
padding: 0.75rem 1.5rem;
font-size: 1rem;
font-weight: 500;
cursor: pointer;
border-radius: 0.5rem;
transition: all 0.2s;
}
.tab-btn:hover {
color: var(--text-main);
background: rgba(255, 255, 255, 0.05);
}
.tab-btn.active {
color: var(--text-main);
background: var(--primary);
box-shadow: 0 4px 6px -1px rgba(79, 70, 229, 0.4);
}
.tab-content {
display: none;
animation: fadeIn 0.3s ease-in-out;
}
.tab-content.active {
display: block;
}
@keyframes fadeIn {
from {
opacity: 0;
transform: translateY(5px);
}
to {
opacity: 1;
transform: translateY(0);
}
}
/* Buttons */
.btn {
background: var(--primary);
color: white;
border: none;
padding: 0.75rem 1.5rem;
border-radius: 0.5rem;
font-weight: 600;
cursor: pointer;
transition: all 0.2s;
display: inline-flex;
align-items: center;
justify-content: center;
gap: 0.5rem;
width: 100%;
}
.btn:hover {
background: var(--primary-hover);
}
.btn:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.btn-danger {
background: var(--danger);
}
.btn-danger:hover {
background: #dc2626;
}
.btn-success {
background: var(--success);
}
.btn-success:hover {
background: #059669;
}
/* Results / Live View */
.transcript-box {
background: rgba(15, 23, 42, 0.6);
border: 1px solid var(--border);
border-radius: 0.5rem;
padding: 1.5rem;
min-height: 200px;
max-height: 400px;
overflow-y: auto;
margin-top: 1rem;
line-height: 1.6;
}
.segment {
margin-bottom: 0.75rem;
padding-bottom: 0.75rem;
border-bottom: 1px solid rgba(255, 255, 255, 0.05);
}
.segment:last-child {
border-bottom: none;
margin-bottom: 0;
padding-bottom: 0;
}
.segment-time {
font-size: 0.75rem;
color: var(--primary);
font-weight: 600;
margin-bottom: 0.25rem;
}
.status-badge {
display: inline-flex;
align-items: center;
gap: 0.3rem;
padding: 0.25rem 0.75rem;
border-radius: 9999px;
font-size: 0.75rem;
font-weight: 600;
}
.status-offline {
background: rgba(239, 68, 68, 0.2);
color: #fca5a5;
}
.status-online {
background: rgba(16, 185, 129, 0.2);
color: #6ee7b7;
}
.status-recording {
background: rgba(239, 68, 68, 0.2);
color: #fca5a5;
animation: pulse 2s infinite;
}
@keyframes pulse {
0% {
box-shadow: 0 0 0 0 rgba(239, 68, 68, 0.4);
}
70% {
box-shadow: 0 0 0 10px rgba(239, 68, 68, 0);
}
100% {
box-shadow: 0 0 0 0 rgba(239, 68, 68, 0);
}
}
/* Code snippets */
pre {
background: #1e293b;
padding: 1rem;
border-radius: 0.5rem;
overflow-x: auto;
font-size: 0.875rem;
color: #e2e8f0;
border: 1px solid var(--border);
margin-bottom: 1rem;
}
code {
font-family: 'Courier New', Courier, monospace;
}
.loading-spinner {
display: none;
width: 24px;
height: 24px;
border: 3px solid rgba(255, 255, 255, 0.3);
border-radius: 50%;
border-top-color: white;
animation: spin 1s ease-in-out infinite;
}
@keyframes spin {
to {
transform: rotate(360deg);
}
}
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1>WhisperLive</h1>
<p>High-Performance Real-Time Audio Transcription</p>
</div>
<!-- Configuration Panel -->
<div class="glass-panel">
<h3 style="margin-bottom: 1rem; font-size: 1.1rem;">Connection Settings</h3>
<div class="config-grid">
<div class="form-group">
<label>HTTP API URL (For File Upload & API)</label>
<input type="text" id="httpUrl" value="https://whisperlive.classroomcopilot.ai">
</div>
<div class="form-group">
<label>WebSocket URL (For Live Audio)</label>
<input type="text" id="wsUrl" value="wss://whisperlive.classroomcopilot.ai/ws">
</div>
</div>
<div style="margin-top: 0.5rem; font-size: 0.8rem; color: var(--text-muted);">
HTTP Status: <span id="httpStatus" class="status-badge status-offline">Checking...</span>
</div>
</div>
<!-- Main Workspace -->
<div class="glass-panel">
<div class="tabs">
<button class="tab-btn active" onclick="switchTab('file-tab')">File Upload</button>
<button class="tab-btn" onclick="switchTab('live-tab')">Live Microphone</button>
<button class="tab-btn" onclick="switchTab('api-tab')">API Usage</button>
</div>
<!-- Tab 1: File Upload -->
<div id="file-tab" class="tab-content active">
<form id="fileForm">
<div class="form-group">
<label>Audio File</label>
<input type="file" id="audioFile" accept=".wav,.mp3,.flac,.m4a,.ogg,.webm,.opus" required>
</div>
<div class="config-grid">
<div class="form-group">
<label>Language</label>
<select id="fileLanguage">
<option value="">Auto-detect</option>
<option value="en">English</option>
<option value="es">Spanish</option>
<option value="fr">French</option>
</select>
</div>
<div class="form-group">
<label>Task</label>
<select id="fileTask">
<option value="transcribe">Transcribe</option>
<option value="translate">Translate to English</option>
</select>
</div>
</div>
<button type="submit" class="btn" id="fileSubmitBtn">
<span>Transcribe File</span>
<div class="loading-spinner" id="fileSpinner"></div>
</button>
</form>
<div id="fileResult" style="display: none;">
<div class="transcript-box" id="fileTranscript"></div>
</div>
</div>
<!-- Tab 2: Live Recording -->
<div id="live-tab" class="tab-content">
<div class="config-grid" style="margin-bottom: 1.5rem;">
<div class="form-group">
<label>Language</label>
<select id="liveLanguage">
<option value="en">English</option>
<option value="es">Spanish</option>
<option value="fr">French</option>
</select>
</div>
<div class="form-group">
<label>Task</label>
<select id="liveTask">
<option value="transcribe">Transcribe</option>
<option value="translate">Translate to English</option>
</select>
</div>
</div>
<div style="display: flex; gap: 1rem; align-items: center;">
<button id="recordBtn" class="btn btn-success" style="width: auto;">
<span id="recordIcon">🎤</span> <span id="recordText">Start Recording</span>
</button>
<span id="liveStatus" class="status-badge status-offline" style="display: none;">Not
connected</span>
</div>
<div class="transcript-box" id="liveTranscript">
<div style="color: var(--text-muted); text-align: center; margin-top: 3rem;">
Click Start Recording to begin live transcription...
</div>
</div>
</div>
<!-- Tab 3: API Usage -->
<div id="api-tab" class="tab-content">
<h3 style="margin-bottom: 1rem;">OpenAI Compatible API</h3>
<p style="color: var(--text-muted); margin-bottom: 1rem; font-size: 0.9rem;">
WhisperLive acts as a drop-in replacement for OpenAI's Whisper API. You can use any standard OpenAI
client by changing the base URL.
</p>
<h4 style="margin-bottom: 0.5rem; color: #cbd5e1;">Python (openai package)</h4>
<pre><code id="pythonSnippet">from openai import OpenAI
client = OpenAI(
api_key="sk-no-key-required",
base_url="https://whisperlive.classroomcopilot.ai/v1/"
)
with open("audio.wav", "rb") as file:
transcription = client.audio.transcriptions.create(
file=file,
model="base",
response_format="verbose_json"
)
print(transcription.text)</code></pre>
<h4 style="margin-bottom: 0.5rem; color: #cbd5e1;">cURL</h4>
<pre><code id="curlSnippet">curl https://whisperlive.classroomcopilot.ai/v1/audio/transcriptions \
-H "Content-Type: multipart/form-data" \
-F file="@audio.wav" \
-F model="base" \
-F response_format="verbose_json"</code></pre>
</div>
</div>
</div>
<script>
// DOM Elements
const httpUrlInput = document.getElementById('httpUrl');
const wsUrlInput = document.getElementById('wsUrl');
const httpStatus = document.getElementById('httpStatus');
// Initialization
window.onload = () => {
// Check if on same domain to set default URL intelligently, else leave defaults
if (window.location.hostname !== '' && window.location.hostname !== 'localhost') {
httpUrlInput.value = window.location.origin;
wsUrlInput.value = window.location.origin.replace(/^http/, 'ws') + '/ws';
}
checkHealth();
updateSnippets();
};
httpUrlInput.addEventListener('change', () => { checkHealth(); updateSnippets(); });
// Tab Switching
function switchTab(tabId) {
document.querySelectorAll('.tab-content').forEach(t => t.classList.remove('active'));
document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
document.getElementById(tabId).classList.add('active');
event.target.classList.add('active');
}
// Health Check
async function checkHealth() {
try {
const res = await fetch(`${httpUrlInput.value}/health`);
if (res.ok) {
httpStatus.className = 'status-badge status-online';
httpStatus.textContent = '✅ Online';
} else throw new Error();
} catch (e) {
httpStatus.className = 'status-badge status-offline';
httpStatus.textContent = '❌ Offline';
}
}
// Update Code Snippets
function updateSnippets() {
const baseUrl = httpUrlInput.value.endsWith('/') ? httpUrlInput.value.slice(0, -1) : httpUrlInput.value;
document.getElementById('pythonSnippet').textContent = `from openai import OpenAI\n\nclient = OpenAI(\n api_key="sk-no-key-required",\n base_url="${baseUrl}/v1/"\n)\n\nwith open("audio.wav", "rb") as file:\n transcription = client.audio.transcriptions.create(\n file=file,\n model="base",\n response_format="verbose_json"\n )\n \nprint(transcription.text)`;
document.getElementById('curlSnippet').textContent = `curl ${baseUrl}/v1/audio/transcriptions \\\n -H "Content-Type: multipart/form-data" \\\n -F file="@audio.wav" \\\n -F model="base" \\\n -F response_format="verbose_json"`;
}
// Utility: Format Time
function formatTime(seconds) {
if (!seconds) return "0:00";
const mins = Math.floor(seconds / 60);
const secs = (seconds % 60).toFixed(2);
return `${mins}:${secs.padStart(5, '0')}`;
}
// ==========================================
// FEATURE 1: FILE TRANSCRIPTION
// ==========================================
document.getElementById('fileForm').addEventListener('submit', async (e) => {
e.preventDefault();
const file = document.getElementById('audioFile').files[0];
if (!file) return;
const btn = document.getElementById('fileSubmitBtn');
const spinner = document.getElementById('fileSpinner');
const resultBox = document.getElementById('fileResult');
const transcriptBox = document.getElementById('fileTranscript');
btn.disabled = true;
spinner.style.display = 'block';
resultBox.style.display = 'none';
const formData = new FormData();
formData.append('file', file);
formData.append('model', 'base');
formData.append('response_format', 'verbose_json');
const lang = document.getElementById('fileLanguage').value;
if (lang) formData.append('language', lang);
const task = document.getElementById('fileTask').value;
const baseUrl = httpUrlInput.value.endsWith('/') ? httpUrlInput.value.slice(0, -1) : httpUrlInput.value;
const endpoint = task === 'translate' ? `${baseUrl}/v1/audio/translations` : `${baseUrl}/v1/audio/transcriptions`;
try {
const response = await fetch(endpoint, { method: 'POST', body: formData });
const data = await response.json();
resultBox.style.display = 'block';
if (response.ok) {
let html = '';
if (data.segments && data.segments.length > 0) {
data.segments.forEach(seg => {
html += `<div class="segment"><div class="segment-time">${formatTime(seg.start)} - ${formatTime(seg.end)}</div><div class="segment-text">${seg.text}</div></div>`;
});
} else if (data.text) {
html += `<div class="segment"><div class="segment-text">${data.text}</div></div>`;
}
transcriptBox.innerHTML = html;
} else {
transcriptBox.innerHTML = `<div style="color: var(--danger)">Error: ${data.error?.message || JSON.stringify(data.error)}</div>`;
}
} catch (error) {
resultBox.style.display = 'block';
transcriptBox.innerHTML = `<div style="color: var(--danger)">Network Error: ${error.message}</div>`;
} finally {
btn.disabled = false;
spinner.style.display = 'none';
}
});
// ==========================================
// FEATURE 2: LIVE WEBSOCKET TRANSCRIPTION
// ==========================================
let ws = null;
let audioContext = null;
let mediaStream = null;
let processor = null;
let isRecording = false;
const recordBtn = document.getElementById('recordBtn');
const liveStatus = document.getElementById('liveStatus');
const liveTranscript = document.getElementById('liveTranscript');
recordBtn.addEventListener('click', async () => {
if (isRecording) {
stopRecording();
} else {
startRecording();
}
});
async function startRecording() {
liveTranscript.innerHTML = '';
liveStatus.style.display = 'inline-flex';
liveStatus.className = 'status-badge status-offline';
liveStatus.textContent = 'Connecting...';
try {
// 1. Connect WebSocket
ws = new WebSocket(wsUrlInput.value);
ws.onopen = () => {
// Send options to server
const options = {
uid: "web-" + Math.random().toString(36).substring(7),
language: document.getElementById('liveLanguage').value,
task: document.getElementById('liveTask').value,
model: "base",
use_vad: true
};
ws.send(JSON.stringify(options));
};
ws.onmessage = async (event) => {
const data = JSON.parse(event.data);
if (data.message === "SERVER_READY") {
liveStatus.className = 'status-badge status-recording';
liveStatus.innerHTML = '🔴 Recording';
await startAudioCapture();
} else if (data.segments) {
renderLiveSegments(data.segments);
} else if (data.status === "WAIT") {
liveStatus.textContent = `Waiting in queue (Est: ${data.message} min)`;
} else if (data.message === "DISCONNECT") {
stopRecording();
liveStatus.className = 'status-badge status-offline';
liveStatus.textContent = 'Disconnected by server';
}
};
ws.onerror = (err) => {
console.error('WebSocket Error', err);
stopRecording();
liveStatus.className = 'status-badge status-offline';
liveStatus.textContent = 'Connection Error';
};
ws.onclose = () => {
stopRecording();
};
// Update UI
isRecording = true;
recordBtn.className = 'btn btn-danger';
document.getElementById('recordIcon').textContent = '';
document.getElementById('recordText').textContent = 'Stop Recording';
} catch (err) {
console.error(err);
liveStatus.className = 'status-badge status-offline';
liveStatus.textContent = 'Microphone Error';
stopRecording();
}
}
async function startAudioCapture() {
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16000 });
const source = audioContext.createMediaStreamSource(mediaStream);
// Create a ScriptProcessorNode with bufferSize of 4096 and a single input/output channel
processor = audioContext.createScriptProcessor(4096, 1, 1);
processor.onaudioprocess = function (e) {
if (!isRecording || ws.readyState !== WebSocket.OPEN) return;
const float32Array = e.inputBuffer.getChannelData(0);
ws.send(float32Array.buffer);
};
source.connect(processor);
processor.connect(audioContext.destination);
}
function stopRecording() {
isRecording = false;
if (processor) {
processor.disconnect();
processor = null;
}
if (mediaStream) {
mediaStream.getTracks().forEach(track => track.stop());
mediaStream = null;
}
if (audioContext) {
audioContext.close();
audioContext = null;
}
if (ws) {
if (ws.readyState === WebSocket.OPEN) {
ws.send("END_OF_AUDIO");
setTimeout(() => ws.close(), 1000);
}
ws = null;
}
recordBtn.className = 'btn btn-success';
document.getElementById('recordIcon').textContent = '🎤';
document.getElementById('recordText').textContent = 'Start Recording';
if (liveStatus.textContent === '🔴 Recording') {
liveStatus.className = 'status-badge status-offline';
liveStatus.textContent = 'Stopped';
}
}
let liveSegments = [];
function renderLiveSegments(segments) {
let html = '';
segments.forEach(seg => {
const timeHtml = (seg.start !== undefined && seg.end !== undefined)
? `<div class="segment-time">${formatTime(seg.start)} - ${formatTime(seg.end)}</div>`
: '';
html += `<div class="segment">${timeHtml}<div class="segment-text">${seg.text}</div></div>`;
});
liveTranscript.innerHTML = html;
liveTranscript.scrollTop = liveTranscript.scrollHeight;
}
</script>
</body>
</html>
"""
return html_content, 200, {'Content-Type': 'text/html'}
@self.app.route('/transcribe', methods=['POST'])
def transcribe_file():
try:
if 'file' not in request.files:
return jsonify({'error': 'No file provided'}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': 'No file selected'}), 400
# Get optional parameters
language = request.form.get('language', None)
task = request.form.get('task', 'transcribe') # 'transcribe' or 'translate'
model_size = request.form.get('model', 'base')
# For now, we'll use the shared transcriber regardless of the requested model size
# In the future, we could create different transcriber instances for different models
# Validate file type
allowed_extensions = {'wav', 'mp3', 'flac', 'm4a', 'ogg', 'webm', 'opus', 'oga'}
if not file.filename.lower().endswith(tuple('.' + ext for ext in allowed_extensions)):
return jsonify({'error': f'Unsupported file type. Allowed: {", ".join(allowed_extensions)}'}), 400
# Save file temporarily
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix) as temp_file:
file.save(temp_file.name)
temp_path = temp_file.name
try:
# Transcribe the file using WhisperLive
if self.backend == "faster_whisper":
# Use the shared transcriber instance
if self.shared_transcriber is None:
return jsonify({'error': 'Transcriber not initialized'}), 500
segments, info = self.shared_transcriber.transcribe(
temp_path,
language=language,
task=task
)
else:
# For other backends, use the server's transcriber
# This would need to be adapted based on your specific backend setup
return jsonify({'error': 'Backend not yet supported for file transcription'}), 501
# Convert segments to serializable format
transcript_segments = []
for segment in segments:
transcript_segments.append({
'start': segment.start,
'end': segment.end,
'text': segment.text,
'no_speech_prob': segment.no_speech_prob
})
# Get transcription info
transcription_info = {
'language': info.language,
'language_probability': info.language_probability,
'duration': info.duration,
'duration_after_vad': info.duration_after_vad,
'transcription_options': info.transcription_options
}
return jsonify({
'success': True,
'segments': transcript_segments,
'info': transcription_info,
'filename': file.filename
})
finally:
# Clean up temporary file
if os.path.exists(temp_path):
os.unlink(temp_path)
except Exception as e:
logger.error(f"Error transcribing file: {str(e)}")
return jsonify({'error': f'Transcription failed: {str(e)}'}), 500
@self.app.route('/transcribe/url', methods=['POST'])
def transcribe_url():
try:
data = request.get_json()
if not data or 'url' not in data:
return jsonify({'error': 'No URL provided'}), 400
url = data['url']
language = data.get('language', None)
task = data.get('task', 'transcribe')
model_size = data.get('model', 'base')
# Validate URL
if not url.startswith(('http://', 'https://', 'rtsp://', 'hls://')):
return jsonify({'error': 'Invalid URL format'}), 400
# For now, we'll return a message that this endpoint is available
# but the actual implementation would depend on your specific needs
return jsonify({
'message': 'URL transcription endpoint available',
'url': url,
'note': 'This endpoint is ready for implementation based on your specific requirements'
})
except Exception as e:
logger.error(f"Error processing URL transcription request: {str(e)}")
return jsonify({'error': f'URL transcription failed: {str(e)}'}), 500
def handle_openai_audio_request(task_type):
try:
if 'file' not in request.files:
return jsonify({'error': {'message': 'No file provided', 'type': 'invalid_request_error', 'code': 'invalid_parameters'}}), 400
file = request.files['file']
if file.filename == '':
return jsonify({'error': {'message': 'No file selected', 'type': 'invalid_request_error', 'code': 'invalid_parameters'}}), 400
# Get OpenAI specific parameters
language = request.form.get('language', None)
model_size = request.form.get('model', 'base')
prompt = request.form.get('prompt', None)
response_format = request.form.get('response_format', 'json')
temperature = request.form.get('temperature', 0)
try:
temperature = float(temperature)
except ValueError:
temperature = 0.0
allowed_extensions = {'wav', 'mp3', 'flac', 'm4a', 'ogg', 'webm', 'mp4', 'mpeg', 'mpga', 'opus', 'oga'}
if not file.filename.lower().endswith(tuple('.' + ext for ext in allowed_extensions)):
return jsonify({'error': {'message': 'Unsupported file type.', 'type': 'invalid_request_error', 'code': 'invalid_file_format'}}), 400
with tempfile.NamedTemporaryFile(delete=False, suffix=Path(file.filename).suffix) as temp_file:
file.save(temp_file.name)
temp_path = temp_file.name
try:
if self.backend == "faster_whisper":
if self.shared_transcriber is None:
return jsonify({'error': {'message': 'Transcriber not initialized', 'type': 'internal_server_error'}}), 500
kwargs = {
"language": language,
"task": task_type,
"temperature": temperature
}
if prompt:
kwargs["initial_prompt"] = prompt
segments, info = self.shared_transcriber.transcribe(temp_path, **kwargs)
else:
return jsonify({'error': {'message': 'Backend not yet supported for file transcription', 'type': 'internal_server_error'}}), 501
transcript_segments = []
full_text = ""
for segment in segments:
text = segment.text
full_text += text
transcript_segments.append({
'id': segment.id,
'seek': segment.seek,
'start': segment.start,
'end': segment.end,
'text': text,
'tokens': segment.tokens,
'temperature': segment.temperature,
'avg_logprob': segment.avg_logprob,
'compression_ratio': segment.compression_ratio,
'no_speech_prob': segment.no_speech_prob
})
full_text = full_text.strip()
if response_format == 'json':
return jsonify({'text': full_text})
elif response_format == 'text':
return Response(full_text, mimetype='text/plain')
elif response_format == 'srt':
return Response(generate_srt(transcript_segments), mimetype='text/plain')
elif response_format == 'vtt':
return Response(generate_vtt(transcript_segments), mimetype='text/plain')
elif response_format == 'verbose_json':
return jsonify({
'task': task_type,
'language': info.language,
'duration': info.duration,
'text': full_text,
'segments': transcript_segments
})
else:
return jsonify({'text': full_text})
finally:
if os.path.exists(temp_path):
os.unlink(temp_path)
except Exception as e:
logger.error(f"Error processing OpenAI audio request: {str(e)}")
return jsonify({'error': {'message': f'Transcription failed: {str(e)}', 'type': 'internal_server_error'}}), 500
@self.app.route('/v1/audio/transcriptions', methods=['POST'])
def openai_transcriptions():
return handle_openai_audio_request('transcribe')
@self.app.route('/v1/audio/translations', methods=['POST'])
def openai_translations():
return handle_openai_audio_request('translate')
@self.app.route('/v1/models', methods=['GET'])
def list_models():
# Standard Whisper models supported by faster-whisper
model_names = [
"whisper-1", "tiny", "tiny.en", "base", "base.en",
"small", "small.en", "medium", "medium.en",
"large", "large-v1", "large-v2", "large-v3"
]
models = []
for name in model_names:
models.append({
"id": name,
"object": "model",
"created": 1677532384,
"owned_by": "openai" if name == "whisper-1" else "local",
"permission": [],
"root": name,
"parent": None
})
return jsonify({
"object": "list",
"data": models
})
# ===== WebSocket Bridge =====
# Bridges browser WebSocket connections on the HTTP port (8080)
# to the internal WhisperLive WebSocket server (port 5000).
# This allows live transcription through a single HTTPS port via NPM.
@self.sock.route('/ws')
def ws_bridge(ws):
"""Bridge WebSocket from HTTP port to internal WhisperLive WS server"""
internal_url = f"ws://127.0.0.1:{self.websocket_port}"
logger.info(f"WebSocket bridge: new connection, proxying to {internal_url}")
internal = None
try:
internal = ws_client.create_connection(internal_url)
# Thread: internal server → browser
def server_to_browser():
try:
while True:
opcode, data = internal.recv_data()
if opcode == ws_client.ABNF.OPCODE_TEXT:
ws.send(data.decode('utf-8'))
elif opcode == ws_client.ABNF.OPCODE_BINARY:
ws.send(data)
elif opcode in (ws_client.ABNF.OPCODE_CLOSE, ):
break
except Exception:
pass
relay_thread = threading.Thread(target=server_to_browser, daemon=True)
relay_thread.start()
# Main thread: browser → internal server
while True:
data = ws.receive()
if data is None:
break
if isinstance(data, bytes):
internal.send_binary(data)
else:
if data == "END_OF_AUDIO":
internal.send_binary(b"END_OF_AUDIO")
else:
internal.send(data)
except Exception as e:
logger.error(f"WebSocket bridge error: {e}")
finally:
if internal:
try:
internal.close()
except Exception:
pass
logger.info("WebSocket bridge: connection closed")
def run_websocket_server(self):
"""Run the WebSocket server in a separate thread"""
logger.info(f"Starting WebSocket server on port {self.websocket_port}")
self.whisper_server.run(
"0.0.0.0",
port=self.websocket_port,
backend=self.backend,
faster_whisper_custom_model_path=self.faster_whisper_custom_model_path,
whisper_tensorrt_path=self.whisper_tensorrt_path,
trt_multilingual=self.trt_multilingual,
single_model=self.single_model,
ssl_context=self.ssl_context
)
def run_http_server(self):
"""Run the HTTP server"""
logger.info(f"Starting HTTP server on port {self.http_port}")
self.app.run(host='0.0.0.0', port=self.http_port, debug=False, threaded=True)
def start(self):
"""Start both servers"""
# Start WebSocket server in a separate thread
websocket_thread = threading.Thread(target=self.run_websocket_server, daemon=True)
websocket_thread.start()
# Start HTTP server in main thread
self.run_http_server()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='WhisperLive Hybrid Server (WebSocket + HTTP)')
parser.add_argument('--websocket-port', '-wp',
type=int,
default=int(os.getenv('PORT_WHISPERLIVE', 9090)),
help="WebSocket port to run the server on.")
parser.add_argument('--http-port', '-hp',
type=int,
default=int(os.getenv('HTTP_PORT', 8080)),
help="HTTP port to run the server on.")
parser.add_argument('--backend', '-b',
type=str,
default='faster_whisper',
help='Backends from ["tensorrt", "faster_whisper"]')
parser.add_argument('--faster_whisper_custom_model_path', '-fw',
type=str, default=None,
help="Custom Faster Whisper Model")
parser.add_argument('--trt_model_path', '-trt',
type=str,
default=None,
help='Whisper TensorRT model path')
parser.add_argument('--trt_multilingual', '-m',
action="store_true",
help='Boolean only for TensorRT model. True if multilingual.')
parser.add_argument('--ssl_cert_path', '-ssl',
type=str,
default=None,
help='Path to cert.pem and key.pem if ssl should be used.')
parser.add_argument('--omp_num_threads', '-omp',
type=int,
default=1,
help="Number of threads to use for OpenMP")
parser.add_argument('--no_single_model', '-nsm',
action='store_true',
help='Set this if every connection should instantiate its own model. Only relevant for custom model, passed using -trt or -fw.')
args = parser.parse_args()
if args.backend == "tensorrt":
if args.trt_model_path is None:
raise ValueError("Please Provide a valid tensorrt model path")
websocket_port = args.websocket_port
http_port = args.http_port
if not check_port_availability(websocket_port):
print(f"Warning: WebSocket port {websocket_port} might already be in use!")
if not check_port_availability(http_port):
print(f"Warning: HTTP port {http_port} might already be in use!")
ssl_context = None
if args.ssl_cert_path is not None:
try:
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
ssl_context.load_cert_chain(
certfile=f"{args.ssl_cert_path}/cert.pem",
keyfile=f"{args.ssl_cert_path}/privkey.pem"
)
print("SSL context created successfully")
except Exception as e:
print(f"Failed to load SSL certificates: {str(e)}")
raise
if "OMP_NUM_THREADS" not in os.environ:
print(f"Setting OMP_NUM_THREADS to {args.omp_num_threads}")
os.environ["OMP_NUM_THREADS"] = str(args.omp_num_threads)
print(f"Running hybrid server with args: {args}")
server = HybridWhisperServer(
websocket_port=websocket_port,
http_port=http_port,
backend=args.backend,
faster_whisper_custom_model_path=args.faster_whisper_custom_model_path,
whisper_tensorrt_path=args.trt_model_path,
trt_multilingual=args.trt_multilingual,
single_model=not args.no_single_model,
ssl_context=ssl_context
)
print(f"Starting hybrid server with WebSocket on port {websocket_port} and HTTP on port {http_port}")
print(f"Backend: {args.backend}, SSL: {args.ssl_cert_path is not None}")
server.start()