feat: apply local modifications to WhisperLive-Server

This commit is contained in:
kcar 2026-05-13 22:33:35 +00:00
parent 05648af633
commit 83edfff9d3
17 changed files with 4274 additions and 1352 deletions

View File

@ -1,51 +0,0 @@
FROM python:3.10-bookworm
ARG DEBIAN_FRONTEND=noninteractive
# Create log directories with proper permissions
RUN mkdir -p /app/logs && \
touch /app/logs/whisperlive.log && \
touch /app/logs/connections.log && \
chmod 666 /app/logs/whisperlive.log && \
chmod 666 /app/logs/connections.log
# install lib required for pyaudio
RUN apt update && apt install -y portaudio19-dev && apt-get clean && rm -rf /var/lib/apt/lists/*
# update pip to support for whl.metadata -> less downloading
RUN pip install --no-cache-dir -U "pip>=24"
# create a working directory
WORKDIR /app
# install the requirements for running the whisper-live server
COPY requirements/server.txt /app/
RUN pip install -r server.txt && rm server.txt
COPY whisper_live /app/whisper_live
COPY run_server.py /app
# Port options
EXPOSE ${PORT_WHISPERLIVE}
EXPOSE ${PORT_WHISPERLIVE_SSL}
ARG PORT_WHISPERLIVE
ENV PORT_WHISPERLIVE=${PORT_WHISPERLIVE}
ARG PORT_WHISPERLIVE_SSL
ENV PORT_WHISPERLIVE_SSL=${PORT_WHISPERLIVE_SSL}
# SSL options
ARG WHISPERLIVE_SSL
ENV WHISPERLIVE_SSL=${WHISPERLIVE_SSL}
# Model options
ARG WHISPL_USE_CUSTOM_MODEL
ENV WHISPL_USE_CUSTOM_MODEL=${WHISPL_USE_CUSTOM_MODEL}
ARG FASTERWHISPER_MODEL
ENV FASTERWHISPER_MODEL=${FASTERWHISPER_MODEL}
CMD ["sh", "-c", "\
if [ \"$WHISPERLIVE_SSL\" = \"true\" ]; then \
python3 -u run_server.py --port $PORT_WHISPERLIVE_SSL --backend faster_whisper --faster_whisper_custom_model_path /app/models/$FASTERWHISPER_MODEL --ssl_cert_path /app/ssl; \
else \
python3 -u run_server.py --port $PORT_WHISPERLIVE --backend faster_whisper --faster_whisper_custom_model_path /app/models/$FASTERWHISPER_MODEL --no_single_model; \
fi"]

View File

@ -1,45 +0,0 @@
FROM python:3.10-bookworm
ARG DEBIAN_FRONTEND=noninteractive
# Create log directories with proper permissions
RUN mkdir -p /app/logs && \
touch /app/logs/whisperlive.log && \
touch /app/logs/connections.log && \
chmod 666 /app/logs/whisperlive.log && \
chmod 666 /app/logs/connections.log
# install lib required for pyaudio
RUN apt update && apt install -y portaudio19-dev && apt-get clean && rm -rf /var/lib/apt/lists/*
# update pip to support for whl.metadata -> less downloading
RUN pip install --no-cache-dir -U "pip>=24"
# create a working directory
WORKDIR /app
# install the requirements for running the whisper-live server
COPY requirements/server.txt /app/
RUN pip install -r server.txt && rm server.txt
COPY whisper_live /app/whisper_live
COPY run_server.py /app
# Copy application files
EXPOSE ${PORT_WHISPERLIVE}
EXPOSE ${PORT_WHISPERLIVE_SSL}
ARG PORT_WHISPERLIVE
ENV PORT_WHISPERLIVE=${PORT_WHISPERLIVE}
ARG PORT_WHISPERLIVE_SSL
ENV PORT_WHISPERLIVE_SSL=${PORT_WHISPERLIVE_SSL}
ARG FASTERWHISPER_MODEL
ENV FASTERWHISPER_MODEL=${FASTERWHISPER_MODEL}
ARG WHISPERLIVE_SSL
ENV WHISPERLIVE_SSL=${WHISPERLIVE_SSL}
CMD ["sh", "-c", "\
if [ \"$WHISPERLIVE_SSL\" = \"true\" ]; then \
python3 -u run_server.py --port $PORT_WHISPERLIVE_SSL --backend faster_whisper --faster_whisper_custom_model_path /app/models/$FASTERWHISPER_MODEL --ssl_cert_path /app/ssl; \
else \
python3 -u run_server.py --port $PORT_WHISPERLIVE --backend faster_whisper --faster_whisper_custom_model_path /app/models/$FASTERWHISPER_MODEL; \
fi"]

View File

@ -1,49 +0,0 @@
FROM python:3.10-bookworm
ARG DEBIAN_FRONTEND=noninteractive
# Create log directories with proper permissions
RUN mkdir -p /app/logs && \
touch /app/logs/whisperlive.log && \
touch /app/logs/connections.log && \
chmod 666 /app/logs/whisperlive.log && \
chmod 666 /app/logs/connections.log
# install lib required for pyaudio
RUN apt update && apt install -y portaudio19-dev && apt-get clean && rm -rf /var/lib/apt/lists/*
# update pip to support for whl.metadata -> less downloading
RUN pip install --no-cache-dir -U "pip>=24"
# create a working directory
WORKDIR /app
# install the requirements for running the whisper-live server
COPY requirements/server.txt /app/
RUN pip install -r server.txt && rm server.txt
# make the paths of the nvidia libs installed as wheels visible. equivalent to:
# export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
ENV LD_LIBRARY_PATH="/usr/local/lib/python3.10/site-packages/nvidia/cublas/lib:/usr/local/lib/python3.10/site-packages/nvidia/cudnn/lib"
COPY whisper_live /app/whisper_live
COPY run_server.py /app
# Copy application files
EXPOSE ${PORT_WHISPERLIVE}
EXPOSE ${PORT_WHISPERLIVE_SSL}
ARG PORT_WHISPERLIVE
ENV PORT_WHISPERLIVE=${PORT_WHISPERLIVE}
ARG PORT_WHISPERLIVE_SSL
ENV PORT_WHISPERLIVE_SSL=${PORT_WHISPERLIVE_SSL}
ARG FASTERWHISPER_MODEL
ENV FASTERWHISPER_MODEL=${FASTERWHISPER_MODEL}
ARG WHISPERLIVE_SSL
ENV WHISPERLIVE_SSL=${WHISPERLIVE_SSL}
CMD ["sh", "-c", "\
if [ \"$WHISPERLIVE_SSL\" = \"true\" ]; then \
python3 -u run_server.py --port $PORT_WHISPERLIVE_SSL --backend faster_whisper --faster_whisper_custom_model_path /app/models/$FASTERWHISPER_MODEL --ssl_cert_path /app/ssl; \
else \
python3 -u run_server.py --port $PORT_WHISPERLIVE --backend faster_whisper --faster_whisper_custom_model_path /app/models/$FASTERWHISPER_MODEL; \
fi"]

File diff suppressed because it is too large Load Diff

5
.env
View File

@ -1,9 +1,10 @@
# Whisper live settings
APP_WS_PROTOCOL=wss
APP_URL=kevlarai.com
APP_URL=classroomcopilot.ai
PORT_WHISPERLIVE=5050
PORT_WHISPERLIVE=5000
PORT_WHISPERLIVE_SSL=5053
HTTP_PORT=8080
WHISPERLIVE_SSL=false
WHISPL_USE_CUSTOM_MODEL=false

View File

@ -20,22 +20,24 @@ WORKDIR /app
# install the requirements for running the whisper-live server
COPY requirements/server.txt /app/
RUN pip install -r server.txt && rm server.txt
RUN pip install --no-cache-dir "setuptools<70.0.0" wheel
RUN pip install -r server.txt
RUN pip install --no-build-isolation openai-whisper==20240930
RUN rm server.txt
# make the paths of the nvidia libs installed as wheels visible
ENV LD_LIBRARY_PATH="/usr/local/lib/python3.10/site-packages/nvidia/cublas/lib:/usr/local/lib/python3.10/site-packages/nvidia/cudnn/lib"
RUN pip install --no-cache-dir nvidia-cublas-cu12 nvidia-cudnn-cu12
ENV LD_LIBRARY_PATH="/usr/local/lib/python3.10/site-packages/nvidia/cublas/lib:/usr/local/lib/python3.10/site-packages/nvidia/cudnn/lib:/usr/local/lib/python3.10/site-packages/torch/lib:${LD_LIBRARY_PATH}"
COPY whisper_live /app/whisper_live
COPY run_server.py /app
COPY hybrid_server.py /app
# Copy application files
EXPOSE ${PORT_WHISPERLIVE}
ARG PORT_WHISPERLIVE
ENV PORT_WHISPERLIVE=${PORT_WHISPERLIVE}
ARG FASTERWHISPER_MODEL
ENV FASTERWHISPER_MODEL=${FASTERWHISPER_MODEL}
# Expose both WebSocket and HTTP ports
EXPOSE 5000 8080
CMD ["python3", "-u", "run_server.py", "--port", "${PORT_WHISPERLIVE}", "--backend", "faster_whisper"]
# Use the hybrid server by default
CMD python3 -u hybrid_server.py --websocket-port 5000 --http-port 8080 --backend faster_whisper
# CMD ["python3", "-u", "run_server.py", "--port", "${PORT_WHISPERLIVE}", "--backend", "faster_whisper", "--faster_whisper_custom_model_path", "/app/models/${FASTERWHISPER_MODEL}", "--ssl_cert_path", "/app/ssl"]

260
HYBRID_SERVER_README.md Normal file
View File

@ -0,0 +1,260 @@
# WhisperLive Hybrid Server
This hybrid server extends the original WhisperLive-Server to support both WebSocket connections (for real-time audio streaming) and HTTP endpoints (for file transcription) in a single container.
## Features
- **WebSocket Server**: Original real-time audio transcription functionality
- **HTTP Server**: New file upload and transcription endpoints
- **Single Container**: Both services run in the same Docker container
- **GPU Sharing**: Both services share the same GPU resources
## Architecture
The hybrid server runs two services simultaneously:
1. **WebSocket Server**: Handles real-time audio streaming transcription
2. **HTTP Server**: Handles file uploads and transcription requests
Both services use the same WhisperLive transcriber instance, ensuring efficient resource usage.
## Ports
- **WebSocket Port**: Default 5050 (configurable via `PORT_WHISPERLIVE`)
- **HTTP Port**: Default 8080 (configurable via `HTTP_PORT`)
## HTTP Endpoints
### 1. Health Check
```
GET /health
```
Returns server health status.
**Response:**
```json
{
"status": "healthy",
"service": "WhisperLive Hybrid Server"
}
```
### 2. OpenAI Compatible Endpoints
```
POST /v1/audio/transcriptions
POST /v1/audio/translations
```
Fully compatible drop-in replacements for the standard OpenAI Whisper API.
**Parameters:**
- `file` (required): Audio file (WAV, MP3, FLAC, M4A, OGG, WEBM, MP4, MPEG, MPGA)
- `model` (optional): Model size (default: "base")
- `language` (optional): Language code (e.g., "en", "es", "fr")
- `prompt` (optional): Text to guide the model's style
- `response_format` (optional): "json", "text", "srt", "verbose_json", "vtt" (default: "json")
- `temperature` (optional): Sampling temperature (0.0 to 1.0)
**Example Request:**
```bash
curl -X POST http://localhost:8080/v1/audio/transcriptions \
-H "Content-Type: multipart/form-data" \
-F "file=@audio.wav" \
-F "model=whisper-1" \
-F "response_format=json"
```
**Response (JSON format):**
```json
{
"text": "Hello, this is a test."
}
```
### 3. Legacy File Transcription
```
POST /transcribe
```
Transcribes an uploaded audio file.
**Parameters:**
- `file` (required): Audio file (WAV, MP3, FLAC, M4A, OGG, WEBM)
- `language` (optional): Language code (e.g., "en", "es", "fr")
- `task` (optional): "transcribe" or "translate" (default: "transcribe")
- `model` (optional): Model size (default: "base")
**Example Request:**
```bash
curl -X POST http://localhost:8080/transcribe \
-F "file=@audio.wav" \
-F "language=en" \
-F "task=transcribe" \
-F "model=base"
```
**Response:**
```json
{
"success": true,
"segments": [
{
"start": 0.0,
"end": 2.5,
"text": "Hello, this is a test.",
"no_speech_prob": 0.1
}
],
"info": {
"language": "en",
"language_probability": 0.95,
"duration": 10.5,
"duration_after_vad": 10.5,
"transcription_options": {}
},
"filename": "audio.wav"
}
```
### 3. URL Transcription (Placeholder)
```
POST /transcribe/url
```
Endpoint for transcribing audio from URLs (ready for implementation).
## Usage Examples
### Python Client
```python
import requests
# Transcribe a file
with open('audio.wav', 'rb') as f:
response = requests.post('http://localhost:8080/transcribe',
files={'file': f},
data={'language': 'en', 'model': 'base'})
if response.status_code == 200:
result = response.json()
print(f"Transcription: {result['segments']}")
```
### JavaScript/Node.js
```javascript
const FormData = require('form-data');
const fs = require('fs');
const form = new FormData();
form.append('file', fs.createReadStream('audio.wav'));
form.append('language', 'en');
form.append('model', 'base');
fetch('http://localhost:8080/transcribe', {
method: 'POST',
body: form
})
.then(response => response.json())
.then(result => console.log(result));
```
### cURL
```bash
# Basic transcription
curl -X POST http://localhost:8080/transcribe \
-F "file=@audio.wav"
# With parameters
curl -X POST http://localhost:8080/transcribe \
-F "file=@audio.wav" \
-F "language=es" \
-F "task=translate" \
-F "model=small"
```
## Configuration
### Environment Variables
- `PORT_WHISPERLIVE`: WebSocket port (default: 5050)
- `HTTP_PORT`: HTTP port (default: 8080)
- `FASTERWHISPER_MODEL`: Custom model path
- `OMP_NUM_THREADS`: OpenMP thread count
### Docker Compose
```yaml
services:
whisperlive:
ports:
- "5050:5050" # WebSocket
- "8080:8080" # HTTP
environment:
PORT_WHISPERLIVE: 5050
HTTP_PORT: 8080
```
## Testing
### 1. Test Script
Run the Python test script:
```bash
python3 test_http_endpoints.py
```
### 2. Web Interface
Open `test_form.html` in a web browser to test the HTTP endpoints with a user-friendly interface.
### 3. Health Check
```bash
curl http://localhost:8080/health
```
## Backend Support
Currently, the HTTP endpoints support:
- **faster_whisper**: Full support for all features
- **tensorrt**: Basic support (needs adaptation)
- **openvino**: Basic support (needs adaptation)
## File Size Limits
- Maximum file size: 100MB
- Supported formats: WAV, MP3, FLAC, M4A, OGG, WEBM
## Performance Considerations
- File transcription uses the same model instance as WebSocket connections
- Temporary files are automatically cleaned up after processing
- Both services share GPU memory efficiently
- HTTP requests are processed in separate threads
## Troubleshooting
### Common Issues
1. **Port Already in Use**
- Check if ports 5050 or 8080 are available
- Use different ports via environment variables
2. **File Upload Errors**
- Ensure file size is under 100MB
- Check file format is supported
- Verify file is not corrupted
3. **GPU Memory Issues**
- Monitor GPU memory usage
- Consider using smaller model sizes
- Restart container if needed
### Logs
Check container logs for detailed error information:
```bash
docker logs whisperlive
```
## Migration from Original Server
The hybrid server is fully backward compatible. Your existing WebSocket clients will continue to work without changes. The HTTP endpoints are additional functionality that doesn't interfere with the original service.
## Future Enhancements
- [ ] Support for more audio formats
- [ ] Batch file processing
- [ ] Progress tracking for long files
- [ ] Authentication and rate limiting
- [ ] WebSocket support for file transcription progress

Binary file not shown.

270
batch_transcribe.py Normal file
View File

@ -0,0 +1,270 @@
#!/usr/bin/env python3
"""
Batch Transcription Script for WhisperLive
Processes all audio files in a folder using the HTTP transcription endpoint
"""
import os
import sys
import json
import time
import argparse
import requests
from pathlib import Path
from typing import List, Dict, Optional
import logging
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
class BatchTranscriber:
def __init__(self, server_url: str = "http://localhost:8080"):
self.server_url = server_url
self.supported_formats = {'.wav', '.mp3', '.flac', '.m4a', '.ogg', '.webm'}
def get_audio_files(self, folder_path: str) -> List[Path]:
"""Get all audio files from the specified folder"""
folder = Path(folder_path)
if not folder.exists():
raise FileNotFoundError(f"Folder not found: {folder_path}")
audio_files = []
for file_path in folder.iterdir():
if file_path.is_file() and file_path.suffix.lower() in self.supported_formats:
audio_files.append(file_path)
return sorted(audio_files)
def transcribe_file(self, file_path: Path, language: Optional[str] = None,
task: str = "transcribe", model: str = "base") -> Dict:
"""Transcribe a single audio file"""
try:
logger.info(f"Transcribing: {file_path.name}")
with open(file_path, 'rb') as f:
files = {'file': f}
data = {
'language': language,
'task': task,
'model': model
}
response = requests.post(f"{self.server_url}/transcribe",
files=files, data=data, timeout=300)
if response.status_code == 200:
result = response.json()
logger.info(f"✅ Successfully transcribed: {file_path.name}")
return result
else:
error_msg = response.text
logger.error(f"❌ Failed to transcribe {file_path.name}: {error_msg}")
return {'error': error_msg, 'status_code': response.status_code}
except Exception as e:
logger.error(f"❌ Error transcribing {file_path.name}: {str(e)}")
return {'error': str(e)}
def save_transcript(self, transcript_data: Dict, output_path: Path,
format_type: str = "txt") -> bool:
"""Save transcript in specified format"""
try:
if 'error' in transcript_data:
return False
if format_type == "txt":
with open(output_path, 'w', encoding='utf-8') as f:
f.write(f"Transcription of: {transcript_data.get('filename', 'Unknown')}\n")
f.write(f"Language: {transcript_data['info'].get('language', 'Auto-detected')}\n")
f.write(f"Duration: {transcript_data['info'].get('duration', 0):.2f} seconds\n")
f.write("=" * 50 + "\n\n")
for segment in transcript_data['segments']:
f.write(f"[{segment['start']:.2f}s - {segment['end']:.2f}s] {segment['text']}\n")
elif format_type == "json":
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(transcript_data, f, indent=2, ensure_ascii=False)
elif format_type == "srt":
with open(output_path, 'w', encoding='utf-8') as f:
for i, segment in enumerate(transcript_data['segments'], 1):
start_time = self.format_srt_time(segment['start'])
end_time = self.format_srt_time(segment['end'])
f.write(f"{i}\n{start_time} --> {end_time}\n{segment['text']}\n\n")
elif format_type == "vtt":
with open(output_path, 'w', encoding='utf-8') as f:
f.write("WEBVTT\n\n")
for segment in transcript_data['segments']:
start_time = self.format_vtt_time(segment['start'])
end_time = self.format_vtt_time(segment['end'])
f.write(f"{start_time} --> {end_time}\n{segment['text']}\n\n")
logger.info(f"💾 Saved transcript: {output_path}")
return True
except Exception as e:
logger.error(f"❌ Error saving transcript {output_path}: {str(e)}")
return False
def format_srt_time(self, seconds: float) -> str:
"""Format time for SRT subtitles"""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
millisecs = int((seconds % 1) * 1000)
return f"{hours:02d}:{minutes:02d}:{secs:02d},{millisecs:03d}"
def format_vtt_time(self, seconds: float) -> str:
"""Format time for VTT subtitles"""
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
secs = int(seconds % 60)
millisecs = int((seconds % 1) * 1000)
return f"{hours:02d}:{minutes:02d}:{secs:02d}.{millisecs:03d}"
def batch_transcribe(self, input_folder: str, output_folder: str,
language: Optional[str] = None, task: str = "transcribe",
model: str = "base", format_type: str = "txt",
delay: float = 1.0) -> Dict:
"""Process all audio files in the input folder"""
# Create output folder if it doesn't exist
output_path = Path(output_folder)
output_path.mkdir(parents=True, exist_ok=True)
# Get all audio files
audio_files = self.get_audio_files(input_folder)
if not audio_files:
logger.warning(f"No audio files found in: {input_folder}")
return {'processed': 0, 'successful': 0, 'failed': 0}
logger.info(f"Found {len(audio_files)} audio files to process")
results = {
'processed': len(audio_files),
'successful': 0,
'failed': 0,
'files': []
}
for i, audio_file in enumerate(audio_files, 1):
logger.info(f"Processing {i}/{len(audio_files)}: {audio_file.name}")
# Transcribe the file
transcript_data = self.transcribe_file(audio_file, language, task, model)
if 'error' not in transcript_data:
# Create output filename
base_name = audio_file.stem
output_file = output_path / f"{base_name}.{format_type}"
# Save transcript
if self.save_transcript(transcript_data, output_file, format_type):
results['successful'] += 1
results['files'].append({
'input': str(audio_file),
'output': str(output_file),
'status': 'success'
})
else:
results['failed'] += 1
results['files'].append({
'input': str(audio_file),
'output': str(output_file),
'status': 'failed'
})
else:
results['failed'] += 1
results['files'].append({
'input': str(audio_file),
'output': None,
'status': 'failed',
'error': transcript_data.get('error', 'Unknown error')
})
# Add delay between requests to avoid overwhelming the server
if i < len(audio_files):
time.sleep(delay)
return results
def main():
parser = argparse.ArgumentParser(description='Batch transcribe audio files using WhisperLive')
parser.add_argument('input_folder', help='Folder containing audio files')
parser.add_argument('output_folder', help='Folder to save transcripts')
parser.add_argument('--server', '-s', default='http://localhost:8080',
help='WhisperLive server URL (default: http://localhost:8080)')
parser.add_argument('--language', '-l', help='Language code (e.g., en, es, fr)')
parser.add_argument('--task', '-t', choices=['transcribe', 'translate'], default='transcribe',
help='Task to perform (default: transcribe)')
parser.add_argument('--model', '-m', default='base',
help='Model size (default: base)')
parser.add_argument('--format', '-f', choices=['txt', 'json', 'srt', 'vtt'], default='txt',
help='Output format (default: txt)')
parser.add_argument('--delay', '-d', type=float, default=1.0,
help='Delay between requests in seconds (default: 1.0)')
parser.add_argument('--verbose', '-v', action='store_true',
help='Verbose output')
args = parser.parse_args()
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
try:
# Initialize transcriber
transcriber = BatchTranscriber(args.server)
# Check server health
try:
response = requests.get(f"{args.server}/health", timeout=5)
if response.status_code != 200:
logger.error(f"Server health check failed: {response.status_code}")
sys.exit(1)
logger.info("✅ Server health check passed")
except requests.exceptions.RequestException as e:
logger.error(f"❌ Cannot connect to server: {e}")
sys.exit(1)
# Process files
results = transcriber.batch_transcribe(
input_folder=args.input_folder,
output_folder=args.output_folder,
language=args.language,
task=args.task,
model=args.model,
format_type=args.format,
delay=args.delay
)
# Print summary
logger.info("\n" + "=" * 50)
logger.info("BATCH TRANSCRIPTION COMPLETED")
logger.info("=" * 50)
logger.info(f"Total files processed: {results['processed']}")
logger.info(f"Successful: {results['successful']}")
logger.info(f"Failed: {results['failed']}")
logger.info(f"Output folder: {args.output_folder}")
logger.info(f"Output format: {args.format}")
if results['failed'] > 0:
logger.warning("\nFailed files:")
for file_info in results['files']:
if file_info['status'] == 'failed':
logger.warning(f" - {file_info['input']}: {file_info.get('error', 'Unknown error')}")
if results['successful'] > 0:
logger.info(f"\n✅ Successfully processed {results['successful']} files!")
except KeyboardInterrupt:
logger.info("\n⚠️ Process interrupted by user")
sys.exit(1)
except Exception as e:
logger.error(f"❌ Unexpected error: {str(e)}")
sys.exit(1)
if __name__ == "__main__":
main()

View File

@ -15,6 +15,8 @@ services:
LOG_PATH: /app/logs
NVIDIA_VISIBLE_DEVICES: all
NVIDIA_DRIVER_CAPABILITIES: compute,utility
PORT_WHISPERLIVE: ${PORT_WHISPERLIVE}
HTTP_PORT: ${HTTP_PORT:-8080}
volumes:
- ./models:/app/models
- ./ssl:/app/ssl
@ -26,11 +28,15 @@ services:
- driver: nvidia
count: 1
capabilities: [gpu]
options:
memory: "4G" # Match the main docker-compose.yml allocation
ports:
- ${PORT_WHISPERLIVE}:${PORT_WHISPERLIVE}
- "${PORT_WHISPERLIVE}:${PORT_WHISPERLIVE}"
- "${HTTP_PORT:-8080}:8080"
restart: unless-stopped
networks:
- audio-network
- default
networks:
audio-network:
default:
driver: bridge

1229
hybrid_server.py Normal file

File diff suppressed because it is too large Load Diff

866
openapi.json Normal file
View File

@ -0,0 +1,866 @@
{
"openapi": "3.1.0",
"info": {
"title": "WhisperLive API",
"description": "A high-performance speech-to-text API based on OpenAI's Whisper model.\nSupports real-time transcription via WebSocket and batch processing via HTTP.\n\n## Features\n- Real-time audio transcription\n- Batch file processing\n- Multiple language support\n- Translation capabilities\n- Multiple model sizes\n- WebSocket and HTTP interfaces\n",
"version": "1.0.0",
"contact": {
"name": "WhisperLive Support",
"url": "https://github.com/collabora/WhisperLive"
},
"license": {
"name": "MIT",
"url": "https://opensource.org/licenses/MIT"
}
},
"servers": [
{
"url": "http://localhost:8080",
"description": "Local development server"
},
{
"url": "https://api.whisperlive.com/v1",
"description": "Production server"
}
],
"security": [
{
"ApiKeyAuth": []
}
],
"paths": {
"/v1/audio/transcriptions": {
"post": {
"summary": "Create transcription",
"description": "Transcribes audio into the input language. The response will include the transcribed text\nand additional metadata such as language detection, confidence scores, and timestamps.\n",
"operationId": "createTranscription",
"tags": [
"Audio"
],
"requestBody": {
"required": true,
"content": {
"multipart/form-data": {
"schema": {
"type": "object",
"required": [
"file"
],
"properties": {
"file": {
"type": "string",
"format": "binary",
"description": "The audio file object (not file name) to transcribe, in one of these formats: \nflac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.\n"
},
"model": {
"type": "string",
"enum": [
"tiny",
"base",
"small",
"medium",
"large"
],
"default": "base",
"description": "ID of the model to use. Only whisper-1 is currently available."
},
"language": {
"type": "string",
"pattern": "^[a-z]{2}$",
"description": "The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.\nSupported languages: en, es, fr, de, it, pt, ru, ja, ko, zh, hi, ar\n"
},
"prompt": {
"type": "string",
"description": "An optional text to guide the model's style or continue a previous audio segment.\nThe prompt should match the audio language.\n"
},
"response_format": {
"type": "string",
"enum": [
"json",
"text",
"srt",
"verbose_json",
"vtt"
],
"default": "json",
"description": "The format of the transcript output."
},
"temperature": {
"type": "number",
"minimum": 0,
"maximum": 1,
"default": 0,
"description": "The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic."
},
"timestamp_granularities": {
"type": "array",
"items": {
"type": "string",
"enum": [
"word",
"segment"
]
},
"description": "The timestamp granularities to populate for this transcription."
}
}
}
}
}
},
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"oneOf": [
{
"$ref": "#/components/schemas/TranscriptionResponse"
},
{
"$ref": "#/components/schemas/TranscriptionTextResponse"
},
{
"$ref": "#/components/schemas/TranscriptionSrtResponse"
},
{
"$ref": "#/components/schemas/TranscriptionVttResponse"
}
]
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest"
},
"401": {
"$ref": "#/components/responses/Unauthorized"
},
"413": {
"$ref": "#/components/responses/FileTooLarge"
},
"422": {
"$ref": "#/components/responses/ValidationError"
},
"429": {
"$ref": "#/components/responses/RateLimitExceeded"
},
"500": {
"$ref": "#/components/responses/InternalServerError"
}
}
}
},
"/v1/audio/translations": {
"post": {
"summary": "Create translation",
"description": "Translates audio into English. The response will include the translated text\nand additional metadata such as confidence scores and timestamps.\n",
"operationId": "createTranslation",
"tags": [
"Audio"
],
"requestBody": {
"required": true,
"content": {
"multipart/form-data": {
"schema": {
"type": "object",
"required": [
"file"
],
"properties": {
"file": {
"type": "string",
"format": "binary",
"description": "The audio file object (not file name) to translate, in one of these formats: \nflac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.\n"
},
"model": {
"type": "string",
"enum": [
"tiny",
"base",
"small",
"medium",
"large"
],
"default": "base",
"description": "ID of the model to use. Only whisper-1 is currently available."
},
"prompt": {
"type": "string",
"description": "An optional text to guide the model's style or continue a previous audio segment.\nThe prompt should be in English.\n"
},
"response_format": {
"type": "string",
"enum": [
"json",
"text",
"srt",
"verbose_json",
"vtt"
],
"default": "json",
"description": "The format of the transcript output."
},
"temperature": {
"type": "number",
"minimum": 0,
"maximum": 1,
"default": 0,
"description": "The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic."
},
"timestamp_granularities": {
"type": "array",
"items": {
"type": "string",
"enum": [
"word",
"segment"
]
},
"description": "The timestamp granularities to populate for this translation."
}
}
}
}
}
},
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"oneOf": [
{
"$ref": "#/components/schemas/TranscriptionResponse"
},
{
"$ref": "#/components/schemas/TranscriptionTextResponse"
},
{
"$ref": "#/components/schemas/TranscriptionSrtResponse"
},
{
"$ref": "#/components/schemas/TranscriptionVttResponse"
}
]
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest"
},
"401": {
"$ref": "#/components/responses/Unauthorized"
},
"413": {
"$ref": "#/components/responses/FileTooLarge"
},
"422": {
"$ref": "#/components/responses/ValidationError"
},
"429": {
"$ref": "#/components/responses/RateLimitExceeded"
},
"500": {
"$ref": "#/components/responses/InternalServerError"
}
}
}
},
"/v1/models": {
"get": {
"summary": "List models",
"description": "Lists the currently available models, and provides basic information about each one such as the owner and availability.",
"operationId": "listModels",
"tags": [
"Models"
],
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ListModelsResponse"
}
}
}
},
"401": {
"$ref": "#/components/responses/Unauthorized"
},
"500": {
"$ref": "#/components/responses/InternalServerError"
}
}
}
},
"/v1/models/{model}": {
"get": {
"summary": "Retrieve model",
"description": "Retrieves a model instance, providing basic information about the model such as the owner and permissioning.",
"operationId": "retrieveModel",
"tags": [
"Models"
],
"parameters": [
{
"name": "model",
"in": "path",
"required": true,
"description": "The ID of the model to use for this request",
"schema": {
"type": "string",
"enum": [
"tiny",
"base",
"small",
"medium",
"large"
]
}
}
],
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Model"
}
}
}
},
"401": {
"$ref": "#/components/responses/Unauthorized"
},
"404": {
"$ref": "#/components/responses/NotFound"
},
"500": {
"$ref": "#/components/responses/InternalServerError"
}
}
}
},
"/v1/health": {
"get": {
"summary": "Health check",
"description": "Check the health status of the API server",
"operationId": "healthCheck",
"tags": [
"System"
],
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HealthResponse"
}
}
}
}
}
}
},
"/v1/websocket": {
"get": {
"summary": "WebSocket connection",
"description": "Establishes a WebSocket connection for real-time audio transcription.\nSend audio data as binary frames and receive transcription results.\n",
"operationId": "websocketConnection",
"tags": [
"Real-time"
],
"parameters": [
{
"name": "model",
"in": "query",
"description": "The model to use for transcription",
"schema": {
"type": "string",
"enum": [
"tiny",
"base",
"small",
"medium",
"large"
],
"default": "base"
}
},
{
"name": "language",
"in": "query",
"description": "The language of the input audio",
"schema": {
"type": "string",
"pattern": "^[a-z]{2}$"
}
},
{
"name": "task",
"in": "query",
"description": "The task to perform",
"schema": {
"type": "string",
"enum": [
"transcribe",
"translate"
],
"default": "transcribe"
}
}
],
"responses": {
"101": {
"description": "Switching Protocols",
"headers": {
"Upgrade": {
"schema": {
"type": "string",
"example": "websocket"
}
},
"Connection": {
"schema": {
"type": "string",
"example": "Upgrade"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest"
},
"401": {
"$ref": "#/components/responses/Unauthorized"
}
}
}
}
},
"components": {
"securitySchemes": {
"ApiKeyAuth": {
"type": "apiKey",
"in": "header",
"name": "Authorization",
"description": "API key authentication. Include your API key in the Authorization header.\nExample: `Authorization: Bearer your-api-key-here`\n"
}
},
"schemas": {
"TranscriptionResponse": {
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "The transcribed text"
},
"language": {
"type": "string",
"description": "The language of the input audio"
},
"duration": {
"type": "number",
"description": "The duration of the input audio in seconds"
},
"words": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Word"
},
"description": "Extracted words and their corresponding timestamps"
},
"segments": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Segment"
},
"description": "Segments of the transcribed text with timestamps"
}
},
"required": [
"text"
]
},
"TranscriptionTextResponse": {
"type": "string",
"description": "The transcribed text as plain text"
},
"TranscriptionSrtResponse": {
"type": "string",
"description": "The transcribed text in SRT subtitle format"
},
"TranscriptionVttResponse": {
"type": "string",
"description": "The transcribed text in VTT subtitle format"
},
"Word": {
"type": "object",
"properties": {
"word": {
"type": "string",
"description": "The text content of the word"
},
"start": {
"type": "number",
"description": "Start time of the word in seconds"
},
"end": {
"type": "number",
"description": "End time of the word in seconds"
},
"probability": {
"type": "number",
"description": "Confidence score of the word (0-1)"
}
},
"required": [
"word",
"start",
"end"
]
},
"Segment": {
"type": "object",
"properties": {
"id": {
"type": "integer",
"description": "Unique identifier for the segment"
},
"seek": {
"type": "number",
"description": "Seek offset of the segment in seconds"
},
"start": {
"type": "number",
"description": "Start time of the segment in seconds"
},
"end": {
"type": "number",
"description": "End time of the segment in seconds"
},
"text": {
"type": "string",
"description": "The text content of the segment"
},
"tokens": {
"type": "array",
"items": {
"type": "integer"
},
"description": "Array of token IDs for the segment"
},
"temperature": {
"type": "number",
"description": "Temperature parameter used for generating this segment"
},
"avg_logprob": {
"type": "number",
"description": "Average log probability of the segment"
},
"compression_ratio": {
"type": "number",
"description": "Compression ratio of the segment"
},
"no_speech_prob": {
"type": "number",
"description": "Probability of no speech in this segment"
},
"words": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Word"
},
"description": "Words in this segment"
}
},
"required": [
"id",
"seek",
"start",
"end",
"text"
]
},
"Model": {
"type": "object",
"properties": {
"id": {
"type": "string",
"description": "The model identifier"
},
"object": {
"type": "string",
"enum": [
"model"
],
"description": "The object type, which is always \"model\""
},
"created": {
"type": "integer",
"description": "The Unix timestamp (in seconds) when the model was created"
},
"owned_by": {
"type": "string",
"description": "The organization that owns the model"
},
"permission": {
"type": "array",
"items": {
"type": "object"
},
"description": "The permissions associated with the model"
},
"root": {
"type": "string",
"description": "The root of the model"
},
"parent": {
"type": "string",
"description": "The parent of the model"
}
},
"required": [
"id",
"object",
"created",
"owned_by"
]
},
"ListModelsResponse": {
"type": "object",
"properties": {
"object": {
"type": "string",
"enum": [
"list"
],
"description": "The object type, which is always \"list\""
},
"data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Model"
},
"description": "The list of models"
}
},
"required": [
"object",
"data"
]
},
"HealthResponse": {
"type": "object",
"properties": {
"status": {
"type": "string",
"enum": [
"healthy",
"unhealthy"
],
"description": "The health status of the service"
},
"service": {
"type": "string",
"description": "The name of the service"
},
"version": {
"type": "string",
"description": "The version of the service"
},
"timestamp": {
"type": "string",
"format": "date-time",
"description": "The current timestamp"
},
"uptime": {
"type": "number",
"description": "The uptime in seconds"
}
},
"required": [
"status",
"service"
]
},
"Error": {
"type": "object",
"properties": {
"error": {
"type": "object",
"properties": {
"message": {
"type": "string",
"description": "A human-readable error message"
},
"type": {
"type": "string",
"description": "The type of error"
},
"code": {
"type": "string",
"description": "The error code"
},
"param": {
"type": "string",
"description": "The parameter that caused the error"
}
}
}
},
"required": [
"error"
]
}
},
"responses": {
"BadRequest": {
"description": "Bad Request",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Error"
},
"example": {
"error": {
"message": "Invalid request parameters",
"type": "invalid_request_error",
"code": "invalid_parameters"
}
}
}
}
},
"Unauthorized": {
"description": "Unauthorized",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Error"
},
"example": {
"error": {
"message": "Invalid API key",
"type": "authentication_error",
"code": "invalid_api_key"
}
}
}
}
},
"FileTooLarge": {
"description": "File Too Large",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Error"
},
"example": {
"error": {
"message": "File size exceeds maximum allowed size",
"type": "invalid_request_error",
"code": "file_too_large"
}
}
}
}
},
"ValidationError": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Error"
},
"example": {
"error": {
"message": "Invalid file format",
"type": "invalid_request_error",
"code": "invalid_file_format"
}
}
}
}
},
"RateLimitExceeded": {
"description": "Rate Limit Exceeded",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Error"
},
"example": {
"error": {
"message": "Rate limit exceeded",
"type": "rate_limit_error",
"code": "rate_limit_exceeded"
}
}
}
}
},
"InternalServerError": {
"description": "Internal Server Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Error"
},
"example": {
"error": {
"message": "An internal server error occurred",
"type": "server_error",
"code": "internal_error"
}
}
}
}
},
"NotFound": {
"description": "Not Found",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Error"
},
"example": {
"error": {
"message": "Model not found",
"type": "invalid_request_error",
"code": "model_not_found"
}
}
}
}
}
}
},
"tags": [
{
"name": "Audio",
"description": "Audio transcription and translation operations"
},
{
"name": "Models",
"description": "Model management operations"
},
{
"name": "System",
"description": "System health and status operations"
},
{
"name": "Real-time",
"description": "Real-time audio processing via WebSocket"
}
]
}

View File

@ -9,5 +9,7 @@ av
jiwer
evaluate
numpy<2
openai-whisper==20240930
tokenizers==0.20.3
flask==3.0.0
flask-sock
websocket-client

727
scratch/dashboard.html Normal file
View File

@ -0,0 +1,727 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>WhisperLive Dashboard</title>
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
<style>
:root {
--primary: #4f46e5;
--primary-hover: #4338ca;
--bg-color: #0f172a;
--card-bg: rgba(30, 41, 59, 0.7);
--text-main: #f8fafc;
--text-muted: #94a3b8;
--border: rgba(255, 255, 255, 0.1);
--success: #10b981;
--danger: #ef4444;
--warning: #f59e0b;
}
* {
box-sizing: border-box;
margin: 0;
padding: 0;
}
body {
font-family: 'Inter', sans-serif;
background-color: var(--bg-color);
color: var(--text-main);
min-height: 100vh;
background-image:
radial-gradient(at 0% 0%, rgba(79, 70, 229, 0.15) 0px, transparent 50%),
radial-gradient(at 100% 100%, rgba(16, 185, 129, 0.1) 0px, transparent 50%);
background-attachment: fixed;
padding: 2rem;
}
.container {
max-width: 1000px;
margin: 0 auto;
}
.header {
text-align: center;
margin-bottom: 2rem;
}
.header h1 {
font-size: 2.5rem;
font-weight: 700;
background: linear-gradient(to right, #818cf8, #34d399);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
margin-bottom: 0.5rem;
}
.header p {
color: var(--text-muted);
}
.glass-panel {
background: var(--card-bg);
backdrop-filter: blur(12px);
-webkit-backdrop-filter: blur(12px);
border: 1px solid var(--border);
border-radius: 1rem;
padding: 1.5rem;
margin-bottom: 1.5rem;
box-shadow: 0 10px 25px -5px rgba(0, 0, 0, 0.3);
}
/* Config Section */
.config-grid {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 1rem;
}
@media (max-width: 768px) {
.config-grid {
grid-template-columns: 1fr;
}
}
.form-group {
margin-bottom: 1rem;
}
.form-group label {
display: block;
font-size: 0.875rem;
font-weight: 500;
margin-bottom: 0.5rem;
color: var(--text-muted);
}
input[type="text"],
input[type="file"],
select {
width: 100%;
padding: 0.75rem 1rem;
background: rgba(15, 23, 42, 0.6);
border: 1px solid var(--border);
border-radius: 0.5rem;
color: var(--text-main);
font-size: 0.875rem;
transition: all 0.2s;
}
input[type="text"]:focus,
select:focus {
outline: none;
border-color: var(--primary);
box-shadow: 0 0 0 2px rgba(79, 70, 229, 0.2);
}
/* Tabs */
.tabs {
display: flex;
gap: 0.5rem;
margin-bottom: 1rem;
border-bottom: 1px solid var(--border);
padding-bottom: 0.5rem;
}
.tab-btn {
background: transparent;
border: none;
color: var(--text-muted);
padding: 0.75rem 1.5rem;
font-size: 1rem;
font-weight: 500;
cursor: pointer;
border-radius: 0.5rem;
transition: all 0.2s;
}
.tab-btn:hover {
color: var(--text-main);
background: rgba(255, 255, 255, 0.05);
}
.tab-btn.active {
color: var(--text-main);
background: var(--primary);
box-shadow: 0 4px 6px -1px rgba(79, 70, 229, 0.4);
}
.tab-content {
display: none;
animation: fadeIn 0.3s ease-in-out;
}
.tab-content.active {
display: block;
}
@keyframes fadeIn {
from {
opacity: 0;
transform: translateY(5px);
}
to {
opacity: 1;
transform: translateY(0);
}
}
/* Buttons */
.btn {
background: var(--primary);
color: white;
border: none;
padding: 0.75rem 1.5rem;
border-radius: 0.5rem;
font-weight: 600;
cursor: pointer;
transition: all 0.2s;
display: inline-flex;
align-items: center;
justify-content: center;
gap: 0.5rem;
width: 100%;
}
.btn:hover {
background: var(--primary-hover);
}
.btn:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.btn-danger {
background: var(--danger);
}
.btn-danger:hover {
background: #dc2626;
}
.btn-success {
background: var(--success);
}
.btn-success:hover {
background: #059669;
}
/* Results / Live View */
.transcript-box {
background: rgba(15, 23, 42, 0.6);
border: 1px solid var(--border);
border-radius: 0.5rem;
padding: 1.5rem;
min-height: 200px;
max-height: 400px;
overflow-y: auto;
margin-top: 1rem;
line-height: 1.6;
}
.segment {
margin-bottom: 0.75rem;
padding-bottom: 0.75rem;
border-bottom: 1px solid rgba(255, 255, 255, 0.05);
}
.segment:last-child {
border-bottom: none;
margin-bottom: 0;
padding-bottom: 0;
}
.segment-time {
font-size: 0.75rem;
color: var(--primary);
font-weight: 600;
margin-bottom: 0.25rem;
}
.status-badge {
display: inline-flex;
align-items: center;
gap: 0.3rem;
padding: 0.25rem 0.75rem;
border-radius: 9999px;
font-size: 0.75rem;
font-weight: 600;
}
.status-offline {
background: rgba(239, 68, 68, 0.2);
color: #fca5a5;
}
.status-online {
background: rgba(16, 185, 129, 0.2);
color: #6ee7b7;
}
.status-recording {
background: rgba(239, 68, 68, 0.2);
color: #fca5a5;
animation: pulse 2s infinite;
}
@keyframes pulse {
0% {
box-shadow: 0 0 0 0 rgba(239, 68, 68, 0.4);
}
70% {
box-shadow: 0 0 0 10px rgba(239, 68, 68, 0);
}
100% {
box-shadow: 0 0 0 0 rgba(239, 68, 68, 0);
}
}
/* Code snippets */
pre {
background: #1e293b;
padding: 1rem;
border-radius: 0.5rem;
overflow-x: auto;
font-size: 0.875rem;
color: #e2e8f0;
border: 1px solid var(--border);
margin-bottom: 1rem;
}
code {
font-family: 'Courier New', Courier, monospace;
}
.loading-spinner {
display: none;
width: 24px;
height: 24px;
border: 3px solid rgba(255, 255, 255, 0.3);
border-radius: 50%;
border-top-color: white;
animation: spin 1s ease-in-out infinite;
}
@keyframes spin {
to {
transform: rotate(360deg);
}
}
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1>WhisperLive</h1>
<p>High-Performance Real-Time Audio Transcription</p>
</div>
<!-- Configuration Panel -->
<div class="glass-panel">
<h3 style="margin-bottom: 1rem; font-size: 1.1rem;">Connection Settings</h3>
<div class="config-grid">
<div class="form-group">
<label>HTTP API URL (For File Upload & API)</label>
<input type="text" id="httpUrl" value="https://whisperlive.classroomcopilot.ai">
</div>
<div class="form-group">
<label>WebSocket URL (For Live Audio)</label>
<input type="text" id="wsUrl" value="wss://whisperlive.classroomcopilot.ai/ws">
</div>
</div>
<div style="margin-top: 0.5rem; font-size: 0.8rem; color: var(--text-muted);">
HTTP Status: <span id="httpStatus" class="status-badge status-offline">Checking...</span>
</div>
</div>
<!-- Main Workspace -->
<div class="glass-panel">
<div class="tabs">
<button class="tab-btn active" onclick="switchTab('file-tab')">File Upload</button>
<button class="tab-btn" onclick="switchTab('live-tab')">Live Microphone</button>
<button class="tab-btn" onclick="switchTab('api-tab')">API Usage</button>
</div>
<!-- Tab 1: File Upload -->
<div id="file-tab" class="tab-content active">
<form id="fileForm">
<div class="form-group">
<label>Audio File</label>
<input type="file" id="audioFile" accept=".wav,.mp3,.flac,.m4a,.ogg,.webm" required>
</div>
<div class="config-grid">
<div class="form-group">
<label>Language</label>
<select id="fileLanguage">
<option value="">Auto-detect</option>
<option value="en">English</option>
<option value="es">Spanish</option>
<option value="fr">French</option>
</select>
</div>
<div class="form-group">
<label>Task</label>
<select id="fileTask">
<option value="transcribe">Transcribe</option>
<option value="translate">Translate to English</option>
</select>
</div>
</div>
<button type="submit" class="btn" id="fileSubmitBtn">
<span>Transcribe File</span>
<div class="loading-spinner" id="fileSpinner"></div>
</button>
</form>
<div id="fileResult" style="display: none;">
<div class="transcript-box" id="fileTranscript"></div>
</div>
</div>
<!-- Tab 2: Live Recording -->
<div id="live-tab" class="tab-content">
<div class="config-grid" style="margin-bottom: 1.5rem;">
<div class="form-group">
<label>Language</label>
<select id="liveLanguage">
<option value="en">English</option>
<option value="es">Spanish</option>
<option value="fr">French</option>
</select>
</div>
<div class="form-group">
<label>Task</label>
<select id="liveTask">
<option value="transcribe">Transcribe</option>
<option value="translate">Translate to English</option>
</select>
</div>
</div>
<div style="display: flex; gap: 1rem; align-items: center;">
<button id="recordBtn" class="btn btn-success" style="width: auto;">
<span id="recordIcon">🎤</span> <span id="recordText">Start Recording</span>
</button>
<span id="liveStatus" class="status-badge status-offline" style="display: none;">Not
connected</span>
</div>
<div class="transcript-box" id="liveTranscript">
<div style="color: var(--text-muted); text-align: center; margin-top: 3rem;">
Click Start Recording to begin live transcription...
</div>
</div>
</div>
<!-- Tab 3: API Usage -->
<div id="api-tab" class="tab-content">
<h3 style="margin-bottom: 1rem;">OpenAI Compatible API</h3>
<p style="color: var(--text-muted); margin-bottom: 1rem; font-size: 0.9rem;">
WhisperLive acts as a drop-in replacement for OpenAI's Whisper API. You can use any standard OpenAI
client by changing the base URL.
</p>
<h4 style="margin-bottom: 0.5rem; color: #cbd5e1;">Python (openai package)</h4>
<pre><code id="pythonSnippet">from openai import OpenAI
client = OpenAI(
api_key="sk-no-key-required",
base_url="https://whisperlive.classroomcopilot.ai/v1/"
)
with open("audio.wav", "rb") as file:
transcription = client.audio.transcriptions.create(
file=file,
model="base",
response_format="verbose_json"
)
print(transcription.text)</code></pre>
<h4 style="margin-bottom: 0.5rem; color: #cbd5e1;">cURL</h4>
<pre><code id="curlSnippet">curl https://whisperlive.classroomcopilot.ai/v1/audio/transcriptions \
-H "Content-Type: multipart/form-data" \
-F file="@audio.wav" \
-F model="base" \
-F response_format="verbose_json"</code></pre>
</div>
</div>
</div>
<script>
// DOM Elements
const httpUrlInput = document.getElementById('httpUrl');
const wsUrlInput = document.getElementById('wsUrl');
const httpStatus = document.getElementById('httpStatus');
// Initialization
window.onload = () => {
// Check if on same domain to set default URL intelligently, else leave defaults
if (window.location.hostname !== '' && window.location.hostname !== 'localhost') {
httpUrlInput.value = window.location.origin;
wsUrlInput.value = window.location.origin.replace(/^http/, 'ws') + '/ws';
}
checkHealth();
updateSnippets();
};
httpUrlInput.addEventListener('change', () => { checkHealth(); updateSnippets(); });
// Tab Switching
function switchTab(tabId) {
document.querySelectorAll('.tab-content').forEach(t => t.classList.remove('active'));
document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
document.getElementById(tabId).classList.add('active');
event.target.classList.add('active');
}
// Health Check
async function checkHealth() {
try {
const res = await fetch(`${httpUrlInput.value}/health`);
if (res.ok) {
httpStatus.className = 'status-badge status-online';
httpStatus.textContent = '✅ Online';
} else throw new Error();
} catch (e) {
httpStatus.className = 'status-badge status-offline';
httpStatus.textContent = '❌ Offline';
}
}
// Update Code Snippets
function updateSnippets() {
const baseUrl = httpUrlInput.value.endsWith('/') ? httpUrlInput.value.slice(0, -1) : httpUrlInput.value;
document.getElementById('pythonSnippet').textContent = `from openai import OpenAI\n\nclient = OpenAI(\n api_key="sk-no-key-required",\n base_url="${baseUrl}/v1/"\n)\n\nwith open("audio.wav", "rb") as file:\n transcription = client.audio.transcriptions.create(\n file=file,\n model="base",\n response_format="verbose_json"\n )\n \nprint(transcription.text)`;
document.getElementById('curlSnippet').textContent = `curl ${baseUrl}/v1/audio/transcriptions \\\n -H "Content-Type: multipart/form-data" \\\n -F file="@audio.wav" \\\n -F model="base" \\\n -F response_format="verbose_json"`;
}
// Utility: Format Time
function formatTime(seconds) {
if (!seconds) return "0:00";
const mins = Math.floor(seconds / 60);
const secs = (seconds % 60).toFixed(2);
return `${mins}:${secs.padStart(5, '0')}`;
}
// ==========================================
// FEATURE 1: FILE TRANSCRIPTION
// ==========================================
document.getElementById('fileForm').addEventListener('submit', async (e) => {
e.preventDefault();
const file = document.getElementById('audioFile').files[0];
if (!file) return;
const btn = document.getElementById('fileSubmitBtn');
const spinner = document.getElementById('fileSpinner');
const resultBox = document.getElementById('fileResult');
const transcriptBox = document.getElementById('fileTranscript');
btn.disabled = true;
spinner.style.display = 'block';
resultBox.style.display = 'none';
const formData = new FormData();
formData.append('file', file);
formData.append('model', 'base');
formData.append('response_format', 'verbose_json');
const lang = document.getElementById('fileLanguage').value;
if (lang) formData.append('language', lang);
const task = document.getElementById('fileTask').value;
const baseUrl = httpUrlInput.value.endsWith('/') ? httpUrlInput.value.slice(0, -1) : httpUrlInput.value;
const endpoint = task === 'translate' ? `${baseUrl}/v1/audio/translations` : `${baseUrl}/v1/audio/transcriptions`;
try {
const response = await fetch(endpoint, { method: 'POST', body: formData });
const data = await response.json();
resultBox.style.display = 'block';
if (response.ok) {
let html = '';
if (data.segments && data.segments.length > 0) {
data.segments.forEach(seg => {
html += `<div class="segment"><div class="segment-time">${formatTime(seg.start)} - ${formatTime(seg.end)}</div><div class="segment-text">${seg.text}</div></div>`;
});
} else if (data.text) {
html += `<div class="segment"><div class="segment-text">${data.text}</div></div>`;
}
transcriptBox.innerHTML = html;
} else {
transcriptBox.innerHTML = `<div style="color: var(--danger)">Error: ${data.error?.message || JSON.stringify(data.error)}</div>`;
}
} catch (error) {
resultBox.style.display = 'block';
transcriptBox.innerHTML = `<div style="color: var(--danger)">Network Error: ${error.message}</div>`;
} finally {
btn.disabled = false;
spinner.style.display = 'none';
}
});
// ==========================================
// FEATURE 2: LIVE WEBSOCKET TRANSCRIPTION
// ==========================================
let ws = null;
let audioContext = null;
let mediaStream = null;
let processor = null;
let isRecording = false;
const recordBtn = document.getElementById('recordBtn');
const liveStatus = document.getElementById('liveStatus');
const liveTranscript = document.getElementById('liveTranscript');
recordBtn.addEventListener('click', async () => {
if (isRecording) {
stopRecording();
} else {
startRecording();
}
});
async function startRecording() {
liveTranscript.innerHTML = '';
liveStatus.style.display = 'inline-flex';
liveStatus.className = 'status-badge status-offline';
liveStatus.textContent = 'Connecting...';
try {
// 1. Connect WebSocket
ws = new WebSocket(wsUrlInput.value);
ws.onopen = () => {
// Send options to server
const options = {
uid: "web-" + Math.random().toString(36).substring(7),
language: document.getElementById('liveLanguage').value,
task: document.getElementById('liveTask').value,
model: "base",
use_vad: true
};
ws.send(JSON.stringify(options));
};
ws.onmessage = async (event) => {
const data = JSON.parse(event.data);
if (data.message === "SERVER_READY") {
liveStatus.className = 'status-badge status-recording';
liveStatus.innerHTML = '🔴 Recording';
await startAudioCapture();
} else if (data.segments) {
renderLiveSegments(data.segments);
} else if (data.status === "WAIT") {
liveStatus.textContent = `Waiting in queue (Est: ${data.message} min)`;
} else if (data.message === "DISCONNECT") {
stopRecording();
liveStatus.className = 'status-badge status-offline';
liveStatus.textContent = 'Disconnected by server';
}
};
ws.onerror = (err) => {
console.error('WebSocket Error', err);
stopRecording();
liveStatus.className = 'status-badge status-offline';
liveStatus.textContent = 'Connection Error';
};
ws.onclose = () => {
stopRecording();
};
// Update UI
isRecording = true;
recordBtn.className = 'btn btn-danger';
document.getElementById('recordIcon').textContent = '⏹';
document.getElementById('recordText').textContent = 'Stop Recording';
} catch (err) {
console.error(err);
liveStatus.className = 'status-badge status-offline';
liveStatus.textContent = 'Microphone Error';
stopRecording();
}
}
async function startAudioCapture() {
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16000 });
const source = audioContext.createMediaStreamSource(mediaStream);
// Create a ScriptProcessorNode with bufferSize of 4096 and a single input/output channel
processor = audioContext.createScriptProcessor(4096, 1, 1);
processor.onaudioprocess = function (e) {
if (!isRecording || ws.readyState !== WebSocket.OPEN) return;
const float32Array = e.inputBuffer.getChannelData(0);
ws.send(float32Array.buffer);
};
source.connect(processor);
processor.connect(audioContext.destination);
}
function stopRecording() {
isRecording = false;
if (processor) {
processor.disconnect();
processor = null;
}
if (mediaStream) {
mediaStream.getTracks().forEach(track => track.stop());
mediaStream = null;
}
if (audioContext) {
audioContext.close();
audioContext = null;
}
if (ws) {
if (ws.readyState === WebSocket.OPEN) {
ws.send("END_OF_AUDIO");
setTimeout(() => ws.close(), 1000);
}
ws = null;
}
recordBtn.className = 'btn btn-success';
document.getElementById('recordIcon').textContent = '🎤';
document.getElementById('recordText').textContent = 'Start Recording';
if (liveStatus.textContent === '🔴 Recording') {
liveStatus.className = 'status-badge status-offline';
liveStatus.textContent = 'Stopped';
}
}
let liveSegments = [];
function renderLiveSegments(segments) {
let html = '';
segments.forEach(seg => {
const timeHtml = (seg.start !== undefined && seg.end !== undefined)
? `<div class="segment-time">${formatTime(seg.start)} - ${formatTime(seg.end)}</div>`
: '';
html += `<div class="segment">${timeHtml}<div class="segment-text">${seg.text}</div></div>`;
});
liveTranscript.innerHTML = html;
liveTranscript.scrollTop = liveTranscript.scrollHeight;
}
</script>
</body>
</html>

9
scratch/test_ws.py Normal file
View File

@ -0,0 +1,9 @@
import websockets
from websockets.sync.server import serve
def handler(websocket):
print("Path:", websocket.request.path)
websocket.send("Hello")
with serve(handler, "127.0.0.1", 8765) as server:
server.serve_forever()

727
test_form.html Normal file
View File

@ -0,0 +1,727 @@
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>WhisperLive Dashboard</title>
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap" rel="stylesheet">
<style>
:root {
--primary: #4f46e5;
--primary-hover: #4338ca;
--bg-color: #0f172a;
--card-bg: rgba(30, 41, 59, 0.7);
--text-main: #f8fafc;
--text-muted: #94a3b8;
--border: rgba(255, 255, 255, 0.1);
--success: #10b981;
--danger: #ef4444;
--warning: #f59e0b;
}
* {
box-sizing: border-box;
margin: 0;
padding: 0;
}
body {
font-family: 'Inter', sans-serif;
background-color: var(--bg-color);
color: var(--text-main);
min-height: 100vh;
background-image:
radial-gradient(at 0% 0%, rgba(79, 70, 229, 0.15) 0px, transparent 50%),
radial-gradient(at 100% 100%, rgba(16, 185, 129, 0.1) 0px, transparent 50%);
background-attachment: fixed;
padding: 2rem;
}
.container {
max-width: 1000px;
margin: 0 auto;
}
.header {
text-align: center;
margin-bottom: 2rem;
}
.header h1 {
font-size: 2.5rem;
font-weight: 700;
background: linear-gradient(to right, #818cf8, #34d399);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
margin-bottom: 0.5rem;
}
.header p {
color: var(--text-muted);
}
.glass-panel {
background: var(--card-bg);
backdrop-filter: blur(12px);
-webkit-backdrop-filter: blur(12px);
border: 1px solid var(--border);
border-radius: 1rem;
padding: 1.5rem;
margin-bottom: 1.5rem;
box-shadow: 0 10px 25px -5px rgba(0, 0, 0, 0.3);
}
/* Config Section */
.config-grid {
display: grid;
grid-template-columns: 1fr 1fr;
gap: 1rem;
}
@media (max-width: 768px) {
.config-grid {
grid-template-columns: 1fr;
}
}
.form-group {
margin-bottom: 1rem;
}
.form-group label {
display: block;
font-size: 0.875rem;
font-weight: 500;
margin-bottom: 0.5rem;
color: var(--text-muted);
}
input[type="text"],
input[type="file"],
select {
width: 100%;
padding: 0.75rem 1rem;
background: rgba(15, 23, 42, 0.6);
border: 1px solid var(--border);
border-radius: 0.5rem;
color: var(--text-main);
font-size: 0.875rem;
transition: all 0.2s;
}
input[type="text"]:focus,
select:focus {
outline: none;
border-color: var(--primary);
box-shadow: 0 0 0 2px rgba(79, 70, 229, 0.2);
}
/* Tabs */
.tabs {
display: flex;
gap: 0.5rem;
margin-bottom: 1rem;
border-bottom: 1px solid var(--border);
padding-bottom: 0.5rem;
}
.tab-btn {
background: transparent;
border: none;
color: var(--text-muted);
padding: 0.75rem 1.5rem;
font-size: 1rem;
font-weight: 500;
cursor: pointer;
border-radius: 0.5rem;
transition: all 0.2s;
}
.tab-btn:hover {
color: var(--text-main);
background: rgba(255, 255, 255, 0.05);
}
.tab-btn.active {
color: var(--text-main);
background: var(--primary);
box-shadow: 0 4px 6px -1px rgba(79, 70, 229, 0.4);
}
.tab-content {
display: none;
animation: fadeIn 0.3s ease-in-out;
}
.tab-content.active {
display: block;
}
@keyframes fadeIn {
from {
opacity: 0;
transform: translateY(5px);
}
to {
opacity: 1;
transform: translateY(0);
}
}
/* Buttons */
.btn {
background: var(--primary);
color: white;
border: none;
padding: 0.75rem 1.5rem;
border-radius: 0.5rem;
font-weight: 600;
cursor: pointer;
transition: all 0.2s;
display: inline-flex;
align-items: center;
justify-content: center;
gap: 0.5rem;
width: 100%;
}
.btn:hover {
background: var(--primary-hover);
}
.btn:disabled {
opacity: 0.5;
cursor: not-allowed;
}
.btn-danger {
background: var(--danger);
}
.btn-danger:hover {
background: #dc2626;
}
.btn-success {
background: var(--success);
}
.btn-success:hover {
background: #059669;
}
/* Results / Live View */
.transcript-box {
background: rgba(15, 23, 42, 0.6);
border: 1px solid var(--border);
border-radius: 0.5rem;
padding: 1.5rem;
min-height: 200px;
max-height: 400px;
overflow-y: auto;
margin-top: 1rem;
line-height: 1.6;
}
.segment {
margin-bottom: 0.75rem;
padding-bottom: 0.75rem;
border-bottom: 1px solid rgba(255, 255, 255, 0.05);
}
.segment:last-child {
border-bottom: none;
margin-bottom: 0;
padding-bottom: 0;
}
.segment-time {
font-size: 0.75rem;
color: var(--primary);
font-weight: 600;
margin-bottom: 0.25rem;
}
.status-badge {
display: inline-flex;
align-items: center;
gap: 0.3rem;
padding: 0.25rem 0.75rem;
border-radius: 9999px;
font-size: 0.75rem;
font-weight: 600;
}
.status-offline {
background: rgba(239, 68, 68, 0.2);
color: #fca5a5;
}
.status-online {
background: rgba(16, 185, 129, 0.2);
color: #6ee7b7;
}
.status-recording {
background: rgba(239, 68, 68, 0.2);
color: #fca5a5;
animation: pulse 2s infinite;
}
@keyframes pulse {
0% {
box-shadow: 0 0 0 0 rgba(239, 68, 68, 0.4);
}
70% {
box-shadow: 0 0 0 10px rgba(239, 68, 68, 0);
}
100% {
box-shadow: 0 0 0 0 rgba(239, 68, 68, 0);
}
}
/* Code snippets */
pre {
background: #1e293b;
padding: 1rem;
border-radius: 0.5rem;
overflow-x: auto;
font-size: 0.875rem;
color: #e2e8f0;
border: 1px solid var(--border);
margin-bottom: 1rem;
}
code {
font-family: 'Courier New', Courier, monospace;
}
.loading-spinner {
display: none;
width: 24px;
height: 24px;
border: 3px solid rgba(255, 255, 255, 0.3);
border-radius: 50%;
border-top-color: white;
animation: spin 1s ease-in-out infinite;
}
@keyframes spin {
to {
transform: rotate(360deg);
}
}
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1>WhisperLive</h1>
<p>High-Performance Real-Time Audio Transcription</p>
</div>
<!-- Configuration Panel -->
<div class="glass-panel">
<h3 style="margin-bottom: 1rem; font-size: 1.1rem;">Connection Settings</h3>
<div class="config-grid">
<div class="form-group">
<label>HTTP API URL (For File Upload & API)</label>
<input type="text" id="httpUrl" value="https://whisperlive.classroomcopilot.ai">
</div>
<div class="form-group">
<label>WebSocket URL (For Live Audio)</label>
<input type="text" id="wsUrl" value="wss://whisperlive.classroomcopilot.ai/ws">
</div>
</div>
<div style="margin-top: 0.5rem; font-size: 0.8rem; color: var(--text-muted);">
HTTP Status: <span id="httpStatus" class="status-badge status-offline">Checking...</span>
</div>
</div>
<!-- Main Workspace -->
<div class="glass-panel">
<div class="tabs">
<button class="tab-btn active" onclick="switchTab('file-tab')">File Upload</button>
<button class="tab-btn" onclick="switchTab('live-tab')">Live Microphone</button>
<button class="tab-btn" onclick="switchTab('api-tab')">API Usage</button>
</div>
<!-- Tab 1: File Upload -->
<div id="file-tab" class="tab-content active">
<form id="fileForm">
<div class="form-group">
<label>Audio File</label>
<input type="file" id="audioFile" accept=".wav,.mp3,.flac,.m4a,.ogg,.webm" required>
</div>
<div class="config-grid">
<div class="form-group">
<label>Language</label>
<select id="fileLanguage">
<option value="">Auto-detect</option>
<option value="en">English</option>
<option value="es">Spanish</option>
<option value="fr">French</option>
</select>
</div>
<div class="form-group">
<label>Task</label>
<select id="fileTask">
<option value="transcribe">Transcribe</option>
<option value="translate">Translate to English</option>
</select>
</div>
</div>
<button type="submit" class="btn" id="fileSubmitBtn">
<span>Transcribe File</span>
<div class="loading-spinner" id="fileSpinner"></div>
</button>
</form>
<div id="fileResult" style="display: none;">
<div class="transcript-box" id="fileTranscript"></div>
</div>
</div>
<!-- Tab 2: Live Recording -->
<div id="live-tab" class="tab-content">
<div class="config-grid" style="margin-bottom: 1.5rem;">
<div class="form-group">
<label>Language</label>
<select id="liveLanguage">
<option value="en">English</option>
<option value="es">Spanish</option>
<option value="fr">French</option>
</select>
</div>
<div class="form-group">
<label>Task</label>
<select id="liveTask">
<option value="transcribe">Transcribe</option>
<option value="translate">Translate to English</option>
</select>
</div>
</div>
<div style="display: flex; gap: 1rem; align-items: center;">
<button id="recordBtn" class="btn btn-success" style="width: auto;">
<span id="recordIcon">🎤</span> <span id="recordText">Start Recording</span>
</button>
<span id="liveStatus" class="status-badge status-offline" style="display: none;">Not
connected</span>
</div>
<div class="transcript-box" id="liveTranscript">
<div style="color: var(--text-muted); text-align: center; margin-top: 3rem;">
Click Start Recording to begin live transcription...
</div>
</div>
</div>
<!-- Tab 3: API Usage -->
<div id="api-tab" class="tab-content">
<h3 style="margin-bottom: 1rem;">OpenAI Compatible API</h3>
<p style="color: var(--text-muted); margin-bottom: 1rem; font-size: 0.9rem;">
WhisperLive acts as a drop-in replacement for OpenAI's Whisper API. You can use any standard OpenAI
client by changing the base URL.
</p>
<h4 style="margin-bottom: 0.5rem; color: #cbd5e1;">Python (openai package)</h4>
<pre><code id="pythonSnippet">from openai import OpenAI
client = OpenAI(
api_key="sk-no-key-required",
base_url="https://whisperlive.classroomcopilot.ai/v1/"
)
with open("audio.wav", "rb") as file:
transcription = client.audio.transcriptions.create(
file=file,
model="base",
response_format="verbose_json"
)
print(transcription.text)</code></pre>
<h4 style="margin-bottom: 0.5rem; color: #cbd5e1;">cURL</h4>
<pre><code id="curlSnippet">curl https://whisperlive.classroomcopilot.ai/v1/audio/transcriptions \
-H "Content-Type: multipart/form-data" \
-F file="@audio.wav" \
-F model="base" \
-F response_format="verbose_json"</code></pre>
</div>
</div>
</div>
<script>
// DOM Elements
const httpUrlInput = document.getElementById('httpUrl');
const wsUrlInput = document.getElementById('wsUrl');
const httpStatus = document.getElementById('httpStatus');
// Initialization
window.onload = () => {
// Check if on same domain to set default URL intelligently, else leave defaults
if (window.location.hostname !== '' && window.location.hostname !== 'localhost') {
httpUrlInput.value = window.location.origin;
wsUrlInput.value = window.location.origin.replace(/^http/, 'ws') + '/ws';
}
checkHealth();
updateSnippets();
};
httpUrlInput.addEventListener('change', () => { checkHealth(); updateSnippets(); });
// Tab Switching
function switchTab(tabId) {
document.querySelectorAll('.tab-content').forEach(t => t.classList.remove('active'));
document.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
document.getElementById(tabId).classList.add('active');
event.target.classList.add('active');
}
// Health Check
async function checkHealth() {
try {
const res = await fetch(`${httpUrlInput.value}/health`);
if (res.ok) {
httpStatus.className = 'status-badge status-online';
httpStatus.textContent = '✅ Online';
} else throw new Error();
} catch (e) {
httpStatus.className = 'status-badge status-offline';
httpStatus.textContent = '❌ Offline';
}
}
// Update Code Snippets
function updateSnippets() {
const baseUrl = httpUrlInput.value.endsWith('/') ? httpUrlInput.value.slice(0, -1) : httpUrlInput.value;
document.getElementById('pythonSnippet').textContent = `from openai import OpenAI\n\nclient = OpenAI(\n api_key="sk-no-key-required",\n base_url="${baseUrl}/v1/"\n)\n\nwith open("audio.wav", "rb") as file:\n transcription = client.audio.transcriptions.create(\n file=file,\n model="base",\n response_format="verbose_json"\n )\n \nprint(transcription.text)`;
document.getElementById('curlSnippet').textContent = `curl ${baseUrl}/v1/audio/transcriptions \\\n -H "Content-Type: multipart/form-data" \\\n -F file="@audio.wav" \\\n -F model="base" \\\n -F response_format="verbose_json"`;
}
// Utility: Format Time
function formatTime(seconds) {
if (!seconds) return "0:00";
const mins = Math.floor(seconds / 60);
const secs = (seconds % 60).toFixed(2);
return `${mins}:${secs.padStart(5, '0')}`;
}
// ==========================================
// FEATURE 1: FILE TRANSCRIPTION
// ==========================================
document.getElementById('fileForm').addEventListener('submit', async (e) => {
e.preventDefault();
const file = document.getElementById('audioFile').files[0];
if (!file) return;
const btn = document.getElementById('fileSubmitBtn');
const spinner = document.getElementById('fileSpinner');
const resultBox = document.getElementById('fileResult');
const transcriptBox = document.getElementById('fileTranscript');
btn.disabled = true;
spinner.style.display = 'block';
resultBox.style.display = 'none';
const formData = new FormData();
formData.append('file', file);
formData.append('model', 'base');
formData.append('response_format', 'verbose_json');
const lang = document.getElementById('fileLanguage').value;
if (lang) formData.append('language', lang);
const task = document.getElementById('fileTask').value;
const baseUrl = httpUrlInput.value.endsWith('/') ? httpUrlInput.value.slice(0, -1) : httpUrlInput.value;
const endpoint = task === 'translate' ? `${baseUrl}/v1/audio/translations` : `${baseUrl}/v1/audio/transcriptions`;
try {
const response = await fetch(endpoint, { method: 'POST', body: formData });
const data = await response.json();
resultBox.style.display = 'block';
if (response.ok) {
let html = '';
if (data.segments && data.segments.length > 0) {
data.segments.forEach(seg => {
html += `<div class="segment"><div class="segment-time">${formatTime(seg.start)} - ${formatTime(seg.end)}</div><div class="segment-text">${seg.text}</div></div>`;
});
} else if (data.text) {
html += `<div class="segment"><div class="segment-text">${data.text}</div></div>`;
}
transcriptBox.innerHTML = html;
} else {
transcriptBox.innerHTML = `<div style="color: var(--danger)">Error: ${data.error?.message || JSON.stringify(data.error)}</div>`;
}
} catch (error) {
resultBox.style.display = 'block';
transcriptBox.innerHTML = `<div style="color: var(--danger)">Network Error: ${error.message}</div>`;
} finally {
btn.disabled = false;
spinner.style.display = 'none';
}
});
// ==========================================
// FEATURE 2: LIVE WEBSOCKET TRANSCRIPTION
// ==========================================
let ws = null;
let audioContext = null;
let mediaStream = null;
let processor = null;
let isRecording = false;
const recordBtn = document.getElementById('recordBtn');
const liveStatus = document.getElementById('liveStatus');
const liveTranscript = document.getElementById('liveTranscript');
recordBtn.addEventListener('click', async () => {
if (isRecording) {
stopRecording();
} else {
startRecording();
}
});
async function startRecording() {
liveTranscript.innerHTML = '';
liveStatus.style.display = 'inline-flex';
liveStatus.className = 'status-badge status-offline';
liveStatus.textContent = 'Connecting...';
try {
// 1. Connect WebSocket
ws = new WebSocket(wsUrlInput.value);
ws.onopen = () => {
// Send options to server
const options = {
uid: "web-" + Math.random().toString(36).substring(7),
language: document.getElementById('liveLanguage').value,
task: document.getElementById('liveTask').value,
model: "base",
use_vad: true
};
ws.send(JSON.stringify(options));
};
ws.onmessage = async (event) => {
const data = JSON.parse(event.data);
if (data.message === "SERVER_READY") {
liveStatus.className = 'status-badge status-recording';
liveStatus.innerHTML = '🔴 Recording';
await startAudioCapture();
} else if (data.segments) {
renderLiveSegments(data.segments);
} else if (data.status === "WAIT") {
liveStatus.textContent = `Waiting in queue (Est: ${data.message} min)`;
} else if (data.message === "DISCONNECT") {
stopRecording();
liveStatus.className = 'status-badge status-offline';
liveStatus.textContent = 'Disconnected by server';
}
};
ws.onerror = (err) => {
console.error('WebSocket Error', err);
stopRecording();
liveStatus.className = 'status-badge status-offline';
liveStatus.textContent = 'Connection Error';
};
ws.onclose = () => {
stopRecording();
};
// Update UI
isRecording = true;
recordBtn.className = 'btn btn-danger';
document.getElementById('recordIcon').textContent = '⏹';
document.getElementById('recordText').textContent = 'Stop Recording';
} catch (err) {
console.error(err);
liveStatus.className = 'status-badge status-offline';
liveStatus.textContent = 'Microphone Error';
stopRecording();
}
}
async function startAudioCapture() {
mediaStream = await navigator.mediaDevices.getUserMedia({ audio: true });
audioContext = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: 16000 });
const source = audioContext.createMediaStreamSource(mediaStream);
// Create a ScriptProcessorNode with bufferSize of 4096 and a single input/output channel
processor = audioContext.createScriptProcessor(4096, 1, 1);
processor.onaudioprocess = function (e) {
if (!isRecording || ws.readyState !== WebSocket.OPEN) return;
const float32Array = e.inputBuffer.getChannelData(0);
ws.send(float32Array.buffer);
};
source.connect(processor);
processor.connect(audioContext.destination);
}
function stopRecording() {
isRecording = false;
if (processor) {
processor.disconnect();
processor = null;
}
if (mediaStream) {
mediaStream.getTracks().forEach(track => track.stop());
mediaStream = null;
}
if (audioContext) {
audioContext.close();
audioContext = null;
}
if (ws) {
if (ws.readyState === WebSocket.OPEN) {
ws.send("END_OF_AUDIO");
setTimeout(() => ws.close(), 1000);
}
ws = null;
}
recordBtn.className = 'btn btn-success';
document.getElementById('recordIcon').textContent = '🎤';
document.getElementById('recordText').textContent = 'Start Recording';
if (liveStatus.textContent === '🔴 Recording') {
liveStatus.className = 'status-badge status-offline';
liveStatus.textContent = 'Stopped';
}
}
let liveSegments = [];
function renderLiveSegments(segments) {
let html = '';
segments.forEach(seg => {
const timeHtml = (seg.start !== undefined && seg.end !== undefined)
? `<div class="segment-time">${formatTime(seg.start)} - ${formatTime(seg.end)}</div>`
: '';
html += `<div class="segment">${timeHtml}<div class="segment-text">${seg.text}</div></div>`;
});
liveTranscript.innerHTML = html;
liveTranscript.scrollTop = liveTranscript.scrollHeight;
}
</script>
</body>
</html>

159
test_http_endpoints.py Normal file
View File

@ -0,0 +1,159 @@
#!/usr/bin/env python3
"""
Test script for WhisperLive HTTP endpoints
This script demonstrates how to use the new HTTP API for file transcription
"""
import requests
import json
import os
from pathlib import Path
# Configuration
HTTP_BASE_URL = "http://localhost:8080" # Adjust if using different port
WEBSOCKET_PORT = 5050 # Your existing WebSocket port
def test_health_endpoint():
"""Test the health check endpoint"""
print("Testing health endpoint...")
try:
response = requests.get(f"{HTTP_BASE_URL}/health")
print(f"Status: {response.status_code}")
print(f"Response: {response.json()}")
return response.status_code == 200
except Exception as e:
print(f"Error: {e}")
return False
def test_file_transcription(audio_file_path, language=None, task="transcribe", model="base"):
"""Test file transcription endpoint"""
print(f"\nTesting file transcription endpoint...")
print(f"File: {audio_file_path}")
print(f"Language: {language or 'auto-detect'}")
print(f"Task: {task}")
print(f"Model: {model}")
if not os.path.exists(audio_file_path):
print(f"Error: File {audio_file_path} not found")
return False
try:
# Prepare the request
files = {'file': open(audio_file_path, 'rb')}
data = {
'language': language,
'task': task,
'model': model
}
# Make the request
response = requests.post(f"{HTTP_BASE_URL}/transcribe", files=files, data=data)
print(f"Status: {response.status_code}")
if response.status_code == 200:
result = response.json()
print("Transcription successful!")
print(f"Filename: {result.get('filename')}")
print(f"Language: {result['info'].get('language')}")
print(f"Duration: {result['info'].get('duration')} seconds")
print(f"Number of segments: {len(result['segments'])}")
# Print first few segments
for i, segment in enumerate(result['segments'][:3]):
print(f"Segment {i+1}: [{segment['start']:.2f}s - {segment['end']:.2f}s] {segment['text']}")
if len(result['segments']) > 3:
print(f"... and {len(result['segments']) - 3} more segments")
return True
else:
print(f"Error: {response.text}")
return False
except Exception as e:
print(f"Error: {e}")
return False
def test_url_transcription():
"""Test URL transcription endpoint (placeholder)"""
print(f"\nTesting URL transcription endpoint...")
try:
data = {
'url': 'https://example.com/audio.mp3',
'language': 'en',
'task': 'transcribe',
'model': 'base'
}
response = requests.post(f"{HTTP_BASE_URL}/transcribe/url", json=data)
print(f"Status: {response.status_code}")
print(f"Response: {response.json()}")
return response.status_code == 200
except Exception as e:
print(f"Error: {e}")
return False
def test_openai_endpoint(audio_file_path):
"""Test the OpenAI compatible endpoint"""
print(f"\nTesting OpenAI compatible endpoint...")
print(f"File: {audio_file_path}")
if not os.path.exists(audio_file_path):
print(f"Error: File {audio_file_path} not found")
return False
try:
files = {'file': open(audio_file_path, 'rb')}
data = {
'model': 'whisper-1',
'response_format': 'json'
}
response = requests.post(f"{HTTP_BASE_URL}/v1/audio/transcriptions", files=files, data=data)
print(f"Status: {response.status_code}")
if response.status_code == 200:
result = response.json()
print("OpenAI endpoint successful!")
print(f"Response: {result}")
return True
else:
print(f"Error: {response.text}")
return False
except Exception as e:
print(f"Error: {e}")
return False
def main():
"""Main test function"""
print("WhisperLive HTTP Endpoints Test")
print("=" * 40)
# Test health endpoint
if not test_health_endpoint():
print("Health check failed. Make sure the server is running.")
return
# Test file transcription with a sample audio file
# You can replace this with any audio file you have
sample_audio = "assets/jfk.flac" # Adjust path as needed
if os.path.exists(sample_audio):
test_file_transcription(sample_audio, language="en", task="transcribe", model="base")
test_openai_endpoint(sample_audio)
else:
print(f"\nSample audio file not found at {sample_audio}")
print("You can test with any audio file by calling:")
print("test_file_transcription('path/to/your/audio.wav')")
# Test URL transcription endpoint
test_url_transcription()
print("\n" + "=" * 40)
print("Test completed!")
if __name__ == "__main__":
main()