First commit of files

This commit is contained in:
K Car 2025-06-07 13:06:08 +01:00
commit 05648af633
61 changed files with 10357 additions and 0 deletions

View File

@ -0,0 +1,51 @@
FROM python:3.10-bookworm
ARG DEBIAN_FRONTEND=noninteractive
# Create log directories with proper permissions
RUN mkdir -p /app/logs && \
touch /app/logs/whisperlive.log && \
touch /app/logs/connections.log && \
chmod 666 /app/logs/whisperlive.log && \
chmod 666 /app/logs/connections.log
# install lib required for pyaudio
RUN apt update && apt install -y portaudio19-dev && apt-get clean && rm -rf /var/lib/apt/lists/*
# update pip to support for whl.metadata -> less downloading
RUN pip install --no-cache-dir -U "pip>=24"
# create a working directory
WORKDIR /app
# install the requirements for running the whisper-live server
COPY requirements/server.txt /app/
RUN pip install -r server.txt && rm server.txt
COPY whisper_live /app/whisper_live
COPY run_server.py /app
# Port options
EXPOSE ${PORT_WHISPERLIVE}
EXPOSE ${PORT_WHISPERLIVE_SSL}
ARG PORT_WHISPERLIVE
ENV PORT_WHISPERLIVE=${PORT_WHISPERLIVE}
ARG PORT_WHISPERLIVE_SSL
ENV PORT_WHISPERLIVE_SSL=${PORT_WHISPERLIVE_SSL}
# SSL options
ARG WHISPERLIVE_SSL
ENV WHISPERLIVE_SSL=${WHISPERLIVE_SSL}
# Model options
ARG WHISPL_USE_CUSTOM_MODEL
ENV WHISPL_USE_CUSTOM_MODEL=${WHISPL_USE_CUSTOM_MODEL}
ARG FASTERWHISPER_MODEL
ENV FASTERWHISPER_MODEL=${FASTERWHISPER_MODEL}
CMD ["sh", "-c", "\
if [ \"$WHISPERLIVE_SSL\" = \"true\" ]; then \
python3 -u run_server.py --port $PORT_WHISPERLIVE_SSL --backend faster_whisper --faster_whisper_custom_model_path /app/models/$FASTERWHISPER_MODEL --ssl_cert_path /app/ssl; \
else \
python3 -u run_server.py --port $PORT_WHISPERLIVE --backend faster_whisper --faster_whisper_custom_model_path /app/models/$FASTERWHISPER_MODEL --no_single_model; \
fi"]

View File

@ -0,0 +1,45 @@
FROM python:3.10-bookworm
ARG DEBIAN_FRONTEND=noninteractive
# Create log directories with proper permissions
RUN mkdir -p /app/logs && \
touch /app/logs/whisperlive.log && \
touch /app/logs/connections.log && \
chmod 666 /app/logs/whisperlive.log && \
chmod 666 /app/logs/connections.log
# install lib required for pyaudio
RUN apt update && apt install -y portaudio19-dev && apt-get clean && rm -rf /var/lib/apt/lists/*
# update pip to support for whl.metadata -> less downloading
RUN pip install --no-cache-dir -U "pip>=24"
# create a working directory
WORKDIR /app
# install the requirements for running the whisper-live server
COPY requirements/server.txt /app/
RUN pip install -r server.txt && rm server.txt
COPY whisper_live /app/whisper_live
COPY run_server.py /app
# Copy application files
EXPOSE ${PORT_WHISPERLIVE}
EXPOSE ${PORT_WHISPERLIVE_SSL}
ARG PORT_WHISPERLIVE
ENV PORT_WHISPERLIVE=${PORT_WHISPERLIVE}
ARG PORT_WHISPERLIVE_SSL
ENV PORT_WHISPERLIVE_SSL=${PORT_WHISPERLIVE_SSL}
ARG FASTERWHISPER_MODEL
ENV FASTERWHISPER_MODEL=${FASTERWHISPER_MODEL}
ARG WHISPERLIVE_SSL
ENV WHISPERLIVE_SSL=${WHISPERLIVE_SSL}
CMD ["sh", "-c", "\
if [ \"$WHISPERLIVE_SSL\" = \"true\" ]; then \
python3 -u run_server.py --port $PORT_WHISPERLIVE_SSL --backend faster_whisper --faster_whisper_custom_model_path /app/models/$FASTERWHISPER_MODEL --ssl_cert_path /app/ssl; \
else \
python3 -u run_server.py --port $PORT_WHISPERLIVE --backend faster_whisper --faster_whisper_custom_model_path /app/models/$FASTERWHISPER_MODEL; \
fi"]

View File

@ -0,0 +1,49 @@
FROM python:3.10-bookworm
ARG DEBIAN_FRONTEND=noninteractive
# Create log directories with proper permissions
RUN mkdir -p /app/logs && \
touch /app/logs/whisperlive.log && \
touch /app/logs/connections.log && \
chmod 666 /app/logs/whisperlive.log && \
chmod 666 /app/logs/connections.log
# install lib required for pyaudio
RUN apt update && apt install -y portaudio19-dev && apt-get clean && rm -rf /var/lib/apt/lists/*
# update pip to support for whl.metadata -> less downloading
RUN pip install --no-cache-dir -U "pip>=24"
# create a working directory
WORKDIR /app
# install the requirements for running the whisper-live server
COPY requirements/server.txt /app/
RUN pip install -r server.txt && rm server.txt
# make the paths of the nvidia libs installed as wheels visible. equivalent to:
# export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
ENV LD_LIBRARY_PATH="/usr/local/lib/python3.10/site-packages/nvidia/cublas/lib:/usr/local/lib/python3.10/site-packages/nvidia/cudnn/lib"
COPY whisper_live /app/whisper_live
COPY run_server.py /app
# Copy application files
EXPOSE ${PORT_WHISPERLIVE}
EXPOSE ${PORT_WHISPERLIVE_SSL}
ARG PORT_WHISPERLIVE
ENV PORT_WHISPERLIVE=${PORT_WHISPERLIVE}
ARG PORT_WHISPERLIVE_SSL
ENV PORT_WHISPERLIVE_SSL=${PORT_WHISPERLIVE_SSL}
ARG FASTERWHISPER_MODEL
ENV FASTERWHISPER_MODEL=${FASTERWHISPER_MODEL}
ARG WHISPERLIVE_SSL
ENV WHISPERLIVE_SSL=${WHISPERLIVE_SSL}
CMD ["sh", "-c", "\
if [ \"$WHISPERLIVE_SSL\" = \"true\" ]; then \
python3 -u run_server.py --port $PORT_WHISPERLIVE_SSL --backend faster_whisper --faster_whisper_custom_model_path /app/models/$FASTERWHISPER_MODEL --ssl_cert_path /app/ssl; \
else \
python3 -u run_server.py --port $PORT_WHISPERLIVE --backend faster_whisper --faster_whisper_custom_model_path /app/models/$FASTERWHISPER_MODEL; \
fi"]

1191
.archive/docker-compose.yml Normal file

File diff suppressed because it is too large Load Diff

11
.env Normal file
View File

@ -0,0 +1,11 @@
# Whisper live settings
APP_WS_PROTOCOL=wss
APP_URL=kevlarai.com
PORT_WHISPERLIVE=5050
PORT_WHISPERLIVE_SSL=5053
WHISPERLIVE_SSL=false
WHISPL_USE_CUSTOM_MODEL=false
FASTERWHISPER_MODEL=faster-whisper-large-v3
WHISPERLIVE_URL=${APP_WS_PROTOCOL}://whisperlive.${APP_URL}

42
Dockerfile Normal file
View File

@ -0,0 +1,42 @@
FROM python:3.10-bookworm
ARG DEBIAN_FRONTEND=noninteractive
# Create log directories with proper permissions
RUN mkdir -p /app/logs && \
touch /app/logs/whisperlive.log && \
touch /app/logs/connections.log && \
chmod 666 /app/logs/whisperlive.log && \
chmod 666 /app/logs/connections.log
# install lib required for pyaudio
RUN apt update && apt install -y portaudio19-dev && apt-get clean && rm -rf /var/lib/apt/lists/*
# update pip to support for whl.metadata -> less downloading
RUN pip install --no-cache-dir -U "pip>=24"
# create a working directory
WORKDIR /app
# install the requirements for running the whisper-live server
COPY requirements/server.txt /app/
RUN pip install -r server.txt && rm server.txt
# make the paths of the nvidia libs installed as wheels visible
ENV LD_LIBRARY_PATH="/usr/local/lib/python3.10/site-packages/nvidia/cublas/lib:/usr/local/lib/python3.10/site-packages/nvidia/cudnn/lib"
COPY whisper_live /app/whisper_live
COPY run_server.py /app
# Copy application files
EXPOSE ${PORT_WHISPERLIVE}
ARG PORT_WHISPERLIVE
ENV PORT_WHISPERLIVE=${PORT_WHISPERLIVE}
ARG FASTERWHISPER_MODEL
ENV FASTERWHISPER_MODEL=${FASTERWHISPER_MODEL}
CMD ["python3", "-u", "run_server.py", "--port", "${PORT_WHISPERLIVE}", "--backend", "faster_whisper"]
# CMD ["python3", "-u", "run_server.py", "--port", "${PORT_WHISPERLIVE}", "--backend", "faster_whisper", "--faster_whisper_custom_model_path", "/app/models/${FASTERWHISPER_MODEL}", "--ssl_cert_path", "/app/ssl"]
# CMD ["python3", "-u", "run_server.py", "--port", "${PORT_WHISPERLIVE_SSL}", "--backend", "faster_whisper", "--faster_whisper_custom_model_path", "/app/models/${FASTERWHISPER_MODEL}", "--ssl_cert_path", "/app/ssl"]

21
LICENSE Normal file
View File

@ -0,0 +1,21 @@
MIT License
Copyright (c) 2023 Vineet Suryan, Collabora Ltd.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.

219
README.md Normal file
View File

@ -0,0 +1,219 @@
# WhisperLive
<h2 align="center">
<a href="https://www.youtube.com/watch?v=0PHWCApIcCI"><img
src="https://img.youtube.com/vi/0PHWCApIcCI/0.jpg" style="background-color:rgba(0,0,0,0);" height=300 alt="WhisperLive"></a>
<a href="https://www.youtube.com/watch?v=0f5oiG4oPWQ"><img
src="https://img.youtube.com/vi/0f5oiG4oPWQ/0.jpg" style="background-color:rgba(0,0,0,0);" height=300 alt="WhisperLive"></a>
<br><br>A nearly-live implementation of OpenAI's Whisper.
<br><br>
</h2>
This project is a real-time transcription application that uses the OpenAI Whisper model
to convert speech input into text output. It can be used to transcribe both live audio
input from microphone and pre-recorded audio files.
- [Installation](#installation)
- [Getting Started](#getting-started)
- [Running the Server](#running-the-server)
- [Running the Client](#running-the-client)
- [Browser Extensions](#browser-extensions)
- [Whisper Live Server in Docker](#whisper-live-server-in-docker)
- [Future Work](#future-work)
- [Blog Posts](#blog-posts)
- [Contact](#contact)
- [Citations](#citations)
## Installation
- Install PyAudio
```bash
bash scripts/setup.sh
```
- Install whisper-live from pip
```bash
pip install whisper-live
```
### Setting up NVIDIA/TensorRT-LLM for TensorRT backend
- Please follow [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md) for setup of [NVIDIA/TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM) and for building Whisper-TensorRT engine.
## Getting Started
The server supports 3 backends `faster_whisper`, `tensorrt` and `openvino`. If running `tensorrt` backend follow [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md)
### Running the Server
- [Faster Whisper](https://github.com/SYSTRAN/faster-whisper) backend
```bash
python3 run_server.py --port 9090 \
--backend faster_whisper
# running with custom model and cache_dir to save auto-converted ctranslate2 models
python3 run_server.py --port 9090 \
--backend faster_whisper \
-fw "/path/to/custom/faster/whisper/model"
-c ~/.cache/whisper-live/
```
- TensorRT backend. Currently, we recommend to only use the docker setup for TensorRT. Follow [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md) which works as expected. Make sure to build your TensorRT Engines before running the server with TensorRT backend.
```bash
# Run English only model
python3 run_server.py -p 9090 \
-b tensorrt \
-trt /home/TensorRT-LLM/examples/whisper/whisper_small_en
# Run Multilingual model
python3 run_server.py -p 9090 \
-b tensorrt \
-trt /home/TensorRT-LLM/examples/whisper/whisper_small \
-m
```
- WhisperLive now supports the [OpenVINO](https://github.com/openvinotoolkit/openvino) backend for efficient inference on Intel CPUs, iGPU and dGPUs. Currently, we tested the models uploaded to [huggingface by OpenVINO](https://huggingface.co/OpenVINO?search_models=whisper).
- > **Docker Recommended:** Running WhisperLive with OpenVINO inside Docker automatically enables GPU support (iGPU/dGPU) without requiring additional host setup.
- > **Native (non-Docker) Use:** If you prefer running outside Docker, ensure the Intel drivers and OpenVINO runtime are installed and properly configured on your system. Refer to the documentation for [installing OpenVINO](https://docs.openvino.ai/2025/get-started/install-openvino.html?PACKAGE=OPENVINO_BASE&VERSION=v_2025_0_0&OP_SYSTEM=LINUX&DISTRIBUTION=PIP#).
```
python3 run_server.py -p 9090 -b openvino
```
#### Controlling OpenMP Threads
To control the number of threads used by OpenMP, you can set the `OMP_NUM_THREADS` environment variable. This is useful for managing CPU resources and ensuring consistent performance. If not specified, `OMP_NUM_THREADS` is set to `1` by default. You can change this by using the `--omp_num_threads` argument:
```bash
python3 run_server.py --port 9090 \
--backend faster_whisper \
--omp_num_threads 4
```
#### Single model mode
By default, when running the server without specifying a model, the server will instantiate a new whisper model for every client connection. This has the advantage, that the server can use different model sizes, based on the client's requested model size. On the other hand, it also means you have to wait for the model to be loaded upon client connection and you will have increased (V)RAM usage.
When serving a custom TensorRT model using the `-trt` or a custom faster_whisper model using the `-fw` option, the server will instead only instantiate the custom model once and then reuse it for all client connections.
If you don't want this, set `--no_single_model`.
### Running the Client
- Initializing the client with below parameters:
- `lang`: Language of the input audio, applicable only if using a multilingual model.
- `translate`: If set to `True` then translate from any language to `en`.
- `model`: Whisper model size.
- `use_vad`: Whether to use `Voice Activity Detection` on the server.
- `save_output_recording`: Set to True to save the microphone input as a `.wav` file during live transcription. This option is helpful for recording sessions for later playback or analysis. Defaults to `False`.
- `output_recording_filename`: Specifies the `.wav` file path where the microphone input will be saved if `save_output_recording` is set to `True`.
- `max_clients`: Specifies the maximum number of clients the server should allow. Defaults to 4.
- `max_connection_time`: Maximum connection time for each client in seconds. Defaults to 600.
- `mute_audio_playback`: Whether to mute audio playback when transcribing an audio file. Defaults to False.
```python
from whisper_live.client import TranscriptionClient
client = TranscriptionClient(
"localhost",
9090,
lang="en",
translate=False,
model="small", # also support hf_model => `Systran/faster-whisper-small`
use_vad=False,
save_output_recording=True, # Only used for microphone input, False by Default
output_recording_filename="./output_recording.wav", # Only used for microphone input
max_clients=4,
max_connection_time=600,
mute_audio_playback=False, # Only used for file input, False by Default
)
```
It connects to the server running on localhost at port 9090. Using a multilingual model, language for the transcription will be automatically detected. You can also use the language option to specify the target language for the transcription, in this case, English ("en"). The translate option should be set to `True` if we want to translate from the source language to English and `False` if we want to transcribe in the source language.
- Transcribe an audio file:
```python
client("tests/jfk.wav")
```
- To transcribe from microphone:
```python
client()
```
- To transcribe from a RTSP stream:
```python
client(rtsp_url="rtsp://admin:admin@192.168.0.1/rtsp")
```
- To transcribe from a HLS stream:
```python
client(hls_url="http://as-hls-ww-live.akamaized.net/pool_904/live/ww/bbc_1xtra/bbc_1xtra.isml/bbc_1xtra-audio%3d96000.norewind.m3u8")
```
## Browser Extensions
- Run the server with your desired backend as shown [here](https://github.com/collabora/WhisperLive?tab=readme-ov-file#running-the-server).
- Transcribe audio directly from your browser using our Chrome or Firefox extensions. Refer to [Audio-Transcription-Chrome](https://github.com/collabora/whisper-live/tree/main/Audio-Transcription-Chrome#readme) and https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md
## Whisper Live Server in Docker
- GPU
- Faster-Whisper
```bash
docker run -it --gpus all -p 9090:9090 ghcr.io/collabora/whisperlive-gpu:latest
```
- TensorRT. Refer to [TensorRT_whisper readme](https://github.com/collabora/WhisperLive/blob/main/TensorRT_whisper.md) for setup and more tensorrt backend configurations.
```bash
docker build . -f docker/Dockerfile.tensorrt -t whisperlive-tensorrt
docker run -p 9090:9090 --runtime=nvidia --entrypoint /bin/bash -it whisperlive-tensorrt
# Build small.en engine
bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en # float16
bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int8 # int8 weight only quantization
bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en int4 # int4 weight only quantization
# Run server with small.en
python3 run_server.py --port 9090 \
--backend tensorrt \
--trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_float16"
--trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_int8"
--trt_model_path "/app/TensorRT-LLM-examples/whisper/whisper_small_en_int4"
```
- OpenVINO
```
docker run -it --device=/dev/dri -p 9090:9090 ghcr.io/collabora/whisperlive-openvino
```
- CPU
- Faster-whisper
```bash
docker run -it -p 9090:9090 ghcr.io/collabora/whisperlive-cpu:latest
```
## Future Work
- [ ] Add translation to other languages on top of transcription.
## Blog Posts
- [Transforming speech technology with WhisperLive](https://www.collabora.com/news-and-blog/blog/2024/05/28/transforming-speech-technology-with-whisperlive/)
- [WhisperFusion: Ultra-low latency conversations with an AI chatbot](https://www.collabora.com/news-and-blog/news-and-events/whisperfusion-ultra-low-latency-conversations-with-an-ai-chatbot.html) powered by WhisperLive
- [Breaking language barriers 2.0: Moving closer towards fully reliable, production-ready Hindi ASR](https://www.collabora.com/news-and-blog/news-and-events/breaking-language-barriers-20-moving-closer-production-ready-hindi-asr.html) which is used in WhisperLive for hindi.
## Contact
We are available to help you with both Open Source and proprietary AI projects. You can reach us via the Collabora website or [vineet.suryan@collabora.com](mailto:vineet.suryan@collabora.com) and [marcus.edel@collabora.com](mailto:marcus.edel@collabora.com).
## Citations
```bibtex
@article{Whisper
title = {Robust Speech Recognition via Large-Scale Weak Supervision},
url = {https://arxiv.org/abs/2212.04356},
author = {Radford, Alec and Kim, Jong Wook and Xu, Tao and Brockman, Greg and McLeavey, Christine and Sutskever, Ilya},
publisher = {arXiv},
year = {2022},
}
```
```bibtex
@misc{Silero VAD,
author = {Silero Team},
title = {Silero VAD: pre-trained enterprise-grade Voice Activity Detector (VAD), Number Detector and Language Classifier},
year = {2021},
publisher = {GitHub},
journal = {GitHub repository},
howpublished = {\url{https://github.com/snakers4/silero-vad}},
email = {hello@silero.ai}
}

0
__init__.py Normal file
View File

BIN
assets/jfk.flac Normal file

Binary file not shown.

16
check_cudnn.py Normal file
View File

@ -0,0 +1,16 @@
import tensorflow as tf
if tf.test.is_built_with_cuda():
print("TF is built with CUDA")
else:
print("TF is not built with CUDA")
if tf.test.is_gpu_available(cuda_only=False, min_cuda_compute_capability=None):
print("CUDA is available in TF")
else:
print("CUDA is not available in TF")
if tf.test.is_built_with_cudnn():
print("cuDNN is available")
else:
print("cuDNN is not available")

36
docker-compose.yaml Normal file
View File

@ -0,0 +1,36 @@
version: '3.8'
services:
whisperlive:
container_name: whisperlive
build:
context: .
dockerfile: Dockerfile
args:
PORT_WHISPERLIVE: ${PORT_WHISPERLIVE}
FASTERWHISPER_MODEL: ${FASTERWHISPER_MODEL}
env_file:
- .env
environment:
LOG_PATH: /app/logs
NVIDIA_VISIBLE_DEVICES: all
NVIDIA_DRIVER_CAPABILITIES: compute,utility
volumes:
- ./models:/app/models
- ./ssl:/app/ssl
- ./logs:/app/logs
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
ports:
- ${PORT_WHISPERLIVE}:${PORT_WHISPERLIVE}
networks:
- audio-network
networks:
audio-network:
driver: bridge

35
docker/Dockerfile Normal file
View File

@ -0,0 +1,35 @@
FROM python:3.10-bookworm
ARG DEBIAN_FRONTEND=noninteractive
# install lib required for pyaudio
RUN apt update && apt install -y portaudio19-dev && apt-get clean && rm -rf /var/lib/apt/lists/*
# update pip to support for whl.metadata -> less downloading
RUN pip install --no-cache-dir -U "pip>=24"
# create a working directory
RUN mkdir /app
WORKDIR /app
# install the requirements for running the whisper-live server
COPY requirements/server.txt /app/
RUN pip install --no-cache-dir -r server.txt && rm server.txt
# make the paths of the nvidia libs installed as wheels visible. equivalent to:
# export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
ENV LD_LIBRARY_PATH="/usr/local/lib/python3.10/site-packages/nvidia/cublas/lib:/usr/local/lib/python3.10/site-packages/nvidia/cudnn/lib"
EXPOSE ${WHISPERLIVE_PORT}
COPY whisper_live /app/whisper_live
COPY models /app/models
COPY run_server.py /app
ARG WHISPERLIVE_PORT
ENV WHISPERLIVE_PORT=${WHISPERLIVE_PORT}
ARG FASTERWHISPER_MODEL
ENV FASTERWHISPER_MODEL=${FASTERWHISPER_MODEL}
CMD python3 run_server.py --port $WHISPERLIVE_PORT --backend faster_whisper --faster_whisper_custom_model_path /app/models/$FASTERWHISPER_MODEL

25
docker/Dockerfile.cpu Normal file
View File

@ -0,0 +1,25 @@
FROM python:3.10-bookworm
ARG DEBIAN_FRONTEND=noninteractive
# install lib required for pyaudio
RUN apt update && apt install -y portaudio19-dev && apt-get clean && rm -rf /var/lib/apt/lists/*
# update pip to support for whl.metadata -> less downloading
RUN pip install --no-cache-dir -U "pip>=24"
# create a working directory
RUN mkdir /app
WORKDIR /app
# install pytorch, but without the nvidia-libs that are only necessary for gpu
RUN pip install --no-cache-dir torch --index-url https://download.pytorch.org/whl/cpu
# install the requirements for running the whisper-live server
COPY requirements/server.txt /app/
RUN pip install --no-cache-dir -r server.txt && rm server.txt
COPY whisper_live /app/whisper_live
COPY run_server.py /app
CMD ["python", "run_server.py"]

26
docker/Dockerfile.gpu Normal file
View File

@ -0,0 +1,26 @@
FROM python:3.10-bookworm
ARG DEBIAN_FRONTEND=noninteractive
# install lib required for pyaudio
RUN apt update && apt install -y portaudio19-dev && apt-get clean && rm -rf /var/lib/apt/lists/*
# update pip to support for whl.metadata -> less downloading
RUN pip install --no-cache-dir -U "pip>=24"
# create a working directory
RUN mkdir /app
WORKDIR /app
# install the requirements for running the whisper-live server
COPY requirements/server.txt /app/
RUN pip install --no-cache-dir -r server.txt && rm server.txt
# make the paths of the nvidia libs installed as wheels visible. equivalent to:
# export LD_LIBRARY_PATH=`python3 -c 'import os; import nvidia.cublas.lib; import nvidia.cudnn.lib; print(os.path.dirname(nvidia.cublas.lib.__file__) + ":" + os.path.dirname(nvidia.cudnn.lib.__file__))'`
ENV LD_LIBRARY_PATH="/usr/local/lib/python3.10/site-packages/nvidia/cublas/lib:/usr/local/lib/python3.10/site-packages/nvidia/cudnn/lib"
COPY whisper_live /app/whisper_live
COPY run_server.py /app
CMD ["python", "run_server.py"]

View File

@ -0,0 +1,37 @@
FROM nvidia/cuda:12.2.2-cudnn8-runtime-ubuntu22.04
ARG DEBIAN_FRONTEND=noninteractive
# Remove any third-party apt sources to avoid issues with expiring keys.
RUN rm -f /etc/apt/sources.list.d/*.list
# Install some basic utilities.
RUN apt-get update && apt-get install -y \
python3.10 python3-pip openmpi-bin libopenmpi-dev git wget \
&& rm -rf /var/lib/apt/lists/*
RUN pip3 install --no-cache-dir -U tensorrt_llm==0.9.0 --extra-index-url https://pypi.nvidia.com
WORKDIR /app
RUN git clone -b v0.9.0 --depth 1 https://github.com/NVIDIA/TensorRT-LLM.git && \
mv TensorRT-LLM/examples ./TensorRT-LLM-examples && \
rm -rf TensorRT-LLM
COPY assets/ ./assets
RUN wget -nc -P assets/ https://raw.githubusercontent.com/openai/whisper/main/whisper/assets/mel_filters.npz
COPY scripts/setup.sh ./
RUN apt update && bash setup.sh && rm setup.sh
COPY requirements/server.txt .
RUN pip install --no-cache-dir -r server.txt && rm server.txt
COPY whisper_live ./whisper_live
COPY scripts/build_whisper_tensorrt.sh .
COPY run_server.py .
# Build the TensorRT engine
RUN bash build_whisper_tensorrt.sh /app/TensorRT-LLM-examples small.en
# Set the command to run the server
CMD ["python3", "run_server.py", "--port", "9090", "--backend", "tensorrt", "--trt_model_path", "/app/TensorRT-LLM-examples/whisper/whisper_small_en"]

View File

@ -0,0 +1,28 @@
services:
whisperlive-server:
runtime: nvidia
build:
context: ./backend/whisperlive/server
dockerfile: Dockerfile.tensorrt # Override to use Dockerfile.tensorrt
args:
WHISPERLIVE_PORT: ${WHISPERLIVE_PORT}
env_file:
- ./.env
environment:
WHISPERLIVE_PORT: ${WHISPERLIVE_PORT}
NVIDIA_VISIBLE_DEVICES: all
NVIDIA_DRIVER_CAPABILITIES: compute,utility
volumes:
- data_volume:/data
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]
ports:
- ${WHISPERLIVE_PORT}:${WHISPERLIVE_PORT}
networks:
- app-network

0
docs/.nojekyll Normal file
View File

Binary file not shown.

BIN
docs/doctrees/index.doctree Normal file

Binary file not shown.

4
docs/html/.buildinfo Normal file
View File

@ -0,0 +1,4 @@
# Sphinx build info version 1
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
config: 7b818b47e6f359b937e5a2517f120d43
tags: 645f666f9bcd5a90fca523b33c5a78b7

View File

@ -0,0 +1,26 @@
.. whisper_live documentation master file, created by
sphinx-quickstart on Fri Sep 22 11:39:30 2023.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to Whisper Live documentation!
========================================
.. toctree::
:maxdepth: 2
.. automodule:: whisper_live.server
:members:
.. automodule:: whisper_live.client
:members:
Indices and tables
==================
* :ref:`genindex`
* :ref:`modindex`
* :ref:`search`

View File

@ -0,0 +1,703 @@
@import url("basic.css");
/* -- page layout ----------------------------------------------------------- */
body {
font-family: Georgia, serif;
font-size: 17px;
background-color: #fff;
color: #000;
margin: 0;
padding: 0;
}
div.document {
width: 940px;
margin: 30px auto 0 auto;
}
div.documentwrapper {
float: left;
width: 100%;
}
div.bodywrapper {
margin: 0 0 0 220px;
}
div.sphinxsidebar {
width: 220px;
font-size: 14px;
line-height: 1.5;
}
hr {
border: 1px solid #B1B4B6;
}
div.body {
background-color: #fff;
color: #3E4349;
padding: 0 30px 0 30px;
}
div.body > .section {
text-align: left;
}
div.footer {
width: 940px;
margin: 20px auto 30px auto;
font-size: 14px;
color: #888;
text-align: right;
}
div.footer a {
color: #888;
}
p.caption {
font-family: inherit;
font-size: inherit;
}
div.relations {
display: none;
}
div.sphinxsidebar a {
color: #444;
text-decoration: none;
border-bottom: 1px dotted #999;
}
div.sphinxsidebar a:hover {
border-bottom: 1px solid #999;
}
div.sphinxsidebarwrapper {
padding: 18px 10px;
}
div.sphinxsidebarwrapper p.logo {
padding: 0;
margin: -10px 0 0 0px;
text-align: center;
}
div.sphinxsidebarwrapper h1.logo {
margin-top: -10px;
text-align: center;
margin-bottom: 5px;
text-align: left;
}
div.sphinxsidebarwrapper h1.logo-name {
margin-top: 0px;
}
div.sphinxsidebarwrapper p.blurb {
margin-top: 0;
font-style: normal;
}
div.sphinxsidebar h3,
div.sphinxsidebar h4 {
font-family: Georgia, serif;
color: #444;
font-size: 24px;
font-weight: normal;
margin: 0 0 5px 0;
padding: 0;
}
div.sphinxsidebar h4 {
font-size: 20px;
}
div.sphinxsidebar h3 a {
color: #444;
}
div.sphinxsidebar p.logo a,
div.sphinxsidebar h3 a,
div.sphinxsidebar p.logo a:hover,
div.sphinxsidebar h3 a:hover {
border: none;
}
div.sphinxsidebar p {
color: #555;
margin: 10px 0;
}
div.sphinxsidebar ul {
margin: 10px 0;
padding: 0;
color: #000;
}
div.sphinxsidebar ul li.toctree-l1 > a {
font-size: 120%;
}
div.sphinxsidebar ul li.toctree-l2 > a {
font-size: 110%;
}
div.sphinxsidebar input {
border: 1px solid #CCC;
font-family: Georgia, serif;
font-size: 1em;
}
div.sphinxsidebar hr {
border: none;
height: 1px;
color: #AAA;
background: #AAA;
text-align: left;
margin-left: 0;
width: 50%;
}
div.sphinxsidebar .badge {
border-bottom: none;
}
div.sphinxsidebar .badge:hover {
border-bottom: none;
}
/* To address an issue with donation coming after search */
div.sphinxsidebar h3.donation {
margin-top: 10px;
}
/* -- body styles ----------------------------------------------------------- */
a {
color: #004B6B;
text-decoration: underline;
}
a:hover {
color: #6D4100;
text-decoration: underline;
}
div.body h1,
div.body h2,
div.body h3,
div.body h4,
div.body h5,
div.body h6 {
font-family: Georgia, serif;
font-weight: normal;
margin: 30px 0px 10px 0px;
padding: 0;
}
div.body h1 { margin-top: 0; padding-top: 0; font-size: 240%; }
div.body h2 { font-size: 180%; }
div.body h3 { font-size: 150%; }
div.body h4 { font-size: 130%; }
div.body h5 { font-size: 100%; }
div.body h6 { font-size: 100%; }
a.headerlink {
color: #DDD;
padding: 0 4px;
text-decoration: none;
}
a.headerlink:hover {
color: #444;
background: #EAEAEA;
}
div.body p, div.body dd, div.body li {
line-height: 1.4em;
}
div.admonition {
margin: 20px 0px;
padding: 10px 30px;
background-color: #EEE;
border: 1px solid #CCC;
}
div.admonition tt.xref, div.admonition code.xref, div.admonition a tt {
background-color: #FBFBFB;
border-bottom: 1px solid #fafafa;
}
div.admonition p.admonition-title {
font-family: Georgia, serif;
font-weight: normal;
font-size: 24px;
margin: 0 0 10px 0;
padding: 0;
line-height: 1;
}
div.admonition p.last {
margin-bottom: 0;
}
div.highlight {
background-color: #fff;
}
dt:target, .highlight {
background: #FAF3E8;
}
div.warning {
background-color: #FCC;
border: 1px solid #FAA;
}
div.danger {
background-color: #FCC;
border: 1px solid #FAA;
-moz-box-shadow: 2px 2px 4px #D52C2C;
-webkit-box-shadow: 2px 2px 4px #D52C2C;
box-shadow: 2px 2px 4px #D52C2C;
}
div.error {
background-color: #FCC;
border: 1px solid #FAA;
-moz-box-shadow: 2px 2px 4px #D52C2C;
-webkit-box-shadow: 2px 2px 4px #D52C2C;
box-shadow: 2px 2px 4px #D52C2C;
}
div.caution {
background-color: #FCC;
border: 1px solid #FAA;
}
div.attention {
background-color: #FCC;
border: 1px solid #FAA;
}
div.important {
background-color: #EEE;
border: 1px solid #CCC;
}
div.note {
background-color: #EEE;
border: 1px solid #CCC;
}
div.tip {
background-color: #EEE;
border: 1px solid #CCC;
}
div.hint {
background-color: #EEE;
border: 1px solid #CCC;
}
div.seealso {
background-color: #EEE;
border: 1px solid #CCC;
}
div.topic {
background-color: #EEE;
}
p.admonition-title {
display: inline;
}
p.admonition-title:after {
content: ":";
}
pre, tt, code {
font-family: 'Consolas', 'Menlo', 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', monospace;
font-size: 0.9em;
}
.hll {
background-color: #FFC;
margin: 0 -12px;
padding: 0 12px;
display: block;
}
img.screenshot {
}
tt.descname, tt.descclassname, code.descname, code.descclassname {
font-size: 0.95em;
}
tt.descname, code.descname {
padding-right: 0.08em;
}
img.screenshot {
-moz-box-shadow: 2px 2px 4px #EEE;
-webkit-box-shadow: 2px 2px 4px #EEE;
box-shadow: 2px 2px 4px #EEE;
}
table.docutils {
border: 1px solid #888;
-moz-box-shadow: 2px 2px 4px #EEE;
-webkit-box-shadow: 2px 2px 4px #EEE;
box-shadow: 2px 2px 4px #EEE;
}
table.docutils td, table.docutils th {
border: 1px solid #888;
padding: 0.25em 0.7em;
}
table.field-list, table.footnote {
border: none;
-moz-box-shadow: none;
-webkit-box-shadow: none;
box-shadow: none;
}
table.footnote {
margin: 15px 0;
width: 100%;
border: 1px solid #EEE;
background: #FDFDFD;
font-size: 0.9em;
}
table.footnote + table.footnote {
margin-top: -15px;
border-top: none;
}
table.field-list th {
padding: 0 0.8em 0 0;
}
table.field-list td {
padding: 0;
}
table.field-list p {
margin-bottom: 0.8em;
}
/* Cloned from
* https://github.com/sphinx-doc/sphinx/commit/ef60dbfce09286b20b7385333d63a60321784e68
*/
.field-name {
-moz-hyphens: manual;
-ms-hyphens: manual;
-webkit-hyphens: manual;
hyphens: manual;
}
table.footnote td.label {
width: .1px;
padding: 0.3em 0 0.3em 0.5em;
}
table.footnote td {
padding: 0.3em 0.5em;
}
dl {
margin-left: 0;
margin-right: 0;
margin-top: 0;
padding: 0;
}
dl dd {
margin-left: 30px;
}
blockquote {
margin: 0 0 0 30px;
padding: 0;
}
ul, ol {
/* Matches the 30px from the narrow-screen "li > ul" selector below */
margin: 10px 0 10px 30px;
padding: 0;
}
pre {
background: #EEE;
padding: 7px 30px;
margin: 15px 0px;
line-height: 1.3em;
}
div.viewcode-block:target {
background: #ffd;
}
dl pre, blockquote pre, li pre {
margin-left: 0;
padding-left: 30px;
}
tt, code {
background-color: #ecf0f3;
color: #222;
/* padding: 1px 2px; */
}
tt.xref, code.xref, a tt {
background-color: #FBFBFB;
border-bottom: 1px solid #fff;
}
a.reference {
text-decoration: none;
border-bottom: 1px dotted #004B6B;
}
/* Don't put an underline on images */
a.image-reference, a.image-reference:hover {
border-bottom: none;
}
a.reference:hover {
border-bottom: 1px solid #6D4100;
}
a.footnote-reference {
text-decoration: none;
font-size: 0.7em;
vertical-align: top;
border-bottom: 1px dotted #004B6B;
}
a.footnote-reference:hover {
border-bottom: 1px solid #6D4100;
}
a:hover tt, a:hover code {
background: #EEE;
}
@media screen and (max-width: 870px) {
div.sphinxsidebar {
display: none;
}
div.document {
width: 100%;
}
div.documentwrapper {
margin-left: 0;
margin-top: 0;
margin-right: 0;
margin-bottom: 0;
}
div.bodywrapper {
margin-top: 0;
margin-right: 0;
margin-bottom: 0;
margin-left: 0;
}
ul {
margin-left: 0;
}
li > ul {
/* Matches the 30px from the "ul, ol" selector above */
margin-left: 30px;
}
.document {
width: auto;
}
.footer {
width: auto;
}
.bodywrapper {
margin: 0;
}
.footer {
width: auto;
}
.github {
display: none;
}
}
@media screen and (max-width: 875px) {
body {
margin: 0;
padding: 20px 30px;
}
div.documentwrapper {
float: none;
background: #fff;
}
div.sphinxsidebar {
display: block;
float: none;
width: 102.5%;
margin: 50px -30px -20px -30px;
padding: 10px 20px;
background: #333;
color: #FFF;
}
div.sphinxsidebar h3, div.sphinxsidebar h4, div.sphinxsidebar p,
div.sphinxsidebar h3 a {
color: #fff;
}
div.sphinxsidebar a {
color: #AAA;
}
div.sphinxsidebar p.logo {
display: none;
}
div.document {
width: 100%;
margin: 0;
}
div.footer {
display: none;
}
div.bodywrapper {
margin: 0;
}
div.body {
min-height: 0;
padding: 0;
}
.rtd_doc_footer {
display: none;
}
.document {
width: auto;
}
.footer {
width: auto;
}
.footer {
width: auto;
}
.github {
display: none;
}
}
/* misc. */
.revsys-inline {
display: none!important;
}
/* Make nested-list/multi-paragraph items look better in Releases changelog
* pages. Without this, docutils' magical list fuckery causes inconsistent
* formatting between different release sub-lists.
*/
div#changelog > div.section > ul > li > p:only-child {
margin-bottom: 0;
}
/* Hide fugly table cell borders in ..bibliography:: directive output */
table.docutils.citation, table.docutils.citation td, table.docutils.citation th {
border: none;
/* Below needed in some edge cases; if not applied, bottom shadows appear */
-moz-box-shadow: none;
-webkit-box-shadow: none;
box-shadow: none;
}
/* relbar */
.related {
line-height: 30px;
width: 100%;
font-size: 0.9rem;
}
.related.top {
border-bottom: 1px solid #EEE;
margin-bottom: 20px;
}
.related.bottom {
border-top: 1px solid #EEE;
}
.related ul {
padding: 0;
margin: 0;
list-style: none;
}
.related li {
display: inline;
}
nav#rellinks {
float: right;
}
nav#rellinks li+li:before {
content: "|";
}
nav#breadcrumbs li+li:before {
content: "\00BB";
}
/* Hide certain items when printing */
@media print {
div.related {
display: none;
}
}

925
docs/html/_static/basic.css Normal file
View File

@ -0,0 +1,925 @@
/*
* basic.css
* ~~~~~~~~~
*
* Sphinx stylesheet -- basic theme.
*
* :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
* :license: BSD, see LICENSE for details.
*
*/
/* -- main layout ----------------------------------------------------------- */
div.clearer {
clear: both;
}
div.section::after {
display: block;
content: '';
clear: left;
}
/* -- relbar ---------------------------------------------------------------- */
div.related {
width: 100%;
font-size: 90%;
}
div.related h3 {
display: none;
}
div.related ul {
margin: 0;
padding: 0 0 0 10px;
list-style: none;
}
div.related li {
display: inline;
}
div.related li.right {
float: right;
margin-right: 5px;
}
/* -- sidebar --------------------------------------------------------------- */
div.sphinxsidebarwrapper {
padding: 10px 5px 0 10px;
}
div.sphinxsidebar {
float: left;
width: 230px;
margin-left: -100%;
font-size: 90%;
word-wrap: break-word;
overflow-wrap : break-word;
}
div.sphinxsidebar ul {
list-style: none;
}
div.sphinxsidebar ul ul,
div.sphinxsidebar ul.want-points {
margin-left: 20px;
list-style: square;
}
div.sphinxsidebar ul ul {
margin-top: 0;
margin-bottom: 0;
}
div.sphinxsidebar form {
margin-top: 10px;
}
div.sphinxsidebar input {
border: 1px solid #98dbcc;
font-family: sans-serif;
font-size: 1em;
}
div.sphinxsidebar #searchbox form.search {
overflow: hidden;
}
div.sphinxsidebar #searchbox input[type="text"] {
float: left;
width: 80%;
padding: 0.25em;
box-sizing: border-box;
}
div.sphinxsidebar #searchbox input[type="submit"] {
float: left;
width: 20%;
border-left: none;
padding: 0.25em;
box-sizing: border-box;
}
img {
border: 0;
max-width: 100%;
}
/* -- search page ----------------------------------------------------------- */
ul.search {
margin: 10px 0 0 20px;
padding: 0;
}
ul.search li {
padding: 5px 0 5px 20px;
background-image: url(file.png);
background-repeat: no-repeat;
background-position: 0 7px;
}
ul.search li a {
font-weight: bold;
}
ul.search li p.context {
color: #888;
margin: 2px 0 0 30px;
text-align: left;
}
ul.keywordmatches li.goodmatch a {
font-weight: bold;
}
/* -- index page ------------------------------------------------------------ */
table.contentstable {
width: 90%;
margin-left: auto;
margin-right: auto;
}
table.contentstable p.biglink {
line-height: 150%;
}
a.biglink {
font-size: 1.3em;
}
span.linkdescr {
font-style: italic;
padding-top: 5px;
font-size: 90%;
}
/* -- general index --------------------------------------------------------- */
table.indextable {
width: 100%;
}
table.indextable td {
text-align: left;
vertical-align: top;
}
table.indextable ul {
margin-top: 0;
margin-bottom: 0;
list-style-type: none;
}
table.indextable > tbody > tr > td > ul {
padding-left: 0em;
}
table.indextable tr.pcap {
height: 10px;
}
table.indextable tr.cap {
margin-top: 10px;
background-color: #f2f2f2;
}
img.toggler {
margin-right: 3px;
margin-top: 3px;
cursor: pointer;
}
div.modindex-jumpbox {
border-top: 1px solid #ddd;
border-bottom: 1px solid #ddd;
margin: 1em 0 1em 0;
padding: 0.4em;
}
div.genindex-jumpbox {
border-top: 1px solid #ddd;
border-bottom: 1px solid #ddd;
margin: 1em 0 1em 0;
padding: 0.4em;
}
/* -- domain module index --------------------------------------------------- */
table.modindextable td {
padding: 2px;
border-collapse: collapse;
}
/* -- general body styles --------------------------------------------------- */
div.body {
min-width: 360px;
max-width: 800px;
}
div.body p, div.body dd, div.body li, div.body blockquote {
-moz-hyphens: auto;
-ms-hyphens: auto;
-webkit-hyphens: auto;
hyphens: auto;
}
a.headerlink {
visibility: hidden;
}
a:visited {
color: #551A8B;
}
h1:hover > a.headerlink,
h2:hover > a.headerlink,
h3:hover > a.headerlink,
h4:hover > a.headerlink,
h5:hover > a.headerlink,
h6:hover > a.headerlink,
dt:hover > a.headerlink,
caption:hover > a.headerlink,
p.caption:hover > a.headerlink,
div.code-block-caption:hover > a.headerlink {
visibility: visible;
}
div.body p.caption {
text-align: inherit;
}
div.body td {
text-align: left;
}
.first {
margin-top: 0 !important;
}
p.rubric {
margin-top: 30px;
font-weight: bold;
}
img.align-left, figure.align-left, .figure.align-left, object.align-left {
clear: left;
float: left;
margin-right: 1em;
}
img.align-right, figure.align-right, .figure.align-right, object.align-right {
clear: right;
float: right;
margin-left: 1em;
}
img.align-center, figure.align-center, .figure.align-center, object.align-center {
display: block;
margin-left: auto;
margin-right: auto;
}
img.align-default, figure.align-default, .figure.align-default {
display: block;
margin-left: auto;
margin-right: auto;
}
.align-left {
text-align: left;
}
.align-center {
text-align: center;
}
.align-default {
text-align: center;
}
.align-right {
text-align: right;
}
/* -- sidebars -------------------------------------------------------------- */
div.sidebar,
aside.sidebar {
margin: 0 0 0.5em 1em;
border: 1px solid #ddb;
padding: 7px;
background-color: #ffe;
width: 40%;
float: right;
clear: right;
overflow-x: auto;
}
p.sidebar-title {
font-weight: bold;
}
nav.contents,
aside.topic,
div.admonition, div.topic, blockquote {
clear: left;
}
/* -- topics ---------------------------------------------------------------- */
nav.contents,
aside.topic,
div.topic {
border: 1px solid #ccc;
padding: 7px;
margin: 10px 0 10px 0;
}
p.topic-title {
font-size: 1.1em;
font-weight: bold;
margin-top: 10px;
}
/* -- admonitions ----------------------------------------------------------- */
div.admonition {
margin-top: 10px;
margin-bottom: 10px;
padding: 7px;
}
div.admonition dt {
font-weight: bold;
}
p.admonition-title {
margin: 0px 10px 5px 0px;
font-weight: bold;
}
div.body p.centered {
text-align: center;
margin-top: 25px;
}
/* -- content of sidebars/topics/admonitions -------------------------------- */
div.sidebar > :last-child,
aside.sidebar > :last-child,
nav.contents > :last-child,
aside.topic > :last-child,
div.topic > :last-child,
div.admonition > :last-child {
margin-bottom: 0;
}
div.sidebar::after,
aside.sidebar::after,
nav.contents::after,
aside.topic::after,
div.topic::after,
div.admonition::after,
blockquote::after {
display: block;
content: '';
clear: both;
}
/* -- tables ---------------------------------------------------------------- */
table.docutils {
margin-top: 10px;
margin-bottom: 10px;
border: 0;
border-collapse: collapse;
}
table.align-center {
margin-left: auto;
margin-right: auto;
}
table.align-default {
margin-left: auto;
margin-right: auto;
}
table caption span.caption-number {
font-style: italic;
}
table caption span.caption-text {
}
table.docutils td, table.docutils th {
padding: 1px 8px 1px 5px;
border-top: 0;
border-left: 0;
border-right: 0;
border-bottom: 1px solid #aaa;
}
th {
text-align: left;
padding-right: 5px;
}
table.citation {
border-left: solid 1px gray;
margin-left: 1px;
}
table.citation td {
border-bottom: none;
}
th > :first-child,
td > :first-child {
margin-top: 0px;
}
th > :last-child,
td > :last-child {
margin-bottom: 0px;
}
/* -- figures --------------------------------------------------------------- */
div.figure, figure {
margin: 0.5em;
padding: 0.5em;
}
div.figure p.caption, figcaption {
padding: 0.3em;
}
div.figure p.caption span.caption-number,
figcaption span.caption-number {
font-style: italic;
}
div.figure p.caption span.caption-text,
figcaption span.caption-text {
}
/* -- field list styles ----------------------------------------------------- */
table.field-list td, table.field-list th {
border: 0 !important;
}
.field-list ul {
margin: 0;
padding-left: 1em;
}
.field-list p {
margin: 0;
}
.field-name {
-moz-hyphens: manual;
-ms-hyphens: manual;
-webkit-hyphens: manual;
hyphens: manual;
}
/* -- hlist styles ---------------------------------------------------------- */
table.hlist {
margin: 1em 0;
}
table.hlist td {
vertical-align: top;
}
/* -- object description styles --------------------------------------------- */
.sig {
font-family: 'Consolas', 'Menlo', 'DejaVu Sans Mono', 'Bitstream Vera Sans Mono', monospace;
}
.sig-name, code.descname {
background-color: transparent;
font-weight: bold;
}
.sig-name {
font-size: 1.1em;
}
code.descname {
font-size: 1.2em;
}
.sig-prename, code.descclassname {
background-color: transparent;
}
.optional {
font-size: 1.3em;
}
.sig-paren {
font-size: larger;
}
.sig-param.n {
font-style: italic;
}
/* C++ specific styling */
.sig-inline.c-texpr,
.sig-inline.cpp-texpr {
font-family: unset;
}
.sig.c .k, .sig.c .kt,
.sig.cpp .k, .sig.cpp .kt {
color: #0033B3;
}
.sig.c .m,
.sig.cpp .m {
color: #1750EB;
}
.sig.c .s, .sig.c .sc,
.sig.cpp .s, .sig.cpp .sc {
color: #067D17;
}
/* -- other body styles ----------------------------------------------------- */
ol.arabic {
list-style: decimal;
}
ol.loweralpha {
list-style: lower-alpha;
}
ol.upperalpha {
list-style: upper-alpha;
}
ol.lowerroman {
list-style: lower-roman;
}
ol.upperroman {
list-style: upper-roman;
}
:not(li) > ol > li:first-child > :first-child,
:not(li) > ul > li:first-child > :first-child {
margin-top: 0px;
}
:not(li) > ol > li:last-child > :last-child,
:not(li) > ul > li:last-child > :last-child {
margin-bottom: 0px;
}
ol.simple ol p,
ol.simple ul p,
ul.simple ol p,
ul.simple ul p {
margin-top: 0;
}
ol.simple > li:not(:first-child) > p,
ul.simple > li:not(:first-child) > p {
margin-top: 0;
}
ol.simple p,
ul.simple p {
margin-bottom: 0;
}
aside.footnote > span,
div.citation > span {
float: left;
}
aside.footnote > span:last-of-type,
div.citation > span:last-of-type {
padding-right: 0.5em;
}
aside.footnote > p {
margin-left: 2em;
}
div.citation > p {
margin-left: 4em;
}
aside.footnote > p:last-of-type,
div.citation > p:last-of-type {
margin-bottom: 0em;
}
aside.footnote > p:last-of-type:after,
div.citation > p:last-of-type:after {
content: "";
clear: both;
}
dl.field-list {
display: grid;
grid-template-columns: fit-content(30%) auto;
}
dl.field-list > dt {
font-weight: bold;
word-break: break-word;
padding-left: 0.5em;
padding-right: 5px;
}
dl.field-list > dd {
padding-left: 0.5em;
margin-top: 0em;
margin-left: 0em;
margin-bottom: 0em;
}
dl {
margin-bottom: 15px;
}
dd > :first-child {
margin-top: 0px;
}
dd ul, dd table {
margin-bottom: 10px;
}
dd {
margin-top: 3px;
margin-bottom: 10px;
margin-left: 30px;
}
.sig dd {
margin-top: 0px;
margin-bottom: 0px;
}
.sig dl {
margin-top: 0px;
margin-bottom: 0px;
}
dl > dd:last-child,
dl > dd:last-child > :last-child {
margin-bottom: 0;
}
dt:target, span.highlighted {
background-color: #fbe54e;
}
rect.highlighted {
fill: #fbe54e;
}
dl.glossary dt {
font-weight: bold;
font-size: 1.1em;
}
.versionmodified {
font-style: italic;
}
.system-message {
background-color: #fda;
padding: 5px;
border: 3px solid red;
}
.footnote:target {
background-color: #ffa;
}
.line-block {
display: block;
margin-top: 1em;
margin-bottom: 1em;
}
.line-block .line-block {
margin-top: 0;
margin-bottom: 0;
margin-left: 1.5em;
}
.guilabel, .menuselection {
font-family: sans-serif;
}
.accelerator {
text-decoration: underline;
}
.classifier {
font-style: oblique;
}
.classifier:before {
font-style: normal;
margin: 0 0.5em;
content: ":";
display: inline-block;
}
abbr, acronym {
border-bottom: dotted 1px;
cursor: help;
}
.translated {
background-color: rgba(207, 255, 207, 0.2)
}
.untranslated {
background-color: rgba(255, 207, 207, 0.2)
}
/* -- code displays --------------------------------------------------------- */
pre {
overflow: auto;
overflow-y: hidden; /* fixes display issues on Chrome browsers */
}
pre, div[class*="highlight-"] {
clear: both;
}
span.pre {
-moz-hyphens: none;
-ms-hyphens: none;
-webkit-hyphens: none;
hyphens: none;
white-space: nowrap;
}
div[class*="highlight-"] {
margin: 1em 0;
}
td.linenos pre {
border: 0;
background-color: transparent;
color: #aaa;
}
table.highlighttable {
display: block;
}
table.highlighttable tbody {
display: block;
}
table.highlighttable tr {
display: flex;
}
table.highlighttable td {
margin: 0;
padding: 0;
}
table.highlighttable td.linenos {
padding-right: 0.5em;
}
table.highlighttable td.code {
flex: 1;
overflow: hidden;
}
.highlight .hll {
display: block;
}
div.highlight pre,
table.highlighttable pre {
margin: 0;
}
div.code-block-caption + div {
margin-top: 0;
}
div.code-block-caption {
margin-top: 1em;
padding: 2px 5px;
font-size: small;
}
div.code-block-caption code {
background-color: transparent;
}
table.highlighttable td.linenos,
span.linenos,
div.highlight span.gp { /* gp: Generic.Prompt */
user-select: none;
-webkit-user-select: text; /* Safari fallback only */
-webkit-user-select: none; /* Chrome/Safari */
-moz-user-select: none; /* Firefox */
-ms-user-select: none; /* IE10+ */
}
div.code-block-caption span.caption-number {
padding: 0.1em 0.3em;
font-style: italic;
}
div.code-block-caption span.caption-text {
}
div.literal-block-wrapper {
margin: 1em 0;
}
code.xref, a code {
background-color: transparent;
font-weight: bold;
}
h1 code, h2 code, h3 code, h4 code, h5 code, h6 code {
background-color: transparent;
}
.viewcode-link {
float: right;
}
.viewcode-back {
float: right;
font-family: sans-serif;
}
div.viewcode-block:target {
margin: -1px -10px;
padding: 0 10px;
}
/* -- math display ---------------------------------------------------------- */
img.math {
vertical-align: middle;
}
div.body div.math p {
text-align: center;
}
span.eqno {
float: right;
}
span.eqno a.headerlink {
position: absolute;
z-index: 1;
}
div.math:hover a.headerlink {
visibility: visible;
}
/* -- printout stylesheet --------------------------------------------------- */
@media print {
div.document,
div.documentwrapper,
div.bodywrapper {
margin: 0 !important;
width: 100%;
}
div.sphinxsidebar,
div.related,
div.footer,
#top-link {
display: none;
}
}

View File

@ -0,0 +1 @@
/* This file intentionally left blank. */

View File

@ -0,0 +1,156 @@
/*
* doctools.js
* ~~~~~~~~~~~
*
* Base JavaScript utilities for all Sphinx HTML documentation.
*
* :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
* :license: BSD, see LICENSE for details.
*
*/
"use strict";
const BLACKLISTED_KEY_CONTROL_ELEMENTS = new Set([
"TEXTAREA",
"INPUT",
"SELECT",
"BUTTON",
]);
const _ready = (callback) => {
if (document.readyState !== "loading") {
callback();
} else {
document.addEventListener("DOMContentLoaded", callback);
}
};
/**
* Small JavaScript module for the documentation.
*/
const Documentation = {
init: () => {
Documentation.initDomainIndexTable();
Documentation.initOnKeyListeners();
},
/**
* i18n support
*/
TRANSLATIONS: {},
PLURAL_EXPR: (n) => (n === 1 ? 0 : 1),
LOCALE: "unknown",
// gettext and ngettext don't access this so that the functions
// can safely bound to a different name (_ = Documentation.gettext)
gettext: (string) => {
const translated = Documentation.TRANSLATIONS[string];
switch (typeof translated) {
case "undefined":
return string; // no translation
case "string":
return translated; // translation exists
default:
return translated[0]; // (singular, plural) translation tuple exists
}
},
ngettext: (singular, plural, n) => {
const translated = Documentation.TRANSLATIONS[singular];
if (typeof translated !== "undefined")
return translated[Documentation.PLURAL_EXPR(n)];
return n === 1 ? singular : plural;
},
addTranslations: (catalog) => {
Object.assign(Documentation.TRANSLATIONS, catalog.messages);
Documentation.PLURAL_EXPR = new Function(
"n",
`return (${catalog.plural_expr})`
);
Documentation.LOCALE = catalog.locale;
},
/**
* helper function to focus on search bar
*/
focusSearchBar: () => {
document.querySelectorAll("input[name=q]")[0]?.focus();
},
/**
* Initialise the domain index toggle buttons
*/
initDomainIndexTable: () => {
const toggler = (el) => {
const idNumber = el.id.substr(7);
const toggledRows = document.querySelectorAll(`tr.cg-${idNumber}`);
if (el.src.substr(-9) === "minus.png") {
el.src = `${el.src.substr(0, el.src.length - 9)}plus.png`;
toggledRows.forEach((el) => (el.style.display = "none"));
} else {
el.src = `${el.src.substr(0, el.src.length - 8)}minus.png`;
toggledRows.forEach((el) => (el.style.display = ""));
}
};
const togglerElements = document.querySelectorAll("img.toggler");
togglerElements.forEach((el) =>
el.addEventListener("click", (event) => toggler(event.currentTarget))
);
togglerElements.forEach((el) => (el.style.display = ""));
if (DOCUMENTATION_OPTIONS.COLLAPSE_INDEX) togglerElements.forEach(toggler);
},
initOnKeyListeners: () => {
// only install a listener if it is really needed
if (
!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS &&
!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS
)
return;
document.addEventListener("keydown", (event) => {
// bail for input elements
if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return;
// bail with special keys
if (event.altKey || event.ctrlKey || event.metaKey) return;
if (!event.shiftKey) {
switch (event.key) {
case "ArrowLeft":
if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break;
const prevLink = document.querySelector('link[rel="prev"]');
if (prevLink && prevLink.href) {
window.location.href = prevLink.href;
event.preventDefault();
}
break;
case "ArrowRight":
if (!DOCUMENTATION_OPTIONS.NAVIGATION_WITH_KEYS) break;
const nextLink = document.querySelector('link[rel="next"]');
if (nextLink && nextLink.href) {
window.location.href = nextLink.href;
event.preventDefault();
}
break;
}
}
// some keyboard layouts may need Shift to get /
switch (event.key) {
case "/":
if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) break;
Documentation.focusSearchBar();
event.preventDefault();
}
});
},
};
// quick alias for translations
const _ = Documentation.gettext;
_ready(Documentation.init);

View File

@ -0,0 +1,13 @@
const DOCUMENTATION_OPTIONS = {
VERSION: '',
LANGUAGE: 'en',
COLLAPSE_INDEX: false,
BUILDER: 'html',
FILE_SUFFIX: '.html',
LINK_SUFFIX: '.html',
HAS_SOURCE: true,
SOURCELINK_SUFFIX: '.txt',
NAVIGATION_WITH_KEYS: false,
SHOW_SEARCH_SUMMARY: true,
ENABLE_SEARCH_SHORTCUTS: true,
};

BIN
docs/html/_static/file.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 286 B

View File

@ -0,0 +1,199 @@
/*
* language_data.js
* ~~~~~~~~~~~~~~~~
*
* This script contains the language-specific data used by searchtools.js,
* namely the list of stopwords, stemmer, scorer and splitter.
*
* :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
* :license: BSD, see LICENSE for details.
*
*/
var stopwords = ["a", "and", "are", "as", "at", "be", "but", "by", "for", "if", "in", "into", "is", "it", "near", "no", "not", "of", "on", "or", "such", "that", "the", "their", "then", "there", "these", "they", "this", "to", "was", "will", "with"];
/* Non-minified version is copied as a separate JS file, is available */
/**
* Porter Stemmer
*/
var Stemmer = function() {
var step2list = {
ational: 'ate',
tional: 'tion',
enci: 'ence',
anci: 'ance',
izer: 'ize',
bli: 'ble',
alli: 'al',
entli: 'ent',
eli: 'e',
ousli: 'ous',
ization: 'ize',
ation: 'ate',
ator: 'ate',
alism: 'al',
iveness: 'ive',
fulness: 'ful',
ousness: 'ous',
aliti: 'al',
iviti: 'ive',
biliti: 'ble',
logi: 'log'
};
var step3list = {
icate: 'ic',
ative: '',
alize: 'al',
iciti: 'ic',
ical: 'ic',
ful: '',
ness: ''
};
var c = "[^aeiou]"; // consonant
var v = "[aeiouy]"; // vowel
var C = c + "[^aeiouy]*"; // consonant sequence
var V = v + "[aeiou]*"; // vowel sequence
var mgr0 = "^(" + C + ")?" + V + C; // [C]VC... is m>0
var meq1 = "^(" + C + ")?" + V + C + "(" + V + ")?$"; // [C]VC[V] is m=1
var mgr1 = "^(" + C + ")?" + V + C + V + C; // [C]VCVC... is m>1
var s_v = "^(" + C + ")?" + v; // vowel in stem
this.stemWord = function (w) {
var stem;
var suffix;
var firstch;
var origword = w;
if (w.length < 3)
return w;
var re;
var re2;
var re3;
var re4;
firstch = w.substr(0,1);
if (firstch == "y")
w = firstch.toUpperCase() + w.substr(1);
// Step 1a
re = /^(.+?)(ss|i)es$/;
re2 = /^(.+?)([^s])s$/;
if (re.test(w))
w = w.replace(re,"$1$2");
else if (re2.test(w))
w = w.replace(re2,"$1$2");
// Step 1b
re = /^(.+?)eed$/;
re2 = /^(.+?)(ed|ing)$/;
if (re.test(w)) {
var fp = re.exec(w);
re = new RegExp(mgr0);
if (re.test(fp[1])) {
re = /.$/;
w = w.replace(re,"");
}
}
else if (re2.test(w)) {
var fp = re2.exec(w);
stem = fp[1];
re2 = new RegExp(s_v);
if (re2.test(stem)) {
w = stem;
re2 = /(at|bl|iz)$/;
re3 = new RegExp("([^aeiouylsz])\\1$");
re4 = new RegExp("^" + C + v + "[^aeiouwxy]$");
if (re2.test(w))
w = w + "e";
else if (re3.test(w)) {
re = /.$/;
w = w.replace(re,"");
}
else if (re4.test(w))
w = w + "e";
}
}
// Step 1c
re = /^(.+?)y$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(s_v);
if (re.test(stem))
w = stem + "i";
}
// Step 2
re = /^(.+?)(ational|tional|enci|anci|izer|bli|alli|entli|eli|ousli|ization|ation|ator|alism|iveness|fulness|ousness|aliti|iviti|biliti|logi)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
suffix = fp[2];
re = new RegExp(mgr0);
if (re.test(stem))
w = stem + step2list[suffix];
}
// Step 3
re = /^(.+?)(icate|ative|alize|iciti|ical|ful|ness)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
suffix = fp[2];
re = new RegExp(mgr0);
if (re.test(stem))
w = stem + step3list[suffix];
}
// Step 4
re = /^(.+?)(al|ance|ence|er|ic|able|ible|ant|ement|ment|ent|ou|ism|ate|iti|ous|ive|ize)$/;
re2 = /^(.+?)(s|t)(ion)$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(mgr1);
if (re.test(stem))
w = stem;
}
else if (re2.test(w)) {
var fp = re2.exec(w);
stem = fp[1] + fp[2];
re2 = new RegExp(mgr1);
if (re2.test(stem))
w = stem;
}
// Step 5
re = /^(.+?)e$/;
if (re.test(w)) {
var fp = re.exec(w);
stem = fp[1];
re = new RegExp(mgr1);
re2 = new RegExp(meq1);
re3 = new RegExp("^" + C + v + "[^aeiouwxy]$");
if (re.test(stem) || (re2.test(stem) && !(re3.test(stem))))
w = stem;
}
re = /ll$/;
re2 = new RegExp(mgr1);
if (re.test(w) && re2.test(w)) {
re = /.$/;
w = w.replace(re,"");
}
// and turn initial Y back to y
if (firstch == "y")
w = firstch.toLowerCase() + w.substr(1);
return w;
}
}

BIN
docs/html/_static/minus.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 90 B

BIN
docs/html/_static/plus.png Normal file

Binary file not shown.

After

Width:  |  Height:  |  Size: 90 B

View File

@ -0,0 +1,84 @@
pre { line-height: 125%; }
td.linenos .normal { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
span.linenos { color: inherit; background-color: transparent; padding-left: 5px; padding-right: 5px; }
td.linenos .special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
span.linenos.special { color: #000000; background-color: #ffffc0; padding-left: 5px; padding-right: 5px; }
.highlight .hll { background-color: #ffffcc }
.highlight { background: #f8f8f8; }
.highlight .c { color: #8f5902; font-style: italic } /* Comment */
.highlight .err { color: #a40000; border: 1px solid #ef2929 } /* Error */
.highlight .g { color: #000000 } /* Generic */
.highlight .k { color: #004461; font-weight: bold } /* Keyword */
.highlight .l { color: #000000 } /* Literal */
.highlight .n { color: #000000 } /* Name */
.highlight .o { color: #582800 } /* Operator */
.highlight .x { color: #000000 } /* Other */
.highlight .p { color: #000000; font-weight: bold } /* Punctuation */
.highlight .ch { color: #8f5902; font-style: italic } /* Comment.Hashbang */
.highlight .cm { color: #8f5902; font-style: italic } /* Comment.Multiline */
.highlight .cp { color: #8f5902 } /* Comment.Preproc */
.highlight .cpf { color: #8f5902; font-style: italic } /* Comment.PreprocFile */
.highlight .c1 { color: #8f5902; font-style: italic } /* Comment.Single */
.highlight .cs { color: #8f5902; font-style: italic } /* Comment.Special */
.highlight .gd { color: #a40000 } /* Generic.Deleted */
.highlight .ge { color: #000000; font-style: italic } /* Generic.Emph */
.highlight .ges { color: #000000 } /* Generic.EmphStrong */
.highlight .gr { color: #ef2929 } /* Generic.Error */
.highlight .gh { color: #000080; font-weight: bold } /* Generic.Heading */
.highlight .gi { color: #00A000 } /* Generic.Inserted */
.highlight .go { color: #888888 } /* Generic.Output */
.highlight .gp { color: #745334 } /* Generic.Prompt */
.highlight .gs { color: #000000; font-weight: bold } /* Generic.Strong */
.highlight .gu { color: #800080; font-weight: bold } /* Generic.Subheading */
.highlight .gt { color: #a40000; font-weight: bold } /* Generic.Traceback */
.highlight .kc { color: #004461; font-weight: bold } /* Keyword.Constant */
.highlight .kd { color: #004461; font-weight: bold } /* Keyword.Declaration */
.highlight .kn { color: #004461; font-weight: bold } /* Keyword.Namespace */
.highlight .kp { color: #004461; font-weight: bold } /* Keyword.Pseudo */
.highlight .kr { color: #004461; font-weight: bold } /* Keyword.Reserved */
.highlight .kt { color: #004461; font-weight: bold } /* Keyword.Type */
.highlight .ld { color: #000000 } /* Literal.Date */
.highlight .m { color: #990000 } /* Literal.Number */
.highlight .s { color: #4e9a06 } /* Literal.String */
.highlight .na { color: #c4a000 } /* Name.Attribute */
.highlight .nb { color: #004461 } /* Name.Builtin */
.highlight .nc { color: #000000 } /* Name.Class */
.highlight .no { color: #000000 } /* Name.Constant */
.highlight .nd { color: #888888 } /* Name.Decorator */
.highlight .ni { color: #ce5c00 } /* Name.Entity */
.highlight .ne { color: #cc0000; font-weight: bold } /* Name.Exception */
.highlight .nf { color: #000000 } /* Name.Function */
.highlight .nl { color: #f57900 } /* Name.Label */
.highlight .nn { color: #000000 } /* Name.Namespace */
.highlight .nx { color: #000000 } /* Name.Other */
.highlight .py { color: #000000 } /* Name.Property */
.highlight .nt { color: #004461; font-weight: bold } /* Name.Tag */
.highlight .nv { color: #000000 } /* Name.Variable */
.highlight .ow { color: #004461; font-weight: bold } /* Operator.Word */
.highlight .pm { color: #000000; font-weight: bold } /* Punctuation.Marker */
.highlight .w { color: #f8f8f8; text-decoration: underline } /* Text.Whitespace */
.highlight .mb { color: #990000 } /* Literal.Number.Bin */
.highlight .mf { color: #990000 } /* Literal.Number.Float */
.highlight .mh { color: #990000 } /* Literal.Number.Hex */
.highlight .mi { color: #990000 } /* Literal.Number.Integer */
.highlight .mo { color: #990000 } /* Literal.Number.Oct */
.highlight .sa { color: #4e9a06 } /* Literal.String.Affix */
.highlight .sb { color: #4e9a06 } /* Literal.String.Backtick */
.highlight .sc { color: #4e9a06 } /* Literal.String.Char */
.highlight .dl { color: #4e9a06 } /* Literal.String.Delimiter */
.highlight .sd { color: #8f5902; font-style: italic } /* Literal.String.Doc */
.highlight .s2 { color: #4e9a06 } /* Literal.String.Double */
.highlight .se { color: #4e9a06 } /* Literal.String.Escape */
.highlight .sh { color: #4e9a06 } /* Literal.String.Heredoc */
.highlight .si { color: #4e9a06 } /* Literal.String.Interpol */
.highlight .sx { color: #4e9a06 } /* Literal.String.Other */
.highlight .sr { color: #4e9a06 } /* Literal.String.Regex */
.highlight .s1 { color: #4e9a06 } /* Literal.String.Single */
.highlight .ss { color: #4e9a06 } /* Literal.String.Symbol */
.highlight .bp { color: #3465a4 } /* Name.Builtin.Pseudo */
.highlight .fm { color: #000000 } /* Name.Function.Magic */
.highlight .vc { color: #000000 } /* Name.Variable.Class */
.highlight .vg { color: #000000 } /* Name.Variable.Global */
.highlight .vi { color: #000000 } /* Name.Variable.Instance */
.highlight .vm { color: #000000 } /* Name.Variable.Magic */
.highlight .il { color: #990000 } /* Literal.Number.Integer.Long */

View File

@ -0,0 +1,574 @@
/*
* searchtools.js
* ~~~~~~~~~~~~~~~~
*
* Sphinx JavaScript utilities for the full-text search.
*
* :copyright: Copyright 2007-2023 by the Sphinx team, see AUTHORS.
* :license: BSD, see LICENSE for details.
*
*/
"use strict";
/**
* Simple result scoring code.
*/
if (typeof Scorer === "undefined") {
var Scorer = {
// Implement the following function to further tweak the score for each result
// The function takes a result array [docname, title, anchor, descr, score, filename]
// and returns the new score.
/*
score: result => {
const [docname, title, anchor, descr, score, filename] = result
return score
},
*/
// query matches the full name of an object
objNameMatch: 11,
// or matches in the last dotted part of the object name
objPartialMatch: 6,
// Additive scores depending on the priority of the object
objPrio: {
0: 15, // used to be importantResults
1: 5, // used to be objectResults
2: -5, // used to be unimportantResults
},
// Used when the priority is not in the mapping.
objPrioDefault: 0,
// query found in title
title: 15,
partialTitle: 7,
// query found in terms
term: 5,
partialTerm: 2,
};
}
const _removeChildren = (element) => {
while (element && element.lastChild) element.removeChild(element.lastChild);
};
/**
* See https://developer.mozilla.org/en-US/docs/Web/JavaScript/Guide/Regular_Expressions#escaping
*/
const _escapeRegExp = (string) =>
string.replace(/[.*+\-?^${}()|[\]\\]/g, "\\$&"); // $& means the whole matched string
const _displayItem = (item, searchTerms, highlightTerms) => {
const docBuilder = DOCUMENTATION_OPTIONS.BUILDER;
const docFileSuffix = DOCUMENTATION_OPTIONS.FILE_SUFFIX;
const docLinkSuffix = DOCUMENTATION_OPTIONS.LINK_SUFFIX;
const showSearchSummary = DOCUMENTATION_OPTIONS.SHOW_SEARCH_SUMMARY;
const contentRoot = document.documentElement.dataset.content_root;
const [docName, title, anchor, descr, score, _filename] = item;
let listItem = document.createElement("li");
let requestUrl;
let linkUrl;
if (docBuilder === "dirhtml") {
// dirhtml builder
let dirname = docName + "/";
if (dirname.match(/\/index\/$/))
dirname = dirname.substring(0, dirname.length - 6);
else if (dirname === "index/") dirname = "";
requestUrl = contentRoot + dirname;
linkUrl = requestUrl;
} else {
// normal html builders
requestUrl = contentRoot + docName + docFileSuffix;
linkUrl = docName + docLinkSuffix;
}
let linkEl = listItem.appendChild(document.createElement("a"));
linkEl.href = linkUrl + anchor;
linkEl.dataset.score = score;
linkEl.innerHTML = title;
if (descr) {
listItem.appendChild(document.createElement("span")).innerHTML =
" (" + descr + ")";
// highlight search terms in the description
if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js
highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted"));
}
else if (showSearchSummary)
fetch(requestUrl)
.then((responseData) => responseData.text())
.then((data) => {
if (data)
listItem.appendChild(
Search.makeSearchSummary(data, searchTerms)
);
// highlight search terms in the summary
if (SPHINX_HIGHLIGHT_ENABLED) // set in sphinx_highlight.js
highlightTerms.forEach((term) => _highlightText(listItem, term, "highlighted"));
});
Search.output.appendChild(listItem);
};
const _finishSearch = (resultCount) => {
Search.stopPulse();
Search.title.innerText = _("Search Results");
if (!resultCount)
Search.status.innerText = Documentation.gettext(
"Your search did not match any documents. Please make sure that all words are spelled correctly and that you've selected enough categories."
);
else
Search.status.innerText = _(
`Search finished, found ${resultCount} page(s) matching the search query.`
);
};
const _displayNextItem = (
results,
resultCount,
searchTerms,
highlightTerms,
) => {
// results left, load the summary and display it
// this is intended to be dynamic (don't sub resultsCount)
if (results.length) {
_displayItem(results.pop(), searchTerms, highlightTerms);
setTimeout(
() => _displayNextItem(results, resultCount, searchTerms, highlightTerms),
5
);
}
// search finished, update title and status message
else _finishSearch(resultCount);
};
/**
* Default splitQuery function. Can be overridden in ``sphinx.search`` with a
* custom function per language.
*
* The regular expression works by splitting the string on consecutive characters
* that are not Unicode letters, numbers, underscores, or emoji characters.
* This is the same as ``\W+`` in Python, preserving the surrogate pair area.
*/
if (typeof splitQuery === "undefined") {
var splitQuery = (query) => query
.split(/[^\p{Letter}\p{Number}_\p{Emoji_Presentation}]+/gu)
.filter(term => term) // remove remaining empty strings
}
/**
* Search Module
*/
const Search = {
_index: null,
_queued_query: null,
_pulse_status: -1,
htmlToText: (htmlString) => {
const htmlElement = new DOMParser().parseFromString(htmlString, 'text/html');
htmlElement.querySelectorAll(".headerlink").forEach((el) => { el.remove() });
const docContent = htmlElement.querySelector('[role="main"]');
if (docContent !== undefined) return docContent.textContent;
console.warn(
"Content block not found. Sphinx search tries to obtain it via '[role=main]'. Could you check your theme or template."
);
return "";
},
init: () => {
const query = new URLSearchParams(window.location.search).get("q");
document
.querySelectorAll('input[name="q"]')
.forEach((el) => (el.value = query));
if (query) Search.performSearch(query);
},
loadIndex: (url) =>
(document.body.appendChild(document.createElement("script")).src = url),
setIndex: (index) => {
Search._index = index;
if (Search._queued_query !== null) {
const query = Search._queued_query;
Search._queued_query = null;
Search.query(query);
}
},
hasIndex: () => Search._index !== null,
deferQuery: (query) => (Search._queued_query = query),
stopPulse: () => (Search._pulse_status = -1),
startPulse: () => {
if (Search._pulse_status >= 0) return;
const pulse = () => {
Search._pulse_status = (Search._pulse_status + 1) % 4;
Search.dots.innerText = ".".repeat(Search._pulse_status);
if (Search._pulse_status >= 0) window.setTimeout(pulse, 500);
};
pulse();
},
/**
* perform a search for something (or wait until index is loaded)
*/
performSearch: (query) => {
// create the required interface elements
const searchText = document.createElement("h2");
searchText.textContent = _("Searching");
const searchSummary = document.createElement("p");
searchSummary.classList.add("search-summary");
searchSummary.innerText = "";
const searchList = document.createElement("ul");
searchList.classList.add("search");
const out = document.getElementById("search-results");
Search.title = out.appendChild(searchText);
Search.dots = Search.title.appendChild(document.createElement("span"));
Search.status = out.appendChild(searchSummary);
Search.output = out.appendChild(searchList);
const searchProgress = document.getElementById("search-progress");
// Some themes don't use the search progress node
if (searchProgress) {
searchProgress.innerText = _("Preparing search...");
}
Search.startPulse();
// index already loaded, the browser was quick!
if (Search.hasIndex()) Search.query(query);
else Search.deferQuery(query);
},
/**
* execute search (requires search index to be loaded)
*/
query: (query) => {
const filenames = Search._index.filenames;
const docNames = Search._index.docnames;
const titles = Search._index.titles;
const allTitles = Search._index.alltitles;
const indexEntries = Search._index.indexentries;
// stem the search terms and add them to the correct list
const stemmer = new Stemmer();
const searchTerms = new Set();
const excludedTerms = new Set();
const highlightTerms = new Set();
const objectTerms = new Set(splitQuery(query.toLowerCase().trim()));
splitQuery(query.trim()).forEach((queryTerm) => {
const queryTermLower = queryTerm.toLowerCase();
// maybe skip this "word"
// stopwords array is from language_data.js
if (
stopwords.indexOf(queryTermLower) !== -1 ||
queryTerm.match(/^\d+$/)
)
return;
// stem the word
let word = stemmer.stemWord(queryTermLower);
// select the correct list
if (word[0] === "-") excludedTerms.add(word.substr(1));
else {
searchTerms.add(word);
highlightTerms.add(queryTermLower);
}
});
if (SPHINX_HIGHLIGHT_ENABLED) { // set in sphinx_highlight.js
localStorage.setItem("sphinx_highlight_terms", [...highlightTerms].join(" "))
}
// console.debug("SEARCH: searching for:");
// console.info("required: ", [...searchTerms]);
// console.info("excluded: ", [...excludedTerms]);
// array of [docname, title, anchor, descr, score, filename]
let results = [];
_removeChildren(document.getElementById("search-progress"));
const queryLower = query.toLowerCase();
for (const [title, foundTitles] of Object.entries(allTitles)) {
if (title.toLowerCase().includes(queryLower) && (queryLower.length >= title.length/2)) {
for (const [file, id] of foundTitles) {
let score = Math.round(100 * queryLower.length / title.length)
results.push([
docNames[file],
titles[file] !== title ? `${titles[file]} > ${title}` : title,
id !== null ? "#" + id : "",
null,
score,
filenames[file],
]);
}
}
}
// search for explicit entries in index directives
for (const [entry, foundEntries] of Object.entries(indexEntries)) {
if (entry.includes(queryLower) && (queryLower.length >= entry.length/2)) {
for (const [file, id] of foundEntries) {
let score = Math.round(100 * queryLower.length / entry.length)
results.push([
docNames[file],
titles[file],
id ? "#" + id : "",
null,
score,
filenames[file],
]);
}
}
}
// lookup as object
objectTerms.forEach((term) =>
results.push(...Search.performObjectSearch(term, objectTerms))
);
// lookup as search terms in fulltext
results.push(...Search.performTermsSearch(searchTerms, excludedTerms));
// let the scorer override scores with a custom scoring function
if (Scorer.score) results.forEach((item) => (item[4] = Scorer.score(item)));
// now sort the results by score (in opposite order of appearance, since the
// display function below uses pop() to retrieve items) and then
// alphabetically
results.sort((a, b) => {
const leftScore = a[4];
const rightScore = b[4];
if (leftScore === rightScore) {
// same score: sort alphabetically
const leftTitle = a[1].toLowerCase();
const rightTitle = b[1].toLowerCase();
if (leftTitle === rightTitle) return 0;
return leftTitle > rightTitle ? -1 : 1; // inverted is intentional
}
return leftScore > rightScore ? 1 : -1;
});
// remove duplicate search results
// note the reversing of results, so that in the case of duplicates, the highest-scoring entry is kept
let seen = new Set();
results = results.reverse().reduce((acc, result) => {
let resultStr = result.slice(0, 4).concat([result[5]]).map(v => String(v)).join(',');
if (!seen.has(resultStr)) {
acc.push(result);
seen.add(resultStr);
}
return acc;
}, []);
results = results.reverse();
// for debugging
//Search.lastresults = results.slice(); // a copy
// console.info("search results:", Search.lastresults);
// print the results
_displayNextItem(results, results.length, searchTerms, highlightTerms);
},
/**
* search for object names
*/
performObjectSearch: (object, objectTerms) => {
const filenames = Search._index.filenames;
const docNames = Search._index.docnames;
const objects = Search._index.objects;
const objNames = Search._index.objnames;
const titles = Search._index.titles;
const results = [];
const objectSearchCallback = (prefix, match) => {
const name = match[4]
const fullname = (prefix ? prefix + "." : "") + name;
const fullnameLower = fullname.toLowerCase();
if (fullnameLower.indexOf(object) < 0) return;
let score = 0;
const parts = fullnameLower.split(".");
// check for different match types: exact matches of full name or
// "last name" (i.e. last dotted part)
if (fullnameLower === object || parts.slice(-1)[0] === object)
score += Scorer.objNameMatch;
else if (parts.slice(-1)[0].indexOf(object) > -1)
score += Scorer.objPartialMatch; // matches in last name
const objName = objNames[match[1]][2];
const title = titles[match[0]];
// If more than one term searched for, we require other words to be
// found in the name/title/description
const otherTerms = new Set(objectTerms);
otherTerms.delete(object);
if (otherTerms.size > 0) {
const haystack = `${prefix} ${name} ${objName} ${title}`.toLowerCase();
if (
[...otherTerms].some((otherTerm) => haystack.indexOf(otherTerm) < 0)
)
return;
}
let anchor = match[3];
if (anchor === "") anchor = fullname;
else if (anchor === "-") anchor = objNames[match[1]][1] + "-" + fullname;
const descr = objName + _(", in ") + title;
// add custom score for some objects according to scorer
if (Scorer.objPrio.hasOwnProperty(match[2]))
score += Scorer.objPrio[match[2]];
else score += Scorer.objPrioDefault;
results.push([
docNames[match[0]],
fullname,
"#" + anchor,
descr,
score,
filenames[match[0]],
]);
};
Object.keys(objects).forEach((prefix) =>
objects[prefix].forEach((array) =>
objectSearchCallback(prefix, array)
)
);
return results;
},
/**
* search for full-text terms in the index
*/
performTermsSearch: (searchTerms, excludedTerms) => {
// prepare search
const terms = Search._index.terms;
const titleTerms = Search._index.titleterms;
const filenames = Search._index.filenames;
const docNames = Search._index.docnames;
const titles = Search._index.titles;
const scoreMap = new Map();
const fileMap = new Map();
// perform the search on the required terms
searchTerms.forEach((word) => {
const files = [];
const arr = [
{ files: terms[word], score: Scorer.term },
{ files: titleTerms[word], score: Scorer.title },
];
// add support for partial matches
if (word.length > 2) {
const escapedWord = _escapeRegExp(word);
Object.keys(terms).forEach((term) => {
if (term.match(escapedWord) && !terms[word])
arr.push({ files: terms[term], score: Scorer.partialTerm });
});
Object.keys(titleTerms).forEach((term) => {
if (term.match(escapedWord) && !titleTerms[word])
arr.push({ files: titleTerms[word], score: Scorer.partialTitle });
});
}
// no match but word was a required one
if (arr.every((record) => record.files === undefined)) return;
// found search word in contents
arr.forEach((record) => {
if (record.files === undefined) return;
let recordFiles = record.files;
if (recordFiles.length === undefined) recordFiles = [recordFiles];
files.push(...recordFiles);
// set score for the word in each file
recordFiles.forEach((file) => {
if (!scoreMap.has(file)) scoreMap.set(file, {});
scoreMap.get(file)[word] = record.score;
});
});
// create the mapping
files.forEach((file) => {
if (fileMap.has(file) && fileMap.get(file).indexOf(word) === -1)
fileMap.get(file).push(word);
else fileMap.set(file, [word]);
});
});
// now check if the files don't contain excluded terms
const results = [];
for (const [file, wordList] of fileMap) {
// check if all requirements are matched
// as search terms with length < 3 are discarded
const filteredTermCount = [...searchTerms].filter(
(term) => term.length > 2
).length;
if (
wordList.length !== searchTerms.size &&
wordList.length !== filteredTermCount
)
continue;
// ensure that none of the excluded terms is in the search result
if (
[...excludedTerms].some(
(term) =>
terms[term] === file ||
titleTerms[term] === file ||
(terms[term] || []).includes(file) ||
(titleTerms[term] || []).includes(file)
)
)
break;
// select one (max) score for the file.
const score = Math.max(...wordList.map((w) => scoreMap.get(file)[w]));
// add result to the result list
results.push([
docNames[file],
titles[file],
"",
null,
score,
filenames[file],
]);
}
return results;
},
/**
* helper function to return a node containing the
* search summary for a given text. keywords is a list
* of stemmed words.
*/
makeSearchSummary: (htmlText, keywords) => {
const text = Search.htmlToText(htmlText);
if (text === "") return null;
const textLower = text.toLowerCase();
const actualStartPosition = [...keywords]
.map((k) => textLower.indexOf(k.toLowerCase()))
.filter((i) => i > -1)
.slice(-1)[0];
const startWithContext = Math.max(actualStartPosition - 120, 0);
const top = startWithContext === 0 ? "" : "...";
const tail = startWithContext + 240 < text.length ? "..." : "";
let summary = document.createElement("p");
summary.classList.add("context");
summary.textContent = top + text.substr(startWithContext, 240).trim() + tail;
return summary;
},
};
_ready(Search.init);

View File

@ -0,0 +1,154 @@
/* Highlighting utilities for Sphinx HTML documentation. */
"use strict";
const SPHINX_HIGHLIGHT_ENABLED = true
/**
* highlight a given string on a node by wrapping it in
* span elements with the given class name.
*/
const _highlight = (node, addItems, text, className) => {
if (node.nodeType === Node.TEXT_NODE) {
const val = node.nodeValue;
const parent = node.parentNode;
const pos = val.toLowerCase().indexOf(text);
if (
pos >= 0 &&
!parent.classList.contains(className) &&
!parent.classList.contains("nohighlight")
) {
let span;
const closestNode = parent.closest("body, svg, foreignObject");
const isInSVG = closestNode && closestNode.matches("svg");
if (isInSVG) {
span = document.createElementNS("http://www.w3.org/2000/svg", "tspan");
} else {
span = document.createElement("span");
span.classList.add(className);
}
span.appendChild(document.createTextNode(val.substr(pos, text.length)));
const rest = document.createTextNode(val.substr(pos + text.length));
parent.insertBefore(
span,
parent.insertBefore(
rest,
node.nextSibling
)
);
node.nodeValue = val.substr(0, pos);
/* There may be more occurrences of search term in this node. So call this
* function recursively on the remaining fragment.
*/
_highlight(rest, addItems, text, className);
if (isInSVG) {
const rect = document.createElementNS(
"http://www.w3.org/2000/svg",
"rect"
);
const bbox = parent.getBBox();
rect.x.baseVal.value = bbox.x;
rect.y.baseVal.value = bbox.y;
rect.width.baseVal.value = bbox.width;
rect.height.baseVal.value = bbox.height;
rect.setAttribute("class", className);
addItems.push({ parent: parent, target: rect });
}
}
} else if (node.matches && !node.matches("button, select, textarea")) {
node.childNodes.forEach((el) => _highlight(el, addItems, text, className));
}
};
const _highlightText = (thisNode, text, className) => {
let addItems = [];
_highlight(thisNode, addItems, text, className);
addItems.forEach((obj) =>
obj.parent.insertAdjacentElement("beforebegin", obj.target)
);
};
/**
* Small JavaScript module for the documentation.
*/
const SphinxHighlight = {
/**
* highlight the search words provided in localstorage in the text
*/
highlightSearchWords: () => {
if (!SPHINX_HIGHLIGHT_ENABLED) return; // bail if no highlight
// get and clear terms from localstorage
const url = new URL(window.location);
const highlight =
localStorage.getItem("sphinx_highlight_terms")
|| url.searchParams.get("highlight")
|| "";
localStorage.removeItem("sphinx_highlight_terms")
url.searchParams.delete("highlight");
window.history.replaceState({}, "", url);
// get individual terms from highlight string
const terms = highlight.toLowerCase().split(/\s+/).filter(x => x);
if (terms.length === 0) return; // nothing to do
// There should never be more than one element matching "div.body"
const divBody = document.querySelectorAll("div.body");
const body = divBody.length ? divBody[0] : document.querySelector("body");
window.setTimeout(() => {
terms.forEach((term) => _highlightText(body, term, "highlighted"));
}, 10);
const searchBox = document.getElementById("searchbox");
if (searchBox === null) return;
searchBox.appendChild(
document
.createRange()
.createContextualFragment(
'<p class="highlight-link">' +
'<a href="javascript:SphinxHighlight.hideSearchWords()">' +
_("Hide Search Matches") +
"</a></p>"
)
);
},
/**
* helper function to hide the search marks again
*/
hideSearchWords: () => {
document
.querySelectorAll("#searchbox .highlight-link")
.forEach((el) => el.remove());
document
.querySelectorAll("span.highlighted")
.forEach((el) => el.classList.remove("highlighted"));
localStorage.removeItem("sphinx_highlight_terms")
},
initEscapeListener: () => {
// only install a listener if it is really needed
if (!DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS) return;
document.addEventListener("keydown", (event) => {
// bail for input elements
if (BLACKLISTED_KEY_CONTROL_ELEMENTS.has(document.activeElement.tagName)) return;
// bail with special keys
if (event.shiftKey || event.altKey || event.ctrlKey || event.metaKey) return;
if (DOCUMENTATION_OPTIONS.ENABLE_SEARCH_SHORTCUTS && (event.key === "Escape")) {
SphinxHighlight.hideSearchWords();
event.preventDefault();
}
});
},
};
_ready(() => {
/* Do not call highlightSearchWords() when we are on the search page.
* It will highlight words from the *previous* search query.
*/
if (typeof Search === "undefined") SphinxHighlight.highlightSearchWords();
SphinxHighlight.initEscapeListener();
});

281
docs/html/genindex.html Normal file
View File

@ -0,0 +1,281 @@
<!DOCTYPE html>
<html lang="en" data-content_root="./">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Index &#8212; whisper_live documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=4f649999" />
<link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=039e1c02" />
<script src="_static/documentation_options.js?v=5929fcd5"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<link rel="index" title="Index" href="#" />
<link rel="search" title="Search" href="search.html" />
<link rel="stylesheet" href="_static/custom.css" type="text/css" />
<meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
</head><body>
<div class="document">
<div class="documentwrapper">
<div class="bodywrapper">
<div class="body" role="main">
<h1 id="index">Index</h1>
<div class="genindex-jumpbox">
<a href="#A"><strong>A</strong></a>
| <a href="#B"><strong>B</strong></a>
| <a href="#C"><strong>C</strong></a>
| <a href="#D"><strong>D</strong></a>
| <a href="#F"><strong>F</strong></a>
| <a href="#G"><strong>G</strong></a>
| <a href="#M"><strong>M</strong></a>
| <a href="#O"><strong>O</strong></a>
| <a href="#P"><strong>P</strong></a>
| <a href="#R"><strong>R</strong></a>
| <a href="#S"><strong>S</strong></a>
| <a href="#T"><strong>T</strong></a>
| <a href="#U"><strong>U</strong></a>
| <a href="#W"><strong>W</strong></a>
</div>
<h2 id="A">A</h2>
<table style="width: 100%" class="indextable genindextable"><tr>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="index.html#whisper_live.server.ServeClient.add_frames">add_frames() (whisper_live.server.ServeClient method)</a>
</li>
</ul></td>
</tr></table>
<h2 id="B">B</h2>
<table style="width: 100%" class="indextable genindextable"><tr>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="index.html#whisper_live.client.Client.bytes_to_float_array">bytes_to_float_array() (whisper_live.client.Client static method)</a>
</li>
</ul></td>
</tr></table>
<h2 id="C">C</h2>
<table style="width: 100%" class="indextable genindextable"><tr>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="index.html#whisper_live.server.ServeClient.cleanup">cleanup() (whisper_live.server.ServeClient method)</a>
</li>
</ul></td>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="index.html#whisper_live.client.Client">Client (class in whisper_live.client)</a>
</li>
<li><a href="index.html#whisper_live.client.Client.close_websocket">close_websocket() (whisper_live.client.Client method)</a>
</li>
</ul></td>
</tr></table>
<h2 id="D">D</h2>
<table style="width: 100%" class="indextable genindextable"><tr>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="index.html#whisper_live.server.ServeClient.disconnect">disconnect() (whisper_live.server.ServeClient method)</a>
</li>
</ul></td>
</tr></table>
<h2 id="F">F</h2>
<table style="width: 100%" class="indextable genindextable"><tr>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="index.html#whisper_live.server.ServeClient.fill_output">fill_output() (whisper_live.server.ServeClient method)</a>
</li>
</ul></td>
</tr></table>
<h2 id="G">G</h2>
<table style="width: 100%" class="indextable genindextable"><tr>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="index.html#whisper_live.client.Client.get_client_socket">get_client_socket() (whisper_live.client.Client method)</a>
</li>
</ul></td>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="index.html#whisper_live.server.TranscriptionServer.get_wait_time">get_wait_time() (whisper_live.server.TranscriptionServer method)</a>
</li>
</ul></td>
</tr></table>
<h2 id="M">M</h2>
<table style="width: 100%" class="indextable genindextable"><tr>
<td style="width: 33%; vertical-align: top;"><ul>
<li>
module
<ul>
<li><a href="index.html#module-whisper_live.client">whisper_live.client</a>
</li>
<li><a href="index.html#module-whisper_live.server">whisper_live.server</a>
</li>
</ul></li>
</ul></td>
</tr></table>
<h2 id="O">O</h2>
<table style="width: 100%" class="indextable genindextable"><tr>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="index.html#whisper_live.client.Client.on_message">on_message() (whisper_live.client.Client method)</a>
</li>
</ul></td>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="index.html#whisper_live.client.Client.on_open">on_open() (whisper_live.client.Client method)</a>
</li>
</ul></td>
</tr></table>
<h2 id="P">P</h2>
<table style="width: 100%" class="indextable genindextable"><tr>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="index.html#whisper_live.client.Client.play_file">play_file() (whisper_live.client.Client method)</a>
</li>
</ul></td>
</tr></table>
<h2 id="R">R</h2>
<table style="width: 100%" class="indextable genindextable"><tr>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="index.html#whisper_live.client.Client.record">record() (whisper_live.client.Client method)</a>
</li>
<li><a href="index.html#whisper_live.server.TranscriptionServer.recv_audio">recv_audio() (whisper_live.server.TranscriptionServer method)</a>
</li>
</ul></td>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="index.html#whisper_live.client.resample">resample() (in module whisper_live.client)</a>
</li>
<li><a href="index.html#whisper_live.server.TranscriptionServer.run">run() (whisper_live.server.TranscriptionServer method)</a>
</li>
</ul></td>
</tr></table>
<h2 id="S">S</h2>
<table style="width: 100%" class="indextable genindextable"><tr>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="index.html#whisper_live.client.Client.send_packet_to_server">send_packet_to_server() (whisper_live.client.Client method)</a>
</li>
</ul></td>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="index.html#whisper_live.server.ServeClient">ServeClient (class in whisper_live.server)</a>
</li>
<li><a href="index.html#whisper_live.server.ServeClient.speech_to_text">speech_to_text() (whisper_live.server.ServeClient method)</a>
</li>
</ul></td>
</tr></table>
<h2 id="T">T</h2>
<table style="width: 100%" class="indextable genindextable"><tr>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="index.html#whisper_live.client.TranscriptionClient">TranscriptionClient (class in whisper_live.client)</a>
</li>
</ul></td>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="index.html#whisper_live.server.TranscriptionServer">TranscriptionServer (class in whisper_live.server)</a>
</li>
</ul></td>
</tr></table>
<h2 id="U">U</h2>
<table style="width: 100%" class="indextable genindextable"><tr>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="index.html#whisper_live.server.ServeClient.update_segments">update_segments() (whisper_live.server.ServeClient method)</a>
</li>
</ul></td>
</tr></table>
<h2 id="W">W</h2>
<table style="width: 100%" class="indextable genindextable"><tr>
<td style="width: 33%; vertical-align: top;"><ul>
<li>
whisper_live.client
<ul>
<li><a href="index.html#module-whisper_live.client">module</a>
</li>
</ul></li>
<li>
whisper_live.server
<ul>
<li><a href="index.html#module-whisper_live.server">module</a>
</li>
</ul></li>
</ul></td>
<td style="width: 33%; vertical-align: top;"><ul>
<li><a href="index.html#whisper_live.client.Client.write_audio_frames_to_file">write_audio_frames_to_file() (whisper_live.client.Client method)</a>
</li>
<li><a href="index.html#whisper_live.client.Client.write_output_recording">write_output_recording() (whisper_live.client.Client method)</a>
</li>
</ul></td>
</tr></table>
</div>
</div>
</div>
<div class="sphinxsidebar" role="navigation" aria-label="main navigation">
<div class="sphinxsidebarwrapper">
<h1 class="logo"><a href="index.html">whisper_live</a></h1>
<h3>Navigation</h3>
<div class="relations">
<h3>Related Topics</h3>
<ul>
<li><a href="index.html">Documentation overview</a><ul>
</ul></li>
</ul>
</div>
<div id="searchbox" style="display: none" role="search">
<h3 id="searchlabel">Quick search</h3>
<div class="searchformwrapper">
<form class="search" action="search.html" method="get">
<input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
<input type="submit" value="Go" />
</form>
</div>
</div>
<script>document.getElementById('searchbox').style.display = "block"</script>
</div>
</div>
<div class="clearer"></div>
</div>
<div class="footer">
&copy;2023, Collabora.
|
Powered by <a href="http://sphinx-doc.org/">Sphinx 7.2.6</a>
&amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.13</a>
</div>
</body>
</html>

468
docs/html/index.html Normal file
View File

@ -0,0 +1,468 @@
<!DOCTYPE html>
<html lang="en" data-content_root="./">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
<title>Welcome to Whisper Live documentation! &#8212; whisper_live documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=4f649999" />
<link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=039e1c02" />
<script src="_static/documentation_options.js?v=5929fcd5"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="stylesheet" href="_static/custom.css" type="text/css" />
<meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
</head><body>
<div class="document">
<div class="documentwrapper">
<div class="bodywrapper">
<div class="body" role="main">
<section id="welcome-to-whisper-live-documentation">
<h1>Welcome to Whisper Live documentation!<a class="headerlink" href="#welcome-to-whisper-live-documentation" title="Link to this heading"></a></h1>
<div class="toctree-wrapper compound">
</div>
<dl class="py class" id="module-whisper_live.server">
<dt class="sig sig-object py" id="whisper_live.server.ServeClient">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">whisper_live.server.</span></span><span class="sig-name descname"><span class="pre">ServeClient</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">websocket</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">task</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'transcribe'</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">device</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">multilingual</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">language</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">client_uid</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.server.ServeClient" title="Link to this definition"></a></dt>
<dd><dl class="simple">
<dt>Attributes:</dt><dd><p>RATE (int): The audio sampling rate (constant) set to 16000.
SERVER_READY (str): A constant message indicating that the server is ready.
DISCONNECT (str): A constant message indicating that the client should disconnect.
client_uid (str): A unique identifier for the client.
data (bytes): Accumulated audio data.
frames (bytes): Accumulated audio frames.
language (str): The language for transcription.
task (str): The task type, e.g., “transcribe.”
transcriber (WhisperModel): The Whisper model for speech-to-text.
timestamp_offset (float): The offset in audio timestamps.
frames_np (numpy.ndarray): NumPy array to store audio frames.
frames_offset (float): The offset in audio frames.
text (list): List of transcribed text segments.
current_out (str): The current incomplete transcription.
prev_out (str): The previous incomplete transcription.
t_start (float): Timestamp for the start of transcription.
exit (bool): A flag to exit the transcription thread.
same_output_threshold (int): Threshold for consecutive same output segments.
show_prev_out_thresh (int): Threshold for showing previous output segments.
add_pause_thresh (int): Threshold for adding a pause (blank) segment.
transcript (list): List of transcribed segments.
send_last_n_segments (int): Number of last segments to send to the client.
wrapper (textwrap.TextWrapper): Text wrapper for formatting text.
pick_previous_segments (int): Number of previous segments to include in the output.
websocket: The WebSocket connection for the client.</p>
</dd>
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="whisper_live.server.ServeClient.add_frames">
<span class="sig-name descname"><span class="pre">add_frames</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">frame_np</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.server.ServeClient.add_frames" title="Link to this definition"></a></dt>
<dd><p>Add audio frames to the ongoing audio stream buffer.</p>
<p>This method is responsible for maintaining the audio stream buffer, allowing the continuous addition
of audio frames as they are received. It also ensures that the buffer does not exceed a specified size
to prevent excessive memory usage.</p>
<p>If the buffer size exceeds a threshold (45 seconds of audio data), it discards the oldest 30 seconds
of audio data to maintain a reasonable buffer size. If the buffer is empty, it initializes it with the provided
audio frame. The audio stream buffer is used for real-time processing of audio data for transcription.</p>
<dl class="simple">
<dt>Args:</dt><dd><p>frame_np (numpy.ndarray): The audio frame data as a NumPy array.</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="whisper_live.server.ServeClient.cleanup">
<span class="sig-name descname"><span class="pre">cleanup</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.server.ServeClient.cleanup" title="Link to this definition"></a></dt>
<dd><p>Perform cleanup tasks before exiting the transcription service.</p>
<p>This method performs necessary cleanup tasks, including stopping the transcription thread, marking
the exit flag to indicate the transcription thread should exit gracefully, and destroying resources
associated with the transcription process.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="whisper_live.server.ServeClient.disconnect">
<span class="sig-name descname"><span class="pre">disconnect</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.server.ServeClient.disconnect" title="Link to this definition"></a></dt>
<dd><p>Notify the client of disconnection and send a disconnect message.</p>
<p>This method sends a disconnect message to the client via the WebSocket connection to notify them
that the transcription service is disconnecting gracefully.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="whisper_live.server.ServeClient.fill_output">
<span class="sig-name descname"><span class="pre">fill_output</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">output</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.server.ServeClient.fill_output" title="Link to this definition"></a></dt>
<dd><p>Format the current incomplete transcription output by combining it with previous complete segments.
The resulting transcription is wrapped into two lines, each containing a maximum of 50 characters.</p>
<p>It ensures that the combined transcription fits within two lines, with a maximum of 50 characters per line.
Segments are concatenated in the order they exist in the list of previous segments, with the most
recent complete segment first and older segments prepended as needed to maintain the character limit.
If a 3-second pause is detected in the previous segments, any text preceding it is discarded to ensure
the transcription starts with the most recent complete content. The resulting transcription is returned
as a single string.</p>
<dl class="simple">
<dt>Args:</dt><dd><p>output(str): The current incomplete transcription segment.</p>
</dd>
<dt>Returns:</dt><dd><p>str: A formatted transcription wrapped in two lines.</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="whisper_live.server.ServeClient.speech_to_text">
<span class="sig-name descname"><span class="pre">speech_to_text</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.server.ServeClient.speech_to_text" title="Link to this definition"></a></dt>
<dd><p>Process an audio stream in an infinite loop, continuously transcribing the speech.</p>
<p>This method continuously receives audio frames, performs real-time transcription, and sends
transcribed segments to the client via a WebSocket connection.</p>
<p>If the clients language is not detected, it waits for 30 seconds of audio input to make a language prediction.
It utilizes the Whisper ASR model to transcribe the audio, continuously processing and streaming results. Segments
are sent to the client in real-time, and a history of segments is maintained to provide context.Pauses in speech
(no output from Whisper) are handled by showing the previous output for a set duration. A blank segment is added if
there is no speech for a specified duration to indicate a pause.</p>
<dl class="simple">
<dt>Raises:</dt><dd><p>Exception: If there is an issue with audio processing or WebSocket communication.</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="whisper_live.server.ServeClient.update_segments">
<span class="sig-name descname"><span class="pre">update_segments</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">segments</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">duration</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.server.ServeClient.update_segments" title="Link to this definition"></a></dt>
<dd><p>Processes the segments from whisper. Appends all the segments to the list
except for the last segment assuming that it is incomplete.</p>
<p>Updates the ongoing transcript with transcribed segments, including their start and end times.
Complete segments are appended to the transcript in chronological order. Incomplete segments
(assumed to be the last one) are processed to identify repeated content. If the same incomplete
segment is seen multiple times, it updates the offset and appends the segment to the transcript.
A threshold is used to detect repeated content and ensure it is only included once in the transcript.
The timestamp offset is updated based on the duration of processed segments. The method returns the
last processed segment, allowing it to be sent to the client for real-time updates.</p>
<dl class="simple">
<dt>Args:</dt><dd><p>segments(dict) : dictionary of segments as returned by whisper
duration(float): duration of the current chunk</p>
</dd>
<dt>Returns:</dt><dd><dl class="simple">
<dt>dict or None: The last processed segment with its start time, end time, and transcribed text.</dt><dd><p>Returns None if there are no valid segments to process.</p>
</dd>
</dl>
</dd>
</dl>
</dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="whisper_live.server.TranscriptionServer">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">whisper_live.server.</span></span><span class="sig-name descname"><span class="pre">TranscriptionServer</span></span><a class="headerlink" href="#whisper_live.server.TranscriptionServer" title="Link to this definition"></a></dt>
<dd><p>Represents a transcription server that handles incoming audio from clients.</p>
<dl class="simple">
<dt>Attributes:</dt><dd><p>RATE (int): The audio sampling rate (constant) set to 16000.
vad_model (torch.Module): The voice activity detection model.
vad_threshold (float): The voice activity detection threshold.
clients (dict): A dictionary to store connected clients.
websockets (dict): A dictionary to store WebSocket connections.
clients_start_time (dict): A dictionary to track client start times.
max_clients (int): Maximum allowed connected clients.
max_connection_time (int): Maximum allowed connection time in seconds.</p>
</dd>
</dl>
<dl class="py method">
<dt class="sig sig-object py" id="whisper_live.server.TranscriptionServer.get_wait_time">
<span class="sig-name descname"><span class="pre">get_wait_time</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.server.TranscriptionServer.get_wait_time" title="Link to this definition"></a></dt>
<dd><p>Calculate and return the estimated wait time for clients.</p>
<dl class="simple">
<dt>Returns:</dt><dd><p>float: The estimated wait time in minutes.</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="whisper_live.server.TranscriptionServer.recv_audio">
<span class="sig-name descname"><span class="pre">recv_audio</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">websocket</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.server.TranscriptionServer.recv_audio" title="Link to this definition"></a></dt>
<dd><p>Receive audio chunks from a client in an infinite loop.</p>
<p>Continuously receives audio frames from a connected client
over a WebSocket connection. It processes the audio frames using a
voice activity detection (VAD) model to determine if they contain speech
or not. If the audio frame contains speech, it is added to the clients
audio data for ASR.
If the maximum number of clients is reached, the method sends a
“WAIT” status to the client, indicating that they should wait
until a slot is available.
If a clients connection exceeds the maximum allowed time, it will
be disconnected, and the clients resources will be cleaned up.</p>
<dl class="simple">
<dt>Args:</dt><dd><p>websocket (WebSocket): The WebSocket connection for the client.</p>
</dd>
<dt>Raises:</dt><dd><p>Exception: If there is an error during the audio frame processing.</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="whisper_live.server.TranscriptionServer.run">
<span class="sig-name descname"><span class="pre">run</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">host</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">port</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">9090</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.server.TranscriptionServer.run" title="Link to this definition"></a></dt>
<dd><p>Run the transcription server.</p>
<dl class="simple">
<dt>Args:</dt><dd><p>host (str): The host address to bind the server.
port (int): The port number to bind the server.</p>
</dd>
</dl>
</dd></dl>
</dd></dl>
<dl class="py class" id="module-whisper_live.client">
<dt class="sig sig-object py" id="whisper_live.client.Client">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">whisper_live.client.</span></span><span class="sig-name descname"><span class="pre">Client</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">host</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">port</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">is_multilingual</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lang</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">translate</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.client.Client" title="Link to this definition"></a></dt>
<dd><p>Handles audio recording, streaming, and communication with a server using WebSocket.</p>
<dl class="py method">
<dt class="sig sig-object py" id="whisper_live.client.Client.bytes_to_float_array">
<em class="property"><span class="pre">static</span><span class="w"> </span></em><span class="sig-name descname"><span class="pre">bytes_to_float_array</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">audio_bytes</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.client.Client.bytes_to_float_array" title="Link to this definition"></a></dt>
<dd><p>Convert audio data from bytes to a NumPy float array.</p>
<p>It assumes that the audio data is in 16-bit PCM format. The audio data is normalized to
have values between -1 and 1.</p>
<dl class="simple">
<dt>Args:</dt><dd><p>audio_bytes (bytes): Audio data in bytes.</p>
</dd>
<dt>Returns:</dt><dd><p>np.ndarray: A NumPy array containing the audio data as float values normalized between -1 and 1.</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="whisper_live.client.Client.close_websocket">
<span class="sig-name descname"><span class="pre">close_websocket</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.client.Client.close_websocket" title="Link to this definition"></a></dt>
<dd><p>Close the WebSocket connection and join the WebSocket thread.</p>
<p>First attempts to close the WebSocket connection using <cite>self.client_socket.close()</cite>. After
closing the connection, it joins the WebSocket thread to ensure proper termination.</p>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="whisper_live.client.Client.get_client_socket">
<span class="sig-name descname"><span class="pre">get_client_socket</span></span><span class="sig-paren">(</span><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.client.Client.get_client_socket" title="Link to this definition"></a></dt>
<dd><p>Get the WebSocket client socket instance.</p>
<dl class="simple">
<dt>Returns:</dt><dd><p>WebSocketApp: The WebSocket client socket instance currently in use by the client.</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="whisper_live.client.Client.on_message">
<span class="sig-name descname"><span class="pre">on_message</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">ws</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">message</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.client.Client.on_message" title="Link to this definition"></a></dt>
<dd><p>Callback function called when a message is received from the server.</p>
<p>It updates various attributes of the client based on the received message, including
recording status, language detection, and server messages. If a disconnect message
is received, it sets the recording status to False.</p>
<dl class="simple">
<dt>Args:</dt><dd><p>ws (websocket.WebSocketApp): The WebSocket client instance.
message (str): The received message from the server.</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="whisper_live.client.Client.on_open">
<span class="sig-name descname"><span class="pre">on_open</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">ws</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.client.Client.on_open" title="Link to this definition"></a></dt>
<dd><p>Callback function called when the WebSocket connection is successfully opened.</p>
<p>Sends an initial configuration message to the server, including client UID, multilingual mode,
language selection, and task type.</p>
<dl class="simple">
<dt>Args:</dt><dd><p>ws (websocket.WebSocketApp): The WebSocket client instance.</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="whisper_live.client.Client.play_file">
<span class="sig-name descname"><span class="pre">play_file</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">filename</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.client.Client.play_file" title="Link to this definition"></a></dt>
<dd><p>Play an audio file and send it to the server for processing.</p>
<p>Reads an audio file, plays it through the audio output, and simultaneously sends
the audio data to the server for processing. It uses PyAudio to create an audio
stream for playback. The audio data is read from the file in chunks, converted to
floating-point format, and sent to the server using WebSocket communication.
This method is typically used when you want to process pre-recorded audio and send it
to the server in real-time.</p>
<dl class="simple">
<dt>Args:</dt><dd><p>filename (str): The path to the audio file to be played and sent to the server.</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="whisper_live.client.Client.record">
<span class="sig-name descname"><span class="pre">record</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">out_file</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">'output_recording.wav'</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.client.Client.record" title="Link to this definition"></a></dt>
<dd><p>Record audio data from the input stream and save it to a WAV file.</p>
<p>Continuously records audio data from the input stream, sends it to the server via a WebSocket
connection, and simultaneously saves it to multiple WAV files in chunks. It stops recording when
the <cite>RECORD_SECONDS</cite> duration is reached or when the <cite>RECORDING</cite> flag is set to <cite>False</cite>.</p>
<p>Audio data is saved in chunks to the “chunks” directory. Each chunk is saved as a separate WAV file.
The recording will continue until the specified duration is reached or until the <cite>RECORDING</cite> flag is set to <cite>False</cite>.
The recording process can be interrupted by sending a KeyboardInterrupt (e.g., pressing Ctrl+C). After recording,
the method combines all the saved audio chunks into the specified <cite>out_file</cite>.</p>
<dl class="simple">
<dt>Args:</dt><dd><p>out_file (str, optional): The name of the output WAV file to save the entire recording. Default is “output_recording.wav”.</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="whisper_live.client.Client.send_packet_to_server">
<span class="sig-name descname"><span class="pre">send_packet_to_server</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">message</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.client.Client.send_packet_to_server" title="Link to this definition"></a></dt>
<dd><p>Send an audio packet to the server using WebSocket.</p>
<dl class="simple">
<dt>Args:</dt><dd><p>message (bytes): The audio data packet in bytes to be sent to the server.</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="whisper_live.client.Client.write_audio_frames_to_file">
<span class="sig-name descname"><span class="pre">write_audio_frames_to_file</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">frames</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">file_name</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.client.Client.write_audio_frames_to_file" title="Link to this definition"></a></dt>
<dd><p>Write audio frames to a WAV file.</p>
<p>The WAV file is created or overwritten with the specified name. The audio frames should be
in the correct format and match the specified channel, sample width, and sample rate.</p>
<dl class="simple">
<dt>Args:</dt><dd><p>frames (bytes): The audio frames to be written to the file.
file_name (str): The name of the WAV file to which the frames will be written.</p>
</dd>
</dl>
</dd></dl>
<dl class="py method">
<dt class="sig sig-object py" id="whisper_live.client.Client.write_output_recording">
<span class="sig-name descname"><span class="pre">write_output_recording</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">n_audio_file</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">out_file</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.client.Client.write_output_recording" title="Link to this definition"></a></dt>
<dd><p>Combine and save recorded audio chunks into a single WAV file.</p>
<p>The individual audio chunk files are expected to be located in the “chunks” directory. Reads each chunk
file, appends its audio data to the final recording, and then deletes the chunk file. After combining
and saving, the final recording is stored in the specified <cite>out_file</cite>.</p>
<dl class="simple">
<dt>Args:</dt><dd><p>n_audio_file (int): The number of audio chunk files to combine.
out_file (str): The name of the output WAV file to save the final recording.</p>
</dd>
</dl>
</dd></dl>
</dd></dl>
<dl class="py class">
<dt class="sig sig-object py" id="whisper_live.client.TranscriptionClient">
<em class="property"><span class="pre">class</span><span class="w"> </span></em><span class="sig-prename descclassname"><span class="pre">whisper_live.client.</span></span><span class="sig-name descname"><span class="pre">TranscriptionClient</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">host</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">port</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">is_multilingual</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">lang</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">None</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">translate</span></span><span class="o"><span class="pre">=</span></span><span class="default_value"><span class="pre">False</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.client.TranscriptionClient" title="Link to this definition"></a></dt>
<dd><p>Client for handling audio transcription tasks via a WebSocket connection.</p>
<p>Acts as a high-level client for audio transcription tasks using a WebSocket connection. It can be used
to send audio data for transcription to a server and receive transcribed text segments.</p>
<dl class="simple">
<dt>Args:</dt><dd><p>host (str): The hostname or IP address of the server.
port (int): The port number to connect to on the server.
is_multilingual (bool, optional): Indicates whether the transcription should support multiple languages (default is False).
lang (str, optional): The primary language for transcription (used if <cite>is_multilingual</cite> is False). Default is None, which defaults to English (en).
translate (bool, optional): Indicates whether translation tasks are required (default is False).</p>
</dd>
<dt>Attributes:</dt><dd><p>client (Client): An instance of the underlying Client class responsible for handling the WebSocket connection.</p>
</dd>
<dt>Example:</dt><dd><p>To create a TranscriptionClient and start transcription on microphone audio:
<code class="docutils literal notranslate"><span class="pre">`python</span>
<span class="pre">transcription_client</span> <span class="pre">=</span> <span class="pre">TranscriptionClient(host=&quot;localhost&quot;,</span> <span class="pre">port=9090,</span> <span class="pre">is_multilingual=True)</span>
<span class="pre">transcription_client()</span>
<span class="pre">`</span></code></p>
</dd>
</dl>
</dd></dl>
<dl class="py function">
<dt class="sig sig-object py" id="whisper_live.client.resample">
<span class="sig-prename descclassname"><span class="pre">whisper_live.client.</span></span><span class="sig-name descname"><span class="pre">resample</span></span><span class="sig-paren">(</span><em class="sig-param"><span class="n"><span class="pre">file</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">str</span></span></em>, <em class="sig-param"><span class="n"><span class="pre">sr</span></span><span class="p"><span class="pre">:</span></span><span class="w"> </span><span class="n"><span class="pre">int</span></span><span class="w"> </span><span class="o"><span class="pre">=</span></span><span class="w"> </span><span class="default_value"><span class="pre">16000</span></span></em><span class="sig-paren">)</span><a class="headerlink" href="#whisper_live.client.resample" title="Link to this definition"></a></dt>
<dd><p># <a class="reference external" href="https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/audio.py#L22">https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/audio.py#L22</a>
Open an audio file and read as mono waveform, resampling as necessary,
save the resampled audio</p>
<dl class="simple">
<dt>Args:</dt><dd><p>file (str): The audio file to open
sr (int): The sample rate to resample the audio if necessary</p>
</dd>
<dt>Returns:</dt><dd><p>resampled_file (str): The resampled audio file</p>
</dd>
</dl>
</dd></dl>
</section>
<section id="indices-and-tables">
<h1>Indices and tables<a class="headerlink" href="#indices-and-tables" title="Link to this heading"></a></h1>
<ul class="simple">
<li><p><a class="reference internal" href="genindex.html"><span class="std std-ref">Index</span></a></p></li>
<li><p><a class="reference internal" href="py-modindex.html"><span class="std std-ref">Module Index</span></a></p></li>
<li><p><a class="reference internal" href="search.html"><span class="std std-ref">Search Page</span></a></p></li>
</ul>
</section>
</div>
</div>
</div>
<div class="sphinxsidebar" role="navigation" aria-label="main navigation">
<div class="sphinxsidebarwrapper">
<h1 class="logo"><a href="#">whisper_live</a></h1>
<h3>Navigation</h3>
<div class="relations">
<h3>Related Topics</h3>
<ul>
<li><a href="#">Documentation overview</a><ul>
</ul></li>
</ul>
</div>
<div id="searchbox" style="display: none" role="search">
<h3 id="searchlabel">Quick search</h3>
<div class="searchformwrapper">
<form class="search" action="search.html" method="get">
<input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
<input type="submit" value="Go" />
</form>
</div>
</div>
<script>document.getElementById('searchbox').style.display = "block"</script>
</div>
</div>
<div class="clearer"></div>
</div>
<div class="footer">
&copy;2023, Collabora.
|
Powered by <a href="http://sphinx-doc.org/">Sphinx 7.2.6</a>
&amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.13</a>
|
<a href="_sources/index.rst.txt"
rel="nofollow">Page source</a>
</div>
</body>
</html>

5
docs/html/objects.inv Normal file
View File

@ -0,0 +1,5 @@
# Sphinx inventory version 2
# Project: whisper_live
# Version:
# The remainder of this file is compressed using zlib.
­UËnÂ0¼ç+\µ×Dåʵ§J­T‰J­Å^«ŽmùAÈßׯ@A hNIvwÆû˜uºF8ƒJ±ÁŠI<C5A0>ÊÓÏ[̓DòJ„⸭ßÊçl+_HYt—¨êífœ#³SìmPµê=:ê5]K ž‚µÐïÒ@ßh>ˆŠIí<49>v¸ršý ÉR£§ÙB'ñhE[tjO  ªh#¡§k!ÇžniËG*N ¤î¥Ñ:´´#¹:+<R\D<>Xh÷j_Y&ÔÁài.S¨z(Ù·å˜Æ ­F-€E­Ée¬ƒb‰èò¤‘¶4£ªEz ËôYçû1ü»cWX˜DPÁL¡àÂ1­2?…%êHîµ0…&:5I˜·“˜‚áåé°nãçà&ŸÉrq”Ë€a_aØÝ‰ˆ¸Ì¢Å¤”n—MÞ釰…·d<C2B7>jg!Îó¹„JRÎÈÁº#ïéµø 㚥 “ˆ%J¦cõ^“eÎ|ļHŒ iT<69>z*â^9ÌôåÁÙ>óþæcO¼w`_},Q<>£eÍ90Û2fý_ñOTüëÖäÒ

123
docs/html/py-modindex.html Normal file
View File

@ -0,0 +1,123 @@
<!DOCTYPE html>
<html lang="en" data-content_root="./">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Python Module Index &#8212; whisper_live documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=4f649999" />
<link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=039e1c02" />
<script src="_static/documentation_options.js?v=5929fcd5"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="search.html" />
<link rel="stylesheet" href="_static/custom.css" type="text/css" />
<meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
</head><body>
<div class="document">
<div class="documentwrapper">
<div class="bodywrapper">
<div class="body" role="main">
<h1>Python Module Index</h1>
<div class="modindex-jumpbox">
<a href="#cap-w"><strong>w</strong></a>
</div>
<table class="indextable modindextable">
<tr class="pcap"><td></td><td>&#160;</td><td></td></tr>
<tr class="cap" id="cap-w"><td></td><td>
<strong>w</strong></td><td></td></tr>
<tr>
<td><img src="_static/minus.png" class="toggler"
id="toggle-1" style="display: none" alt="-" /></td>
<td>
<code class="xref">whisper_live</code></td><td>
<em></em></td></tr>
<tr class="cg-1">
<td></td>
<td>&#160;&#160;&#160;
<a href="index.html#module-whisper_live.client"><code class="xref">whisper_live.client</code></a></td><td>
<em></em></td></tr>
<tr class="cg-1">
<td></td>
<td>&#160;&#160;&#160;
<a href="index.html#module-whisper_live.server"><code class="xref">whisper_live.server</code></a></td><td>
<em></em></td></tr>
</table>
</div>
</div>
</div>
<div class="sphinxsidebar" role="navigation" aria-label="main navigation">
<div class="sphinxsidebarwrapper">
<h1 class="logo"><a href="index.html">whisper_live</a></h1>
<h3>Navigation</h3>
<div class="relations">
<h3>Related Topics</h3>
<ul>
<li><a href="index.html">Documentation overview</a><ul>
</ul></li>
</ul>
</div>
<div id="searchbox" style="display: none" role="search">
<h3 id="searchlabel">Quick search</h3>
<div class="searchformwrapper">
<form class="search" action="search.html" method="get">
<input type="text" name="q" aria-labelledby="searchlabel" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
<input type="submit" value="Go" />
</form>
</div>
</div>
<script>document.getElementById('searchbox').style.display = "block"</script>
</div>
</div>
<div class="clearer"></div>
</div>
<div class="footer">
&copy;2023, Collabora.
|
Powered by <a href="http://sphinx-doc.org/">Sphinx 7.2.6</a>
&amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.13</a>
</div>
</body>
</html>

117
docs/html/search.html Normal file
View File

@ -0,0 +1,117 @@
<!DOCTYPE html>
<html lang="en" data-content_root="./">
<head>
<meta charset="utf-8" />
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
<title>Search &#8212; whisper_live documentation</title>
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=4f649999" />
<link rel="stylesheet" type="text/css" href="_static/alabaster.css?v=039e1c02" />
<script src="_static/documentation_options.js?v=5929fcd5"></script>
<script src="_static/doctools.js?v=888ff710"></script>
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
<script src="_static/searchtools.js"></script>
<script src="_static/language_data.js"></script>
<link rel="index" title="Index" href="genindex.html" />
<link rel="search" title="Search" href="#" />
<script src="searchindex.js" defer></script>
<link rel="stylesheet" href="_static/custom.css" type="text/css" />
<meta name="viewport" content="width=device-width, initial-scale=0.9, maximum-scale=0.9" />
</head><body>
<div class="document">
<div class="documentwrapper">
<div class="bodywrapper">
<div class="body" role="main">
<h1 id="search-documentation">Search</h1>
<noscript>
<div class="admonition warning">
<p>
Please activate JavaScript to enable the search
functionality.
</p>
</div>
</noscript>
<p>
Searching for multiple words only shows matches that contain
all words.
</p>
<form action="" method="get">
<input type="text" name="q" aria-labelledby="search-documentation" value="" autocomplete="off" autocorrect="off" autocapitalize="off" spellcheck="false"/>
<input type="submit" value="search" />
<span id="search-progress" style="padding-left: 10px"></span>
</form>
<div id="search-results">
</div>
</div>
</div>
</div>
<div class="sphinxsidebar" role="navigation" aria-label="main navigation">
<div class="sphinxsidebarwrapper">
<h1 class="logo"><a href="index.html">whisper_live</a></h1>
<h3>Navigation</h3>
<div class="relations">
<h3>Related Topics</h3>
<ul>
<li><a href="index.html">Documentation overview</a><ul>
</ul></li>
</ul>
</div>
</div>
</div>
<div class="clearer"></div>
</div>
<div class="footer">
&copy;2023, Collabora.
|
Powered by <a href="http://sphinx-doc.org/">Sphinx 7.2.6</a>
&amp; <a href="https://github.com/bitprophet/alabaster">Alabaster 0.7.13</a>
</div>
</body>
</html>

1
docs/html/searchindex.js Normal file

File diff suppressed because one or more lines are too long

1
docs/index.html Normal file
View File

@ -0,0 +1 @@
<meta http-equiv="refresh" content="0; url=./html/index.html" />

13
requirements/server.txt Normal file
View File

@ -0,0 +1,13 @@
faster-whisper==1.1.0
websockets
onnxruntime==1.17.0
numba
kaldialign
soundfile
scipy
av
jiwer
evaluate
numpy<2
openai-whisper==20240930
tokenizers==0.20.3

84
run_server.py Normal file
View File

@ -0,0 +1,84 @@
import argparse
import ssl
import os
import socket
def check_port_availability(port):
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
result = sock.connect_ex(('0.0.0.0', port))
sock.close()
return result != 0
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--port', '-p',
type=int,
default=int(os.getenv('PORT_WHISPERLIVE')),
help="Websocket port to run the server on.")
parser.add_argument('--backend', '-b',
type=str,
default='faster_whisper',
help='Backends from ["tensorrt", "faster_whisper"]')
parser.add_argument('--faster_whisper_custom_model_path', '-fw',
type=str, default=None,
help="Custom Faster Whisper Model")
parser.add_argument('--trt_model_path', '-trt',
type=str,
default=None,
help='Whisper TensorRT model path')
parser.add_argument('--trt_multilingual', '-m',
action="store_true",
help='Boolean only for TensorRT model. True if multilingual.')
parser.add_argument('--ssl_cert_path', '-ssl',
type=str,
default=None,
help='Path to cert.pem and key.pem if ssl should be used.')
parser.add_argument('--omp_num_threads', '-omp',
type=int,
default=1,
help="Number of threads to use for OpenMP")
parser.add_argument('--no_single_model', '-nsm',
action='store_true',
help='Set this if every connection should instantiate its own model. Only relevant for custom model, passed using -trt or -fw.')
args = parser.parse_args()
if args.backend == "tensorrt":
if args.trt_model_path is None:
raise ValueError("Please Provide a valid tensorrt model path")
port = args.port
if not check_port_availability(port):
print(f"Warning: Port {port} might already be in use!")
ssl_context = None
if args.ssl_cert_path is not None:
try:
ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
ssl_context.load_cert_chain(
certfile=f"{args.ssl_cert_path}/cert.pem",
keyfile=f"{args.ssl_cert_path}/privkey.pem"
)
print("SSL context created successfully")
except Exception as e:
print(f"Failed to load SSL certificates: {str(e)}")
raise
if "OMP_NUM_THREADS" not in os.environ:
print(f"Setting OMP_NUM_THREADS to {args.omp_num_threads}")
os.environ["OMP_NUM_THREADS"] = str(args.omp_num_threads)
from whisper_live.server import TranscriptionServer
print(f"Running server with args: {args}")
server = TranscriptionServer()
print(f"Starting server on port {args.port} with backend {args.backend} using SSL: {args.ssl_cert_path is not None}")
server.run(
"0.0.0.0",
port=args.port,
backend=args.backend,
faster_whisper_custom_model_path=args.faster_whisper_custom_model_path,
whisper_tensorrt_path=args.trt_model_path,
trt_multilingual=args.trt_multilingual,
single_model=not args.no_single_model,
ssl_context=ssl_context
)

View File

@ -0,0 +1,77 @@
#!/bin/bash
download_and_build_model() {
local model_name="$1"
local model_url=""
case "$model_name" in
"tiny.en")
model_url="https://openaipublic.azureedge.net/main/whisper/models/d3dd57d32accea0b295c96e26691aa14d8822fac7d9d27d5dc00b4ca2826dd03/tiny.en.pt"
;;
"tiny")
model_url="https://openaipublic.azureedge.net/main/whisper/models/65147644a518d12f04e32d6f3b26facc3f8dd46e5390956a9424a650c0ce22b9/tiny.pt"
;;
"base.en")
model_url="https://openaipublic.azureedge.net/main/whisper/models/25a8566e1d0c1e2231d1c762132cd20e0f96a85d16145c3a00adf5d1ac670ead/base.en.pt"
;;
"base")
model_url="https://openaipublic.azureedge.net/main/whisper/models/ed3a0b6b1c0edf879ad9b11b1af5a0e6ab5db9205f891f668f8b0e6c6326e34e/base.pt"
;;
"small.en")
model_url="https://openaipublic.azureedge.net/main/whisper/models/f953ad0fd29cacd07d5a9eda5624af0f6bcf2258be67c92b79389873d91e0872/small.en.pt"
;;
"small")
model_url="https://openaipublic.azureedge.net/main/whisper/models/9ecf779972d90ba49c06d968637d720dd632c55bbf19d441fb42bf17a411e794/small.pt"
;;
"medium.en")
model_url="https://openaipublic.azureedge.net/main/whisper/models/d7440d1dc186f76616474e0ff0b3b6b879abc9d1a4926b7adfa41db2d497ab4f/medium.en.pt"
;;
"medium")
model_url="https://openaipublic.azureedge.net/main/whisper/models/345ae4da62f9b3d59415adc60127b97c714f32e89e936602e85993674d08dcb1/medium.pt"
;;
"large-v1")
model_url="https://openaipublic.azureedge.net/main/whisper/models/e4b87e7e0bf463eb8e6956e646f1e277e901512310def2c24bf0e11bd3c28e9a/large-v1.pt"
;;
"large-v2")
model_url="https://openaipublic.azureedge.net/main/whisper/models/81f7c96c852ee8fc832187b0132e569d6c3065a3252ed18e56effd0b6a73e524/large-v2.pt"
;;
"large-v3" | "large")
model_url="https://openaipublic.azureedge.net/main/whisper/models/e5b1a55b89c1367dacf97e3e19bfd829a01529dbfdeefa8caeb59b3f1b81dadb/large-v3.pt"
;;
*)
echo "Invalid model name: $model_name"
exit 1
;;
esac
echo "Downloading $model_name..."
# wget --directory-prefix=assets "$model_url"
# echo "Download completed: ${model_name}.pt"
if [ ! -f "assets/${model_name}.pt" ]; then
wget --directory-prefix=assets "$model_url"
echo "Download completed: ${model_name}.pt"
else
echo "${model_name}.pt already exists in assets directory."
fi
local output_dir="whisper_${model_name//./_}"
echo "$output_dir"
echo "Running build script for $model_name with output directory $output_dir"
python3 build.py --output_dir "$output_dir" --use_gpt_attention_plugin --use_gemm_plugin --use_bert_attention_plugin --model_name "$model_name"
echo "Whisper $model_name TensorRT engine built."
echo "========================================="
echo "Model is located at: $(pwd)/$output_dir"
}
if [ "$#" -lt 1 ]; then
echo "Usage: $0 <path-to-tensorrt-examples-dir> [model-name]"
exit 1
fi
tensorrt_examples_dir="$1"
model_name="${2:-small.en}"
cd $1/whisper
pip install --no-deps -r requirements.txt
download_and_build_model "$model_name"

4
scripts/setup.sh Normal file
View File

@ -0,0 +1,4 @@
#!/bin/bash
apt-get update
apt-get install -y portaudio19-dev ffmpeg wget

60
setup.py Normal file
View File

@ -0,0 +1,60 @@
import pathlib
from setuptools import find_packages, setup
from whisper_live.__version__ import __version__
# The directory containing this file
HERE = pathlib.Path(__file__).parent
# The text of the README file
README = (HERE / "README.md").read_text()
# This call to setup() does all the work
setup(
name="whisper_live",
version=__version__,
description="A nearly-live implementation of OpenAI's Whisper.",
long_description=README,
long_description_content_type="text/markdown",
include_package_data=True,
url="https://github.com/collabora/WhisperLive",
author="Collabora Ltd",
author_email="vineet.suryan@collabora.com",
license="MIT",
classifiers=[
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
"Intended Audience :: Science/Research",
"License :: OSI Approved :: MIT License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3 :: Only",
"Programming Language :: Python :: 3.8",
"Programming Language :: Python :: 3.9",
"Topic :: Scientific/Engineering :: Artificial Intelligence",
],
packages=find_packages(
exclude=(
"examples",
"Audio-Transcription-Chrome",
"Audio-Transcription-Firefox",
"requirements",
"whisper-finetuning"
)
),
install_requires=[
"PyAudio",
"faster-whisper==1.1.0",
"torch",
"torchaudio",
"websockets",
"onnxruntime==1.17.0",
"scipy",
"websocket-client",
"numba",
"openai-whisper==20240930", #TODO: understand this
"kaldialign",
"soundfile",
"tokenizers==0.20.3"
],
python_requires=">=3.8"
)

0
tests/__init__.py Normal file
View File

156
tests/test_client.py Normal file
View File

@ -0,0 +1,156 @@
import json
import os
import scipy
import websocket
import copy
import unittest
from unittest.mock import patch, MagicMock
from whisper_live.client import Client, TranscriptionClient, TranscriptionTeeClient
from whisper_live.utils import resample
from pathlib import Path
class BaseTestCase(unittest.TestCase):
@patch('whisper_live.client.websocket.WebSocketApp')
@patch('whisper_live.client.pyaudio.PyAudio')
def setUp(self, mock_pyaudio, mock_websocket):
self.mock_pyaudio_instance = MagicMock()
mock_pyaudio.return_value = self.mock_pyaudio_instance
self.mock_stream = MagicMock()
self.mock_pyaudio_instance.open.return_value = self.mock_stream
self.mock_ws_app = mock_websocket.return_value
self.mock_ws_app.send = MagicMock()
self.client = TranscriptionClient(host='localhost', port=9090, lang="en").client
self.mock_pyaudio = mock_pyaudio
self.mock_websocket = mock_websocket
self.mock_audio_packet = b'\x00\x01\x02\x03'
def tearDown(self):
self.client.close_websocket()
self.mock_pyaudio.stop()
self.mock_websocket.stop()
del self.client
class TestClientWebSocketCommunication(BaseTestCase):
def test_websocket_communication(self):
expected_url = 'ws://localhost:9090'
self.mock_websocket.assert_called()
self.assertEqual(self.mock_websocket.call_args[0][0], expected_url)
class TestClientCallbacks(BaseTestCase):
def test_on_open(self):
expected_message = json.dumps({
"uid": self.client.uid,
"language": self.client.language,
"task": self.client.task,
"model": self.client.model,
"use_vad": True
})
self.client.on_open(self.mock_ws_app)
self.mock_ws_app.send.assert_called_with(expected_message)
def test_on_message(self):
message = json.dumps(
{
"uid": self.client.uid,
"message": "SERVER_READY",
"backend": "faster_whisper"
}
)
self.client.on_message(self.mock_ws_app, message)
message = json.dumps({
"uid": self.client.uid,
"segments": [
{"start": 0, "end": 1, "text": "Test transcript"},
{"start": 1, "end": 2, "text": "Test transcript 2"},
{"start": 2, "end": 3, "text": "Test transcript 3"}
]
})
self.client.on_message(self.mock_ws_app, message)
# Assert that the transcript was updated correctly
self.assertEqual(len(self.client.transcript), 2)
self.assertEqual(self.client.transcript[1]['text'], "Test transcript 2")
def test_on_close(self):
close_status_code = 1000
close_msg = "Normal closure"
self.client.on_close(self.mock_ws_app, close_status_code, close_msg)
self.assertFalse(self.client.recording)
self.assertFalse(self.client.server_error)
self.assertFalse(self.client.waiting)
def test_on_error(self):
error_message = "Test Error"
self.client.on_error(self.mock_ws_app, error_message)
self.assertTrue(self.client.server_error)
self.assertEqual(self.client.error_message, error_message)
class TestAudioResampling(unittest.TestCase):
def test_resample_audio(self):
original_audio = "assets/jfk.flac"
expected_sr = 16000
resampled_audio = resample(original_audio, expected_sr)
sr, _ = scipy.io.wavfile.read(resampled_audio)
self.assertEqual(sr, expected_sr)
os.remove(resampled_audio)
class TestSendingAudioPacket(BaseTestCase):
def test_send_packet(self):
self.client.send_packet_to_server(self.mock_audio_packet)
self.client.client_socket.send.assert_called_with(self.mock_audio_packet, websocket.ABNF.OPCODE_BINARY)
class TestTee(BaseTestCase):
@patch('whisper_live.client.websocket.WebSocketApp')
@patch('whisper_live.client.pyaudio.PyAudio')
def setUp(self, mock_audio, mock_websocket):
super().setUp()
self.client2 = Client(host='localhost', port=9090, lang="es", translate=False, srt_file_path="transcript.srt")
self.client3 = Client(host='localhost', port=9090, lang="es", translate=True, srt_file_path="translation.srt")
# need a separate mock for each websocket
self.client3.client_socket = copy.deepcopy(self.client3.client_socket)
self.tee = TranscriptionTeeClient([self.client2, self.client3])
def tearDown(self):
self.tee.close_all_clients()
del self.tee
super().tearDown()
def test_invalid_constructor(self):
with self.assertRaises(Exception) as context:
TranscriptionTeeClient([])
def test_multicast_unconditional(self):
self.tee.multicast_packet(self.mock_audio_packet, True)
for client in self.tee.clients:
client.client_socket.send.assert_called_with(self.mock_audio_packet, websocket.ABNF.OPCODE_BINARY)
def test_multicast_conditional(self):
self.client2.recording = False
self.client3.recording = True
self.tee.multicast_packet(self.mock_audio_packet, False)
self.client2.client_socket.send.assert_not_called()
self.client3.client_socket.send.assert_called_with(self.mock_audio_packet, websocket.ABNF.OPCODE_BINARY)
def test_close_all(self):
self.tee.close_all_clients()
for client in self.tee.clients:
client.client_socket.close.assert_called()
def test_write_all_srt(self):
for client in self.tee.clients:
client.server_backend = "faster_whisper"
self.tee.write_all_clients_srt()
self.assertTrue(Path("transcript.srt").is_file())
self.assertTrue(Path("translation.srt").is_file())

150
tests/test_server.py Normal file
View File

@ -0,0 +1,150 @@
import subprocess
import time
import json
import unittest
from unittest import mock
import numpy as np
import evaluate
from websockets.exceptions import ConnectionClosed
from whisper_live.server import TranscriptionServer
from whisper_live.client import Client, TranscriptionClient, TranscriptionTeeClient
from whisper.normalizers import EnglishTextNormalizer
class TestTranscriptionServerInitialization(unittest.TestCase):
def test_initialization(self):
server = TranscriptionServer()
self.assertEqual(server.client_manager.max_clients, 4)
self.assertEqual(server.client_manager.max_connection_time, 600)
self.assertDictEqual(server.client_manager.clients, {})
self.assertDictEqual(server.client_manager.start_times, {})
class TestGetWaitTime(unittest.TestCase):
def setUp(self):
self.server = TranscriptionServer()
self.server.client_manager.start_times = {
'client1': time.time() - 120,
'client2': time.time() - 300
}
self.server.client_manager.max_connection_time = 600
def test_get_wait_time(self):
expected_wait_time = (600 - (time.time() - self.server.client_manager.start_times['client2'])) / 60
print(self.server.client_manager.get_wait_time(), expected_wait_time)
self.assertAlmostEqual(self.server.client_manager.get_wait_time(), expected_wait_time, places=2)
class TestServerConnection(unittest.TestCase):
def setUp(self):
self.server = TranscriptionServer()
@mock.patch('websockets.WebSocketCommonProtocol')
def test_connection(self, mock_websocket):
mock_websocket.recv.return_value = json.dumps({
'uid': 'test_client',
'language': 'en',
'task': 'transcribe',
'model': 'tiny.en'
})
self.server.recv_audio(mock_websocket, "faster_whisper")
@mock.patch('websockets.WebSocketCommonProtocol')
def test_recv_audio_exception_handling(self, mock_websocket):
mock_websocket.recv.side_effect = [json.dumps({
'uid': 'test_client',
'language': 'en',
'task': 'transcribe',
'model': 'tiny.en'
}), np.array([1, 2, 3]).tobytes()]
with self.assertLogs(level="ERROR"):
self.server.recv_audio(mock_websocket, "faster_whisper")
self.assertNotIn(mock_websocket, self.server.client_manager.clients)
class TestServerInferenceAccuracy(unittest.TestCase):
@classmethod
def setUpClass(cls):
cls.mock_pyaudio_patch = mock.patch('pyaudio.PyAudio')
cls.mock_pyaudio = cls.mock_pyaudio_patch.start()
cls.mock_pyaudio.return_value.open.return_value = mock.MagicMock()
cls.server_process = subprocess.Popen(["python", "run_server.py"])
time.sleep(2)
@classmethod
def tearDownClass(cls):
cls.server_process.terminate()
cls.server_process.wait()
def setUp(self):
self.metric = evaluate.load("wer")
self.normalizer = EnglishTextNormalizer()
def check_prediction(self, srt_path):
gt = "And so my fellow Americans, ask not, what your country can do for you. Ask what you can do for your country!"
with open(srt_path, "r") as f:
lines = f.readlines()
prediction = " ".join([line.strip() for line in lines[2::4]])
prediction_normalized = self.normalizer(prediction)
gt_normalized = self.normalizer(gt)
# calculate WER
wer = self.metric.compute(
predictions=[prediction_normalized],
references=[gt_normalized]
)
self.assertLess(wer, 0.05)
def test_inference(self):
client = TranscriptionClient(
"localhost", "9090", model="base.en", lang="en",
)
client("assets/jfk.flac")
self.check_prediction("output.srt")
def test_simultaneous_inference(self):
client1 = Client(
"localhost", "9090", model="base.en", lang="en", srt_file_path="transcript1.srt")
client2 = Client(
"localhost", "9090", model="base.en", lang="en", srt_file_path="transcript2.srt")
tee = TranscriptionTeeClient([client1, client2])
tee("assets/jfk.flac")
self.check_prediction("transcript1.srt")
self.check_prediction("transcript2.srt")
class TestExceptionHandling(unittest.TestCase):
def setUp(self):
self.server = TranscriptionServer()
@mock.patch('websockets.WebSocketCommonProtocol')
def test_connection_closed_exception(self, mock_websocket):
mock_websocket.recv.side_effect = ConnectionClosed(1001, "testing connection closed")
with self.assertLogs(level="INFO") as log:
self.server.recv_audio(mock_websocket, "faster_whisper")
self.assertTrue(any("Connection closed by client" in message for message in log.output))
@mock.patch('websockets.WebSocketCommonProtocol')
def test_json_decode_exception(self, mock_websocket):
mock_websocket.recv.return_value = "invalid json"
with self.assertLogs(level="ERROR") as log:
self.server.recv_audio(mock_websocket, "faster_whisper")
self.assertTrue(any("Failed to decode JSON from client" in message for message in log.output))
@mock.patch('websockets.WebSocketCommonProtocol')
def test_unexpected_exception_handling(self, mock_websocket):
mock_websocket.recv.side_effect = RuntimeError("Unexpected error")
with self.assertLogs(level="ERROR") as log:
self.server.recv_audio(mock_websocket, "faster_whisper")
for message in log.output:
print(message)
print()
self.assertTrue(any("Unexpected error" in message for message in log.output))

26
tests/test_vad.py Normal file
View File

@ -0,0 +1,26 @@
import unittest
import numpy as np
from whisper_live.tensorrt_utils import load_audio
from whisper_live.vad import VoiceActivityDetector
class TestVoiceActivityDetection(unittest.TestCase):
def setUp(self):
self.vad = VoiceActivityDetector()
self.sample_rate = 16000
def generate_silence(self, duration_seconds):
return np.zeros(int(self.sample_rate * duration_seconds), dtype=np.float32)
def load_speech_segment(self, filepath):
return load_audio(filepath)
def test_vad_silence_detection(self):
silence = self.generate_silence(3)
is_speech_present = self.vad(silence.copy())
self.assertFalse(is_speech_present, "VAD incorrectly identified silence as speech.")
def test_vad_speech_detection(self):
audio_tensor = load_audio("assets/jfk.flac")
is_speech_present = self.vad(audio_tensor)
self.assertTrue(is_speech_present, "VAD failed to identify speech segment.")

0
whisper_live/__init__.py Normal file
View File

Binary file not shown.

Binary file not shown.

View File

@ -0,0 +1 @@
__version__ = "0.4.1"

1139
whisper_live/server.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,365 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
import os
from collections import defaultdict
from functools import lru_cache
from pathlib import Path
from subprocess import CalledProcessError, run
from typing import Dict, Iterable, List, Optional, TextIO, Tuple, Union
import kaldialign
import numpy as np
import soundfile
import torch
import torch.nn.functional as F
Pathlike = Union[str, Path]
SAMPLE_RATE = 16000
N_FFT = 400
HOP_LENGTH = 160
CHUNK_LENGTH = 30
N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000 samples in a 30-second chunk
def load_audio(file: str, sr: int = SAMPLE_RATE):
"""
Open an audio file and read as mono waveform, resampling as necessary
Parameters
----------
file: str
The audio file to open
sr: int
The sample rate to resample the audio if necessary
Returns
-------
A NumPy array containing the audio waveform, in float32 dtype.
"""
# This launches a subprocess to decode audio while down-mixing
# and resampling as necessary. Requires the ffmpeg CLI in PATH.
# fmt: off
cmd = [
"ffmpeg", "-nostdin", "-threads", "0", "-i", file, "-f", "s16le", "-ac",
"1", "-acodec", "pcm_s16le", "-ar",
str(sr), "-"
]
# fmt: on
try:
out = run(cmd, capture_output=True, check=True).stdout
except CalledProcessError as e:
raise RuntimeError(f"Failed to load audio: {e.stderr.decode()}") from e
return np.frombuffer(out, np.int16).flatten().astype(np.float32) / 32768.0
def load_audio_wav_format(wav_path):
# make sure audio in .wav format
assert wav_path.endswith(
'.wav'), f"Only support .wav format, but got {wav_path}"
waveform, sample_rate = soundfile.read(wav_path)
assert sample_rate == 16000, f"Only support 16k sample rate, but got {sample_rate}"
return waveform, sample_rate
def pad_or_trim(array, length: int = N_SAMPLES, *, axis: int = -1):
"""
Pad or trim the audio array to N_SAMPLES, as expected by the encoder.
"""
if torch.is_tensor(array):
if array.shape[axis] > length:
array = array.index_select(dim=axis,
index=torch.arange(length,
device=array.device))
if array.shape[axis] < length:
pad_widths = [(0, 0)] * array.ndim
pad_widths[axis] = (0, length - array.shape[axis])
array = F.pad(array,
[pad for sizes in pad_widths[::-1] for pad in sizes])
else:
if array.shape[axis] > length:
array = array.take(indices=range(length), axis=axis)
if array.shape[axis] < length:
pad_widths = [(0, 0)] * array.ndim
pad_widths[axis] = (0, length - array.shape[axis])
array = np.pad(array, pad_widths)
return array
@lru_cache(maxsize=None)
def mel_filters(device,
n_mels: int,
mel_filters_dir: str = None) -> torch.Tensor:
"""
load the mel filterbank matrix for projecting STFT into a Mel spectrogram.
Allows decoupling librosa dependency; saved using:
np.savez_compressed(
"mel_filters.npz",
mel_80=librosa.filters.mel(sr=16000, n_fft=400, n_mels=80),
)
"""
assert n_mels in {80, 128}, f"Unsupported n_mels: {n_mels}"
if mel_filters_dir is None:
mel_filters_path = os.path.join(os.path.dirname(__file__), "assets",
"mel_filters.npz")
else:
mel_filters_path = os.path.join(mel_filters_dir, "mel_filters.npz")
with np.load(mel_filters_path) as f:
return torch.from_numpy(f[f"mel_{n_mels}"]).to(device)
def log_mel_spectrogram(
audio: Union[str, np.ndarray, torch.Tensor],
n_mels: int,
padding: int = 0,
device: Optional[Union[str, torch.device]] = None,
return_duration: bool = False,
mel_filters_dir: str = None,
):
"""
Compute the log-Mel spectrogram of
Parameters
----------
audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
n_mels: int
The number of Mel-frequency filters, only 80 and 128 are supported
padding: int
Number of zero samples to pad to the right
device: Optional[Union[str, torch.device]]
If given, the audio tensor is moved to this device before STFT
Returns
-------
torch.Tensor, shape = (80 or 128, n_frames)
A Tensor that contains the Mel spectrogram
"""
if not torch.is_tensor(audio):
if isinstance(audio, str):
if audio.endswith('.wav'):
audio, _ = load_audio_wav_format(audio)
else:
audio = load_audio(audio)
assert isinstance(audio,
np.ndarray), f"Unsupported audio type: {type(audio)}"
duration = audio.shape[-1] / SAMPLE_RATE
audio = pad_or_trim(audio, N_SAMPLES)
audio = audio.astype(np.float32)
audio = torch.from_numpy(audio)
if device is not None:
audio = audio.to(device)
if padding > 0:
audio = F.pad(audio, (0, padding))
window = torch.hann_window(N_FFT).to(audio.device)
stft = torch.stft(audio,
N_FFT,
HOP_LENGTH,
window=window,
return_complex=True)
magnitudes = stft[..., :-1].abs()**2
filters = mel_filters(audio.device, n_mels, mel_filters_dir)
mel_spec = filters @ magnitudes
log_spec = torch.clamp(mel_spec, min=1e-10).log10()
log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
log_spec = (log_spec + 4.0) / 4.0
if return_duration:
return log_spec, duration
else:
return log_spec
def store_transcripts(filename: Pathlike, texts: Iterable[Tuple[str, str,
str]]) -> None:
"""Save predicted results and reference transcripts to a file.
https://github.com/k2-fsa/icefall/blob/master/icefall/utils.py
Args:
filename:
File to save the results to.
texts:
An iterable of tuples. The first element is the cur_id, the second is
the reference transcript and the third element is the predicted result.
Returns:
Return None.
"""
with open(filename, "w") as f:
for cut_id, ref, hyp in texts:
print(f"{cut_id}:\tref={ref}", file=f)
print(f"{cut_id}:\thyp={hyp}", file=f)
def write_error_stats( # noqa: C901
f: TextIO,
test_set_name: str,
results: List[Tuple[str, str]],
enable_log: bool = True,
) -> float:
"""Write statistics based on predicted results and reference transcripts.
https://github.com/k2-fsa/icefall/blob/master/icefall/utils.py
It will write the following to the given file:
- WER
- number of insertions, deletions, substitutions, corrects and total
reference words. For example::
Errors: 23 insertions, 57 deletions, 212 substitutions, over 2606
reference words (2337 correct)
- The difference between the reference transcript and predicted result.
An instance is given below::
THE ASSOCIATION OF (EDISON->ADDISON) ILLUMINATING COMPANIES
The above example shows that the reference word is `EDISON`,
but it is predicted to `ADDISON` (a substitution error).
Another example is::
FOR THE FIRST DAY (SIR->*) I THINK
The reference word `SIR` is missing in the predicted
results (a deletion error).
results:
An iterable of tuples. The first element is the cur_id, the second is
the reference transcript and the third element is the predicted result.
enable_log:
If True, also print detailed WER to the console.
Otherwise, it is written only to the given file.
Returns:
Return None.
"""
subs: Dict[Tuple[str, str], int] = defaultdict(int)
ins: Dict[str, int] = defaultdict(int)
dels: Dict[str, int] = defaultdict(int)
# `words` stores counts per word, as follows:
# corr, ref_sub, hyp_sub, ins, dels
words: Dict[str, List[int]] = defaultdict(lambda: [0, 0, 0, 0, 0])
num_corr = 0
ERR = "*"
for cut_id, ref, hyp in results:
ali = kaldialign.align(ref, hyp, ERR)
for ref_word, hyp_word in ali:
if ref_word == ERR:
ins[hyp_word] += 1
words[hyp_word][3] += 1
elif hyp_word == ERR:
dels[ref_word] += 1
words[ref_word][4] += 1
elif hyp_word != ref_word:
subs[(ref_word, hyp_word)] += 1
words[ref_word][1] += 1
words[hyp_word][2] += 1
else:
words[ref_word][0] += 1
num_corr += 1
ref_len = sum([len(r) for _, r, _ in results])
sub_errs = sum(subs.values())
ins_errs = sum(ins.values())
del_errs = sum(dels.values())
tot_errs = sub_errs + ins_errs + del_errs
tot_err_rate = "%.2f" % (100.0 * tot_errs / ref_len)
if enable_log:
logging.info(f"[{test_set_name}] %WER {tot_errs / ref_len:.2%} "
f"[{tot_errs} / {ref_len}, {ins_errs} ins, "
f"{del_errs} del, {sub_errs} sub ]")
print(f"%WER = {tot_err_rate}", file=f)
print(
f"Errors: {ins_errs} insertions, {del_errs} deletions, "
f"{sub_errs} substitutions, over {ref_len} reference "
f"words ({num_corr} correct)",
file=f,
)
print(
"Search below for sections starting with PER-UTT DETAILS:, "
"SUBSTITUTIONS:, DELETIONS:, INSERTIONS:, PER-WORD STATS:",
file=f,
)
print("", file=f)
print("PER-UTT DETAILS: corr or (ref->hyp) ", file=f)
for cut_id, ref, hyp in results:
ali = kaldialign.align(ref, hyp, ERR)
combine_successive_errors = True
if combine_successive_errors:
ali = [[[x], [y]] for x, y in ali]
for i in range(len(ali) - 1):
if ali[i][0] != ali[i][1] and ali[i + 1][0] != ali[i + 1][1]:
ali[i + 1][0] = ali[i][0] + ali[i + 1][0]
ali[i + 1][1] = ali[i][1] + ali[i + 1][1]
ali[i] = [[], []]
ali = [[
list(filter(lambda a: a != ERR, x)),
list(filter(lambda a: a != ERR, y)),
] for x, y in ali]
ali = list(filter(lambda x: x != [[], []], ali))
ali = [[
ERR if x == [] else " ".join(x),
ERR if y == [] else " ".join(y),
] for x, y in ali]
print(
f"{cut_id}:\t" + " ".join((ref_word if ref_word == hyp_word else
f"({ref_word}->{hyp_word})"
for ref_word, hyp_word in ali)),
file=f,
)
print("", file=f)
print("SUBSTITUTIONS: count ref -> hyp", file=f)
for count, (ref, hyp) in sorted([(v, k) for k, v in subs.items()],
reverse=True):
print(f"{count} {ref} -> {hyp}", file=f)
print("", file=f)
print("DELETIONS: count ref", file=f)
for count, ref in sorted([(v, k) for k, v in dels.items()], reverse=True):
print(f"{count} {ref}", file=f)
print("", file=f)
print("INSERTIONS: count hyp", file=f)
for count, hyp in sorted([(v, k) for k, v in ins.items()], reverse=True):
print(f"{count} {hyp}", file=f)
print("", file=f)
print("PER-WORD STATS: word corr tot_errs count_in_ref count_in_hyp",
file=f)
for _, word, counts in sorted([(sum(v[1:]), k, v)
for k, v in words.items()],
reverse=True):
(corr, ref_sub, hyp_sub, ins, dels) = counts
tot_errs = ref_sub + hyp_sub + ins + dels
ref_count = corr + ref_sub + dels
hyp_count = corr + hyp_sub + ins
print(f"{word} {corr} {tot_errs} {ref_count} {hyp_count}", file=f)
return float(tot_err_rate)

1889
whisper_live/transcriber.py Normal file

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,320 @@
import json
import re
from collections import OrderedDict
from pathlib import Path
from typing import Union
import torch
import numpy as np
import torch.nn.functional as F
from whisper.tokenizer import get_tokenizer
from whisper_live.tensorrt_utils import (mel_filters, load_audio_wav_format, pad_or_trim, load_audio)
import tensorrt_llm
import tensorrt_llm.logger as logger
from tensorrt_llm._utils import (str_dtype_to_torch, str_dtype_to_trt,
trt_dtype_to_torch)
from tensorrt_llm.runtime import ModelConfig, SamplingConfig
from tensorrt_llm.runtime.session import Session, TensorInfo
SAMPLE_RATE = 16000
N_FFT = 400
HOP_LENGTH = 160
CHUNK_LENGTH = 30
N_SAMPLES = CHUNK_LENGTH * SAMPLE_RATE # 480000 samples in a 30-second chunk
class WhisperEncoding:
def __init__(self, engine_dir):
self.session = self.get_session(engine_dir)
def get_session(self, engine_dir):
config_path = engine_dir / 'encoder_config.json'
with open(config_path, 'r') as f:
config = json.load(f)
dtype = config['builder_config']['precision']
n_mels = config['builder_config']['n_mels']
num_languages = config['builder_config']['num_languages']
self.dtype = dtype
self.n_mels = n_mels
self.num_languages = num_languages
serialize_path = engine_dir / f'whisper_encoder_{self.dtype}_tp1_rank0.engine'
with open(serialize_path, 'rb') as f:
session = Session.from_serialized_engine(f.read())
return session
def get_audio_features(self, mel):
inputs = OrderedDict()
output_list = []
inputs.update({'x': mel})
output_list.append(
TensorInfo('x', str_dtype_to_trt(self.dtype), mel.shape))
output_info = (self.session).infer_shapes(output_list)
logger.debug(f'output info {output_info}')
outputs = {
t.name: torch.empty(tuple(t.shape),
dtype=trt_dtype_to_torch(t.dtype),
device='cuda')
for t in output_info
}
stream = torch.cuda.current_stream()
ok = self.session.run(inputs=inputs,
outputs=outputs,
stream=stream.cuda_stream)
assert ok, 'Engine execution failed'
stream.synchronize()
audio_features = outputs['output']
return audio_features
class WhisperDecoding:
def __init__(self, engine_dir, runtime_mapping, debug_mode=False):
self.decoder_config = self.get_config(engine_dir)
self.decoder_generation_session = self.get_session(
engine_dir, runtime_mapping, debug_mode)
def get_config(self, engine_dir):
config_path = engine_dir / 'decoder_config.json'
with open(config_path, 'r') as f:
config = json.load(f)
decoder_config = OrderedDict()
decoder_config.update(config['plugin_config'])
decoder_config.update(config['builder_config'])
return decoder_config
def get_session(self, engine_dir, runtime_mapping, debug_mode=False):
dtype = self.decoder_config['precision']
serialize_path = engine_dir / f'whisper_decoder_{dtype}_tp1_rank0.engine'
with open(serialize_path, "rb") as f:
decoder_engine_buffer = f.read()
decoder_model_config = ModelConfig(
num_heads=self.decoder_config['num_heads'],
num_kv_heads=self.decoder_config['num_heads'],
hidden_size=self.decoder_config['hidden_size'],
vocab_size=self.decoder_config['vocab_size'],
num_layers=self.decoder_config['num_layers'],
gpt_attention_plugin=self.decoder_config['gpt_attention_plugin'],
remove_input_padding=self.decoder_config['remove_input_padding'],
cross_attention=self.decoder_config['cross_attention'],
has_position_embedding=self.
decoder_config['has_position_embedding'],
has_token_type_embedding=self.
decoder_config['has_token_type_embedding'],
)
decoder_generation_session = tensorrt_llm.runtime.GenerationSession(
decoder_model_config,
decoder_engine_buffer,
runtime_mapping,
debug_mode=debug_mode)
return decoder_generation_session
def generate(self,
decoder_input_ids,
encoder_outputs,
eot_id,
max_new_tokens=40,
num_beams=1):
encoder_input_lengths = torch.tensor(
[encoder_outputs.shape[1] for x in range(encoder_outputs.shape[0])],
dtype=torch.int32,
device='cuda')
decoder_input_lengths = torch.tensor([
decoder_input_ids.shape[-1]
for _ in range(decoder_input_ids.shape[0])
],
dtype=torch.int32,
device='cuda')
decoder_max_input_length = torch.max(decoder_input_lengths).item()
# generation config
sampling_config = SamplingConfig(end_id=eot_id,
pad_id=eot_id,
num_beams=num_beams)
self.decoder_generation_session.setup(
decoder_input_lengths.size(0),
decoder_max_input_length,
max_new_tokens,
beam_width=num_beams,
encoder_max_input_length=encoder_outputs.shape[1])
torch.cuda.synchronize()
decoder_input_ids = decoder_input_ids.type(torch.int32).cuda()
output_ids = self.decoder_generation_session.decode(
decoder_input_ids,
decoder_input_lengths,
sampling_config,
encoder_output=encoder_outputs,
encoder_input_lengths=encoder_input_lengths,
)
torch.cuda.synchronize()
# get the list of int from output_ids tensor
output_ids = output_ids.cpu().numpy().tolist()
return output_ids
class WhisperTRTLLM(object):
def __init__(self, engine_dir, assets_dir=None, device=None, is_multilingual=False,
language="en", task="transcribe"):
world_size = 1
runtime_rank = tensorrt_llm.mpi_rank()
runtime_mapping = tensorrt_llm.Mapping(world_size, runtime_rank)
torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
engine_dir = Path(engine_dir)
self.encoder = WhisperEncoding(engine_dir)
self.decoder = WhisperDecoding(engine_dir,
runtime_mapping,
debug_mode=False)
self.n_mels = self.encoder.n_mels
# self.tokenizer = get_tokenizer(num_languages=self.encoder.num_languages,
# tokenizer_dir=assets_dir)
self.device = device
self.tokenizer = get_tokenizer(
is_multilingual,
num_languages=self.encoder.num_languages,
language=language,
task=task,
)
self.filters = mel_filters(self.device, self.encoder.n_mels, assets_dir)
def log_mel_spectrogram(
self,
audio: Union[str, np.ndarray, torch.Tensor],
padding: int = 0,
return_duration=True
):
"""
Compute the log-Mel spectrogram of
Parameters
----------
audio: Union[str, np.ndarray, torch.Tensor], shape = (*)
The path to audio or either a NumPy array or Tensor containing the audio waveform in 16 kHz
n_mels: int
The number of Mel-frequency filters, only 80 and 128 are supported
padding: int
Number of zero samples to pad to the right
device: Optional[Union[str, torch.device]]
If given, the audio tensor is moved to this device before STFT
Returns
-------
torch.Tensor, shape = (80 or 128, n_frames)
A Tensor that contains the Mel spectrogram
"""
if not torch.is_tensor(audio):
if isinstance(audio, str):
if audio.endswith('.wav'):
audio, _ = load_audio_wav_format(audio)
else:
audio = load_audio(audio)
assert isinstance(audio, np.ndarray), f"Unsupported audio type: {type(audio)}"
duration = audio.shape[-1] / SAMPLE_RATE
audio = pad_or_trim(audio, N_SAMPLES)
audio = audio.astype(np.float32)
audio = torch.from_numpy(audio)
if self.device is not None:
audio = audio.to(self.device)
if padding > 0:
audio = F.pad(audio, (0, padding))
window = torch.hann_window(N_FFT).to(audio.device)
stft = torch.stft(audio, N_FFT, HOP_LENGTH, window=window, return_complex=True)
magnitudes = stft[..., :-1].abs()**2
mel_spec = self.filters @ magnitudes
log_spec = torch.clamp(mel_spec, min=1e-10).log10()
log_spec = torch.maximum(log_spec, log_spec.max() - 8.0)
log_spec = (log_spec + 4.0) / 4.0
if return_duration:
return log_spec, duration
else:
return log_spec
def process_batch(
self,
mel,
text_prefix="<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
num_beams=1):
prompt_id = self.tokenizer.encode(
text_prefix, allowed_special=set(self.tokenizer.special_tokens.keys()))
prompt_id = torch.tensor(prompt_id)
batch_size = mel.shape[0]
decoder_input_ids = prompt_id.repeat(batch_size, 1)
encoder_output = self.encoder.get_audio_features(mel)
output_ids = self.decoder.generate(decoder_input_ids,
encoder_output,
self.tokenizer.eot,
max_new_tokens=96,
num_beams=num_beams)
texts = []
for i in range(len(output_ids)):
text = self.tokenizer.decode(output_ids[i][0]).strip()
texts.append(text)
return texts
def transcribe(
self,
mel,
text_prefix="<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
dtype='float16',
batch_size=1,
num_beams=1,
):
mel = mel.type(str_dtype_to_torch(dtype))
mel = mel.unsqueeze(0)
predictions = self.process_batch(mel, text_prefix, num_beams)
prediction = predictions[0]
# remove all special tokens in the prediction
prediction = re.sub(r'<\|.*?\|>', '', prediction)
return prediction.strip()
def decode_wav_file(
model,
mel,
text_prefix="<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
dtype='float16',
batch_size=1,
num_beams=1,
normalizer=None,
mel_filters_dir=None):
mel = mel.type(str_dtype_to_torch(dtype))
mel = mel.unsqueeze(0)
# repeat the mel spectrogram to match the batch size
mel = mel.repeat(batch_size, 1, 1)
predictions = model.process_batch(mel, text_prefix, num_beams)
prediction = predictions[0]
# remove all special tokens in the prediction
prediction = re.sub(r'<\|.*?\|>', '', prediction)
if normalizer:
prediction = normalizer(prediction)
return prediction.strip()

82
whisper_live/utils.py Normal file
View File

@ -0,0 +1,82 @@
import os
import textwrap
import scipy
import numpy as np
import av
from pathlib import Path
def clear_screen():
"""Clears the console screen."""
os.system("cls" if os.name == "nt" else "clear")
def print_transcript(text):
"""Prints formatted transcript text."""
wrapper = textwrap.TextWrapper(width=60)
for line in wrapper.wrap(text="".join(text)):
print(line)
def format_time(s):
"""Convert seconds (float) to SRT time format."""
hours = int(s // 3600)
minutes = int((s % 3600) // 60)
seconds = int(s % 60)
milliseconds = int((s - int(s)) * 1000)
return f"{hours:02}:{minutes:02}:{seconds:02},{milliseconds:03}"
def create_srt_file(segments, resampled_file):
with open(resampled_file, 'w', encoding='utf-8') as srt_file:
segment_number = 1
for segment in segments:
start_time = format_time(float(segment['start']))
end_time = format_time(float(segment['end']))
text = segment['text']
srt_file.write(f"{segment_number}\n")
srt_file.write(f"{start_time} --> {end_time}\n")
srt_file.write(f"{text}\n\n")
segment_number += 1
def resample(file: str, sr: int = 16000):
"""
Resample the audio file to 16kHz.
Args:
file (str): The audio file to open
sr (int): The sample rate to resample the audio if necessary
Returns:
resampled_file (str): The resampled audio file
"""
container = av.open(file)
stream = next(s for s in container.streams if s.type == 'audio')
resampler = av.AudioResampler(
format='s16',
layout='mono',
rate=sr,
)
resampled_file = Path(file).stem + "_resampled.wav"
output_container = av.open(resampled_file, mode='w')
output_stream = output_container.add_stream('pcm_s16le', rate=sr)
output_stream.layout = 'mono'
for frame in container.decode(audio=0):
frame.pts = None
resampled_frames = resampler.resample(frame)
if resampled_frames is not None:
for resampled_frame in resampled_frames:
for packet in output_stream.encode(resampled_frame):
output_container.mux(packet)
for packet in output_stream.encode(None):
output_container.mux(packet)
output_container.close()
return resampled_file

155
whisper_live/vad.py Normal file
View File

@ -0,0 +1,155 @@
# original: https://github.com/snakers4/silero-vad/blob/master/utils_vad.py
import os
import subprocess
import torch
import numpy as np
import onnxruntime
import warnings
class VoiceActivityDetection():
def __init__(self, force_onnx_cpu=True):
path = self.download()
opts = onnxruntime.SessionOptions()
opts.log_severity_level = 3
opts.inter_op_num_threads = 1
opts.intra_op_num_threads = 1
if force_onnx_cpu and 'CPUExecutionProvider' in onnxruntime.get_available_providers():
self.session = onnxruntime.InferenceSession(path, providers=['CPUExecutionProvider'], sess_options=opts)
else:
self.session = onnxruntime.InferenceSession(path, providers=['CUDAExecutionProvider'], sess_options=opts)
self.reset_states()
self.sample_rates = [8000, 16000]
def _validate_input(self, x, sr: int):
if x.dim() == 1:
x = x.unsqueeze(0)
if x.dim() > 2:
raise ValueError(f"Too many dimensions for input audio chunk {x.dim()}")
if sr != 16000 and (sr % 16000 == 0):
step = sr // 16000
x = x[:, ::step]
sr = 16000
if sr not in self.sample_rates:
raise ValueError(f"Supported sampling rates: {self.sample_rates} (or multiply of 16000)")
if sr / x.shape[1] > 31.25:
raise ValueError("Input audio chunk is too short")
return x, sr
def reset_states(self, batch_size=1):
self._state = torch.zeros((2, batch_size, 128)).float()
self._context = torch.zeros(0)
self._last_sr = 0
self._last_batch_size = 0
def __call__(self, x, sr: int):
x, sr = self._validate_input(x, sr)
num_samples = 512 if sr == 16000 else 256
if x.shape[-1] != num_samples:
raise ValueError(f"Provided number of samples is {x.shape[-1]} (Supported values: 256 for 8000 sample rate, 512 for 16000)")
batch_size = x.shape[0]
context_size = 64 if sr == 16000 else 32
if not self._last_batch_size:
self.reset_states(batch_size)
if (self._last_sr) and (self._last_sr != sr):
self.reset_states(batch_size)
if (self._last_batch_size) and (self._last_batch_size != batch_size):
self.reset_states(batch_size)
if not len(self._context):
self._context = torch.zeros(batch_size, context_size)
x = torch.cat([self._context, x], dim=1)
if sr in [8000, 16000]:
ort_inputs = {'input': x.numpy(), 'state': self._state.numpy(), 'sr': np.array(sr, dtype='int64')}
ort_outs = self.session.run(None, ort_inputs)
out, state = ort_outs
self._state = torch.from_numpy(state)
else:
raise ValueError()
self._context = x[..., -context_size:]
self._last_sr = sr
self._last_batch_size = batch_size
out = torch.from_numpy(out)
return out
def audio_forward(self, x, sr: int):
outs = []
x, sr = self._validate_input(x, sr)
self.reset_states()
num_samples = 512 if sr == 16000 else 256
if x.shape[1] % num_samples:
pad_num = num_samples - (x.shape[1] % num_samples)
x = torch.nn.functional.pad(x, (0, pad_num), 'constant', value=0.0)
for i in range(0, x.shape[1], num_samples):
wavs_batch = x[:, i:i+num_samples]
out_chunk = self.__call__(wavs_batch, sr)
outs.append(out_chunk)
stacked = torch.cat(outs, dim=1)
return stacked.cpu()
@staticmethod
def download(model_url="https://github.com/snakers4/silero-vad/raw/v5.0/files/silero_vad.onnx"):
target_dir = os.path.expanduser("~/.cache/whisper-live/")
# Ensure the target directory exists
os.makedirs(target_dir, exist_ok=True)
# Define the target file path
model_filename = os.path.join(target_dir, "silero_vad.onnx")
# Check if the model file already exists
if not os.path.exists(model_filename):
# If it doesn't exist, download the model using wget
try:
subprocess.run(["wget", "-O", model_filename, model_url], check=True)
except subprocess.CalledProcessError:
print("Failed to download the model using wget.")
return model_filename
class VoiceActivityDetector:
def __init__(self, threshold=0.5, frame_rate=16000):
"""
Initializes the VoiceActivityDetector with a voice activity detection model and a threshold.
Args:
threshold (float, optional): The probability threshold for detecting voice activity. Defaults to 0.5.
"""
self.model = VoiceActivityDetection()
self.threshold = threshold
self.frame_rate = frame_rate
def __call__(self, audio_frame):
"""
Determines if the given audio frame contains speech by comparing the detected speech probability against
the threshold.
Args:
audio_frame (np.ndarray): The audio frame to be analyzed for voice activity. It is expected to be a
NumPy array of audio samples.
Returns:
bool: True if the speech probability exceeds the threshold, indicating the presence of voice activity;
False otherwise.
"""
speech_probs = self.model.audio_forward(torch.from_numpy(audio_frame.copy()), self.frame_rate)[0]
return torch.any(speech_probs > self.threshold).item()

169
workflows/ci.yml Normal file
View File

@ -0,0 +1,169 @@
name: Test & Build CI/CD
on:
push:
branches:
- main
tags:
- v*
pull_request:
branches: [ main ]
types: [opened, synchronize, reopened]
jobs:
run-tests:
runs-on: ubuntu-22.04
strategy:
matrix:
python-version: [3.8, 3.9, '3.10', 3.11]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Cache Python dependencies
uses: actions/cache@v2
with:
path: |
~/.cache/pip
!~/.cache/pip/log
key: ${{ runner.os }}-pip-${{ matrix.python-version }}-${{ hashFiles('requirements/server.txt', 'requirements/client.txt') }}
restore-keys: |
${{ runner.os }}-pip-${{ matrix.python-version }}-
- name: Install system dependencies
run: sudo apt-get update && sudo apt-get install -y ffmpeg portaudio19-dev
- name: Install Python dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements/server.txt --extra-index-url https://download.pytorch.org/whl/cpu
pip install -r requirements/client.txt
- name: Run tests
run: |
echo "Running tests with Python ${{ matrix.python-version }}"
python -m unittest discover -s tests
check-code-format:
runs-on: ubuntu-22.04
strategy:
matrix:
python-version: [3.8, 3.9, '3.10', 3.11]
steps:
- uses: actions/checkout@v2
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v2
with:
python-version: ${{ matrix.python-version }}
- name: Install dependencies
run: |
python -m pip install --upgrade pip
python -m pip install flake8
- name: Lint with flake8
run: |
# stop the build if there are Python syntax errors or undefined names
flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics
# exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide
flake8 . --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics
build-and-push-docker-cpu:
needs: [run-tests, check-code-format]
runs-on: ubuntu-22.04
if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/'))
steps:
- uses: actions/checkout@v2
- name: Log in to GitHub Container Registry
uses: docker/login-action@v1
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GHCR_TOKEN }}
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
- name: Build and push Docker image
uses: docker/build-push-action@v2
with:
context: .
file: docker/Dockerfile.cpu
push: true
tags: ghcr.io/collabora/whisperlive-cpu:latest
build-and-push-docker-gpu:
needs: [run-tests, check-code-format, build-and-push-docker-cpu]
timeout-minutes: 20
runs-on: ubuntu-22.04
if: github.event_name == 'push' && (github.ref == 'refs/heads/main' || startsWith(github.ref, 'refs/tags/'))
steps:
- uses: actions/checkout@v2
- name: Log in to GitHub Container Registry
uses: docker/login-action@v1
with:
registry: ghcr.io
username: ${{ github.repository_owner }}
password: ${{ secrets.GHCR_TOKEN }}
- name: Docker Prune
run: docker system prune -af
- name: Set up Docker Buildx
uses: docker/setup-buildx-action@v1
- name: Build and push Docker GPU image
uses: docker/build-push-action@v2
with:
context: .
file: docker/Dockerfile.gpu
push: true
tags: ghcr.io/collabora/whisperlive-gpu:latest
publish-to-pypi:
needs: [run-tests, check-code-format]
runs-on: ubuntu-22.04
if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
steps:
- uses: actions/checkout@v2
- name: Set up Python 3.8
uses: actions/setup-python@v2
with:
python-version: 3.8
- name: Cache Python dependencies
uses: actions/cache@v2
with:
path: |
~/.cache/pip
!~/.cache/pip/log
key: ubuntu-latest-pip-3.8-${{ hashFiles('requirements/server.txt', 'requirements/client.txt') }}
restore-keys: |
ubuntu-latest-pip-3.8-
- name: Install system dependencies
run: sudo apt-get update && sudo apt-get install -y ffmpeg portaudio19-dev
- name: Install Python dependencies
run: |
pip install -r requirements/server.txt
pip install -r requirements/client.txt
pip install wheel
- name: Build package
run: python setup.py sdist bdist_wheel
- name: Publish package to PyPI
uses: pypa/gh-action-pypi-publish@release/v1
with:
user: __token__
password: ${{ secrets.PYPI_API_TOKEN }}