{ "openapi": "3.1.0", "info": { "title": "WhisperLive API", "description": "A high-performance speech-to-text API based on OpenAI's Whisper model.\nSupports real-time transcription via WebSocket and batch processing via HTTP.\n\n## Features\n- Real-time audio transcription\n- Batch file processing\n- Multiple language support\n- Translation capabilities\n- Multiple model sizes\n- WebSocket and HTTP interfaces\n", "version": "1.0.0", "contact": { "name": "WhisperLive Support", "url": "https://github.com/collabora/WhisperLive" }, "license": { "name": "MIT", "url": "https://opensource.org/licenses/MIT" } }, "servers": [ { "url": "http://localhost:8080", "description": "Local development server" }, { "url": "https://api.whisperlive.com/v1", "description": "Production server" } ], "security": [ { "ApiKeyAuth": [] } ], "paths": { "/v1/audio/transcriptions": { "post": { "summary": "Create transcription", "description": "Transcribes audio into the input language. The response will include the transcribed text\nand additional metadata such as language detection, confidence scores, and timestamps.\n", "operationId": "createTranscription", "tags": [ "Audio" ], "requestBody": { "required": true, "content": { "multipart/form-data": { "schema": { "type": "object", "required": [ "file" ], "properties": { "file": { "type": "string", "format": "binary", "description": "The audio file object (not file name) to transcribe, in one of these formats: \nflac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.\n" }, "model": { "type": "string", "enum": [ "tiny", "base", "small", "medium", "large" ], "default": "base", "description": "ID of the model to use. Only whisper-1 is currently available." }, "language": { "type": "string", "pattern": "^[a-z]{2}$", "description": "The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.\nSupported languages: en, es, fr, de, it, pt, ru, ja, ko, zh, hi, ar\n" }, "prompt": { "type": "string", "description": "An optional text to guide the model's style or continue a previous audio segment.\nThe prompt should match the audio language.\n" }, "response_format": { "type": "string", "enum": [ "json", "text", "srt", "verbose_json", "vtt" ], "default": "json", "description": "The format of the transcript output." }, "temperature": { "type": "number", "minimum": 0, "maximum": 1, "default": 0, "description": "The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic." }, "timestamp_granularities": { "type": "array", "items": { "type": "string", "enum": [ "word", "segment" ] }, "description": "The timestamp granularities to populate for this transcription." } } } } } }, "responses": { "200": { "description": "OK", "content": { "application/json": { "schema": { "oneOf": [ { "$ref": "#/components/schemas/TranscriptionResponse" }, { "$ref": "#/components/schemas/TranscriptionTextResponse" }, { "$ref": "#/components/schemas/TranscriptionSrtResponse" }, { "$ref": "#/components/schemas/TranscriptionVttResponse" } ] } } } }, "400": { "$ref": "#/components/responses/BadRequest" }, "401": { "$ref": "#/components/responses/Unauthorized" }, "413": { "$ref": "#/components/responses/FileTooLarge" }, "422": { "$ref": "#/components/responses/ValidationError" }, "429": { "$ref": "#/components/responses/RateLimitExceeded" }, "500": { "$ref": "#/components/responses/InternalServerError" } } } }, "/v1/audio/translations": { "post": { "summary": "Create translation", "description": "Translates audio into English. The response will include the translated text\nand additional metadata such as confidence scores and timestamps.\n", "operationId": "createTranslation", "tags": [ "Audio" ], "requestBody": { "required": true, "content": { "multipart/form-data": { "schema": { "type": "object", "required": [ "file" ], "properties": { "file": { "type": "string", "format": "binary", "description": "The audio file object (not file name) to translate, in one of these formats: \nflac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.\n" }, "model": { "type": "string", "enum": [ "tiny", "base", "small", "medium", "large" ], "default": "base", "description": "ID of the model to use. Only whisper-1 is currently available." }, "prompt": { "type": "string", "description": "An optional text to guide the model's style or continue a previous audio segment.\nThe prompt should be in English.\n" }, "response_format": { "type": "string", "enum": [ "json", "text", "srt", "verbose_json", "vtt" ], "default": "json", "description": "The format of the transcript output." }, "temperature": { "type": "number", "minimum": 0, "maximum": 1, "default": 0, "description": "The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic." }, "timestamp_granularities": { "type": "array", "items": { "type": "string", "enum": [ "word", "segment" ] }, "description": "The timestamp granularities to populate for this translation." } } } } } }, "responses": { "200": { "description": "OK", "content": { "application/json": { "schema": { "oneOf": [ { "$ref": "#/components/schemas/TranscriptionResponse" }, { "$ref": "#/components/schemas/TranscriptionTextResponse" }, { "$ref": "#/components/schemas/TranscriptionSrtResponse" }, { "$ref": "#/components/schemas/TranscriptionVttResponse" } ] } } } }, "400": { "$ref": "#/components/responses/BadRequest" }, "401": { "$ref": "#/components/responses/Unauthorized" }, "413": { "$ref": "#/components/responses/FileTooLarge" }, "422": { "$ref": "#/components/responses/ValidationError" }, "429": { "$ref": "#/components/responses/RateLimitExceeded" }, "500": { "$ref": "#/components/responses/InternalServerError" } } } }, "/v1/models": { "get": { "summary": "List models", "description": "Lists the currently available models, and provides basic information about each one such as the owner and availability.", "operationId": "listModels", "tags": [ "Models" ], "responses": { "200": { "description": "OK", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/ListModelsResponse" } } } }, "401": { "$ref": "#/components/responses/Unauthorized" }, "500": { "$ref": "#/components/responses/InternalServerError" } } } }, "/v1/models/{model}": { "get": { "summary": "Retrieve model", "description": "Retrieves a model instance, providing basic information about the model such as the owner and permissioning.", "operationId": "retrieveModel", "tags": [ "Models" ], "parameters": [ { "name": "model", "in": "path", "required": true, "description": "The ID of the model to use for this request", "schema": { "type": "string", "enum": [ "tiny", "base", "small", "medium", "large" ] } } ], "responses": { "200": { "description": "OK", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/Model" } } } }, "401": { "$ref": "#/components/responses/Unauthorized" }, "404": { "$ref": "#/components/responses/NotFound" }, "500": { "$ref": "#/components/responses/InternalServerError" } } } }, "/v1/health": { "get": { "summary": "Health check", "description": "Check the health status of the API server", "operationId": "healthCheck", "tags": [ "System" ], "responses": { "200": { "description": "OK", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/HealthResponse" } } } } } } }, "/v1/websocket": { "get": { "summary": "WebSocket connection", "description": "Establishes a WebSocket connection for real-time audio transcription.\nSend audio data as binary frames and receive transcription results.\n", "operationId": "websocketConnection", "tags": [ "Real-time" ], "parameters": [ { "name": "model", "in": "query", "description": "The model to use for transcription", "schema": { "type": "string", "enum": [ "tiny", "base", "small", "medium", "large" ], "default": "base" } }, { "name": "language", "in": "query", "description": "The language of the input audio", "schema": { "type": "string", "pattern": "^[a-z]{2}$" } }, { "name": "task", "in": "query", "description": "The task to perform", "schema": { "type": "string", "enum": [ "transcribe", "translate" ], "default": "transcribe" } } ], "responses": { "101": { "description": "Switching Protocols", "headers": { "Upgrade": { "schema": { "type": "string", "example": "websocket" } }, "Connection": { "schema": { "type": "string", "example": "Upgrade" } } } }, "400": { "$ref": "#/components/responses/BadRequest" }, "401": { "$ref": "#/components/responses/Unauthorized" } } } } }, "components": { "securitySchemes": { "ApiKeyAuth": { "type": "apiKey", "in": "header", "name": "Authorization", "description": "API key authentication. Include your API key in the Authorization header.\nExample: `Authorization: Bearer your-api-key-here`\n" } }, "schemas": { "TranscriptionResponse": { "type": "object", "properties": { "text": { "type": "string", "description": "The transcribed text" }, "language": { "type": "string", "description": "The language of the input audio" }, "duration": { "type": "number", "description": "The duration of the input audio in seconds" }, "words": { "type": "array", "items": { "$ref": "#/components/schemas/Word" }, "description": "Extracted words and their corresponding timestamps" }, "segments": { "type": "array", "items": { "$ref": "#/components/schemas/Segment" }, "description": "Segments of the transcribed text with timestamps" } }, "required": [ "text" ] }, "TranscriptionTextResponse": { "type": "string", "description": "The transcribed text as plain text" }, "TranscriptionSrtResponse": { "type": "string", "description": "The transcribed text in SRT subtitle format" }, "TranscriptionVttResponse": { "type": "string", "description": "The transcribed text in VTT subtitle format" }, "Word": { "type": "object", "properties": { "word": { "type": "string", "description": "The text content of the word" }, "start": { "type": "number", "description": "Start time of the word in seconds" }, "end": { "type": "number", "description": "End time of the word in seconds" }, "probability": { "type": "number", "description": "Confidence score of the word (0-1)" } }, "required": [ "word", "start", "end" ] }, "Segment": { "type": "object", "properties": { "id": { "type": "integer", "description": "Unique identifier for the segment" }, "seek": { "type": "number", "description": "Seek offset of the segment in seconds" }, "start": { "type": "number", "description": "Start time of the segment in seconds" }, "end": { "type": "number", "description": "End time of the segment in seconds" }, "text": { "type": "string", "description": "The text content of the segment" }, "tokens": { "type": "array", "items": { "type": "integer" }, "description": "Array of token IDs for the segment" }, "temperature": { "type": "number", "description": "Temperature parameter used for generating this segment" }, "avg_logprob": { "type": "number", "description": "Average log probability of the segment" }, "compression_ratio": { "type": "number", "description": "Compression ratio of the segment" }, "no_speech_prob": { "type": "number", "description": "Probability of no speech in this segment" }, "words": { "type": "array", "items": { "$ref": "#/components/schemas/Word" }, "description": "Words in this segment" } }, "required": [ "id", "seek", "start", "end", "text" ] }, "Model": { "type": "object", "properties": { "id": { "type": "string", "description": "The model identifier" }, "object": { "type": "string", "enum": [ "model" ], "description": "The object type, which is always \"model\"" }, "created": { "type": "integer", "description": "The Unix timestamp (in seconds) when the model was created" }, "owned_by": { "type": "string", "description": "The organization that owns the model" }, "permission": { "type": "array", "items": { "type": "object" }, "description": "The permissions associated with the model" }, "root": { "type": "string", "description": "The root of the model" }, "parent": { "type": "string", "description": "The parent of the model" } }, "required": [ "id", "object", "created", "owned_by" ] }, "ListModelsResponse": { "type": "object", "properties": { "object": { "type": "string", "enum": [ "list" ], "description": "The object type, which is always \"list\"" }, "data": { "type": "array", "items": { "$ref": "#/components/schemas/Model" }, "description": "The list of models" } }, "required": [ "object", "data" ] }, "HealthResponse": { "type": "object", "properties": { "status": { "type": "string", "enum": [ "healthy", "unhealthy" ], "description": "The health status of the service" }, "service": { "type": "string", "description": "The name of the service" }, "version": { "type": "string", "description": "The version of the service" }, "timestamp": { "type": "string", "format": "date-time", "description": "The current timestamp" }, "uptime": { "type": "number", "description": "The uptime in seconds" } }, "required": [ "status", "service" ] }, "Error": { "type": "object", "properties": { "error": { "type": "object", "properties": { "message": { "type": "string", "description": "A human-readable error message" }, "type": { "type": "string", "description": "The type of error" }, "code": { "type": "string", "description": "The error code" }, "param": { "type": "string", "description": "The parameter that caused the error" } } } }, "required": [ "error" ] } }, "responses": { "BadRequest": { "description": "Bad Request", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/Error" }, "example": { "error": { "message": "Invalid request parameters", "type": "invalid_request_error", "code": "invalid_parameters" } } } } }, "Unauthorized": { "description": "Unauthorized", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/Error" }, "example": { "error": { "message": "Invalid API key", "type": "authentication_error", "code": "invalid_api_key" } } } } }, "FileTooLarge": { "description": "File Too Large", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/Error" }, "example": { "error": { "message": "File size exceeds maximum allowed size", "type": "invalid_request_error", "code": "file_too_large" } } } } }, "ValidationError": { "description": "Validation Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/Error" }, "example": { "error": { "message": "Invalid file format", "type": "invalid_request_error", "code": "invalid_file_format" } } } } }, "RateLimitExceeded": { "description": "Rate Limit Exceeded", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/Error" }, "example": { "error": { "message": "Rate limit exceeded", "type": "rate_limit_error", "code": "rate_limit_exceeded" } } } } }, "InternalServerError": { "description": "Internal Server Error", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/Error" }, "example": { "error": { "message": "An internal server error occurred", "type": "server_error", "code": "internal_error" } } } } }, "NotFound": { "description": "Not Found", "content": { "application/json": { "schema": { "$ref": "#/components/schemas/Error" }, "example": { "error": { "message": "Model not found", "type": "invalid_request_error", "code": "model_not_found" } } } } } } }, "tags": [ { "name": "Audio", "description": "Audio transcription and translation operations" }, { "name": "Models", "description": "Model management operations" }, { "name": "System", "description": "System health and status operations" }, { "name": "Real-time", "description": "Real-time audio processing via WebSocket" } ] }