WhisperLive-Server/openapi.json

866 lines
25 KiB
JSON

{
"openapi": "3.1.0",
"info": {
"title": "WhisperLive API",
"description": "A high-performance speech-to-text API based on OpenAI's Whisper model.\nSupports real-time transcription via WebSocket and batch processing via HTTP.\n\n## Features\n- Real-time audio transcription\n- Batch file processing\n- Multiple language support\n- Translation capabilities\n- Multiple model sizes\n- WebSocket and HTTP interfaces\n",
"version": "1.0.0",
"contact": {
"name": "WhisperLive Support",
"url": "https://github.com/collabora/WhisperLive"
},
"license": {
"name": "MIT",
"url": "https://opensource.org/licenses/MIT"
}
},
"servers": [
{
"url": "http://localhost:8080",
"description": "Local development server"
},
{
"url": "https://api.whisperlive.com/v1",
"description": "Production server"
}
],
"security": [
{
"ApiKeyAuth": []
}
],
"paths": {
"/v1/audio/transcriptions": {
"post": {
"summary": "Create transcription",
"description": "Transcribes audio into the input language. The response will include the transcribed text\nand additional metadata such as language detection, confidence scores, and timestamps.\n",
"operationId": "createTranscription",
"tags": [
"Audio"
],
"requestBody": {
"required": true,
"content": {
"multipart/form-data": {
"schema": {
"type": "object",
"required": [
"file"
],
"properties": {
"file": {
"type": "string",
"format": "binary",
"description": "The audio file object (not file name) to transcribe, in one of these formats: \nflac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.\n"
},
"model": {
"type": "string",
"enum": [
"tiny",
"base",
"small",
"medium",
"large"
],
"default": "base",
"description": "ID of the model to use. Only whisper-1 is currently available."
},
"language": {
"type": "string",
"pattern": "^[a-z]{2}$",
"description": "The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.\nSupported languages: en, es, fr, de, it, pt, ru, ja, ko, zh, hi, ar\n"
},
"prompt": {
"type": "string",
"description": "An optional text to guide the model's style or continue a previous audio segment.\nThe prompt should match the audio language.\n"
},
"response_format": {
"type": "string",
"enum": [
"json",
"text",
"srt",
"verbose_json",
"vtt"
],
"default": "json",
"description": "The format of the transcript output."
},
"temperature": {
"type": "number",
"minimum": 0,
"maximum": 1,
"default": 0,
"description": "The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic."
},
"timestamp_granularities": {
"type": "array",
"items": {
"type": "string",
"enum": [
"word",
"segment"
]
},
"description": "The timestamp granularities to populate for this transcription."
}
}
}
}
}
},
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"oneOf": [
{
"$ref": "#/components/schemas/TranscriptionResponse"
},
{
"$ref": "#/components/schemas/TranscriptionTextResponse"
},
{
"$ref": "#/components/schemas/TranscriptionSrtResponse"
},
{
"$ref": "#/components/schemas/TranscriptionVttResponse"
}
]
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest"
},
"401": {
"$ref": "#/components/responses/Unauthorized"
},
"413": {
"$ref": "#/components/responses/FileTooLarge"
},
"422": {
"$ref": "#/components/responses/ValidationError"
},
"429": {
"$ref": "#/components/responses/RateLimitExceeded"
},
"500": {
"$ref": "#/components/responses/InternalServerError"
}
}
}
},
"/v1/audio/translations": {
"post": {
"summary": "Create translation",
"description": "Translates audio into English. The response will include the translated text\nand additional metadata such as confidence scores and timestamps.\n",
"operationId": "createTranslation",
"tags": [
"Audio"
],
"requestBody": {
"required": true,
"content": {
"multipart/form-data": {
"schema": {
"type": "object",
"required": [
"file"
],
"properties": {
"file": {
"type": "string",
"format": "binary",
"description": "The audio file object (not file name) to translate, in one of these formats: \nflac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.\n"
},
"model": {
"type": "string",
"enum": [
"tiny",
"base",
"small",
"medium",
"large"
],
"default": "base",
"description": "ID of the model to use. Only whisper-1 is currently available."
},
"prompt": {
"type": "string",
"description": "An optional text to guide the model's style or continue a previous audio segment.\nThe prompt should be in English.\n"
},
"response_format": {
"type": "string",
"enum": [
"json",
"text",
"srt",
"verbose_json",
"vtt"
],
"default": "json",
"description": "The format of the transcript output."
},
"temperature": {
"type": "number",
"minimum": 0,
"maximum": 1,
"default": 0,
"description": "The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic."
},
"timestamp_granularities": {
"type": "array",
"items": {
"type": "string",
"enum": [
"word",
"segment"
]
},
"description": "The timestamp granularities to populate for this translation."
}
}
}
}
}
},
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"oneOf": [
{
"$ref": "#/components/schemas/TranscriptionResponse"
},
{
"$ref": "#/components/schemas/TranscriptionTextResponse"
},
{
"$ref": "#/components/schemas/TranscriptionSrtResponse"
},
{
"$ref": "#/components/schemas/TranscriptionVttResponse"
}
]
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest"
},
"401": {
"$ref": "#/components/responses/Unauthorized"
},
"413": {
"$ref": "#/components/responses/FileTooLarge"
},
"422": {
"$ref": "#/components/responses/ValidationError"
},
"429": {
"$ref": "#/components/responses/RateLimitExceeded"
},
"500": {
"$ref": "#/components/responses/InternalServerError"
}
}
}
},
"/v1/models": {
"get": {
"summary": "List models",
"description": "Lists the currently available models, and provides basic information about each one such as the owner and availability.",
"operationId": "listModels",
"tags": [
"Models"
],
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/ListModelsResponse"
}
}
}
},
"401": {
"$ref": "#/components/responses/Unauthorized"
},
"500": {
"$ref": "#/components/responses/InternalServerError"
}
}
}
},
"/v1/models/{model}": {
"get": {
"summary": "Retrieve model",
"description": "Retrieves a model instance, providing basic information about the model such as the owner and permissioning.",
"operationId": "retrieveModel",
"tags": [
"Models"
],
"parameters": [
{
"name": "model",
"in": "path",
"required": true,
"description": "The ID of the model to use for this request",
"schema": {
"type": "string",
"enum": [
"tiny",
"base",
"small",
"medium",
"large"
]
}
}
],
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Model"
}
}
}
},
"401": {
"$ref": "#/components/responses/Unauthorized"
},
"404": {
"$ref": "#/components/responses/NotFound"
},
"500": {
"$ref": "#/components/responses/InternalServerError"
}
}
}
},
"/v1/health": {
"get": {
"summary": "Health check",
"description": "Check the health status of the API server",
"operationId": "healthCheck",
"tags": [
"System"
],
"responses": {
"200": {
"description": "OK",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/HealthResponse"
}
}
}
}
}
}
},
"/v1/websocket": {
"get": {
"summary": "WebSocket connection",
"description": "Establishes a WebSocket connection for real-time audio transcription.\nSend audio data as binary frames and receive transcription results.\n",
"operationId": "websocketConnection",
"tags": [
"Real-time"
],
"parameters": [
{
"name": "model",
"in": "query",
"description": "The model to use for transcription",
"schema": {
"type": "string",
"enum": [
"tiny",
"base",
"small",
"medium",
"large"
],
"default": "base"
}
},
{
"name": "language",
"in": "query",
"description": "The language of the input audio",
"schema": {
"type": "string",
"pattern": "^[a-z]{2}$"
}
},
{
"name": "task",
"in": "query",
"description": "The task to perform",
"schema": {
"type": "string",
"enum": [
"transcribe",
"translate"
],
"default": "transcribe"
}
}
],
"responses": {
"101": {
"description": "Switching Protocols",
"headers": {
"Upgrade": {
"schema": {
"type": "string",
"example": "websocket"
}
},
"Connection": {
"schema": {
"type": "string",
"example": "Upgrade"
}
}
}
},
"400": {
"$ref": "#/components/responses/BadRequest"
},
"401": {
"$ref": "#/components/responses/Unauthorized"
}
}
}
}
},
"components": {
"securitySchemes": {
"ApiKeyAuth": {
"type": "apiKey",
"in": "header",
"name": "Authorization",
"description": "API key authentication. Include your API key in the Authorization header.\nExample: `Authorization: Bearer your-api-key-here`\n"
}
},
"schemas": {
"TranscriptionResponse": {
"type": "object",
"properties": {
"text": {
"type": "string",
"description": "The transcribed text"
},
"language": {
"type": "string",
"description": "The language of the input audio"
},
"duration": {
"type": "number",
"description": "The duration of the input audio in seconds"
},
"words": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Word"
},
"description": "Extracted words and their corresponding timestamps"
},
"segments": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Segment"
},
"description": "Segments of the transcribed text with timestamps"
}
},
"required": [
"text"
]
},
"TranscriptionTextResponse": {
"type": "string",
"description": "The transcribed text as plain text"
},
"TranscriptionSrtResponse": {
"type": "string",
"description": "The transcribed text in SRT subtitle format"
},
"TranscriptionVttResponse": {
"type": "string",
"description": "The transcribed text in VTT subtitle format"
},
"Word": {
"type": "object",
"properties": {
"word": {
"type": "string",
"description": "The text content of the word"
},
"start": {
"type": "number",
"description": "Start time of the word in seconds"
},
"end": {
"type": "number",
"description": "End time of the word in seconds"
},
"probability": {
"type": "number",
"description": "Confidence score of the word (0-1)"
}
},
"required": [
"word",
"start",
"end"
]
},
"Segment": {
"type": "object",
"properties": {
"id": {
"type": "integer",
"description": "Unique identifier for the segment"
},
"seek": {
"type": "number",
"description": "Seek offset of the segment in seconds"
},
"start": {
"type": "number",
"description": "Start time of the segment in seconds"
},
"end": {
"type": "number",
"description": "End time of the segment in seconds"
},
"text": {
"type": "string",
"description": "The text content of the segment"
},
"tokens": {
"type": "array",
"items": {
"type": "integer"
},
"description": "Array of token IDs for the segment"
},
"temperature": {
"type": "number",
"description": "Temperature parameter used for generating this segment"
},
"avg_logprob": {
"type": "number",
"description": "Average log probability of the segment"
},
"compression_ratio": {
"type": "number",
"description": "Compression ratio of the segment"
},
"no_speech_prob": {
"type": "number",
"description": "Probability of no speech in this segment"
},
"words": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Word"
},
"description": "Words in this segment"
}
},
"required": [
"id",
"seek",
"start",
"end",
"text"
]
},
"Model": {
"type": "object",
"properties": {
"id": {
"type": "string",
"description": "The model identifier"
},
"object": {
"type": "string",
"enum": [
"model"
],
"description": "The object type, which is always \"model\""
},
"created": {
"type": "integer",
"description": "The Unix timestamp (in seconds) when the model was created"
},
"owned_by": {
"type": "string",
"description": "The organization that owns the model"
},
"permission": {
"type": "array",
"items": {
"type": "object"
},
"description": "The permissions associated with the model"
},
"root": {
"type": "string",
"description": "The root of the model"
},
"parent": {
"type": "string",
"description": "The parent of the model"
}
},
"required": [
"id",
"object",
"created",
"owned_by"
]
},
"ListModelsResponse": {
"type": "object",
"properties": {
"object": {
"type": "string",
"enum": [
"list"
],
"description": "The object type, which is always \"list\""
},
"data": {
"type": "array",
"items": {
"$ref": "#/components/schemas/Model"
},
"description": "The list of models"
}
},
"required": [
"object",
"data"
]
},
"HealthResponse": {
"type": "object",
"properties": {
"status": {
"type": "string",
"enum": [
"healthy",
"unhealthy"
],
"description": "The health status of the service"
},
"service": {
"type": "string",
"description": "The name of the service"
},
"version": {
"type": "string",
"description": "The version of the service"
},
"timestamp": {
"type": "string",
"format": "date-time",
"description": "The current timestamp"
},
"uptime": {
"type": "number",
"description": "The uptime in seconds"
}
},
"required": [
"status",
"service"
]
},
"Error": {
"type": "object",
"properties": {
"error": {
"type": "object",
"properties": {
"message": {
"type": "string",
"description": "A human-readable error message"
},
"type": {
"type": "string",
"description": "The type of error"
},
"code": {
"type": "string",
"description": "The error code"
},
"param": {
"type": "string",
"description": "The parameter that caused the error"
}
}
}
},
"required": [
"error"
]
}
},
"responses": {
"BadRequest": {
"description": "Bad Request",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Error"
},
"example": {
"error": {
"message": "Invalid request parameters",
"type": "invalid_request_error",
"code": "invalid_parameters"
}
}
}
}
},
"Unauthorized": {
"description": "Unauthorized",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Error"
},
"example": {
"error": {
"message": "Invalid API key",
"type": "authentication_error",
"code": "invalid_api_key"
}
}
}
}
},
"FileTooLarge": {
"description": "File Too Large",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Error"
},
"example": {
"error": {
"message": "File size exceeds maximum allowed size",
"type": "invalid_request_error",
"code": "file_too_large"
}
}
}
}
},
"ValidationError": {
"description": "Validation Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Error"
},
"example": {
"error": {
"message": "Invalid file format",
"type": "invalid_request_error",
"code": "invalid_file_format"
}
}
}
}
},
"RateLimitExceeded": {
"description": "Rate Limit Exceeded",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Error"
},
"example": {
"error": {
"message": "Rate limit exceeded",
"type": "rate_limit_error",
"code": "rate_limit_exceeded"
}
}
}
}
},
"InternalServerError": {
"description": "Internal Server Error",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Error"
},
"example": {
"error": {
"message": "An internal server error occurred",
"type": "server_error",
"code": "internal_error"
}
}
}
}
},
"NotFound": {
"description": "Not Found",
"content": {
"application/json": {
"schema": {
"$ref": "#/components/schemas/Error"
},
"example": {
"error": {
"message": "Model not found",
"type": "invalid_request_error",
"code": "model_not_found"
}
}
}
}
}
}
},
"tags": [
{
"name": "Audio",
"description": "Audio transcription and translation operations"
},
{
"name": "Models",
"description": "Model management operations"
},
{
"name": "System",
"description": "System health and status operations"
},
{
"name": "Real-time",
"description": "Real-time audio processing via WebSocket"
}
]
}