866 lines
25 KiB
JSON
866 lines
25 KiB
JSON
{
|
|
"openapi": "3.1.0",
|
|
"info": {
|
|
"title": "WhisperLive API",
|
|
"description": "A high-performance speech-to-text API based on OpenAI's Whisper model.\nSupports real-time transcription via WebSocket and batch processing via HTTP.\n\n## Features\n- Real-time audio transcription\n- Batch file processing\n- Multiple language support\n- Translation capabilities\n- Multiple model sizes\n- WebSocket and HTTP interfaces\n",
|
|
"version": "1.0.0",
|
|
"contact": {
|
|
"name": "WhisperLive Support",
|
|
"url": "https://github.com/collabora/WhisperLive"
|
|
},
|
|
"license": {
|
|
"name": "MIT",
|
|
"url": "https://opensource.org/licenses/MIT"
|
|
}
|
|
},
|
|
"servers": [
|
|
{
|
|
"url": "http://localhost:8080",
|
|
"description": "Local development server"
|
|
},
|
|
{
|
|
"url": "https://api.whisperlive.com/v1",
|
|
"description": "Production server"
|
|
}
|
|
],
|
|
"security": [
|
|
{
|
|
"ApiKeyAuth": []
|
|
}
|
|
],
|
|
"paths": {
|
|
"/v1/audio/transcriptions": {
|
|
"post": {
|
|
"summary": "Create transcription",
|
|
"description": "Transcribes audio into the input language. The response will include the transcribed text\nand additional metadata such as language detection, confidence scores, and timestamps.\n",
|
|
"operationId": "createTranscription",
|
|
"tags": [
|
|
"Audio"
|
|
],
|
|
"requestBody": {
|
|
"required": true,
|
|
"content": {
|
|
"multipart/form-data": {
|
|
"schema": {
|
|
"type": "object",
|
|
"required": [
|
|
"file"
|
|
],
|
|
"properties": {
|
|
"file": {
|
|
"type": "string",
|
|
"format": "binary",
|
|
"description": "The audio file object (not file name) to transcribe, in one of these formats: \nflac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.\n"
|
|
},
|
|
"model": {
|
|
"type": "string",
|
|
"enum": [
|
|
"tiny",
|
|
"base",
|
|
"small",
|
|
"medium",
|
|
"large"
|
|
],
|
|
"default": "base",
|
|
"description": "ID of the model to use. Only whisper-1 is currently available."
|
|
},
|
|
"language": {
|
|
"type": "string",
|
|
"pattern": "^[a-z]{2}$",
|
|
"description": "The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.\nSupported languages: en, es, fr, de, it, pt, ru, ja, ko, zh, hi, ar\n"
|
|
},
|
|
"prompt": {
|
|
"type": "string",
|
|
"description": "An optional text to guide the model's style or continue a previous audio segment.\nThe prompt should match the audio language.\n"
|
|
},
|
|
"response_format": {
|
|
"type": "string",
|
|
"enum": [
|
|
"json",
|
|
"text",
|
|
"srt",
|
|
"verbose_json",
|
|
"vtt"
|
|
],
|
|
"default": "json",
|
|
"description": "The format of the transcript output."
|
|
},
|
|
"temperature": {
|
|
"type": "number",
|
|
"minimum": 0,
|
|
"maximum": 1,
|
|
"default": 0,
|
|
"description": "The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic."
|
|
},
|
|
"timestamp_granularities": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "string",
|
|
"enum": [
|
|
"word",
|
|
"segment"
|
|
]
|
|
},
|
|
"description": "The timestamp granularities to populate for this transcription."
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"responses": {
|
|
"200": {
|
|
"description": "OK",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"oneOf": [
|
|
{
|
|
"$ref": "#/components/schemas/TranscriptionResponse"
|
|
},
|
|
{
|
|
"$ref": "#/components/schemas/TranscriptionTextResponse"
|
|
},
|
|
{
|
|
"$ref": "#/components/schemas/TranscriptionSrtResponse"
|
|
},
|
|
{
|
|
"$ref": "#/components/schemas/TranscriptionVttResponse"
|
|
}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"400": {
|
|
"$ref": "#/components/responses/BadRequest"
|
|
},
|
|
"401": {
|
|
"$ref": "#/components/responses/Unauthorized"
|
|
},
|
|
"413": {
|
|
"$ref": "#/components/responses/FileTooLarge"
|
|
},
|
|
"422": {
|
|
"$ref": "#/components/responses/ValidationError"
|
|
},
|
|
"429": {
|
|
"$ref": "#/components/responses/RateLimitExceeded"
|
|
},
|
|
"500": {
|
|
"$ref": "#/components/responses/InternalServerError"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"/v1/audio/translations": {
|
|
"post": {
|
|
"summary": "Create translation",
|
|
"description": "Translates audio into English. The response will include the translated text\nand additional metadata such as confidence scores and timestamps.\n",
|
|
"operationId": "createTranslation",
|
|
"tags": [
|
|
"Audio"
|
|
],
|
|
"requestBody": {
|
|
"required": true,
|
|
"content": {
|
|
"multipart/form-data": {
|
|
"schema": {
|
|
"type": "object",
|
|
"required": [
|
|
"file"
|
|
],
|
|
"properties": {
|
|
"file": {
|
|
"type": "string",
|
|
"format": "binary",
|
|
"description": "The audio file object (not file name) to translate, in one of these formats: \nflac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.\n"
|
|
},
|
|
"model": {
|
|
"type": "string",
|
|
"enum": [
|
|
"tiny",
|
|
"base",
|
|
"small",
|
|
"medium",
|
|
"large"
|
|
],
|
|
"default": "base",
|
|
"description": "ID of the model to use. Only whisper-1 is currently available."
|
|
},
|
|
"prompt": {
|
|
"type": "string",
|
|
"description": "An optional text to guide the model's style or continue a previous audio segment.\nThe prompt should be in English.\n"
|
|
},
|
|
"response_format": {
|
|
"type": "string",
|
|
"enum": [
|
|
"json",
|
|
"text",
|
|
"srt",
|
|
"verbose_json",
|
|
"vtt"
|
|
],
|
|
"default": "json",
|
|
"description": "The format of the transcript output."
|
|
},
|
|
"temperature": {
|
|
"type": "number",
|
|
"minimum": 0,
|
|
"maximum": 1,
|
|
"default": 0,
|
|
"description": "The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic."
|
|
},
|
|
"timestamp_granularities": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "string",
|
|
"enum": [
|
|
"word",
|
|
"segment"
|
|
]
|
|
},
|
|
"description": "The timestamp granularities to populate for this translation."
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"responses": {
|
|
"200": {
|
|
"description": "OK",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"oneOf": [
|
|
{
|
|
"$ref": "#/components/schemas/TranscriptionResponse"
|
|
},
|
|
{
|
|
"$ref": "#/components/schemas/TranscriptionTextResponse"
|
|
},
|
|
{
|
|
"$ref": "#/components/schemas/TranscriptionSrtResponse"
|
|
},
|
|
{
|
|
"$ref": "#/components/schemas/TranscriptionVttResponse"
|
|
}
|
|
]
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"400": {
|
|
"$ref": "#/components/responses/BadRequest"
|
|
},
|
|
"401": {
|
|
"$ref": "#/components/responses/Unauthorized"
|
|
},
|
|
"413": {
|
|
"$ref": "#/components/responses/FileTooLarge"
|
|
},
|
|
"422": {
|
|
"$ref": "#/components/responses/ValidationError"
|
|
},
|
|
"429": {
|
|
"$ref": "#/components/responses/RateLimitExceeded"
|
|
},
|
|
"500": {
|
|
"$ref": "#/components/responses/InternalServerError"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"/v1/models": {
|
|
"get": {
|
|
"summary": "List models",
|
|
"description": "Lists the currently available models, and provides basic information about each one such as the owner and availability.",
|
|
"operationId": "listModels",
|
|
"tags": [
|
|
"Models"
|
|
],
|
|
"responses": {
|
|
"200": {
|
|
"description": "OK",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/ListModelsResponse"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"401": {
|
|
"$ref": "#/components/responses/Unauthorized"
|
|
},
|
|
"500": {
|
|
"$ref": "#/components/responses/InternalServerError"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"/v1/models/{model}": {
|
|
"get": {
|
|
"summary": "Retrieve model",
|
|
"description": "Retrieves a model instance, providing basic information about the model such as the owner and permissioning.",
|
|
"operationId": "retrieveModel",
|
|
"tags": [
|
|
"Models"
|
|
],
|
|
"parameters": [
|
|
{
|
|
"name": "model",
|
|
"in": "path",
|
|
"required": true,
|
|
"description": "The ID of the model to use for this request",
|
|
"schema": {
|
|
"type": "string",
|
|
"enum": [
|
|
"tiny",
|
|
"base",
|
|
"small",
|
|
"medium",
|
|
"large"
|
|
]
|
|
}
|
|
}
|
|
],
|
|
"responses": {
|
|
"200": {
|
|
"description": "OK",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/Model"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"401": {
|
|
"$ref": "#/components/responses/Unauthorized"
|
|
},
|
|
"404": {
|
|
"$ref": "#/components/responses/NotFound"
|
|
},
|
|
"500": {
|
|
"$ref": "#/components/responses/InternalServerError"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"/v1/health": {
|
|
"get": {
|
|
"summary": "Health check",
|
|
"description": "Check the health status of the API server",
|
|
"operationId": "healthCheck",
|
|
"tags": [
|
|
"System"
|
|
],
|
|
"responses": {
|
|
"200": {
|
|
"description": "OK",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/HealthResponse"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"/v1/websocket": {
|
|
"get": {
|
|
"summary": "WebSocket connection",
|
|
"description": "Establishes a WebSocket connection for real-time audio transcription.\nSend audio data as binary frames and receive transcription results.\n",
|
|
"operationId": "websocketConnection",
|
|
"tags": [
|
|
"Real-time"
|
|
],
|
|
"parameters": [
|
|
{
|
|
"name": "model",
|
|
"in": "query",
|
|
"description": "The model to use for transcription",
|
|
"schema": {
|
|
"type": "string",
|
|
"enum": [
|
|
"tiny",
|
|
"base",
|
|
"small",
|
|
"medium",
|
|
"large"
|
|
],
|
|
"default": "base"
|
|
}
|
|
},
|
|
{
|
|
"name": "language",
|
|
"in": "query",
|
|
"description": "The language of the input audio",
|
|
"schema": {
|
|
"type": "string",
|
|
"pattern": "^[a-z]{2}$"
|
|
}
|
|
},
|
|
{
|
|
"name": "task",
|
|
"in": "query",
|
|
"description": "The task to perform",
|
|
"schema": {
|
|
"type": "string",
|
|
"enum": [
|
|
"transcribe",
|
|
"translate"
|
|
],
|
|
"default": "transcribe"
|
|
}
|
|
}
|
|
],
|
|
"responses": {
|
|
"101": {
|
|
"description": "Switching Protocols",
|
|
"headers": {
|
|
"Upgrade": {
|
|
"schema": {
|
|
"type": "string",
|
|
"example": "websocket"
|
|
}
|
|
},
|
|
"Connection": {
|
|
"schema": {
|
|
"type": "string",
|
|
"example": "Upgrade"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"400": {
|
|
"$ref": "#/components/responses/BadRequest"
|
|
},
|
|
"401": {
|
|
"$ref": "#/components/responses/Unauthorized"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"components": {
|
|
"securitySchemes": {
|
|
"ApiKeyAuth": {
|
|
"type": "apiKey",
|
|
"in": "header",
|
|
"name": "Authorization",
|
|
"description": "API key authentication. Include your API key in the Authorization header.\nExample: `Authorization: Bearer your-api-key-here`\n"
|
|
}
|
|
},
|
|
"schemas": {
|
|
"TranscriptionResponse": {
|
|
"type": "object",
|
|
"properties": {
|
|
"text": {
|
|
"type": "string",
|
|
"description": "The transcribed text"
|
|
},
|
|
"language": {
|
|
"type": "string",
|
|
"description": "The language of the input audio"
|
|
},
|
|
"duration": {
|
|
"type": "number",
|
|
"description": "The duration of the input audio in seconds"
|
|
},
|
|
"words": {
|
|
"type": "array",
|
|
"items": {
|
|
"$ref": "#/components/schemas/Word"
|
|
},
|
|
"description": "Extracted words and their corresponding timestamps"
|
|
},
|
|
"segments": {
|
|
"type": "array",
|
|
"items": {
|
|
"$ref": "#/components/schemas/Segment"
|
|
},
|
|
"description": "Segments of the transcribed text with timestamps"
|
|
}
|
|
},
|
|
"required": [
|
|
"text"
|
|
]
|
|
},
|
|
"TranscriptionTextResponse": {
|
|
"type": "string",
|
|
"description": "The transcribed text as plain text"
|
|
},
|
|
"TranscriptionSrtResponse": {
|
|
"type": "string",
|
|
"description": "The transcribed text in SRT subtitle format"
|
|
},
|
|
"TranscriptionVttResponse": {
|
|
"type": "string",
|
|
"description": "The transcribed text in VTT subtitle format"
|
|
},
|
|
"Word": {
|
|
"type": "object",
|
|
"properties": {
|
|
"word": {
|
|
"type": "string",
|
|
"description": "The text content of the word"
|
|
},
|
|
"start": {
|
|
"type": "number",
|
|
"description": "Start time of the word in seconds"
|
|
},
|
|
"end": {
|
|
"type": "number",
|
|
"description": "End time of the word in seconds"
|
|
},
|
|
"probability": {
|
|
"type": "number",
|
|
"description": "Confidence score of the word (0-1)"
|
|
}
|
|
},
|
|
"required": [
|
|
"word",
|
|
"start",
|
|
"end"
|
|
]
|
|
},
|
|
"Segment": {
|
|
"type": "object",
|
|
"properties": {
|
|
"id": {
|
|
"type": "integer",
|
|
"description": "Unique identifier for the segment"
|
|
},
|
|
"seek": {
|
|
"type": "number",
|
|
"description": "Seek offset of the segment in seconds"
|
|
},
|
|
"start": {
|
|
"type": "number",
|
|
"description": "Start time of the segment in seconds"
|
|
},
|
|
"end": {
|
|
"type": "number",
|
|
"description": "End time of the segment in seconds"
|
|
},
|
|
"text": {
|
|
"type": "string",
|
|
"description": "The text content of the segment"
|
|
},
|
|
"tokens": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "integer"
|
|
},
|
|
"description": "Array of token IDs for the segment"
|
|
},
|
|
"temperature": {
|
|
"type": "number",
|
|
"description": "Temperature parameter used for generating this segment"
|
|
},
|
|
"avg_logprob": {
|
|
"type": "number",
|
|
"description": "Average log probability of the segment"
|
|
},
|
|
"compression_ratio": {
|
|
"type": "number",
|
|
"description": "Compression ratio of the segment"
|
|
},
|
|
"no_speech_prob": {
|
|
"type": "number",
|
|
"description": "Probability of no speech in this segment"
|
|
},
|
|
"words": {
|
|
"type": "array",
|
|
"items": {
|
|
"$ref": "#/components/schemas/Word"
|
|
},
|
|
"description": "Words in this segment"
|
|
}
|
|
},
|
|
"required": [
|
|
"id",
|
|
"seek",
|
|
"start",
|
|
"end",
|
|
"text"
|
|
]
|
|
},
|
|
"Model": {
|
|
"type": "object",
|
|
"properties": {
|
|
"id": {
|
|
"type": "string",
|
|
"description": "The model identifier"
|
|
},
|
|
"object": {
|
|
"type": "string",
|
|
"enum": [
|
|
"model"
|
|
],
|
|
"description": "The object type, which is always \"model\""
|
|
},
|
|
"created": {
|
|
"type": "integer",
|
|
"description": "The Unix timestamp (in seconds) when the model was created"
|
|
},
|
|
"owned_by": {
|
|
"type": "string",
|
|
"description": "The organization that owns the model"
|
|
},
|
|
"permission": {
|
|
"type": "array",
|
|
"items": {
|
|
"type": "object"
|
|
},
|
|
"description": "The permissions associated with the model"
|
|
},
|
|
"root": {
|
|
"type": "string",
|
|
"description": "The root of the model"
|
|
},
|
|
"parent": {
|
|
"type": "string",
|
|
"description": "The parent of the model"
|
|
}
|
|
},
|
|
"required": [
|
|
"id",
|
|
"object",
|
|
"created",
|
|
"owned_by"
|
|
]
|
|
},
|
|
"ListModelsResponse": {
|
|
"type": "object",
|
|
"properties": {
|
|
"object": {
|
|
"type": "string",
|
|
"enum": [
|
|
"list"
|
|
],
|
|
"description": "The object type, which is always \"list\""
|
|
},
|
|
"data": {
|
|
"type": "array",
|
|
"items": {
|
|
"$ref": "#/components/schemas/Model"
|
|
},
|
|
"description": "The list of models"
|
|
}
|
|
},
|
|
"required": [
|
|
"object",
|
|
"data"
|
|
]
|
|
},
|
|
"HealthResponse": {
|
|
"type": "object",
|
|
"properties": {
|
|
"status": {
|
|
"type": "string",
|
|
"enum": [
|
|
"healthy",
|
|
"unhealthy"
|
|
],
|
|
"description": "The health status of the service"
|
|
},
|
|
"service": {
|
|
"type": "string",
|
|
"description": "The name of the service"
|
|
},
|
|
"version": {
|
|
"type": "string",
|
|
"description": "The version of the service"
|
|
},
|
|
"timestamp": {
|
|
"type": "string",
|
|
"format": "date-time",
|
|
"description": "The current timestamp"
|
|
},
|
|
"uptime": {
|
|
"type": "number",
|
|
"description": "The uptime in seconds"
|
|
}
|
|
},
|
|
"required": [
|
|
"status",
|
|
"service"
|
|
]
|
|
},
|
|
"Error": {
|
|
"type": "object",
|
|
"properties": {
|
|
"error": {
|
|
"type": "object",
|
|
"properties": {
|
|
"message": {
|
|
"type": "string",
|
|
"description": "A human-readable error message"
|
|
},
|
|
"type": {
|
|
"type": "string",
|
|
"description": "The type of error"
|
|
},
|
|
"code": {
|
|
"type": "string",
|
|
"description": "The error code"
|
|
},
|
|
"param": {
|
|
"type": "string",
|
|
"description": "The parameter that caused the error"
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"required": [
|
|
"error"
|
|
]
|
|
}
|
|
},
|
|
"responses": {
|
|
"BadRequest": {
|
|
"description": "Bad Request",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/Error"
|
|
},
|
|
"example": {
|
|
"error": {
|
|
"message": "Invalid request parameters",
|
|
"type": "invalid_request_error",
|
|
"code": "invalid_parameters"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"Unauthorized": {
|
|
"description": "Unauthorized",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/Error"
|
|
},
|
|
"example": {
|
|
"error": {
|
|
"message": "Invalid API key",
|
|
"type": "authentication_error",
|
|
"code": "invalid_api_key"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"FileTooLarge": {
|
|
"description": "File Too Large",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/Error"
|
|
},
|
|
"example": {
|
|
"error": {
|
|
"message": "File size exceeds maximum allowed size",
|
|
"type": "invalid_request_error",
|
|
"code": "file_too_large"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"ValidationError": {
|
|
"description": "Validation Error",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/Error"
|
|
},
|
|
"example": {
|
|
"error": {
|
|
"message": "Invalid file format",
|
|
"type": "invalid_request_error",
|
|
"code": "invalid_file_format"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"RateLimitExceeded": {
|
|
"description": "Rate Limit Exceeded",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/Error"
|
|
},
|
|
"example": {
|
|
"error": {
|
|
"message": "Rate limit exceeded",
|
|
"type": "rate_limit_error",
|
|
"code": "rate_limit_exceeded"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"InternalServerError": {
|
|
"description": "Internal Server Error",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/Error"
|
|
},
|
|
"example": {
|
|
"error": {
|
|
"message": "An internal server error occurred",
|
|
"type": "server_error",
|
|
"code": "internal_error"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"NotFound": {
|
|
"description": "Not Found",
|
|
"content": {
|
|
"application/json": {
|
|
"schema": {
|
|
"$ref": "#/components/schemas/Error"
|
|
},
|
|
"example": {
|
|
"error": {
|
|
"message": "Model not found",
|
|
"type": "invalid_request_error",
|
|
"code": "model_not_found"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
},
|
|
"tags": [
|
|
{
|
|
"name": "Audio",
|
|
"description": "Audio transcription and translation operations"
|
|
},
|
|
{
|
|
"name": "Models",
|
|
"description": "Model management operations"
|
|
},
|
|
{
|
|
"name": "System",
|
|
"description": "System health and status operations"
|
|
},
|
|
{
|
|
"name": "Real-time",
|
|
"description": "Real-time audio processing via WebSocket"
|
|
}
|
|
]
|
|
} |