WhisperLive-Server/openapi.json

{
  "openapi": "3.1.0",
  "info": {
    "title": "WhisperLive API",
    "description": "A high-performance speech-to-text API based on OpenAI's Whisper model.\nSupports real-time transcription via WebSocket and batch processing via HTTP.\n\n## Features\n- Real-time audio transcription\n- Batch file processing\n- Multiple language support\n- Translation capabilities\n- Multiple model sizes\n- WebSocket and HTTP interfaces\n",
    "version": "1.0.0",
    "contact": {
      "name": "WhisperLive Support",
      "url": "https://github.com/collabora/WhisperLive"
    },
    "license": {
      "name": "MIT",
      "url": "https://opensource.org/licenses/MIT"
    }
  },
  "servers": [
    {
      "url": "http://localhost:8080",
      "description": "Local development server"
    },
    {
      "url": "https://api.whisperlive.com/v1",
      "description": "Production server"
    }
  ],
  "security": [
    {
      "ApiKeyAuth": []
    }
  ],
  "paths": {
    "/v1/audio/transcriptions": {
      "post": {
        "summary": "Create transcription",
        "description": "Transcribes audio into the input language. The response will include the transcribed text\nand additional metadata such as language detection, confidence scores, and timestamps.\n",
        "operationId": "createTranscription",
        "tags": [
          "Audio"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "multipart/form-data": {
              "schema": {
                "type": "object",
                "required": [
                  "file"
                ],
                "properties": {
                  "file": {
                    "type": "string",
                    "format": "binary",
                    "description": "The audio file object (not file name) to transcribe, in one of these formats: \nflac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.\n"
                  },
                  "model": {
                    "type": "string",
                    "enum": [
                      "tiny",
                      "base",
                      "small",
                      "medium",
                      "large"
                    ],
                    "default": "base",
                    "description": "ID of the model to use. Only whisper-1 is currently available."
                  },
                  "language": {
                    "type": "string",
                    "pattern": "^[a-z]{2}$",
                    "description": "The language of the input audio. Supplying the input language in ISO-639-1 format will improve accuracy and latency.\nSupported languages: en, es, fr, de, it, pt, ru, ja, ko, zh, hi, ar\n"
                  },
                  "prompt": {
                    "type": "string",
                    "description": "An optional text to guide the model's style or continue a previous audio segment.\nThe prompt should match the audio language.\n"
                  },
                  "response_format": {
                    "type": "string",
                    "enum": [
                      "json",
                      "text",
                      "srt",
                      "verbose_json",
                      "vtt"
                    ],
                    "default": "json",
                    "description": "The format of the transcript output."
                  },
                  "temperature": {
                    "type": "number",
                    "minimum": 0,
                    "maximum": 1,
                    "default": 0,
                    "description": "The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic."
                  },
                  "timestamp_granularities": {
                    "type": "array",
                    "items": {
                      "type": "string",
                      "enum": [
                        "word",
                        "segment"
                      ]
                    },
                    "description": "The timestamp granularities to populate for this transcription."
                  }
                }
              }
            }
          }
        },
        "responses": {
          "200": {
            "description": "OK",
            "content": {
              "application/json": {
                "schema": {
                  "oneOf": [
                    {
                      "$ref": "#/components/schemas/TranscriptionResponse"
                    },
                    {
                      "$ref": "#/components/schemas/TranscriptionTextResponse"
                    },
                    {
                      "$ref": "#/components/schemas/TranscriptionSrtResponse"
                    },
                    {
                      "$ref": "#/components/schemas/TranscriptionVttResponse"
                    }
                  ]
                }
              }
            }
          },
          "400": {
            "$ref": "#/components/responses/BadRequest"
          },
          "401": {
            "$ref": "#/components/responses/Unauthorized"
          },
          "413": {
            "$ref": "#/components/responses/FileTooLarge"
          },
          "422": {
            "$ref": "#/components/responses/ValidationError"
          },
          "429": {
            "$ref": "#/components/responses/RateLimitExceeded"
          },
          "500": {
            "$ref": "#/components/responses/InternalServerError"
          }
        }
      }
    },
    "/v1/audio/translations": {
      "post": {
        "summary": "Create translation",
        "description": "Translates audio into English. The response will include the translated text\nand additional metadata such as confidence scores and timestamps.\n",
        "operationId": "createTranslation",
        "tags": [
          "Audio"
        ],
        "requestBody": {
          "required": true,
          "content": {
            "multipart/form-data": {
              "schema": {
                "type": "object",
                "required": [
                  "file"
                ],
                "properties": {
                  "file": {
                    "type": "string",
                    "format": "binary",
                    "description": "The audio file object (not file name) to translate, in one of these formats: \nflac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.\n"
                  },
                  "model": {
                    "type": "string",
                    "enum": [
                      "tiny",
                      "base",
                      "small",
                      "medium",
                      "large"
                    ],
                    "default": "base",
                    "description": "ID of the model to use. Only whisper-1 is currently available."
                  },
                  "prompt": {
                    "type": "string",
                    "description": "An optional text to guide the model's style or continue a previous audio segment.\nThe prompt should be in English.\n"
                  },
                  "response_format": {
                    "type": "string",
                    "enum": [
                      "json",
                      "text",
                      "srt",
                      "verbose_json",
                      "vtt"
                    ],
                    "default": "json",
                    "description": "The format of the transcript output."
                  },
                  "temperature": {
                    "type": "number",
                    "minimum": 0,
                    "maximum": 1,
                    "default": 0,
                    "description": "The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic."
                  },
                  "timestamp_granularities": {
                    "type": "array",
                    "items": {
                      "type": "string",
                      "enum": [
                        "word",
                        "segment"
                      ]
                    },
                    "description": "The timestamp granularities to populate for this translation."
                  }
                }
              }
            }
          }
        },
        "responses": {
          "200": {
            "description": "OK",
            "content": {
              "application/json": {
                "schema": {
                  "oneOf": [
                    {
                      "$ref": "#/components/schemas/TranscriptionResponse"
                    },
                    {
                      "$ref": "#/components/schemas/TranscriptionTextResponse"
                    },
                    {
                      "$ref": "#/components/schemas/TranscriptionSrtResponse"
                    },
                    {
                      "$ref": "#/components/schemas/TranscriptionVttResponse"
                    }
                  ]
                }
              }
            }
          },
          "400": {
            "$ref": "#/components/responses/BadRequest"
          },
          "401": {
            "$ref": "#/components/responses/Unauthorized"
          },
          "413": {
            "$ref": "#/components/responses/FileTooLarge"
          },
          "422": {
            "$ref": "#/components/responses/ValidationError"
          },
          "429": {
            "$ref": "#/components/responses/RateLimitExceeded"
          },
          "500": {
            "$ref": "#/components/responses/InternalServerError"
          }
        }
      }
    },
    "/v1/models": {
      "get": {
        "summary": "List models",
        "description": "Lists the currently available models, and provides basic information about each one such as the owner and availability.",
        "operationId": "listModels",
        "tags": [
          "Models"
        ],
        "responses": {
          "200": {
            "description": "OK",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/ListModelsResponse"
                }
              }
            }
          },
          "401": {
            "$ref": "#/components/responses/Unauthorized"
          },
          "500": {
            "$ref": "#/components/responses/InternalServerError"
          }
        }
      }
    },
    "/v1/models/{model}": {
      "get": {
        "summary": "Retrieve model",
        "description": "Retrieves a model instance, providing basic information about the model such as the owner and permissioning.",
        "operationId": "retrieveModel",
        "tags": [
          "Models"
        ],
        "parameters": [
          {
            "name": "model",
            "in": "path",
            "required": true,
            "description": "The ID of the model to use for this request",
            "schema": {
              "type": "string",
              "enum": [
                "tiny",
                "base",
                "small",
                "medium",
                "large"
              ]
            }
          }
        ],
        "responses": {
          "200": {
            "description": "OK",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/Model"
                }
              }
            }
          },
          "401": {
            "$ref": "#/components/responses/Unauthorized"
          },
          "404": {
            "$ref": "#/components/responses/NotFound"
          },
          "500": {
            "$ref": "#/components/responses/InternalServerError"
          }
        }
      }
    },
    "/v1/health": {
      "get": {
        "summary": "Health check",
        "description": "Check the health status of the API server",
        "operationId": "healthCheck",
        "tags": [
          "System"
        ],
        "responses": {
          "200": {
            "description": "OK",
            "content": {
              "application/json": {
                "schema": {
                  "$ref": "#/components/schemas/HealthResponse"
                }
              }
            }
          }
        }
      }
    },
    "/v1/websocket": {
      "get": {
        "summary": "WebSocket connection",
        "description": "Establishes a WebSocket connection for real-time audio transcription.\nSend audio data as binary frames and receive transcription results.\n",
        "operationId": "websocketConnection",
        "tags": [
          "Real-time"
        ],
        "parameters": [
          {
            "name": "model",
            "in": "query",
            "description": "The model to use for transcription",
            "schema": {
              "type": "string",
              "enum": [
                "tiny",
                "base",
                "small",
                "medium",
                "large"
              ],
              "default": "base"
            }
          },
          {
            "name": "language",
            "in": "query",
            "description": "The language of the input audio",
            "schema": {
              "type": "string",
              "pattern": "^[a-z]{2}$"
            }
          },
          {
            "name": "task",
            "in": "query",
            "description": "The task to perform",
            "schema": {
              "type": "string",
              "enum": [
                "transcribe",
                "translate"
              ],
              "default": "transcribe"
            }
          }
        ],
        "responses": {
          "101": {
            "description": "Switching Protocols",
            "headers": {
              "Upgrade": {
                "schema": {
                  "type": "string",
                  "example": "websocket"
                }
              },
              "Connection": {
                "schema": {
                  "type": "string",
                  "example": "Upgrade"
                }
              }
            }
          },
          "400": {
            "$ref": "#/components/responses/BadRequest"
          },
          "401": {
            "$ref": "#/components/responses/Unauthorized"
          }
        }
      }
    }
  },
  "components": {
    "securitySchemes": {
      "ApiKeyAuth": {
        "type": "apiKey",
        "in": "header",
        "name": "Authorization",
        "description": "API key authentication. Include your API key in the Authorization header.\nExample: `Authorization: Bearer your-api-key-here`\n"
      }
    },
    "schemas": {
      "TranscriptionResponse": {
        "type": "object",
        "properties": {
          "text": {
            "type": "string",
            "description": "The transcribed text"
          },
          "language": {
            "type": "string",
            "description": "The language of the input audio"
          },
          "duration": {
            "type": "number",
            "description": "The duration of the input audio in seconds"
          },
          "words": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Word"
            },
            "description": "Extracted words and their corresponding timestamps"
          },
          "segments": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Segment"
            },
            "description": "Segments of the transcribed text with timestamps"
          }
        },
        "required": [
          "text"
        ]
      },
      "TranscriptionTextResponse": {
        "type": "string",
        "description": "The transcribed text as plain text"
      },
      "TranscriptionSrtResponse": {
        "type": "string",
        "description": "The transcribed text in SRT subtitle format"
      },
      "TranscriptionVttResponse": {
        "type": "string",
        "description": "The transcribed text in VTT subtitle format"
      },
      "Word": {
        "type": "object",
        "properties": {
          "word": {
            "type": "string",
            "description": "The text content of the word"
          },
          "start": {
            "type": "number",
            "description": "Start time of the word in seconds"
          },
          "end": {
            "type": "number",
            "description": "End time of the word in seconds"
          },
          "probability": {
            "type": "number",
            "description": "Confidence score of the word (0-1)"
          }
        },
        "required": [
          "word",
          "start",
          "end"
        ]
      },
      "Segment": {
        "type": "object",
        "properties": {
          "id": {
            "type": "integer",
            "description": "Unique identifier for the segment"
          },
          "seek": {
            "type": "number",
            "description": "Seek offset of the segment in seconds"
          },
          "start": {
            "type": "number",
            "description": "Start time of the segment in seconds"
          },
          "end": {
            "type": "number",
            "description": "End time of the segment in seconds"
          },
          "text": {
            "type": "string",
            "description": "The text content of the segment"
          },
          "tokens": {
            "type": "array",
            "items": {
              "type": "integer"
            },
            "description": "Array of token IDs for the segment"
          },
          "temperature": {
            "type": "number",
            "description": "Temperature parameter used for generating this segment"
          },
          "avg_logprob": {
            "type": "number",
            "description": "Average log probability of the segment"
          },
          "compression_ratio": {
            "type": "number",
            "description": "Compression ratio of the segment"
          },
          "no_speech_prob": {
            "type": "number",
            "description": "Probability of no speech in this segment"
          },
          "words": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Word"
            },
            "description": "Words in this segment"
          }
        },
        "required": [
          "id",
          "seek",
          "start",
          "end",
          "text"
        ]
      },
      "Model": {
        "type": "object",
        "properties": {
          "id": {
            "type": "string",
            "description": "The model identifier"
          },
          "object": {
            "type": "string",
            "enum": [
              "model"
            ],
            "description": "The object type, which is always \"model\""
          },
          "created": {
            "type": "integer",
            "description": "The Unix timestamp (in seconds) when the model was created"
          },
          "owned_by": {
            "type": "string",
            "description": "The organization that owns the model"
          },
          "permission": {
            "type": "array",
            "items": {
              "type": "object"
            },
            "description": "The permissions associated with the model"
          },
          "root": {
            "type": "string",
            "description": "The root of the model"
          },
          "parent": {
            "type": "string",
            "description": "The parent of the model"
          }
        },
        "required": [
          "id",
          "object",
          "created",
          "owned_by"
        ]
      },
      "ListModelsResponse": {
        "type": "object",
        "properties": {
          "object": {
            "type": "string",
            "enum": [
              "list"
            ],
            "description": "The object type, which is always \"list\""
          },
          "data": {
            "type": "array",
            "items": {
              "$ref": "#/components/schemas/Model"
            },
            "description": "The list of models"
          }
        },
        "required": [
          "object",
          "data"
        ]
      },
      "HealthResponse": {
        "type": "object",
        "properties": {
          "status": {
            "type": "string",
            "enum": [
              "healthy",
              "unhealthy"
            ],
            "description": "The health status of the service"
          },
          "service": {
            "type": "string",
            "description": "The name of the service"
          },
          "version": {
            "type": "string",
            "description": "The version of the service"
          },
          "timestamp": {
            "type": "string",
            "format": "date-time",
            "description": "The current timestamp"
          },
          "uptime": {
            "type": "number",
            "description": "The uptime in seconds"
          }
        },
        "required": [
          "status",
          "service"
        ]
      },
      "Error": {
        "type": "object",
        "properties": {
          "error": {
            "type": "object",
            "properties": {
              "message": {
                "type": "string",
                "description": "A human-readable error message"
              },
              "type": {
                "type": "string",
                "description": "The type of error"
              },
              "code": {
                "type": "string",
                "description": "The error code"
              },
              "param": {
                "type": "string",
                "description": "The parameter that caused the error"
              }
            }
          }
        },
        "required": [
          "error"
        ]
      }
    },
    "responses": {
      "BadRequest": {
        "description": "Bad Request",
        "content": {
          "application/json": {
            "schema": {
              "$ref": "#/components/schemas/Error"
            },
            "example": {
              "error": {
                "message": "Invalid request parameters",
                "type": "invalid_request_error",
                "code": "invalid_parameters"
              }
            }
          }
        }
      },
      "Unauthorized": {
        "description": "Unauthorized",
        "content": {
          "application/json": {
            "schema": {
              "$ref": "#/components/schemas/Error"
            },
            "example": {
              "error": {
                "message": "Invalid API key",
                "type": "authentication_error",
                "code": "invalid_api_key"
              }
            }
          }
        }
      },
      "FileTooLarge": {
        "description": "File Too Large",
        "content": {
          "application/json": {
            "schema": {
              "$ref": "#/components/schemas/Error"
            },
            "example": {
              "error": {
                "message": "File size exceeds maximum allowed size",
                "type": "invalid_request_error",
                "code": "file_too_large"
              }
            }
          }
        }
      },
      "ValidationError": {
        "description": "Validation Error",
        "content": {
          "application/json": {
            "schema": {
              "$ref": "#/components/schemas/Error"
            },
            "example": {
              "error": {
                "message": "Invalid file format",
                "type": "invalid_request_error",
                "code": "invalid_file_format"
              }
            }
          }
        }
      },
      "RateLimitExceeded": {
        "description": "Rate Limit Exceeded",
        "content": {
          "application/json": {
            "schema": {
              "$ref": "#/components/schemas/Error"
            },
            "example": {
              "error": {
                "message": "Rate limit exceeded",
                "type": "rate_limit_error",
                "code": "rate_limit_exceeded"
              }
            }
          }
        }
      },
      "InternalServerError": {
        "description": "Internal Server Error",
        "content": {
          "application/json": {
            "schema": {
              "$ref": "#/components/schemas/Error"
            },
            "example": {
              "error": {
                "message": "An internal server error occurred",
                "type": "server_error",
                "code": "internal_error"
              }
            }
          }
        }
      },
      "NotFound": {
        "description": "Not Found",
        "content": {
          "application/json": {
            "schema": {
              "$ref": "#/components/schemas/Error"
            },
            "example": {
              "error": {
                "message": "Model not found",
                "type": "invalid_request_error",
                "code": "model_not_found"
              }
            }
          }
        }
      }
    }
  },
  "tags": [
    {
      "name": "Audio",
      "description": "Audio transcription and translation operations"
    },
    {
      "name": "Models",
      "description": "Model management operations"
    },
    {
      "name": "System",
      "description": "System health and status operations"
    },
    {
      "name": "Real-time",
      "description": "Real-time audio processing via WebSocket"
    }
  ]
}