api/modules/transcription/llm_client.py

"""Pluggable LLM client for transcription summaries.

Phase 3: Full implementation with Anthropic, OpenAI, Ollama, OpenRouter, and Google providers.
"""

import os
import json
import logging
from typing import Optional, Dict, Any

import aiohttp

from modules.transcription.prompts import PROMPT_TEMPLATES

logger = logging.getLogger(__name__)

# Default models per provider
DEFAULT_MODELS: Dict[str, str] = {
    "anthropic": "claude-sonnet-4-6",
    "openai": "gpt-4o",
    "ollama": "llama3",
    "openrouter": "anthropic/claude-sonnet-4-6",
    "google": "gemini-2.0-flash",
}

# Timeout for LLM calls (seconds)
LLM_TIMEOUT = 120


class LLMCallResult:
    """Result from an LLM call, containing content and token usage."""

    def __init__(self, content: str, input_tokens: Optional[int] = None,
                 output_tokens: Optional[int] = None, raw_response: Optional[Dict[str, Any]] = None):
        self.content = content
        self.input_tokens = input_tokens
        self.output_tokens = output_tokens
        self.raw_response = raw_response

    def to_dict(self) -> Dict[str, Any]:
        return {
            "content": self.content,
            "input_tokens": self.input_tokens,
            "output_tokens": self.output_tokens,
        }


async def call_llm(
    provider: str,
    model: Optional[str] = None,
    api_key: str = "",
    system_prompt: str = "",
    user_message: str = "",
) -> LLMCallResult:
    """Call an LLM to generate a summary.

    Routes to the appropriate provider implementation.

    Args:
        provider: 'anthropic', 'openai', 'ollama', 'openrouter', 'google'
        model: Model name (falls back to provider default if None)
        api_key: User's API key (from frontend, passed per-request)
        system_prompt: System prompt template (already filled with transcript)
        user_message: User message content

    Returns:
        LLMCallResult with generated summary text and token counts

    Raises:
        ValueError: If provider is not supported
        Exception: If the API call fails
    """
    provider = provider.lower().strip()
    if model is None:
        model = DEFAULT_MODELS.get(provider, "")

    dispatch = {
        "anthropic": call_anthropic,
        "openai": call_openai,
        "ollama": call_ollama,
        "openrouter": call_openrouter,
        "google": call_google,
    }

    if provider not in dispatch:
        raise ValueError(
            f"Unsupported provider: {provider}. "
            f"Supported: {', '.join(dispatch.keys())}"
        )

    logger.info(f"Calling LLM provider={provider} model={model}")
    result = await dispatch[provider](
        api_key=api_key, model=model,
        system_prompt=system_prompt, user_message=user_message,
    )
    logger.info(f"LLM call complete: provider={provider} tokens_in={result.input_tokens} tokens_out={result.output_tokens}")
    return result


# ---------------------------------------------------------------------------
# Provider implementations
# ---------------------------------------------------------------------------

async def call_anthropic(
    api_key: str, model: str, system_prompt: str, user_message: str,
) -> LLMCallResult:
    """Call Anthropic Claude API (messages v2)."""
    url = "https://api.anthropic.com/v1/messages"
    headers = {
        "x-api-key": api_key,
        "anthropic-version": "2023-06-01",
        "content-type": "application/json",
    }
    payload = {
        "model": model,
        "max_tokens": 4096,
        "system": system_prompt,
        "messages": [{"role": "user", "content": user_message}],
    }

    async with aiohttp.ClientSession() as session:
        async with session.post(url, headers=headers, json=payload,
                                timeout=aiohttp.ClientTimeout(total=LLM_TIMEOUT)) as resp:
            if resp.status != 200:
                body = await resp.text()
                logger.error(f"Anthropic API error ({resp.status}): {body}")
                raise Exception(f"Anthropic API error {resp.status}: {body}")

            data = await resp.json()

    # Extract content blocks
    content_parts = []
    for block in data.get("content", []):
        if block.get("type") == "text":
            content_parts.append(block["text"])

    content = "\n".join(content_parts)

    # Token counts from response
    usage = data.get("usage", {})
    input_tokens = usage.get("input_tokens") or usage.get("input_tokens")
    output_tokens = usage.get("output_tokens") or usage.get("output_tokens")

    # Anthropic v2 uses input_tokens/output_tokens; fall back to input_tokens/input_tokens
    if not input_tokens:
        input_tokens = usage.get("input_tokens")
    if not output_tokens:
        output_tokens = usage.get("output_tokens")

    return LLMCallResult(
        content=content,
        input_tokens=input_tokens,
        output_tokens=output_tokens,
        raw_response=data,
    )


async def call_openai(
    api_key: str, model: str, system_prompt: str, user_message: str,
) -> LLMCallResult:
    """Call OpenAI Chat Completions API."""
    url = "https://api.openai.com/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json",
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message},
        ],
        "max_tokens": 4096,
    }

    async with aiohttp.ClientSession() as session:
        async with session.post(url, headers=headers, json=payload,
                                timeout=aiohttp.ClientTimeout(total=LLM_TIMEOUT)) as resp:
            if resp.status != 200:
                body = await resp.text()
                logger.error(f"OpenAI API error ({resp.status}): {body}")
                raise Exception(f"OpenAI API error {resp.status}: {body}")

            data = await resp.json()

    choice = data.get("choices", [{}])[0]
    content = choice.get("message", {}).get("content", "")

    usage = data.get("usage", {})
    return LLMCallResult(
        content=content,
        input_tokens=usage.get("prompt_tokens"),
        output_tokens=usage.get("completion_tokens"),
        raw_response=data,
    )


async def call_ollama(
    api_key: str, model: str, system_prompt: str, user_message: str,
) -> LLMCallResult:
    """Call local Ollama instance (generate endpoint)."""
    ollama_url = os.getenv("OLLAMA_URL", "http://localhost:11434")
    url = f"{ollama_url}/api/generate"

    # Ollama uses a single prompt with system instructions prepended
    full_prompt = f"{system_prompt}\n\n{user_message}"

    payload = {
        "model": model,
        "prompt": full_prompt,
        "stream": False,
    }

    headers = {"Content-Type": "application/json"}
    # Ollama may not need an API key; include if set
    if api_key:
        headers["Authorization"] = f"Bearer {api_key}"

    async with aiohttp.ClientSession() as session:
        async with session.post(url, headers=headers, json=payload,
                                timeout=aiohttp.ClientTimeout(total=LLM_TIMEOUT)) as resp:
            if resp.status != 200:
                body = await resp.text()
                logger.error(f"Ollama API error ({resp.status}): {body}")
                raise Exception(f"Ollama API error {resp.status}: {body}")

            data = await resp.json()

    content = data.get("response", "")
    # Ollama reports total_tokens; split into input/output heuristically
    total = data.get("total_tokens", 0)
    prompt_tokens = data.get("prompt_eval_count", None)
    eval_count = data.get("eval_count", None)

    return LLMCallResult(
        content=content,
        input_tokens=prompt_tokens,
        output_tokens=eval_count,
        raw_response=data,
    )


async def call_openrouter(
    api_key: str, model: str, system_prompt: str, user_message: str,
) -> LLMCallResult:
    """Call OpenRouter API (OpenAI-compatible chat completions)."""
    url = "https://openrouter.ai/api/v1/chat/completions"
    headers = {
        "Authorization": f"Bearer {api_key}",
        "Content-Type": "application/json",
        "HTTP-Referer": os.getenv("APP_URL", "https://classroom-copilot.example.com"),
        "X-Title": "Classroom Copilot",
    }
    payload = {
        "model": model,
        "messages": [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_message},
        ],
        "max_tokens": 4096,
    }

    async with aiohttp.ClientSession() as session:
        async with session.post(url, headers=headers, json=payload,
                                timeout=aiohttp.ClientTimeout(total=LLM_TIMEOUT)) as resp:
            if resp.status != 200:
                body = await resp.text()
                logger.error(f"OpenRouter API error ({resp.status}): {body}")
                raise Exception(f"OpenRouter API error {resp.status}: {body}")

            data = await resp.json()

    choice = data.get("choices", [{}])[0]
    content = choice.get("message", {}).get("content", "")

    usage = data.get("usage", {})
    return LLMCallResult(
        content=content,
        input_tokens=usage.get("prompt_tokens"),
        output_tokens=usage.get("completion_tokens"),
        raw_response=data,
    )


async def call_google(
    api_key: str, model: str, system_prompt: str, user_message: str,
) -> LLMCallResult:
    """Call Google Gemini API (generateContent)."""
    url = f"https://generativelanguage.googleapis.com/v1beta/models/{model}:generateContent?key={api_key}"

    payload = {
        "contents": [
            {
                "role": "user",
                "parts": [{"text": user_message}],
            }
        ],
        "system_instruction": {
            "parts": [{"text": system_prompt}],
        },
        "generationConfig": {
            "maxOutputTokens": 4096,
        },
    }

    headers = {"Content-Type": "application/json"}

    async with aiohttp.ClientSession() as session:
        async with session.post(url, headers=headers, json=payload,
                                timeout=aiohttp.ClientTimeout(total=LLM_TIMEOUT)) as resp:
            if resp.status != 200:
                body = await resp.text()
                logger.error(f"Google Gemini API error ({resp.status}): {body}")
                raise Exception(f"Google Gemini API error {resp.status}: {body}")

            data = await resp.json()

    # Extract text from candidates
    candidates = data.get("candidates", [])
    if candidates:
        content_parts = candidates[0].get("content", {}).get("parts", [])
        content = "\n".join(p.get("text", "") for p in content_parts)
    else:
        content = ""

    # Token usage from usage_metadata
    usage = data.get("usageMetadata", {})
    return LLMCallResult(
        content=content,
        input_tokens=usage.get("promptTokenCount"),
        output_tokens=usage.get("candidatesTokenCount"),
        raw_response=data,
    )


# ---------------------------------------------------------------------------
# Helper: build prompt from template
# ---------------------------------------------------------------------------

def build_prompt(summary_type: str, transcript: str) -> tuple[str, str]:
    """Build system + user prompt from template and transcript.

    Args:
        summary_type: One of 'full_lesson', 'questions_asked', 'teaching_style',
                      'key_moments', 'segment'
        transcript: The full (or segment) transcript text

    Returns:
        (system_prompt, user_message) tuple
    """
    template = PROMPT_TEMPLATES.get(summary_type, PROMPT_TEMPLATES["full_lesson"])
    # The template has {transcript} placeholder — fill it in
    filled = template.format(transcript=transcript)

    # Split into system and user: everything before "Transcript:" is the system prompt,
    # everything from "Transcript:" onward is the user message.
    transcript_marker = "\n\nTranscript:\n"
    if transcript_marker in filled:
        system_prompt, user_message = filled.split(transcript_marker, 1)
        user_message = "Transcript:\n" + user_message
    else:
        system_prompt = "You are an expert educational analyst."
        user_message = filled

    return system_prompt, user_message