api/routers/queue_management.py
2025-11-14 14:47:19 +00:00

314 lines
10 KiB
Python

"""
Queue Management API
Provides endpoints for monitoring and managing the document processing queue.
"""
from fastapi import APIRouter, HTTPException, Depends
from typing import Dict, Any, List, Optional
from modules.queue_system import get_queue, TaskPriority, ServiceType
from modules.task_processors import get_processor
from modules.auth.supabase_bearer import SupabaseBearer
import os
router = APIRouter()
auth = SupabaseBearer()
@router.get("/queue/stats")
def get_queue_stats(payload: Dict[str, Any] = Depends(auth)):
"""Get comprehensive queue statistics."""
queue = get_queue()
stats = queue.get_queue_stats()
return stats
@router.get("/queue/health")
def queue_health():
"""Check queue health status."""
try:
queue = get_queue()
stats = queue.get_queue_stats()
# Basic health checks
total_processing = stats['total_processing']
total_queued = sum(stats['queues'].values())
dead_letter_count = stats['dead_letter_count']
status = "healthy"
issues = []
if dead_letter_count > 10:
issues.append(f"High dead letter count: {dead_letter_count}")
status = "degraded"
if total_processing == 0 and total_queued > 0:
issues.append("Tasks queued but no workers processing")
status = "degraded"
# Check Redis connectivity
queue.redis_client.ping()
return {
"status": status,
"total_processing": total_processing,
"total_queued": total_queued,
"dead_letter_count": dead_letter_count,
"issues": issues,
"timestamp": queue.redis_client.time()[0]
}
except Exception as e:
return {
"status": "unhealthy",
"error": str(e),
"timestamp": None
}
@router.post("/queue/workers/start")
def start_workers(
worker_count: int = 1,
services: Optional[List[str]] = None,
payload: Dict[str, Any] = Depends(auth)
):
"""Start queue workers."""
if not payload.get('role') == 'service_role': # Admin only
raise HTTPException(status_code=403, detail="Admin access required")
processor = get_processor()
# Convert service names to enums
service_enums = []
if services:
for service_name in services:
try:
service_enums.append(ServiceType(service_name))
except ValueError:
raise HTTPException(status_code=400, detail=f"Invalid service: {service_name}")
else:
service_enums = list(ServiceType)
worker_ids = []
for i in range(worker_count):
worker_id = processor.start_worker(services=service_enums)
worker_ids.append(worker_id)
return {
"message": f"Started {worker_count} workers",
"worker_ids": worker_ids,
"services": [s.value for s in service_enums]
}
@router.post("/queue/workers/stop")
def stop_workers(timeout: int = 30, payload: Dict[str, Any] = Depends(auth)):
"""Stop all queue workers."""
if not payload.get('role') == 'service_role': # Admin only
raise HTTPException(status_code=403, detail="Admin access required")
processor = get_processor()
processor.shutdown(timeout=timeout)
return {"message": "Workers shutdown initiated"}
@router.get("/queue/tasks/{task_id}")
def get_task_status(task_id: str, payload: Dict[str, Any] = Depends(auth)):
"""Get status of a specific task."""
queue = get_queue()
# Get task data from Redis
task_key = queue._get_task_key(task_id)
task_data = queue.redis_client.hgetall(task_key)
if not task_data:
raise HTTPException(status_code=404, detail="Task not found")
return task_data
@router.get("/queue/tasks/by-file/{file_id}")
def list_tasks_by_file(file_id: str, limit: int = 200, payload: Dict[str, Any] = Depends(auth)):
"""List recent queue tasks for a given file_id (best-effort scan)."""
queue = get_queue()
results: List[Dict[str, Any]] = []
count = 0
import json
try:
for key in queue.redis_client.scan_iter(match="task:*"):
task_data = queue.redis_client.hgetall(key)
if not task_data:
continue
if task_data.get('file_id') != file_id:
continue
# Build a brief
try:
payload_raw = task_data.get('payload') or '{}'
payload_obj = json.loads(payload_raw)
except Exception:
payload_obj = {}
results.append({
'id': task_data.get('id') or key.split(':', 1)[-1],
'service': task_data.get('service'),
'task_type': task_data.get('task_type'),
'status': task_data.get('status', 'pending'),
'priority': task_data.get('priority'),
'created_at': float(task_data.get('created_at') or 0),
'scheduled_at': float(task_data.get('scheduled_at') or 0),
'depends_on': payload_obj.get('depends_on') or []
})
count += 1
if count >= max(1, int(limit)):
break
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed scanning tasks: {e}")
# Sort by created_at desc
results.sort(key=lambda x: x.get('created_at', 0), reverse=True)
return {
'file_id': file_id,
'count': len(results),
'tasks': results
}
@router.get("/queue/tasks/{task_id}/dependencies")
def get_task_dependencies(task_id: str, payload: Dict[str, Any] = Depends(auth)):
"""Inspect a task's dependency state (direct depends_on only)."""
queue = get_queue()
task_key = queue._get_task_key(task_id)
task_data = queue.redis_client.hgetall(task_key)
if not task_data:
raise HTTPException(status_code=404, detail="Task not found")
# Parse payload JSON stored in Redis
import json
try:
payload_raw = task_data.get('payload') or '{}'
payload_obj = json.loads(payload_raw)
except Exception:
payload_obj = {}
depends_on = payload_obj.get('depends_on') or []
if not isinstance(depends_on, list):
depends_on = []
details: List[Dict[str, Any]] = []
missing: List[str] = []
all_completed = True
for dep_id in depends_on:
if not dep_id:
continue
dep_key = queue._get_task_key(dep_id)
dep_data = queue.redis_client.hgetall(dep_key)
if not dep_data:
missing.append(dep_id)
all_completed = False
details.append({
'task_id': dep_id,
'status': 'missing'
})
continue
status = dep_data.get('status', 'pending')
if status != 'completed':
all_completed = False
details.append({
'task_id': dep_id,
'status': status,
'service': dep_data.get('service'),
'task_type': dep_data.get('task_type'),
'created_at': dep_data.get('created_at'),
'scheduled_at': dep_data.get('scheduled_at')
})
return {
'task_id': task_id,
'depends_on': depends_on,
'all_completed': all_completed,
'missing': missing,
'details': details
}
@router.delete("/queue/dead-letter/{task_id}")
def remove_dead_task(task_id: str, payload: Dict[str, Any] = Depends(auth)):
"""Remove a task from the dead letter queue."""
if not payload.get('role') == 'service_role': # Admin only
raise HTTPException(status_code=403, detail="Admin access required")
queue = get_queue()
# Remove from dead letter queue
removed = queue.redis_client.lrem(queue.dead_letter_key, 1, task_id)
if removed == 0:
raise HTTPException(status_code=404, detail="Task not found in dead letter queue")
# Clean up task data
queue.redis_client.delete(queue._get_task_key(task_id))
return {"message": f"Removed task {task_id} from dead letter queue"}
@router.post("/queue/dead-letter/{task_id}/retry")
def retry_dead_task(task_id: str, payload: Dict[str, Any] = Depends(auth)):
"""Retry a task from the dead letter queue."""
if not payload.get('role') == 'service_role': # Admin only
raise HTTPException(status_code=403, detail="Admin access required")
queue = get_queue()
# Get task data
task_key = queue._get_task_key(task_id)
task_data = queue.redis_client.hgetall(task_key)
if not task_data:
raise HTTPException(status_code=404, detail="Task not found")
# Remove from dead letter queue
removed = queue.redis_client.lrem(queue.dead_letter_key, 1, task_id)
if removed == 0:
raise HTTPException(status_code=404, detail="Task not found in dead letter queue")
# Reset task for retry
queue.redis_client.hset(
task_key,
mapping={
'attempts': 0,
'status': 'pending',
'scheduled_at': queue.redis_client.time()[0]
}
)
# Re-queue task
priority = TaskPriority(task_data['priority'])
queue_key = queue.queue_keys[priority]
queue.redis_client.lpush(queue_key, task_id)
return {"message": f"Task {task_id} requeued for retry"}
@router.get("/queue/metrics")
def get_queue_metrics(hours: int = 1, payload: Dict[str, Any] = Depends(auth)):
"""Get queue processing metrics over time."""
queue = get_queue()
# This is a simplified version - in production you'd want more sophisticated metrics
current_time = queue.redis_client.time()[0]
start_time = current_time - (hours * 3600)
# Scan for metric keys in the time range
metrics = {}
pattern = f"{queue.metrics_key}:*"
for key in queue.redis_client.scan_iter(match=pattern):
# Parse key format: metrics:action:service:priority:timestamp_minute
parts = key.split(':')
if len(parts) >= 5:
action = parts[1]
service = parts[2]
priority = parts[3]
timestamp_minute = int(parts[4])
if timestamp_minute >= (start_time // 60):
count = int(queue.redis_client.get(key) or 0)
metric_key = f"{action}_{service}_{priority}"
if metric_key not in metrics:
metrics[metric_key] = 0
metrics[metric_key] += count
return {
"time_range_hours": hours,
"metrics": metrics
}