chore: add .env and *.bak to .gitignore, remove archive/ folder
This commit is contained in:
parent
c92da048fd
commit
2436a00cac
3
.gitignore
vendored
3
.gitignore
vendored
@ -1,8 +1,11 @@
|
||||
__pycache__
|
||||
.pytest_cache
|
||||
|
||||
.env
|
||||
|
||||
.DS_Store
|
||||
|
||||
.archive/*
|
||||
|
||||
data/logs/*
|
||||
*.bak
|
||||
|
||||
@ -1,66 +0,0 @@
|
||||
# Auto-Processing Code Archive
|
||||
|
||||
This directory contains the complex auto-processing system that was previously used for automatic document processing after file upload.
|
||||
|
||||
## Archived Components
|
||||
|
||||
### Core Processing Files
|
||||
- `files_with_auto_processing.py` - Original files.py router with automatic processing
|
||||
- `pipeline_controller.py` - Complex multi-phase pipeline orchestration
|
||||
- `task_processors.py` - Document processing task handlers
|
||||
|
||||
### Advanced Queue Management (Created but not deployed)
|
||||
- `memory_aware_queue.py` - Memory-based intelligent queue management
|
||||
- `enhanced_upload_handler.py` - Advanced upload handler with queuing
|
||||
- `enhanced_upload.py` - API endpoints for advanced upload system
|
||||
|
||||
## What This System Did
|
||||
|
||||
### Automatic Processing Pipeline
|
||||
1. **File Upload** → Immediate processing trigger
|
||||
2. **PDF Conversion** (synchronous, blocking)
|
||||
3. **Phase 1**: Structure discovery (Tika, Page Images, Document Analysis, Split Map)
|
||||
4. **Phase 2**: Docling processing (NO_OCR → OCR → VLM pipelines)
|
||||
5. **Complex Dependencies**: Phase coordination, task sequencing
|
||||
6. **Redis Queue Management**: Service limits, rate limits, dependency tracking
|
||||
|
||||
### Features
|
||||
- Multi-phase processing pipelines
|
||||
- Complex task dependency management
|
||||
- Memory-aware queue limits
|
||||
- Multi-user capacity management
|
||||
- Real-time processing status
|
||||
- WebSocket status updates
|
||||
- Service-specific resource limits
|
||||
- Task recovery on restart
|
||||
|
||||
## Why Archived
|
||||
|
||||
The system was overly complex for the current needs:
|
||||
- **Complexity**: Multi-phase pipelines with complex dependencies
|
||||
- **Blocking Operations**: Synchronous PDF conversion causing timeouts
|
||||
- **Resource Management**: Over-engineered for single-user scenarios
|
||||
- **User Experience**: Users had to wait for processing to complete
|
||||
|
||||
## New Simplified Approach
|
||||
|
||||
The new system focuses on:
|
||||
- **Simple Upload**: Just store files and create database records
|
||||
- **No Auto-Processing**: Users manually trigger processing when needed
|
||||
- **Directory Support**: Upload entire folders with manifest tracking
|
||||
- **Immediate Response**: Users get instant confirmation without waiting
|
||||
|
||||
## If You Need to Restore
|
||||
|
||||
To restore the auto-processing functionality:
|
||||
1. Copy `files_with_auto_processing.py` back to `routers/database/files/files.py`
|
||||
2. Ensure `pipeline_controller.py` and `task_processors.py` are in `modules/`
|
||||
3. Update imports and dependencies
|
||||
4. Re-enable background processing in upload handlers
|
||||
|
||||
## Migration Notes
|
||||
|
||||
The database schema and Redis structure remain compatible. The new simplified system can coexist with the archived processing logic if needed.
|
||||
|
||||
Date Archived: $(date)
|
||||
Reason: Simplification for directory upload implementation
|
||||
@ -1,142 +0,0 @@
|
||||
"""
|
||||
Enhanced Upload Router with Memory-Aware Queuing
|
||||
===============================================
|
||||
|
||||
Provides intelligent upload endpoints with capacity checking,
|
||||
queue management, and real-time status updates.
|
||||
|
||||
Endpoints:
|
||||
- POST /upload/check-capacity - Pre-check if upload is possible
|
||||
- POST /upload/with-queue - Upload with intelligent queuing
|
||||
- GET /upload/status/{file_id} - Get processing status
|
||||
- GET /upload/queue-status - Get overall queue status
|
||||
- WebSocket /ws/upload-status - Real-time status updates
|
||||
"""
|
||||
|
||||
import os
|
||||
import uuid
|
||||
import json
|
||||
import logging
|
||||
from typing import Dict, List, Optional, Any
|
||||
from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, WebSocket, WebSocketDisconnect
|
||||
from fastapi.responses import JSONResponse
|
||||
|
||||
from modules.auth.supabase_bearer import SupabaseBearer
|
||||
from modules.enhanced_upload_handler import get_upload_handler
|
||||
from modules.memory_aware_queue import get_memory_queue
|
||||
from modules.logger_tool import initialise_logger
|
||||
|
||||
router = APIRouter()
|
||||
auth = SupabaseBearer()
|
||||
|
||||
logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
|
||||
|
||||
# WebSocket connection manager for real-time updates
|
||||
class ConnectionManager:
|
||||
def __init__(self):
|
||||
self.active_connections: Dict[str, List[WebSocket]] = {}
|
||||
|
||||
async def connect(self, websocket: WebSocket, file_id: str):
|
||||
await websocket.accept()
|
||||
if file_id not in self.active_connections:
|
||||
self.active_connections[file_id] = []
|
||||
self.active_connections[file_id].append(websocket)
|
||||
|
||||
def disconnect(self, websocket: WebSocket, file_id: str):
|
||||
if file_id in self.active_connections:
|
||||
self.active_connections[file_id].remove(websocket)
|
||||
if not self.active_connections[file_id]:
|
||||
del self.active_connections[file_id]
|
||||
|
||||
async def broadcast_to_file(self, file_id: str, message: dict):
|
||||
if file_id in self.active_connections:
|
||||
for connection in self.active_connections[file_id].copy():
|
||||
try:
|
||||
await connection.send_json(message)
|
||||
except:
|
||||
self.active_connections[file_id].remove(connection)
|
||||
|
||||
manager = ConnectionManager()
|
||||
|
||||
@router.post("/upload/check-capacity")
|
||||
async def check_upload_capacity(
|
||||
file_size: int = Form(...),
|
||||
mime_type: str = Form(...),
|
||||
payload: Dict[str, Any] = Depends(auth)
|
||||
):
|
||||
"""
|
||||
Check if user can upload a file of given size and type.
|
||||
|
||||
Returns capacity information and recommendations.
|
||||
"""
|
||||
try:
|
||||
user_id = payload.get('sub') or payload.get('user_id', 'anonymous')
|
||||
|
||||
# Determine environment
|
||||
environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'
|
||||
upload_handler = get_upload_handler(environment)
|
||||
|
||||
eligible, message, details = upload_handler.check_upload_eligibility(
|
||||
user_id, file_size, mime_type
|
||||
)
|
||||
|
||||
response = {
|
||||
'eligible': eligible,
|
||||
'message': message,
|
||||
'details': details,
|
||||
'timestamp': time.time()
|
||||
}
|
||||
|
||||
status_code = 200 if eligible else 429 # Too Many Requests if not eligible
|
||||
|
||||
logger.info(f"📋 Capacity check for user {user_id}: {eligible} - {message}")
|
||||
|
||||
return JSONResponse(content=response, status_code=status_code)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Capacity check error: {e}")
|
||||
raise HTTPException(status_code=500, detail=str(e))
|
||||
|
||||
@router.post("/upload/with-queue")
|
||||
async def upload_with_queue(
|
||||
cabinet_id: str = Form(...),
|
||||
path: str = Form(...),
|
||||
scope: str = Form(...),
|
||||
priority: int = Form(1),
|
||||
file: UploadFile = File(...),
|
||||
payload: Dict[str, Any] = Depends(auth)
|
||||
):
|
||||
"""
|
||||
Upload file with intelligent queuing and capacity management.
|
||||
|
||||
Returns queue information and processing status.
|
||||
"""
|
||||
try:
|
||||
user_id = payload.get('sub') or payload.get('user_id', 'anonymous')
|
||||
|
||||
# Read file content
|
||||
file_bytes = await file.read()
|
||||
file_size = len(file_bytes)
|
||||
mime_type = file.content_type or 'application/octet-stream'
|
||||
filename = file.filename or path
|
||||
|
||||
logger.info(f"📤 Upload request: {filename} ({file_size} bytes) for user {user_id}")
|
||||
|
||||
# Determine environment
|
||||
environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'
|
||||
upload_handler = get_upload_handler(environment)
|
||||
|
||||
# Check if upload queuing is enabled
|
||||
if os.getenv('UPLOAD_QUEUE_ENABLED', 'true').lower() == 'true':
|
||||
# Use new queue-based upload
|
||||
file_id = str(uuid.uuid4())
|
||||
|
||||
result = await upload_handler.handle_upload_with_queue(
|
||||
file_id=file_id,
|
||||
user_id=user_id,
|
||||
filename=filename,
|
||||
file_bytes=file_bytes,
|
||||
mime_type=mime_type,
|
||||
cabinet_id=cabinet_id,
|
||||
priority=priority
|
||||
)\n \n return result\n \n else:\n # Fall back to immediate processing (legacy mode)\n logger.warning(\"Using legacy immediate processing mode\")\n # TODO: Call original upload_file function\n raise HTTPException(status_code=501, detail=\"Legacy mode not implemented in this endpoint\")\n \n except HTTPException:\n raise\n except Exception as e:\n logger.error(f\"Upload error: {e}\")\n raise HTTPException(status_code=500, detail=str(e))\n\n@router.get(\"/upload/status/{file_id}\")\nasync def get_upload_status(\n file_id: str,\n payload: Dict[str, Any] = Depends(auth)\n):\n \"\"\"\n Get current processing status for an uploaded file.\n \"\"\"\n try:\n # Determine environment\n environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n upload_handler = get_upload_handler(environment)\n \n status = upload_handler.get_processing_status(file_id)\n \n if status.get('status') == 'not_found':\n raise HTTPException(status_code=404, detail=\"File not found\")\n \n return status\n \n except HTTPException:\n raise\n except Exception as e:\n logger.error(f\"Status check error for {file_id}: {e}\")\n raise HTTPException(status_code=500, detail=str(e))\n\n@router.get(\"/upload/queue-status\")\nasync def get_queue_status(\n payload: Dict[str, Any] = Depends(auth)\n):\n \"\"\"\n Get overall queue status and system capacity information.\n \"\"\"\n try:\n # Determine environment\n environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n memory_queue = get_memory_queue(environment)\n \n system_status = memory_queue.get_system_status()\n \n return {\n 'system_status': system_status,\n 'timestamp': time.time(),\n 'environment': environment\n }\n \n except Exception as e:\n logger.error(f\"Queue status error: {e}\")\n raise HTTPException(status_code=500, detail=str(e))\n\n@router.websocket(\"/ws/upload-status/{file_id}\")\nasync def websocket_upload_status(websocket: WebSocket, file_id: str):\n \"\"\"\n WebSocket endpoint for real-time upload status updates.\n \"\"\"\n await manager.connect(websocket, file_id)\n \n try:\n # Send initial status\n environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n upload_handler = get_upload_handler(environment)\n initial_status = upload_handler.get_processing_status(file_id)\n \n await websocket.send_json({\n 'type': 'status_update',\n 'data': initial_status\n })\n \n # Keep connection alive and listen for updates\n while True:\n # In a real implementation, you'd have a background task\n # that pushes updates when file status changes\n await asyncio.sleep(5)\n \n # Check for status updates\n current_status = upload_handler.get_processing_status(file_id)\n await websocket.send_json({\n 'type': 'status_update', \n 'data': current_status\n })\n \n except WebSocketDisconnect:\n manager.disconnect(websocket, file_id)\n except Exception as e:\n logger.error(f\"WebSocket error for {file_id}: {e}\")\n manager.disconnect(websocket, file_id)\n\n# Background task to process upload queue\n@router.on_event(\"startup\")\nasync def start_queue_processor():\n \"\"\"Start background queue processor.\"\"\"\n \n if os.getenv('UPLOAD_QUEUE_ENABLED', 'true').lower() != 'true':\n logger.info(\"📋 Upload queue disabled, skipping queue processor\")\n return\n \n import asyncio\n \n environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n upload_handler = get_upload_handler(environment)\n \n # Start background processor\n asyncio.create_task(upload_handler.process_queued_files(\"document_processor\"))\n \n logger.info(\"🚀 Upload queue processor started\")\n\nimport time\nimport asyncio"
|
||||
@ -1,362 +0,0 @@
|
||||
"""
|
||||
Enhanced Upload Handler with Memory-Aware Queuing
|
||||
=================================================
|
||||
|
||||
Replaces the immediate processing model with intelligent queue management.
|
||||
Provides user feedback about capacity, queue position, and processing status.
|
||||
|
||||
Features:
|
||||
- Pre-upload capacity checking
|
||||
- Memory-aware queuing with user quotas
|
||||
- Real-time status updates via WebSocket/SSE
|
||||
- Graceful degradation under load
|
||||
- Fair queuing across multiple users
|
||||
"""
|
||||
|
||||
import os
|
||||
import uuid
|
||||
import time
|
||||
import logging
|
||||
import asyncio
|
||||
from typing import Dict, List, Optional, Any, Tuple
|
||||
from fastapi import HTTPException, BackgroundTasks
|
||||
from dataclasses import asdict
|
||||
|
||||
from .memory_aware_queue import get_memory_queue, QueuedFile
|
||||
from .redis_manager import get_redis_manager
|
||||
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
||||
from modules.database.tools.storage.storage_admin import StorageAdmin
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class EnhancedUploadHandler:
|
||||
"""Enhanced upload handler with memory-aware queuing."""
|
||||
|
||||
def __init__(self, environment: str = "dev"):
|
||||
self.memory_queue = get_memory_queue(environment)
|
||||
self.redis_manager = get_redis_manager(environment)
|
||||
self.redis_client = self.redis_manager.client
|
||||
|
||||
# Processing status tracking
|
||||
self.processing_status_key = "file_processing_status"
|
||||
|
||||
def check_upload_eligibility(self, user_id: str, file_size: int,
|
||||
mime_type: str) -> Tuple[bool, str, Dict[str, Any]]:
|
||||
"""
|
||||
Check if user can upload a file right now.
|
||||
|
||||
Returns:
|
||||
(eligible, message, details)
|
||||
"""
|
||||
|
||||
# Check system capacity
|
||||
can_accept, message, queue_info = self.memory_queue.check_upload_capacity(
|
||||
user_id, file_size, mime_type
|
||||
)
|
||||
|
||||
if not can_accept:
|
||||
return False, message, {
|
||||
'reason': 'capacity_exceeded',
|
||||
'queue_info': queue_info,
|
||||
'recommendations': self._get_recommendations(queue_info)
|
||||
}
|
||||
|
||||
return True, message, {
|
||||
'status': 'ready_for_upload',
|
||||
'queue_info': queue_info,
|
||||
'processing_estimate': self._estimate_processing_time(file_size, mime_type)
|
||||
}
|
||||
|
||||
async def handle_upload_with_queue(self, file_id: str, user_id: str,
|
||||
filename: str, file_bytes: bytes,
|
||||
mime_type: str, cabinet_id: str,
|
||||
priority: int = 1) -> Dict[str, Any]:
|
||||
"""
|
||||
Handle file upload with intelligent queuing.
|
||||
|
||||
Steps:
|
||||
1. Store file immediately (cheap operation)
|
||||
2. Add to processing queue
|
||||
3. Return queue status to user
|
||||
4. Process asynchronously when capacity available
|
||||
"""
|
||||
|
||||
# Store file immediately (this is fast)
|
||||
storage = StorageAdmin()
|
||||
client = SupabaseServiceRoleClient()
|
||||
|
||||
# Create database record
|
||||
bucket = f"{cabinet_id}-files" # or your bucket naming convention
|
||||
storage_path = f"{cabinet_id}/{file_id}/{filename}"
|
||||
|
||||
try:
|
||||
# Store file
|
||||
storage.upload_file(bucket, storage_path, file_bytes, mime_type, upsert=True)
|
||||
|
||||
# Create file record
|
||||
insert_res = client.supabase.table('files').insert({
|
||||
'id': file_id,
|
||||
'name': filename,
|
||||
'cabinet_id': cabinet_id,
|
||||
'bucket': bucket,
|
||||
'path': storage_path,
|
||||
'mime_type': mime_type,
|
||||
'uploaded_by': user_id,
|
||||
'size_bytes': len(file_bytes),
|
||||
'source': 'classroomcopilot-web',
|
||||
'status': 'queued_for_processing' # New status
|
||||
}).execute()
|
||||
|
||||
if not insert_res.data:
|
||||
raise HTTPException(status_code=500, detail="Failed to create file record")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to store file {file_id}: {e}")
|
||||
raise HTTPException(status_code=500, detail=f"Storage failed: {str(e)}")
|
||||
|
||||
# Add to processing queue
|
||||
try:
|
||||
queue_result = self.memory_queue.enqueue_file(
|
||||
file_id=file_id,
|
||||
user_id=user_id,
|
||||
filename=filename,
|
||||
size_bytes=len(file_bytes),
|
||||
mime_type=mime_type,
|
||||
cabinet_id=cabinet_id,
|
||||
priority=priority
|
||||
)
|
||||
|
||||
# Update file status in database
|
||||
client.supabase.table('files').update({
|
||||
'status': 'queued_for_processing',
|
||||
'extra': {
|
||||
'queue_position': queue_result['queue_position'],
|
||||
'estimated_wait_seconds': queue_result['estimated_wait_seconds'],
|
||||
'memory_estimate_mb': queue_result['memory_estimate_mb']
|
||||
}
|
||||
}).eq('id', file_id).execute()
|
||||
|
||||
logger.info(f"📋 File {file_id} queued at position {queue_result['queue_position']}")
|
||||
|
||||
return {
|
||||
'status': 'upload_successful',
|
||||
'message': 'File uploaded and queued for processing',
|
||||
'file_id': file_id,
|
||||
'queue_info': queue_result,
|
||||
'next_steps': {
|
||||
'poll_status_endpoint': f'/database/files/{file_id}/processing-status',
|
||||
'websocket_updates': f'/ws/file-processing/{file_id}'
|
||||
}
|
||||
}
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to queue file {file_id}: {e}")
|
||||
# Clean up stored file
|
||||
try:
|
||||
storage.delete_file(bucket, storage_path)
|
||||
client.supabase.table('files').delete().eq('id', file_id).execute()
|
||||
except:
|
||||
pass
|
||||
raise HTTPException(status_code=500, detail=f"Queue failed: {str(e)}")
|
||||
|
||||
async def process_queued_files(self, service_name: str = "document_processor"):
|
||||
"""
|
||||
Background service to process queued files.
|
||||
This runs continuously as a background task.
|
||||
"""
|
||||
|
||||
logger.info(f"🚀 Started queue processor for {service_name}")
|
||||
|
||||
while True:
|
||||
try:
|
||||
# Get next file from queue
|
||||
queued_file = self.memory_queue.dequeue_next_file(service_name)
|
||||
|
||||
if not queued_file:
|
||||
# No files ready for processing
|
||||
await asyncio.sleep(5)
|
||||
continue
|
||||
|
||||
# Update file status
|
||||
await self._update_processing_status(queued_file.file_id, 'processing')
|
||||
|
||||
# Process the file
|
||||
try:
|
||||
await self._process_file(queued_file, service_name)
|
||||
await self._update_processing_status(queued_file.file_id, 'completed')
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to process file {queued_file.file_id}: {e}")
|
||||
await self._update_processing_status(queued_file.file_id, 'failed', str(e))
|
||||
|
||||
finally:
|
||||
# Always free memory
|
||||
self.memory_queue.complete_processing(
|
||||
service_name,
|
||||
queued_file.file_id,
|
||||
queued_file.memory_estimate_mb
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Queue processor error: {e}")
|
||||
await asyncio.sleep(10) # Back off on errors
|
||||
|
||||
async def _process_file(self, queued_file: QueuedFile, service_name: str):
|
||||
"""Process a single file from the queue."""
|
||||
|
||||
logger.info(f"🔄 Processing file {queued_file.file_id} in {service_name}")
|
||||
|
||||
# Import here to avoid circular imports
|
||||
from modules.pipeline_controller import get_pipeline_controller
|
||||
|
||||
client = SupabaseServiceRoleClient()
|
||||
controller = get_pipeline_controller()
|
||||
|
||||
# Get file record
|
||||
file_result = client.supabase.table('files').select('*').eq('id', queued_file.file_id).single().execute()
|
||||
file_row = file_result.data
|
||||
|
||||
if not file_row:
|
||||
raise Exception(f"File record not found: {queued_file.file_id}")
|
||||
|
||||
# Update status to processing
|
||||
client.supabase.table('files').update({
|
||||
'status': 'processing'
|
||||
}).eq('id', queued_file.file_id).execute()
|
||||
|
||||
# Convert to PDF if needed (this is where the bottleneck was before)
|
||||
processing_path = await self._handle_pdf_conversion(file_row)
|
||||
|
||||
# Enqueue Phase 1 tasks
|
||||
phase1_tasks = controller.enqueue_phase1_tasks(
|
||||
file_id=queued_file.file_id,
|
||||
file_row={**file_row, 'path': processing_path},
|
||||
processing_path=processing_path,
|
||||
processing_mime=file_row['mime_type']
|
||||
)
|
||||
|
||||
# Update database with task IDs
|
||||
client.supabase.table('files').update({
|
||||
'status': 'phase1_processing',
|
||||
'extra': {
|
||||
**file_row.get('extra', {}),
|
||||
'phase1_tasks': phase1_tasks,
|
||||
'processing_started_at': time.time()
|
||||
}
|
||||
}).eq('id', queued_file.file_id).execute()
|
||||
|
||||
logger.info(f"✅ File {queued_file.file_id} processing initiated")
|
||||
|
||||
async def _handle_pdf_conversion(self, file_row: Dict[str, Any]) -> str:
|
||||
"""Handle PDF conversion asynchronously."""
|
||||
|
||||
if file_row['mime_type'] == 'application/pdf':
|
||||
return file_row['path']
|
||||
|
||||
# TODO: Implement async PDF conversion
|
||||
# For now, return original path and handle conversion in pipeline
|
||||
logger.info(f"PDF conversion queued for file {file_row['id']}")
|
||||
return file_row['path']
|
||||
|
||||
async def _update_processing_status(self, file_id: str, status: str, error: str = None):
|
||||
"""Update file processing status."""
|
||||
|
||||
status_data = {
|
||||
'file_id': file_id,
|
||||
'status': status,
|
||||
'timestamp': time.time(),
|
||||
'error': error
|
||||
}
|
||||
|
||||
# Store in Redis for real-time updates
|
||||
status_key = f"{self.processing_status_key}:{file_id}"
|
||||
self.redis_client.setex(status_key, 86400, json.dumps(status_data)) # 24h expiry
|
||||
|
||||
# Update database
|
||||
client = SupabaseServiceRoleClient()
|
||||
client.supabase.table('files').update({
|
||||
'status': status,
|
||||
'error_message': error
|
||||
}).eq('id', file_id).execute()
|
||||
|
||||
logger.info(f"📊 Status update for {file_id}: {status}")
|
||||
|
||||
def get_processing_status(self, file_id: str) -> Dict[str, Any]:
|
||||
"""Get current processing status for a file."""
|
||||
|
||||
status_key = f"{self.processing_status_key}:{file_id}"
|
||||
status_json = self.redis_client.get(status_key)
|
||||
|
||||
if status_json:
|
||||
return json.loads(status_json)
|
||||
|
||||
# Fallback to database
|
||||
client = SupabaseServiceRoleClient()
|
||||
result = client.supabase.table('files').select('status, error_message, extra').eq('id', file_id).single().execute()
|
||||
|
||||
if result.data:
|
||||
return {
|
||||
'file_id': file_id,
|
||||
'status': result.data['status'],
|
||||
'error': result.data.get('error_message'),
|
||||
'extra': result.data.get('extra', {})
|
||||
}
|
||||
|
||||
return {'file_id': file_id, 'status': 'not_found'}
|
||||
|
||||
def _estimate_processing_time(self, file_size: int, mime_type: str) -> Dict[str, Any]:
|
||||
"""Estimate processing time for a file."""
|
||||
|
||||
# Base time estimates (in seconds)
|
||||
base_times = {
|
||||
'application/pdf': 60, # 1 minute per MB
|
||||
'application/msword': 120, # 2 minutes per MB
|
||||
'image/': 30 # 30 seconds per MB
|
||||
}
|
||||
|
||||
# Find matching mime type
|
||||
time_per_mb = 60 # default
|
||||
for mime_prefix, time_val in base_times.items():
|
||||
if mime_type.startswith(mime_prefix):
|
||||
time_per_mb = time_val
|
||||
break
|
||||
|
||||
file_size_mb = file_size / (1024 * 1024)
|
||||
estimated_seconds = int(file_size_mb * time_per_mb)
|
||||
|
||||
return {
|
||||
'estimated_seconds': estimated_seconds,
|
||||
'estimated_minutes': estimated_seconds / 60,
|
||||
'phases': {
|
||||
'pdf_conversion': estimated_seconds * 0.2,
|
||||
'metadata_extraction': estimated_seconds * 0.3,
|
||||
'docling_processing': estimated_seconds * 0.5
|
||||
}
|
||||
}
|
||||
|
||||
def _get_recommendations(self, queue_info: Dict[str, Any]) -> List[str]:
|
||||
"""Get recommendations for user when upload is rejected."""
|
||||
|
||||
recommendations = []
|
||||
|
||||
if queue_info.get('reason') == 'file_too_large':
|
||||
recommendations.append("Try compressing your file or splitting it into smaller parts")
|
||||
|
||||
if queue_info.get('utilization', 0) > 0.9:
|
||||
recommendations.append("System is currently overloaded. Try uploading during off-peak hours")
|
||||
recommendations.append("Consider uploading smaller files first")
|
||||
|
||||
if queue_info.get('user_current', 0) > 0:
|
||||
recommendations.append("Wait for your current uploads to complete before uploading more")
|
||||
|
||||
if not recommendations:
|
||||
recommendations.append("Please try again in a few minutes")
|
||||
|
||||
return recommendations
|
||||
|
||||
# Convenience functions
|
||||
def get_upload_handler(environment: str = "dev") -> EnhancedUploadHandler:
|
||||
"""Get enhanced upload handler instance."""
|
||||
return EnhancedUploadHandler(environment)
|
||||
|
||||
import json
|
||||
@ -1,997 +0,0 @@
|
||||
import os
|
||||
import io
|
||||
from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, BackgroundTasks
|
||||
from typing import Any, Dict, Optional
|
||||
import uuid
|
||||
import re
|
||||
import requests
|
||||
import os
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
from modules.auth.supabase_bearer import SupabaseBearer, verify_supabase_jwt_str
|
||||
from modules.logger_tool import initialise_logger
|
||||
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
||||
from modules.database.supabase.utils.storage import StorageAdmin
|
||||
from modules.document_processor import DocumentProcessor
|
||||
from modules.queue_system import (
|
||||
enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task,
|
||||
enqueue_document_analysis_task, enqueue_page_images_task,
|
||||
TaskPriority, get_queue, QueueConnectionError
|
||||
)
|
||||
from fastapi.responses import Response
|
||||
from fastapi import Body
|
||||
|
||||
router = APIRouter()
|
||||
auth = SupabaseBearer()
|
||||
doc_processor = DocumentProcessor()
|
||||
|
||||
DEFAULT_BUCKET = os.getenv('DEFAULT_FILES_BUCKET', 'cc.users')
|
||||
|
||||
# Timeout configurations (in seconds)
|
||||
TIKA_TIMEOUT = int(os.getenv('TIKA_TIMEOUT', '300')) # 5 minutes default
|
||||
DOCLING_FRONTMATTER_TIMEOUT = int(os.getenv('DOCLING_FRONTMATTER_TIMEOUT', '1800')) # 30 minutes default
|
||||
DOCLING_NOOCR_TIMEOUT = int(os.getenv('DOCLING_NOOCR_TIMEOUT', '3600')) # 1 hour default
|
||||
|
||||
# (Legacy feature flags removed - using new three-phase system)
|
||||
|
||||
logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
|
||||
|
||||
def _safe_filename(name: str) -> str:
|
||||
base = os.path.basename(name or 'file')
|
||||
return re.sub(r"[^A-Za-z0-9._-]+", "_", base)
|
||||
|
||||
def _choose_bucket(scope: str, user_id: str, school_id: Optional[str]) -> str:
|
||||
scope = (scope or 'teacher').lower()
|
||||
if scope == 'school' and school_id:
|
||||
return f"cc.institutes.{school_id}.private"
|
||||
# teacher / student fall back to users bucket for now
|
||||
return 'cc.users'
|
||||
|
||||
@router.post("/files/upload")
|
||||
async def upload_file(
|
||||
cabinet_id: str = Form(...),
|
||||
path: str = Form(...),
|
||||
scope: str = Form('teacher'),
|
||||
school_id: Optional[str] = Form(default=None),
|
||||
file: UploadFile = File(...),
|
||||
payload: Dict[str, Any] = Depends(auth),
|
||||
background_tasks: BackgroundTasks = None
|
||||
):
|
||||
user_id = payload.get('sub') or payload.get('user_id')
|
||||
if not user_id:
|
||||
raise HTTPException(status_code=401, detail="Invalid token payload")
|
||||
|
||||
client = SupabaseServiceRoleClient()
|
||||
storage = StorageAdmin()
|
||||
|
||||
# Determine target bucket by scope
|
||||
bucket = _choose_bucket(scope, user_id, school_id)
|
||||
|
||||
# Stage DB row to get file_id
|
||||
staged_path = f"{cabinet_id}/staging/{uuid.uuid4()}"
|
||||
name = _safe_filename(path or file.filename)
|
||||
file_bytes = await file.read()
|
||||
insert_res = client.supabase.table('files').insert({
|
||||
'cabinet_id': cabinet_id,
|
||||
'name': name,
|
||||
'path': staged_path,
|
||||
'bucket': bucket,
|
||||
'mime_type': file.content_type,
|
||||
'uploaded_by': user_id,
|
||||
'size_bytes': len(file_bytes),
|
||||
'source': 'classroomcopilot-web'
|
||||
}).execute()
|
||||
if not insert_res.data:
|
||||
raise HTTPException(status_code=500, detail="Failed to create file record")
|
||||
file_row = insert_res.data[0]
|
||||
file_id = file_row['id']
|
||||
|
||||
# Final storage path: bucket/cabinet_id/file_id/file
|
||||
final_storage_path = f"{cabinet_id}/{file_id}/{name}"
|
||||
try:
|
||||
storage.upload_file(bucket, final_storage_path, file_bytes, file.content_type or 'application/octet-stream', upsert=True)
|
||||
except Exception as e:
|
||||
# cleanup staged row
|
||||
client.supabase.table('files').delete().eq('id', file_id).execute()
|
||||
raise HTTPException(status_code=500, detail=f"Storage upload failed: {str(e)}")
|
||||
|
||||
# Update DB path to final
|
||||
update_res = client.supabase.table('files').update({
|
||||
'path': final_storage_path
|
||||
}).eq('id', file_id).execute()
|
||||
# Kick off initial artefacts generation in background (Tika + Docling frontmatter + no-OCR)
|
||||
try:
|
||||
if background_tasks is not None:
|
||||
logger.info(f"Scheduling initial artefacts generation for file_id={file_id}")
|
||||
background_tasks.add_task(generate_initial_artefacts, file_id, payload)
|
||||
else:
|
||||
logger.info(f"Running initial artefacts generation synchronously for file_id={file_id}")
|
||||
generate_initial_artefacts(file_id, payload)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to schedule initial artefacts for file_id={file_id}: {e}")
|
||||
|
||||
return update_res.data
|
||||
|
||||
@router.get("/files")
|
||||
def list_files(cabinet_id: str, payload: Dict[str, Any] = Depends(auth)):
|
||||
client = SupabaseServiceRoleClient()
|
||||
res = client.supabase.table('files').select('*').eq('cabinet_id', cabinet_id).execute()
|
||||
return res.data
|
||||
|
||||
@router.post("/files/{file_id}/move")
|
||||
def move_file(file_id: str, body: Dict[str, Any], payload: Dict[str, Any] = Depends(auth)):
|
||||
client = SupabaseServiceRoleClient()
|
||||
updates = {}
|
||||
if 'cabinet_id' in body:
|
||||
updates['cabinet_id'] = body['cabinet_id']
|
||||
if 'path' in body:
|
||||
updates['path'] = body['path']
|
||||
if not updates:
|
||||
raise HTTPException(status_code=400, detail="No changes provided")
|
||||
res = client.supabase.table('files').update(updates).eq('id', file_id).execute()
|
||||
return res.data
|
||||
|
||||
@router.delete("/files/{file_id}")
|
||||
def delete_file(file_id: str, payload: Dict[str, Any] = Depends(auth)):
|
||||
client = SupabaseServiceRoleClient()
|
||||
res = client.supabase.table('files').delete().eq('id', file_id).execute()
|
||||
return res.data
|
||||
|
||||
@router.get("/files/{file_id}/artefacts")
|
||||
def list_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)):
|
||||
client = SupabaseServiceRoleClient()
|
||||
res = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).order('created_at', desc=True).execute()
|
||||
return res.data
|
||||
|
||||
@router.get("/files/{file_id}/viewer-artefacts")
|
||||
def list_viewer_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)):
|
||||
"""
|
||||
Get artefacts organized for UI viewer display, including frontmatter JSON,
|
||||
processing bundles, and analysis data with proper display metadata.
|
||||
"""
|
||||
client = SupabaseServiceRoleClient()
|
||||
|
||||
# Get all artefacts for the file
|
||||
res = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).order('created_at', desc=True).execute()
|
||||
all_artefacts = res.data or []
|
||||
|
||||
# Organize artefacts by category for UI display
|
||||
viewer_artefacts = {
|
||||
'document_analysis': [],
|
||||
'processing_bundles': [],
|
||||
'raw_data': []
|
||||
}
|
||||
|
||||
for artefact in all_artefacts:
|
||||
artefact_type = artefact.get('type', '')
|
||||
extra = artefact.get('extra', {})
|
||||
|
||||
# Enhanced artefact info for UI display
|
||||
artefact_info = {
|
||||
'id': artefact['id'],
|
||||
'type': artefact_type,
|
||||
'display_name': extra.get('display_name'),
|
||||
'bundle_label': extra.get('bundle_label'),
|
||||
'section_title': extra.get('section_title'),
|
||||
'page_range': extra.get('page_range'),
|
||||
'page_count': extra.get('page_count'),
|
||||
'pipeline': extra.get('pipeline'),
|
||||
'processing_mode': extra.get('processing_mode'),
|
||||
'ui_order': extra.get('ui_order', 999),
|
||||
'description': extra.get('description'),
|
||||
'viewer_type': extra.get('viewer_type', 'json'),
|
||||
'created_at': artefact['created_at'],
|
||||
'status': artefact.get('status', 'unknown')
|
||||
}
|
||||
|
||||
# Categorize artefacts for UI organization
|
||||
if artefact_type == 'docling_frontmatter_json':
|
||||
artefact_info.update({
|
||||
'display_name': artefact_info['display_name'] or 'Document Frontmatter',
|
||||
'bundle_label': artefact_info['bundle_label'] or 'Frontmatter Analysis',
|
||||
'description': artefact_info['description'] or 'OCR analysis of document structure and metadata',
|
||||
'ui_order': 1,
|
||||
'viewer_type': 'json'
|
||||
})
|
||||
viewer_artefacts['document_analysis'].append(artefact_info)
|
||||
|
||||
elif artefact_type == 'split_map_json':
|
||||
artefact_info.update({
|
||||
'display_name': 'Document Structure Map',
|
||||
'bundle_label': 'Split Map',
|
||||
'description': 'Document section boundaries and organization structure',
|
||||
'ui_order': 2,
|
||||
'viewer_type': 'json'
|
||||
})
|
||||
viewer_artefacts['document_analysis'].append(artefact_info)
|
||||
|
||||
elif artefact_type == 'tika_json':
|
||||
artefact_info.update({
|
||||
'display_name': 'Document Metadata',
|
||||
'bundle_label': 'Tika Analysis',
|
||||
'description': 'Raw document metadata and properties extracted by Apache Tika',
|
||||
'ui_order': 3,
|
||||
'viewer_type': 'json'
|
||||
})
|
||||
viewer_artefacts['raw_data'].append(artefact_info)
|
||||
|
||||
elif artefact_type in ['canonical_docling_json', 'docling_bundle_split', 'docling_bundle', 'docling_standard', 'docling_bundle_split_pages']:
|
||||
# Processing bundles (OCR, No-OCR, VLM) - use original_pipeline for proper differentiation
|
||||
pipeline_name = extra.get('original_pipeline', extra.get('pipeline', 'Unknown'))
|
||||
bundle_label = artefact_info['bundle_label'] or f"{pipeline_name.upper().replace('_', '-')} Bundle"
|
||||
display_name = artefact_info['display_name'] or f"{pipeline_name.upper().replace('_', '-')} Processing Result"
|
||||
|
||||
# Special handling for master manifests
|
||||
if artefact_type == 'docling_bundle_split_pages':
|
||||
display_name = f"{pipeline_name.upper().replace('_', '-')} Document Pages"
|
||||
bundle_label = f"{pipeline_name.upper().replace('_', '-')} Pages Bundle"
|
||||
artefact_info.update({
|
||||
'viewer_type': 'bundle_collection',
|
||||
'is_master_manifest': True,
|
||||
'ui_order': 10 # Show master manifests before individual pages
|
||||
})
|
||||
elif artefact_type == 'docling_standard':
|
||||
# Individual page bundles - lower UI priority
|
||||
artefact_info.update({
|
||||
'viewer_type': 'page_bundle',
|
||||
'is_individual_page': True,
|
||||
'ui_order': extra.get('split_order', 999) + 100 # Show after master manifests
|
||||
})
|
||||
|
||||
artefact_info.update({
|
||||
'display_name': display_name,
|
||||
'bundle_label': bundle_label,
|
||||
'description': f"Docling processing result using {pipeline_name.replace('_', '-')} pipeline",
|
||||
'pipeline_type': pipeline_name # Add explicit pipeline type for UI
|
||||
})
|
||||
viewer_artefacts['processing_bundles'].append(artefact_info)
|
||||
|
||||
elif artefact_type.startswith('docling_') and artefact_type.endswith('_json'):
|
||||
# Other docling JSON results
|
||||
pipeline_name = artefact_type.replace('docling_', '').replace('_json', '').upper()
|
||||
artefact_info.update({
|
||||
'display_name': f"{pipeline_name} Analysis",
|
||||
'bundle_label': f"{pipeline_name} Result",
|
||||
'description': f"Docling {pipeline_name.lower()} processing result",
|
||||
'viewer_type': 'json'
|
||||
})
|
||||
viewer_artefacts['processing_bundles'].append(artefact_info)
|
||||
|
||||
elif artefact_type == 'page_images':
|
||||
artefact_info.update({
|
||||
'display_name': 'Page Images',
|
||||
'bundle_label': 'Visual Pages',
|
||||
'description': 'Generated page images for document visualization',
|
||||
'viewer_type': 'images'
|
||||
})
|
||||
viewer_artefacts['raw_data'].append(artefact_info)
|
||||
|
||||
# Sort each category by ui_order
|
||||
for category in viewer_artefacts.values():
|
||||
category.sort(key=lambda x: (x['ui_order'], x['created_at']))
|
||||
|
||||
return {
|
||||
'file_id': file_id,
|
||||
'categories': viewer_artefacts,
|
||||
'total_artefacts': len(all_artefacts)
|
||||
}
|
||||
|
||||
@router.post("/files/{file_id}/artefacts/initial")
|
||||
def generate_initial_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)):
|
||||
"""
|
||||
Generate initial artefacts using the new three-phase pipeline architecture.
|
||||
|
||||
Phase 1: Document Structure Discovery & Analysis
|
||||
- Tika metadata extraction
|
||||
- Page images generation
|
||||
- Document structure analysis (LLM-enhanced)
|
||||
- Split map generation
|
||||
|
||||
Phase 2: Triggered automatically after Phase 1 completion
|
||||
"""
|
||||
logger.info(f"Three-phase pipeline: Starting Phase 1 for file_id={file_id}")
|
||||
|
||||
from modules.pipeline_controller import get_pipeline_controller
|
||||
|
||||
client = SupabaseServiceRoleClient()
|
||||
storage = StorageAdmin()
|
||||
controller = get_pipeline_controller()
|
||||
|
||||
# Load file row
|
||||
fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
|
||||
file_row = fr.data
|
||||
if not file_row:
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
|
||||
bucket = file_row['bucket']
|
||||
storage_path = file_row['path']
|
||||
cabinet_id = file_row['cabinet_id']
|
||||
mime = file_row.get('mime_type') or 'application/octet-stream'
|
||||
filename = file_row.get('name', 'file')
|
||||
|
||||
# Step 1: Convert to PDF if not already a PDF (synchronous for now)
|
||||
processing_path = storage_path
|
||||
processing_mime = mime
|
||||
|
||||
if mime != 'application/pdf':
|
||||
logger.info(f"Converting non-PDF file to PDF: file_id={file_id} mime={mime}")
|
||||
try:
|
||||
file_bytes = storage.download_file(bucket, storage_path)
|
||||
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Save original file to temp location
|
||||
temp_input = Path(temp_dir) / filename
|
||||
with open(temp_input, 'wb') as f:
|
||||
f.write(file_bytes)
|
||||
|
||||
# Convert to PDF
|
||||
pdf_bytes = doc_processor.convert_to_pdf(temp_input)
|
||||
|
||||
# Store PDF as artefact
|
||||
pdf_artefact_id = str(uuid.uuid4())
|
||||
pdf_rel_path = f"{cabinet_id}/{file_id}/{pdf_artefact_id}/document.pdf"
|
||||
storage.upload_file(bucket, pdf_rel_path, pdf_bytes, 'application/pdf', upsert=True)
|
||||
|
||||
pdf_ar = client.supabase.table('document_artefacts').insert({
|
||||
'file_id': file_id,
|
||||
'type': 'document_pdf',
|
||||
'rel_path': pdf_rel_path,
|
||||
'extra': {'converted_from': mime, 'original_filename': filename},
|
||||
'status': 'completed'
|
||||
}).execute()
|
||||
|
||||
# Use converted PDF for subsequent processing
|
||||
processing_path = pdf_rel_path
|
||||
processing_mime = 'application/pdf'
|
||||
logger.info(f"PDF conversion: completed file_id={file_id} rel_path={pdf_rel_path}")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"PDF conversion: error processing file_id={file_id}: {e}")
|
||||
# Continue with original file if conversion fails
|
||||
else:
|
||||
logger.info(f"File is already PDF, skipping conversion: file_id={file_id}")
|
||||
|
||||
# Step 2: Enqueue Phase 1 tasks using the new pipeline controller
|
||||
user_id = payload.get('sub') or payload.get('user_id')
|
||||
priority = TaskPriority.HIGH if user_id else TaskPriority.NORMAL
|
||||
|
||||
try:
|
||||
# Update file row with processing path
|
||||
updated_file_row = {**file_row, 'path': processing_path, 'mime_type': processing_mime}
|
||||
|
||||
# Enqueue Phase 1 tasks
|
||||
phase1_tasks = controller.enqueue_phase1_tasks(
|
||||
file_id=file_id,
|
||||
file_row=updated_file_row,
|
||||
processing_path=processing_path,
|
||||
processing_mime=processing_mime,
|
||||
priority=priority
|
||||
)
|
||||
|
||||
total_tasks = sum(len(task_list) for task_list in phase1_tasks.values())
|
||||
|
||||
logger.info(f"Three-phase pipeline: Enqueued {total_tasks} Phase 1 tasks for file_id={file_id}")
|
||||
|
||||
|
||||
return {
|
||||
'message': f'Three-phase pipeline: Enqueued {total_tasks} Phase 1 tasks. Phase 2 will trigger automatically after completion.',
|
||||
'phase1_tasks': {k: v for k, v in phase1_tasks.items()},
|
||||
'file_id': file_id,
|
||||
'pipeline_mode': 'three_phase',
|
||||
'bundle_architecture_enabled': True
|
||||
}
|
||||
|
||||
except QueueConnectionError as e:
|
||||
logger.error(f"Queue system unavailable for file_id={file_id}: {e}")
|
||||
logger.error("Redis is not running. Please start the API server with './start.sh dev' to auto-start Redis.")
|
||||
return {
|
||||
'message': 'File uploaded successfully, but processing tasks could not be queued (Redis unavailable)',
|
||||
'file_id': file_id,
|
||||
'queue_status': 'unavailable',
|
||||
'error': 'Queue system unavailable. Please restart the API server with Redis enabled.'
|
||||
}
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error enqueueing Phase 1 tasks for file_id={file_id}: {e}")
|
||||
return {
|
||||
'message': 'File uploaded successfully, but processing tasks failed to queue',
|
||||
'file_id': file_id,
|
||||
'queue_status': 'failed',
|
||||
'error': str(e)
|
||||
}
|
||||
|
||||
@router.get("/files/{file_id}/page-images/manifest")
|
||||
def get_page_images_manifest(file_id: str, payload: Dict[str, Any] = Depends(auth)):
|
||||
"""Return the page_images manifest JSON for a file via service-role access."""
|
||||
client = SupabaseServiceRoleClient()
|
||||
storage = StorageAdmin()
|
||||
|
||||
# Find file row to get bucket
|
||||
fr = client.supabase.table('files').select('id,bucket,cabinet_id').eq('id', file_id).single().execute()
|
||||
file_row = fr.data or {}
|
||||
if not file_row:
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
bucket = file_row['bucket']
|
||||
cabinet_id = file_row['cabinet_id']
|
||||
|
||||
# Find page_images artefact
|
||||
arts = client.supabase.table('document_artefacts') \
|
||||
.select('id,type,rel_path,extra') \
|
||||
.eq('file_id', file_id).eq('type', 'page_images') \
|
||||
.order('created_at', desc=True).limit(1).execute().data or []
|
||||
if not arts:
|
||||
raise HTTPException(status_code=404, detail="page_images artefact not found")
|
||||
art = arts[0]
|
||||
|
||||
# Manifest path
|
||||
manifest_rel_path = (art.get('extra') or {}).get('manifest') or f"{art['rel_path'].rstrip('/')}/page_images.json"
|
||||
|
||||
try:
|
||||
raw = storage.download_file(bucket, manifest_rel_path)
|
||||
import json as _json
|
||||
manifest = _json.loads(raw.decode('utf-8'))
|
||||
# Ensure bucket and base prefix are present for the UI
|
||||
manifest.setdefault('bucket', bucket)
|
||||
manifest.setdefault('base_dir', art['rel_path'])
|
||||
return manifest
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Failed to load manifest: {e}")
|
||||
|
||||
def json_dumps(obj: Any) -> str:
|
||||
try:
|
||||
import json
|
||||
return json.dumps(obj, ensure_ascii=False)
|
||||
except Exception:
|
||||
return "{}"
|
||||
|
||||
|
||||
@router.get("/files/{file_id}/artefacts/{artefact_id}/json")
|
||||
def get_artefact_json(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)):
|
||||
"""Return the JSON content of a document artefact using service-role storage access."""
|
||||
client = SupabaseServiceRoleClient()
|
||||
storage = StorageAdmin()
|
||||
# Look up artefact to get rel_path and validate it belongs to file
|
||||
ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path').eq('id', artefact_id).single().execute()
|
||||
artefact = ar.data
|
||||
if not artefact:
|
||||
raise HTTPException(status_code=404, detail="Artefact not found")
|
||||
if artefact.get('file_id') != file_id:
|
||||
raise HTTPException(status_code=400, detail="Artefact does not belong to file")
|
||||
|
||||
# Look up file to get bucket
|
||||
fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute()
|
||||
file_row = fr.data
|
||||
if not file_row:
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
|
||||
bucket = file_row['bucket']
|
||||
rel_path = artefact['rel_path']
|
||||
try:
|
||||
raw = storage.download_file(bucket, rel_path)
|
||||
import json as _json
|
||||
data = _json.loads(raw.decode('utf-8'))
|
||||
return data
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Failed to load artefact JSON: {str(e)}")
|
||||
|
||||
|
||||
@router.get("/files/{file_id}/artefacts/{artefact_id}/vlm-section-manifest")
|
||||
def get_vlm_section_manifest(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)):
|
||||
"""Return the VLM section page bundle manifest JSON for a VLM section bundle artefact."""
|
||||
client = SupabaseServiceRoleClient()
|
||||
storage = StorageAdmin()
|
||||
|
||||
ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path,type,extra').eq('id', artefact_id).single().execute().data
|
||||
if not ar:
|
||||
raise HTTPException(status_code=404, detail="Artefact not found")
|
||||
if ar.get('file_id') != file_id:
|
||||
raise HTTPException(status_code=400, detail="Artefact does not belong to file")
|
||||
if ar.get('type') != 'vlm_section_page_bundle':
|
||||
raise HTTPException(status_code=400, detail="Artefact is not a VLM section page bundle")
|
||||
|
||||
fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute().data
|
||||
if not fr:
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
bucket = fr['bucket']
|
||||
|
||||
# The rel_path directly points to the manifest JSON file
|
||||
manifest_rel_path = ar['rel_path']
|
||||
|
||||
try:
|
||||
raw = storage.download_file(bucket, manifest_rel_path)
|
||||
import json as _json
|
||||
data = _json.loads(raw.decode('utf-8'))
|
||||
# ensure bucket present for client use
|
||||
data.setdefault('bucket', bucket)
|
||||
return data
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Failed to load VLM section manifest: {e}")
|
||||
|
||||
|
||||
@router.post("/files/{file_id}/artefacts/outline")
|
||||
def enqueue_outline_structure(file_id: str, payload: Dict[str, Any] = Depends(auth)):
|
||||
"""
|
||||
Manually enqueue the fast document outline (headings-only) analysis for an existing file.
|
||||
Returns the queued task id.
|
||||
"""
|
||||
client = SupabaseServiceRoleClient()
|
||||
|
||||
fr = client.supabase.table('files').select('id,bucket,cabinet_id,path,mime_type').eq('id', file_id).single().execute()
|
||||
file_row = fr.data
|
||||
if not file_row:
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
|
||||
bucket = file_row['bucket']
|
||||
storage_path = file_row['path']
|
||||
cabinet_id = file_row['cabinet_id']
|
||||
mime = file_row.get('mime_type') or 'application/pdf'
|
||||
|
||||
# Prefer converted PDF artefact if available
|
||||
arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or []
|
||||
pdf_art = next((a for a in arts if a.get('type') == 'document_pdf'), None)
|
||||
processing_path = pdf_art['rel_path'] if pdf_art else storage_path
|
||||
|
||||
try:
|
||||
task_id = enqueue_docling_task(
|
||||
file_id=file_id,
|
||||
task_type='document_structure_analysis',
|
||||
payload={
|
||||
'bucket': bucket,
|
||||
'file_path': processing_path,
|
||||
'cabinet_id': cabinet_id,
|
||||
'mime_type': mime,
|
||||
'config': {
|
||||
'target_type': 'inbody',
|
||||
'to_formats': 'json',
|
||||
'do_ocr': False,
|
||||
'force_ocr': False
|
||||
}
|
||||
},
|
||||
priority=TaskPriority.NORMAL,
|
||||
timeout=300
|
||||
)
|
||||
return { 'message': 'outline task enqueued', 'task_id': task_id, 'file_id': file_id }
|
||||
except QueueConnectionError as e:
|
||||
raise HTTPException(status_code=503, detail=f"Queue unavailable: {e}")
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Failed to enqueue outline task: {e}")
|
||||
|
||||
@router.get("/files/proxy")
|
||||
def proxy_storage_file(bucket: str, path: str, payload: Dict[str, Any] = Depends(auth)):
|
||||
"""Proxy a storage file (service-role), useful for private image access in the UI."""
|
||||
storage = StorageAdmin()
|
||||
try:
|
||||
data = storage.download_file(bucket, path)
|
||||
media = 'application/octet-stream'
|
||||
lp = path.lower()
|
||||
if lp.endswith('.png'):
|
||||
media = 'image/png'
|
||||
elif lp.endswith('.webp'):
|
||||
media = 'image/webp'
|
||||
elif lp.endswith('.jpg') or lp.endswith('.jpeg'):
|
||||
media = 'image/jpeg'
|
||||
elif lp.endswith('.json'):
|
||||
media = 'application/json'
|
||||
return Response(content=data, media_type=media)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=404, detail=f"File not found or inaccessible: {e}")
|
||||
|
||||
|
||||
# Signed proxy for iframe/img tags without Authorization header
|
||||
@router.get("/files/proxy_signed")
|
||||
def proxy_storage_file_signed(bucket: str, path: str, token: str):
|
||||
"""Proxy using a signed bearer token passed as query param 'token'."""
|
||||
try:
|
||||
payload = verify_supabase_jwt_str(token)
|
||||
if not payload:
|
||||
raise HTTPException(status_code=403, detail="Invalid token")
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=403, detail=f"Invalid token: {e}")
|
||||
|
||||
storage = StorageAdmin()
|
||||
try:
|
||||
data = storage.download_file(bucket, path)
|
||||
media = 'application/octet-stream'
|
||||
lp = path.lower()
|
||||
if lp.endswith('.png'):
|
||||
media = 'image/png'
|
||||
elif lp.endswith('.webp'):
|
||||
media = 'image/webp'
|
||||
elif lp.endswith('.jpg') or lp.endswith('.jpeg'):
|
||||
media = 'image/jpeg'
|
||||
elif lp.endswith('.json'):
|
||||
media = 'application/json'
|
||||
return Response(content=data, media_type=media)
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=404, detail=f"File not found or inaccessible: {e}")
|
||||
|
||||
# -------- Canonical bundle manifest ---------
|
||||
|
||||
@router.get("/files/{file_id}/artefacts/{artefact_id}/manifest")
|
||||
def get_canonical_manifest(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)):
|
||||
"""Return the manifest.json for a canonical_docling_bundle artefact."""
|
||||
client = SupabaseServiceRoleClient()
|
||||
storage = StorageAdmin()
|
||||
|
||||
ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path,extra').eq('id', artefact_id).single().execute().data
|
||||
if not ar:
|
||||
raise HTTPException(status_code=404, detail="Artefact not found")
|
||||
if ar.get('file_id') != file_id:
|
||||
raise HTTPException(status_code=400, detail="Artefact does not belong to file")
|
||||
extra = ar.get('extra') or {}
|
||||
manifest_rel_path = extra.get('manifest')
|
||||
if not manifest_rel_path:
|
||||
raise HTTPException(status_code=404, detail="Manifest path not recorded on artefact")
|
||||
|
||||
fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute().data
|
||||
if not fr:
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
bucket = fr['bucket']
|
||||
|
||||
try:
|
||||
raw = storage.download_file(bucket, manifest_rel_path)
|
||||
import json as _json
|
||||
data = _json.loads(raw.decode('utf-8'))
|
||||
# ensure bucket present for client use
|
||||
data.setdefault('bucket', bucket)
|
||||
return data
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=500, detail=f"Failed to load manifest: {e}")
|
||||
|
||||
# -------- Canonical Docling generation ---------
|
||||
|
||||
def _load_split_map(client: SupabaseServiceRoleClient, storage: StorageAdmin, bucket: str, file_id: str) -> Optional[Dict[str, Any]]:
|
||||
try:
|
||||
arts = client.supabase.table('document_artefacts') \
|
||||
.select('id,type,rel_path') \
|
||||
.eq('file_id', file_id).eq('type', 'split_map_json') \
|
||||
.order('created_at', desc=True).limit(1).execute().data or []
|
||||
if not arts:
|
||||
return None
|
||||
art = arts[0]
|
||||
raw = storage.download_file(bucket, art['rel_path'])
|
||||
import json as _json
|
||||
return _json.loads(raw.decode('utf-8'))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
@router.post("/files/{file_id}/artefacts/canonical-docling")
|
||||
def enqueue_canonical_docling(
|
||||
file_id: str,
|
||||
body: Dict[str, Any] = Body(default={}),
|
||||
payload: Dict[str, Any] = Depends(auth)
|
||||
):
|
||||
"""Enqueue generation of canonical Docling JSON(s) for a file.
|
||||
|
||||
If a split_map is available and the document is large, this will enqueue
|
||||
multiple Docling jobs using page ranges per section. Otherwise a single
|
||||
job is created for the whole document.
|
||||
"""
|
||||
client = SupabaseServiceRoleClient()
|
||||
storage = StorageAdmin()
|
||||
|
||||
fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
|
||||
file_row = fr.data
|
||||
if not file_row:
|
||||
raise HTTPException(status_code=404, detail="File not found")
|
||||
|
||||
bucket = file_row['bucket']
|
||||
cabinet_id = file_row['cabinet_id']
|
||||
mime = file_row.get('mime_type') or 'application/pdf'
|
||||
storage_path = file_row['path']
|
||||
|
||||
# Prefer converted PDF if available
|
||||
try:
|
||||
arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or []
|
||||
a_pdf = next((a for a in arts if a.get('type') == 'document_pdf'), None)
|
||||
processing_path = a_pdf['rel_path'] if a_pdf else storage_path
|
||||
processing_mime = 'application/pdf' if a_pdf else mime
|
||||
except Exception:
|
||||
processing_path = storage_path
|
||||
processing_mime = mime
|
||||
|
||||
# Determine page_count (prefer Tika; fallback to PDF parser if needed)
|
||||
page_count = None
|
||||
try:
|
||||
arts_pc = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).execute().data or []
|
||||
a_tika_pc = next((a for a in arts_pc if a.get('type') == 'tika_json'), None)
|
||||
if a_tika_pc:
|
||||
raw = storage.download_file(bucket, a_tika_pc['rel_path'])
|
||||
import json as _json
|
||||
tj = _json.loads(raw.decode('utf-8'))
|
||||
for k in ("xmpTPg:NPages", "Page-Count", "pdf:PageCount"):
|
||||
v = tj.get(k) or tj.get(k.lower())
|
||||
if v is not None:
|
||||
page_count = int(v)
|
||||
break
|
||||
except Exception as e:
|
||||
logger.debug(f"[canonical-docling] Tika page_count read failed: {e}")
|
||||
pass
|
||||
|
||||
# Fallback: compute page_count from PDF if Tika did not provide it
|
||||
if page_count is None:
|
||||
try:
|
||||
pdf_bytes = storage.download_file(bucket, processing_path)
|
||||
try:
|
||||
import fitz # PyMuPDF
|
||||
doc = fitz.open(stream=pdf_bytes, filetype='pdf')
|
||||
page_count = int(doc.page_count)
|
||||
doc.close()
|
||||
logger.info(f"[canonical-docling] page_count via PyMuPDF: {page_count}")
|
||||
except Exception:
|
||||
try:
|
||||
from PyPDF2 import PdfReader
|
||||
reader = PdfReader(io.BytesIO(pdf_bytes))
|
||||
page_count = int(len(reader.pages))
|
||||
logger.info(f"[canonical-docling] page_count via PyPDF2: {page_count}")
|
||||
except Exception:
|
||||
page_count = None
|
||||
except Exception:
|
||||
page_count = None
|
||||
else:
|
||||
logger.info(f"[canonical-docling] page_count via Tika: {page_count}")
|
||||
|
||||
# Optional custom range from caller
|
||||
custom_range = body.get('custom_range')
|
||||
custom_label = body.get('custom_label') or ''
|
||||
selected_section_id = body.get('selected_section_id')
|
||||
selected_section_title = body.get('selected_section_title')
|
||||
|
||||
# Load split map if requested and document is large enough
|
||||
use_split_requested = bool(body.get('use_split_map', True))
|
||||
split_threshold = int(body.get('threshold') or os.getenv('DOCLING_SPLIT_THRESHOLD', '50'))
|
||||
ranges = [] # list of (start,end)
|
||||
split_map = None
|
||||
sections = [] # list of dicts: {start,end,title}
|
||||
logger.info(f"[canonical-docling] use_split_map={use_split_requested} threshold={split_threshold} page_count={page_count}")
|
||||
# If custom range provided, honor it and bypass split map
|
||||
if isinstance(custom_range, list) and len(custom_range) >= 2:
|
||||
try:
|
||||
cs = int(custom_range[0]); ce = int(custom_range[1])
|
||||
if page_count is not None:
|
||||
cs = max(1, min(cs, page_count))
|
||||
ce = max(cs, min(ce, page_count))
|
||||
ranges = [(cs, ce)]
|
||||
sections = [{'start': cs, 'end': ce, 'title': custom_label or 'Custom range'}]
|
||||
use_split_requested = False
|
||||
logger.info(f"[canonical-docling] using custom_range start={cs} end={ce} label='{custom_label}'")
|
||||
except Exception as _e:
|
||||
logger.warning(f"[canonical-docling] invalid custom_range; falling back. err={_e}")
|
||||
|
||||
if not ranges and use_split_requested and (page_count is None or page_count >= split_threshold):
|
||||
split_map = _load_split_map(client, storage, bucket, file_id)
|
||||
entries = (split_map or {}).get('entries') if split_map else []
|
||||
logger.info(f"[canonical-docling] split_map loaded entries={len(entries) if isinstance(entries, list) else 0}")
|
||||
if split_map and isinstance(entries, list) and len(entries) > 0:
|
||||
# Normalize and sort entries by start_page to enforce correct order
|
||||
norm: list[dict] = []
|
||||
for e in entries:
|
||||
try:
|
||||
s = int(e.get('start_page', 1))
|
||||
t = int(e.get('end_page', s))
|
||||
if t < s:
|
||||
t = s
|
||||
title = e.get('title') or e.get('label') or ''
|
||||
norm.append({'start': s, 'end': t, 'title': title})
|
||||
except Exception:
|
||||
continue
|
||||
norm.sort(key=lambda x: x['start'])
|
||||
# Deduplicate identical or overlapping starts by keeping the earliest occurrence
|
||||
ordered: list[dict] = []
|
||||
last_end = 0
|
||||
for e in norm:
|
||||
s, t = int(e['start']), int(e['end'])
|
||||
if ordered and s <= last_end:
|
||||
# Clamp to prevent inversion and maintain order
|
||||
s = last_end + 1
|
||||
if s > (page_count or s):
|
||||
continue
|
||||
if t < s:
|
||||
t = s
|
||||
last_end = max(last_end, t)
|
||||
ordered.append({'start': s, 'end': t, 'title': e['title']})
|
||||
for e in ordered:
|
||||
ranges.append((e['start'], e['end']))
|
||||
sections.append(e)
|
||||
|
||||
# Fallback: if no split_map ranges... we shouldn't be here
|
||||
if not ranges:
|
||||
# If document is large, split into fixed windows to protect Docling server
|
||||
if page_count is not None and page_count >= split_threshold:
|
||||
chunk = int(os.getenv('DOCLING_FALLBACK_CHUNK_PAGES', '25'))
|
||||
chunk = max(5, min(100, chunk))
|
||||
for i in range(1, (page_count or 1) + 1, chunk):
|
||||
end = min(i + chunk - 1, page_count or i)
|
||||
ranges.append((i, end))
|
||||
sections.append({'start': i, 'end': end, 'title': f"Pages {i}-{end}"})
|
||||
logger.warning(f"[canonical-docling] using fallback chunking ranges={len(ranges)} chunk={chunk}")
|
||||
else:
|
||||
ranges = [(1, page_count or 9223372036854775807)]
|
||||
logger.warning(f"[canonical-docling] using single-range fallback (small doc)")
|
||||
|
||||
# Build config
|
||||
cfg = body.get('config', {})
|
||||
pipeline = cfg.get('pipeline', 'standard')
|
||||
config: Dict[str, Any] = {
|
||||
# target_type is computed in processor based on to_formats unless explicitly provided by user
|
||||
'to_formats': cfg.get('to_formats', 'json'),
|
||||
'do_ocr': bool(cfg.get('do_ocr', True)),
|
||||
'force_ocr': bool(cfg.get('force_ocr', False)),
|
||||
'image_export_mode': cfg.get('image_export_mode', 'embedded'),
|
||||
'ocr_engine': cfg.get('ocr_engine', 'easyocr'),
|
||||
'ocr_lang': cfg.get('ocr_lang', 'en'),
|
||||
'pdf_backend': cfg.get('pdf_backend', 'dlparse_v4'),
|
||||
'table_mode': cfg.get('table_mode', 'fast'),
|
||||
'pipeline': pipeline,
|
||||
'do_picture_classification': bool(cfg.get('do_picture_classification', False)),
|
||||
'do_picture_description': bool(cfg.get('do_picture_description', False)),
|
||||
}
|
||||
# If user explicitly set target_type, pass it through
|
||||
if 'target_type' in cfg:
|
||||
config['target_type'] = cfg['target_type']
|
||||
# Optional VLM settings (only include API fields if provided as JSON by caller)
|
||||
if config['do_picture_description']:
|
||||
pd_api = cfg.get('picture_description_api')
|
||||
if isinstance(pd_api, (dict, list)):
|
||||
config['picture_description_api'] = pd_api
|
||||
elif isinstance(pd_api, str) and pd_api.strip().startswith(('{', '[')):
|
||||
config['picture_description_api'] = pd_api
|
||||
if cfg.get('picture_description_prompt'):
|
||||
config['picture_description_prompt'] = cfg['picture_description_prompt']
|
||||
if pipeline == 'vlm':
|
||||
# Provider presets mapping
|
||||
provider = (cfg.get('vlm_provider') or '').strip().lower()
|
||||
provider_model = (cfg.get('vlm_provider_model') or '').strip()
|
||||
provider_base = (cfg.get('vlm_provider_base_url') or '').strip()
|
||||
if provider in ('ollama', 'openai') and provider_model:
|
||||
if provider == 'ollama':
|
||||
base_url = provider_base or os.getenv('OLLAMA_BASE_URL') or os.getenv('VLM_OLLAMA_BASE_URL')
|
||||
if base_url:
|
||||
endpoint = f"{base_url.rstrip('/')}/v1/chat/completions"
|
||||
# Use OpenAI provider schema against Ollama's OpenAI-compatible endpoint
|
||||
cfg_api = {
|
||||
'provider': 'openai',
|
||||
'url': endpoint,
|
||||
'model': provider_model,
|
||||
'response_format': 'markdown',
|
||||
'request_params': {'model': provider_model}
|
||||
}
|
||||
logger.info(f"[canonical-docling] VLM provider=ollama mapped to openai-compatible url={endpoint} model={provider_model}")
|
||||
config['vlm_pipeline_model_api'] = cfg_api
|
||||
# Also wire picture_description_api if picture description is enabled
|
||||
if config.get('do_picture_description'):
|
||||
config['picture_description_api'] = {
|
||||
'url': endpoint,
|
||||
'headers': {},
|
||||
'params': {'model': provider_model}
|
||||
}
|
||||
elif provider == 'openai':
|
||||
base_url = provider_base or os.getenv('OPENAI_BASE_URL') or 'https://api.openai.com/v1'
|
||||
api_key = os.getenv('OPENAI_API_KEY') or os.getenv('OPENAI_API_KEY_READONLY')
|
||||
# Do not inline key if not present; server may have default
|
||||
model_cfg: Dict[str, Any] = {
|
||||
'provider': 'openai',
|
||||
'url': f"{base_url.rstrip('/')}/chat/completions",
|
||||
'model': provider_model,
|
||||
'response_format': 'markdown',
|
||||
'request_params': {'model': provider_model}
|
||||
}
|
||||
if api_key:
|
||||
model_cfg['api_key'] = api_key
|
||||
# Also pass explicit Authorization header for servers that expect it
|
||||
model_cfg['headers'] = {
|
||||
'Authorization': f"Bearer {api_key}"
|
||||
}
|
||||
logger.info(f"[canonical-docling] VLM provider=openai url={model_cfg['url']} model={provider_model} api_key={'yes' if api_key else 'no'}")
|
||||
config['vlm_pipeline_model_api'] = model_cfg
|
||||
# Also wire picture_description_api if picture description is enabled
|
||||
if config.get('do_picture_description'):
|
||||
headers = {'Authorization': f"Bearer {api_key}"} if api_key else {}
|
||||
config['picture_description_api'] = {
|
||||
'url': f"{base_url.rstrip('/')}/chat/completions",
|
||||
'headers': headers,
|
||||
'params': {'model': provider_model}
|
||||
}
|
||||
else:
|
||||
# Pass through explicit API/local JSON if provided by caller
|
||||
vpa = cfg.get('vlm_pipeline_model_api')
|
||||
if isinstance(vpa, (dict, list)):
|
||||
config['vlm_pipeline_model_api'] = vpa
|
||||
elif isinstance(vpa, str) and vpa.strip().startswith(('{', '[')):
|
||||
config['vlm_pipeline_model_api'] = vpa
|
||||
|
||||
# Enqueue tasks for each range
|
||||
priority = TaskPriority.HIGH
|
||||
task_ids = []
|
||||
multi = len(ranges) > 1
|
||||
logger.info(f"[canonical-docling] final ranges={len(ranges)} multi={multi} pipeline={pipeline} producer={body.get('producer', 'manual')}")
|
||||
|
||||
# Create a group id for split bundles (used for UI grouping)
|
||||
# Use provided group_id if present (for two-pass auto system), otherwise generate new
|
||||
group_id = body.get('group_id') or (str(uuid.uuid4()) if multi else None)
|
||||
if multi and not sections:
|
||||
# Build sections from ranges if titles were not captured
|
||||
for (start, end) in ranges:
|
||||
sections.append({'start': int(start), 'end': int(end), 'title': ''})
|
||||
|
||||
idx = 0
|
||||
for (start, end) in ranges:
|
||||
# Locate title for this range if available
|
||||
title = ''
|
||||
if multi and sections and idx < len(sections):
|
||||
title = sections[idx].get('title') or ''
|
||||
idx += 1
|
||||
|
||||
cfg_range = dict(config)
|
||||
# Ensure 1-based inclusive range is passed through
|
||||
cfg_range['page_range'] = [max(1, int(start)), max(int(start), int(end))]
|
||||
extra = {
|
||||
'is_subdoc': multi,
|
||||
'page_range': [int(start), int(end)],
|
||||
'label': (title or f"subdoc p{int(start)}-{int(end)}") if multi else 'canonical'
|
||||
}
|
||||
# Attach selected section metadata if provided by caller
|
||||
if selected_section_id:
|
||||
extra['selected_section_id'] = selected_section_id
|
||||
if selected_section_title or custom_label:
|
||||
extra['selected_section_title'] = selected_section_title or custom_label
|
||||
# For split processing, force split bundle artefact type and add grouping/order metadata
|
||||
if multi:
|
||||
extra.update({
|
||||
# UI grouping metadata
|
||||
'split_order': idx,
|
||||
'split_heading': title,
|
||||
'split_total': len(ranges)
|
||||
})
|
||||
if group_id:
|
||||
extra['group_id'] = group_id
|
||||
extra['group_pack_type'] = 'docling_standard_auto_split'
|
||||
else:
|
||||
# Single-bundle case: allow caller to override type (defaults to canonical bundle)
|
||||
if 'artefact_type_override' in body and body.get('artefact_type_override'):
|
||||
extra['artefact_type_override'] = body.get('artefact_type_override')
|
||||
|
||||
# Mark producer and selection metadata
|
||||
extra['producer'] = body.get('producer') or ('auto_split' if (multi and body.get('use_split_map')) else 'manual')
|
||||
if selected_section_id:
|
||||
extra['selected_section_id'] = selected_section_id
|
||||
if selected_section_title or custom_label:
|
||||
extra['selected_section_title'] = selected_section_title or custom_label
|
||||
|
||||
# Enhanced logging for canonical operations
|
||||
if multi:
|
||||
logger.info(f"[canonical-docling] enqueue range idx={idx}/{len(ranges)} start={start} end={end} title='{title}' group_id={group_id} producer={extra.get('producer')} pipeline={pipeline}")
|
||||
else:
|
||||
logger.info(f"[canonical-docling] enqueue single range start={start} end={end} producer={extra.get('producer')} pipeline={pipeline}")
|
||||
tid = enqueue_docling_task(
|
||||
file_id=file_id,
|
||||
task_type='canonical_docling_subdoc_json' if multi else 'canonical_docling_json',
|
||||
payload={
|
||||
'bucket': bucket,
|
||||
'file_path': processing_path,
|
||||
'cabinet_id': cabinet_id,
|
||||
'mime_type': processing_mime,
|
||||
'config': cfg_range,
|
||||
'artefact_extra': extra,
|
||||
# Ensure canonical tasks respect upstream dependencies (e.g., Frontmatter)
|
||||
'depends_on': body.get('depends_on', []),
|
||||
# Pass through grouping info if provided by caller (kept for backward-compat)
|
||||
'group_pack_type': body.get('group_pack_type')
|
||||
},
|
||||
priority=priority,
|
||||
timeout=int(body.get('timeout', DOCLING_NOOCR_TIMEOUT))
|
||||
)
|
||||
task_ids.append(tid)
|
||||
|
||||
logger.info(f"[canonical-docling] completed enqueue file_id={file_id} tasks={len(task_ids)} ranges={len(ranges)} pipeline={pipeline} producer={body.get('producer','manual')} group_id={group_id if multi else 'single'}")
|
||||
|
||||
return {
|
||||
'message': f'enqueued {len(task_ids)} canonical docling job(s)',
|
||||
'task_ids': task_ids,
|
||||
'ranges': ranges,
|
||||
'used_split_map': bool(split_map),
|
||||
'group_id': group_id,
|
||||
'pipeline': pipeline,
|
||||
'producer': body.get('producer', 'manual')
|
||||
}
|
||||
|
||||
@ -1,411 +0,0 @@
|
||||
"""
|
||||
Memory-Aware Queue Management System
|
||||
====================================
|
||||
|
||||
Provides intelligent queue management based on memory usage and file sizes
|
||||
rather than simple task count limits. Supports multiple users with fair
|
||||
queuing and capacity management.
|
||||
|
||||
Features:
|
||||
- Memory-based queue limits (not just task count)
|
||||
- Fair queuing across multiple users
|
||||
- Upload capacity checking with user feedback
|
||||
- Graceful degradation under load
|
||||
- Service-specific memory tracking
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
import json
|
||||
import uuid
|
||||
import logging
|
||||
from typing import Dict, List, Optional, Any, Tuple
|
||||
from dataclasses import dataclass, asdict
|
||||
from enum import Enum
|
||||
import redis
|
||||
from .redis_manager import get_redis_manager
|
||||
import psutil
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
class QueueStatus(Enum):
|
||||
ACCEPTING = "accepting" # Normal operation
|
||||
BUSY = "busy" # High load, warn users
|
||||
OVERLOADED = "overloaded" # Reject new uploads
|
||||
MAINTENANCE = "maintenance" # Manual override
|
||||
|
||||
@dataclass
|
||||
class MemoryConfig:
|
||||
"""Memory configuration for queue management."""
|
||||
max_total_memory_mb: int = 2048 # 2GB total queue memory
|
||||
max_user_memory_mb: int = 512 # 512MB per user
|
||||
max_file_size_mb: int = 100 # 100MB max file size
|
||||
memory_warning_threshold: float = 0.8 # Warn at 80%
|
||||
memory_reject_threshold: float = 0.95 # Reject at 95%
|
||||
|
||||
@dataclass
|
||||
class QueuedFile:
|
||||
"""Represents a file waiting in the queue."""
|
||||
file_id: str
|
||||
user_id: str
|
||||
filename: str
|
||||
size_bytes: int
|
||||
mime_type: str
|
||||
cabinet_id: str
|
||||
priority: int = 1
|
||||
queued_at: float = 0
|
||||
estimated_processing_time: int = 300 # seconds
|
||||
memory_estimate_mb: float = 0
|
||||
|
||||
def __post_init__(self):
|
||||
if self.queued_at == 0:
|
||||
self.queued_at = time.time()
|
||||
|
||||
# Estimate memory usage (rough heuristic)
|
||||
self.memory_estimate_mb = self._estimate_memory_usage()
|
||||
|
||||
def _estimate_memory_usage(self) -> float:
|
||||
"""Estimate memory usage for this file during processing."""
|
||||
base_mb = self.size_bytes / (1024 * 1024)
|
||||
|
||||
# Processing multipliers based on operations
|
||||
if self.mime_type == 'application/pdf':
|
||||
# PDF: original + extracted text + images + thumbnails
|
||||
return base_mb * 3.5
|
||||
elif self.mime_type.startswith('image/'):
|
||||
# Images: original + resized variants + OCR text
|
||||
return base_mb * 2.5
|
||||
else:
|
||||
# Other docs: original + PDF conversion + processing
|
||||
return base_mb * 4.0
|
||||
|
||||
class MemoryAwareQueue:
|
||||
"""Memory-aware queue management system."""
|
||||
|
||||
def __init__(self, environment: str = "dev"):
|
||||
self.redis_manager = get_redis_manager(environment)
|
||||
self.redis_client = self.redis_manager.client
|
||||
self.config = self._load_config()
|
||||
|
||||
# Redis keys
|
||||
self.upload_queue_key = "upload_queue"
|
||||
self.processing_memory_key = "processing_memory"
|
||||
self.user_quota_key = "user_quotas"
|
||||
self.system_status_key = "system_status"
|
||||
|
||||
logger.info(f"🧠 Memory-aware queue initialized (max: {self.config.max_total_memory_mb}MB)")
|
||||
|
||||
def _load_config(self) -> MemoryConfig:
|
||||
"""Load memory configuration from environment."""
|
||||
return MemoryConfig(
|
||||
max_total_memory_mb=int(os.getenv('QUEUE_MAX_MEMORY_MB', '2048')),
|
||||
max_user_memory_mb=int(os.getenv('QUEUE_MAX_USER_MEMORY_MB', '512')),
|
||||
max_file_size_mb=int(os.getenv('MAX_FILE_SIZE_MB', '100')),
|
||||
memory_warning_threshold=float(os.getenv('MEMORY_WARNING_THRESHOLD', '0.8')),
|
||||
memory_reject_threshold=float(os.getenv('MEMORY_REJECT_THRESHOLD', '0.95'))
|
||||
)
|
||||
|
||||
def check_upload_capacity(self, user_id: str, file_size_bytes: int,
|
||||
mime_type: str) -> Tuple[bool, str, Dict[str, Any]]:
|
||||
"""
|
||||
Check if system can accept a new upload.
|
||||
|
||||
Returns:
|
||||
(can_accept, message, queue_info)
|
||||
"""
|
||||
|
||||
# Create temporary QueuedFile to estimate memory
|
||||
temp_file = QueuedFile(
|
||||
file_id="temp",
|
||||
user_id=user_id,
|
||||
filename="temp",
|
||||
size_bytes=file_size_bytes,
|
||||
mime_type=mime_type,
|
||||
cabinet_id="temp"
|
||||
)
|
||||
|
||||
file_memory_mb = temp_file.memory_estimate_mb
|
||||
|
||||
# Check file size limit
|
||||
if file_size_bytes > (self.config.max_file_size_mb * 1024 * 1024):
|
||||
return False, f"File too large (max: {self.config.max_file_size_mb}MB)", {}
|
||||
|
||||
# Get current memory usage
|
||||
current_memory = self._get_current_memory_usage()
|
||||
user_memory = self._get_user_memory_usage(user_id)
|
||||
|
||||
# Check user quota
|
||||
if user_memory + file_memory_mb > self.config.max_user_memory_mb:
|
||||
return False, f"User quota exceeded (limit: {self.config.max_user_memory_mb}MB)", {
|
||||
'user_current': user_memory,
|
||||
'user_limit': self.config.max_user_memory_mb
|
||||
}
|
||||
|
||||
# Check system capacity
|
||||
total_after = current_memory + file_memory_mb
|
||||
max_memory = self.config.max_total_memory_mb
|
||||
|
||||
if total_after > (max_memory * self.config.memory_reject_threshold):
|
||||
queue_info = self._get_queue_info()
|
||||
return False, "System overloaded. Please try again later.", {
|
||||
'current_memory': current_memory,
|
||||
'max_memory': max_memory,
|
||||
'utilization': current_memory / max_memory,
|
||||
'queue_position': queue_info['total_queued'] + 1
|
||||
}
|
||||
|
||||
# Calculate wait time estimate
|
||||
wait_estimate = self._estimate_wait_time(user_id)
|
||||
|
||||
status = "ready"
|
||||
message = "Upload accepted"
|
||||
|
||||
if total_after > (max_memory * self.config.memory_warning_threshold):
|
||||
status = "busy"
|
||||
message = f"System busy. Estimated wait: {wait_estimate // 60}m {wait_estimate % 60}s"
|
||||
|
||||
return True, message, {
|
||||
'status': status,
|
||||
'estimated_wait_seconds': wait_estimate,
|
||||
'memory_usage': {
|
||||
'current': current_memory,
|
||||
'after_upload': total_after,
|
||||
'limit': max_memory,
|
||||
'utilization': total_after / max_memory
|
||||
},
|
||||
'user_quota': {
|
||||
'used': user_memory,
|
||||
'after_upload': user_memory + file_memory_mb,
|
||||
'limit': self.config.max_user_memory_mb
|
||||
}
|
||||
}
|
||||
|
||||
def enqueue_file(self, file_id: str, user_id: str, filename: str,
|
||||
size_bytes: int, mime_type: str, cabinet_id: str,
|
||||
priority: int = 1) -> Dict[str, Any]:
|
||||
"""
|
||||
Add file to upload queue.
|
||||
|
||||
Returns:
|
||||
Queue information including position and estimated wait time
|
||||
"""
|
||||
|
||||
queued_file = QueuedFile(
|
||||
file_id=file_id,
|
||||
user_id=user_id,
|
||||
filename=filename,
|
||||
size_bytes=size_bytes,
|
||||
mime_type=mime_type,
|
||||
cabinet_id=cabinet_id,
|
||||
priority=priority
|
||||
)
|
||||
|
||||
# Serialize and add to Redis queue (priority queue: higher priority = lower score)
|
||||
score = time.time() - (priority * 1000000) # Priority affects score significantly
|
||||
|
||||
self.redis_client.zadd(
|
||||
self.upload_queue_key,
|
||||
{json.dumps(asdict(queued_file)): score}
|
||||
)
|
||||
|
||||
# Update user quota tracking
|
||||
self._update_user_quota(user_id, queued_file.memory_estimate_mb, increment=True)
|
||||
|
||||
# Get queue position and wait estimate
|
||||
position = self._get_queue_position(file_id)
|
||||
wait_estimate = self._estimate_wait_time(user_id)
|
||||
|
||||
logger.info(f"📋 Queued file {file_id} for user {user_id} (pos: {position}, wait: {wait_estimate}s)")
|
||||
|
||||
return {
|
||||
'queued': True,
|
||||
'file_id': file_id,
|
||||
'queue_position': position,
|
||||
'estimated_wait_seconds': wait_estimate,
|
||||
'memory_estimate_mb': queued_file.memory_estimate_mb
|
||||
}
|
||||
|
||||
def dequeue_next_file(self, service_name: str) -> Optional[QueuedFile]:
|
||||
"""
|
||||
Get next file from queue for processing.
|
||||
|
||||
Args:
|
||||
service_name: The service requesting work (for capacity management)
|
||||
"""
|
||||
|
||||
# Check if service has capacity
|
||||
service_memory = self._get_service_memory_usage(service_name)
|
||||
service_limit = self._get_service_memory_limit(service_name)
|
||||
|
||||
if service_memory >= service_limit:
|
||||
logger.debug(f"Service {service_name} at capacity ({service_memory}/{service_limit}MB)")
|
||||
return None
|
||||
|
||||
# Get next item from priority queue (lowest score first)
|
||||
items = self.redis_client.zrange(self.upload_queue_key, 0, 0, withscores=True)
|
||||
|
||||
if not items:
|
||||
return None
|
||||
|
||||
file_data_json, score = items[0]
|
||||
file_data = json.loads(file_data_json)
|
||||
queued_file = QueuedFile(**file_data)
|
||||
|
||||
# Check if this file would exceed service memory limit
|
||||
if service_memory + queued_file.memory_estimate_mb > service_limit:
|
||||
# Skip this file for now, try smaller ones later
|
||||
logger.debug(f"File {queued_file.file_id} too large for {service_name} capacity")
|
||||
return None
|
||||
|
||||
# Remove from queue
|
||||
self.redis_client.zrem(self.upload_queue_key, file_data_json)
|
||||
|
||||
# Update tracking
|
||||
self._update_user_quota(queued_file.user_id, queued_file.memory_estimate_mb, increment=False)
|
||||
self._update_service_memory(service_name, queued_file.memory_estimate_mb, increment=True)
|
||||
|
||||
logger.info(f"🎯 Dequeued file {queued_file.file_id} for {service_name} processing")
|
||||
|
||||
return queued_file
|
||||
|
||||
def complete_processing(self, service_name: str, file_id: str, memory_used_mb: float):
|
||||
"""Mark file processing as complete and free memory."""
|
||||
self._update_service_memory(service_name, memory_used_mb, increment=False)
|
||||
logger.info(f"✅ Completed processing {file_id} in {service_name} (freed {memory_used_mb}MB)")
|
||||
|
||||
def _get_current_memory_usage(self) -> float:
|
||||
"""Get current total memory usage across all services."""
|
||||
services = ['docling', 'tika', 'llm', 'document_analysis']
|
||||
total = 0
|
||||
|
||||
for service in services:
|
||||
service_key = f"{self.processing_memory_key}:{service}"
|
||||
memory = float(self.redis_client.get(service_key) or 0)
|
||||
total += memory
|
||||
|
||||
return total
|
||||
|
||||
def _get_user_memory_usage(self, user_id: str) -> float:
|
||||
"""Get current memory usage for a specific user."""
|
||||
user_key = f"{self.user_quota_key}:{user_id}"
|
||||
return float(self.redis_client.get(user_key) or 0)
|
||||
|
||||
def _get_service_memory_usage(self, service_name: str) -> float:
|
||||
"""Get current memory usage for a service."""
|
||||
service_key = f"{self.processing_memory_key}:{service_name}"
|
||||
return float(self.redis_client.get(service_key) or 0)
|
||||
|
||||
def _get_service_memory_limit(self, service_name: str) -> float:
|
||||
"""Get memory limit for a service."""
|
||||
# Service-specific memory limits as percentage of total
|
||||
limits = {
|
||||
'docling': 0.4, # 40% for Docling (memory-intensive)
|
||||
'tika': 0.2, # 20% for Tika
|
||||
'llm': 0.3, # 30% for LLM processing
|
||||
'document_analysis': 0.1 # 10% for document analysis
|
||||
}
|
||||
|
||||
percentage = limits.get(service_name, 0.1)
|
||||
return self.config.max_total_memory_mb * percentage
|
||||
|
||||
def _update_user_quota(self, user_id: str, memory_mb: float, increment: bool):
|
||||
"""Update user memory quota tracking."""
|
||||
user_key = f"{self.user_quota_key}:{user_id}"
|
||||
|
||||
if increment:
|
||||
self.redis_client.incrbyfloat(user_key, memory_mb)
|
||||
else:
|
||||
current = float(self.redis_client.get(user_key) or 0)
|
||||
new_value = max(0, current - memory_mb)
|
||||
self.redis_client.set(user_key, new_value)
|
||||
|
||||
# Set expiration for cleanup
|
||||
self.redis_client.expire(user_key, 86400) # 24 hours
|
||||
|
||||
def _update_service_memory(self, service_name: str, memory_mb: float, increment: bool):
|
||||
"""Update service memory usage tracking."""
|
||||
service_key = f"{self.processing_memory_key}:{service_name}"
|
||||
|
||||
if increment:
|
||||
self.redis_client.incrbyfloat(service_key, memory_mb)
|
||||
else:
|
||||
current = float(self.redis_client.get(service_key) or 0)
|
||||
new_value = max(0, current - memory_mb)
|
||||
self.redis_client.set(service_key, new_value)
|
||||
|
||||
# Set expiration for cleanup
|
||||
self.redis_client.expire(service_key, 3600) # 1 hour
|
||||
|
||||
def _get_queue_position(self, file_id: str) -> int:
|
||||
"""Get position of file in queue."""
|
||||
items = self.redis_client.zrange(self.upload_queue_key, 0, -1)
|
||||
for i, item in enumerate(items):
|
||||
file_data = json.loads(item)
|
||||
if file_data['file_id'] == file_id:
|
||||
return i + 1
|
||||
return 0
|
||||
|
||||
def _estimate_wait_time(self, user_id: str) -> int:
|
||||
"""Estimate wait time for user's next file."""
|
||||
# Simple estimation based on queue position and average processing time
|
||||
queue_size = self.redis_client.zcard(self.upload_queue_key)
|
||||
avg_processing_time = 300 # 5 minutes average
|
||||
|
||||
return int(queue_size * avg_processing_time * 0.5) # Assume parallel processing
|
||||
|
||||
def _get_queue_info(self) -> Dict[str, Any]:
|
||||
"""Get comprehensive queue information."""
|
||||
total_queued = self.redis_client.zcard(self.upload_queue_key)
|
||||
current_memory = self._get_current_memory_usage()
|
||||
max_memory = self.config.max_total_memory_mb
|
||||
|
||||
return {
|
||||
'total_queued': total_queued,
|
||||
'memory_usage': {
|
||||
'current_mb': current_memory,
|
||||
'max_mb': max_memory,
|
||||
'utilization': current_memory / max_memory if max_memory > 0 else 0
|
||||
},
|
||||
'status': self._determine_system_status(current_memory, max_memory)
|
||||
}
|
||||
|
||||
def _determine_system_status(self, current_memory: float, max_memory: float) -> str:
|
||||
"""Determine current system status based on memory usage."""
|
||||
utilization = current_memory / max_memory if max_memory > 0 else 0
|
||||
|
||||
if utilization >= self.config.memory_reject_threshold:
|
||||
return "overloaded"
|
||||
elif utilization >= self.config.memory_warning_threshold:
|
||||
return "busy"
|
||||
else:
|
||||
return "ready"
|
||||
|
||||
def get_system_status(self) -> Dict[str, Any]:
|
||||
"""Get comprehensive system status for monitoring."""
|
||||
queue_info = self._get_queue_info()
|
||||
|
||||
# Service-specific info
|
||||
services = {}
|
||||
for service_name in ['docling', 'tika', 'llm', 'document_analysis']:
|
||||
services[service_name] = {
|
||||
'memory_used_mb': self._get_service_memory_usage(service_name),
|
||||
'memory_limit_mb': self._get_service_memory_limit(service_name),
|
||||
'utilization': self._get_service_memory_usage(service_name) / self._get_service_memory_limit(service_name)
|
||||
}
|
||||
|
||||
return {
|
||||
'status': queue_info['status'],
|
||||
'queue': queue_info,
|
||||
'services': services,
|
||||
'config': asdict(self.config)
|
||||
}
|
||||
|
||||
# Convenience functions
|
||||
def get_memory_queue(environment: str = "dev") -> MemoryAwareQueue:
|
||||
"""Get memory-aware queue instance."""
|
||||
return MemoryAwareQueue(environment)
|
||||
|
||||
def check_upload_capacity(user_id: str, file_size: int, mime_type: str, environment: str = "dev") -> Tuple[bool, str, Dict]:
|
||||
"""Quick capacity check for upload."""
|
||||
queue = get_memory_queue(environment)
|
||||
return queue.check_upload_capacity(user_id, file_size, mime_type)
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
x
Reference in New Issue
Block a user