chore: add .env and *.bak to .gitignore, remove archive/ folder

This commit is contained in:
Classroom Copilot Dev 2026-02-23 21:14:43 +00:00
parent c92da048fd
commit 2436a00cac
8 changed files with 3 additions and 5825 deletions

3
.gitignore vendored
View File

@ -1,8 +1,11 @@
__pycache__
.pytest_cache
.env
.DS_Store
.archive/*
data/logs/*
*.bak

View File

@ -1,66 +0,0 @@
# Auto-Processing Code Archive
This directory contains the complex auto-processing system that was previously used for automatic document processing after file upload.
## Archived Components
### Core Processing Files
- `files_with_auto_processing.py` - Original files.py router with automatic processing
- `pipeline_controller.py` - Complex multi-phase pipeline orchestration
- `task_processors.py` - Document processing task handlers
### Advanced Queue Management (Created but not deployed)
- `memory_aware_queue.py` - Memory-based intelligent queue management
- `enhanced_upload_handler.py` - Advanced upload handler with queuing
- `enhanced_upload.py` - API endpoints for advanced upload system
## What This System Did
### Automatic Processing Pipeline
1. **File Upload** → Immediate processing trigger
2. **PDF Conversion** (synchronous, blocking)
3. **Phase 1**: Structure discovery (Tika, Page Images, Document Analysis, Split Map)
4. **Phase 2**: Docling processing (NO_OCR → OCR → VLM pipelines)
5. **Complex Dependencies**: Phase coordination, task sequencing
6. **Redis Queue Management**: Service limits, rate limits, dependency tracking
### Features
- Multi-phase processing pipelines
- Complex task dependency management
- Memory-aware queue limits
- Multi-user capacity management
- Real-time processing status
- WebSocket status updates
- Service-specific resource limits
- Task recovery on restart
## Why Archived
The system was overly complex for the current needs:
- **Complexity**: Multi-phase pipelines with complex dependencies
- **Blocking Operations**: Synchronous PDF conversion causing timeouts
- **Resource Management**: Over-engineered for single-user scenarios
- **User Experience**: Users had to wait for processing to complete
## New Simplified Approach
The new system focuses on:
- **Simple Upload**: Just store files and create database records
- **No Auto-Processing**: Users manually trigger processing when needed
- **Directory Support**: Upload entire folders with manifest tracking
- **Immediate Response**: Users get instant confirmation without waiting
## If You Need to Restore
To restore the auto-processing functionality:
1. Copy `files_with_auto_processing.py` back to `routers/database/files/files.py`
2. Ensure `pipeline_controller.py` and `task_processors.py` are in `modules/`
3. Update imports and dependencies
4. Re-enable background processing in upload handlers
## Migration Notes
The database schema and Redis structure remain compatible. The new simplified system can coexist with the archived processing logic if needed.
Date Archived: $(date)
Reason: Simplification for directory upload implementation

View File

@ -1,142 +0,0 @@
"""
Enhanced Upload Router with Memory-Aware Queuing
===============================================
Provides intelligent upload endpoints with capacity checking,
queue management, and real-time status updates.
Endpoints:
- POST /upload/check-capacity - Pre-check if upload is possible
- POST /upload/with-queue - Upload with intelligent queuing
- GET /upload/status/{file_id} - Get processing status
- GET /upload/queue-status - Get overall queue status
- WebSocket /ws/upload-status - Real-time status updates
"""
import os
import uuid
import json
import logging
from typing import Dict, List, Optional, Any
from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, WebSocket, WebSocketDisconnect
from fastapi.responses import JSONResponse
from modules.auth.supabase_bearer import SupabaseBearer
from modules.enhanced_upload_handler import get_upload_handler
from modules.memory_aware_queue import get_memory_queue
from modules.logger_tool import initialise_logger
router = APIRouter()
auth = SupabaseBearer()
logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
# WebSocket connection manager for real-time updates
class ConnectionManager:
def __init__(self):
self.active_connections: Dict[str, List[WebSocket]] = {}
async def connect(self, websocket: WebSocket, file_id: str):
await websocket.accept()
if file_id not in self.active_connections:
self.active_connections[file_id] = []
self.active_connections[file_id].append(websocket)
def disconnect(self, websocket: WebSocket, file_id: str):
if file_id in self.active_connections:
self.active_connections[file_id].remove(websocket)
if not self.active_connections[file_id]:
del self.active_connections[file_id]
async def broadcast_to_file(self, file_id: str, message: dict):
if file_id in self.active_connections:
for connection in self.active_connections[file_id].copy():
try:
await connection.send_json(message)
except:
self.active_connections[file_id].remove(connection)
manager = ConnectionManager()
@router.post("/upload/check-capacity")
async def check_upload_capacity(
file_size: int = Form(...),
mime_type: str = Form(...),
payload: Dict[str, Any] = Depends(auth)
):
"""
Check if user can upload a file of given size and type.
Returns capacity information and recommendations.
"""
try:
user_id = payload.get('sub') or payload.get('user_id', 'anonymous')
# Determine environment
environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'
upload_handler = get_upload_handler(environment)
eligible, message, details = upload_handler.check_upload_eligibility(
user_id, file_size, mime_type
)
response = {
'eligible': eligible,
'message': message,
'details': details,
'timestamp': time.time()
}
status_code = 200 if eligible else 429 # Too Many Requests if not eligible
logger.info(f"📋 Capacity check for user {user_id}: {eligible} - {message}")
return JSONResponse(content=response, status_code=status_code)
except Exception as e:
logger.error(f"Capacity check error: {e}")
raise HTTPException(status_code=500, detail=str(e))
@router.post("/upload/with-queue")
async def upload_with_queue(
cabinet_id: str = Form(...),
path: str = Form(...),
scope: str = Form(...),
priority: int = Form(1),
file: UploadFile = File(...),
payload: Dict[str, Any] = Depends(auth)
):
"""
Upload file with intelligent queuing and capacity management.
Returns queue information and processing status.
"""
try:
user_id = payload.get('sub') or payload.get('user_id', 'anonymous')
# Read file content
file_bytes = await file.read()
file_size = len(file_bytes)
mime_type = file.content_type or 'application/octet-stream'
filename = file.filename or path
logger.info(f"📤 Upload request: {filename} ({file_size} bytes) for user {user_id}")
# Determine environment
environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'
upload_handler = get_upload_handler(environment)
# Check if upload queuing is enabled
if os.getenv('UPLOAD_QUEUE_ENABLED', 'true').lower() == 'true':
# Use new queue-based upload
file_id = str(uuid.uuid4())
result = await upload_handler.handle_upload_with_queue(
file_id=file_id,
user_id=user_id,
filename=filename,
file_bytes=file_bytes,
mime_type=mime_type,
cabinet_id=cabinet_id,
priority=priority
)\n \n return result\n \n else:\n # Fall back to immediate processing (legacy mode)\n logger.warning(\"Using legacy immediate processing mode\")\n # TODO: Call original upload_file function\n raise HTTPException(status_code=501, detail=\"Legacy mode not implemented in this endpoint\")\n \n except HTTPException:\n raise\n except Exception as e:\n logger.error(f\"Upload error: {e}\")\n raise HTTPException(status_code=500, detail=str(e))\n\n@router.get(\"/upload/status/{file_id}\")\nasync def get_upload_status(\n file_id: str,\n payload: Dict[str, Any] = Depends(auth)\n):\n \"\"\"\n Get current processing status for an uploaded file.\n \"\"\"\n try:\n # Determine environment\n environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n upload_handler = get_upload_handler(environment)\n \n status = upload_handler.get_processing_status(file_id)\n \n if status.get('status') == 'not_found':\n raise HTTPException(status_code=404, detail=\"File not found\")\n \n return status\n \n except HTTPException:\n raise\n except Exception as e:\n logger.error(f\"Status check error for {file_id}: {e}\")\n raise HTTPException(status_code=500, detail=str(e))\n\n@router.get(\"/upload/queue-status\")\nasync def get_queue_status(\n payload: Dict[str, Any] = Depends(auth)\n):\n \"\"\"\n Get overall queue status and system capacity information.\n \"\"\"\n try:\n # Determine environment\n environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n memory_queue = get_memory_queue(environment)\n \n system_status = memory_queue.get_system_status()\n \n return {\n 'system_status': system_status,\n 'timestamp': time.time(),\n 'environment': environment\n }\n \n except Exception as e:\n logger.error(f\"Queue status error: {e}\")\n raise HTTPException(status_code=500, detail=str(e))\n\n@router.websocket(\"/ws/upload-status/{file_id}\")\nasync def websocket_upload_status(websocket: WebSocket, file_id: str):\n \"\"\"\n WebSocket endpoint for real-time upload status updates.\n \"\"\"\n await manager.connect(websocket, file_id)\n \n try:\n # Send initial status\n environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n upload_handler = get_upload_handler(environment)\n initial_status = upload_handler.get_processing_status(file_id)\n \n await websocket.send_json({\n 'type': 'status_update',\n 'data': initial_status\n })\n \n # Keep connection alive and listen for updates\n while True:\n # In a real implementation, you'd have a background task\n # that pushes updates when file status changes\n await asyncio.sleep(5)\n \n # Check for status updates\n current_status = upload_handler.get_processing_status(file_id)\n await websocket.send_json({\n 'type': 'status_update', \n 'data': current_status\n })\n \n except WebSocketDisconnect:\n manager.disconnect(websocket, file_id)\n except Exception as e:\n logger.error(f\"WebSocket error for {file_id}: {e}\")\n manager.disconnect(websocket, file_id)\n\n# Background task to process upload queue\n@router.on_event(\"startup\")\nasync def start_queue_processor():\n \"\"\"Start background queue processor.\"\"\"\n \n if os.getenv('UPLOAD_QUEUE_ENABLED', 'true').lower() != 'true':\n logger.info(\"📋 Upload queue disabled, skipping queue processor\")\n return\n \n import asyncio\n \n environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n upload_handler = get_upload_handler(environment)\n \n # Start background processor\n asyncio.create_task(upload_handler.process_queued_files(\"document_processor\"))\n \n logger.info(\"🚀 Upload queue processor started\")\n\nimport time\nimport asyncio"

View File

@ -1,362 +0,0 @@
"""
Enhanced Upload Handler with Memory-Aware Queuing
=================================================
Replaces the immediate processing model with intelligent queue management.
Provides user feedback about capacity, queue position, and processing status.
Features:
- Pre-upload capacity checking
- Memory-aware queuing with user quotas
- Real-time status updates via WebSocket/SSE
- Graceful degradation under load
- Fair queuing across multiple users
"""
import os
import uuid
import time
import logging
import asyncio
from typing import Dict, List, Optional, Any, Tuple
from fastapi import HTTPException, BackgroundTasks
from dataclasses import asdict
from .memory_aware_queue import get_memory_queue, QueuedFile
from .redis_manager import get_redis_manager
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
from modules.database.tools.storage.storage_admin import StorageAdmin
logger = logging.getLogger(__name__)
class EnhancedUploadHandler:
"""Enhanced upload handler with memory-aware queuing."""
def __init__(self, environment: str = "dev"):
self.memory_queue = get_memory_queue(environment)
self.redis_manager = get_redis_manager(environment)
self.redis_client = self.redis_manager.client
# Processing status tracking
self.processing_status_key = "file_processing_status"
def check_upload_eligibility(self, user_id: str, file_size: int,
mime_type: str) -> Tuple[bool, str, Dict[str, Any]]:
"""
Check if user can upload a file right now.
Returns:
(eligible, message, details)
"""
# Check system capacity
can_accept, message, queue_info = self.memory_queue.check_upload_capacity(
user_id, file_size, mime_type
)
if not can_accept:
return False, message, {
'reason': 'capacity_exceeded',
'queue_info': queue_info,
'recommendations': self._get_recommendations(queue_info)
}
return True, message, {
'status': 'ready_for_upload',
'queue_info': queue_info,
'processing_estimate': self._estimate_processing_time(file_size, mime_type)
}
async def handle_upload_with_queue(self, file_id: str, user_id: str,
filename: str, file_bytes: bytes,
mime_type: str, cabinet_id: str,
priority: int = 1) -> Dict[str, Any]:
"""
Handle file upload with intelligent queuing.
Steps:
1. Store file immediately (cheap operation)
2. Add to processing queue
3. Return queue status to user
4. Process asynchronously when capacity available
"""
# Store file immediately (this is fast)
storage = StorageAdmin()
client = SupabaseServiceRoleClient()
# Create database record
bucket = f"{cabinet_id}-files" # or your bucket naming convention
storage_path = f"{cabinet_id}/{file_id}/{filename}"
try:
# Store file
storage.upload_file(bucket, storage_path, file_bytes, mime_type, upsert=True)
# Create file record
insert_res = client.supabase.table('files').insert({
'id': file_id,
'name': filename,
'cabinet_id': cabinet_id,
'bucket': bucket,
'path': storage_path,
'mime_type': mime_type,
'uploaded_by': user_id,
'size_bytes': len(file_bytes),
'source': 'classroomcopilot-web',
'status': 'queued_for_processing' # New status
}).execute()
if not insert_res.data:
raise HTTPException(status_code=500, detail="Failed to create file record")
except Exception as e:
logger.error(f"Failed to store file {file_id}: {e}")
raise HTTPException(status_code=500, detail=f"Storage failed: {str(e)}")
# Add to processing queue
try:
queue_result = self.memory_queue.enqueue_file(
file_id=file_id,
user_id=user_id,
filename=filename,
size_bytes=len(file_bytes),
mime_type=mime_type,
cabinet_id=cabinet_id,
priority=priority
)
# Update file status in database
client.supabase.table('files').update({
'status': 'queued_for_processing',
'extra': {
'queue_position': queue_result['queue_position'],
'estimated_wait_seconds': queue_result['estimated_wait_seconds'],
'memory_estimate_mb': queue_result['memory_estimate_mb']
}
}).eq('id', file_id).execute()
logger.info(f"📋 File {file_id} queued at position {queue_result['queue_position']}")
return {
'status': 'upload_successful',
'message': 'File uploaded and queued for processing',
'file_id': file_id,
'queue_info': queue_result,
'next_steps': {
'poll_status_endpoint': f'/database/files/{file_id}/processing-status',
'websocket_updates': f'/ws/file-processing/{file_id}'
}
}
except Exception as e:
logger.error(f"Failed to queue file {file_id}: {e}")
# Clean up stored file
try:
storage.delete_file(bucket, storage_path)
client.supabase.table('files').delete().eq('id', file_id).execute()
except:
pass
raise HTTPException(status_code=500, detail=f"Queue failed: {str(e)}")
async def process_queued_files(self, service_name: str = "document_processor"):
"""
Background service to process queued files.
This runs continuously as a background task.
"""
logger.info(f"🚀 Started queue processor for {service_name}")
while True:
try:
# Get next file from queue
queued_file = self.memory_queue.dequeue_next_file(service_name)
if not queued_file:
# No files ready for processing
await asyncio.sleep(5)
continue
# Update file status
await self._update_processing_status(queued_file.file_id, 'processing')
# Process the file
try:
await self._process_file(queued_file, service_name)
await self._update_processing_status(queued_file.file_id, 'completed')
except Exception as e:
logger.error(f"Failed to process file {queued_file.file_id}: {e}")
await self._update_processing_status(queued_file.file_id, 'failed', str(e))
finally:
# Always free memory
self.memory_queue.complete_processing(
service_name,
queued_file.file_id,
queued_file.memory_estimate_mb
)
except Exception as e:
logger.error(f"Queue processor error: {e}")
await asyncio.sleep(10) # Back off on errors
async def _process_file(self, queued_file: QueuedFile, service_name: str):
"""Process a single file from the queue."""
logger.info(f"🔄 Processing file {queued_file.file_id} in {service_name}")
# Import here to avoid circular imports
from modules.pipeline_controller import get_pipeline_controller
client = SupabaseServiceRoleClient()
controller = get_pipeline_controller()
# Get file record
file_result = client.supabase.table('files').select('*').eq('id', queued_file.file_id).single().execute()
file_row = file_result.data
if not file_row:
raise Exception(f"File record not found: {queued_file.file_id}")
# Update status to processing
client.supabase.table('files').update({
'status': 'processing'
}).eq('id', queued_file.file_id).execute()
# Convert to PDF if needed (this is where the bottleneck was before)
processing_path = await self._handle_pdf_conversion(file_row)
# Enqueue Phase 1 tasks
phase1_tasks = controller.enqueue_phase1_tasks(
file_id=queued_file.file_id,
file_row={**file_row, 'path': processing_path},
processing_path=processing_path,
processing_mime=file_row['mime_type']
)
# Update database with task IDs
client.supabase.table('files').update({
'status': 'phase1_processing',
'extra': {
**file_row.get('extra', {}),
'phase1_tasks': phase1_tasks,
'processing_started_at': time.time()
}
}).eq('id', queued_file.file_id).execute()
logger.info(f"✅ File {queued_file.file_id} processing initiated")
async def _handle_pdf_conversion(self, file_row: Dict[str, Any]) -> str:
"""Handle PDF conversion asynchronously."""
if file_row['mime_type'] == 'application/pdf':
return file_row['path']
# TODO: Implement async PDF conversion
# For now, return original path and handle conversion in pipeline
logger.info(f"PDF conversion queued for file {file_row['id']}")
return file_row['path']
async def _update_processing_status(self, file_id: str, status: str, error: str = None):
"""Update file processing status."""
status_data = {
'file_id': file_id,
'status': status,
'timestamp': time.time(),
'error': error
}
# Store in Redis for real-time updates
status_key = f"{self.processing_status_key}:{file_id}"
self.redis_client.setex(status_key, 86400, json.dumps(status_data)) # 24h expiry
# Update database
client = SupabaseServiceRoleClient()
client.supabase.table('files').update({
'status': status,
'error_message': error
}).eq('id', file_id).execute()
logger.info(f"📊 Status update for {file_id}: {status}")
def get_processing_status(self, file_id: str) -> Dict[str, Any]:
"""Get current processing status for a file."""
status_key = f"{self.processing_status_key}:{file_id}"
status_json = self.redis_client.get(status_key)
if status_json:
return json.loads(status_json)
# Fallback to database
client = SupabaseServiceRoleClient()
result = client.supabase.table('files').select('status, error_message, extra').eq('id', file_id).single().execute()
if result.data:
return {
'file_id': file_id,
'status': result.data['status'],
'error': result.data.get('error_message'),
'extra': result.data.get('extra', {})
}
return {'file_id': file_id, 'status': 'not_found'}
def _estimate_processing_time(self, file_size: int, mime_type: str) -> Dict[str, Any]:
"""Estimate processing time for a file."""
# Base time estimates (in seconds)
base_times = {
'application/pdf': 60, # 1 minute per MB
'application/msword': 120, # 2 minutes per MB
'image/': 30 # 30 seconds per MB
}
# Find matching mime type
time_per_mb = 60 # default
for mime_prefix, time_val in base_times.items():
if mime_type.startswith(mime_prefix):
time_per_mb = time_val
break
file_size_mb = file_size / (1024 * 1024)
estimated_seconds = int(file_size_mb * time_per_mb)
return {
'estimated_seconds': estimated_seconds,
'estimated_minutes': estimated_seconds / 60,
'phases': {
'pdf_conversion': estimated_seconds * 0.2,
'metadata_extraction': estimated_seconds * 0.3,
'docling_processing': estimated_seconds * 0.5
}
}
def _get_recommendations(self, queue_info: Dict[str, Any]) -> List[str]:
"""Get recommendations for user when upload is rejected."""
recommendations = []
if queue_info.get('reason') == 'file_too_large':
recommendations.append("Try compressing your file or splitting it into smaller parts")
if queue_info.get('utilization', 0) > 0.9:
recommendations.append("System is currently overloaded. Try uploading during off-peak hours")
recommendations.append("Consider uploading smaller files first")
if queue_info.get('user_current', 0) > 0:
recommendations.append("Wait for your current uploads to complete before uploading more")
if not recommendations:
recommendations.append("Please try again in a few minutes")
return recommendations
# Convenience functions
def get_upload_handler(environment: str = "dev") -> EnhancedUploadHandler:
"""Get enhanced upload handler instance."""
return EnhancedUploadHandler(environment)
import json

View File

@ -1,997 +0,0 @@
import os
import io
from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, BackgroundTasks
from typing import Any, Dict, Optional
import uuid
import re
import requests
import os
import tempfile
from pathlib import Path
from modules.auth.supabase_bearer import SupabaseBearer, verify_supabase_jwt_str
from modules.logger_tool import initialise_logger
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
from modules.database.supabase.utils.storage import StorageAdmin
from modules.document_processor import DocumentProcessor
from modules.queue_system import (
enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task,
enqueue_document_analysis_task, enqueue_page_images_task,
TaskPriority, get_queue, QueueConnectionError
)
from fastapi.responses import Response
from fastapi import Body
router = APIRouter()
auth = SupabaseBearer()
doc_processor = DocumentProcessor()
DEFAULT_BUCKET = os.getenv('DEFAULT_FILES_BUCKET', 'cc.users')
# Timeout configurations (in seconds)
TIKA_TIMEOUT = int(os.getenv('TIKA_TIMEOUT', '300')) # 5 minutes default
DOCLING_FRONTMATTER_TIMEOUT = int(os.getenv('DOCLING_FRONTMATTER_TIMEOUT', '1800')) # 30 minutes default
DOCLING_NOOCR_TIMEOUT = int(os.getenv('DOCLING_NOOCR_TIMEOUT', '3600')) # 1 hour default
# (Legacy feature flags removed - using new three-phase system)
logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
def _safe_filename(name: str) -> str:
base = os.path.basename(name or 'file')
return re.sub(r"[^A-Za-z0-9._-]+", "_", base)
def _choose_bucket(scope: str, user_id: str, school_id: Optional[str]) -> str:
scope = (scope or 'teacher').lower()
if scope == 'school' and school_id:
return f"cc.institutes.{school_id}.private"
# teacher / student fall back to users bucket for now
return 'cc.users'
@router.post("/files/upload")
async def upload_file(
cabinet_id: str = Form(...),
path: str = Form(...),
scope: str = Form('teacher'),
school_id: Optional[str] = Form(default=None),
file: UploadFile = File(...),
payload: Dict[str, Any] = Depends(auth),
background_tasks: BackgroundTasks = None
):
user_id = payload.get('sub') or payload.get('user_id')
if not user_id:
raise HTTPException(status_code=401, detail="Invalid token payload")
client = SupabaseServiceRoleClient()
storage = StorageAdmin()
# Determine target bucket by scope
bucket = _choose_bucket(scope, user_id, school_id)
# Stage DB row to get file_id
staged_path = f"{cabinet_id}/staging/{uuid.uuid4()}"
name = _safe_filename(path or file.filename)
file_bytes = await file.read()
insert_res = client.supabase.table('files').insert({
'cabinet_id': cabinet_id,
'name': name,
'path': staged_path,
'bucket': bucket,
'mime_type': file.content_type,
'uploaded_by': user_id,
'size_bytes': len(file_bytes),
'source': 'classroomcopilot-web'
}).execute()
if not insert_res.data:
raise HTTPException(status_code=500, detail="Failed to create file record")
file_row = insert_res.data[0]
file_id = file_row['id']
# Final storage path: bucket/cabinet_id/file_id/file
final_storage_path = f"{cabinet_id}/{file_id}/{name}"
try:
storage.upload_file(bucket, final_storage_path, file_bytes, file.content_type or 'application/octet-stream', upsert=True)
except Exception as e:
# cleanup staged row
client.supabase.table('files').delete().eq('id', file_id).execute()
raise HTTPException(status_code=500, detail=f"Storage upload failed: {str(e)}")
# Update DB path to final
update_res = client.supabase.table('files').update({
'path': final_storage_path
}).eq('id', file_id).execute()
# Kick off initial artefacts generation in background (Tika + Docling frontmatter + no-OCR)
try:
if background_tasks is not None:
logger.info(f"Scheduling initial artefacts generation for file_id={file_id}")
background_tasks.add_task(generate_initial_artefacts, file_id, payload)
else:
logger.info(f"Running initial artefacts generation synchronously for file_id={file_id}")
generate_initial_artefacts(file_id, payload)
except Exception as e:
logger.error(f"Failed to schedule initial artefacts for file_id={file_id}: {e}")
return update_res.data
@router.get("/files")
def list_files(cabinet_id: str, payload: Dict[str, Any] = Depends(auth)):
client = SupabaseServiceRoleClient()
res = client.supabase.table('files').select('*').eq('cabinet_id', cabinet_id).execute()
return res.data
@router.post("/files/{file_id}/move")
def move_file(file_id: str, body: Dict[str, Any], payload: Dict[str, Any] = Depends(auth)):
client = SupabaseServiceRoleClient()
updates = {}
if 'cabinet_id' in body:
updates['cabinet_id'] = body['cabinet_id']
if 'path' in body:
updates['path'] = body['path']
if not updates:
raise HTTPException(status_code=400, detail="No changes provided")
res = client.supabase.table('files').update(updates).eq('id', file_id).execute()
return res.data
@router.delete("/files/{file_id}")
def delete_file(file_id: str, payload: Dict[str, Any] = Depends(auth)):
client = SupabaseServiceRoleClient()
res = client.supabase.table('files').delete().eq('id', file_id).execute()
return res.data
@router.get("/files/{file_id}/artefacts")
def list_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)):
client = SupabaseServiceRoleClient()
res = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).order('created_at', desc=True).execute()
return res.data
@router.get("/files/{file_id}/viewer-artefacts")
def list_viewer_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)):
"""
Get artefacts organized for UI viewer display, including frontmatter JSON,
processing bundles, and analysis data with proper display metadata.
"""
client = SupabaseServiceRoleClient()
# Get all artefacts for the file
res = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).order('created_at', desc=True).execute()
all_artefacts = res.data or []
# Organize artefacts by category for UI display
viewer_artefacts = {
'document_analysis': [],
'processing_bundles': [],
'raw_data': []
}
for artefact in all_artefacts:
artefact_type = artefact.get('type', '')
extra = artefact.get('extra', {})
# Enhanced artefact info for UI display
artefact_info = {
'id': artefact['id'],
'type': artefact_type,
'display_name': extra.get('display_name'),
'bundle_label': extra.get('bundle_label'),
'section_title': extra.get('section_title'),
'page_range': extra.get('page_range'),
'page_count': extra.get('page_count'),
'pipeline': extra.get('pipeline'),
'processing_mode': extra.get('processing_mode'),
'ui_order': extra.get('ui_order', 999),
'description': extra.get('description'),
'viewer_type': extra.get('viewer_type', 'json'),
'created_at': artefact['created_at'],
'status': artefact.get('status', 'unknown')
}
# Categorize artefacts for UI organization
if artefact_type == 'docling_frontmatter_json':
artefact_info.update({
'display_name': artefact_info['display_name'] or 'Document Frontmatter',
'bundle_label': artefact_info['bundle_label'] or 'Frontmatter Analysis',
'description': artefact_info['description'] or 'OCR analysis of document structure and metadata',
'ui_order': 1,
'viewer_type': 'json'
})
viewer_artefacts['document_analysis'].append(artefact_info)
elif artefact_type == 'split_map_json':
artefact_info.update({
'display_name': 'Document Structure Map',
'bundle_label': 'Split Map',
'description': 'Document section boundaries and organization structure',
'ui_order': 2,
'viewer_type': 'json'
})
viewer_artefacts['document_analysis'].append(artefact_info)
elif artefact_type == 'tika_json':
artefact_info.update({
'display_name': 'Document Metadata',
'bundle_label': 'Tika Analysis',
'description': 'Raw document metadata and properties extracted by Apache Tika',
'ui_order': 3,
'viewer_type': 'json'
})
viewer_artefacts['raw_data'].append(artefact_info)
elif artefact_type in ['canonical_docling_json', 'docling_bundle_split', 'docling_bundle', 'docling_standard', 'docling_bundle_split_pages']:
# Processing bundles (OCR, No-OCR, VLM) - use original_pipeline for proper differentiation
pipeline_name = extra.get('original_pipeline', extra.get('pipeline', 'Unknown'))
bundle_label = artefact_info['bundle_label'] or f"{pipeline_name.upper().replace('_', '-')} Bundle"
display_name = artefact_info['display_name'] or f"{pipeline_name.upper().replace('_', '-')} Processing Result"
# Special handling for master manifests
if artefact_type == 'docling_bundle_split_pages':
display_name = f"{pipeline_name.upper().replace('_', '-')} Document Pages"
bundle_label = f"{pipeline_name.upper().replace('_', '-')} Pages Bundle"
artefact_info.update({
'viewer_type': 'bundle_collection',
'is_master_manifest': True,
'ui_order': 10 # Show master manifests before individual pages
})
elif artefact_type == 'docling_standard':
# Individual page bundles - lower UI priority
artefact_info.update({
'viewer_type': 'page_bundle',
'is_individual_page': True,
'ui_order': extra.get('split_order', 999) + 100 # Show after master manifests
})
artefact_info.update({
'display_name': display_name,
'bundle_label': bundle_label,
'description': f"Docling processing result using {pipeline_name.replace('_', '-')} pipeline",
'pipeline_type': pipeline_name # Add explicit pipeline type for UI
})
viewer_artefacts['processing_bundles'].append(artefact_info)
elif artefact_type.startswith('docling_') and artefact_type.endswith('_json'):
# Other docling JSON results
pipeline_name = artefact_type.replace('docling_', '').replace('_json', '').upper()
artefact_info.update({
'display_name': f"{pipeline_name} Analysis",
'bundle_label': f"{pipeline_name} Result",
'description': f"Docling {pipeline_name.lower()} processing result",
'viewer_type': 'json'
})
viewer_artefacts['processing_bundles'].append(artefact_info)
elif artefact_type == 'page_images':
artefact_info.update({
'display_name': 'Page Images',
'bundle_label': 'Visual Pages',
'description': 'Generated page images for document visualization',
'viewer_type': 'images'
})
viewer_artefacts['raw_data'].append(artefact_info)
# Sort each category by ui_order
for category in viewer_artefacts.values():
category.sort(key=lambda x: (x['ui_order'], x['created_at']))
return {
'file_id': file_id,
'categories': viewer_artefacts,
'total_artefacts': len(all_artefacts)
}
@router.post("/files/{file_id}/artefacts/initial")
def generate_initial_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)):
"""
Generate initial artefacts using the new three-phase pipeline architecture.
Phase 1: Document Structure Discovery & Analysis
- Tika metadata extraction
- Page images generation
- Document structure analysis (LLM-enhanced)
- Split map generation
Phase 2: Triggered automatically after Phase 1 completion
"""
logger.info(f"Three-phase pipeline: Starting Phase 1 for file_id={file_id}")
from modules.pipeline_controller import get_pipeline_controller
client = SupabaseServiceRoleClient()
storage = StorageAdmin()
controller = get_pipeline_controller()
# Load file row
fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
file_row = fr.data
if not file_row:
raise HTTPException(status_code=404, detail="File not found")
bucket = file_row['bucket']
storage_path = file_row['path']
cabinet_id = file_row['cabinet_id']
mime = file_row.get('mime_type') or 'application/octet-stream'
filename = file_row.get('name', 'file')
# Step 1: Convert to PDF if not already a PDF (synchronous for now)
processing_path = storage_path
processing_mime = mime
if mime != 'application/pdf':
logger.info(f"Converting non-PDF file to PDF: file_id={file_id} mime={mime}")
try:
file_bytes = storage.download_file(bucket, storage_path)
with tempfile.TemporaryDirectory() as temp_dir:
# Save original file to temp location
temp_input = Path(temp_dir) / filename
with open(temp_input, 'wb') as f:
f.write(file_bytes)
# Convert to PDF
pdf_bytes = doc_processor.convert_to_pdf(temp_input)
# Store PDF as artefact
pdf_artefact_id = str(uuid.uuid4())
pdf_rel_path = f"{cabinet_id}/{file_id}/{pdf_artefact_id}/document.pdf"
storage.upload_file(bucket, pdf_rel_path, pdf_bytes, 'application/pdf', upsert=True)
pdf_ar = client.supabase.table('document_artefacts').insert({
'file_id': file_id,
'type': 'document_pdf',
'rel_path': pdf_rel_path,
'extra': {'converted_from': mime, 'original_filename': filename},
'status': 'completed'
}).execute()
# Use converted PDF for subsequent processing
processing_path = pdf_rel_path
processing_mime = 'application/pdf'
logger.info(f"PDF conversion: completed file_id={file_id} rel_path={pdf_rel_path}")
except Exception as e:
logger.error(f"PDF conversion: error processing file_id={file_id}: {e}")
# Continue with original file if conversion fails
else:
logger.info(f"File is already PDF, skipping conversion: file_id={file_id}")
# Step 2: Enqueue Phase 1 tasks using the new pipeline controller
user_id = payload.get('sub') or payload.get('user_id')
priority = TaskPriority.HIGH if user_id else TaskPriority.NORMAL
try:
# Update file row with processing path
updated_file_row = {**file_row, 'path': processing_path, 'mime_type': processing_mime}
# Enqueue Phase 1 tasks
phase1_tasks = controller.enqueue_phase1_tasks(
file_id=file_id,
file_row=updated_file_row,
processing_path=processing_path,
processing_mime=processing_mime,
priority=priority
)
total_tasks = sum(len(task_list) for task_list in phase1_tasks.values())
logger.info(f"Three-phase pipeline: Enqueued {total_tasks} Phase 1 tasks for file_id={file_id}")
return {
'message': f'Three-phase pipeline: Enqueued {total_tasks} Phase 1 tasks. Phase 2 will trigger automatically after completion.',
'phase1_tasks': {k: v for k, v in phase1_tasks.items()},
'file_id': file_id,
'pipeline_mode': 'three_phase',
'bundle_architecture_enabled': True
}
except QueueConnectionError as e:
logger.error(f"Queue system unavailable for file_id={file_id}: {e}")
logger.error("Redis is not running. Please start the API server with './start.sh dev' to auto-start Redis.")
return {
'message': 'File uploaded successfully, but processing tasks could not be queued (Redis unavailable)',
'file_id': file_id,
'queue_status': 'unavailable',
'error': 'Queue system unavailable. Please restart the API server with Redis enabled.'
}
except Exception as e:
logger.error(f"Unexpected error enqueueing Phase 1 tasks for file_id={file_id}: {e}")
return {
'message': 'File uploaded successfully, but processing tasks failed to queue',
'file_id': file_id,
'queue_status': 'failed',
'error': str(e)
}
@router.get("/files/{file_id}/page-images/manifest")
def get_page_images_manifest(file_id: str, payload: Dict[str, Any] = Depends(auth)):
"""Return the page_images manifest JSON for a file via service-role access."""
client = SupabaseServiceRoleClient()
storage = StorageAdmin()
# Find file row to get bucket
fr = client.supabase.table('files').select('id,bucket,cabinet_id').eq('id', file_id).single().execute()
file_row = fr.data or {}
if not file_row:
raise HTTPException(status_code=404, detail="File not found")
bucket = file_row['bucket']
cabinet_id = file_row['cabinet_id']
# Find page_images artefact
arts = client.supabase.table('document_artefacts') \
.select('id,type,rel_path,extra') \
.eq('file_id', file_id).eq('type', 'page_images') \
.order('created_at', desc=True).limit(1).execute().data or []
if not arts:
raise HTTPException(status_code=404, detail="page_images artefact not found")
art = arts[0]
# Manifest path
manifest_rel_path = (art.get('extra') or {}).get('manifest') or f"{art['rel_path'].rstrip('/')}/page_images.json"
try:
raw = storage.download_file(bucket, manifest_rel_path)
import json as _json
manifest = _json.loads(raw.decode('utf-8'))
# Ensure bucket and base prefix are present for the UI
manifest.setdefault('bucket', bucket)
manifest.setdefault('base_dir', art['rel_path'])
return manifest
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to load manifest: {e}")
def json_dumps(obj: Any) -> str:
try:
import json
return json.dumps(obj, ensure_ascii=False)
except Exception:
return "{}"
@router.get("/files/{file_id}/artefacts/{artefact_id}/json")
def get_artefact_json(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)):
"""Return the JSON content of a document artefact using service-role storage access."""
client = SupabaseServiceRoleClient()
storage = StorageAdmin()
# Look up artefact to get rel_path and validate it belongs to file
ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path').eq('id', artefact_id).single().execute()
artefact = ar.data
if not artefact:
raise HTTPException(status_code=404, detail="Artefact not found")
if artefact.get('file_id') != file_id:
raise HTTPException(status_code=400, detail="Artefact does not belong to file")
# Look up file to get bucket
fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute()
file_row = fr.data
if not file_row:
raise HTTPException(status_code=404, detail="File not found")
bucket = file_row['bucket']
rel_path = artefact['rel_path']
try:
raw = storage.download_file(bucket, rel_path)
import json as _json
data = _json.loads(raw.decode('utf-8'))
return data
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to load artefact JSON: {str(e)}")
@router.get("/files/{file_id}/artefacts/{artefact_id}/vlm-section-manifest")
def get_vlm_section_manifest(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)):
"""Return the VLM section page bundle manifest JSON for a VLM section bundle artefact."""
client = SupabaseServiceRoleClient()
storage = StorageAdmin()
ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path,type,extra').eq('id', artefact_id).single().execute().data
if not ar:
raise HTTPException(status_code=404, detail="Artefact not found")
if ar.get('file_id') != file_id:
raise HTTPException(status_code=400, detail="Artefact does not belong to file")
if ar.get('type') != 'vlm_section_page_bundle':
raise HTTPException(status_code=400, detail="Artefact is not a VLM section page bundle")
fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute().data
if not fr:
raise HTTPException(status_code=404, detail="File not found")
bucket = fr['bucket']
# The rel_path directly points to the manifest JSON file
manifest_rel_path = ar['rel_path']
try:
raw = storage.download_file(bucket, manifest_rel_path)
import json as _json
data = _json.loads(raw.decode('utf-8'))
# ensure bucket present for client use
data.setdefault('bucket', bucket)
return data
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to load VLM section manifest: {e}")
@router.post("/files/{file_id}/artefacts/outline")
def enqueue_outline_structure(file_id: str, payload: Dict[str, Any] = Depends(auth)):
"""
Manually enqueue the fast document outline (headings-only) analysis for an existing file.
Returns the queued task id.
"""
client = SupabaseServiceRoleClient()
fr = client.supabase.table('files').select('id,bucket,cabinet_id,path,mime_type').eq('id', file_id).single().execute()
file_row = fr.data
if not file_row:
raise HTTPException(status_code=404, detail="File not found")
bucket = file_row['bucket']
storage_path = file_row['path']
cabinet_id = file_row['cabinet_id']
mime = file_row.get('mime_type') or 'application/pdf'
# Prefer converted PDF artefact if available
arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or []
pdf_art = next((a for a in arts if a.get('type') == 'document_pdf'), None)
processing_path = pdf_art['rel_path'] if pdf_art else storage_path
try:
task_id = enqueue_docling_task(
file_id=file_id,
task_type='document_structure_analysis',
payload={
'bucket': bucket,
'file_path': processing_path,
'cabinet_id': cabinet_id,
'mime_type': mime,
'config': {
'target_type': 'inbody',
'to_formats': 'json',
'do_ocr': False,
'force_ocr': False
}
},
priority=TaskPriority.NORMAL,
timeout=300
)
return { 'message': 'outline task enqueued', 'task_id': task_id, 'file_id': file_id }
except QueueConnectionError as e:
raise HTTPException(status_code=503, detail=f"Queue unavailable: {e}")
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to enqueue outline task: {e}")
@router.get("/files/proxy")
def proxy_storage_file(bucket: str, path: str, payload: Dict[str, Any] = Depends(auth)):
"""Proxy a storage file (service-role), useful for private image access in the UI."""
storage = StorageAdmin()
try:
data = storage.download_file(bucket, path)
media = 'application/octet-stream'
lp = path.lower()
if lp.endswith('.png'):
media = 'image/png'
elif lp.endswith('.webp'):
media = 'image/webp'
elif lp.endswith('.jpg') or lp.endswith('.jpeg'):
media = 'image/jpeg'
elif lp.endswith('.json'):
media = 'application/json'
return Response(content=data, media_type=media)
except Exception as e:
raise HTTPException(status_code=404, detail=f"File not found or inaccessible: {e}")
# Signed proxy for iframe/img tags without Authorization header
@router.get("/files/proxy_signed")
def proxy_storage_file_signed(bucket: str, path: str, token: str):
"""Proxy using a signed bearer token passed as query param 'token'."""
try:
payload = verify_supabase_jwt_str(token)
if not payload:
raise HTTPException(status_code=403, detail="Invalid token")
except Exception as e:
raise HTTPException(status_code=403, detail=f"Invalid token: {e}")
storage = StorageAdmin()
try:
data = storage.download_file(bucket, path)
media = 'application/octet-stream'
lp = path.lower()
if lp.endswith('.png'):
media = 'image/png'
elif lp.endswith('.webp'):
media = 'image/webp'
elif lp.endswith('.jpg') or lp.endswith('.jpeg'):
media = 'image/jpeg'
elif lp.endswith('.json'):
media = 'application/json'
return Response(content=data, media_type=media)
except Exception as e:
raise HTTPException(status_code=404, detail=f"File not found or inaccessible: {e}")
# -------- Canonical bundle manifest ---------
@router.get("/files/{file_id}/artefacts/{artefact_id}/manifest")
def get_canonical_manifest(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)):
"""Return the manifest.json for a canonical_docling_bundle artefact."""
client = SupabaseServiceRoleClient()
storage = StorageAdmin()
ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path,extra').eq('id', artefact_id).single().execute().data
if not ar:
raise HTTPException(status_code=404, detail="Artefact not found")
if ar.get('file_id') != file_id:
raise HTTPException(status_code=400, detail="Artefact does not belong to file")
extra = ar.get('extra') or {}
manifest_rel_path = extra.get('manifest')
if not manifest_rel_path:
raise HTTPException(status_code=404, detail="Manifest path not recorded on artefact")
fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute().data
if not fr:
raise HTTPException(status_code=404, detail="File not found")
bucket = fr['bucket']
try:
raw = storage.download_file(bucket, manifest_rel_path)
import json as _json
data = _json.loads(raw.decode('utf-8'))
# ensure bucket present for client use
data.setdefault('bucket', bucket)
return data
except Exception as e:
raise HTTPException(status_code=500, detail=f"Failed to load manifest: {e}")
# -------- Canonical Docling generation ---------
def _load_split_map(client: SupabaseServiceRoleClient, storage: StorageAdmin, bucket: str, file_id: str) -> Optional[Dict[str, Any]]:
try:
arts = client.supabase.table('document_artefacts') \
.select('id,type,rel_path') \
.eq('file_id', file_id).eq('type', 'split_map_json') \
.order('created_at', desc=True).limit(1).execute().data or []
if not arts:
return None
art = arts[0]
raw = storage.download_file(bucket, art['rel_path'])
import json as _json
return _json.loads(raw.decode('utf-8'))
except Exception:
return None
@router.post("/files/{file_id}/artefacts/canonical-docling")
def enqueue_canonical_docling(
file_id: str,
body: Dict[str, Any] = Body(default={}),
payload: Dict[str, Any] = Depends(auth)
):
"""Enqueue generation of canonical Docling JSON(s) for a file.
If a split_map is available and the document is large, this will enqueue
multiple Docling jobs using page ranges per section. Otherwise a single
job is created for the whole document.
"""
client = SupabaseServiceRoleClient()
storage = StorageAdmin()
fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
file_row = fr.data
if not file_row:
raise HTTPException(status_code=404, detail="File not found")
bucket = file_row['bucket']
cabinet_id = file_row['cabinet_id']
mime = file_row.get('mime_type') or 'application/pdf'
storage_path = file_row['path']
# Prefer converted PDF if available
try:
arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or []
a_pdf = next((a for a in arts if a.get('type') == 'document_pdf'), None)
processing_path = a_pdf['rel_path'] if a_pdf else storage_path
processing_mime = 'application/pdf' if a_pdf else mime
except Exception:
processing_path = storage_path
processing_mime = mime
# Determine page_count (prefer Tika; fallback to PDF parser if needed)
page_count = None
try:
arts_pc = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).execute().data or []
a_tika_pc = next((a for a in arts_pc if a.get('type') == 'tika_json'), None)
if a_tika_pc:
raw = storage.download_file(bucket, a_tika_pc['rel_path'])
import json as _json
tj = _json.loads(raw.decode('utf-8'))
for k in ("xmpTPg:NPages", "Page-Count", "pdf:PageCount"):
v = tj.get(k) or tj.get(k.lower())
if v is not None:
page_count = int(v)
break
except Exception as e:
logger.debug(f"[canonical-docling] Tika page_count read failed: {e}")
pass
# Fallback: compute page_count from PDF if Tika did not provide it
if page_count is None:
try:
pdf_bytes = storage.download_file(bucket, processing_path)
try:
import fitz # PyMuPDF
doc = fitz.open(stream=pdf_bytes, filetype='pdf')
page_count = int(doc.page_count)
doc.close()
logger.info(f"[canonical-docling] page_count via PyMuPDF: {page_count}")
except Exception:
try:
from PyPDF2 import PdfReader
reader = PdfReader(io.BytesIO(pdf_bytes))
page_count = int(len(reader.pages))
logger.info(f"[canonical-docling] page_count via PyPDF2: {page_count}")
except Exception:
page_count = None
except Exception:
page_count = None
else:
logger.info(f"[canonical-docling] page_count via Tika: {page_count}")
# Optional custom range from caller
custom_range = body.get('custom_range')
custom_label = body.get('custom_label') or ''
selected_section_id = body.get('selected_section_id')
selected_section_title = body.get('selected_section_title')
# Load split map if requested and document is large enough
use_split_requested = bool(body.get('use_split_map', True))
split_threshold = int(body.get('threshold') or os.getenv('DOCLING_SPLIT_THRESHOLD', '50'))
ranges = [] # list of (start,end)
split_map = None
sections = [] # list of dicts: {start,end,title}
logger.info(f"[canonical-docling] use_split_map={use_split_requested} threshold={split_threshold} page_count={page_count}")
# If custom range provided, honor it and bypass split map
if isinstance(custom_range, list) and len(custom_range) >= 2:
try:
cs = int(custom_range[0]); ce = int(custom_range[1])
if page_count is not None:
cs = max(1, min(cs, page_count))
ce = max(cs, min(ce, page_count))
ranges = [(cs, ce)]
sections = [{'start': cs, 'end': ce, 'title': custom_label or 'Custom range'}]
use_split_requested = False
logger.info(f"[canonical-docling] using custom_range start={cs} end={ce} label='{custom_label}'")
except Exception as _e:
logger.warning(f"[canonical-docling] invalid custom_range; falling back. err={_e}")
if not ranges and use_split_requested and (page_count is None or page_count >= split_threshold):
split_map = _load_split_map(client, storage, bucket, file_id)
entries = (split_map or {}).get('entries') if split_map else []
logger.info(f"[canonical-docling] split_map loaded entries={len(entries) if isinstance(entries, list) else 0}")
if split_map and isinstance(entries, list) and len(entries) > 0:
# Normalize and sort entries by start_page to enforce correct order
norm: list[dict] = []
for e in entries:
try:
s = int(e.get('start_page', 1))
t = int(e.get('end_page', s))
if t < s:
t = s
title = e.get('title') or e.get('label') or ''
norm.append({'start': s, 'end': t, 'title': title})
except Exception:
continue
norm.sort(key=lambda x: x['start'])
# Deduplicate identical or overlapping starts by keeping the earliest occurrence
ordered: list[dict] = []
last_end = 0
for e in norm:
s, t = int(e['start']), int(e['end'])
if ordered and s <= last_end:
# Clamp to prevent inversion and maintain order
s = last_end + 1
if s > (page_count or s):
continue
if t < s:
t = s
last_end = max(last_end, t)
ordered.append({'start': s, 'end': t, 'title': e['title']})
for e in ordered:
ranges.append((e['start'], e['end']))
sections.append(e)
# Fallback: if no split_map ranges... we shouldn't be here
if not ranges:
# If document is large, split into fixed windows to protect Docling server
if page_count is not None and page_count >= split_threshold:
chunk = int(os.getenv('DOCLING_FALLBACK_CHUNK_PAGES', '25'))
chunk = max(5, min(100, chunk))
for i in range(1, (page_count or 1) + 1, chunk):
end = min(i + chunk - 1, page_count or i)
ranges.append((i, end))
sections.append({'start': i, 'end': end, 'title': f"Pages {i}-{end}"})
logger.warning(f"[canonical-docling] using fallback chunking ranges={len(ranges)} chunk={chunk}")
else:
ranges = [(1, page_count or 9223372036854775807)]
logger.warning(f"[canonical-docling] using single-range fallback (small doc)")
# Build config
cfg = body.get('config', {})
pipeline = cfg.get('pipeline', 'standard')
config: Dict[str, Any] = {
# target_type is computed in processor based on to_formats unless explicitly provided by user
'to_formats': cfg.get('to_formats', 'json'),
'do_ocr': bool(cfg.get('do_ocr', True)),
'force_ocr': bool(cfg.get('force_ocr', False)),
'image_export_mode': cfg.get('image_export_mode', 'embedded'),
'ocr_engine': cfg.get('ocr_engine', 'easyocr'),
'ocr_lang': cfg.get('ocr_lang', 'en'),
'pdf_backend': cfg.get('pdf_backend', 'dlparse_v4'),
'table_mode': cfg.get('table_mode', 'fast'),
'pipeline': pipeline,
'do_picture_classification': bool(cfg.get('do_picture_classification', False)),
'do_picture_description': bool(cfg.get('do_picture_description', False)),
}
# If user explicitly set target_type, pass it through
if 'target_type' in cfg:
config['target_type'] = cfg['target_type']
# Optional VLM settings (only include API fields if provided as JSON by caller)
if config['do_picture_description']:
pd_api = cfg.get('picture_description_api')
if isinstance(pd_api, (dict, list)):
config['picture_description_api'] = pd_api
elif isinstance(pd_api, str) and pd_api.strip().startswith(('{', '[')):
config['picture_description_api'] = pd_api
if cfg.get('picture_description_prompt'):
config['picture_description_prompt'] = cfg['picture_description_prompt']
if pipeline == 'vlm':
# Provider presets mapping
provider = (cfg.get('vlm_provider') or '').strip().lower()
provider_model = (cfg.get('vlm_provider_model') or '').strip()
provider_base = (cfg.get('vlm_provider_base_url') or '').strip()
if provider in ('ollama', 'openai') and provider_model:
if provider == 'ollama':
base_url = provider_base or os.getenv('OLLAMA_BASE_URL') or os.getenv('VLM_OLLAMA_BASE_URL')
if base_url:
endpoint = f"{base_url.rstrip('/')}/v1/chat/completions"
# Use OpenAI provider schema against Ollama's OpenAI-compatible endpoint
cfg_api = {
'provider': 'openai',
'url': endpoint,
'model': provider_model,
'response_format': 'markdown',
'request_params': {'model': provider_model}
}
logger.info(f"[canonical-docling] VLM provider=ollama mapped to openai-compatible url={endpoint} model={provider_model}")
config['vlm_pipeline_model_api'] = cfg_api
# Also wire picture_description_api if picture description is enabled
if config.get('do_picture_description'):
config['picture_description_api'] = {
'url': endpoint,
'headers': {},
'params': {'model': provider_model}
}
elif provider == 'openai':
base_url = provider_base or os.getenv('OPENAI_BASE_URL') or 'https://api.openai.com/v1'
api_key = os.getenv('OPENAI_API_KEY') or os.getenv('OPENAI_API_KEY_READONLY')
# Do not inline key if not present; server may have default
model_cfg: Dict[str, Any] = {
'provider': 'openai',
'url': f"{base_url.rstrip('/')}/chat/completions",
'model': provider_model,
'response_format': 'markdown',
'request_params': {'model': provider_model}
}
if api_key:
model_cfg['api_key'] = api_key
# Also pass explicit Authorization header for servers that expect it
model_cfg['headers'] = {
'Authorization': f"Bearer {api_key}"
}
logger.info(f"[canonical-docling] VLM provider=openai url={model_cfg['url']} model={provider_model} api_key={'yes' if api_key else 'no'}")
config['vlm_pipeline_model_api'] = model_cfg
# Also wire picture_description_api if picture description is enabled
if config.get('do_picture_description'):
headers = {'Authorization': f"Bearer {api_key}"} if api_key else {}
config['picture_description_api'] = {
'url': f"{base_url.rstrip('/')}/chat/completions",
'headers': headers,
'params': {'model': provider_model}
}
else:
# Pass through explicit API/local JSON if provided by caller
vpa = cfg.get('vlm_pipeline_model_api')
if isinstance(vpa, (dict, list)):
config['vlm_pipeline_model_api'] = vpa
elif isinstance(vpa, str) and vpa.strip().startswith(('{', '[')):
config['vlm_pipeline_model_api'] = vpa
# Enqueue tasks for each range
priority = TaskPriority.HIGH
task_ids = []
multi = len(ranges) > 1
logger.info(f"[canonical-docling] final ranges={len(ranges)} multi={multi} pipeline={pipeline} producer={body.get('producer', 'manual')}")
# Create a group id for split bundles (used for UI grouping)
# Use provided group_id if present (for two-pass auto system), otherwise generate new
group_id = body.get('group_id') or (str(uuid.uuid4()) if multi else None)
if multi and not sections:
# Build sections from ranges if titles were not captured
for (start, end) in ranges:
sections.append({'start': int(start), 'end': int(end), 'title': ''})
idx = 0
for (start, end) in ranges:
# Locate title for this range if available
title = ''
if multi and sections and idx < len(sections):
title = sections[idx].get('title') or ''
idx += 1
cfg_range = dict(config)
# Ensure 1-based inclusive range is passed through
cfg_range['page_range'] = [max(1, int(start)), max(int(start), int(end))]
extra = {
'is_subdoc': multi,
'page_range': [int(start), int(end)],
'label': (title or f"subdoc p{int(start)}-{int(end)}") if multi else 'canonical'
}
# Attach selected section metadata if provided by caller
if selected_section_id:
extra['selected_section_id'] = selected_section_id
if selected_section_title or custom_label:
extra['selected_section_title'] = selected_section_title or custom_label
# For split processing, force split bundle artefact type and add grouping/order metadata
if multi:
extra.update({
# UI grouping metadata
'split_order': idx,
'split_heading': title,
'split_total': len(ranges)
})
if group_id:
extra['group_id'] = group_id
extra['group_pack_type'] = 'docling_standard_auto_split'
else:
# Single-bundle case: allow caller to override type (defaults to canonical bundle)
if 'artefact_type_override' in body and body.get('artefact_type_override'):
extra['artefact_type_override'] = body.get('artefact_type_override')
# Mark producer and selection metadata
extra['producer'] = body.get('producer') or ('auto_split' if (multi and body.get('use_split_map')) else 'manual')
if selected_section_id:
extra['selected_section_id'] = selected_section_id
if selected_section_title or custom_label:
extra['selected_section_title'] = selected_section_title or custom_label
# Enhanced logging for canonical operations
if multi:
logger.info(f"[canonical-docling] enqueue range idx={idx}/{len(ranges)} start={start} end={end} title='{title}' group_id={group_id} producer={extra.get('producer')} pipeline={pipeline}")
else:
logger.info(f"[canonical-docling] enqueue single range start={start} end={end} producer={extra.get('producer')} pipeline={pipeline}")
tid = enqueue_docling_task(
file_id=file_id,
task_type='canonical_docling_subdoc_json' if multi else 'canonical_docling_json',
payload={
'bucket': bucket,
'file_path': processing_path,
'cabinet_id': cabinet_id,
'mime_type': processing_mime,
'config': cfg_range,
'artefact_extra': extra,
# Ensure canonical tasks respect upstream dependencies (e.g., Frontmatter)
'depends_on': body.get('depends_on', []),
# Pass through grouping info if provided by caller (kept for backward-compat)
'group_pack_type': body.get('group_pack_type')
},
priority=priority,
timeout=int(body.get('timeout', DOCLING_NOOCR_TIMEOUT))
)
task_ids.append(tid)
logger.info(f"[canonical-docling] completed enqueue file_id={file_id} tasks={len(task_ids)} ranges={len(ranges)} pipeline={pipeline} producer={body.get('producer','manual')} group_id={group_id if multi else 'single'}")
return {
'message': f'enqueued {len(task_ids)} canonical docling job(s)',
'task_ids': task_ids,
'ranges': ranges,
'used_split_map': bool(split_map),
'group_id': group_id,
'pipeline': pipeline,
'producer': body.get('producer', 'manual')
}

View File

@ -1,411 +0,0 @@
"""
Memory-Aware Queue Management System
====================================
Provides intelligent queue management based on memory usage and file sizes
rather than simple task count limits. Supports multiple users with fair
queuing and capacity management.
Features:
- Memory-based queue limits (not just task count)
- Fair queuing across multiple users
- Upload capacity checking with user feedback
- Graceful degradation under load
- Service-specific memory tracking
"""
import os
import time
import json
import uuid
import logging
from typing import Dict, List, Optional, Any, Tuple
from dataclasses import dataclass, asdict
from enum import Enum
import redis
from .redis_manager import get_redis_manager
import psutil
logger = logging.getLogger(__name__)
class QueueStatus(Enum):
ACCEPTING = "accepting" # Normal operation
BUSY = "busy" # High load, warn users
OVERLOADED = "overloaded" # Reject new uploads
MAINTENANCE = "maintenance" # Manual override
@dataclass
class MemoryConfig:
"""Memory configuration for queue management."""
max_total_memory_mb: int = 2048 # 2GB total queue memory
max_user_memory_mb: int = 512 # 512MB per user
max_file_size_mb: int = 100 # 100MB max file size
memory_warning_threshold: float = 0.8 # Warn at 80%
memory_reject_threshold: float = 0.95 # Reject at 95%
@dataclass
class QueuedFile:
"""Represents a file waiting in the queue."""
file_id: str
user_id: str
filename: str
size_bytes: int
mime_type: str
cabinet_id: str
priority: int = 1
queued_at: float = 0
estimated_processing_time: int = 300 # seconds
memory_estimate_mb: float = 0
def __post_init__(self):
if self.queued_at == 0:
self.queued_at = time.time()
# Estimate memory usage (rough heuristic)
self.memory_estimate_mb = self._estimate_memory_usage()
def _estimate_memory_usage(self) -> float:
"""Estimate memory usage for this file during processing."""
base_mb = self.size_bytes / (1024 * 1024)
# Processing multipliers based on operations
if self.mime_type == 'application/pdf':
# PDF: original + extracted text + images + thumbnails
return base_mb * 3.5
elif self.mime_type.startswith('image/'):
# Images: original + resized variants + OCR text
return base_mb * 2.5
else:
# Other docs: original + PDF conversion + processing
return base_mb * 4.0
class MemoryAwareQueue:
"""Memory-aware queue management system."""
def __init__(self, environment: str = "dev"):
self.redis_manager = get_redis_manager(environment)
self.redis_client = self.redis_manager.client
self.config = self._load_config()
# Redis keys
self.upload_queue_key = "upload_queue"
self.processing_memory_key = "processing_memory"
self.user_quota_key = "user_quotas"
self.system_status_key = "system_status"
logger.info(f"🧠 Memory-aware queue initialized (max: {self.config.max_total_memory_mb}MB)")
def _load_config(self) -> MemoryConfig:
"""Load memory configuration from environment."""
return MemoryConfig(
max_total_memory_mb=int(os.getenv('QUEUE_MAX_MEMORY_MB', '2048')),
max_user_memory_mb=int(os.getenv('QUEUE_MAX_USER_MEMORY_MB', '512')),
max_file_size_mb=int(os.getenv('MAX_FILE_SIZE_MB', '100')),
memory_warning_threshold=float(os.getenv('MEMORY_WARNING_THRESHOLD', '0.8')),
memory_reject_threshold=float(os.getenv('MEMORY_REJECT_THRESHOLD', '0.95'))
)
def check_upload_capacity(self, user_id: str, file_size_bytes: int,
mime_type: str) -> Tuple[bool, str, Dict[str, Any]]:
"""
Check if system can accept a new upload.
Returns:
(can_accept, message, queue_info)
"""
# Create temporary QueuedFile to estimate memory
temp_file = QueuedFile(
file_id="temp",
user_id=user_id,
filename="temp",
size_bytes=file_size_bytes,
mime_type=mime_type,
cabinet_id="temp"
)
file_memory_mb = temp_file.memory_estimate_mb
# Check file size limit
if file_size_bytes > (self.config.max_file_size_mb * 1024 * 1024):
return False, f"File too large (max: {self.config.max_file_size_mb}MB)", {}
# Get current memory usage
current_memory = self._get_current_memory_usage()
user_memory = self._get_user_memory_usage(user_id)
# Check user quota
if user_memory + file_memory_mb > self.config.max_user_memory_mb:
return False, f"User quota exceeded (limit: {self.config.max_user_memory_mb}MB)", {
'user_current': user_memory,
'user_limit': self.config.max_user_memory_mb
}
# Check system capacity
total_after = current_memory + file_memory_mb
max_memory = self.config.max_total_memory_mb
if total_after > (max_memory * self.config.memory_reject_threshold):
queue_info = self._get_queue_info()
return False, "System overloaded. Please try again later.", {
'current_memory': current_memory,
'max_memory': max_memory,
'utilization': current_memory / max_memory,
'queue_position': queue_info['total_queued'] + 1
}
# Calculate wait time estimate
wait_estimate = self._estimate_wait_time(user_id)
status = "ready"
message = "Upload accepted"
if total_after > (max_memory * self.config.memory_warning_threshold):
status = "busy"
message = f"System busy. Estimated wait: {wait_estimate // 60}m {wait_estimate % 60}s"
return True, message, {
'status': status,
'estimated_wait_seconds': wait_estimate,
'memory_usage': {
'current': current_memory,
'after_upload': total_after,
'limit': max_memory,
'utilization': total_after / max_memory
},
'user_quota': {
'used': user_memory,
'after_upload': user_memory + file_memory_mb,
'limit': self.config.max_user_memory_mb
}
}
def enqueue_file(self, file_id: str, user_id: str, filename: str,
size_bytes: int, mime_type: str, cabinet_id: str,
priority: int = 1) -> Dict[str, Any]:
"""
Add file to upload queue.
Returns:
Queue information including position and estimated wait time
"""
queued_file = QueuedFile(
file_id=file_id,
user_id=user_id,
filename=filename,
size_bytes=size_bytes,
mime_type=mime_type,
cabinet_id=cabinet_id,
priority=priority
)
# Serialize and add to Redis queue (priority queue: higher priority = lower score)
score = time.time() - (priority * 1000000) # Priority affects score significantly
self.redis_client.zadd(
self.upload_queue_key,
{json.dumps(asdict(queued_file)): score}
)
# Update user quota tracking
self._update_user_quota(user_id, queued_file.memory_estimate_mb, increment=True)
# Get queue position and wait estimate
position = self._get_queue_position(file_id)
wait_estimate = self._estimate_wait_time(user_id)
logger.info(f"📋 Queued file {file_id} for user {user_id} (pos: {position}, wait: {wait_estimate}s)")
return {
'queued': True,
'file_id': file_id,
'queue_position': position,
'estimated_wait_seconds': wait_estimate,
'memory_estimate_mb': queued_file.memory_estimate_mb
}
def dequeue_next_file(self, service_name: str) -> Optional[QueuedFile]:
"""
Get next file from queue for processing.
Args:
service_name: The service requesting work (for capacity management)
"""
# Check if service has capacity
service_memory = self._get_service_memory_usage(service_name)
service_limit = self._get_service_memory_limit(service_name)
if service_memory >= service_limit:
logger.debug(f"Service {service_name} at capacity ({service_memory}/{service_limit}MB)")
return None
# Get next item from priority queue (lowest score first)
items = self.redis_client.zrange(self.upload_queue_key, 0, 0, withscores=True)
if not items:
return None
file_data_json, score = items[0]
file_data = json.loads(file_data_json)
queued_file = QueuedFile(**file_data)
# Check if this file would exceed service memory limit
if service_memory + queued_file.memory_estimate_mb > service_limit:
# Skip this file for now, try smaller ones later
logger.debug(f"File {queued_file.file_id} too large for {service_name} capacity")
return None
# Remove from queue
self.redis_client.zrem(self.upload_queue_key, file_data_json)
# Update tracking
self._update_user_quota(queued_file.user_id, queued_file.memory_estimate_mb, increment=False)
self._update_service_memory(service_name, queued_file.memory_estimate_mb, increment=True)
logger.info(f"🎯 Dequeued file {queued_file.file_id} for {service_name} processing")
return queued_file
def complete_processing(self, service_name: str, file_id: str, memory_used_mb: float):
"""Mark file processing as complete and free memory."""
self._update_service_memory(service_name, memory_used_mb, increment=False)
logger.info(f"✅ Completed processing {file_id} in {service_name} (freed {memory_used_mb}MB)")
def _get_current_memory_usage(self) -> float:
"""Get current total memory usage across all services."""
services = ['docling', 'tika', 'llm', 'document_analysis']
total = 0
for service in services:
service_key = f"{self.processing_memory_key}:{service}"
memory = float(self.redis_client.get(service_key) or 0)
total += memory
return total
def _get_user_memory_usage(self, user_id: str) -> float:
"""Get current memory usage for a specific user."""
user_key = f"{self.user_quota_key}:{user_id}"
return float(self.redis_client.get(user_key) or 0)
def _get_service_memory_usage(self, service_name: str) -> float:
"""Get current memory usage for a service."""
service_key = f"{self.processing_memory_key}:{service_name}"
return float(self.redis_client.get(service_key) or 0)
def _get_service_memory_limit(self, service_name: str) -> float:
"""Get memory limit for a service."""
# Service-specific memory limits as percentage of total
limits = {
'docling': 0.4, # 40% for Docling (memory-intensive)
'tika': 0.2, # 20% for Tika
'llm': 0.3, # 30% for LLM processing
'document_analysis': 0.1 # 10% for document analysis
}
percentage = limits.get(service_name, 0.1)
return self.config.max_total_memory_mb * percentage
def _update_user_quota(self, user_id: str, memory_mb: float, increment: bool):
"""Update user memory quota tracking."""
user_key = f"{self.user_quota_key}:{user_id}"
if increment:
self.redis_client.incrbyfloat(user_key, memory_mb)
else:
current = float(self.redis_client.get(user_key) or 0)
new_value = max(0, current - memory_mb)
self.redis_client.set(user_key, new_value)
# Set expiration for cleanup
self.redis_client.expire(user_key, 86400) # 24 hours
def _update_service_memory(self, service_name: str, memory_mb: float, increment: bool):
"""Update service memory usage tracking."""
service_key = f"{self.processing_memory_key}:{service_name}"
if increment:
self.redis_client.incrbyfloat(service_key, memory_mb)
else:
current = float(self.redis_client.get(service_key) or 0)
new_value = max(0, current - memory_mb)
self.redis_client.set(service_key, new_value)
# Set expiration for cleanup
self.redis_client.expire(service_key, 3600) # 1 hour
def _get_queue_position(self, file_id: str) -> int:
"""Get position of file in queue."""
items = self.redis_client.zrange(self.upload_queue_key, 0, -1)
for i, item in enumerate(items):
file_data = json.loads(item)
if file_data['file_id'] == file_id:
return i + 1
return 0
def _estimate_wait_time(self, user_id: str) -> int:
"""Estimate wait time for user's next file."""
# Simple estimation based on queue position and average processing time
queue_size = self.redis_client.zcard(self.upload_queue_key)
avg_processing_time = 300 # 5 minutes average
return int(queue_size * avg_processing_time * 0.5) # Assume parallel processing
def _get_queue_info(self) -> Dict[str, Any]:
"""Get comprehensive queue information."""
total_queued = self.redis_client.zcard(self.upload_queue_key)
current_memory = self._get_current_memory_usage()
max_memory = self.config.max_total_memory_mb
return {
'total_queued': total_queued,
'memory_usage': {
'current_mb': current_memory,
'max_mb': max_memory,
'utilization': current_memory / max_memory if max_memory > 0 else 0
},
'status': self._determine_system_status(current_memory, max_memory)
}
def _determine_system_status(self, current_memory: float, max_memory: float) -> str:
"""Determine current system status based on memory usage."""
utilization = current_memory / max_memory if max_memory > 0 else 0
if utilization >= self.config.memory_reject_threshold:
return "overloaded"
elif utilization >= self.config.memory_warning_threshold:
return "busy"
else:
return "ready"
def get_system_status(self) -> Dict[str, Any]:
"""Get comprehensive system status for monitoring."""
queue_info = self._get_queue_info()
# Service-specific info
services = {}
for service_name in ['docling', 'tika', 'llm', 'document_analysis']:
services[service_name] = {
'memory_used_mb': self._get_service_memory_usage(service_name),
'memory_limit_mb': self._get_service_memory_limit(service_name),
'utilization': self._get_service_memory_usage(service_name) / self._get_service_memory_limit(service_name)
}
return {
'status': queue_info['status'],
'queue': queue_info,
'services': services,
'config': asdict(self.config)
}
# Convenience functions
def get_memory_queue(environment: str = "dev") -> MemoryAwareQueue:
"""Get memory-aware queue instance."""
return MemoryAwareQueue(environment)
def check_upload_capacity(user_id: str, file_size: int, mime_type: str, environment: str = "dev") -> Tuple[bool, str, Dict]:
"""Quick capacity check for upload."""
queue = get_memory_queue(environment)
return queue.check_upload_capacity(user_id, file_size, mime_type)

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff