diff --git a/.gitignore b/.gitignore index 554b91b..92f33d4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,11 @@ __pycache__ .pytest_cache +.env + .DS_Store .archive/* data/logs/* +*.bak diff --git a/archive/auto_processing/README.md b/archive/auto_processing/README.md deleted file mode 100644 index ab42fd5..0000000 --- a/archive/auto_processing/README.md +++ /dev/null @@ -1,66 +0,0 @@ -# Auto-Processing Code Archive - -This directory contains the complex auto-processing system that was previously used for automatic document processing after file upload. - -## Archived Components - -### Core Processing Files -- `files_with_auto_processing.py` - Original files.py router with automatic processing -- `pipeline_controller.py` - Complex multi-phase pipeline orchestration -- `task_processors.py` - Document processing task handlers - -### Advanced Queue Management (Created but not deployed) -- `memory_aware_queue.py` - Memory-based intelligent queue management -- `enhanced_upload_handler.py` - Advanced upload handler with queuing -- `enhanced_upload.py` - API endpoints for advanced upload system - -## What This System Did - -### Automatic Processing Pipeline -1. **File Upload** → Immediate processing trigger -2. **PDF Conversion** (synchronous, blocking) -3. **Phase 1**: Structure discovery (Tika, Page Images, Document Analysis, Split Map) -4. **Phase 2**: Docling processing (NO_OCR → OCR → VLM pipelines) -5. **Complex Dependencies**: Phase coordination, task sequencing -6. **Redis Queue Management**: Service limits, rate limits, dependency tracking - -### Features -- Multi-phase processing pipelines -- Complex task dependency management -- Memory-aware queue limits -- Multi-user capacity management -- Real-time processing status -- WebSocket status updates -- Service-specific resource limits -- Task recovery on restart - -## Why Archived - -The system was overly complex for the current needs: -- **Complexity**: Multi-phase pipelines with complex dependencies -- **Blocking Operations**: Synchronous PDF conversion causing timeouts -- **Resource Management**: Over-engineered for single-user scenarios -- **User Experience**: Users had to wait for processing to complete - -## New Simplified Approach - -The new system focuses on: -- **Simple Upload**: Just store files and create database records -- **No Auto-Processing**: Users manually trigger processing when needed -- **Directory Support**: Upload entire folders with manifest tracking -- **Immediate Response**: Users get instant confirmation without waiting - -## If You Need to Restore - -To restore the auto-processing functionality: -1. Copy `files_with_auto_processing.py` back to `routers/database/files/files.py` -2. Ensure `pipeline_controller.py` and `task_processors.py` are in `modules/` -3. Update imports and dependencies -4. Re-enable background processing in upload handlers - -## Migration Notes - -The database schema and Redis structure remain compatible. The new simplified system can coexist with the archived processing logic if needed. - -Date Archived: $(date) -Reason: Simplification for directory upload implementation diff --git a/archive/auto_processing/enhanced_upload.py b/archive/auto_processing/enhanced_upload.py deleted file mode 100644 index 3cd60ef..0000000 --- a/archive/auto_processing/enhanced_upload.py +++ /dev/null @@ -1,142 +0,0 @@ -""" -Enhanced Upload Router with Memory-Aware Queuing -=============================================== - -Provides intelligent upload endpoints with capacity checking, -queue management, and real-time status updates. - -Endpoints: -- POST /upload/check-capacity - Pre-check if upload is possible -- POST /upload/with-queue - Upload with intelligent queuing -- GET /upload/status/{file_id} - Get processing status -- GET /upload/queue-status - Get overall queue status -- WebSocket /ws/upload-status - Real-time status updates -""" - -import os -import uuid -import json -import logging -from typing import Dict, List, Optional, Any -from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, WebSocket, WebSocketDisconnect -from fastapi.responses import JSONResponse - -from modules.auth.supabase_bearer import SupabaseBearer -from modules.enhanced_upload_handler import get_upload_handler -from modules.memory_aware_queue import get_memory_queue -from modules.logger_tool import initialise_logger - -router = APIRouter() -auth = SupabaseBearer() - -logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True) - -# WebSocket connection manager for real-time updates -class ConnectionManager: - def __init__(self): - self.active_connections: Dict[str, List[WebSocket]] = {} - - async def connect(self, websocket: WebSocket, file_id: str): - await websocket.accept() - if file_id not in self.active_connections: - self.active_connections[file_id] = [] - self.active_connections[file_id].append(websocket) - - def disconnect(self, websocket: WebSocket, file_id: str): - if file_id in self.active_connections: - self.active_connections[file_id].remove(websocket) - if not self.active_connections[file_id]: - del self.active_connections[file_id] - - async def broadcast_to_file(self, file_id: str, message: dict): - if file_id in self.active_connections: - for connection in self.active_connections[file_id].copy(): - try: - await connection.send_json(message) - except: - self.active_connections[file_id].remove(connection) - -manager = ConnectionManager() - -@router.post("/upload/check-capacity") -async def check_upload_capacity( - file_size: int = Form(...), - mime_type: str = Form(...), - payload: Dict[str, Any] = Depends(auth) -): - """ - Check if user can upload a file of given size and type. - - Returns capacity information and recommendations. - """ - try: - user_id = payload.get('sub') or payload.get('user_id', 'anonymous') - - # Determine environment - environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod' - upload_handler = get_upload_handler(environment) - - eligible, message, details = upload_handler.check_upload_eligibility( - user_id, file_size, mime_type - ) - - response = { - 'eligible': eligible, - 'message': message, - 'details': details, - 'timestamp': time.time() - } - - status_code = 200 if eligible else 429 # Too Many Requests if not eligible - - logger.info(f"📋 Capacity check for user {user_id}: {eligible} - {message}") - - return JSONResponse(content=response, status_code=status_code) - - except Exception as e: - logger.error(f"Capacity check error: {e}") - raise HTTPException(status_code=500, detail=str(e)) - -@router.post("/upload/with-queue") -async def upload_with_queue( - cabinet_id: str = Form(...), - path: str = Form(...), - scope: str = Form(...), - priority: int = Form(1), - file: UploadFile = File(...), - payload: Dict[str, Any] = Depends(auth) -): - """ - Upload file with intelligent queuing and capacity management. - - Returns queue information and processing status. - """ - try: - user_id = payload.get('sub') or payload.get('user_id', 'anonymous') - - # Read file content - file_bytes = await file.read() - file_size = len(file_bytes) - mime_type = file.content_type or 'application/octet-stream' - filename = file.filename or path - - logger.info(f"📤 Upload request: {filename} ({file_size} bytes) for user {user_id}") - - # Determine environment - environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod' - upload_handler = get_upload_handler(environment) - - # Check if upload queuing is enabled - if os.getenv('UPLOAD_QUEUE_ENABLED', 'true').lower() == 'true': - # Use new queue-based upload - file_id = str(uuid.uuid4()) - - result = await upload_handler.handle_upload_with_queue( - file_id=file_id, - user_id=user_id, - filename=filename, - file_bytes=file_bytes, - mime_type=mime_type, - cabinet_id=cabinet_id, - priority=priority - )\n \n return result\n \n else:\n # Fall back to immediate processing (legacy mode)\n logger.warning(\"Using legacy immediate processing mode\")\n # TODO: Call original upload_file function\n raise HTTPException(status_code=501, detail=\"Legacy mode not implemented in this endpoint\")\n \n except HTTPException:\n raise\n except Exception as e:\n logger.error(f\"Upload error: {e}\")\n raise HTTPException(status_code=500, detail=str(e))\n\n@router.get(\"/upload/status/{file_id}\")\nasync def get_upload_status(\n file_id: str,\n payload: Dict[str, Any] = Depends(auth)\n):\n \"\"\"\n Get current processing status for an uploaded file.\n \"\"\"\n try:\n # Determine environment\n environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n upload_handler = get_upload_handler(environment)\n \n status = upload_handler.get_processing_status(file_id)\n \n if status.get('status') == 'not_found':\n raise HTTPException(status_code=404, detail=\"File not found\")\n \n return status\n \n except HTTPException:\n raise\n except Exception as e:\n logger.error(f\"Status check error for {file_id}: {e}\")\n raise HTTPException(status_code=500, detail=str(e))\n\n@router.get(\"/upload/queue-status\")\nasync def get_queue_status(\n payload: Dict[str, Any] = Depends(auth)\n):\n \"\"\"\n Get overall queue status and system capacity information.\n \"\"\"\n try:\n # Determine environment\n environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n memory_queue = get_memory_queue(environment)\n \n system_status = memory_queue.get_system_status()\n \n return {\n 'system_status': system_status,\n 'timestamp': time.time(),\n 'environment': environment\n }\n \n except Exception as e:\n logger.error(f\"Queue status error: {e}\")\n raise HTTPException(status_code=500, detail=str(e))\n\n@router.websocket(\"/ws/upload-status/{file_id}\")\nasync def websocket_upload_status(websocket: WebSocket, file_id: str):\n \"\"\"\n WebSocket endpoint for real-time upload status updates.\n \"\"\"\n await manager.connect(websocket, file_id)\n \n try:\n # Send initial status\n environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n upload_handler = get_upload_handler(environment)\n initial_status = upload_handler.get_processing_status(file_id)\n \n await websocket.send_json({\n 'type': 'status_update',\n 'data': initial_status\n })\n \n # Keep connection alive and listen for updates\n while True:\n # In a real implementation, you'd have a background task\n # that pushes updates when file status changes\n await asyncio.sleep(5)\n \n # Check for status updates\n current_status = upload_handler.get_processing_status(file_id)\n await websocket.send_json({\n 'type': 'status_update', \n 'data': current_status\n })\n \n except WebSocketDisconnect:\n manager.disconnect(websocket, file_id)\n except Exception as e:\n logger.error(f\"WebSocket error for {file_id}: {e}\")\n manager.disconnect(websocket, file_id)\n\n# Background task to process upload queue\n@router.on_event(\"startup\")\nasync def start_queue_processor():\n \"\"\"Start background queue processor.\"\"\"\n \n if os.getenv('UPLOAD_QUEUE_ENABLED', 'true').lower() != 'true':\n logger.info(\"📋 Upload queue disabled, skipping queue processor\")\n return\n \n import asyncio\n \n environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n upload_handler = get_upload_handler(environment)\n \n # Start background processor\n asyncio.create_task(upload_handler.process_queued_files(\"document_processor\"))\n \n logger.info(\"🚀 Upload queue processor started\")\n\nimport time\nimport asyncio" diff --git a/archive/auto_processing/enhanced_upload_handler.py b/archive/auto_processing/enhanced_upload_handler.py deleted file mode 100644 index d453639..0000000 --- a/archive/auto_processing/enhanced_upload_handler.py +++ /dev/null @@ -1,362 +0,0 @@ -""" -Enhanced Upload Handler with Memory-Aware Queuing -================================================= - -Replaces the immediate processing model with intelligent queue management. -Provides user feedback about capacity, queue position, and processing status. - -Features: -- Pre-upload capacity checking -- Memory-aware queuing with user quotas -- Real-time status updates via WebSocket/SSE -- Graceful degradation under load -- Fair queuing across multiple users -""" - -import os -import uuid -import time -import logging -import asyncio -from typing import Dict, List, Optional, Any, Tuple -from fastapi import HTTPException, BackgroundTasks -from dataclasses import asdict - -from .memory_aware_queue import get_memory_queue, QueuedFile -from .redis_manager import get_redis_manager -from modules.database.supabase.utils.client import SupabaseServiceRoleClient -from modules.database.tools.storage.storage_admin import StorageAdmin - -logger = logging.getLogger(__name__) - -class EnhancedUploadHandler: - """Enhanced upload handler with memory-aware queuing.""" - - def __init__(self, environment: str = "dev"): - self.memory_queue = get_memory_queue(environment) - self.redis_manager = get_redis_manager(environment) - self.redis_client = self.redis_manager.client - - # Processing status tracking - self.processing_status_key = "file_processing_status" - - def check_upload_eligibility(self, user_id: str, file_size: int, - mime_type: str) -> Tuple[bool, str, Dict[str, Any]]: - """ - Check if user can upload a file right now. - - Returns: - (eligible, message, details) - """ - - # Check system capacity - can_accept, message, queue_info = self.memory_queue.check_upload_capacity( - user_id, file_size, mime_type - ) - - if not can_accept: - return False, message, { - 'reason': 'capacity_exceeded', - 'queue_info': queue_info, - 'recommendations': self._get_recommendations(queue_info) - } - - return True, message, { - 'status': 'ready_for_upload', - 'queue_info': queue_info, - 'processing_estimate': self._estimate_processing_time(file_size, mime_type) - } - - async def handle_upload_with_queue(self, file_id: str, user_id: str, - filename: str, file_bytes: bytes, - mime_type: str, cabinet_id: str, - priority: int = 1) -> Dict[str, Any]: - """ - Handle file upload with intelligent queuing. - - Steps: - 1. Store file immediately (cheap operation) - 2. Add to processing queue - 3. Return queue status to user - 4. Process asynchronously when capacity available - """ - - # Store file immediately (this is fast) - storage = StorageAdmin() - client = SupabaseServiceRoleClient() - - # Create database record - bucket = f"{cabinet_id}-files" # or your bucket naming convention - storage_path = f"{cabinet_id}/{file_id}/{filename}" - - try: - # Store file - storage.upload_file(bucket, storage_path, file_bytes, mime_type, upsert=True) - - # Create file record - insert_res = client.supabase.table('files').insert({ - 'id': file_id, - 'name': filename, - 'cabinet_id': cabinet_id, - 'bucket': bucket, - 'path': storage_path, - 'mime_type': mime_type, - 'uploaded_by': user_id, - 'size_bytes': len(file_bytes), - 'source': 'classroomcopilot-web', - 'status': 'queued_for_processing' # New status - }).execute() - - if not insert_res.data: - raise HTTPException(status_code=500, detail="Failed to create file record") - - except Exception as e: - logger.error(f"Failed to store file {file_id}: {e}") - raise HTTPException(status_code=500, detail=f"Storage failed: {str(e)}") - - # Add to processing queue - try: - queue_result = self.memory_queue.enqueue_file( - file_id=file_id, - user_id=user_id, - filename=filename, - size_bytes=len(file_bytes), - mime_type=mime_type, - cabinet_id=cabinet_id, - priority=priority - ) - - # Update file status in database - client.supabase.table('files').update({ - 'status': 'queued_for_processing', - 'extra': { - 'queue_position': queue_result['queue_position'], - 'estimated_wait_seconds': queue_result['estimated_wait_seconds'], - 'memory_estimate_mb': queue_result['memory_estimate_mb'] - } - }).eq('id', file_id).execute() - - logger.info(f"📋 File {file_id} queued at position {queue_result['queue_position']}") - - return { - 'status': 'upload_successful', - 'message': 'File uploaded and queued for processing', - 'file_id': file_id, - 'queue_info': queue_result, - 'next_steps': { - 'poll_status_endpoint': f'/database/files/{file_id}/processing-status', - 'websocket_updates': f'/ws/file-processing/{file_id}' - } - } - - except Exception as e: - logger.error(f"Failed to queue file {file_id}: {e}") - # Clean up stored file - try: - storage.delete_file(bucket, storage_path) - client.supabase.table('files').delete().eq('id', file_id).execute() - except: - pass - raise HTTPException(status_code=500, detail=f"Queue failed: {str(e)}") - - async def process_queued_files(self, service_name: str = "document_processor"): - """ - Background service to process queued files. - This runs continuously as a background task. - """ - - logger.info(f"🚀 Started queue processor for {service_name}") - - while True: - try: - # Get next file from queue - queued_file = self.memory_queue.dequeue_next_file(service_name) - - if not queued_file: - # No files ready for processing - await asyncio.sleep(5) - continue - - # Update file status - await self._update_processing_status(queued_file.file_id, 'processing') - - # Process the file - try: - await self._process_file(queued_file, service_name) - await self._update_processing_status(queued_file.file_id, 'completed') - - except Exception as e: - logger.error(f"Failed to process file {queued_file.file_id}: {e}") - await self._update_processing_status(queued_file.file_id, 'failed', str(e)) - - finally: - # Always free memory - self.memory_queue.complete_processing( - service_name, - queued_file.file_id, - queued_file.memory_estimate_mb - ) - - except Exception as e: - logger.error(f"Queue processor error: {e}") - await asyncio.sleep(10) # Back off on errors - - async def _process_file(self, queued_file: QueuedFile, service_name: str): - """Process a single file from the queue.""" - - logger.info(f"🔄 Processing file {queued_file.file_id} in {service_name}") - - # Import here to avoid circular imports - from modules.pipeline_controller import get_pipeline_controller - - client = SupabaseServiceRoleClient() - controller = get_pipeline_controller() - - # Get file record - file_result = client.supabase.table('files').select('*').eq('id', queued_file.file_id).single().execute() - file_row = file_result.data - - if not file_row: - raise Exception(f"File record not found: {queued_file.file_id}") - - # Update status to processing - client.supabase.table('files').update({ - 'status': 'processing' - }).eq('id', queued_file.file_id).execute() - - # Convert to PDF if needed (this is where the bottleneck was before) - processing_path = await self._handle_pdf_conversion(file_row) - - # Enqueue Phase 1 tasks - phase1_tasks = controller.enqueue_phase1_tasks( - file_id=queued_file.file_id, - file_row={**file_row, 'path': processing_path}, - processing_path=processing_path, - processing_mime=file_row['mime_type'] - ) - - # Update database with task IDs - client.supabase.table('files').update({ - 'status': 'phase1_processing', - 'extra': { - **file_row.get('extra', {}), - 'phase1_tasks': phase1_tasks, - 'processing_started_at': time.time() - } - }).eq('id', queued_file.file_id).execute() - - logger.info(f"✅ File {queued_file.file_id} processing initiated") - - async def _handle_pdf_conversion(self, file_row: Dict[str, Any]) -> str: - """Handle PDF conversion asynchronously.""" - - if file_row['mime_type'] == 'application/pdf': - return file_row['path'] - - # TODO: Implement async PDF conversion - # For now, return original path and handle conversion in pipeline - logger.info(f"PDF conversion queued for file {file_row['id']}") - return file_row['path'] - - async def _update_processing_status(self, file_id: str, status: str, error: str = None): - """Update file processing status.""" - - status_data = { - 'file_id': file_id, - 'status': status, - 'timestamp': time.time(), - 'error': error - } - - # Store in Redis for real-time updates - status_key = f"{self.processing_status_key}:{file_id}" - self.redis_client.setex(status_key, 86400, json.dumps(status_data)) # 24h expiry - - # Update database - client = SupabaseServiceRoleClient() - client.supabase.table('files').update({ - 'status': status, - 'error_message': error - }).eq('id', file_id).execute() - - logger.info(f"📊 Status update for {file_id}: {status}") - - def get_processing_status(self, file_id: str) -> Dict[str, Any]: - """Get current processing status for a file.""" - - status_key = f"{self.processing_status_key}:{file_id}" - status_json = self.redis_client.get(status_key) - - if status_json: - return json.loads(status_json) - - # Fallback to database - client = SupabaseServiceRoleClient() - result = client.supabase.table('files').select('status, error_message, extra').eq('id', file_id).single().execute() - - if result.data: - return { - 'file_id': file_id, - 'status': result.data['status'], - 'error': result.data.get('error_message'), - 'extra': result.data.get('extra', {}) - } - - return {'file_id': file_id, 'status': 'not_found'} - - def _estimate_processing_time(self, file_size: int, mime_type: str) -> Dict[str, Any]: - """Estimate processing time for a file.""" - - # Base time estimates (in seconds) - base_times = { - 'application/pdf': 60, # 1 minute per MB - 'application/msword': 120, # 2 minutes per MB - 'image/': 30 # 30 seconds per MB - } - - # Find matching mime type - time_per_mb = 60 # default - for mime_prefix, time_val in base_times.items(): - if mime_type.startswith(mime_prefix): - time_per_mb = time_val - break - - file_size_mb = file_size / (1024 * 1024) - estimated_seconds = int(file_size_mb * time_per_mb) - - return { - 'estimated_seconds': estimated_seconds, - 'estimated_minutes': estimated_seconds / 60, - 'phases': { - 'pdf_conversion': estimated_seconds * 0.2, - 'metadata_extraction': estimated_seconds * 0.3, - 'docling_processing': estimated_seconds * 0.5 - } - } - - def _get_recommendations(self, queue_info: Dict[str, Any]) -> List[str]: - """Get recommendations for user when upload is rejected.""" - - recommendations = [] - - if queue_info.get('reason') == 'file_too_large': - recommendations.append("Try compressing your file or splitting it into smaller parts") - - if queue_info.get('utilization', 0) > 0.9: - recommendations.append("System is currently overloaded. Try uploading during off-peak hours") - recommendations.append("Consider uploading smaller files first") - - if queue_info.get('user_current', 0) > 0: - recommendations.append("Wait for your current uploads to complete before uploading more") - - if not recommendations: - recommendations.append("Please try again in a few minutes") - - return recommendations - -# Convenience functions -def get_upload_handler(environment: str = "dev") -> EnhancedUploadHandler: - """Get enhanced upload handler instance.""" - return EnhancedUploadHandler(environment) - -import json diff --git a/archive/auto_processing/files_with_auto_processing.py b/archive/auto_processing/files_with_auto_processing.py deleted file mode 100644 index 3188335..0000000 --- a/archive/auto_processing/files_with_auto_processing.py +++ /dev/null @@ -1,997 +0,0 @@ -import os -import io -from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, BackgroundTasks -from typing import Any, Dict, Optional -import uuid -import re -import requests -import os -import tempfile -from pathlib import Path -from modules.auth.supabase_bearer import SupabaseBearer, verify_supabase_jwt_str -from modules.logger_tool import initialise_logger -from modules.database.supabase.utils.client import SupabaseServiceRoleClient -from modules.database.supabase.utils.storage import StorageAdmin -from modules.document_processor import DocumentProcessor -from modules.queue_system import ( - enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task, - enqueue_document_analysis_task, enqueue_page_images_task, - TaskPriority, get_queue, QueueConnectionError -) -from fastapi.responses import Response -from fastapi import Body - -router = APIRouter() -auth = SupabaseBearer() -doc_processor = DocumentProcessor() - -DEFAULT_BUCKET = os.getenv('DEFAULT_FILES_BUCKET', 'cc.users') - -# Timeout configurations (in seconds) -TIKA_TIMEOUT = int(os.getenv('TIKA_TIMEOUT', '300')) # 5 minutes default -DOCLING_FRONTMATTER_TIMEOUT = int(os.getenv('DOCLING_FRONTMATTER_TIMEOUT', '1800')) # 30 minutes default -DOCLING_NOOCR_TIMEOUT = int(os.getenv('DOCLING_NOOCR_TIMEOUT', '3600')) # 1 hour default - -# (Legacy feature flags removed - using new three-phase system) - -logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True) - -def _safe_filename(name: str) -> str: - base = os.path.basename(name or 'file') - return re.sub(r"[^A-Za-z0-9._-]+", "_", base) - -def _choose_bucket(scope: str, user_id: str, school_id: Optional[str]) -> str: - scope = (scope or 'teacher').lower() - if scope == 'school' and school_id: - return f"cc.institutes.{school_id}.private" - # teacher / student fall back to users bucket for now - return 'cc.users' - -@router.post("/files/upload") -async def upload_file( - cabinet_id: str = Form(...), - path: str = Form(...), - scope: str = Form('teacher'), - school_id: Optional[str] = Form(default=None), - file: UploadFile = File(...), - payload: Dict[str, Any] = Depends(auth), - background_tasks: BackgroundTasks = None -): - user_id = payload.get('sub') or payload.get('user_id') - if not user_id: - raise HTTPException(status_code=401, detail="Invalid token payload") - - client = SupabaseServiceRoleClient() - storage = StorageAdmin() - - # Determine target bucket by scope - bucket = _choose_bucket(scope, user_id, school_id) - - # Stage DB row to get file_id - staged_path = f"{cabinet_id}/staging/{uuid.uuid4()}" - name = _safe_filename(path or file.filename) - file_bytes = await file.read() - insert_res = client.supabase.table('files').insert({ - 'cabinet_id': cabinet_id, - 'name': name, - 'path': staged_path, - 'bucket': bucket, - 'mime_type': file.content_type, - 'uploaded_by': user_id, - 'size_bytes': len(file_bytes), - 'source': 'classroomcopilot-web' - }).execute() - if not insert_res.data: - raise HTTPException(status_code=500, detail="Failed to create file record") - file_row = insert_res.data[0] - file_id = file_row['id'] - - # Final storage path: bucket/cabinet_id/file_id/file - final_storage_path = f"{cabinet_id}/{file_id}/{name}" - try: - storage.upload_file(bucket, final_storage_path, file_bytes, file.content_type or 'application/octet-stream', upsert=True) - except Exception as e: - # cleanup staged row - client.supabase.table('files').delete().eq('id', file_id).execute() - raise HTTPException(status_code=500, detail=f"Storage upload failed: {str(e)}") - - # Update DB path to final - update_res = client.supabase.table('files').update({ - 'path': final_storage_path - }).eq('id', file_id).execute() - # Kick off initial artefacts generation in background (Tika + Docling frontmatter + no-OCR) - try: - if background_tasks is not None: - logger.info(f"Scheduling initial artefacts generation for file_id={file_id}") - background_tasks.add_task(generate_initial_artefacts, file_id, payload) - else: - logger.info(f"Running initial artefacts generation synchronously for file_id={file_id}") - generate_initial_artefacts(file_id, payload) - except Exception as e: - logger.error(f"Failed to schedule initial artefacts for file_id={file_id}: {e}") - - return update_res.data - -@router.get("/files") -def list_files(cabinet_id: str, payload: Dict[str, Any] = Depends(auth)): - client = SupabaseServiceRoleClient() - res = client.supabase.table('files').select('*').eq('cabinet_id', cabinet_id).execute() - return res.data - -@router.post("/files/{file_id}/move") -def move_file(file_id: str, body: Dict[str, Any], payload: Dict[str, Any] = Depends(auth)): - client = SupabaseServiceRoleClient() - updates = {} - if 'cabinet_id' in body: - updates['cabinet_id'] = body['cabinet_id'] - if 'path' in body: - updates['path'] = body['path'] - if not updates: - raise HTTPException(status_code=400, detail="No changes provided") - res = client.supabase.table('files').update(updates).eq('id', file_id).execute() - return res.data - -@router.delete("/files/{file_id}") -def delete_file(file_id: str, payload: Dict[str, Any] = Depends(auth)): - client = SupabaseServiceRoleClient() - res = client.supabase.table('files').delete().eq('id', file_id).execute() - return res.data - -@router.get("/files/{file_id}/artefacts") -def list_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)): - client = SupabaseServiceRoleClient() - res = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).order('created_at', desc=True).execute() - return res.data - -@router.get("/files/{file_id}/viewer-artefacts") -def list_viewer_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)): - """ - Get artefacts organized for UI viewer display, including frontmatter JSON, - processing bundles, and analysis data with proper display metadata. - """ - client = SupabaseServiceRoleClient() - - # Get all artefacts for the file - res = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).order('created_at', desc=True).execute() - all_artefacts = res.data or [] - - # Organize artefacts by category for UI display - viewer_artefacts = { - 'document_analysis': [], - 'processing_bundles': [], - 'raw_data': [] - } - - for artefact in all_artefacts: - artefact_type = artefact.get('type', '') - extra = artefact.get('extra', {}) - - # Enhanced artefact info for UI display - artefact_info = { - 'id': artefact['id'], - 'type': artefact_type, - 'display_name': extra.get('display_name'), - 'bundle_label': extra.get('bundle_label'), - 'section_title': extra.get('section_title'), - 'page_range': extra.get('page_range'), - 'page_count': extra.get('page_count'), - 'pipeline': extra.get('pipeline'), - 'processing_mode': extra.get('processing_mode'), - 'ui_order': extra.get('ui_order', 999), - 'description': extra.get('description'), - 'viewer_type': extra.get('viewer_type', 'json'), - 'created_at': artefact['created_at'], - 'status': artefact.get('status', 'unknown') - } - - # Categorize artefacts for UI organization - if artefact_type == 'docling_frontmatter_json': - artefact_info.update({ - 'display_name': artefact_info['display_name'] or 'Document Frontmatter', - 'bundle_label': artefact_info['bundle_label'] or 'Frontmatter Analysis', - 'description': artefact_info['description'] or 'OCR analysis of document structure and metadata', - 'ui_order': 1, - 'viewer_type': 'json' - }) - viewer_artefacts['document_analysis'].append(artefact_info) - - elif artefact_type == 'split_map_json': - artefact_info.update({ - 'display_name': 'Document Structure Map', - 'bundle_label': 'Split Map', - 'description': 'Document section boundaries and organization structure', - 'ui_order': 2, - 'viewer_type': 'json' - }) - viewer_artefacts['document_analysis'].append(artefact_info) - - elif artefact_type == 'tika_json': - artefact_info.update({ - 'display_name': 'Document Metadata', - 'bundle_label': 'Tika Analysis', - 'description': 'Raw document metadata and properties extracted by Apache Tika', - 'ui_order': 3, - 'viewer_type': 'json' - }) - viewer_artefacts['raw_data'].append(artefact_info) - - elif artefact_type in ['canonical_docling_json', 'docling_bundle_split', 'docling_bundle', 'docling_standard', 'docling_bundle_split_pages']: - # Processing bundles (OCR, No-OCR, VLM) - use original_pipeline for proper differentiation - pipeline_name = extra.get('original_pipeline', extra.get('pipeline', 'Unknown')) - bundle_label = artefact_info['bundle_label'] or f"{pipeline_name.upper().replace('_', '-')} Bundle" - display_name = artefact_info['display_name'] or f"{pipeline_name.upper().replace('_', '-')} Processing Result" - - # Special handling for master manifests - if artefact_type == 'docling_bundle_split_pages': - display_name = f"{pipeline_name.upper().replace('_', '-')} Document Pages" - bundle_label = f"{pipeline_name.upper().replace('_', '-')} Pages Bundle" - artefact_info.update({ - 'viewer_type': 'bundle_collection', - 'is_master_manifest': True, - 'ui_order': 10 # Show master manifests before individual pages - }) - elif artefact_type == 'docling_standard': - # Individual page bundles - lower UI priority - artefact_info.update({ - 'viewer_type': 'page_bundle', - 'is_individual_page': True, - 'ui_order': extra.get('split_order', 999) + 100 # Show after master manifests - }) - - artefact_info.update({ - 'display_name': display_name, - 'bundle_label': bundle_label, - 'description': f"Docling processing result using {pipeline_name.replace('_', '-')} pipeline", - 'pipeline_type': pipeline_name # Add explicit pipeline type for UI - }) - viewer_artefacts['processing_bundles'].append(artefact_info) - - elif artefact_type.startswith('docling_') and artefact_type.endswith('_json'): - # Other docling JSON results - pipeline_name = artefact_type.replace('docling_', '').replace('_json', '').upper() - artefact_info.update({ - 'display_name': f"{pipeline_name} Analysis", - 'bundle_label': f"{pipeline_name} Result", - 'description': f"Docling {pipeline_name.lower()} processing result", - 'viewer_type': 'json' - }) - viewer_artefacts['processing_bundles'].append(artefact_info) - - elif artefact_type == 'page_images': - artefact_info.update({ - 'display_name': 'Page Images', - 'bundle_label': 'Visual Pages', - 'description': 'Generated page images for document visualization', - 'viewer_type': 'images' - }) - viewer_artefacts['raw_data'].append(artefact_info) - - # Sort each category by ui_order - for category in viewer_artefacts.values(): - category.sort(key=lambda x: (x['ui_order'], x['created_at'])) - - return { - 'file_id': file_id, - 'categories': viewer_artefacts, - 'total_artefacts': len(all_artefacts) - } - -@router.post("/files/{file_id}/artefacts/initial") -def generate_initial_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)): - """ - Generate initial artefacts using the new three-phase pipeline architecture. - - Phase 1: Document Structure Discovery & Analysis - - Tika metadata extraction - - Page images generation - - Document structure analysis (LLM-enhanced) - - Split map generation - - Phase 2: Triggered automatically after Phase 1 completion - """ - logger.info(f"Three-phase pipeline: Starting Phase 1 for file_id={file_id}") - - from modules.pipeline_controller import get_pipeline_controller - - client = SupabaseServiceRoleClient() - storage = StorageAdmin() - controller = get_pipeline_controller() - - # Load file row - fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute() - file_row = fr.data - if not file_row: - raise HTTPException(status_code=404, detail="File not found") - - bucket = file_row['bucket'] - storage_path = file_row['path'] - cabinet_id = file_row['cabinet_id'] - mime = file_row.get('mime_type') or 'application/octet-stream' - filename = file_row.get('name', 'file') - - # Step 1: Convert to PDF if not already a PDF (synchronous for now) - processing_path = storage_path - processing_mime = mime - - if mime != 'application/pdf': - logger.info(f"Converting non-PDF file to PDF: file_id={file_id} mime={mime}") - try: - file_bytes = storage.download_file(bucket, storage_path) - - with tempfile.TemporaryDirectory() as temp_dir: - # Save original file to temp location - temp_input = Path(temp_dir) / filename - with open(temp_input, 'wb') as f: - f.write(file_bytes) - - # Convert to PDF - pdf_bytes = doc_processor.convert_to_pdf(temp_input) - - # Store PDF as artefact - pdf_artefact_id = str(uuid.uuid4()) - pdf_rel_path = f"{cabinet_id}/{file_id}/{pdf_artefact_id}/document.pdf" - storage.upload_file(bucket, pdf_rel_path, pdf_bytes, 'application/pdf', upsert=True) - - pdf_ar = client.supabase.table('document_artefacts').insert({ - 'file_id': file_id, - 'type': 'document_pdf', - 'rel_path': pdf_rel_path, - 'extra': {'converted_from': mime, 'original_filename': filename}, - 'status': 'completed' - }).execute() - - # Use converted PDF for subsequent processing - processing_path = pdf_rel_path - processing_mime = 'application/pdf' - logger.info(f"PDF conversion: completed file_id={file_id} rel_path={pdf_rel_path}") - - except Exception as e: - logger.error(f"PDF conversion: error processing file_id={file_id}: {e}") - # Continue with original file if conversion fails - else: - logger.info(f"File is already PDF, skipping conversion: file_id={file_id}") - - # Step 2: Enqueue Phase 1 tasks using the new pipeline controller - user_id = payload.get('sub') or payload.get('user_id') - priority = TaskPriority.HIGH if user_id else TaskPriority.NORMAL - - try: - # Update file row with processing path - updated_file_row = {**file_row, 'path': processing_path, 'mime_type': processing_mime} - - # Enqueue Phase 1 tasks - phase1_tasks = controller.enqueue_phase1_tasks( - file_id=file_id, - file_row=updated_file_row, - processing_path=processing_path, - processing_mime=processing_mime, - priority=priority - ) - - total_tasks = sum(len(task_list) for task_list in phase1_tasks.values()) - - logger.info(f"Three-phase pipeline: Enqueued {total_tasks} Phase 1 tasks for file_id={file_id}") - - - return { - 'message': f'Three-phase pipeline: Enqueued {total_tasks} Phase 1 tasks. Phase 2 will trigger automatically after completion.', - 'phase1_tasks': {k: v for k, v in phase1_tasks.items()}, - 'file_id': file_id, - 'pipeline_mode': 'three_phase', - 'bundle_architecture_enabled': True - } - - except QueueConnectionError as e: - logger.error(f"Queue system unavailable for file_id={file_id}: {e}") - logger.error("Redis is not running. Please start the API server with './start.sh dev' to auto-start Redis.") - return { - 'message': 'File uploaded successfully, but processing tasks could not be queued (Redis unavailable)', - 'file_id': file_id, - 'queue_status': 'unavailable', - 'error': 'Queue system unavailable. Please restart the API server with Redis enabled.' - } - except Exception as e: - logger.error(f"Unexpected error enqueueing Phase 1 tasks for file_id={file_id}: {e}") - return { - 'message': 'File uploaded successfully, but processing tasks failed to queue', - 'file_id': file_id, - 'queue_status': 'failed', - 'error': str(e) - } - -@router.get("/files/{file_id}/page-images/manifest") -def get_page_images_manifest(file_id: str, payload: Dict[str, Any] = Depends(auth)): - """Return the page_images manifest JSON for a file via service-role access.""" - client = SupabaseServiceRoleClient() - storage = StorageAdmin() - - # Find file row to get bucket - fr = client.supabase.table('files').select('id,bucket,cabinet_id').eq('id', file_id).single().execute() - file_row = fr.data or {} - if not file_row: - raise HTTPException(status_code=404, detail="File not found") - bucket = file_row['bucket'] - cabinet_id = file_row['cabinet_id'] - - # Find page_images artefact - arts = client.supabase.table('document_artefacts') \ - .select('id,type,rel_path,extra') \ - .eq('file_id', file_id).eq('type', 'page_images') \ - .order('created_at', desc=True).limit(1).execute().data or [] - if not arts: - raise HTTPException(status_code=404, detail="page_images artefact not found") - art = arts[0] - - # Manifest path - manifest_rel_path = (art.get('extra') or {}).get('manifest') or f"{art['rel_path'].rstrip('/')}/page_images.json" - - try: - raw = storage.download_file(bucket, manifest_rel_path) - import json as _json - manifest = _json.loads(raw.decode('utf-8')) - # Ensure bucket and base prefix are present for the UI - manifest.setdefault('bucket', bucket) - manifest.setdefault('base_dir', art['rel_path']) - return manifest - except Exception as e: - raise HTTPException(status_code=500, detail=f"Failed to load manifest: {e}") - -def json_dumps(obj: Any) -> str: - try: - import json - return json.dumps(obj, ensure_ascii=False) - except Exception: - return "{}" - - -@router.get("/files/{file_id}/artefacts/{artefact_id}/json") -def get_artefact_json(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)): - """Return the JSON content of a document artefact using service-role storage access.""" - client = SupabaseServiceRoleClient() - storage = StorageAdmin() - # Look up artefact to get rel_path and validate it belongs to file - ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path').eq('id', artefact_id).single().execute() - artefact = ar.data - if not artefact: - raise HTTPException(status_code=404, detail="Artefact not found") - if artefact.get('file_id') != file_id: - raise HTTPException(status_code=400, detail="Artefact does not belong to file") - - # Look up file to get bucket - fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute() - file_row = fr.data - if not file_row: - raise HTTPException(status_code=404, detail="File not found") - - bucket = file_row['bucket'] - rel_path = artefact['rel_path'] - try: - raw = storage.download_file(bucket, rel_path) - import json as _json - data = _json.loads(raw.decode('utf-8')) - return data - except Exception as e: - raise HTTPException(status_code=500, detail=f"Failed to load artefact JSON: {str(e)}") - - -@router.get("/files/{file_id}/artefacts/{artefact_id}/vlm-section-manifest") -def get_vlm_section_manifest(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)): - """Return the VLM section page bundle manifest JSON for a VLM section bundle artefact.""" - client = SupabaseServiceRoleClient() - storage = StorageAdmin() - - ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path,type,extra').eq('id', artefact_id).single().execute().data - if not ar: - raise HTTPException(status_code=404, detail="Artefact not found") - if ar.get('file_id') != file_id: - raise HTTPException(status_code=400, detail="Artefact does not belong to file") - if ar.get('type') != 'vlm_section_page_bundle': - raise HTTPException(status_code=400, detail="Artefact is not a VLM section page bundle") - - fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute().data - if not fr: - raise HTTPException(status_code=404, detail="File not found") - bucket = fr['bucket'] - - # The rel_path directly points to the manifest JSON file - manifest_rel_path = ar['rel_path'] - - try: - raw = storage.download_file(bucket, manifest_rel_path) - import json as _json - data = _json.loads(raw.decode('utf-8')) - # ensure bucket present for client use - data.setdefault('bucket', bucket) - return data - except Exception as e: - raise HTTPException(status_code=500, detail=f"Failed to load VLM section manifest: {e}") - - -@router.post("/files/{file_id}/artefacts/outline") -def enqueue_outline_structure(file_id: str, payload: Dict[str, Any] = Depends(auth)): - """ - Manually enqueue the fast document outline (headings-only) analysis for an existing file. - Returns the queued task id. - """ - client = SupabaseServiceRoleClient() - - fr = client.supabase.table('files').select('id,bucket,cabinet_id,path,mime_type').eq('id', file_id).single().execute() - file_row = fr.data - if not file_row: - raise HTTPException(status_code=404, detail="File not found") - - bucket = file_row['bucket'] - storage_path = file_row['path'] - cabinet_id = file_row['cabinet_id'] - mime = file_row.get('mime_type') or 'application/pdf' - - # Prefer converted PDF artefact if available - arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or [] - pdf_art = next((a for a in arts if a.get('type') == 'document_pdf'), None) - processing_path = pdf_art['rel_path'] if pdf_art else storage_path - - try: - task_id = enqueue_docling_task( - file_id=file_id, - task_type='document_structure_analysis', - payload={ - 'bucket': bucket, - 'file_path': processing_path, - 'cabinet_id': cabinet_id, - 'mime_type': mime, - 'config': { - 'target_type': 'inbody', - 'to_formats': 'json', - 'do_ocr': False, - 'force_ocr': False - } - }, - priority=TaskPriority.NORMAL, - timeout=300 - ) - return { 'message': 'outline task enqueued', 'task_id': task_id, 'file_id': file_id } - except QueueConnectionError as e: - raise HTTPException(status_code=503, detail=f"Queue unavailable: {e}") - except Exception as e: - raise HTTPException(status_code=500, detail=f"Failed to enqueue outline task: {e}") - -@router.get("/files/proxy") -def proxy_storage_file(bucket: str, path: str, payload: Dict[str, Any] = Depends(auth)): - """Proxy a storage file (service-role), useful for private image access in the UI.""" - storage = StorageAdmin() - try: - data = storage.download_file(bucket, path) - media = 'application/octet-stream' - lp = path.lower() - if lp.endswith('.png'): - media = 'image/png' - elif lp.endswith('.webp'): - media = 'image/webp' - elif lp.endswith('.jpg') or lp.endswith('.jpeg'): - media = 'image/jpeg' - elif lp.endswith('.json'): - media = 'application/json' - return Response(content=data, media_type=media) - except Exception as e: - raise HTTPException(status_code=404, detail=f"File not found or inaccessible: {e}") - - -# Signed proxy for iframe/img tags without Authorization header -@router.get("/files/proxy_signed") -def proxy_storage_file_signed(bucket: str, path: str, token: str): - """Proxy using a signed bearer token passed as query param 'token'.""" - try: - payload = verify_supabase_jwt_str(token) - if not payload: - raise HTTPException(status_code=403, detail="Invalid token") - except Exception as e: - raise HTTPException(status_code=403, detail=f"Invalid token: {e}") - - storage = StorageAdmin() - try: - data = storage.download_file(bucket, path) - media = 'application/octet-stream' - lp = path.lower() - if lp.endswith('.png'): - media = 'image/png' - elif lp.endswith('.webp'): - media = 'image/webp' - elif lp.endswith('.jpg') or lp.endswith('.jpeg'): - media = 'image/jpeg' - elif lp.endswith('.json'): - media = 'application/json' - return Response(content=data, media_type=media) - except Exception as e: - raise HTTPException(status_code=404, detail=f"File not found or inaccessible: {e}") - -# -------- Canonical bundle manifest --------- - -@router.get("/files/{file_id}/artefacts/{artefact_id}/manifest") -def get_canonical_manifest(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)): - """Return the manifest.json for a canonical_docling_bundle artefact.""" - client = SupabaseServiceRoleClient() - storage = StorageAdmin() - - ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path,extra').eq('id', artefact_id).single().execute().data - if not ar: - raise HTTPException(status_code=404, detail="Artefact not found") - if ar.get('file_id') != file_id: - raise HTTPException(status_code=400, detail="Artefact does not belong to file") - extra = ar.get('extra') or {} - manifest_rel_path = extra.get('manifest') - if not manifest_rel_path: - raise HTTPException(status_code=404, detail="Manifest path not recorded on artefact") - - fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute().data - if not fr: - raise HTTPException(status_code=404, detail="File not found") - bucket = fr['bucket'] - - try: - raw = storage.download_file(bucket, manifest_rel_path) - import json as _json - data = _json.loads(raw.decode('utf-8')) - # ensure bucket present for client use - data.setdefault('bucket', bucket) - return data - except Exception as e: - raise HTTPException(status_code=500, detail=f"Failed to load manifest: {e}") - -# -------- Canonical Docling generation --------- - -def _load_split_map(client: SupabaseServiceRoleClient, storage: StorageAdmin, bucket: str, file_id: str) -> Optional[Dict[str, Any]]: - try: - arts = client.supabase.table('document_artefacts') \ - .select('id,type,rel_path') \ - .eq('file_id', file_id).eq('type', 'split_map_json') \ - .order('created_at', desc=True).limit(1).execute().data or [] - if not arts: - return None - art = arts[0] - raw = storage.download_file(bucket, art['rel_path']) - import json as _json - return _json.loads(raw.decode('utf-8')) - except Exception: - return None - - -@router.post("/files/{file_id}/artefacts/canonical-docling") -def enqueue_canonical_docling( - file_id: str, - body: Dict[str, Any] = Body(default={}), - payload: Dict[str, Any] = Depends(auth) -): - """Enqueue generation of canonical Docling JSON(s) for a file. - - If a split_map is available and the document is large, this will enqueue - multiple Docling jobs using page ranges per section. Otherwise a single - job is created for the whole document. - """ - client = SupabaseServiceRoleClient() - storage = StorageAdmin() - - fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute() - file_row = fr.data - if not file_row: - raise HTTPException(status_code=404, detail="File not found") - - bucket = file_row['bucket'] - cabinet_id = file_row['cabinet_id'] - mime = file_row.get('mime_type') or 'application/pdf' - storage_path = file_row['path'] - - # Prefer converted PDF if available - try: - arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or [] - a_pdf = next((a for a in arts if a.get('type') == 'document_pdf'), None) - processing_path = a_pdf['rel_path'] if a_pdf else storage_path - processing_mime = 'application/pdf' if a_pdf else mime - except Exception: - processing_path = storage_path - processing_mime = mime - - # Determine page_count (prefer Tika; fallback to PDF parser if needed) - page_count = None - try: - arts_pc = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).execute().data or [] - a_tika_pc = next((a for a in arts_pc if a.get('type') == 'tika_json'), None) - if a_tika_pc: - raw = storage.download_file(bucket, a_tika_pc['rel_path']) - import json as _json - tj = _json.loads(raw.decode('utf-8')) - for k in ("xmpTPg:NPages", "Page-Count", "pdf:PageCount"): - v = tj.get(k) or tj.get(k.lower()) - if v is not None: - page_count = int(v) - break - except Exception as e: - logger.debug(f"[canonical-docling] Tika page_count read failed: {e}") - pass - - # Fallback: compute page_count from PDF if Tika did not provide it - if page_count is None: - try: - pdf_bytes = storage.download_file(bucket, processing_path) - try: - import fitz # PyMuPDF - doc = fitz.open(stream=pdf_bytes, filetype='pdf') - page_count = int(doc.page_count) - doc.close() - logger.info(f"[canonical-docling] page_count via PyMuPDF: {page_count}") - except Exception: - try: - from PyPDF2 import PdfReader - reader = PdfReader(io.BytesIO(pdf_bytes)) - page_count = int(len(reader.pages)) - logger.info(f"[canonical-docling] page_count via PyPDF2: {page_count}") - except Exception: - page_count = None - except Exception: - page_count = None - else: - logger.info(f"[canonical-docling] page_count via Tika: {page_count}") - - # Optional custom range from caller - custom_range = body.get('custom_range') - custom_label = body.get('custom_label') or '' - selected_section_id = body.get('selected_section_id') - selected_section_title = body.get('selected_section_title') - - # Load split map if requested and document is large enough - use_split_requested = bool(body.get('use_split_map', True)) - split_threshold = int(body.get('threshold') or os.getenv('DOCLING_SPLIT_THRESHOLD', '50')) - ranges = [] # list of (start,end) - split_map = None - sections = [] # list of dicts: {start,end,title} - logger.info(f"[canonical-docling] use_split_map={use_split_requested} threshold={split_threshold} page_count={page_count}") - # If custom range provided, honor it and bypass split map - if isinstance(custom_range, list) and len(custom_range) >= 2: - try: - cs = int(custom_range[0]); ce = int(custom_range[1]) - if page_count is not None: - cs = max(1, min(cs, page_count)) - ce = max(cs, min(ce, page_count)) - ranges = [(cs, ce)] - sections = [{'start': cs, 'end': ce, 'title': custom_label or 'Custom range'}] - use_split_requested = False - logger.info(f"[canonical-docling] using custom_range start={cs} end={ce} label='{custom_label}'") - except Exception as _e: - logger.warning(f"[canonical-docling] invalid custom_range; falling back. err={_e}") - - if not ranges and use_split_requested and (page_count is None or page_count >= split_threshold): - split_map = _load_split_map(client, storage, bucket, file_id) - entries = (split_map or {}).get('entries') if split_map else [] - logger.info(f"[canonical-docling] split_map loaded entries={len(entries) if isinstance(entries, list) else 0}") - if split_map and isinstance(entries, list) and len(entries) > 0: - # Normalize and sort entries by start_page to enforce correct order - norm: list[dict] = [] - for e in entries: - try: - s = int(e.get('start_page', 1)) - t = int(e.get('end_page', s)) - if t < s: - t = s - title = e.get('title') or e.get('label') or '' - norm.append({'start': s, 'end': t, 'title': title}) - except Exception: - continue - norm.sort(key=lambda x: x['start']) - # Deduplicate identical or overlapping starts by keeping the earliest occurrence - ordered: list[dict] = [] - last_end = 0 - for e in norm: - s, t = int(e['start']), int(e['end']) - if ordered and s <= last_end: - # Clamp to prevent inversion and maintain order - s = last_end + 1 - if s > (page_count or s): - continue - if t < s: - t = s - last_end = max(last_end, t) - ordered.append({'start': s, 'end': t, 'title': e['title']}) - for e in ordered: - ranges.append((e['start'], e['end'])) - sections.append(e) - - # Fallback: if no split_map ranges... we shouldn't be here - if not ranges: - # If document is large, split into fixed windows to protect Docling server - if page_count is not None and page_count >= split_threshold: - chunk = int(os.getenv('DOCLING_FALLBACK_CHUNK_PAGES', '25')) - chunk = max(5, min(100, chunk)) - for i in range(1, (page_count or 1) + 1, chunk): - end = min(i + chunk - 1, page_count or i) - ranges.append((i, end)) - sections.append({'start': i, 'end': end, 'title': f"Pages {i}-{end}"}) - logger.warning(f"[canonical-docling] using fallback chunking ranges={len(ranges)} chunk={chunk}") - else: - ranges = [(1, page_count or 9223372036854775807)] - logger.warning(f"[canonical-docling] using single-range fallback (small doc)") - - # Build config - cfg = body.get('config', {}) - pipeline = cfg.get('pipeline', 'standard') - config: Dict[str, Any] = { - # target_type is computed in processor based on to_formats unless explicitly provided by user - 'to_formats': cfg.get('to_formats', 'json'), - 'do_ocr': bool(cfg.get('do_ocr', True)), - 'force_ocr': bool(cfg.get('force_ocr', False)), - 'image_export_mode': cfg.get('image_export_mode', 'embedded'), - 'ocr_engine': cfg.get('ocr_engine', 'easyocr'), - 'ocr_lang': cfg.get('ocr_lang', 'en'), - 'pdf_backend': cfg.get('pdf_backend', 'dlparse_v4'), - 'table_mode': cfg.get('table_mode', 'fast'), - 'pipeline': pipeline, - 'do_picture_classification': bool(cfg.get('do_picture_classification', False)), - 'do_picture_description': bool(cfg.get('do_picture_description', False)), - } - # If user explicitly set target_type, pass it through - if 'target_type' in cfg: - config['target_type'] = cfg['target_type'] - # Optional VLM settings (only include API fields if provided as JSON by caller) - if config['do_picture_description']: - pd_api = cfg.get('picture_description_api') - if isinstance(pd_api, (dict, list)): - config['picture_description_api'] = pd_api - elif isinstance(pd_api, str) and pd_api.strip().startswith(('{', '[')): - config['picture_description_api'] = pd_api - if cfg.get('picture_description_prompt'): - config['picture_description_prompt'] = cfg['picture_description_prompt'] - if pipeline == 'vlm': - # Provider presets mapping - provider = (cfg.get('vlm_provider') or '').strip().lower() - provider_model = (cfg.get('vlm_provider_model') or '').strip() - provider_base = (cfg.get('vlm_provider_base_url') or '').strip() - if provider in ('ollama', 'openai') and provider_model: - if provider == 'ollama': - base_url = provider_base or os.getenv('OLLAMA_BASE_URL') or os.getenv('VLM_OLLAMA_BASE_URL') - if base_url: - endpoint = f"{base_url.rstrip('/')}/v1/chat/completions" - # Use OpenAI provider schema against Ollama's OpenAI-compatible endpoint - cfg_api = { - 'provider': 'openai', - 'url': endpoint, - 'model': provider_model, - 'response_format': 'markdown', - 'request_params': {'model': provider_model} - } - logger.info(f"[canonical-docling] VLM provider=ollama mapped to openai-compatible url={endpoint} model={provider_model}") - config['vlm_pipeline_model_api'] = cfg_api - # Also wire picture_description_api if picture description is enabled - if config.get('do_picture_description'): - config['picture_description_api'] = { - 'url': endpoint, - 'headers': {}, - 'params': {'model': provider_model} - } - elif provider == 'openai': - base_url = provider_base or os.getenv('OPENAI_BASE_URL') or 'https://api.openai.com/v1' - api_key = os.getenv('OPENAI_API_KEY') or os.getenv('OPENAI_API_KEY_READONLY') - # Do not inline key if not present; server may have default - model_cfg: Dict[str, Any] = { - 'provider': 'openai', - 'url': f"{base_url.rstrip('/')}/chat/completions", - 'model': provider_model, - 'response_format': 'markdown', - 'request_params': {'model': provider_model} - } - if api_key: - model_cfg['api_key'] = api_key - # Also pass explicit Authorization header for servers that expect it - model_cfg['headers'] = { - 'Authorization': f"Bearer {api_key}" - } - logger.info(f"[canonical-docling] VLM provider=openai url={model_cfg['url']} model={provider_model} api_key={'yes' if api_key else 'no'}") - config['vlm_pipeline_model_api'] = model_cfg - # Also wire picture_description_api if picture description is enabled - if config.get('do_picture_description'): - headers = {'Authorization': f"Bearer {api_key}"} if api_key else {} - config['picture_description_api'] = { - 'url': f"{base_url.rstrip('/')}/chat/completions", - 'headers': headers, - 'params': {'model': provider_model} - } - else: - # Pass through explicit API/local JSON if provided by caller - vpa = cfg.get('vlm_pipeline_model_api') - if isinstance(vpa, (dict, list)): - config['vlm_pipeline_model_api'] = vpa - elif isinstance(vpa, str) and vpa.strip().startswith(('{', '[')): - config['vlm_pipeline_model_api'] = vpa - - # Enqueue tasks for each range - priority = TaskPriority.HIGH - task_ids = [] - multi = len(ranges) > 1 - logger.info(f"[canonical-docling] final ranges={len(ranges)} multi={multi} pipeline={pipeline} producer={body.get('producer', 'manual')}") - - # Create a group id for split bundles (used for UI grouping) - # Use provided group_id if present (for two-pass auto system), otherwise generate new - group_id = body.get('group_id') or (str(uuid.uuid4()) if multi else None) - if multi and not sections: - # Build sections from ranges if titles were not captured - for (start, end) in ranges: - sections.append({'start': int(start), 'end': int(end), 'title': ''}) - - idx = 0 - for (start, end) in ranges: - # Locate title for this range if available - title = '' - if multi and sections and idx < len(sections): - title = sections[idx].get('title') or '' - idx += 1 - - cfg_range = dict(config) - # Ensure 1-based inclusive range is passed through - cfg_range['page_range'] = [max(1, int(start)), max(int(start), int(end))] - extra = { - 'is_subdoc': multi, - 'page_range': [int(start), int(end)], - 'label': (title or f"subdoc p{int(start)}-{int(end)}") if multi else 'canonical' - } - # Attach selected section metadata if provided by caller - if selected_section_id: - extra['selected_section_id'] = selected_section_id - if selected_section_title or custom_label: - extra['selected_section_title'] = selected_section_title or custom_label - # For split processing, force split bundle artefact type and add grouping/order metadata - if multi: - extra.update({ - # UI grouping metadata - 'split_order': idx, - 'split_heading': title, - 'split_total': len(ranges) - }) - if group_id: - extra['group_id'] = group_id - extra['group_pack_type'] = 'docling_standard_auto_split' - else: - # Single-bundle case: allow caller to override type (defaults to canonical bundle) - if 'artefact_type_override' in body and body.get('artefact_type_override'): - extra['artefact_type_override'] = body.get('artefact_type_override') - - # Mark producer and selection metadata - extra['producer'] = body.get('producer') or ('auto_split' if (multi and body.get('use_split_map')) else 'manual') - if selected_section_id: - extra['selected_section_id'] = selected_section_id - if selected_section_title or custom_label: - extra['selected_section_title'] = selected_section_title or custom_label - - # Enhanced logging for canonical operations - if multi: - logger.info(f"[canonical-docling] enqueue range idx={idx}/{len(ranges)} start={start} end={end} title='{title}' group_id={group_id} producer={extra.get('producer')} pipeline={pipeline}") - else: - logger.info(f"[canonical-docling] enqueue single range start={start} end={end} producer={extra.get('producer')} pipeline={pipeline}") - tid = enqueue_docling_task( - file_id=file_id, - task_type='canonical_docling_subdoc_json' if multi else 'canonical_docling_json', - payload={ - 'bucket': bucket, - 'file_path': processing_path, - 'cabinet_id': cabinet_id, - 'mime_type': processing_mime, - 'config': cfg_range, - 'artefact_extra': extra, - # Ensure canonical tasks respect upstream dependencies (e.g., Frontmatter) - 'depends_on': body.get('depends_on', []), - # Pass through grouping info if provided by caller (kept for backward-compat) - 'group_pack_type': body.get('group_pack_type') - }, - priority=priority, - timeout=int(body.get('timeout', DOCLING_NOOCR_TIMEOUT)) - ) - task_ids.append(tid) - - logger.info(f"[canonical-docling] completed enqueue file_id={file_id} tasks={len(task_ids)} ranges={len(ranges)} pipeline={pipeline} producer={body.get('producer','manual')} group_id={group_id if multi else 'single'}") - - return { - 'message': f'enqueued {len(task_ids)} canonical docling job(s)', - 'task_ids': task_ids, - 'ranges': ranges, - 'used_split_map': bool(split_map), - 'group_id': group_id, - 'pipeline': pipeline, - 'producer': body.get('producer', 'manual') - } - diff --git a/archive/auto_processing/memory_aware_queue.py b/archive/auto_processing/memory_aware_queue.py deleted file mode 100644 index d673433..0000000 --- a/archive/auto_processing/memory_aware_queue.py +++ /dev/null @@ -1,411 +0,0 @@ -""" -Memory-Aware Queue Management System -==================================== - -Provides intelligent queue management based on memory usage and file sizes -rather than simple task count limits. Supports multiple users with fair -queuing and capacity management. - -Features: -- Memory-based queue limits (not just task count) -- Fair queuing across multiple users -- Upload capacity checking with user feedback -- Graceful degradation under load -- Service-specific memory tracking -""" - -import os -import time -import json -import uuid -import logging -from typing import Dict, List, Optional, Any, Tuple -from dataclasses import dataclass, asdict -from enum import Enum -import redis -from .redis_manager import get_redis_manager -import psutil - -logger = logging.getLogger(__name__) - -class QueueStatus(Enum): - ACCEPTING = "accepting" # Normal operation - BUSY = "busy" # High load, warn users - OVERLOADED = "overloaded" # Reject new uploads - MAINTENANCE = "maintenance" # Manual override - -@dataclass -class MemoryConfig: - """Memory configuration for queue management.""" - max_total_memory_mb: int = 2048 # 2GB total queue memory - max_user_memory_mb: int = 512 # 512MB per user - max_file_size_mb: int = 100 # 100MB max file size - memory_warning_threshold: float = 0.8 # Warn at 80% - memory_reject_threshold: float = 0.95 # Reject at 95% - -@dataclass -class QueuedFile: - """Represents a file waiting in the queue.""" - file_id: str - user_id: str - filename: str - size_bytes: int - mime_type: str - cabinet_id: str - priority: int = 1 - queued_at: float = 0 - estimated_processing_time: int = 300 # seconds - memory_estimate_mb: float = 0 - - def __post_init__(self): - if self.queued_at == 0: - self.queued_at = time.time() - - # Estimate memory usage (rough heuristic) - self.memory_estimate_mb = self._estimate_memory_usage() - - def _estimate_memory_usage(self) -> float: - """Estimate memory usage for this file during processing.""" - base_mb = self.size_bytes / (1024 * 1024) - - # Processing multipliers based on operations - if self.mime_type == 'application/pdf': - # PDF: original + extracted text + images + thumbnails - return base_mb * 3.5 - elif self.mime_type.startswith('image/'): - # Images: original + resized variants + OCR text - return base_mb * 2.5 - else: - # Other docs: original + PDF conversion + processing - return base_mb * 4.0 - -class MemoryAwareQueue: - """Memory-aware queue management system.""" - - def __init__(self, environment: str = "dev"): - self.redis_manager = get_redis_manager(environment) - self.redis_client = self.redis_manager.client - self.config = self._load_config() - - # Redis keys - self.upload_queue_key = "upload_queue" - self.processing_memory_key = "processing_memory" - self.user_quota_key = "user_quotas" - self.system_status_key = "system_status" - - logger.info(f"🧠 Memory-aware queue initialized (max: {self.config.max_total_memory_mb}MB)") - - def _load_config(self) -> MemoryConfig: - """Load memory configuration from environment.""" - return MemoryConfig( - max_total_memory_mb=int(os.getenv('QUEUE_MAX_MEMORY_MB', '2048')), - max_user_memory_mb=int(os.getenv('QUEUE_MAX_USER_MEMORY_MB', '512')), - max_file_size_mb=int(os.getenv('MAX_FILE_SIZE_MB', '100')), - memory_warning_threshold=float(os.getenv('MEMORY_WARNING_THRESHOLD', '0.8')), - memory_reject_threshold=float(os.getenv('MEMORY_REJECT_THRESHOLD', '0.95')) - ) - - def check_upload_capacity(self, user_id: str, file_size_bytes: int, - mime_type: str) -> Tuple[bool, str, Dict[str, Any]]: - """ - Check if system can accept a new upload. - - Returns: - (can_accept, message, queue_info) - """ - - # Create temporary QueuedFile to estimate memory - temp_file = QueuedFile( - file_id="temp", - user_id=user_id, - filename="temp", - size_bytes=file_size_bytes, - mime_type=mime_type, - cabinet_id="temp" - ) - - file_memory_mb = temp_file.memory_estimate_mb - - # Check file size limit - if file_size_bytes > (self.config.max_file_size_mb * 1024 * 1024): - return False, f"File too large (max: {self.config.max_file_size_mb}MB)", {} - - # Get current memory usage - current_memory = self._get_current_memory_usage() - user_memory = self._get_user_memory_usage(user_id) - - # Check user quota - if user_memory + file_memory_mb > self.config.max_user_memory_mb: - return False, f"User quota exceeded (limit: {self.config.max_user_memory_mb}MB)", { - 'user_current': user_memory, - 'user_limit': self.config.max_user_memory_mb - } - - # Check system capacity - total_after = current_memory + file_memory_mb - max_memory = self.config.max_total_memory_mb - - if total_after > (max_memory * self.config.memory_reject_threshold): - queue_info = self._get_queue_info() - return False, "System overloaded. Please try again later.", { - 'current_memory': current_memory, - 'max_memory': max_memory, - 'utilization': current_memory / max_memory, - 'queue_position': queue_info['total_queued'] + 1 - } - - # Calculate wait time estimate - wait_estimate = self._estimate_wait_time(user_id) - - status = "ready" - message = "Upload accepted" - - if total_after > (max_memory * self.config.memory_warning_threshold): - status = "busy" - message = f"System busy. Estimated wait: {wait_estimate // 60}m {wait_estimate % 60}s" - - return True, message, { - 'status': status, - 'estimated_wait_seconds': wait_estimate, - 'memory_usage': { - 'current': current_memory, - 'after_upload': total_after, - 'limit': max_memory, - 'utilization': total_after / max_memory - }, - 'user_quota': { - 'used': user_memory, - 'after_upload': user_memory + file_memory_mb, - 'limit': self.config.max_user_memory_mb - } - } - - def enqueue_file(self, file_id: str, user_id: str, filename: str, - size_bytes: int, mime_type: str, cabinet_id: str, - priority: int = 1) -> Dict[str, Any]: - """ - Add file to upload queue. - - Returns: - Queue information including position and estimated wait time - """ - - queued_file = QueuedFile( - file_id=file_id, - user_id=user_id, - filename=filename, - size_bytes=size_bytes, - mime_type=mime_type, - cabinet_id=cabinet_id, - priority=priority - ) - - # Serialize and add to Redis queue (priority queue: higher priority = lower score) - score = time.time() - (priority * 1000000) # Priority affects score significantly - - self.redis_client.zadd( - self.upload_queue_key, - {json.dumps(asdict(queued_file)): score} - ) - - # Update user quota tracking - self._update_user_quota(user_id, queued_file.memory_estimate_mb, increment=True) - - # Get queue position and wait estimate - position = self._get_queue_position(file_id) - wait_estimate = self._estimate_wait_time(user_id) - - logger.info(f"📋 Queued file {file_id} for user {user_id} (pos: {position}, wait: {wait_estimate}s)") - - return { - 'queued': True, - 'file_id': file_id, - 'queue_position': position, - 'estimated_wait_seconds': wait_estimate, - 'memory_estimate_mb': queued_file.memory_estimate_mb - } - - def dequeue_next_file(self, service_name: str) -> Optional[QueuedFile]: - """ - Get next file from queue for processing. - - Args: - service_name: The service requesting work (for capacity management) - """ - - # Check if service has capacity - service_memory = self._get_service_memory_usage(service_name) - service_limit = self._get_service_memory_limit(service_name) - - if service_memory >= service_limit: - logger.debug(f"Service {service_name} at capacity ({service_memory}/{service_limit}MB)") - return None - - # Get next item from priority queue (lowest score first) - items = self.redis_client.zrange(self.upload_queue_key, 0, 0, withscores=True) - - if not items: - return None - - file_data_json, score = items[0] - file_data = json.loads(file_data_json) - queued_file = QueuedFile(**file_data) - - # Check if this file would exceed service memory limit - if service_memory + queued_file.memory_estimate_mb > service_limit: - # Skip this file for now, try smaller ones later - logger.debug(f"File {queued_file.file_id} too large for {service_name} capacity") - return None - - # Remove from queue - self.redis_client.zrem(self.upload_queue_key, file_data_json) - - # Update tracking - self._update_user_quota(queued_file.user_id, queued_file.memory_estimate_mb, increment=False) - self._update_service_memory(service_name, queued_file.memory_estimate_mb, increment=True) - - logger.info(f"🎯 Dequeued file {queued_file.file_id} for {service_name} processing") - - return queued_file - - def complete_processing(self, service_name: str, file_id: str, memory_used_mb: float): - """Mark file processing as complete and free memory.""" - self._update_service_memory(service_name, memory_used_mb, increment=False) - logger.info(f"✅ Completed processing {file_id} in {service_name} (freed {memory_used_mb}MB)") - - def _get_current_memory_usage(self) -> float: - """Get current total memory usage across all services.""" - services = ['docling', 'tika', 'llm', 'document_analysis'] - total = 0 - - for service in services: - service_key = f"{self.processing_memory_key}:{service}" - memory = float(self.redis_client.get(service_key) or 0) - total += memory - - return total - - def _get_user_memory_usage(self, user_id: str) -> float: - """Get current memory usage for a specific user.""" - user_key = f"{self.user_quota_key}:{user_id}" - return float(self.redis_client.get(user_key) or 0) - - def _get_service_memory_usage(self, service_name: str) -> float: - """Get current memory usage for a service.""" - service_key = f"{self.processing_memory_key}:{service_name}" - return float(self.redis_client.get(service_key) or 0) - - def _get_service_memory_limit(self, service_name: str) -> float: - """Get memory limit for a service.""" - # Service-specific memory limits as percentage of total - limits = { - 'docling': 0.4, # 40% for Docling (memory-intensive) - 'tika': 0.2, # 20% for Tika - 'llm': 0.3, # 30% for LLM processing - 'document_analysis': 0.1 # 10% for document analysis - } - - percentage = limits.get(service_name, 0.1) - return self.config.max_total_memory_mb * percentage - - def _update_user_quota(self, user_id: str, memory_mb: float, increment: bool): - """Update user memory quota tracking.""" - user_key = f"{self.user_quota_key}:{user_id}" - - if increment: - self.redis_client.incrbyfloat(user_key, memory_mb) - else: - current = float(self.redis_client.get(user_key) or 0) - new_value = max(0, current - memory_mb) - self.redis_client.set(user_key, new_value) - - # Set expiration for cleanup - self.redis_client.expire(user_key, 86400) # 24 hours - - def _update_service_memory(self, service_name: str, memory_mb: float, increment: bool): - """Update service memory usage tracking.""" - service_key = f"{self.processing_memory_key}:{service_name}" - - if increment: - self.redis_client.incrbyfloat(service_key, memory_mb) - else: - current = float(self.redis_client.get(service_key) or 0) - new_value = max(0, current - memory_mb) - self.redis_client.set(service_key, new_value) - - # Set expiration for cleanup - self.redis_client.expire(service_key, 3600) # 1 hour - - def _get_queue_position(self, file_id: str) -> int: - """Get position of file in queue.""" - items = self.redis_client.zrange(self.upload_queue_key, 0, -1) - for i, item in enumerate(items): - file_data = json.loads(item) - if file_data['file_id'] == file_id: - return i + 1 - return 0 - - def _estimate_wait_time(self, user_id: str) -> int: - """Estimate wait time for user's next file.""" - # Simple estimation based on queue position and average processing time - queue_size = self.redis_client.zcard(self.upload_queue_key) - avg_processing_time = 300 # 5 minutes average - - return int(queue_size * avg_processing_time * 0.5) # Assume parallel processing - - def _get_queue_info(self) -> Dict[str, Any]: - """Get comprehensive queue information.""" - total_queued = self.redis_client.zcard(self.upload_queue_key) - current_memory = self._get_current_memory_usage() - max_memory = self.config.max_total_memory_mb - - return { - 'total_queued': total_queued, - 'memory_usage': { - 'current_mb': current_memory, - 'max_mb': max_memory, - 'utilization': current_memory / max_memory if max_memory > 0 else 0 - }, - 'status': self._determine_system_status(current_memory, max_memory) - } - - def _determine_system_status(self, current_memory: float, max_memory: float) -> str: - """Determine current system status based on memory usage.""" - utilization = current_memory / max_memory if max_memory > 0 else 0 - - if utilization >= self.config.memory_reject_threshold: - return "overloaded" - elif utilization >= self.config.memory_warning_threshold: - return "busy" - else: - return "ready" - - def get_system_status(self) -> Dict[str, Any]: - """Get comprehensive system status for monitoring.""" - queue_info = self._get_queue_info() - - # Service-specific info - services = {} - for service_name in ['docling', 'tika', 'llm', 'document_analysis']: - services[service_name] = { - 'memory_used_mb': self._get_service_memory_usage(service_name), - 'memory_limit_mb': self._get_service_memory_limit(service_name), - 'utilization': self._get_service_memory_usage(service_name) / self._get_service_memory_limit(service_name) - } - - return { - 'status': queue_info['status'], - 'queue': queue_info, - 'services': services, - 'config': asdict(self.config) - } - -# Convenience functions -def get_memory_queue(environment: str = "dev") -> MemoryAwareQueue: - """Get memory-aware queue instance.""" - return MemoryAwareQueue(environment) - -def check_upload_capacity(user_id: str, file_size: int, mime_type: str, environment: str = "dev") -> Tuple[bool, str, Dict]: - """Quick capacity check for upload.""" - queue = get_memory_queue(environment) - return queue.check_upload_capacity(user_id, file_size, mime_type) diff --git a/archive/auto_processing/pipeline_controller.py b/archive/auto_processing/pipeline_controller.py deleted file mode 100644 index adb0395..0000000 --- a/archive/auto_processing/pipeline_controller.py +++ /dev/null @@ -1,1316 +0,0 @@ -""" -Pipeline Controller for Three-Phase Document Processing Architecture - -This module coordinates the three phases of document processing: -- Phase 1: Document Structure Discovery & Analysis -- Phase 2: Parallel Content Processing Pipelines -- Phase 3: Enhanced Frontend Viewing (handled by frontend) - -Features: -- Environment variable controlled auto-processing -- Phase 1 completion detection -- Automatic Phase 2 triggering -- Intelligent retry and coordination logic -""" - -import json -import os -import uuid -import time -from typing import Dict, Any, List, Optional, Set -from pathlib import Path - -from modules.logger_tool import initialise_logger -from modules.database.supabase.utils.client import SupabaseServiceRoleClient -from modules.database.supabase.utils.storage import StorageAdmin -from modules.queue_system import ( - enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task, - enqueue_document_analysis_task, enqueue_page_images_task, - TaskPriority, get_queue -) -from modules.bundle_metadata import ( - create_standard_metadata, BundleMetadata, PipelineType, ProcessingMode, BundleType -) - -logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True) - -class DocumentPipelineController: - """ - Coordinates the three-phase document processing pipeline. - """ - - def __init__(self): - self.client = SupabaseServiceRoleClient() - self.storage = StorageAdmin() - - # Phase 1 environment variables - self.auto_tika = os.getenv('AUTO_TIKA_PROCESSING', 'true').lower() == 'true' - self.auto_page_images = os.getenv('AUTO_PAGE_IMAGES', 'true').lower() == 'true' - self.auto_document_analysis = os.getenv('AUTO_DOCUMENT_ANALYSIS', 'true').lower() == 'true' - self.auto_split_map = os.getenv('AUTO_SPLIT_MAP_GENERATION', 'true').lower() == 'true' - - # Phase 2 environment variables - self.auto_docling_ocr = os.getenv('AUTO_DOCLING_OCR', 'true').lower() == 'true' - self.auto_docling_no_ocr = os.getenv('AUTO_DOCLING_NO_OCR', 'true').lower() == 'true' - self.auto_docling_vlm = os.getenv('AUTO_DOCLING_VLM', 'false').lower() == 'true' - - # Processing granularity - self.docling_ocr_by_page = os.getenv('DOCLING_OCR_BY_PAGE', 'false').lower() == 'true' - self.docling_no_ocr_by_page = os.getenv('DOCLING_NO_OCR_BY_PAGE', 'false').lower() == 'true' - self.docling_vlm_by_page = os.getenv('DOCLING_VLM_BY_PAGE', 'true').lower() == 'true' - - # Grouping strategy - self.docling_use_split_map = os.getenv('DOCLING_USE_SPLIT_MAP', 'true').lower() == 'true' - self.docling_split_threshold = int(os.getenv('DOCLING_SPLIT_THRESHOLD', '50')) - - logger.info("Pipeline controller initialized with new bundle architecture") - - def enqueue_phase1_tasks(self, file_id: str, file_row: Dict[str, Any], - processing_path: str, processing_mime: str, - priority: TaskPriority = TaskPriority.HIGH) -> Dict[str, List[str]]: - """ - Enqueue Phase 1 tasks: Structure Discovery & Analysis - - Returns: - Dictionary mapping task types to task IDs - """ - logger.info(f"Phase 1: Starting structure discovery for file {file_id}") - - task_ids = {} - bucket = file_row['bucket'] - cabinet_id = file_row['cabinet_id'] - - # Step 1: Tika Processing (metadata extraction) - if self.auto_tika: - tika_url = os.getenv('TIKA_URL') - if tika_url: - tika_task_id = enqueue_tika_task( - file_id=file_id, - payload={ - 'bucket': bucket, - 'file_path': processing_path, - 'cabinet_id': cabinet_id, - 'mime_type': processing_mime - }, - priority=priority - ) - task_ids['tika'] = [tika_task_id] - logger.info(f"Phase 1: Enqueued Tika task {tika_task_id}") - else: - logger.warning("Phase 1: Tika enabled but TIKA_URL not configured") - - # Step 2: Frontmatter processing (lightweight document overview) - docling_url = os.getenv('DOCLING_URL') or os.getenv('NEOFS_DOCLING_URL') - if docling_url: - try: - front_pages = int(os.getenv('DOCLING_FRONTPAGES', '3')) - except Exception: - front_pages = 3 - - # Create enhanced metadata for frontmatter JSON display in UI - frontmatter_metadata = { - 'display_name': f'Document Frontmatter (p1-{front_pages})', - 'bundle_label': 'Frontmatter Analysis', - 'section_title': 'Document Frontmatter', - 'page_range': [1, front_pages], - 'page_count': front_pages, - 'bundle_type': 'frontmatter_json', - 'processing_mode': 'frontmatter_analysis', - 'pipeline': 'frontmatter_ocr', - 'is_frontmatter': True, - 'ui_category': 'document_analysis', - 'ui_order': 1, # Show first in UI - 'description': f'OCR analysis of first {front_pages} pages for document structure and metadata', - 'viewer_type': 'json' - } - - frontmatter_task_id = enqueue_docling_task( - file_id=file_id, - task_type='docling_frontmatter_json', - payload={ - 'bucket': bucket, - 'file_path': processing_path, - 'cabinet_id': cabinet_id, - 'mime_type': processing_mime, - 'config': { - 'do_ocr': True, - 'force_ocr': False, - 'image_export_mode': 'embedded', - 'ocr_engine': 'easyocr', - 'ocr_lang': 'en', - 'pdf_backend': 'dlparse_v4', - 'table_mode': 'fast', - 'target_type': 'inbody', - 'to_formats': 'json', - 'page_range': [1, front_pages] - }, - 'artefact_extra': frontmatter_metadata, - 'depends_on': task_ids.get('tika', []) - }, - priority=priority, - timeout=int(os.getenv('DOCLING_FRONTMATTER_TIMEOUT', '1800')) - ) - task_ids['frontmatter'] = [frontmatter_task_id] - logger.info(f"Phase 1: Enqueued frontmatter task {frontmatter_task_id}") - - # Step 3: Document Structure Analysis (LLM-enhanced hierarchy) - if self.auto_document_analysis: - analysis_task_id = enqueue_docling_task( - file_id=file_id, - task_type='document_structure_analysis', - payload={ - 'bucket': bucket, - 'file_path': processing_path, - 'cabinet_id': cabinet_id, - 'mime_type': processing_mime, - 'config': { - 'target_type': 'inbody', - 'to_formats': 'json', - 'do_ocr': False, - 'force_ocr': False - }, - 'depends_on': task_ids.get('frontmatter', []) - }, - priority=priority, - timeout=int(os.getenv('DOCUMENT_ANALYSIS_TIMEOUT', '300')) - ) - task_ids['document_analysis'] = [analysis_task_id] - logger.info(f"Phase 1: Enqueued document analysis task {analysis_task_id}") - - # Step 4: Split Map Generation (definitive section boundaries) - if self.auto_split_map: - split_map_task_id = enqueue_split_map_task( - file_id=file_id, - payload={ - 'depends_on': task_ids.get('frontmatter', []) + task_ids.get('document_analysis', []) - }, - priority=TaskPriority.NORMAL - ) - task_ids['split_map'] = [split_map_task_id] - logger.info(f"Phase 1: Enqueued split map task {split_map_task_id}") - - # Step 5: Page Images Generation (for frontend viewing) - if self.auto_page_images: - page_images_task_id = enqueue_docling_task( - file_id=file_id, - task_type='generate_page_images', - payload={ - 'bucket': bucket, - 'file_path': processing_path, - 'cabinet_id': cabinet_id, - 'mime_type': processing_mime, - 'config': {}, - 'depends_on': task_ids.get('document_analysis', []) - }, - priority=TaskPriority.NORMAL, - timeout=int(os.getenv('PAGE_IMAGES_TIMEOUT', '1800')) - ) - task_ids['page_images'] = [page_images_task_id] - logger.info(f"Phase 1: Enqueued page images task {page_images_task_id}") - - # Bundle tasks are now directly enqueued by split_map task completion - - total_tasks = sum(len(task_list) for task_list in task_ids.values()) - logger.info(f"Phase 1: Enqueued {total_tasks} tasks for file {file_id}: {list(task_ids.keys())}") - - return task_ids - - def check_phase1_completion(self, file_id: str) -> Dict[str, Any]: - """ - Check if Phase 1 is complete for a given file. - - Returns: - Dictionary with completion status and details - """ - logger.debug(f"Checking Phase 1 completion for file {file_id}") - - # Get all artefacts for the file - artefacts_result = self.client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).execute() - artefacts = artefacts_result.data or [] - - # Check for required Phase 1 artefacts - phase1_checks = { - 'tika_metadata': False, - 'frontmatter': False, - 'document_analysis': False, - 'split_map': False, - 'page_images': False - } - - for artefact in artefacts: - if artefact['status'] == 'completed': - artefact_type = artefact['type'] - if artefact_type == 'tika_json': - phase1_checks['tika_metadata'] = True - elif artefact_type == 'docling_frontmatter_json': - phase1_checks['frontmatter'] = True - elif artefact_type == 'document_outline_hierarchy': - phase1_checks['document_analysis'] = True - elif artefact_type == 'split_map_json': - phase1_checks['split_map'] = True - elif artefact_type == 'page_images': - phase1_checks['page_images'] = True - - # Determine completion based on enabled features - required_checks = [] - if self.auto_tika: - required_checks.append('tika_metadata') - required_checks.append('frontmatter') # Always required for basic processing - if self.auto_document_analysis: - required_checks.append('document_analysis') - if self.auto_split_map: - required_checks.append('split_map') - if self.auto_page_images: - required_checks.append('page_images') - - completed_required = [check for check in required_checks if phase1_checks[check]] - is_complete = len(completed_required) == len(required_checks) - - return { - 'file_id': file_id, - 'is_complete': is_complete, - 'completed_components': completed_required, - 'required_components': required_checks, - 'all_checks': phase1_checks, - 'completion_percentage': (len(completed_required) / max(len(required_checks), 1)) * 100 - } - - def enqueue_sequential_docling_pipelines(self, file_id: str, file_data: Dict[str, Any]) -> Dict[str, Any]: - """ - Enqueue sequential docling pipelines with dependencies: no_ocr → ocr → vlm - - Each pipeline depends on ALL tasks from the previous pipeline completing. - This replaces the complex Phase 2 coordinator with simple task dependencies. - - Args: - file_id: The file ID to process - file_data: File processing information (bucket, path, etc.) - - Returns: - Dictionary with enqueued pipeline information - """ - logger.info(f"Enqueueing sequential docling pipelines for file {file_id}") - - bucket = file_data['bucket'] - file_path = file_data['file_path'] - cabinet_id = file_data['cabinet_id'] - mime_type = file_data['mime_type'] - - # Base configuration shared by all pipelines (pipeline-specific options added per pipeline) - base_config = { - 'to_formats': ['json', 'html', 'text', 'md', 'doctags'], - 'image_export_mode': 'referenced', - 'target_type': 'zip', - 'pdf_backend': os.getenv('DOCLING_PDF_BACKEND', 'dlparse_v4'), - 'include_images': os.getenv('DOCLING_INCLUDE_IMAGES', 'true').lower() == 'true', - 'images_scale': float(os.getenv('DOCLING_IMAGES_SCALE', '2.0')), - 'ocr_engine': os.getenv('OCR_ENGINE', 'easyocr'), - 'ocr_lang': os.getenv('OCR_LANG', 'en'), - 'picture_description_area_threshold': float(os.getenv('DOCLING_PICTURE_DESCRIPTION_AREA_THRESHOLD', '0.05')) - } - - # Determine the pipeline execution order: no_ocr → ocr → vlm - pipeline_order = [] - if self.auto_docling_no_ocr: - pipeline_order.append('no_ocr') - if self.auto_docling_ocr: - pipeline_order.append('ocr') - if self.auto_docling_vlm: - pipeline_order.append('vlm') - - if not pipeline_order: - logger.info(f"No docling pipelines enabled for file {file_id}") - return { - 'file_id': file_id, - 'enqueued_pipelines': {}, - 'total_tasks': 0, - 'sequential_order': [], - 'message': 'No docling pipelines enabled' - } - - logger.info(f"Sequential pipeline order for file {file_id}: {pipeline_order}") - - # Enqueue all pipelines with proper dependencies - enqueued_pipelines = {} - all_task_ids = {} - - for i, pipeline_type in enumerate(pipeline_order): - # Determine dependencies: depend on ALL tasks from previous pipeline - depends_on = [] - if i > 0: - previous_pipeline = pipeline_order[i - 1] - depends_on = all_task_ids.get(previous_pipeline, []) - logger.info(f"Pipeline {pipeline_type} will depend on {len(depends_on)} tasks from {previous_pipeline}: {depends_on[:3]}..." if len(depends_on) > 3 else f"Pipeline {pipeline_type} will depend on {len(depends_on)} tasks from {previous_pipeline}: {depends_on}") - else: - logger.info(f"Pipeline {pipeline_type} has no dependencies (first pipeline)") - - # Create pipeline tasks - pipeline_result = self._enqueue_single_pipeline_with_deps( - file_id, pipeline_type, base_config, bucket, file_path, cabinet_id, mime_type, depends_on - ) - - if pipeline_result: - enqueued_pipelines[pipeline_type] = pipeline_result - all_task_ids[pipeline_type] = pipeline_result['task_ids'] - logger.info(f"Enqueued {pipeline_type} pipeline with {len(pipeline_result['task_ids'])} tasks") - - total_tasks = sum(len(p.get('task_ids', [])) for p in enqueued_pipelines.values()) - logger.info(f"Successfully enqueued {len(pipeline_order)} sequential pipelines with {total_tasks} total tasks for file {file_id}") - - return { - 'file_id': file_id, - 'enqueued_pipelines': enqueued_pipelines, - 'total_tasks': total_tasks, - 'sequential_order': pipeline_order - } - - def _determine_processing_mode(self, file_id: str, pipeline_type: str) -> tuple[str, dict]: - """ - Determine how to process document based on settings and characteristics. - - Implements corrected decision logic: - 1. Priority 1: Respect explicit BY_PAGE preference - 2. Priority 2: Check size threshold for auto-processing - 3. Priority 3: Use split map for large documents - 4. Priority 4: Fallback chunking - - Returns: - Tuple of (processing_mode, processing_data) - """ - # Check BY_PAGE flags first (highest priority) - by_page = self._get_by_page_setting(pipeline_type) - if by_page: - logger.info(f"BY_PAGE enabled for {pipeline_type} - creating page-based bundles regardless of document size") - return "split_by_pages", self._get_page_ranges(file_id) - - # Get document characteristics - page_count = self._get_page_count(file_id) - - # Apply size threshold logic - if page_count < self.docling_split_threshold: - logger.info(f"Document has {page_count} pages (< {self.docling_split_threshold} threshold) - creating single bundle") - return "whole_document", {} - - # Check for split map availability - split_map = self._load_split_map_if_needed(file_id) - if split_map and self.docling_use_split_map: - logger.info(f"Document has {page_count} pages (>= {self.docling_split_threshold} threshold) with split map - creating section-based bundles") - return "split_by_sections", split_map - else: - logger.error(f"Document has {page_count} pages (>= {self.docling_split_threshold} threshold) without split map - ERROR") - return "error" - - def _get_by_page_setting(self, pipeline_type: str) -> bool: - """Get BY_PAGE setting for the specified pipeline type.""" - if pipeline_type == 'no_ocr': - return self.docling_no_ocr_by_page - elif pipeline_type == 'ocr': - return self.docling_ocr_by_page - elif pipeline_type == 'vlm': - return self.docling_vlm_by_page - return False - - def _get_pipeline_specific_config(self, pipeline_type: str) -> Dict[str, Any]: - """Get pipeline-specific configuration options from environment variables.""" - if pipeline_type == 'no_ocr': - return { - 'table_mode': os.getenv('DOCLING_NO_OCR_TABLE_MODE', 'fast'), - 'table_cell_matching': os.getenv('DOCLING_NO_OCR_TABLE_CELL_MATCHING', 'false').lower() == 'true', - 'do_formula_enrichment': os.getenv('DOCLING_NO_OCR_DO_FORMULA_ENRICHMENT', 'false').lower() == 'true', - 'do_code_enrichment': os.getenv('DOCLING_NO_OCR_DO_CODE_ENRICHMENT', 'false').lower() == 'true', - 'do_table_structure': os.getenv('DOCLING_NO_OCR_DO_TABLE_STRUCTURE', 'true').lower() == 'true', - 'do_picture_classification': os.getenv('DOCLING_NO_OCR_DO_PICTURE_CLASSIFICATION', 'false').lower() == 'true', - 'do_picture_description': os.getenv('DOCLING_NO_OCR_DO_PICTURE_DESCRIPTION', 'false').lower() == 'true' - } - elif pipeline_type == 'ocr': - return { - 'table_mode': os.getenv('DOCLING_OCR_TABLE_MODE', 'accurate'), - 'table_cell_matching': os.getenv('DOCLING_OCR_TABLE_CELL_MATCHING', 'true').lower() == 'true', - 'do_formula_enrichment': os.getenv('DOCLING_OCR_DO_FORMULA_ENRICHMENT', 'true').lower() == 'true', - 'do_code_enrichment': os.getenv('DOCLING_OCR_DO_CODE_ENRICHMENT', 'true').lower() == 'true', - 'do_table_structure': os.getenv('DOCLING_OCR_DO_TABLE_STRUCTURE', 'true').lower() == 'true', - 'do_picture_classification': os.getenv('DOCLING_OCR_DO_PICTURE_CLASSIFICATION', 'false').lower() == 'true', - 'do_picture_description': os.getenv('DOCLING_OCR_DO_PICTURE_DESCRIPTION', 'false').lower() == 'true' - } - elif pipeline_type == 'vlm': - return { - 'table_mode': os.getenv('DOCLING_VLM_TABLE_MODE', 'accurate'), - 'table_cell_matching': os.getenv('DOCLING_VLM_TABLE_CELL_MATCHING', 'true').lower() == 'true', - 'do_formula_enrichment': os.getenv('DOCLING_VLM_DO_FORMULA_ENRICHMENT', 'false').lower() == 'true', - 'do_code_enrichment': os.getenv('DOCLING_VLM_DO_CODE_ENRICHMENT', 'false').lower() == 'true', - 'do_table_structure': os.getenv('DOCLING_VLM_DO_TABLE_STRUCTURE', 'true').lower() == 'true', - 'do_picture_classification': os.getenv('DOCLING_VLM_DO_PICTURE_CLASSIFICATION', 'true').lower() == 'true', - 'do_picture_description': os.getenv('DOCLING_VLM_DO_PICTURE_DESCRIPTION', 'true').lower() == 'true' - } - else: - # Default config for unknown pipeline types - return { - 'table_mode': 'fast', - 'table_cell_matching': False, - 'do_formula_enrichment': False, - 'do_code_enrichment': False, - 'do_table_structure': True, - 'do_picture_classification': False, - 'do_picture_description': False - } - - def _get_page_count(self, file_id: str) -> int: - """Get page count for the file from existing artefacts (first Tika).""" - logger.info(f"🔍 PAGE COUNT: Starting page count detection for file {file_id}") - - try: - # Try to get page count from existing artefacts, excluding frontmatter (partial document) - artefacts = self.client.supabase.table('document_artefacts').select('type,extra').eq('file_id', file_id).execute() - artefact_types = [art.get('type', 'unknown') for art in artefacts.data or []] - logger.info(f"🔍 PAGE COUNT: Found {len(artefacts.data or [])} artefacts for file {file_id}: {artefact_types}") - - for art in artefacts.data or []: - art_type = art.get('type', 'unknown') - extra = art.get('extra', {}) - logger.info(f"🔍 PAGE COUNT: Checking artefact type '{art_type}' for file {file_id}") - - # Skip frontmatter artefacts as they only contain partial page counts - if art_type == 'docling_frontmatter_json': - logger.info(f"🔍 PAGE COUNT: Skipping frontmatter artefact (partial page count) for file {file_id}") - continue - - # Also skip docling_json artefacts that are from frontmatter processing - if art_type == 'docling_json' and extra.get('is_frontmatter', False): - logger.info(f"🔍 PAGE COUNT: Skipping frontmatter-derived docling_json artefact (partial page count) for file {file_id}") - continue - - # Also skip docling_json artefacts that have frontmatter-related pipeline info - if art_type == 'docling_json' and extra.get('pipeline') == 'frontmatter_ocr': - logger.info(f"🔍 PAGE COUNT: Skipping frontmatter pipeline docling_json artefact (partial page count) for file {file_id}") - continue - - if 'page_count' in extra: - page_count = int(extra['page_count']) - logger.info(f"✅ PAGE COUNT: Found page count {page_count} from {art_type} artefact for file {file_id}") - return page_count - else: - logger.info(f"🔍 PAGE COUNT: No page_count in {art_type} artefact for file {file_id}") - - logger.info(f"🔍 PAGE COUNT: No artefacts with page_count found, trying Tika JSON parsing for file {file_id}") - - # Try to get page count from Tika JSON (most reliable source) - tika_arts = self.client.supabase.table('document_artefacts') \ - .select('rel_path') \ - .eq('file_id', file_id) \ - .eq('type', 'tika_json') \ - .execute() - - if tika_arts.data: - logger.info(f"🔍 PAGE COUNT: Found Tika JSON artefact, parsing content for file {file_id}") - file_info = self.client.supabase.table('files').select('bucket').eq('id', file_id).single().execute() - if file_info.data: - tika_data = self.storage.download_file(file_info.data['bucket'], tika_arts.data[0]['rel_path']) - import json - tika_json = json.loads(tika_data.decode('utf-8')) - - # Check common Tika page count keys in top level and metadata - logger.info(f"🔍 PAGE COUNT: Checking Tika JSON keys for page count in file {file_id}") - - # First check metadata section (most common location) - metadata = tika_json.get('metadata', {}) - for key in ("xmpTPg:NPages", "Page-Count", "pdf:PageCount", "pdf:pagecount", "meta:page-count", "pdfa:PDFVersion"): - # Check both exact key and lowercase version in metadata - value = metadata.get(key) or metadata.get(key.lower()) - if value is not None: - try: - page_count = int(value) - if page_count > 0: - logger.info(f"✅ PAGE COUNT: Found page count {page_count} from Tika metadata key '{key}' for file {file_id}") - return page_count - except Exception as parse_error: - logger.info(f"🔍 PAGE COUNT: Could not parse value '{value}' from Tika metadata key '{key}': {parse_error}") - continue - - # Also check top level (fallback) - for key in ("xmpTPg:NPages", "Page-Count", "pdf:PageCount", "pdf:pagecount"): - value = tika_json.get(key) or tika_json.get(key.lower()) - if value is not None: - try: - page_count = int(value) - if page_count > 0: - logger.info(f"✅ PAGE COUNT: Found page count {page_count} from Tika JSON top-level key '{key}' for file {file_id}") - return page_count - except Exception as parse_error: - logger.info(f"🔍 PAGE COUNT: Could not parse value '{value}' from Tika top-level key '{key}': {parse_error}") - continue - - # Debug: Show available keys to help diagnose issues - logger.info(f"🔍 PAGE COUNT: Available Tika JSON top-level keys: {list(tika_json.keys())}") - if 'metadata' in tika_json: - logger.info(f"🔍 PAGE COUNT: Available Tika metadata keys: {list(metadata.keys())}") - - logger.warning(f"🔍 PAGE COUNT: No valid page count keys found in Tika JSON for file {file_id}") - else: - logger.warning(f"🔍 PAGE COUNT: Could not get file info for Tika JSON parsing for file {file_id}") - else: - logger.warning(f"🔍 PAGE COUNT: No Tika JSON artefact found for file {file_id}") - - # Final fallback - try to get it directly from PDF using PyMuPDF - logger.warning(f"🔍 PAGE COUNT: Trying direct PDF parsing as final fallback for file {file_id}") - return self._get_page_count_direct_pdf(file_id) - - except Exception as e: - logger.error(f"❌ PAGE COUNT: Error getting page count for file {file_id}: {e}, defaulting to {self.docling_split_threshold + 1}") - return self.docling_split_threshold + 1 - - def _get_page_count_direct_pdf(self, file_id: str) -> int: - """Final fallback: Get page count directly from PDF using PyMuPDF.""" - try: - # Get file info from database - file_info = self.client.supabase.table('files').select('bucket,path,cabinet_id').eq('id', file_id).single().execute() - if not file_info.data: - logger.warning(f"🔍 PAGE COUNT: Could not find file info for {file_id}, defaulting to threshold + 1") - return self.docling_split_threshold + 1 - - file_row = file_info.data - bucket = file_row['bucket'] - file_path = file_row['path'] - - # Download and read PDF directly with PyMuPDF - logger.info(f"🔍 PAGE COUNT: Reading PDF directly from storage for file {file_id}") - pdf_bytes = self.storage.download_file(bucket, file_path) - - import fitz # PyMuPDF - doc = fitz.open(stream=pdf_bytes, filetype="pdf") - page_count = len(doc) - doc.close() - - logger.info(f"✅ PAGE COUNT: Direct PDF reading found {page_count} pages for file {file_id}") - return page_count - - except Exception as e: - logger.error(f"❌ PAGE COUNT: Direct PDF reading failed for file {file_id}: {e}, defaulting to {self.docling_split_threshold + 1}") - return self.docling_split_threshold + 1 - - def _get_page_ranges(self, file_id: str) -> dict: - """Get page ranges for page-based processing.""" - page_count = self._get_page_count(file_id) - return { - 'pages': list(range(1, page_count + 1)), - 'total_pages': page_count - } - - def _load_split_map_if_needed(self, file_id: str) -> Optional[Dict[str, Any]]: - """Load split map if needed for processing decisions.""" - try: - file_info = self.client.supabase.table('files').select('bucket').eq('id', file_id).single().execute() - if not file_info.data: - return None - return self._load_split_map(file_info.data['bucket'], file_id) - except Exception: - return None - - def _create_chunked_ranges(self, page_count: int) -> dict: - """Create chunked page ranges for large documents without split maps.""" - chunk_size = max(10, self.docling_split_threshold // 4) # 1/4 of threshold, min 10 pages - chunks = [] - - for start_page in range(1, page_count + 1, chunk_size): - end_page = min(start_page + chunk_size - 1, page_count) - chunks.append({ - 'start': start_page, - 'end': end_page, - 'title': f'Pages {start_page}-{end_page}' - }) - - return { - 'entries': chunks, - 'total_chunks': len(chunks) - } - - def _enqueue_single_pipeline_with_deps(self, file_id: str, pipeline_type: str, base_config: Dict[str, Any], - bucket: str, file_path: str, cabinet_id: str, mime_type: str, - depends_on: List[str]) -> Optional[Dict[str, Any]]: - """Enqueue a single pipeline with dependencies on previous pipeline tasks.""" - - group_id = str(uuid.uuid4()) - - # Get pipeline-specific configuration options - pipeline_specific_config = self._get_pipeline_specific_config(pipeline_type) - - if pipeline_type == 'no_ocr': - config = { - **base_config, - **pipeline_specific_config, - 'do_ocr': False, - 'force_ocr': False, - 'pipeline': 'standard' - } - logger.info(f"NO_OCR pipeline config: table_mode={config['table_mode']}, " - f"formula_enrichment={config['do_formula_enrichment']}, " - f"code_enrichment={config['do_code_enrichment']}") - elif pipeline_type == 'ocr': - config = { - **base_config, - **pipeline_specific_config, - 'do_ocr': True, - 'force_ocr': False, - 'pipeline': 'standard' - } - logger.info(f"OCR pipeline config: table_mode={config['table_mode']}, " - f"formula_enrichment={config['do_formula_enrichment']}, " - f"code_enrichment={config['do_code_enrichment']}") - elif pipeline_type == 'vlm': - config = { - **base_config, - **pipeline_specific_config, - 'do_ocr': False, - 'force_ocr': False, - 'pipeline': 'vlm', - 'vlm_pipeline_model': os.getenv('DOCLING_VLM_MODEL', 'smoldocling') - } - logger.info(f"VLM pipeline config: table_mode={config['table_mode']}, " - f"picture_classification={config['do_picture_classification']}, " - f"picture_description={config['do_picture_description']}") - else: - logger.error(f"Unknown pipeline type: {pipeline_type}") - return None - - # Determine processing mode using corrected logic - processing_mode, processing_data = self._determine_processing_mode(file_id, pipeline_type) - - # Enqueue single bundle task with dependencies - task_id = self._enqueue_bundle_task_with_deps( - file_id, pipeline_type, group_id, config, processing_mode, processing_data, - bucket, file_path, cabinet_id, mime_type, depends_on - ) - - return { - 'group_id': group_id, - 'task_ids': [task_id] if task_id else [], - 'task_count': 1 if task_id else 0, - 'processing_mode': processing_mode, - 'processing_data': processing_data - } - - def _enqueue_bundle_task_with_deps(self, file_id: str, pipeline_type: str, group_id: str, - config: Dict[str, Any], processing_mode: str, processing_data: dict, - bucket: str, file_path: str, cabinet_id: str, mime_type: str, - depends_on: List[str]) -> Optional[str]: - """ - Enqueue a single bundle task that handles processing internally based on mode. - - This replaces the old approach of creating multiple individual tasks. - """ - from modules.queue_system import enqueue_docling_task, TaskPriority - from modules.bundle_metadata import create_standard_metadata - - # Map processing modes to bundle types and task types - if processing_mode == "whole_document": - task_type = 'docling_bundle' - bundle_type = 'whole_document' - else: - task_type = 'docling_bundle_split' - bundle_type = processing_mode - - # Create bundle metadata with correct processing mode mapping - if processing_mode == "whole_document": - bundle_processing_mode = "whole_document" - elif processing_mode.startswith("split_by_"): - # For split modes, map to the appropriate bundle metadata mode - if processing_mode == "split_by_pages": - bundle_processing_mode = "pages" - elif processing_mode == "split_by_sections": - bundle_processing_mode = "sections" - elif processing_mode == "split_by_chunks": - bundle_processing_mode = "chunks" - else: - bundle_processing_mode = processing_mode.replace('split_by_', '') - else: - bundle_processing_mode = processing_mode - - bundle_metadata = create_standard_metadata( - file_id=file_id, - pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"), - processing_mode=bundle_processing_mode, - config=config, - group_id=group_id, - producer="auto_phase2" - ) - - # Create task payload with new bundle architecture - payload = { - 'bucket': bucket, - 'file_path': file_path, - 'cabinet_id': cabinet_id, - 'mime_type': mime_type, - 'config': config, - 'processing_mode': processing_mode, - 'processing_data': processing_data, - 'bundle_metadata': bundle_metadata.to_artefact_extra(), - 'depends_on': depends_on - } - - # Determine timeout based on processing complexity - if processing_mode == "whole_document": - timeout = 7200 # 2 hours for whole document - elif processing_mode == "split_by_pages": - # Estimate based on page count - page_count = processing_data.get('total_pages', 50) - timeout = min(14400, max(3600, page_count * 60)) # 1-4 hours based on pages - else: - # Section or chunk based processing - section_count = len(processing_data.get('entries', [])) - timeout = min(10800, max(3600, section_count * 300)) # 1-3 hours based on sections - - logger.info(f"Enqueuing {task_type} task for {pipeline_type} pipeline: {processing_mode} (timeout: {timeout}s)") - - try: - task_id = enqueue_docling_task( - file_id=file_id, - task_type=task_type, - payload=payload, - priority=TaskPriority.NORMAL, - timeout=timeout - ) - - logger.info(f"Successfully enqueued {task_type} task {task_id} for {pipeline_type} pipeline") - return task_id - - except Exception as e: - logger.error(f"Failed to enqueue bundle task for {pipeline_type} pipeline: {e}") - return None - - def trigger_phase2_pipelines(self, file_id: str, file_data: Dict[str, Any]) -> Dict[str, Any]: - """ - Trigger Phase 2 sequential processing pipelines based on environment configuration. - - Pipelines run in order: no_ocr → ocr → vlm (depending on what's enabled). - Only the first pipeline starts immediately; others are triggered when the previous completes. - - Args: - file_id: The file ID to process - file_data: File processing information (bucket, path, etc.) - - Returns: - Dictionary with triggered pipeline information - """ - logger.info(f"Phase 2: Starting sequential content processing for file {file_id}") - - triggered_pipelines = {} - bucket = file_data['bucket'] - file_path = file_data['file_path'] - cabinet_id = file_data['cabinet_id'] - mime_type = file_data['mime_type'] - - # Base configuration for all pipelines (DEPRECATED METHOD - use enqueue_sequential_docling_pipelines) - base_config = { - 'to_formats': ['json', 'html', 'text', 'md', 'doctags'], - 'image_export_mode': 'referenced', - 'target_type': 'zip', - 'pdf_backend': os.getenv('DOCLING_PDF_BACKEND', 'dlparse_v4'), - 'include_images': os.getenv('DOCLING_INCLUDE_IMAGES', 'true').lower() == 'true', - 'images_scale': float(os.getenv('DOCLING_IMAGES_SCALE', '2.0')), - 'ocr_engine': os.getenv('OCR_ENGINE', 'easyocr'), - 'ocr_lang': os.getenv('OCR_LANG', 'en'), - 'picture_description_area_threshold': float(os.getenv('DOCLING_PICTURE_DESCRIPTION_AREA_THRESHOLD', '0.05')) - } - - # Determine the pipeline execution order: no_ocr → ocr → vlm - pipeline_order = [] - if self.auto_docling_no_ocr: - pipeline_order.append('no_ocr') - if self.auto_docling_ocr: - pipeline_order.append('ocr') - if self.auto_docling_vlm: - pipeline_order.append('vlm') - - if not pipeline_order: - logger.info(f"Phase 2: No pipelines enabled for file {file_id}") - return { - 'file_id': file_id, - 'triggered_pipelines': {}, - 'total_tasks': 0, - 'sequential_order': [], - 'message': 'No Phase 2 pipelines enabled' - } - - logger.info(f"Phase 2: Sequential pipeline order for file {file_id}: {pipeline_order}") - logger.warning(f"trigger_phase2_pipelines is deprecated - use enqueue_sequential_docling_pipelines for new implementations") - - # For backward compatibility, delegate to the new method - return self.enqueue_sequential_docling_pipelines(file_id, file_data) - - def _start_single_pipeline(self, file_id: str, pipeline_type: str, base_config: Dict[str, Any], - bucket: str, file_path: str, cabinet_id: str, mime_type: str) -> Optional[Dict[str, Any]]: - """Start a single pipeline of the specified type.""" - - if pipeline_type == 'no_ocr': - group_id = str(uuid.uuid4()) - config = { - **base_config, - 'do_ocr': False, - 'force_ocr': False, - 'pipeline': 'standard' - } - tasks = self._enqueue_pipeline( - file_id, 'no_ocr', group_id, config, - bucket, file_path, cabinet_id, mime_type, - by_page=self.docling_no_ocr_by_page - ) - return { - 'group_id': group_id, - 'task_count': len(tasks), - 'by_page': self.docling_no_ocr_by_page - } - - elif pipeline_type == 'ocr': - group_id = str(uuid.uuid4()) - config = { - **base_config, - 'do_ocr': True, - 'ocr_engine': os.getenv('OCR_ENGINE', 'easyocr'), - 'force_ocr': False, - 'pipeline': 'standard' - } - tasks = self._enqueue_pipeline( - file_id, 'ocr', group_id, config, - bucket, file_path, cabinet_id, mime_type, - by_page=self.docling_ocr_by_page - ) - return { - 'group_id': group_id, - 'task_count': len(tasks), - 'by_page': self.docling_ocr_by_page - } - - elif pipeline_type == 'vlm': - group_id = str(uuid.uuid4()) - config = { - **base_config, - 'do_ocr': False, - 'force_ocr': False, - 'pipeline': 'vlm', - 'vlm_pipeline_model': os.getenv('DOCLING_VLM_MODEL', 'smoldocling') - } - tasks = self._enqueue_pipeline( - file_id, 'vlm', group_id, config, - bucket, file_path, cabinet_id, mime_type, - by_page=self.docling_vlm_by_page - ) - return { - 'group_id': group_id, - 'task_count': len(tasks), - 'by_page': self.docling_vlm_by_page - } - - else: - logger.error(f"Unknown pipeline type: {pipeline_type}") - return None - -# continue_sequential_pipeline method removed - task dependencies now handle sequential execution - - def _load_split_map(self, bucket: str, file_id: str) -> Optional[Dict[str, Any]]: - """Load split map data for a file.""" - try: - arts = self.client.supabase.table('document_artefacts') \ - .select('id,type,rel_path') \ - .eq('file_id', file_id).eq('type', 'split_map_json') \ - .order('created_at', desc=True).limit(1).execute().data or [] - if not arts: - return None - art = arts[0] - raw = self.storage.download_file(bucket, art['rel_path']) - import json as _json - return _json.loads(raw.decode('utf-8')) - except Exception: - return None - - def _enqueue_pipeline(self, file_id: str, pipeline_type: str, group_id: str, config: Dict[str, Any], - bucket: str, file_path: str, cabinet_id: str, mime_type: str, - by_page: bool = False) -> List[str]: - """Enqueue tasks for a specific pipeline (OCR/No-OCR/VLM)""" - - task_ids = [] - - if by_page: - # Process each page individually, then group by split map sections - logger.info(f"Phase 2: Enqueueing {pipeline_type} pipeline by page for file {file_id}") - - split_map = self._load_split_map(bucket, file_id) - if split_map: - entries = split_map.get('entries', []) - for section_idx, entry in enumerate(entries, 1): - start_page = int(entry.get('start_page', 1)) - end_page = int(entry.get('end_page', start_page)) - section_title = entry.get('title', f'Section {section_idx}') - - if pipeline_type == 'vlm': - # VLM uses specialized page processing - section_task_id = enqueue_docling_task( - file_id=file_id, - task_type='vlm_section_page_bundle', - payload={ - 'section_idx': section_idx, - 'start_page': start_page, - 'end_page': end_page, - 'section_title': section_title, - 'vlm_group_id': group_id, - 'vlm_model': config.get('vlm_pipeline_model', 'smoldocling'), - 'base_config': config, - 'total_sections': len(entries), - 'producer': 'auto_phase2' - }, - priority=TaskPriority.NORMAL, - timeout=3600 - ) - task_ids.append(section_task_id) - else: - # OCR/No-OCR by page processing (process each page in section individually) - for page_num in range(start_page, end_page + 1): - page_config = { - **config, - 'page_range': [page_num, page_num] - } - - # Create standardized bundle metadata - page_metadata = create_standard_metadata( - file_id=file_id, - pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"), - processing_mode="individual_pages", - config=page_config, - group_id=group_id, - split_order=section_idx, - split_total=len(entries), - split_heading=section_title, - page_range=[page_num, page_num], - producer="auto_phase2" - ) - - # Add legacy fields for backward compatibility - artefact_extra = page_metadata.to_artefact_extra() - artefact_extra.update({ - 'section_idx': section_idx, - 'section_title': section_title, - 'page_number': page_num, - }) - - page_task_id = enqueue_docling_task( - file_id=file_id, - task_type='canonical_docling_json', - payload={ - 'bucket': bucket, - 'file_path': file_path, - 'cabinet_id': cabinet_id, - 'mime_type': mime_type, - 'config': page_config, - 'artefact_extra': artefact_extra - }, - priority=TaskPriority.NORMAL, - timeout=1800 - ) - task_ids.append(page_task_id) - else: - logger.warning(f"Phase 2: No split map found for by-page processing of file {file_id}") - return [] - - elif self.docling_use_split_map: - # Process by split map sections - logger.info(f"Phase 2: Enqueueing {pipeline_type} pipeline by split map sections for file {file_id}") - - split_map = self._load_split_map(bucket, file_id) - if split_map: - entries = split_map.get('entries', []) - - # Normalize and sort entries by start_page - normalized_entries = [] - for entry in entries: - try: - start_page = int(entry.get('start_page', 1)) - end_page = int(entry.get('end_page', start_page)) - title = entry.get('title') or entry.get('label') or '' - if end_page < start_page: - end_page = start_page - normalized_entries.append({ - 'start': start_page, - 'end': end_page, - 'title': title - }) - except Exception: - continue - - normalized_entries.sort(key=lambda x: x['start']) - - # Create tasks for each section - for i, entry in enumerate(normalized_entries, 1): - section_config = { - **config, - 'page_range': [entry['start'], entry['end']] - } - - # Create standardized bundle metadata for section - section_metadata = create_standard_metadata( - file_id=file_id, - pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"), - processing_mode="split_sections", - config=section_config, - group_id=group_id, - split_order=i, - split_total=len(normalized_entries), - split_heading=entry['title'] or f'Section {i}', - page_range=[entry['start'], entry['end']], - producer="auto_phase2" - ) - - section_task_id = enqueue_docling_task( - file_id=file_id, - task_type='canonical_docling_json', - payload={ - 'bucket': bucket, - 'file_path': file_path, - 'cabinet_id': cabinet_id, - 'mime_type': mime_type, - 'config': section_config, - 'artefact_extra': section_metadata.to_artefact_extra() - }, - priority=TaskPriority.NORMAL, - timeout=3600 - ) - task_ids.append(section_task_id) - else: - logger.warning(f"Phase 2: No split map found for section-based processing of file {file_id}") - return [] - - else: - # Process whole document - logger.info(f"Phase 2: Enqueueing {pipeline_type} pipeline for whole document {file_id}") - - # Create standardized bundle metadata for whole document - whole_doc_metadata = create_standard_metadata( - file_id=file_id, - pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"), - processing_mode="whole_document", - config=config, - group_id=group_id, - producer="auto_phase2" - ) - - task_id = enqueue_docling_task( - file_id=file_id, - task_type='canonical_docling_json', - payload={ - 'bucket': bucket, - 'file_path': file_path, - 'cabinet_id': cabinet_id, - 'mime_type': mime_type, - 'config': config, - 'artefact_extra': whole_doc_metadata.to_artefact_extra() - }, - priority=TaskPriority.NORMAL, - timeout=7200 - ) - task_ids.append(task_id) - - logger.info(f"Phase 2: Enqueued {len(task_ids)} tasks for {pipeline_type} pipeline") - return task_ids - - def _enqueue_pipeline_with_deps(self, file_id: str, pipeline_type: str, group_id: str, config: Dict[str, Any], - bucket: str, file_path: str, cabinet_id: str, mime_type: str, - by_page: bool = False, depends_on: List[str] = None) -> List[str]: - """Enqueue tasks for a specific pipeline with dependencies""" - - if depends_on is None: - depends_on = [] - - task_ids = [] - - if by_page: - # Process each page individually, then group by split map sections - logger.info(f"Enqueueing {pipeline_type} pipeline by page for file {file_id} with {len(depends_on)} dependencies") - - split_map = self._load_split_map(bucket, file_id) - if split_map: - entries = split_map.get('entries', []) - for section_idx, entry in enumerate(entries, 1): - start_page = int(entry.get('start_page', 1)) - end_page = int(entry.get('end_page', start_page)) - section_title = entry.get('title', f'Section {section_idx}') - - if pipeline_type == 'vlm': - # VLM uses specialized page processing - section_task_id = enqueue_docling_task( - file_id=file_id, - task_type='vlm_section_page_bundle', - payload={ - 'section_idx': section_idx, - 'start_page': start_page, - 'end_page': end_page, - 'section_title': section_title, - 'vlm_group_id': group_id, - 'vlm_model': config.get('vlm_pipeline_model', 'smoldocling'), - 'base_config': config, - 'total_sections': len(entries), - 'producer': 'auto_phase2', - 'depends_on': depends_on - }, - priority=TaskPriority.NORMAL, - timeout=3600 - ) - task_ids.append(section_task_id) - else: - # OCR/No-OCR by page processing (process each page in section individually) - for page_num in range(start_page, end_page + 1): - page_config = { - **config, - 'page_range': [page_num, page_num] - } - - # Create standardized bundle metadata - page_metadata = create_standard_metadata( - file_id=file_id, - pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"), - processing_mode="individual_pages", - config=page_config, - group_id=group_id, - split_order=section_idx, - split_total=len(entries), - split_heading=section_title, - page_range=[page_num, page_num], - producer="auto_phase2" - ) - - # Add legacy fields for backward compatibility - artefact_extra = page_metadata.to_artefact_extra() - artefact_extra.update({ - 'section_idx': section_idx, - 'section_title': section_title, - 'page_number': page_num, - }) - - page_task_id = enqueue_docling_task( - file_id=file_id, - task_type='canonical_docling_json', - payload={ - 'bucket': bucket, - 'file_path': file_path, - 'cabinet_id': cabinet_id, - 'mime_type': mime_type, - 'config': page_config, - 'artefact_extra': artefact_extra, - 'depends_on': depends_on - }, - priority=TaskPriority.NORMAL, - timeout=1800 - ) - task_ids.append(page_task_id) - else: - logger.warning(f"No split map found for by-page processing of file {file_id}") - return [] - - elif self.docling_use_split_map: - # Process by split map sections - logger.info(f"Enqueueing {pipeline_type} pipeline by split map sections for file {file_id} with {len(depends_on)} dependencies") - - split_map = self._load_split_map(bucket, file_id) - if split_map: - entries = split_map.get('entries', []) - - # Normalize and sort entries by start_page - normalized_entries = [] - for entry in entries: - try: - start_page = int(entry.get('start_page', 1)) - end_page = int(entry.get('end_page', start_page)) - title = entry.get('title') or entry.get('label') or '' - if end_page < start_page: - end_page = start_page - normalized_entries.append({ - 'start': start_page, - 'end': end_page, - 'title': title - }) - except Exception: - continue - - normalized_entries.sort(key=lambda x: x['start']) - - # Create tasks for each section - for i, entry in enumerate(normalized_entries, 1): - section_config = { - **config, - 'page_range': [entry['start'], entry['end']] - } - - # Create standardized bundle metadata for section - section_metadata = create_standard_metadata( - file_id=file_id, - pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"), - processing_mode="split_sections", - config=section_config, - group_id=group_id, - split_order=i, - split_total=len(normalized_entries), - split_heading=entry['title'] or f'Section {i}', - page_range=[entry['start'], entry['end']], - producer="auto_phase2" - ) - - section_task_id = enqueue_docling_task( - file_id=file_id, - task_type='canonical_docling_json', - payload={ - 'bucket': bucket, - 'file_path': file_path, - 'cabinet_id': cabinet_id, - 'mime_type': mime_type, - 'config': section_config, - 'artefact_extra': section_metadata.to_artefact_extra(), - 'depends_on': depends_on - }, - priority=TaskPriority.NORMAL, - timeout=3600 - ) - task_ids.append(section_task_id) - else: - logger.warning(f"No split map found for section-based processing of file {file_id}") - return [] - - else: - # Process whole document - logger.info(f"Enqueueing {pipeline_type} pipeline for whole document {file_id} with {len(depends_on)} dependencies") - - # Create standardized bundle metadata for whole document - whole_doc_metadata = create_standard_metadata( - file_id=file_id, - pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"), - processing_mode="whole_document", - config=config, - group_id=group_id, - producer="auto_phase2" - ) - - task_id = enqueue_docling_task( - file_id=file_id, - task_type='canonical_docling_json', - payload={ - 'bucket': bucket, - 'file_path': file_path, - 'cabinet_id': cabinet_id, - 'mime_type': mime_type, - 'config': config, - 'artefact_extra': whole_doc_metadata.to_artefact_extra(), - 'depends_on': depends_on - }, - priority=TaskPriority.NORMAL, - timeout=7200 - ) - task_ids.append(task_id) - - logger.info(f"Enqueued {len(task_ids)} tasks for {pipeline_type} pipeline with dependencies") - return task_ids - - -# Global pipeline controller instance -_controller_instance = None - -def get_pipeline_controller() -> DocumentPipelineController: - """Get the global pipeline controller instance.""" - global _controller_instance - if _controller_instance is None: - _controller_instance = DocumentPipelineController() - return _controller_instance diff --git a/archive/auto_processing/task_processors.py b/archive/auto_processing/task_processors.py deleted file mode 100644 index 58ec924..0000000 --- a/archive/auto_processing/task_processors.py +++ /dev/null @@ -1,2531 +0,0 @@ -""" -Task Processors for Document Processing Queue - -This module contains the actual processing implementations for different -types of queued tasks (Tika, Docling, LLM, Split Map). -""" - -import json -import zipfile -import io -import mimetypes -import requests -import tempfile -import uuid -from pathlib import Path -from typing import Dict, Any, Optional -import os - -from modules.queue_system import DocumentProcessingQueue, QueueTask, ServiceType -from modules.database.supabase.utils.client import SupabaseServiceRoleClient -from modules.database.supabase.utils.storage import StorageAdmin -from modules.document_processor import DocumentProcessor -from modules.logger_tool import initialise_logger - -logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True) - -class DocumentTaskProcessor(DocumentProcessingQueue): - """ - Extended queue with actual task processing implementations. - """ - - def __init__(self, redis_url: str = None): - super().__init__(redis_url) - self.client = SupabaseServiceRoleClient() - self.storage = StorageAdmin() - self.doc_processor = DocumentProcessor() - - # Service URLs - self.tika_url = os.getenv('TIKA_URL') - self.docling_url = os.getenv('DOCLING_URL') or os.getenv('NEOFS_DOCLING_URL') - self.llm_url = os.getenv('LLM_URL') # Local LLM endpoint - - logger.info("Task processor initialized with service URLs") - - def _process_task(self, task: QueueTask): - """Process a task based on its service type.""" - try: - # DEBUG: Log entry into processing - logger.info(f"🚀 PROCESS DEBUG: Starting _process_task for {task.id}") - - # Audit dependency info (if any) - try: - deps = [] - if isinstance(task.payload, dict): - deps = task.payload.get('depends_on') or [] - if deps: - logger.info(f"Processing task {task.id}: {task.service.value}/{task.task_type} deps={deps}") - else: - logger.info(f"Processing task {task.id}: {task.service.value}/{task.task_type}") - except Exception: - logger.info(f"Processing task {task.id}: {task.service.value}/{task.task_type}") - - # DEBUG: Log service routing - logger.info(f"🚀 PROCESS DEBUG: Routing task {task.id} to service {task.service}") - - if task.service == ServiceType.TIKA: - result = self._process_tika_task(task) - elif task.service == ServiceType.DOCLING: - result = self._process_docling_task(task) - elif task.service == ServiceType.LLM: - result = self._process_llm_task(task) - elif task.service == ServiceType.SPLIT_MAP: - result = self._process_split_map_task(task) - elif task.service == ServiceType.DOCUMENT_ANALYSIS: - result = self.process_document_analysis_task(task) - elif task.service == ServiceType.PAGE_IMAGES: - result = self.process_page_images_task(task) - else: - raise ValueError(f"Unknown service type: {task.service}") - - # DEBUG: Log successful completion - logger.info(f"✅ PROCESS DEBUG: Task {task.id} completed successfully, calling complete_task") - self.complete_task(task, result) - logger.info(f"✅ PROCESS DEBUG: Task {task.id} completion confirmed") - - except Exception as e: - # DEBUG: Log detailed failure info - logger.error(f"🚨 PROCESS DEBUG: Task {task.id} processing failed: {e}") - logger.error(f"🚨 PROCESS DEBUG: Exception type: {type(e)}") - import traceback - logger.error(f"🚨 PROCESS DEBUG: Full traceback:\n{traceback.format_exc()}") - logger.info(f"🚨 PROCESS DEBUG: Calling fail_task for {task.id}") - self.fail_task(task, str(e)) - logger.info(f"🚨 PROCESS DEBUG: fail_task completed for {task.id}") - - def _process_tika_task(self, task: QueueTask) -> Dict[str, Any]: - """Process Tika metadata extraction task.""" - if not self.tika_url: - raise ValueError("TIKA_URL not configured") - - payload = task.payload - file_id = task.file_id - bucket = payload['bucket'] - file_path = payload['file_path'] - cabinet_id = payload['cabinet_id'] - mime_type = payload.get('mime_type', 'application/octet-stream') - - # Download file - logger.debug(f"Downloading file for Tika processing: {bucket}/{file_path}") - file_bytes = self.storage.download_file(bucket, file_path) - - # Call Tika - headers = {'Accept': 'application/json', 'Content-Type': mime_type} - timeout = task.timeout - - response = requests.put( - f"{self.tika_url.rstrip('/')}/meta", - data=file_bytes, - headers=headers, - timeout=timeout - ) - response.raise_for_status() - - tika_json = response.json() - - # Store result as artefact - artefact_id = str(uuid.uuid4()) - rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/tika.json" - - self.storage.upload_file( - bucket, - rel_path, - json.dumps(tika_json, ensure_ascii=False).encode('utf-8'), - 'application/json', - upsert=True - ) - - # Create artefact record with enhanced UI metadata - artefact_data = { - 'id': artefact_id, - 'file_id': file_id, - 'type': 'tika_json', - 'rel_path': rel_path, - 'extra': { - 'processing_time': response.elapsed.total_seconds(), - 'display_name': 'Document Metadata', - 'bundle_label': 'Tika Analysis', - 'section_title': 'Document Metadata', - 'bundle_type': 'tika_json', - 'processing_mode': 'metadata_extraction', - 'pipeline': 'tika_analysis', - 'is_metadata': True, - 'ui_category': 'raw_data', - 'ui_order': 3, - 'description': 'Raw document metadata and properties extracted by Apache Tika', - 'viewer_type': 'json' - }, - 'status': 'completed' - } - - self.client.supabase.table('document_artefacts').insert(artefact_data).execute() - - logger.info(f"Tika processing completed for file {file_id}") - return { - 'artefact_id': artefact_id, - 'rel_path': rel_path, - 'processing_time': response.elapsed.total_seconds() - } - - def _process_docling_task(self, task: QueueTask) -> Dict[str, Any]: - """Process Docling document analysis task. - - Also allows routing of related task types so that page images and - enhanced structure analysis can run under the stable docling service - umbrella when SERVICE dispatch for new types is problematic. - """ - # Soft-route additional task types through this handler - if task.task_type in ("document_structure_analysis", "document_analysis"): - return self.process_document_analysis_task(task) - if task.task_type in ("generate_page_images", "page_images"): - return self.process_page_images_task(task) - if task.task_type in ("vlm_section_page_bundle",): - return self.process_vlm_section_page_bundle_task(task) - if task.task_type in ("vlm_section_bundle_collector",): - return self.process_vlm_section_bundle_collector_task(task) - # New unified bundle architecture handlers - if task.task_type in ("docling_bundle",): - return self.process_docling_bundle_task(task) - if task.task_type in ("docling_bundle_split",): - return self.process_docling_bundle_split_task(task) -# phase2_coordinator task type removed - pipelines now enqueued directly from split_map task - if not self.docling_url: - raise ValueError("DOCLING_URL not configured") - - payload = task.payload - file_id = task.file_id - bucket = payload['bucket'] - file_path = payload['file_path'] - cabinet_id = payload['cabinet_id'] - task_config = payload.get('config', {}) - - # Download file - logger.debug(f"Downloading file for Docling processing: {bucket}/{file_path}") - file_bytes = self.storage.download_file(bucket, file_path) - - # Prepare Docling request - docling_api_key = os.getenv('DOCLING_API_KEY') - # Accept any content type so zip/binary responses are allowed - headers = {'Accept': '*/*'} - if docling_api_key: - headers['X-Api-Key'] = docling_api_key - - # Determine to_formats. For canonical docling we will request a ZIP bundle. - to_formats_val = task_config.get('to_formats', 'json') - to_formats_list = to_formats_val if isinstance(to_formats_val, list) else [to_formats_val] - is_canonical = str(task.task_type).startswith('canonical_docling') - target_type = task_config.get('target_type', 'zip' if is_canonical else 'inbody') - - # Build form data from config (override for canonical) - form_data = [ - ('target_type', target_type), - ('do_ocr', str(task_config.get('do_ocr', False)).lower()), - ('force_ocr', str(task_config.get('force_ocr', False)).lower()), - ('image_export_mode', 'referenced' if is_canonical else task_config.get('image_export_mode', 'embedded')), - ('ocr_engine', task_config.get('ocr_engine', 'easyocr')), - ('ocr_lang', task_config.get('ocr_lang', 'en')), - ('pdf_backend', task_config.get('pdf_backend', 'dlparse_v4')), - ('table_mode', task_config.get('table_mode', 'fast')), - ('do_formula_enrichment', str(task_config.get('do_formula_enrichment', False)).lower()), - ('do_code_enrichment', str(task_config.get('do_code_enrichment', False)).lower()), - ('pipeline', task_config.get('pipeline', 'standard')) - ] - # Optional extra flags forwarded when present - if 'table_cell_matching' in task_config: - form_data.append(('table_cell_matching', str(task_config.get('table_cell_matching')).lower())) - if 'do_picture_classification' in task_config: - form_data.append(('do_picture_classification', str(task_config.get('do_picture_classification')).lower())) - if 'do_picture_description' in task_config: - form_data.append(('do_picture_description', str(task_config.get('do_picture_description')).lower())) - if task_config.get('picture_description_prompt'): - form_data.append(('picture_description_prompt', task_config.get('picture_description_prompt'))) - # picture_description_api and vlm_pipeline_model_api must be JSON per Docling OpenAPI - if task_config.get('picture_description_api') is not None: - v = task_config.get('picture_description_api') - if isinstance(v, (dict, list)): - form_data.append(('picture_description_api', json.dumps(v))) - elif isinstance(v, str) and v.strip().startswith(('{', '[')): - form_data.append(('picture_description_api', v)) - # else: omit to avoid validation error - if task_config.get('vlm_pipeline_model'): - form_data.append(('vlm_pipeline_model', task_config.get('vlm_pipeline_model'))) - if task_config.get('vlm_pipeline_model_api') is not None: - v = task_config.get('vlm_pipeline_model_api') - if isinstance(v, (dict, list)): - form_data.append(('vlm_pipeline_model_api', json.dumps(v))) - elif isinstance(v, str) and v.strip().startswith(('{', '[')): - form_data.append(('vlm_pipeline_model_api', v)) - # else: omit - if is_canonical and ('md' in to_formats_list): - form_data.append(('md_page_break_placeholder', task_config.get('md_page_break_placeholder', '\n\n\n\n'))) - # Append to_formats as repeated fields (filter unsupported split pages) - to_formats_list = [f for f in to_formats_list if f != 'html_split_page'] - for fmt in to_formats_list: - form_data.append(('to_formats', fmt)) - - # Handle page range with clamping and min/max correction - page_range = task_config.get('page_range', [1, 999999]) - if isinstance(page_range, list) and len(page_range) >= 2: - def _to_int_safe(v, default): - try: - return int(v) - except Exception: - return default - start_pg = _to_int_safe(page_range[0], 1) - end_pg = _to_int_safe(page_range[1], 999999) - if start_pg < 1: - start_pg = 1 - if end_pg < start_pg: - end_pg = start_pg - # Clamp for frontmatter-like tasks to actual page count if possible - if task.task_type in ('docling_frontmatter_json', 'document_structure_analysis'): - try: - import fitz # PyMuPDF - doc = fitz.open(stream=file_bytes, filetype='pdf') - pc = int(doc.page_count) - doc.close() - if pc > 0: - end_pg = min(end_pg, pc) - start_pg = max(1, min(start_pg, pc)) - if end_pg < start_pg: - end_pg = start_pg - except Exception: - pass - form_data.append(('page_range', str(start_pg))) - form_data.append(('page_range', str(end_pg))) - - files = [('files', ('file', file_bytes, payload.get('mime_type', 'application/pdf')))] - - # Make request - response = requests.post( - f"{self.docling_url.rstrip('/')}/v1/convert/file", - files=files, - data=form_data, - headers=headers, - timeout=task.timeout - ) - response.raise_for_status() - - content_type = (response.headers.get('Content-Type') or '').lower() - is_zip_resp = ('zip' in content_type) or (response.content[:2] == b'PK') - - if is_zip_resp and is_canonical: - # Unpack zip, store all files and a manifest - artefact_id = str(uuid.uuid4()) - base_dir = f"{cabinet_id}/{file_id}/{artefact_id}" - archive_path = f"{base_dir}/bundle.zip" - # Save original archive - self.storage.upload_file(bucket, archive_path, response.content, 'application/zip', upsert=True) - - zf = zipfile.ZipFile(io.BytesIO(response.content)) - entries = [] - md_full_path = None - html_full_path = None - text_full_path = None - json_full_path = None - images_list = [] - md_data_bytes: bytes | None = None - for zi in zf.infolist(): - if zi.is_dir(): - continue - name = zi.filename.lstrip('/').replace('..', '') - data = zf.read(zi) - ctype = mimetypes.guess_type(name)[0] or 'application/octet-stream' - rel = f"{base_dir}/{name}" - self.storage.upload_file(bucket, rel, data, ctype, upsert=True) - entries.append({ - 'name': name, - 'path': rel, - 'size': zi.file_size, - 'content_type': ctype - }) - # Detect known outputs - lower = name.lower() - if lower.endswith('.md') and md_full_path is None: - md_full_path = rel - md_data_bytes = data - elif lower.endswith('.html') and html_full_path is None: - html_full_path = rel - elif lower.endswith('.txt') and text_full_path is None: - text_full_path = rel - elif lower.endswith('.json') and json_full_path is None: - json_full_path = rel - if ctype.startswith('image/'): - images_list.append({'name': name, 'path': rel, 'content_type': ctype, 'size': zi.file_size}) - - manifest = { - 'file_id': file_id, - 'artefact_id': artefact_id, - 'to_formats': to_formats_list, - 'image_export_mode': 'referenced', - 'entries': entries, - 'archive_path': archive_path, - 'markdown_full': md_full_path, - 'html_full': html_full_path, - 'text_full': text_full_path, - 'json_full': json_full_path, - 'images': images_list, - 'bucket': bucket - } - # Create markdown pages by splitting on placeholder if available - if md_data_bytes is not None: - try: - md_text = md_data_bytes.decode('utf-8', errors='replace') - sep = task_config.get('md_page_break_placeholder', '\n\n\n\n') - parts = md_text.split(sep) - if len(parts) > 1: - pages_dir = f"{base_dir}/md_pages" - pages = [] - for i, part in enumerate(parts, start=1): - pth = f"{pages_dir}/page-{i:04d}.md" - self.storage.upload_file(bucket, pth, part.encode('utf-8'), 'text/markdown', upsert=True) - pages.append({'page': i, 'path': pth}) - manifest['markdown_pages'] = pages - except Exception as e: - logger.warning(f"Failed creating markdown_pages for file {file_id}: {e}") - manifest_path = f"{base_dir}/manifest.json" - self.storage.upload_file(bucket, manifest_path, json.dumps(manifest, ensure_ascii=False).encode('utf-8'), 'application/json', upsert=True) - - # Create artefact row pointing to directory with manifest, including grouping extras for split packs - artefact_extra = payload.get('artefact_extra') if isinstance(payload, dict) else None - # Determine artefact type by pipeline (standard vs vlm) - pipeline_mode = (task_config.get('pipeline') or 'standard').lower() - artefact_type_final = 'docling_vlm' if pipeline_mode == 'vlm' else 'docling_standard' - group_pack_type = payload.get('group_pack_type') if isinstance(payload, dict) else None - # propagate group_id if provided (set by caller for multi-part packs) - group_id = (artefact_extra or {}).get('group_id') - # Compute a settings fingerprint for grouping (exclude page_range) - try: - import hashlib, json as _json - cfg_for_hash = dict(task_config) - cfg_for_hash.pop('page_range', None) - settings_fingerprint = hashlib.sha1(_json.dumps(cfg_for_hash, sort_keys=True, ensure_ascii=False).encode('utf-8')).hexdigest() - except Exception: - settings_fingerprint = None - - self.client.supabase.table('document_artefacts').insert({ - 'id': artefact_id, - 'file_id': file_id, - 'type': artefact_type_final, - 'rel_path': base_dir, - 'extra': { - 'manifest': manifest_path, - 'processing_time': response.elapsed.total_seconds(), - 'config': task_config, - 'group_pack_type': group_pack_type or (artefact_extra or {}).get('group_pack_type'), - 'group_id': group_id, - 'pipeline': pipeline_mode, - 'settings_fingerprint': settings_fingerprint, - **(artefact_extra or {}) - }, - 'status': 'completed' - }).execute() - - logger.info(f"Canonical docling bundle stored for file {file_id} with {len(entries)} files") - return { - 'artefact_id': artefact_id, - 'files_count': len(entries) - } - - if 'application/json' in content_type or content_type.endswith('+json'): - docling_json = response.json() - artefact_id = str(uuid.uuid4()) - artefact_type = task.task_type - rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/{artefact_type.replace('_json', '.json')}" - self.storage.upload_file( - bucket, - rel_path, - json.dumps(docling_json, ensure_ascii=False).encode('utf-8'), - 'application/json', - upsert=True - ) - artefact_data = { - 'id': artefact_id, - 'file_id': file_id, - 'type': artefact_type, - 'rel_path': rel_path, - 'extra': { - 'processing_time': response.elapsed.total_seconds(), - 'config': task_config, - **({} if 'artefact_extra' not in payload else payload['artefact_extra']) - }, - 'status': 'completed' - } - self.client.supabase.table('document_artefacts').insert(artefact_data).execute() - else: - # Fallback: store raw output if server didn't return JSON (unexpected for inbody) - artefact_id = str(uuid.uuid4()) - ext = ('html' if 'html' in content_type else ('md' if 'markdown' in content_type else ('txt' if 'text/plain' in content_type else 'bin'))) - artefact_type = f'docling_output_{ext}' - rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/docling_output.{ext}" - self.storage.upload_file( - bucket, - rel_path, - response.content, - 'application/zip' if ext == 'zip' else (content_type or 'application/octet-stream'), - upsert=True - ) - artefact_data = { - 'id': artefact_id, - 'file_id': file_id, - 'type': artefact_type, - 'rel_path': rel_path, - 'extra': { - 'processing_time': response.elapsed.total_seconds(), - 'config': task_config, - 'to_formats': to_formats_list, - 'content_type': content_type, - **({} if 'artefact_extra' not in payload else payload['artefact_extra']) - }, - 'status': 'completed' - } - self.client.supabase.table('document_artefacts').insert(artefact_data).execute() - - # When we get canonical Docling JSON, also split out component contents into separate artefacts - try: - if 'application/json' in content_type or content_type.endswith('+json'): - self._store_docling_component_artefacts( - file_id=file_id, - cabinet_id=cabinet_id, - bucket=bucket, - docling_json=docling_json, - task_config=task_config, - artefact_extra=payload.get('artefact_extra') if isinstance(payload, dict) else None - ) - except Exception as split_e: - logger.warning(f"Storing component artefacts failed for file {file_id}: {split_e}") - - # Handle optional frontpage image extraction - if task.task_type == 'docling_frontmatter_json': - try: - self._extract_frontpage_image(docling_json, file_id, cabinet_id, bucket) - except Exception as e: - logger.warning(f"Frontpage image extraction failed for file {file_id}: {e}") - - logger.info(f"Docling processing completed for file {file_id}") - - # Pipeline dependencies now handle sequential execution automatically - - return { - 'artefact_id': artefact_id, - 'rel_path': rel_path, - 'processing_time': response.elapsed.total_seconds() - } - - def _extract_frontpage_image(self, docling_json: Dict[str, Any], file_id: str, - cabinet_id: str, bucket: str): - """Extract and store frontpage image from Docling JSON.""" - import base64 - - # Look for frontpage image in various locations - cover_b64 = None - for key in ['frontpage', 'cover']: - if key in docling_json and 'image_base64' in docling_json[key]: - cover_b64 = docling_json[key]['image_base64'] - break - - if not cover_b64: - return - - # Decode and store image - artefact_id = str(uuid.uuid4()) - img_bytes = base64.b64decode(cover_b64) - rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/frontpage.png" - - self.storage.upload_file(bucket, rel_path, img_bytes, 'image/png', upsert=True) - - # Create artefact record - artefact_data = { - 'id': artefact_id, - 'file_id': file_id, - 'type': 'docling_frontpage_image', - 'rel_path': rel_path, - 'extra': {'extracted_from': 'docling_frontmatter'}, - 'status': 'completed' - } - - self.client.supabase.table('document_artefacts').insert(artefact_data).execute() - logger.debug(f"Frontpage image extracted for file {file_id}") - - def _store_docling_component_artefacts(self, *, file_id: str, cabinet_id: str, bucket: str, docling_json: Dict[str, Any], task_config: Dict[str, Any], artefact_extra: Optional[Dict[str, Any]] = None) -> None: - """Create artefacts for component contents from a canonical Docling JSON. - - Stores md_content, html_content, text_content, doctags_content and json_content - if present, as separate artefacts and files alongside the canonical JSON. - """ - doc = docling_json.get('document') or {} - components = [ - ('md_content', 'docling_md', 'docling.md', 'text/markdown', lambda v: v if isinstance(v, str) else ''), - ('html_content', 'docling_html', 'docling.html', 'text/html', lambda v: v if isinstance(v, str) else ''), - ('text_content', 'docling_text', 'docling.txt', 'text/plain', lambda v: v if isinstance(v, str) else ''), - ('doctags_content', 'docling_doctags', 'docling.doctags.xml', 'application/xml', lambda v: v if isinstance(v, str) else ''), - ('json_content', 'docling_json', 'docling.json', 'application/json', lambda v: json.dumps(v or {}, ensure_ascii=False)), - ] - - for key, art_type, filename, mime, to_bytes in components: - if key not in doc or doc.get(key) in (None, ''): - continue - try: - artefact_id = str(uuid.uuid4()) - rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/{filename}" - data_bytes = to_bytes(doc.get(key)) - if isinstance(data_bytes, str): - data_bytes = data_bytes.encode('utf-8') - self.storage.upload_file(bucket, rel_path, data_bytes, mime, upsert=True) - extra = {'source': 'canonical_docling_json', 'component_key': key, 'config': task_config} - if artefact_extra: - extra.update(artefact_extra) - self.client.supabase.table('document_artefacts').insert({ - 'id': artefact_id, - 'file_id': file_id, - 'type': art_type, - 'rel_path': rel_path, - 'extra': extra, - 'status': 'completed' - }).execute() - except Exception as e: - logger.warning(f"Failed to store component '{key}' for file {file_id}: {e}") - - def _process_llm_task(self, task: QueueTask) -> Dict[str, Any]: - """Process LLM analysis task (document classification, etc.).""" - if not self.llm_url: - raise ValueError("LLM_URL not configured") - - payload = task.payload - file_id = task.file_id - prompt = payload['prompt'] - context = payload.get('context', '') - model = payload.get('model', 'default') - - # Prepare LLM request - llm_request = { - 'model': model, - 'prompt': prompt, - 'context': context, - 'max_tokens': payload.get('max_tokens', 1000), - 'temperature': payload.get('temperature', 0.1) - } - - # Call local LLM - response = requests.post( - f"{self.llm_url.rstrip('/')}/generate", - json=llm_request, - headers={'Content-Type': 'application/json'}, - timeout=task.timeout - ) - response.raise_for_status() - - llm_result = response.json() - - # Store result (optional - depends on use case) - if payload.get('store_result', False): - bucket = payload['bucket'] - cabinet_id = payload['cabinet_id'] - - artefact_id = str(uuid.uuid4()) - rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/llm_{task.task_type}.json" - - self.storage.upload_file( - bucket, - rel_path, - json.dumps(llm_result, ensure_ascii=False).encode('utf-8'), - 'application/json', - upsert=True - ) - - # Create artefact record - artefact_data = { - 'id': artefact_id, - 'file_id': file_id, - 'type': f'llm_{task.task_type}', - 'rel_path': rel_path, - 'extra': { - 'model': model, - 'task_type': task.task_type - }, - 'status': 'completed' - } - - self.client.supabase.table('document_artefacts').insert(artefact_data).execute() - - logger.info(f"LLM processing completed for file {file_id}") - return llm_result - - def _process_split_map_task(self, task: QueueTask) -> Dict[str, Any]: - """Process split map generation task.""" - from routers.database.files.split_map import create_split_map_for_file - from routers.database.files.files import enqueue_canonical_docling - - file_id = task.file_id - - # Generate split map - split_map = create_split_map_for_file(file_id) - - logger.info(f"Split map generation completed for file {file_id}") - - # NEW BUNDLE ARCHITECTURE: Direct pipeline enqueueing - # Split map completion now directly triggers bundle task creation - logger.info(f"NEW ARCHITECTURE: Enqueueing sequential docling bundle pipelines for file {file_id}") - - try: - # Get file information for pipeline enqueueing - file_result = self.client.supabase.table('files').select('*').eq('id', file_id).single().execute() - if not file_result.data: - logger.error(f"Could not find file {file_id} for pipeline enqueueing") - return { - 'method': split_map['method'], - 'confidence': split_map['confidence'], - 'entries_count': len(split_map['entries']), - 'pipeline_error': 'File not found for pipeline enqueueing' - } - - file_row = file_result.data - bucket = file_row['bucket'] - cabinet_id = file_row['cabinet_id'] - storage_path = file_row['path'] - original_mime = file_row.get('mime_type', 'application/pdf') - - # Prefer converted PDF if available (matches existing pattern) - try: - arts = self.client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or [] - pdf_art = next((a for a in arts if a.get('type') == 'document_pdf'), None) - processing_path = pdf_art['rel_path'] if pdf_art else storage_path - processing_mime = 'application/pdf' if pdf_art else original_mime - except Exception: - processing_path = storage_path - processing_mime = original_mime - - # Prepare file data for pipeline controller - file_data = { - 'bucket': bucket, - 'file_path': processing_path, - 'cabinet_id': cabinet_id, - 'mime_type': processing_mime - } - - # Import and use pipeline controller to enqueue sequential pipelines - from modules.pipeline_controller import get_pipeline_controller - controller = get_pipeline_controller() - - pipeline_result = controller.enqueue_sequential_docling_pipelines(file_id, file_data) - - logger.info(f"Successfully enqueued {pipeline_result['total_tasks']} tasks across " - f"{len(pipeline_result['enqueued_pipelines'])} pipelines for file {file_id}") - logger.info(f"Pipeline execution order: {pipeline_result['sequential_order']}") - - return { - 'method': split_map['method'], - 'confidence': split_map['confidence'], - 'entries_count': len(split_map['entries']), - 'enqueued_pipelines': pipeline_result['enqueued_pipelines'], - 'total_pipeline_tasks': pipeline_result['total_tasks'], - 'pipeline_order': pipeline_result['sequential_order'] - } - - except Exception as e: - logger.error(f"Failed to enqueue sequential pipelines for file {file_id}: {e}") - return { - 'method': split_map['method'], - 'confidence': split_map['confidence'], - 'entries_count': len(split_map['entries']), - 'pipeline_error': str(e) - } - - # Split map processing completed successfully - - return { - 'method': split_map['method'], - 'confidence': split_map['confidence'], - 'entries_count': len(split_map['entries']) - } - - def _enqueue_vlm_page_processing(self, file_id: str, threshold: int, vlm_group_id: str, vlm_model: str, base_config: dict): - """Enqueue VLM processing for individual pages within split map sections.""" - from routers.database.files.files import _load_split_map - from modules.database.supabase.utils.client import SupabaseServiceRoleClient - from modules.database.supabase.utils.storage import StorageAdmin - - try: - client = SupabaseServiceRoleClient() - storage = StorageAdmin() - - # Get file info - fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute() - if not fr.data: - logger.error(f"File {file_id} not found for VLM page processing") - return - - file_row = fr.data - bucket = file_row['bucket'] - - # Load split map - split_map = _load_split_map(client, storage, bucket, file_id) - if not split_map: - logger.warning(f"No split map found for VLM page processing file {file_id}") - return - - entries = split_map.get('entries', []) - if not entries: - logger.warning(f"Empty split map entries for VLM page processing file {file_id}") - return - - logger.info(f"[auto-canonical] VLM page processing: found {len(entries)} sections for file {file_id}") - - # Process each section with page-by-page VLM - for section_idx, entry in enumerate(entries, 1): - try: - start_page = int(entry.get('start_page', 1)) - end_page = int(entry.get('end_page', start_page)) - section_title = entry.get('title', f'Section {section_idx}') - - logger.info(f"[auto-canonical] VLM page processing section {section_idx}: '{section_title}' pages {start_page}-{end_page}") - - # Create section-level bundle manifest task - self._enqueue_vlm_section_page_bundle( - file_id, section_idx, start_page, end_page, section_title, - vlm_group_id, vlm_model, base_config, len(entries) - ) - - except Exception as section_e: - logger.warning(f"Failed to process VLM section {section_idx} for file {file_id}: {section_e}") - continue - - except Exception as e: - logger.error(f"VLM page processing setup failed for file {file_id}: {e}") - - def _enqueue_vlm_section_page_bundle(self, file_id: str, section_idx: int, start_page: int, end_page: int, - section_title: str, vlm_group_id: str, vlm_model: str, - base_config: dict, total_sections: int): - """Enqueue VLM processing for individual pages within a section, then bundle them.""" - from modules.queue_system import enqueue_docling_task, TaskPriority - - try: - # Create a unique task to handle page-by-page processing for this section - section_task_id = enqueue_docling_task( - file_id=file_id, - task_type='vlm_section_page_bundle', - payload={ - 'section_idx': section_idx, - 'start_page': start_page, - 'end_page': end_page, - 'section_title': section_title, - 'vlm_group_id': vlm_group_id, - 'vlm_model': vlm_model, - 'base_config': base_config, - 'total_sections': total_sections, - 'producer': 'auto_split' - }, - priority=TaskPriority.NORMAL, - timeout=3600 # 1 hour for page-by-page processing - ) - - logger.info(f"[auto-canonical] VLM section page bundle task {section_task_id} for section {section_idx} of file {file_id}") - - except Exception as e: - logger.error(f"Failed to enqueue VLM section page bundle for section {section_idx} file {file_id}: {e}") - - def process_document_analysis_task(self, task: QueueTask) -> Dict[str, Any]: - """Process document structure analysis task""" - file_id = task.file_id - payload = task.payload - - logger.info(f"Processing document analysis task for file {file_id}") - - try: - # Load file from storage - bucket = payload['bucket'] - file_path = payload['file_path'] - cabinet_id = payload['cabinet_id'] - - file_bytes = self.storage.download_file(bucket, file_path) - - # Load existing artefacts if available - client = SupabaseServiceRoleClient() - artefacts = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).execute() - - tika_json = None - docling_json = None - - for art in artefacts.data: - if art['type'] == 'tika_json' and art['status'] == 'completed': - try: - tika_data = self.storage.download_file(bucket, art['rel_path']) - tika_json = json.loads(tika_data.decode('utf-8')) - except Exception as e: - logger.warning(f"Failed to load Tika JSON for analysis: {e}") - - elif art['type'] in ['docling_frontmatter_json', 'docling_noocr_json'] and art['status'] == 'completed': - try: - docling_data = self.storage.download_file(bucket, art['rel_path']) - docling_json = json.loads(docling_data.decode('utf-8')) - break # Use first available Docling result - except Exception as e: - logger.warning(f"Failed to load Docling JSON for analysis: {e}") - - # Import here to avoid circular imports - from modules.document_analysis import create_document_outline_hierarchy_artefact - - # Create document analysis - analysis_data = create_document_outline_hierarchy_artefact( - file_id=file_id, - pdf_bytes=file_bytes, - tika_json=tika_json, - docling_json=docling_json - ) - - # Store analysis as artefact (insert row first, then upload file) - artefact_id = analysis_data.get('artefact_id') or str(uuid.uuid4()) - analysis_data['artefact_id'] = artefact_id - rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/document_outline_hierarchy.json" - # Insert row first to avoid orphaned files if DB insert fails - # Insert artefact record with processing status - sections_count = len(analysis_data.get('sections', []) or []) - metadata = analysis_data.get('metadata') or {} - analysis_methods = metadata.get('analysis_methods') - self.client.supabase.table('document_artefacts').insert({ - 'id': artefact_id, - 'file_id': file_id, - 'type': 'document_outline_hierarchy', - 'rel_path': rel_path, - 'extra': { - 'sections_count': sections_count, - 'analysis_methods': analysis_methods - }, - 'status': 'processing' - }).execute() - - # Now upload the file - analysis_json = json.dumps(analysis_data, ensure_ascii=False) - self.storage.upload_file(bucket, rel_path, analysis_json.encode('utf-8'), 'application/json', upsert=True) - - # Mark artefact as completed - self.client.supabase.table('document_artefacts').update({ - 'status': 'completed' - }).eq('id', artefact_id).execute() - - logger.info(f"Document analysis completed for file {file_id} (sections={sections_count})") - return { - 'sections_count': sections_count - } - - except Exception as e: - logger.error(f"Document analysis failed for file {file_id}: {e}") - raise - - def process_page_images_task(self, task: QueueTask) -> Dict[str, Any]: - """Process page images generation task""" - file_id = task.file_id - payload = task.payload - - logger.info(f"Processing page images task for file {file_id}") - - try: - # Load file from storage - bucket = payload['bucket'] - file_path = payload['file_path'] - cabinet_id = payload['cabinet_id'] - - file_bytes = self.storage.download_file(bucket, file_path) - - # Import here to avoid circular imports - from modules.page_image_generator import create_page_images_artefact - - # Generate page images - images_data = create_page_images_artefact( - file_id=file_id, - cabinet_id=cabinet_id, - pdf_bytes=file_bytes - ) - - artefact_id = images_data['artefact_id'] - # Include bucket in manifest for client-side signed URL generation - images_data['bucket'] = bucket - - # Upload all page images to storage - for page_info in images_data['page_images']: - # Upload full image - full_path = page_info['full_image_path'] - full_data = page_info.pop('full_image_data') # Remove from JSON - self.storage.upload_file(bucket, full_path, full_data, 'image/png', upsert=True) - - # Upload thumbnail - thumb_path = page_info['thumbnail_path'] - thumb_data = page_info.pop('thumbnail_data') # Remove from JSON - self.storage.upload_file(bucket, thumb_path, thumb_data, 'image/webp', upsert=True) - - # Store images metadata manifest under the artefact directory - artefact_dir = f"{cabinet_id}/{file_id}/{artefact_id}" - manifest_rel_path = f"{artefact_dir}/page_images.json" - images_json = json.dumps(images_data, ensure_ascii=False) - self.storage.upload_file(bucket, manifest_rel_path, images_json.encode('utf-8'), 'application/json', upsert=True) - - # Insert artefact record - client = SupabaseServiceRoleClient() - client.supabase.table('document_artefacts').insert({ - 'id': artefact_id, - 'file_id': file_id, - 'type': 'page_images', - # Store the directory prefix as rel_path for hybrid approach - 'rel_path': artefact_dir, - 'extra': { - 'page_count': images_data['page_count'], - 'total_full_images': images_data['storage_info']['total_full_images'], - 'total_thumbnails': images_data['storage_info']['total_thumbnails'], - 'estimated_storage_mb': images_data['storage_info']['estimated_storage_mb'], - 'manifest': manifest_rel_path - }, - 'status': 'completed' - }).execute() - - logger.info(f"Page images generation completed for file {file_id}") - return { - 'page_count': images_data['page_count'], - 'estimated_storage_mb': images_data['storage_info']['estimated_storage_mb'] - } - - except Exception as e: - logger.error(f"Page images generation failed for file {file_id}: {e}") - raise - - def process_comparison_analysis_task(self, task: QueueTask) -> Dict[str, Any]: - """Process comparison analysis between no-OCR and OCR docling results.""" - file_id = task.file_id - payload = task.payload - - logger.info(f"Processing comparison analysis task for file {file_id}") - - try: - no_ocr_group_id = payload.get('no_ocr_group_id') - ocr_group_id = payload.get('ocr_group_id') - comparison_type = payload.get('comparison_type', 'noocr_vs_ocr') - initial_delay = payload.get('initial_delay_seconds', 0) - - # If this is the first execution and we have an initial delay, sleep briefly - if initial_delay > 0: - import time - logger.info(f"Comparison analysis: applying initial delay of {min(initial_delay, 60)} seconds for file {file_id}") - time.sleep(min(initial_delay, 60)) # Max 1 minute delay per attempt - logger.info(f"Comparison analysis: delay complete for file {file_id}") - - if not no_ocr_group_id or not ocr_group_id: - raise ValueError("Missing group_id parameters for comparison") - - client = SupabaseServiceRoleClient() - - # Find file info - fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute() - if not fr.data: - raise ValueError(f"File {file_id} not found") - - file_row = fr.data - bucket = file_row['bucket'] - cabinet_id = file_row['cabinet_id'] - - # Find artefacts for both groups - artefacts = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).execute() - arts = artefacts.data or [] - - # Filter artefacts by group_id and type, including status - no_ocr_arts = [a for a in arts if ((a.get('extra') or {}).get('group_id') == no_ocr_group_id and - a.get('type') == 'docling_standard' and - a.get('status') == 'completed')] - ocr_arts = [a for a in arts if ((a.get('extra') or {}).get('group_id') == ocr_group_id and - a.get('type') == 'docling_standard' and - a.get('status') == 'completed')] - - # Also check pending/processing artefacts to understand timing better - no_ocr_pending = [a for a in arts if ((a.get('extra') or {}).get('group_id') == no_ocr_group_id and - a.get('type') == 'docling_standard' and - a.get('status') in ('processing', 'pending'))] - ocr_pending = [a for a in arts if ((a.get('extra') or {}).get('group_id') == ocr_group_id and - a.get('type') == 'docling_standard' and - a.get('status') in ('processing', 'pending'))] - - # Determine expected total parts from split_total metadata (if available) - expected_parts = None - if no_ocr_arts: - expected_parts = (no_ocr_arts[0].get('extra') or {}).get('split_total') - elif ocr_arts: - expected_parts = (ocr_arts[0].get('extra') or {}).get('split_total') - elif no_ocr_pending: - expected_parts = (no_ocr_pending[0].get('extra') or {}).get('split_total') - elif ocr_pending: - expected_parts = (ocr_pending[0].get('extra') or {}).get('split_total') - - logger.info(f"Comparison analysis: found {len(no_ocr_arts)} completed no-OCR artefacts ({len(no_ocr_pending)} pending), {len(ocr_arts)} completed OCR artefacts ({len(ocr_pending)} pending), expected_parts={expected_parts}") - - # Enhanced validation with progress-aware retry logic - if expected_parts is not None: - # We know how many parts to expect, so wait for all of them - total_no_ocr = len(no_ocr_arts) + len(no_ocr_pending) - total_ocr = len(ocr_arts) + len(ocr_pending) - - # Calculate completion percentages - no_ocr_completion = len(no_ocr_arts) / expected_parts * 100 - ocr_completion = len(ocr_arts) / expected_parts * 100 - - # Check if we're making progress (store in task metadata for persistence) - progress_key = f"comparison_progress_{file_id}" - current_progress = { - 'no_ocr_completed': len(no_ocr_arts), - 'ocr_completed': len(ocr_arts), - 'no_ocr_pending': len(no_ocr_pending), - 'ocr_pending': len(ocr_pending) - } - - # Get previous progress from payload (injected by retry mechanism) - previous_progress = payload.get('previous_progress', {'no_ocr_completed': 0, 'ocr_completed': 0}) - progress_made = (current_progress['no_ocr_completed'] > previous_progress['no_ocr_completed'] or - current_progress['ocr_completed'] > previous_progress['ocr_completed']) - - if len(no_ocr_arts) < expected_parts or len(ocr_arts) < expected_parts: - if len(no_ocr_pending) > 0 or len(ocr_pending) > 0: - # Still processing - this is expected, always retry - error_msg = f"PROGRESS_RETRY: expected={expected_parts}, no_ocr={len(no_ocr_arts)}/{expected_parts} ({no_ocr_completion:.1f}%), ocr={len(ocr_arts)}/{expected_parts} ({ocr_completion:.1f}%), pending: no_ocr={len(no_ocr_pending)}, ocr={len(ocr_pending)}" - progress_retry_error = ValueError(error_msg) - progress_retry_error.current_progress = current_progress - progress_retry_error.is_progress_retry = True - raise progress_retry_error - elif progress_made: - # No pending but made progress since last check - likely brief gap between completions - error_msg = f"PROGRESS_MADE_RETRY: expected={expected_parts}, no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)}, progress since last check" - progress_retry_error = ValueError(error_msg) - progress_retry_error.current_progress = current_progress - progress_retry_error.is_progress_retry = True - raise progress_retry_error - else: - # No progress and no pending - likely stalled, but still retry with backoff - error_msg = f"STALLED_RETRY: expected={expected_parts}, no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - no pending tasks but will retry" - stalled_retry_error = ValueError(error_msg) - stalled_retry_error.current_progress = current_progress - stalled_retry_error.is_stalled_retry = True - raise stalled_retry_error - - # Also verify both groups have the same number of completed parts - if len(no_ocr_arts) != len(ocr_arts): - error_msg = f"ALIGNMENT_RETRY: no_ocr completed={len(no_ocr_arts)}, ocr completed={len(ocr_arts)} (expected {expected_parts} each) - waiting for alignment" - alignment_retry_error = ValueError(error_msg) - alignment_retry_error.current_progress = current_progress - alignment_retry_error.is_alignment_retry = True - raise alignment_retry_error - - else: - # Fallback to original logic when split_total not available - if not no_ocr_arts or not ocr_arts: - # More detailed retry logic with pending artefact awareness - if len(no_ocr_arts) == 0 and len(ocr_arts) == 0: - if len(no_ocr_pending) > 0 or len(ocr_pending) > 0: - raise ValueError(f"Batches still processing: no_ocr completed={len(no_ocr_arts)} pending={len(no_ocr_pending)}, ocr completed={len(ocr_arts)} pending={len(ocr_pending)} - will retry") - else: - raise ValueError(f"No artefacts found for either group: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - may need more time") - elif len(ocr_arts) == 0: - if len(ocr_pending) > 0: - raise ValueError(f"OCR batch still processing: no_ocr completed={len(no_ocr_arts)}, ocr completed={len(ocr_arts)} pending={len(ocr_pending)} - will retry") - else: - raise ValueError(f"OCR batch appears incomplete: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - will retry") - elif len(no_ocr_arts) == 0: - if len(no_ocr_pending) > 0: - raise ValueError(f"No-OCR batch still processing: no_ocr completed={len(no_ocr_arts)} pending={len(no_ocr_pending)}, ocr completed={len(ocr_arts)} - will retry") - else: - raise ValueError(f"No-OCR batch appears incomplete: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - will retry") - else: - raise ValueError(f"Unexpected missing artefacts: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)}") - - # For fallback case, ensure both groups have same count - if len(no_ocr_arts) != len(ocr_arts): - raise ValueError(f"Mismatched group sizes: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - will retry") - - # Sort both groups by split_order for aligned comparison - no_ocr_arts.sort(key=lambda x: ((x.get('extra') or {}).get('split_order') or 0)) - ocr_arts.sort(key=lambda x: ((x.get('extra') or {}).get('split_order') or 0)) - - # Log final validation before proceeding - no_ocr_orders = [((a.get('extra') or {}).get('split_order') or 0) for a in no_ocr_arts] - ocr_orders = [((a.get('extra') or {}).get('split_order') or 0) for a in ocr_arts] - logger.info(f"Proceeding with comparison: no_ocr split_orders={no_ocr_orders}, ocr split_orders={ocr_orders}, expected_parts={expected_parts}") - - # Create comparison results - comparison_results = self._compare_docling_groups( - file_id, bucket, cabinet_id, no_ocr_arts, ocr_arts, comparison_type, - no_ocr_group_id, ocr_group_id, payload - ) - - return comparison_results - - except Exception as e: - logger.error(f"Comparison analysis failed for file {file_id}: {e}") - raise - - def _compare_docling_groups(self, file_id: str, bucket: str, cabinet_id: str, - no_ocr_arts: list, ocr_arts: list, comparison_type: str, - no_ocr_group_id: str, ocr_group_id: str, payload: Dict[str, Any]) -> Dict[str, Any]: - """Compare two groups of docling artefacts and generate analysis.""" - import subprocess - import tempfile - import json - import uuid - - logger.info(f"Starting detailed comparison for file {file_id}: {len(no_ocr_arts)} vs {len(ocr_arts)} artefacts") - - artefact_id = str(uuid.uuid4()) - comparison_dir = f"{cabinet_id}/{file_id}/{artefact_id}" - results = [] - overall_stats = { - 'total_comparisons': min(len(no_ocr_arts), len(ocr_arts)), - 'successful_comparisons': 0, - 'failed_comparisons': 0, - 'differences_found': 0, - 'identical_count': 0 - } - - try: - with tempfile.TemporaryDirectory() as temp_dir: - for i in range(min(len(no_ocr_arts), len(ocr_arts))): - no_ocr_art = no_ocr_arts[i] - ocr_art = ocr_arts[i] - - try: - # Download manifest JSONs for both artefacts - no_ocr_manifest_path = ((no_ocr_art.get('extra') or {}).get('manifest')) - ocr_manifest_path = ((ocr_art.get('extra') or {}).get('manifest')) - - if not no_ocr_manifest_path or not ocr_manifest_path: - logger.warning(f"Missing manifest paths for comparison {i+1}") - continue - - no_ocr_manifest_data = self.storage.download_file(bucket, no_ocr_manifest_path) - ocr_manifest_data = self.storage.download_file(bucket, ocr_manifest_path) - - no_ocr_manifest = json.loads(no_ocr_manifest_data.decode('utf-8')) - ocr_manifest = json.loads(ocr_manifest_data.decode('utf-8')) - - # Compare JSON content if available - no_ocr_json_path = no_ocr_manifest.get('json_full') - ocr_json_path = ocr_manifest.get('json_full') - - if no_ocr_json_path and ocr_json_path: - comparison_result = self._compare_json_content( - bucket, no_ocr_json_path, ocr_json_path, temp_dir, i + 1 - ) - - comparison_result['no_ocr_artefact_id'] = no_ocr_art['id'] - comparison_result['ocr_artefact_id'] = ocr_art['id'] - comparison_result['split_order'] = (no_ocr_art.get('extra') or {}).get('split_order', i + 1) - comparison_result['split_heading'] = (no_ocr_art.get('extra') or {}).get('split_heading', f'Part {i+1}') - - results.append(comparison_result) - - overall_stats['successful_comparisons'] += 1 - if comparison_result['has_differences']: - overall_stats['differences_found'] += 1 - else: - overall_stats['identical_count'] += 1 - - else: - logger.warning(f"Missing JSON content paths for comparison {i+1}") - overall_stats['failed_comparisons'] += 1 - - except Exception as part_e: - logger.warning(f"Failed to compare part {i+1}: {part_e}") - overall_stats['failed_comparisons'] += 1 - continue - - # Create comprehensive comparison report - comparison_report = { - 'file_id': file_id, - 'comparison_type': comparison_type, - 'timestamp': json.dumps({"created_at": "now()"}, default=str), - 'overall_statistics': overall_stats, - 'detailed_results': results, - 'summary': { - 'total_parts_compared': overall_stats['successful_comparisons'], - 'identical_parts': overall_stats['identical_count'], - 'different_parts': overall_stats['differences_found'], - 'accuracy_percentage': (overall_stats['identical_count'] / max(overall_stats['successful_comparisons'], 1)) * 100 - } - } - - # Store comparison report as artefact - report_path = f"{comparison_dir}/comparison_report.json" - report_json = json.dumps(comparison_report, ensure_ascii=False, indent=2) - - self.storage.upload_file(bucket, report_path, report_json.encode('utf-8'), 'application/json', upsert=True) - - # Create artefact record - client = SupabaseServiceRoleClient() - client.supabase.table('document_artefacts').insert({ - 'id': artefact_id, - 'file_id': file_id, - 'type': 'docling_comparison_analysis', - 'rel_path': report_path, - 'extra': { - 'comparison_type': comparison_type, - 'no_ocr_group_id': no_ocr_group_id, - 'ocr_group_id': ocr_group_id, - 'producer': payload.get('producer', 'auto_split'), - 'total_comparisons': overall_stats['total_comparisons'], - 'successful_comparisons': overall_stats['successful_comparisons'], - 'differences_found': overall_stats['differences_found'], - 'accuracy_percentage': comparison_report['summary']['accuracy_percentage'] - }, - 'status': 'completed' - }).execute() - - logger.info(f"Comparison analysis completed for file {file_id}: {overall_stats['successful_comparisons']} comparisons, {overall_stats['differences_found']} differences found") - - # Trigger VLM processing after comparison completes (if enabled) - self._trigger_vlm_after_comparison(file_id, payload) - - return { - 'artefact_id': artefact_id, - 'comparisons_completed': overall_stats['successful_comparisons'], - 'differences_found': overall_stats['differences_found'], - 'accuracy_percentage': comparison_report['summary']['accuracy_percentage'] - } - - except Exception as e: - logger.error(f"Failed to create comparison analysis for file {file_id}: {e}") - raise - - def _compare_json_content(self, bucket: str, no_ocr_path: str, ocr_path: str, - temp_dir: str, part_number: int) -> Dict[str, Any]: - """Compare JSON content using jq and diff as suggested in web search results.""" - import subprocess - import os - from pathlib import Path - - try: - # Download both JSON files - no_ocr_data = self.storage.download_file(bucket, no_ocr_path) - ocr_data = self.storage.download_file(bucket, ocr_path) - - # Save to temp files - no_ocr_file = Path(temp_dir) / f'no_ocr_part_{part_number}.json' - ocr_file = Path(temp_dir) / f'ocr_part_{part_number}.json' - - with open(no_ocr_file, 'wb') as f: - f.write(no_ocr_data) - with open(ocr_file, 'wb') as f: - f.write(ocr_data) - - # Use jq to sort and format both files for comparison (as suggested in web search results) - sorted_no_ocr = Path(temp_dir) / f'sorted_no_ocr_part_{part_number}.json' - sorted_ocr = Path(temp_dir) / f'sorted_ocr_part_{part_number}.json' - - # Sort both files using jq - subprocess.run(['jq', '--sort-keys', '.', str(no_ocr_file)], - stdout=open(sorted_no_ocr, 'w'), stderr=subprocess.DEVNULL, check=True) - subprocess.run(['jq', '--sort-keys', '.', str(ocr_file)], - stdout=open(sorted_ocr, 'w'), stderr=subprocess.DEVNULL, check=True) - - # Compare using diff - diff_output = Path(temp_dir) / f'diff_part_{part_number}.txt' - diff_result = subprocess.run( - ['diff', '-u', str(sorted_no_ocr), str(sorted_ocr)], - stdout=open(diff_output, 'w'), - stderr=subprocess.DEVNULL, - text=True - ) - - # Read diff output - with open(diff_output, 'r') as f: - diff_content = f.read() - - # Analyze differences - has_differences = diff_result.returncode != 0 - diff_lines = len([l for l in diff_content.split('\n') if l.startswith(('+', '-')) and not l.startswith(('+++', '---'))]) - - return { - 'part_number': part_number, - 'has_differences': has_differences, - 'diff_lines_count': diff_lines, - 'diff_content_preview': diff_content[:1000] if diff_content else '', # First 1000 chars - 'no_ocr_size': len(no_ocr_data), - 'ocr_size': len(ocr_data), - 'size_difference': abs(len(ocr_data) - len(no_ocr_data)) - } - - except subprocess.CalledProcessError as e: - logger.warning(f"jq/diff command failed for part {part_number}: {e}") - return { - 'part_number': part_number, - 'has_differences': True, - 'error': f"Comparison tools failed: {str(e)}", - 'diff_lines_count': -1 - } - except Exception as e: - logger.warning(f"JSON comparison failed for part {part_number}: {e}") - return { - 'part_number': part_number, - 'has_differences': True, - 'error': f"Comparison failed: {str(e)}", - 'diff_lines_count': -1 - } - - def process_vlm_section_page_bundle_task(self, task: QueueTask) -> Dict[str, Any]: - """Process VLM section page bundle task - create individual page bundles and combine them.""" - file_id = task.file_id - payload = task.payload - - logger.info(f"Processing VLM section page bundle task for file {file_id}") - - try: - section_idx = payload.get('section_idx') - start_page = payload.get('start_page') - end_page = payload.get('end_page') - section_title = payload.get('section_title', f'Section {section_idx}') - vlm_group_id = payload.get('vlm_group_id') - vlm_model = payload.get('vlm_model', 'smoldocling') - base_config = payload.get('base_config', {}) - total_sections = payload.get('total_sections', 1) - - client = SupabaseServiceRoleClient() - - # Get file info - fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute() - if not fr.data: - raise ValueError(f"File {file_id} not found") - - file_row = fr.data - bucket = file_row['bucket'] - cabinet_id = file_row['cabinet_id'] - - # Find processing path (prefer converted PDF) - arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or [] - pdf_art = next((a for a in arts if a.get('type') == 'document_pdf'), None) - processing_path = pdf_art['rel_path'] if pdf_art else file_row['path'] - processing_mime = 'application/pdf' - - logger.info(f"VLM section bundle: processing section {section_idx} '{section_title}' pages {start_page}-{end_page} for file {file_id}") - - # Create individual page processing tasks - page_task_ids = [] - for page_num in range(start_page, end_page + 1): - try: - page_config = { - **base_config, - 'do_ocr': False, - 'force_ocr': False, - 'pipeline': 'vlm', - 'vlm_pipeline_model': vlm_model, - 'page_range': [page_num, page_num], - 'target_type': 'zip', - 'image_export_mode': 'referenced', - # Add required VLM parameters that may be missing - 'do_picture_classification': False, - 'do_picture_description': False - } - - logger.debug(f"VLM page {page_num} config: pipeline={page_config.get('pipeline')}, model={page_config.get('vlm_pipeline_model')}, range={page_config.get('page_range')}") - - from modules.queue_system import enqueue_docling_task, TaskPriority - - page_task_id = enqueue_docling_task( - file_id=file_id, - task_type='canonical_docling_json', - payload={ - 'bucket': bucket, - 'file_path': processing_path, - 'cabinet_id': cabinet_id, - 'mime_type': processing_mime, - 'config': page_config, - 'artefact_extra': { - 'is_subdoc': True, - 'page_range': [page_num, page_num], - 'label': f'{section_title} - Page {page_num}', - 'vlm_section_idx': section_idx, - 'vlm_section_title': section_title, - 'vlm_page_number': page_num, - 'vlm_section_start': start_page, - 'vlm_section_end': end_page, - 'producer': 'auto_split_vlm_page' - } - }, - priority=TaskPriority.NORMAL, - timeout=1800 - ) - - page_task_ids.append((page_num, page_task_id)) - logger.debug(f"Enqueued VLM page task {page_task_id} for page {page_num} of section {section_idx}") - - except Exception as page_e: - logger.warning(f"Failed to enqueue VLM page {page_num} for section {section_idx} file {file_id}: {page_e}") - continue - - if not page_task_ids: - raise ValueError(f"No page tasks could be enqueued for section {section_idx}") - - # Wait for all page tasks to complete and then create section bundle - logger.info(f"Enqueued {len(page_task_ids)} VLM page tasks for section {section_idx}, now waiting for completion...") - - # Create a follow-up task to bundle the completed page results - from modules.queue_system import enqueue_docling_task, TaskPriority - import time - - # Wait a bit for page tasks to start, then create bundle task - time.sleep(10) - - bundle_task_id = enqueue_docling_task( - file_id=file_id, - task_type='vlm_section_bundle_collector', - payload={ - 'section_idx': section_idx, - 'start_page': start_page, - 'end_page': end_page, - 'section_title': section_title, - 'vlm_group_id': vlm_group_id, - 'vlm_model': vlm_model, - 'total_sections': total_sections, - 'producer': 'auto_split', - 'page_task_ids': [tid for _, tid in page_task_ids], - 'expected_pages': list(range(start_page, end_page + 1)) - }, - priority=TaskPriority.LOW, # Run after page tasks - timeout=3600 - ) - - logger.info(f"Created VLM section bundle collector task {bundle_task_id} for section {section_idx}") - - return { - 'section_idx': section_idx, - 'page_tasks_created': len(page_task_ids), - 'bundle_task_id': bundle_task_id, - 'pages_range': f"{start_page}-{end_page}" - } - - except Exception as e: - logger.error(f"VLM section page bundle task failed for file {file_id}: {e}") - raise - - def process_vlm_section_bundle_collector_task(self, task: QueueTask) -> Dict[str, Any]: - """Collect completed VLM page results and create section-level bundle manifest.""" - file_id = task.file_id - payload = task.payload - - logger.info(f"Processing VLM section bundle collector for file {file_id}") - - try: - section_idx = payload.get('section_idx') - start_page = payload.get('start_page') - end_page = payload.get('end_page') - section_title = payload.get('section_title', f'Section {section_idx}') - vlm_group_id = payload.get('vlm_group_id') - vlm_model = payload.get('vlm_model', 'smoldocling') - total_sections = payload.get('total_sections', 1) - expected_pages = payload.get('expected_pages', []) - - client = SupabaseServiceRoleClient() - - # Get file info - fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute() - if not fr.data: - raise ValueError(f"File {file_id} not found") - - file_row = fr.data - bucket = file_row['bucket'] - cabinet_id = file_row['cabinet_id'] - - # Find all completed VLM page artefacts for this section - artefacts = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).execute() - arts = artefacts.data or [] - - # Filter for this section's VLM page artefacts - section_page_arts = [] - for art in arts: - extra = art.get('extra', {}) - if (extra.get('vlm_section_idx') == section_idx and - extra.get('producer') == 'auto_split_vlm_page' and - art.get('type') == 'docling_vlm' and - art.get('status') == 'completed'): - section_page_arts.append(art) - - # Check if we have all expected pages - found_pages = [art.get('extra', {}).get('vlm_page_number') for art in section_page_arts] - found_pages = [p for p in found_pages if p is not None] - missing_pages = [p for p in expected_pages if p not in found_pages] - - logger.info(f"VLM section {section_idx} bundle collector: found {len(section_page_arts)} page artefacts, expected {len(expected_pages)} pages") - - if logger.isEnabledFor(10): # DEBUG level - found_pages_debug = [art.get('extra', {}).get('vlm_page_number') for art in section_page_arts] - logger.debug(f"VLM section {section_idx}: found pages {found_pages_debug}, expected pages {expected_pages}") - - if missing_pages: - # Not all pages are ready, retry later - logger.info(f"VLM section {section_idx} bundle collector: missing pages {missing_pages}, found pages {found_pages} - will retry later") - raise ValueError(f"VLM section {section_idx} missing pages: {missing_pages} (found: {found_pages}) - will retry") - - # Sort page artefacts by page number - section_page_arts.sort(key=lambda x: x.get('extra', {}).get('vlm_page_number', 0)) - - logger.info(f"VLM section {section_idx} bundle: creating manifest for {len(section_page_arts)} pages") - - # Create section bundle manifest - section_artefact_id = str(uuid.uuid4()) - section_manifest_path = f"{cabinet_id}/{file_id}/{section_artefact_id}/vlm_section_{section_idx}_manifest.json" - - page_bundles = [] - for page_art in section_page_arts: - extra = page_art.get('extra', {}) - page_num = extra.get('vlm_page_number') - page_manifest_path = extra.get('manifest') - - page_bundles.append({ - 'page_number': page_num, - 'artefact_id': page_art['id'], - 'manifest_path': page_manifest_path, - 'rel_path': page_art['rel_path'], - 'label': extra.get('label', f'Page {page_num}') - }) - - section_manifest = { - 'file_id': file_id, - 'section_idx': section_idx, - 'section_title': section_title, - 'start_page': start_page, - 'end_page': end_page, - 'vlm_model': vlm_model, - 'total_pages': len(page_bundles), - 'page_bundles': page_bundles, - 'created_at': 'now()', - 'type': 'vlm_section_page_bundle' - } - - # Store section manifest - import json - manifest_json = json.dumps(section_manifest, ensure_ascii=False, indent=2) - self.storage.upload_file(bucket, section_manifest_path, manifest_json.encode('utf-8'), 'application/json', upsert=True) - - # Create section bundle artefact - client.supabase.table('document_artefacts').insert({ - 'id': section_artefact_id, - 'file_id': file_id, - 'type': 'vlm_section_page_bundle', - 'rel_path': section_manifest_path, - 'extra': { - 'section_idx': section_idx, - 'section_title': section_title, - 'start_page': start_page, - 'end_page': end_page, - 'vlm_model': vlm_model, - 'total_pages': len(page_bundles), - 'group_id': vlm_group_id, - 'split_order': section_idx, - 'split_heading': section_title, - 'split_total': total_sections, - 'pipeline': 'vlm', - 'producer': 'auto_split', - 'group_pack_type': 'vlm_page_bundle_auto_split' - }, - 'status': 'completed' - }).execute() - - logger.info(f"VLM section bundle collector completed for section {section_idx} of file {file_id}: created manifest with {len(page_bundles)} page bundles") - - return { - 'section_artefact_id': section_artefact_id, - 'section_idx': section_idx, - 'pages_bundled': len(page_bundles), - 'manifest_path': section_manifest_path - } - - except Exception as e: - logger.error(f"VLM section bundle collector failed for file {file_id}: {e}") - raise - - def _trigger_vlm_after_comparison(self, file_id: str, comparison_payload: Dict[str, Any]): - """Trigger VLM processing after comparison analysis completes.""" - try: - # Check if VLM should be triggered - if not comparison_payload.get('trigger_vlm_after_comparison'): - logger.debug(f"VLM post-comparison trigger not enabled for file {file_id}") - return - - vlm_config = comparison_payload.get('vlm_config', {}) - if not vlm_config.get('enabled'): - logger.debug(f"VLM not enabled for file {file_id}") - return - - logger.info(f"[auto-canonical] Triggering VLM processing after comparison for file {file_id}") - - # Extract VLM configuration - split_by_page = vlm_config.get('split_by_page', False) - vlm_model = vlm_config.get('model', 'smoldocling') - threshold = vlm_config.get('threshold', 50) - base_config = vlm_config.get('base_config', {}) - - # Generate new group_id for VLM processing - import uuid - vlm_group_id = str(uuid.uuid4()) - - if split_by_page: - # Page-by-page processing within sections - logger.info(f"[auto-canonical] vlm page-by-page processing for file {file_id} (post-comparison)") - self._enqueue_vlm_page_processing( - file_id, threshold, vlm_group_id, vlm_model, base_config - ) - else: - # Standard section-level VLM processing - from routers.database.files.files import enqueue_canonical_docling - - body_vlm = { - 'use_split_map': True, - 'threshold': threshold, - 'producer': 'auto_split', - 'group_id': vlm_group_id, - 'config': { - **base_config, - 'do_ocr': False, # VLM doesn't need OCR - 'force_ocr': False, - 'pipeline': 'vlm', - 'vlm_pipeline_model': vlm_model - } - } - logger.info(f"[auto-canonical] vlm section batch group_id={vlm_group_id} for file {file_id} (post-comparison)") - enqueue_canonical_docling(file_id=file_id, body=body_vlm) - - except Exception as e: - logger.warning(f"Failed to trigger VLM processing after comparison for file {file_id}: {e}") - - def process_docling_bundle_task(self, task: QueueTask) -> Dict[str, Any]: - """ - Process single docling bundle task (whole document processing). - - This creates a coherent single bundle with all formats using direct processing. - NO temporary tasks or old logic reuse - this is the new architecture. - """ - file_id = task.file_id - payload = task.payload - - logger.info(f"🎯 NEW ARCHITECTURE: Processing docling bundle task for file {file_id} (whole document)") - - try: - # Extract bundle configuration - config = payload.get('config', {}) - bundle_metadata = payload.get('bundle_metadata', {}) - - # Ensure bundle processing configuration - config['target_type'] = 'zip' - config['to_formats'] = ['json', 'html', 'text', 'md', 'doctags'] - - # Call the actual docling processing directly - NO temp tasks! - result = self._process_docling_bundle_direct(task, config, bundle_metadata) - - logger.info(f"✅ NEW ARCHITECTURE: Successfully processed docling bundle for file {file_id}") - return result - - except Exception as e: - logger.error(f"❌ NEW ARCHITECTURE: Docling bundle processing failed for file {file_id}: {e}") - raise - - def _process_docling_bundle_direct(self, task: QueueTask, config: Dict[str, Any], bundle_metadata: Dict[str, Any]) -> Dict[str, Any]: - """ - Direct docling bundle processing - NEW ARCHITECTURE approach. - - This processes the docling request directly without creating temporary tasks, - ensuring clean Redis state and proper bundle metadata handling. - """ - file_id = task.file_id - payload = task.payload - - logger.info(f"🔧 DIRECT PROCESSING: Starting docling bundle processing for file {file_id}") - - if not self.docling_url: - raise ValueError("DOCLING_URL not configured") - - # Extract payload data - bucket = payload['bucket'] - file_path = payload['file_path'] - cabinet_id = payload['cabinet_id'] - - # Download file - logger.debug(f"📥 DIRECT PROCESSING: Downloading file for bundle processing: {bucket}/{file_path}") - file_bytes = self.storage.download_file(bucket, file_path) - - # Prepare Docling request with bundle-specific config - docling_api_key = os.getenv('DOCLING_API_KEY') - headers = {'Accept': '*/*'} - if docling_api_key: - headers['X-Api-Key'] = docling_api_key - - # Build form data for bundle processing - USE CONFIG FROM PIPELINE_CONTROLLER (no hardcoded defaults!) - # The config passed from pipeline_controller already has environment variables loaded - form_data = [ - ('target_type', 'zip'), # Always zip for bundles - ('do_ocr', str(config.get('do_ocr', False)).lower()), - ('force_ocr', str(config.get('force_ocr', False)).lower()), - ('image_export_mode', 'referenced'), # Bundle standard - ('ocr_engine', config.get('ocr_engine', 'easyocr')), - ('pdf_backend', config.get('pdf_backend', 'dlparse_v4')), - ('table_mode', config.get('table_mode', 'fast')), # Use config from pipeline_controller (env vars) - ('table_cell_matching', str(config.get('table_cell_matching', True)).lower()), # Use config from pipeline_controller (env: true) - ('pipeline', config.get('pipeline', 'standard')), - ('do_formula_enrichment', str(config.get('do_formula_enrichment', True)).lower()), # Use config from pipeline_controller (env: true) - ('do_code_enrichment', str(config.get('do_code_enrichment', True)).lower()), # Use config from pipeline_controller (env: true) - ('do_table_structure', str(config.get('do_table_structure', True)).lower()), - ('include_images', str(config.get('include_images', True)).lower()), - ('images_scale', str(config.get('images_scale', 2.0))), - ('do_picture_classification', str(config.get('do_picture_classification', False)).lower()), - ('do_picture_description', str(config.get('do_picture_description', False)).lower()), - ('document_timeout', str(config.get('document_timeout', task.timeout))) - ] - - # Handle OCR languages as array (API expects multiple form fields) - ocr_lang = config.get('ocr_lang') - if ocr_lang: - if isinstance(ocr_lang, list): - for lang in ocr_lang: - form_data.append(('ocr_lang', str(lang))) - else: - form_data.append(('ocr_lang', str(ocr_lang))) - - # Handle VLM pipeline options (CRITICAL for VLM processing) - if config.get('vlm_pipeline_model'): - form_data.append(('vlm_pipeline_model', config.get('vlm_pipeline_model'))) - - # VLM model local/API options must be JSON per Docling OpenAPI spec - if config.get('vlm_pipeline_model_local'): - vlm_local = config.get('vlm_pipeline_model_local') - if isinstance(vlm_local, (dict, list)): - form_data.append(('vlm_pipeline_model_local', json.dumps(vlm_local))) - elif isinstance(vlm_local, str) and vlm_local.strip().startswith(('{', '[')): - form_data.append(('vlm_pipeline_model_local', vlm_local)) - # else: omit to avoid validation error - - if config.get('vlm_pipeline_model_api'): - vlm_api = config.get('vlm_pipeline_model_api') - if isinstance(vlm_api, (dict, list)): - form_data.append(('vlm_pipeline_model_api', json.dumps(vlm_api))) - elif isinstance(vlm_api, str) and vlm_api.strip().startswith(('{', '[')): - form_data.append(('vlm_pipeline_model_api', vlm_api)) - # else: omit - - # Picture description options must be JSON per Docling OpenAPI spec - if config.get('picture_description_local'): - pic_local = config.get('picture_description_local') - if isinstance(pic_local, (dict, list)): - form_data.append(('picture_description_local', json.dumps(pic_local))) - elif isinstance(pic_local, str) and pic_local.strip().startswith(('{', '[')): - form_data.append(('picture_description_local', pic_local)) - - if config.get('picture_description_api'): - pic_api = config.get('picture_description_api') - if isinstance(pic_api, (dict, list)): - form_data.append(('picture_description_api', json.dumps(pic_api))) - elif isinstance(pic_api, str) and pic_api.strip().startswith(('{', '[')): - form_data.append(('picture_description_api', pic_api)) - if 'picture_description_area_threshold' in config: - form_data.append(('picture_description_area_threshold', str(config.get('picture_description_area_threshold')))) - - # Handle markdown page break placeholder - if 'md_page_break_placeholder' in config: - form_data.append(('md_page_break_placeholder', config.get('md_page_break_placeholder'))) - - # Add formats - always all formats for bundles - for fmt in ['json', 'html', 'text', 'md', 'doctags']: - form_data.append(('to_formats', fmt)) - - # Handle page range properly - get actual PDF page count like frontmatter does - page_range = config.get('page_range', [1, 999999]) - if isinstance(page_range, list) and len(page_range) >= 2: - def _to_int_safe(v, default): - try: - return int(v) - except Exception: - return default - start_pg = _to_int_safe(page_range[0], 1) - end_pg = _to_int_safe(page_range[1], 999999) - if start_pg < 1: - start_pg = 1 - if end_pg < start_pg: - end_pg = start_pg - - # CRITICAL: Get actual PDF page count to prevent massive range - try: - import fitz # PyMuPDF - doc = fitz.open(stream=file_bytes, filetype='pdf') - pc = int(doc.page_count) - doc.close() - if pc > 0: - end_pg = min(end_pg, pc) # Clamp to actual page count! - start_pg = max(1, min(start_pg, pc)) - if end_pg < start_pg: - end_pg = start_pg - logger.info(f"📄 DIRECT PROCESSING: PDF has {pc} pages, using range {start_pg}-{end_pg}") - except Exception as e: - logger.warning(f"Could not determine PDF page count: {e}, using defaults") - - form_data.append(('page_range', str(start_pg))) - form_data.append(('page_range', str(end_pg))) - else: - # Fallback to single page if no range specified - form_data.append(('page_range', '1')) - form_data.append(('page_range', '1')) - - files = [('files', ('file', file_bytes, payload.get('mime_type', 'application/pdf')))] - - # DEBUG: Log the actual config being sent to Docling - config_debug = {key: value for key, value in form_data if key in ['table_mode', 'table_cell_matching', 'do_formula_enrichment', 'do_code_enrichment', 'do_ocr', 'pipeline']} - logger.info(f"🔧 DIRECT PROCESSING: Docling config being sent: {config_debug}") - - # Make the HTTP request - logger.info(f"🌐 DIRECT PROCESSING: Making HTTP request to Docling for file {file_id}") - try: - import time - start_time = time.time() - - response = requests.post( - f"{self.docling_url.rstrip('/')}/v1/convert/file", - files=files, - data=form_data, - headers=headers, - timeout=task.timeout - ) - response.raise_for_status() - - elapsed = time.time() - start_time - logger.info(f"⚡ DIRECT PROCESSING: Docling request completed in {elapsed:.2f}s for file {file_id}") - - except Exception as e: - logger.error(f"🌐 DIRECT PROCESSING: HTTP request failed for file {file_id}: {e}") - raise - - # Process response - should be ZIP for bundle - content_type = (response.headers.get('Content-Type') or '').lower() - is_zip_resp = ('zip' in content_type) or (response.content[:2] == b'PK') - - if not is_zip_resp: - raise ValueError(f"Expected ZIP response for bundle, got: {content_type}") - - # Process ZIP bundle and create artefacts - logger.info(f"📦 DIRECT PROCESSING: Processing ZIP bundle for file {file_id}") - result = self._process_docling_zip_bundle( - file_id=file_id, - bucket=bucket, - cabinet_id=cabinet_id, - zip_content=response.content, - bundle_metadata=bundle_metadata, - task_config=config - ) - - logger.info(f"✅ DIRECT PROCESSING: Bundle processing completed for file {file_id}") - return result - - def _create_bundle_display_metadata(self, bundle_type: str, title: str, index: int = None, - total: int = None, page_range: list = None) -> dict: - """ - Create consistent display metadata for bundle organization. - - This ensures all bundles have proper titles, ordering, and display names - for frontend organization and user-friendly presentation. - """ - metadata = { - 'title': title, - 'bundle_type': bundle_type - } - - if index is not None: - metadata['split_order'] = index - - if total is not None: - metadata['split_total'] = total - - if page_range: - metadata['page_range'] = page_range - metadata['page_count'] = page_range[1] - page_range[0] + 1 - - # Create display names based on bundle type - if bundle_type == 'page': - metadata['display_name'] = f"Page {page_range[0]}" if page_range else f"Page {index}" - metadata['bundle_label'] = f"Page {page_range[0]} Bundle" - metadata['sort_key'] = page_range[0] if page_range else index - elif bundle_type == 'section': - page_str = f" (p{page_range[0]}-{page_range[1]})" if page_range else "" - metadata['display_name'] = f"{index:02d}. {title}{page_str}" - metadata['bundle_label'] = f"{title} Bundle" - metadata['sort_key'] = index - elif bundle_type == 'chunk': - page_str = f" (p{page_range[0]}-{page_range[1]})" if page_range else "" - metadata['display_name'] = f"{index:02d}. {title}{page_str}" - metadata['bundle_label'] = f"{title} Bundle" - metadata['sort_key'] = index - else: - metadata['display_name'] = title - metadata['bundle_label'] = f"{title} Bundle" - metadata['sort_key'] = index or 0 - - return metadata - - def _process_docling_zip_bundle(self, file_id: str, bucket: str, cabinet_id: str, - zip_content: bytes, bundle_metadata: Dict[str, Any], - task_config: Dict[str, Any]) -> Dict[str, Any]: - """ - Process ZIP bundle response and create artefacts with proper bundle metadata. - - This is the NEW ARCHITECTURE approach for handling docling ZIP responses. - """ - import zipfile - import io - import uuid - import json - import time - - logger.info(f"📦 ZIP PROCESSING: Starting bundle extraction for file {file_id}") - - # Create bundle artefact structure - artefact_id = str(uuid.uuid4()) - base_dir = f"{cabinet_id}/{file_id}/{artefact_id}" - archive_path = f"{base_dir}/bundle.zip" - - # Save original archive - self.storage.upload_file(bucket, archive_path, zip_content, 'application/zip', upsert=True) - - # Extract ZIP contents - zf = zipfile.ZipFile(io.BytesIO(zip_content)) - entries = [] - file_paths = {} - - for entry in zf.filelist: - if entry.is_dir(): - continue - - entry_content = zf.read(entry) - entry_filename = entry.filename - rel_path = f"{base_dir}/{entry_filename}" - - # Determine MIME type - if entry_filename.endswith('.json'): - mime = 'application/json' - file_paths['json'] = rel_path - elif entry_filename.endswith('.html'): - mime = 'text/html' - file_paths['html'] = rel_path - elif entry_filename.endswith('.md'): - mime = 'text/markdown' - file_paths['md'] = rel_path - elif entry_filename.endswith('.txt'): - mime = 'text/plain' - file_paths['text'] = rel_path - elif entry_filename.endswith('.doctags'): - mime = 'application/json' - file_paths['doctags'] = rel_path - else: - mime = 'application/octet-stream' - - # Upload file - self.storage.upload_file(bucket, rel_path, entry_content, mime, upsert=True) - - entries.append({ - 'filename': entry_filename, - 'rel_path': rel_path, - 'mime_type': mime, - 'size': len(entry_content) - }) - - logger.debug(f"📄 ZIP PROCESSING: Extracted {entry_filename} -> {rel_path}") - - zf.close() - - # Create bundle manifest - manifest = { - 'bundle_id': artefact_id, - 'file_id': file_id, - 'bundle_type': 'docling_bundle', - 'processing_mode': 'whole_document', - 'created_at': time.time(), - 'archive_path': archive_path, - 'entries': entries, - 'file_paths': file_paths, - 'metadata': bundle_metadata, - 'config': task_config - } - - manifest_path = f"{base_dir}/manifest.json" - manifest_content = json.dumps(manifest, indent=2).encode('utf-8') - self.storage.upload_file(bucket, manifest_path, manifest_content, 'application/json', upsert=True) - - # Create database artefact with bundle metadata - artefact_extra = { - **bundle_metadata, - 'manifest': manifest_path, - 'archive_path': archive_path, - 'file_paths': file_paths, - 'entry_count': len(entries), - 'group_pack_type': 'whole' # Add proper pack type for whole document bundles - } - - self.client.supabase.table('document_artefacts').insert({ - 'id': artefact_id, - 'file_id': file_id, - 'page_number': 0, # Whole document - 'type': 'docling_bundle', - 'rel_path': base_dir, - 'size_tag': json.dumps(task_config), - 'language': 'en', - 'chunk_index': None, - 'extra': artefact_extra - }).execute() - - logger.info(f"✅ ZIP PROCESSING: Created bundle artefact {artefact_id} with {len(entries)} files for file {file_id}") - - return { - 'artefact_id': artefact_id, - 'rel_path': base_dir, - 'manifest_path': manifest_path, - 'archive_path': archive_path, - 'file_paths': file_paths, - 'entry_count': len(entries), - 'bundle_metadata': bundle_metadata - } - - def process_docling_bundle_split_task(self, task: QueueTask) -> Dict[str, Any]: - """ - Process split docling bundle task (multi-unit processing). - - This creates multiple sub-bundles and a master manifest based on processing mode. - """ - file_id = task.file_id - payload = task.payload - - logger.info(f"Processing docling bundle split task for file {file_id}") - - try: - processing_mode = payload.get('processing_mode', 'split_by_sections') - processing_data = payload.get('processing_data', {}) - config = payload.get('config', {}) - bundle_metadata = payload.get('bundle_metadata', {}) - - logger.info(f"Split bundle processing mode: {processing_mode}") - - if processing_mode == 'split_by_pages': - return self._process_split_by_pages(task, processing_data, config, bundle_metadata) - elif processing_mode == 'split_by_sections': - return self._process_split_by_sections(task, processing_data, config, bundle_metadata) - elif processing_mode == 'split_by_chunks': - return self._process_split_by_chunks(task, processing_data, config, bundle_metadata) - else: - raise ValueError(f"Unknown processing mode: {processing_mode}") - - except Exception as e: - logger.error(f"Docling bundle split processing failed for file {file_id}: {e}") - raise - - def _process_split_by_pages(self, task: QueueTask, processing_data: dict, - config: dict, bundle_metadata: dict) -> Dict[str, Any]: - """Process document by individual pages and create page bundles.""" - file_id = task.file_id - payload = task.payload - bucket = payload['bucket'] - file_path = payload['file_path'] - cabinet_id = payload['cabinet_id'] - mime_type = payload['mime_type'] - - pages = processing_data.get('pages', []) - logger.info(f"Processing {len(pages)} individual pages for file {file_id}") - - # Create master bundle directory - master_bundle_id = str(uuid.uuid4()) - master_dir = f"{cabinet_id}/{file_id}/{master_bundle_id}" - page_bundles = [] - - # Process each page as a separate bundle - for idx, page_num in enumerate(pages, 1): - try: - page_config = { - **config, - 'page_range': [page_num, page_num], - 'target_type': 'zip', - 'to_formats': ['json', 'html', 'text', 'md', 'doctags'] - } - - # Create descriptive page title and enhanced metadata - page_title = f"Page {page_num}" - page_display_name = f"Page {page_num}" - - # Create individual page task with enhanced labeling - page_task = QueueTask( - id=f"{task.id}_page_{page_num}", - file_id=file_id, - service=task.service, - task_type='canonical_docling_json', - payload={ - **payload, - 'config': page_config, - 'artefact_extra': { - 'page_number': page_num, - 'page_title': page_title, - 'display_name': page_display_name, - 'split_order': idx, # Sequential order within this bundle - 'split_total': len(pages), - 'split_heading': page_title, - 'section_title': page_title, # For consistency - 'is_page_bundle': True, - 'master_bundle_id': master_bundle_id, - 'bundle_label': f"Page {page_num} Bundle", - **bundle_metadata - } - }, - priority=task.priority, - timeout=1800, - created_at=task.created_at - ) - - # Process page bundle - page_result = self._process_docling_task(page_task) - page_bundles.append({ - 'page_number': page_num, - 'page_title': page_title, - 'display_name': page_display_name, - 'split_order': idx, - 'artefact_id': page_result.get('artefact_id'), - 'rel_path': page_result.get('rel_path') - }) - - except Exception as e: - logger.warning(f"Failed to process page {page_num} for file {file_id}: {e}") - continue - - # Sort page bundles by page number for consistent ordering - page_bundles.sort(key=lambda x: x['page_number']) - - # Create enhanced master manifest with proper organization metadata - master_manifest = { - 'file_id': file_id, - 'bundle_type': 'docling_bundle_split', - 'split_mode': 'split_by_pages', - 'total_pages': len(pages), - 'successful_pages': len(page_bundles), - 'page_bundles': page_bundles, - 'created_at': 'now()', - 'display_name': f"Document Pages ({len(page_bundles)} pages)", - 'organization': { - 'type': 'pages', - 'sort_field': 'page_number', - 'sort_order': 'asc', - 'grouping': 'individual_pages' - }, - **bundle_metadata - } - - # Store master manifest - manifest_path = f"{master_dir}/master_manifest.json" - manifest_json = json.dumps(master_manifest, ensure_ascii=False, indent=2) - self.storage.upload_file(bucket, manifest_path, manifest_json.encode('utf-8'), 'application/json', upsert=True) - - # Create master bundle artefact - self.client.supabase.table('document_artefacts').insert({ - 'id': master_bundle_id, - 'file_id': file_id, - 'type': 'docling_bundle_split_pages', - 'rel_path': master_dir, - 'extra': { - 'manifest': manifest_path, - 'split_mode': 'split_by_pages', - 'total_pages': len(pages), - 'successful_pages': len(page_bundles), - 'group_pack_type': 'split_pages', # Add proper pack type for split page bundles - **bundle_metadata - }, - 'status': 'completed' - }).execute() - - logger.info(f"Created page-based split bundle for file {file_id}: {len(page_bundles)} pages") - return { - 'master_bundle_id': master_bundle_id, - 'pages_processed': len(page_bundles), - 'total_pages': len(pages) - } - - def _process_split_by_sections(self, task: QueueTask, processing_data: dict, - config: dict, bundle_metadata: dict) -> Dict[str, Any]: - """Process document by sections and create section bundles.""" - file_id = task.file_id - payload = task.payload - bucket = payload['bucket'] - cabinet_id = payload['cabinet_id'] - - entries = processing_data.get('entries', []) - logger.info(f"Processing {len(entries)} sections for file {file_id}") - - # Create master bundle directory - master_bundle_id = str(uuid.uuid4()) - master_dir = f"{cabinet_id}/{file_id}/{master_bundle_id}" - section_bundles = [] - - # Process each section as a separate bundle - logger.info(f"Processing {len(entries)} sections for file {file_id}") - for i, entry in enumerate(entries, 1): - try: - start_page = entry.get('start_page', 1) - end_page = entry.get('end_page', start_page) - # Enhanced section title handling with fallbacks and smart naming - raw_title = entry.get('title') or entry.get('label') or entry.get('heading') - section_title = raw_title.strip() if raw_title else f'Section {i}' - - # Create enhanced display names for better organization - page_range_str = f"p{start_page}" if start_page == end_page else f"p{start_page}-{end_page}" - display_name = f"{i:02d}. {section_title}" if raw_title else f"{i:02d}. Section {i} ({page_range_str})" - bundle_label = f"{section_title} Bundle" - - # Validate page ranges - if start_page < 1: - raise ValueError(f"Invalid start_page: {start_page} (must be >= 1)") - if end_page < start_page: - raise ValueError(f"Invalid page range: {start_page}-{end_page} (end < start)") - if start_page > 999 or end_page > 999: - raise ValueError(f"Suspicious page range: {start_page}-{end_page} (too high, possible corruption)") - - logger.info(f"Processing section {i}/{len(entries)}: '{display_name}' (pages {start_page}-{end_page})") - - section_config = { - **config, - 'page_range': [start_page, end_page], - 'target_type': 'zip', - 'to_formats': ['json', 'html', 'text', 'md', 'doctags'] - } - - # Create section task with enhanced metadata and labeling - section_task = QueueTask( - id=f"{task.id}_section_{i}", - file_id=file_id, - service=task.service, - task_type='canonical_docling_json', - payload={ - **payload, - 'config': section_config, - 'artefact_extra': { - 'section_number': i, - 'section_title': section_title, - 'display_name': display_name, - 'bundle_label': bundle_label, - 'start_page': start_page, - 'end_page': end_page, - 'page_range': [start_page, end_page], - 'page_count': end_page - start_page + 1, - 'split_order': i, # Preserved ordering from split map - 'split_total': len(entries), - 'split_heading': section_title, - 'is_section_bundle': True, - 'master_bundle_id': master_bundle_id, - **bundle_metadata - } - }, - priority=task.priority, - timeout=3600, - created_at=task.created_at - ) - - # Process section bundle - section_result = self._process_docling_task(section_task) - section_bundles.append({ - 'section_number': i, - 'section_title': section_title, - 'display_name': display_name, - 'bundle_label': bundle_label, - 'page_range': [start_page, end_page], - 'page_count': end_page - start_page + 1, - 'split_order': i, - 'artefact_id': section_result.get('artefact_id'), - 'rel_path': section_result.get('rel_path') - }) - - except Exception as e: - logger.error(f"FATAL: Failed to process section {i} for file {file_id}: {e}") - logger.error(f"Section details: title='{section_title}', pages={start_page}-{end_page}") - # Don't continue - fail the entire task if any section fails - raise Exception(f"Section processing failed for section {i} ('{section_title}', pages {start_page}-{end_page}): {e}") - - # Sort section bundles by split_order for consistent ordering - section_bundles.sort(key=lambda x: x['split_order']) - - # Create enhanced master manifest with proper organization metadata - master_manifest = { - 'file_id': file_id, - 'bundle_type': 'docling_bundle_split', - 'split_mode': 'split_by_sections', - 'total_sections': len(entries), - 'successful_sections': len(section_bundles), - 'section_bundles': section_bundles, - 'created_at': 'now()', - 'display_name': f"Document Sections ({len(section_bundles)} sections)", - 'organization': { - 'type': 'sections', - 'sort_field': 'split_order', - 'sort_order': 'asc', - 'grouping': 'split_map_sections', - 'has_titles': True, - 'ordering_preserved': True - }, - **bundle_metadata - } - - # Store master manifest - manifest_path = f"{master_dir}/master_manifest.json" - manifest_json = json.dumps(master_manifest, ensure_ascii=False, indent=2) - self.storage.upload_file(bucket, manifest_path, manifest_json.encode('utf-8'), 'application/json', upsert=True) - - # Create master bundle artefact - self.client.supabase.table('document_artefacts').insert({ - 'id': master_bundle_id, - 'file_id': file_id, - 'type': 'docling_bundle_split_sections', - 'rel_path': master_dir, - 'extra': { - 'manifest': manifest_path, - 'split_mode': 'split_by_sections', - 'total_sections': len(entries), - 'successful_sections': len(section_bundles), - 'group_pack_type': 'split_sections', # Add proper pack type for split section bundles - **bundle_metadata - }, - 'status': 'completed' - }).execute() - - logger.info(f"Created section-based split bundle for file {file_id}: {len(section_bundles)} sections") - return { - 'master_bundle_id': master_bundle_id, - 'sections_processed': len(section_bundles), - 'total_sections': len(entries) - } - - def _process_split_by_chunks(self, task: QueueTask, processing_data: dict, - config: dict, bundle_metadata: dict) -> Dict[str, Any]: - """Process document by chunks and create chunk bundles.""" - # Very similar to _process_split_by_sections but with chunk-specific labeling - file_id = task.file_id - payload = task.payload - bucket = payload['bucket'] - cabinet_id = payload['cabinet_id'] - - chunks = processing_data.get('entries', []) - logger.info(f"Processing {len(chunks)} chunks for file {file_id}") - - # Create master bundle directory - master_bundle_id = str(uuid.uuid4()) - master_dir = f"{cabinet_id}/{file_id}/{master_bundle_id}" - chunk_bundles = [] - - # Process each chunk as a separate bundle - for i, chunk in enumerate(chunks, 1): - try: - start_page = chunk['start'] - end_page = chunk['end'] - # Enhanced chunk title handling - raw_title = chunk.get('title', f'Chunk {i}') - chunk_title = raw_title.strip() if raw_title else f'Chunk {i}' - - # Create enhanced display names for chunks - page_range_str = f"p{start_page}" if start_page == end_page else f"p{start_page}-{end_page}" - display_name = f"{i:02d}. {chunk_title} ({page_range_str})" - bundle_label = f"{chunk_title} Bundle" - - chunk_config = { - **config, - 'page_range': [start_page, end_page], - 'target_type': 'zip', - 'to_formats': ['json', 'html', 'text', 'md', 'doctags'] - } - - # Create chunk task with enhanced labeling - chunk_task = QueueTask( - id=f"{task.id}_chunk_{i}", - file_id=file_id, - service=task.service, - task_type='canonical_docling_json', - payload={ - **payload, - 'config': chunk_config, - 'artefact_extra': { - 'chunk_number': i, - 'chunk_title': chunk_title, - 'display_name': display_name, - 'bundle_label': bundle_label, - 'start_page': start_page, - 'end_page': end_page, - 'page_range': [start_page, end_page], - 'page_count': end_page - start_page + 1, - 'split_order': i, - 'split_total': len(chunks), - 'split_heading': chunk_title, - 'is_chunk_bundle': True, - 'master_bundle_id': master_bundle_id, - **bundle_metadata - } - }, - priority=task.priority, - timeout=3600, - created_at=task.created_at - ) - - # Process chunk bundle - chunk_result = self._process_docling_task(chunk_task) - chunk_bundles.append({ - 'chunk_number': i, - 'chunk_title': chunk_title, - 'display_name': display_name, - 'bundle_label': bundle_label, - 'page_range': [start_page, end_page], - 'page_count': end_page - start_page + 1, - 'split_order': i, - 'artefact_id': chunk_result.get('artefact_id'), - 'rel_path': chunk_result.get('rel_path') - }) - - except Exception as e: - logger.warning(f"Failed to process chunk {i} for file {file_id}: {e}") - continue - - # Create master manifest - master_manifest = { - 'file_id': file_id, - 'bundle_type': 'docling_bundle_split', - 'split_mode': 'split_by_chunks', - 'total_chunks': len(chunks), - 'successful_chunks': len(chunk_bundles), - 'chunk_bundles': chunk_bundles, - 'created_at': 'now()', - **bundle_metadata - } - - # Store master manifest - manifest_path = f"{master_dir}/master_manifest.json" - manifest_json = json.dumps(master_manifest, ensure_ascii=False, indent=2) - self.storage.upload_file(bucket, manifest_path, manifest_json.encode('utf-8'), 'application/json', upsert=True) - - # Create master bundle artefact - self.client.supabase.table('document_artefacts').insert({ - 'id': master_bundle_id, - 'file_id': file_id, - 'type': 'docling_bundle_split_chunks', - 'rel_path': master_dir, - 'extra': { - 'manifest': manifest_path, - 'split_mode': 'split_by_chunks', - 'total_chunks': len(chunks), - 'successful_chunks': len(chunk_bundles), - 'group_pack_type': 'split_chunks', # Add proper pack type for split chunk bundles - **bundle_metadata - }, - 'status': 'completed' - }).execute() - - logger.info(f"Created chunk-based split bundle for file {file_id}: {len(chunk_bundles)} chunks") - return { - 'master_bundle_id': master_bundle_id, - 'chunks_processed': len(chunk_bundles), - 'total_chunks': len(chunks) - } - -# process_phase2_coordinator_task method removed - pipelines now enqueued directly from split_map task - -# _check_pipeline_group_completion method removed - task dependencies now handle sequential execution - -# Global processor instance -_processor_instance = None - -def get_processor() -> DocumentTaskProcessor: - """Get the global task processor instance.""" - global _processor_instance - if _processor_instance is None: - _processor_instance = DocumentTaskProcessor() - return _processor_instance