chore: add .env and *.bak to .gitignore, remove archive/ folder

2026-02-23 21:14:43 +00:00 · 2026-02-23 21:14:43 +00:00 · 2436a00cac
commit 2436a00cac
parent c92da048fd
8 changed files with 3 additions and 5825 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,8 +1,11 @@
 __pycache__
 .pytest_cache

+.env
+
 .DS_Store

 .archive/*

 data/logs/*
+*.bak
--- a/archive/auto_processing/README.md
+++ b/archive/auto_processing/README.md
@ -1,66 +0,0 @@
-# Auto-Processing Code Archive
-
-This directory contains the complex auto-processing system that was previously used for automatic document processing after file upload.
-
-## Archived Components
-
-### Core Processing Files
- `files_with_auto_processing.py` - Original files.py router with automatic processing
- `pipeline_controller.py` - Complex multi-phase pipeline orchestration  
- `task_processors.py` - Document processing task handlers
-
-### Advanced Queue Management (Created but not deployed)
- `memory_aware_queue.py` - Memory-based intelligent queue management
- `enhanced_upload_handler.py` - Advanced upload handler with queuing
- `enhanced_upload.py` - API endpoints for advanced upload system
-
-## What This System Did
-
-### Automatic Processing Pipeline
-1. **File Upload** → Immediate processing trigger
-2. **PDF Conversion** (synchronous, blocking)
-3. **Phase 1**: Structure discovery (Tika, Page Images, Document Analysis, Split Map)
-4. **Phase 2**: Docling processing (NO_OCR → OCR → VLM pipelines)
-5. **Complex Dependencies**: Phase coordination, task sequencing
-6. **Redis Queue Management**: Service limits, rate limits, dependency tracking
-
-### Features
- Multi-phase processing pipelines
- Complex task dependency management
- Memory-aware queue limits
- Multi-user capacity management
- Real-time processing status
- WebSocket status updates
- Service-specific resource limits
- Task recovery on restart
-
-## Why Archived
-
-The system was overly complex for the current needs:
- **Complexity**: Multi-phase pipelines with complex dependencies
- **Blocking Operations**: Synchronous PDF conversion causing timeouts
- **Resource Management**: Over-engineered for single-user scenarios
- **User Experience**: Users had to wait for processing to complete
-
-## New Simplified Approach
-
-The new system focuses on:
- **Simple Upload**: Just store files and create database records
- **No Auto-Processing**: Users manually trigger processing when needed
- **Directory Support**: Upload entire folders with manifest tracking
- **Immediate Response**: Users get instant confirmation without waiting
-
-## If You Need to Restore
-
-To restore the auto-processing functionality:
-1. Copy `files_with_auto_processing.py` back to `routers/database/files/files.py`
-2. Ensure `pipeline_controller.py` and `task_processors.py` are in `modules/`
-3. Update imports and dependencies
-4. Re-enable background processing in upload handlers
-
-## Migration Notes
-
-The database schema and Redis structure remain compatible. The new simplified system can coexist with the archived processing logic if needed.
-
-Date Archived: $(date)
-Reason: Simplification for directory upload implementation
--- a/archive/auto_processing/enhanced_upload.py
+++ b/archive/auto_processing/enhanced_upload.py
@ -1,142 +0,0 @@
-"""
-Enhanced Upload Router with Memory-Aware Queuing
-===============================================
-
-Provides intelligent upload endpoints with capacity checking,
-queue management, and real-time status updates.
-
-Endpoints:
- POST /upload/check-capacity    - Pre-check if upload is possible
- POST /upload/with-queue        - Upload with intelligent queuing
- GET  /upload/status/{file_id}  - Get processing status
- GET  /upload/queue-status      - Get overall queue status
- WebSocket /ws/upload-status    - Real-time status updates
-"""
-
-import os
-import uuid
-import json
-import logging
-from typing import Dict, List, Optional, Any
-from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, WebSocket, WebSocketDisconnect
-from fastapi.responses import JSONResponse
-
-from modules.auth.supabase_bearer import SupabaseBearer
-from modules.enhanced_upload_handler import get_upload_handler
-from modules.memory_aware_queue import get_memory_queue
-from modules.logger_tool import initialise_logger
-
-router = APIRouter()
-auth = SupabaseBearer()
-
-logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
-
-# WebSocket connection manager for real-time updates
-class ConnectionManager:
-    def __init__(self):
-        self.active_connections: Dict[str, List[WebSocket]] = {}
-    
-    async def connect(self, websocket: WebSocket, file_id: str):
-        await websocket.accept()
-        if file_id not in self.active_connections:
-            self.active_connections[file_id] = []
-        self.active_connections[file_id].append(websocket)
-    
-    def disconnect(self, websocket: WebSocket, file_id: str):
-        if file_id in self.active_connections:
-            self.active_connections[file_id].remove(websocket)
-            if not self.active_connections[file_id]:
-                del self.active_connections[file_id]
-    
-    async def broadcast_to_file(self, file_id: str, message: dict):
-        if file_id in self.active_connections:
-            for connection in self.active_connections[file_id].copy():
-                try:
-                    await connection.send_json(message)
-                except:
-                    self.active_connections[file_id].remove(connection)
-
-manager = ConnectionManager()
-
-@router.post("/upload/check-capacity")
-async def check_upload_capacity(
-    file_size: int = Form(...),
-    mime_type: str = Form(...),
-    payload: Dict[str, Any] = Depends(auth)
-):
-    """
-    Check if user can upload a file of given size and type.
-    
-    Returns capacity information and recommendations.
-    """
-    try:
-        user_id = payload.get('sub') or payload.get('user_id', 'anonymous')
-        
-        # Determine environment
-        environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'
-        upload_handler = get_upload_handler(environment)
-        
-        eligible, message, details = upload_handler.check_upload_eligibility(
-            user_id, file_size, mime_type
-        )
-        
-        response = {
-            'eligible': eligible,
-            'message': message,
-            'details': details,
-            'timestamp': time.time()
-        }
-        
-        status_code = 200 if eligible else 429  # Too Many Requests if not eligible
-        
-        logger.info(f"📋 Capacity check for user {user_id}: {eligible} - {message}")
-        
-        return JSONResponse(content=response, status_code=status_code)
-        
-    except Exception as e:
-        logger.error(f"Capacity check error: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
-
-@router.post("/upload/with-queue")
-async def upload_with_queue(
-    cabinet_id: str = Form(...),
-    path: str = Form(...),
-    scope: str = Form(...),
-    priority: int = Form(1),
-    file: UploadFile = File(...),
-    payload: Dict[str, Any] = Depends(auth)
-):
-    """
-    Upload file with intelligent queuing and capacity management.
-    
-    Returns queue information and processing status.
-    """
-    try:
-        user_id = payload.get('sub') or payload.get('user_id', 'anonymous')
-        
-        # Read file content
-        file_bytes = await file.read()
-        file_size = len(file_bytes)
-        mime_type = file.content_type or 'application/octet-stream'
-        filename = file.filename or path
-        
-        logger.info(f"📤 Upload request: {filename} ({file_size} bytes) for user {user_id}")
-        
-        # Determine environment
-        environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'
-        upload_handler = get_upload_handler(environment)
-        
-        # Check if upload queuing is enabled
-        if os.getenv('UPLOAD_QUEUE_ENABLED', 'true').lower() == 'true':
-            # Use new queue-based upload
-            file_id = str(uuid.uuid4())
-            
-            result = await upload_handler.handle_upload_with_queue(
-                file_id=file_id,
-                user_id=user_id,
-                filename=filename,
-                file_bytes=file_bytes,
-                mime_type=mime_type,
-                cabinet_id=cabinet_id,
-                priority=priority
-            )\n            \n            return result\n        \n        else:\n            # Fall back to immediate processing (legacy mode)\n            logger.warning(\"Using legacy immediate processing mode\")\n            # TODO: Call original upload_file function\n            raise HTTPException(status_code=501, detail=\"Legacy mode not implemented in this endpoint\")\n        \n    except HTTPException:\n        raise\n    except Exception as e:\n        logger.error(f\"Upload error: {e}\")\n        raise HTTPException(status_code=500, detail=str(e))\n\n@router.get(\"/upload/status/{file_id}\")\nasync def get_upload_status(\n    file_id: str,\n    payload: Dict[str, Any] = Depends(auth)\n):\n    \"\"\"\n    Get current processing status for an uploaded file.\n    \"\"\"\n    try:\n        # Determine environment\n        environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n        upload_handler = get_upload_handler(environment)\n        \n        status = upload_handler.get_processing_status(file_id)\n        \n        if status.get('status') == 'not_found':\n            raise HTTPException(status_code=404, detail=\"File not found\")\n        \n        return status\n        \n    except HTTPException:\n        raise\n    except Exception as e:\n        logger.error(f\"Status check error for {file_id}: {e}\")\n        raise HTTPException(status_code=500, detail=str(e))\n\n@router.get(\"/upload/queue-status\")\nasync def get_queue_status(\n    payload: Dict[str, Any] = Depends(auth)\n):\n    \"\"\"\n    Get overall queue status and system capacity information.\n    \"\"\"\n    try:\n        # Determine environment\n        environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n        memory_queue = get_memory_queue(environment)\n        \n        system_status = memory_queue.get_system_status()\n        \n        return {\n            'system_status': system_status,\n            'timestamp': time.time(),\n            'environment': environment\n        }\n        \n    except Exception as e:\n        logger.error(f\"Queue status error: {e}\")\n        raise HTTPException(status_code=500, detail=str(e))\n\n@router.websocket(\"/ws/upload-status/{file_id}\")\nasync def websocket_upload_status(websocket: WebSocket, file_id: str):\n    \"\"\"\n    WebSocket endpoint for real-time upload status updates.\n    \"\"\"\n    await manager.connect(websocket, file_id)\n    \n    try:\n        # Send initial status\n        environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n        upload_handler = get_upload_handler(environment)\n        initial_status = upload_handler.get_processing_status(file_id)\n        \n        await websocket.send_json({\n            'type': 'status_update',\n            'data': initial_status\n        })\n        \n        # Keep connection alive and listen for updates\n        while True:\n            # In a real implementation, you'd have a background task\n            # that pushes updates when file status changes\n            await asyncio.sleep(5)\n            \n            # Check for status updates\n            current_status = upload_handler.get_processing_status(file_id)\n            await websocket.send_json({\n                'type': 'status_update', \n                'data': current_status\n            })\n            \n    except WebSocketDisconnect:\n        manager.disconnect(websocket, file_id)\n    except Exception as e:\n        logger.error(f\"WebSocket error for {file_id}: {e}\")\n        manager.disconnect(websocket, file_id)\n\n# Background task to process upload queue\n@router.on_event(\"startup\")\nasync def start_queue_processor():\n    \"\"\"Start background queue processor.\"\"\"\n    \n    if os.getenv('UPLOAD_QUEUE_ENABLED', 'true').lower() != 'true':\n        logger.info(\"📋 Upload queue disabled, skipping queue processor\")\n        return\n    \n    import asyncio\n    \n    environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n    upload_handler = get_upload_handler(environment)\n    \n    # Start background processor\n    asyncio.create_task(upload_handler.process_queued_files(\"document_processor\"))\n    \n    logger.info(\"🚀 Upload queue processor started\")\n\nimport time\nimport asyncio"
--- a/archive/auto_processing/enhanced_upload_handler.py
+++ b/archive/auto_processing/enhanced_upload_handler.py
@ -1,362 +0,0 @@
-"""
-Enhanced Upload Handler with Memory-Aware Queuing
-=================================================
-
-Replaces the immediate processing model with intelligent queue management.
-Provides user feedback about capacity, queue position, and processing status.
-
-Features:
- Pre-upload capacity checking
- Memory-aware queuing with user quotas
- Real-time status updates via WebSocket/SSE
- Graceful degradation under load
- Fair queuing across multiple users
-"""
-
-import os
-import uuid
-import time
-import logging
-import asyncio
-from typing import Dict, List, Optional, Any, Tuple
-from fastapi import HTTPException, BackgroundTasks
-from dataclasses import asdict
-
-from .memory_aware_queue import get_memory_queue, QueuedFile
-from .redis_manager import get_redis_manager
-from modules.database.supabase.utils.client import SupabaseServiceRoleClient
-from modules.database.tools.storage.storage_admin import StorageAdmin
-
-logger = logging.getLogger(__name__)
-
-class EnhancedUploadHandler:
-    """Enhanced upload handler with memory-aware queuing."""
-    
-    def __init__(self, environment: str = "dev"):
-        self.memory_queue = get_memory_queue(environment)
-        self.redis_manager = get_redis_manager(environment)
-        self.redis_client = self.redis_manager.client
-        
-        # Processing status tracking
-        self.processing_status_key = "file_processing_status"
-        
-    def check_upload_eligibility(self, user_id: str, file_size: int, 
-                               mime_type: str) -> Tuple[bool, str, Dict[str, Any]]:
-        """
-        Check if user can upload a file right now.
-        
-        Returns:
-            (eligible, message, details)
-        """
-        
-        # Check system capacity
-        can_accept, message, queue_info = self.memory_queue.check_upload_capacity(
-            user_id, file_size, mime_type
-        )
-        
-        if not can_accept:
-            return False, message, {
-                'reason': 'capacity_exceeded',
-                'queue_info': queue_info,
-                'recommendations': self._get_recommendations(queue_info)
-            }
-        
-        return True, message, {
-            'status': 'ready_for_upload',
-            'queue_info': queue_info,
-            'processing_estimate': self._estimate_processing_time(file_size, mime_type)
-        }
-    
-    async def handle_upload_with_queue(self, file_id: str, user_id: str, 
-                                     filename: str, file_bytes: bytes,
-                                     mime_type: str, cabinet_id: str,
-                                     priority: int = 1) -> Dict[str, Any]:
-        """
-        Handle file upload with intelligent queuing.
-        
-        Steps:
-        1. Store file immediately (cheap operation)
-        2. Add to processing queue
-        3. Return queue status to user
-        4. Process asynchronously when capacity available
-        """
-        
-        # Store file immediately (this is fast)
-        storage = StorageAdmin()
-        client = SupabaseServiceRoleClient()
-        
-        # Create database record
-        bucket = f"{cabinet_id}-files"  # or your bucket naming convention
-        storage_path = f"{cabinet_id}/{file_id}/{filename}"
-        
-        try:
-            # Store file
-            storage.upload_file(bucket, storage_path, file_bytes, mime_type, upsert=True)
-            
-            # Create file record
-            insert_res = client.supabase.table('files').insert({
-                'id': file_id,
-                'name': filename,
-                'cabinet_id': cabinet_id,
-                'bucket': bucket,
-                'path': storage_path,
-                'mime_type': mime_type,
-                'uploaded_by': user_id,
-                'size_bytes': len(file_bytes),
-                'source': 'classroomcopilot-web',
-                'status': 'queued_for_processing'  # New status
-            }).execute()
-            
-            if not insert_res.data:
-                raise HTTPException(status_code=500, detail="Failed to create file record")
-            
-        except Exception as e:
-            logger.error(f"Failed to store file {file_id}: {e}")
-            raise HTTPException(status_code=500, detail=f"Storage failed: {str(e)}")
-        
-        # Add to processing queue
-        try:
-            queue_result = self.memory_queue.enqueue_file(
-                file_id=file_id,
-                user_id=user_id,
-                filename=filename,
-                size_bytes=len(file_bytes),
-                mime_type=mime_type,
-                cabinet_id=cabinet_id,
-                priority=priority
-            )
-            
-            # Update file status in database
-            client.supabase.table('files').update({
-                'status': 'queued_for_processing',
-                'extra': {
-                    'queue_position': queue_result['queue_position'],
-                    'estimated_wait_seconds': queue_result['estimated_wait_seconds'],
-                    'memory_estimate_mb': queue_result['memory_estimate_mb']
-                }
-            }).eq('id', file_id).execute()
-            
-            logger.info(f"📋 File {file_id} queued at position {queue_result['queue_position']}")
-            
-            return {
-                'status': 'upload_successful',
-                'message': 'File uploaded and queued for processing',
-                'file_id': file_id,
-                'queue_info': queue_result,
-                'next_steps': {
-                    'poll_status_endpoint': f'/database/files/{file_id}/processing-status',
-                    'websocket_updates': f'/ws/file-processing/{file_id}'
-                }
-            }
-            
-        except Exception as e:
-            logger.error(f"Failed to queue file {file_id}: {e}")
-            # Clean up stored file
-            try:
-                storage.delete_file(bucket, storage_path)
-                client.supabase.table('files').delete().eq('id', file_id).execute()
-            except:
-                pass
-            raise HTTPException(status_code=500, detail=f"Queue failed: {str(e)}")
-    
-    async def process_queued_files(self, service_name: str = "document_processor"):
-        """
-        Background service to process queued files.
-        This runs continuously as a background task.
-        """
-        
-        logger.info(f"🚀 Started queue processor for {service_name}")
-        
-        while True:
-            try:
-                # Get next file from queue
-                queued_file = self.memory_queue.dequeue_next_file(service_name)
-                
-                if not queued_file:
-                    # No files ready for processing
-                    await asyncio.sleep(5)
-                    continue
-                
-                # Update file status
-                await self._update_processing_status(queued_file.file_id, 'processing')
-                
-                # Process the file
-                try:
-                    await self._process_file(queued_file, service_name)
-                    await self._update_processing_status(queued_file.file_id, 'completed')
-                    
-                except Exception as e:
-                    logger.error(f"Failed to process file {queued_file.file_id}: {e}")
-                    await self._update_processing_status(queued_file.file_id, 'failed', str(e))
-                
-                finally:
-                    # Always free memory
-                    self.memory_queue.complete_processing(
-                        service_name, 
-                        queued_file.file_id, 
-                        queued_file.memory_estimate_mb
-                    )
-                
-            except Exception as e:
-                logger.error(f"Queue processor error: {e}")
-                await asyncio.sleep(10)  # Back off on errors
-    
-    async def _process_file(self, queued_file: QueuedFile, service_name: str):
-        """Process a single file from the queue."""
-        
-        logger.info(f"🔄 Processing file {queued_file.file_id} in {service_name}")
-        
-        # Import here to avoid circular imports
-        from modules.pipeline_controller import get_pipeline_controller
-        
-        client = SupabaseServiceRoleClient()
-        controller = get_pipeline_controller()
-        
-        # Get file record
-        file_result = client.supabase.table('files').select('*').eq('id', queued_file.file_id).single().execute()
-        file_row = file_result.data
-        
-        if not file_row:
-            raise Exception(f"File record not found: {queued_file.file_id}")
-        
-        # Update status to processing
-        client.supabase.table('files').update({
-            'status': 'processing'
-        }).eq('id', queued_file.file_id).execute()
-        
-        # Convert to PDF if needed (this is where the bottleneck was before)
-        processing_path = await self._handle_pdf_conversion(file_row)
-        
-        # Enqueue Phase 1 tasks
-        phase1_tasks = controller.enqueue_phase1_tasks(
-            file_id=queued_file.file_id,
-            file_row={**file_row, 'path': processing_path},
-            processing_path=processing_path,
-            processing_mime=file_row['mime_type']
-        )
-        
-        # Update database with task IDs
-        client.supabase.table('files').update({
-            'status': 'phase1_processing',
-            'extra': {
-                **file_row.get('extra', {}),
-                'phase1_tasks': phase1_tasks,
-                'processing_started_at': time.time()
-            }
-        }).eq('id', queued_file.file_id).execute()
-        
-        logger.info(f"✅ File {queued_file.file_id} processing initiated")
-    
-    async def _handle_pdf_conversion(self, file_row: Dict[str, Any]) -> str:
-        """Handle PDF conversion asynchronously."""
-        
-        if file_row['mime_type'] == 'application/pdf':
-            return file_row['path']
-        
-        # TODO: Implement async PDF conversion
-        # For now, return original path and handle conversion in pipeline
-        logger.info(f"PDF conversion queued for file {file_row['id']}")
-        return file_row['path']
-    
-    async def _update_processing_status(self, file_id: str, status: str, error: str = None):
-        """Update file processing status."""
-        
-        status_data = {
-            'file_id': file_id,
-            'status': status,
-            'timestamp': time.time(),
-            'error': error
-        }
-        
-        # Store in Redis for real-time updates
-        status_key = f"{self.processing_status_key}:{file_id}"
-        self.redis_client.setex(status_key, 86400, json.dumps(status_data))  # 24h expiry
-        
-        # Update database
-        client = SupabaseServiceRoleClient()
-        client.supabase.table('files').update({
-            'status': status,
-            'error_message': error
-        }).eq('id', file_id).execute()
-        
-        logger.info(f"📊 Status update for {file_id}: {status}")
-    
-    def get_processing_status(self, file_id: str) -> Dict[str, Any]:
-        """Get current processing status for a file."""
-        
-        status_key = f"{self.processing_status_key}:{file_id}"
-        status_json = self.redis_client.get(status_key)
-        
-        if status_json:
-            return json.loads(status_json)
-        
-        # Fallback to database
-        client = SupabaseServiceRoleClient()
-        result = client.supabase.table('files').select('status, error_message, extra').eq('id', file_id).single().execute()
-        
-        if result.data:
-            return {
-                'file_id': file_id,
-                'status': result.data['status'],
-                'error': result.data.get('error_message'),
-                'extra': result.data.get('extra', {})
-            }
-        
-        return {'file_id': file_id, 'status': 'not_found'}
-    
-    def _estimate_processing_time(self, file_size: int, mime_type: str) -> Dict[str, Any]:
-        """Estimate processing time for a file."""
-        
-        # Base time estimates (in seconds)
-        base_times = {
-            'application/pdf': 60,      # 1 minute per MB
-            'application/msword': 120,  # 2 minutes per MB
-            'image/': 30               # 30 seconds per MB
-        }
-        
-        # Find matching mime type
-        time_per_mb = 60  # default
-        for mime_prefix, time_val in base_times.items():
-            if mime_type.startswith(mime_prefix):
-                time_per_mb = time_val
-                break
-        
-        file_size_mb = file_size / (1024 * 1024)
-        estimated_seconds = int(file_size_mb * time_per_mb)
-        
-        return {
-            'estimated_seconds': estimated_seconds,
-            'estimated_minutes': estimated_seconds / 60,
-            'phases': {
-                'pdf_conversion': estimated_seconds * 0.2,
-                'metadata_extraction': estimated_seconds * 0.3,
-                'docling_processing': estimated_seconds * 0.5
-            }
-        }
-    
-    def _get_recommendations(self, queue_info: Dict[str, Any]) -> List[str]:
-        """Get recommendations for user when upload is rejected."""
-        
-        recommendations = []
-        
-        if queue_info.get('reason') == 'file_too_large':
-            recommendations.append("Try compressing your file or splitting it into smaller parts")
-        
-        if queue_info.get('utilization', 0) > 0.9:
-            recommendations.append("System is currently overloaded. Try uploading during off-peak hours")
-            recommendations.append("Consider uploading smaller files first")
-        
-        if queue_info.get('user_current', 0) > 0:
-            recommendations.append("Wait for your current uploads to complete before uploading more")
-        
-        if not recommendations:
-            recommendations.append("Please try again in a few minutes")
-        
-        return recommendations
-
-# Convenience functions
-def get_upload_handler(environment: str = "dev") -> EnhancedUploadHandler:
-    """Get enhanced upload handler instance."""
-    return EnhancedUploadHandler(environment)
-
-import json
--- a/archive/auto_processing/files_with_auto_processing.py
+++ b/archive/auto_processing/files_with_auto_processing.py
@ -1,997 +0,0 @@
-import os
-import io
-from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, BackgroundTasks
-from typing import Any, Dict, Optional
-import uuid
-import re
-import requests
-import os
-import tempfile
-from pathlib import Path
-from modules.auth.supabase_bearer import SupabaseBearer, verify_supabase_jwt_str
-from modules.logger_tool import initialise_logger
-from modules.database.supabase.utils.client import SupabaseServiceRoleClient
-from modules.database.supabase.utils.storage import StorageAdmin
-from modules.document_processor import DocumentProcessor
-from modules.queue_system import (
-    enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task,
-    enqueue_document_analysis_task, enqueue_page_images_task,
-    TaskPriority, get_queue, QueueConnectionError
-)
-from fastapi.responses import Response
-from fastapi import Body
-
-router = APIRouter()
-auth = SupabaseBearer()
-doc_processor = DocumentProcessor()
-
-DEFAULT_BUCKET = os.getenv('DEFAULT_FILES_BUCKET', 'cc.users')
-
-# Timeout configurations (in seconds)
-TIKA_TIMEOUT = int(os.getenv('TIKA_TIMEOUT', '300'))  # 5 minutes default
-DOCLING_FRONTMATTER_TIMEOUT = int(os.getenv('DOCLING_FRONTMATTER_TIMEOUT', '1800'))  # 30 minutes default
-DOCLING_NOOCR_TIMEOUT = int(os.getenv('DOCLING_NOOCR_TIMEOUT', '3600'))  # 1 hour default
-
-# (Legacy feature flags removed - using new three-phase system)
-
-logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
-
-def _safe_filename(name: str) -> str:
-    base = os.path.basename(name or 'file')
-    return re.sub(r"[^A-Za-z0-9._-]+", "_", base)
-
-def _choose_bucket(scope: str, user_id: str, school_id: Optional[str]) -> str:
-    scope = (scope or 'teacher').lower()
-    if scope == 'school' and school_id:
-        return f"cc.institutes.{school_id}.private"
-    # teacher / student fall back to users bucket for now
-    return 'cc.users'
-
-@router.post("/files/upload")
-async def upload_file(
-    cabinet_id: str = Form(...),
-    path: str = Form(...),
-    scope: str = Form('teacher'),
-    school_id: Optional[str] = Form(default=None),
-    file: UploadFile = File(...),
-    payload: Dict[str, Any] = Depends(auth),
-    background_tasks: BackgroundTasks = None
-):
-    user_id = payload.get('sub') or payload.get('user_id')
-    if not user_id:
-        raise HTTPException(status_code=401, detail="Invalid token payload")
-
-    client = SupabaseServiceRoleClient()
-    storage = StorageAdmin()
-
-    # Determine target bucket by scope
-    bucket = _choose_bucket(scope, user_id, school_id)
-
-    # Stage DB row to get file_id
-    staged_path = f"{cabinet_id}/staging/{uuid.uuid4()}"
-    name = _safe_filename(path or file.filename)
-    file_bytes = await file.read()
-    insert_res = client.supabase.table('files').insert({
-        'cabinet_id': cabinet_id,
-        'name': name,
-        'path': staged_path,
-        'bucket': bucket,
-        'mime_type': file.content_type,
-        'uploaded_by': user_id,
-        'size_bytes': len(file_bytes),
-        'source': 'classroomcopilot-web'
-    }).execute()
-    if not insert_res.data:
-        raise HTTPException(status_code=500, detail="Failed to create file record")
-    file_row = insert_res.data[0]
-    file_id = file_row['id']
-
-    # Final storage path: bucket/cabinet_id/file_id/file
-    final_storage_path = f"{cabinet_id}/{file_id}/{name}"
-    try:
-        storage.upload_file(bucket, final_storage_path, file_bytes, file.content_type or 'application/octet-stream', upsert=True)
-    except Exception as e:
-        # cleanup staged row
-        client.supabase.table('files').delete().eq('id', file_id).execute()
-        raise HTTPException(status_code=500, detail=f"Storage upload failed: {str(e)}")
-
-    # Update DB path to final
-    update_res = client.supabase.table('files').update({
-        'path': final_storage_path
-    }).eq('id', file_id).execute()
-    # Kick off initial artefacts generation in background (Tika + Docling frontmatter + no-OCR)
-    try:
-        if background_tasks is not None:
-            logger.info(f"Scheduling initial artefacts generation for file_id={file_id}")
-            background_tasks.add_task(generate_initial_artefacts, file_id, payload)
-        else:
-            logger.info(f"Running initial artefacts generation synchronously for file_id={file_id}")
-            generate_initial_artefacts(file_id, payload)
-    except Exception as e:
-        logger.error(f"Failed to schedule initial artefacts for file_id={file_id}: {e}")
-
-    return update_res.data
-
-@router.get("/files")
-def list_files(cabinet_id: str, payload: Dict[str, Any] = Depends(auth)):
-    client = SupabaseServiceRoleClient()
-    res = client.supabase.table('files').select('*').eq('cabinet_id', cabinet_id).execute()
-    return res.data
-
-@router.post("/files/{file_id}/move")
-def move_file(file_id: str, body: Dict[str, Any], payload: Dict[str, Any] = Depends(auth)):
-    client = SupabaseServiceRoleClient()
-    updates = {}
-    if 'cabinet_id' in body:
-        updates['cabinet_id'] = body['cabinet_id']
-    if 'path' in body:
-        updates['path'] = body['path']
-    if not updates:
-        raise HTTPException(status_code=400, detail="No changes provided")
-    res = client.supabase.table('files').update(updates).eq('id', file_id).execute()
-    return res.data
-
-@router.delete("/files/{file_id}")
-def delete_file(file_id: str, payload: Dict[str, Any] = Depends(auth)):
-    client = SupabaseServiceRoleClient()
-    res = client.supabase.table('files').delete().eq('id', file_id).execute()
-    return res.data
-
-@router.get("/files/{file_id}/artefacts")
-def list_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)):
-    client = SupabaseServiceRoleClient()
-    res = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).order('created_at', desc=True).execute()
-    return res.data
-
-@router.get("/files/{file_id}/viewer-artefacts")
-def list_viewer_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)):
-    """
-    Get artefacts organized for UI viewer display, including frontmatter JSON,
-    processing bundles, and analysis data with proper display metadata.
-    """
-    client = SupabaseServiceRoleClient()
-    
-    # Get all artefacts for the file
-    res = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).order('created_at', desc=True).execute()
-    all_artefacts = res.data or []
-    
-    # Organize artefacts by category for UI display
-    viewer_artefacts = {
-        'document_analysis': [],
-        'processing_bundles': [],
-        'raw_data': []
-    }
-    
-    for artefact in all_artefacts:
-        artefact_type = artefact.get('type', '')
-        extra = artefact.get('extra', {})
-        
-        # Enhanced artefact info for UI display
-        artefact_info = {
-            'id': artefact['id'],
-            'type': artefact_type,
-            'display_name': extra.get('display_name'),
-            'bundle_label': extra.get('bundle_label'),
-            'section_title': extra.get('section_title'),
-            'page_range': extra.get('page_range'),
-            'page_count': extra.get('page_count'),
-            'pipeline': extra.get('pipeline'),
-            'processing_mode': extra.get('processing_mode'),
-            'ui_order': extra.get('ui_order', 999),
-            'description': extra.get('description'),
-            'viewer_type': extra.get('viewer_type', 'json'),
-            'created_at': artefact['created_at'],
-            'status': artefact.get('status', 'unknown')
-        }
-        
-        # Categorize artefacts for UI organization
-        if artefact_type == 'docling_frontmatter_json':
-            artefact_info.update({
-                'display_name': artefact_info['display_name'] or 'Document Frontmatter',
-                'bundle_label': artefact_info['bundle_label'] or 'Frontmatter Analysis',
-                'description': artefact_info['description'] or 'OCR analysis of document structure and metadata',
-                'ui_order': 1,
-                'viewer_type': 'json'
-            })
-            viewer_artefacts['document_analysis'].append(artefact_info)
-            
-        elif artefact_type == 'split_map_json':
-            artefact_info.update({
-                'display_name': 'Document Structure Map',
-                'bundle_label': 'Split Map',
-                'description': 'Document section boundaries and organization structure',
-                'ui_order': 2,
-                'viewer_type': 'json'
-            })
-            viewer_artefacts['document_analysis'].append(artefact_info)
-            
-        elif artefact_type == 'tika_json':
-            artefact_info.update({
-                'display_name': 'Document Metadata',
-                'bundle_label': 'Tika Analysis',
-                'description': 'Raw document metadata and properties extracted by Apache Tika',
-                'ui_order': 3,
-                'viewer_type': 'json'
-            })
-            viewer_artefacts['raw_data'].append(artefact_info)
-            
-        elif artefact_type in ['canonical_docling_json', 'docling_bundle_split', 'docling_bundle', 'docling_standard', 'docling_bundle_split_pages']:
-            # Processing bundles (OCR, No-OCR, VLM) - use original_pipeline for proper differentiation
-            pipeline_name = extra.get('original_pipeline', extra.get('pipeline', 'Unknown'))
-            bundle_label = artefact_info['bundle_label'] or f"{pipeline_name.upper().replace('_', '-')} Bundle"
-            display_name = artefact_info['display_name'] or f"{pipeline_name.upper().replace('_', '-')} Processing Result"
-            
-            # Special handling for master manifests
-            if artefact_type == 'docling_bundle_split_pages':
-                display_name = f"{pipeline_name.upper().replace('_', '-')} Document Pages"
-                bundle_label = f"{pipeline_name.upper().replace('_', '-')} Pages Bundle"
-                artefact_info.update({
-                    'viewer_type': 'bundle_collection',
-                    'is_master_manifest': True,
-                    'ui_order': 10  # Show master manifests before individual pages
-                })
-            elif artefact_type == 'docling_standard':
-                # Individual page bundles - lower UI priority  
-                artefact_info.update({
-                    'viewer_type': 'page_bundle',
-                    'is_individual_page': True,
-                    'ui_order': extra.get('split_order', 999) + 100  # Show after master manifests
-                })
-            
-            artefact_info.update({
-                'display_name': display_name,
-                'bundle_label': bundle_label,
-                'description': f"Docling processing result using {pipeline_name.replace('_', '-')} pipeline",
-                'pipeline_type': pipeline_name  # Add explicit pipeline type for UI
-            })
-            viewer_artefacts['processing_bundles'].append(artefact_info)
-            
-        elif artefact_type.startswith('docling_') and artefact_type.endswith('_json'):
-            # Other docling JSON results
-            pipeline_name = artefact_type.replace('docling_', '').replace('_json', '').upper()
-            artefact_info.update({
-                'display_name': f"{pipeline_name} Analysis",
-                'bundle_label': f"{pipeline_name} Result",
-                'description': f"Docling {pipeline_name.lower()} processing result",
-                'viewer_type': 'json'
-            })
-            viewer_artefacts['processing_bundles'].append(artefact_info)
-            
-        elif artefact_type == 'page_images':
-            artefact_info.update({
-                'display_name': 'Page Images',
-                'bundle_label': 'Visual Pages',
-                'description': 'Generated page images for document visualization',
-                'viewer_type': 'images'
-            })
-            viewer_artefacts['raw_data'].append(artefact_info)
-    
-    # Sort each category by ui_order
-    for category in viewer_artefacts.values():
-        category.sort(key=lambda x: (x['ui_order'], x['created_at']))
-    
-    return {
-        'file_id': file_id,
-        'categories': viewer_artefacts,
-        'total_artefacts': len(all_artefacts)
-    }
-
-@router.post("/files/{file_id}/artefacts/initial")
-def generate_initial_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)):
-    """
-    Generate initial artefacts using the new three-phase pipeline architecture.
-    
-    Phase 1: Document Structure Discovery & Analysis
-    - Tika metadata extraction
-    - Page images generation  
-    - Document structure analysis (LLM-enhanced)
-    - Split map generation
-    
-    Phase 2: Triggered automatically after Phase 1 completion
-    """
-    logger.info(f"Three-phase pipeline: Starting Phase 1 for file_id={file_id}")
-    
-    from modules.pipeline_controller import get_pipeline_controller
-    
-    client = SupabaseServiceRoleClient()
-    storage = StorageAdmin()
-    controller = get_pipeline_controller()
-
-    # Load file row
-    fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
-    file_row = fr.data
-    if not file_row:
-        raise HTTPException(status_code=404, detail="File not found")
-
-    bucket = file_row['bucket']
-    storage_path = file_row['path']
-    cabinet_id = file_row['cabinet_id']
-    mime = file_row.get('mime_type') or 'application/octet-stream'
-    filename = file_row.get('name', 'file')
-
-    # Step 1: Convert to PDF if not already a PDF (synchronous for now)
-    processing_path = storage_path
-    processing_mime = mime
-    
-    if mime != 'application/pdf':
-        logger.info(f"Converting non-PDF file to PDF: file_id={file_id} mime={mime}")
-        try:
-            file_bytes = storage.download_file(bucket, storage_path)
-            
-            with tempfile.TemporaryDirectory() as temp_dir:
-                # Save original file to temp location
-                temp_input = Path(temp_dir) / filename
-                with open(temp_input, 'wb') as f:
-                    f.write(file_bytes)
-                
-                # Convert to PDF
-                pdf_bytes = doc_processor.convert_to_pdf(temp_input)
-                
-                # Store PDF as artefact
-                pdf_artefact_id = str(uuid.uuid4())
-                pdf_rel_path = f"{cabinet_id}/{file_id}/{pdf_artefact_id}/document.pdf"
-                storage.upload_file(bucket, pdf_rel_path, pdf_bytes, 'application/pdf', upsert=True)
-                
-                pdf_ar = client.supabase.table('document_artefacts').insert({
-                    'file_id': file_id,
-                    'type': 'document_pdf',
-                    'rel_path': pdf_rel_path,
-                    'extra': {'converted_from': mime, 'original_filename': filename},
-                    'status': 'completed'
-                }).execute()
-                
-                # Use converted PDF for subsequent processing
-                processing_path = pdf_rel_path
-                processing_mime = 'application/pdf'
-                logger.info(f"PDF conversion: completed file_id={file_id} rel_path={pdf_rel_path}")
-                
-        except Exception as e:
-            logger.error(f"PDF conversion: error processing file_id={file_id}: {e}")
-            # Continue with original file if conversion fails
-    else:
-        logger.info(f"File is already PDF, skipping conversion: file_id={file_id}")
-
-    # Step 2: Enqueue Phase 1 tasks using the new pipeline controller
-    user_id = payload.get('sub') or payload.get('user_id')
-    priority = TaskPriority.HIGH if user_id else TaskPriority.NORMAL
-
-    try:
-        # Update file row with processing path
-        updated_file_row = {**file_row, 'path': processing_path, 'mime_type': processing_mime}
-        
-        # Enqueue Phase 1 tasks
-        phase1_tasks = controller.enqueue_phase1_tasks(
-            file_id=file_id,
-            file_row=updated_file_row,
-            processing_path=processing_path,
-            processing_mime=processing_mime,
-            priority=priority
-        )
-        
-        total_tasks = sum(len(task_list) for task_list in phase1_tasks.values())
-        
-        logger.info(f"Three-phase pipeline: Enqueued {total_tasks} Phase 1 tasks for file_id={file_id}")
-        
-
-        return {
-            'message': f'Three-phase pipeline: Enqueued {total_tasks} Phase 1 tasks. Phase 2 will trigger automatically after completion.',
-            'phase1_tasks': {k: v for k, v in phase1_tasks.items()},
-            'file_id': file_id,
-            'pipeline_mode': 'three_phase',
-            'bundle_architecture_enabled': True
-        }
-
-    except QueueConnectionError as e:
-        logger.error(f"Queue system unavailable for file_id={file_id}: {e}")
-        logger.error("Redis is not running. Please start the API server with './start.sh dev' to auto-start Redis.")
-        return {
-            'message': 'File uploaded successfully, but processing tasks could not be queued (Redis unavailable)',
-            'file_id': file_id,
-            'queue_status': 'unavailable',
-            'error': 'Queue system unavailable. Please restart the API server with Redis enabled.'
-        }
-    except Exception as e:
-        logger.error(f"Unexpected error enqueueing Phase 1 tasks for file_id={file_id}: {e}")
-        return {
-            'message': 'File uploaded successfully, but processing tasks failed to queue',
-            'file_id': file_id,
-            'queue_status': 'failed',
-            'error': str(e)
-        }
-
-@router.get("/files/{file_id}/page-images/manifest")
-def get_page_images_manifest(file_id: str, payload: Dict[str, Any] = Depends(auth)):
-    """Return the page_images manifest JSON for a file via service-role access."""
-    client = SupabaseServiceRoleClient()
-    storage = StorageAdmin()
-
-    # Find file row to get bucket
-    fr = client.supabase.table('files').select('id,bucket,cabinet_id').eq('id', file_id).single().execute()
-    file_row = fr.data or {}
-    if not file_row:
-        raise HTTPException(status_code=404, detail="File not found")
-    bucket = file_row['bucket']
-    cabinet_id = file_row['cabinet_id']
-
-    # Find page_images artefact
-    arts = client.supabase.table('document_artefacts') \
-        .select('id,type,rel_path,extra') \
-        .eq('file_id', file_id).eq('type', 'page_images') \
-        .order('created_at', desc=True).limit(1).execute().data or []
-    if not arts:
-        raise HTTPException(status_code=404, detail="page_images artefact not found")
-    art = arts[0]
-
-    # Manifest path
-    manifest_rel_path = (art.get('extra') or {}).get('manifest') or f"{art['rel_path'].rstrip('/')}/page_images.json"
-
-    try:
-        raw = storage.download_file(bucket, manifest_rel_path)
-        import json as _json
-        manifest = _json.loads(raw.decode('utf-8'))
-        # Ensure bucket and base prefix are present for the UI
-        manifest.setdefault('bucket', bucket)
-        manifest.setdefault('base_dir', art['rel_path'])
-        return manifest
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to load manifest: {e}")
-
-def json_dumps(obj: Any) -> str:
-    try:
-        import json
-        return json.dumps(obj, ensure_ascii=False)
-    except Exception:
-        return "{}"
-
-
-@router.get("/files/{file_id}/artefacts/{artefact_id}/json")
-def get_artefact_json(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)):
-    """Return the JSON content of a document artefact using service-role storage access."""
-    client = SupabaseServiceRoleClient()
-    storage = StorageAdmin()
-    # Look up artefact to get rel_path and validate it belongs to file
-    ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path').eq('id', artefact_id).single().execute()
-    artefact = ar.data
-    if not artefact:
-        raise HTTPException(status_code=404, detail="Artefact not found")
-    if artefact.get('file_id') != file_id:
-        raise HTTPException(status_code=400, detail="Artefact does not belong to file")
-
-    # Look up file to get bucket
-    fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute()
-    file_row = fr.data
-    if not file_row:
-        raise HTTPException(status_code=404, detail="File not found")
-
-    bucket = file_row['bucket']
-    rel_path = artefact['rel_path']
-    try:
-        raw = storage.download_file(bucket, rel_path)
-        import json as _json
-        data = _json.loads(raw.decode('utf-8'))
-        return data
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to load artefact JSON: {str(e)}")
-
-
-@router.get("/files/{file_id}/artefacts/{artefact_id}/vlm-section-manifest")
-def get_vlm_section_manifest(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)):
-    """Return the VLM section page bundle manifest JSON for a VLM section bundle artefact."""
-    client = SupabaseServiceRoleClient()
-    storage = StorageAdmin()
-
-    ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path,type,extra').eq('id', artefact_id).single().execute().data
-    if not ar:
-        raise HTTPException(status_code=404, detail="Artefact not found")
-    if ar.get('file_id') != file_id:
-        raise HTTPException(status_code=400, detail="Artefact does not belong to file")
-    if ar.get('type') != 'vlm_section_page_bundle':
-        raise HTTPException(status_code=400, detail="Artefact is not a VLM section page bundle")
-
-    fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute().data
-    if not fr:
-        raise HTTPException(status_code=404, detail="File not found")
-    bucket = fr['bucket']
-
-    # The rel_path directly points to the manifest JSON file
-    manifest_rel_path = ar['rel_path']
-
-    try:
-        raw = storage.download_file(bucket, manifest_rel_path)
-        import json as _json
-        data = _json.loads(raw.decode('utf-8'))
-        # ensure bucket present for client use
-        data.setdefault('bucket', bucket)
-        return data
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to load VLM section manifest: {e}")
-
-
-@router.post("/files/{file_id}/artefacts/outline")
-def enqueue_outline_structure(file_id: str, payload: Dict[str, Any] = Depends(auth)):
-    """
-    Manually enqueue the fast document outline (headings-only) analysis for an existing file.
-    Returns the queued task id.
-    """
-    client = SupabaseServiceRoleClient()
-
-    fr = client.supabase.table('files').select('id,bucket,cabinet_id,path,mime_type').eq('id', file_id).single().execute()
-    file_row = fr.data
-    if not file_row:
-        raise HTTPException(status_code=404, detail="File not found")
-
-    bucket = file_row['bucket']
-    storage_path = file_row['path']
-    cabinet_id = file_row['cabinet_id']
-    mime = file_row.get('mime_type') or 'application/pdf'
-
-    # Prefer converted PDF artefact if available
-    arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or []
-    pdf_art = next((a for a in arts if a.get('type') == 'document_pdf'), None)
-    processing_path = pdf_art['rel_path'] if pdf_art else storage_path
-
-    try:
-        task_id = enqueue_docling_task(
-            file_id=file_id,
-            task_type='document_structure_analysis',
-            payload={
-                'bucket': bucket,
-                'file_path': processing_path,
-                'cabinet_id': cabinet_id,
-                'mime_type': mime,
-                'config': {
-                    'target_type': 'inbody',
-                    'to_formats': 'json',
-                    'do_ocr': False,
-                    'force_ocr': False
-                }
-            },
-            priority=TaskPriority.NORMAL,
-            timeout=300
-        )
-        return { 'message': 'outline task enqueued', 'task_id': task_id, 'file_id': file_id }
-    except QueueConnectionError as e:
-        raise HTTPException(status_code=503, detail=f"Queue unavailable: {e}")
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to enqueue outline task: {e}")
-
-@router.get("/files/proxy")
-def proxy_storage_file(bucket: str, path: str, payload: Dict[str, Any] = Depends(auth)):
-    """Proxy a storage file (service-role), useful for private image access in the UI."""
-    storage = StorageAdmin()
-    try:
-        data = storage.download_file(bucket, path)
-        media = 'application/octet-stream'
-        lp = path.lower()
-        if lp.endswith('.png'):
-            media = 'image/png'
-        elif lp.endswith('.webp'):
-            media = 'image/webp'
-        elif lp.endswith('.jpg') or lp.endswith('.jpeg'):
-            media = 'image/jpeg'
-        elif lp.endswith('.json'):
-            media = 'application/json'
-        return Response(content=data, media_type=media)
-    except Exception as e:
-        raise HTTPException(status_code=404, detail=f"File not found or inaccessible: {e}")
-
-
-# Signed proxy for iframe/img tags without Authorization header
-@router.get("/files/proxy_signed")
-def proxy_storage_file_signed(bucket: str, path: str, token: str):
-    """Proxy using a signed bearer token passed as query param 'token'."""
-    try:
-        payload = verify_supabase_jwt_str(token)
-        if not payload:
-            raise HTTPException(status_code=403, detail="Invalid token")
-    except Exception as e:
-        raise HTTPException(status_code=403, detail=f"Invalid token: {e}")
-
-    storage = StorageAdmin()
-    try:
-        data = storage.download_file(bucket, path)
-        media = 'application/octet-stream'
-        lp = path.lower()
-        if lp.endswith('.png'):
-            media = 'image/png'
-        elif lp.endswith('.webp'):
-            media = 'image/webp'
-        elif lp.endswith('.jpg') or lp.endswith('.jpeg'):
-            media = 'image/jpeg'
-        elif lp.endswith('.json'):
-            media = 'application/json'
-        return Response(content=data, media_type=media)
-    except Exception as e:
-        raise HTTPException(status_code=404, detail=f"File not found or inaccessible: {e}")
-
-# -------- Canonical bundle manifest ---------
-
-@router.get("/files/{file_id}/artefacts/{artefact_id}/manifest")
-def get_canonical_manifest(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)):
-    """Return the manifest.json for a canonical_docling_bundle artefact."""
-    client = SupabaseServiceRoleClient()
-    storage = StorageAdmin()
-
-    ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path,extra').eq('id', artefact_id).single().execute().data
-    if not ar:
-        raise HTTPException(status_code=404, detail="Artefact not found")
-    if ar.get('file_id') != file_id:
-        raise HTTPException(status_code=400, detail="Artefact does not belong to file")
-    extra = ar.get('extra') or {}
-    manifest_rel_path = extra.get('manifest')
-    if not manifest_rel_path:
-        raise HTTPException(status_code=404, detail="Manifest path not recorded on artefact")
-
-    fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute().data
-    if not fr:
-        raise HTTPException(status_code=404, detail="File not found")
-    bucket = fr['bucket']
-
-    try:
-        raw = storage.download_file(bucket, manifest_rel_path)
-        import json as _json
-        data = _json.loads(raw.decode('utf-8'))
-        # ensure bucket present for client use
-        data.setdefault('bucket', bucket)
-        return data
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to load manifest: {e}")
-
-# -------- Canonical Docling generation ---------
-
-def _load_split_map(client: SupabaseServiceRoleClient, storage: StorageAdmin, bucket: str, file_id: str) -> Optional[Dict[str, Any]]:
-    try:
-        arts = client.supabase.table('document_artefacts') \
-            .select('id,type,rel_path') \
-            .eq('file_id', file_id).eq('type', 'split_map_json') \
-            .order('created_at', desc=True).limit(1).execute().data or []
-        if not arts:
-            return None
-        art = arts[0]
-        raw = storage.download_file(bucket, art['rel_path'])
-        import json as _json
-        return _json.loads(raw.decode('utf-8'))
-    except Exception:
-        return None
-
-
-@router.post("/files/{file_id}/artefacts/canonical-docling")
-def enqueue_canonical_docling(
-    file_id: str,
-    body: Dict[str, Any] = Body(default={}),
-    payload: Dict[str, Any] = Depends(auth)
-):
-    """Enqueue generation of canonical Docling JSON(s) for a file.
-
-    If a split_map is available and the document is large, this will enqueue
-    multiple Docling jobs using page ranges per section. Otherwise a single
-    job is created for the whole document.
-    """
-    client = SupabaseServiceRoleClient()
-    storage = StorageAdmin()
-
-    fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
-    file_row = fr.data
-    if not file_row:
-        raise HTTPException(status_code=404, detail="File not found")
-
-    bucket = file_row['bucket']
-    cabinet_id = file_row['cabinet_id']
-    mime = file_row.get('mime_type') or 'application/pdf'
-    storage_path = file_row['path']
-
-    # Prefer converted PDF if available
-    try:
-        arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or []
-        a_pdf = next((a for a in arts if a.get('type') == 'document_pdf'), None)
-        processing_path = a_pdf['rel_path'] if a_pdf else storage_path
-        processing_mime = 'application/pdf' if a_pdf else mime
-    except Exception:
-        processing_path = storage_path
-        processing_mime = mime
-
-    # Determine page_count (prefer Tika; fallback to PDF parser if needed)
-    page_count = None
-    try:
-        arts_pc = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).execute().data or []
-        a_tika_pc = next((a for a in arts_pc if a.get('type') == 'tika_json'), None)
-        if a_tika_pc:
-            raw = storage.download_file(bucket, a_tika_pc['rel_path'])
-            import json as _json
-            tj = _json.loads(raw.decode('utf-8'))
-            for k in ("xmpTPg:NPages", "Page-Count", "pdf:PageCount"):
-                v = tj.get(k) or tj.get(k.lower())
-                if v is not None:
-                    page_count = int(v)
-                    break
-    except Exception as e:
-        logger.debug(f"[canonical-docling] Tika page_count read failed: {e}")
-        pass
-
-    # Fallback: compute page_count from PDF if Tika did not provide it
-    if page_count is None:
-        try:
-            pdf_bytes = storage.download_file(bucket, processing_path)
-            try:
-                import fitz  # PyMuPDF
-                doc = fitz.open(stream=pdf_bytes, filetype='pdf')
-                page_count = int(doc.page_count)
-                doc.close()
-                logger.info(f"[canonical-docling] page_count via PyMuPDF: {page_count}")
-            except Exception:
-                try:
-                    from PyPDF2 import PdfReader
-                    reader = PdfReader(io.BytesIO(pdf_bytes))
-                    page_count = int(len(reader.pages))
-                    logger.info(f"[canonical-docling] page_count via PyPDF2: {page_count}")
-                except Exception:
-                    page_count = None
-        except Exception:
-            page_count = None
-    else:
-        logger.info(f"[canonical-docling] page_count via Tika: {page_count}")
-
-    # Optional custom range from caller
-    custom_range = body.get('custom_range')
-    custom_label = body.get('custom_label') or ''
-    selected_section_id = body.get('selected_section_id')
-    selected_section_title = body.get('selected_section_title')
-
-    # Load split map if requested and document is large enough
-    use_split_requested = bool(body.get('use_split_map', True))
-    split_threshold = int(body.get('threshold') or os.getenv('DOCLING_SPLIT_THRESHOLD', '50'))
-    ranges = []  # list of (start,end)
-    split_map = None
-    sections = []  # list of dicts: {start,end,title}
-    logger.info(f"[canonical-docling] use_split_map={use_split_requested} threshold={split_threshold} page_count={page_count}")
-    # If custom range provided, honor it and bypass split map
-    if isinstance(custom_range, list) and len(custom_range) >= 2:
-        try:
-            cs = int(custom_range[0]); ce = int(custom_range[1])
-            if page_count is not None:
-                cs = max(1, min(cs, page_count))
-                ce = max(cs, min(ce, page_count))
-            ranges = [(cs, ce)]
-            sections = [{'start': cs, 'end': ce, 'title': custom_label or 'Custom range'}]
-            use_split_requested = False
-            logger.info(f"[canonical-docling] using custom_range start={cs} end={ce} label='{custom_label}'")
-        except Exception as _e:
-            logger.warning(f"[canonical-docling] invalid custom_range; falling back. err={_e}")
-
-    if not ranges and use_split_requested and (page_count is None or page_count >= split_threshold):
-        split_map = _load_split_map(client, storage, bucket, file_id)
-        entries = (split_map or {}).get('entries') if split_map else []
-        logger.info(f"[canonical-docling] split_map loaded entries={len(entries) if isinstance(entries, list) else 0}")
-        if split_map and isinstance(entries, list) and len(entries) > 0:
-            # Normalize and sort entries by start_page to enforce correct order
-            norm: list[dict] = []
-            for e in entries:
-                try:
-                    s = int(e.get('start_page', 1))
-                    t = int(e.get('end_page', s))
-                    if t < s:
-                        t = s
-                    title = e.get('title') or e.get('label') or ''
-                    norm.append({'start': s, 'end': t, 'title': title})
-                except Exception:
-                    continue
-            norm.sort(key=lambda x: x['start'])
-            # Deduplicate identical or overlapping starts by keeping the earliest occurrence
-            ordered: list[dict] = []
-            last_end = 0
-            for e in norm:
-                s, t = int(e['start']), int(e['end'])
-                if ordered and s <= last_end:
-                    # Clamp to prevent inversion and maintain order
-                    s = last_end + 1
-                    if s > (page_count or s):
-                        continue
-                    if t < s:
-                        t = s
-                last_end = max(last_end, t)
-                ordered.append({'start': s, 'end': t, 'title': e['title']})
-            for e in ordered:
-                ranges.append((e['start'], e['end']))
-                sections.append(e)
-
-    # Fallback: if no split_map ranges... we shouldn't be here
-    if not ranges:
-        # If document is large, split into fixed windows to protect Docling server
-        if page_count is not None and page_count >= split_threshold:
-            chunk = int(os.getenv('DOCLING_FALLBACK_CHUNK_PAGES', '25'))
-            chunk = max(5, min(100, chunk))
-            for i in range(1, (page_count or 1) + 1, chunk):
-                end = min(i + chunk - 1, page_count or i)
-                ranges.append((i, end))
-                sections.append({'start': i, 'end': end, 'title': f"Pages {i}-{end}"})
-            logger.warning(f"[canonical-docling] using fallback chunking ranges={len(ranges)} chunk={chunk}")
-        else:
-            ranges = [(1, page_count or 9223372036854775807)]
-            logger.warning(f"[canonical-docling] using single-range fallback (small doc)")
-
-    # Build config
-    cfg = body.get('config', {})
-    pipeline = cfg.get('pipeline', 'standard')
-    config: Dict[str, Any] = {
-        # target_type is computed in processor based on to_formats unless explicitly provided by user
-        'to_formats': cfg.get('to_formats', 'json'),
-        'do_ocr': bool(cfg.get('do_ocr', True)),
-        'force_ocr': bool(cfg.get('force_ocr', False)),
-        'image_export_mode': cfg.get('image_export_mode', 'embedded'),
-        'ocr_engine': cfg.get('ocr_engine', 'easyocr'),
-        'ocr_lang': cfg.get('ocr_lang', 'en'),
-        'pdf_backend': cfg.get('pdf_backend', 'dlparse_v4'),
-        'table_mode': cfg.get('table_mode', 'fast'),
-        'pipeline': pipeline,
-        'do_picture_classification': bool(cfg.get('do_picture_classification', False)),
-        'do_picture_description': bool(cfg.get('do_picture_description', False)),
-    }
-    # If user explicitly set target_type, pass it through
-    if 'target_type' in cfg:
-        config['target_type'] = cfg['target_type']
-    # Optional VLM settings (only include API fields if provided as JSON by caller)
-    if config['do_picture_description']:
-        pd_api = cfg.get('picture_description_api')
-        if isinstance(pd_api, (dict, list)):
-            config['picture_description_api'] = pd_api
-        elif isinstance(pd_api, str) and pd_api.strip().startswith(('{', '[')):
-            config['picture_description_api'] = pd_api
-        if cfg.get('picture_description_prompt'):
-            config['picture_description_prompt'] = cfg['picture_description_prompt']
-    if pipeline == 'vlm':
-        # Provider presets mapping
-        provider = (cfg.get('vlm_provider') or '').strip().lower()
-        provider_model = (cfg.get('vlm_provider_model') or '').strip()
-        provider_base = (cfg.get('vlm_provider_base_url') or '').strip()
-        if provider in ('ollama', 'openai') and provider_model:
-            if provider == 'ollama':
-                base_url = provider_base or os.getenv('OLLAMA_BASE_URL') or os.getenv('VLM_OLLAMA_BASE_URL')
-                if base_url:
-                    endpoint = f"{base_url.rstrip('/')}/v1/chat/completions"
-                    # Use OpenAI provider schema against Ollama's OpenAI-compatible endpoint
-                    cfg_api = {
-                        'provider': 'openai',
-                        'url': endpoint,
-                        'model': provider_model,
-                        'response_format': 'markdown',
-                        'request_params': {'model': provider_model}
-                    }
-                    logger.info(f"[canonical-docling] VLM provider=ollama mapped to openai-compatible url={endpoint} model={provider_model}")
-                    config['vlm_pipeline_model_api'] = cfg_api
-                    # Also wire picture_description_api if picture description is enabled
-                    if config.get('do_picture_description'):
-                        config['picture_description_api'] = {
-                            'url': endpoint,
-                            'headers': {},
-                            'params': {'model': provider_model}
-                        }
-            elif provider == 'openai':
-                base_url = provider_base or os.getenv('OPENAI_BASE_URL') or 'https://api.openai.com/v1'
-                api_key = os.getenv('OPENAI_API_KEY') or os.getenv('OPENAI_API_KEY_READONLY')
-                # Do not inline key if not present; server may have default
-                model_cfg: Dict[str, Any] = {
-                    'provider': 'openai',
-                    'url': f"{base_url.rstrip('/')}/chat/completions",
-                    'model': provider_model,
-                    'response_format': 'markdown',
-                    'request_params': {'model': provider_model}
-                }
-                if api_key:
-                    model_cfg['api_key'] = api_key
-                    # Also pass explicit Authorization header for servers that expect it
-                    model_cfg['headers'] = {
-                        'Authorization': f"Bearer {api_key}"
-                    }
-                logger.info(f"[canonical-docling] VLM provider=openai url={model_cfg['url']} model={provider_model} api_key={'yes' if api_key else 'no'}")
-                config['vlm_pipeline_model_api'] = model_cfg
-                # Also wire picture_description_api if picture description is enabled
-                if config.get('do_picture_description'):
-                    headers = {'Authorization': f"Bearer {api_key}"} if api_key else {}
-                    config['picture_description_api'] = {
-                        'url': f"{base_url.rstrip('/')}/chat/completions",
-                        'headers': headers,
-                        'params': {'model': provider_model}
-                    }
-        else:
-            # Pass through explicit API/local JSON if provided by caller
-            vpa = cfg.get('vlm_pipeline_model_api')
-            if isinstance(vpa, (dict, list)):
-                config['vlm_pipeline_model_api'] = vpa
-            elif isinstance(vpa, str) and vpa.strip().startswith(('{', '[')):
-                config['vlm_pipeline_model_api'] = vpa
-
-    # Enqueue tasks for each range
-    priority = TaskPriority.HIGH
-    task_ids = []
-    multi = len(ranges) > 1
-    logger.info(f"[canonical-docling] final ranges={len(ranges)} multi={multi} pipeline={pipeline} producer={body.get('producer', 'manual')}")
-    
-    # Create a group id for split bundles (used for UI grouping)
-    # Use provided group_id if present (for two-pass auto system), otherwise generate new
-    group_id = body.get('group_id') or (str(uuid.uuid4()) if multi else None)
-    if multi and not sections:
-        # Build sections from ranges if titles were not captured
-        for (start, end) in ranges:
-            sections.append({'start': int(start), 'end': int(end), 'title': ''})
-
-    idx = 0
-    for (start, end) in ranges:
-        # Locate title for this range if available
-        title = ''
-        if multi and sections and idx < len(sections):
-            title = sections[idx].get('title') or ''
-        idx += 1
-
-        cfg_range = dict(config)
-        # Ensure 1-based inclusive range is passed through
-        cfg_range['page_range'] = [max(1, int(start)), max(int(start), int(end))]
-        extra = {
-            'is_subdoc': multi,
-            'page_range': [int(start), int(end)],
-            'label': (title or f"subdoc p{int(start)}-{int(end)}") if multi else 'canonical'
-        }
-        # Attach selected section metadata if provided by caller
-        if selected_section_id:
-            extra['selected_section_id'] = selected_section_id
-        if selected_section_title or custom_label:
-            extra['selected_section_title'] = selected_section_title or custom_label
-        # For split processing, force split bundle artefact type and add grouping/order metadata
-        if multi:
-            extra.update({
-                # UI grouping metadata
-                'split_order': idx,
-                'split_heading': title,
-                'split_total': len(ranges)
-            })
-            if group_id:
-                extra['group_id'] = group_id
-                extra['group_pack_type'] = 'docling_standard_auto_split'
-        else:
-            # Single-bundle case: allow caller to override type (defaults to canonical bundle)
-            if 'artefact_type_override' in body and body.get('artefact_type_override'):
-                extra['artefact_type_override'] = body.get('artefact_type_override')
-
-        # Mark producer and selection metadata
-        extra['producer'] = body.get('producer') or ('auto_split' if (multi and body.get('use_split_map')) else 'manual')
-        if selected_section_id:
-            extra['selected_section_id'] = selected_section_id
-        if selected_section_title or custom_label:
-            extra['selected_section_title'] = selected_section_title or custom_label
-
-        # Enhanced logging for canonical operations
-        if multi:
-            logger.info(f"[canonical-docling] enqueue range idx={idx}/{len(ranges)} start={start} end={end} title='{title}' group_id={group_id} producer={extra.get('producer')} pipeline={pipeline}")
-        else:
-            logger.info(f"[canonical-docling] enqueue single range start={start} end={end} producer={extra.get('producer')} pipeline={pipeline}")
-        tid = enqueue_docling_task(
-            file_id=file_id,
-            task_type='canonical_docling_subdoc_json' if multi else 'canonical_docling_json',
-            payload={
-                'bucket': bucket,
-                'file_path': processing_path,
-                'cabinet_id': cabinet_id,
-                'mime_type': processing_mime,
-                'config': cfg_range,
-                'artefact_extra': extra,
-                # Ensure canonical tasks respect upstream dependencies (e.g., Frontmatter)
-                'depends_on': body.get('depends_on', []),
-                # Pass through grouping info if provided by caller (kept for backward-compat)
-                'group_pack_type': body.get('group_pack_type')
-            },
-            priority=priority,
-            timeout=int(body.get('timeout', DOCLING_NOOCR_TIMEOUT))
-        )
-        task_ids.append(tid)
-
-    logger.info(f"[canonical-docling] completed enqueue file_id={file_id} tasks={len(task_ids)} ranges={len(ranges)} pipeline={pipeline} producer={body.get('producer','manual')} group_id={group_id if multi else 'single'}")
-    
-    return {
-        'message': f'enqueued {len(task_ids)} canonical docling job(s)',
-        'task_ids': task_ids,
-        'ranges': ranges,
-        'used_split_map': bool(split_map),
-        'group_id': group_id,
-        'pipeline': pipeline,
-        'producer': body.get('producer', 'manual')
-    }
-
--- a/archive/auto_processing/memory_aware_queue.py
+++ b/archive/auto_processing/memory_aware_queue.py
@ -1,411 +0,0 @@
-"""
-Memory-Aware Queue Management System
-====================================
-
-Provides intelligent queue management based on memory usage and file sizes
-rather than simple task count limits. Supports multiple users with fair
-queuing and capacity management.
-
-Features:
- Memory-based queue limits (not just task count)
- Fair queuing across multiple users
- Upload capacity checking with user feedback
- Graceful degradation under load
- Service-specific memory tracking
-"""
-
-import os
-import time
-import json
-import uuid
-import logging
-from typing import Dict, List, Optional, Any, Tuple
-from dataclasses import dataclass, asdict
-from enum import Enum
-import redis
-from .redis_manager import get_redis_manager
-import psutil
-
-logger = logging.getLogger(__name__)
-
-class QueueStatus(Enum):
-    ACCEPTING = "accepting"           # Normal operation
-    BUSY = "busy"                    # High load, warn users
-    OVERLOADED = "overloaded"        # Reject new uploads
-    MAINTENANCE = "maintenance"       # Manual override
-
-@dataclass
-class MemoryConfig:
-    """Memory configuration for queue management."""
-    max_total_memory_mb: int = 2048      # 2GB total queue memory
-    max_user_memory_mb: int = 512        # 512MB per user
-    max_file_size_mb: int = 100          # 100MB max file size
-    memory_warning_threshold: float = 0.8  # Warn at 80%
-    memory_reject_threshold: float = 0.95   # Reject at 95%
-
-@dataclass
-class QueuedFile:
-    """Represents a file waiting in the queue."""
-    file_id: str
-    user_id: str
-    filename: str
-    size_bytes: int
-    mime_type: str
-    cabinet_id: str
-    priority: int = 1
-    queued_at: float = 0
-    estimated_processing_time: int = 300  # seconds
-    memory_estimate_mb: float = 0
-    
-    def __post_init__(self):
-        if self.queued_at == 0:
-            self.queued_at = time.time()
-        
-        # Estimate memory usage (rough heuristic)
-        self.memory_estimate_mb = self._estimate_memory_usage()
-    
-    def _estimate_memory_usage(self) -> float:
-        """Estimate memory usage for this file during processing."""
-        base_mb = self.size_bytes / (1024 * 1024)
-        
-        # Processing multipliers based on operations
-        if self.mime_type == 'application/pdf':
-            # PDF: original + extracted text + images + thumbnails
-            return base_mb * 3.5
-        elif self.mime_type.startswith('image/'):
-            # Images: original + resized variants + OCR text
-            return base_mb * 2.5
-        else:
-            # Other docs: original + PDF conversion + processing
-            return base_mb * 4.0
-
-class MemoryAwareQueue:
-    """Memory-aware queue management system."""
-    
-    def __init__(self, environment: str = "dev"):
-        self.redis_manager = get_redis_manager(environment)
-        self.redis_client = self.redis_manager.client
-        self.config = self._load_config()
-        
-        # Redis keys
-        self.upload_queue_key = "upload_queue"
-        self.processing_memory_key = "processing_memory"
-        self.user_quota_key = "user_quotas"
-        self.system_status_key = "system_status"
-        
-        logger.info(f"🧠 Memory-aware queue initialized (max: {self.config.max_total_memory_mb}MB)")
-    
-    def _load_config(self) -> MemoryConfig:
-        """Load memory configuration from environment."""
-        return MemoryConfig(
-            max_total_memory_mb=int(os.getenv('QUEUE_MAX_MEMORY_MB', '2048')),
-            max_user_memory_mb=int(os.getenv('QUEUE_MAX_USER_MEMORY_MB', '512')),
-            max_file_size_mb=int(os.getenv('MAX_FILE_SIZE_MB', '100')),
-            memory_warning_threshold=float(os.getenv('MEMORY_WARNING_THRESHOLD', '0.8')),
-            memory_reject_threshold=float(os.getenv('MEMORY_REJECT_THRESHOLD', '0.95'))
-        )
-    
-    def check_upload_capacity(self, user_id: str, file_size_bytes: int, 
-                            mime_type: str) -> Tuple[bool, str, Dict[str, Any]]:
-        """
-        Check if system can accept a new upload.
-        
-        Returns:
-            (can_accept, message, queue_info)
-        """
-        
-        # Create temporary QueuedFile to estimate memory
-        temp_file = QueuedFile(
-            file_id="temp",
-            user_id=user_id,
-            filename="temp",
-            size_bytes=file_size_bytes,
-            mime_type=mime_type,
-            cabinet_id="temp"
-        )
-        
-        file_memory_mb = temp_file.memory_estimate_mb
-        
-        # Check file size limit
-        if file_size_bytes > (self.config.max_file_size_mb * 1024 * 1024):
-            return False, f"File too large (max: {self.config.max_file_size_mb}MB)", {}
-        
-        # Get current memory usage
-        current_memory = self._get_current_memory_usage()
-        user_memory = self._get_user_memory_usage(user_id)
-        
-        # Check user quota
-        if user_memory + file_memory_mb > self.config.max_user_memory_mb:
-            return False, f"User quota exceeded (limit: {self.config.max_user_memory_mb}MB)", {
-                'user_current': user_memory,
-                'user_limit': self.config.max_user_memory_mb
-            }
-        
-        # Check system capacity
-        total_after = current_memory + file_memory_mb
-        max_memory = self.config.max_total_memory_mb
-        
-        if total_after > (max_memory * self.config.memory_reject_threshold):
-            queue_info = self._get_queue_info()
-            return False, "System overloaded. Please try again later.", {
-                'current_memory': current_memory,
-                'max_memory': max_memory,
-                'utilization': current_memory / max_memory,
-                'queue_position': queue_info['total_queued'] + 1
-            }
-        
-        # Calculate wait time estimate
-        wait_estimate = self._estimate_wait_time(user_id)
-        
-        status = "ready"
-        message = "Upload accepted"
-        
-        if total_after > (max_memory * self.config.memory_warning_threshold):
-            status = "busy"
-            message = f"System busy. Estimated wait: {wait_estimate // 60}m {wait_estimate % 60}s"
-        
-        return True, message, {
-            'status': status,
-            'estimated_wait_seconds': wait_estimate,
-            'memory_usage': {
-                'current': current_memory,
-                'after_upload': total_after,
-                'limit': max_memory,
-                'utilization': total_after / max_memory
-            },
-            'user_quota': {
-                'used': user_memory,
-                'after_upload': user_memory + file_memory_mb,
-                'limit': self.config.max_user_memory_mb
-            }
-        }
-    
-    def enqueue_file(self, file_id: str, user_id: str, filename: str, 
-                    size_bytes: int, mime_type: str, cabinet_id: str, 
-                    priority: int = 1) -> Dict[str, Any]:
-        """
-        Add file to upload queue.
-        
-        Returns:
-            Queue information including position and estimated wait time
-        """
-        
-        queued_file = QueuedFile(
-            file_id=file_id,
-            user_id=user_id,
-            filename=filename,
-            size_bytes=size_bytes,
-            mime_type=mime_type,
-            cabinet_id=cabinet_id,
-            priority=priority
-        )
-        
-        # Serialize and add to Redis queue (priority queue: higher priority = lower score)
-        score = time.time() - (priority * 1000000)  # Priority affects score significantly
-        
-        self.redis_client.zadd(
-            self.upload_queue_key, 
-            {json.dumps(asdict(queued_file)): score}
-        )
-        
-        # Update user quota tracking
-        self._update_user_quota(user_id, queued_file.memory_estimate_mb, increment=True)
-        
-        # Get queue position and wait estimate
-        position = self._get_queue_position(file_id)
-        wait_estimate = self._estimate_wait_time(user_id)
-        
-        logger.info(f"📋 Queued file {file_id} for user {user_id} (pos: {position}, wait: {wait_estimate}s)")
-        
-        return {
-            'queued': True,
-            'file_id': file_id,
-            'queue_position': position,
-            'estimated_wait_seconds': wait_estimate,
-            'memory_estimate_mb': queued_file.memory_estimate_mb
-        }
-    
-    def dequeue_next_file(self, service_name: str) -> Optional[QueuedFile]:
-        """
-        Get next file from queue for processing.
-        
-        Args:
-            service_name: The service requesting work (for capacity management)
-        """
-        
-        # Check if service has capacity
-        service_memory = self._get_service_memory_usage(service_name)
-        service_limit = self._get_service_memory_limit(service_name)
-        
-        if service_memory >= service_limit:
-            logger.debug(f"Service {service_name} at capacity ({service_memory}/{service_limit}MB)")
-            return None
-        
-        # Get next item from priority queue (lowest score first)
-        items = self.redis_client.zrange(self.upload_queue_key, 0, 0, withscores=True)
-        
-        if not items:
-            return None
-        
-        file_data_json, score = items[0]
-        file_data = json.loads(file_data_json)
-        queued_file = QueuedFile(**file_data)
-        
-        # Check if this file would exceed service memory limit
-        if service_memory + queued_file.memory_estimate_mb > service_limit:
-            # Skip this file for now, try smaller ones later
-            logger.debug(f"File {queued_file.file_id} too large for {service_name} capacity")
-            return None
-        
-        # Remove from queue
-        self.redis_client.zrem(self.upload_queue_key, file_data_json)
-        
-        # Update tracking
-        self._update_user_quota(queued_file.user_id, queued_file.memory_estimate_mb, increment=False)
-        self._update_service_memory(service_name, queued_file.memory_estimate_mb, increment=True)
-        
-        logger.info(f"🎯 Dequeued file {queued_file.file_id} for {service_name} processing")
-        
-        return queued_file
-    
-    def complete_processing(self, service_name: str, file_id: str, memory_used_mb: float):
-        """Mark file processing as complete and free memory."""
-        self._update_service_memory(service_name, memory_used_mb, increment=False)
-        logger.info(f"✅ Completed processing {file_id} in {service_name} (freed {memory_used_mb}MB)")
-    
-    def _get_current_memory_usage(self) -> float:
-        """Get current total memory usage across all services."""
-        services = ['docling', 'tika', 'llm', 'document_analysis']
-        total = 0
-        
-        for service in services:
-            service_key = f"{self.processing_memory_key}:{service}"
-            memory = float(self.redis_client.get(service_key) or 0)
-            total += memory
-        
-        return total
-    
-    def _get_user_memory_usage(self, user_id: str) -> float:
-        """Get current memory usage for a specific user."""
-        user_key = f"{self.user_quota_key}:{user_id}"
-        return float(self.redis_client.get(user_key) or 0)
-    
-    def _get_service_memory_usage(self, service_name: str) -> float:
-        """Get current memory usage for a service."""
-        service_key = f"{self.processing_memory_key}:{service_name}"
-        return float(self.redis_client.get(service_key) or 0)
-    
-    def _get_service_memory_limit(self, service_name: str) -> float:
-        """Get memory limit for a service."""
-        # Service-specific memory limits as percentage of total
-        limits = {
-            'docling': 0.4,      # 40% for Docling (memory-intensive)
-            'tika': 0.2,         # 20% for Tika
-            'llm': 0.3,          # 30% for LLM processing
-            'document_analysis': 0.1  # 10% for document analysis
-        }
-        
-        percentage = limits.get(service_name, 0.1)
-        return self.config.max_total_memory_mb * percentage
-    
-    def _update_user_quota(self, user_id: str, memory_mb: float, increment: bool):
-        """Update user memory quota tracking."""
-        user_key = f"{self.user_quota_key}:{user_id}"
-        
-        if increment:
-            self.redis_client.incrbyfloat(user_key, memory_mb)
-        else:
-            current = float(self.redis_client.get(user_key) or 0)
-            new_value = max(0, current - memory_mb)
-            self.redis_client.set(user_key, new_value)
-        
-        # Set expiration for cleanup
-        self.redis_client.expire(user_key, 86400)  # 24 hours
-    
-    def _update_service_memory(self, service_name: str, memory_mb: float, increment: bool):
-        """Update service memory usage tracking."""
-        service_key = f"{self.processing_memory_key}:{service_name}"
-        
-        if increment:
-            self.redis_client.incrbyfloat(service_key, memory_mb)
-        else:
-            current = float(self.redis_client.get(service_key) or 0)
-            new_value = max(0, current - memory_mb)
-            self.redis_client.set(service_key, new_value)
-        
-        # Set expiration for cleanup
-        self.redis_client.expire(service_key, 3600)  # 1 hour
-    
-    def _get_queue_position(self, file_id: str) -> int:
-        """Get position of file in queue."""
-        items = self.redis_client.zrange(self.upload_queue_key, 0, -1)
-        for i, item in enumerate(items):
-            file_data = json.loads(item)
-            if file_data['file_id'] == file_id:
-                return i + 1
-        return 0
-    
-    def _estimate_wait_time(self, user_id: str) -> int:
-        """Estimate wait time for user's next file."""
-        # Simple estimation based on queue position and average processing time
-        queue_size = self.redis_client.zcard(self.upload_queue_key)
-        avg_processing_time = 300  # 5 minutes average
-        
-        return int(queue_size * avg_processing_time * 0.5)  # Assume parallel processing
-    
-    def _get_queue_info(self) -> Dict[str, Any]:
-        """Get comprehensive queue information."""
-        total_queued = self.redis_client.zcard(self.upload_queue_key)
-        current_memory = self._get_current_memory_usage()
-        max_memory = self.config.max_total_memory_mb
-        
-        return {
-            'total_queued': total_queued,
-            'memory_usage': {
-                'current_mb': current_memory,
-                'max_mb': max_memory,
-                'utilization': current_memory / max_memory if max_memory > 0 else 0
-            },
-            'status': self._determine_system_status(current_memory, max_memory)
-        }
-    
-    def _determine_system_status(self, current_memory: float, max_memory: float) -> str:
-        """Determine current system status based on memory usage."""
-        utilization = current_memory / max_memory if max_memory > 0 else 0
-        
-        if utilization >= self.config.memory_reject_threshold:
-            return "overloaded"
-        elif utilization >= self.config.memory_warning_threshold:
-            return "busy"
-        else:
-            return "ready"
-    
-    def get_system_status(self) -> Dict[str, Any]:
-        """Get comprehensive system status for monitoring."""
-        queue_info = self._get_queue_info()
-        
-        # Service-specific info
-        services = {}
-        for service_name in ['docling', 'tika', 'llm', 'document_analysis']:
-            services[service_name] = {
-                'memory_used_mb': self._get_service_memory_usage(service_name),
-                'memory_limit_mb': self._get_service_memory_limit(service_name),
-                'utilization': self._get_service_memory_usage(service_name) / self._get_service_memory_limit(service_name)
-            }
-        
-        return {
-            'status': queue_info['status'],
-            'queue': queue_info,
-            'services': services,
-            'config': asdict(self.config)
-        }
-
-# Convenience functions
-def get_memory_queue(environment: str = "dev") -> MemoryAwareQueue:
-    """Get memory-aware queue instance."""
-    return MemoryAwareQueue(environment)
-
-def check_upload_capacity(user_id: str, file_size: int, mime_type: str, environment: str = "dev") -> Tuple[bool, str, Dict]:
-    """Quick capacity check for upload."""
-    queue = get_memory_queue(environment)
-    return queue.check_upload_capacity(user_id, file_size, mime_type)
--- a/archive/auto_processing/pipeline_controller.py
+++ b/archive/auto_processing/pipeline_controller.py
--- a/archive/auto_processing/task_processors.py
+++ b/archive/auto_processing/task_processors.py