diff --git a/.gitignore b/.gitignore
index 554b91b..92f33d4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,8 +1,11 @@
 __pycache__
 .pytest_cache
 
+.env
+
 .DS_Store
 
 .archive/*
 
 data/logs/*
+*.bak
diff --git a/archive/auto_processing/README.md b/archive/auto_processing/README.md
deleted file mode 100644
index ab42fd5..0000000
--- a/archive/auto_processing/README.md
+++ /dev/null
@@ -1,66 +0,0 @@
-# Auto-Processing Code Archive
-
-This directory contains the complex auto-processing system that was previously used for automatic document processing after file upload.
-
-## Archived Components
-
-### Core Processing Files
-- `files_with_auto_processing.py` - Original files.py router with automatic processing
-- `pipeline_controller.py` - Complex multi-phase pipeline orchestration  
-- `task_processors.py` - Document processing task handlers
-
-### Advanced Queue Management (Created but not deployed)
-- `memory_aware_queue.py` - Memory-based intelligent queue management
-- `enhanced_upload_handler.py` - Advanced upload handler with queuing
-- `enhanced_upload.py` - API endpoints for advanced upload system
-
-## What This System Did
-
-### Automatic Processing Pipeline
-1. **File Upload** → Immediate processing trigger
-2. **PDF Conversion** (synchronous, blocking)
-3. **Phase 1**: Structure discovery (Tika, Page Images, Document Analysis, Split Map)
-4. **Phase 2**: Docling processing (NO_OCR → OCR → VLM pipelines)
-5. **Complex Dependencies**: Phase coordination, task sequencing
-6. **Redis Queue Management**: Service limits, rate limits, dependency tracking
-
-### Features
-- Multi-phase processing pipelines
-- Complex task dependency management
-- Memory-aware queue limits
-- Multi-user capacity management
-- Real-time processing status
-- WebSocket status updates
-- Service-specific resource limits
-- Task recovery on restart
-
-## Why Archived
-
-The system was overly complex for the current needs:
-- **Complexity**: Multi-phase pipelines with complex dependencies
-- **Blocking Operations**: Synchronous PDF conversion causing timeouts
-- **Resource Management**: Over-engineered for single-user scenarios
-- **User Experience**: Users had to wait for processing to complete
-
-## New Simplified Approach
-
-The new system focuses on:
-- **Simple Upload**: Just store files and create database records
-- **No Auto-Processing**: Users manually trigger processing when needed
-- **Directory Support**: Upload entire folders with manifest tracking
-- **Immediate Response**: Users get instant confirmation without waiting
-
-## If You Need to Restore
-
-To restore the auto-processing functionality:
-1. Copy `files_with_auto_processing.py` back to `routers/database/files/files.py`
-2. Ensure `pipeline_controller.py` and `task_processors.py` are in `modules/`
-3. Update imports and dependencies
-4. Re-enable background processing in upload handlers
-
-## Migration Notes
-
-The database schema and Redis structure remain compatible. The new simplified system can coexist with the archived processing logic if needed.
-
-Date Archived: $(date)
-Reason: Simplification for directory upload implementation
diff --git a/archive/auto_processing/enhanced_upload.py b/archive/auto_processing/enhanced_upload.py
deleted file mode 100644
index 3cd60ef..0000000
--- a/archive/auto_processing/enhanced_upload.py
+++ /dev/null
@@ -1,142 +0,0 @@
-"""
-Enhanced Upload Router with Memory-Aware Queuing
-===============================================
-
-Provides intelligent upload endpoints with capacity checking,
-queue management, and real-time status updates.
-
-Endpoints:
-- POST /upload/check-capacity    - Pre-check if upload is possible
-- POST /upload/with-queue        - Upload with intelligent queuing
-- GET  /upload/status/{file_id}  - Get processing status
-- GET  /upload/queue-status      - Get overall queue status
-- WebSocket /ws/upload-status    - Real-time status updates
-"""
-
-import os
-import uuid
-import json
-import logging
-from typing import Dict, List, Optional, Any
-from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, WebSocket, WebSocketDisconnect
-from fastapi.responses import JSONResponse
-
-from modules.auth.supabase_bearer import SupabaseBearer
-from modules.enhanced_upload_handler import get_upload_handler
-from modules.memory_aware_queue import get_memory_queue
-from modules.logger_tool import initialise_logger
-
-router = APIRouter()
-auth = SupabaseBearer()
-
-logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
-
-# WebSocket connection manager for real-time updates
-class ConnectionManager:
-    def __init__(self):
-        self.active_connections: Dict[str, List[WebSocket]] = {}
-    
-    async def connect(self, websocket: WebSocket, file_id: str):
-        await websocket.accept()
-        if file_id not in self.active_connections:
-            self.active_connections[file_id] = []
-        self.active_connections[file_id].append(websocket)
-    
-    def disconnect(self, websocket: WebSocket, file_id: str):
-        if file_id in self.active_connections:
-            self.active_connections[file_id].remove(websocket)
-            if not self.active_connections[file_id]:
-                del self.active_connections[file_id]
-    
-    async def broadcast_to_file(self, file_id: str, message: dict):
-        if file_id in self.active_connections:
-            for connection in self.active_connections[file_id].copy():
-                try:
-                    await connection.send_json(message)
-                except:
-                    self.active_connections[file_id].remove(connection)
-
-manager = ConnectionManager()
-
-@router.post("/upload/check-capacity")
-async def check_upload_capacity(
-    file_size: int = Form(...),
-    mime_type: str = Form(...),
-    payload: Dict[str, Any] = Depends(auth)
-):
-    """
-    Check if user can upload a file of given size and type.
-    
-    Returns capacity information and recommendations.
-    """
-    try:
-        user_id = payload.get('sub') or payload.get('user_id', 'anonymous')
-        
-        # Determine environment
-        environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'
-        upload_handler = get_upload_handler(environment)
-        
-        eligible, message, details = upload_handler.check_upload_eligibility(
-            user_id, file_size, mime_type
-        )
-        
-        response = {
-            'eligible': eligible,
-            'message': message,
-            'details': details,
-            'timestamp': time.time()
-        }
-        
-        status_code = 200 if eligible else 429  # Too Many Requests if not eligible
-        
-        logger.info(f"📋 Capacity check for user {user_id}: {eligible} - {message}")
-        
-        return JSONResponse(content=response, status_code=status_code)
-        
-    except Exception as e:
-        logger.error(f"Capacity check error: {e}")
-        raise HTTPException(status_code=500, detail=str(e))
-
-@router.post("/upload/with-queue")
-async def upload_with_queue(
-    cabinet_id: str = Form(...),
-    path: str = Form(...),
-    scope: str = Form(...),
-    priority: int = Form(1),
-    file: UploadFile = File(...),
-    payload: Dict[str, Any] = Depends(auth)
-):
-    """
-    Upload file with intelligent queuing and capacity management.
-    
-    Returns queue information and processing status.
-    """
-    try:
-        user_id = payload.get('sub') or payload.get('user_id', 'anonymous')
-        
-        # Read file content
-        file_bytes = await file.read()
-        file_size = len(file_bytes)
-        mime_type = file.content_type or 'application/octet-stream'
-        filename = file.filename or path
-        
-        logger.info(f"📤 Upload request: {filename} ({file_size} bytes) for user {user_id}")
-        
-        # Determine environment
-        environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'
-        upload_handler = get_upload_handler(environment)
-        
-        # Check if upload queuing is enabled
-        if os.getenv('UPLOAD_QUEUE_ENABLED', 'true').lower() == 'true':
-            # Use new queue-based upload
-            file_id = str(uuid.uuid4())
-            
-            result = await upload_handler.handle_upload_with_queue(
-                file_id=file_id,
-                user_id=user_id,
-                filename=filename,
-                file_bytes=file_bytes,
-                mime_type=mime_type,
-                cabinet_id=cabinet_id,
-                priority=priority
-            )\n            \n            return result\n        \n        else:\n            # Fall back to immediate processing (legacy mode)\n            logger.warning(\"Using legacy immediate processing mode\")\n            # TODO: Call original upload_file function\n            raise HTTPException(status_code=501, detail=\"Legacy mode not implemented in this endpoint\")\n        \n    except HTTPException:\n        raise\n    except Exception as e:\n        logger.error(f\"Upload error: {e}\")\n        raise HTTPException(status_code=500, detail=str(e))\n\n@router.get(\"/upload/status/{file_id}\")\nasync def get_upload_status(\n    file_id: str,\n    payload: Dict[str, Any] = Depends(auth)\n):\n    \"\"\"\n    Get current processing status for an uploaded file.\n    \"\"\"\n    try:\n        # Determine environment\n        environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n        upload_handler = get_upload_handler(environment)\n        \n        status = upload_handler.get_processing_status(file_id)\n        \n        if status.get('status') == 'not_found':\n            raise HTTPException(status_code=404, detail=\"File not found\")\n        \n        return status\n        \n    except HTTPException:\n        raise\n    except Exception as e:\n        logger.error(f\"Status check error for {file_id}: {e}\")\n        raise HTTPException(status_code=500, detail=str(e))\n\n@router.get(\"/upload/queue-status\")\nasync def get_queue_status(\n    payload: Dict[str, Any] = Depends(auth)\n):\n    \"\"\"\n    Get overall queue status and system capacity information.\n    \"\"\"\n    try:\n        # Determine environment\n        environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n        memory_queue = get_memory_queue(environment)\n        \n        system_status = memory_queue.get_system_status()\n        \n        return {\n            'system_status': system_status,\n            'timestamp': time.time(),\n            'environment': environment\n        }\n        \n    except Exception as e:\n        logger.error(f\"Queue status error: {e}\")\n        raise HTTPException(status_code=500, detail=str(e))\n\n@router.websocket(\"/ws/upload-status/{file_id}\")\nasync def websocket_upload_status(websocket: WebSocket, file_id: str):\n    \"\"\"\n    WebSocket endpoint for real-time upload status updates.\n    \"\"\"\n    await manager.connect(websocket, file_id)\n    \n    try:\n        # Send initial status\n        environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n        upload_handler = get_upload_handler(environment)\n        initial_status = upload_handler.get_processing_status(file_id)\n        \n        await websocket.send_json({\n            'type': 'status_update',\n            'data': initial_status\n        })\n        \n        # Keep connection alive and listen for updates\n        while True:\n            # In a real implementation, you'd have a background task\n            # that pushes updates when file status changes\n            await asyncio.sleep(5)\n            \n            # Check for status updates\n            current_status = upload_handler.get_processing_status(file_id)\n            await websocket.send_json({\n                'type': 'status_update', \n                'data': current_status\n            })\n            \n    except WebSocketDisconnect:\n        manager.disconnect(websocket, file_id)\n    except Exception as e:\n        logger.error(f\"WebSocket error for {file_id}: {e}\")\n        manager.disconnect(websocket, file_id)\n\n# Background task to process upload queue\n@router.on_event(\"startup\")\nasync def start_queue_processor():\n    \"\"\"Start background queue processor.\"\"\"\n    \n    if os.getenv('UPLOAD_QUEUE_ENABLED', 'true').lower() != 'true':\n        logger.info(\"📋 Upload queue disabled, skipping queue processor\")\n        return\n    \n    import asyncio\n    \n    environment = 'dev' if os.getenv('BACKEND_DEV_MODE', 'true').lower() == 'true' else 'prod'\n    upload_handler = get_upload_handler(environment)\n    \n    # Start background processor\n    asyncio.create_task(upload_handler.process_queued_files(\"document_processor\"))\n    \n    logger.info(\"🚀 Upload queue processor started\")\n\nimport time\nimport asyncio"
diff --git a/archive/auto_processing/enhanced_upload_handler.py b/archive/auto_processing/enhanced_upload_handler.py
deleted file mode 100644
index d453639..0000000
--- a/archive/auto_processing/enhanced_upload_handler.py
+++ /dev/null
@@ -1,362 +0,0 @@
-"""
-Enhanced Upload Handler with Memory-Aware Queuing
-=================================================
-
-Replaces the immediate processing model with intelligent queue management.
-Provides user feedback about capacity, queue position, and processing status.
-
-Features:
-- Pre-upload capacity checking
-- Memory-aware queuing with user quotas
-- Real-time status updates via WebSocket/SSE
-- Graceful degradation under load
-- Fair queuing across multiple users
-"""
-
-import os
-import uuid
-import time
-import logging
-import asyncio
-from typing import Dict, List, Optional, Any, Tuple
-from fastapi import HTTPException, BackgroundTasks
-from dataclasses import asdict
-
-from .memory_aware_queue import get_memory_queue, QueuedFile
-from .redis_manager import get_redis_manager
-from modules.database.supabase.utils.client import SupabaseServiceRoleClient
-from modules.database.tools.storage.storage_admin import StorageAdmin
-
-logger = logging.getLogger(__name__)
-
-class EnhancedUploadHandler:
-    """Enhanced upload handler with memory-aware queuing."""
-    
-    def __init__(self, environment: str = "dev"):
-        self.memory_queue = get_memory_queue(environment)
-        self.redis_manager = get_redis_manager(environment)
-        self.redis_client = self.redis_manager.client
-        
-        # Processing status tracking
-        self.processing_status_key = "file_processing_status"
-        
-    def check_upload_eligibility(self, user_id: str, file_size: int, 
-                               mime_type: str) -> Tuple[bool, str, Dict[str, Any]]:
-        """
-        Check if user can upload a file right now.
-        
-        Returns:
-            (eligible, message, details)
-        """
-        
-        # Check system capacity
-        can_accept, message, queue_info = self.memory_queue.check_upload_capacity(
-            user_id, file_size, mime_type
-        )
-        
-        if not can_accept:
-            return False, message, {
-                'reason': 'capacity_exceeded',
-                'queue_info': queue_info,
-                'recommendations': self._get_recommendations(queue_info)
-            }
-        
-        return True, message, {
-            'status': 'ready_for_upload',
-            'queue_info': queue_info,
-            'processing_estimate': self._estimate_processing_time(file_size, mime_type)
-        }
-    
-    async def handle_upload_with_queue(self, file_id: str, user_id: str, 
-                                     filename: str, file_bytes: bytes,
-                                     mime_type: str, cabinet_id: str,
-                                     priority: int = 1) -> Dict[str, Any]:
-        """
-        Handle file upload with intelligent queuing.
-        
-        Steps:
-        1. Store file immediately (cheap operation)
-        2. Add to processing queue
-        3. Return queue status to user
-        4. Process asynchronously when capacity available
-        """
-        
-        # Store file immediately (this is fast)
-        storage = StorageAdmin()
-        client = SupabaseServiceRoleClient()
-        
-        # Create database record
-        bucket = f"{cabinet_id}-files"  # or your bucket naming convention
-        storage_path = f"{cabinet_id}/{file_id}/{filename}"
-        
-        try:
-            # Store file
-            storage.upload_file(bucket, storage_path, file_bytes, mime_type, upsert=True)
-            
-            # Create file record
-            insert_res = client.supabase.table('files').insert({
-                'id': file_id,
-                'name': filename,
-                'cabinet_id': cabinet_id,
-                'bucket': bucket,
-                'path': storage_path,
-                'mime_type': mime_type,
-                'uploaded_by': user_id,
-                'size_bytes': len(file_bytes),
-                'source': 'classroomcopilot-web',
-                'status': 'queued_for_processing'  # New status
-            }).execute()
-            
-            if not insert_res.data:
-                raise HTTPException(status_code=500, detail="Failed to create file record")
-            
-        except Exception as e:
-            logger.error(f"Failed to store file {file_id}: {e}")
-            raise HTTPException(status_code=500, detail=f"Storage failed: {str(e)}")
-        
-        # Add to processing queue
-        try:
-            queue_result = self.memory_queue.enqueue_file(
-                file_id=file_id,
-                user_id=user_id,
-                filename=filename,
-                size_bytes=len(file_bytes),
-                mime_type=mime_type,
-                cabinet_id=cabinet_id,
-                priority=priority
-            )
-            
-            # Update file status in database
-            client.supabase.table('files').update({
-                'status': 'queued_for_processing',
-                'extra': {
-                    'queue_position': queue_result['queue_position'],
-                    'estimated_wait_seconds': queue_result['estimated_wait_seconds'],
-                    'memory_estimate_mb': queue_result['memory_estimate_mb']
-                }
-            }).eq('id', file_id).execute()
-            
-            logger.info(f"📋 File {file_id} queued at position {queue_result['queue_position']}")
-            
-            return {
-                'status': 'upload_successful',
-                'message': 'File uploaded and queued for processing',
-                'file_id': file_id,
-                'queue_info': queue_result,
-                'next_steps': {
-                    'poll_status_endpoint': f'/database/files/{file_id}/processing-status',
-                    'websocket_updates': f'/ws/file-processing/{file_id}'
-                }
-            }
-            
-        except Exception as e:
-            logger.error(f"Failed to queue file {file_id}: {e}")
-            # Clean up stored file
-            try:
-                storage.delete_file(bucket, storage_path)
-                client.supabase.table('files').delete().eq('id', file_id).execute()
-            except:
-                pass
-            raise HTTPException(status_code=500, detail=f"Queue failed: {str(e)}")
-    
-    async def process_queued_files(self, service_name: str = "document_processor"):
-        """
-        Background service to process queued files.
-        This runs continuously as a background task.
-        """
-        
-        logger.info(f"🚀 Started queue processor for {service_name}")
-        
-        while True:
-            try:
-                # Get next file from queue
-                queued_file = self.memory_queue.dequeue_next_file(service_name)
-                
-                if not queued_file:
-                    # No files ready for processing
-                    await asyncio.sleep(5)
-                    continue
-                
-                # Update file status
-                await self._update_processing_status(queued_file.file_id, 'processing')
-                
-                # Process the file
-                try:
-                    await self._process_file(queued_file, service_name)
-                    await self._update_processing_status(queued_file.file_id, 'completed')
-                    
-                except Exception as e:
-                    logger.error(f"Failed to process file {queued_file.file_id}: {e}")
-                    await self._update_processing_status(queued_file.file_id, 'failed', str(e))
-                
-                finally:
-                    # Always free memory
-                    self.memory_queue.complete_processing(
-                        service_name, 
-                        queued_file.file_id, 
-                        queued_file.memory_estimate_mb
-                    )
-                
-            except Exception as e:
-                logger.error(f"Queue processor error: {e}")
-                await asyncio.sleep(10)  # Back off on errors
-    
-    async def _process_file(self, queued_file: QueuedFile, service_name: str):
-        """Process a single file from the queue."""
-        
-        logger.info(f"🔄 Processing file {queued_file.file_id} in {service_name}")
-        
-        # Import here to avoid circular imports
-        from modules.pipeline_controller import get_pipeline_controller
-        
-        client = SupabaseServiceRoleClient()
-        controller = get_pipeline_controller()
-        
-        # Get file record
-        file_result = client.supabase.table('files').select('*').eq('id', queued_file.file_id).single().execute()
-        file_row = file_result.data
-        
-        if not file_row:
-            raise Exception(f"File record not found: {queued_file.file_id}")
-        
-        # Update status to processing
-        client.supabase.table('files').update({
-            'status': 'processing'
-        }).eq('id', queued_file.file_id).execute()
-        
-        # Convert to PDF if needed (this is where the bottleneck was before)
-        processing_path = await self._handle_pdf_conversion(file_row)
-        
-        # Enqueue Phase 1 tasks
-        phase1_tasks = controller.enqueue_phase1_tasks(
-            file_id=queued_file.file_id,
-            file_row={**file_row, 'path': processing_path},
-            processing_path=processing_path,
-            processing_mime=file_row['mime_type']
-        )
-        
-        # Update database with task IDs
-        client.supabase.table('files').update({
-            'status': 'phase1_processing',
-            'extra': {
-                **file_row.get('extra', {}),
-                'phase1_tasks': phase1_tasks,
-                'processing_started_at': time.time()
-            }
-        }).eq('id', queued_file.file_id).execute()
-        
-        logger.info(f"✅ File {queued_file.file_id} processing initiated")
-    
-    async def _handle_pdf_conversion(self, file_row: Dict[str, Any]) -> str:
-        """Handle PDF conversion asynchronously."""
-        
-        if file_row['mime_type'] == 'application/pdf':
-            return file_row['path']
-        
-        # TODO: Implement async PDF conversion
-        # For now, return original path and handle conversion in pipeline
-        logger.info(f"PDF conversion queued for file {file_row['id']}")
-        return file_row['path']
-    
-    async def _update_processing_status(self, file_id: str, status: str, error: str = None):
-        """Update file processing status."""
-        
-        status_data = {
-            'file_id': file_id,
-            'status': status,
-            'timestamp': time.time(),
-            'error': error
-        }
-        
-        # Store in Redis for real-time updates
-        status_key = f"{self.processing_status_key}:{file_id}"
-        self.redis_client.setex(status_key, 86400, json.dumps(status_data))  # 24h expiry
-        
-        # Update database
-        client = SupabaseServiceRoleClient()
-        client.supabase.table('files').update({
-            'status': status,
-            'error_message': error
-        }).eq('id', file_id).execute()
-        
-        logger.info(f"📊 Status update for {file_id}: {status}")
-    
-    def get_processing_status(self, file_id: str) -> Dict[str, Any]:
-        """Get current processing status for a file."""
-        
-        status_key = f"{self.processing_status_key}:{file_id}"
-        status_json = self.redis_client.get(status_key)
-        
-        if status_json:
-            return json.loads(status_json)
-        
-        # Fallback to database
-        client = SupabaseServiceRoleClient()
-        result = client.supabase.table('files').select('status, error_message, extra').eq('id', file_id).single().execute()
-        
-        if result.data:
-            return {
-                'file_id': file_id,
-                'status': result.data['status'],
-                'error': result.data.get('error_message'),
-                'extra': result.data.get('extra', {})
-            }
-        
-        return {'file_id': file_id, 'status': 'not_found'}
-    
-    def _estimate_processing_time(self, file_size: int, mime_type: str) -> Dict[str, Any]:
-        """Estimate processing time for a file."""
-        
-        # Base time estimates (in seconds)
-        base_times = {
-            'application/pdf': 60,      # 1 minute per MB
-            'application/msword': 120,  # 2 minutes per MB
-            'image/': 30               # 30 seconds per MB
-        }
-        
-        # Find matching mime type
-        time_per_mb = 60  # default
-        for mime_prefix, time_val in base_times.items():
-            if mime_type.startswith(mime_prefix):
-                time_per_mb = time_val
-                break
-        
-        file_size_mb = file_size / (1024 * 1024)
-        estimated_seconds = int(file_size_mb * time_per_mb)
-        
-        return {
-            'estimated_seconds': estimated_seconds,
-            'estimated_minutes': estimated_seconds / 60,
-            'phases': {
-                'pdf_conversion': estimated_seconds * 0.2,
-                'metadata_extraction': estimated_seconds * 0.3,
-                'docling_processing': estimated_seconds * 0.5
-            }
-        }
-    
-    def _get_recommendations(self, queue_info: Dict[str, Any]) -> List[str]:
-        """Get recommendations for user when upload is rejected."""
-        
-        recommendations = []
-        
-        if queue_info.get('reason') == 'file_too_large':
-            recommendations.append("Try compressing your file or splitting it into smaller parts")
-        
-        if queue_info.get('utilization', 0) > 0.9:
-            recommendations.append("System is currently overloaded. Try uploading during off-peak hours")
-            recommendations.append("Consider uploading smaller files first")
-        
-        if queue_info.get('user_current', 0) > 0:
-            recommendations.append("Wait for your current uploads to complete before uploading more")
-        
-        if not recommendations:
-            recommendations.append("Please try again in a few minutes")
-        
-        return recommendations
-
-# Convenience functions
-def get_upload_handler(environment: str = "dev") -> EnhancedUploadHandler:
-    """Get enhanced upload handler instance."""
-    return EnhancedUploadHandler(environment)
-
-import json
diff --git a/archive/auto_processing/files_with_auto_processing.py b/archive/auto_processing/files_with_auto_processing.py
deleted file mode 100644
index 3188335..0000000
--- a/archive/auto_processing/files_with_auto_processing.py
+++ /dev/null
@@ -1,997 +0,0 @@
-import os
-import io
-from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, BackgroundTasks
-from typing import Any, Dict, Optional
-import uuid
-import re
-import requests
-import os
-import tempfile
-from pathlib import Path
-from modules.auth.supabase_bearer import SupabaseBearer, verify_supabase_jwt_str
-from modules.logger_tool import initialise_logger
-from modules.database.supabase.utils.client import SupabaseServiceRoleClient
-from modules.database.supabase.utils.storage import StorageAdmin
-from modules.document_processor import DocumentProcessor
-from modules.queue_system import (
-    enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task,
-    enqueue_document_analysis_task, enqueue_page_images_task,
-    TaskPriority, get_queue, QueueConnectionError
-)
-from fastapi.responses import Response
-from fastapi import Body
-
-router = APIRouter()
-auth = SupabaseBearer()
-doc_processor = DocumentProcessor()
-
-DEFAULT_BUCKET = os.getenv('DEFAULT_FILES_BUCKET', 'cc.users')
-
-# Timeout configurations (in seconds)
-TIKA_TIMEOUT = int(os.getenv('TIKA_TIMEOUT', '300'))  # 5 minutes default
-DOCLING_FRONTMATTER_TIMEOUT = int(os.getenv('DOCLING_FRONTMATTER_TIMEOUT', '1800'))  # 30 minutes default
-DOCLING_NOOCR_TIMEOUT = int(os.getenv('DOCLING_NOOCR_TIMEOUT', '3600'))  # 1 hour default
-
-# (Legacy feature flags removed - using new three-phase system)
-
-logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
-
-def _safe_filename(name: str) -> str:
-    base = os.path.basename(name or 'file')
-    return re.sub(r"[^A-Za-z0-9._-]+", "_", base)
-
-def _choose_bucket(scope: str, user_id: str, school_id: Optional[str]) -> str:
-    scope = (scope or 'teacher').lower()
-    if scope == 'school' and school_id:
-        return f"cc.institutes.{school_id}.private"
-    # teacher / student fall back to users bucket for now
-    return 'cc.users'
-
-@router.post("/files/upload")
-async def upload_file(
-    cabinet_id: str = Form(...),
-    path: str = Form(...),
-    scope: str = Form('teacher'),
-    school_id: Optional[str] = Form(default=None),
-    file: UploadFile = File(...),
-    payload: Dict[str, Any] = Depends(auth),
-    background_tasks: BackgroundTasks = None
-):
-    user_id = payload.get('sub') or payload.get('user_id')
-    if not user_id:
-        raise HTTPException(status_code=401, detail="Invalid token payload")
-
-    client = SupabaseServiceRoleClient()
-    storage = StorageAdmin()
-
-    # Determine target bucket by scope
-    bucket = _choose_bucket(scope, user_id, school_id)
-
-    # Stage DB row to get file_id
-    staged_path = f"{cabinet_id}/staging/{uuid.uuid4()}"
-    name = _safe_filename(path or file.filename)
-    file_bytes = await file.read()
-    insert_res = client.supabase.table('files').insert({
-        'cabinet_id': cabinet_id,
-        'name': name,
-        'path': staged_path,
-        'bucket': bucket,
-        'mime_type': file.content_type,
-        'uploaded_by': user_id,
-        'size_bytes': len(file_bytes),
-        'source': 'classroomcopilot-web'
-    }).execute()
-    if not insert_res.data:
-        raise HTTPException(status_code=500, detail="Failed to create file record")
-    file_row = insert_res.data[0]
-    file_id = file_row['id']
-
-    # Final storage path: bucket/cabinet_id/file_id/file
-    final_storage_path = f"{cabinet_id}/{file_id}/{name}"
-    try:
-        storage.upload_file(bucket, final_storage_path, file_bytes, file.content_type or 'application/octet-stream', upsert=True)
-    except Exception as e:
-        # cleanup staged row
-        client.supabase.table('files').delete().eq('id', file_id).execute()
-        raise HTTPException(status_code=500, detail=f"Storage upload failed: {str(e)}")
-
-    # Update DB path to final
-    update_res = client.supabase.table('files').update({
-        'path': final_storage_path
-    }).eq('id', file_id).execute()
-    # Kick off initial artefacts generation in background (Tika + Docling frontmatter + no-OCR)
-    try:
-        if background_tasks is not None:
-            logger.info(f"Scheduling initial artefacts generation for file_id={file_id}")
-            background_tasks.add_task(generate_initial_artefacts, file_id, payload)
-        else:
-            logger.info(f"Running initial artefacts generation synchronously for file_id={file_id}")
-            generate_initial_artefacts(file_id, payload)
-    except Exception as e:
-        logger.error(f"Failed to schedule initial artefacts for file_id={file_id}: {e}")
-
-    return update_res.data
-
-@router.get("/files")
-def list_files(cabinet_id: str, payload: Dict[str, Any] = Depends(auth)):
-    client = SupabaseServiceRoleClient()
-    res = client.supabase.table('files').select('*').eq('cabinet_id', cabinet_id).execute()
-    return res.data
-
-@router.post("/files/{file_id}/move")
-def move_file(file_id: str, body: Dict[str, Any], payload: Dict[str, Any] = Depends(auth)):
-    client = SupabaseServiceRoleClient()
-    updates = {}
-    if 'cabinet_id' in body:
-        updates['cabinet_id'] = body['cabinet_id']
-    if 'path' in body:
-        updates['path'] = body['path']
-    if not updates:
-        raise HTTPException(status_code=400, detail="No changes provided")
-    res = client.supabase.table('files').update(updates).eq('id', file_id).execute()
-    return res.data
-
-@router.delete("/files/{file_id}")
-def delete_file(file_id: str, payload: Dict[str, Any] = Depends(auth)):
-    client = SupabaseServiceRoleClient()
-    res = client.supabase.table('files').delete().eq('id', file_id).execute()
-    return res.data
-
-@router.get("/files/{file_id}/artefacts")
-def list_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)):
-    client = SupabaseServiceRoleClient()
-    res = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).order('created_at', desc=True).execute()
-    return res.data
-
-@router.get("/files/{file_id}/viewer-artefacts")
-def list_viewer_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)):
-    """
-    Get artefacts organized for UI viewer display, including frontmatter JSON,
-    processing bundles, and analysis data with proper display metadata.
-    """
-    client = SupabaseServiceRoleClient()
-    
-    # Get all artefacts for the file
-    res = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).order('created_at', desc=True).execute()
-    all_artefacts = res.data or []
-    
-    # Organize artefacts by category for UI display
-    viewer_artefacts = {
-        'document_analysis': [],
-        'processing_bundles': [],
-        'raw_data': []
-    }
-    
-    for artefact in all_artefacts:
-        artefact_type = artefact.get('type', '')
-        extra = artefact.get('extra', {})
-        
-        # Enhanced artefact info for UI display
-        artefact_info = {
-            'id': artefact['id'],
-            'type': artefact_type,
-            'display_name': extra.get('display_name'),
-            'bundle_label': extra.get('bundle_label'),
-            'section_title': extra.get('section_title'),
-            'page_range': extra.get('page_range'),
-            'page_count': extra.get('page_count'),
-            'pipeline': extra.get('pipeline'),
-            'processing_mode': extra.get('processing_mode'),
-            'ui_order': extra.get('ui_order', 999),
-            'description': extra.get('description'),
-            'viewer_type': extra.get('viewer_type', 'json'),
-            'created_at': artefact['created_at'],
-            'status': artefact.get('status', 'unknown')
-        }
-        
-        # Categorize artefacts for UI organization
-        if artefact_type == 'docling_frontmatter_json':
-            artefact_info.update({
-                'display_name': artefact_info['display_name'] or 'Document Frontmatter',
-                'bundle_label': artefact_info['bundle_label'] or 'Frontmatter Analysis',
-                'description': artefact_info['description'] or 'OCR analysis of document structure and metadata',
-                'ui_order': 1,
-                'viewer_type': 'json'
-            })
-            viewer_artefacts['document_analysis'].append(artefact_info)
-            
-        elif artefact_type == 'split_map_json':
-            artefact_info.update({
-                'display_name': 'Document Structure Map',
-                'bundle_label': 'Split Map',
-                'description': 'Document section boundaries and organization structure',
-                'ui_order': 2,
-                'viewer_type': 'json'
-            })
-            viewer_artefacts['document_analysis'].append(artefact_info)
-            
-        elif artefact_type == 'tika_json':
-            artefact_info.update({
-                'display_name': 'Document Metadata',
-                'bundle_label': 'Tika Analysis',
-                'description': 'Raw document metadata and properties extracted by Apache Tika',
-                'ui_order': 3,
-                'viewer_type': 'json'
-            })
-            viewer_artefacts['raw_data'].append(artefact_info)
-            
-        elif artefact_type in ['canonical_docling_json', 'docling_bundle_split', 'docling_bundle', 'docling_standard', 'docling_bundle_split_pages']:
-            # Processing bundles (OCR, No-OCR, VLM) - use original_pipeline for proper differentiation
-            pipeline_name = extra.get('original_pipeline', extra.get('pipeline', 'Unknown'))
-            bundle_label = artefact_info['bundle_label'] or f"{pipeline_name.upper().replace('_', '-')} Bundle"
-            display_name = artefact_info['display_name'] or f"{pipeline_name.upper().replace('_', '-')} Processing Result"
-            
-            # Special handling for master manifests
-            if artefact_type == 'docling_bundle_split_pages':
-                display_name = f"{pipeline_name.upper().replace('_', '-')} Document Pages"
-                bundle_label = f"{pipeline_name.upper().replace('_', '-')} Pages Bundle"
-                artefact_info.update({
-                    'viewer_type': 'bundle_collection',
-                    'is_master_manifest': True,
-                    'ui_order': 10  # Show master manifests before individual pages
-                })
-            elif artefact_type == 'docling_standard':
-                # Individual page bundles - lower UI priority  
-                artefact_info.update({
-                    'viewer_type': 'page_bundle',
-                    'is_individual_page': True,
-                    'ui_order': extra.get('split_order', 999) + 100  # Show after master manifests
-                })
-            
-            artefact_info.update({
-                'display_name': display_name,
-                'bundle_label': bundle_label,
-                'description': f"Docling processing result using {pipeline_name.replace('_', '-')} pipeline",
-                'pipeline_type': pipeline_name  # Add explicit pipeline type for UI
-            })
-            viewer_artefacts['processing_bundles'].append(artefact_info)
-            
-        elif artefact_type.startswith('docling_') and artefact_type.endswith('_json'):
-            # Other docling JSON results
-            pipeline_name = artefact_type.replace('docling_', '').replace('_json', '').upper()
-            artefact_info.update({
-                'display_name': f"{pipeline_name} Analysis",
-                'bundle_label': f"{pipeline_name} Result",
-                'description': f"Docling {pipeline_name.lower()} processing result",
-                'viewer_type': 'json'
-            })
-            viewer_artefacts['processing_bundles'].append(artefact_info)
-            
-        elif artefact_type == 'page_images':
-            artefact_info.update({
-                'display_name': 'Page Images',
-                'bundle_label': 'Visual Pages',
-                'description': 'Generated page images for document visualization',
-                'viewer_type': 'images'
-            })
-            viewer_artefacts['raw_data'].append(artefact_info)
-    
-    # Sort each category by ui_order
-    for category in viewer_artefacts.values():
-        category.sort(key=lambda x: (x['ui_order'], x['created_at']))
-    
-    return {
-        'file_id': file_id,
-        'categories': viewer_artefacts,
-        'total_artefacts': len(all_artefacts)
-    }
-
-@router.post("/files/{file_id}/artefacts/initial")
-def generate_initial_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)):
-    """
-    Generate initial artefacts using the new three-phase pipeline architecture.
-    
-    Phase 1: Document Structure Discovery & Analysis
-    - Tika metadata extraction
-    - Page images generation  
-    - Document structure analysis (LLM-enhanced)
-    - Split map generation
-    
-    Phase 2: Triggered automatically after Phase 1 completion
-    """
-    logger.info(f"Three-phase pipeline: Starting Phase 1 for file_id={file_id}")
-    
-    from modules.pipeline_controller import get_pipeline_controller
-    
-    client = SupabaseServiceRoleClient()
-    storage = StorageAdmin()
-    controller = get_pipeline_controller()
-
-    # Load file row
-    fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
-    file_row = fr.data
-    if not file_row:
-        raise HTTPException(status_code=404, detail="File not found")
-
-    bucket = file_row['bucket']
-    storage_path = file_row['path']
-    cabinet_id = file_row['cabinet_id']
-    mime = file_row.get('mime_type') or 'application/octet-stream'
-    filename = file_row.get('name', 'file')
-
-    # Step 1: Convert to PDF if not already a PDF (synchronous for now)
-    processing_path = storage_path
-    processing_mime = mime
-    
-    if mime != 'application/pdf':
-        logger.info(f"Converting non-PDF file to PDF: file_id={file_id} mime={mime}")
-        try:
-            file_bytes = storage.download_file(bucket, storage_path)
-            
-            with tempfile.TemporaryDirectory() as temp_dir:
-                # Save original file to temp location
-                temp_input = Path(temp_dir) / filename
-                with open(temp_input, 'wb') as f:
-                    f.write(file_bytes)
-                
-                # Convert to PDF
-                pdf_bytes = doc_processor.convert_to_pdf(temp_input)
-                
-                # Store PDF as artefact
-                pdf_artefact_id = str(uuid.uuid4())
-                pdf_rel_path = f"{cabinet_id}/{file_id}/{pdf_artefact_id}/document.pdf"
-                storage.upload_file(bucket, pdf_rel_path, pdf_bytes, 'application/pdf', upsert=True)
-                
-                pdf_ar = client.supabase.table('document_artefacts').insert({
-                    'file_id': file_id,
-                    'type': 'document_pdf',
-                    'rel_path': pdf_rel_path,
-                    'extra': {'converted_from': mime, 'original_filename': filename},
-                    'status': 'completed'
-                }).execute()
-                
-                # Use converted PDF for subsequent processing
-                processing_path = pdf_rel_path
-                processing_mime = 'application/pdf'
-                logger.info(f"PDF conversion: completed file_id={file_id} rel_path={pdf_rel_path}")
-                
-        except Exception as e:
-            logger.error(f"PDF conversion: error processing file_id={file_id}: {e}")
-            # Continue with original file if conversion fails
-    else:
-        logger.info(f"File is already PDF, skipping conversion: file_id={file_id}")
-
-    # Step 2: Enqueue Phase 1 tasks using the new pipeline controller
-    user_id = payload.get('sub') or payload.get('user_id')
-    priority = TaskPriority.HIGH if user_id else TaskPriority.NORMAL
-
-    try:
-        # Update file row with processing path
-        updated_file_row = {**file_row, 'path': processing_path, 'mime_type': processing_mime}
-        
-        # Enqueue Phase 1 tasks
-        phase1_tasks = controller.enqueue_phase1_tasks(
-            file_id=file_id,
-            file_row=updated_file_row,
-            processing_path=processing_path,
-            processing_mime=processing_mime,
-            priority=priority
-        )
-        
-        total_tasks = sum(len(task_list) for task_list in phase1_tasks.values())
-        
-        logger.info(f"Three-phase pipeline: Enqueued {total_tasks} Phase 1 tasks for file_id={file_id}")
-        
-
-        return {
-            'message': f'Three-phase pipeline: Enqueued {total_tasks} Phase 1 tasks. Phase 2 will trigger automatically after completion.',
-            'phase1_tasks': {k: v for k, v in phase1_tasks.items()},
-            'file_id': file_id,
-            'pipeline_mode': 'three_phase',
-            'bundle_architecture_enabled': True
-        }
-
-    except QueueConnectionError as e:
-        logger.error(f"Queue system unavailable for file_id={file_id}: {e}")
-        logger.error("Redis is not running. Please start the API server with './start.sh dev' to auto-start Redis.")
-        return {
-            'message': 'File uploaded successfully, but processing tasks could not be queued (Redis unavailable)',
-            'file_id': file_id,
-            'queue_status': 'unavailable',
-            'error': 'Queue system unavailable. Please restart the API server with Redis enabled.'
-        }
-    except Exception as e:
-        logger.error(f"Unexpected error enqueueing Phase 1 tasks for file_id={file_id}: {e}")
-        return {
-            'message': 'File uploaded successfully, but processing tasks failed to queue',
-            'file_id': file_id,
-            'queue_status': 'failed',
-            'error': str(e)
-        }
-
-@router.get("/files/{file_id}/page-images/manifest")
-def get_page_images_manifest(file_id: str, payload: Dict[str, Any] = Depends(auth)):
-    """Return the page_images manifest JSON for a file via service-role access."""
-    client = SupabaseServiceRoleClient()
-    storage = StorageAdmin()
-
-    # Find file row to get bucket
-    fr = client.supabase.table('files').select('id,bucket,cabinet_id').eq('id', file_id).single().execute()
-    file_row = fr.data or {}
-    if not file_row:
-        raise HTTPException(status_code=404, detail="File not found")
-    bucket = file_row['bucket']
-    cabinet_id = file_row['cabinet_id']
-
-    # Find page_images artefact
-    arts = client.supabase.table('document_artefacts') \
-        .select('id,type,rel_path,extra') \
-        .eq('file_id', file_id).eq('type', 'page_images') \
-        .order('created_at', desc=True).limit(1).execute().data or []
-    if not arts:
-        raise HTTPException(status_code=404, detail="page_images artefact not found")
-    art = arts[0]
-
-    # Manifest path
-    manifest_rel_path = (art.get('extra') or {}).get('manifest') or f"{art['rel_path'].rstrip('/')}/page_images.json"
-
-    try:
-        raw = storage.download_file(bucket, manifest_rel_path)
-        import json as _json
-        manifest = _json.loads(raw.decode('utf-8'))
-        # Ensure bucket and base prefix are present for the UI
-        manifest.setdefault('bucket', bucket)
-        manifest.setdefault('base_dir', art['rel_path'])
-        return manifest
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to load manifest: {e}")
-
-def json_dumps(obj: Any) -> str:
-    try:
-        import json
-        return json.dumps(obj, ensure_ascii=False)
-    except Exception:
-        return "{}"
-
-
-@router.get("/files/{file_id}/artefacts/{artefact_id}/json")
-def get_artefact_json(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)):
-    """Return the JSON content of a document artefact using service-role storage access."""
-    client = SupabaseServiceRoleClient()
-    storage = StorageAdmin()
-    # Look up artefact to get rel_path and validate it belongs to file
-    ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path').eq('id', artefact_id).single().execute()
-    artefact = ar.data
-    if not artefact:
-        raise HTTPException(status_code=404, detail="Artefact not found")
-    if artefact.get('file_id') != file_id:
-        raise HTTPException(status_code=400, detail="Artefact does not belong to file")
-
-    # Look up file to get bucket
-    fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute()
-    file_row = fr.data
-    if not file_row:
-        raise HTTPException(status_code=404, detail="File not found")
-
-    bucket = file_row['bucket']
-    rel_path = artefact['rel_path']
-    try:
-        raw = storage.download_file(bucket, rel_path)
-        import json as _json
-        data = _json.loads(raw.decode('utf-8'))
-        return data
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to load artefact JSON: {str(e)}")
-
-
-@router.get("/files/{file_id}/artefacts/{artefact_id}/vlm-section-manifest")
-def get_vlm_section_manifest(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)):
-    """Return the VLM section page bundle manifest JSON for a VLM section bundle artefact."""
-    client = SupabaseServiceRoleClient()
-    storage = StorageAdmin()
-
-    ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path,type,extra').eq('id', artefact_id).single().execute().data
-    if not ar:
-        raise HTTPException(status_code=404, detail="Artefact not found")
-    if ar.get('file_id') != file_id:
-        raise HTTPException(status_code=400, detail="Artefact does not belong to file")
-    if ar.get('type') != 'vlm_section_page_bundle':
-        raise HTTPException(status_code=400, detail="Artefact is not a VLM section page bundle")
-
-    fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute().data
-    if not fr:
-        raise HTTPException(status_code=404, detail="File not found")
-    bucket = fr['bucket']
-
-    # The rel_path directly points to the manifest JSON file
-    manifest_rel_path = ar['rel_path']
-
-    try:
-        raw = storage.download_file(bucket, manifest_rel_path)
-        import json as _json
-        data = _json.loads(raw.decode('utf-8'))
-        # ensure bucket present for client use
-        data.setdefault('bucket', bucket)
-        return data
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to load VLM section manifest: {e}")
-
-
-@router.post("/files/{file_id}/artefacts/outline")
-def enqueue_outline_structure(file_id: str, payload: Dict[str, Any] = Depends(auth)):
-    """
-    Manually enqueue the fast document outline (headings-only) analysis for an existing file.
-    Returns the queued task id.
-    """
-    client = SupabaseServiceRoleClient()
-
-    fr = client.supabase.table('files').select('id,bucket,cabinet_id,path,mime_type').eq('id', file_id).single().execute()
-    file_row = fr.data
-    if not file_row:
-        raise HTTPException(status_code=404, detail="File not found")
-
-    bucket = file_row['bucket']
-    storage_path = file_row['path']
-    cabinet_id = file_row['cabinet_id']
-    mime = file_row.get('mime_type') or 'application/pdf'
-
-    # Prefer converted PDF artefact if available
-    arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or []
-    pdf_art = next((a for a in arts if a.get('type') == 'document_pdf'), None)
-    processing_path = pdf_art['rel_path'] if pdf_art else storage_path
-
-    try:
-        task_id = enqueue_docling_task(
-            file_id=file_id,
-            task_type='document_structure_analysis',
-            payload={
-                'bucket': bucket,
-                'file_path': processing_path,
-                'cabinet_id': cabinet_id,
-                'mime_type': mime,
-                'config': {
-                    'target_type': 'inbody',
-                    'to_formats': 'json',
-                    'do_ocr': False,
-                    'force_ocr': False
-                }
-            },
-            priority=TaskPriority.NORMAL,
-            timeout=300
-        )
-        return { 'message': 'outline task enqueued', 'task_id': task_id, 'file_id': file_id }
-    except QueueConnectionError as e:
-        raise HTTPException(status_code=503, detail=f"Queue unavailable: {e}")
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to enqueue outline task: {e}")
-
-@router.get("/files/proxy")
-def proxy_storage_file(bucket: str, path: str, payload: Dict[str, Any] = Depends(auth)):
-    """Proxy a storage file (service-role), useful for private image access in the UI."""
-    storage = StorageAdmin()
-    try:
-        data = storage.download_file(bucket, path)
-        media = 'application/octet-stream'
-        lp = path.lower()
-        if lp.endswith('.png'):
-            media = 'image/png'
-        elif lp.endswith('.webp'):
-            media = 'image/webp'
-        elif lp.endswith('.jpg') or lp.endswith('.jpeg'):
-            media = 'image/jpeg'
-        elif lp.endswith('.json'):
-            media = 'application/json'
-        return Response(content=data, media_type=media)
-    except Exception as e:
-        raise HTTPException(status_code=404, detail=f"File not found or inaccessible: {e}")
-
-
-# Signed proxy for iframe/img tags without Authorization header
-@router.get("/files/proxy_signed")
-def proxy_storage_file_signed(bucket: str, path: str, token: str):
-    """Proxy using a signed bearer token passed as query param 'token'."""
-    try:
-        payload = verify_supabase_jwt_str(token)
-        if not payload:
-            raise HTTPException(status_code=403, detail="Invalid token")
-    except Exception as e:
-        raise HTTPException(status_code=403, detail=f"Invalid token: {e}")
-
-    storage = StorageAdmin()
-    try:
-        data = storage.download_file(bucket, path)
-        media = 'application/octet-stream'
-        lp = path.lower()
-        if lp.endswith('.png'):
-            media = 'image/png'
-        elif lp.endswith('.webp'):
-            media = 'image/webp'
-        elif lp.endswith('.jpg') or lp.endswith('.jpeg'):
-            media = 'image/jpeg'
-        elif lp.endswith('.json'):
-            media = 'application/json'
-        return Response(content=data, media_type=media)
-    except Exception as e:
-        raise HTTPException(status_code=404, detail=f"File not found or inaccessible: {e}")
-
-# -------- Canonical bundle manifest ---------
-
-@router.get("/files/{file_id}/artefacts/{artefact_id}/manifest")
-def get_canonical_manifest(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)):
-    """Return the manifest.json for a canonical_docling_bundle artefact."""
-    client = SupabaseServiceRoleClient()
-    storage = StorageAdmin()
-
-    ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path,extra').eq('id', artefact_id).single().execute().data
-    if not ar:
-        raise HTTPException(status_code=404, detail="Artefact not found")
-    if ar.get('file_id') != file_id:
-        raise HTTPException(status_code=400, detail="Artefact does not belong to file")
-    extra = ar.get('extra') or {}
-    manifest_rel_path = extra.get('manifest')
-    if not manifest_rel_path:
-        raise HTTPException(status_code=404, detail="Manifest path not recorded on artefact")
-
-    fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute().data
-    if not fr:
-        raise HTTPException(status_code=404, detail="File not found")
-    bucket = fr['bucket']
-
-    try:
-        raw = storage.download_file(bucket, manifest_rel_path)
-        import json as _json
-        data = _json.loads(raw.decode('utf-8'))
-        # ensure bucket present for client use
-        data.setdefault('bucket', bucket)
-        return data
-    except Exception as e:
-        raise HTTPException(status_code=500, detail=f"Failed to load manifest: {e}")
-
-# -------- Canonical Docling generation ---------
-
-def _load_split_map(client: SupabaseServiceRoleClient, storage: StorageAdmin, bucket: str, file_id: str) -> Optional[Dict[str, Any]]:
-    try:
-        arts = client.supabase.table('document_artefacts') \
-            .select('id,type,rel_path') \
-            .eq('file_id', file_id).eq('type', 'split_map_json') \
-            .order('created_at', desc=True).limit(1).execute().data or []
-        if not arts:
-            return None
-        art = arts[0]
-        raw = storage.download_file(bucket, art['rel_path'])
-        import json as _json
-        return _json.loads(raw.decode('utf-8'))
-    except Exception:
-        return None
-
-
-@router.post("/files/{file_id}/artefacts/canonical-docling")
-def enqueue_canonical_docling(
-    file_id: str,
-    body: Dict[str, Any] = Body(default={}),
-    payload: Dict[str, Any] = Depends(auth)
-):
-    """Enqueue generation of canonical Docling JSON(s) for a file.
-
-    If a split_map is available and the document is large, this will enqueue
-    multiple Docling jobs using page ranges per section. Otherwise a single
-    job is created for the whole document.
-    """
-    client = SupabaseServiceRoleClient()
-    storage = StorageAdmin()
-
-    fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
-    file_row = fr.data
-    if not file_row:
-        raise HTTPException(status_code=404, detail="File not found")
-
-    bucket = file_row['bucket']
-    cabinet_id = file_row['cabinet_id']
-    mime = file_row.get('mime_type') or 'application/pdf'
-    storage_path = file_row['path']
-
-    # Prefer converted PDF if available
-    try:
-        arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or []
-        a_pdf = next((a for a in arts if a.get('type') == 'document_pdf'), None)
-        processing_path = a_pdf['rel_path'] if a_pdf else storage_path
-        processing_mime = 'application/pdf' if a_pdf else mime
-    except Exception:
-        processing_path = storage_path
-        processing_mime = mime
-
-    # Determine page_count (prefer Tika; fallback to PDF parser if needed)
-    page_count = None
-    try:
-        arts_pc = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).execute().data or []
-        a_tika_pc = next((a for a in arts_pc if a.get('type') == 'tika_json'), None)
-        if a_tika_pc:
-            raw = storage.download_file(bucket, a_tika_pc['rel_path'])
-            import json as _json
-            tj = _json.loads(raw.decode('utf-8'))
-            for k in ("xmpTPg:NPages", "Page-Count", "pdf:PageCount"):
-                v = tj.get(k) or tj.get(k.lower())
-                if v is not None:
-                    page_count = int(v)
-                    break
-    except Exception as e:
-        logger.debug(f"[canonical-docling] Tika page_count read failed: {e}")
-        pass
-
-    # Fallback: compute page_count from PDF if Tika did not provide it
-    if page_count is None:
-        try:
-            pdf_bytes = storage.download_file(bucket, processing_path)
-            try:
-                import fitz  # PyMuPDF
-                doc = fitz.open(stream=pdf_bytes, filetype='pdf')
-                page_count = int(doc.page_count)
-                doc.close()
-                logger.info(f"[canonical-docling] page_count via PyMuPDF: {page_count}")
-            except Exception:
-                try:
-                    from PyPDF2 import PdfReader
-                    reader = PdfReader(io.BytesIO(pdf_bytes))
-                    page_count = int(len(reader.pages))
-                    logger.info(f"[canonical-docling] page_count via PyPDF2: {page_count}")
-                except Exception:
-                    page_count = None
-        except Exception:
-            page_count = None
-    else:
-        logger.info(f"[canonical-docling] page_count via Tika: {page_count}")
-
-    # Optional custom range from caller
-    custom_range = body.get('custom_range')
-    custom_label = body.get('custom_label') or ''
-    selected_section_id = body.get('selected_section_id')
-    selected_section_title = body.get('selected_section_title')
-
-    # Load split map if requested and document is large enough
-    use_split_requested = bool(body.get('use_split_map', True))
-    split_threshold = int(body.get('threshold') or os.getenv('DOCLING_SPLIT_THRESHOLD', '50'))
-    ranges = []  # list of (start,end)
-    split_map = None
-    sections = []  # list of dicts: {start,end,title}
-    logger.info(f"[canonical-docling] use_split_map={use_split_requested} threshold={split_threshold} page_count={page_count}")
-    # If custom range provided, honor it and bypass split map
-    if isinstance(custom_range, list) and len(custom_range) >= 2:
-        try:
-            cs = int(custom_range[0]); ce = int(custom_range[1])
-            if page_count is not None:
-                cs = max(1, min(cs, page_count))
-                ce = max(cs, min(ce, page_count))
-            ranges = [(cs, ce)]
-            sections = [{'start': cs, 'end': ce, 'title': custom_label or 'Custom range'}]
-            use_split_requested = False
-            logger.info(f"[canonical-docling] using custom_range start={cs} end={ce} label='{custom_label}'")
-        except Exception as _e:
-            logger.warning(f"[canonical-docling] invalid custom_range; falling back. err={_e}")
-
-    if not ranges and use_split_requested and (page_count is None or page_count >= split_threshold):
-        split_map = _load_split_map(client, storage, bucket, file_id)
-        entries = (split_map or {}).get('entries') if split_map else []
-        logger.info(f"[canonical-docling] split_map loaded entries={len(entries) if isinstance(entries, list) else 0}")
-        if split_map and isinstance(entries, list) and len(entries) > 0:
-            # Normalize and sort entries by start_page to enforce correct order
-            norm: list[dict] = []
-            for e in entries:
-                try:
-                    s = int(e.get('start_page', 1))
-                    t = int(e.get('end_page', s))
-                    if t < s:
-                        t = s
-                    title = e.get('title') or e.get('label') or ''
-                    norm.append({'start': s, 'end': t, 'title': title})
-                except Exception:
-                    continue
-            norm.sort(key=lambda x: x['start'])
-            # Deduplicate identical or overlapping starts by keeping the earliest occurrence
-            ordered: list[dict] = []
-            last_end = 0
-            for e in norm:
-                s, t = int(e['start']), int(e['end'])
-                if ordered and s <= last_end:
-                    # Clamp to prevent inversion and maintain order
-                    s = last_end + 1
-                    if s > (page_count or s):
-                        continue
-                    if t < s:
-                        t = s
-                last_end = max(last_end, t)
-                ordered.append({'start': s, 'end': t, 'title': e['title']})
-            for e in ordered:
-                ranges.append((e['start'], e['end']))
-                sections.append(e)
-
-    # Fallback: if no split_map ranges... we shouldn't be here
-    if not ranges:
-        # If document is large, split into fixed windows to protect Docling server
-        if page_count is not None and page_count >= split_threshold:
-            chunk = int(os.getenv('DOCLING_FALLBACK_CHUNK_PAGES', '25'))
-            chunk = max(5, min(100, chunk))
-            for i in range(1, (page_count or 1) + 1, chunk):
-                end = min(i + chunk - 1, page_count or i)
-                ranges.append((i, end))
-                sections.append({'start': i, 'end': end, 'title': f"Pages {i}-{end}"})
-            logger.warning(f"[canonical-docling] using fallback chunking ranges={len(ranges)} chunk={chunk}")
-        else:
-            ranges = [(1, page_count or 9223372036854775807)]
-            logger.warning(f"[canonical-docling] using single-range fallback (small doc)")
-
-    # Build config
-    cfg = body.get('config', {})
-    pipeline = cfg.get('pipeline', 'standard')
-    config: Dict[str, Any] = {
-        # target_type is computed in processor based on to_formats unless explicitly provided by user
-        'to_formats': cfg.get('to_formats', 'json'),
-        'do_ocr': bool(cfg.get('do_ocr', True)),
-        'force_ocr': bool(cfg.get('force_ocr', False)),
-        'image_export_mode': cfg.get('image_export_mode', 'embedded'),
-        'ocr_engine': cfg.get('ocr_engine', 'easyocr'),
-        'ocr_lang': cfg.get('ocr_lang', 'en'),
-        'pdf_backend': cfg.get('pdf_backend', 'dlparse_v4'),
-        'table_mode': cfg.get('table_mode', 'fast'),
-        'pipeline': pipeline,
-        'do_picture_classification': bool(cfg.get('do_picture_classification', False)),
-        'do_picture_description': bool(cfg.get('do_picture_description', False)),
-    }
-    # If user explicitly set target_type, pass it through
-    if 'target_type' in cfg:
-        config['target_type'] = cfg['target_type']
-    # Optional VLM settings (only include API fields if provided as JSON by caller)
-    if config['do_picture_description']:
-        pd_api = cfg.get('picture_description_api')
-        if isinstance(pd_api, (dict, list)):
-            config['picture_description_api'] = pd_api
-        elif isinstance(pd_api, str) and pd_api.strip().startswith(('{', '[')):
-            config['picture_description_api'] = pd_api
-        if cfg.get('picture_description_prompt'):
-            config['picture_description_prompt'] = cfg['picture_description_prompt']
-    if pipeline == 'vlm':
-        # Provider presets mapping
-        provider = (cfg.get('vlm_provider') or '').strip().lower()
-        provider_model = (cfg.get('vlm_provider_model') or '').strip()
-        provider_base = (cfg.get('vlm_provider_base_url') or '').strip()
-        if provider in ('ollama', 'openai') and provider_model:
-            if provider == 'ollama':
-                base_url = provider_base or os.getenv('OLLAMA_BASE_URL') or os.getenv('VLM_OLLAMA_BASE_URL')
-                if base_url:
-                    endpoint = f"{base_url.rstrip('/')}/v1/chat/completions"
-                    # Use OpenAI provider schema against Ollama's OpenAI-compatible endpoint
-                    cfg_api = {
-                        'provider': 'openai',
-                        'url': endpoint,
-                        'model': provider_model,
-                        'response_format': 'markdown',
-                        'request_params': {'model': provider_model}
-                    }
-                    logger.info(f"[canonical-docling] VLM provider=ollama mapped to openai-compatible url={endpoint} model={provider_model}")
-                    config['vlm_pipeline_model_api'] = cfg_api
-                    # Also wire picture_description_api if picture description is enabled
-                    if config.get('do_picture_description'):
-                        config['picture_description_api'] = {
-                            'url': endpoint,
-                            'headers': {},
-                            'params': {'model': provider_model}
-                        }
-            elif provider == 'openai':
-                base_url = provider_base or os.getenv('OPENAI_BASE_URL') or 'https://api.openai.com/v1'
-                api_key = os.getenv('OPENAI_API_KEY') or os.getenv('OPENAI_API_KEY_READONLY')
-                # Do not inline key if not present; server may have default
-                model_cfg: Dict[str, Any] = {
-                    'provider': 'openai',
-                    'url': f"{base_url.rstrip('/')}/chat/completions",
-                    'model': provider_model,
-                    'response_format': 'markdown',
-                    'request_params': {'model': provider_model}
-                }
-                if api_key:
-                    model_cfg['api_key'] = api_key
-                    # Also pass explicit Authorization header for servers that expect it
-                    model_cfg['headers'] = {
-                        'Authorization': f"Bearer {api_key}"
-                    }
-                logger.info(f"[canonical-docling] VLM provider=openai url={model_cfg['url']} model={provider_model} api_key={'yes' if api_key else 'no'}")
-                config['vlm_pipeline_model_api'] = model_cfg
-                # Also wire picture_description_api if picture description is enabled
-                if config.get('do_picture_description'):
-                    headers = {'Authorization': f"Bearer {api_key}"} if api_key else {}
-                    config['picture_description_api'] = {
-                        'url': f"{base_url.rstrip('/')}/chat/completions",
-                        'headers': headers,
-                        'params': {'model': provider_model}
-                    }
-        else:
-            # Pass through explicit API/local JSON if provided by caller
-            vpa = cfg.get('vlm_pipeline_model_api')
-            if isinstance(vpa, (dict, list)):
-                config['vlm_pipeline_model_api'] = vpa
-            elif isinstance(vpa, str) and vpa.strip().startswith(('{', '[')):
-                config['vlm_pipeline_model_api'] = vpa
-
-    # Enqueue tasks for each range
-    priority = TaskPriority.HIGH
-    task_ids = []
-    multi = len(ranges) > 1
-    logger.info(f"[canonical-docling] final ranges={len(ranges)} multi={multi} pipeline={pipeline} producer={body.get('producer', 'manual')}")
-    
-    # Create a group id for split bundles (used for UI grouping)
-    # Use provided group_id if present (for two-pass auto system), otherwise generate new
-    group_id = body.get('group_id') or (str(uuid.uuid4()) if multi else None)
-    if multi and not sections:
-        # Build sections from ranges if titles were not captured
-        for (start, end) in ranges:
-            sections.append({'start': int(start), 'end': int(end), 'title': ''})
-
-    idx = 0
-    for (start, end) in ranges:
-        # Locate title for this range if available
-        title = ''
-        if multi and sections and idx < len(sections):
-            title = sections[idx].get('title') or ''
-        idx += 1
-
-        cfg_range = dict(config)
-        # Ensure 1-based inclusive range is passed through
-        cfg_range['page_range'] = [max(1, int(start)), max(int(start), int(end))]
-        extra = {
-            'is_subdoc': multi,
-            'page_range': [int(start), int(end)],
-            'label': (title or f"subdoc p{int(start)}-{int(end)}") if multi else 'canonical'
-        }
-        # Attach selected section metadata if provided by caller
-        if selected_section_id:
-            extra['selected_section_id'] = selected_section_id
-        if selected_section_title or custom_label:
-            extra['selected_section_title'] = selected_section_title or custom_label
-        # For split processing, force split bundle artefact type and add grouping/order metadata
-        if multi:
-            extra.update({
-                # UI grouping metadata
-                'split_order': idx,
-                'split_heading': title,
-                'split_total': len(ranges)
-            })
-            if group_id:
-                extra['group_id'] = group_id
-                extra['group_pack_type'] = 'docling_standard_auto_split'
-        else:
-            # Single-bundle case: allow caller to override type (defaults to canonical bundle)
-            if 'artefact_type_override' in body and body.get('artefact_type_override'):
-                extra['artefact_type_override'] = body.get('artefact_type_override')
-
-        # Mark producer and selection metadata
-        extra['producer'] = body.get('producer') or ('auto_split' if (multi and body.get('use_split_map')) else 'manual')
-        if selected_section_id:
-            extra['selected_section_id'] = selected_section_id
-        if selected_section_title or custom_label:
-            extra['selected_section_title'] = selected_section_title or custom_label
-
-        # Enhanced logging for canonical operations
-        if multi:
-            logger.info(f"[canonical-docling] enqueue range idx={idx}/{len(ranges)} start={start} end={end} title='{title}' group_id={group_id} producer={extra.get('producer')} pipeline={pipeline}")
-        else:
-            logger.info(f"[canonical-docling] enqueue single range start={start} end={end} producer={extra.get('producer')} pipeline={pipeline}")
-        tid = enqueue_docling_task(
-            file_id=file_id,
-            task_type='canonical_docling_subdoc_json' if multi else 'canonical_docling_json',
-            payload={
-                'bucket': bucket,
-                'file_path': processing_path,
-                'cabinet_id': cabinet_id,
-                'mime_type': processing_mime,
-                'config': cfg_range,
-                'artefact_extra': extra,
-                # Ensure canonical tasks respect upstream dependencies (e.g., Frontmatter)
-                'depends_on': body.get('depends_on', []),
-                # Pass through grouping info if provided by caller (kept for backward-compat)
-                'group_pack_type': body.get('group_pack_type')
-            },
-            priority=priority,
-            timeout=int(body.get('timeout', DOCLING_NOOCR_TIMEOUT))
-        )
-        task_ids.append(tid)
-
-    logger.info(f"[canonical-docling] completed enqueue file_id={file_id} tasks={len(task_ids)} ranges={len(ranges)} pipeline={pipeline} producer={body.get('producer','manual')} group_id={group_id if multi else 'single'}")
-    
-    return {
-        'message': f'enqueued {len(task_ids)} canonical docling job(s)',
-        'task_ids': task_ids,
-        'ranges': ranges,
-        'used_split_map': bool(split_map),
-        'group_id': group_id,
-        'pipeline': pipeline,
-        'producer': body.get('producer', 'manual')
-    }
-
diff --git a/archive/auto_processing/memory_aware_queue.py b/archive/auto_processing/memory_aware_queue.py
deleted file mode 100644
index d673433..0000000
--- a/archive/auto_processing/memory_aware_queue.py
+++ /dev/null
@@ -1,411 +0,0 @@
-"""
-Memory-Aware Queue Management System
-====================================
-
-Provides intelligent queue management based on memory usage and file sizes
-rather than simple task count limits. Supports multiple users with fair
-queuing and capacity management.
-
-Features:
-- Memory-based queue limits (not just task count)
-- Fair queuing across multiple users
-- Upload capacity checking with user feedback
-- Graceful degradation under load
-- Service-specific memory tracking
-"""
-
-import os
-import time
-import json
-import uuid
-import logging
-from typing import Dict, List, Optional, Any, Tuple
-from dataclasses import dataclass, asdict
-from enum import Enum
-import redis
-from .redis_manager import get_redis_manager
-import psutil
-
-logger = logging.getLogger(__name__)
-
-class QueueStatus(Enum):
-    ACCEPTING = "accepting"           # Normal operation
-    BUSY = "busy"                    # High load, warn users
-    OVERLOADED = "overloaded"        # Reject new uploads
-    MAINTENANCE = "maintenance"       # Manual override
-
-@dataclass
-class MemoryConfig:
-    """Memory configuration for queue management."""
-    max_total_memory_mb: int = 2048      # 2GB total queue memory
-    max_user_memory_mb: int = 512        # 512MB per user
-    max_file_size_mb: int = 100          # 100MB max file size
-    memory_warning_threshold: float = 0.8  # Warn at 80%
-    memory_reject_threshold: float = 0.95   # Reject at 95%
-
-@dataclass
-class QueuedFile:
-    """Represents a file waiting in the queue."""
-    file_id: str
-    user_id: str
-    filename: str
-    size_bytes: int
-    mime_type: str
-    cabinet_id: str
-    priority: int = 1
-    queued_at: float = 0
-    estimated_processing_time: int = 300  # seconds
-    memory_estimate_mb: float = 0
-    
-    def __post_init__(self):
-        if self.queued_at == 0:
-            self.queued_at = time.time()
-        
-        # Estimate memory usage (rough heuristic)
-        self.memory_estimate_mb = self._estimate_memory_usage()
-    
-    def _estimate_memory_usage(self) -> float:
-        """Estimate memory usage for this file during processing."""
-        base_mb = self.size_bytes / (1024 * 1024)
-        
-        # Processing multipliers based on operations
-        if self.mime_type == 'application/pdf':
-            # PDF: original + extracted text + images + thumbnails
-            return base_mb * 3.5
-        elif self.mime_type.startswith('image/'):
-            # Images: original + resized variants + OCR text
-            return base_mb * 2.5
-        else:
-            # Other docs: original + PDF conversion + processing
-            return base_mb * 4.0
-
-class MemoryAwareQueue:
-    """Memory-aware queue management system."""
-    
-    def __init__(self, environment: str = "dev"):
-        self.redis_manager = get_redis_manager(environment)
-        self.redis_client = self.redis_manager.client
-        self.config = self._load_config()
-        
-        # Redis keys
-        self.upload_queue_key = "upload_queue"
-        self.processing_memory_key = "processing_memory"
-        self.user_quota_key = "user_quotas"
-        self.system_status_key = "system_status"
-        
-        logger.info(f"🧠 Memory-aware queue initialized (max: {self.config.max_total_memory_mb}MB)")
-    
-    def _load_config(self) -> MemoryConfig:
-        """Load memory configuration from environment."""
-        return MemoryConfig(
-            max_total_memory_mb=int(os.getenv('QUEUE_MAX_MEMORY_MB', '2048')),
-            max_user_memory_mb=int(os.getenv('QUEUE_MAX_USER_MEMORY_MB', '512')),
-            max_file_size_mb=int(os.getenv('MAX_FILE_SIZE_MB', '100')),
-            memory_warning_threshold=float(os.getenv('MEMORY_WARNING_THRESHOLD', '0.8')),
-            memory_reject_threshold=float(os.getenv('MEMORY_REJECT_THRESHOLD', '0.95'))
-        )
-    
-    def check_upload_capacity(self, user_id: str, file_size_bytes: int, 
-                            mime_type: str) -> Tuple[bool, str, Dict[str, Any]]:
-        """
-        Check if system can accept a new upload.
-        
-        Returns:
-            (can_accept, message, queue_info)
-        """
-        
-        # Create temporary QueuedFile to estimate memory
-        temp_file = QueuedFile(
-            file_id="temp",
-            user_id=user_id,
-            filename="temp",
-            size_bytes=file_size_bytes,
-            mime_type=mime_type,
-            cabinet_id="temp"
-        )
-        
-        file_memory_mb = temp_file.memory_estimate_mb
-        
-        # Check file size limit
-        if file_size_bytes > (self.config.max_file_size_mb * 1024 * 1024):
-            return False, f"File too large (max: {self.config.max_file_size_mb}MB)", {}
-        
-        # Get current memory usage
-        current_memory = self._get_current_memory_usage()
-        user_memory = self._get_user_memory_usage(user_id)
-        
-        # Check user quota
-        if user_memory + file_memory_mb > self.config.max_user_memory_mb:
-            return False, f"User quota exceeded (limit: {self.config.max_user_memory_mb}MB)", {
-                'user_current': user_memory,
-                'user_limit': self.config.max_user_memory_mb
-            }
-        
-        # Check system capacity
-        total_after = current_memory + file_memory_mb
-        max_memory = self.config.max_total_memory_mb
-        
-        if total_after > (max_memory * self.config.memory_reject_threshold):
-            queue_info = self._get_queue_info()
-            return False, "System overloaded. Please try again later.", {
-                'current_memory': current_memory,
-                'max_memory': max_memory,
-                'utilization': current_memory / max_memory,
-                'queue_position': queue_info['total_queued'] + 1
-            }
-        
-        # Calculate wait time estimate
-        wait_estimate = self._estimate_wait_time(user_id)
-        
-        status = "ready"
-        message = "Upload accepted"
-        
-        if total_after > (max_memory * self.config.memory_warning_threshold):
-            status = "busy"
-            message = f"System busy. Estimated wait: {wait_estimate // 60}m {wait_estimate % 60}s"
-        
-        return True, message, {
-            'status': status,
-            'estimated_wait_seconds': wait_estimate,
-            'memory_usage': {
-                'current': current_memory,
-                'after_upload': total_after,
-                'limit': max_memory,
-                'utilization': total_after / max_memory
-            },
-            'user_quota': {
-                'used': user_memory,
-                'after_upload': user_memory + file_memory_mb,
-                'limit': self.config.max_user_memory_mb
-            }
-        }
-    
-    def enqueue_file(self, file_id: str, user_id: str, filename: str, 
-                    size_bytes: int, mime_type: str, cabinet_id: str, 
-                    priority: int = 1) -> Dict[str, Any]:
-        """
-        Add file to upload queue.
-        
-        Returns:
-            Queue information including position and estimated wait time
-        """
-        
-        queued_file = QueuedFile(
-            file_id=file_id,
-            user_id=user_id,
-            filename=filename,
-            size_bytes=size_bytes,
-            mime_type=mime_type,
-            cabinet_id=cabinet_id,
-            priority=priority
-        )
-        
-        # Serialize and add to Redis queue (priority queue: higher priority = lower score)
-        score = time.time() - (priority * 1000000)  # Priority affects score significantly
-        
-        self.redis_client.zadd(
-            self.upload_queue_key, 
-            {json.dumps(asdict(queued_file)): score}
-        )
-        
-        # Update user quota tracking
-        self._update_user_quota(user_id, queued_file.memory_estimate_mb, increment=True)
-        
-        # Get queue position and wait estimate
-        position = self._get_queue_position(file_id)
-        wait_estimate = self._estimate_wait_time(user_id)
-        
-        logger.info(f"📋 Queued file {file_id} for user {user_id} (pos: {position}, wait: {wait_estimate}s)")
-        
-        return {
-            'queued': True,
-            'file_id': file_id,
-            'queue_position': position,
-            'estimated_wait_seconds': wait_estimate,
-            'memory_estimate_mb': queued_file.memory_estimate_mb
-        }
-    
-    def dequeue_next_file(self, service_name: str) -> Optional[QueuedFile]:
-        """
-        Get next file from queue for processing.
-        
-        Args:
-            service_name: The service requesting work (for capacity management)
-        """
-        
-        # Check if service has capacity
-        service_memory = self._get_service_memory_usage(service_name)
-        service_limit = self._get_service_memory_limit(service_name)
-        
-        if service_memory >= service_limit:
-            logger.debug(f"Service {service_name} at capacity ({service_memory}/{service_limit}MB)")
-            return None
-        
-        # Get next item from priority queue (lowest score first)
-        items = self.redis_client.zrange(self.upload_queue_key, 0, 0, withscores=True)
-        
-        if not items:
-            return None
-        
-        file_data_json, score = items[0]
-        file_data = json.loads(file_data_json)
-        queued_file = QueuedFile(**file_data)
-        
-        # Check if this file would exceed service memory limit
-        if service_memory + queued_file.memory_estimate_mb > service_limit:
-            # Skip this file for now, try smaller ones later
-            logger.debug(f"File {queued_file.file_id} too large for {service_name} capacity")
-            return None
-        
-        # Remove from queue
-        self.redis_client.zrem(self.upload_queue_key, file_data_json)
-        
-        # Update tracking
-        self._update_user_quota(queued_file.user_id, queued_file.memory_estimate_mb, increment=False)
-        self._update_service_memory(service_name, queued_file.memory_estimate_mb, increment=True)
-        
-        logger.info(f"🎯 Dequeued file {queued_file.file_id} for {service_name} processing")
-        
-        return queued_file
-    
-    def complete_processing(self, service_name: str, file_id: str, memory_used_mb: float):
-        """Mark file processing as complete and free memory."""
-        self._update_service_memory(service_name, memory_used_mb, increment=False)
-        logger.info(f"✅ Completed processing {file_id} in {service_name} (freed {memory_used_mb}MB)")
-    
-    def _get_current_memory_usage(self) -> float:
-        """Get current total memory usage across all services."""
-        services = ['docling', 'tika', 'llm', 'document_analysis']
-        total = 0
-        
-        for service in services:
-            service_key = f"{self.processing_memory_key}:{service}"
-            memory = float(self.redis_client.get(service_key) or 0)
-            total += memory
-        
-        return total
-    
-    def _get_user_memory_usage(self, user_id: str) -> float:
-        """Get current memory usage for a specific user."""
-        user_key = f"{self.user_quota_key}:{user_id}"
-        return float(self.redis_client.get(user_key) or 0)
-    
-    def _get_service_memory_usage(self, service_name: str) -> float:
-        """Get current memory usage for a service."""
-        service_key = f"{self.processing_memory_key}:{service_name}"
-        return float(self.redis_client.get(service_key) or 0)
-    
-    def _get_service_memory_limit(self, service_name: str) -> float:
-        """Get memory limit for a service."""
-        # Service-specific memory limits as percentage of total
-        limits = {
-            'docling': 0.4,      # 40% for Docling (memory-intensive)
-            'tika': 0.2,         # 20% for Tika
-            'llm': 0.3,          # 30% for LLM processing
-            'document_analysis': 0.1  # 10% for document analysis
-        }
-        
-        percentage = limits.get(service_name, 0.1)
-        return self.config.max_total_memory_mb * percentage
-    
-    def _update_user_quota(self, user_id: str, memory_mb: float, increment: bool):
-        """Update user memory quota tracking."""
-        user_key = f"{self.user_quota_key}:{user_id}"
-        
-        if increment:
-            self.redis_client.incrbyfloat(user_key, memory_mb)
-        else:
-            current = float(self.redis_client.get(user_key) or 0)
-            new_value = max(0, current - memory_mb)
-            self.redis_client.set(user_key, new_value)
-        
-        # Set expiration for cleanup
-        self.redis_client.expire(user_key, 86400)  # 24 hours
-    
-    def _update_service_memory(self, service_name: str, memory_mb: float, increment: bool):
-        """Update service memory usage tracking."""
-        service_key = f"{self.processing_memory_key}:{service_name}"
-        
-        if increment:
-            self.redis_client.incrbyfloat(service_key, memory_mb)
-        else:
-            current = float(self.redis_client.get(service_key) or 0)
-            new_value = max(0, current - memory_mb)
-            self.redis_client.set(service_key, new_value)
-        
-        # Set expiration for cleanup
-        self.redis_client.expire(service_key, 3600)  # 1 hour
-    
-    def _get_queue_position(self, file_id: str) -> int:
-        """Get position of file in queue."""
-        items = self.redis_client.zrange(self.upload_queue_key, 0, -1)
-        for i, item in enumerate(items):
-            file_data = json.loads(item)
-            if file_data['file_id'] == file_id:
-                return i + 1
-        return 0
-    
-    def _estimate_wait_time(self, user_id: str) -> int:
-        """Estimate wait time for user's next file."""
-        # Simple estimation based on queue position and average processing time
-        queue_size = self.redis_client.zcard(self.upload_queue_key)
-        avg_processing_time = 300  # 5 minutes average
-        
-        return int(queue_size * avg_processing_time * 0.5)  # Assume parallel processing
-    
-    def _get_queue_info(self) -> Dict[str, Any]:
-        """Get comprehensive queue information."""
-        total_queued = self.redis_client.zcard(self.upload_queue_key)
-        current_memory = self._get_current_memory_usage()
-        max_memory = self.config.max_total_memory_mb
-        
-        return {
-            'total_queued': total_queued,
-            'memory_usage': {
-                'current_mb': current_memory,
-                'max_mb': max_memory,
-                'utilization': current_memory / max_memory if max_memory > 0 else 0
-            },
-            'status': self._determine_system_status(current_memory, max_memory)
-        }
-    
-    def _determine_system_status(self, current_memory: float, max_memory: float) -> str:
-        """Determine current system status based on memory usage."""
-        utilization = current_memory / max_memory if max_memory > 0 else 0
-        
-        if utilization >= self.config.memory_reject_threshold:
-            return "overloaded"
-        elif utilization >= self.config.memory_warning_threshold:
-            return "busy"
-        else:
-            return "ready"
-    
-    def get_system_status(self) -> Dict[str, Any]:
-        """Get comprehensive system status for monitoring."""
-        queue_info = self._get_queue_info()
-        
-        # Service-specific info
-        services = {}
-        for service_name in ['docling', 'tika', 'llm', 'document_analysis']:
-            services[service_name] = {
-                'memory_used_mb': self._get_service_memory_usage(service_name),
-                'memory_limit_mb': self._get_service_memory_limit(service_name),
-                'utilization': self._get_service_memory_usage(service_name) / self._get_service_memory_limit(service_name)
-            }
-        
-        return {
-            'status': queue_info['status'],
-            'queue': queue_info,
-            'services': services,
-            'config': asdict(self.config)
-        }
-
-# Convenience functions
-def get_memory_queue(environment: str = "dev") -> MemoryAwareQueue:
-    """Get memory-aware queue instance."""
-    return MemoryAwareQueue(environment)
-
-def check_upload_capacity(user_id: str, file_size: int, mime_type: str, environment: str = "dev") -> Tuple[bool, str, Dict]:
-    """Quick capacity check for upload."""
-    queue = get_memory_queue(environment)
-    return queue.check_upload_capacity(user_id, file_size, mime_type)
diff --git a/archive/auto_processing/pipeline_controller.py b/archive/auto_processing/pipeline_controller.py
deleted file mode 100644
index adb0395..0000000
--- a/archive/auto_processing/pipeline_controller.py
+++ /dev/null
@@ -1,1316 +0,0 @@
-"""
-Pipeline Controller for Three-Phase Document Processing Architecture
-
-This module coordinates the three phases of document processing:
-- Phase 1: Document Structure Discovery & Analysis  
-- Phase 2: Parallel Content Processing Pipelines
-- Phase 3: Enhanced Frontend Viewing (handled by frontend)
-
-Features:
-- Environment variable controlled auto-processing
-- Phase 1 completion detection  
-- Automatic Phase 2 triggering
-- Intelligent retry and coordination logic
-"""
-
-import json
-import os
-import uuid
-import time
-from typing import Dict, Any, List, Optional, Set
-from pathlib import Path
-
-from modules.logger_tool import initialise_logger
-from modules.database.supabase.utils.client import SupabaseServiceRoleClient
-from modules.database.supabase.utils.storage import StorageAdmin
-from modules.queue_system import (
-    enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task,
-    enqueue_document_analysis_task, enqueue_page_images_task,
-    TaskPriority, get_queue
-)
-from modules.bundle_metadata import (
-    create_standard_metadata, BundleMetadata, PipelineType, ProcessingMode, BundleType
-)
-
-logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
-
-class DocumentPipelineController:
-    """
-    Coordinates the three-phase document processing pipeline.
-    """
-    
-    def __init__(self):
-        self.client = SupabaseServiceRoleClient()
-        self.storage = StorageAdmin()
-        
-        # Phase 1 environment variables
-        self.auto_tika = os.getenv('AUTO_TIKA_PROCESSING', 'true').lower() == 'true'
-        self.auto_page_images = os.getenv('AUTO_PAGE_IMAGES', 'true').lower() == 'true'
-        self.auto_document_analysis = os.getenv('AUTO_DOCUMENT_ANALYSIS', 'true').lower() == 'true'
-        self.auto_split_map = os.getenv('AUTO_SPLIT_MAP_GENERATION', 'true').lower() == 'true'
-        
-        # Phase 2 environment variables
-        self.auto_docling_ocr = os.getenv('AUTO_DOCLING_OCR', 'true').lower() == 'true'
-        self.auto_docling_no_ocr = os.getenv('AUTO_DOCLING_NO_OCR', 'true').lower() == 'true'
-        self.auto_docling_vlm = os.getenv('AUTO_DOCLING_VLM', 'false').lower() == 'true'
-        
-        # Processing granularity
-        self.docling_ocr_by_page = os.getenv('DOCLING_OCR_BY_PAGE', 'false').lower() == 'true'
-        self.docling_no_ocr_by_page = os.getenv('DOCLING_NO_OCR_BY_PAGE', 'false').lower() == 'true'
-        self.docling_vlm_by_page = os.getenv('DOCLING_VLM_BY_PAGE', 'true').lower() == 'true'
-        
-        # Grouping strategy
-        self.docling_use_split_map = os.getenv('DOCLING_USE_SPLIT_MAP', 'true').lower() == 'true'
-        self.docling_split_threshold = int(os.getenv('DOCLING_SPLIT_THRESHOLD', '50'))
-        
-        logger.info("Pipeline controller initialized with new bundle architecture")
-
-    def enqueue_phase1_tasks(self, file_id: str, file_row: Dict[str, Any], 
-                           processing_path: str, processing_mime: str, 
-                           priority: TaskPriority = TaskPriority.HIGH) -> Dict[str, List[str]]:
-        """
-        Enqueue Phase 1 tasks: Structure Discovery & Analysis
-        
-        Returns:
-            Dictionary mapping task types to task IDs
-        """
-        logger.info(f"Phase 1: Starting structure discovery for file {file_id}")
-        
-        task_ids = {}
-        bucket = file_row['bucket']
-        cabinet_id = file_row['cabinet_id']
-        
-        # Step 1: Tika Processing (metadata extraction)
-        if self.auto_tika:
-            tika_url = os.getenv('TIKA_URL')
-            if tika_url:
-                tika_task_id = enqueue_tika_task(
-                    file_id=file_id,
-                    payload={
-                        'bucket': bucket,
-                        'file_path': processing_path,
-                        'cabinet_id': cabinet_id,
-                        'mime_type': processing_mime
-                    },
-                    priority=priority
-                )
-                task_ids['tika'] = [tika_task_id]
-                logger.info(f"Phase 1: Enqueued Tika task {tika_task_id}")
-            else:
-                logger.warning("Phase 1: Tika enabled but TIKA_URL not configured")
-        
-        # Step 2: Frontmatter processing (lightweight document overview)
-        docling_url = os.getenv('DOCLING_URL') or os.getenv('NEOFS_DOCLING_URL')
-        if docling_url:
-            try:
-                front_pages = int(os.getenv('DOCLING_FRONTPAGES', '3'))
-            except Exception:
-                front_pages = 3
-                
-            # Create enhanced metadata for frontmatter JSON display in UI
-            frontmatter_metadata = {
-                'display_name': f'Document Frontmatter (p1-{front_pages})',
-                'bundle_label': 'Frontmatter Analysis',
-                'section_title': 'Document Frontmatter',
-                'page_range': [1, front_pages],
-                'page_count': front_pages,
-                'bundle_type': 'frontmatter_json',
-                'processing_mode': 'frontmatter_analysis',
-                'pipeline': 'frontmatter_ocr',
-                'is_frontmatter': True,
-                'ui_category': 'document_analysis',
-                'ui_order': 1,  # Show first in UI
-                'description': f'OCR analysis of first {front_pages} pages for document structure and metadata',
-                'viewer_type': 'json'
-            }
-            
-            frontmatter_task_id = enqueue_docling_task(
-                file_id=file_id,
-                task_type='docling_frontmatter_json',
-                payload={
-                    'bucket': bucket,
-                    'file_path': processing_path,
-                    'cabinet_id': cabinet_id,
-                    'mime_type': processing_mime,
-                    'config': {
-                        'do_ocr': True,
-                        'force_ocr': False,
-                        'image_export_mode': 'embedded',
-                        'ocr_engine': 'easyocr',
-                        'ocr_lang': 'en',
-                        'pdf_backend': 'dlparse_v4',
-                        'table_mode': 'fast',
-                        'target_type': 'inbody',
-                        'to_formats': 'json',
-                        'page_range': [1, front_pages]
-                    },
-                    'artefact_extra': frontmatter_metadata,
-                    'depends_on': task_ids.get('tika', [])
-                },
-                priority=priority,
-                timeout=int(os.getenv('DOCLING_FRONTMATTER_TIMEOUT', '1800'))
-            )
-            task_ids['frontmatter'] = [frontmatter_task_id]
-            logger.info(f"Phase 1: Enqueued frontmatter task {frontmatter_task_id}")
-        
-        # Step 3: Document Structure Analysis (LLM-enhanced hierarchy)
-        if self.auto_document_analysis:
-            analysis_task_id = enqueue_docling_task(
-                file_id=file_id,
-                task_type='document_structure_analysis',
-                payload={
-                    'bucket': bucket,
-                    'file_path': processing_path,
-                    'cabinet_id': cabinet_id,
-                    'mime_type': processing_mime,
-                    'config': {
-                        'target_type': 'inbody',
-                        'to_formats': 'json',
-                        'do_ocr': False,
-                        'force_ocr': False
-                    },
-                    'depends_on': task_ids.get('frontmatter', [])
-                },
-                priority=priority,
-                timeout=int(os.getenv('DOCUMENT_ANALYSIS_TIMEOUT', '300'))
-            )
-            task_ids['document_analysis'] = [analysis_task_id]
-            logger.info(f"Phase 1: Enqueued document analysis task {analysis_task_id}")
-        
-        # Step 4: Split Map Generation (definitive section boundaries)
-        if self.auto_split_map:
-            split_map_task_id = enqueue_split_map_task(
-                file_id=file_id,
-                payload={
-                    'depends_on': task_ids.get('frontmatter', []) + task_ids.get('document_analysis', [])
-                },
-                priority=TaskPriority.NORMAL
-            )
-            task_ids['split_map'] = [split_map_task_id]
-            logger.info(f"Phase 1: Enqueued split map task {split_map_task_id}")
-        
-        # Step 5: Page Images Generation (for frontend viewing)
-        if self.auto_page_images:
-            page_images_task_id = enqueue_docling_task(
-                file_id=file_id,
-                task_type='generate_page_images',
-                payload={
-                    'bucket': bucket,
-                    'file_path': processing_path,
-                    'cabinet_id': cabinet_id,
-                    'mime_type': processing_mime,
-                    'config': {},
-                    'depends_on': task_ids.get('document_analysis', [])
-                },
-                priority=TaskPriority.NORMAL,
-                timeout=int(os.getenv('PAGE_IMAGES_TIMEOUT', '1800'))
-            )
-            task_ids['page_images'] = [page_images_task_id]
-            logger.info(f"Phase 1: Enqueued page images task {page_images_task_id}")
-        
-        # Bundle tasks are now directly enqueued by split_map task completion
-        
-        total_tasks = sum(len(task_list) for task_list in task_ids.values())
-        logger.info(f"Phase 1: Enqueued {total_tasks} tasks for file {file_id}: {list(task_ids.keys())}")
-        
-        return task_ids
-
-    def check_phase1_completion(self, file_id: str) -> Dict[str, Any]:
-        """
-        Check if Phase 1 is complete for a given file.
-        
-        Returns:
-            Dictionary with completion status and details
-        """
-        logger.debug(f"Checking Phase 1 completion for file {file_id}")
-        
-        # Get all artefacts for the file
-        artefacts_result = self.client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).execute()
-        artefacts = artefacts_result.data or []
-        
-        # Check for required Phase 1 artefacts
-        phase1_checks = {
-            'tika_metadata': False,
-            'frontmatter': False, 
-            'document_analysis': False,
-            'split_map': False,
-            'page_images': False
-        }
-        
-        for artefact in artefacts:
-            if artefact['status'] == 'completed':
-                artefact_type = artefact['type']
-                if artefact_type == 'tika_json':
-                    phase1_checks['tika_metadata'] = True
-                elif artefact_type == 'docling_frontmatter_json':
-                    phase1_checks['frontmatter'] = True
-                elif artefact_type == 'document_outline_hierarchy':
-                    phase1_checks['document_analysis'] = True
-                elif artefact_type == 'split_map_json':
-                    phase1_checks['split_map'] = True
-                elif artefact_type == 'page_images':
-                    phase1_checks['page_images'] = True
-        
-        # Determine completion based on enabled features
-        required_checks = []
-        if self.auto_tika:
-            required_checks.append('tika_metadata')
-        required_checks.append('frontmatter')  # Always required for basic processing
-        if self.auto_document_analysis:
-            required_checks.append('document_analysis')
-        if self.auto_split_map:
-            required_checks.append('split_map')
-        if self.auto_page_images:
-            required_checks.append('page_images')
-        
-        completed_required = [check for check in required_checks if phase1_checks[check]]
-        is_complete = len(completed_required) == len(required_checks)
-        
-        return {
-            'file_id': file_id,
-            'is_complete': is_complete,
-            'completed_components': completed_required,
-            'required_components': required_checks,
-            'all_checks': phase1_checks,
-            'completion_percentage': (len(completed_required) / max(len(required_checks), 1)) * 100
-        }
-
-    def enqueue_sequential_docling_pipelines(self, file_id: str, file_data: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Enqueue sequential docling pipelines with dependencies: no_ocr → ocr → vlm
-        
-        Each pipeline depends on ALL tasks from the previous pipeline completing.
-        This replaces the complex Phase 2 coordinator with simple task dependencies.
-        
-        Args:
-            file_id: The file ID to process
-            file_data: File processing information (bucket, path, etc.)
-            
-        Returns:
-            Dictionary with enqueued pipeline information
-        """
-        logger.info(f"Enqueueing sequential docling pipelines for file {file_id}")
-        
-        bucket = file_data['bucket']
-        file_path = file_data['file_path']
-        cabinet_id = file_data['cabinet_id']
-        mime_type = file_data['mime_type']
-        
-        # Base configuration shared by all pipelines (pipeline-specific options added per pipeline)
-        base_config = {
-            'to_formats': ['json', 'html', 'text', 'md', 'doctags'],
-            'image_export_mode': 'referenced',
-            'target_type': 'zip',
-            'pdf_backend': os.getenv('DOCLING_PDF_BACKEND', 'dlparse_v4'),
-            'include_images': os.getenv('DOCLING_INCLUDE_IMAGES', 'true').lower() == 'true',
-            'images_scale': float(os.getenv('DOCLING_IMAGES_SCALE', '2.0')),
-            'ocr_engine': os.getenv('OCR_ENGINE', 'easyocr'),
-            'ocr_lang': os.getenv('OCR_LANG', 'en'),
-            'picture_description_area_threshold': float(os.getenv('DOCLING_PICTURE_DESCRIPTION_AREA_THRESHOLD', '0.05'))
-        }
-        
-        # Determine the pipeline execution order: no_ocr → ocr → vlm
-        pipeline_order = []
-        if self.auto_docling_no_ocr:
-            pipeline_order.append('no_ocr')
-        if self.auto_docling_ocr:
-            pipeline_order.append('ocr') 
-        if self.auto_docling_vlm:
-            pipeline_order.append('vlm')
-        
-        if not pipeline_order:
-            logger.info(f"No docling pipelines enabled for file {file_id}")
-            return {
-                'file_id': file_id,
-                'enqueued_pipelines': {},
-                'total_tasks': 0,
-                'sequential_order': [],
-                'message': 'No docling pipelines enabled'
-            }
-        
-        logger.info(f"Sequential pipeline order for file {file_id}: {pipeline_order}")
-        
-        # Enqueue all pipelines with proper dependencies
-        enqueued_pipelines = {}
-        all_task_ids = {}
-        
-        for i, pipeline_type in enumerate(pipeline_order):
-            # Determine dependencies: depend on ALL tasks from previous pipeline
-            depends_on = []
-            if i > 0:
-                previous_pipeline = pipeline_order[i - 1]
-                depends_on = all_task_ids.get(previous_pipeline, [])
-                logger.info(f"Pipeline {pipeline_type} will depend on {len(depends_on)} tasks from {previous_pipeline}: {depends_on[:3]}..." if len(depends_on) > 3 else f"Pipeline {pipeline_type} will depend on {len(depends_on)} tasks from {previous_pipeline}: {depends_on}")
-            else:
-                logger.info(f"Pipeline {pipeline_type} has no dependencies (first pipeline)")
-            
-            # Create pipeline tasks
-            pipeline_result = self._enqueue_single_pipeline_with_deps(
-                file_id, pipeline_type, base_config, bucket, file_path, cabinet_id, mime_type, depends_on
-            )
-            
-            if pipeline_result:
-                enqueued_pipelines[pipeline_type] = pipeline_result
-                all_task_ids[pipeline_type] = pipeline_result['task_ids']
-                logger.info(f"Enqueued {pipeline_type} pipeline with {len(pipeline_result['task_ids'])} tasks")
-            
-        total_tasks = sum(len(p.get('task_ids', [])) for p in enqueued_pipelines.values())
-        logger.info(f"Successfully enqueued {len(pipeline_order)} sequential pipelines with {total_tasks} total tasks for file {file_id}")
-        
-        return {
-            'file_id': file_id,
-            'enqueued_pipelines': enqueued_pipelines,
-            'total_tasks': total_tasks,
-            'sequential_order': pipeline_order
-        }
-
-    def _determine_processing_mode(self, file_id: str, pipeline_type: str) -> tuple[str, dict]:
-        """
-        Determine how to process document based on settings and characteristics.
-        
-        Implements corrected decision logic:
-        1. Priority 1: Respect explicit BY_PAGE preference
-        2. Priority 2: Check size threshold for auto-processing  
-        3. Priority 3: Use split map for large documents
-        4. Priority 4: Fallback chunking
-        
-        Returns:
-            Tuple of (processing_mode, processing_data)
-        """
-        # Check BY_PAGE flags first (highest priority)
-        by_page = self._get_by_page_setting(pipeline_type)
-        if by_page:
-            logger.info(f"BY_PAGE enabled for {pipeline_type} - creating page-based bundles regardless of document size")
-            return "split_by_pages", self._get_page_ranges(file_id)
-        
-        # Get document characteristics
-        page_count = self._get_page_count(file_id)
-        
-        # Apply size threshold logic
-        if page_count < self.docling_split_threshold:
-            logger.info(f"Document has {page_count} pages (< {self.docling_split_threshold} threshold) - creating single bundle")
-            return "whole_document", {}
-        
-        # Check for split map availability
-        split_map = self._load_split_map_if_needed(file_id)
-        if split_map and self.docling_use_split_map:
-            logger.info(f"Document has {page_count} pages (>= {self.docling_split_threshold} threshold) with split map - creating section-based bundles")
-            return "split_by_sections", split_map
-        else:
-            logger.error(f"Document has {page_count} pages (>= {self.docling_split_threshold} threshold) without split map - ERROR")
-            return "error"
-
-    def _get_by_page_setting(self, pipeline_type: str) -> bool:
-        """Get BY_PAGE setting for the specified pipeline type."""
-        if pipeline_type == 'no_ocr':
-            return self.docling_no_ocr_by_page
-        elif pipeline_type == 'ocr':
-            return self.docling_ocr_by_page
-        elif pipeline_type == 'vlm':
-            return self.docling_vlm_by_page
-        return False
-
-    def _get_pipeline_specific_config(self, pipeline_type: str) -> Dict[str, Any]:
-        """Get pipeline-specific configuration options from environment variables."""
-        if pipeline_type == 'no_ocr':
-            return {
-                'table_mode': os.getenv('DOCLING_NO_OCR_TABLE_MODE', 'fast'),
-                'table_cell_matching': os.getenv('DOCLING_NO_OCR_TABLE_CELL_MATCHING', 'false').lower() == 'true',
-                'do_formula_enrichment': os.getenv('DOCLING_NO_OCR_DO_FORMULA_ENRICHMENT', 'false').lower() == 'true',
-                'do_code_enrichment': os.getenv('DOCLING_NO_OCR_DO_CODE_ENRICHMENT', 'false').lower() == 'true',
-                'do_table_structure': os.getenv('DOCLING_NO_OCR_DO_TABLE_STRUCTURE', 'true').lower() == 'true',
-                'do_picture_classification': os.getenv('DOCLING_NO_OCR_DO_PICTURE_CLASSIFICATION', 'false').lower() == 'true',
-                'do_picture_description': os.getenv('DOCLING_NO_OCR_DO_PICTURE_DESCRIPTION', 'false').lower() == 'true'
-            }
-        elif pipeline_type == 'ocr':
-            return {
-                'table_mode': os.getenv('DOCLING_OCR_TABLE_MODE', 'accurate'),
-                'table_cell_matching': os.getenv('DOCLING_OCR_TABLE_CELL_MATCHING', 'true').lower() == 'true',
-                'do_formula_enrichment': os.getenv('DOCLING_OCR_DO_FORMULA_ENRICHMENT', 'true').lower() == 'true',
-                'do_code_enrichment': os.getenv('DOCLING_OCR_DO_CODE_ENRICHMENT', 'true').lower() == 'true',
-                'do_table_structure': os.getenv('DOCLING_OCR_DO_TABLE_STRUCTURE', 'true').lower() == 'true',
-                'do_picture_classification': os.getenv('DOCLING_OCR_DO_PICTURE_CLASSIFICATION', 'false').lower() == 'true',
-                'do_picture_description': os.getenv('DOCLING_OCR_DO_PICTURE_DESCRIPTION', 'false').lower() == 'true'
-            }
-        elif pipeline_type == 'vlm':
-            return {
-                'table_mode': os.getenv('DOCLING_VLM_TABLE_MODE', 'accurate'),
-                'table_cell_matching': os.getenv('DOCLING_VLM_TABLE_CELL_MATCHING', 'true').lower() == 'true',
-                'do_formula_enrichment': os.getenv('DOCLING_VLM_DO_FORMULA_ENRICHMENT', 'false').lower() == 'true',
-                'do_code_enrichment': os.getenv('DOCLING_VLM_DO_CODE_ENRICHMENT', 'false').lower() == 'true',
-                'do_table_structure': os.getenv('DOCLING_VLM_DO_TABLE_STRUCTURE', 'true').lower() == 'true',
-                'do_picture_classification': os.getenv('DOCLING_VLM_DO_PICTURE_CLASSIFICATION', 'true').lower() == 'true',
-                'do_picture_description': os.getenv('DOCLING_VLM_DO_PICTURE_DESCRIPTION', 'true').lower() == 'true'
-            }
-        else:
-            # Default config for unknown pipeline types
-            return {
-                'table_mode': 'fast',
-                'table_cell_matching': False,
-                'do_formula_enrichment': False,
-                'do_code_enrichment': False,
-                'do_table_structure': True,
-                'do_picture_classification': False,
-                'do_picture_description': False
-            }
-
-    def _get_page_count(self, file_id: str) -> int:
-        """Get page count for the file from existing artefacts (first Tika)."""
-        logger.info(f"🔍 PAGE COUNT: Starting page count detection for file {file_id}")
-        
-        try:
-            # Try to get page count from existing artefacts, excluding frontmatter (partial document)
-            artefacts = self.client.supabase.table('document_artefacts').select('type,extra').eq('file_id', file_id).execute()
-            artefact_types = [art.get('type', 'unknown') for art in artefacts.data or []]
-            logger.info(f"🔍 PAGE COUNT: Found {len(artefacts.data or [])} artefacts for file {file_id}: {artefact_types}")
-            
-            for art in artefacts.data or []:
-                art_type = art.get('type', 'unknown')
-                extra = art.get('extra', {})
-                logger.info(f"🔍 PAGE COUNT: Checking artefact type '{art_type}' for file {file_id}")
-                
-                # Skip frontmatter artefacts as they only contain partial page counts
-                if art_type == 'docling_frontmatter_json':
-                    logger.info(f"🔍 PAGE COUNT: Skipping frontmatter artefact (partial page count) for file {file_id}")
-                    continue
-                
-                # Also skip docling_json artefacts that are from frontmatter processing
-                if art_type == 'docling_json' and extra.get('is_frontmatter', False):
-                    logger.info(f"🔍 PAGE COUNT: Skipping frontmatter-derived docling_json artefact (partial page count) for file {file_id}")
-                    continue
-                
-                # Also skip docling_json artefacts that have frontmatter-related pipeline info
-                if art_type == 'docling_json' and extra.get('pipeline') == 'frontmatter_ocr':
-                    logger.info(f"🔍 PAGE COUNT: Skipping frontmatter pipeline docling_json artefact (partial page count) for file {file_id}")
-                    continue
-                    
-                if 'page_count' in extra:
-                    page_count = int(extra['page_count'])
-                    logger.info(f"✅ PAGE COUNT: Found page count {page_count} from {art_type} artefact for file {file_id}")
-                    return page_count
-                else:
-                    logger.info(f"🔍 PAGE COUNT: No page_count in {art_type} artefact for file {file_id}")
-            
-            logger.info(f"🔍 PAGE COUNT: No artefacts with page_count found, trying Tika JSON parsing for file {file_id}")
-            
-            # Try to get page count from Tika JSON (most reliable source)
-            tika_arts = self.client.supabase.table('document_artefacts') \
-                .select('rel_path') \
-                .eq('file_id', file_id) \
-                .eq('type', 'tika_json') \
-                .execute()
-            
-            if tika_arts.data:
-                logger.info(f"🔍 PAGE COUNT: Found Tika JSON artefact, parsing content for file {file_id}")
-                file_info = self.client.supabase.table('files').select('bucket').eq('id', file_id).single().execute()
-                if file_info.data:
-                    tika_data = self.storage.download_file(file_info.data['bucket'], tika_arts.data[0]['rel_path'])
-                    import json
-                    tika_json = json.loads(tika_data.decode('utf-8'))
-                    
-                    # Check common Tika page count keys in top level and metadata
-                    logger.info(f"🔍 PAGE COUNT: Checking Tika JSON keys for page count in file {file_id}")
-                    
-                    # First check metadata section (most common location)
-                    metadata = tika_json.get('metadata', {})
-                    for key in ("xmpTPg:NPages", "Page-Count", "pdf:PageCount", "pdf:pagecount", "meta:page-count", "pdfa:PDFVersion"):
-                        # Check both exact key and lowercase version in metadata
-                        value = metadata.get(key) or metadata.get(key.lower())
-                        if value is not None:
-                            try:
-                                page_count = int(value)
-                                if page_count > 0:
-                                    logger.info(f"✅ PAGE COUNT: Found page count {page_count} from Tika metadata key '{key}' for file {file_id}")
-                                    return page_count
-                            except Exception as parse_error:
-                                logger.info(f"🔍 PAGE COUNT: Could not parse value '{value}' from Tika metadata key '{key}': {parse_error}")
-                                continue
-                    
-                    # Also check top level (fallback)
-                    for key in ("xmpTPg:NPages", "Page-Count", "pdf:PageCount", "pdf:pagecount"):
-                        value = tika_json.get(key) or tika_json.get(key.lower())
-                        if value is not None:
-                            try:
-                                page_count = int(value)
-                                if page_count > 0:
-                                    logger.info(f"✅ PAGE COUNT: Found page count {page_count} from Tika JSON top-level key '{key}' for file {file_id}")
-                                    return page_count
-                            except Exception as parse_error:
-                                logger.info(f"🔍 PAGE COUNT: Could not parse value '{value}' from Tika top-level key '{key}': {parse_error}")
-                                continue
-                    
-                    # Debug: Show available keys to help diagnose issues
-                    logger.info(f"🔍 PAGE COUNT: Available Tika JSON top-level keys: {list(tika_json.keys())}")
-                    if 'metadata' in tika_json:
-                        logger.info(f"🔍 PAGE COUNT: Available Tika metadata keys: {list(metadata.keys())}")
-                    
-                    logger.warning(f"🔍 PAGE COUNT: No valid page count keys found in Tika JSON for file {file_id}")
-                else:
-                    logger.warning(f"🔍 PAGE COUNT: Could not get file info for Tika JSON parsing for file {file_id}")
-            else:
-                logger.warning(f"🔍 PAGE COUNT: No Tika JSON artefact found for file {file_id}")
-            
-            # Final fallback - try to get it directly from PDF using PyMuPDF
-            logger.warning(f"🔍 PAGE COUNT: Trying direct PDF parsing as final fallback for file {file_id}")
-            return self._get_page_count_direct_pdf(file_id)
-            
-        except Exception as e:
-            logger.error(f"❌ PAGE COUNT: Error getting page count for file {file_id}: {e}, defaulting to {self.docling_split_threshold + 1}")
-            return self.docling_split_threshold + 1
-
-    def _get_page_count_direct_pdf(self, file_id: str) -> int:
-        """Final fallback: Get page count directly from PDF using PyMuPDF."""
-        try:
-            # Get file info from database
-            file_info = self.client.supabase.table('files').select('bucket,path,cabinet_id').eq('id', file_id).single().execute()
-            if not file_info.data:
-                logger.warning(f"🔍 PAGE COUNT: Could not find file info for {file_id}, defaulting to threshold + 1")
-                return self.docling_split_threshold + 1
-            
-            file_row = file_info.data
-            bucket = file_row['bucket']
-            file_path = file_row['path']
-            
-            # Download and read PDF directly with PyMuPDF
-            logger.info(f"🔍 PAGE COUNT: Reading PDF directly from storage for file {file_id}")
-            pdf_bytes = self.storage.download_file(bucket, file_path)
-            
-            import fitz  # PyMuPDF
-            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
-            page_count = len(doc)
-            doc.close()
-            
-            logger.info(f"✅ PAGE COUNT: Direct PDF reading found {page_count} pages for file {file_id}")
-            return page_count
-            
-        except Exception as e:
-            logger.error(f"❌ PAGE COUNT: Direct PDF reading failed for file {file_id}: {e}, defaulting to {self.docling_split_threshold + 1}")
-            return self.docling_split_threshold + 1
-
-    def _get_page_ranges(self, file_id: str) -> dict:
-        """Get page ranges for page-based processing."""
-        page_count = self._get_page_count(file_id)
-        return {
-            'pages': list(range(1, page_count + 1)),
-            'total_pages': page_count
-        }
-
-    def _load_split_map_if_needed(self, file_id: str) -> Optional[Dict[str, Any]]:
-        """Load split map if needed for processing decisions."""
-        try:
-            file_info = self.client.supabase.table('files').select('bucket').eq('id', file_id).single().execute()
-            if not file_info.data:
-                return None
-            return self._load_split_map(file_info.data['bucket'], file_id)
-        except Exception:
-            return None
-
-    def _create_chunked_ranges(self, page_count: int) -> dict:
-        """Create chunked page ranges for large documents without split maps."""
-        chunk_size = max(10, self.docling_split_threshold // 4)  # 1/4 of threshold, min 10 pages
-        chunks = []
-        
-        for start_page in range(1, page_count + 1, chunk_size):
-            end_page = min(start_page + chunk_size - 1, page_count)
-            chunks.append({
-                'start': start_page,
-                'end': end_page,
-                'title': f'Pages {start_page}-{end_page}'
-            })
-        
-        return {
-            'entries': chunks,
-            'total_chunks': len(chunks)
-        }
-
-    def _enqueue_single_pipeline_with_deps(self, file_id: str, pipeline_type: str, base_config: Dict[str, Any],
-                                          bucket: str, file_path: str, cabinet_id: str, mime_type: str,
-                                          depends_on: List[str]) -> Optional[Dict[str, Any]]:
-        """Enqueue a single pipeline with dependencies on previous pipeline tasks."""
-        
-        group_id = str(uuid.uuid4())
-        
-        # Get pipeline-specific configuration options
-        pipeline_specific_config = self._get_pipeline_specific_config(pipeline_type)
-        
-        if pipeline_type == 'no_ocr':
-            config = {
-                **base_config,
-                **pipeline_specific_config,
-                'do_ocr': False,
-                'force_ocr': False,
-                'pipeline': 'standard'
-            }
-            logger.info(f"NO_OCR pipeline config: table_mode={config['table_mode']}, "
-                       f"formula_enrichment={config['do_formula_enrichment']}, "
-                       f"code_enrichment={config['do_code_enrichment']}")
-        elif pipeline_type == 'ocr':
-            config = {
-                **base_config,
-                **pipeline_specific_config,
-                'do_ocr': True,
-                'force_ocr': False,
-                'pipeline': 'standard'
-            }
-            logger.info(f"OCR pipeline config: table_mode={config['table_mode']}, "
-                       f"formula_enrichment={config['do_formula_enrichment']}, "
-                       f"code_enrichment={config['do_code_enrichment']}")
-        elif pipeline_type == 'vlm':
-            config = {
-                **base_config,
-                **pipeline_specific_config,
-                'do_ocr': False,
-                'force_ocr': False,
-                'pipeline': 'vlm',
-                'vlm_pipeline_model': os.getenv('DOCLING_VLM_MODEL', 'smoldocling')
-            }
-            logger.info(f"VLM pipeline config: table_mode={config['table_mode']}, "
-                       f"picture_classification={config['do_picture_classification']}, "
-                       f"picture_description={config['do_picture_description']}")
-        else:
-            logger.error(f"Unknown pipeline type: {pipeline_type}")
-            return None
-        
-        # Determine processing mode using corrected logic
-        processing_mode, processing_data = self._determine_processing_mode(file_id, pipeline_type)
-        
-        # Enqueue single bundle task with dependencies
-        task_id = self._enqueue_bundle_task_with_deps(
-            file_id, pipeline_type, group_id, config, processing_mode, processing_data,
-            bucket, file_path, cabinet_id, mime_type, depends_on
-        )
-        
-        return {
-            'group_id': group_id,
-            'task_ids': [task_id] if task_id else [],
-            'task_count': 1 if task_id else 0,
-            'processing_mode': processing_mode,
-            'processing_data': processing_data
-        }
-
-    def _enqueue_bundle_task_with_deps(self, file_id: str, pipeline_type: str, group_id: str,
-                                     config: Dict[str, Any], processing_mode: str, processing_data: dict,
-                                     bucket: str, file_path: str, cabinet_id: str, mime_type: str,
-                                     depends_on: List[str]) -> Optional[str]:
-        """
-        Enqueue a single bundle task that handles processing internally based on mode.
-        
-        This replaces the old approach of creating multiple individual tasks.
-        """
-        from modules.queue_system import enqueue_docling_task, TaskPriority
-        from modules.bundle_metadata import create_standard_metadata
-        
-        # Map processing modes to bundle types and task types
-        if processing_mode == "whole_document":
-            task_type = 'docling_bundle'
-            bundle_type = 'whole_document'
-        else:
-            task_type = 'docling_bundle_split'
-            bundle_type = processing_mode
-        
-        # Create bundle metadata with correct processing mode mapping
-        if processing_mode == "whole_document":
-            bundle_processing_mode = "whole_document"
-        elif processing_mode.startswith("split_by_"):
-            # For split modes, map to the appropriate bundle metadata mode
-            if processing_mode == "split_by_pages":
-                bundle_processing_mode = "pages"
-            elif processing_mode == "split_by_sections":
-                bundle_processing_mode = "sections" 
-            elif processing_mode == "split_by_chunks":
-                bundle_processing_mode = "chunks"
-            else:
-                bundle_processing_mode = processing_mode.replace('split_by_', '')
-        else:
-            bundle_processing_mode = processing_mode
-            
-        bundle_metadata = create_standard_metadata(
-            file_id=file_id,
-            pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"),
-            processing_mode=bundle_processing_mode,
-            config=config,
-            group_id=group_id,
-            producer="auto_phase2"
-        )
-        
-        # Create task payload with new bundle architecture
-        payload = {
-            'bucket': bucket,
-            'file_path': file_path,
-            'cabinet_id': cabinet_id,
-            'mime_type': mime_type,
-            'config': config,
-            'processing_mode': processing_mode,
-            'processing_data': processing_data,
-            'bundle_metadata': bundle_metadata.to_artefact_extra(),
-            'depends_on': depends_on
-        }
-        
-        # Determine timeout based on processing complexity
-        if processing_mode == "whole_document":
-            timeout = 7200  # 2 hours for whole document
-        elif processing_mode == "split_by_pages":
-            # Estimate based on page count
-            page_count = processing_data.get('total_pages', 50)
-            timeout = min(14400, max(3600, page_count * 60))  # 1-4 hours based on pages
-        else:
-            # Section or chunk based processing
-            section_count = len(processing_data.get('entries', []))
-            timeout = min(10800, max(3600, section_count * 300))  # 1-3 hours based on sections
-        
-        logger.info(f"Enqueuing {task_type} task for {pipeline_type} pipeline: {processing_mode} (timeout: {timeout}s)")
-        
-        try:
-            task_id = enqueue_docling_task(
-                file_id=file_id,
-                task_type=task_type,
-                payload=payload,
-                priority=TaskPriority.NORMAL,
-                timeout=timeout
-            )
-            
-            logger.info(f"Successfully enqueued {task_type} task {task_id} for {pipeline_type} pipeline")
-            return task_id
-            
-        except Exception as e:
-            logger.error(f"Failed to enqueue bundle task for {pipeline_type} pipeline: {e}")
-            return None
-
-    def trigger_phase2_pipelines(self, file_id: str, file_data: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Trigger Phase 2 sequential processing pipelines based on environment configuration.
-        
-        Pipelines run in order: no_ocr → ocr → vlm (depending on what's enabled).
-        Only the first pipeline starts immediately; others are triggered when the previous completes.
-        
-        Args:
-            file_id: The file ID to process
-            file_data: File processing information (bucket, path, etc.)
-            
-        Returns:
-            Dictionary with triggered pipeline information
-        """
-        logger.info(f"Phase 2: Starting sequential content processing for file {file_id}")
-        
-        triggered_pipelines = {}
-        bucket = file_data['bucket']
-        file_path = file_data['file_path']
-        cabinet_id = file_data['cabinet_id']
-        mime_type = file_data['mime_type']
-        
-        # Base configuration for all pipelines (DEPRECATED METHOD - use enqueue_sequential_docling_pipelines)
-        base_config = {
-            'to_formats': ['json', 'html', 'text', 'md', 'doctags'],
-            'image_export_mode': 'referenced',
-            'target_type': 'zip',
-            'pdf_backend': os.getenv('DOCLING_PDF_BACKEND', 'dlparse_v4'),
-            'include_images': os.getenv('DOCLING_INCLUDE_IMAGES', 'true').lower() == 'true',
-            'images_scale': float(os.getenv('DOCLING_IMAGES_SCALE', '2.0')),
-            'ocr_engine': os.getenv('OCR_ENGINE', 'easyocr'),
-            'ocr_lang': os.getenv('OCR_LANG', 'en'),
-            'picture_description_area_threshold': float(os.getenv('DOCLING_PICTURE_DESCRIPTION_AREA_THRESHOLD', '0.05'))
-        }
-        
-        # Determine the pipeline execution order: no_ocr → ocr → vlm
-        pipeline_order = []
-        if self.auto_docling_no_ocr:
-            pipeline_order.append('no_ocr')
-        if self.auto_docling_ocr:
-            pipeline_order.append('ocr') 
-        if self.auto_docling_vlm:
-            pipeline_order.append('vlm')
-        
-        if not pipeline_order:
-            logger.info(f"Phase 2: No pipelines enabled for file {file_id}")
-            return {
-                'file_id': file_id,
-                'triggered_pipelines': {},
-                'total_tasks': 0,
-                'sequential_order': [],
-                'message': 'No Phase 2 pipelines enabled'
-            }
-        
-        logger.info(f"Phase 2: Sequential pipeline order for file {file_id}: {pipeline_order}")
-        logger.warning(f"trigger_phase2_pipelines is deprecated - use enqueue_sequential_docling_pipelines for new implementations")
-        
-        # For backward compatibility, delegate to the new method
-        return self.enqueue_sequential_docling_pipelines(file_id, file_data)
-
-    def _start_single_pipeline(self, file_id: str, pipeline_type: str, base_config: Dict[str, Any],
-                               bucket: str, file_path: str, cabinet_id: str, mime_type: str) -> Optional[Dict[str, Any]]:
-        """Start a single pipeline of the specified type."""
-        
-        if pipeline_type == 'no_ocr':
-            group_id = str(uuid.uuid4())
-            config = {
-                **base_config,
-                'do_ocr': False,
-                'force_ocr': False,
-                'pipeline': 'standard'
-            }
-            tasks = self._enqueue_pipeline(
-                file_id, 'no_ocr', group_id, config,
-                bucket, file_path, cabinet_id, mime_type,
-                by_page=self.docling_no_ocr_by_page
-            )
-            return {
-                'group_id': group_id,
-                'task_count': len(tasks),
-                'by_page': self.docling_no_ocr_by_page
-            }
-            
-        elif pipeline_type == 'ocr':
-            group_id = str(uuid.uuid4())
-            config = {
-                **base_config,
-                'do_ocr': True,
-                'ocr_engine': os.getenv('OCR_ENGINE', 'easyocr'),
-                'force_ocr': False,
-                'pipeline': 'standard'
-            }
-            tasks = self._enqueue_pipeline(
-                file_id, 'ocr', group_id, config, 
-                bucket, file_path, cabinet_id, mime_type,
-                by_page=self.docling_ocr_by_page
-            )
-            return {
-                'group_id': group_id,
-                'task_count': len(tasks),
-                'by_page': self.docling_ocr_by_page
-            }
-            
-        elif pipeline_type == 'vlm':
-            group_id = str(uuid.uuid4())
-            config = {
-                **base_config,
-                'do_ocr': False,
-                'force_ocr': False,
-                'pipeline': 'vlm',
-                'vlm_pipeline_model': os.getenv('DOCLING_VLM_MODEL', 'smoldocling')
-            }
-            tasks = self._enqueue_pipeline(
-                file_id, 'vlm', group_id, config,
-                bucket, file_path, cabinet_id, mime_type,
-                by_page=self.docling_vlm_by_page
-            )
-            return {
-                'group_id': group_id,
-                'task_count': len(tasks),
-                'by_page': self.docling_vlm_by_page
-            }
-        
-        else:
-            logger.error(f"Unknown pipeline type: {pipeline_type}")
-            return None
-
-# continue_sequential_pipeline method removed - task dependencies now handle sequential execution
-
-    def _load_split_map(self, bucket: str, file_id: str) -> Optional[Dict[str, Any]]:
-        """Load split map data for a file."""
-        try:
-            arts = self.client.supabase.table('document_artefacts') \
-                .select('id,type,rel_path') \
-                .eq('file_id', file_id).eq('type', 'split_map_json') \
-                .order('created_at', desc=True).limit(1).execute().data or []
-            if not arts:
-                return None
-            art = arts[0]
-            raw = self.storage.download_file(bucket, art['rel_path'])
-            import json as _json
-            return _json.loads(raw.decode('utf-8'))
-        except Exception:
-            return None
-
-    def _enqueue_pipeline(self, file_id: str, pipeline_type: str, group_id: str, config: Dict[str, Any],
-                         bucket: str, file_path: str, cabinet_id: str, mime_type: str,
-                         by_page: bool = False) -> List[str]:
-        """Enqueue tasks for a specific pipeline (OCR/No-OCR/VLM)"""
-        
-        task_ids = []
-        
-        if by_page:
-            # Process each page individually, then group by split map sections
-            logger.info(f"Phase 2: Enqueueing {pipeline_type} pipeline by page for file {file_id}")
-            
-            split_map = self._load_split_map(bucket, file_id)
-            if split_map:
-                entries = split_map.get('entries', [])
-                for section_idx, entry in enumerate(entries, 1):
-                    start_page = int(entry.get('start_page', 1))
-                    end_page = int(entry.get('end_page', start_page))
-                    section_title = entry.get('title', f'Section {section_idx}')
-                    
-                    if pipeline_type == 'vlm':
-                        # VLM uses specialized page processing
-                        section_task_id = enqueue_docling_task(
-                            file_id=file_id,
-                            task_type='vlm_section_page_bundle',
-                            payload={
-                                'section_idx': section_idx,
-                                'start_page': start_page,
-                                'end_page': end_page,
-                                'section_title': section_title,
-                                'vlm_group_id': group_id,
-                                'vlm_model': config.get('vlm_pipeline_model', 'smoldocling'),
-                                'base_config': config,
-                                'total_sections': len(entries),
-                                'producer': 'auto_phase2'
-                            },
-                            priority=TaskPriority.NORMAL,
-                            timeout=3600
-                        )
-                        task_ids.append(section_task_id)
-                    else:
-                        # OCR/No-OCR by page processing (process each page in section individually)
-                        for page_num in range(start_page, end_page + 1):
-                            page_config = {
-                                **config,
-                                'page_range': [page_num, page_num]
-                            }
-                            
-                            # Create standardized bundle metadata
-                            page_metadata = create_standard_metadata(
-                                file_id=file_id,
-                                pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"),
-                                processing_mode="individual_pages",
-                                config=page_config,
-                                group_id=group_id,
-                                split_order=section_idx,
-                                split_total=len(entries),
-                                split_heading=section_title,
-                                page_range=[page_num, page_num],
-                                producer="auto_phase2"
-                            )
-                            
-                            # Add legacy fields for backward compatibility
-                            artefact_extra = page_metadata.to_artefact_extra()
-                            artefact_extra.update({
-                                'section_idx': section_idx,
-                                'section_title': section_title,
-                                'page_number': page_num,
-                            })
-                            
-                            page_task_id = enqueue_docling_task(
-                                file_id=file_id,
-                                task_type='canonical_docling_json',
-                                payload={
-                                    'bucket': bucket,
-                                    'file_path': file_path,
-                                    'cabinet_id': cabinet_id,
-                                    'mime_type': mime_type,
-                                    'config': page_config,
-                                    'artefact_extra': artefact_extra
-                                },
-                                priority=TaskPriority.NORMAL,
-                                timeout=1800
-                            )
-                            task_ids.append(page_task_id)
-            else:
-                logger.warning(f"Phase 2: No split map found for by-page processing of file {file_id}")
-                return []
-        
-        elif self.docling_use_split_map:
-            # Process by split map sections
-            logger.info(f"Phase 2: Enqueueing {pipeline_type} pipeline by split map sections for file {file_id}")
-            
-            split_map = self._load_split_map(bucket, file_id)
-            if split_map:
-                entries = split_map.get('entries', [])
-                
-                # Normalize and sort entries by start_page
-                normalized_entries = []
-                for entry in entries:
-                    try:
-                        start_page = int(entry.get('start_page', 1))
-                        end_page = int(entry.get('end_page', start_page))
-                        title = entry.get('title') or entry.get('label') or ''
-                        if end_page < start_page:
-                            end_page = start_page
-                        normalized_entries.append({
-                            'start': start_page,
-                            'end': end_page,
-                            'title': title
-                        })
-                    except Exception:
-                        continue
-                
-                normalized_entries.sort(key=lambda x: x['start'])
-                
-                # Create tasks for each section
-                for i, entry in enumerate(normalized_entries, 1):
-                    section_config = {
-                        **config,
-                        'page_range': [entry['start'], entry['end']]
-                    }
-                    
-                    # Create standardized bundle metadata for section
-                    section_metadata = create_standard_metadata(
-                        file_id=file_id,
-                        pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"),
-                        processing_mode="split_sections",
-                        config=section_config,
-                        group_id=group_id,
-                        split_order=i,
-                        split_total=len(normalized_entries),
-                        split_heading=entry['title'] or f'Section {i}',
-                        page_range=[entry['start'], entry['end']],
-                        producer="auto_phase2"
-                    )
-                    
-                    section_task_id = enqueue_docling_task(
-                        file_id=file_id,
-                        task_type='canonical_docling_json',
-                        payload={
-                            'bucket': bucket,
-                            'file_path': file_path,
-                            'cabinet_id': cabinet_id,
-                            'mime_type': mime_type,
-                            'config': section_config,
-                            'artefact_extra': section_metadata.to_artefact_extra()
-                        },
-                        priority=TaskPriority.NORMAL,
-                        timeout=3600
-                    )
-                    task_ids.append(section_task_id)
-            else:
-                logger.warning(f"Phase 2: No split map found for section-based processing of file {file_id}")
-                return []
-        
-        else:
-            # Process whole document
-            logger.info(f"Phase 2: Enqueueing {pipeline_type} pipeline for whole document {file_id}")
-            
-            # Create standardized bundle metadata for whole document
-            whole_doc_metadata = create_standard_metadata(
-                file_id=file_id,
-                pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"),
-                processing_mode="whole_document",
-                config=config,
-                group_id=group_id,
-                producer="auto_phase2"
-            )
-            
-            task_id = enqueue_docling_task(
-                file_id=file_id,
-                task_type='canonical_docling_json',
-                payload={
-                    'bucket': bucket,
-                    'file_path': file_path,
-                    'cabinet_id': cabinet_id,
-                    'mime_type': mime_type,
-                    'config': config,
-                    'artefact_extra': whole_doc_metadata.to_artefact_extra()
-                },
-                priority=TaskPriority.NORMAL,
-                timeout=7200
-            )
-            task_ids.append(task_id)
-        
-        logger.info(f"Phase 2: Enqueued {len(task_ids)} tasks for {pipeline_type} pipeline")
-        return task_ids
-
-    def _enqueue_pipeline_with_deps(self, file_id: str, pipeline_type: str, group_id: str, config: Dict[str, Any],
-                                   bucket: str, file_path: str, cabinet_id: str, mime_type: str,
-                                   by_page: bool = False, depends_on: List[str] = None) -> List[str]:
-        """Enqueue tasks for a specific pipeline with dependencies"""
-        
-        if depends_on is None:
-            depends_on = []
-        
-        task_ids = []
-        
-        if by_page:
-            # Process each page individually, then group by split map sections
-            logger.info(f"Enqueueing {pipeline_type} pipeline by page for file {file_id} with {len(depends_on)} dependencies")
-            
-            split_map = self._load_split_map(bucket, file_id)
-            if split_map:
-                entries = split_map.get('entries', [])
-                for section_idx, entry in enumerate(entries, 1):
-                    start_page = int(entry.get('start_page', 1))
-                    end_page = int(entry.get('end_page', start_page))
-                    section_title = entry.get('title', f'Section {section_idx}')
-                    
-                    if pipeline_type == 'vlm':
-                        # VLM uses specialized page processing
-                        section_task_id = enqueue_docling_task(
-                            file_id=file_id,
-                            task_type='vlm_section_page_bundle',
-                            payload={
-                                'section_idx': section_idx,
-                                'start_page': start_page,
-                                'end_page': end_page,
-                                'section_title': section_title,
-                                'vlm_group_id': group_id,
-                                'vlm_model': config.get('vlm_pipeline_model', 'smoldocling'),
-                                'base_config': config,
-                                'total_sections': len(entries),
-                                'producer': 'auto_phase2',
-                                'depends_on': depends_on
-                            },
-                            priority=TaskPriority.NORMAL,
-                            timeout=3600
-                        )
-                        task_ids.append(section_task_id)
-                    else:
-                        # OCR/No-OCR by page processing (process each page in section individually)
-                        for page_num in range(start_page, end_page + 1):
-                            page_config = {
-                                **config,
-                                'page_range': [page_num, page_num]
-                            }
-                            
-                            # Create standardized bundle metadata
-                            page_metadata = create_standard_metadata(
-                                file_id=file_id,
-                                pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"),
-                                processing_mode="individual_pages",
-                                config=page_config,
-                                group_id=group_id,
-                                split_order=section_idx,
-                                split_total=len(entries),
-                                split_heading=section_title,
-                                page_range=[page_num, page_num],
-                                producer="auto_phase2"
-                            )
-                            
-                            # Add legacy fields for backward compatibility
-                            artefact_extra = page_metadata.to_artefact_extra()
-                            artefact_extra.update({
-                                'section_idx': section_idx,
-                                'section_title': section_title,
-                                'page_number': page_num,
-                            })
-                            
-                            page_task_id = enqueue_docling_task(
-                                file_id=file_id,
-                                task_type='canonical_docling_json',
-                                payload={
-                                    'bucket': bucket,
-                                    'file_path': file_path,
-                                    'cabinet_id': cabinet_id,
-                                    'mime_type': mime_type,
-                                    'config': page_config,
-                                    'artefact_extra': artefact_extra,
-                                    'depends_on': depends_on
-                                },
-                                priority=TaskPriority.NORMAL,
-                                timeout=1800
-                            )
-                            task_ids.append(page_task_id)
-            else:
-                logger.warning(f"No split map found for by-page processing of file {file_id}")
-                return []
-        
-        elif self.docling_use_split_map:
-            # Process by split map sections
-            logger.info(f"Enqueueing {pipeline_type} pipeline by split map sections for file {file_id} with {len(depends_on)} dependencies")
-            
-            split_map = self._load_split_map(bucket, file_id)
-            if split_map:
-                entries = split_map.get('entries', [])
-                
-                # Normalize and sort entries by start_page
-                normalized_entries = []
-                for entry in entries:
-                    try:
-                        start_page = int(entry.get('start_page', 1))
-                        end_page = int(entry.get('end_page', start_page))
-                        title = entry.get('title') or entry.get('label') or ''
-                        if end_page < start_page:
-                            end_page = start_page
-                        normalized_entries.append({
-                            'start': start_page,
-                            'end': end_page,
-                            'title': title
-                        })
-                    except Exception:
-                        continue
-                
-                normalized_entries.sort(key=lambda x: x['start'])
-                
-                # Create tasks for each section
-                for i, entry in enumerate(normalized_entries, 1):
-                    section_config = {
-                        **config,
-                        'page_range': [entry['start'], entry['end']]
-                    }
-                    
-                    # Create standardized bundle metadata for section
-                    section_metadata = create_standard_metadata(
-                        file_id=file_id,
-                        pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"),
-                        processing_mode="split_sections",
-                        config=section_config,
-                        group_id=group_id,
-                        split_order=i,
-                        split_total=len(normalized_entries),
-                        split_heading=entry['title'] or f'Section {i}',
-                        page_range=[entry['start'], entry['end']],
-                        producer="auto_phase2"
-                    )
-                    
-                    section_task_id = enqueue_docling_task(
-                        file_id=file_id,
-                        task_type='canonical_docling_json',
-                        payload={
-                            'bucket': bucket,
-                            'file_path': file_path,
-                            'cabinet_id': cabinet_id,
-                            'mime_type': mime_type,
-                            'config': section_config,
-                            'artefact_extra': section_metadata.to_artefact_extra(),
-                            'depends_on': depends_on
-                        },
-                        priority=TaskPriority.NORMAL,
-                        timeout=3600
-                    )
-                    task_ids.append(section_task_id)
-            else:
-                logger.warning(f"No split map found for section-based processing of file {file_id}")
-                return []
-        
-        else:
-            # Process whole document
-            logger.info(f"Enqueueing {pipeline_type} pipeline for whole document {file_id} with {len(depends_on)} dependencies")
-            
-            # Create standardized bundle metadata for whole document
-            whole_doc_metadata = create_standard_metadata(
-                file_id=file_id,
-                pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"),
-                processing_mode="whole_document",
-                config=config,
-                group_id=group_id,
-                producer="auto_phase2"
-            )
-            
-            task_id = enqueue_docling_task(
-                file_id=file_id,
-                task_type='canonical_docling_json',
-                payload={
-                    'bucket': bucket,
-                    'file_path': file_path,
-                    'cabinet_id': cabinet_id,
-                    'mime_type': mime_type,
-                    'config': config,
-                    'artefact_extra': whole_doc_metadata.to_artefact_extra(),
-                    'depends_on': depends_on
-                },
-                priority=TaskPriority.NORMAL,
-                timeout=7200
-            )
-            task_ids.append(task_id)
-        
-        logger.info(f"Enqueued {len(task_ids)} tasks for {pipeline_type} pipeline with dependencies")
-        return task_ids
-
-
-# Global pipeline controller instance
-_controller_instance = None
-
-def get_pipeline_controller() -> DocumentPipelineController:
-    """Get the global pipeline controller instance."""
-    global _controller_instance
-    if _controller_instance is None:
-        _controller_instance = DocumentPipelineController()
-    return _controller_instance
diff --git a/archive/auto_processing/task_processors.py b/archive/auto_processing/task_processors.py
deleted file mode 100644
index 58ec924..0000000
--- a/archive/auto_processing/task_processors.py
+++ /dev/null
@@ -1,2531 +0,0 @@
-"""
-Task Processors for Document Processing Queue
-
-This module contains the actual processing implementations for different
-types of queued tasks (Tika, Docling, LLM, Split Map).
-"""
-
-import json
-import zipfile
-import io
-import mimetypes
-import requests
-import tempfile
-import uuid
-from pathlib import Path
-from typing import Dict, Any, Optional
-import os
-
-from modules.queue_system import DocumentProcessingQueue, QueueTask, ServiceType
-from modules.database.supabase.utils.client import SupabaseServiceRoleClient
-from modules.database.supabase.utils.storage import StorageAdmin
-from modules.document_processor import DocumentProcessor
-from modules.logger_tool import initialise_logger
-
-logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
-
-class DocumentTaskProcessor(DocumentProcessingQueue):
-    """
-    Extended queue with actual task processing implementations.
-    """
-    
-    def __init__(self, redis_url: str = None):
-        super().__init__(redis_url)
-        self.client = SupabaseServiceRoleClient()
-        self.storage = StorageAdmin()
-        self.doc_processor = DocumentProcessor()
-        
-        # Service URLs
-        self.tika_url = os.getenv('TIKA_URL')
-        self.docling_url = os.getenv('DOCLING_URL') or os.getenv('NEOFS_DOCLING_URL')
-        self.llm_url = os.getenv('LLM_URL')  # Local LLM endpoint
-        
-        logger.info("Task processor initialized with service URLs")
-
-    def _process_task(self, task: QueueTask):
-        """Process a task based on its service type."""
-        try:
-            # DEBUG: Log entry into processing
-            logger.info(f"🚀 PROCESS DEBUG: Starting _process_task for {task.id}")
-            
-            # Audit dependency info (if any)
-            try:
-                deps = []
-                if isinstance(task.payload, dict):
-                    deps = task.payload.get('depends_on') or []
-                if deps:
-                    logger.info(f"Processing task {task.id}: {task.service.value}/{task.task_type} deps={deps}")
-                else:
-                    logger.info(f"Processing task {task.id}: {task.service.value}/{task.task_type}")
-            except Exception:
-                logger.info(f"Processing task {task.id}: {task.service.value}/{task.task_type}")
-            
-            # DEBUG: Log service routing
-            logger.info(f"🚀 PROCESS DEBUG: Routing task {task.id} to service {task.service}")
-            
-            if task.service == ServiceType.TIKA:
-                result = self._process_tika_task(task)
-            elif task.service == ServiceType.DOCLING:
-                result = self._process_docling_task(task)
-            elif task.service == ServiceType.LLM:
-                result = self._process_llm_task(task)
-            elif task.service == ServiceType.SPLIT_MAP:
-                result = self._process_split_map_task(task)
-            elif task.service == ServiceType.DOCUMENT_ANALYSIS:
-                result = self.process_document_analysis_task(task)
-            elif task.service == ServiceType.PAGE_IMAGES:
-                result = self.process_page_images_task(task)
-            else:
-                raise ValueError(f"Unknown service type: {task.service}")
-            
-            # DEBUG: Log successful completion
-            logger.info(f"✅ PROCESS DEBUG: Task {task.id} completed successfully, calling complete_task")
-            self.complete_task(task, result)
-            logger.info(f"✅ PROCESS DEBUG: Task {task.id} completion confirmed")
-            
-        except Exception as e:
-            # DEBUG: Log detailed failure info
-            logger.error(f"🚨 PROCESS DEBUG: Task {task.id} processing failed: {e}")
-            logger.error(f"🚨 PROCESS DEBUG: Exception type: {type(e)}")
-            import traceback
-            logger.error(f"🚨 PROCESS DEBUG: Full traceback:\n{traceback.format_exc()}")
-            logger.info(f"🚨 PROCESS DEBUG: Calling fail_task for {task.id}")
-            self.fail_task(task, str(e))
-            logger.info(f"🚨 PROCESS DEBUG: fail_task completed for {task.id}")
-
-    def _process_tika_task(self, task: QueueTask) -> Dict[str, Any]:
-        """Process Tika metadata extraction task."""
-        if not self.tika_url:
-            raise ValueError("TIKA_URL not configured")
-        
-        payload = task.payload
-        file_id = task.file_id
-        bucket = payload['bucket']
-        file_path = payload['file_path']
-        cabinet_id = payload['cabinet_id']
-        mime_type = payload.get('mime_type', 'application/octet-stream')
-        
-        # Download file
-        logger.debug(f"Downloading file for Tika processing: {bucket}/{file_path}")
-        file_bytes = self.storage.download_file(bucket, file_path)
-        
-        # Call Tika
-        headers = {'Accept': 'application/json', 'Content-Type': mime_type}
-        timeout = task.timeout
-        
-        response = requests.put(
-            f"{self.tika_url.rstrip('/')}/meta",
-            data=file_bytes,
-            headers=headers,
-            timeout=timeout
-        )
-        response.raise_for_status()
-        
-        tika_json = response.json()
-        
-        # Store result as artefact
-        artefact_id = str(uuid.uuid4())
-        rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/tika.json"
-        
-        self.storage.upload_file(
-            bucket, 
-            rel_path, 
-            json.dumps(tika_json, ensure_ascii=False).encode('utf-8'),
-            'application/json',
-            upsert=True
-        )
-        
-        # Create artefact record with enhanced UI metadata
-        artefact_data = {
-            'id': artefact_id,
-            'file_id': file_id,
-            'type': 'tika_json',
-            'rel_path': rel_path,
-            'extra': {
-                'processing_time': response.elapsed.total_seconds(),
-                'display_name': 'Document Metadata',
-                'bundle_label': 'Tika Analysis',
-                'section_title': 'Document Metadata',
-                'bundle_type': 'tika_json',
-                'processing_mode': 'metadata_extraction',
-                'pipeline': 'tika_analysis',
-                'is_metadata': True,
-                'ui_category': 'raw_data',
-                'ui_order': 3,
-                'description': 'Raw document metadata and properties extracted by Apache Tika',
-                'viewer_type': 'json'
-            },
-            'status': 'completed'
-        }
-        
-        self.client.supabase.table('document_artefacts').insert(artefact_data).execute()
-        
-        logger.info(f"Tika processing completed for file {file_id}")
-        return {
-            'artefact_id': artefact_id,
-            'rel_path': rel_path,
-            'processing_time': response.elapsed.total_seconds()
-        }
-
-    def _process_docling_task(self, task: QueueTask) -> Dict[str, Any]:
-        """Process Docling document analysis task.
-
-        Also allows routing of related task types so that page images and
-        enhanced structure analysis can run under the stable docling service
-        umbrella when SERVICE dispatch for new types is problematic.
-        """
-        # Soft-route additional task types through this handler
-        if task.task_type in ("document_structure_analysis", "document_analysis"):
-            return self.process_document_analysis_task(task)
-        if task.task_type in ("generate_page_images", "page_images"):
-            return self.process_page_images_task(task)
-        if task.task_type in ("vlm_section_page_bundle",):
-            return self.process_vlm_section_page_bundle_task(task)
-        if task.task_type in ("vlm_section_bundle_collector",):
-            return self.process_vlm_section_bundle_collector_task(task)
-        # New unified bundle architecture handlers
-        if task.task_type in ("docling_bundle",):
-            return self.process_docling_bundle_task(task)
-        if task.task_type in ("docling_bundle_split",):
-            return self.process_docling_bundle_split_task(task)
-# phase2_coordinator task type removed - pipelines now enqueued directly from split_map task
-        if not self.docling_url:
-            raise ValueError("DOCLING_URL not configured")
-        
-        payload = task.payload
-        file_id = task.file_id
-        bucket = payload['bucket']
-        file_path = payload['file_path']
-        cabinet_id = payload['cabinet_id']
-        task_config = payload.get('config', {})
-        
-        # Download file
-        logger.debug(f"Downloading file for Docling processing: {bucket}/{file_path}")
-        file_bytes = self.storage.download_file(bucket, file_path)
-        
-        # Prepare Docling request
-        docling_api_key = os.getenv('DOCLING_API_KEY')
-        # Accept any content type so zip/binary responses are allowed
-        headers = {'Accept': '*/*'}
-        if docling_api_key:
-            headers['X-Api-Key'] = docling_api_key
-        
-        # Determine to_formats. For canonical docling we will request a ZIP bundle.
-        to_formats_val = task_config.get('to_formats', 'json')
-        to_formats_list = to_formats_val if isinstance(to_formats_val, list) else [to_formats_val]
-        is_canonical = str(task.task_type).startswith('canonical_docling')
-        target_type = task_config.get('target_type', 'zip' if is_canonical else 'inbody')
-
-        # Build form data from config (override for canonical)
-        form_data = [
-            ('target_type', target_type),
-            ('do_ocr', str(task_config.get('do_ocr', False)).lower()),
-            ('force_ocr', str(task_config.get('force_ocr', False)).lower()),
-            ('image_export_mode', 'referenced' if is_canonical else task_config.get('image_export_mode', 'embedded')),
-            ('ocr_engine', task_config.get('ocr_engine', 'easyocr')),
-            ('ocr_lang', task_config.get('ocr_lang', 'en')),
-            ('pdf_backend', task_config.get('pdf_backend', 'dlparse_v4')),
-            ('table_mode', task_config.get('table_mode', 'fast')),
-            ('do_formula_enrichment', str(task_config.get('do_formula_enrichment', False)).lower()),
-            ('do_code_enrichment', str(task_config.get('do_code_enrichment', False)).lower()),
-            ('pipeline', task_config.get('pipeline', 'standard'))
-        ]
-        # Optional extra flags forwarded when present
-        if 'table_cell_matching' in task_config:
-            form_data.append(('table_cell_matching', str(task_config.get('table_cell_matching')).lower()))
-        if 'do_picture_classification' in task_config:
-            form_data.append(('do_picture_classification', str(task_config.get('do_picture_classification')).lower()))
-        if 'do_picture_description' in task_config:
-            form_data.append(('do_picture_description', str(task_config.get('do_picture_description')).lower()))
-        if task_config.get('picture_description_prompt'):
-            form_data.append(('picture_description_prompt', task_config.get('picture_description_prompt')))
-        # picture_description_api and vlm_pipeline_model_api must be JSON per Docling OpenAPI
-        if task_config.get('picture_description_api') is not None:
-            v = task_config.get('picture_description_api')
-            if isinstance(v, (dict, list)):
-                form_data.append(('picture_description_api', json.dumps(v)))
-            elif isinstance(v, str) and v.strip().startswith(('{', '[')):
-                form_data.append(('picture_description_api', v))
-            # else: omit to avoid validation error
-        if task_config.get('vlm_pipeline_model'):
-            form_data.append(('vlm_pipeline_model', task_config.get('vlm_pipeline_model')))
-        if task_config.get('vlm_pipeline_model_api') is not None:
-            v = task_config.get('vlm_pipeline_model_api')
-            if isinstance(v, (dict, list)):
-                form_data.append(('vlm_pipeline_model_api', json.dumps(v)))
-            elif isinstance(v, str) and v.strip().startswith(('{', '[')):
-                form_data.append(('vlm_pipeline_model_api', v))
-            # else: omit
-        if is_canonical and ('md' in to_formats_list):
-            form_data.append(('md_page_break_placeholder', task_config.get('md_page_break_placeholder', '\n\n<!-- page-break -->\n\n')))
-        # Append to_formats as repeated fields (filter unsupported split pages)
-        to_formats_list = [f for f in to_formats_list if f != 'html_split_page']
-        for fmt in to_formats_list:
-            form_data.append(('to_formats', fmt))
-        
-        # Handle page range with clamping and min/max correction
-        page_range = task_config.get('page_range', [1, 999999])
-        if isinstance(page_range, list) and len(page_range) >= 2:
-            def _to_int_safe(v, default):
-                try:
-                    return int(v)
-                except Exception:
-                    return default
-            start_pg = _to_int_safe(page_range[0], 1)
-            end_pg = _to_int_safe(page_range[1], 999999)
-            if start_pg < 1:
-                start_pg = 1
-            if end_pg < start_pg:
-                end_pg = start_pg
-            # Clamp for frontmatter-like tasks to actual page count if possible
-            if task.task_type in ('docling_frontmatter_json', 'document_structure_analysis'):
-                try:
-                    import fitz  # PyMuPDF
-                    doc = fitz.open(stream=file_bytes, filetype='pdf')
-                    pc = int(doc.page_count)
-                    doc.close()
-                    if pc > 0:
-                        end_pg = min(end_pg, pc)
-                        start_pg = max(1, min(start_pg, pc))
-                        if end_pg < start_pg:
-                            end_pg = start_pg
-                except Exception:
-                    pass
-            form_data.append(('page_range', str(start_pg)))
-            form_data.append(('page_range', str(end_pg)))
-        
-        files = [('files', ('file', file_bytes, payload.get('mime_type', 'application/pdf')))]
-        
-        # Make request
-        response = requests.post(
-            f"{self.docling_url.rstrip('/')}/v1/convert/file",
-            files=files,
-            data=form_data,
-            headers=headers,
-            timeout=task.timeout
-        )
-        response.raise_for_status()
-
-        content_type = (response.headers.get('Content-Type') or '').lower()
-        is_zip_resp = ('zip' in content_type) or (response.content[:2] == b'PK')
-
-        if is_zip_resp and is_canonical:
-            # Unpack zip, store all files and a manifest
-            artefact_id = str(uuid.uuid4())
-            base_dir = f"{cabinet_id}/{file_id}/{artefact_id}"
-            archive_path = f"{base_dir}/bundle.zip"
-            # Save original archive
-            self.storage.upload_file(bucket, archive_path, response.content, 'application/zip', upsert=True)
-
-            zf = zipfile.ZipFile(io.BytesIO(response.content))
-            entries = []
-            md_full_path = None
-            html_full_path = None
-            text_full_path = None
-            json_full_path = None
-            images_list = []
-            md_data_bytes: bytes | None = None
-            for zi in zf.infolist():
-                if zi.is_dir():
-                    continue
-                name = zi.filename.lstrip('/').replace('..', '')
-                data = zf.read(zi)
-                ctype = mimetypes.guess_type(name)[0] or 'application/octet-stream'
-                rel = f"{base_dir}/{name}"
-                self.storage.upload_file(bucket, rel, data, ctype, upsert=True)
-                entries.append({
-                    'name': name,
-                    'path': rel,
-                    'size': zi.file_size,
-                    'content_type': ctype
-                })
-                # Detect known outputs
-                lower = name.lower()
-                if lower.endswith('.md') and md_full_path is None:
-                    md_full_path = rel
-                    md_data_bytes = data
-                elif lower.endswith('.html') and html_full_path is None:
-                    html_full_path = rel
-                elif lower.endswith('.txt') and text_full_path is None:
-                    text_full_path = rel
-                elif lower.endswith('.json') and json_full_path is None:
-                    json_full_path = rel
-                if ctype.startswith('image/'):
-                    images_list.append({'name': name, 'path': rel, 'content_type': ctype, 'size': zi.file_size})
-
-            manifest = {
-                'file_id': file_id,
-                'artefact_id': artefact_id,
-                'to_formats': to_formats_list,
-                'image_export_mode': 'referenced',
-                'entries': entries,
-                'archive_path': archive_path,
-                'markdown_full': md_full_path,
-                'html_full': html_full_path,
-                'text_full': text_full_path,
-                'json_full': json_full_path,
-                'images': images_list,
-                'bucket': bucket
-            }
-            # Create markdown pages by splitting on placeholder if available
-            if md_data_bytes is not None:
-                try:
-                    md_text = md_data_bytes.decode('utf-8', errors='replace')
-                    sep = task_config.get('md_page_break_placeholder', '\n\n<!-- page-break -->\n\n')
-                    parts = md_text.split(sep)
-                    if len(parts) > 1:
-                        pages_dir = f"{base_dir}/md_pages"
-                        pages = []
-                        for i, part in enumerate(parts, start=1):
-                            pth = f"{pages_dir}/page-{i:04d}.md"
-                            self.storage.upload_file(bucket, pth, part.encode('utf-8'), 'text/markdown', upsert=True)
-                            pages.append({'page': i, 'path': pth})
-                        manifest['markdown_pages'] = pages
-                except Exception as e:
-                    logger.warning(f"Failed creating markdown_pages for file {file_id}: {e}")
-            manifest_path = f"{base_dir}/manifest.json"
-            self.storage.upload_file(bucket, manifest_path, json.dumps(manifest, ensure_ascii=False).encode('utf-8'), 'application/json', upsert=True)
-
-            # Create artefact row pointing to directory with manifest, including grouping extras for split packs
-            artefact_extra = payload.get('artefact_extra') if isinstance(payload, dict) else None
-            # Determine artefact type by pipeline (standard vs vlm)
-            pipeline_mode = (task_config.get('pipeline') or 'standard').lower()
-            artefact_type_final = 'docling_vlm' if pipeline_mode == 'vlm' else 'docling_standard'
-            group_pack_type = payload.get('group_pack_type') if isinstance(payload, dict) else None
-            # propagate group_id if provided (set by caller for multi-part packs)
-            group_id = (artefact_extra or {}).get('group_id')
-            # Compute a settings fingerprint for grouping (exclude page_range)
-            try:
-                import hashlib, json as _json
-                cfg_for_hash = dict(task_config)
-                cfg_for_hash.pop('page_range', None)
-                settings_fingerprint = hashlib.sha1(_json.dumps(cfg_for_hash, sort_keys=True, ensure_ascii=False).encode('utf-8')).hexdigest()
-            except Exception:
-                settings_fingerprint = None
-
-            self.client.supabase.table('document_artefacts').insert({
-                'id': artefact_id,
-                'file_id': file_id,
-                'type': artefact_type_final,
-                'rel_path': base_dir,
-                'extra': {
-                    'manifest': manifest_path,
-                    'processing_time': response.elapsed.total_seconds(),
-                    'config': task_config,
-                    'group_pack_type': group_pack_type or (artefact_extra or {}).get('group_pack_type'),
-                    'group_id': group_id,
-                    'pipeline': pipeline_mode,
-                    'settings_fingerprint': settings_fingerprint,
-                    **(artefact_extra or {})
-                },
-                'status': 'completed'
-            }).execute()
-
-            logger.info(f"Canonical docling bundle stored for file {file_id} with {len(entries)} files")
-            return {
-                'artefact_id': artefact_id,
-                'files_count': len(entries)
-            }
-
-        if 'application/json' in content_type or content_type.endswith('+json'):
-            docling_json = response.json()
-            artefact_id = str(uuid.uuid4())
-            artefact_type = task.task_type
-            rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/{artefact_type.replace('_json', '.json')}"
-            self.storage.upload_file(
-                bucket,
-                rel_path,
-                json.dumps(docling_json, ensure_ascii=False).encode('utf-8'),
-                'application/json',
-                upsert=True
-            )
-            artefact_data = {
-                'id': artefact_id,
-                'file_id': file_id,
-                'type': artefact_type,
-                'rel_path': rel_path,
-                'extra': {
-                    'processing_time': response.elapsed.total_seconds(),
-                    'config': task_config,
-                    **({} if 'artefact_extra' not in payload else payload['artefact_extra'])
-                },
-                'status': 'completed'
-            }
-            self.client.supabase.table('document_artefacts').insert(artefact_data).execute()
-        else:
-            # Fallback: store raw output if server didn't return JSON (unexpected for inbody)
-            artefact_id = str(uuid.uuid4())
-            ext = ('html' if 'html' in content_type else ('md' if 'markdown' in content_type else ('txt' if 'text/plain' in content_type else 'bin')))
-            artefact_type = f'docling_output_{ext}'
-            rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/docling_output.{ext}"
-            self.storage.upload_file(
-                bucket,
-                rel_path,
-                response.content,
-                'application/zip' if ext == 'zip' else (content_type or 'application/octet-stream'),
-                upsert=True
-            )
-            artefact_data = {
-                'id': artefact_id,
-                'file_id': file_id,
-                'type': artefact_type,
-                'rel_path': rel_path,
-                'extra': {
-                    'processing_time': response.elapsed.total_seconds(),
-                    'config': task_config,
-                    'to_formats': to_formats_list,
-                    'content_type': content_type,
-                    **({} if 'artefact_extra' not in payload else payload['artefact_extra'])
-                },
-                'status': 'completed'
-            }
-            self.client.supabase.table('document_artefacts').insert(artefact_data).execute()
-
-        # When we get canonical Docling JSON, also split out component contents into separate artefacts
-        try:
-            if 'application/json' in content_type or content_type.endswith('+json'):
-                self._store_docling_component_artefacts(
-                    file_id=file_id,
-                    cabinet_id=cabinet_id,
-                    bucket=bucket,
-                    docling_json=docling_json,
-                    task_config=task_config,
-                    artefact_extra=payload.get('artefact_extra') if isinstance(payload, dict) else None
-                )
-        except Exception as split_e:
-            logger.warning(f"Storing component artefacts failed for file {file_id}: {split_e}")
-        
-        # Handle optional frontpage image extraction
-        if task.task_type == 'docling_frontmatter_json':
-            try:
-                self._extract_frontpage_image(docling_json, file_id, cabinet_id, bucket)
-            except Exception as e:
-                logger.warning(f"Frontpage image extraction failed for file {file_id}: {e}")
-        
-        logger.info(f"Docling processing completed for file {file_id}")
-        
-        # Pipeline dependencies now handle sequential execution automatically
-        
-        return {
-            'artefact_id': artefact_id,
-            'rel_path': rel_path,
-            'processing_time': response.elapsed.total_seconds()
-        }
-
-    def _extract_frontpage_image(self, docling_json: Dict[str, Any], file_id: str, 
-                                cabinet_id: str, bucket: str):
-        """Extract and store frontpage image from Docling JSON."""
-        import base64
-        
-        # Look for frontpage image in various locations
-        cover_b64 = None
-        for key in ['frontpage', 'cover']:
-            if key in docling_json and 'image_base64' in docling_json[key]:
-                cover_b64 = docling_json[key]['image_base64']
-                break
-        
-        if not cover_b64:
-            return
-        
-        # Decode and store image
-        artefact_id = str(uuid.uuid4())
-        img_bytes = base64.b64decode(cover_b64)
-        rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/frontpage.png"
-        
-        self.storage.upload_file(bucket, rel_path, img_bytes, 'image/png', upsert=True)
-        
-        # Create artefact record
-        artefact_data = {
-            'id': artefact_id,
-            'file_id': file_id,
-            'type': 'docling_frontpage_image',
-            'rel_path': rel_path,
-            'extra': {'extracted_from': 'docling_frontmatter'},
-            'status': 'completed'
-        }
-        
-        self.client.supabase.table('document_artefacts').insert(artefact_data).execute()
-        logger.debug(f"Frontpage image extracted for file {file_id}")
-
-    def _store_docling_component_artefacts(self, *, file_id: str, cabinet_id: str, bucket: str, docling_json: Dict[str, Any], task_config: Dict[str, Any], artefact_extra: Optional[Dict[str, Any]] = None) -> None:
-        """Create artefacts for component contents from a canonical Docling JSON.
-
-        Stores md_content, html_content, text_content, doctags_content and json_content
-        if present, as separate artefacts and files alongside the canonical JSON.
-        """
-        doc = docling_json.get('document') or {}
-        components = [
-            ('md_content', 'docling_md', 'docling.md', 'text/markdown', lambda v: v if isinstance(v, str) else ''),
-            ('html_content', 'docling_html', 'docling.html', 'text/html', lambda v: v if isinstance(v, str) else ''),
-            ('text_content', 'docling_text', 'docling.txt', 'text/plain', lambda v: v if isinstance(v, str) else ''),
-            ('doctags_content', 'docling_doctags', 'docling.doctags.xml', 'application/xml', lambda v: v if isinstance(v, str) else ''),
-            ('json_content', 'docling_json', 'docling.json', 'application/json', lambda v: json.dumps(v or {}, ensure_ascii=False)),
-        ]
-
-        for key, art_type, filename, mime, to_bytes in components:
-            if key not in doc or doc.get(key) in (None, ''):
-                continue
-            try:
-                artefact_id = str(uuid.uuid4())
-                rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/{filename}"
-                data_bytes = to_bytes(doc.get(key))
-                if isinstance(data_bytes, str):
-                    data_bytes = data_bytes.encode('utf-8')
-                self.storage.upload_file(bucket, rel_path, data_bytes, mime, upsert=True)
-                extra = {'source': 'canonical_docling_json', 'component_key': key, 'config': task_config}
-                if artefact_extra:
-                    extra.update(artefact_extra)
-                self.client.supabase.table('document_artefacts').insert({
-                    'id': artefact_id,
-                    'file_id': file_id,
-                    'type': art_type,
-                    'rel_path': rel_path,
-                    'extra': extra,
-                    'status': 'completed'
-                }).execute()
-            except Exception as e:
-                logger.warning(f"Failed to store component '{key}' for file {file_id}: {e}")
-
-    def _process_llm_task(self, task: QueueTask) -> Dict[str, Any]:
-        """Process LLM analysis task (document classification, etc.)."""
-        if not self.llm_url:
-            raise ValueError("LLM_URL not configured")
-        
-        payload = task.payload
-        file_id = task.file_id
-        prompt = payload['prompt']
-        context = payload.get('context', '')
-        model = payload.get('model', 'default')
-        
-        # Prepare LLM request
-        llm_request = {
-            'model': model,
-            'prompt': prompt,
-            'context': context,
-            'max_tokens': payload.get('max_tokens', 1000),
-            'temperature': payload.get('temperature', 0.1)
-        }
-        
-        # Call local LLM
-        response = requests.post(
-            f"{self.llm_url.rstrip('/')}/generate",
-            json=llm_request,
-            headers={'Content-Type': 'application/json'},
-            timeout=task.timeout
-        )
-        response.raise_for_status()
-        
-        llm_result = response.json()
-        
-        # Store result (optional - depends on use case)
-        if payload.get('store_result', False):
-            bucket = payload['bucket']
-            cabinet_id = payload['cabinet_id']
-            
-            artefact_id = str(uuid.uuid4())
-            rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/llm_{task.task_type}.json"
-            
-            self.storage.upload_file(
-                bucket,
-                rel_path,
-                json.dumps(llm_result, ensure_ascii=False).encode('utf-8'),
-                'application/json',
-                upsert=True
-            )
-            
-            # Create artefact record
-            artefact_data = {
-                'id': artefact_id,
-                'file_id': file_id,
-                'type': f'llm_{task.task_type}',
-                'rel_path': rel_path,
-                'extra': {
-                    'model': model,
-                    'task_type': task.task_type
-                },
-                'status': 'completed'
-            }
-            
-            self.client.supabase.table('document_artefacts').insert(artefact_data).execute()
-        
-        logger.info(f"LLM processing completed for file {file_id}")
-        return llm_result
-
-    def _process_split_map_task(self, task: QueueTask) -> Dict[str, Any]:
-        """Process split map generation task."""
-        from routers.database.files.split_map import create_split_map_for_file
-        from routers.database.files.files import enqueue_canonical_docling
-        
-        file_id = task.file_id
-        
-        # Generate split map
-        split_map = create_split_map_for_file(file_id)
-        
-        logger.info(f"Split map generation completed for file {file_id}")
-        
-        # NEW BUNDLE ARCHITECTURE: Direct pipeline enqueueing
-        # Split map completion now directly triggers bundle task creation
-        logger.info(f"NEW ARCHITECTURE: Enqueueing sequential docling bundle pipelines for file {file_id}")
-        
-        try:
-            # Get file information for pipeline enqueueing
-            file_result = self.client.supabase.table('files').select('*').eq('id', file_id).single().execute()
-            if not file_result.data:
-                logger.error(f"Could not find file {file_id} for pipeline enqueueing")
-                return {
-                    'method': split_map['method'],
-                    'confidence': split_map['confidence'],
-                    'entries_count': len(split_map['entries']),
-                    'pipeline_error': 'File not found for pipeline enqueueing'
-                }
-            
-            file_row = file_result.data
-            bucket = file_row['bucket']
-            cabinet_id = file_row['cabinet_id']
-            storage_path = file_row['path']
-            original_mime = file_row.get('mime_type', 'application/pdf')
-            
-            # Prefer converted PDF if available (matches existing pattern)
-            try:
-                arts = self.client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or []
-                pdf_art = next((a for a in arts if a.get('type') == 'document_pdf'), None)
-                processing_path = pdf_art['rel_path'] if pdf_art else storage_path
-                processing_mime = 'application/pdf' if pdf_art else original_mime
-            except Exception:
-                processing_path = storage_path
-                processing_mime = original_mime
-            
-            # Prepare file data for pipeline controller
-            file_data = {
-                'bucket': bucket,
-                'file_path': processing_path,
-                'cabinet_id': cabinet_id,
-                'mime_type': processing_mime
-            }
-            
-            # Import and use pipeline controller to enqueue sequential pipelines
-            from modules.pipeline_controller import get_pipeline_controller
-            controller = get_pipeline_controller()
-            
-            pipeline_result = controller.enqueue_sequential_docling_pipelines(file_id, file_data)
-            
-            logger.info(f"Successfully enqueued {pipeline_result['total_tasks']} tasks across "
-                      f"{len(pipeline_result['enqueued_pipelines'])} pipelines for file {file_id}")
-            logger.info(f"Pipeline execution order: {pipeline_result['sequential_order']}")
-            
-            return {
-                'method': split_map['method'],
-                'confidence': split_map['confidence'],
-                'entries_count': len(split_map['entries']),
-                'enqueued_pipelines': pipeline_result['enqueued_pipelines'],
-                'total_pipeline_tasks': pipeline_result['total_tasks'],
-                'pipeline_order': pipeline_result['sequential_order']
-            }
-            
-        except Exception as e:
-            logger.error(f"Failed to enqueue sequential pipelines for file {file_id}: {e}")
-            return {
-                'method': split_map['method'],
-                'confidence': split_map['confidence'],
-                'entries_count': len(split_map['entries']),
-                'pipeline_error': str(e)
-            }
-        
-        # Split map processing completed successfully
-        
-        return {
-            'method': split_map['method'],
-            'confidence': split_map['confidence'],
-            'entries_count': len(split_map['entries'])
-        }
-
-    def _enqueue_vlm_page_processing(self, file_id: str, threshold: int, vlm_group_id: str, vlm_model: str, base_config: dict):
-        """Enqueue VLM processing for individual pages within split map sections."""
-        from routers.database.files.files import _load_split_map
-        from modules.database.supabase.utils.client import SupabaseServiceRoleClient
-        from modules.database.supabase.utils.storage import StorageAdmin
-        
-        try:
-            client = SupabaseServiceRoleClient()
-            storage = StorageAdmin()
-            
-            # Get file info
-            fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
-            if not fr.data:
-                logger.error(f"File {file_id} not found for VLM page processing")
-                return
-                
-            file_row = fr.data
-            bucket = file_row['bucket']
-            
-            # Load split map
-            split_map = _load_split_map(client, storage, bucket, file_id)
-            if not split_map:
-                logger.warning(f"No split map found for VLM page processing file {file_id}")
-                return
-                
-            entries = split_map.get('entries', [])
-            if not entries:
-                logger.warning(f"Empty split map entries for VLM page processing file {file_id}")
-                return
-                
-            logger.info(f"[auto-canonical] VLM page processing: found {len(entries)} sections for file {file_id}")
-            
-            # Process each section with page-by-page VLM
-            for section_idx, entry in enumerate(entries, 1):
-                try:
-                    start_page = int(entry.get('start_page', 1))
-                    end_page = int(entry.get('end_page', start_page))
-                    section_title = entry.get('title', f'Section {section_idx}')
-                    
-                    logger.info(f"[auto-canonical] VLM page processing section {section_idx}: '{section_title}' pages {start_page}-{end_page}")
-                    
-                    # Create section-level bundle manifest task
-                    self._enqueue_vlm_section_page_bundle(
-                        file_id, section_idx, start_page, end_page, section_title,
-                        vlm_group_id, vlm_model, base_config, len(entries)
-                    )
-                    
-                except Exception as section_e:
-                    logger.warning(f"Failed to process VLM section {section_idx} for file {file_id}: {section_e}")
-                    continue
-                    
-        except Exception as e:
-            logger.error(f"VLM page processing setup failed for file {file_id}: {e}")
-
-    def _enqueue_vlm_section_page_bundle(self, file_id: str, section_idx: int, start_page: int, end_page: int, 
-                                        section_title: str, vlm_group_id: str, vlm_model: str, 
-                                        base_config: dict, total_sections: int):
-        """Enqueue VLM processing for individual pages within a section, then bundle them."""
-        from modules.queue_system import enqueue_docling_task, TaskPriority
-        
-        try:
-            # Create a unique task to handle page-by-page processing for this section
-            section_task_id = enqueue_docling_task(
-                file_id=file_id,
-                task_type='vlm_section_page_bundle',
-                payload={
-                    'section_idx': section_idx,
-                    'start_page': start_page,
-                    'end_page': end_page,
-                    'section_title': section_title,
-                    'vlm_group_id': vlm_group_id,
-                    'vlm_model': vlm_model,
-                    'base_config': base_config,
-                    'total_sections': total_sections,
-                    'producer': 'auto_split'
-                },
-                priority=TaskPriority.NORMAL,
-                timeout=3600  # 1 hour for page-by-page processing
-            )
-            
-            logger.info(f"[auto-canonical] VLM section page bundle task {section_task_id} for section {section_idx} of file {file_id}")
-            
-        except Exception as e:
-            logger.error(f"Failed to enqueue VLM section page bundle for section {section_idx} file {file_id}: {e}")
-    
-    def process_document_analysis_task(self, task: QueueTask) -> Dict[str, Any]:
-        """Process document structure analysis task"""
-        file_id = task.file_id
-        payload = task.payload
-        
-        logger.info(f"Processing document analysis task for file {file_id}")
-        
-        try:
-            # Load file from storage
-            bucket = payload['bucket']
-            file_path = payload['file_path']
-            cabinet_id = payload['cabinet_id']
-            
-            file_bytes = self.storage.download_file(bucket, file_path)
-            
-            # Load existing artefacts if available
-            client = SupabaseServiceRoleClient()
-            artefacts = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).execute()
-            
-            tika_json = None
-            docling_json = None
-            
-            for art in artefacts.data:
-                if art['type'] == 'tika_json' and art['status'] == 'completed':
-                    try:
-                        tika_data = self.storage.download_file(bucket, art['rel_path'])
-                        tika_json = json.loads(tika_data.decode('utf-8'))
-                    except Exception as e:
-                        logger.warning(f"Failed to load Tika JSON for analysis: {e}")
-                
-                elif art['type'] in ['docling_frontmatter_json', 'docling_noocr_json'] and art['status'] == 'completed':
-                    try:
-                        docling_data = self.storage.download_file(bucket, art['rel_path'])
-                        docling_json = json.loads(docling_data.decode('utf-8'))
-                        break  # Use first available Docling result
-                    except Exception as e:
-                        logger.warning(f"Failed to load Docling JSON for analysis: {e}")
-            
-            # Import here to avoid circular imports
-            from modules.document_analysis import create_document_outline_hierarchy_artefact
-            
-            # Create document analysis
-            analysis_data = create_document_outline_hierarchy_artefact(
-                file_id=file_id,
-                pdf_bytes=file_bytes,
-                tika_json=tika_json,
-                docling_json=docling_json
-            )
-            
-            # Store analysis as artefact (insert row first, then upload file)
-            artefact_id = analysis_data.get('artefact_id') or str(uuid.uuid4())
-            analysis_data['artefact_id'] = artefact_id
-            rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/document_outline_hierarchy.json"
-            # Insert row first to avoid orphaned files if DB insert fails
-            # Insert artefact record with processing status
-            sections_count = len(analysis_data.get('sections', []) or [])
-            metadata = analysis_data.get('metadata') or {}
-            analysis_methods = metadata.get('analysis_methods')
-            self.client.supabase.table('document_artefacts').insert({
-                'id': artefact_id,
-                'file_id': file_id,
-                'type': 'document_outline_hierarchy',
-                'rel_path': rel_path,
-                'extra': {
-                    'sections_count': sections_count,
-                    'analysis_methods': analysis_methods
-                },
-                'status': 'processing'
-            }).execute()
-
-            # Now upload the file
-            analysis_json = json.dumps(analysis_data, ensure_ascii=False)
-            self.storage.upload_file(bucket, rel_path, analysis_json.encode('utf-8'), 'application/json', upsert=True)
-
-            # Mark artefact as completed
-            self.client.supabase.table('document_artefacts').update({
-                'status': 'completed'
-            }).eq('id', artefact_id).execute()
-
-            logger.info(f"Document analysis completed for file {file_id} (sections={sections_count})")
-            return {
-                'sections_count': sections_count
-            }
-            
-        except Exception as e:
-            logger.error(f"Document analysis failed for file {file_id}: {e}")
-            raise
-    
-    def process_page_images_task(self, task: QueueTask) -> Dict[str, Any]:
-        """Process page images generation task"""
-        file_id = task.file_id
-        payload = task.payload
-        
-        logger.info(f"Processing page images task for file {file_id}")
-        
-        try:
-            # Load file from storage
-            bucket = payload['bucket']
-            file_path = payload['file_path']
-            cabinet_id = payload['cabinet_id']
-            
-            file_bytes = self.storage.download_file(bucket, file_path)
-            
-            # Import here to avoid circular imports
-            from modules.page_image_generator import create_page_images_artefact
-            
-            # Generate page images
-            images_data = create_page_images_artefact(
-                file_id=file_id,
-                cabinet_id=cabinet_id,
-                pdf_bytes=file_bytes
-            )
-            
-            artefact_id = images_data['artefact_id']
-            # Include bucket in manifest for client-side signed URL generation
-            images_data['bucket'] = bucket
-            
-            # Upload all page images to storage
-            for page_info in images_data['page_images']:
-                # Upload full image
-                full_path = page_info['full_image_path']
-                full_data = page_info.pop('full_image_data')  # Remove from JSON
-                self.storage.upload_file(bucket, full_path, full_data, 'image/png', upsert=True)
-                
-                # Upload thumbnail
-                thumb_path = page_info['thumbnail_path']
-                thumb_data = page_info.pop('thumbnail_data')  # Remove from JSON
-                self.storage.upload_file(bucket, thumb_path, thumb_data, 'image/webp', upsert=True)
-            
-            # Store images metadata manifest under the artefact directory
-            artefact_dir = f"{cabinet_id}/{file_id}/{artefact_id}"
-            manifest_rel_path = f"{artefact_dir}/page_images.json"
-            images_json = json.dumps(images_data, ensure_ascii=False)
-            self.storage.upload_file(bucket, manifest_rel_path, images_json.encode('utf-8'), 'application/json', upsert=True)
-            
-            # Insert artefact record
-            client = SupabaseServiceRoleClient()
-            client.supabase.table('document_artefacts').insert({
-                'id': artefact_id,
-                'file_id': file_id,
-                'type': 'page_images',
-                # Store the directory prefix as rel_path for hybrid approach
-                'rel_path': artefact_dir,
-                'extra': {
-                    'page_count': images_data['page_count'],
-                    'total_full_images': images_data['storage_info']['total_full_images'],
-                    'total_thumbnails': images_data['storage_info']['total_thumbnails'],
-                    'estimated_storage_mb': images_data['storage_info']['estimated_storage_mb'],
-                    'manifest': manifest_rel_path
-                },
-                'status': 'completed'
-            }).execute()
-            
-            logger.info(f"Page images generation completed for file {file_id}")
-            return {
-                'page_count': images_data['page_count'],
-                'estimated_storage_mb': images_data['storage_info']['estimated_storage_mb']
-            }
-            
-        except Exception as e:
-            logger.error(f"Page images generation failed for file {file_id}: {e}")
-            raise
-
-    def process_comparison_analysis_task(self, task: QueueTask) -> Dict[str, Any]:
-        """Process comparison analysis between no-OCR and OCR docling results."""
-        file_id = task.file_id
-        payload = task.payload
-        
-        logger.info(f"Processing comparison analysis task for file {file_id}")
-        
-        try:
-            no_ocr_group_id = payload.get('no_ocr_group_id')
-            ocr_group_id = payload.get('ocr_group_id')
-            comparison_type = payload.get('comparison_type', 'noocr_vs_ocr')
-            initial_delay = payload.get('initial_delay_seconds', 0)
-            
-            # If this is the first execution and we have an initial delay, sleep briefly
-            if initial_delay > 0:
-                import time
-                logger.info(f"Comparison analysis: applying initial delay of {min(initial_delay, 60)} seconds for file {file_id}")
-                time.sleep(min(initial_delay, 60))  # Max 1 minute delay per attempt
-                logger.info(f"Comparison analysis: delay complete for file {file_id}")
-            
-            if not no_ocr_group_id or not ocr_group_id:
-                raise ValueError("Missing group_id parameters for comparison")
-            
-            client = SupabaseServiceRoleClient()
-            
-            # Find file info
-            fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
-            if not fr.data:
-                raise ValueError(f"File {file_id} not found")
-            
-            file_row = fr.data
-            bucket = file_row['bucket']
-            cabinet_id = file_row['cabinet_id']
-            
-            # Find artefacts for both groups
-            artefacts = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).execute()
-            arts = artefacts.data or []
-            
-            # Filter artefacts by group_id and type, including status
-            no_ocr_arts = [a for a in arts if ((a.get('extra') or {}).get('group_id') == no_ocr_group_id and 
-                                             a.get('type') == 'docling_standard' and
-                                             a.get('status') == 'completed')]
-            ocr_arts = [a for a in arts if ((a.get('extra') or {}).get('group_id') == ocr_group_id and 
-                                          a.get('type') == 'docling_standard' and
-                                          a.get('status') == 'completed')]
-            
-            # Also check pending/processing artefacts to understand timing better
-            no_ocr_pending = [a for a in arts if ((a.get('extra') or {}).get('group_id') == no_ocr_group_id and 
-                                                a.get('type') == 'docling_standard' and
-                                                a.get('status') in ('processing', 'pending'))]
-            ocr_pending = [a for a in arts if ((a.get('extra') or {}).get('group_id') == ocr_group_id and 
-                                             a.get('type') == 'docling_standard' and
-                                             a.get('status') in ('processing', 'pending'))]
-            
-            # Determine expected total parts from split_total metadata (if available)
-            expected_parts = None
-            if no_ocr_arts:
-                expected_parts = (no_ocr_arts[0].get('extra') or {}).get('split_total')
-            elif ocr_arts:
-                expected_parts = (ocr_arts[0].get('extra') or {}).get('split_total')
-            elif no_ocr_pending:
-                expected_parts = (no_ocr_pending[0].get('extra') or {}).get('split_total')
-            elif ocr_pending:
-                expected_parts = (ocr_pending[0].get('extra') or {}).get('split_total')
-            
-            logger.info(f"Comparison analysis: found {len(no_ocr_arts)} completed no-OCR artefacts ({len(no_ocr_pending)} pending), {len(ocr_arts)} completed OCR artefacts ({len(ocr_pending)} pending), expected_parts={expected_parts}")
-            
-            # Enhanced validation with progress-aware retry logic
-            if expected_parts is not None:
-                # We know how many parts to expect, so wait for all of them
-                total_no_ocr = len(no_ocr_arts) + len(no_ocr_pending)
-                total_ocr = len(ocr_arts) + len(ocr_pending)
-                
-                # Calculate completion percentages
-                no_ocr_completion = len(no_ocr_arts) / expected_parts * 100
-                ocr_completion = len(ocr_arts) / expected_parts * 100
-                
-                # Check if we're making progress (store in task metadata for persistence)
-                progress_key = f"comparison_progress_{file_id}"
-                current_progress = {
-                    'no_ocr_completed': len(no_ocr_arts),
-                    'ocr_completed': len(ocr_arts),
-                    'no_ocr_pending': len(no_ocr_pending),
-                    'ocr_pending': len(ocr_pending)
-                }
-                
-                # Get previous progress from payload (injected by retry mechanism)
-                previous_progress = payload.get('previous_progress', {'no_ocr_completed': 0, 'ocr_completed': 0})
-                progress_made = (current_progress['no_ocr_completed'] > previous_progress['no_ocr_completed'] or 
-                               current_progress['ocr_completed'] > previous_progress['ocr_completed'])
-                
-                if len(no_ocr_arts) < expected_parts or len(ocr_arts) < expected_parts:
-                    if len(no_ocr_pending) > 0 or len(ocr_pending) > 0:
-                        # Still processing - this is expected, always retry
-                        error_msg = f"PROGRESS_RETRY: expected={expected_parts}, no_ocr={len(no_ocr_arts)}/{expected_parts} ({no_ocr_completion:.1f}%), ocr={len(ocr_arts)}/{expected_parts} ({ocr_completion:.1f}%), pending: no_ocr={len(no_ocr_pending)}, ocr={len(ocr_pending)}"
-                        progress_retry_error = ValueError(error_msg)
-                        progress_retry_error.current_progress = current_progress
-                        progress_retry_error.is_progress_retry = True
-                        raise progress_retry_error
-                    elif progress_made:
-                        # No pending but made progress since last check - likely brief gap between completions
-                        error_msg = f"PROGRESS_MADE_RETRY: expected={expected_parts}, no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)}, progress since last check"
-                        progress_retry_error = ValueError(error_msg)
-                        progress_retry_error.current_progress = current_progress
-                        progress_retry_error.is_progress_retry = True
-                        raise progress_retry_error
-                    else:
-                        # No progress and no pending - likely stalled, but still retry with backoff
-                        error_msg = f"STALLED_RETRY: expected={expected_parts}, no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - no pending tasks but will retry"
-                        stalled_retry_error = ValueError(error_msg)
-                        stalled_retry_error.current_progress = current_progress
-                        stalled_retry_error.is_stalled_retry = True
-                        raise stalled_retry_error
-                        
-                # Also verify both groups have the same number of completed parts
-                if len(no_ocr_arts) != len(ocr_arts):
-                    error_msg = f"ALIGNMENT_RETRY: no_ocr completed={len(no_ocr_arts)}, ocr completed={len(ocr_arts)} (expected {expected_parts} each) - waiting for alignment"
-                    alignment_retry_error = ValueError(error_msg)
-                    alignment_retry_error.current_progress = current_progress
-                    alignment_retry_error.is_alignment_retry = True
-                    raise alignment_retry_error
-                    
-            else:
-                # Fallback to original logic when split_total not available
-                if not no_ocr_arts or not ocr_arts:
-                    # More detailed retry logic with pending artefact awareness
-                    if len(no_ocr_arts) == 0 and len(ocr_arts) == 0:
-                        if len(no_ocr_pending) > 0 or len(ocr_pending) > 0:
-                            raise ValueError(f"Batches still processing: no_ocr completed={len(no_ocr_arts)} pending={len(no_ocr_pending)}, ocr completed={len(ocr_arts)} pending={len(ocr_pending)} - will retry")
-                        else:
-                            raise ValueError(f"No artefacts found for either group: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - may need more time")
-                    elif len(ocr_arts) == 0:
-                        if len(ocr_pending) > 0:
-                            raise ValueError(f"OCR batch still processing: no_ocr completed={len(no_ocr_arts)}, ocr completed={len(ocr_arts)} pending={len(ocr_pending)} - will retry")
-                        else:
-                            raise ValueError(f"OCR batch appears incomplete: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - will retry")
-                    elif len(no_ocr_arts) == 0:
-                        if len(no_ocr_pending) > 0:
-                            raise ValueError(f"No-OCR batch still processing: no_ocr completed={len(no_ocr_arts)} pending={len(no_ocr_pending)}, ocr completed={len(ocr_arts)} - will retry")
-                        else:
-                            raise ValueError(f"No-OCR batch appears incomplete: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - will retry")
-                    else:
-                        raise ValueError(f"Unexpected missing artefacts: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)}")
-                        
-                # For fallback case, ensure both groups have same count
-                if len(no_ocr_arts) != len(ocr_arts):
-                    raise ValueError(f"Mismatched group sizes: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - will retry")
-            
-            # Sort both groups by split_order for aligned comparison
-            no_ocr_arts.sort(key=lambda x: ((x.get('extra') or {}).get('split_order') or 0))
-            ocr_arts.sort(key=lambda x: ((x.get('extra') or {}).get('split_order') or 0))
-            
-            # Log final validation before proceeding
-            no_ocr_orders = [((a.get('extra') or {}).get('split_order') or 0) for a in no_ocr_arts]
-            ocr_orders = [((a.get('extra') or {}).get('split_order') or 0) for a in ocr_arts]
-            logger.info(f"Proceeding with comparison: no_ocr split_orders={no_ocr_orders}, ocr split_orders={ocr_orders}, expected_parts={expected_parts}")
-            
-            # Create comparison results
-            comparison_results = self._compare_docling_groups(
-                file_id, bucket, cabinet_id, no_ocr_arts, ocr_arts, comparison_type, 
-                no_ocr_group_id, ocr_group_id, payload
-            )
-            
-            return comparison_results
-            
-        except Exception as e:
-            logger.error(f"Comparison analysis failed for file {file_id}: {e}")
-            raise
-
-    def _compare_docling_groups(self, file_id: str, bucket: str, cabinet_id: str, 
-                               no_ocr_arts: list, ocr_arts: list, comparison_type: str,
-                               no_ocr_group_id: str, ocr_group_id: str, payload: Dict[str, Any]) -> Dict[str, Any]:
-        """Compare two groups of docling artefacts and generate analysis."""
-        import subprocess
-        import tempfile
-        import json
-        import uuid
-        
-        logger.info(f"Starting detailed comparison for file {file_id}: {len(no_ocr_arts)} vs {len(ocr_arts)} artefacts")
-        
-        artefact_id = str(uuid.uuid4())
-        comparison_dir = f"{cabinet_id}/{file_id}/{artefact_id}"
-        results = []
-        overall_stats = {
-            'total_comparisons': min(len(no_ocr_arts), len(ocr_arts)),
-            'successful_comparisons': 0,
-            'failed_comparisons': 0,
-            'differences_found': 0,
-            'identical_count': 0
-        }
-        
-        try:
-            with tempfile.TemporaryDirectory() as temp_dir:
-                for i in range(min(len(no_ocr_arts), len(ocr_arts))):
-                    no_ocr_art = no_ocr_arts[i]
-                    ocr_art = ocr_arts[i]
-                    
-                    try:
-                        # Download manifest JSONs for both artefacts
-                        no_ocr_manifest_path = ((no_ocr_art.get('extra') or {}).get('manifest'))
-                        ocr_manifest_path = ((ocr_art.get('extra') or {}).get('manifest'))
-                        
-                        if not no_ocr_manifest_path or not ocr_manifest_path:
-                            logger.warning(f"Missing manifest paths for comparison {i+1}")
-                            continue
-                            
-                        no_ocr_manifest_data = self.storage.download_file(bucket, no_ocr_manifest_path)
-                        ocr_manifest_data = self.storage.download_file(bucket, ocr_manifest_path)
-                        
-                        no_ocr_manifest = json.loads(no_ocr_manifest_data.decode('utf-8'))
-                        ocr_manifest = json.loads(ocr_manifest_data.decode('utf-8'))
-                        
-                        # Compare JSON content if available
-                        no_ocr_json_path = no_ocr_manifest.get('json_full')
-                        ocr_json_path = ocr_manifest.get('json_full') 
-                        
-                        if no_ocr_json_path and ocr_json_path:
-                            comparison_result = self._compare_json_content(
-                                bucket, no_ocr_json_path, ocr_json_path, temp_dir, i + 1
-                            )
-                            
-                            comparison_result['no_ocr_artefact_id'] = no_ocr_art['id']
-                            comparison_result['ocr_artefact_id'] = ocr_art['id']
-                            comparison_result['split_order'] = (no_ocr_art.get('extra') or {}).get('split_order', i + 1)
-                            comparison_result['split_heading'] = (no_ocr_art.get('extra') or {}).get('split_heading', f'Part {i+1}')
-                            
-                            results.append(comparison_result)
-                            
-                            overall_stats['successful_comparisons'] += 1
-                            if comparison_result['has_differences']:
-                                overall_stats['differences_found'] += 1
-                            else:
-                                overall_stats['identical_count'] += 1
-                                
-                        else:
-                            logger.warning(f"Missing JSON content paths for comparison {i+1}")
-                            overall_stats['failed_comparisons'] += 1
-                            
-                    except Exception as part_e:
-                        logger.warning(f"Failed to compare part {i+1}: {part_e}")
-                        overall_stats['failed_comparisons'] += 1
-                        continue
-            
-            # Create comprehensive comparison report
-            comparison_report = {
-                'file_id': file_id,
-                'comparison_type': comparison_type,
-                'timestamp': json.dumps({"created_at": "now()"}, default=str),
-                'overall_statistics': overall_stats,
-                'detailed_results': results,
-                'summary': {
-                    'total_parts_compared': overall_stats['successful_comparisons'],
-                    'identical_parts': overall_stats['identical_count'],
-                    'different_parts': overall_stats['differences_found'],
-                    'accuracy_percentage': (overall_stats['identical_count'] / max(overall_stats['successful_comparisons'], 1)) * 100
-                }
-            }
-            
-            # Store comparison report as artefact
-            report_path = f"{comparison_dir}/comparison_report.json"
-            report_json = json.dumps(comparison_report, ensure_ascii=False, indent=2)
-            
-            self.storage.upload_file(bucket, report_path, report_json.encode('utf-8'), 'application/json', upsert=True)
-            
-            # Create artefact record
-            client = SupabaseServiceRoleClient()
-            client.supabase.table('document_artefacts').insert({
-                'id': artefact_id,
-                'file_id': file_id,
-                'type': 'docling_comparison_analysis',
-                'rel_path': report_path,
-                'extra': {
-                    'comparison_type': comparison_type,
-                    'no_ocr_group_id': no_ocr_group_id,
-                    'ocr_group_id': ocr_group_id,
-                    'producer': payload.get('producer', 'auto_split'),
-                    'total_comparisons': overall_stats['total_comparisons'],
-                    'successful_comparisons': overall_stats['successful_comparisons'],
-                    'differences_found': overall_stats['differences_found'],
-                    'accuracy_percentage': comparison_report['summary']['accuracy_percentage']
-                },
-                'status': 'completed'
-            }).execute()
-            
-            logger.info(f"Comparison analysis completed for file {file_id}: {overall_stats['successful_comparisons']} comparisons, {overall_stats['differences_found']} differences found")
-            
-            # Trigger VLM processing after comparison completes (if enabled)
-            self._trigger_vlm_after_comparison(file_id, payload)
-            
-            return {
-                'artefact_id': artefact_id,
-                'comparisons_completed': overall_stats['successful_comparisons'],
-                'differences_found': overall_stats['differences_found'],
-                'accuracy_percentage': comparison_report['summary']['accuracy_percentage']
-            }
-            
-        except Exception as e:
-            logger.error(f"Failed to create comparison analysis for file {file_id}: {e}")
-            raise
-
-    def _compare_json_content(self, bucket: str, no_ocr_path: str, ocr_path: str, 
-                             temp_dir: str, part_number: int) -> Dict[str, Any]:
-        """Compare JSON content using jq and diff as suggested in web search results."""
-        import subprocess
-        import os
-        from pathlib import Path
-        
-        try:
-            # Download both JSON files
-            no_ocr_data = self.storage.download_file(bucket, no_ocr_path)
-            ocr_data = self.storage.download_file(bucket, ocr_path)
-            
-            # Save to temp files
-            no_ocr_file = Path(temp_dir) / f'no_ocr_part_{part_number}.json'
-            ocr_file = Path(temp_dir) / f'ocr_part_{part_number}.json'
-            
-            with open(no_ocr_file, 'wb') as f:
-                f.write(no_ocr_data)
-            with open(ocr_file, 'wb') as f:
-                f.write(ocr_data)
-            
-            # Use jq to sort and format both files for comparison (as suggested in web search results)
-            sorted_no_ocr = Path(temp_dir) / f'sorted_no_ocr_part_{part_number}.json'
-            sorted_ocr = Path(temp_dir) / f'sorted_ocr_part_{part_number}.json'
-            
-            # Sort both files using jq
-            subprocess.run(['jq', '--sort-keys', '.', str(no_ocr_file)], 
-                          stdout=open(sorted_no_ocr, 'w'), stderr=subprocess.DEVNULL, check=True)
-            subprocess.run(['jq', '--sort-keys', '.', str(ocr_file)], 
-                          stdout=open(sorted_ocr, 'w'), stderr=subprocess.DEVNULL, check=True)
-            
-            # Compare using diff
-            diff_output = Path(temp_dir) / f'diff_part_{part_number}.txt'
-            diff_result = subprocess.run(
-                ['diff', '-u', str(sorted_no_ocr), str(sorted_ocr)],
-                stdout=open(diff_output, 'w'),
-                stderr=subprocess.DEVNULL,
-                text=True
-            )
-            
-            # Read diff output
-            with open(diff_output, 'r') as f:
-                diff_content = f.read()
-            
-            # Analyze differences
-            has_differences = diff_result.returncode != 0
-            diff_lines = len([l for l in diff_content.split('\n') if l.startswith(('+', '-')) and not l.startswith(('+++', '---'))])
-            
-            return {
-                'part_number': part_number,
-                'has_differences': has_differences,
-                'diff_lines_count': diff_lines,
-                'diff_content_preview': diff_content[:1000] if diff_content else '',  # First 1000 chars
-                'no_ocr_size': len(no_ocr_data),
-                'ocr_size': len(ocr_data),
-                'size_difference': abs(len(ocr_data) - len(no_ocr_data))
-            }
-            
-        except subprocess.CalledProcessError as e:
-            logger.warning(f"jq/diff command failed for part {part_number}: {e}")
-            return {
-                'part_number': part_number,
-                'has_differences': True,
-                'error': f"Comparison tools failed: {str(e)}",
-                'diff_lines_count': -1
-            }
-        except Exception as e:
-            logger.warning(f"JSON comparison failed for part {part_number}: {e}")
-            return {
-                'part_number': part_number,
-                'has_differences': True,
-                'error': f"Comparison failed: {str(e)}",
-                'diff_lines_count': -1
-            }
-
-    def process_vlm_section_page_bundle_task(self, task: QueueTask) -> Dict[str, Any]:
-        """Process VLM section page bundle task - create individual page bundles and combine them."""
-        file_id = task.file_id
-        payload = task.payload
-        
-        logger.info(f"Processing VLM section page bundle task for file {file_id}")
-        
-        try:
-            section_idx = payload.get('section_idx')
-            start_page = payload.get('start_page')
-            end_page = payload.get('end_page')
-            section_title = payload.get('section_title', f'Section {section_idx}')
-            vlm_group_id = payload.get('vlm_group_id')
-            vlm_model = payload.get('vlm_model', 'smoldocling')
-            base_config = payload.get('base_config', {})
-            total_sections = payload.get('total_sections', 1)
-            
-            client = SupabaseServiceRoleClient()
-            
-            # Get file info
-            fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
-            if not fr.data:
-                raise ValueError(f"File {file_id} not found")
-                
-            file_row = fr.data
-            bucket = file_row['bucket']
-            cabinet_id = file_row['cabinet_id']
-            
-            # Find processing path (prefer converted PDF)
-            arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or []
-            pdf_art = next((a for a in arts if a.get('type') == 'document_pdf'), None)
-            processing_path = pdf_art['rel_path'] if pdf_art else file_row['path']
-            processing_mime = 'application/pdf'
-            
-            logger.info(f"VLM section bundle: processing section {section_idx} '{section_title}' pages {start_page}-{end_page} for file {file_id}")
-            
-            # Create individual page processing tasks
-            page_task_ids = []
-            for page_num in range(start_page, end_page + 1):
-                try:
-                    page_config = {
-                        **base_config,
-                        'do_ocr': False,
-                        'force_ocr': False,
-                        'pipeline': 'vlm',
-                        'vlm_pipeline_model': vlm_model,
-                        'page_range': [page_num, page_num],
-                        'target_type': 'zip',
-                        'image_export_mode': 'referenced',
-                        # Add required VLM parameters that may be missing
-                        'do_picture_classification': False,
-                        'do_picture_description': False
-                    }
-                    
-                    logger.debug(f"VLM page {page_num} config: pipeline={page_config.get('pipeline')}, model={page_config.get('vlm_pipeline_model')}, range={page_config.get('page_range')}")
-                    
-                    from modules.queue_system import enqueue_docling_task, TaskPriority
-                    
-                    page_task_id = enqueue_docling_task(
-                        file_id=file_id,
-                        task_type='canonical_docling_json',
-                        payload={
-                            'bucket': bucket,
-                            'file_path': processing_path,
-                            'cabinet_id': cabinet_id,
-                            'mime_type': processing_mime,
-                            'config': page_config,
-                            'artefact_extra': {
-                                'is_subdoc': True,
-                                'page_range': [page_num, page_num],
-                                'label': f'{section_title} - Page {page_num}',
-                                'vlm_section_idx': section_idx,
-                                'vlm_section_title': section_title,
-                                'vlm_page_number': page_num,
-                                'vlm_section_start': start_page,
-                                'vlm_section_end': end_page,
-                                'producer': 'auto_split_vlm_page'
-                            }
-                        },
-                        priority=TaskPriority.NORMAL,
-                        timeout=1800
-                    )
-                    
-                    page_task_ids.append((page_num, page_task_id))
-                    logger.debug(f"Enqueued VLM page task {page_task_id} for page {page_num} of section {section_idx}")
-                    
-                except Exception as page_e:
-                    logger.warning(f"Failed to enqueue VLM page {page_num} for section {section_idx} file {file_id}: {page_e}")
-                    continue
-            
-            if not page_task_ids:
-                raise ValueError(f"No page tasks could be enqueued for section {section_idx}")
-            
-            # Wait for all page tasks to complete and then create section bundle
-            logger.info(f"Enqueued {len(page_task_ids)} VLM page tasks for section {section_idx}, now waiting for completion...")
-            
-            # Create a follow-up task to bundle the completed page results
-            from modules.queue_system import enqueue_docling_task, TaskPriority
-            import time
-            
-            # Wait a bit for page tasks to start, then create bundle task
-            time.sleep(10)
-            
-            bundle_task_id = enqueue_docling_task(
-                file_id=file_id,
-                task_type='vlm_section_bundle_collector',
-                payload={
-                    'section_idx': section_idx,
-                    'start_page': start_page,
-                    'end_page': end_page,
-                    'section_title': section_title,
-                    'vlm_group_id': vlm_group_id,
-                    'vlm_model': vlm_model,
-                    'total_sections': total_sections,
-                    'producer': 'auto_split',
-                    'page_task_ids': [tid for _, tid in page_task_ids],
-                    'expected_pages': list(range(start_page, end_page + 1))
-                },
-                priority=TaskPriority.LOW,  # Run after page tasks
-                timeout=3600
-            )
-            
-            logger.info(f"Created VLM section bundle collector task {bundle_task_id} for section {section_idx}")
-            
-            return {
-                'section_idx': section_idx,
-                'page_tasks_created': len(page_task_ids),
-                'bundle_task_id': bundle_task_id,
-                'pages_range': f"{start_page}-{end_page}"
-            }
-            
-        except Exception as e:
-            logger.error(f"VLM section page bundle task failed for file {file_id}: {e}")
-            raise
-
-    def process_vlm_section_bundle_collector_task(self, task: QueueTask) -> Dict[str, Any]:
-        """Collect completed VLM page results and create section-level bundle manifest."""
-        file_id = task.file_id
-        payload = task.payload
-        
-        logger.info(f"Processing VLM section bundle collector for file {file_id}")
-        
-        try:
-            section_idx = payload.get('section_idx')
-            start_page = payload.get('start_page')
-            end_page = payload.get('end_page')
-            section_title = payload.get('section_title', f'Section {section_idx}')
-            vlm_group_id = payload.get('vlm_group_id')
-            vlm_model = payload.get('vlm_model', 'smoldocling')
-            total_sections = payload.get('total_sections', 1)
-            expected_pages = payload.get('expected_pages', [])
-            
-            client = SupabaseServiceRoleClient()
-            
-            # Get file info
-            fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
-            if not fr.data:
-                raise ValueError(f"File {file_id} not found")
-                
-            file_row = fr.data
-            bucket = file_row['bucket']
-            cabinet_id = file_row['cabinet_id']
-            
-            # Find all completed VLM page artefacts for this section
-            artefacts = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).execute()
-            arts = artefacts.data or []
-            
-            # Filter for this section's VLM page artefacts
-            section_page_arts = []
-            for art in arts:
-                extra = art.get('extra', {})
-                if (extra.get('vlm_section_idx') == section_idx and 
-                    extra.get('producer') == 'auto_split_vlm_page' and
-                    art.get('type') == 'docling_vlm' and
-                    art.get('status') == 'completed'):
-                    section_page_arts.append(art)
-            
-            # Check if we have all expected pages
-            found_pages = [art.get('extra', {}).get('vlm_page_number') for art in section_page_arts]
-            found_pages = [p for p in found_pages if p is not None]
-            missing_pages = [p for p in expected_pages if p not in found_pages]
-            
-            logger.info(f"VLM section {section_idx} bundle collector: found {len(section_page_arts)} page artefacts, expected {len(expected_pages)} pages")
-            
-            if logger.isEnabledFor(10):  # DEBUG level
-                found_pages_debug = [art.get('extra', {}).get('vlm_page_number') for art in section_page_arts]
-                logger.debug(f"VLM section {section_idx}: found pages {found_pages_debug}, expected pages {expected_pages}")
-            
-            if missing_pages:
-                # Not all pages are ready, retry later
-                logger.info(f"VLM section {section_idx} bundle collector: missing pages {missing_pages}, found pages {found_pages} - will retry later")
-                raise ValueError(f"VLM section {section_idx} missing pages: {missing_pages} (found: {found_pages}) - will retry")
-                
-            # Sort page artefacts by page number
-            section_page_arts.sort(key=lambda x: x.get('extra', {}).get('vlm_page_number', 0))
-            
-            logger.info(f"VLM section {section_idx} bundle: creating manifest for {len(section_page_arts)} pages")
-            
-            # Create section bundle manifest
-            section_artefact_id = str(uuid.uuid4())
-            section_manifest_path = f"{cabinet_id}/{file_id}/{section_artefact_id}/vlm_section_{section_idx}_manifest.json"
-            
-            page_bundles = []
-            for page_art in section_page_arts:
-                extra = page_art.get('extra', {})
-                page_num = extra.get('vlm_page_number')
-                page_manifest_path = extra.get('manifest')
-                
-                page_bundles.append({
-                    'page_number': page_num,
-                    'artefact_id': page_art['id'],
-                    'manifest_path': page_manifest_path,
-                    'rel_path': page_art['rel_path'],
-                    'label': extra.get('label', f'Page {page_num}')
-                })
-            
-            section_manifest = {
-                'file_id': file_id,
-                'section_idx': section_idx,
-                'section_title': section_title,
-                'start_page': start_page,
-                'end_page': end_page,
-                'vlm_model': vlm_model,
-                'total_pages': len(page_bundles),
-                'page_bundles': page_bundles,
-                'created_at': 'now()',
-                'type': 'vlm_section_page_bundle'
-            }
-            
-            # Store section manifest
-            import json
-            manifest_json = json.dumps(section_manifest, ensure_ascii=False, indent=2)
-            self.storage.upload_file(bucket, section_manifest_path, manifest_json.encode('utf-8'), 'application/json', upsert=True)
-            
-            # Create section bundle artefact
-            client.supabase.table('document_artefacts').insert({
-                'id': section_artefact_id,
-                'file_id': file_id,
-                'type': 'vlm_section_page_bundle',
-                'rel_path': section_manifest_path,
-                'extra': {
-                    'section_idx': section_idx,
-                    'section_title': section_title,
-                    'start_page': start_page,
-                    'end_page': end_page,
-                    'vlm_model': vlm_model,
-                    'total_pages': len(page_bundles),
-                    'group_id': vlm_group_id,
-                    'split_order': section_idx,
-                    'split_heading': section_title,
-                    'split_total': total_sections,
-                    'pipeline': 'vlm',
-                    'producer': 'auto_split',
-                    'group_pack_type': 'vlm_page_bundle_auto_split'
-                },
-                'status': 'completed'
-            }).execute()
-            
-            logger.info(f"VLM section bundle collector completed for section {section_idx} of file {file_id}: created manifest with {len(page_bundles)} page bundles")
-            
-            return {
-                'section_artefact_id': section_artefact_id,
-                'section_idx': section_idx,
-                'pages_bundled': len(page_bundles),
-                'manifest_path': section_manifest_path
-            }
-            
-        except Exception as e:
-            logger.error(f"VLM section bundle collector failed for file {file_id}: {e}")
-            raise
-
-    def _trigger_vlm_after_comparison(self, file_id: str, comparison_payload: Dict[str, Any]):
-        """Trigger VLM processing after comparison analysis completes."""
-        try:
-            # Check if VLM should be triggered
-            if not comparison_payload.get('trigger_vlm_after_comparison'):
-                logger.debug(f"VLM post-comparison trigger not enabled for file {file_id}")
-                return
-                
-            vlm_config = comparison_payload.get('vlm_config', {})
-            if not vlm_config.get('enabled'):
-                logger.debug(f"VLM not enabled for file {file_id}")
-                return
-                
-            logger.info(f"[auto-canonical] Triggering VLM processing after comparison for file {file_id}")
-            
-            # Extract VLM configuration
-            split_by_page = vlm_config.get('split_by_page', False)
-            vlm_model = vlm_config.get('model', 'smoldocling')
-            threshold = vlm_config.get('threshold', 50)
-            base_config = vlm_config.get('base_config', {})
-            
-            # Generate new group_id for VLM processing
-            import uuid
-            vlm_group_id = str(uuid.uuid4())
-            
-            if split_by_page:
-                # Page-by-page processing within sections
-                logger.info(f"[auto-canonical] vlm page-by-page processing for file {file_id} (post-comparison)")
-                self._enqueue_vlm_page_processing(
-                    file_id, threshold, vlm_group_id, vlm_model, base_config
-                )
-            else:
-                # Standard section-level VLM processing
-                from routers.database.files.files import enqueue_canonical_docling
-                
-                body_vlm = {
-                    'use_split_map': True,
-                    'threshold': threshold,
-                    'producer': 'auto_split',
-                    'group_id': vlm_group_id,
-                    'config': {
-                        **base_config,
-                        'do_ocr': False,  # VLM doesn't need OCR
-                        'force_ocr': False,
-                        'pipeline': 'vlm',
-                        'vlm_pipeline_model': vlm_model
-                    }
-                }
-                logger.info(f"[auto-canonical] vlm section batch group_id={vlm_group_id} for file {file_id} (post-comparison)")
-                enqueue_canonical_docling(file_id=file_id, body=body_vlm)
-                
-        except Exception as e:
-            logger.warning(f"Failed to trigger VLM processing after comparison for file {file_id}: {e}")
-
-    def process_docling_bundle_task(self, task: QueueTask) -> Dict[str, Any]:
-        """
-        Process single docling bundle task (whole document processing).
-        
-        This creates a coherent single bundle with all formats using direct processing.
-        NO temporary tasks or old logic reuse - this is the new architecture.
-        """
-        file_id = task.file_id
-        payload = task.payload
-        
-        logger.info(f"🎯 NEW ARCHITECTURE: Processing docling bundle task for file {file_id} (whole document)")
-        
-        try:
-            # Extract bundle configuration
-            config = payload.get('config', {})
-            bundle_metadata = payload.get('bundle_metadata', {})
-            
-            # Ensure bundle processing configuration
-            config['target_type'] = 'zip'
-            config['to_formats'] = ['json', 'html', 'text', 'md', 'doctags']
-            
-            # Call the actual docling processing directly - NO temp tasks!
-            result = self._process_docling_bundle_direct(task, config, bundle_metadata)
-            
-            logger.info(f"✅ NEW ARCHITECTURE: Successfully processed docling bundle for file {file_id}")
-            return result
-            
-        except Exception as e:
-            logger.error(f"❌ NEW ARCHITECTURE: Docling bundle processing failed for file {file_id}: {e}")
-            raise
-
-    def _process_docling_bundle_direct(self, task: QueueTask, config: Dict[str, Any], bundle_metadata: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Direct docling bundle processing - NEW ARCHITECTURE approach.
-        
-        This processes the docling request directly without creating temporary tasks,
-        ensuring clean Redis state and proper bundle metadata handling.
-        """
-        file_id = task.file_id
-        payload = task.payload
-        
-        logger.info(f"🔧 DIRECT PROCESSING: Starting docling bundle processing for file {file_id}")
-        
-        if not self.docling_url:
-            raise ValueError("DOCLING_URL not configured")
-        
-        # Extract payload data
-        bucket = payload['bucket']
-        file_path = payload['file_path']
-        cabinet_id = payload['cabinet_id']
-        
-        # Download file
-        logger.debug(f"📥 DIRECT PROCESSING: Downloading file for bundle processing: {bucket}/{file_path}")
-        file_bytes = self.storage.download_file(bucket, file_path)
-        
-        # Prepare Docling request with bundle-specific config
-        docling_api_key = os.getenv('DOCLING_API_KEY')
-        headers = {'Accept': '*/*'}
-        if docling_api_key:
-            headers['X-Api-Key'] = docling_api_key
-        
-        # Build form data for bundle processing - USE CONFIG FROM PIPELINE_CONTROLLER (no hardcoded defaults!)
-        # The config passed from pipeline_controller already has environment variables loaded
-        form_data = [
-            ('target_type', 'zip'),  # Always zip for bundles
-            ('do_ocr', str(config.get('do_ocr', False)).lower()),
-            ('force_ocr', str(config.get('force_ocr', False)).lower()),
-            ('image_export_mode', 'referenced'),  # Bundle standard
-            ('ocr_engine', config.get('ocr_engine', 'easyocr')),
-            ('pdf_backend', config.get('pdf_backend', 'dlparse_v4')),
-            ('table_mode', config.get('table_mode', 'fast')),  # Use config from pipeline_controller (env vars)
-            ('table_cell_matching', str(config.get('table_cell_matching', True)).lower()),  # Use config from pipeline_controller (env: true)
-            ('pipeline', config.get('pipeline', 'standard')),
-            ('do_formula_enrichment', str(config.get('do_formula_enrichment', True)).lower()),  # Use config from pipeline_controller (env: true)
-            ('do_code_enrichment', str(config.get('do_code_enrichment', True)).lower()),  # Use config from pipeline_controller (env: true)
-            ('do_table_structure', str(config.get('do_table_structure', True)).lower()),
-            ('include_images', str(config.get('include_images', True)).lower()),
-            ('images_scale', str(config.get('images_scale', 2.0))),
-            ('do_picture_classification', str(config.get('do_picture_classification', False)).lower()),
-            ('do_picture_description', str(config.get('do_picture_description', False)).lower()),
-            ('document_timeout', str(config.get('document_timeout', task.timeout)))
-        ]
-        
-        # Handle OCR languages as array (API expects multiple form fields)
-        ocr_lang = config.get('ocr_lang')
-        if ocr_lang:
-            if isinstance(ocr_lang, list):
-                for lang in ocr_lang:
-                    form_data.append(('ocr_lang', str(lang)))
-            else:
-                form_data.append(('ocr_lang', str(ocr_lang)))
-        
-        # Handle VLM pipeline options (CRITICAL for VLM processing)
-        if config.get('vlm_pipeline_model'):
-            form_data.append(('vlm_pipeline_model', config.get('vlm_pipeline_model')))
-        
-        # VLM model local/API options must be JSON per Docling OpenAPI spec
-        if config.get('vlm_pipeline_model_local'):
-            vlm_local = config.get('vlm_pipeline_model_local')
-            if isinstance(vlm_local, (dict, list)):
-                form_data.append(('vlm_pipeline_model_local', json.dumps(vlm_local)))
-            elif isinstance(vlm_local, str) and vlm_local.strip().startswith(('{', '[')):
-                form_data.append(('vlm_pipeline_model_local', vlm_local))
-            # else: omit to avoid validation error
-        
-        if config.get('vlm_pipeline_model_api'):
-            vlm_api = config.get('vlm_pipeline_model_api')
-            if isinstance(vlm_api, (dict, list)):
-                form_data.append(('vlm_pipeline_model_api', json.dumps(vlm_api)))
-            elif isinstance(vlm_api, str) and vlm_api.strip().startswith(('{', '[')):
-                form_data.append(('vlm_pipeline_model_api', vlm_api))
-            # else: omit
-        
-        # Picture description options must be JSON per Docling OpenAPI spec
-        if config.get('picture_description_local'):
-            pic_local = config.get('picture_description_local')
-            if isinstance(pic_local, (dict, list)):
-                form_data.append(('picture_description_local', json.dumps(pic_local)))
-            elif isinstance(pic_local, str) and pic_local.strip().startswith(('{', '[')):
-                form_data.append(('picture_description_local', pic_local))
-        
-        if config.get('picture_description_api'):
-            pic_api = config.get('picture_description_api')
-            if isinstance(pic_api, (dict, list)):
-                form_data.append(('picture_description_api', json.dumps(pic_api)))
-            elif isinstance(pic_api, str) and pic_api.strip().startswith(('{', '[')):
-                form_data.append(('picture_description_api', pic_api))
-        if 'picture_description_area_threshold' in config:
-            form_data.append(('picture_description_area_threshold', str(config.get('picture_description_area_threshold'))))
-        
-        # Handle markdown page break placeholder 
-        if 'md_page_break_placeholder' in config:
-            form_data.append(('md_page_break_placeholder', config.get('md_page_break_placeholder')))
-        
-        # Add formats - always all formats for bundles
-        for fmt in ['json', 'html', 'text', 'md', 'doctags']:
-            form_data.append(('to_formats', fmt))
-        
-        # Handle page range properly - get actual PDF page count like frontmatter does
-        page_range = config.get('page_range', [1, 999999])
-        if isinstance(page_range, list) and len(page_range) >= 2:
-            def _to_int_safe(v, default):
-                try:
-                    return int(v)
-                except Exception:
-                    return default
-            start_pg = _to_int_safe(page_range[0], 1)
-            end_pg = _to_int_safe(page_range[1], 999999)
-            if start_pg < 1:
-                start_pg = 1
-            if end_pg < start_pg:
-                end_pg = start_pg
-            
-            # CRITICAL: Get actual PDF page count to prevent massive range
-            try:
-                import fitz  # PyMuPDF
-                doc = fitz.open(stream=file_bytes, filetype='pdf')
-                pc = int(doc.page_count)
-                doc.close()
-                if pc > 0:
-                    end_pg = min(end_pg, pc)  # Clamp to actual page count!
-                    start_pg = max(1, min(start_pg, pc))
-                    if end_pg < start_pg:
-                        end_pg = start_pg
-                logger.info(f"📄 DIRECT PROCESSING: PDF has {pc} pages, using range {start_pg}-{end_pg}")
-            except Exception as e:
-                logger.warning(f"Could not determine PDF page count: {e}, using defaults")
-                
-            form_data.append(('page_range', str(start_pg)))
-            form_data.append(('page_range', str(end_pg)))
-        else:
-            # Fallback to single page if no range specified
-            form_data.append(('page_range', '1'))
-            form_data.append(('page_range', '1'))
-        
-        files = [('files', ('file', file_bytes, payload.get('mime_type', 'application/pdf')))]
-        
-        # DEBUG: Log the actual config being sent to Docling
-        config_debug = {key: value for key, value in form_data if key in ['table_mode', 'table_cell_matching', 'do_formula_enrichment', 'do_code_enrichment', 'do_ocr', 'pipeline']}
-        logger.info(f"🔧 DIRECT PROCESSING: Docling config being sent: {config_debug}")
-        
-        # Make the HTTP request
-        logger.info(f"🌐 DIRECT PROCESSING: Making HTTP request to Docling for file {file_id}")
-        try:
-            import time
-            start_time = time.time()
-            
-            response = requests.post(
-                f"{self.docling_url.rstrip('/')}/v1/convert/file",
-                files=files,
-                data=form_data,
-                headers=headers,
-                timeout=task.timeout
-            )
-            response.raise_for_status()
-            
-            elapsed = time.time() - start_time
-            logger.info(f"⚡ DIRECT PROCESSING: Docling request completed in {elapsed:.2f}s for file {file_id}")
-            
-        except Exception as e:
-            logger.error(f"🌐 DIRECT PROCESSING: HTTP request failed for file {file_id}: {e}")
-            raise
-        
-        # Process response - should be ZIP for bundle
-        content_type = (response.headers.get('Content-Type') or '').lower()
-        is_zip_resp = ('zip' in content_type) or (response.content[:2] == b'PK')
-        
-        if not is_zip_resp:
-            raise ValueError(f"Expected ZIP response for bundle, got: {content_type}")
-        
-        # Process ZIP bundle and create artefacts
-        logger.info(f"📦 DIRECT PROCESSING: Processing ZIP bundle for file {file_id}")
-        result = self._process_docling_zip_bundle(
-            file_id=file_id,
-            bucket=bucket,
-            cabinet_id=cabinet_id,
-            zip_content=response.content,
-            bundle_metadata=bundle_metadata,
-            task_config=config
-        )
-        
-        logger.info(f"✅ DIRECT PROCESSING: Bundle processing completed for file {file_id}")
-        return result
-
-    def _create_bundle_display_metadata(self, bundle_type: str, title: str, index: int = None, 
-                                       total: int = None, page_range: list = None) -> dict:
-        """
-        Create consistent display metadata for bundle organization.
-        
-        This ensures all bundles have proper titles, ordering, and display names
-        for frontend organization and user-friendly presentation.
-        """
-        metadata = {
-            'title': title,
-            'bundle_type': bundle_type
-        }
-        
-        if index is not None:
-            metadata['split_order'] = index
-            
-        if total is not None:
-            metadata['split_total'] = total
-            
-        if page_range:
-            metadata['page_range'] = page_range
-            metadata['page_count'] = page_range[1] - page_range[0] + 1
-            
-        # Create display names based on bundle type
-        if bundle_type == 'page':
-            metadata['display_name'] = f"Page {page_range[0]}" if page_range else f"Page {index}"
-            metadata['bundle_label'] = f"Page {page_range[0]} Bundle"
-            metadata['sort_key'] = page_range[0] if page_range else index
-        elif bundle_type == 'section':
-            page_str = f" (p{page_range[0]}-{page_range[1]})" if page_range else ""
-            metadata['display_name'] = f"{index:02d}. {title}{page_str}"
-            metadata['bundle_label'] = f"{title} Bundle"
-            metadata['sort_key'] = index
-        elif bundle_type == 'chunk':
-            page_str = f" (p{page_range[0]}-{page_range[1]})" if page_range else ""
-            metadata['display_name'] = f"{index:02d}. {title}{page_str}"
-            metadata['bundle_label'] = f"{title} Bundle"
-            metadata['sort_key'] = index
-        else:
-            metadata['display_name'] = title
-            metadata['bundle_label'] = f"{title} Bundle"
-            metadata['sort_key'] = index or 0
-            
-        return metadata
-
-    def _process_docling_zip_bundle(self, file_id: str, bucket: str, cabinet_id: str, 
-                                   zip_content: bytes, bundle_metadata: Dict[str, Any], 
-                                   task_config: Dict[str, Any]) -> Dict[str, Any]:
-        """
-        Process ZIP bundle response and create artefacts with proper bundle metadata.
-        
-        This is the NEW ARCHITECTURE approach for handling docling ZIP responses.
-        """
-        import zipfile
-        import io
-        import uuid
-        import json
-        import time
-        
-        logger.info(f"📦 ZIP PROCESSING: Starting bundle extraction for file {file_id}")
-        
-        # Create bundle artefact structure
-        artefact_id = str(uuid.uuid4())
-        base_dir = f"{cabinet_id}/{file_id}/{artefact_id}"
-        archive_path = f"{base_dir}/bundle.zip"
-        
-        # Save original archive
-        self.storage.upload_file(bucket, archive_path, zip_content, 'application/zip', upsert=True)
-        
-        # Extract ZIP contents
-        zf = zipfile.ZipFile(io.BytesIO(zip_content))
-        entries = []
-        file_paths = {}
-        
-        for entry in zf.filelist:
-            if entry.is_dir():
-                continue
-                
-            entry_content = zf.read(entry)
-            entry_filename = entry.filename
-            rel_path = f"{base_dir}/{entry_filename}"
-            
-            # Determine MIME type
-            if entry_filename.endswith('.json'):
-                mime = 'application/json'
-                file_paths['json'] = rel_path
-            elif entry_filename.endswith('.html'):
-                mime = 'text/html'
-                file_paths['html'] = rel_path
-            elif entry_filename.endswith('.md'):
-                mime = 'text/markdown'
-                file_paths['md'] = rel_path
-            elif entry_filename.endswith('.txt'):
-                mime = 'text/plain'
-                file_paths['text'] = rel_path
-            elif entry_filename.endswith('.doctags'):
-                mime = 'application/json'
-                file_paths['doctags'] = rel_path
-            else:
-                mime = 'application/octet-stream'
-            
-            # Upload file
-            self.storage.upload_file(bucket, rel_path, entry_content, mime, upsert=True)
-            
-            entries.append({
-                'filename': entry_filename,
-                'rel_path': rel_path,
-                'mime_type': mime,
-                'size': len(entry_content)
-            })
-            
-            logger.debug(f"📄 ZIP PROCESSING: Extracted {entry_filename} -> {rel_path}")
-        
-        zf.close()
-        
-        # Create bundle manifest
-        manifest = {
-            'bundle_id': artefact_id,
-            'file_id': file_id,
-            'bundle_type': 'docling_bundle',
-            'processing_mode': 'whole_document',
-            'created_at': time.time(),
-            'archive_path': archive_path,
-            'entries': entries,
-            'file_paths': file_paths,
-            'metadata': bundle_metadata,
-            'config': task_config
-        }
-        
-        manifest_path = f"{base_dir}/manifest.json"
-        manifest_content = json.dumps(manifest, indent=2).encode('utf-8')
-        self.storage.upload_file(bucket, manifest_path, manifest_content, 'application/json', upsert=True)
-        
-        # Create database artefact with bundle metadata
-        artefact_extra = {
-            **bundle_metadata,
-            'manifest': manifest_path,
-            'archive_path': archive_path,
-            'file_paths': file_paths,
-            'entry_count': len(entries),
-            'group_pack_type': 'whole'  # Add proper pack type for whole document bundles
-        }
-        
-        self.client.supabase.table('document_artefacts').insert({
-            'id': artefact_id,
-            'file_id': file_id,
-            'page_number': 0,  # Whole document
-            'type': 'docling_bundle',
-            'rel_path': base_dir,
-            'size_tag': json.dumps(task_config),
-            'language': 'en',
-            'chunk_index': None,
-            'extra': artefact_extra
-        }).execute()
-        
-        logger.info(f"✅ ZIP PROCESSING: Created bundle artefact {artefact_id} with {len(entries)} files for file {file_id}")
-        
-        return {
-            'artefact_id': artefact_id,
-            'rel_path': base_dir,
-            'manifest_path': manifest_path,
-            'archive_path': archive_path,
-            'file_paths': file_paths,
-            'entry_count': len(entries),
-            'bundle_metadata': bundle_metadata
-        }
-
-    def process_docling_bundle_split_task(self, task: QueueTask) -> Dict[str, Any]:
-        """
-        Process split docling bundle task (multi-unit processing).
-        
-        This creates multiple sub-bundles and a master manifest based on processing mode.
-        """
-        file_id = task.file_id
-        payload = task.payload
-        
-        logger.info(f"Processing docling bundle split task for file {file_id}")
-        
-        try:
-            processing_mode = payload.get('processing_mode', 'split_by_sections')
-            processing_data = payload.get('processing_data', {})
-            config = payload.get('config', {})
-            bundle_metadata = payload.get('bundle_metadata', {})
-            
-            logger.info(f"Split bundle processing mode: {processing_mode}")
-            
-            if processing_mode == 'split_by_pages':
-                return self._process_split_by_pages(task, processing_data, config, bundle_metadata)
-            elif processing_mode == 'split_by_sections':
-                return self._process_split_by_sections(task, processing_data, config, bundle_metadata)  
-            elif processing_mode == 'split_by_chunks':
-                return self._process_split_by_chunks(task, processing_data, config, bundle_metadata)
-            else:
-                raise ValueError(f"Unknown processing mode: {processing_mode}")
-                
-        except Exception as e:
-            logger.error(f"Docling bundle split processing failed for file {file_id}: {e}")
-            raise
-
-    def _process_split_by_pages(self, task: QueueTask, processing_data: dict, 
-                               config: dict, bundle_metadata: dict) -> Dict[str, Any]:
-        """Process document by individual pages and create page bundles."""
-        file_id = task.file_id
-        payload = task.payload
-        bucket = payload['bucket']
-        file_path = payload['file_path']
-        cabinet_id = payload['cabinet_id']
-        mime_type = payload['mime_type']
-        
-        pages = processing_data.get('pages', [])
-        logger.info(f"Processing {len(pages)} individual pages for file {file_id}")
-        
-        # Create master bundle directory
-        master_bundle_id = str(uuid.uuid4())
-        master_dir = f"{cabinet_id}/{file_id}/{master_bundle_id}"
-        page_bundles = []
-        
-        # Process each page as a separate bundle
-        for idx, page_num in enumerate(pages, 1):
-            try:
-                page_config = {
-                    **config,
-                    'page_range': [page_num, page_num],
-                    'target_type': 'zip',
-                    'to_formats': ['json', 'html', 'text', 'md', 'doctags']
-                }
-                
-                # Create descriptive page title and enhanced metadata
-                page_title = f"Page {page_num}"
-                page_display_name = f"Page {page_num}"
-                
-                # Create individual page task with enhanced labeling
-                page_task = QueueTask(
-                    id=f"{task.id}_page_{page_num}",
-                    file_id=file_id,
-                    service=task.service,
-                    task_type='canonical_docling_json',
-                    payload={
-                        **payload,
-                        'config': page_config,
-                        'artefact_extra': {
-                            'page_number': page_num,
-                            'page_title': page_title,
-                            'display_name': page_display_name,
-                            'split_order': idx,  # Sequential order within this bundle
-                            'split_total': len(pages),
-                            'split_heading': page_title,
-                            'section_title': page_title,  # For consistency
-                            'is_page_bundle': True,
-                            'master_bundle_id': master_bundle_id,
-                            'bundle_label': f"Page {page_num} Bundle",
-                            **bundle_metadata
-                        }
-                    },
-                    priority=task.priority,
-                    timeout=1800,
-                    created_at=task.created_at
-                )
-                
-                # Process page bundle
-                page_result = self._process_docling_task(page_task)
-                page_bundles.append({
-                    'page_number': page_num,
-                    'page_title': page_title,
-                    'display_name': page_display_name,
-                    'split_order': idx,
-                    'artefact_id': page_result.get('artefact_id'),
-                    'rel_path': page_result.get('rel_path')
-                })
-                
-            except Exception as e:
-                logger.warning(f"Failed to process page {page_num} for file {file_id}: {e}")
-                continue
-        
-        # Sort page bundles by page number for consistent ordering
-        page_bundles.sort(key=lambda x: x['page_number'])
-        
-        # Create enhanced master manifest with proper organization metadata
-        master_manifest = {
-            'file_id': file_id,
-            'bundle_type': 'docling_bundle_split',
-            'split_mode': 'split_by_pages',
-            'total_pages': len(pages),
-            'successful_pages': len(page_bundles),
-            'page_bundles': page_bundles,
-            'created_at': 'now()',
-            'display_name': f"Document Pages ({len(page_bundles)} pages)",
-            'organization': {
-                'type': 'pages',
-                'sort_field': 'page_number',
-                'sort_order': 'asc',
-                'grouping': 'individual_pages'
-            },
-            **bundle_metadata
-        }
-        
-        # Store master manifest
-        manifest_path = f"{master_dir}/master_manifest.json"
-        manifest_json = json.dumps(master_manifest, ensure_ascii=False, indent=2)
-        self.storage.upload_file(bucket, manifest_path, manifest_json.encode('utf-8'), 'application/json', upsert=True)
-        
-        # Create master bundle artefact
-        self.client.supabase.table('document_artefacts').insert({
-            'id': master_bundle_id,
-            'file_id': file_id,
-            'type': 'docling_bundle_split_pages',
-            'rel_path': master_dir,
-            'extra': {
-                'manifest': manifest_path,
-                'split_mode': 'split_by_pages',
-                'total_pages': len(pages),
-                'successful_pages': len(page_bundles),
-                'group_pack_type': 'split_pages',  # Add proper pack type for split page bundles
-                **bundle_metadata
-            },
-            'status': 'completed'
-        }).execute()
-        
-        logger.info(f"Created page-based split bundle for file {file_id}: {len(page_bundles)} pages")
-        return {
-            'master_bundle_id': master_bundle_id,
-            'pages_processed': len(page_bundles),
-            'total_pages': len(pages)
-        }
-
-    def _process_split_by_sections(self, task: QueueTask, processing_data: dict,
-                                  config: dict, bundle_metadata: dict) -> Dict[str, Any]:
-        """Process document by sections and create section bundles."""
-        file_id = task.file_id
-        payload = task.payload
-        bucket = payload['bucket']
-        cabinet_id = payload['cabinet_id']
-        
-        entries = processing_data.get('entries', [])
-        logger.info(f"Processing {len(entries)} sections for file {file_id}")
-        
-        # Create master bundle directory
-        master_bundle_id = str(uuid.uuid4())
-        master_dir = f"{cabinet_id}/{file_id}/{master_bundle_id}"
-        section_bundles = []
-        
-        # Process each section as a separate bundle  
-        logger.info(f"Processing {len(entries)} sections for file {file_id}")
-        for i, entry in enumerate(entries, 1):
-            try:
-                start_page = entry.get('start_page', 1)
-                end_page = entry.get('end_page', start_page)
-                # Enhanced section title handling with fallbacks and smart naming
-                raw_title = entry.get('title') or entry.get('label') or entry.get('heading')
-                section_title = raw_title.strip() if raw_title else f'Section {i}'
-                
-                # Create enhanced display names for better organization
-                page_range_str = f"p{start_page}" if start_page == end_page else f"p{start_page}-{end_page}"
-                display_name = f"{i:02d}. {section_title}" if raw_title else f"{i:02d}. Section {i} ({page_range_str})"
-                bundle_label = f"{section_title} Bundle"
-                
-                # Validate page ranges
-                if start_page < 1:
-                    raise ValueError(f"Invalid start_page: {start_page} (must be >= 1)")
-                if end_page < start_page:
-                    raise ValueError(f"Invalid page range: {start_page}-{end_page} (end < start)")
-                if start_page > 999 or end_page > 999:
-                    raise ValueError(f"Suspicious page range: {start_page}-{end_page} (too high, possible corruption)")
-                
-                logger.info(f"Processing section {i}/{len(entries)}: '{display_name}' (pages {start_page}-{end_page})")
-                
-                section_config = {
-                    **config,
-                    'page_range': [start_page, end_page],
-                    'target_type': 'zip',
-                    'to_formats': ['json', 'html', 'text', 'md', 'doctags']
-                }
-                
-                # Create section task with enhanced metadata and labeling
-                section_task = QueueTask(
-                    id=f"{task.id}_section_{i}",
-                    file_id=file_id,
-                    service=task.service,
-                    task_type='canonical_docling_json',
-                    payload={
-                        **payload,
-                        'config': section_config,
-                        'artefact_extra': {
-                            'section_number': i,
-                            'section_title': section_title,
-                            'display_name': display_name,
-                            'bundle_label': bundle_label,
-                            'start_page': start_page,
-                            'end_page': end_page,
-                            'page_range': [start_page, end_page],
-                            'page_count': end_page - start_page + 1,
-                            'split_order': i,  # Preserved ordering from split map
-                            'split_total': len(entries),
-                            'split_heading': section_title,
-                            'is_section_bundle': True,
-                            'master_bundle_id': master_bundle_id,
-                            **bundle_metadata
-                        }
-                    },
-                    priority=task.priority,
-                    timeout=3600,
-                    created_at=task.created_at
-                )
-                
-                # Process section bundle
-                section_result = self._process_docling_task(section_task)
-                section_bundles.append({
-                    'section_number': i,
-                    'section_title': section_title,
-                    'display_name': display_name,
-                    'bundle_label': bundle_label,
-                    'page_range': [start_page, end_page],
-                    'page_count': end_page - start_page + 1,
-                    'split_order': i,
-                    'artefact_id': section_result.get('artefact_id'),
-                    'rel_path': section_result.get('rel_path')
-                })
-                
-            except Exception as e:
-                logger.error(f"FATAL: Failed to process section {i} for file {file_id}: {e}")
-                logger.error(f"Section details: title='{section_title}', pages={start_page}-{end_page}")
-                # Don't continue - fail the entire task if any section fails
-                raise Exception(f"Section processing failed for section {i} ('{section_title}', pages {start_page}-{end_page}): {e}")
-        
-        # Sort section bundles by split_order for consistent ordering
-        section_bundles.sort(key=lambda x: x['split_order'])
-        
-        # Create enhanced master manifest with proper organization metadata
-        master_manifest = {
-            'file_id': file_id,
-            'bundle_type': 'docling_bundle_split',
-            'split_mode': 'split_by_sections',
-            'total_sections': len(entries),
-            'successful_sections': len(section_bundles),
-            'section_bundles': section_bundles,
-            'created_at': 'now()',
-            'display_name': f"Document Sections ({len(section_bundles)} sections)",
-            'organization': {
-                'type': 'sections',
-                'sort_field': 'split_order',
-                'sort_order': 'asc',
-                'grouping': 'split_map_sections',
-                'has_titles': True,
-                'ordering_preserved': True
-            },
-            **bundle_metadata
-        }
-        
-        # Store master manifest
-        manifest_path = f"{master_dir}/master_manifest.json"
-        manifest_json = json.dumps(master_manifest, ensure_ascii=False, indent=2)
-        self.storage.upload_file(bucket, manifest_path, manifest_json.encode('utf-8'), 'application/json', upsert=True)
-        
-        # Create master bundle artefact
-        self.client.supabase.table('document_artefacts').insert({
-            'id': master_bundle_id,
-            'file_id': file_id,
-            'type': 'docling_bundle_split_sections',
-            'rel_path': master_dir,
-            'extra': {
-                'manifest': manifest_path,
-                'split_mode': 'split_by_sections',
-                'total_sections': len(entries),
-                'successful_sections': len(section_bundles),
-                'group_pack_type': 'split_sections',  # Add proper pack type for split section bundles
-                **bundle_metadata
-            },
-            'status': 'completed'
-        }).execute()
-        
-        logger.info(f"Created section-based split bundle for file {file_id}: {len(section_bundles)} sections")
-        return {
-            'master_bundle_id': master_bundle_id,
-            'sections_processed': len(section_bundles),
-            'total_sections': len(entries)
-        }
-
-    def _process_split_by_chunks(self, task: QueueTask, processing_data: dict,
-                                config: dict, bundle_metadata: dict) -> Dict[str, Any]:
-        """Process document by chunks and create chunk bundles."""
-        # Very similar to _process_split_by_sections but with chunk-specific labeling
-        file_id = task.file_id
-        payload = task.payload
-        bucket = payload['bucket']
-        cabinet_id = payload['cabinet_id']
-        
-        chunks = processing_data.get('entries', [])
-        logger.info(f"Processing {len(chunks)} chunks for file {file_id}")
-        
-        # Create master bundle directory
-        master_bundle_id = str(uuid.uuid4())
-        master_dir = f"{cabinet_id}/{file_id}/{master_bundle_id}"
-        chunk_bundles = []
-        
-        # Process each chunk as a separate bundle
-        for i, chunk in enumerate(chunks, 1):
-            try:
-                start_page = chunk['start']
-                end_page = chunk['end']
-                # Enhanced chunk title handling
-                raw_title = chunk.get('title', f'Chunk {i}')
-                chunk_title = raw_title.strip() if raw_title else f'Chunk {i}'
-                
-                # Create enhanced display names for chunks
-                page_range_str = f"p{start_page}" if start_page == end_page else f"p{start_page}-{end_page}"
-                display_name = f"{i:02d}. {chunk_title} ({page_range_str})"
-                bundle_label = f"{chunk_title} Bundle"
-                
-                chunk_config = {
-                    **config,
-                    'page_range': [start_page, end_page],
-                    'target_type': 'zip',
-                    'to_formats': ['json', 'html', 'text', 'md', 'doctags']
-                }
-                
-                # Create chunk task with enhanced labeling
-                chunk_task = QueueTask(
-                    id=f"{task.id}_chunk_{i}",
-                    file_id=file_id,
-                    service=task.service,
-                    task_type='canonical_docling_json',
-                    payload={
-                        **payload,
-                        'config': chunk_config,
-                        'artefact_extra': {
-                            'chunk_number': i,
-                            'chunk_title': chunk_title,
-                            'display_name': display_name,
-                            'bundle_label': bundle_label,
-                            'start_page': start_page,
-                            'end_page': end_page,
-                            'page_range': [start_page, end_page],
-                            'page_count': end_page - start_page + 1,
-                            'split_order': i,
-                            'split_total': len(chunks),
-                            'split_heading': chunk_title,
-                            'is_chunk_bundle': True,
-                            'master_bundle_id': master_bundle_id,
-                            **bundle_metadata
-                        }
-                    },
-                    priority=task.priority,
-                    timeout=3600,
-                    created_at=task.created_at
-                )
-                
-                # Process chunk bundle
-                chunk_result = self._process_docling_task(chunk_task)
-                chunk_bundles.append({
-                    'chunk_number': i,
-                    'chunk_title': chunk_title,
-                    'display_name': display_name,
-                    'bundle_label': bundle_label,
-                    'page_range': [start_page, end_page],
-                    'page_count': end_page - start_page + 1,
-                    'split_order': i,
-                    'artefact_id': chunk_result.get('artefact_id'),
-                    'rel_path': chunk_result.get('rel_path')
-                })
-                
-            except Exception as e:
-                logger.warning(f"Failed to process chunk {i} for file {file_id}: {e}")
-                continue
-        
-        # Create master manifest
-        master_manifest = {
-            'file_id': file_id,
-            'bundle_type': 'docling_bundle_split',
-            'split_mode': 'split_by_chunks',
-            'total_chunks': len(chunks),
-            'successful_chunks': len(chunk_bundles),
-            'chunk_bundles': chunk_bundles,
-            'created_at': 'now()',
-            **bundle_metadata
-        }
-        
-        # Store master manifest
-        manifest_path = f"{master_dir}/master_manifest.json"
-        manifest_json = json.dumps(master_manifest, ensure_ascii=False, indent=2)
-        self.storage.upload_file(bucket, manifest_path, manifest_json.encode('utf-8'), 'application/json', upsert=True)
-        
-        # Create master bundle artefact
-        self.client.supabase.table('document_artefacts').insert({
-            'id': master_bundle_id,
-            'file_id': file_id,
-            'type': 'docling_bundle_split_chunks',
-            'rel_path': master_dir,
-            'extra': {
-                'manifest': manifest_path,
-                'split_mode': 'split_by_chunks',
-                'total_chunks': len(chunks),
-                'successful_chunks': len(chunk_bundles),
-                'group_pack_type': 'split_chunks',  # Add proper pack type for split chunk bundles
-                **bundle_metadata
-            },
-            'status': 'completed'
-        }).execute()
-        
-        logger.info(f"Created chunk-based split bundle for file {file_id}: {len(chunk_bundles)} chunks")
-        return {
-            'master_bundle_id': master_bundle_id,
-            'chunks_processed': len(chunk_bundles),
-            'total_chunks': len(chunks)
-        }
-
-# process_phase2_coordinator_task method removed - pipelines now enqueued directly from split_map task
-
-# _check_pipeline_group_completion method removed - task dependencies now handle sequential execution
-
-# Global processor instance
-_processor_instance = None
-
-def get_processor() -> DocumentTaskProcessor:
-    """Get the global task processor instance."""
-    global _processor_instance
-    if _processor_instance is None:
-        _processor_instance = DocumentTaskProcessor()
-    return _processor_instance