api/archive/auto_processing/files_with_auto_processing.py

import os
import io
from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, BackgroundTasks
from typing import Any, Dict, Optional
import uuid
import re
import requests
import os
import tempfile
from pathlib import Path
from modules.auth.supabase_bearer import SupabaseBearer, verify_supabase_jwt_str
from modules.logger_tool import initialise_logger
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
from modules.database.supabase.utils.storage import StorageAdmin
from modules.document_processor import DocumentProcessor
from modules.queue_system import (
    enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task,
    enqueue_document_analysis_task, enqueue_page_images_task,
    TaskPriority, get_queue, QueueConnectionError
)
from fastapi.responses import Response
from fastapi import Body

router = APIRouter()
auth = SupabaseBearer()
doc_processor = DocumentProcessor()

DEFAULT_BUCKET = os.getenv('DEFAULT_FILES_BUCKET', 'cc.users')

# Timeout configurations (in seconds)
TIKA_TIMEOUT = int(os.getenv('TIKA_TIMEOUT', '300'))  # 5 minutes default
DOCLING_FRONTMATTER_TIMEOUT = int(os.getenv('DOCLING_FRONTMATTER_TIMEOUT', '1800'))  # 30 minutes default
DOCLING_NOOCR_TIMEOUT = int(os.getenv('DOCLING_NOOCR_TIMEOUT', '3600'))  # 1 hour default

# (Legacy feature flags removed - using new three-phase system)

logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)

def _safe_filename(name: str) -> str:
    base = os.path.basename(name or 'file')
    return re.sub(r"[^A-Za-z0-9._-]+", "_", base)

def _choose_bucket(scope: str, user_id: str, school_id: Optional[str]) -> str:
    scope = (scope or 'teacher').lower()
    if scope == 'school' and school_id:
        return f"cc.institutes.{school_id}.private"
    # teacher / student fall back to users bucket for now
    return 'cc.users'

@router.post("/files/upload")
async def upload_file(
    cabinet_id: str = Form(...),
    path: str = Form(...),
    scope: str = Form('teacher'),
    school_id: Optional[str] = Form(default=None),
    file: UploadFile = File(...),
    payload: Dict[str, Any] = Depends(auth),
    background_tasks: BackgroundTasks = None
):
    user_id = payload.get('sub') or payload.get('user_id')
    if not user_id:
        raise HTTPException(status_code=401, detail="Invalid token payload")

    client = SupabaseServiceRoleClient()
    storage = StorageAdmin()

    # Determine target bucket by scope
    bucket = _choose_bucket(scope, user_id, school_id)

    # Stage DB row to get file_id
    staged_path = f"{cabinet_id}/staging/{uuid.uuid4()}"
    name = _safe_filename(path or file.filename)
    file_bytes = await file.read()
    insert_res = client.supabase.table('files').insert({
        'cabinet_id': cabinet_id,
        'name': name,
        'path': staged_path,
        'bucket': bucket,
        'mime_type': file.content_type,
        'uploaded_by': user_id,
        'size_bytes': len(file_bytes),
        'source': 'classroomcopilot-web'
    }).execute()
    if not insert_res.data:
        raise HTTPException(status_code=500, detail="Failed to create file record")
    file_row = insert_res.data[0]
    file_id = file_row['id']

    # Final storage path: bucket/cabinet_id/file_id/file
    final_storage_path = f"{cabinet_id}/{file_id}/{name}"
    try:
        storage.upload_file(bucket, final_storage_path, file_bytes, file.content_type or 'application/octet-stream', upsert=True)
    except Exception as e:
        # cleanup staged row
        client.supabase.table('files').delete().eq('id', file_id).execute()
        raise HTTPException(status_code=500, detail=f"Storage upload failed: {str(e)}")

    # Update DB path to final
    update_res = client.supabase.table('files').update({
        'path': final_storage_path
    }).eq('id', file_id).execute()
    # Kick off initial artefacts generation in background (Tika + Docling frontmatter + no-OCR)
    try:
        if background_tasks is not None:
            logger.info(f"Scheduling initial artefacts generation for file_id={file_id}")
            background_tasks.add_task(generate_initial_artefacts, file_id, payload)
        else:
            logger.info(f"Running initial artefacts generation synchronously for file_id={file_id}")
            generate_initial_artefacts(file_id, payload)
    except Exception as e:
        logger.error(f"Failed to schedule initial artefacts for file_id={file_id}: {e}")

    return update_res.data

@router.get("/files")
def list_files(cabinet_id: str, payload: Dict[str, Any] = Depends(auth)):
    client = SupabaseServiceRoleClient()
    res = client.supabase.table('files').select('*').eq('cabinet_id', cabinet_id).execute()
    return res.data

@router.post("/files/{file_id}/move")
def move_file(file_id: str, body: Dict[str, Any], payload: Dict[str, Any] = Depends(auth)):
    client = SupabaseServiceRoleClient()
    updates = {}
    if 'cabinet_id' in body:
        updates['cabinet_id'] = body['cabinet_id']
    if 'path' in body:
        updates['path'] = body['path']
    if not updates:
        raise HTTPException(status_code=400, detail="No changes provided")
    res = client.supabase.table('files').update(updates).eq('id', file_id).execute()
    return res.data

@router.delete("/files/{file_id}")
def delete_file(file_id: str, payload: Dict[str, Any] = Depends(auth)):
    client = SupabaseServiceRoleClient()
    res = client.supabase.table('files').delete().eq('id', file_id).execute()
    return res.data

@router.get("/files/{file_id}/artefacts")
def list_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)):
    client = SupabaseServiceRoleClient()
    res = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).order('created_at', desc=True).execute()
    return res.data

@router.get("/files/{file_id}/viewer-artefacts")
def list_viewer_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)):
    """
    Get artefacts organized for UI viewer display, including frontmatter JSON,
    processing bundles, and analysis data with proper display metadata.
    """
    client = SupabaseServiceRoleClient()

    # Get all artefacts for the file
    res = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).order('created_at', desc=True).execute()
    all_artefacts = res.data or []

    # Organize artefacts by category for UI display
    viewer_artefacts = {
        'document_analysis': [],
        'processing_bundles': [],
        'raw_data': []
    }

    for artefact in all_artefacts:
        artefact_type = artefact.get('type', '')
        extra = artefact.get('extra', {})

        # Enhanced artefact info for UI display
        artefact_info = {
            'id': artefact['id'],
            'type': artefact_type,
            'display_name': extra.get('display_name'),
            'bundle_label': extra.get('bundle_label'),
            'section_title': extra.get('section_title'),
            'page_range': extra.get('page_range'),
            'page_count': extra.get('page_count'),
            'pipeline': extra.get('pipeline'),
            'processing_mode': extra.get('processing_mode'),
            'ui_order': extra.get('ui_order', 999),
            'description': extra.get('description'),
            'viewer_type': extra.get('viewer_type', 'json'),
            'created_at': artefact['created_at'],
            'status': artefact.get('status', 'unknown')
        }

        # Categorize artefacts for UI organization
        if artefact_type == 'docling_frontmatter_json':
            artefact_info.update({
                'display_name': artefact_info['display_name'] or 'Document Frontmatter',
                'bundle_label': artefact_info['bundle_label'] or 'Frontmatter Analysis',
                'description': artefact_info['description'] or 'OCR analysis of document structure and metadata',
                'ui_order': 1,
                'viewer_type': 'json'
            })
            viewer_artefacts['document_analysis'].append(artefact_info)

        elif artefact_type == 'split_map_json':
            artefact_info.update({
                'display_name': 'Document Structure Map',
                'bundle_label': 'Split Map',
                'description': 'Document section boundaries and organization structure',
                'ui_order': 2,
                'viewer_type': 'json'
            })
            viewer_artefacts['document_analysis'].append(artefact_info)

        elif artefact_type == 'tika_json':
            artefact_info.update({
                'display_name': 'Document Metadata',
                'bundle_label': 'Tika Analysis',
                'description': 'Raw document metadata and properties extracted by Apache Tika',
                'ui_order': 3,
                'viewer_type': 'json'
            })
            viewer_artefacts['raw_data'].append(artefact_info)

        elif artefact_type in ['canonical_docling_json', 'docling_bundle_split', 'docling_bundle', 'docling_standard', 'docling_bundle_split_pages']:
            # Processing bundles (OCR, No-OCR, VLM) - use original_pipeline for proper differentiation
            pipeline_name = extra.get('original_pipeline', extra.get('pipeline', 'Unknown'))
            bundle_label = artefact_info['bundle_label'] or f"{pipeline_name.upper().replace('_', '-')} Bundle"
            display_name = artefact_info['display_name'] or f"{pipeline_name.upper().replace('_', '-')} Processing Result"

            # Special handling for master manifests
            if artefact_type == 'docling_bundle_split_pages':
                display_name = f"{pipeline_name.upper().replace('_', '-')} Document Pages"
                bundle_label = f"{pipeline_name.upper().replace('_', '-')} Pages Bundle"
                artefact_info.update({
                    'viewer_type': 'bundle_collection',
                    'is_master_manifest': True,
                    'ui_order': 10  # Show master manifests before individual pages
                })
            elif artefact_type == 'docling_standard':
                # Individual page bundles - lower UI priority
                artefact_info.update({
                    'viewer_type': 'page_bundle',
                    'is_individual_page': True,
                    'ui_order': extra.get('split_order', 999) + 100  # Show after master manifests
                })

            artefact_info.update({
                'display_name': display_name,
                'bundle_label': bundle_label,
                'description': f"Docling processing result using {pipeline_name.replace('_', '-')} pipeline",
                'pipeline_type': pipeline_name  # Add explicit pipeline type for UI
            })
            viewer_artefacts['processing_bundles'].append(artefact_info)

        elif artefact_type.startswith('docling_') and artefact_type.endswith('_json'):
            # Other docling JSON results
            pipeline_name = artefact_type.replace('docling_', '').replace('_json', '').upper()
            artefact_info.update({
                'display_name': f"{pipeline_name} Analysis",
                'bundle_label': f"{pipeline_name} Result",
                'description': f"Docling {pipeline_name.lower()} processing result",
                'viewer_type': 'json'
            })
            viewer_artefacts['processing_bundles'].append(artefact_info)

        elif artefact_type == 'page_images':
            artefact_info.update({
                'display_name': 'Page Images',
                'bundle_label': 'Visual Pages',
                'description': 'Generated page images for document visualization',
                'viewer_type': 'images'
            })
            viewer_artefacts['raw_data'].append(artefact_info)

    # Sort each category by ui_order
    for category in viewer_artefacts.values():
        category.sort(key=lambda x: (x['ui_order'], x['created_at']))

    return {
        'file_id': file_id,
        'categories': viewer_artefacts,
        'total_artefacts': len(all_artefacts)
    }

@router.post("/files/{file_id}/artefacts/initial")
def generate_initial_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)):
    """
    Generate initial artefacts using the new three-phase pipeline architecture.

    Phase 1: Document Structure Discovery & Analysis
    - Tika metadata extraction
    - Page images generation
    - Document structure analysis (LLM-enhanced)
    - Split map generation

    Phase 2: Triggered automatically after Phase 1 completion
    """
    logger.info(f"Three-phase pipeline: Starting Phase 1 for file_id={file_id}")

    from modules.pipeline_controller import get_pipeline_controller

    client = SupabaseServiceRoleClient()
    storage = StorageAdmin()
    controller = get_pipeline_controller()

    # Load file row
    fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
    file_row = fr.data
    if not file_row:
        raise HTTPException(status_code=404, detail="File not found")

    bucket = file_row['bucket']
    storage_path = file_row['path']
    cabinet_id = file_row['cabinet_id']
    mime = file_row.get('mime_type') or 'application/octet-stream'
    filename = file_row.get('name', 'file')

    # Step 1: Convert to PDF if not already a PDF (synchronous for now)
    processing_path = storage_path
    processing_mime = mime

    if mime != 'application/pdf':
        logger.info(f"Converting non-PDF file to PDF: file_id={file_id} mime={mime}")
        try:
            file_bytes = storage.download_file(bucket, storage_path)

            with tempfile.TemporaryDirectory() as temp_dir:
                # Save original file to temp location
                temp_input = Path(temp_dir) / filename
                with open(temp_input, 'wb') as f:
                    f.write(file_bytes)

                # Convert to PDF
                pdf_bytes = doc_processor.convert_to_pdf(temp_input)

                # Store PDF as artefact
                pdf_artefact_id = str(uuid.uuid4())
                pdf_rel_path = f"{cabinet_id}/{file_id}/{pdf_artefact_id}/document.pdf"
                storage.upload_file(bucket, pdf_rel_path, pdf_bytes, 'application/pdf', upsert=True)

                pdf_ar = client.supabase.table('document_artefacts').insert({
                    'file_id': file_id,
                    'type': 'document_pdf',
                    'rel_path': pdf_rel_path,
                    'extra': {'converted_from': mime, 'original_filename': filename},
                    'status': 'completed'
                }).execute()

                # Use converted PDF for subsequent processing
                processing_path = pdf_rel_path
                processing_mime = 'application/pdf'
                logger.info(f"PDF conversion: completed file_id={file_id} rel_path={pdf_rel_path}")

        except Exception as e:
            logger.error(f"PDF conversion: error processing file_id={file_id}: {e}")
            # Continue with original file if conversion fails
    else:
        logger.info(f"File is already PDF, skipping conversion: file_id={file_id}")

    # Step 2: Enqueue Phase 1 tasks using the new pipeline controller
    user_id = payload.get('sub') or payload.get('user_id')
    priority = TaskPriority.HIGH if user_id else TaskPriority.NORMAL

    try:
        # Update file row with processing path
        updated_file_row = {**file_row, 'path': processing_path, 'mime_type': processing_mime}

        # Enqueue Phase 1 tasks
        phase1_tasks = controller.enqueue_phase1_tasks(
            file_id=file_id,
            file_row=updated_file_row,
            processing_path=processing_path,
            processing_mime=processing_mime,
            priority=priority
        )

        total_tasks = sum(len(task_list) for task_list in phase1_tasks.values())

        logger.info(f"Three-phase pipeline: Enqueued {total_tasks} Phase 1 tasks for file_id={file_id}")


        return {
            'message': f'Three-phase pipeline: Enqueued {total_tasks} Phase 1 tasks. Phase 2 will trigger automatically after completion.',
            'phase1_tasks': {k: v for k, v in phase1_tasks.items()},
            'file_id': file_id,
            'pipeline_mode': 'three_phase',
            'bundle_architecture_enabled': True
        }

    except QueueConnectionError as e:
        logger.error(f"Queue system unavailable for file_id={file_id}: {e}")
        logger.error("Redis is not running. Please start the API server with './start.sh dev' to auto-start Redis.")
        return {
            'message': 'File uploaded successfully, but processing tasks could not be queued (Redis unavailable)',
            'file_id': file_id,
            'queue_status': 'unavailable',
            'error': 'Queue system unavailable. Please restart the API server with Redis enabled.'
        }
    except Exception as e:
        logger.error(f"Unexpected error enqueueing Phase 1 tasks for file_id={file_id}: {e}")
        return {
            'message': 'File uploaded successfully, but processing tasks failed to queue',
            'file_id': file_id,
            'queue_status': 'failed',
            'error': str(e)
        }

@router.get("/files/{file_id}/page-images/manifest")
def get_page_images_manifest(file_id: str, payload: Dict[str, Any] = Depends(auth)):
    """Return the page_images manifest JSON for a file via service-role access."""
    client = SupabaseServiceRoleClient()
    storage = StorageAdmin()

    # Find file row to get bucket
    fr = client.supabase.table('files').select('id,bucket,cabinet_id').eq('id', file_id).single().execute()
    file_row = fr.data or {}
    if not file_row:
        raise HTTPException(status_code=404, detail="File not found")
    bucket = file_row['bucket']
    cabinet_id = file_row['cabinet_id']

    # Find page_images artefact
    arts = client.supabase.table('document_artefacts') \
        .select('id,type,rel_path,extra') \
        .eq('file_id', file_id).eq('type', 'page_images') \
        .order('created_at', desc=True).limit(1).execute().data or []
    if not arts:
        raise HTTPException(status_code=404, detail="page_images artefact not found")
    art = arts[0]

    # Manifest path
    manifest_rel_path = (art.get('extra') or {}).get('manifest') or f"{art['rel_path'].rstrip('/')}/page_images.json"

    try:
        raw = storage.download_file(bucket, manifest_rel_path)
        import json as _json
        manifest = _json.loads(raw.decode('utf-8'))
        # Ensure bucket and base prefix are present for the UI
        manifest.setdefault('bucket', bucket)
        manifest.setdefault('base_dir', art['rel_path'])
        return manifest
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Failed to load manifest: {e}")

def json_dumps(obj: Any) -> str:
    try:
        import json
        return json.dumps(obj, ensure_ascii=False)
    except Exception:
        return "{}"


@router.get("/files/{file_id}/artefacts/{artefact_id}/json")
def get_artefact_json(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)):
    """Return the JSON content of a document artefact using service-role storage access."""
    client = SupabaseServiceRoleClient()
    storage = StorageAdmin()
    # Look up artefact to get rel_path and validate it belongs to file
    ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path').eq('id', artefact_id).single().execute()
    artefact = ar.data
    if not artefact:
        raise HTTPException(status_code=404, detail="Artefact not found")
    if artefact.get('file_id') != file_id:
        raise HTTPException(status_code=400, detail="Artefact does not belong to file")

    # Look up file to get bucket
    fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute()
    file_row = fr.data
    if not file_row:
        raise HTTPException(status_code=404, detail="File not found")

    bucket = file_row['bucket']
    rel_path = artefact['rel_path']
    try:
        raw = storage.download_file(bucket, rel_path)
        import json as _json
        data = _json.loads(raw.decode('utf-8'))
        return data
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Failed to load artefact JSON: {str(e)}")


@router.get("/files/{file_id}/artefacts/{artefact_id}/vlm-section-manifest")
def get_vlm_section_manifest(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)):
    """Return the VLM section page bundle manifest JSON for a VLM section bundle artefact."""
    client = SupabaseServiceRoleClient()
    storage = StorageAdmin()

    ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path,type,extra').eq('id', artefact_id).single().execute().data
    if not ar:
        raise HTTPException(status_code=404, detail="Artefact not found")
    if ar.get('file_id') != file_id:
        raise HTTPException(status_code=400, detail="Artefact does not belong to file")
    if ar.get('type') != 'vlm_section_page_bundle':
        raise HTTPException(status_code=400, detail="Artefact is not a VLM section page bundle")

    fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute().data
    if not fr:
        raise HTTPException(status_code=404, detail="File not found")
    bucket = fr['bucket']

    # The rel_path directly points to the manifest JSON file
    manifest_rel_path = ar['rel_path']

    try:
        raw = storage.download_file(bucket, manifest_rel_path)
        import json as _json
        data = _json.loads(raw.decode('utf-8'))
        # ensure bucket present for client use
        data.setdefault('bucket', bucket)
        return data
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Failed to load VLM section manifest: {e}")


@router.post("/files/{file_id}/artefacts/outline")
def enqueue_outline_structure(file_id: str, payload: Dict[str, Any] = Depends(auth)):
    """
    Manually enqueue the fast document outline (headings-only) analysis for an existing file.
    Returns the queued task id.
    """
    client = SupabaseServiceRoleClient()

    fr = client.supabase.table('files').select('id,bucket,cabinet_id,path,mime_type').eq('id', file_id).single().execute()
    file_row = fr.data
    if not file_row:
        raise HTTPException(status_code=404, detail="File not found")

    bucket = file_row['bucket']
    storage_path = file_row['path']
    cabinet_id = file_row['cabinet_id']
    mime = file_row.get('mime_type') or 'application/pdf'

    # Prefer converted PDF artefact if available
    arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or []
    pdf_art = next((a for a in arts if a.get('type') == 'document_pdf'), None)
    processing_path = pdf_art['rel_path'] if pdf_art else storage_path

    try:
        task_id = enqueue_docling_task(
            file_id=file_id,
            task_type='document_structure_analysis',
            payload={
                'bucket': bucket,
                'file_path': processing_path,
                'cabinet_id': cabinet_id,
                'mime_type': mime,
                'config': {
                    'target_type': 'inbody',
                    'to_formats': 'json',
                    'do_ocr': False,
                    'force_ocr': False
                }
            },
            priority=TaskPriority.NORMAL,
            timeout=300
        )
        return { 'message': 'outline task enqueued', 'task_id': task_id, 'file_id': file_id }
    except QueueConnectionError as e:
        raise HTTPException(status_code=503, detail=f"Queue unavailable: {e}")
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Failed to enqueue outline task: {e}")

@router.get("/files/proxy")
def proxy_storage_file(bucket: str, path: str, payload: Dict[str, Any] = Depends(auth)):
    """Proxy a storage file (service-role), useful for private image access in the UI."""
    storage = StorageAdmin()
    try:
        data = storage.download_file(bucket, path)
        media = 'application/octet-stream'
        lp = path.lower()
        if lp.endswith('.png'):
            media = 'image/png'
        elif lp.endswith('.webp'):
            media = 'image/webp'
        elif lp.endswith('.jpg') or lp.endswith('.jpeg'):
            media = 'image/jpeg'
        elif lp.endswith('.json'):
            media = 'application/json'
        return Response(content=data, media_type=media)
    except Exception as e:
        raise HTTPException(status_code=404, detail=f"File not found or inaccessible: {e}")


# Signed proxy for iframe/img tags without Authorization header
@router.get("/files/proxy_signed")
def proxy_storage_file_signed(bucket: str, path: str, token: str):
    """Proxy using a signed bearer token passed as query param 'token'."""
    try:
        payload = verify_supabase_jwt_str(token)
        if not payload:
            raise HTTPException(status_code=403, detail="Invalid token")
    except Exception as e:
        raise HTTPException(status_code=403, detail=f"Invalid token: {e}")

    storage = StorageAdmin()
    try:
        data = storage.download_file(bucket, path)
        media = 'application/octet-stream'
        lp = path.lower()
        if lp.endswith('.png'):
            media = 'image/png'
        elif lp.endswith('.webp'):
            media = 'image/webp'
        elif lp.endswith('.jpg') or lp.endswith('.jpeg'):
            media = 'image/jpeg'
        elif lp.endswith('.json'):
            media = 'application/json'
        return Response(content=data, media_type=media)
    except Exception as e:
        raise HTTPException(status_code=404, detail=f"File not found or inaccessible: {e}")

# -------- Canonical bundle manifest ---------

@router.get("/files/{file_id}/artefacts/{artefact_id}/manifest")
def get_canonical_manifest(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)):
    """Return the manifest.json for a canonical_docling_bundle artefact."""
    client = SupabaseServiceRoleClient()
    storage = StorageAdmin()

    ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path,extra').eq('id', artefact_id).single().execute().data
    if not ar:
        raise HTTPException(status_code=404, detail="Artefact not found")
    if ar.get('file_id') != file_id:
        raise HTTPException(status_code=400, detail="Artefact does not belong to file")
    extra = ar.get('extra') or {}
    manifest_rel_path = extra.get('manifest')
    if not manifest_rel_path:
        raise HTTPException(status_code=404, detail="Manifest path not recorded on artefact")

    fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute().data
    if not fr:
        raise HTTPException(status_code=404, detail="File not found")
    bucket = fr['bucket']

    try:
        raw = storage.download_file(bucket, manifest_rel_path)
        import json as _json
        data = _json.loads(raw.decode('utf-8'))
        # ensure bucket present for client use
        data.setdefault('bucket', bucket)
        return data
    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Failed to load manifest: {e}")

# -------- Canonical Docling generation ---------

def _load_split_map(client: SupabaseServiceRoleClient, storage: StorageAdmin, bucket: str, file_id: str) -> Optional[Dict[str, Any]]:
    try:
        arts = client.supabase.table('document_artefacts') \
            .select('id,type,rel_path') \
            .eq('file_id', file_id).eq('type', 'split_map_json') \
            .order('created_at', desc=True).limit(1).execute().data or []
        if not arts:
            return None
        art = arts[0]
        raw = storage.download_file(bucket, art['rel_path'])
        import json as _json
        return _json.loads(raw.decode('utf-8'))
    except Exception:
        return None


@router.post("/files/{file_id}/artefacts/canonical-docling")
def enqueue_canonical_docling(
    file_id: str,
    body: Dict[str, Any] = Body(default={}),
    payload: Dict[str, Any] = Depends(auth)
):
    """Enqueue generation of canonical Docling JSON(s) for a file.

    If a split_map is available and the document is large, this will enqueue
    multiple Docling jobs using page ranges per section. Otherwise a single
    job is created for the whole document.
    """
    client = SupabaseServiceRoleClient()
    storage = StorageAdmin()

    fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
    file_row = fr.data
    if not file_row:
        raise HTTPException(status_code=404, detail="File not found")

    bucket = file_row['bucket']
    cabinet_id = file_row['cabinet_id']
    mime = file_row.get('mime_type') or 'application/pdf'
    storage_path = file_row['path']

    # Prefer converted PDF if available
    try:
        arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or []
        a_pdf = next((a for a in arts if a.get('type') == 'document_pdf'), None)
        processing_path = a_pdf['rel_path'] if a_pdf else storage_path
        processing_mime = 'application/pdf' if a_pdf else mime
    except Exception:
        processing_path = storage_path
        processing_mime = mime

    # Determine page_count (prefer Tika; fallback to PDF parser if needed)
    page_count = None
    try:
        arts_pc = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).execute().data or []
        a_tika_pc = next((a for a in arts_pc if a.get('type') == 'tika_json'), None)
        if a_tika_pc:
            raw = storage.download_file(bucket, a_tika_pc['rel_path'])
            import json as _json
            tj = _json.loads(raw.decode('utf-8'))
            for k in ("xmpTPg:NPages", "Page-Count", "pdf:PageCount"):
                v = tj.get(k) or tj.get(k.lower())
                if v is not None:
                    page_count = int(v)
                    break
    except Exception as e:
        logger.debug(f"[canonical-docling] Tika page_count read failed: {e}")
        pass

    # Fallback: compute page_count from PDF if Tika did not provide it
    if page_count is None:
        try:
            pdf_bytes = storage.download_file(bucket, processing_path)
            try:
                import fitz  # PyMuPDF
                doc = fitz.open(stream=pdf_bytes, filetype='pdf')
                page_count = int(doc.page_count)
                doc.close()
                logger.info(f"[canonical-docling] page_count via PyMuPDF: {page_count}")
            except Exception:
                try:
                    from PyPDF2 import PdfReader
                    reader = PdfReader(io.BytesIO(pdf_bytes))
                    page_count = int(len(reader.pages))
                    logger.info(f"[canonical-docling] page_count via PyPDF2: {page_count}")
                except Exception:
                    page_count = None
        except Exception:
            page_count = None
    else:
        logger.info(f"[canonical-docling] page_count via Tika: {page_count}")

    # Optional custom range from caller
    custom_range = body.get('custom_range')
    custom_label = body.get('custom_label') or ''
    selected_section_id = body.get('selected_section_id')
    selected_section_title = body.get('selected_section_title')

    # Load split map if requested and document is large enough
    use_split_requested = bool(body.get('use_split_map', True))
    split_threshold = int(body.get('threshold') or os.getenv('DOCLING_SPLIT_THRESHOLD', '50'))
    ranges = []  # list of (start,end)
    split_map = None
    sections = []  # list of dicts: {start,end,title}
    logger.info(f"[canonical-docling] use_split_map={use_split_requested} threshold={split_threshold} page_count={page_count}")
    # If custom range provided, honor it and bypass split map
    if isinstance(custom_range, list) and len(custom_range) >= 2:
        try:
            cs = int(custom_range[0]); ce = int(custom_range[1])
            if page_count is not None:
                cs = max(1, min(cs, page_count))
                ce = max(cs, min(ce, page_count))
            ranges = [(cs, ce)]
            sections = [{'start': cs, 'end': ce, 'title': custom_label or 'Custom range'}]
            use_split_requested = False
            logger.info(f"[canonical-docling] using custom_range start={cs} end={ce} label='{custom_label}'")
        except Exception as _e:
            logger.warning(f"[canonical-docling] invalid custom_range; falling back. err={_e}")

    if not ranges and use_split_requested and (page_count is None or page_count >= split_threshold):
        split_map = _load_split_map(client, storage, bucket, file_id)
        entries = (split_map or {}).get('entries') if split_map else []
        logger.info(f"[canonical-docling] split_map loaded entries={len(entries) if isinstance(entries, list) else 0}")
        if split_map and isinstance(entries, list) and len(entries) > 0:
            # Normalize and sort entries by start_page to enforce correct order
            norm: list[dict] = []
            for e in entries:
                try:
                    s = int(e.get('start_page', 1))
                    t = int(e.get('end_page', s))
                    if t < s:
                        t = s
                    title = e.get('title') or e.get('label') or ''
                    norm.append({'start': s, 'end': t, 'title': title})
                except Exception:
                    continue
            norm.sort(key=lambda x: x['start'])
            # Deduplicate identical or overlapping starts by keeping the earliest occurrence
            ordered: list[dict] = []
            last_end = 0
            for e in norm:
                s, t = int(e['start']), int(e['end'])
                if ordered and s <= last_end:
                    # Clamp to prevent inversion and maintain order
                    s = last_end + 1
                    if s > (page_count or s):
                        continue
                    if t < s:
                        t = s
                last_end = max(last_end, t)
                ordered.append({'start': s, 'end': t, 'title': e['title']})
            for e in ordered:
                ranges.append((e['start'], e['end']))
                sections.append(e)

    # Fallback: if no split_map ranges... we shouldn't be here
    if not ranges:
        # If document is large, split into fixed windows to protect Docling server
        if page_count is not None and page_count >= split_threshold:
            chunk = int(os.getenv('DOCLING_FALLBACK_CHUNK_PAGES', '25'))
            chunk = max(5, min(100, chunk))
            for i in range(1, (page_count or 1) + 1, chunk):
                end = min(i + chunk - 1, page_count or i)
                ranges.append((i, end))
                sections.append({'start': i, 'end': end, 'title': f"Pages {i}-{end}"})
            logger.warning(f"[canonical-docling] using fallback chunking ranges={len(ranges)} chunk={chunk}")
        else:
            ranges = [(1, page_count or 9223372036854775807)]
            logger.warning(f"[canonical-docling] using single-range fallback (small doc)")

    # Build config
    cfg = body.get('config', {})
    pipeline = cfg.get('pipeline', 'standard')
    config: Dict[str, Any] = {
        # target_type is computed in processor based on to_formats unless explicitly provided by user
        'to_formats': cfg.get('to_formats', 'json'),
        'do_ocr': bool(cfg.get('do_ocr', True)),
        'force_ocr': bool(cfg.get('force_ocr', False)),
        'image_export_mode': cfg.get('image_export_mode', 'embedded'),
        'ocr_engine': cfg.get('ocr_engine', 'easyocr'),
        'ocr_lang': cfg.get('ocr_lang', 'en'),
        'pdf_backend': cfg.get('pdf_backend', 'dlparse_v4'),
        'table_mode': cfg.get('table_mode', 'fast'),
        'pipeline': pipeline,
        'do_picture_classification': bool(cfg.get('do_picture_classification', False)),
        'do_picture_description': bool(cfg.get('do_picture_description', False)),
    }
    # If user explicitly set target_type, pass it through
    if 'target_type' in cfg:
        config['target_type'] = cfg['target_type']
    # Optional VLM settings (only include API fields if provided as JSON by caller)
    if config['do_picture_description']:
        pd_api = cfg.get('picture_description_api')
        if isinstance(pd_api, (dict, list)):
            config['picture_description_api'] = pd_api
        elif isinstance(pd_api, str) and pd_api.strip().startswith(('{', '[')):
            config['picture_description_api'] = pd_api
        if cfg.get('picture_description_prompt'):
            config['picture_description_prompt'] = cfg['picture_description_prompt']
    if pipeline == 'vlm':
        # Provider presets mapping
        provider = (cfg.get('vlm_provider') or '').strip().lower()
        provider_model = (cfg.get('vlm_provider_model') or '').strip()
        provider_base = (cfg.get('vlm_provider_base_url') or '').strip()
        if provider in ('ollama', 'openai') and provider_model:
            if provider == 'ollama':
                base_url = provider_base or os.getenv('OLLAMA_BASE_URL') or os.getenv('VLM_OLLAMA_BASE_URL')
                if base_url:
                    endpoint = f"{base_url.rstrip('/')}/v1/chat/completions"
                    # Use OpenAI provider schema against Ollama's OpenAI-compatible endpoint
                    cfg_api = {
                        'provider': 'openai',
                        'url': endpoint,
                        'model': provider_model,
                        'response_format': 'markdown',
                        'request_params': {'model': provider_model}
                    }
                    logger.info(f"[canonical-docling] VLM provider=ollama mapped to openai-compatible url={endpoint} model={provider_model}")
                    config['vlm_pipeline_model_api'] = cfg_api
                    # Also wire picture_description_api if picture description is enabled
                    if config.get('do_picture_description'):
                        config['picture_description_api'] = {
                            'url': endpoint,
                            'headers': {},
                            'params': {'model': provider_model}
                        }
            elif provider == 'openai':
                base_url = provider_base or os.getenv('OPENAI_BASE_URL') or 'https://api.openai.com/v1'
                api_key = os.getenv('OPENAI_API_KEY') or os.getenv('OPENAI_API_KEY_READONLY')
                # Do not inline key if not present; server may have default
                model_cfg: Dict[str, Any] = {
                    'provider': 'openai',
                    'url': f"{base_url.rstrip('/')}/chat/completions",
                    'model': provider_model,
                    'response_format': 'markdown',
                    'request_params': {'model': provider_model}
                }
                if api_key:
                    model_cfg['api_key'] = api_key
                    # Also pass explicit Authorization header for servers that expect it
                    model_cfg['headers'] = {
                        'Authorization': f"Bearer {api_key}"
                    }
                logger.info(f"[canonical-docling] VLM provider=openai url={model_cfg['url']} model={provider_model} api_key={'yes' if api_key else 'no'}")
                config['vlm_pipeline_model_api'] = model_cfg
                # Also wire picture_description_api if picture description is enabled
                if config.get('do_picture_description'):
                    headers = {'Authorization': f"Bearer {api_key}"} if api_key else {}
                    config['picture_description_api'] = {
                        'url': f"{base_url.rstrip('/')}/chat/completions",
                        'headers': headers,
                        'params': {'model': provider_model}
                    }
        else:
            # Pass through explicit API/local JSON if provided by caller
            vpa = cfg.get('vlm_pipeline_model_api')
            if isinstance(vpa, (dict, list)):
                config['vlm_pipeline_model_api'] = vpa
            elif isinstance(vpa, str) and vpa.strip().startswith(('{', '[')):
                config['vlm_pipeline_model_api'] = vpa

    # Enqueue tasks for each range
    priority = TaskPriority.HIGH
    task_ids = []
    multi = len(ranges) > 1
    logger.info(f"[canonical-docling] final ranges={len(ranges)} multi={multi} pipeline={pipeline} producer={body.get('producer', 'manual')}")

    # Create a group id for split bundles (used for UI grouping)
    # Use provided group_id if present (for two-pass auto system), otherwise generate new
    group_id = body.get('group_id') or (str(uuid.uuid4()) if multi else None)
    if multi and not sections:
        # Build sections from ranges if titles were not captured
        for (start, end) in ranges:
            sections.append({'start': int(start), 'end': int(end), 'title': ''})

    idx = 0
    for (start, end) in ranges:
        # Locate title for this range if available
        title = ''
        if multi and sections and idx < len(sections):
            title = sections[idx].get('title') or ''
        idx += 1

        cfg_range = dict(config)
        # Ensure 1-based inclusive range is passed through
        cfg_range['page_range'] = [max(1, int(start)), max(int(start), int(end))]
        extra = {
            'is_subdoc': multi,
            'page_range': [int(start), int(end)],
            'label': (title or f"subdoc p{int(start)}-{int(end)}") if multi else 'canonical'
        }
        # Attach selected section metadata if provided by caller
        if selected_section_id:
            extra['selected_section_id'] = selected_section_id
        if selected_section_title or custom_label:
            extra['selected_section_title'] = selected_section_title or custom_label
        # For split processing, force split bundle artefact type and add grouping/order metadata
        if multi:
            extra.update({
                # UI grouping metadata
                'split_order': idx,
                'split_heading': title,
                'split_total': len(ranges)
            })
            if group_id:
                extra['group_id'] = group_id
                extra['group_pack_type'] = 'docling_standard_auto_split'
        else:
            # Single-bundle case: allow caller to override type (defaults to canonical bundle)
            if 'artefact_type_override' in body and body.get('artefact_type_override'):
                extra['artefact_type_override'] = body.get('artefact_type_override')

        # Mark producer and selection metadata
        extra['producer'] = body.get('producer') or ('auto_split' if (multi and body.get('use_split_map')) else 'manual')
        if selected_section_id:
            extra['selected_section_id'] = selected_section_id
        if selected_section_title or custom_label:
            extra['selected_section_title'] = selected_section_title or custom_label

        # Enhanced logging for canonical operations
        if multi:
            logger.info(f"[canonical-docling] enqueue range idx={idx}/{len(ranges)} start={start} end={end} title='{title}' group_id={group_id} producer={extra.get('producer')} pipeline={pipeline}")
        else:
            logger.info(f"[canonical-docling] enqueue single range start={start} end={end} producer={extra.get('producer')} pipeline={pipeline}")
        tid = enqueue_docling_task(
            file_id=file_id,
            task_type='canonical_docling_subdoc_json' if multi else 'canonical_docling_json',
            payload={
                'bucket': bucket,
                'file_path': processing_path,
                'cabinet_id': cabinet_id,
                'mime_type': processing_mime,
                'config': cfg_range,
                'artefact_extra': extra,
                # Ensure canonical tasks respect upstream dependencies (e.g., Frontmatter)
                'depends_on': body.get('depends_on', []),
                # Pass through grouping info if provided by caller (kept for backward-compat)
                'group_pack_type': body.get('group_pack_type')
            },
            priority=priority,
            timeout=int(body.get('timeout', DOCLING_NOOCR_TIMEOUT))
        )
        task_ids.append(tid)

    logger.info(f"[canonical-docling] completed enqueue file_id={file_id} tasks={len(task_ids)} ranges={len(ranges)} pipeline={pipeline} producer={body.get('producer','manual')} group_id={group_id if multi else 'single'}")

    return {
        'message': f'enqueued {len(task_ids)} canonical docling job(s)',
        'task_ids': task_ids,
        'ranges': ranges,
        'used_split_map': bool(split_map),
        'group_id': group_id,
        'pipeline': pipeline,
        'producer': body.get('producer', 'manual')
    }