api/archive/auto_processing/task_processors.py

"""
Task Processors for Document Processing Queue

This module contains the actual processing implementations for different
types of queued tasks (Tika, Docling, LLM, Split Map).
"""

import json
import zipfile
import io
import mimetypes
import requests
import tempfile
import uuid
from pathlib import Path
from typing import Dict, Any, Optional
import os

from modules.queue_system import DocumentProcessingQueue, QueueTask, ServiceType
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
from modules.database.supabase.utils.storage import StorageAdmin
from modules.document_processor import DocumentProcessor
from modules.logger_tool import initialise_logger

logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)

class DocumentTaskProcessor(DocumentProcessingQueue):
    """
    Extended queue with actual task processing implementations.
    """

    def __init__(self, redis_url: str = None):
        super().__init__(redis_url)
        self.client = SupabaseServiceRoleClient()
        self.storage = StorageAdmin()
        self.doc_processor = DocumentProcessor()

        # Service URLs
        self.tika_url = os.getenv('TIKA_URL')
        self.docling_url = os.getenv('DOCLING_URL') or os.getenv('NEOFS_DOCLING_URL')
        self.llm_url = os.getenv('LLM_URL')  # Local LLM endpoint

        logger.info("Task processor initialized with service URLs")

    def _process_task(self, task: QueueTask):
        """Process a task based on its service type."""
        try:
            # DEBUG: Log entry into processing
            logger.info(f"🚀 PROCESS DEBUG: Starting _process_task for {task.id}")

            # Audit dependency info (if any)
            try:
                deps = []
                if isinstance(task.payload, dict):
                    deps = task.payload.get('depends_on') or []
                if deps:
                    logger.info(f"Processing task {task.id}: {task.service.value}/{task.task_type} deps={deps}")
                else:
                    logger.info(f"Processing task {task.id}: {task.service.value}/{task.task_type}")
            except Exception:
                logger.info(f"Processing task {task.id}: {task.service.value}/{task.task_type}")

            # DEBUG: Log service routing
            logger.info(f"🚀 PROCESS DEBUG: Routing task {task.id} to service {task.service}")

            if task.service == ServiceType.TIKA:
                result = self._process_tika_task(task)
            elif task.service == ServiceType.DOCLING:
                result = self._process_docling_task(task)
            elif task.service == ServiceType.LLM:
                result = self._process_llm_task(task)
            elif task.service == ServiceType.SPLIT_MAP:
                result = self._process_split_map_task(task)
            elif task.service == ServiceType.DOCUMENT_ANALYSIS:
                result = self.process_document_analysis_task(task)
            elif task.service == ServiceType.PAGE_IMAGES:
                result = self.process_page_images_task(task)
            else:
                raise ValueError(f"Unknown service type: {task.service}")

            # DEBUG: Log successful completion
            logger.info(f"✅ PROCESS DEBUG: Task {task.id} completed successfully, calling complete_task")
            self.complete_task(task, result)
            logger.info(f"✅ PROCESS DEBUG: Task {task.id} completion confirmed")

        except Exception as e:
            # DEBUG: Log detailed failure info
            logger.error(f"🚨 PROCESS DEBUG: Task {task.id} processing failed: {e}")
            logger.error(f"🚨 PROCESS DEBUG: Exception type: {type(e)}")
            import traceback
            logger.error(f"🚨 PROCESS DEBUG: Full traceback:\n{traceback.format_exc()}")
            logger.info(f"🚨 PROCESS DEBUG: Calling fail_task for {task.id}")
            self.fail_task(task, str(e))
            logger.info(f"🚨 PROCESS DEBUG: fail_task completed for {task.id}")

    def _process_tika_task(self, task: QueueTask) -> Dict[str, Any]:
        """Process Tika metadata extraction task."""
        if not self.tika_url:
            raise ValueError("TIKA_URL not configured")

        payload = task.payload
        file_id = task.file_id
        bucket = payload['bucket']
        file_path = payload['file_path']
        cabinet_id = payload['cabinet_id']
        mime_type = payload.get('mime_type', 'application/octet-stream')

        # Download file
        logger.debug(f"Downloading file for Tika processing: {bucket}/{file_path}")
        file_bytes = self.storage.download_file(bucket, file_path)

        # Call Tika
        headers = {'Accept': 'application/json', 'Content-Type': mime_type}
        timeout = task.timeout

        response = requests.put(
            f"{self.tika_url.rstrip('/')}/meta",
            data=file_bytes,
            headers=headers,
            timeout=timeout
        )
        response.raise_for_status()

        tika_json = response.json()

        # Store result as artefact
        artefact_id = str(uuid.uuid4())
        rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/tika.json"

        self.storage.upload_file(
            bucket,
            rel_path,
            json.dumps(tika_json, ensure_ascii=False).encode('utf-8'),
            'application/json',
            upsert=True
        )

        # Create artefact record with enhanced UI metadata
        artefact_data = {
            'id': artefact_id,
            'file_id': file_id,
            'type': 'tika_json',
            'rel_path': rel_path,
            'extra': {
                'processing_time': response.elapsed.total_seconds(),
                'display_name': 'Document Metadata',
                'bundle_label': 'Tika Analysis',
                'section_title': 'Document Metadata',
                'bundle_type': 'tika_json',
                'processing_mode': 'metadata_extraction',
                'pipeline': 'tika_analysis',
                'is_metadata': True,
                'ui_category': 'raw_data',
                'ui_order': 3,
                'description': 'Raw document metadata and properties extracted by Apache Tika',
                'viewer_type': 'json'
            },
            'status': 'completed'
        }

        self.client.supabase.table('document_artefacts').insert(artefact_data).execute()

        logger.info(f"Tika processing completed for file {file_id}")
        return {
            'artefact_id': artefact_id,
            'rel_path': rel_path,
            'processing_time': response.elapsed.total_seconds()
        }

    def _process_docling_task(self, task: QueueTask) -> Dict[str, Any]:
        """Process Docling document analysis task.

        Also allows routing of related task types so that page images and
        enhanced structure analysis can run under the stable docling service
        umbrella when SERVICE dispatch for new types is problematic.
        """
        # Soft-route additional task types through this handler
        if task.task_type in ("document_structure_analysis", "document_analysis"):
            return self.process_document_analysis_task(task)
        if task.task_type in ("generate_page_images", "page_images"):
            return self.process_page_images_task(task)
        if task.task_type in ("vlm_section_page_bundle",):
            return self.process_vlm_section_page_bundle_task(task)
        if task.task_type in ("vlm_section_bundle_collector",):
            return self.process_vlm_section_bundle_collector_task(task)
        # New unified bundle architecture handlers
        if task.task_type in ("docling_bundle",):
            return self.process_docling_bundle_task(task)
        if task.task_type in ("docling_bundle_split",):
            return self.process_docling_bundle_split_task(task)
# phase2_coordinator task type removed - pipelines now enqueued directly from split_map task
        if not self.docling_url:
            raise ValueError("DOCLING_URL not configured")

        payload = task.payload
        file_id = task.file_id
        bucket = payload['bucket']
        file_path = payload['file_path']
        cabinet_id = payload['cabinet_id']
        task_config = payload.get('config', {})

        # Download file
        logger.debug(f"Downloading file for Docling processing: {bucket}/{file_path}")
        file_bytes = self.storage.download_file(bucket, file_path)

        # Prepare Docling request
        docling_api_key = os.getenv('DOCLING_API_KEY')
        # Accept any content type so zip/binary responses are allowed
        headers = {'Accept': '*/*'}
        if docling_api_key:
            headers['X-Api-Key'] = docling_api_key

        # Determine to_formats. For canonical docling we will request a ZIP bundle.
        to_formats_val = task_config.get('to_formats', 'json')
        to_formats_list = to_formats_val if isinstance(to_formats_val, list) else [to_formats_val]
        is_canonical = str(task.task_type).startswith('canonical_docling')
        target_type = task_config.get('target_type', 'zip' if is_canonical else 'inbody')

        # Build form data from config (override for canonical)
        form_data = [
            ('target_type', target_type),
            ('do_ocr', str(task_config.get('do_ocr', False)).lower()),
            ('force_ocr', str(task_config.get('force_ocr', False)).lower()),
            ('image_export_mode', 'referenced' if is_canonical else task_config.get('image_export_mode', 'embedded')),
            ('ocr_engine', task_config.get('ocr_engine', 'easyocr')),
            ('ocr_lang', task_config.get('ocr_lang', 'en')),
            ('pdf_backend', task_config.get('pdf_backend', 'dlparse_v4')),
            ('table_mode', task_config.get('table_mode', 'fast')),
            ('do_formula_enrichment', str(task_config.get('do_formula_enrichment', False)).lower()),
            ('do_code_enrichment', str(task_config.get('do_code_enrichment', False)).lower()),
            ('pipeline', task_config.get('pipeline', 'standard'))
        ]
        # Optional extra flags forwarded when present
        if 'table_cell_matching' in task_config:
            form_data.append(('table_cell_matching', str(task_config.get('table_cell_matching')).lower()))
        if 'do_picture_classification' in task_config:
            form_data.append(('do_picture_classification', str(task_config.get('do_picture_classification')).lower()))
        if 'do_picture_description' in task_config:
            form_data.append(('do_picture_description', str(task_config.get('do_picture_description')).lower()))
        if task_config.get('picture_description_prompt'):
            form_data.append(('picture_description_prompt', task_config.get('picture_description_prompt')))
        # picture_description_api and vlm_pipeline_model_api must be JSON per Docling OpenAPI
        if task_config.get('picture_description_api') is not None:
            v = task_config.get('picture_description_api')
            if isinstance(v, (dict, list)):
                form_data.append(('picture_description_api', json.dumps(v)))
            elif isinstance(v, str) and v.strip().startswith(('{', '[')):
                form_data.append(('picture_description_api', v))
            # else: omit to avoid validation error
        if task_config.get('vlm_pipeline_model'):
            form_data.append(('vlm_pipeline_model', task_config.get('vlm_pipeline_model')))
        if task_config.get('vlm_pipeline_model_api') is not None:
            v = task_config.get('vlm_pipeline_model_api')
            if isinstance(v, (dict, list)):
                form_data.append(('vlm_pipeline_model_api', json.dumps(v)))
            elif isinstance(v, str) and v.strip().startswith(('{', '[')):
                form_data.append(('vlm_pipeline_model_api', v))
            # else: omit
        if is_canonical and ('md' in to_formats_list):
            form_data.append(('md_page_break_placeholder', task_config.get('md_page_break_placeholder', '\n\n<!-- page-break -->\n\n')))
        # Append to_formats as repeated fields (filter unsupported split pages)
        to_formats_list = [f for f in to_formats_list if f != 'html_split_page']
        for fmt in to_formats_list:
            form_data.append(('to_formats', fmt))

        # Handle page range with clamping and min/max correction
        page_range = task_config.get('page_range', [1, 999999])
        if isinstance(page_range, list) and len(page_range) >= 2:
            def _to_int_safe(v, default):
                try:
                    return int(v)
                except Exception:
                    return default
            start_pg = _to_int_safe(page_range[0], 1)
            end_pg = _to_int_safe(page_range[1], 999999)
            if start_pg < 1:
                start_pg = 1
            if end_pg < start_pg:
                end_pg = start_pg
            # Clamp for frontmatter-like tasks to actual page count if possible
            if task.task_type in ('docling_frontmatter_json', 'document_structure_analysis'):
                try:
                    import fitz  # PyMuPDF
                    doc = fitz.open(stream=file_bytes, filetype='pdf')
                    pc = int(doc.page_count)
                    doc.close()
                    if pc > 0:
                        end_pg = min(end_pg, pc)
                        start_pg = max(1, min(start_pg, pc))
                        if end_pg < start_pg:
                            end_pg = start_pg
                except Exception:
                    pass
            form_data.append(('page_range', str(start_pg)))
            form_data.append(('page_range', str(end_pg)))

        files = [('files', ('file', file_bytes, payload.get('mime_type', 'application/pdf')))]

        # Make request
        response = requests.post(
            f"{self.docling_url.rstrip('/')}/v1/convert/file",
            files=files,
            data=form_data,
            headers=headers,
            timeout=task.timeout
        )
        response.raise_for_status()

        content_type = (response.headers.get('Content-Type') or '').lower()
        is_zip_resp = ('zip' in content_type) or (response.content[:2] == b'PK')

        if is_zip_resp and is_canonical:
            # Unpack zip, store all files and a manifest
            artefact_id = str(uuid.uuid4())
            base_dir = f"{cabinet_id}/{file_id}/{artefact_id}"
            archive_path = f"{base_dir}/bundle.zip"
            # Save original archive
            self.storage.upload_file(bucket, archive_path, response.content, 'application/zip', upsert=True)

            zf = zipfile.ZipFile(io.BytesIO(response.content))
            entries = []
            md_full_path = None
            html_full_path = None
            text_full_path = None
            json_full_path = None
            images_list = []
            md_data_bytes: bytes | None = None
            for zi in zf.infolist():
                if zi.is_dir():
                    continue
                name = zi.filename.lstrip('/').replace('..', '')
                data = zf.read(zi)
                ctype = mimetypes.guess_type(name)[0] or 'application/octet-stream'
                rel = f"{base_dir}/{name}"
                self.storage.upload_file(bucket, rel, data, ctype, upsert=True)
                entries.append({
                    'name': name,
                    'path': rel,
                    'size': zi.file_size,
                    'content_type': ctype
                })
                # Detect known outputs
                lower = name.lower()
                if lower.endswith('.md') and md_full_path is None:
                    md_full_path = rel
                    md_data_bytes = data
                elif lower.endswith('.html') and html_full_path is None:
                    html_full_path = rel
                elif lower.endswith('.txt') and text_full_path is None:
                    text_full_path = rel
                elif lower.endswith('.json') and json_full_path is None:
                    json_full_path = rel
                if ctype.startswith('image/'):
                    images_list.append({'name': name, 'path': rel, 'content_type': ctype, 'size': zi.file_size})

            manifest = {
                'file_id': file_id,
                'artefact_id': artefact_id,
                'to_formats': to_formats_list,
                'image_export_mode': 'referenced',
                'entries': entries,
                'archive_path': archive_path,
                'markdown_full': md_full_path,
                'html_full': html_full_path,
                'text_full': text_full_path,
                'json_full': json_full_path,
                'images': images_list,
                'bucket': bucket
            }
            # Create markdown pages by splitting on placeholder if available
            if md_data_bytes is not None:
                try:
                    md_text = md_data_bytes.decode('utf-8', errors='replace')
                    sep = task_config.get('md_page_break_placeholder', '\n\n<!-- page-break -->\n\n')
                    parts = md_text.split(sep)
                    if len(parts) > 1:
                        pages_dir = f"{base_dir}/md_pages"
                        pages = []
                        for i, part in enumerate(parts, start=1):
                            pth = f"{pages_dir}/page-{i:04d}.md"
                            self.storage.upload_file(bucket, pth, part.encode('utf-8'), 'text/markdown', upsert=True)
                            pages.append({'page': i, 'path': pth})
                        manifest['markdown_pages'] = pages
                except Exception as e:
                    logger.warning(f"Failed creating markdown_pages for file {file_id}: {e}")
            manifest_path = f"{base_dir}/manifest.json"
            self.storage.upload_file(bucket, manifest_path, json.dumps(manifest, ensure_ascii=False).encode('utf-8'), 'application/json', upsert=True)

            # Create artefact row pointing to directory with manifest, including grouping extras for split packs
            artefact_extra = payload.get('artefact_extra') if isinstance(payload, dict) else None
            # Determine artefact type by pipeline (standard vs vlm)
            pipeline_mode = (task_config.get('pipeline') or 'standard').lower()
            artefact_type_final = 'docling_vlm' if pipeline_mode == 'vlm' else 'docling_standard'
            group_pack_type = payload.get('group_pack_type') if isinstance(payload, dict) else None
            # propagate group_id if provided (set by caller for multi-part packs)
            group_id = (artefact_extra or {}).get('group_id')
            # Compute a settings fingerprint for grouping (exclude page_range)
            try:
                import hashlib, json as _json
                cfg_for_hash = dict(task_config)
                cfg_for_hash.pop('page_range', None)
                settings_fingerprint = hashlib.sha1(_json.dumps(cfg_for_hash, sort_keys=True, ensure_ascii=False).encode('utf-8')).hexdigest()
            except Exception:
                settings_fingerprint = None

            self.client.supabase.table('document_artefacts').insert({
                'id': artefact_id,
                'file_id': file_id,
                'type': artefact_type_final,
                'rel_path': base_dir,
                'extra': {
                    'manifest': manifest_path,
                    'processing_time': response.elapsed.total_seconds(),
                    'config': task_config,
                    'group_pack_type': group_pack_type or (artefact_extra or {}).get('group_pack_type'),
                    'group_id': group_id,
                    'pipeline': pipeline_mode,
                    'settings_fingerprint': settings_fingerprint,
                    **(artefact_extra or {})
                },
                'status': 'completed'
            }).execute()

            logger.info(f"Canonical docling bundle stored for file {file_id} with {len(entries)} files")
            return {
                'artefact_id': artefact_id,
                'files_count': len(entries)
            }

        if 'application/json' in content_type or content_type.endswith('+json'):
            docling_json = response.json()
            artefact_id = str(uuid.uuid4())
            artefact_type = task.task_type
            rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/{artefact_type.replace('_json', '.json')}"
            self.storage.upload_file(
                bucket,
                rel_path,
                json.dumps(docling_json, ensure_ascii=False).encode('utf-8'),
                'application/json',
                upsert=True
            )
            artefact_data = {
                'id': artefact_id,
                'file_id': file_id,
                'type': artefact_type,
                'rel_path': rel_path,
                'extra': {
                    'processing_time': response.elapsed.total_seconds(),
                    'config': task_config,
                    **({} if 'artefact_extra' not in payload else payload['artefact_extra'])
                },
                'status': 'completed'
            }
            self.client.supabase.table('document_artefacts').insert(artefact_data).execute()
        else:
            # Fallback: store raw output if server didn't return JSON (unexpected for inbody)
            artefact_id = str(uuid.uuid4())
            ext = ('html' if 'html' in content_type else ('md' if 'markdown' in content_type else ('txt' if 'text/plain' in content_type else 'bin')))
            artefact_type = f'docling_output_{ext}'
            rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/docling_output.{ext}"
            self.storage.upload_file(
                bucket,
                rel_path,
                response.content,
                'application/zip' if ext == 'zip' else (content_type or 'application/octet-stream'),
                upsert=True
            )
            artefact_data = {
                'id': artefact_id,
                'file_id': file_id,
                'type': artefact_type,
                'rel_path': rel_path,
                'extra': {
                    'processing_time': response.elapsed.total_seconds(),
                    'config': task_config,
                    'to_formats': to_formats_list,
                    'content_type': content_type,
                    **({} if 'artefact_extra' not in payload else payload['artefact_extra'])
                },
                'status': 'completed'
            }
            self.client.supabase.table('document_artefacts').insert(artefact_data).execute()

        # When we get canonical Docling JSON, also split out component contents into separate artefacts
        try:
            if 'application/json' in content_type or content_type.endswith('+json'):
                self._store_docling_component_artefacts(
                    file_id=file_id,
                    cabinet_id=cabinet_id,
                    bucket=bucket,
                    docling_json=docling_json,
                    task_config=task_config,
                    artefact_extra=payload.get('artefact_extra') if isinstance(payload, dict) else None
                )
        except Exception as split_e:
            logger.warning(f"Storing component artefacts failed for file {file_id}: {split_e}")

        # Handle optional frontpage image extraction
        if task.task_type == 'docling_frontmatter_json':
            try:
                self._extract_frontpage_image(docling_json, file_id, cabinet_id, bucket)
            except Exception as e:
                logger.warning(f"Frontpage image extraction failed for file {file_id}: {e}")

        logger.info(f"Docling processing completed for file {file_id}")

        # Pipeline dependencies now handle sequential execution automatically

        return {
            'artefact_id': artefact_id,
            'rel_path': rel_path,
            'processing_time': response.elapsed.total_seconds()
        }

    def _extract_frontpage_image(self, docling_json: Dict[str, Any], file_id: str,
                                cabinet_id: str, bucket: str):
        """Extract and store frontpage image from Docling JSON."""
        import base64

        # Look for frontpage image in various locations
        cover_b64 = None
        for key in ['frontpage', 'cover']:
            if key in docling_json and 'image_base64' in docling_json[key]:
                cover_b64 = docling_json[key]['image_base64']
                break

        if not cover_b64:
            return

        # Decode and store image
        artefact_id = str(uuid.uuid4())
        img_bytes = base64.b64decode(cover_b64)
        rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/frontpage.png"

        self.storage.upload_file(bucket, rel_path, img_bytes, 'image/png', upsert=True)

        # Create artefact record
        artefact_data = {
            'id': artefact_id,
            'file_id': file_id,
            'type': 'docling_frontpage_image',
            'rel_path': rel_path,
            'extra': {'extracted_from': 'docling_frontmatter'},
            'status': 'completed'
        }

        self.client.supabase.table('document_artefacts').insert(artefact_data).execute()
        logger.debug(f"Frontpage image extracted for file {file_id}")

    def _store_docling_component_artefacts(self, *, file_id: str, cabinet_id: str, bucket: str, docling_json: Dict[str, Any], task_config: Dict[str, Any], artefact_extra: Optional[Dict[str, Any]] = None) -> None:
        """Create artefacts for component contents from a canonical Docling JSON.

        Stores md_content, html_content, text_content, doctags_content and json_content
        if present, as separate artefacts and files alongside the canonical JSON.
        """
        doc = docling_json.get('document') or {}
        components = [
            ('md_content', 'docling_md', 'docling.md', 'text/markdown', lambda v: v if isinstance(v, str) else ''),
            ('html_content', 'docling_html', 'docling.html', 'text/html', lambda v: v if isinstance(v, str) else ''),
            ('text_content', 'docling_text', 'docling.txt', 'text/plain', lambda v: v if isinstance(v, str) else ''),
            ('doctags_content', 'docling_doctags', 'docling.doctags.xml', 'application/xml', lambda v: v if isinstance(v, str) else ''),
            ('json_content', 'docling_json', 'docling.json', 'application/json', lambda v: json.dumps(v or {}, ensure_ascii=False)),
        ]

        for key, art_type, filename, mime, to_bytes in components:
            if key not in doc or doc.get(key) in (None, ''):
                continue
            try:
                artefact_id = str(uuid.uuid4())
                rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/{filename}"
                data_bytes = to_bytes(doc.get(key))
                if isinstance(data_bytes, str):
                    data_bytes = data_bytes.encode('utf-8')
                self.storage.upload_file(bucket, rel_path, data_bytes, mime, upsert=True)
                extra = {'source': 'canonical_docling_json', 'component_key': key, 'config': task_config}
                if artefact_extra:
                    extra.update(artefact_extra)
                self.client.supabase.table('document_artefacts').insert({
                    'id': artefact_id,
                    'file_id': file_id,
                    'type': art_type,
                    'rel_path': rel_path,
                    'extra': extra,
                    'status': 'completed'
                }).execute()
            except Exception as e:
                logger.warning(f"Failed to store component '{key}' for file {file_id}: {e}")

    def _process_llm_task(self, task: QueueTask) -> Dict[str, Any]:
        """Process LLM analysis task (document classification, etc.)."""
        if not self.llm_url:
            raise ValueError("LLM_URL not configured")

        payload = task.payload
        file_id = task.file_id
        prompt = payload['prompt']
        context = payload.get('context', '')
        model = payload.get('model', 'default')

        # Prepare LLM request
        llm_request = {
            'model': model,
            'prompt': prompt,
            'context': context,
            'max_tokens': payload.get('max_tokens', 1000),
            'temperature': payload.get('temperature', 0.1)
        }

        # Call local LLM
        response = requests.post(
            f"{self.llm_url.rstrip('/')}/generate",
            json=llm_request,
            headers={'Content-Type': 'application/json'},
            timeout=task.timeout
        )
        response.raise_for_status()

        llm_result = response.json()

        # Store result (optional - depends on use case)
        if payload.get('store_result', False):
            bucket = payload['bucket']
            cabinet_id = payload['cabinet_id']

            artefact_id = str(uuid.uuid4())
            rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/llm_{task.task_type}.json"

            self.storage.upload_file(
                bucket,
                rel_path,
                json.dumps(llm_result, ensure_ascii=False).encode('utf-8'),
                'application/json',
                upsert=True
            )

            # Create artefact record
            artefact_data = {
                'id': artefact_id,
                'file_id': file_id,
                'type': f'llm_{task.task_type}',
                'rel_path': rel_path,
                'extra': {
                    'model': model,
                    'task_type': task.task_type
                },
                'status': 'completed'
            }

            self.client.supabase.table('document_artefacts').insert(artefact_data).execute()

        logger.info(f"LLM processing completed for file {file_id}")
        return llm_result

    def _process_split_map_task(self, task: QueueTask) -> Dict[str, Any]:
        """Process split map generation task."""
        from routers.database.files.split_map import create_split_map_for_file
        from routers.database.files.files import enqueue_canonical_docling

        file_id = task.file_id

        # Generate split map
        split_map = create_split_map_for_file(file_id)

        logger.info(f"Split map generation completed for file {file_id}")

        # NEW BUNDLE ARCHITECTURE: Direct pipeline enqueueing
        # Split map completion now directly triggers bundle task creation
        logger.info(f"NEW ARCHITECTURE: Enqueueing sequential docling bundle pipelines for file {file_id}")

        try:
            # Get file information for pipeline enqueueing
            file_result = self.client.supabase.table('files').select('*').eq('id', file_id).single().execute()
            if not file_result.data:
                logger.error(f"Could not find file {file_id} for pipeline enqueueing")
                return {
                    'method': split_map['method'],
                    'confidence': split_map['confidence'],
                    'entries_count': len(split_map['entries']),
                    'pipeline_error': 'File not found for pipeline enqueueing'
                }

            file_row = file_result.data
            bucket = file_row['bucket']
            cabinet_id = file_row['cabinet_id']
            storage_path = file_row['path']
            original_mime = file_row.get('mime_type', 'application/pdf')

            # Prefer converted PDF if available (matches existing pattern)
            try:
                arts = self.client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or []
                pdf_art = next((a for a in arts if a.get('type') == 'document_pdf'), None)
                processing_path = pdf_art['rel_path'] if pdf_art else storage_path
                processing_mime = 'application/pdf' if pdf_art else original_mime
            except Exception:
                processing_path = storage_path
                processing_mime = original_mime

            # Prepare file data for pipeline controller
            file_data = {
                'bucket': bucket,
                'file_path': processing_path,
                'cabinet_id': cabinet_id,
                'mime_type': processing_mime
            }

            # Import and use pipeline controller to enqueue sequential pipelines
            from modules.pipeline_controller import get_pipeline_controller
            controller = get_pipeline_controller()

            pipeline_result = controller.enqueue_sequential_docling_pipelines(file_id, file_data)

            logger.info(f"Successfully enqueued {pipeline_result['total_tasks']} tasks across "
                      f"{len(pipeline_result['enqueued_pipelines'])} pipelines for file {file_id}")
            logger.info(f"Pipeline execution order: {pipeline_result['sequential_order']}")

            return {
                'method': split_map['method'],
                'confidence': split_map['confidence'],
                'entries_count': len(split_map['entries']),
                'enqueued_pipelines': pipeline_result['enqueued_pipelines'],
                'total_pipeline_tasks': pipeline_result['total_tasks'],
                'pipeline_order': pipeline_result['sequential_order']
            }

        except Exception as e:
            logger.error(f"Failed to enqueue sequential pipelines for file {file_id}: {e}")
            return {
                'method': split_map['method'],
                'confidence': split_map['confidence'],
                'entries_count': len(split_map['entries']),
                'pipeline_error': str(e)
            }

        # Split map processing completed successfully

        return {
            'method': split_map['method'],
            'confidence': split_map['confidence'],
            'entries_count': len(split_map['entries'])
        }

    def _enqueue_vlm_page_processing(self, file_id: str, threshold: int, vlm_group_id: str, vlm_model: str, base_config: dict):
        """Enqueue VLM processing for individual pages within split map sections."""
        from routers.database.files.files import _load_split_map
        from modules.database.supabase.utils.client import SupabaseServiceRoleClient
        from modules.database.supabase.utils.storage import StorageAdmin

        try:
            client = SupabaseServiceRoleClient()
            storage = StorageAdmin()

            # Get file info
            fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
            if not fr.data:
                logger.error(f"File {file_id} not found for VLM page processing")
                return

            file_row = fr.data
            bucket = file_row['bucket']

            # Load split map
            split_map = _load_split_map(client, storage, bucket, file_id)
            if not split_map:
                logger.warning(f"No split map found for VLM page processing file {file_id}")
                return

            entries = split_map.get('entries', [])
            if not entries:
                logger.warning(f"Empty split map entries for VLM page processing file {file_id}")
                return

            logger.info(f"[auto-canonical] VLM page processing: found {len(entries)} sections for file {file_id}")

            # Process each section with page-by-page VLM
            for section_idx, entry in enumerate(entries, 1):
                try:
                    start_page = int(entry.get('start_page', 1))
                    end_page = int(entry.get('end_page', start_page))
                    section_title = entry.get('title', f'Section {section_idx}')

                    logger.info(f"[auto-canonical] VLM page processing section {section_idx}: '{section_title}' pages {start_page}-{end_page}")

                    # Create section-level bundle manifest task
                    self._enqueue_vlm_section_page_bundle(
                        file_id, section_idx, start_page, end_page, section_title,
                        vlm_group_id, vlm_model, base_config, len(entries)
                    )

                except Exception as section_e:
                    logger.warning(f"Failed to process VLM section {section_idx} for file {file_id}: {section_e}")
                    continue

        except Exception as e:
            logger.error(f"VLM page processing setup failed for file {file_id}: {e}")

    def _enqueue_vlm_section_page_bundle(self, file_id: str, section_idx: int, start_page: int, end_page: int,
                                        section_title: str, vlm_group_id: str, vlm_model: str,
                                        base_config: dict, total_sections: int):
        """Enqueue VLM processing for individual pages within a section, then bundle them."""
        from modules.queue_system import enqueue_docling_task, TaskPriority

        try:
            # Create a unique task to handle page-by-page processing for this section
            section_task_id = enqueue_docling_task(
                file_id=file_id,
                task_type='vlm_section_page_bundle',
                payload={
                    'section_idx': section_idx,
                    'start_page': start_page,
                    'end_page': end_page,
                    'section_title': section_title,
                    'vlm_group_id': vlm_group_id,
                    'vlm_model': vlm_model,
                    'base_config': base_config,
                    'total_sections': total_sections,
                    'producer': 'auto_split'
                },
                priority=TaskPriority.NORMAL,
                timeout=3600  # 1 hour for page-by-page processing
            )

            logger.info(f"[auto-canonical] VLM section page bundle task {section_task_id} for section {section_idx} of file {file_id}")

        except Exception as e:
            logger.error(f"Failed to enqueue VLM section page bundle for section {section_idx} file {file_id}: {e}")

    def process_document_analysis_task(self, task: QueueTask) -> Dict[str, Any]:
        """Process document structure analysis task"""
        file_id = task.file_id
        payload = task.payload

        logger.info(f"Processing document analysis task for file {file_id}")

        try:
            # Load file from storage
            bucket = payload['bucket']
            file_path = payload['file_path']
            cabinet_id = payload['cabinet_id']

            file_bytes = self.storage.download_file(bucket, file_path)

            # Load existing artefacts if available
            client = SupabaseServiceRoleClient()
            artefacts = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).execute()

            tika_json = None
            docling_json = None

            for art in artefacts.data:
                if art['type'] == 'tika_json' and art['status'] == 'completed':
                    try:
                        tika_data = self.storage.download_file(bucket, art['rel_path'])
                        tika_json = json.loads(tika_data.decode('utf-8'))
                    except Exception as e:
                        logger.warning(f"Failed to load Tika JSON for analysis: {e}")

                elif art['type'] in ['docling_frontmatter_json', 'docling_noocr_json'] and art['status'] == 'completed':
                    try:
                        docling_data = self.storage.download_file(bucket, art['rel_path'])
                        docling_json = json.loads(docling_data.decode('utf-8'))
                        break  # Use first available Docling result
                    except Exception as e:
                        logger.warning(f"Failed to load Docling JSON for analysis: {e}")

            # Import here to avoid circular imports
            from modules.document_analysis import create_document_outline_hierarchy_artefact

            # Create document analysis
            analysis_data = create_document_outline_hierarchy_artefact(
                file_id=file_id,
                pdf_bytes=file_bytes,
                tika_json=tika_json,
                docling_json=docling_json
            )

            # Store analysis as artefact (insert row first, then upload file)
            artefact_id = analysis_data.get('artefact_id') or str(uuid.uuid4())
            analysis_data['artefact_id'] = artefact_id
            rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/document_outline_hierarchy.json"
            # Insert row first to avoid orphaned files if DB insert fails
            # Insert artefact record with processing status
            sections_count = len(analysis_data.get('sections', []) or [])
            metadata = analysis_data.get('metadata') or {}
            analysis_methods = metadata.get('analysis_methods')
            self.client.supabase.table('document_artefacts').insert({
                'id': artefact_id,
                'file_id': file_id,
                'type': 'document_outline_hierarchy',
                'rel_path': rel_path,
                'extra': {
                    'sections_count': sections_count,
                    'analysis_methods': analysis_methods
                },
                'status': 'processing'
            }).execute()

            # Now upload the file
            analysis_json = json.dumps(analysis_data, ensure_ascii=False)
            self.storage.upload_file(bucket, rel_path, analysis_json.encode('utf-8'), 'application/json', upsert=True)

            # Mark artefact as completed
            self.client.supabase.table('document_artefacts').update({
                'status': 'completed'
            }).eq('id', artefact_id).execute()

            logger.info(f"Document analysis completed for file {file_id} (sections={sections_count})")
            return {
                'sections_count': sections_count
            }

        except Exception as e:
            logger.error(f"Document analysis failed for file {file_id}: {e}")
            raise

    def process_page_images_task(self, task: QueueTask) -> Dict[str, Any]:
        """Process page images generation task"""
        file_id = task.file_id
        payload = task.payload

        logger.info(f"Processing page images task for file {file_id}")

        try:
            # Load file from storage
            bucket = payload['bucket']
            file_path = payload['file_path']
            cabinet_id = payload['cabinet_id']

            file_bytes = self.storage.download_file(bucket, file_path)

            # Import here to avoid circular imports
            from modules.page_image_generator import create_page_images_artefact

            # Generate page images
            images_data = create_page_images_artefact(
                file_id=file_id,
                cabinet_id=cabinet_id,
                pdf_bytes=file_bytes
            )

            artefact_id = images_data['artefact_id']
            # Include bucket in manifest for client-side signed URL generation
            images_data['bucket'] = bucket

            # Upload all page images to storage
            for page_info in images_data['page_images']:
                # Upload full image
                full_path = page_info['full_image_path']
                full_data = page_info.pop('full_image_data')  # Remove from JSON
                self.storage.upload_file(bucket, full_path, full_data, 'image/png', upsert=True)

                # Upload thumbnail
                thumb_path = page_info['thumbnail_path']
                thumb_data = page_info.pop('thumbnail_data')  # Remove from JSON
                self.storage.upload_file(bucket, thumb_path, thumb_data, 'image/webp', upsert=True)

            # Store images metadata manifest under the artefact directory
            artefact_dir = f"{cabinet_id}/{file_id}/{artefact_id}"
            manifest_rel_path = f"{artefact_dir}/page_images.json"
            images_json = json.dumps(images_data, ensure_ascii=False)
            self.storage.upload_file(bucket, manifest_rel_path, images_json.encode('utf-8'), 'application/json', upsert=True)

            # Insert artefact record
            client = SupabaseServiceRoleClient()
            client.supabase.table('document_artefacts').insert({
                'id': artefact_id,
                'file_id': file_id,
                'type': 'page_images',
                # Store the directory prefix as rel_path for hybrid approach
                'rel_path': artefact_dir,
                'extra': {
                    'page_count': images_data['page_count'],
                    'total_full_images': images_data['storage_info']['total_full_images'],
                    'total_thumbnails': images_data['storage_info']['total_thumbnails'],
                    'estimated_storage_mb': images_data['storage_info']['estimated_storage_mb'],
                    'manifest': manifest_rel_path
                },
                'status': 'completed'
            }).execute()

            logger.info(f"Page images generation completed for file {file_id}")
            return {
                'page_count': images_data['page_count'],
                'estimated_storage_mb': images_data['storage_info']['estimated_storage_mb']
            }

        except Exception as e:
            logger.error(f"Page images generation failed for file {file_id}: {e}")
            raise

    def process_comparison_analysis_task(self, task: QueueTask) -> Dict[str, Any]:
        """Process comparison analysis between no-OCR and OCR docling results."""
        file_id = task.file_id
        payload = task.payload

        logger.info(f"Processing comparison analysis task for file {file_id}")

        try:
            no_ocr_group_id = payload.get('no_ocr_group_id')
            ocr_group_id = payload.get('ocr_group_id')
            comparison_type = payload.get('comparison_type', 'noocr_vs_ocr')
            initial_delay = payload.get('initial_delay_seconds', 0)

            # If this is the first execution and we have an initial delay, sleep briefly
            if initial_delay > 0:
                import time
                logger.info(f"Comparison analysis: applying initial delay of {min(initial_delay, 60)} seconds for file {file_id}")
                time.sleep(min(initial_delay, 60))  # Max 1 minute delay per attempt
                logger.info(f"Comparison analysis: delay complete for file {file_id}")

            if not no_ocr_group_id or not ocr_group_id:
                raise ValueError("Missing group_id parameters for comparison")

            client = SupabaseServiceRoleClient()

            # Find file info
            fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
            if not fr.data:
                raise ValueError(f"File {file_id} not found")

            file_row = fr.data
            bucket = file_row['bucket']
            cabinet_id = file_row['cabinet_id']

            # Find artefacts for both groups
            artefacts = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).execute()
            arts = artefacts.data or []

            # Filter artefacts by group_id and type, including status
            no_ocr_arts = [a for a in arts if ((a.get('extra') or {}).get('group_id') == no_ocr_group_id and
                                             a.get('type') == 'docling_standard' and
                                             a.get('status') == 'completed')]
            ocr_arts = [a for a in arts if ((a.get('extra') or {}).get('group_id') == ocr_group_id and
                                          a.get('type') == 'docling_standard' and
                                          a.get('status') == 'completed')]

            # Also check pending/processing artefacts to understand timing better
            no_ocr_pending = [a for a in arts if ((a.get('extra') or {}).get('group_id') == no_ocr_group_id and
                                                a.get('type') == 'docling_standard' and
                                                a.get('status') in ('processing', 'pending'))]
            ocr_pending = [a for a in arts if ((a.get('extra') or {}).get('group_id') == ocr_group_id and
                                             a.get('type') == 'docling_standard' and
                                             a.get('status') in ('processing', 'pending'))]

            # Determine expected total parts from split_total metadata (if available)
            expected_parts = None
            if no_ocr_arts:
                expected_parts = (no_ocr_arts[0].get('extra') or {}).get('split_total')
            elif ocr_arts:
                expected_parts = (ocr_arts[0].get('extra') or {}).get('split_total')
            elif no_ocr_pending:
                expected_parts = (no_ocr_pending[0].get('extra') or {}).get('split_total')
            elif ocr_pending:
                expected_parts = (ocr_pending[0].get('extra') or {}).get('split_total')

            logger.info(f"Comparison analysis: found {len(no_ocr_arts)} completed no-OCR artefacts ({len(no_ocr_pending)} pending), {len(ocr_arts)} completed OCR artefacts ({len(ocr_pending)} pending), expected_parts={expected_parts}")

            # Enhanced validation with progress-aware retry logic
            if expected_parts is not None:
                # We know how many parts to expect, so wait for all of them
                total_no_ocr = len(no_ocr_arts) + len(no_ocr_pending)
                total_ocr = len(ocr_arts) + len(ocr_pending)

                # Calculate completion percentages
                no_ocr_completion = len(no_ocr_arts) / expected_parts * 100
                ocr_completion = len(ocr_arts) / expected_parts * 100

                # Check if we're making progress (store in task metadata for persistence)
                progress_key = f"comparison_progress_{file_id}"
                current_progress = {
                    'no_ocr_completed': len(no_ocr_arts),
                    'ocr_completed': len(ocr_arts),
                    'no_ocr_pending': len(no_ocr_pending),
                    'ocr_pending': len(ocr_pending)
                }

                # Get previous progress from payload (injected by retry mechanism)
                previous_progress = payload.get('previous_progress', {'no_ocr_completed': 0, 'ocr_completed': 0})
                progress_made = (current_progress['no_ocr_completed'] > previous_progress['no_ocr_completed'] or
                               current_progress['ocr_completed'] > previous_progress['ocr_completed'])

                if len(no_ocr_arts) < expected_parts or len(ocr_arts) < expected_parts:
                    if len(no_ocr_pending) > 0 or len(ocr_pending) > 0:
                        # Still processing - this is expected, always retry
                        error_msg = f"PROGRESS_RETRY: expected={expected_parts}, no_ocr={len(no_ocr_arts)}/{expected_parts} ({no_ocr_completion:.1f}%), ocr={len(ocr_arts)}/{expected_parts} ({ocr_completion:.1f}%), pending: no_ocr={len(no_ocr_pending)}, ocr={len(ocr_pending)}"
                        progress_retry_error = ValueError(error_msg)
                        progress_retry_error.current_progress = current_progress
                        progress_retry_error.is_progress_retry = True
                        raise progress_retry_error
                    elif progress_made:
                        # No pending but made progress since last check - likely brief gap between completions
                        error_msg = f"PROGRESS_MADE_RETRY: expected={expected_parts}, no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)}, progress since last check"
                        progress_retry_error = ValueError(error_msg)
                        progress_retry_error.current_progress = current_progress
                        progress_retry_error.is_progress_retry = True
                        raise progress_retry_error
                    else:
                        # No progress and no pending - likely stalled, but still retry with backoff
                        error_msg = f"STALLED_RETRY: expected={expected_parts}, no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - no pending tasks but will retry"
                        stalled_retry_error = ValueError(error_msg)
                        stalled_retry_error.current_progress = current_progress
                        stalled_retry_error.is_stalled_retry = True
                        raise stalled_retry_error

                # Also verify both groups have the same number of completed parts
                if len(no_ocr_arts) != len(ocr_arts):
                    error_msg = f"ALIGNMENT_RETRY: no_ocr completed={len(no_ocr_arts)}, ocr completed={len(ocr_arts)} (expected {expected_parts} each) - waiting for alignment"
                    alignment_retry_error = ValueError(error_msg)
                    alignment_retry_error.current_progress = current_progress
                    alignment_retry_error.is_alignment_retry = True
                    raise alignment_retry_error

            else:
                # Fallback to original logic when split_total not available
                if not no_ocr_arts or not ocr_arts:
                    # More detailed retry logic with pending artefact awareness
                    if len(no_ocr_arts) == 0 and len(ocr_arts) == 0:
                        if len(no_ocr_pending) > 0 or len(ocr_pending) > 0:
                            raise ValueError(f"Batches still processing: no_ocr completed={len(no_ocr_arts)} pending={len(no_ocr_pending)}, ocr completed={len(ocr_arts)} pending={len(ocr_pending)} - will retry")
                        else:
                            raise ValueError(f"No artefacts found for either group: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - may need more time")
                    elif len(ocr_arts) == 0:
                        if len(ocr_pending) > 0:
                            raise ValueError(f"OCR batch still processing: no_ocr completed={len(no_ocr_arts)}, ocr completed={len(ocr_arts)} pending={len(ocr_pending)} - will retry")
                        else:
                            raise ValueError(f"OCR batch appears incomplete: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - will retry")
                    elif len(no_ocr_arts) == 0:
                        if len(no_ocr_pending) > 0:
                            raise ValueError(f"No-OCR batch still processing: no_ocr completed={len(no_ocr_arts)} pending={len(no_ocr_pending)}, ocr completed={len(ocr_arts)} - will retry")
                        else:
                            raise ValueError(f"No-OCR batch appears incomplete: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - will retry")
                    else:
                        raise ValueError(f"Unexpected missing artefacts: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)}")

                # For fallback case, ensure both groups have same count
                if len(no_ocr_arts) != len(ocr_arts):
                    raise ValueError(f"Mismatched group sizes: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - will retry")

            # Sort both groups by split_order for aligned comparison
            no_ocr_arts.sort(key=lambda x: ((x.get('extra') or {}).get('split_order') or 0))
            ocr_arts.sort(key=lambda x: ((x.get('extra') or {}).get('split_order') or 0))

            # Log final validation before proceeding
            no_ocr_orders = [((a.get('extra') or {}).get('split_order') or 0) for a in no_ocr_arts]
            ocr_orders = [((a.get('extra') or {}).get('split_order') or 0) for a in ocr_arts]
            logger.info(f"Proceeding with comparison: no_ocr split_orders={no_ocr_orders}, ocr split_orders={ocr_orders}, expected_parts={expected_parts}")

            # Create comparison results
            comparison_results = self._compare_docling_groups(
                file_id, bucket, cabinet_id, no_ocr_arts, ocr_arts, comparison_type,
                no_ocr_group_id, ocr_group_id, payload
            )

            return comparison_results

        except Exception as e:
            logger.error(f"Comparison analysis failed for file {file_id}: {e}")
            raise

    def _compare_docling_groups(self, file_id: str, bucket: str, cabinet_id: str,
                               no_ocr_arts: list, ocr_arts: list, comparison_type: str,
                               no_ocr_group_id: str, ocr_group_id: str, payload: Dict[str, Any]) -> Dict[str, Any]:
        """Compare two groups of docling artefacts and generate analysis."""
        import subprocess
        import tempfile
        import json
        import uuid

        logger.info(f"Starting detailed comparison for file {file_id}: {len(no_ocr_arts)} vs {len(ocr_arts)} artefacts")

        artefact_id = str(uuid.uuid4())
        comparison_dir = f"{cabinet_id}/{file_id}/{artefact_id}"
        results = []
        overall_stats = {
            'total_comparisons': min(len(no_ocr_arts), len(ocr_arts)),
            'successful_comparisons': 0,
            'failed_comparisons': 0,
            'differences_found': 0,
            'identical_count': 0
        }

        try:
            with tempfile.TemporaryDirectory() as temp_dir:
                for i in range(min(len(no_ocr_arts), len(ocr_arts))):
                    no_ocr_art = no_ocr_arts[i]
                    ocr_art = ocr_arts[i]

                    try:
                        # Download manifest JSONs for both artefacts
                        no_ocr_manifest_path = ((no_ocr_art.get('extra') or {}).get('manifest'))
                        ocr_manifest_path = ((ocr_art.get('extra') or {}).get('manifest'))

                        if not no_ocr_manifest_path or not ocr_manifest_path:
                            logger.warning(f"Missing manifest paths for comparison {i+1}")
                            continue

                        no_ocr_manifest_data = self.storage.download_file(bucket, no_ocr_manifest_path)
                        ocr_manifest_data = self.storage.download_file(bucket, ocr_manifest_path)

                        no_ocr_manifest = json.loads(no_ocr_manifest_data.decode('utf-8'))
                        ocr_manifest = json.loads(ocr_manifest_data.decode('utf-8'))

                        # Compare JSON content if available
                        no_ocr_json_path = no_ocr_manifest.get('json_full')
                        ocr_json_path = ocr_manifest.get('json_full')

                        if no_ocr_json_path and ocr_json_path:
                            comparison_result = self._compare_json_content(
                                bucket, no_ocr_json_path, ocr_json_path, temp_dir, i + 1
                            )

                            comparison_result['no_ocr_artefact_id'] = no_ocr_art['id']
                            comparison_result['ocr_artefact_id'] = ocr_art['id']
                            comparison_result['split_order'] = (no_ocr_art.get('extra') or {}).get('split_order', i + 1)
                            comparison_result['split_heading'] = (no_ocr_art.get('extra') or {}).get('split_heading', f'Part {i+1}')

                            results.append(comparison_result)

                            overall_stats['successful_comparisons'] += 1
                            if comparison_result['has_differences']:
                                overall_stats['differences_found'] += 1
                            else:
                                overall_stats['identical_count'] += 1

                        else:
                            logger.warning(f"Missing JSON content paths for comparison {i+1}")
                            overall_stats['failed_comparisons'] += 1

                    except Exception as part_e:
                        logger.warning(f"Failed to compare part {i+1}: {part_e}")
                        overall_stats['failed_comparisons'] += 1
                        continue

            # Create comprehensive comparison report
            comparison_report = {
                'file_id': file_id,
                'comparison_type': comparison_type,
                'timestamp': json.dumps({"created_at": "now()"}, default=str),
                'overall_statistics': overall_stats,
                'detailed_results': results,
                'summary': {
                    'total_parts_compared': overall_stats['successful_comparisons'],
                    'identical_parts': overall_stats['identical_count'],
                    'different_parts': overall_stats['differences_found'],
                    'accuracy_percentage': (overall_stats['identical_count'] / max(overall_stats['successful_comparisons'], 1)) * 100
                }
            }

            # Store comparison report as artefact
            report_path = f"{comparison_dir}/comparison_report.json"
            report_json = json.dumps(comparison_report, ensure_ascii=False, indent=2)

            self.storage.upload_file(bucket, report_path, report_json.encode('utf-8'), 'application/json', upsert=True)

            # Create artefact record
            client = SupabaseServiceRoleClient()
            client.supabase.table('document_artefacts').insert({
                'id': artefact_id,
                'file_id': file_id,
                'type': 'docling_comparison_analysis',
                'rel_path': report_path,
                'extra': {
                    'comparison_type': comparison_type,
                    'no_ocr_group_id': no_ocr_group_id,
                    'ocr_group_id': ocr_group_id,
                    'producer': payload.get('producer', 'auto_split'),
                    'total_comparisons': overall_stats['total_comparisons'],
                    'successful_comparisons': overall_stats['successful_comparisons'],
                    'differences_found': overall_stats['differences_found'],
                    'accuracy_percentage': comparison_report['summary']['accuracy_percentage']
                },
                'status': 'completed'
            }).execute()

            logger.info(f"Comparison analysis completed for file {file_id}: {overall_stats['successful_comparisons']} comparisons, {overall_stats['differences_found']} differences found")

            # Trigger VLM processing after comparison completes (if enabled)
            self._trigger_vlm_after_comparison(file_id, payload)

            return {
                'artefact_id': artefact_id,
                'comparisons_completed': overall_stats['successful_comparisons'],
                'differences_found': overall_stats['differences_found'],
                'accuracy_percentage': comparison_report['summary']['accuracy_percentage']
            }

        except Exception as e:
            logger.error(f"Failed to create comparison analysis for file {file_id}: {e}")
            raise

    def _compare_json_content(self, bucket: str, no_ocr_path: str, ocr_path: str,
                             temp_dir: str, part_number: int) -> Dict[str, Any]:
        """Compare JSON content using jq and diff as suggested in web search results."""
        import subprocess
        import os
        from pathlib import Path

        try:
            # Download both JSON files
            no_ocr_data = self.storage.download_file(bucket, no_ocr_path)
            ocr_data = self.storage.download_file(bucket, ocr_path)

            # Save to temp files
            no_ocr_file = Path(temp_dir) / f'no_ocr_part_{part_number}.json'
            ocr_file = Path(temp_dir) / f'ocr_part_{part_number}.json'

            with open(no_ocr_file, 'wb') as f:
                f.write(no_ocr_data)
            with open(ocr_file, 'wb') as f:
                f.write(ocr_data)

            # Use jq to sort and format both files for comparison (as suggested in web search results)
            sorted_no_ocr = Path(temp_dir) / f'sorted_no_ocr_part_{part_number}.json'
            sorted_ocr = Path(temp_dir) / f'sorted_ocr_part_{part_number}.json'

            # Sort both files using jq
            subprocess.run(['jq', '--sort-keys', '.', str(no_ocr_file)],
                          stdout=open(sorted_no_ocr, 'w'), stderr=subprocess.DEVNULL, check=True)
            subprocess.run(['jq', '--sort-keys', '.', str(ocr_file)],
                          stdout=open(sorted_ocr, 'w'), stderr=subprocess.DEVNULL, check=True)

            # Compare using diff
            diff_output = Path(temp_dir) / f'diff_part_{part_number}.txt'
            diff_result = subprocess.run(
                ['diff', '-u', str(sorted_no_ocr), str(sorted_ocr)],
                stdout=open(diff_output, 'w'),
                stderr=subprocess.DEVNULL,
                text=True
            )

            # Read diff output
            with open(diff_output, 'r') as f:
                diff_content = f.read()

            # Analyze differences
            has_differences = diff_result.returncode != 0
            diff_lines = len([l for l in diff_content.split('\n') if l.startswith(('+', '-')) and not l.startswith(('+++', '---'))])

            return {
                'part_number': part_number,
                'has_differences': has_differences,
                'diff_lines_count': diff_lines,
                'diff_content_preview': diff_content[:1000] if diff_content else '',  # First 1000 chars
                'no_ocr_size': len(no_ocr_data),
                'ocr_size': len(ocr_data),
                'size_difference': abs(len(ocr_data) - len(no_ocr_data))
            }

        except subprocess.CalledProcessError as e:
            logger.warning(f"jq/diff command failed for part {part_number}: {e}")
            return {
                'part_number': part_number,
                'has_differences': True,
                'error': f"Comparison tools failed: {str(e)}",
                'diff_lines_count': -1
            }
        except Exception as e:
            logger.warning(f"JSON comparison failed for part {part_number}: {e}")
            return {
                'part_number': part_number,
                'has_differences': True,
                'error': f"Comparison failed: {str(e)}",
                'diff_lines_count': -1
            }

    def process_vlm_section_page_bundle_task(self, task: QueueTask) -> Dict[str, Any]:
        """Process VLM section page bundle task - create individual page bundles and combine them."""
        file_id = task.file_id
        payload = task.payload

        logger.info(f"Processing VLM section page bundle task for file {file_id}")

        try:
            section_idx = payload.get('section_idx')
            start_page = payload.get('start_page')
            end_page = payload.get('end_page')
            section_title = payload.get('section_title', f'Section {section_idx}')
            vlm_group_id = payload.get('vlm_group_id')
            vlm_model = payload.get('vlm_model', 'smoldocling')
            base_config = payload.get('base_config', {})
            total_sections = payload.get('total_sections', 1)

            client = SupabaseServiceRoleClient()

            # Get file info
            fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
            if not fr.data:
                raise ValueError(f"File {file_id} not found")

            file_row = fr.data
            bucket = file_row['bucket']
            cabinet_id = file_row['cabinet_id']

            # Find processing path (prefer converted PDF)
            arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or []
            pdf_art = next((a for a in arts if a.get('type') == 'document_pdf'), None)
            processing_path = pdf_art['rel_path'] if pdf_art else file_row['path']
            processing_mime = 'application/pdf'

            logger.info(f"VLM section bundle: processing section {section_idx} '{section_title}' pages {start_page}-{end_page} for file {file_id}")

            # Create individual page processing tasks
            page_task_ids = []
            for page_num in range(start_page, end_page + 1):
                try:
                    page_config = {
                        **base_config,
                        'do_ocr': False,
                        'force_ocr': False,
                        'pipeline': 'vlm',
                        'vlm_pipeline_model': vlm_model,
                        'page_range': [page_num, page_num],
                        'target_type': 'zip',
                        'image_export_mode': 'referenced',
                        # Add required VLM parameters that may be missing
                        'do_picture_classification': False,
                        'do_picture_description': False
                    }

                    logger.debug(f"VLM page {page_num} config: pipeline={page_config.get('pipeline')}, model={page_config.get('vlm_pipeline_model')}, range={page_config.get('page_range')}")

                    from modules.queue_system import enqueue_docling_task, TaskPriority

                    page_task_id = enqueue_docling_task(
                        file_id=file_id,
                        task_type='canonical_docling_json',
                        payload={
                            'bucket': bucket,
                            'file_path': processing_path,
                            'cabinet_id': cabinet_id,
                            'mime_type': processing_mime,
                            'config': page_config,
                            'artefact_extra': {
                                'is_subdoc': True,
                                'page_range': [page_num, page_num],
                                'label': f'{section_title} - Page {page_num}',
                                'vlm_section_idx': section_idx,
                                'vlm_section_title': section_title,
                                'vlm_page_number': page_num,
                                'vlm_section_start': start_page,
                                'vlm_section_end': end_page,
                                'producer': 'auto_split_vlm_page'
                            }
                        },
                        priority=TaskPriority.NORMAL,
                        timeout=1800
                    )

                    page_task_ids.append((page_num, page_task_id))
                    logger.debug(f"Enqueued VLM page task {page_task_id} for page {page_num} of section {section_idx}")

                except Exception as page_e:
                    logger.warning(f"Failed to enqueue VLM page {page_num} for section {section_idx} file {file_id}: {page_e}")
                    continue

            if not page_task_ids:
                raise ValueError(f"No page tasks could be enqueued for section {section_idx}")

            # Wait for all page tasks to complete and then create section bundle
            logger.info(f"Enqueued {len(page_task_ids)} VLM page tasks for section {section_idx}, now waiting for completion...")

            # Create a follow-up task to bundle the completed page results
            from modules.queue_system import enqueue_docling_task, TaskPriority
            import time

            # Wait a bit for page tasks to start, then create bundle task
            time.sleep(10)

            bundle_task_id = enqueue_docling_task(
                file_id=file_id,
                task_type='vlm_section_bundle_collector',
                payload={
                    'section_idx': section_idx,
                    'start_page': start_page,
                    'end_page': end_page,
                    'section_title': section_title,
                    'vlm_group_id': vlm_group_id,
                    'vlm_model': vlm_model,
                    'total_sections': total_sections,
                    'producer': 'auto_split',
                    'page_task_ids': [tid for _, tid in page_task_ids],
                    'expected_pages': list(range(start_page, end_page + 1))
                },
                priority=TaskPriority.LOW,  # Run after page tasks
                timeout=3600
            )

            logger.info(f"Created VLM section bundle collector task {bundle_task_id} for section {section_idx}")

            return {
                'section_idx': section_idx,
                'page_tasks_created': len(page_task_ids),
                'bundle_task_id': bundle_task_id,
                'pages_range': f"{start_page}-{end_page}"
            }

        except Exception as e:
            logger.error(f"VLM section page bundle task failed for file {file_id}: {e}")
            raise

    def process_vlm_section_bundle_collector_task(self, task: QueueTask) -> Dict[str, Any]:
        """Collect completed VLM page results and create section-level bundle manifest."""
        file_id = task.file_id
        payload = task.payload

        logger.info(f"Processing VLM section bundle collector for file {file_id}")

        try:
            section_idx = payload.get('section_idx')
            start_page = payload.get('start_page')
            end_page = payload.get('end_page')
            section_title = payload.get('section_title', f'Section {section_idx}')
            vlm_group_id = payload.get('vlm_group_id')
            vlm_model = payload.get('vlm_model', 'smoldocling')
            total_sections = payload.get('total_sections', 1)
            expected_pages = payload.get('expected_pages', [])

            client = SupabaseServiceRoleClient()

            # Get file info
            fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
            if not fr.data:
                raise ValueError(f"File {file_id} not found")

            file_row = fr.data
            bucket = file_row['bucket']
            cabinet_id = file_row['cabinet_id']

            # Find all completed VLM page artefacts for this section
            artefacts = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).execute()
            arts = artefacts.data or []

            # Filter for this section's VLM page artefacts
            section_page_arts = []
            for art in arts:
                extra = art.get('extra', {})
                if (extra.get('vlm_section_idx') == section_idx and
                    extra.get('producer') == 'auto_split_vlm_page' and
                    art.get('type') == 'docling_vlm' and
                    art.get('status') == 'completed'):
                    section_page_arts.append(art)

            # Check if we have all expected pages
            found_pages = [art.get('extra', {}).get('vlm_page_number') for art in section_page_arts]
            found_pages = [p for p in found_pages if p is not None]
            missing_pages = [p for p in expected_pages if p not in found_pages]

            logger.info(f"VLM section {section_idx} bundle collector: found {len(section_page_arts)} page artefacts, expected {len(expected_pages)} pages")

            if logger.isEnabledFor(10):  # DEBUG level
                found_pages_debug = [art.get('extra', {}).get('vlm_page_number') for art in section_page_arts]
                logger.debug(f"VLM section {section_idx}: found pages {found_pages_debug}, expected pages {expected_pages}")

            if missing_pages:
                # Not all pages are ready, retry later
                logger.info(f"VLM section {section_idx} bundle collector: missing pages {missing_pages}, found pages {found_pages} - will retry later")
                raise ValueError(f"VLM section {section_idx} missing pages: {missing_pages} (found: {found_pages}) - will retry")

            # Sort page artefacts by page number
            section_page_arts.sort(key=lambda x: x.get('extra', {}).get('vlm_page_number', 0))

            logger.info(f"VLM section {section_idx} bundle: creating manifest for {len(section_page_arts)} pages")

            # Create section bundle manifest
            section_artefact_id = str(uuid.uuid4())
            section_manifest_path = f"{cabinet_id}/{file_id}/{section_artefact_id}/vlm_section_{section_idx}_manifest.json"

            page_bundles = []
            for page_art in section_page_arts:
                extra = page_art.get('extra', {})
                page_num = extra.get('vlm_page_number')
                page_manifest_path = extra.get('manifest')

                page_bundles.append({
                    'page_number': page_num,
                    'artefact_id': page_art['id'],
                    'manifest_path': page_manifest_path,
                    'rel_path': page_art['rel_path'],
                    'label': extra.get('label', f'Page {page_num}')
                })

            section_manifest = {
                'file_id': file_id,
                'section_idx': section_idx,
                'section_title': section_title,
                'start_page': start_page,
                'end_page': end_page,
                'vlm_model': vlm_model,
                'total_pages': len(page_bundles),
                'page_bundles': page_bundles,
                'created_at': 'now()',
                'type': 'vlm_section_page_bundle'
            }

            # Store section manifest
            import json
            manifest_json = json.dumps(section_manifest, ensure_ascii=False, indent=2)
            self.storage.upload_file(bucket, section_manifest_path, manifest_json.encode('utf-8'), 'application/json', upsert=True)

            # Create section bundle artefact
            client.supabase.table('document_artefacts').insert({
                'id': section_artefact_id,
                'file_id': file_id,
                'type': 'vlm_section_page_bundle',
                'rel_path': section_manifest_path,
                'extra': {
                    'section_idx': section_idx,
                    'section_title': section_title,
                    'start_page': start_page,
                    'end_page': end_page,
                    'vlm_model': vlm_model,
                    'total_pages': len(page_bundles),
                    'group_id': vlm_group_id,
                    'split_order': section_idx,
                    'split_heading': section_title,
                    'split_total': total_sections,
                    'pipeline': 'vlm',
                    'producer': 'auto_split',
                    'group_pack_type': 'vlm_page_bundle_auto_split'
                },
                'status': 'completed'
            }).execute()

            logger.info(f"VLM section bundle collector completed for section {section_idx} of file {file_id}: created manifest with {len(page_bundles)} page bundles")

            return {
                'section_artefact_id': section_artefact_id,
                'section_idx': section_idx,
                'pages_bundled': len(page_bundles),
                'manifest_path': section_manifest_path
            }

        except Exception as e:
            logger.error(f"VLM section bundle collector failed for file {file_id}: {e}")
            raise

    def _trigger_vlm_after_comparison(self, file_id: str, comparison_payload: Dict[str, Any]):
        """Trigger VLM processing after comparison analysis completes."""
        try:
            # Check if VLM should be triggered
            if not comparison_payload.get('trigger_vlm_after_comparison'):
                logger.debug(f"VLM post-comparison trigger not enabled for file {file_id}")
                return

            vlm_config = comparison_payload.get('vlm_config', {})
            if not vlm_config.get('enabled'):
                logger.debug(f"VLM not enabled for file {file_id}")
                return

            logger.info(f"[auto-canonical] Triggering VLM processing after comparison for file {file_id}")

            # Extract VLM configuration
            split_by_page = vlm_config.get('split_by_page', False)
            vlm_model = vlm_config.get('model', 'smoldocling')
            threshold = vlm_config.get('threshold', 50)
            base_config = vlm_config.get('base_config', {})

            # Generate new group_id for VLM processing
            import uuid
            vlm_group_id = str(uuid.uuid4())

            if split_by_page:
                # Page-by-page processing within sections
                logger.info(f"[auto-canonical] vlm page-by-page processing for file {file_id} (post-comparison)")
                self._enqueue_vlm_page_processing(
                    file_id, threshold, vlm_group_id, vlm_model, base_config
                )
            else:
                # Standard section-level VLM processing
                from routers.database.files.files import enqueue_canonical_docling

                body_vlm = {
                    'use_split_map': True,
                    'threshold': threshold,
                    'producer': 'auto_split',
                    'group_id': vlm_group_id,
                    'config': {
                        **base_config,
                        'do_ocr': False,  # VLM doesn't need OCR
                        'force_ocr': False,
                        'pipeline': 'vlm',
                        'vlm_pipeline_model': vlm_model
                    }
                }
                logger.info(f"[auto-canonical] vlm section batch group_id={vlm_group_id} for file {file_id} (post-comparison)")
                enqueue_canonical_docling(file_id=file_id, body=body_vlm)

        except Exception as e:
            logger.warning(f"Failed to trigger VLM processing after comparison for file {file_id}: {e}")

    def process_docling_bundle_task(self, task: QueueTask) -> Dict[str, Any]:
        """
        Process single docling bundle task (whole document processing).

        This creates a coherent single bundle with all formats using direct processing.
        NO temporary tasks or old logic reuse - this is the new architecture.
        """
        file_id = task.file_id
        payload = task.payload

        logger.info(f"🎯 NEW ARCHITECTURE: Processing docling bundle task for file {file_id} (whole document)")

        try:
            # Extract bundle configuration
            config = payload.get('config', {})
            bundle_metadata = payload.get('bundle_metadata', {})

            # Ensure bundle processing configuration
            config['target_type'] = 'zip'
            config['to_formats'] = ['json', 'html', 'text', 'md', 'doctags']

            # Call the actual docling processing directly - NO temp tasks!
            result = self._process_docling_bundle_direct(task, config, bundle_metadata)

            logger.info(f"✅ NEW ARCHITECTURE: Successfully processed docling bundle for file {file_id}")
            return result

        except Exception as e:
            logger.error(f"❌ NEW ARCHITECTURE: Docling bundle processing failed for file {file_id}: {e}")
            raise

    def _process_docling_bundle_direct(self, task: QueueTask, config: Dict[str, Any], bundle_metadata: Dict[str, Any]) -> Dict[str, Any]:
        """
        Direct docling bundle processing - NEW ARCHITECTURE approach.

        This processes the docling request directly without creating temporary tasks,
        ensuring clean Redis state and proper bundle metadata handling.
        """
        file_id = task.file_id
        payload = task.payload

        logger.info(f"🔧 DIRECT PROCESSING: Starting docling bundle processing for file {file_id}")

        if not self.docling_url:
            raise ValueError("DOCLING_URL not configured")

        # Extract payload data
        bucket = payload['bucket']
        file_path = payload['file_path']
        cabinet_id = payload['cabinet_id']

        # Download file
        logger.debug(f"📥 DIRECT PROCESSING: Downloading file for bundle processing: {bucket}/{file_path}")
        file_bytes = self.storage.download_file(bucket, file_path)

        # Prepare Docling request with bundle-specific config
        docling_api_key = os.getenv('DOCLING_API_KEY')
        headers = {'Accept': '*/*'}
        if docling_api_key:
            headers['X-Api-Key'] = docling_api_key

        # Build form data for bundle processing - USE CONFIG FROM PIPELINE_CONTROLLER (no hardcoded defaults!)
        # The config passed from pipeline_controller already has environment variables loaded
        form_data = [
            ('target_type', 'zip'),  # Always zip for bundles
            ('do_ocr', str(config.get('do_ocr', False)).lower()),
            ('force_ocr', str(config.get('force_ocr', False)).lower()),
            ('image_export_mode', 'referenced'),  # Bundle standard
            ('ocr_engine', config.get('ocr_engine', 'easyocr')),
            ('pdf_backend', config.get('pdf_backend', 'dlparse_v4')),
            ('table_mode', config.get('table_mode', 'fast')),  # Use config from pipeline_controller (env vars)
            ('table_cell_matching', str(config.get('table_cell_matching', True)).lower()),  # Use config from pipeline_controller (env: true)
            ('pipeline', config.get('pipeline', 'standard')),
            ('do_formula_enrichment', str(config.get('do_formula_enrichment', True)).lower()),  # Use config from pipeline_controller (env: true)
            ('do_code_enrichment', str(config.get('do_code_enrichment', True)).lower()),  # Use config from pipeline_controller (env: true)
            ('do_table_structure', str(config.get('do_table_structure', True)).lower()),
            ('include_images', str(config.get('include_images', True)).lower()),
            ('images_scale', str(config.get('images_scale', 2.0))),
            ('do_picture_classification', str(config.get('do_picture_classification', False)).lower()),
            ('do_picture_description', str(config.get('do_picture_description', False)).lower()),
            ('document_timeout', str(config.get('document_timeout', task.timeout)))
        ]

        # Handle OCR languages as array (API expects multiple form fields)
        ocr_lang = config.get('ocr_lang')
        if ocr_lang:
            if isinstance(ocr_lang, list):
                for lang in ocr_lang:
                    form_data.append(('ocr_lang', str(lang)))
            else:
                form_data.append(('ocr_lang', str(ocr_lang)))

        # Handle VLM pipeline options (CRITICAL for VLM processing)
        if config.get('vlm_pipeline_model'):
            form_data.append(('vlm_pipeline_model', config.get('vlm_pipeline_model')))

        # VLM model local/API options must be JSON per Docling OpenAPI spec
        if config.get('vlm_pipeline_model_local'):
            vlm_local = config.get('vlm_pipeline_model_local')
            if isinstance(vlm_local, (dict, list)):
                form_data.append(('vlm_pipeline_model_local', json.dumps(vlm_local)))
            elif isinstance(vlm_local, str) and vlm_local.strip().startswith(('{', '[')):
                form_data.append(('vlm_pipeline_model_local', vlm_local))
            # else: omit to avoid validation error

        if config.get('vlm_pipeline_model_api'):
            vlm_api = config.get('vlm_pipeline_model_api')
            if isinstance(vlm_api, (dict, list)):
                form_data.append(('vlm_pipeline_model_api', json.dumps(vlm_api)))
            elif isinstance(vlm_api, str) and vlm_api.strip().startswith(('{', '[')):
                form_data.append(('vlm_pipeline_model_api', vlm_api))
            # else: omit

        # Picture description options must be JSON per Docling OpenAPI spec
        if config.get('picture_description_local'):
            pic_local = config.get('picture_description_local')
            if isinstance(pic_local, (dict, list)):
                form_data.append(('picture_description_local', json.dumps(pic_local)))
            elif isinstance(pic_local, str) and pic_local.strip().startswith(('{', '[')):
                form_data.append(('picture_description_local', pic_local))

        if config.get('picture_description_api'):
            pic_api = config.get('picture_description_api')
            if isinstance(pic_api, (dict, list)):
                form_data.append(('picture_description_api', json.dumps(pic_api)))
            elif isinstance(pic_api, str) and pic_api.strip().startswith(('{', '[')):
                form_data.append(('picture_description_api', pic_api))
        if 'picture_description_area_threshold' in config:
            form_data.append(('picture_description_area_threshold', str(config.get('picture_description_area_threshold'))))

        # Handle markdown page break placeholder
        if 'md_page_break_placeholder' in config:
            form_data.append(('md_page_break_placeholder', config.get('md_page_break_placeholder')))

        # Add formats - always all formats for bundles
        for fmt in ['json', 'html', 'text', 'md', 'doctags']:
            form_data.append(('to_formats', fmt))

        # Handle page range properly - get actual PDF page count like frontmatter does
        page_range = config.get('page_range', [1, 999999])
        if isinstance(page_range, list) and len(page_range) >= 2:
            def _to_int_safe(v, default):
                try:
                    return int(v)
                except Exception:
                    return default
            start_pg = _to_int_safe(page_range[0], 1)
            end_pg = _to_int_safe(page_range[1], 999999)
            if start_pg < 1:
                start_pg = 1
            if end_pg < start_pg:
                end_pg = start_pg

            # CRITICAL: Get actual PDF page count to prevent massive range
            try:
                import fitz  # PyMuPDF
                doc = fitz.open(stream=file_bytes, filetype='pdf')
                pc = int(doc.page_count)
                doc.close()
                if pc > 0:
                    end_pg = min(end_pg, pc)  # Clamp to actual page count!
                    start_pg = max(1, min(start_pg, pc))
                    if end_pg < start_pg:
                        end_pg = start_pg
                logger.info(f"📄 DIRECT PROCESSING: PDF has {pc} pages, using range {start_pg}-{end_pg}")
            except Exception as e:
                logger.warning(f"Could not determine PDF page count: {e}, using defaults")

            form_data.append(('page_range', str(start_pg)))
            form_data.append(('page_range', str(end_pg)))
        else:
            # Fallback to single page if no range specified
            form_data.append(('page_range', '1'))
            form_data.append(('page_range', '1'))

        files = [('files', ('file', file_bytes, payload.get('mime_type', 'application/pdf')))]

        # DEBUG: Log the actual config being sent to Docling
        config_debug = {key: value for key, value in form_data if key in ['table_mode', 'table_cell_matching', 'do_formula_enrichment', 'do_code_enrichment', 'do_ocr', 'pipeline']}
        logger.info(f"🔧 DIRECT PROCESSING: Docling config being sent: {config_debug}")

        # Make the HTTP request
        logger.info(f"🌐 DIRECT PROCESSING: Making HTTP request to Docling for file {file_id}")
        try:
            import time
            start_time = time.time()

            response = requests.post(
                f"{self.docling_url.rstrip('/')}/v1/convert/file",
                files=files,
                data=form_data,
                headers=headers,
                timeout=task.timeout
            )
            response.raise_for_status()

            elapsed = time.time() - start_time
            logger.info(f"⚡ DIRECT PROCESSING: Docling request completed in {elapsed:.2f}s for file {file_id}")

        except Exception as e:
            logger.error(f"🌐 DIRECT PROCESSING: HTTP request failed for file {file_id}: {e}")
            raise

        # Process response - should be ZIP for bundle
        content_type = (response.headers.get('Content-Type') or '').lower()
        is_zip_resp = ('zip' in content_type) or (response.content[:2] == b'PK')

        if not is_zip_resp:
            raise ValueError(f"Expected ZIP response for bundle, got: {content_type}")

        # Process ZIP bundle and create artefacts
        logger.info(f"📦 DIRECT PROCESSING: Processing ZIP bundle for file {file_id}")
        result = self._process_docling_zip_bundle(
            file_id=file_id,
            bucket=bucket,
            cabinet_id=cabinet_id,
            zip_content=response.content,
            bundle_metadata=bundle_metadata,
            task_config=config
        )

        logger.info(f"✅ DIRECT PROCESSING: Bundle processing completed for file {file_id}")
        return result

    def _create_bundle_display_metadata(self, bundle_type: str, title: str, index: int = None,
                                       total: int = None, page_range: list = None) -> dict:
        """
        Create consistent display metadata for bundle organization.

        This ensures all bundles have proper titles, ordering, and display names
        for frontend organization and user-friendly presentation.
        """
        metadata = {
            'title': title,
            'bundle_type': bundle_type
        }

        if index is not None:
            metadata['split_order'] = index

        if total is not None:
            metadata['split_total'] = total

        if page_range:
            metadata['page_range'] = page_range
            metadata['page_count'] = page_range[1] - page_range[0] + 1

        # Create display names based on bundle type
        if bundle_type == 'page':
            metadata['display_name'] = f"Page {page_range[0]}" if page_range else f"Page {index}"
            metadata['bundle_label'] = f"Page {page_range[0]} Bundle"
            metadata['sort_key'] = page_range[0] if page_range else index
        elif bundle_type == 'section':
            page_str = f" (p{page_range[0]}-{page_range[1]})" if page_range else ""
            metadata['display_name'] = f"{index:02d}. {title}{page_str}"
            metadata['bundle_label'] = f"{title} Bundle"
            metadata['sort_key'] = index
        elif bundle_type == 'chunk':
            page_str = f" (p{page_range[0]}-{page_range[1]})" if page_range else ""
            metadata['display_name'] = f"{index:02d}. {title}{page_str}"
            metadata['bundle_label'] = f"{title} Bundle"
            metadata['sort_key'] = index
        else:
            metadata['display_name'] = title
            metadata['bundle_label'] = f"{title} Bundle"
            metadata['sort_key'] = index or 0

        return metadata

    def _process_docling_zip_bundle(self, file_id: str, bucket: str, cabinet_id: str,
                                   zip_content: bytes, bundle_metadata: Dict[str, Any],
                                   task_config: Dict[str, Any]) -> Dict[str, Any]:
        """
        Process ZIP bundle response and create artefacts with proper bundle metadata.

        This is the NEW ARCHITECTURE approach for handling docling ZIP responses.
        """
        import zipfile
        import io
        import uuid
        import json
        import time

        logger.info(f"📦 ZIP PROCESSING: Starting bundle extraction for file {file_id}")

        # Create bundle artefact structure
        artefact_id = str(uuid.uuid4())
        base_dir = f"{cabinet_id}/{file_id}/{artefact_id}"
        archive_path = f"{base_dir}/bundle.zip"

        # Save original archive
        self.storage.upload_file(bucket, archive_path, zip_content, 'application/zip', upsert=True)

        # Extract ZIP contents
        zf = zipfile.ZipFile(io.BytesIO(zip_content))
        entries = []
        file_paths = {}

        for entry in zf.filelist:
            if entry.is_dir():
                continue

            entry_content = zf.read(entry)
            entry_filename = entry.filename
            rel_path = f"{base_dir}/{entry_filename}"

            # Determine MIME type
            if entry_filename.endswith('.json'):
                mime = 'application/json'
                file_paths['json'] = rel_path
            elif entry_filename.endswith('.html'):
                mime = 'text/html'
                file_paths['html'] = rel_path
            elif entry_filename.endswith('.md'):
                mime = 'text/markdown'
                file_paths['md'] = rel_path
            elif entry_filename.endswith('.txt'):
                mime = 'text/plain'
                file_paths['text'] = rel_path
            elif entry_filename.endswith('.doctags'):
                mime = 'application/json'
                file_paths['doctags'] = rel_path
            else:
                mime = 'application/octet-stream'

            # Upload file
            self.storage.upload_file(bucket, rel_path, entry_content, mime, upsert=True)

            entries.append({
                'filename': entry_filename,
                'rel_path': rel_path,
                'mime_type': mime,
                'size': len(entry_content)
            })

            logger.debug(f"📄 ZIP PROCESSING: Extracted {entry_filename} -> {rel_path}")

        zf.close()

        # Create bundle manifest
        manifest = {
            'bundle_id': artefact_id,
            'file_id': file_id,
            'bundle_type': 'docling_bundle',
            'processing_mode': 'whole_document',
            'created_at': time.time(),
            'archive_path': archive_path,
            'entries': entries,
            'file_paths': file_paths,
            'metadata': bundle_metadata,
            'config': task_config
        }

        manifest_path = f"{base_dir}/manifest.json"
        manifest_content = json.dumps(manifest, indent=2).encode('utf-8')
        self.storage.upload_file(bucket, manifest_path, manifest_content, 'application/json', upsert=True)

        # Create database artefact with bundle metadata
        artefact_extra = {
            **bundle_metadata,
            'manifest': manifest_path,
            'archive_path': archive_path,
            'file_paths': file_paths,
            'entry_count': len(entries),
            'group_pack_type': 'whole'  # Add proper pack type for whole document bundles
        }

        self.client.supabase.table('document_artefacts').insert({
            'id': artefact_id,
            'file_id': file_id,
            'page_number': 0,  # Whole document
            'type': 'docling_bundle',
            'rel_path': base_dir,
            'size_tag': json.dumps(task_config),
            'language': 'en',
            'chunk_index': None,
            'extra': artefact_extra
        }).execute()

        logger.info(f"✅ ZIP PROCESSING: Created bundle artefact {artefact_id} with {len(entries)} files for file {file_id}")

        return {
            'artefact_id': artefact_id,
            'rel_path': base_dir,
            'manifest_path': manifest_path,
            'archive_path': archive_path,
            'file_paths': file_paths,
            'entry_count': len(entries),
            'bundle_metadata': bundle_metadata
        }

    def process_docling_bundle_split_task(self, task: QueueTask) -> Dict[str, Any]:
        """
        Process split docling bundle task (multi-unit processing).

        This creates multiple sub-bundles and a master manifest based on processing mode.
        """
        file_id = task.file_id
        payload = task.payload

        logger.info(f"Processing docling bundle split task for file {file_id}")

        try:
            processing_mode = payload.get('processing_mode', 'split_by_sections')
            processing_data = payload.get('processing_data', {})
            config = payload.get('config', {})
            bundle_metadata = payload.get('bundle_metadata', {})

            logger.info(f"Split bundle processing mode: {processing_mode}")

            if processing_mode == 'split_by_pages':
                return self._process_split_by_pages(task, processing_data, config, bundle_metadata)
            elif processing_mode == 'split_by_sections':
                return self._process_split_by_sections(task, processing_data, config, bundle_metadata)
            elif processing_mode == 'split_by_chunks':
                return self._process_split_by_chunks(task, processing_data, config, bundle_metadata)
            else:
                raise ValueError(f"Unknown processing mode: {processing_mode}")

        except Exception as e:
            logger.error(f"Docling bundle split processing failed for file {file_id}: {e}")
            raise

    def _process_split_by_pages(self, task: QueueTask, processing_data: dict,
                               config: dict, bundle_metadata: dict) -> Dict[str, Any]:
        """Process document by individual pages and create page bundles."""
        file_id = task.file_id
        payload = task.payload
        bucket = payload['bucket']
        file_path = payload['file_path']
        cabinet_id = payload['cabinet_id']
        mime_type = payload['mime_type']

        pages = processing_data.get('pages', [])
        logger.info(f"Processing {len(pages)} individual pages for file {file_id}")

        # Create master bundle directory
        master_bundle_id = str(uuid.uuid4())
        master_dir = f"{cabinet_id}/{file_id}/{master_bundle_id}"
        page_bundles = []

        # Process each page as a separate bundle
        for idx, page_num in enumerate(pages, 1):
            try:
                page_config = {
                    **config,
                    'page_range': [page_num, page_num],
                    'target_type': 'zip',
                    'to_formats': ['json', 'html', 'text', 'md', 'doctags']
                }

                # Create descriptive page title and enhanced metadata
                page_title = f"Page {page_num}"
                page_display_name = f"Page {page_num}"

                # Create individual page task with enhanced labeling
                page_task = QueueTask(
                    id=f"{task.id}_page_{page_num}",
                    file_id=file_id,
                    service=task.service,
                    task_type='canonical_docling_json',
                    payload={
                        **payload,
                        'config': page_config,
                        'artefact_extra': {
                            'page_number': page_num,
                            'page_title': page_title,
                            'display_name': page_display_name,
                            'split_order': idx,  # Sequential order within this bundle
                            'split_total': len(pages),
                            'split_heading': page_title,
                            'section_title': page_title,  # For consistency
                            'is_page_bundle': True,
                            'master_bundle_id': master_bundle_id,
                            'bundle_label': f"Page {page_num} Bundle",
                            **bundle_metadata
                        }
                    },
                    priority=task.priority,
                    timeout=1800,
                    created_at=task.created_at
                )

                # Process page bundle
                page_result = self._process_docling_task(page_task)
                page_bundles.append({
                    'page_number': page_num,
                    'page_title': page_title,
                    'display_name': page_display_name,
                    'split_order': idx,
                    'artefact_id': page_result.get('artefact_id'),
                    'rel_path': page_result.get('rel_path')
                })

            except Exception as e:
                logger.warning(f"Failed to process page {page_num} for file {file_id}: {e}")
                continue

        # Sort page bundles by page number for consistent ordering
        page_bundles.sort(key=lambda x: x['page_number'])

        # Create enhanced master manifest with proper organization metadata
        master_manifest = {
            'file_id': file_id,
            'bundle_type': 'docling_bundle_split',
            'split_mode': 'split_by_pages',
            'total_pages': len(pages),
            'successful_pages': len(page_bundles),
            'page_bundles': page_bundles,
            'created_at': 'now()',
            'display_name': f"Document Pages ({len(page_bundles)} pages)",
            'organization': {
                'type': 'pages',
                'sort_field': 'page_number',
                'sort_order': 'asc',
                'grouping': 'individual_pages'
            },
            **bundle_metadata
        }

        # Store master manifest
        manifest_path = f"{master_dir}/master_manifest.json"
        manifest_json = json.dumps(master_manifest, ensure_ascii=False, indent=2)
        self.storage.upload_file(bucket, manifest_path, manifest_json.encode('utf-8'), 'application/json', upsert=True)

        # Create master bundle artefact
        self.client.supabase.table('document_artefacts').insert({
            'id': master_bundle_id,
            'file_id': file_id,
            'type': 'docling_bundle_split_pages',
            'rel_path': master_dir,
            'extra': {
                'manifest': manifest_path,
                'split_mode': 'split_by_pages',
                'total_pages': len(pages),
                'successful_pages': len(page_bundles),
                'group_pack_type': 'split_pages',  # Add proper pack type for split page bundles
                **bundle_metadata
            },
            'status': 'completed'
        }).execute()

        logger.info(f"Created page-based split bundle for file {file_id}: {len(page_bundles)} pages")
        return {
            'master_bundle_id': master_bundle_id,
            'pages_processed': len(page_bundles),
            'total_pages': len(pages)
        }

    def _process_split_by_sections(self, task: QueueTask, processing_data: dict,
                                  config: dict, bundle_metadata: dict) -> Dict[str, Any]:
        """Process document by sections and create section bundles."""
        file_id = task.file_id
        payload = task.payload
        bucket = payload['bucket']
        cabinet_id = payload['cabinet_id']

        entries = processing_data.get('entries', [])
        logger.info(f"Processing {len(entries)} sections for file {file_id}")

        # Create master bundle directory
        master_bundle_id = str(uuid.uuid4())
        master_dir = f"{cabinet_id}/{file_id}/{master_bundle_id}"
        section_bundles = []

        # Process each section as a separate bundle
        logger.info(f"Processing {len(entries)} sections for file {file_id}")
        for i, entry in enumerate(entries, 1):
            try:
                start_page = entry.get('start_page', 1)
                end_page = entry.get('end_page', start_page)
                # Enhanced section title handling with fallbacks and smart naming
                raw_title = entry.get('title') or entry.get('label') or entry.get('heading')
                section_title = raw_title.strip() if raw_title else f'Section {i}'

                # Create enhanced display names for better organization
                page_range_str = f"p{start_page}" if start_page == end_page else f"p{start_page}-{end_page}"
                display_name = f"{i:02d}. {section_title}" if raw_title else f"{i:02d}. Section {i} ({page_range_str})"
                bundle_label = f"{section_title} Bundle"

                # Validate page ranges
                if start_page < 1:
                    raise ValueError(f"Invalid start_page: {start_page} (must be >= 1)")
                if end_page < start_page:
                    raise ValueError(f"Invalid page range: {start_page}-{end_page} (end < start)")
                if start_page > 999 or end_page > 999:
                    raise ValueError(f"Suspicious page range: {start_page}-{end_page} (too high, possible corruption)")

                logger.info(f"Processing section {i}/{len(entries)}: '{display_name}' (pages {start_page}-{end_page})")

                section_config = {
                    **config,
                    'page_range': [start_page, end_page],
                    'target_type': 'zip',
                    'to_formats': ['json', 'html', 'text', 'md', 'doctags']
                }

                # Create section task with enhanced metadata and labeling
                section_task = QueueTask(
                    id=f"{task.id}_section_{i}",
                    file_id=file_id,
                    service=task.service,
                    task_type='canonical_docling_json',
                    payload={
                        **payload,
                        'config': section_config,
                        'artefact_extra': {
                            'section_number': i,
                            'section_title': section_title,
                            'display_name': display_name,
                            'bundle_label': bundle_label,
                            'start_page': start_page,
                            'end_page': end_page,
                            'page_range': [start_page, end_page],
                            'page_count': end_page - start_page + 1,
                            'split_order': i,  # Preserved ordering from split map
                            'split_total': len(entries),
                            'split_heading': section_title,
                            'is_section_bundle': True,
                            'master_bundle_id': master_bundle_id,
                            **bundle_metadata
                        }
                    },
                    priority=task.priority,
                    timeout=3600,
                    created_at=task.created_at
                )

                # Process section bundle
                section_result = self._process_docling_task(section_task)
                section_bundles.append({
                    'section_number': i,
                    'section_title': section_title,
                    'display_name': display_name,
                    'bundle_label': bundle_label,
                    'page_range': [start_page, end_page],
                    'page_count': end_page - start_page + 1,
                    'split_order': i,
                    'artefact_id': section_result.get('artefact_id'),
                    'rel_path': section_result.get('rel_path')
                })

            except Exception as e:
                logger.error(f"FATAL: Failed to process section {i} for file {file_id}: {e}")
                logger.error(f"Section details: title='{section_title}', pages={start_page}-{end_page}")
                # Don't continue - fail the entire task if any section fails
                raise Exception(f"Section processing failed for section {i} ('{section_title}', pages {start_page}-{end_page}): {e}")

        # Sort section bundles by split_order for consistent ordering
        section_bundles.sort(key=lambda x: x['split_order'])

        # Create enhanced master manifest with proper organization metadata
        master_manifest = {
            'file_id': file_id,
            'bundle_type': 'docling_bundle_split',
            'split_mode': 'split_by_sections',
            'total_sections': len(entries),
            'successful_sections': len(section_bundles),
            'section_bundles': section_bundles,
            'created_at': 'now()',
            'display_name': f"Document Sections ({len(section_bundles)} sections)",
            'organization': {
                'type': 'sections',
                'sort_field': 'split_order',
                'sort_order': 'asc',
                'grouping': 'split_map_sections',
                'has_titles': True,
                'ordering_preserved': True
            },
            **bundle_metadata
        }

        # Store master manifest
        manifest_path = f"{master_dir}/master_manifest.json"
        manifest_json = json.dumps(master_manifest, ensure_ascii=False, indent=2)
        self.storage.upload_file(bucket, manifest_path, manifest_json.encode('utf-8'), 'application/json', upsert=True)

        # Create master bundle artefact
        self.client.supabase.table('document_artefacts').insert({
            'id': master_bundle_id,
            'file_id': file_id,
            'type': 'docling_bundle_split_sections',
            'rel_path': master_dir,
            'extra': {
                'manifest': manifest_path,
                'split_mode': 'split_by_sections',
                'total_sections': len(entries),
                'successful_sections': len(section_bundles),
                'group_pack_type': 'split_sections',  # Add proper pack type for split section bundles
                **bundle_metadata
            },
            'status': 'completed'
        }).execute()

        logger.info(f"Created section-based split bundle for file {file_id}: {len(section_bundles)} sections")
        return {
            'master_bundle_id': master_bundle_id,
            'sections_processed': len(section_bundles),
            'total_sections': len(entries)
        }

    def _process_split_by_chunks(self, task: QueueTask, processing_data: dict,
                                config: dict, bundle_metadata: dict) -> Dict[str, Any]:
        """Process document by chunks and create chunk bundles."""
        # Very similar to _process_split_by_sections but with chunk-specific labeling
        file_id = task.file_id
        payload = task.payload
        bucket = payload['bucket']
        cabinet_id = payload['cabinet_id']

        chunks = processing_data.get('entries', [])
        logger.info(f"Processing {len(chunks)} chunks for file {file_id}")

        # Create master bundle directory
        master_bundle_id = str(uuid.uuid4())
        master_dir = f"{cabinet_id}/{file_id}/{master_bundle_id}"
        chunk_bundles = []

        # Process each chunk as a separate bundle
        for i, chunk in enumerate(chunks, 1):
            try:
                start_page = chunk['start']
                end_page = chunk['end']
                # Enhanced chunk title handling
                raw_title = chunk.get('title', f'Chunk {i}')
                chunk_title = raw_title.strip() if raw_title else f'Chunk {i}'

                # Create enhanced display names for chunks
                page_range_str = f"p{start_page}" if start_page == end_page else f"p{start_page}-{end_page}"
                display_name = f"{i:02d}. {chunk_title} ({page_range_str})"
                bundle_label = f"{chunk_title} Bundle"

                chunk_config = {
                    **config,
                    'page_range': [start_page, end_page],
                    'target_type': 'zip',
                    'to_formats': ['json', 'html', 'text', 'md', 'doctags']
                }

                # Create chunk task with enhanced labeling
                chunk_task = QueueTask(
                    id=f"{task.id}_chunk_{i}",
                    file_id=file_id,
                    service=task.service,
                    task_type='canonical_docling_json',
                    payload={
                        **payload,
                        'config': chunk_config,
                        'artefact_extra': {
                            'chunk_number': i,
                            'chunk_title': chunk_title,
                            'display_name': display_name,
                            'bundle_label': bundle_label,
                            'start_page': start_page,
                            'end_page': end_page,
                            'page_range': [start_page, end_page],
                            'page_count': end_page - start_page + 1,
                            'split_order': i,
                            'split_total': len(chunks),
                            'split_heading': chunk_title,
                            'is_chunk_bundle': True,
                            'master_bundle_id': master_bundle_id,
                            **bundle_metadata
                        }
                    },
                    priority=task.priority,
                    timeout=3600,
                    created_at=task.created_at
                )

                # Process chunk bundle
                chunk_result = self._process_docling_task(chunk_task)
                chunk_bundles.append({
                    'chunk_number': i,
                    'chunk_title': chunk_title,
                    'display_name': display_name,
                    'bundle_label': bundle_label,
                    'page_range': [start_page, end_page],
                    'page_count': end_page - start_page + 1,
                    'split_order': i,
                    'artefact_id': chunk_result.get('artefact_id'),
                    'rel_path': chunk_result.get('rel_path')
                })

            except Exception as e:
                logger.warning(f"Failed to process chunk {i} for file {file_id}: {e}")
                continue

        # Create master manifest
        master_manifest = {
            'file_id': file_id,
            'bundle_type': 'docling_bundle_split',
            'split_mode': 'split_by_chunks',
            'total_chunks': len(chunks),
            'successful_chunks': len(chunk_bundles),
            'chunk_bundles': chunk_bundles,
            'created_at': 'now()',
            **bundle_metadata
        }

        # Store master manifest
        manifest_path = f"{master_dir}/master_manifest.json"
        manifest_json = json.dumps(master_manifest, ensure_ascii=False, indent=2)
        self.storage.upload_file(bucket, manifest_path, manifest_json.encode('utf-8'), 'application/json', upsert=True)

        # Create master bundle artefact
        self.client.supabase.table('document_artefacts').insert({
            'id': master_bundle_id,
            'file_id': file_id,
            'type': 'docling_bundle_split_chunks',
            'rel_path': master_dir,
            'extra': {
                'manifest': manifest_path,
                'split_mode': 'split_by_chunks',
                'total_chunks': len(chunks),
                'successful_chunks': len(chunk_bundles),
                'group_pack_type': 'split_chunks',  # Add proper pack type for split chunk bundles
                **bundle_metadata
            },
            'status': 'completed'
        }).execute()

        logger.info(f"Created chunk-based split bundle for file {file_id}: {len(chunk_bundles)} chunks")
        return {
            'master_bundle_id': master_bundle_id,
            'chunks_processed': len(chunk_bundles),
            'total_chunks': len(chunks)
        }

# process_phase2_coordinator_task method removed - pipelines now enqueued directly from split_map task

# _check_pipeline_group_completion method removed - task dependencies now handle sequential execution

# Global processor instance
_processor_instance = None

def get_processor() -> DocumentTaskProcessor:
    """Get the global task processor instance."""
    global _processor_instance
    if _processor_instance is None:
        _processor_instance = DocumentTaskProcessor()
    return _processor_instance