api/archive/auto_processing/pipeline_controller.py

"""
Pipeline Controller for Three-Phase Document Processing Architecture

This module coordinates the three phases of document processing:
- Phase 1: Document Structure Discovery & Analysis
- Phase 2: Parallel Content Processing Pipelines
- Phase 3: Enhanced Frontend Viewing (handled by frontend)

Features:
- Environment variable controlled auto-processing
- Phase 1 completion detection
- Automatic Phase 2 triggering
- Intelligent retry and coordination logic
"""

import json
import os
import uuid
import time
from typing import Dict, Any, List, Optional, Set
from pathlib import Path

from modules.logger_tool import initialise_logger
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
from modules.database.supabase.utils.storage import StorageAdmin
from modules.queue_system import (
    enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task,
    enqueue_document_analysis_task, enqueue_page_images_task,
    TaskPriority, get_queue
)
from modules.bundle_metadata import (
    create_standard_metadata, BundleMetadata, PipelineType, ProcessingMode, BundleType
)

logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)

class DocumentPipelineController:
    """
    Coordinates the three-phase document processing pipeline.
    """

    def __init__(self):
        self.client = SupabaseServiceRoleClient()
        self.storage = StorageAdmin()

        # Phase 1 environment variables
        self.auto_tika = os.getenv('AUTO_TIKA_PROCESSING', 'true').lower() == 'true'
        self.auto_page_images = os.getenv('AUTO_PAGE_IMAGES', 'true').lower() == 'true'
        self.auto_document_analysis = os.getenv('AUTO_DOCUMENT_ANALYSIS', 'true').lower() == 'true'
        self.auto_split_map = os.getenv('AUTO_SPLIT_MAP_GENERATION', 'true').lower() == 'true'

        # Phase 2 environment variables
        self.auto_docling_ocr = os.getenv('AUTO_DOCLING_OCR', 'true').lower() == 'true'
        self.auto_docling_no_ocr = os.getenv('AUTO_DOCLING_NO_OCR', 'true').lower() == 'true'
        self.auto_docling_vlm = os.getenv('AUTO_DOCLING_VLM', 'false').lower() == 'true'

        # Processing granularity
        self.docling_ocr_by_page = os.getenv('DOCLING_OCR_BY_PAGE', 'false').lower() == 'true'
        self.docling_no_ocr_by_page = os.getenv('DOCLING_NO_OCR_BY_PAGE', 'false').lower() == 'true'
        self.docling_vlm_by_page = os.getenv('DOCLING_VLM_BY_PAGE', 'true').lower() == 'true'

        # Grouping strategy
        self.docling_use_split_map = os.getenv('DOCLING_USE_SPLIT_MAP', 'true').lower() == 'true'
        self.docling_split_threshold = int(os.getenv('DOCLING_SPLIT_THRESHOLD', '50'))

        logger.info("Pipeline controller initialized with new bundle architecture")

    def enqueue_phase1_tasks(self, file_id: str, file_row: Dict[str, Any],
                           processing_path: str, processing_mime: str,
                           priority: TaskPriority = TaskPriority.HIGH) -> Dict[str, List[str]]:
        """
        Enqueue Phase 1 tasks: Structure Discovery & Analysis

        Returns:
            Dictionary mapping task types to task IDs
        """
        logger.info(f"Phase 1: Starting structure discovery for file {file_id}")

        task_ids = {}
        bucket = file_row['bucket']
        cabinet_id = file_row['cabinet_id']

        # Step 1: Tika Processing (metadata extraction)
        if self.auto_tika:
            tika_url = os.getenv('TIKA_URL')
            if tika_url:
                tika_task_id = enqueue_tika_task(
                    file_id=file_id,
                    payload={
                        'bucket': bucket,
                        'file_path': processing_path,
                        'cabinet_id': cabinet_id,
                        'mime_type': processing_mime
                    },
                    priority=priority
                )
                task_ids['tika'] = [tika_task_id]
                logger.info(f"Phase 1: Enqueued Tika task {tika_task_id}")
            else:
                logger.warning("Phase 1: Tika enabled but TIKA_URL not configured")

        # Step 2: Frontmatter processing (lightweight document overview)
        docling_url = os.getenv('DOCLING_URL') or os.getenv('NEOFS_DOCLING_URL')
        if docling_url:
            try:
                front_pages = int(os.getenv('DOCLING_FRONTPAGES', '3'))
            except Exception:
                front_pages = 3

            # Create enhanced metadata for frontmatter JSON display in UI
            frontmatter_metadata = {
                'display_name': f'Document Frontmatter (p1-{front_pages})',
                'bundle_label': 'Frontmatter Analysis',
                'section_title': 'Document Frontmatter',
                'page_range': [1, front_pages],
                'page_count': front_pages,
                'bundle_type': 'frontmatter_json',
                'processing_mode': 'frontmatter_analysis',
                'pipeline': 'frontmatter_ocr',
                'is_frontmatter': True,
                'ui_category': 'document_analysis',
                'ui_order': 1,  # Show first in UI
                'description': f'OCR analysis of first {front_pages} pages for document structure and metadata',
                'viewer_type': 'json'
            }

            frontmatter_task_id = enqueue_docling_task(
                file_id=file_id,
                task_type='docling_frontmatter_json',
                payload={
                    'bucket': bucket,
                    'file_path': processing_path,
                    'cabinet_id': cabinet_id,
                    'mime_type': processing_mime,
                    'config': {
                        'do_ocr': True,
                        'force_ocr': False,
                        'image_export_mode': 'embedded',
                        'ocr_engine': 'easyocr',
                        'ocr_lang': 'en',
                        'pdf_backend': 'dlparse_v4',
                        'table_mode': 'fast',
                        'target_type': 'inbody',
                        'to_formats': 'json',
                        'page_range': [1, front_pages]
                    },
                    'artefact_extra': frontmatter_metadata,
                    'depends_on': task_ids.get('tika', [])
                },
                priority=priority,
                timeout=int(os.getenv('DOCLING_FRONTMATTER_TIMEOUT', '1800'))
            )
            task_ids['frontmatter'] = [frontmatter_task_id]
            logger.info(f"Phase 1: Enqueued frontmatter task {frontmatter_task_id}")

        # Step 3: Document Structure Analysis (LLM-enhanced hierarchy)
        if self.auto_document_analysis:
            analysis_task_id = enqueue_docling_task(
                file_id=file_id,
                task_type='document_structure_analysis',
                payload={
                    'bucket': bucket,
                    'file_path': processing_path,
                    'cabinet_id': cabinet_id,
                    'mime_type': processing_mime,
                    'config': {
                        'target_type': 'inbody',
                        'to_formats': 'json',
                        'do_ocr': False,
                        'force_ocr': False
                    },
                    'depends_on': task_ids.get('frontmatter', [])
                },
                priority=priority,
                timeout=int(os.getenv('DOCUMENT_ANALYSIS_TIMEOUT', '300'))
            )
            task_ids['document_analysis'] = [analysis_task_id]
            logger.info(f"Phase 1: Enqueued document analysis task {analysis_task_id}")

        # Step 4: Split Map Generation (definitive section boundaries)
        if self.auto_split_map:
            split_map_task_id = enqueue_split_map_task(
                file_id=file_id,
                payload={
                    'depends_on': task_ids.get('frontmatter', []) + task_ids.get('document_analysis', [])
                },
                priority=TaskPriority.NORMAL
            )
            task_ids['split_map'] = [split_map_task_id]
            logger.info(f"Phase 1: Enqueued split map task {split_map_task_id}")

        # Step 5: Page Images Generation (for frontend viewing)
        if self.auto_page_images:
            page_images_task_id = enqueue_docling_task(
                file_id=file_id,
                task_type='generate_page_images',
                payload={
                    'bucket': bucket,
                    'file_path': processing_path,
                    'cabinet_id': cabinet_id,
                    'mime_type': processing_mime,
                    'config': {},
                    'depends_on': task_ids.get('document_analysis', [])
                },
                priority=TaskPriority.NORMAL,
                timeout=int(os.getenv('PAGE_IMAGES_TIMEOUT', '1800'))
            )
            task_ids['page_images'] = [page_images_task_id]
            logger.info(f"Phase 1: Enqueued page images task {page_images_task_id}")

        # Bundle tasks are now directly enqueued by split_map task completion

        total_tasks = sum(len(task_list) for task_list in task_ids.values())
        logger.info(f"Phase 1: Enqueued {total_tasks} tasks for file {file_id}: {list(task_ids.keys())}")

        return task_ids

    def check_phase1_completion(self, file_id: str) -> Dict[str, Any]:
        """
        Check if Phase 1 is complete for a given file.

        Returns:
            Dictionary with completion status and details
        """
        logger.debug(f"Checking Phase 1 completion for file {file_id}")

        # Get all artefacts for the file
        artefacts_result = self.client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).execute()
        artefacts = artefacts_result.data or []

        # Check for required Phase 1 artefacts
        phase1_checks = {
            'tika_metadata': False,
            'frontmatter': False,
            'document_analysis': False,
            'split_map': False,
            'page_images': False
        }

        for artefact in artefacts:
            if artefact['status'] == 'completed':
                artefact_type = artefact['type']
                if artefact_type == 'tika_json':
                    phase1_checks['tika_metadata'] = True
                elif artefact_type == 'docling_frontmatter_json':
                    phase1_checks['frontmatter'] = True
                elif artefact_type == 'document_outline_hierarchy':
                    phase1_checks['document_analysis'] = True
                elif artefact_type == 'split_map_json':
                    phase1_checks['split_map'] = True
                elif artefact_type == 'page_images':
                    phase1_checks['page_images'] = True

        # Determine completion based on enabled features
        required_checks = []
        if self.auto_tika:
            required_checks.append('tika_metadata')
        required_checks.append('frontmatter')  # Always required for basic processing
        if self.auto_document_analysis:
            required_checks.append('document_analysis')
        if self.auto_split_map:
            required_checks.append('split_map')
        if self.auto_page_images:
            required_checks.append('page_images')

        completed_required = [check for check in required_checks if phase1_checks[check]]
        is_complete = len(completed_required) == len(required_checks)

        return {
            'file_id': file_id,
            'is_complete': is_complete,
            'completed_components': completed_required,
            'required_components': required_checks,
            'all_checks': phase1_checks,
            'completion_percentage': (len(completed_required) / max(len(required_checks), 1)) * 100
        }

    def enqueue_sequential_docling_pipelines(self, file_id: str, file_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Enqueue sequential docling pipelines with dependencies: no_ocr → ocr → vlm

        Each pipeline depends on ALL tasks from the previous pipeline completing.
        This replaces the complex Phase 2 coordinator with simple task dependencies.

        Args:
            file_id: The file ID to process
            file_data: File processing information (bucket, path, etc.)

        Returns:
            Dictionary with enqueued pipeline information
        """
        logger.info(f"Enqueueing sequential docling pipelines for file {file_id}")

        bucket = file_data['bucket']
        file_path = file_data['file_path']
        cabinet_id = file_data['cabinet_id']
        mime_type = file_data['mime_type']

        # Base configuration shared by all pipelines (pipeline-specific options added per pipeline)
        base_config = {
            'to_formats': ['json', 'html', 'text', 'md', 'doctags'],
            'image_export_mode': 'referenced',
            'target_type': 'zip',
            'pdf_backend': os.getenv('DOCLING_PDF_BACKEND', 'dlparse_v4'),
            'include_images': os.getenv('DOCLING_INCLUDE_IMAGES', 'true').lower() == 'true',
            'images_scale': float(os.getenv('DOCLING_IMAGES_SCALE', '2.0')),
            'ocr_engine': os.getenv('OCR_ENGINE', 'easyocr'),
            'ocr_lang': os.getenv('OCR_LANG', 'en'),
            'picture_description_area_threshold': float(os.getenv('DOCLING_PICTURE_DESCRIPTION_AREA_THRESHOLD', '0.05'))
        }

        # Determine the pipeline execution order: no_ocr → ocr → vlm
        pipeline_order = []
        if self.auto_docling_no_ocr:
            pipeline_order.append('no_ocr')
        if self.auto_docling_ocr:
            pipeline_order.append('ocr')
        if self.auto_docling_vlm:
            pipeline_order.append('vlm')

        if not pipeline_order:
            logger.info(f"No docling pipelines enabled for file {file_id}")
            return {
                'file_id': file_id,
                'enqueued_pipelines': {},
                'total_tasks': 0,
                'sequential_order': [],
                'message': 'No docling pipelines enabled'
            }

        logger.info(f"Sequential pipeline order for file {file_id}: {pipeline_order}")

        # Enqueue all pipelines with proper dependencies
        enqueued_pipelines = {}
        all_task_ids = {}

        for i, pipeline_type in enumerate(pipeline_order):
            # Determine dependencies: depend on ALL tasks from previous pipeline
            depends_on = []
            if i > 0:
                previous_pipeline = pipeline_order[i - 1]
                depends_on = all_task_ids.get(previous_pipeline, [])
                logger.info(f"Pipeline {pipeline_type} will depend on {len(depends_on)} tasks from {previous_pipeline}: {depends_on[:3]}..." if len(depends_on) > 3 else f"Pipeline {pipeline_type} will depend on {len(depends_on)} tasks from {previous_pipeline}: {depends_on}")
            else:
                logger.info(f"Pipeline {pipeline_type} has no dependencies (first pipeline)")

            # Create pipeline tasks
            pipeline_result = self._enqueue_single_pipeline_with_deps(
                file_id, pipeline_type, base_config, bucket, file_path, cabinet_id, mime_type, depends_on
            )

            if pipeline_result:
                enqueued_pipelines[pipeline_type] = pipeline_result
                all_task_ids[pipeline_type] = pipeline_result['task_ids']
                logger.info(f"Enqueued {pipeline_type} pipeline with {len(pipeline_result['task_ids'])} tasks")

        total_tasks = sum(len(p.get('task_ids', [])) for p in enqueued_pipelines.values())
        logger.info(f"Successfully enqueued {len(pipeline_order)} sequential pipelines with {total_tasks} total tasks for file {file_id}")

        return {
            'file_id': file_id,
            'enqueued_pipelines': enqueued_pipelines,
            'total_tasks': total_tasks,
            'sequential_order': pipeline_order
        }

    def _determine_processing_mode(self, file_id: str, pipeline_type: str) -> tuple[str, dict]:
        """
        Determine how to process document based on settings and characteristics.

        Implements corrected decision logic:
        1. Priority 1: Respect explicit BY_PAGE preference
        2. Priority 2: Check size threshold for auto-processing
        3. Priority 3: Use split map for large documents
        4. Priority 4: Fallback chunking

        Returns:
            Tuple of (processing_mode, processing_data)
        """
        # Check BY_PAGE flags first (highest priority)
        by_page = self._get_by_page_setting(pipeline_type)
        if by_page:
            logger.info(f"BY_PAGE enabled for {pipeline_type} - creating page-based bundles regardless of document size")
            return "split_by_pages", self._get_page_ranges(file_id)

        # Get document characteristics
        page_count = self._get_page_count(file_id)

        # Apply size threshold logic
        if page_count < self.docling_split_threshold:
            logger.info(f"Document has {page_count} pages (< {self.docling_split_threshold} threshold) - creating single bundle")
            return "whole_document", {}

        # Check for split map availability
        split_map = self._load_split_map_if_needed(file_id)
        if split_map and self.docling_use_split_map:
            logger.info(f"Document has {page_count} pages (>= {self.docling_split_threshold} threshold) with split map - creating section-based bundles")
            return "split_by_sections", split_map
        else:
            logger.error(f"Document has {page_count} pages (>= {self.docling_split_threshold} threshold) without split map - ERROR")
            return "error"

    def _get_by_page_setting(self, pipeline_type: str) -> bool:
        """Get BY_PAGE setting for the specified pipeline type."""
        if pipeline_type == 'no_ocr':
            return self.docling_no_ocr_by_page
        elif pipeline_type == 'ocr':
            return self.docling_ocr_by_page
        elif pipeline_type == 'vlm':
            return self.docling_vlm_by_page
        return False

    def _get_pipeline_specific_config(self, pipeline_type: str) -> Dict[str, Any]:
        """Get pipeline-specific configuration options from environment variables."""
        if pipeline_type == 'no_ocr':
            return {
                'table_mode': os.getenv('DOCLING_NO_OCR_TABLE_MODE', 'fast'),
                'table_cell_matching': os.getenv('DOCLING_NO_OCR_TABLE_CELL_MATCHING', 'false').lower() == 'true',
                'do_formula_enrichment': os.getenv('DOCLING_NO_OCR_DO_FORMULA_ENRICHMENT', 'false').lower() == 'true',
                'do_code_enrichment': os.getenv('DOCLING_NO_OCR_DO_CODE_ENRICHMENT', 'false').lower() == 'true',
                'do_table_structure': os.getenv('DOCLING_NO_OCR_DO_TABLE_STRUCTURE', 'true').lower() == 'true',
                'do_picture_classification': os.getenv('DOCLING_NO_OCR_DO_PICTURE_CLASSIFICATION', 'false').lower() == 'true',
                'do_picture_description': os.getenv('DOCLING_NO_OCR_DO_PICTURE_DESCRIPTION', 'false').lower() == 'true'
            }
        elif pipeline_type == 'ocr':
            return {
                'table_mode': os.getenv('DOCLING_OCR_TABLE_MODE', 'accurate'),
                'table_cell_matching': os.getenv('DOCLING_OCR_TABLE_CELL_MATCHING', 'true').lower() == 'true',
                'do_formula_enrichment': os.getenv('DOCLING_OCR_DO_FORMULA_ENRICHMENT', 'true').lower() == 'true',
                'do_code_enrichment': os.getenv('DOCLING_OCR_DO_CODE_ENRICHMENT', 'true').lower() == 'true',
                'do_table_structure': os.getenv('DOCLING_OCR_DO_TABLE_STRUCTURE', 'true').lower() == 'true',
                'do_picture_classification': os.getenv('DOCLING_OCR_DO_PICTURE_CLASSIFICATION', 'false').lower() == 'true',
                'do_picture_description': os.getenv('DOCLING_OCR_DO_PICTURE_DESCRIPTION', 'false').lower() == 'true'
            }
        elif pipeline_type == 'vlm':
            return {
                'table_mode': os.getenv('DOCLING_VLM_TABLE_MODE', 'accurate'),
                'table_cell_matching': os.getenv('DOCLING_VLM_TABLE_CELL_MATCHING', 'true').lower() == 'true',
                'do_formula_enrichment': os.getenv('DOCLING_VLM_DO_FORMULA_ENRICHMENT', 'false').lower() == 'true',
                'do_code_enrichment': os.getenv('DOCLING_VLM_DO_CODE_ENRICHMENT', 'false').lower() == 'true',
                'do_table_structure': os.getenv('DOCLING_VLM_DO_TABLE_STRUCTURE', 'true').lower() == 'true',
                'do_picture_classification': os.getenv('DOCLING_VLM_DO_PICTURE_CLASSIFICATION', 'true').lower() == 'true',
                'do_picture_description': os.getenv('DOCLING_VLM_DO_PICTURE_DESCRIPTION', 'true').lower() == 'true'
            }
        else:
            # Default config for unknown pipeline types
            return {
                'table_mode': 'fast',
                'table_cell_matching': False,
                'do_formula_enrichment': False,
                'do_code_enrichment': False,
                'do_table_structure': True,
                'do_picture_classification': False,
                'do_picture_description': False
            }

    def _get_page_count(self, file_id: str) -> int:
        """Get page count for the file from existing artefacts (first Tika)."""
        logger.info(f"🔍 PAGE COUNT: Starting page count detection for file {file_id}")

        try:
            # Try to get page count from existing artefacts, excluding frontmatter (partial document)
            artefacts = self.client.supabase.table('document_artefacts').select('type,extra').eq('file_id', file_id).execute()
            artefact_types = [art.get('type', 'unknown') for art in artefacts.data or []]
            logger.info(f"🔍 PAGE COUNT: Found {len(artefacts.data or [])} artefacts for file {file_id}: {artefact_types}")

            for art in artefacts.data or []:
                art_type = art.get('type', 'unknown')
                extra = art.get('extra', {})
                logger.info(f"🔍 PAGE COUNT: Checking artefact type '{art_type}' for file {file_id}")

                # Skip frontmatter artefacts as they only contain partial page counts
                if art_type == 'docling_frontmatter_json':
                    logger.info(f"🔍 PAGE COUNT: Skipping frontmatter artefact (partial page count) for file {file_id}")
                    continue

                # Also skip docling_json artefacts that are from frontmatter processing
                if art_type == 'docling_json' and extra.get('is_frontmatter', False):
                    logger.info(f"🔍 PAGE COUNT: Skipping frontmatter-derived docling_json artefact (partial page count) for file {file_id}")
                    continue

                # Also skip docling_json artefacts that have frontmatter-related pipeline info
                if art_type == 'docling_json' and extra.get('pipeline') == 'frontmatter_ocr':
                    logger.info(f"🔍 PAGE COUNT: Skipping frontmatter pipeline docling_json artefact (partial page count) for file {file_id}")
                    continue

                if 'page_count' in extra:
                    page_count = int(extra['page_count'])
                    logger.info(f"✅ PAGE COUNT: Found page count {page_count} from {art_type} artefact for file {file_id}")
                    return page_count
                else:
                    logger.info(f"🔍 PAGE COUNT: No page_count in {art_type} artefact for file {file_id}")

            logger.info(f"🔍 PAGE COUNT: No artefacts with page_count found, trying Tika JSON parsing for file {file_id}")

            # Try to get page count from Tika JSON (most reliable source)
            tika_arts = self.client.supabase.table('document_artefacts') \
                .select('rel_path') \
                .eq('file_id', file_id) \
                .eq('type', 'tika_json') \
                .execute()

            if tika_arts.data:
                logger.info(f"🔍 PAGE COUNT: Found Tika JSON artefact, parsing content for file {file_id}")
                file_info = self.client.supabase.table('files').select('bucket').eq('id', file_id).single().execute()
                if file_info.data:
                    tika_data = self.storage.download_file(file_info.data['bucket'], tika_arts.data[0]['rel_path'])
                    import json
                    tika_json = json.loads(tika_data.decode('utf-8'))

                    # Check common Tika page count keys in top level and metadata
                    logger.info(f"🔍 PAGE COUNT: Checking Tika JSON keys for page count in file {file_id}")

                    # First check metadata section (most common location)
                    metadata = tika_json.get('metadata', {})
                    for key in ("xmpTPg:NPages", "Page-Count", "pdf:PageCount", "pdf:pagecount", "meta:page-count", "pdfa:PDFVersion"):
                        # Check both exact key and lowercase version in metadata
                        value = metadata.get(key) or metadata.get(key.lower())
                        if value is not None:
                            try:
                                page_count = int(value)
                                if page_count > 0:
                                    logger.info(f"✅ PAGE COUNT: Found page count {page_count} from Tika metadata key '{key}' for file {file_id}")
                                    return page_count
                            except Exception as parse_error:
                                logger.info(f"🔍 PAGE COUNT: Could not parse value '{value}' from Tika metadata key '{key}': {parse_error}")
                                continue

                    # Also check top level (fallback)
                    for key in ("xmpTPg:NPages", "Page-Count", "pdf:PageCount", "pdf:pagecount"):
                        value = tika_json.get(key) or tika_json.get(key.lower())
                        if value is not None:
                            try:
                                page_count = int(value)
                                if page_count > 0:
                                    logger.info(f"✅ PAGE COUNT: Found page count {page_count} from Tika JSON top-level key '{key}' for file {file_id}")
                                    return page_count
                            except Exception as parse_error:
                                logger.info(f"🔍 PAGE COUNT: Could not parse value '{value}' from Tika top-level key '{key}': {parse_error}")
                                continue

                    # Debug: Show available keys to help diagnose issues
                    logger.info(f"🔍 PAGE COUNT: Available Tika JSON top-level keys: {list(tika_json.keys())}")
                    if 'metadata' in tika_json:
                        logger.info(f"🔍 PAGE COUNT: Available Tika metadata keys: {list(metadata.keys())}")

                    logger.warning(f"🔍 PAGE COUNT: No valid page count keys found in Tika JSON for file {file_id}")
                else:
                    logger.warning(f"🔍 PAGE COUNT: Could not get file info for Tika JSON parsing for file {file_id}")
            else:
                logger.warning(f"🔍 PAGE COUNT: No Tika JSON artefact found for file {file_id}")

            # Final fallback - try to get it directly from PDF using PyMuPDF
            logger.warning(f"🔍 PAGE COUNT: Trying direct PDF parsing as final fallback for file {file_id}")
            return self._get_page_count_direct_pdf(file_id)

        except Exception as e:
            logger.error(f"❌ PAGE COUNT: Error getting page count for file {file_id}: {e}, defaulting to {self.docling_split_threshold + 1}")
            return self.docling_split_threshold + 1

    def _get_page_count_direct_pdf(self, file_id: str) -> int:
        """Final fallback: Get page count directly from PDF using PyMuPDF."""
        try:
            # Get file info from database
            file_info = self.client.supabase.table('files').select('bucket,path,cabinet_id').eq('id', file_id).single().execute()
            if not file_info.data:
                logger.warning(f"🔍 PAGE COUNT: Could not find file info for {file_id}, defaulting to threshold + 1")
                return self.docling_split_threshold + 1

            file_row = file_info.data
            bucket = file_row['bucket']
            file_path = file_row['path']

            # Download and read PDF directly with PyMuPDF
            logger.info(f"🔍 PAGE COUNT: Reading PDF directly from storage for file {file_id}")
            pdf_bytes = self.storage.download_file(bucket, file_path)

            import fitz  # PyMuPDF
            doc = fitz.open(stream=pdf_bytes, filetype="pdf")
            page_count = len(doc)
            doc.close()

            logger.info(f"✅ PAGE COUNT: Direct PDF reading found {page_count} pages for file {file_id}")
            return page_count

        except Exception as e:
            logger.error(f"❌ PAGE COUNT: Direct PDF reading failed for file {file_id}: {e}, defaulting to {self.docling_split_threshold + 1}")
            return self.docling_split_threshold + 1

    def _get_page_ranges(self, file_id: str) -> dict:
        """Get page ranges for page-based processing."""
        page_count = self._get_page_count(file_id)
        return {
            'pages': list(range(1, page_count + 1)),
            'total_pages': page_count
        }

    def _load_split_map_if_needed(self, file_id: str) -> Optional[Dict[str, Any]]:
        """Load split map if needed for processing decisions."""
        try:
            file_info = self.client.supabase.table('files').select('bucket').eq('id', file_id).single().execute()
            if not file_info.data:
                return None
            return self._load_split_map(file_info.data['bucket'], file_id)
        except Exception:
            return None

    def _create_chunked_ranges(self, page_count: int) -> dict:
        """Create chunked page ranges for large documents without split maps."""
        chunk_size = max(10, self.docling_split_threshold // 4)  # 1/4 of threshold, min 10 pages
        chunks = []

        for start_page in range(1, page_count + 1, chunk_size):
            end_page = min(start_page + chunk_size - 1, page_count)
            chunks.append({
                'start': start_page,
                'end': end_page,
                'title': f'Pages {start_page}-{end_page}'
            })

        return {
            'entries': chunks,
            'total_chunks': len(chunks)
        }

    def _enqueue_single_pipeline_with_deps(self, file_id: str, pipeline_type: str, base_config: Dict[str, Any],
                                          bucket: str, file_path: str, cabinet_id: str, mime_type: str,
                                          depends_on: List[str]) -> Optional[Dict[str, Any]]:
        """Enqueue a single pipeline with dependencies on previous pipeline tasks."""

        group_id = str(uuid.uuid4())

        # Get pipeline-specific configuration options
        pipeline_specific_config = self._get_pipeline_specific_config(pipeline_type)

        if pipeline_type == 'no_ocr':
            config = {
                **base_config,
                **pipeline_specific_config,
                'do_ocr': False,
                'force_ocr': False,
                'pipeline': 'standard'
            }
            logger.info(f"NO_OCR pipeline config: table_mode={config['table_mode']}, "
                       f"formula_enrichment={config['do_formula_enrichment']}, "
                       f"code_enrichment={config['do_code_enrichment']}")
        elif pipeline_type == 'ocr':
            config = {
                **base_config,
                **pipeline_specific_config,
                'do_ocr': True,
                'force_ocr': False,
                'pipeline': 'standard'
            }
            logger.info(f"OCR pipeline config: table_mode={config['table_mode']}, "
                       f"formula_enrichment={config['do_formula_enrichment']}, "
                       f"code_enrichment={config['do_code_enrichment']}")
        elif pipeline_type == 'vlm':
            config = {
                **base_config,
                **pipeline_specific_config,
                'do_ocr': False,
                'force_ocr': False,
                'pipeline': 'vlm',
                'vlm_pipeline_model': os.getenv('DOCLING_VLM_MODEL', 'smoldocling')
            }
            logger.info(f"VLM pipeline config: table_mode={config['table_mode']}, "
                       f"picture_classification={config['do_picture_classification']}, "
                       f"picture_description={config['do_picture_description']}")
        else:
            logger.error(f"Unknown pipeline type: {pipeline_type}")
            return None

        # Determine processing mode using corrected logic
        processing_mode, processing_data = self._determine_processing_mode(file_id, pipeline_type)

        # Enqueue single bundle task with dependencies
        task_id = self._enqueue_bundle_task_with_deps(
            file_id, pipeline_type, group_id, config, processing_mode, processing_data,
            bucket, file_path, cabinet_id, mime_type, depends_on
        )

        return {
            'group_id': group_id,
            'task_ids': [task_id] if task_id else [],
            'task_count': 1 if task_id else 0,
            'processing_mode': processing_mode,
            'processing_data': processing_data
        }

    def _enqueue_bundle_task_with_deps(self, file_id: str, pipeline_type: str, group_id: str,
                                     config: Dict[str, Any], processing_mode: str, processing_data: dict,
                                     bucket: str, file_path: str, cabinet_id: str, mime_type: str,
                                     depends_on: List[str]) -> Optional[str]:
        """
        Enqueue a single bundle task that handles processing internally based on mode.

        This replaces the old approach of creating multiple individual tasks.
        """
        from modules.queue_system import enqueue_docling_task, TaskPriority
        from modules.bundle_metadata import create_standard_metadata

        # Map processing modes to bundle types and task types
        if processing_mode == "whole_document":
            task_type = 'docling_bundle'
            bundle_type = 'whole_document'
        else:
            task_type = 'docling_bundle_split'
            bundle_type = processing_mode

        # Create bundle metadata with correct processing mode mapping
        if processing_mode == "whole_document":
            bundle_processing_mode = "whole_document"
        elif processing_mode.startswith("split_by_"):
            # For split modes, map to the appropriate bundle metadata mode
            if processing_mode == "split_by_pages":
                bundle_processing_mode = "pages"
            elif processing_mode == "split_by_sections":
                bundle_processing_mode = "sections"
            elif processing_mode == "split_by_chunks":
                bundle_processing_mode = "chunks"
            else:
                bundle_processing_mode = processing_mode.replace('split_by_', '')
        else:
            bundle_processing_mode = processing_mode

        bundle_metadata = create_standard_metadata(
            file_id=file_id,
            pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"),
            processing_mode=bundle_processing_mode,
            config=config,
            group_id=group_id,
            producer="auto_phase2"
        )

        # Create task payload with new bundle architecture
        payload = {
            'bucket': bucket,
            'file_path': file_path,
            'cabinet_id': cabinet_id,
            'mime_type': mime_type,
            'config': config,
            'processing_mode': processing_mode,
            'processing_data': processing_data,
            'bundle_metadata': bundle_metadata.to_artefact_extra(),
            'depends_on': depends_on
        }

        # Determine timeout based on processing complexity
        if processing_mode == "whole_document":
            timeout = 7200  # 2 hours for whole document
        elif processing_mode == "split_by_pages":
            # Estimate based on page count
            page_count = processing_data.get('total_pages', 50)
            timeout = min(14400, max(3600, page_count * 60))  # 1-4 hours based on pages
        else:
            # Section or chunk based processing
            section_count = len(processing_data.get('entries', []))
            timeout = min(10800, max(3600, section_count * 300))  # 1-3 hours based on sections

        logger.info(f"Enqueuing {task_type} task for {pipeline_type} pipeline: {processing_mode} (timeout: {timeout}s)")

        try:
            task_id = enqueue_docling_task(
                file_id=file_id,
                task_type=task_type,
                payload=payload,
                priority=TaskPriority.NORMAL,
                timeout=timeout
            )

            logger.info(f"Successfully enqueued {task_type} task {task_id} for {pipeline_type} pipeline")
            return task_id

        except Exception as e:
            logger.error(f"Failed to enqueue bundle task for {pipeline_type} pipeline: {e}")
            return None

    def trigger_phase2_pipelines(self, file_id: str, file_data: Dict[str, Any]) -> Dict[str, Any]:
        """
        Trigger Phase 2 sequential processing pipelines based on environment configuration.

        Pipelines run in order: no_ocr → ocr → vlm (depending on what's enabled).
        Only the first pipeline starts immediately; others are triggered when the previous completes.

        Args:
            file_id: The file ID to process
            file_data: File processing information (bucket, path, etc.)

        Returns:
            Dictionary with triggered pipeline information
        """
        logger.info(f"Phase 2: Starting sequential content processing for file {file_id}")

        triggered_pipelines = {}
        bucket = file_data['bucket']
        file_path = file_data['file_path']
        cabinet_id = file_data['cabinet_id']
        mime_type = file_data['mime_type']

        # Base configuration for all pipelines (DEPRECATED METHOD - use enqueue_sequential_docling_pipelines)
        base_config = {
            'to_formats': ['json', 'html', 'text', 'md', 'doctags'],
            'image_export_mode': 'referenced',
            'target_type': 'zip',
            'pdf_backend': os.getenv('DOCLING_PDF_BACKEND', 'dlparse_v4'),
            'include_images': os.getenv('DOCLING_INCLUDE_IMAGES', 'true').lower() == 'true',
            'images_scale': float(os.getenv('DOCLING_IMAGES_SCALE', '2.0')),
            'ocr_engine': os.getenv('OCR_ENGINE', 'easyocr'),
            'ocr_lang': os.getenv('OCR_LANG', 'en'),
            'picture_description_area_threshold': float(os.getenv('DOCLING_PICTURE_DESCRIPTION_AREA_THRESHOLD', '0.05'))
        }

        # Determine the pipeline execution order: no_ocr → ocr → vlm
        pipeline_order = []
        if self.auto_docling_no_ocr:
            pipeline_order.append('no_ocr')
        if self.auto_docling_ocr:
            pipeline_order.append('ocr')
        if self.auto_docling_vlm:
            pipeline_order.append('vlm')

        if not pipeline_order:
            logger.info(f"Phase 2: No pipelines enabled for file {file_id}")
            return {
                'file_id': file_id,
                'triggered_pipelines': {},
                'total_tasks': 0,
                'sequential_order': [],
                'message': 'No Phase 2 pipelines enabled'
            }

        logger.info(f"Phase 2: Sequential pipeline order for file {file_id}: {pipeline_order}")
        logger.warning(f"trigger_phase2_pipelines is deprecated - use enqueue_sequential_docling_pipelines for new implementations")

        # For backward compatibility, delegate to the new method
        return self.enqueue_sequential_docling_pipelines(file_id, file_data)

    def _start_single_pipeline(self, file_id: str, pipeline_type: str, base_config: Dict[str, Any],
                               bucket: str, file_path: str, cabinet_id: str, mime_type: str) -> Optional[Dict[str, Any]]:
        """Start a single pipeline of the specified type."""

        if pipeline_type == 'no_ocr':
            group_id = str(uuid.uuid4())
            config = {
                **base_config,
                'do_ocr': False,
                'force_ocr': False,
                'pipeline': 'standard'
            }
            tasks = self._enqueue_pipeline(
                file_id, 'no_ocr', group_id, config,
                bucket, file_path, cabinet_id, mime_type,
                by_page=self.docling_no_ocr_by_page
            )
            return {
                'group_id': group_id,
                'task_count': len(tasks),
                'by_page': self.docling_no_ocr_by_page
            }

        elif pipeline_type == 'ocr':
            group_id = str(uuid.uuid4())
            config = {
                **base_config,
                'do_ocr': True,
                'ocr_engine': os.getenv('OCR_ENGINE', 'easyocr'),
                'force_ocr': False,
                'pipeline': 'standard'
            }
            tasks = self._enqueue_pipeline(
                file_id, 'ocr', group_id, config,
                bucket, file_path, cabinet_id, mime_type,
                by_page=self.docling_ocr_by_page
            )
            return {
                'group_id': group_id,
                'task_count': len(tasks),
                'by_page': self.docling_ocr_by_page
            }

        elif pipeline_type == 'vlm':
            group_id = str(uuid.uuid4())
            config = {
                **base_config,
                'do_ocr': False,
                'force_ocr': False,
                'pipeline': 'vlm',
                'vlm_pipeline_model': os.getenv('DOCLING_VLM_MODEL', 'smoldocling')
            }
            tasks = self._enqueue_pipeline(
                file_id, 'vlm', group_id, config,
                bucket, file_path, cabinet_id, mime_type,
                by_page=self.docling_vlm_by_page
            )
            return {
                'group_id': group_id,
                'task_count': len(tasks),
                'by_page': self.docling_vlm_by_page
            }

        else:
            logger.error(f"Unknown pipeline type: {pipeline_type}")
            return None

# continue_sequential_pipeline method removed - task dependencies now handle sequential execution

    def _load_split_map(self, bucket: str, file_id: str) -> Optional[Dict[str, Any]]:
        """Load split map data for a file."""
        try:
            arts = self.client.supabase.table('document_artefacts') \
                .select('id,type,rel_path') \
                .eq('file_id', file_id).eq('type', 'split_map_json') \
                .order('created_at', desc=True).limit(1).execute().data or []
            if not arts:
                return None
            art = arts[0]
            raw = self.storage.download_file(bucket, art['rel_path'])
            import json as _json
            return _json.loads(raw.decode('utf-8'))
        except Exception:
            return None

    def _enqueue_pipeline(self, file_id: str, pipeline_type: str, group_id: str, config: Dict[str, Any],
                         bucket: str, file_path: str, cabinet_id: str, mime_type: str,
                         by_page: bool = False) -> List[str]:
        """Enqueue tasks for a specific pipeline (OCR/No-OCR/VLM)"""

        task_ids = []

        if by_page:
            # Process each page individually, then group by split map sections
            logger.info(f"Phase 2: Enqueueing {pipeline_type} pipeline by page for file {file_id}")

            split_map = self._load_split_map(bucket, file_id)
            if split_map:
                entries = split_map.get('entries', [])
                for section_idx, entry in enumerate(entries, 1):
                    start_page = int(entry.get('start_page', 1))
                    end_page = int(entry.get('end_page', start_page))
                    section_title = entry.get('title', f'Section {section_idx}')

                    if pipeline_type == 'vlm':
                        # VLM uses specialized page processing
                        section_task_id = enqueue_docling_task(
                            file_id=file_id,
                            task_type='vlm_section_page_bundle',
                            payload={
                                'section_idx': section_idx,
                                'start_page': start_page,
                                'end_page': end_page,
                                'section_title': section_title,
                                'vlm_group_id': group_id,
                                'vlm_model': config.get('vlm_pipeline_model', 'smoldocling'),
                                'base_config': config,
                                'total_sections': len(entries),
                                'producer': 'auto_phase2'
                            },
                            priority=TaskPriority.NORMAL,
                            timeout=3600
                        )
                        task_ids.append(section_task_id)
                    else:
                        # OCR/No-OCR by page processing (process each page in section individually)
                        for page_num in range(start_page, end_page + 1):
                            page_config = {
                                **config,
                                'page_range': [page_num, page_num]
                            }

                            # Create standardized bundle metadata
                            page_metadata = create_standard_metadata(
                                file_id=file_id,
                                pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"),
                                processing_mode="individual_pages",
                                config=page_config,
                                group_id=group_id,
                                split_order=section_idx,
                                split_total=len(entries),
                                split_heading=section_title,
                                page_range=[page_num, page_num],
                                producer="auto_phase2"
                            )

                            # Add legacy fields for backward compatibility
                            artefact_extra = page_metadata.to_artefact_extra()
                            artefact_extra.update({
                                'section_idx': section_idx,
                                'section_title': section_title,
                                'page_number': page_num,
                            })

                            page_task_id = enqueue_docling_task(
                                file_id=file_id,
                                task_type='canonical_docling_json',
                                payload={
                                    'bucket': bucket,
                                    'file_path': file_path,
                                    'cabinet_id': cabinet_id,
                                    'mime_type': mime_type,
                                    'config': page_config,
                                    'artefact_extra': artefact_extra
                                },
                                priority=TaskPriority.NORMAL,
                                timeout=1800
                            )
                            task_ids.append(page_task_id)
            else:
                logger.warning(f"Phase 2: No split map found for by-page processing of file {file_id}")
                return []

        elif self.docling_use_split_map:
            # Process by split map sections
            logger.info(f"Phase 2: Enqueueing {pipeline_type} pipeline by split map sections for file {file_id}")

            split_map = self._load_split_map(bucket, file_id)
            if split_map:
                entries = split_map.get('entries', [])

                # Normalize and sort entries by start_page
                normalized_entries = []
                for entry in entries:
                    try:
                        start_page = int(entry.get('start_page', 1))
                        end_page = int(entry.get('end_page', start_page))
                        title = entry.get('title') or entry.get('label') or ''
                        if end_page < start_page:
                            end_page = start_page
                        normalized_entries.append({
                            'start': start_page,
                            'end': end_page,
                            'title': title
                        })
                    except Exception:
                        continue

                normalized_entries.sort(key=lambda x: x['start'])

                # Create tasks for each section
                for i, entry in enumerate(normalized_entries, 1):
                    section_config = {
                        **config,
                        'page_range': [entry['start'], entry['end']]
                    }

                    # Create standardized bundle metadata for section
                    section_metadata = create_standard_metadata(
                        file_id=file_id,
                        pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"),
                        processing_mode="split_sections",
                        config=section_config,
                        group_id=group_id,
                        split_order=i,
                        split_total=len(normalized_entries),
                        split_heading=entry['title'] or f'Section {i}',
                        page_range=[entry['start'], entry['end']],
                        producer="auto_phase2"
                    )

                    section_task_id = enqueue_docling_task(
                        file_id=file_id,
                        task_type='canonical_docling_json',
                        payload={
                            'bucket': bucket,
                            'file_path': file_path,
                            'cabinet_id': cabinet_id,
                            'mime_type': mime_type,
                            'config': section_config,
                            'artefact_extra': section_metadata.to_artefact_extra()
                        },
                        priority=TaskPriority.NORMAL,
                        timeout=3600
                    )
                    task_ids.append(section_task_id)
            else:
                logger.warning(f"Phase 2: No split map found for section-based processing of file {file_id}")
                return []

        else:
            # Process whole document
            logger.info(f"Phase 2: Enqueueing {pipeline_type} pipeline for whole document {file_id}")

            # Create standardized bundle metadata for whole document
            whole_doc_metadata = create_standard_metadata(
                file_id=file_id,
                pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"),
                processing_mode="whole_document",
                config=config,
                group_id=group_id,
                producer="auto_phase2"
            )

            task_id = enqueue_docling_task(
                file_id=file_id,
                task_type='canonical_docling_json',
                payload={
                    'bucket': bucket,
                    'file_path': file_path,
                    'cabinet_id': cabinet_id,
                    'mime_type': mime_type,
                    'config': config,
                    'artefact_extra': whole_doc_metadata.to_artefact_extra()
                },
                priority=TaskPriority.NORMAL,
                timeout=7200
            )
            task_ids.append(task_id)

        logger.info(f"Phase 2: Enqueued {len(task_ids)} tasks for {pipeline_type} pipeline")
        return task_ids

    def _enqueue_pipeline_with_deps(self, file_id: str, pipeline_type: str, group_id: str, config: Dict[str, Any],
                                   bucket: str, file_path: str, cabinet_id: str, mime_type: str,
                                   by_page: bool = False, depends_on: List[str] = None) -> List[str]:
        """Enqueue tasks for a specific pipeline with dependencies"""

        if depends_on is None:
            depends_on = []

        task_ids = []

        if by_page:
            # Process each page individually, then group by split map sections
            logger.info(f"Enqueueing {pipeline_type} pipeline by page for file {file_id} with {len(depends_on)} dependencies")

            split_map = self._load_split_map(bucket, file_id)
            if split_map:
                entries = split_map.get('entries', [])
                for section_idx, entry in enumerate(entries, 1):
                    start_page = int(entry.get('start_page', 1))
                    end_page = int(entry.get('end_page', start_page))
                    section_title = entry.get('title', f'Section {section_idx}')

                    if pipeline_type == 'vlm':
                        # VLM uses specialized page processing
                        section_task_id = enqueue_docling_task(
                            file_id=file_id,
                            task_type='vlm_section_page_bundle',
                            payload={
                                'section_idx': section_idx,
                                'start_page': start_page,
                                'end_page': end_page,
                                'section_title': section_title,
                                'vlm_group_id': group_id,
                                'vlm_model': config.get('vlm_pipeline_model', 'smoldocling'),
                                'base_config': config,
                                'total_sections': len(entries),
                                'producer': 'auto_phase2',
                                'depends_on': depends_on
                            },
                            priority=TaskPriority.NORMAL,
                            timeout=3600
                        )
                        task_ids.append(section_task_id)
                    else:
                        # OCR/No-OCR by page processing (process each page in section individually)
                        for page_num in range(start_page, end_page + 1):
                            page_config = {
                                **config,
                                'page_range': [page_num, page_num]
                            }

                            # Create standardized bundle metadata
                            page_metadata = create_standard_metadata(
                                file_id=file_id,
                                pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"),
                                processing_mode="individual_pages",
                                config=page_config,
                                group_id=group_id,
                                split_order=section_idx,
                                split_total=len(entries),
                                split_heading=section_title,
                                page_range=[page_num, page_num],
                                producer="auto_phase2"
                            )

                            # Add legacy fields for backward compatibility
                            artefact_extra = page_metadata.to_artefact_extra()
                            artefact_extra.update({
                                'section_idx': section_idx,
                                'section_title': section_title,
                                'page_number': page_num,
                            })

                            page_task_id = enqueue_docling_task(
                                file_id=file_id,
                                task_type='canonical_docling_json',
                                payload={
                                    'bucket': bucket,
                                    'file_path': file_path,
                                    'cabinet_id': cabinet_id,
                                    'mime_type': mime_type,
                                    'config': page_config,
                                    'artefact_extra': artefact_extra,
                                    'depends_on': depends_on
                                },
                                priority=TaskPriority.NORMAL,
                                timeout=1800
                            )
                            task_ids.append(page_task_id)
            else:
                logger.warning(f"No split map found for by-page processing of file {file_id}")
                return []

        elif self.docling_use_split_map:
            # Process by split map sections
            logger.info(f"Enqueueing {pipeline_type} pipeline by split map sections for file {file_id} with {len(depends_on)} dependencies")

            split_map = self._load_split_map(bucket, file_id)
            if split_map:
                entries = split_map.get('entries', [])

                # Normalize and sort entries by start_page
                normalized_entries = []
                for entry in entries:
                    try:
                        start_page = int(entry.get('start_page', 1))
                        end_page = int(entry.get('end_page', start_page))
                        title = entry.get('title') or entry.get('label') or ''
                        if end_page < start_page:
                            end_page = start_page
                        normalized_entries.append({
                            'start': start_page,
                            'end': end_page,
                            'title': title
                        })
                    except Exception:
                        continue

                normalized_entries.sort(key=lambda x: x['start'])

                # Create tasks for each section
                for i, entry in enumerate(normalized_entries, 1):
                    section_config = {
                        **config,
                        'page_range': [entry['start'], entry['end']]
                    }

                    # Create standardized bundle metadata for section
                    section_metadata = create_standard_metadata(
                        file_id=file_id,
                        pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"),
                        processing_mode="split_sections",
                        config=section_config,
                        group_id=group_id,
                        split_order=i,
                        split_total=len(normalized_entries),
                        split_heading=entry['title'] or f'Section {i}',
                        page_range=[entry['start'], entry['end']],
                        producer="auto_phase2"
                    )

                    section_task_id = enqueue_docling_task(
                        file_id=file_id,
                        task_type='canonical_docling_json',
                        payload={
                            'bucket': bucket,
                            'file_path': file_path,
                            'cabinet_id': cabinet_id,
                            'mime_type': mime_type,
                            'config': section_config,
                            'artefact_extra': section_metadata.to_artefact_extra(),
                            'depends_on': depends_on
                        },
                        priority=TaskPriority.NORMAL,
                        timeout=3600
                    )
                    task_ids.append(section_task_id)
            else:
                logger.warning(f"No split map found for section-based processing of file {file_id}")
                return []

        else:
            # Process whole document
            logger.info(f"Enqueueing {pipeline_type} pipeline for whole document {file_id} with {len(depends_on)} dependencies")

            # Create standardized bundle metadata for whole document
            whole_doc_metadata = create_standard_metadata(
                file_id=file_id,
                pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"),
                processing_mode="whole_document",
                config=config,
                group_id=group_id,
                producer="auto_phase2"
            )

            task_id = enqueue_docling_task(
                file_id=file_id,
                task_type='canonical_docling_json',
                payload={
                    'bucket': bucket,
                    'file_path': file_path,
                    'cabinet_id': cabinet_id,
                    'mime_type': mime_type,
                    'config': config,
                    'artefact_extra': whole_doc_metadata.to_artefact_extra(),
                    'depends_on': depends_on
                },
                priority=TaskPriority.NORMAL,
                timeout=7200
            )
            task_ids.append(task_id)

        logger.info(f"Enqueued {len(task_ids)} tasks for {pipeline_type} pipeline with dependencies")
        return task_ids


# Global pipeline controller instance
_controller_instance = None

def get_pipeline_controller() -> DocumentPipelineController:
    """Get the global pipeline controller instance."""
    global _controller_instance
    if _controller_instance is None:
        _controller_instance = DocumentPipelineController()
    return _controller_instance