""" Pipeline Controller for Three-Phase Document Processing Architecture This module coordinates the three phases of document processing: - Phase 1: Document Structure Discovery & Analysis - Phase 2: Parallel Content Processing Pipelines - Phase 3: Enhanced Frontend Viewing (handled by frontend) Features: - Environment variable controlled auto-processing - Phase 1 completion detection - Automatic Phase 2 triggering - Intelligent retry and coordination logic """ import json import os import uuid import time from typing import Dict, Any, List, Optional, Set from pathlib import Path from modules.logger_tool import initialise_logger from modules.database.supabase.utils.client import SupabaseServiceRoleClient from modules.database.supabase.utils.storage import StorageAdmin from modules.queue_system import ( enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task, enqueue_document_analysis_task, enqueue_page_images_task, TaskPriority, get_queue ) from modules.bundle_metadata import ( create_standard_metadata, BundleMetadata, PipelineType, ProcessingMode, BundleType ) logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True) class DocumentPipelineController: """ Coordinates the three-phase document processing pipeline. """ def __init__(self): self.client = SupabaseServiceRoleClient() self.storage = StorageAdmin() # Phase 1 environment variables self.auto_tika = os.getenv('AUTO_TIKA_PROCESSING', 'true').lower() == 'true' self.auto_page_images = os.getenv('AUTO_PAGE_IMAGES', 'true').lower() == 'true' self.auto_document_analysis = os.getenv('AUTO_DOCUMENT_ANALYSIS', 'true').lower() == 'true' self.auto_split_map = os.getenv('AUTO_SPLIT_MAP_GENERATION', 'true').lower() == 'true' # Phase 2 environment variables self.auto_docling_ocr = os.getenv('AUTO_DOCLING_OCR', 'true').lower() == 'true' self.auto_docling_no_ocr = os.getenv('AUTO_DOCLING_NO_OCR', 'true').lower() == 'true' self.auto_docling_vlm = os.getenv('AUTO_DOCLING_VLM', 'false').lower() == 'true' # Processing granularity self.docling_ocr_by_page = os.getenv('DOCLING_OCR_BY_PAGE', 'false').lower() == 'true' self.docling_no_ocr_by_page = os.getenv('DOCLING_NO_OCR_BY_PAGE', 'false').lower() == 'true' self.docling_vlm_by_page = os.getenv('DOCLING_VLM_BY_PAGE', 'true').lower() == 'true' # Grouping strategy self.docling_use_split_map = os.getenv('DOCLING_USE_SPLIT_MAP', 'true').lower() == 'true' self.docling_split_threshold = int(os.getenv('DOCLING_SPLIT_THRESHOLD', '50')) logger.info("Pipeline controller initialized with new bundle architecture") def enqueue_phase1_tasks(self, file_id: str, file_row: Dict[str, Any], processing_path: str, processing_mime: str, priority: TaskPriority = TaskPriority.HIGH) -> Dict[str, List[str]]: """ Enqueue Phase 1 tasks: Structure Discovery & Analysis Returns: Dictionary mapping task types to task IDs """ logger.info(f"Phase 1: Starting structure discovery for file {file_id}") task_ids = {} bucket = file_row['bucket'] cabinet_id = file_row['cabinet_id'] # Step 1: Tika Processing (metadata extraction) if self.auto_tika: tika_url = os.getenv('TIKA_URL') if tika_url: tika_task_id = enqueue_tika_task( file_id=file_id, payload={ 'bucket': bucket, 'file_path': processing_path, 'cabinet_id': cabinet_id, 'mime_type': processing_mime }, priority=priority ) task_ids['tika'] = [tika_task_id] logger.info(f"Phase 1: Enqueued Tika task {tika_task_id}") else: logger.warning("Phase 1: Tika enabled but TIKA_URL not configured") # Step 2: Frontmatter processing (lightweight document overview) docling_url = os.getenv('DOCLING_URL') or os.getenv('NEOFS_DOCLING_URL') if docling_url: try: front_pages = int(os.getenv('DOCLING_FRONTPAGES', '3')) except Exception: front_pages = 3 # Create enhanced metadata for frontmatter JSON display in UI frontmatter_metadata = { 'display_name': f'Document Frontmatter (p1-{front_pages})', 'bundle_label': 'Frontmatter Analysis', 'section_title': 'Document Frontmatter', 'page_range': [1, front_pages], 'page_count': front_pages, 'bundle_type': 'frontmatter_json', 'processing_mode': 'frontmatter_analysis', 'pipeline': 'frontmatter_ocr', 'is_frontmatter': True, 'ui_category': 'document_analysis', 'ui_order': 1, # Show first in UI 'description': f'OCR analysis of first {front_pages} pages for document structure and metadata', 'viewer_type': 'json' } frontmatter_task_id = enqueue_docling_task( file_id=file_id, task_type='docling_frontmatter_json', payload={ 'bucket': bucket, 'file_path': processing_path, 'cabinet_id': cabinet_id, 'mime_type': processing_mime, 'config': { 'do_ocr': True, 'force_ocr': False, 'image_export_mode': 'embedded', 'ocr_engine': 'easyocr', 'ocr_lang': 'en', 'pdf_backend': 'dlparse_v4', 'table_mode': 'fast', 'target_type': 'inbody', 'to_formats': 'json', 'page_range': [1, front_pages] }, 'artefact_extra': frontmatter_metadata, 'depends_on': task_ids.get('tika', []) }, priority=priority, timeout=int(os.getenv('DOCLING_FRONTMATTER_TIMEOUT', '1800')) ) task_ids['frontmatter'] = [frontmatter_task_id] logger.info(f"Phase 1: Enqueued frontmatter task {frontmatter_task_id}") # Step 3: Document Structure Analysis (LLM-enhanced hierarchy) if self.auto_document_analysis: analysis_task_id = enqueue_docling_task( file_id=file_id, task_type='document_structure_analysis', payload={ 'bucket': bucket, 'file_path': processing_path, 'cabinet_id': cabinet_id, 'mime_type': processing_mime, 'config': { 'target_type': 'inbody', 'to_formats': 'json', 'do_ocr': False, 'force_ocr': False }, 'depends_on': task_ids.get('frontmatter', []) }, priority=priority, timeout=int(os.getenv('DOCUMENT_ANALYSIS_TIMEOUT', '300')) ) task_ids['document_analysis'] = [analysis_task_id] logger.info(f"Phase 1: Enqueued document analysis task {analysis_task_id}") # Step 4: Split Map Generation (definitive section boundaries) if self.auto_split_map: split_map_task_id = enqueue_split_map_task( file_id=file_id, payload={ 'depends_on': task_ids.get('frontmatter', []) + task_ids.get('document_analysis', []) }, priority=TaskPriority.NORMAL ) task_ids['split_map'] = [split_map_task_id] logger.info(f"Phase 1: Enqueued split map task {split_map_task_id}") # Step 5: Page Images Generation (for frontend viewing) if self.auto_page_images: page_images_task_id = enqueue_docling_task( file_id=file_id, task_type='generate_page_images', payload={ 'bucket': bucket, 'file_path': processing_path, 'cabinet_id': cabinet_id, 'mime_type': processing_mime, 'config': {}, 'depends_on': task_ids.get('document_analysis', []) }, priority=TaskPriority.NORMAL, timeout=int(os.getenv('PAGE_IMAGES_TIMEOUT', '1800')) ) task_ids['page_images'] = [page_images_task_id] logger.info(f"Phase 1: Enqueued page images task {page_images_task_id}") # Bundle tasks are now directly enqueued by split_map task completion total_tasks = sum(len(task_list) for task_list in task_ids.values()) logger.info(f"Phase 1: Enqueued {total_tasks} tasks for file {file_id}: {list(task_ids.keys())}") return task_ids def check_phase1_completion(self, file_id: str) -> Dict[str, Any]: """ Check if Phase 1 is complete for a given file. Returns: Dictionary with completion status and details """ logger.debug(f"Checking Phase 1 completion for file {file_id}") # Get all artefacts for the file artefacts_result = self.client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).execute() artefacts = artefacts_result.data or [] # Check for required Phase 1 artefacts phase1_checks = { 'tika_metadata': False, 'frontmatter': False, 'document_analysis': False, 'split_map': False, 'page_images': False } for artefact in artefacts: if artefact['status'] == 'completed': artefact_type = artefact['type'] if artefact_type == 'tika_json': phase1_checks['tika_metadata'] = True elif artefact_type == 'docling_frontmatter_json': phase1_checks['frontmatter'] = True elif artefact_type == 'document_outline_hierarchy': phase1_checks['document_analysis'] = True elif artefact_type == 'split_map_json': phase1_checks['split_map'] = True elif artefact_type == 'page_images': phase1_checks['page_images'] = True # Determine completion based on enabled features required_checks = [] if self.auto_tika: required_checks.append('tika_metadata') required_checks.append('frontmatter') # Always required for basic processing if self.auto_document_analysis: required_checks.append('document_analysis') if self.auto_split_map: required_checks.append('split_map') if self.auto_page_images: required_checks.append('page_images') completed_required = [check for check in required_checks if phase1_checks[check]] is_complete = len(completed_required) == len(required_checks) return { 'file_id': file_id, 'is_complete': is_complete, 'completed_components': completed_required, 'required_components': required_checks, 'all_checks': phase1_checks, 'completion_percentage': (len(completed_required) / max(len(required_checks), 1)) * 100 } def enqueue_sequential_docling_pipelines(self, file_id: str, file_data: Dict[str, Any]) -> Dict[str, Any]: """ Enqueue sequential docling pipelines with dependencies: no_ocr → ocr → vlm Each pipeline depends on ALL tasks from the previous pipeline completing. This replaces the complex Phase 2 coordinator with simple task dependencies. Args: file_id: The file ID to process file_data: File processing information (bucket, path, etc.) Returns: Dictionary with enqueued pipeline information """ logger.info(f"Enqueueing sequential docling pipelines for file {file_id}") bucket = file_data['bucket'] file_path = file_data['file_path'] cabinet_id = file_data['cabinet_id'] mime_type = file_data['mime_type'] # Base configuration shared by all pipelines (pipeline-specific options added per pipeline) base_config = { 'to_formats': ['json', 'html', 'text', 'md', 'doctags'], 'image_export_mode': 'referenced', 'target_type': 'zip', 'pdf_backend': os.getenv('DOCLING_PDF_BACKEND', 'dlparse_v4'), 'include_images': os.getenv('DOCLING_INCLUDE_IMAGES', 'true').lower() == 'true', 'images_scale': float(os.getenv('DOCLING_IMAGES_SCALE', '2.0')), 'ocr_engine': os.getenv('OCR_ENGINE', 'easyocr'), 'ocr_lang': os.getenv('OCR_LANG', 'en'), 'picture_description_area_threshold': float(os.getenv('DOCLING_PICTURE_DESCRIPTION_AREA_THRESHOLD', '0.05')) } # Determine the pipeline execution order: no_ocr → ocr → vlm pipeline_order = [] if self.auto_docling_no_ocr: pipeline_order.append('no_ocr') if self.auto_docling_ocr: pipeline_order.append('ocr') if self.auto_docling_vlm: pipeline_order.append('vlm') if not pipeline_order: logger.info(f"No docling pipelines enabled for file {file_id}") return { 'file_id': file_id, 'enqueued_pipelines': {}, 'total_tasks': 0, 'sequential_order': [], 'message': 'No docling pipelines enabled' } logger.info(f"Sequential pipeline order for file {file_id}: {pipeline_order}") # Enqueue all pipelines with proper dependencies enqueued_pipelines = {} all_task_ids = {} for i, pipeline_type in enumerate(pipeline_order): # Determine dependencies: depend on ALL tasks from previous pipeline depends_on = [] if i > 0: previous_pipeline = pipeline_order[i - 1] depends_on = all_task_ids.get(previous_pipeline, []) logger.info(f"Pipeline {pipeline_type} will depend on {len(depends_on)} tasks from {previous_pipeline}: {depends_on[:3]}..." if len(depends_on) > 3 else f"Pipeline {pipeline_type} will depend on {len(depends_on)} tasks from {previous_pipeline}: {depends_on}") else: logger.info(f"Pipeline {pipeline_type} has no dependencies (first pipeline)") # Create pipeline tasks pipeline_result = self._enqueue_single_pipeline_with_deps( file_id, pipeline_type, base_config, bucket, file_path, cabinet_id, mime_type, depends_on ) if pipeline_result: enqueued_pipelines[pipeline_type] = pipeline_result all_task_ids[pipeline_type] = pipeline_result['task_ids'] logger.info(f"Enqueued {pipeline_type} pipeline with {len(pipeline_result['task_ids'])} tasks") total_tasks = sum(len(p.get('task_ids', [])) for p in enqueued_pipelines.values()) logger.info(f"Successfully enqueued {len(pipeline_order)} sequential pipelines with {total_tasks} total tasks for file {file_id}") return { 'file_id': file_id, 'enqueued_pipelines': enqueued_pipelines, 'total_tasks': total_tasks, 'sequential_order': pipeline_order } def _determine_processing_mode(self, file_id: str, pipeline_type: str) -> tuple[str, dict]: """ Determine how to process document based on settings and characteristics. Implements corrected decision logic: 1. Priority 1: Respect explicit BY_PAGE preference 2. Priority 2: Check size threshold for auto-processing 3. Priority 3: Use split map for large documents 4. Priority 4: Fallback chunking Returns: Tuple of (processing_mode, processing_data) """ # Check BY_PAGE flags first (highest priority) by_page = self._get_by_page_setting(pipeline_type) if by_page: logger.info(f"BY_PAGE enabled for {pipeline_type} - creating page-based bundles regardless of document size") return "split_by_pages", self._get_page_ranges(file_id) # Get document characteristics page_count = self._get_page_count(file_id) # Apply size threshold logic if page_count < self.docling_split_threshold: logger.info(f"Document has {page_count} pages (< {self.docling_split_threshold} threshold) - creating single bundle") return "whole_document", {} # Check for split map availability split_map = self._load_split_map_if_needed(file_id) if split_map and self.docling_use_split_map: logger.info(f"Document has {page_count} pages (>= {self.docling_split_threshold} threshold) with split map - creating section-based bundles") return "split_by_sections", split_map else: logger.error(f"Document has {page_count} pages (>= {self.docling_split_threshold} threshold) without split map - ERROR") return "error" def _get_by_page_setting(self, pipeline_type: str) -> bool: """Get BY_PAGE setting for the specified pipeline type.""" if pipeline_type == 'no_ocr': return self.docling_no_ocr_by_page elif pipeline_type == 'ocr': return self.docling_ocr_by_page elif pipeline_type == 'vlm': return self.docling_vlm_by_page return False def _get_pipeline_specific_config(self, pipeline_type: str) -> Dict[str, Any]: """Get pipeline-specific configuration options from environment variables.""" if pipeline_type == 'no_ocr': return { 'table_mode': os.getenv('DOCLING_NO_OCR_TABLE_MODE', 'fast'), 'table_cell_matching': os.getenv('DOCLING_NO_OCR_TABLE_CELL_MATCHING', 'false').lower() == 'true', 'do_formula_enrichment': os.getenv('DOCLING_NO_OCR_DO_FORMULA_ENRICHMENT', 'false').lower() == 'true', 'do_code_enrichment': os.getenv('DOCLING_NO_OCR_DO_CODE_ENRICHMENT', 'false').lower() == 'true', 'do_table_structure': os.getenv('DOCLING_NO_OCR_DO_TABLE_STRUCTURE', 'true').lower() == 'true', 'do_picture_classification': os.getenv('DOCLING_NO_OCR_DO_PICTURE_CLASSIFICATION', 'false').lower() == 'true', 'do_picture_description': os.getenv('DOCLING_NO_OCR_DO_PICTURE_DESCRIPTION', 'false').lower() == 'true' } elif pipeline_type == 'ocr': return { 'table_mode': os.getenv('DOCLING_OCR_TABLE_MODE', 'accurate'), 'table_cell_matching': os.getenv('DOCLING_OCR_TABLE_CELL_MATCHING', 'true').lower() == 'true', 'do_formula_enrichment': os.getenv('DOCLING_OCR_DO_FORMULA_ENRICHMENT', 'true').lower() == 'true', 'do_code_enrichment': os.getenv('DOCLING_OCR_DO_CODE_ENRICHMENT', 'true').lower() == 'true', 'do_table_structure': os.getenv('DOCLING_OCR_DO_TABLE_STRUCTURE', 'true').lower() == 'true', 'do_picture_classification': os.getenv('DOCLING_OCR_DO_PICTURE_CLASSIFICATION', 'false').lower() == 'true', 'do_picture_description': os.getenv('DOCLING_OCR_DO_PICTURE_DESCRIPTION', 'false').lower() == 'true' } elif pipeline_type == 'vlm': return { 'table_mode': os.getenv('DOCLING_VLM_TABLE_MODE', 'accurate'), 'table_cell_matching': os.getenv('DOCLING_VLM_TABLE_CELL_MATCHING', 'true').lower() == 'true', 'do_formula_enrichment': os.getenv('DOCLING_VLM_DO_FORMULA_ENRICHMENT', 'false').lower() == 'true', 'do_code_enrichment': os.getenv('DOCLING_VLM_DO_CODE_ENRICHMENT', 'false').lower() == 'true', 'do_table_structure': os.getenv('DOCLING_VLM_DO_TABLE_STRUCTURE', 'true').lower() == 'true', 'do_picture_classification': os.getenv('DOCLING_VLM_DO_PICTURE_CLASSIFICATION', 'true').lower() == 'true', 'do_picture_description': os.getenv('DOCLING_VLM_DO_PICTURE_DESCRIPTION', 'true').lower() == 'true' } else: # Default config for unknown pipeline types return { 'table_mode': 'fast', 'table_cell_matching': False, 'do_formula_enrichment': False, 'do_code_enrichment': False, 'do_table_structure': True, 'do_picture_classification': False, 'do_picture_description': False } def _get_page_count(self, file_id: str) -> int: """Get page count for the file from existing artefacts (first Tika).""" logger.info(f"🔍 PAGE COUNT: Starting page count detection for file {file_id}") try: # Try to get page count from existing artefacts, excluding frontmatter (partial document) artefacts = self.client.supabase.table('document_artefacts').select('type,extra').eq('file_id', file_id).execute() artefact_types = [art.get('type', 'unknown') for art in artefacts.data or []] logger.info(f"🔍 PAGE COUNT: Found {len(artefacts.data or [])} artefacts for file {file_id}: {artefact_types}") for art in artefacts.data or []: art_type = art.get('type', 'unknown') extra = art.get('extra', {}) logger.info(f"🔍 PAGE COUNT: Checking artefact type '{art_type}' for file {file_id}") # Skip frontmatter artefacts as they only contain partial page counts if art_type == 'docling_frontmatter_json': logger.info(f"🔍 PAGE COUNT: Skipping frontmatter artefact (partial page count) for file {file_id}") continue # Also skip docling_json artefacts that are from frontmatter processing if art_type == 'docling_json' and extra.get('is_frontmatter', False): logger.info(f"🔍 PAGE COUNT: Skipping frontmatter-derived docling_json artefact (partial page count) for file {file_id}") continue # Also skip docling_json artefacts that have frontmatter-related pipeline info if art_type == 'docling_json' and extra.get('pipeline') == 'frontmatter_ocr': logger.info(f"🔍 PAGE COUNT: Skipping frontmatter pipeline docling_json artefact (partial page count) for file {file_id}") continue if 'page_count' in extra: page_count = int(extra['page_count']) logger.info(f"✅ PAGE COUNT: Found page count {page_count} from {art_type} artefact for file {file_id}") return page_count else: logger.info(f"🔍 PAGE COUNT: No page_count in {art_type} artefact for file {file_id}") logger.info(f"🔍 PAGE COUNT: No artefacts with page_count found, trying Tika JSON parsing for file {file_id}") # Try to get page count from Tika JSON (most reliable source) tika_arts = self.client.supabase.table('document_artefacts') \ .select('rel_path') \ .eq('file_id', file_id) \ .eq('type', 'tika_json') \ .execute() if tika_arts.data: logger.info(f"🔍 PAGE COUNT: Found Tika JSON artefact, parsing content for file {file_id}") file_info = self.client.supabase.table('files').select('bucket').eq('id', file_id).single().execute() if file_info.data: tika_data = self.storage.download_file(file_info.data['bucket'], tika_arts.data[0]['rel_path']) import json tika_json = json.loads(tika_data.decode('utf-8')) # Check common Tika page count keys in top level and metadata logger.info(f"🔍 PAGE COUNT: Checking Tika JSON keys for page count in file {file_id}") # First check metadata section (most common location) metadata = tika_json.get('metadata', {}) for key in ("xmpTPg:NPages", "Page-Count", "pdf:PageCount", "pdf:pagecount", "meta:page-count", "pdfa:PDFVersion"): # Check both exact key and lowercase version in metadata value = metadata.get(key) or metadata.get(key.lower()) if value is not None: try: page_count = int(value) if page_count > 0: logger.info(f"✅ PAGE COUNT: Found page count {page_count} from Tika metadata key '{key}' for file {file_id}") return page_count except Exception as parse_error: logger.info(f"🔍 PAGE COUNT: Could not parse value '{value}' from Tika metadata key '{key}': {parse_error}") continue # Also check top level (fallback) for key in ("xmpTPg:NPages", "Page-Count", "pdf:PageCount", "pdf:pagecount"): value = tika_json.get(key) or tika_json.get(key.lower()) if value is not None: try: page_count = int(value) if page_count > 0: logger.info(f"✅ PAGE COUNT: Found page count {page_count} from Tika JSON top-level key '{key}' for file {file_id}") return page_count except Exception as parse_error: logger.info(f"🔍 PAGE COUNT: Could not parse value '{value}' from Tika top-level key '{key}': {parse_error}") continue # Debug: Show available keys to help diagnose issues logger.info(f"🔍 PAGE COUNT: Available Tika JSON top-level keys: {list(tika_json.keys())}") if 'metadata' in tika_json: logger.info(f"🔍 PAGE COUNT: Available Tika metadata keys: {list(metadata.keys())}") logger.warning(f"🔍 PAGE COUNT: No valid page count keys found in Tika JSON for file {file_id}") else: logger.warning(f"🔍 PAGE COUNT: Could not get file info for Tika JSON parsing for file {file_id}") else: logger.warning(f"🔍 PAGE COUNT: No Tika JSON artefact found for file {file_id}") # Final fallback - try to get it directly from PDF using PyMuPDF logger.warning(f"🔍 PAGE COUNT: Trying direct PDF parsing as final fallback for file {file_id}") return self._get_page_count_direct_pdf(file_id) except Exception as e: logger.error(f"❌ PAGE COUNT: Error getting page count for file {file_id}: {e}, defaulting to {self.docling_split_threshold + 1}") return self.docling_split_threshold + 1 def _get_page_count_direct_pdf(self, file_id: str) -> int: """Final fallback: Get page count directly from PDF using PyMuPDF.""" try: # Get file info from database file_info = self.client.supabase.table('files').select('bucket,path,cabinet_id').eq('id', file_id).single().execute() if not file_info.data: logger.warning(f"🔍 PAGE COUNT: Could not find file info for {file_id}, defaulting to threshold + 1") return self.docling_split_threshold + 1 file_row = file_info.data bucket = file_row['bucket'] file_path = file_row['path'] # Download and read PDF directly with PyMuPDF logger.info(f"🔍 PAGE COUNT: Reading PDF directly from storage for file {file_id}") pdf_bytes = self.storage.download_file(bucket, file_path) import fitz # PyMuPDF doc = fitz.open(stream=pdf_bytes, filetype="pdf") page_count = len(doc) doc.close() logger.info(f"✅ PAGE COUNT: Direct PDF reading found {page_count} pages for file {file_id}") return page_count except Exception as e: logger.error(f"❌ PAGE COUNT: Direct PDF reading failed for file {file_id}: {e}, defaulting to {self.docling_split_threshold + 1}") return self.docling_split_threshold + 1 def _get_page_ranges(self, file_id: str) -> dict: """Get page ranges for page-based processing.""" page_count = self._get_page_count(file_id) return { 'pages': list(range(1, page_count + 1)), 'total_pages': page_count } def _load_split_map_if_needed(self, file_id: str) -> Optional[Dict[str, Any]]: """Load split map if needed for processing decisions.""" try: file_info = self.client.supabase.table('files').select('bucket').eq('id', file_id).single().execute() if not file_info.data: return None return self._load_split_map(file_info.data['bucket'], file_id) except Exception: return None def _create_chunked_ranges(self, page_count: int) -> dict: """Create chunked page ranges for large documents without split maps.""" chunk_size = max(10, self.docling_split_threshold // 4) # 1/4 of threshold, min 10 pages chunks = [] for start_page in range(1, page_count + 1, chunk_size): end_page = min(start_page + chunk_size - 1, page_count) chunks.append({ 'start': start_page, 'end': end_page, 'title': f'Pages {start_page}-{end_page}' }) return { 'entries': chunks, 'total_chunks': len(chunks) } def _enqueue_single_pipeline_with_deps(self, file_id: str, pipeline_type: str, base_config: Dict[str, Any], bucket: str, file_path: str, cabinet_id: str, mime_type: str, depends_on: List[str]) -> Optional[Dict[str, Any]]: """Enqueue a single pipeline with dependencies on previous pipeline tasks.""" group_id = str(uuid.uuid4()) # Get pipeline-specific configuration options pipeline_specific_config = self._get_pipeline_specific_config(pipeline_type) if pipeline_type == 'no_ocr': config = { **base_config, **pipeline_specific_config, 'do_ocr': False, 'force_ocr': False, 'pipeline': 'standard' } logger.info(f"NO_OCR pipeline config: table_mode={config['table_mode']}, " f"formula_enrichment={config['do_formula_enrichment']}, " f"code_enrichment={config['do_code_enrichment']}") elif pipeline_type == 'ocr': config = { **base_config, **pipeline_specific_config, 'do_ocr': True, 'force_ocr': False, 'pipeline': 'standard' } logger.info(f"OCR pipeline config: table_mode={config['table_mode']}, " f"formula_enrichment={config['do_formula_enrichment']}, " f"code_enrichment={config['do_code_enrichment']}") elif pipeline_type == 'vlm': config = { **base_config, **pipeline_specific_config, 'do_ocr': False, 'force_ocr': False, 'pipeline': 'vlm', 'vlm_pipeline_model': os.getenv('DOCLING_VLM_MODEL', 'smoldocling') } logger.info(f"VLM pipeline config: table_mode={config['table_mode']}, " f"picture_classification={config['do_picture_classification']}, " f"picture_description={config['do_picture_description']}") else: logger.error(f"Unknown pipeline type: {pipeline_type}") return None # Determine processing mode using corrected logic processing_mode, processing_data = self._determine_processing_mode(file_id, pipeline_type) # Enqueue single bundle task with dependencies task_id = self._enqueue_bundle_task_with_deps( file_id, pipeline_type, group_id, config, processing_mode, processing_data, bucket, file_path, cabinet_id, mime_type, depends_on ) return { 'group_id': group_id, 'task_ids': [task_id] if task_id else [], 'task_count': 1 if task_id else 0, 'processing_mode': processing_mode, 'processing_data': processing_data } def _enqueue_bundle_task_with_deps(self, file_id: str, pipeline_type: str, group_id: str, config: Dict[str, Any], processing_mode: str, processing_data: dict, bucket: str, file_path: str, cabinet_id: str, mime_type: str, depends_on: List[str]) -> Optional[str]: """ Enqueue a single bundle task that handles processing internally based on mode. This replaces the old approach of creating multiple individual tasks. """ from modules.queue_system import enqueue_docling_task, TaskPriority from modules.bundle_metadata import create_standard_metadata # Map processing modes to bundle types and task types if processing_mode == "whole_document": task_type = 'docling_bundle' bundle_type = 'whole_document' else: task_type = 'docling_bundle_split' bundle_type = processing_mode # Create bundle metadata with correct processing mode mapping if processing_mode == "whole_document": bundle_processing_mode = "whole_document" elif processing_mode.startswith("split_by_"): # For split modes, map to the appropriate bundle metadata mode if processing_mode == "split_by_pages": bundle_processing_mode = "pages" elif processing_mode == "split_by_sections": bundle_processing_mode = "sections" elif processing_mode == "split_by_chunks": bundle_processing_mode = "chunks" else: bundle_processing_mode = processing_mode.replace('split_by_', '') else: bundle_processing_mode = processing_mode bundle_metadata = create_standard_metadata( file_id=file_id, pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"), processing_mode=bundle_processing_mode, config=config, group_id=group_id, producer="auto_phase2" ) # Create task payload with new bundle architecture payload = { 'bucket': bucket, 'file_path': file_path, 'cabinet_id': cabinet_id, 'mime_type': mime_type, 'config': config, 'processing_mode': processing_mode, 'processing_data': processing_data, 'bundle_metadata': bundle_metadata.to_artefact_extra(), 'depends_on': depends_on } # Determine timeout based on processing complexity if processing_mode == "whole_document": timeout = 7200 # 2 hours for whole document elif processing_mode == "split_by_pages": # Estimate based on page count page_count = processing_data.get('total_pages', 50) timeout = min(14400, max(3600, page_count * 60)) # 1-4 hours based on pages else: # Section or chunk based processing section_count = len(processing_data.get('entries', [])) timeout = min(10800, max(3600, section_count * 300)) # 1-3 hours based on sections logger.info(f"Enqueuing {task_type} task for {pipeline_type} pipeline: {processing_mode} (timeout: {timeout}s)") try: task_id = enqueue_docling_task( file_id=file_id, task_type=task_type, payload=payload, priority=TaskPriority.NORMAL, timeout=timeout ) logger.info(f"Successfully enqueued {task_type} task {task_id} for {pipeline_type} pipeline") return task_id except Exception as e: logger.error(f"Failed to enqueue bundle task for {pipeline_type} pipeline: {e}") return None def trigger_phase2_pipelines(self, file_id: str, file_data: Dict[str, Any]) -> Dict[str, Any]: """ Trigger Phase 2 sequential processing pipelines based on environment configuration. Pipelines run in order: no_ocr → ocr → vlm (depending on what's enabled). Only the first pipeline starts immediately; others are triggered when the previous completes. Args: file_id: The file ID to process file_data: File processing information (bucket, path, etc.) Returns: Dictionary with triggered pipeline information """ logger.info(f"Phase 2: Starting sequential content processing for file {file_id}") triggered_pipelines = {} bucket = file_data['bucket'] file_path = file_data['file_path'] cabinet_id = file_data['cabinet_id'] mime_type = file_data['mime_type'] # Base configuration for all pipelines (DEPRECATED METHOD - use enqueue_sequential_docling_pipelines) base_config = { 'to_formats': ['json', 'html', 'text', 'md', 'doctags'], 'image_export_mode': 'referenced', 'target_type': 'zip', 'pdf_backend': os.getenv('DOCLING_PDF_BACKEND', 'dlparse_v4'), 'include_images': os.getenv('DOCLING_INCLUDE_IMAGES', 'true').lower() == 'true', 'images_scale': float(os.getenv('DOCLING_IMAGES_SCALE', '2.0')), 'ocr_engine': os.getenv('OCR_ENGINE', 'easyocr'), 'ocr_lang': os.getenv('OCR_LANG', 'en'), 'picture_description_area_threshold': float(os.getenv('DOCLING_PICTURE_DESCRIPTION_AREA_THRESHOLD', '0.05')) } # Determine the pipeline execution order: no_ocr → ocr → vlm pipeline_order = [] if self.auto_docling_no_ocr: pipeline_order.append('no_ocr') if self.auto_docling_ocr: pipeline_order.append('ocr') if self.auto_docling_vlm: pipeline_order.append('vlm') if not pipeline_order: logger.info(f"Phase 2: No pipelines enabled for file {file_id}") return { 'file_id': file_id, 'triggered_pipelines': {}, 'total_tasks': 0, 'sequential_order': [], 'message': 'No Phase 2 pipelines enabled' } logger.info(f"Phase 2: Sequential pipeline order for file {file_id}: {pipeline_order}") logger.warning(f"trigger_phase2_pipelines is deprecated - use enqueue_sequential_docling_pipelines for new implementations") # For backward compatibility, delegate to the new method return self.enqueue_sequential_docling_pipelines(file_id, file_data) def _start_single_pipeline(self, file_id: str, pipeline_type: str, base_config: Dict[str, Any], bucket: str, file_path: str, cabinet_id: str, mime_type: str) -> Optional[Dict[str, Any]]: """Start a single pipeline of the specified type.""" if pipeline_type == 'no_ocr': group_id = str(uuid.uuid4()) config = { **base_config, 'do_ocr': False, 'force_ocr': False, 'pipeline': 'standard' } tasks = self._enqueue_pipeline( file_id, 'no_ocr', group_id, config, bucket, file_path, cabinet_id, mime_type, by_page=self.docling_no_ocr_by_page ) return { 'group_id': group_id, 'task_count': len(tasks), 'by_page': self.docling_no_ocr_by_page } elif pipeline_type == 'ocr': group_id = str(uuid.uuid4()) config = { **base_config, 'do_ocr': True, 'ocr_engine': os.getenv('OCR_ENGINE', 'easyocr'), 'force_ocr': False, 'pipeline': 'standard' } tasks = self._enqueue_pipeline( file_id, 'ocr', group_id, config, bucket, file_path, cabinet_id, mime_type, by_page=self.docling_ocr_by_page ) return { 'group_id': group_id, 'task_count': len(tasks), 'by_page': self.docling_ocr_by_page } elif pipeline_type == 'vlm': group_id = str(uuid.uuid4()) config = { **base_config, 'do_ocr': False, 'force_ocr': False, 'pipeline': 'vlm', 'vlm_pipeline_model': os.getenv('DOCLING_VLM_MODEL', 'smoldocling') } tasks = self._enqueue_pipeline( file_id, 'vlm', group_id, config, bucket, file_path, cabinet_id, mime_type, by_page=self.docling_vlm_by_page ) return { 'group_id': group_id, 'task_count': len(tasks), 'by_page': self.docling_vlm_by_page } else: logger.error(f"Unknown pipeline type: {pipeline_type}") return None # continue_sequential_pipeline method removed - task dependencies now handle sequential execution def _load_split_map(self, bucket: str, file_id: str) -> Optional[Dict[str, Any]]: """Load split map data for a file.""" try: arts = self.client.supabase.table('document_artefacts') \ .select('id,type,rel_path') \ .eq('file_id', file_id).eq('type', 'split_map_json') \ .order('created_at', desc=True).limit(1).execute().data or [] if not arts: return None art = arts[0] raw = self.storage.download_file(bucket, art['rel_path']) import json as _json return _json.loads(raw.decode('utf-8')) except Exception: return None def _enqueue_pipeline(self, file_id: str, pipeline_type: str, group_id: str, config: Dict[str, Any], bucket: str, file_path: str, cabinet_id: str, mime_type: str, by_page: bool = False) -> List[str]: """Enqueue tasks for a specific pipeline (OCR/No-OCR/VLM)""" task_ids = [] if by_page: # Process each page individually, then group by split map sections logger.info(f"Phase 2: Enqueueing {pipeline_type} pipeline by page for file {file_id}") split_map = self._load_split_map(bucket, file_id) if split_map: entries = split_map.get('entries', []) for section_idx, entry in enumerate(entries, 1): start_page = int(entry.get('start_page', 1)) end_page = int(entry.get('end_page', start_page)) section_title = entry.get('title', f'Section {section_idx}') if pipeline_type == 'vlm': # VLM uses specialized page processing section_task_id = enqueue_docling_task( file_id=file_id, task_type='vlm_section_page_bundle', payload={ 'section_idx': section_idx, 'start_page': start_page, 'end_page': end_page, 'section_title': section_title, 'vlm_group_id': group_id, 'vlm_model': config.get('vlm_pipeline_model', 'smoldocling'), 'base_config': config, 'total_sections': len(entries), 'producer': 'auto_phase2' }, priority=TaskPriority.NORMAL, timeout=3600 ) task_ids.append(section_task_id) else: # OCR/No-OCR by page processing (process each page in section individually) for page_num in range(start_page, end_page + 1): page_config = { **config, 'page_range': [page_num, page_num] } # Create standardized bundle metadata page_metadata = create_standard_metadata( file_id=file_id, pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"), processing_mode="individual_pages", config=page_config, group_id=group_id, split_order=section_idx, split_total=len(entries), split_heading=section_title, page_range=[page_num, page_num], producer="auto_phase2" ) # Add legacy fields for backward compatibility artefact_extra = page_metadata.to_artefact_extra() artefact_extra.update({ 'section_idx': section_idx, 'section_title': section_title, 'page_number': page_num, }) page_task_id = enqueue_docling_task( file_id=file_id, task_type='canonical_docling_json', payload={ 'bucket': bucket, 'file_path': file_path, 'cabinet_id': cabinet_id, 'mime_type': mime_type, 'config': page_config, 'artefact_extra': artefact_extra }, priority=TaskPriority.NORMAL, timeout=1800 ) task_ids.append(page_task_id) else: logger.warning(f"Phase 2: No split map found for by-page processing of file {file_id}") return [] elif self.docling_use_split_map: # Process by split map sections logger.info(f"Phase 2: Enqueueing {pipeline_type} pipeline by split map sections for file {file_id}") split_map = self._load_split_map(bucket, file_id) if split_map: entries = split_map.get('entries', []) # Normalize and sort entries by start_page normalized_entries = [] for entry in entries: try: start_page = int(entry.get('start_page', 1)) end_page = int(entry.get('end_page', start_page)) title = entry.get('title') or entry.get('label') or '' if end_page < start_page: end_page = start_page normalized_entries.append({ 'start': start_page, 'end': end_page, 'title': title }) except Exception: continue normalized_entries.sort(key=lambda x: x['start']) # Create tasks for each section for i, entry in enumerate(normalized_entries, 1): section_config = { **config, 'page_range': [entry['start'], entry['end']] } # Create standardized bundle metadata for section section_metadata = create_standard_metadata( file_id=file_id, pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"), processing_mode="split_sections", config=section_config, group_id=group_id, split_order=i, split_total=len(normalized_entries), split_heading=entry['title'] or f'Section {i}', page_range=[entry['start'], entry['end']], producer="auto_phase2" ) section_task_id = enqueue_docling_task( file_id=file_id, task_type='canonical_docling_json', payload={ 'bucket': bucket, 'file_path': file_path, 'cabinet_id': cabinet_id, 'mime_type': mime_type, 'config': section_config, 'artefact_extra': section_metadata.to_artefact_extra() }, priority=TaskPriority.NORMAL, timeout=3600 ) task_ids.append(section_task_id) else: logger.warning(f"Phase 2: No split map found for section-based processing of file {file_id}") return [] else: # Process whole document logger.info(f"Phase 2: Enqueueing {pipeline_type} pipeline for whole document {file_id}") # Create standardized bundle metadata for whole document whole_doc_metadata = create_standard_metadata( file_id=file_id, pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"), processing_mode="whole_document", config=config, group_id=group_id, producer="auto_phase2" ) task_id = enqueue_docling_task( file_id=file_id, task_type='canonical_docling_json', payload={ 'bucket': bucket, 'file_path': file_path, 'cabinet_id': cabinet_id, 'mime_type': mime_type, 'config': config, 'artefact_extra': whole_doc_metadata.to_artefact_extra() }, priority=TaskPriority.NORMAL, timeout=7200 ) task_ids.append(task_id) logger.info(f"Phase 2: Enqueued {len(task_ids)} tasks for {pipeline_type} pipeline") return task_ids def _enqueue_pipeline_with_deps(self, file_id: str, pipeline_type: str, group_id: str, config: Dict[str, Any], bucket: str, file_path: str, cabinet_id: str, mime_type: str, by_page: bool = False, depends_on: List[str] = None) -> List[str]: """Enqueue tasks for a specific pipeline with dependencies""" if depends_on is None: depends_on = [] task_ids = [] if by_page: # Process each page individually, then group by split map sections logger.info(f"Enqueueing {pipeline_type} pipeline by page for file {file_id} with {len(depends_on)} dependencies") split_map = self._load_split_map(bucket, file_id) if split_map: entries = split_map.get('entries', []) for section_idx, entry in enumerate(entries, 1): start_page = int(entry.get('start_page', 1)) end_page = int(entry.get('end_page', start_page)) section_title = entry.get('title', f'Section {section_idx}') if pipeline_type == 'vlm': # VLM uses specialized page processing section_task_id = enqueue_docling_task( file_id=file_id, task_type='vlm_section_page_bundle', payload={ 'section_idx': section_idx, 'start_page': start_page, 'end_page': end_page, 'section_title': section_title, 'vlm_group_id': group_id, 'vlm_model': config.get('vlm_pipeline_model', 'smoldocling'), 'base_config': config, 'total_sections': len(entries), 'producer': 'auto_phase2', 'depends_on': depends_on }, priority=TaskPriority.NORMAL, timeout=3600 ) task_ids.append(section_task_id) else: # OCR/No-OCR by page processing (process each page in section individually) for page_num in range(start_page, end_page + 1): page_config = { **config, 'page_range': [page_num, page_num] } # Create standardized bundle metadata page_metadata = create_standard_metadata( file_id=file_id, pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"), processing_mode="individual_pages", config=page_config, group_id=group_id, split_order=section_idx, split_total=len(entries), split_heading=section_title, page_range=[page_num, page_num], producer="auto_phase2" ) # Add legacy fields for backward compatibility artefact_extra = page_metadata.to_artefact_extra() artefact_extra.update({ 'section_idx': section_idx, 'section_title': section_title, 'page_number': page_num, }) page_task_id = enqueue_docling_task( file_id=file_id, task_type='canonical_docling_json', payload={ 'bucket': bucket, 'file_path': file_path, 'cabinet_id': cabinet_id, 'mime_type': mime_type, 'config': page_config, 'artefact_extra': artefact_extra, 'depends_on': depends_on }, priority=TaskPriority.NORMAL, timeout=1800 ) task_ids.append(page_task_id) else: logger.warning(f"No split map found for by-page processing of file {file_id}") return [] elif self.docling_use_split_map: # Process by split map sections logger.info(f"Enqueueing {pipeline_type} pipeline by split map sections for file {file_id} with {len(depends_on)} dependencies") split_map = self._load_split_map(bucket, file_id) if split_map: entries = split_map.get('entries', []) # Normalize and sort entries by start_page normalized_entries = [] for entry in entries: try: start_page = int(entry.get('start_page', 1)) end_page = int(entry.get('end_page', start_page)) title = entry.get('title') or entry.get('label') or '' if end_page < start_page: end_page = start_page normalized_entries.append({ 'start': start_page, 'end': end_page, 'title': title }) except Exception: continue normalized_entries.sort(key=lambda x: x['start']) # Create tasks for each section for i, entry in enumerate(normalized_entries, 1): section_config = { **config, 'page_range': [entry['start'], entry['end']] } # Create standardized bundle metadata for section section_metadata = create_standard_metadata( file_id=file_id, pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"), processing_mode="split_sections", config=section_config, group_id=group_id, split_order=i, split_total=len(normalized_entries), split_heading=entry['title'] or f'Section {i}', page_range=[entry['start'], entry['end']], producer="auto_phase2" ) section_task_id = enqueue_docling_task( file_id=file_id, task_type='canonical_docling_json', payload={ 'bucket': bucket, 'file_path': file_path, 'cabinet_id': cabinet_id, 'mime_type': mime_type, 'config': section_config, 'artefact_extra': section_metadata.to_artefact_extra(), 'depends_on': depends_on }, priority=TaskPriority.NORMAL, timeout=3600 ) task_ids.append(section_task_id) else: logger.warning(f"No split map found for section-based processing of file {file_id}") return [] else: # Process whole document logger.info(f"Enqueueing {pipeline_type} pipeline for whole document {file_id} with {len(depends_on)} dependencies") # Create standardized bundle metadata for whole document whole_doc_metadata = create_standard_metadata( file_id=file_id, pipeline="vlm" if pipeline_type == "vlm" else ("ocr" if config.get('do_ocr') else "no_ocr"), processing_mode="whole_document", config=config, group_id=group_id, producer="auto_phase2" ) task_id = enqueue_docling_task( file_id=file_id, task_type='canonical_docling_json', payload={ 'bucket': bucket, 'file_path': file_path, 'cabinet_id': cabinet_id, 'mime_type': mime_type, 'config': config, 'artefact_extra': whole_doc_metadata.to_artefact_extra(), 'depends_on': depends_on }, priority=TaskPriority.NORMAL, timeout=7200 ) task_ids.append(task_id) logger.info(f"Enqueued {len(task_ids)} tasks for {pipeline_type} pipeline with dependencies") return task_ids # Global pipeline controller instance _controller_instance = None def get_pipeline_controller() -> DocumentPipelineController: """Get the global pipeline controller instance.""" global _controller_instance if _controller_instance is None: _controller_instance = DocumentPipelineController() return _controller_instance