""" Task Processors for Document Processing Queue This module contains the actual processing implementations for different types of queued tasks (Tika, Docling, LLM, Split Map). """ import json import zipfile import io import mimetypes import requests import tempfile import uuid from pathlib import Path from typing import Dict, Any, Optional import os from modules.queue_system import DocumentProcessingQueue, QueueTask, ServiceType from modules.database.supabase.utils.client import SupabaseServiceRoleClient from modules.database.supabase.utils.storage import StorageAdmin from modules.document_processor import DocumentProcessor from modules.logger_tool import initialise_logger logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True) class DocumentTaskProcessor(DocumentProcessingQueue): """ Extended queue with actual task processing implementations. """ def __init__(self, redis_url: str = None): super().__init__(redis_url) self.client = SupabaseServiceRoleClient() self.storage = StorageAdmin() self.doc_processor = DocumentProcessor() # Service URLs self.tika_url = os.getenv('TIKA_URL') self.docling_url = os.getenv('DOCLING_URL') or os.getenv('NEOFS_DOCLING_URL') self.llm_url = os.getenv('LLM_URL') # Local LLM endpoint logger.info("Task processor initialized with service URLs") def _process_task(self, task: QueueTask): """Process a task based on its service type.""" try: # DEBUG: Log entry into processing logger.info(f"🚀 PROCESS DEBUG: Starting _process_task for {task.id}") # Audit dependency info (if any) try: deps = [] if isinstance(task.payload, dict): deps = task.payload.get('depends_on') or [] if deps: logger.info(f"Processing task {task.id}: {task.service.value}/{task.task_type} deps={deps}") else: logger.info(f"Processing task {task.id}: {task.service.value}/{task.task_type}") except Exception: logger.info(f"Processing task {task.id}: {task.service.value}/{task.task_type}") # DEBUG: Log service routing logger.info(f"🚀 PROCESS DEBUG: Routing task {task.id} to service {task.service}") if task.service == ServiceType.TIKA: result = self._process_tika_task(task) elif task.service == ServiceType.DOCLING: result = self._process_docling_task(task) elif task.service == ServiceType.LLM: result = self._process_llm_task(task) elif task.service == ServiceType.SPLIT_MAP: result = self._process_split_map_task(task) elif task.service == ServiceType.DOCUMENT_ANALYSIS: result = self.process_document_analysis_task(task) elif task.service == ServiceType.PAGE_IMAGES: result = self.process_page_images_task(task) else: raise ValueError(f"Unknown service type: {task.service}") # DEBUG: Log successful completion logger.info(f"✅ PROCESS DEBUG: Task {task.id} completed successfully, calling complete_task") self.complete_task(task, result) logger.info(f"✅ PROCESS DEBUG: Task {task.id} completion confirmed") except Exception as e: # DEBUG: Log detailed failure info logger.error(f"🚨 PROCESS DEBUG: Task {task.id} processing failed: {e}") logger.error(f"🚨 PROCESS DEBUG: Exception type: {type(e)}") import traceback logger.error(f"🚨 PROCESS DEBUG: Full traceback:\n{traceback.format_exc()}") logger.info(f"🚨 PROCESS DEBUG: Calling fail_task for {task.id}") self.fail_task(task, str(e)) logger.info(f"🚨 PROCESS DEBUG: fail_task completed for {task.id}") def _process_tika_task(self, task: QueueTask) -> Dict[str, Any]: """Process Tika metadata extraction task.""" if not self.tika_url: raise ValueError("TIKA_URL not configured") payload = task.payload file_id = task.file_id bucket = payload['bucket'] file_path = payload['file_path'] cabinet_id = payload['cabinet_id'] mime_type = payload.get('mime_type', 'application/octet-stream') # Download file logger.debug(f"Downloading file for Tika processing: {bucket}/{file_path}") file_bytes = self.storage.download_file(bucket, file_path) # Call Tika headers = {'Accept': 'application/json', 'Content-Type': mime_type} timeout = task.timeout response = requests.put( f"{self.tika_url.rstrip('/')}/meta", data=file_bytes, headers=headers, timeout=timeout ) response.raise_for_status() tika_json = response.json() # Store result as artefact artefact_id = str(uuid.uuid4()) rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/tika.json" self.storage.upload_file( bucket, rel_path, json.dumps(tika_json, ensure_ascii=False).encode('utf-8'), 'application/json', upsert=True ) # Create artefact record with enhanced UI metadata artefact_data = { 'id': artefact_id, 'file_id': file_id, 'type': 'tika_json', 'rel_path': rel_path, 'extra': { 'processing_time': response.elapsed.total_seconds(), 'display_name': 'Document Metadata', 'bundle_label': 'Tika Analysis', 'section_title': 'Document Metadata', 'bundle_type': 'tika_json', 'processing_mode': 'metadata_extraction', 'pipeline': 'tika_analysis', 'is_metadata': True, 'ui_category': 'raw_data', 'ui_order': 3, 'description': 'Raw document metadata and properties extracted by Apache Tika', 'viewer_type': 'json' }, 'status': 'completed' } self.client.supabase.table('document_artefacts').insert(artefact_data).execute() logger.info(f"Tika processing completed for file {file_id}") return { 'artefact_id': artefact_id, 'rel_path': rel_path, 'processing_time': response.elapsed.total_seconds() } def _process_docling_task(self, task: QueueTask) -> Dict[str, Any]: """Process Docling document analysis task. Also allows routing of related task types so that page images and enhanced structure analysis can run under the stable docling service umbrella when SERVICE dispatch for new types is problematic. """ # Soft-route additional task types through this handler if task.task_type in ("document_structure_analysis", "document_analysis"): return self.process_document_analysis_task(task) if task.task_type in ("generate_page_images", "page_images"): return self.process_page_images_task(task) if task.task_type in ("vlm_section_page_bundle",): return self.process_vlm_section_page_bundle_task(task) if task.task_type in ("vlm_section_bundle_collector",): return self.process_vlm_section_bundle_collector_task(task) # New unified bundle architecture handlers if task.task_type in ("docling_bundle",): return self.process_docling_bundle_task(task) if task.task_type in ("docling_bundle_split",): return self.process_docling_bundle_split_task(task) # phase2_coordinator task type removed - pipelines now enqueued directly from split_map task if not self.docling_url: raise ValueError("DOCLING_URL not configured") payload = task.payload file_id = task.file_id bucket = payload['bucket'] file_path = payload['file_path'] cabinet_id = payload['cabinet_id'] task_config = payload.get('config', {}) # Download file logger.debug(f"Downloading file for Docling processing: {bucket}/{file_path}") file_bytes = self.storage.download_file(bucket, file_path) # Prepare Docling request docling_api_key = os.getenv('DOCLING_API_KEY') # Accept any content type so zip/binary responses are allowed headers = {'Accept': '*/*'} if docling_api_key: headers['X-Api-Key'] = docling_api_key # Determine to_formats. For canonical docling we will request a ZIP bundle. to_formats_val = task_config.get('to_formats', 'json') to_formats_list = to_formats_val if isinstance(to_formats_val, list) else [to_formats_val] is_canonical = str(task.task_type).startswith('canonical_docling') target_type = task_config.get('target_type', 'zip' if is_canonical else 'inbody') # Build form data from config (override for canonical) form_data = [ ('target_type', target_type), ('do_ocr', str(task_config.get('do_ocr', False)).lower()), ('force_ocr', str(task_config.get('force_ocr', False)).lower()), ('image_export_mode', 'referenced' if is_canonical else task_config.get('image_export_mode', 'embedded')), ('ocr_engine', task_config.get('ocr_engine', 'easyocr')), ('ocr_lang', task_config.get('ocr_lang', 'en')), ('pdf_backend', task_config.get('pdf_backend', 'dlparse_v4')), ('table_mode', task_config.get('table_mode', 'fast')), ('do_formula_enrichment', str(task_config.get('do_formula_enrichment', False)).lower()), ('do_code_enrichment', str(task_config.get('do_code_enrichment', False)).lower()), ('pipeline', task_config.get('pipeline', 'standard')) ] # Optional extra flags forwarded when present if 'table_cell_matching' in task_config: form_data.append(('table_cell_matching', str(task_config.get('table_cell_matching')).lower())) if 'do_picture_classification' in task_config: form_data.append(('do_picture_classification', str(task_config.get('do_picture_classification')).lower())) if 'do_picture_description' in task_config: form_data.append(('do_picture_description', str(task_config.get('do_picture_description')).lower())) if task_config.get('picture_description_prompt'): form_data.append(('picture_description_prompt', task_config.get('picture_description_prompt'))) # picture_description_api and vlm_pipeline_model_api must be JSON per Docling OpenAPI if task_config.get('picture_description_api') is not None: v = task_config.get('picture_description_api') if isinstance(v, (dict, list)): form_data.append(('picture_description_api', json.dumps(v))) elif isinstance(v, str) and v.strip().startswith(('{', '[')): form_data.append(('picture_description_api', v)) # else: omit to avoid validation error if task_config.get('vlm_pipeline_model'): form_data.append(('vlm_pipeline_model', task_config.get('vlm_pipeline_model'))) if task_config.get('vlm_pipeline_model_api') is not None: v = task_config.get('vlm_pipeline_model_api') if isinstance(v, (dict, list)): form_data.append(('vlm_pipeline_model_api', json.dumps(v))) elif isinstance(v, str) and v.strip().startswith(('{', '[')): form_data.append(('vlm_pipeline_model_api', v)) # else: omit if is_canonical and ('md' in to_formats_list): form_data.append(('md_page_break_placeholder', task_config.get('md_page_break_placeholder', '\n\n\n\n'))) # Append to_formats as repeated fields (filter unsupported split pages) to_formats_list = [f for f in to_formats_list if f != 'html_split_page'] for fmt in to_formats_list: form_data.append(('to_formats', fmt)) # Handle page range with clamping and min/max correction page_range = task_config.get('page_range', [1, 999999]) if isinstance(page_range, list) and len(page_range) >= 2: def _to_int_safe(v, default): try: return int(v) except Exception: return default start_pg = _to_int_safe(page_range[0], 1) end_pg = _to_int_safe(page_range[1], 999999) if start_pg < 1: start_pg = 1 if end_pg < start_pg: end_pg = start_pg # Clamp for frontmatter-like tasks to actual page count if possible if task.task_type in ('docling_frontmatter_json', 'document_structure_analysis'): try: import fitz # PyMuPDF doc = fitz.open(stream=file_bytes, filetype='pdf') pc = int(doc.page_count) doc.close() if pc > 0: end_pg = min(end_pg, pc) start_pg = max(1, min(start_pg, pc)) if end_pg < start_pg: end_pg = start_pg except Exception: pass form_data.append(('page_range', str(start_pg))) form_data.append(('page_range', str(end_pg))) files = [('files', ('file', file_bytes, payload.get('mime_type', 'application/pdf')))] # Make request response = requests.post( f"{self.docling_url.rstrip('/')}/v1/convert/file", files=files, data=form_data, headers=headers, timeout=task.timeout ) response.raise_for_status() content_type = (response.headers.get('Content-Type') or '').lower() is_zip_resp = ('zip' in content_type) or (response.content[:2] == b'PK') if is_zip_resp and is_canonical: # Unpack zip, store all files and a manifest artefact_id = str(uuid.uuid4()) base_dir = f"{cabinet_id}/{file_id}/{artefact_id}" archive_path = f"{base_dir}/bundle.zip" # Save original archive self.storage.upload_file(bucket, archive_path, response.content, 'application/zip', upsert=True) zf = zipfile.ZipFile(io.BytesIO(response.content)) entries = [] md_full_path = None html_full_path = None text_full_path = None json_full_path = None images_list = [] md_data_bytes: bytes | None = None for zi in zf.infolist(): if zi.is_dir(): continue name = zi.filename.lstrip('/').replace('..', '') data = zf.read(zi) ctype = mimetypes.guess_type(name)[0] or 'application/octet-stream' rel = f"{base_dir}/{name}" self.storage.upload_file(bucket, rel, data, ctype, upsert=True) entries.append({ 'name': name, 'path': rel, 'size': zi.file_size, 'content_type': ctype }) # Detect known outputs lower = name.lower() if lower.endswith('.md') and md_full_path is None: md_full_path = rel md_data_bytes = data elif lower.endswith('.html') and html_full_path is None: html_full_path = rel elif lower.endswith('.txt') and text_full_path is None: text_full_path = rel elif lower.endswith('.json') and json_full_path is None: json_full_path = rel if ctype.startswith('image/'): images_list.append({'name': name, 'path': rel, 'content_type': ctype, 'size': zi.file_size}) manifest = { 'file_id': file_id, 'artefact_id': artefact_id, 'to_formats': to_formats_list, 'image_export_mode': 'referenced', 'entries': entries, 'archive_path': archive_path, 'markdown_full': md_full_path, 'html_full': html_full_path, 'text_full': text_full_path, 'json_full': json_full_path, 'images': images_list, 'bucket': bucket } # Create markdown pages by splitting on placeholder if available if md_data_bytes is not None: try: md_text = md_data_bytes.decode('utf-8', errors='replace') sep = task_config.get('md_page_break_placeholder', '\n\n\n\n') parts = md_text.split(sep) if len(parts) > 1: pages_dir = f"{base_dir}/md_pages" pages = [] for i, part in enumerate(parts, start=1): pth = f"{pages_dir}/page-{i:04d}.md" self.storage.upload_file(bucket, pth, part.encode('utf-8'), 'text/markdown', upsert=True) pages.append({'page': i, 'path': pth}) manifest['markdown_pages'] = pages except Exception as e: logger.warning(f"Failed creating markdown_pages for file {file_id}: {e}") manifest_path = f"{base_dir}/manifest.json" self.storage.upload_file(bucket, manifest_path, json.dumps(manifest, ensure_ascii=False).encode('utf-8'), 'application/json', upsert=True) # Create artefact row pointing to directory with manifest, including grouping extras for split packs artefact_extra = payload.get('artefact_extra') if isinstance(payload, dict) else None # Determine artefact type by pipeline (standard vs vlm) pipeline_mode = (task_config.get('pipeline') or 'standard').lower() artefact_type_final = 'docling_vlm' if pipeline_mode == 'vlm' else 'docling_standard' group_pack_type = payload.get('group_pack_type') if isinstance(payload, dict) else None # propagate group_id if provided (set by caller for multi-part packs) group_id = (artefact_extra or {}).get('group_id') # Compute a settings fingerprint for grouping (exclude page_range) try: import hashlib, json as _json cfg_for_hash = dict(task_config) cfg_for_hash.pop('page_range', None) settings_fingerprint = hashlib.sha1(_json.dumps(cfg_for_hash, sort_keys=True, ensure_ascii=False).encode('utf-8')).hexdigest() except Exception: settings_fingerprint = None self.client.supabase.table('document_artefacts').insert({ 'id': artefact_id, 'file_id': file_id, 'type': artefact_type_final, 'rel_path': base_dir, 'extra': { 'manifest': manifest_path, 'processing_time': response.elapsed.total_seconds(), 'config': task_config, 'group_pack_type': group_pack_type or (artefact_extra or {}).get('group_pack_type'), 'group_id': group_id, 'pipeline': pipeline_mode, 'settings_fingerprint': settings_fingerprint, **(artefact_extra or {}) }, 'status': 'completed' }).execute() logger.info(f"Canonical docling bundle stored for file {file_id} with {len(entries)} files") return { 'artefact_id': artefact_id, 'files_count': len(entries) } if 'application/json' in content_type or content_type.endswith('+json'): docling_json = response.json() artefact_id = str(uuid.uuid4()) artefact_type = task.task_type rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/{artefact_type.replace('_json', '.json')}" self.storage.upload_file( bucket, rel_path, json.dumps(docling_json, ensure_ascii=False).encode('utf-8'), 'application/json', upsert=True ) artefact_data = { 'id': artefact_id, 'file_id': file_id, 'type': artefact_type, 'rel_path': rel_path, 'extra': { 'processing_time': response.elapsed.total_seconds(), 'config': task_config, **({} if 'artefact_extra' not in payload else payload['artefact_extra']) }, 'status': 'completed' } self.client.supabase.table('document_artefacts').insert(artefact_data).execute() else: # Fallback: store raw output if server didn't return JSON (unexpected for inbody) artefact_id = str(uuid.uuid4()) ext = ('html' if 'html' in content_type else ('md' if 'markdown' in content_type else ('txt' if 'text/plain' in content_type else 'bin'))) artefact_type = f'docling_output_{ext}' rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/docling_output.{ext}" self.storage.upload_file( bucket, rel_path, response.content, 'application/zip' if ext == 'zip' else (content_type or 'application/octet-stream'), upsert=True ) artefact_data = { 'id': artefact_id, 'file_id': file_id, 'type': artefact_type, 'rel_path': rel_path, 'extra': { 'processing_time': response.elapsed.total_seconds(), 'config': task_config, 'to_formats': to_formats_list, 'content_type': content_type, **({} if 'artefact_extra' not in payload else payload['artefact_extra']) }, 'status': 'completed' } self.client.supabase.table('document_artefacts').insert(artefact_data).execute() # When we get canonical Docling JSON, also split out component contents into separate artefacts try: if 'application/json' in content_type or content_type.endswith('+json'): self._store_docling_component_artefacts( file_id=file_id, cabinet_id=cabinet_id, bucket=bucket, docling_json=docling_json, task_config=task_config, artefact_extra=payload.get('artefact_extra') if isinstance(payload, dict) else None ) except Exception as split_e: logger.warning(f"Storing component artefacts failed for file {file_id}: {split_e}") # Handle optional frontpage image extraction if task.task_type == 'docling_frontmatter_json': try: self._extract_frontpage_image(docling_json, file_id, cabinet_id, bucket) except Exception as e: logger.warning(f"Frontpage image extraction failed for file {file_id}: {e}") logger.info(f"Docling processing completed for file {file_id}") # Pipeline dependencies now handle sequential execution automatically return { 'artefact_id': artefact_id, 'rel_path': rel_path, 'processing_time': response.elapsed.total_seconds() } def _extract_frontpage_image(self, docling_json: Dict[str, Any], file_id: str, cabinet_id: str, bucket: str): """Extract and store frontpage image from Docling JSON.""" import base64 # Look for frontpage image in various locations cover_b64 = None for key in ['frontpage', 'cover']: if key in docling_json and 'image_base64' in docling_json[key]: cover_b64 = docling_json[key]['image_base64'] break if not cover_b64: return # Decode and store image artefact_id = str(uuid.uuid4()) img_bytes = base64.b64decode(cover_b64) rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/frontpage.png" self.storage.upload_file(bucket, rel_path, img_bytes, 'image/png', upsert=True) # Create artefact record artefact_data = { 'id': artefact_id, 'file_id': file_id, 'type': 'docling_frontpage_image', 'rel_path': rel_path, 'extra': {'extracted_from': 'docling_frontmatter'}, 'status': 'completed' } self.client.supabase.table('document_artefacts').insert(artefact_data).execute() logger.debug(f"Frontpage image extracted for file {file_id}") def _store_docling_component_artefacts(self, *, file_id: str, cabinet_id: str, bucket: str, docling_json: Dict[str, Any], task_config: Dict[str, Any], artefact_extra: Optional[Dict[str, Any]] = None) -> None: """Create artefacts for component contents from a canonical Docling JSON. Stores md_content, html_content, text_content, doctags_content and json_content if present, as separate artefacts and files alongside the canonical JSON. """ doc = docling_json.get('document') or {} components = [ ('md_content', 'docling_md', 'docling.md', 'text/markdown', lambda v: v if isinstance(v, str) else ''), ('html_content', 'docling_html', 'docling.html', 'text/html', lambda v: v if isinstance(v, str) else ''), ('text_content', 'docling_text', 'docling.txt', 'text/plain', lambda v: v if isinstance(v, str) else ''), ('doctags_content', 'docling_doctags', 'docling.doctags.xml', 'application/xml', lambda v: v if isinstance(v, str) else ''), ('json_content', 'docling_json', 'docling.json', 'application/json', lambda v: json.dumps(v or {}, ensure_ascii=False)), ] for key, art_type, filename, mime, to_bytes in components: if key not in doc or doc.get(key) in (None, ''): continue try: artefact_id = str(uuid.uuid4()) rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/{filename}" data_bytes = to_bytes(doc.get(key)) if isinstance(data_bytes, str): data_bytes = data_bytes.encode('utf-8') self.storage.upload_file(bucket, rel_path, data_bytes, mime, upsert=True) extra = {'source': 'canonical_docling_json', 'component_key': key, 'config': task_config} if artefact_extra: extra.update(artefact_extra) self.client.supabase.table('document_artefacts').insert({ 'id': artefact_id, 'file_id': file_id, 'type': art_type, 'rel_path': rel_path, 'extra': extra, 'status': 'completed' }).execute() except Exception as e: logger.warning(f"Failed to store component '{key}' for file {file_id}: {e}") def _process_llm_task(self, task: QueueTask) -> Dict[str, Any]: """Process LLM analysis task (document classification, etc.).""" if not self.llm_url: raise ValueError("LLM_URL not configured") payload = task.payload file_id = task.file_id prompt = payload['prompt'] context = payload.get('context', '') model = payload.get('model', 'default') # Prepare LLM request llm_request = { 'model': model, 'prompt': prompt, 'context': context, 'max_tokens': payload.get('max_tokens', 1000), 'temperature': payload.get('temperature', 0.1) } # Call local LLM response = requests.post( f"{self.llm_url.rstrip('/')}/generate", json=llm_request, headers={'Content-Type': 'application/json'}, timeout=task.timeout ) response.raise_for_status() llm_result = response.json() # Store result (optional - depends on use case) if payload.get('store_result', False): bucket = payload['bucket'] cabinet_id = payload['cabinet_id'] artefact_id = str(uuid.uuid4()) rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/llm_{task.task_type}.json" self.storage.upload_file( bucket, rel_path, json.dumps(llm_result, ensure_ascii=False).encode('utf-8'), 'application/json', upsert=True ) # Create artefact record artefact_data = { 'id': artefact_id, 'file_id': file_id, 'type': f'llm_{task.task_type}', 'rel_path': rel_path, 'extra': { 'model': model, 'task_type': task.task_type }, 'status': 'completed' } self.client.supabase.table('document_artefacts').insert(artefact_data).execute() logger.info(f"LLM processing completed for file {file_id}") return llm_result def _process_split_map_task(self, task: QueueTask) -> Dict[str, Any]: """Process split map generation task.""" from routers.database.files.split_map import create_split_map_for_file from routers.database.files.files import enqueue_canonical_docling file_id = task.file_id # Generate split map split_map = create_split_map_for_file(file_id) logger.info(f"Split map generation completed for file {file_id}") # NEW BUNDLE ARCHITECTURE: Direct pipeline enqueueing # Split map completion now directly triggers bundle task creation logger.info(f"NEW ARCHITECTURE: Enqueueing sequential docling bundle pipelines for file {file_id}") try: # Get file information for pipeline enqueueing file_result = self.client.supabase.table('files').select('*').eq('id', file_id).single().execute() if not file_result.data: logger.error(f"Could not find file {file_id} for pipeline enqueueing") return { 'method': split_map['method'], 'confidence': split_map['confidence'], 'entries_count': len(split_map['entries']), 'pipeline_error': 'File not found for pipeline enqueueing' } file_row = file_result.data bucket = file_row['bucket'] cabinet_id = file_row['cabinet_id'] storage_path = file_row['path'] original_mime = file_row.get('mime_type', 'application/pdf') # Prefer converted PDF if available (matches existing pattern) try: arts = self.client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or [] pdf_art = next((a for a in arts if a.get('type') == 'document_pdf'), None) processing_path = pdf_art['rel_path'] if pdf_art else storage_path processing_mime = 'application/pdf' if pdf_art else original_mime except Exception: processing_path = storage_path processing_mime = original_mime # Prepare file data for pipeline controller file_data = { 'bucket': bucket, 'file_path': processing_path, 'cabinet_id': cabinet_id, 'mime_type': processing_mime } # Import and use pipeline controller to enqueue sequential pipelines from modules.pipeline_controller import get_pipeline_controller controller = get_pipeline_controller() pipeline_result = controller.enqueue_sequential_docling_pipelines(file_id, file_data) logger.info(f"Successfully enqueued {pipeline_result['total_tasks']} tasks across " f"{len(pipeline_result['enqueued_pipelines'])} pipelines for file {file_id}") logger.info(f"Pipeline execution order: {pipeline_result['sequential_order']}") return { 'method': split_map['method'], 'confidence': split_map['confidence'], 'entries_count': len(split_map['entries']), 'enqueued_pipelines': pipeline_result['enqueued_pipelines'], 'total_pipeline_tasks': pipeline_result['total_tasks'], 'pipeline_order': pipeline_result['sequential_order'] } except Exception as e: logger.error(f"Failed to enqueue sequential pipelines for file {file_id}: {e}") return { 'method': split_map['method'], 'confidence': split_map['confidence'], 'entries_count': len(split_map['entries']), 'pipeline_error': str(e) } # Split map processing completed successfully return { 'method': split_map['method'], 'confidence': split_map['confidence'], 'entries_count': len(split_map['entries']) } def _enqueue_vlm_page_processing(self, file_id: str, threshold: int, vlm_group_id: str, vlm_model: str, base_config: dict): """Enqueue VLM processing for individual pages within split map sections.""" from routers.database.files.files import _load_split_map from modules.database.supabase.utils.client import SupabaseServiceRoleClient from modules.database.supabase.utils.storage import StorageAdmin try: client = SupabaseServiceRoleClient() storage = StorageAdmin() # Get file info fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute() if not fr.data: logger.error(f"File {file_id} not found for VLM page processing") return file_row = fr.data bucket = file_row['bucket'] # Load split map split_map = _load_split_map(client, storage, bucket, file_id) if not split_map: logger.warning(f"No split map found for VLM page processing file {file_id}") return entries = split_map.get('entries', []) if not entries: logger.warning(f"Empty split map entries for VLM page processing file {file_id}") return logger.info(f"[auto-canonical] VLM page processing: found {len(entries)} sections for file {file_id}") # Process each section with page-by-page VLM for section_idx, entry in enumerate(entries, 1): try: start_page = int(entry.get('start_page', 1)) end_page = int(entry.get('end_page', start_page)) section_title = entry.get('title', f'Section {section_idx}') logger.info(f"[auto-canonical] VLM page processing section {section_idx}: '{section_title}' pages {start_page}-{end_page}") # Create section-level bundle manifest task self._enqueue_vlm_section_page_bundle( file_id, section_idx, start_page, end_page, section_title, vlm_group_id, vlm_model, base_config, len(entries) ) except Exception as section_e: logger.warning(f"Failed to process VLM section {section_idx} for file {file_id}: {section_e}") continue except Exception as e: logger.error(f"VLM page processing setup failed for file {file_id}: {e}") def _enqueue_vlm_section_page_bundle(self, file_id: str, section_idx: int, start_page: int, end_page: int, section_title: str, vlm_group_id: str, vlm_model: str, base_config: dict, total_sections: int): """Enqueue VLM processing for individual pages within a section, then bundle them.""" from modules.queue_system import enqueue_docling_task, TaskPriority try: # Create a unique task to handle page-by-page processing for this section section_task_id = enqueue_docling_task( file_id=file_id, task_type='vlm_section_page_bundle', payload={ 'section_idx': section_idx, 'start_page': start_page, 'end_page': end_page, 'section_title': section_title, 'vlm_group_id': vlm_group_id, 'vlm_model': vlm_model, 'base_config': base_config, 'total_sections': total_sections, 'producer': 'auto_split' }, priority=TaskPriority.NORMAL, timeout=3600 # 1 hour for page-by-page processing ) logger.info(f"[auto-canonical] VLM section page bundle task {section_task_id} for section {section_idx} of file {file_id}") except Exception as e: logger.error(f"Failed to enqueue VLM section page bundle for section {section_idx} file {file_id}: {e}") def process_document_analysis_task(self, task: QueueTask) -> Dict[str, Any]: """Process document structure analysis task""" file_id = task.file_id payload = task.payload logger.info(f"Processing document analysis task for file {file_id}") try: # Load file from storage bucket = payload['bucket'] file_path = payload['file_path'] cabinet_id = payload['cabinet_id'] file_bytes = self.storage.download_file(bucket, file_path) # Load existing artefacts if available client = SupabaseServiceRoleClient() artefacts = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).execute() tika_json = None docling_json = None for art in artefacts.data: if art['type'] == 'tika_json' and art['status'] == 'completed': try: tika_data = self.storage.download_file(bucket, art['rel_path']) tika_json = json.loads(tika_data.decode('utf-8')) except Exception as e: logger.warning(f"Failed to load Tika JSON for analysis: {e}") elif art['type'] in ['docling_frontmatter_json', 'docling_noocr_json'] and art['status'] == 'completed': try: docling_data = self.storage.download_file(bucket, art['rel_path']) docling_json = json.loads(docling_data.decode('utf-8')) break # Use first available Docling result except Exception as e: logger.warning(f"Failed to load Docling JSON for analysis: {e}") # Import here to avoid circular imports from modules.document_analysis import create_document_outline_hierarchy_artefact # Create document analysis analysis_data = create_document_outline_hierarchy_artefact( file_id=file_id, pdf_bytes=file_bytes, tika_json=tika_json, docling_json=docling_json ) # Store analysis as artefact (insert row first, then upload file) artefact_id = analysis_data.get('artefact_id') or str(uuid.uuid4()) analysis_data['artefact_id'] = artefact_id rel_path = f"{cabinet_id}/{file_id}/{artefact_id}/document_outline_hierarchy.json" # Insert row first to avoid orphaned files if DB insert fails # Insert artefact record with processing status sections_count = len(analysis_data.get('sections', []) or []) metadata = analysis_data.get('metadata') or {} analysis_methods = metadata.get('analysis_methods') self.client.supabase.table('document_artefacts').insert({ 'id': artefact_id, 'file_id': file_id, 'type': 'document_outline_hierarchy', 'rel_path': rel_path, 'extra': { 'sections_count': sections_count, 'analysis_methods': analysis_methods }, 'status': 'processing' }).execute() # Now upload the file analysis_json = json.dumps(analysis_data, ensure_ascii=False) self.storage.upload_file(bucket, rel_path, analysis_json.encode('utf-8'), 'application/json', upsert=True) # Mark artefact as completed self.client.supabase.table('document_artefacts').update({ 'status': 'completed' }).eq('id', artefact_id).execute() logger.info(f"Document analysis completed for file {file_id} (sections={sections_count})") return { 'sections_count': sections_count } except Exception as e: logger.error(f"Document analysis failed for file {file_id}: {e}") raise def process_page_images_task(self, task: QueueTask) -> Dict[str, Any]: """Process page images generation task""" file_id = task.file_id payload = task.payload logger.info(f"Processing page images task for file {file_id}") try: # Load file from storage bucket = payload['bucket'] file_path = payload['file_path'] cabinet_id = payload['cabinet_id'] file_bytes = self.storage.download_file(bucket, file_path) # Import here to avoid circular imports from modules.page_image_generator import create_page_images_artefact # Generate page images images_data = create_page_images_artefact( file_id=file_id, cabinet_id=cabinet_id, pdf_bytes=file_bytes ) artefact_id = images_data['artefact_id'] # Include bucket in manifest for client-side signed URL generation images_data['bucket'] = bucket # Upload all page images to storage for page_info in images_data['page_images']: # Upload full image full_path = page_info['full_image_path'] full_data = page_info.pop('full_image_data') # Remove from JSON self.storage.upload_file(bucket, full_path, full_data, 'image/png', upsert=True) # Upload thumbnail thumb_path = page_info['thumbnail_path'] thumb_data = page_info.pop('thumbnail_data') # Remove from JSON self.storage.upload_file(bucket, thumb_path, thumb_data, 'image/webp', upsert=True) # Store images metadata manifest under the artefact directory artefact_dir = f"{cabinet_id}/{file_id}/{artefact_id}" manifest_rel_path = f"{artefact_dir}/page_images.json" images_json = json.dumps(images_data, ensure_ascii=False) self.storage.upload_file(bucket, manifest_rel_path, images_json.encode('utf-8'), 'application/json', upsert=True) # Insert artefact record client = SupabaseServiceRoleClient() client.supabase.table('document_artefacts').insert({ 'id': artefact_id, 'file_id': file_id, 'type': 'page_images', # Store the directory prefix as rel_path for hybrid approach 'rel_path': artefact_dir, 'extra': { 'page_count': images_data['page_count'], 'total_full_images': images_data['storage_info']['total_full_images'], 'total_thumbnails': images_data['storage_info']['total_thumbnails'], 'estimated_storage_mb': images_data['storage_info']['estimated_storage_mb'], 'manifest': manifest_rel_path }, 'status': 'completed' }).execute() logger.info(f"Page images generation completed for file {file_id}") return { 'page_count': images_data['page_count'], 'estimated_storage_mb': images_data['storage_info']['estimated_storage_mb'] } except Exception as e: logger.error(f"Page images generation failed for file {file_id}: {e}") raise def process_comparison_analysis_task(self, task: QueueTask) -> Dict[str, Any]: """Process comparison analysis between no-OCR and OCR docling results.""" file_id = task.file_id payload = task.payload logger.info(f"Processing comparison analysis task for file {file_id}") try: no_ocr_group_id = payload.get('no_ocr_group_id') ocr_group_id = payload.get('ocr_group_id') comparison_type = payload.get('comparison_type', 'noocr_vs_ocr') initial_delay = payload.get('initial_delay_seconds', 0) # If this is the first execution and we have an initial delay, sleep briefly if initial_delay > 0: import time logger.info(f"Comparison analysis: applying initial delay of {min(initial_delay, 60)} seconds for file {file_id}") time.sleep(min(initial_delay, 60)) # Max 1 minute delay per attempt logger.info(f"Comparison analysis: delay complete for file {file_id}") if not no_ocr_group_id or not ocr_group_id: raise ValueError("Missing group_id parameters for comparison") client = SupabaseServiceRoleClient() # Find file info fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute() if not fr.data: raise ValueError(f"File {file_id} not found") file_row = fr.data bucket = file_row['bucket'] cabinet_id = file_row['cabinet_id'] # Find artefacts for both groups artefacts = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).execute() arts = artefacts.data or [] # Filter artefacts by group_id and type, including status no_ocr_arts = [a for a in arts if ((a.get('extra') or {}).get('group_id') == no_ocr_group_id and a.get('type') == 'docling_standard' and a.get('status') == 'completed')] ocr_arts = [a for a in arts if ((a.get('extra') or {}).get('group_id') == ocr_group_id and a.get('type') == 'docling_standard' and a.get('status') == 'completed')] # Also check pending/processing artefacts to understand timing better no_ocr_pending = [a for a in arts if ((a.get('extra') or {}).get('group_id') == no_ocr_group_id and a.get('type') == 'docling_standard' and a.get('status') in ('processing', 'pending'))] ocr_pending = [a for a in arts if ((a.get('extra') or {}).get('group_id') == ocr_group_id and a.get('type') == 'docling_standard' and a.get('status') in ('processing', 'pending'))] # Determine expected total parts from split_total metadata (if available) expected_parts = None if no_ocr_arts: expected_parts = (no_ocr_arts[0].get('extra') or {}).get('split_total') elif ocr_arts: expected_parts = (ocr_arts[0].get('extra') or {}).get('split_total') elif no_ocr_pending: expected_parts = (no_ocr_pending[0].get('extra') or {}).get('split_total') elif ocr_pending: expected_parts = (ocr_pending[0].get('extra') or {}).get('split_total') logger.info(f"Comparison analysis: found {len(no_ocr_arts)} completed no-OCR artefacts ({len(no_ocr_pending)} pending), {len(ocr_arts)} completed OCR artefacts ({len(ocr_pending)} pending), expected_parts={expected_parts}") # Enhanced validation with progress-aware retry logic if expected_parts is not None: # We know how many parts to expect, so wait for all of them total_no_ocr = len(no_ocr_arts) + len(no_ocr_pending) total_ocr = len(ocr_arts) + len(ocr_pending) # Calculate completion percentages no_ocr_completion = len(no_ocr_arts) / expected_parts * 100 ocr_completion = len(ocr_arts) / expected_parts * 100 # Check if we're making progress (store in task metadata for persistence) progress_key = f"comparison_progress_{file_id}" current_progress = { 'no_ocr_completed': len(no_ocr_arts), 'ocr_completed': len(ocr_arts), 'no_ocr_pending': len(no_ocr_pending), 'ocr_pending': len(ocr_pending) } # Get previous progress from payload (injected by retry mechanism) previous_progress = payload.get('previous_progress', {'no_ocr_completed': 0, 'ocr_completed': 0}) progress_made = (current_progress['no_ocr_completed'] > previous_progress['no_ocr_completed'] or current_progress['ocr_completed'] > previous_progress['ocr_completed']) if len(no_ocr_arts) < expected_parts or len(ocr_arts) < expected_parts: if len(no_ocr_pending) > 0 or len(ocr_pending) > 0: # Still processing - this is expected, always retry error_msg = f"PROGRESS_RETRY: expected={expected_parts}, no_ocr={len(no_ocr_arts)}/{expected_parts} ({no_ocr_completion:.1f}%), ocr={len(ocr_arts)}/{expected_parts} ({ocr_completion:.1f}%), pending: no_ocr={len(no_ocr_pending)}, ocr={len(ocr_pending)}" progress_retry_error = ValueError(error_msg) progress_retry_error.current_progress = current_progress progress_retry_error.is_progress_retry = True raise progress_retry_error elif progress_made: # No pending but made progress since last check - likely brief gap between completions error_msg = f"PROGRESS_MADE_RETRY: expected={expected_parts}, no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)}, progress since last check" progress_retry_error = ValueError(error_msg) progress_retry_error.current_progress = current_progress progress_retry_error.is_progress_retry = True raise progress_retry_error else: # No progress and no pending - likely stalled, but still retry with backoff error_msg = f"STALLED_RETRY: expected={expected_parts}, no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - no pending tasks but will retry" stalled_retry_error = ValueError(error_msg) stalled_retry_error.current_progress = current_progress stalled_retry_error.is_stalled_retry = True raise stalled_retry_error # Also verify both groups have the same number of completed parts if len(no_ocr_arts) != len(ocr_arts): error_msg = f"ALIGNMENT_RETRY: no_ocr completed={len(no_ocr_arts)}, ocr completed={len(ocr_arts)} (expected {expected_parts} each) - waiting for alignment" alignment_retry_error = ValueError(error_msg) alignment_retry_error.current_progress = current_progress alignment_retry_error.is_alignment_retry = True raise alignment_retry_error else: # Fallback to original logic when split_total not available if not no_ocr_arts or not ocr_arts: # More detailed retry logic with pending artefact awareness if len(no_ocr_arts) == 0 and len(ocr_arts) == 0: if len(no_ocr_pending) > 0 or len(ocr_pending) > 0: raise ValueError(f"Batches still processing: no_ocr completed={len(no_ocr_arts)} pending={len(no_ocr_pending)}, ocr completed={len(ocr_arts)} pending={len(ocr_pending)} - will retry") else: raise ValueError(f"No artefacts found for either group: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - may need more time") elif len(ocr_arts) == 0: if len(ocr_pending) > 0: raise ValueError(f"OCR batch still processing: no_ocr completed={len(no_ocr_arts)}, ocr completed={len(ocr_arts)} pending={len(ocr_pending)} - will retry") else: raise ValueError(f"OCR batch appears incomplete: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - will retry") elif len(no_ocr_arts) == 0: if len(no_ocr_pending) > 0: raise ValueError(f"No-OCR batch still processing: no_ocr completed={len(no_ocr_arts)} pending={len(no_ocr_pending)}, ocr completed={len(ocr_arts)} - will retry") else: raise ValueError(f"No-OCR batch appears incomplete: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - will retry") else: raise ValueError(f"Unexpected missing artefacts: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)}") # For fallback case, ensure both groups have same count if len(no_ocr_arts) != len(ocr_arts): raise ValueError(f"Mismatched group sizes: no_ocr={len(no_ocr_arts)}, ocr={len(ocr_arts)} - will retry") # Sort both groups by split_order for aligned comparison no_ocr_arts.sort(key=lambda x: ((x.get('extra') or {}).get('split_order') or 0)) ocr_arts.sort(key=lambda x: ((x.get('extra') or {}).get('split_order') or 0)) # Log final validation before proceeding no_ocr_orders = [((a.get('extra') or {}).get('split_order') or 0) for a in no_ocr_arts] ocr_orders = [((a.get('extra') or {}).get('split_order') or 0) for a in ocr_arts] logger.info(f"Proceeding with comparison: no_ocr split_orders={no_ocr_orders}, ocr split_orders={ocr_orders}, expected_parts={expected_parts}") # Create comparison results comparison_results = self._compare_docling_groups( file_id, bucket, cabinet_id, no_ocr_arts, ocr_arts, comparison_type, no_ocr_group_id, ocr_group_id, payload ) return comparison_results except Exception as e: logger.error(f"Comparison analysis failed for file {file_id}: {e}") raise def _compare_docling_groups(self, file_id: str, bucket: str, cabinet_id: str, no_ocr_arts: list, ocr_arts: list, comparison_type: str, no_ocr_group_id: str, ocr_group_id: str, payload: Dict[str, Any]) -> Dict[str, Any]: """Compare two groups of docling artefacts and generate analysis.""" import subprocess import tempfile import json import uuid logger.info(f"Starting detailed comparison for file {file_id}: {len(no_ocr_arts)} vs {len(ocr_arts)} artefacts") artefact_id = str(uuid.uuid4()) comparison_dir = f"{cabinet_id}/{file_id}/{artefact_id}" results = [] overall_stats = { 'total_comparisons': min(len(no_ocr_arts), len(ocr_arts)), 'successful_comparisons': 0, 'failed_comparisons': 0, 'differences_found': 0, 'identical_count': 0 } try: with tempfile.TemporaryDirectory() as temp_dir: for i in range(min(len(no_ocr_arts), len(ocr_arts))): no_ocr_art = no_ocr_arts[i] ocr_art = ocr_arts[i] try: # Download manifest JSONs for both artefacts no_ocr_manifest_path = ((no_ocr_art.get('extra') or {}).get('manifest')) ocr_manifest_path = ((ocr_art.get('extra') or {}).get('manifest')) if not no_ocr_manifest_path or not ocr_manifest_path: logger.warning(f"Missing manifest paths for comparison {i+1}") continue no_ocr_manifest_data = self.storage.download_file(bucket, no_ocr_manifest_path) ocr_manifest_data = self.storage.download_file(bucket, ocr_manifest_path) no_ocr_manifest = json.loads(no_ocr_manifest_data.decode('utf-8')) ocr_manifest = json.loads(ocr_manifest_data.decode('utf-8')) # Compare JSON content if available no_ocr_json_path = no_ocr_manifest.get('json_full') ocr_json_path = ocr_manifest.get('json_full') if no_ocr_json_path and ocr_json_path: comparison_result = self._compare_json_content( bucket, no_ocr_json_path, ocr_json_path, temp_dir, i + 1 ) comparison_result['no_ocr_artefact_id'] = no_ocr_art['id'] comparison_result['ocr_artefact_id'] = ocr_art['id'] comparison_result['split_order'] = (no_ocr_art.get('extra') or {}).get('split_order', i + 1) comparison_result['split_heading'] = (no_ocr_art.get('extra') or {}).get('split_heading', f'Part {i+1}') results.append(comparison_result) overall_stats['successful_comparisons'] += 1 if comparison_result['has_differences']: overall_stats['differences_found'] += 1 else: overall_stats['identical_count'] += 1 else: logger.warning(f"Missing JSON content paths for comparison {i+1}") overall_stats['failed_comparisons'] += 1 except Exception as part_e: logger.warning(f"Failed to compare part {i+1}: {part_e}") overall_stats['failed_comparisons'] += 1 continue # Create comprehensive comparison report comparison_report = { 'file_id': file_id, 'comparison_type': comparison_type, 'timestamp': json.dumps({"created_at": "now()"}, default=str), 'overall_statistics': overall_stats, 'detailed_results': results, 'summary': { 'total_parts_compared': overall_stats['successful_comparisons'], 'identical_parts': overall_stats['identical_count'], 'different_parts': overall_stats['differences_found'], 'accuracy_percentage': (overall_stats['identical_count'] / max(overall_stats['successful_comparisons'], 1)) * 100 } } # Store comparison report as artefact report_path = f"{comparison_dir}/comparison_report.json" report_json = json.dumps(comparison_report, ensure_ascii=False, indent=2) self.storage.upload_file(bucket, report_path, report_json.encode('utf-8'), 'application/json', upsert=True) # Create artefact record client = SupabaseServiceRoleClient() client.supabase.table('document_artefacts').insert({ 'id': artefact_id, 'file_id': file_id, 'type': 'docling_comparison_analysis', 'rel_path': report_path, 'extra': { 'comparison_type': comparison_type, 'no_ocr_group_id': no_ocr_group_id, 'ocr_group_id': ocr_group_id, 'producer': payload.get('producer', 'auto_split'), 'total_comparisons': overall_stats['total_comparisons'], 'successful_comparisons': overall_stats['successful_comparisons'], 'differences_found': overall_stats['differences_found'], 'accuracy_percentage': comparison_report['summary']['accuracy_percentage'] }, 'status': 'completed' }).execute() logger.info(f"Comparison analysis completed for file {file_id}: {overall_stats['successful_comparisons']} comparisons, {overall_stats['differences_found']} differences found") # Trigger VLM processing after comparison completes (if enabled) self._trigger_vlm_after_comparison(file_id, payload) return { 'artefact_id': artefact_id, 'comparisons_completed': overall_stats['successful_comparisons'], 'differences_found': overall_stats['differences_found'], 'accuracy_percentage': comparison_report['summary']['accuracy_percentage'] } except Exception as e: logger.error(f"Failed to create comparison analysis for file {file_id}: {e}") raise def _compare_json_content(self, bucket: str, no_ocr_path: str, ocr_path: str, temp_dir: str, part_number: int) -> Dict[str, Any]: """Compare JSON content using jq and diff as suggested in web search results.""" import subprocess import os from pathlib import Path try: # Download both JSON files no_ocr_data = self.storage.download_file(bucket, no_ocr_path) ocr_data = self.storage.download_file(bucket, ocr_path) # Save to temp files no_ocr_file = Path(temp_dir) / f'no_ocr_part_{part_number}.json' ocr_file = Path(temp_dir) / f'ocr_part_{part_number}.json' with open(no_ocr_file, 'wb') as f: f.write(no_ocr_data) with open(ocr_file, 'wb') as f: f.write(ocr_data) # Use jq to sort and format both files for comparison (as suggested in web search results) sorted_no_ocr = Path(temp_dir) / f'sorted_no_ocr_part_{part_number}.json' sorted_ocr = Path(temp_dir) / f'sorted_ocr_part_{part_number}.json' # Sort both files using jq subprocess.run(['jq', '--sort-keys', '.', str(no_ocr_file)], stdout=open(sorted_no_ocr, 'w'), stderr=subprocess.DEVNULL, check=True) subprocess.run(['jq', '--sort-keys', '.', str(ocr_file)], stdout=open(sorted_ocr, 'w'), stderr=subprocess.DEVNULL, check=True) # Compare using diff diff_output = Path(temp_dir) / f'diff_part_{part_number}.txt' diff_result = subprocess.run( ['diff', '-u', str(sorted_no_ocr), str(sorted_ocr)], stdout=open(diff_output, 'w'), stderr=subprocess.DEVNULL, text=True ) # Read diff output with open(diff_output, 'r') as f: diff_content = f.read() # Analyze differences has_differences = diff_result.returncode != 0 diff_lines = len([l for l in diff_content.split('\n') if l.startswith(('+', '-')) and not l.startswith(('+++', '---'))]) return { 'part_number': part_number, 'has_differences': has_differences, 'diff_lines_count': diff_lines, 'diff_content_preview': diff_content[:1000] if diff_content else '', # First 1000 chars 'no_ocr_size': len(no_ocr_data), 'ocr_size': len(ocr_data), 'size_difference': abs(len(ocr_data) - len(no_ocr_data)) } except subprocess.CalledProcessError as e: logger.warning(f"jq/diff command failed for part {part_number}: {e}") return { 'part_number': part_number, 'has_differences': True, 'error': f"Comparison tools failed: {str(e)}", 'diff_lines_count': -1 } except Exception as e: logger.warning(f"JSON comparison failed for part {part_number}: {e}") return { 'part_number': part_number, 'has_differences': True, 'error': f"Comparison failed: {str(e)}", 'diff_lines_count': -1 } def process_vlm_section_page_bundle_task(self, task: QueueTask) -> Dict[str, Any]: """Process VLM section page bundle task - create individual page bundles and combine them.""" file_id = task.file_id payload = task.payload logger.info(f"Processing VLM section page bundle task for file {file_id}") try: section_idx = payload.get('section_idx') start_page = payload.get('start_page') end_page = payload.get('end_page') section_title = payload.get('section_title', f'Section {section_idx}') vlm_group_id = payload.get('vlm_group_id') vlm_model = payload.get('vlm_model', 'smoldocling') base_config = payload.get('base_config', {}) total_sections = payload.get('total_sections', 1) client = SupabaseServiceRoleClient() # Get file info fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute() if not fr.data: raise ValueError(f"File {file_id} not found") file_row = fr.data bucket = file_row['bucket'] cabinet_id = file_row['cabinet_id'] # Find processing path (prefer converted PDF) arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or [] pdf_art = next((a for a in arts if a.get('type') == 'document_pdf'), None) processing_path = pdf_art['rel_path'] if pdf_art else file_row['path'] processing_mime = 'application/pdf' logger.info(f"VLM section bundle: processing section {section_idx} '{section_title}' pages {start_page}-{end_page} for file {file_id}") # Create individual page processing tasks page_task_ids = [] for page_num in range(start_page, end_page + 1): try: page_config = { **base_config, 'do_ocr': False, 'force_ocr': False, 'pipeline': 'vlm', 'vlm_pipeline_model': vlm_model, 'page_range': [page_num, page_num], 'target_type': 'zip', 'image_export_mode': 'referenced', # Add required VLM parameters that may be missing 'do_picture_classification': False, 'do_picture_description': False } logger.debug(f"VLM page {page_num} config: pipeline={page_config.get('pipeline')}, model={page_config.get('vlm_pipeline_model')}, range={page_config.get('page_range')}") from modules.queue_system import enqueue_docling_task, TaskPriority page_task_id = enqueue_docling_task( file_id=file_id, task_type='canonical_docling_json', payload={ 'bucket': bucket, 'file_path': processing_path, 'cabinet_id': cabinet_id, 'mime_type': processing_mime, 'config': page_config, 'artefact_extra': { 'is_subdoc': True, 'page_range': [page_num, page_num], 'label': f'{section_title} - Page {page_num}', 'vlm_section_idx': section_idx, 'vlm_section_title': section_title, 'vlm_page_number': page_num, 'vlm_section_start': start_page, 'vlm_section_end': end_page, 'producer': 'auto_split_vlm_page' } }, priority=TaskPriority.NORMAL, timeout=1800 ) page_task_ids.append((page_num, page_task_id)) logger.debug(f"Enqueued VLM page task {page_task_id} for page {page_num} of section {section_idx}") except Exception as page_e: logger.warning(f"Failed to enqueue VLM page {page_num} for section {section_idx} file {file_id}: {page_e}") continue if not page_task_ids: raise ValueError(f"No page tasks could be enqueued for section {section_idx}") # Wait for all page tasks to complete and then create section bundle logger.info(f"Enqueued {len(page_task_ids)} VLM page tasks for section {section_idx}, now waiting for completion...") # Create a follow-up task to bundle the completed page results from modules.queue_system import enqueue_docling_task, TaskPriority import time # Wait a bit for page tasks to start, then create bundle task time.sleep(10) bundle_task_id = enqueue_docling_task( file_id=file_id, task_type='vlm_section_bundle_collector', payload={ 'section_idx': section_idx, 'start_page': start_page, 'end_page': end_page, 'section_title': section_title, 'vlm_group_id': vlm_group_id, 'vlm_model': vlm_model, 'total_sections': total_sections, 'producer': 'auto_split', 'page_task_ids': [tid for _, tid in page_task_ids], 'expected_pages': list(range(start_page, end_page + 1)) }, priority=TaskPriority.LOW, # Run after page tasks timeout=3600 ) logger.info(f"Created VLM section bundle collector task {bundle_task_id} for section {section_idx}") return { 'section_idx': section_idx, 'page_tasks_created': len(page_task_ids), 'bundle_task_id': bundle_task_id, 'pages_range': f"{start_page}-{end_page}" } except Exception as e: logger.error(f"VLM section page bundle task failed for file {file_id}: {e}") raise def process_vlm_section_bundle_collector_task(self, task: QueueTask) -> Dict[str, Any]: """Collect completed VLM page results and create section-level bundle manifest.""" file_id = task.file_id payload = task.payload logger.info(f"Processing VLM section bundle collector for file {file_id}") try: section_idx = payload.get('section_idx') start_page = payload.get('start_page') end_page = payload.get('end_page') section_title = payload.get('section_title', f'Section {section_idx}') vlm_group_id = payload.get('vlm_group_id') vlm_model = payload.get('vlm_model', 'smoldocling') total_sections = payload.get('total_sections', 1) expected_pages = payload.get('expected_pages', []) client = SupabaseServiceRoleClient() # Get file info fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute() if not fr.data: raise ValueError(f"File {file_id} not found") file_row = fr.data bucket = file_row['bucket'] cabinet_id = file_row['cabinet_id'] # Find all completed VLM page artefacts for this section artefacts = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).execute() arts = artefacts.data or [] # Filter for this section's VLM page artefacts section_page_arts = [] for art in arts: extra = art.get('extra', {}) if (extra.get('vlm_section_idx') == section_idx and extra.get('producer') == 'auto_split_vlm_page' and art.get('type') == 'docling_vlm' and art.get('status') == 'completed'): section_page_arts.append(art) # Check if we have all expected pages found_pages = [art.get('extra', {}).get('vlm_page_number') for art in section_page_arts] found_pages = [p for p in found_pages if p is not None] missing_pages = [p for p in expected_pages if p not in found_pages] logger.info(f"VLM section {section_idx} bundle collector: found {len(section_page_arts)} page artefacts, expected {len(expected_pages)} pages") if logger.isEnabledFor(10): # DEBUG level found_pages_debug = [art.get('extra', {}).get('vlm_page_number') for art in section_page_arts] logger.debug(f"VLM section {section_idx}: found pages {found_pages_debug}, expected pages {expected_pages}") if missing_pages: # Not all pages are ready, retry later logger.info(f"VLM section {section_idx} bundle collector: missing pages {missing_pages}, found pages {found_pages} - will retry later") raise ValueError(f"VLM section {section_idx} missing pages: {missing_pages} (found: {found_pages}) - will retry") # Sort page artefacts by page number section_page_arts.sort(key=lambda x: x.get('extra', {}).get('vlm_page_number', 0)) logger.info(f"VLM section {section_idx} bundle: creating manifest for {len(section_page_arts)} pages") # Create section bundle manifest section_artefact_id = str(uuid.uuid4()) section_manifest_path = f"{cabinet_id}/{file_id}/{section_artefact_id}/vlm_section_{section_idx}_manifest.json" page_bundles = [] for page_art in section_page_arts: extra = page_art.get('extra', {}) page_num = extra.get('vlm_page_number') page_manifest_path = extra.get('manifest') page_bundles.append({ 'page_number': page_num, 'artefact_id': page_art['id'], 'manifest_path': page_manifest_path, 'rel_path': page_art['rel_path'], 'label': extra.get('label', f'Page {page_num}') }) section_manifest = { 'file_id': file_id, 'section_idx': section_idx, 'section_title': section_title, 'start_page': start_page, 'end_page': end_page, 'vlm_model': vlm_model, 'total_pages': len(page_bundles), 'page_bundles': page_bundles, 'created_at': 'now()', 'type': 'vlm_section_page_bundle' } # Store section manifest import json manifest_json = json.dumps(section_manifest, ensure_ascii=False, indent=2) self.storage.upload_file(bucket, section_manifest_path, manifest_json.encode('utf-8'), 'application/json', upsert=True) # Create section bundle artefact client.supabase.table('document_artefacts').insert({ 'id': section_artefact_id, 'file_id': file_id, 'type': 'vlm_section_page_bundle', 'rel_path': section_manifest_path, 'extra': { 'section_idx': section_idx, 'section_title': section_title, 'start_page': start_page, 'end_page': end_page, 'vlm_model': vlm_model, 'total_pages': len(page_bundles), 'group_id': vlm_group_id, 'split_order': section_idx, 'split_heading': section_title, 'split_total': total_sections, 'pipeline': 'vlm', 'producer': 'auto_split', 'group_pack_type': 'vlm_page_bundle_auto_split' }, 'status': 'completed' }).execute() logger.info(f"VLM section bundle collector completed for section {section_idx} of file {file_id}: created manifest with {len(page_bundles)} page bundles") return { 'section_artefact_id': section_artefact_id, 'section_idx': section_idx, 'pages_bundled': len(page_bundles), 'manifest_path': section_manifest_path } except Exception as e: logger.error(f"VLM section bundle collector failed for file {file_id}: {e}") raise def _trigger_vlm_after_comparison(self, file_id: str, comparison_payload: Dict[str, Any]): """Trigger VLM processing after comparison analysis completes.""" try: # Check if VLM should be triggered if not comparison_payload.get('trigger_vlm_after_comparison'): logger.debug(f"VLM post-comparison trigger not enabled for file {file_id}") return vlm_config = comparison_payload.get('vlm_config', {}) if not vlm_config.get('enabled'): logger.debug(f"VLM not enabled for file {file_id}") return logger.info(f"[auto-canonical] Triggering VLM processing after comparison for file {file_id}") # Extract VLM configuration split_by_page = vlm_config.get('split_by_page', False) vlm_model = vlm_config.get('model', 'smoldocling') threshold = vlm_config.get('threshold', 50) base_config = vlm_config.get('base_config', {}) # Generate new group_id for VLM processing import uuid vlm_group_id = str(uuid.uuid4()) if split_by_page: # Page-by-page processing within sections logger.info(f"[auto-canonical] vlm page-by-page processing for file {file_id} (post-comparison)") self._enqueue_vlm_page_processing( file_id, threshold, vlm_group_id, vlm_model, base_config ) else: # Standard section-level VLM processing from routers.database.files.files import enqueue_canonical_docling body_vlm = { 'use_split_map': True, 'threshold': threshold, 'producer': 'auto_split', 'group_id': vlm_group_id, 'config': { **base_config, 'do_ocr': False, # VLM doesn't need OCR 'force_ocr': False, 'pipeline': 'vlm', 'vlm_pipeline_model': vlm_model } } logger.info(f"[auto-canonical] vlm section batch group_id={vlm_group_id} for file {file_id} (post-comparison)") enqueue_canonical_docling(file_id=file_id, body=body_vlm) except Exception as e: logger.warning(f"Failed to trigger VLM processing after comparison for file {file_id}: {e}") def process_docling_bundle_task(self, task: QueueTask) -> Dict[str, Any]: """ Process single docling bundle task (whole document processing). This creates a coherent single bundle with all formats using direct processing. NO temporary tasks or old logic reuse - this is the new architecture. """ file_id = task.file_id payload = task.payload logger.info(f"🎯 NEW ARCHITECTURE: Processing docling bundle task for file {file_id} (whole document)") try: # Extract bundle configuration config = payload.get('config', {}) bundle_metadata = payload.get('bundle_metadata', {}) # Ensure bundle processing configuration config['target_type'] = 'zip' config['to_formats'] = ['json', 'html', 'text', 'md', 'doctags'] # Call the actual docling processing directly - NO temp tasks! result = self._process_docling_bundle_direct(task, config, bundle_metadata) logger.info(f"✅ NEW ARCHITECTURE: Successfully processed docling bundle for file {file_id}") return result except Exception as e: logger.error(f"❌ NEW ARCHITECTURE: Docling bundle processing failed for file {file_id}: {e}") raise def _process_docling_bundle_direct(self, task: QueueTask, config: Dict[str, Any], bundle_metadata: Dict[str, Any]) -> Dict[str, Any]: """ Direct docling bundle processing - NEW ARCHITECTURE approach. This processes the docling request directly without creating temporary tasks, ensuring clean Redis state and proper bundle metadata handling. """ file_id = task.file_id payload = task.payload logger.info(f"🔧 DIRECT PROCESSING: Starting docling bundle processing for file {file_id}") if not self.docling_url: raise ValueError("DOCLING_URL not configured") # Extract payload data bucket = payload['bucket'] file_path = payload['file_path'] cabinet_id = payload['cabinet_id'] # Download file logger.debug(f"📥 DIRECT PROCESSING: Downloading file for bundle processing: {bucket}/{file_path}") file_bytes = self.storage.download_file(bucket, file_path) # Prepare Docling request with bundle-specific config docling_api_key = os.getenv('DOCLING_API_KEY') headers = {'Accept': '*/*'} if docling_api_key: headers['X-Api-Key'] = docling_api_key # Build form data for bundle processing - USE CONFIG FROM PIPELINE_CONTROLLER (no hardcoded defaults!) # The config passed from pipeline_controller already has environment variables loaded form_data = [ ('target_type', 'zip'), # Always zip for bundles ('do_ocr', str(config.get('do_ocr', False)).lower()), ('force_ocr', str(config.get('force_ocr', False)).lower()), ('image_export_mode', 'referenced'), # Bundle standard ('ocr_engine', config.get('ocr_engine', 'easyocr')), ('pdf_backend', config.get('pdf_backend', 'dlparse_v4')), ('table_mode', config.get('table_mode', 'fast')), # Use config from pipeline_controller (env vars) ('table_cell_matching', str(config.get('table_cell_matching', True)).lower()), # Use config from pipeline_controller (env: true) ('pipeline', config.get('pipeline', 'standard')), ('do_formula_enrichment', str(config.get('do_formula_enrichment', True)).lower()), # Use config from pipeline_controller (env: true) ('do_code_enrichment', str(config.get('do_code_enrichment', True)).lower()), # Use config from pipeline_controller (env: true) ('do_table_structure', str(config.get('do_table_structure', True)).lower()), ('include_images', str(config.get('include_images', True)).lower()), ('images_scale', str(config.get('images_scale', 2.0))), ('do_picture_classification', str(config.get('do_picture_classification', False)).lower()), ('do_picture_description', str(config.get('do_picture_description', False)).lower()), ('document_timeout', str(config.get('document_timeout', task.timeout))) ] # Handle OCR languages as array (API expects multiple form fields) ocr_lang = config.get('ocr_lang') if ocr_lang: if isinstance(ocr_lang, list): for lang in ocr_lang: form_data.append(('ocr_lang', str(lang))) else: form_data.append(('ocr_lang', str(ocr_lang))) # Handle VLM pipeline options (CRITICAL for VLM processing) if config.get('vlm_pipeline_model'): form_data.append(('vlm_pipeline_model', config.get('vlm_pipeline_model'))) # VLM model local/API options must be JSON per Docling OpenAPI spec if config.get('vlm_pipeline_model_local'): vlm_local = config.get('vlm_pipeline_model_local') if isinstance(vlm_local, (dict, list)): form_data.append(('vlm_pipeline_model_local', json.dumps(vlm_local))) elif isinstance(vlm_local, str) and vlm_local.strip().startswith(('{', '[')): form_data.append(('vlm_pipeline_model_local', vlm_local)) # else: omit to avoid validation error if config.get('vlm_pipeline_model_api'): vlm_api = config.get('vlm_pipeline_model_api') if isinstance(vlm_api, (dict, list)): form_data.append(('vlm_pipeline_model_api', json.dumps(vlm_api))) elif isinstance(vlm_api, str) and vlm_api.strip().startswith(('{', '[')): form_data.append(('vlm_pipeline_model_api', vlm_api)) # else: omit # Picture description options must be JSON per Docling OpenAPI spec if config.get('picture_description_local'): pic_local = config.get('picture_description_local') if isinstance(pic_local, (dict, list)): form_data.append(('picture_description_local', json.dumps(pic_local))) elif isinstance(pic_local, str) and pic_local.strip().startswith(('{', '[')): form_data.append(('picture_description_local', pic_local)) if config.get('picture_description_api'): pic_api = config.get('picture_description_api') if isinstance(pic_api, (dict, list)): form_data.append(('picture_description_api', json.dumps(pic_api))) elif isinstance(pic_api, str) and pic_api.strip().startswith(('{', '[')): form_data.append(('picture_description_api', pic_api)) if 'picture_description_area_threshold' in config: form_data.append(('picture_description_area_threshold', str(config.get('picture_description_area_threshold')))) # Handle markdown page break placeholder if 'md_page_break_placeholder' in config: form_data.append(('md_page_break_placeholder', config.get('md_page_break_placeholder'))) # Add formats - always all formats for bundles for fmt in ['json', 'html', 'text', 'md', 'doctags']: form_data.append(('to_formats', fmt)) # Handle page range properly - get actual PDF page count like frontmatter does page_range = config.get('page_range', [1, 999999]) if isinstance(page_range, list) and len(page_range) >= 2: def _to_int_safe(v, default): try: return int(v) except Exception: return default start_pg = _to_int_safe(page_range[0], 1) end_pg = _to_int_safe(page_range[1], 999999) if start_pg < 1: start_pg = 1 if end_pg < start_pg: end_pg = start_pg # CRITICAL: Get actual PDF page count to prevent massive range try: import fitz # PyMuPDF doc = fitz.open(stream=file_bytes, filetype='pdf') pc = int(doc.page_count) doc.close() if pc > 0: end_pg = min(end_pg, pc) # Clamp to actual page count! start_pg = max(1, min(start_pg, pc)) if end_pg < start_pg: end_pg = start_pg logger.info(f"📄 DIRECT PROCESSING: PDF has {pc} pages, using range {start_pg}-{end_pg}") except Exception as e: logger.warning(f"Could not determine PDF page count: {e}, using defaults") form_data.append(('page_range', str(start_pg))) form_data.append(('page_range', str(end_pg))) else: # Fallback to single page if no range specified form_data.append(('page_range', '1')) form_data.append(('page_range', '1')) files = [('files', ('file', file_bytes, payload.get('mime_type', 'application/pdf')))] # DEBUG: Log the actual config being sent to Docling config_debug = {key: value for key, value in form_data if key in ['table_mode', 'table_cell_matching', 'do_formula_enrichment', 'do_code_enrichment', 'do_ocr', 'pipeline']} logger.info(f"🔧 DIRECT PROCESSING: Docling config being sent: {config_debug}") # Make the HTTP request logger.info(f"🌐 DIRECT PROCESSING: Making HTTP request to Docling for file {file_id}") try: import time start_time = time.time() response = requests.post( f"{self.docling_url.rstrip('/')}/v1/convert/file", files=files, data=form_data, headers=headers, timeout=task.timeout ) response.raise_for_status() elapsed = time.time() - start_time logger.info(f"⚡ DIRECT PROCESSING: Docling request completed in {elapsed:.2f}s for file {file_id}") except Exception as e: logger.error(f"🌐 DIRECT PROCESSING: HTTP request failed for file {file_id}: {e}") raise # Process response - should be ZIP for bundle content_type = (response.headers.get('Content-Type') or '').lower() is_zip_resp = ('zip' in content_type) or (response.content[:2] == b'PK') if not is_zip_resp: raise ValueError(f"Expected ZIP response for bundle, got: {content_type}") # Process ZIP bundle and create artefacts logger.info(f"📦 DIRECT PROCESSING: Processing ZIP bundle for file {file_id}") result = self._process_docling_zip_bundle( file_id=file_id, bucket=bucket, cabinet_id=cabinet_id, zip_content=response.content, bundle_metadata=bundle_metadata, task_config=config ) logger.info(f"✅ DIRECT PROCESSING: Bundle processing completed for file {file_id}") return result def _create_bundle_display_metadata(self, bundle_type: str, title: str, index: int = None, total: int = None, page_range: list = None) -> dict: """ Create consistent display metadata for bundle organization. This ensures all bundles have proper titles, ordering, and display names for frontend organization and user-friendly presentation. """ metadata = { 'title': title, 'bundle_type': bundle_type } if index is not None: metadata['split_order'] = index if total is not None: metadata['split_total'] = total if page_range: metadata['page_range'] = page_range metadata['page_count'] = page_range[1] - page_range[0] + 1 # Create display names based on bundle type if bundle_type == 'page': metadata['display_name'] = f"Page {page_range[0]}" if page_range else f"Page {index}" metadata['bundle_label'] = f"Page {page_range[0]} Bundle" metadata['sort_key'] = page_range[0] if page_range else index elif bundle_type == 'section': page_str = f" (p{page_range[0]}-{page_range[1]})" if page_range else "" metadata['display_name'] = f"{index:02d}. {title}{page_str}" metadata['bundle_label'] = f"{title} Bundle" metadata['sort_key'] = index elif bundle_type == 'chunk': page_str = f" (p{page_range[0]}-{page_range[1]})" if page_range else "" metadata['display_name'] = f"{index:02d}. {title}{page_str}" metadata['bundle_label'] = f"{title} Bundle" metadata['sort_key'] = index else: metadata['display_name'] = title metadata['bundle_label'] = f"{title} Bundle" metadata['sort_key'] = index or 0 return metadata def _process_docling_zip_bundle(self, file_id: str, bucket: str, cabinet_id: str, zip_content: bytes, bundle_metadata: Dict[str, Any], task_config: Dict[str, Any]) -> Dict[str, Any]: """ Process ZIP bundle response and create artefacts with proper bundle metadata. This is the NEW ARCHITECTURE approach for handling docling ZIP responses. """ import zipfile import io import uuid import json import time logger.info(f"📦 ZIP PROCESSING: Starting bundle extraction for file {file_id}") # Create bundle artefact structure artefact_id = str(uuid.uuid4()) base_dir = f"{cabinet_id}/{file_id}/{artefact_id}" archive_path = f"{base_dir}/bundle.zip" # Save original archive self.storage.upload_file(bucket, archive_path, zip_content, 'application/zip', upsert=True) # Extract ZIP contents zf = zipfile.ZipFile(io.BytesIO(zip_content)) entries = [] file_paths = {} for entry in zf.filelist: if entry.is_dir(): continue entry_content = zf.read(entry) entry_filename = entry.filename rel_path = f"{base_dir}/{entry_filename}" # Determine MIME type if entry_filename.endswith('.json'): mime = 'application/json' file_paths['json'] = rel_path elif entry_filename.endswith('.html'): mime = 'text/html' file_paths['html'] = rel_path elif entry_filename.endswith('.md'): mime = 'text/markdown' file_paths['md'] = rel_path elif entry_filename.endswith('.txt'): mime = 'text/plain' file_paths['text'] = rel_path elif entry_filename.endswith('.doctags'): mime = 'application/json' file_paths['doctags'] = rel_path else: mime = 'application/octet-stream' # Upload file self.storage.upload_file(bucket, rel_path, entry_content, mime, upsert=True) entries.append({ 'filename': entry_filename, 'rel_path': rel_path, 'mime_type': mime, 'size': len(entry_content) }) logger.debug(f"📄 ZIP PROCESSING: Extracted {entry_filename} -> {rel_path}") zf.close() # Create bundle manifest manifest = { 'bundle_id': artefact_id, 'file_id': file_id, 'bundle_type': 'docling_bundle', 'processing_mode': 'whole_document', 'created_at': time.time(), 'archive_path': archive_path, 'entries': entries, 'file_paths': file_paths, 'metadata': bundle_metadata, 'config': task_config } manifest_path = f"{base_dir}/manifest.json" manifest_content = json.dumps(manifest, indent=2).encode('utf-8') self.storage.upload_file(bucket, manifest_path, manifest_content, 'application/json', upsert=True) # Create database artefact with bundle metadata artefact_extra = { **bundle_metadata, 'manifest': manifest_path, 'archive_path': archive_path, 'file_paths': file_paths, 'entry_count': len(entries), 'group_pack_type': 'whole' # Add proper pack type for whole document bundles } self.client.supabase.table('document_artefacts').insert({ 'id': artefact_id, 'file_id': file_id, 'page_number': 0, # Whole document 'type': 'docling_bundle', 'rel_path': base_dir, 'size_tag': json.dumps(task_config), 'language': 'en', 'chunk_index': None, 'extra': artefact_extra }).execute() logger.info(f"✅ ZIP PROCESSING: Created bundle artefact {artefact_id} with {len(entries)} files for file {file_id}") return { 'artefact_id': artefact_id, 'rel_path': base_dir, 'manifest_path': manifest_path, 'archive_path': archive_path, 'file_paths': file_paths, 'entry_count': len(entries), 'bundle_metadata': bundle_metadata } def process_docling_bundle_split_task(self, task: QueueTask) -> Dict[str, Any]: """ Process split docling bundle task (multi-unit processing). This creates multiple sub-bundles and a master manifest based on processing mode. """ file_id = task.file_id payload = task.payload logger.info(f"Processing docling bundle split task for file {file_id}") try: processing_mode = payload.get('processing_mode', 'split_by_sections') processing_data = payload.get('processing_data', {}) config = payload.get('config', {}) bundle_metadata = payload.get('bundle_metadata', {}) logger.info(f"Split bundle processing mode: {processing_mode}") if processing_mode == 'split_by_pages': return self._process_split_by_pages(task, processing_data, config, bundle_metadata) elif processing_mode == 'split_by_sections': return self._process_split_by_sections(task, processing_data, config, bundle_metadata) elif processing_mode == 'split_by_chunks': return self._process_split_by_chunks(task, processing_data, config, bundle_metadata) else: raise ValueError(f"Unknown processing mode: {processing_mode}") except Exception as e: logger.error(f"Docling bundle split processing failed for file {file_id}: {e}") raise def _process_split_by_pages(self, task: QueueTask, processing_data: dict, config: dict, bundle_metadata: dict) -> Dict[str, Any]: """Process document by individual pages and create page bundles.""" file_id = task.file_id payload = task.payload bucket = payload['bucket'] file_path = payload['file_path'] cabinet_id = payload['cabinet_id'] mime_type = payload['mime_type'] pages = processing_data.get('pages', []) logger.info(f"Processing {len(pages)} individual pages for file {file_id}") # Create master bundle directory master_bundle_id = str(uuid.uuid4()) master_dir = f"{cabinet_id}/{file_id}/{master_bundle_id}" page_bundles = [] # Process each page as a separate bundle for idx, page_num in enumerate(pages, 1): try: page_config = { **config, 'page_range': [page_num, page_num], 'target_type': 'zip', 'to_formats': ['json', 'html', 'text', 'md', 'doctags'] } # Create descriptive page title and enhanced metadata page_title = f"Page {page_num}" page_display_name = f"Page {page_num}" # Create individual page task with enhanced labeling page_task = QueueTask( id=f"{task.id}_page_{page_num}", file_id=file_id, service=task.service, task_type='canonical_docling_json', payload={ **payload, 'config': page_config, 'artefact_extra': { 'page_number': page_num, 'page_title': page_title, 'display_name': page_display_name, 'split_order': idx, # Sequential order within this bundle 'split_total': len(pages), 'split_heading': page_title, 'section_title': page_title, # For consistency 'is_page_bundle': True, 'master_bundle_id': master_bundle_id, 'bundle_label': f"Page {page_num} Bundle", **bundle_metadata } }, priority=task.priority, timeout=1800, created_at=task.created_at ) # Process page bundle page_result = self._process_docling_task(page_task) page_bundles.append({ 'page_number': page_num, 'page_title': page_title, 'display_name': page_display_name, 'split_order': idx, 'artefact_id': page_result.get('artefact_id'), 'rel_path': page_result.get('rel_path') }) except Exception as e: logger.warning(f"Failed to process page {page_num} for file {file_id}: {e}") continue # Sort page bundles by page number for consistent ordering page_bundles.sort(key=lambda x: x['page_number']) # Create enhanced master manifest with proper organization metadata master_manifest = { 'file_id': file_id, 'bundle_type': 'docling_bundle_split', 'split_mode': 'split_by_pages', 'total_pages': len(pages), 'successful_pages': len(page_bundles), 'page_bundles': page_bundles, 'created_at': 'now()', 'display_name': f"Document Pages ({len(page_bundles)} pages)", 'organization': { 'type': 'pages', 'sort_field': 'page_number', 'sort_order': 'asc', 'grouping': 'individual_pages' }, **bundle_metadata } # Store master manifest manifest_path = f"{master_dir}/master_manifest.json" manifest_json = json.dumps(master_manifest, ensure_ascii=False, indent=2) self.storage.upload_file(bucket, manifest_path, manifest_json.encode('utf-8'), 'application/json', upsert=True) # Create master bundle artefact self.client.supabase.table('document_artefacts').insert({ 'id': master_bundle_id, 'file_id': file_id, 'type': 'docling_bundle_split_pages', 'rel_path': master_dir, 'extra': { 'manifest': manifest_path, 'split_mode': 'split_by_pages', 'total_pages': len(pages), 'successful_pages': len(page_bundles), 'group_pack_type': 'split_pages', # Add proper pack type for split page bundles **bundle_metadata }, 'status': 'completed' }).execute() logger.info(f"Created page-based split bundle for file {file_id}: {len(page_bundles)} pages") return { 'master_bundle_id': master_bundle_id, 'pages_processed': len(page_bundles), 'total_pages': len(pages) } def _process_split_by_sections(self, task: QueueTask, processing_data: dict, config: dict, bundle_metadata: dict) -> Dict[str, Any]: """Process document by sections and create section bundles.""" file_id = task.file_id payload = task.payload bucket = payload['bucket'] cabinet_id = payload['cabinet_id'] entries = processing_data.get('entries', []) logger.info(f"Processing {len(entries)} sections for file {file_id}") # Create master bundle directory master_bundle_id = str(uuid.uuid4()) master_dir = f"{cabinet_id}/{file_id}/{master_bundle_id}" section_bundles = [] # Process each section as a separate bundle logger.info(f"Processing {len(entries)} sections for file {file_id}") for i, entry in enumerate(entries, 1): try: start_page = entry.get('start_page', 1) end_page = entry.get('end_page', start_page) # Enhanced section title handling with fallbacks and smart naming raw_title = entry.get('title') or entry.get('label') or entry.get('heading') section_title = raw_title.strip() if raw_title else f'Section {i}' # Create enhanced display names for better organization page_range_str = f"p{start_page}" if start_page == end_page else f"p{start_page}-{end_page}" display_name = f"{i:02d}. {section_title}" if raw_title else f"{i:02d}. Section {i} ({page_range_str})" bundle_label = f"{section_title} Bundle" # Validate page ranges if start_page < 1: raise ValueError(f"Invalid start_page: {start_page} (must be >= 1)") if end_page < start_page: raise ValueError(f"Invalid page range: {start_page}-{end_page} (end < start)") if start_page > 999 or end_page > 999: raise ValueError(f"Suspicious page range: {start_page}-{end_page} (too high, possible corruption)") logger.info(f"Processing section {i}/{len(entries)}: '{display_name}' (pages {start_page}-{end_page})") section_config = { **config, 'page_range': [start_page, end_page], 'target_type': 'zip', 'to_formats': ['json', 'html', 'text', 'md', 'doctags'] } # Create section task with enhanced metadata and labeling section_task = QueueTask( id=f"{task.id}_section_{i}", file_id=file_id, service=task.service, task_type='canonical_docling_json', payload={ **payload, 'config': section_config, 'artefact_extra': { 'section_number': i, 'section_title': section_title, 'display_name': display_name, 'bundle_label': bundle_label, 'start_page': start_page, 'end_page': end_page, 'page_range': [start_page, end_page], 'page_count': end_page - start_page + 1, 'split_order': i, # Preserved ordering from split map 'split_total': len(entries), 'split_heading': section_title, 'is_section_bundle': True, 'master_bundle_id': master_bundle_id, **bundle_metadata } }, priority=task.priority, timeout=3600, created_at=task.created_at ) # Process section bundle section_result = self._process_docling_task(section_task) section_bundles.append({ 'section_number': i, 'section_title': section_title, 'display_name': display_name, 'bundle_label': bundle_label, 'page_range': [start_page, end_page], 'page_count': end_page - start_page + 1, 'split_order': i, 'artefact_id': section_result.get('artefact_id'), 'rel_path': section_result.get('rel_path') }) except Exception as e: logger.error(f"FATAL: Failed to process section {i} for file {file_id}: {e}") logger.error(f"Section details: title='{section_title}', pages={start_page}-{end_page}") # Don't continue - fail the entire task if any section fails raise Exception(f"Section processing failed for section {i} ('{section_title}', pages {start_page}-{end_page}): {e}") # Sort section bundles by split_order for consistent ordering section_bundles.sort(key=lambda x: x['split_order']) # Create enhanced master manifest with proper organization metadata master_manifest = { 'file_id': file_id, 'bundle_type': 'docling_bundle_split', 'split_mode': 'split_by_sections', 'total_sections': len(entries), 'successful_sections': len(section_bundles), 'section_bundles': section_bundles, 'created_at': 'now()', 'display_name': f"Document Sections ({len(section_bundles)} sections)", 'organization': { 'type': 'sections', 'sort_field': 'split_order', 'sort_order': 'asc', 'grouping': 'split_map_sections', 'has_titles': True, 'ordering_preserved': True }, **bundle_metadata } # Store master manifest manifest_path = f"{master_dir}/master_manifest.json" manifest_json = json.dumps(master_manifest, ensure_ascii=False, indent=2) self.storage.upload_file(bucket, manifest_path, manifest_json.encode('utf-8'), 'application/json', upsert=True) # Create master bundle artefact self.client.supabase.table('document_artefacts').insert({ 'id': master_bundle_id, 'file_id': file_id, 'type': 'docling_bundle_split_sections', 'rel_path': master_dir, 'extra': { 'manifest': manifest_path, 'split_mode': 'split_by_sections', 'total_sections': len(entries), 'successful_sections': len(section_bundles), 'group_pack_type': 'split_sections', # Add proper pack type for split section bundles **bundle_metadata }, 'status': 'completed' }).execute() logger.info(f"Created section-based split bundle for file {file_id}: {len(section_bundles)} sections") return { 'master_bundle_id': master_bundle_id, 'sections_processed': len(section_bundles), 'total_sections': len(entries) } def _process_split_by_chunks(self, task: QueueTask, processing_data: dict, config: dict, bundle_metadata: dict) -> Dict[str, Any]: """Process document by chunks and create chunk bundles.""" # Very similar to _process_split_by_sections but with chunk-specific labeling file_id = task.file_id payload = task.payload bucket = payload['bucket'] cabinet_id = payload['cabinet_id'] chunks = processing_data.get('entries', []) logger.info(f"Processing {len(chunks)} chunks for file {file_id}") # Create master bundle directory master_bundle_id = str(uuid.uuid4()) master_dir = f"{cabinet_id}/{file_id}/{master_bundle_id}" chunk_bundles = [] # Process each chunk as a separate bundle for i, chunk in enumerate(chunks, 1): try: start_page = chunk['start'] end_page = chunk['end'] # Enhanced chunk title handling raw_title = chunk.get('title', f'Chunk {i}') chunk_title = raw_title.strip() if raw_title else f'Chunk {i}' # Create enhanced display names for chunks page_range_str = f"p{start_page}" if start_page == end_page else f"p{start_page}-{end_page}" display_name = f"{i:02d}. {chunk_title} ({page_range_str})" bundle_label = f"{chunk_title} Bundle" chunk_config = { **config, 'page_range': [start_page, end_page], 'target_type': 'zip', 'to_formats': ['json', 'html', 'text', 'md', 'doctags'] } # Create chunk task with enhanced labeling chunk_task = QueueTask( id=f"{task.id}_chunk_{i}", file_id=file_id, service=task.service, task_type='canonical_docling_json', payload={ **payload, 'config': chunk_config, 'artefact_extra': { 'chunk_number': i, 'chunk_title': chunk_title, 'display_name': display_name, 'bundle_label': bundle_label, 'start_page': start_page, 'end_page': end_page, 'page_range': [start_page, end_page], 'page_count': end_page - start_page + 1, 'split_order': i, 'split_total': len(chunks), 'split_heading': chunk_title, 'is_chunk_bundle': True, 'master_bundle_id': master_bundle_id, **bundle_metadata } }, priority=task.priority, timeout=3600, created_at=task.created_at ) # Process chunk bundle chunk_result = self._process_docling_task(chunk_task) chunk_bundles.append({ 'chunk_number': i, 'chunk_title': chunk_title, 'display_name': display_name, 'bundle_label': bundle_label, 'page_range': [start_page, end_page], 'page_count': end_page - start_page + 1, 'split_order': i, 'artefact_id': chunk_result.get('artefact_id'), 'rel_path': chunk_result.get('rel_path') }) except Exception as e: logger.warning(f"Failed to process chunk {i} for file {file_id}: {e}") continue # Create master manifest master_manifest = { 'file_id': file_id, 'bundle_type': 'docling_bundle_split', 'split_mode': 'split_by_chunks', 'total_chunks': len(chunks), 'successful_chunks': len(chunk_bundles), 'chunk_bundles': chunk_bundles, 'created_at': 'now()', **bundle_metadata } # Store master manifest manifest_path = f"{master_dir}/master_manifest.json" manifest_json = json.dumps(master_manifest, ensure_ascii=False, indent=2) self.storage.upload_file(bucket, manifest_path, manifest_json.encode('utf-8'), 'application/json', upsert=True) # Create master bundle artefact self.client.supabase.table('document_artefacts').insert({ 'id': master_bundle_id, 'file_id': file_id, 'type': 'docling_bundle_split_chunks', 'rel_path': master_dir, 'extra': { 'manifest': manifest_path, 'split_mode': 'split_by_chunks', 'total_chunks': len(chunks), 'successful_chunks': len(chunk_bundles), 'group_pack_type': 'split_chunks', # Add proper pack type for split chunk bundles **bundle_metadata }, 'status': 'completed' }).execute() logger.info(f"Created chunk-based split bundle for file {file_id}: {len(chunk_bundles)} chunks") return { 'master_bundle_id': master_bundle_id, 'chunks_processed': len(chunk_bundles), 'total_chunks': len(chunks) } # process_phase2_coordinator_task method removed - pipelines now enqueued directly from split_map task # _check_pipeline_group_completion method removed - task dependencies now handle sequential execution # Global processor instance _processor_instance = None def get_processor() -> DocumentTaskProcessor: """Get the global task processor instance.""" global _processor_instance if _processor_instance is None: _processor_instance = DocumentTaskProcessor() return _processor_instance