import os import io from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, BackgroundTasks from typing import Any, Dict, Optional import uuid import re import requests import os import tempfile from pathlib import Path from modules.auth.supabase_bearer import SupabaseBearer, verify_supabase_jwt_str from modules.logger_tool import initialise_logger from modules.database.supabase.utils.client import SupabaseServiceRoleClient from modules.database.supabase.utils.storage import StorageAdmin from modules.document_processor import DocumentProcessor from modules.queue_system import ( enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task, enqueue_document_analysis_task, enqueue_page_images_task, TaskPriority, get_queue, QueueConnectionError ) from fastapi.responses import Response from fastapi import Body router = APIRouter() auth = SupabaseBearer() doc_processor = DocumentProcessor() DEFAULT_BUCKET = os.getenv('DEFAULT_FILES_BUCKET', 'cc.users') # Timeout configurations (in seconds) TIKA_TIMEOUT = int(os.getenv('TIKA_TIMEOUT', '300')) # 5 minutes default DOCLING_FRONTMATTER_TIMEOUT = int(os.getenv('DOCLING_FRONTMATTER_TIMEOUT', '1800')) # 30 minutes default DOCLING_NOOCR_TIMEOUT = int(os.getenv('DOCLING_NOOCR_TIMEOUT', '3600')) # 1 hour default # (Legacy feature flags removed - using new three-phase system) logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True) def _safe_filename(name: str) -> str: base = os.path.basename(name or 'file') return re.sub(r"[^A-Za-z0-9._-]+", "_", base) def _choose_bucket(scope: str, user_id: str, school_id: Optional[str]) -> str: scope = (scope or 'teacher').lower() if scope == 'school' and school_id: return f"cc.institutes.{school_id}.private" # teacher / student fall back to users bucket for now return 'cc.users' @router.post("/files/upload") async def upload_file( cabinet_id: str = Form(...), path: str = Form(...), scope: str = Form('teacher'), school_id: Optional[str] = Form(default=None), file: UploadFile = File(...), payload: Dict[str, Any] = Depends(auth), background_tasks: BackgroundTasks = None ): user_id = payload.get('sub') or payload.get('user_id') if not user_id: raise HTTPException(status_code=401, detail="Invalid token payload") client = SupabaseServiceRoleClient() storage = StorageAdmin() # Determine target bucket by scope bucket = _choose_bucket(scope, user_id, school_id) # Stage DB row to get file_id staged_path = f"{cabinet_id}/staging/{uuid.uuid4()}" name = _safe_filename(path or file.filename) file_bytes = await file.read() insert_res = client.supabase.table('files').insert({ 'cabinet_id': cabinet_id, 'name': name, 'path': staged_path, 'bucket': bucket, 'mime_type': file.content_type, 'uploaded_by': user_id, 'size_bytes': len(file_bytes), 'source': 'classroomcopilot-web' }).execute() if not insert_res.data: raise HTTPException(status_code=500, detail="Failed to create file record") file_row = insert_res.data[0] file_id = file_row['id'] # Final storage path: bucket/cabinet_id/file_id/file final_storage_path = f"{cabinet_id}/{file_id}/{name}" try: storage.upload_file(bucket, final_storage_path, file_bytes, file.content_type or 'application/octet-stream', upsert=True) except Exception as e: # cleanup staged row client.supabase.table('files').delete().eq('id', file_id).execute() raise HTTPException(status_code=500, detail=f"Storage upload failed: {str(e)}") # Update DB path to final update_res = client.supabase.table('files').update({ 'path': final_storage_path }).eq('id', file_id).execute() # Kick off initial artefacts generation in background (Tika + Docling frontmatter + no-OCR) try: if background_tasks is not None: logger.info(f"Scheduling initial artefacts generation for file_id={file_id}") background_tasks.add_task(generate_initial_artefacts, file_id, payload) else: logger.info(f"Running initial artefacts generation synchronously for file_id={file_id}") generate_initial_artefacts(file_id, payload) except Exception as e: logger.error(f"Failed to schedule initial artefacts for file_id={file_id}: {e}") return update_res.data @router.get("/files") def list_files(cabinet_id: str, payload: Dict[str, Any] = Depends(auth)): client = SupabaseServiceRoleClient() res = client.supabase.table('files').select('*').eq('cabinet_id', cabinet_id).execute() return res.data @router.post("/files/{file_id}/move") def move_file(file_id: str, body: Dict[str, Any], payload: Dict[str, Any] = Depends(auth)): client = SupabaseServiceRoleClient() updates = {} if 'cabinet_id' in body: updates['cabinet_id'] = body['cabinet_id'] if 'path' in body: updates['path'] = body['path'] if not updates: raise HTTPException(status_code=400, detail="No changes provided") res = client.supabase.table('files').update(updates).eq('id', file_id).execute() return res.data @router.delete("/files/{file_id}") def delete_file(file_id: str, payload: Dict[str, Any] = Depends(auth)): client = SupabaseServiceRoleClient() res = client.supabase.table('files').delete().eq('id', file_id).execute() return res.data @router.get("/files/{file_id}/artefacts") def list_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)): client = SupabaseServiceRoleClient() res = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).order('created_at', desc=True).execute() return res.data @router.get("/files/{file_id}/viewer-artefacts") def list_viewer_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)): """ Get artefacts organized for UI viewer display, including frontmatter JSON, processing bundles, and analysis data with proper display metadata. """ client = SupabaseServiceRoleClient() # Get all artefacts for the file res = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).order('created_at', desc=True).execute() all_artefacts = res.data or [] # Organize artefacts by category for UI display viewer_artefacts = { 'document_analysis': [], 'processing_bundles': [], 'raw_data': [] } for artefact in all_artefacts: artefact_type = artefact.get('type', '') extra = artefact.get('extra', {}) # Enhanced artefact info for UI display artefact_info = { 'id': artefact['id'], 'type': artefact_type, 'display_name': extra.get('display_name'), 'bundle_label': extra.get('bundle_label'), 'section_title': extra.get('section_title'), 'page_range': extra.get('page_range'), 'page_count': extra.get('page_count'), 'pipeline': extra.get('pipeline'), 'processing_mode': extra.get('processing_mode'), 'ui_order': extra.get('ui_order', 999), 'description': extra.get('description'), 'viewer_type': extra.get('viewer_type', 'json'), 'created_at': artefact['created_at'], 'status': artefact.get('status', 'unknown') } # Categorize artefacts for UI organization if artefact_type == 'docling_frontmatter_json': artefact_info.update({ 'display_name': artefact_info['display_name'] or 'Document Frontmatter', 'bundle_label': artefact_info['bundle_label'] or 'Frontmatter Analysis', 'description': artefact_info['description'] or 'OCR analysis of document structure and metadata', 'ui_order': 1, 'viewer_type': 'json' }) viewer_artefacts['document_analysis'].append(artefact_info) elif artefact_type == 'split_map_json': artefact_info.update({ 'display_name': 'Document Structure Map', 'bundle_label': 'Split Map', 'description': 'Document section boundaries and organization structure', 'ui_order': 2, 'viewer_type': 'json' }) viewer_artefacts['document_analysis'].append(artefact_info) elif artefact_type == 'tika_json': artefact_info.update({ 'display_name': 'Document Metadata', 'bundle_label': 'Tika Analysis', 'description': 'Raw document metadata and properties extracted by Apache Tika', 'ui_order': 3, 'viewer_type': 'json' }) viewer_artefacts['raw_data'].append(artefact_info) elif artefact_type in ['canonical_docling_json', 'docling_bundle_split', 'docling_bundle', 'docling_standard', 'docling_bundle_split_pages']: # Processing bundles (OCR, No-OCR, VLM) - use original_pipeline for proper differentiation pipeline_name = extra.get('original_pipeline', extra.get('pipeline', 'Unknown')) bundle_label = artefact_info['bundle_label'] or f"{pipeline_name.upper().replace('_', '-')} Bundle" display_name = artefact_info['display_name'] or f"{pipeline_name.upper().replace('_', '-')} Processing Result" # Special handling for master manifests if artefact_type == 'docling_bundle_split_pages': display_name = f"{pipeline_name.upper().replace('_', '-')} Document Pages" bundle_label = f"{pipeline_name.upper().replace('_', '-')} Pages Bundle" artefact_info.update({ 'viewer_type': 'bundle_collection', 'is_master_manifest': True, 'ui_order': 10 # Show master manifests before individual pages }) elif artefact_type == 'docling_standard': # Individual page bundles - lower UI priority artefact_info.update({ 'viewer_type': 'page_bundle', 'is_individual_page': True, 'ui_order': extra.get('split_order', 999) + 100 # Show after master manifests }) artefact_info.update({ 'display_name': display_name, 'bundle_label': bundle_label, 'description': f"Docling processing result using {pipeline_name.replace('_', '-')} pipeline", 'pipeline_type': pipeline_name # Add explicit pipeline type for UI }) viewer_artefacts['processing_bundles'].append(artefact_info) elif artefact_type.startswith('docling_') and artefact_type.endswith('_json'): # Other docling JSON results pipeline_name = artefact_type.replace('docling_', '').replace('_json', '').upper() artefact_info.update({ 'display_name': f"{pipeline_name} Analysis", 'bundle_label': f"{pipeline_name} Result", 'description': f"Docling {pipeline_name.lower()} processing result", 'viewer_type': 'json' }) viewer_artefacts['processing_bundles'].append(artefact_info) elif artefact_type == 'page_images': artefact_info.update({ 'display_name': 'Page Images', 'bundle_label': 'Visual Pages', 'description': 'Generated page images for document visualization', 'viewer_type': 'images' }) viewer_artefacts['raw_data'].append(artefact_info) # Sort each category by ui_order for category in viewer_artefacts.values(): category.sort(key=lambda x: (x['ui_order'], x['created_at'])) return { 'file_id': file_id, 'categories': viewer_artefacts, 'total_artefacts': len(all_artefacts) } @router.post("/files/{file_id}/artefacts/initial") def generate_initial_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)): """ Generate initial artefacts using the new three-phase pipeline architecture. Phase 1: Document Structure Discovery & Analysis - Tika metadata extraction - Page images generation - Document structure analysis (LLM-enhanced) - Split map generation Phase 2: Triggered automatically after Phase 1 completion """ logger.info(f"Three-phase pipeline: Starting Phase 1 for file_id={file_id}") from modules.pipeline_controller import get_pipeline_controller client = SupabaseServiceRoleClient() storage = StorageAdmin() controller = get_pipeline_controller() # Load file row fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute() file_row = fr.data if not file_row: raise HTTPException(status_code=404, detail="File not found") bucket = file_row['bucket'] storage_path = file_row['path'] cabinet_id = file_row['cabinet_id'] mime = file_row.get('mime_type') or 'application/octet-stream' filename = file_row.get('name', 'file') # Step 1: Convert to PDF if not already a PDF (synchronous for now) processing_path = storage_path processing_mime = mime if mime != 'application/pdf': logger.info(f"Converting non-PDF file to PDF: file_id={file_id} mime={mime}") try: file_bytes = storage.download_file(bucket, storage_path) with tempfile.TemporaryDirectory() as temp_dir: # Save original file to temp location temp_input = Path(temp_dir) / filename with open(temp_input, 'wb') as f: f.write(file_bytes) # Convert to PDF pdf_bytes = doc_processor.convert_to_pdf(temp_input) # Store PDF as artefact pdf_artefact_id = str(uuid.uuid4()) pdf_rel_path = f"{cabinet_id}/{file_id}/{pdf_artefact_id}/document.pdf" storage.upload_file(bucket, pdf_rel_path, pdf_bytes, 'application/pdf', upsert=True) pdf_ar = client.supabase.table('document_artefacts').insert({ 'file_id': file_id, 'type': 'document_pdf', 'rel_path': pdf_rel_path, 'extra': {'converted_from': mime, 'original_filename': filename}, 'status': 'completed' }).execute() # Use converted PDF for subsequent processing processing_path = pdf_rel_path processing_mime = 'application/pdf' logger.info(f"PDF conversion: completed file_id={file_id} rel_path={pdf_rel_path}") except Exception as e: logger.error(f"PDF conversion: error processing file_id={file_id}: {e}") # Continue with original file if conversion fails else: logger.info(f"File is already PDF, skipping conversion: file_id={file_id}") # Step 2: Enqueue Phase 1 tasks using the new pipeline controller user_id = payload.get('sub') or payload.get('user_id') priority = TaskPriority.HIGH if user_id else TaskPriority.NORMAL try: # Update file row with processing path updated_file_row = {**file_row, 'path': processing_path, 'mime_type': processing_mime} # Enqueue Phase 1 tasks phase1_tasks = controller.enqueue_phase1_tasks( file_id=file_id, file_row=updated_file_row, processing_path=processing_path, processing_mime=processing_mime, priority=priority ) total_tasks = sum(len(task_list) for task_list in phase1_tasks.values()) logger.info(f"Three-phase pipeline: Enqueued {total_tasks} Phase 1 tasks for file_id={file_id}") return { 'message': f'Three-phase pipeline: Enqueued {total_tasks} Phase 1 tasks. Phase 2 will trigger automatically after completion.', 'phase1_tasks': {k: v for k, v in phase1_tasks.items()}, 'file_id': file_id, 'pipeline_mode': 'three_phase', 'bundle_architecture_enabled': True } except QueueConnectionError as e: logger.error(f"Queue system unavailable for file_id={file_id}: {e}") logger.error("Redis is not running. Please start the API server with './start.sh dev' to auto-start Redis.") return { 'message': 'File uploaded successfully, but processing tasks could not be queued (Redis unavailable)', 'file_id': file_id, 'queue_status': 'unavailable', 'error': 'Queue system unavailable. Please restart the API server with Redis enabled.' } except Exception as e: logger.error(f"Unexpected error enqueueing Phase 1 tasks for file_id={file_id}: {e}") return { 'message': 'File uploaded successfully, but processing tasks failed to queue', 'file_id': file_id, 'queue_status': 'failed', 'error': str(e) } @router.get("/files/{file_id}/page-images/manifest") def get_page_images_manifest(file_id: str, payload: Dict[str, Any] = Depends(auth)): """Return the page_images manifest JSON for a file via service-role access.""" client = SupabaseServiceRoleClient() storage = StorageAdmin() # Find file row to get bucket fr = client.supabase.table('files').select('id,bucket,cabinet_id').eq('id', file_id).single().execute() file_row = fr.data or {} if not file_row: raise HTTPException(status_code=404, detail="File not found") bucket = file_row['bucket'] cabinet_id = file_row['cabinet_id'] # Find page_images artefact arts = client.supabase.table('document_artefacts') \ .select('id,type,rel_path,extra') \ .eq('file_id', file_id).eq('type', 'page_images') \ .order('created_at', desc=True).limit(1).execute().data or [] if not arts: raise HTTPException(status_code=404, detail="page_images artefact not found") art = arts[0] # Manifest path manifest_rel_path = (art.get('extra') or {}).get('manifest') or f"{art['rel_path'].rstrip('/')}/page_images.json" try: raw = storage.download_file(bucket, manifest_rel_path) import json as _json manifest = _json.loads(raw.decode('utf-8')) # Ensure bucket and base prefix are present for the UI manifest.setdefault('bucket', bucket) manifest.setdefault('base_dir', art['rel_path']) return manifest except Exception as e: raise HTTPException(status_code=500, detail=f"Failed to load manifest: {e}") def json_dumps(obj: Any) -> str: try: import json return json.dumps(obj, ensure_ascii=False) except Exception: return "{}" @router.get("/files/{file_id}/artefacts/{artefact_id}/json") def get_artefact_json(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)): """Return the JSON content of a document artefact using service-role storage access.""" client = SupabaseServiceRoleClient() storage = StorageAdmin() # Look up artefact to get rel_path and validate it belongs to file ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path').eq('id', artefact_id).single().execute() artefact = ar.data if not artefact: raise HTTPException(status_code=404, detail="Artefact not found") if artefact.get('file_id') != file_id: raise HTTPException(status_code=400, detail="Artefact does not belong to file") # Look up file to get bucket fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute() file_row = fr.data if not file_row: raise HTTPException(status_code=404, detail="File not found") bucket = file_row['bucket'] rel_path = artefact['rel_path'] try: raw = storage.download_file(bucket, rel_path) import json as _json data = _json.loads(raw.decode('utf-8')) return data except Exception as e: raise HTTPException(status_code=500, detail=f"Failed to load artefact JSON: {str(e)}") @router.get("/files/{file_id}/artefacts/{artefact_id}/vlm-section-manifest") def get_vlm_section_manifest(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)): """Return the VLM section page bundle manifest JSON for a VLM section bundle artefact.""" client = SupabaseServiceRoleClient() storage = StorageAdmin() ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path,type,extra').eq('id', artefact_id).single().execute().data if not ar: raise HTTPException(status_code=404, detail="Artefact not found") if ar.get('file_id') != file_id: raise HTTPException(status_code=400, detail="Artefact does not belong to file") if ar.get('type') != 'vlm_section_page_bundle': raise HTTPException(status_code=400, detail="Artefact is not a VLM section page bundle") fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute().data if not fr: raise HTTPException(status_code=404, detail="File not found") bucket = fr['bucket'] # The rel_path directly points to the manifest JSON file manifest_rel_path = ar['rel_path'] try: raw = storage.download_file(bucket, manifest_rel_path) import json as _json data = _json.loads(raw.decode('utf-8')) # ensure bucket present for client use data.setdefault('bucket', bucket) return data except Exception as e: raise HTTPException(status_code=500, detail=f"Failed to load VLM section manifest: {e}") @router.post("/files/{file_id}/artefacts/outline") def enqueue_outline_structure(file_id: str, payload: Dict[str, Any] = Depends(auth)): """ Manually enqueue the fast document outline (headings-only) analysis for an existing file. Returns the queued task id. """ client = SupabaseServiceRoleClient() fr = client.supabase.table('files').select('id,bucket,cabinet_id,path,mime_type').eq('id', file_id).single().execute() file_row = fr.data if not file_row: raise HTTPException(status_code=404, detail="File not found") bucket = file_row['bucket'] storage_path = file_row['path'] cabinet_id = file_row['cabinet_id'] mime = file_row.get('mime_type') or 'application/pdf' # Prefer converted PDF artefact if available arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or [] pdf_art = next((a for a in arts if a.get('type') == 'document_pdf'), None) processing_path = pdf_art['rel_path'] if pdf_art else storage_path try: task_id = enqueue_docling_task( file_id=file_id, task_type='document_structure_analysis', payload={ 'bucket': bucket, 'file_path': processing_path, 'cabinet_id': cabinet_id, 'mime_type': mime, 'config': { 'target_type': 'inbody', 'to_formats': 'json', 'do_ocr': False, 'force_ocr': False } }, priority=TaskPriority.NORMAL, timeout=300 ) return { 'message': 'outline task enqueued', 'task_id': task_id, 'file_id': file_id } except QueueConnectionError as e: raise HTTPException(status_code=503, detail=f"Queue unavailable: {e}") except Exception as e: raise HTTPException(status_code=500, detail=f"Failed to enqueue outline task: {e}") @router.get("/files/proxy") def proxy_storage_file(bucket: str, path: str, payload: Dict[str, Any] = Depends(auth)): """Proxy a storage file (service-role), useful for private image access in the UI.""" storage = StorageAdmin() try: data = storage.download_file(bucket, path) media = 'application/octet-stream' lp = path.lower() if lp.endswith('.png'): media = 'image/png' elif lp.endswith('.webp'): media = 'image/webp' elif lp.endswith('.jpg') or lp.endswith('.jpeg'): media = 'image/jpeg' elif lp.endswith('.json'): media = 'application/json' return Response(content=data, media_type=media) except Exception as e: raise HTTPException(status_code=404, detail=f"File not found or inaccessible: {e}") # Signed proxy for iframe/img tags without Authorization header @router.get("/files/proxy_signed") def proxy_storage_file_signed(bucket: str, path: str, token: str): """Proxy using a signed bearer token passed as query param 'token'.""" try: payload = verify_supabase_jwt_str(token) if not payload: raise HTTPException(status_code=403, detail="Invalid token") except Exception as e: raise HTTPException(status_code=403, detail=f"Invalid token: {e}") storage = StorageAdmin() try: data = storage.download_file(bucket, path) media = 'application/octet-stream' lp = path.lower() if lp.endswith('.png'): media = 'image/png' elif lp.endswith('.webp'): media = 'image/webp' elif lp.endswith('.jpg') or lp.endswith('.jpeg'): media = 'image/jpeg' elif lp.endswith('.json'): media = 'application/json' return Response(content=data, media_type=media) except Exception as e: raise HTTPException(status_code=404, detail=f"File not found or inaccessible: {e}") # -------- Canonical bundle manifest --------- @router.get("/files/{file_id}/artefacts/{artefact_id}/manifest") def get_canonical_manifest(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)): """Return the manifest.json for a canonical_docling_bundle artefact.""" client = SupabaseServiceRoleClient() storage = StorageAdmin() ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path,extra').eq('id', artefact_id).single().execute().data if not ar: raise HTTPException(status_code=404, detail="Artefact not found") if ar.get('file_id') != file_id: raise HTTPException(status_code=400, detail="Artefact does not belong to file") extra = ar.get('extra') or {} manifest_rel_path = extra.get('manifest') if not manifest_rel_path: raise HTTPException(status_code=404, detail="Manifest path not recorded on artefact") fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute().data if not fr: raise HTTPException(status_code=404, detail="File not found") bucket = fr['bucket'] try: raw = storage.download_file(bucket, manifest_rel_path) import json as _json data = _json.loads(raw.decode('utf-8')) # ensure bucket present for client use data.setdefault('bucket', bucket) return data except Exception as e: raise HTTPException(status_code=500, detail=f"Failed to load manifest: {e}") # -------- Canonical Docling generation --------- def _load_split_map(client: SupabaseServiceRoleClient, storage: StorageAdmin, bucket: str, file_id: str) -> Optional[Dict[str, Any]]: try: arts = client.supabase.table('document_artefacts') \ .select('id,type,rel_path') \ .eq('file_id', file_id).eq('type', 'split_map_json') \ .order('created_at', desc=True).limit(1).execute().data or [] if not arts: return None art = arts[0] raw = storage.download_file(bucket, art['rel_path']) import json as _json return _json.loads(raw.decode('utf-8')) except Exception: return None @router.post("/files/{file_id}/artefacts/canonical-docling") def enqueue_canonical_docling( file_id: str, body: Dict[str, Any] = Body(default={}), payload: Dict[str, Any] = Depends(auth) ): """Enqueue generation of canonical Docling JSON(s) for a file. If a split_map is available and the document is large, this will enqueue multiple Docling jobs using page ranges per section. Otherwise a single job is created for the whole document. """ client = SupabaseServiceRoleClient() storage = StorageAdmin() fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute() file_row = fr.data if not file_row: raise HTTPException(status_code=404, detail="File not found") bucket = file_row['bucket'] cabinet_id = file_row['cabinet_id'] mime = file_row.get('mime_type') or 'application/pdf' storage_path = file_row['path'] # Prefer converted PDF if available try: arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or [] a_pdf = next((a for a in arts if a.get('type') == 'document_pdf'), None) processing_path = a_pdf['rel_path'] if a_pdf else storage_path processing_mime = 'application/pdf' if a_pdf else mime except Exception: processing_path = storage_path processing_mime = mime # Determine page_count (prefer Tika; fallback to PDF parser if needed) page_count = None try: arts_pc = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).execute().data or [] a_tika_pc = next((a for a in arts_pc if a.get('type') == 'tika_json'), None) if a_tika_pc: raw = storage.download_file(bucket, a_tika_pc['rel_path']) import json as _json tj = _json.loads(raw.decode('utf-8')) for k in ("xmpTPg:NPages", "Page-Count", "pdf:PageCount"): v = tj.get(k) or tj.get(k.lower()) if v is not None: page_count = int(v) break except Exception as e: logger.debug(f"[canonical-docling] Tika page_count read failed: {e}") pass # Fallback: compute page_count from PDF if Tika did not provide it if page_count is None: try: pdf_bytes = storage.download_file(bucket, processing_path) try: import fitz # PyMuPDF doc = fitz.open(stream=pdf_bytes, filetype='pdf') page_count = int(doc.page_count) doc.close() logger.info(f"[canonical-docling] page_count via PyMuPDF: {page_count}") except Exception: try: from PyPDF2 import PdfReader reader = PdfReader(io.BytesIO(pdf_bytes)) page_count = int(len(reader.pages)) logger.info(f"[canonical-docling] page_count via PyPDF2: {page_count}") except Exception: page_count = None except Exception: page_count = None else: logger.info(f"[canonical-docling] page_count via Tika: {page_count}") # Optional custom range from caller custom_range = body.get('custom_range') custom_label = body.get('custom_label') or '' selected_section_id = body.get('selected_section_id') selected_section_title = body.get('selected_section_title') # Load split map if requested and document is large enough use_split_requested = bool(body.get('use_split_map', True)) split_threshold = int(body.get('threshold') or os.getenv('DOCLING_SPLIT_THRESHOLD', '50')) ranges = [] # list of (start,end) split_map = None sections = [] # list of dicts: {start,end,title} logger.info(f"[canonical-docling] use_split_map={use_split_requested} threshold={split_threshold} page_count={page_count}") # If custom range provided, honor it and bypass split map if isinstance(custom_range, list) and len(custom_range) >= 2: try: cs = int(custom_range[0]); ce = int(custom_range[1]) if page_count is not None: cs = max(1, min(cs, page_count)) ce = max(cs, min(ce, page_count)) ranges = [(cs, ce)] sections = [{'start': cs, 'end': ce, 'title': custom_label or 'Custom range'}] use_split_requested = False logger.info(f"[canonical-docling] using custom_range start={cs} end={ce} label='{custom_label}'") except Exception as _e: logger.warning(f"[canonical-docling] invalid custom_range; falling back. err={_e}") if not ranges and use_split_requested and (page_count is None or page_count >= split_threshold): split_map = _load_split_map(client, storage, bucket, file_id) entries = (split_map or {}).get('entries') if split_map else [] logger.info(f"[canonical-docling] split_map loaded entries={len(entries) if isinstance(entries, list) else 0}") if split_map and isinstance(entries, list) and len(entries) > 0: # Normalize and sort entries by start_page to enforce correct order norm: list[dict] = [] for e in entries: try: s = int(e.get('start_page', 1)) t = int(e.get('end_page', s)) if t < s: t = s title = e.get('title') or e.get('label') or '' norm.append({'start': s, 'end': t, 'title': title}) except Exception: continue norm.sort(key=lambda x: x['start']) # Deduplicate identical or overlapping starts by keeping the earliest occurrence ordered: list[dict] = [] last_end = 0 for e in norm: s, t = int(e['start']), int(e['end']) if ordered and s <= last_end: # Clamp to prevent inversion and maintain order s = last_end + 1 if s > (page_count or s): continue if t < s: t = s last_end = max(last_end, t) ordered.append({'start': s, 'end': t, 'title': e['title']}) for e in ordered: ranges.append((e['start'], e['end'])) sections.append(e) # Fallback: if no split_map ranges... we shouldn't be here if not ranges: # If document is large, split into fixed windows to protect Docling server if page_count is not None and page_count >= split_threshold: chunk = int(os.getenv('DOCLING_FALLBACK_CHUNK_PAGES', '25')) chunk = max(5, min(100, chunk)) for i in range(1, (page_count or 1) + 1, chunk): end = min(i + chunk - 1, page_count or i) ranges.append((i, end)) sections.append({'start': i, 'end': end, 'title': f"Pages {i}-{end}"}) logger.warning(f"[canonical-docling] using fallback chunking ranges={len(ranges)} chunk={chunk}") else: ranges = [(1, page_count or 9223372036854775807)] logger.warning(f"[canonical-docling] using single-range fallback (small doc)") # Build config cfg = body.get('config', {}) pipeline = cfg.get('pipeline', 'standard') config: Dict[str, Any] = { # target_type is computed in processor based on to_formats unless explicitly provided by user 'to_formats': cfg.get('to_formats', 'json'), 'do_ocr': bool(cfg.get('do_ocr', True)), 'force_ocr': bool(cfg.get('force_ocr', False)), 'image_export_mode': cfg.get('image_export_mode', 'embedded'), 'ocr_engine': cfg.get('ocr_engine', 'easyocr'), 'ocr_lang': cfg.get('ocr_lang', 'en'), 'pdf_backend': cfg.get('pdf_backend', 'dlparse_v4'), 'table_mode': cfg.get('table_mode', 'fast'), 'pipeline': pipeline, 'do_picture_classification': bool(cfg.get('do_picture_classification', False)), 'do_picture_description': bool(cfg.get('do_picture_description', False)), } # If user explicitly set target_type, pass it through if 'target_type' in cfg: config['target_type'] = cfg['target_type'] # Optional VLM settings (only include API fields if provided as JSON by caller) if config['do_picture_description']: pd_api = cfg.get('picture_description_api') if isinstance(pd_api, (dict, list)): config['picture_description_api'] = pd_api elif isinstance(pd_api, str) and pd_api.strip().startswith(('{', '[')): config['picture_description_api'] = pd_api if cfg.get('picture_description_prompt'): config['picture_description_prompt'] = cfg['picture_description_prompt'] if pipeline == 'vlm': # Provider presets mapping provider = (cfg.get('vlm_provider') or '').strip().lower() provider_model = (cfg.get('vlm_provider_model') or '').strip() provider_base = (cfg.get('vlm_provider_base_url') or '').strip() if provider in ('ollama', 'openai') and provider_model: if provider == 'ollama': base_url = provider_base or os.getenv('OLLAMA_BASE_URL') or os.getenv('VLM_OLLAMA_BASE_URL') if base_url: endpoint = f"{base_url.rstrip('/')}/v1/chat/completions" # Use OpenAI provider schema against Ollama's OpenAI-compatible endpoint cfg_api = { 'provider': 'openai', 'url': endpoint, 'model': provider_model, 'response_format': 'markdown', 'request_params': {'model': provider_model} } logger.info(f"[canonical-docling] VLM provider=ollama mapped to openai-compatible url={endpoint} model={provider_model}") config['vlm_pipeline_model_api'] = cfg_api # Also wire picture_description_api if picture description is enabled if config.get('do_picture_description'): config['picture_description_api'] = { 'url': endpoint, 'headers': {}, 'params': {'model': provider_model} } elif provider == 'openai': base_url = provider_base or os.getenv('OPENAI_BASE_URL') or 'https://api.openai.com/v1' api_key = os.getenv('OPENAI_API_KEY') or os.getenv('OPENAI_API_KEY_READONLY') # Do not inline key if not present; server may have default model_cfg: Dict[str, Any] = { 'provider': 'openai', 'url': f"{base_url.rstrip('/')}/chat/completions", 'model': provider_model, 'response_format': 'markdown', 'request_params': {'model': provider_model} } if api_key: model_cfg['api_key'] = api_key # Also pass explicit Authorization header for servers that expect it model_cfg['headers'] = { 'Authorization': f"Bearer {api_key}" } logger.info(f"[canonical-docling] VLM provider=openai url={model_cfg['url']} model={provider_model} api_key={'yes' if api_key else 'no'}") config['vlm_pipeline_model_api'] = model_cfg # Also wire picture_description_api if picture description is enabled if config.get('do_picture_description'): headers = {'Authorization': f"Bearer {api_key}"} if api_key else {} config['picture_description_api'] = { 'url': f"{base_url.rstrip('/')}/chat/completions", 'headers': headers, 'params': {'model': provider_model} } else: # Pass through explicit API/local JSON if provided by caller vpa = cfg.get('vlm_pipeline_model_api') if isinstance(vpa, (dict, list)): config['vlm_pipeline_model_api'] = vpa elif isinstance(vpa, str) and vpa.strip().startswith(('{', '[')): config['vlm_pipeline_model_api'] = vpa # Enqueue tasks for each range priority = TaskPriority.HIGH task_ids = [] multi = len(ranges) > 1 logger.info(f"[canonical-docling] final ranges={len(ranges)} multi={multi} pipeline={pipeline} producer={body.get('producer', 'manual')}") # Create a group id for split bundles (used for UI grouping) # Use provided group_id if present (for two-pass auto system), otherwise generate new group_id = body.get('group_id') or (str(uuid.uuid4()) if multi else None) if multi and not sections: # Build sections from ranges if titles were not captured for (start, end) in ranges: sections.append({'start': int(start), 'end': int(end), 'title': ''}) idx = 0 for (start, end) in ranges: # Locate title for this range if available title = '' if multi and sections and idx < len(sections): title = sections[idx].get('title') or '' idx += 1 cfg_range = dict(config) # Ensure 1-based inclusive range is passed through cfg_range['page_range'] = [max(1, int(start)), max(int(start), int(end))] extra = { 'is_subdoc': multi, 'page_range': [int(start), int(end)], 'label': (title or f"subdoc p{int(start)}-{int(end)}") if multi else 'canonical' } # Attach selected section metadata if provided by caller if selected_section_id: extra['selected_section_id'] = selected_section_id if selected_section_title or custom_label: extra['selected_section_title'] = selected_section_title or custom_label # For split processing, force split bundle artefact type and add grouping/order metadata if multi: extra.update({ # UI grouping metadata 'split_order': idx, 'split_heading': title, 'split_total': len(ranges) }) if group_id: extra['group_id'] = group_id extra['group_pack_type'] = 'docling_standard_auto_split' else: # Single-bundle case: allow caller to override type (defaults to canonical bundle) if 'artefact_type_override' in body and body.get('artefact_type_override'): extra['artefact_type_override'] = body.get('artefact_type_override') # Mark producer and selection metadata extra['producer'] = body.get('producer') or ('auto_split' if (multi and body.get('use_split_map')) else 'manual') if selected_section_id: extra['selected_section_id'] = selected_section_id if selected_section_title or custom_label: extra['selected_section_title'] = selected_section_title or custom_label # Enhanced logging for canonical operations if multi: logger.info(f"[canonical-docling] enqueue range idx={idx}/{len(ranges)} start={start} end={end} title='{title}' group_id={group_id} producer={extra.get('producer')} pipeline={pipeline}") else: logger.info(f"[canonical-docling] enqueue single range start={start} end={end} producer={extra.get('producer')} pipeline={pipeline}") tid = enqueue_docling_task( file_id=file_id, task_type='canonical_docling_subdoc_json' if multi else 'canonical_docling_json', payload={ 'bucket': bucket, 'file_path': processing_path, 'cabinet_id': cabinet_id, 'mime_type': processing_mime, 'config': cfg_range, 'artefact_extra': extra, # Ensure canonical tasks respect upstream dependencies (e.g., Frontmatter) 'depends_on': body.get('depends_on', []), # Pass through grouping info if provided by caller (kept for backward-compat) 'group_pack_type': body.get('group_pack_type') }, priority=priority, timeout=int(body.get('timeout', DOCLING_NOOCR_TIMEOUT)) ) task_ids.append(tid) logger.info(f"[canonical-docling] completed enqueue file_id={file_id} tasks={len(task_ids)} ranges={len(ranges)} pipeline={pipeline} producer={body.get('producer','manual')} group_id={group_id if multi else 'single'}") return { 'message': f'enqueued {len(task_ids)} canonical docling job(s)', 'task_ids': task_ids, 'ranges': ranges, 'used_split_map': bool(split_map), 'group_id': group_id, 'pipeline': pipeline, 'producer': body.get('producer', 'manual') }