998 lines
44 KiB
Python
998 lines
44 KiB
Python
import os
|
|
import io
|
|
from fastapi import APIRouter, Depends, HTTPException, UploadFile, File, Form, BackgroundTasks
|
|
from typing import Any, Dict, Optional
|
|
import uuid
|
|
import re
|
|
import requests
|
|
import os
|
|
import tempfile
|
|
from pathlib import Path
|
|
from modules.auth.supabase_bearer import SupabaseBearer, verify_supabase_jwt_str
|
|
from modules.logger_tool import initialise_logger
|
|
from modules.database.supabase.utils.client import SupabaseServiceRoleClient
|
|
from modules.database.supabase.utils.storage import StorageAdmin
|
|
from modules.document_processor import DocumentProcessor
|
|
from modules.queue_system import (
|
|
enqueue_tika_task, enqueue_docling_task, enqueue_split_map_task,
|
|
enqueue_document_analysis_task, enqueue_page_images_task,
|
|
TaskPriority, get_queue, QueueConnectionError
|
|
)
|
|
from fastapi.responses import Response
|
|
from fastapi import Body
|
|
|
|
router = APIRouter()
|
|
auth = SupabaseBearer()
|
|
doc_processor = DocumentProcessor()
|
|
|
|
DEFAULT_BUCKET = os.getenv('DEFAULT_FILES_BUCKET', 'cc.users')
|
|
|
|
# Timeout configurations (in seconds)
|
|
TIKA_TIMEOUT = int(os.getenv('TIKA_TIMEOUT', '300')) # 5 minutes default
|
|
DOCLING_FRONTMATTER_TIMEOUT = int(os.getenv('DOCLING_FRONTMATTER_TIMEOUT', '1800')) # 30 minutes default
|
|
DOCLING_NOOCR_TIMEOUT = int(os.getenv('DOCLING_NOOCR_TIMEOUT', '3600')) # 1 hour default
|
|
|
|
# (Legacy feature flags removed - using new three-phase system)
|
|
|
|
logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
|
|
|
|
def _safe_filename(name: str) -> str:
|
|
base = os.path.basename(name or 'file')
|
|
return re.sub(r"[^A-Za-z0-9._-]+", "_", base)
|
|
|
|
def _choose_bucket(scope: str, user_id: str, school_id: Optional[str]) -> str:
|
|
scope = (scope or 'teacher').lower()
|
|
if scope == 'school' and school_id:
|
|
return f"cc.institutes.{school_id}.private"
|
|
# teacher / student fall back to users bucket for now
|
|
return 'cc.users'
|
|
|
|
@router.post("/files/upload")
|
|
async def upload_file(
|
|
cabinet_id: str = Form(...),
|
|
path: str = Form(...),
|
|
scope: str = Form('teacher'),
|
|
school_id: Optional[str] = Form(default=None),
|
|
file: UploadFile = File(...),
|
|
payload: Dict[str, Any] = Depends(auth),
|
|
background_tasks: BackgroundTasks = None
|
|
):
|
|
user_id = payload.get('sub') or payload.get('user_id')
|
|
if not user_id:
|
|
raise HTTPException(status_code=401, detail="Invalid token payload")
|
|
|
|
client = SupabaseServiceRoleClient()
|
|
storage = StorageAdmin()
|
|
|
|
# Determine target bucket by scope
|
|
bucket = _choose_bucket(scope, user_id, school_id)
|
|
|
|
# Stage DB row to get file_id
|
|
staged_path = f"{cabinet_id}/staging/{uuid.uuid4()}"
|
|
name = _safe_filename(path or file.filename)
|
|
file_bytes = await file.read()
|
|
insert_res = client.supabase.table('files').insert({
|
|
'cabinet_id': cabinet_id,
|
|
'name': name,
|
|
'path': staged_path,
|
|
'bucket': bucket,
|
|
'mime_type': file.content_type,
|
|
'uploaded_by': user_id,
|
|
'size_bytes': len(file_bytes),
|
|
'source': 'classroomcopilot-web'
|
|
}).execute()
|
|
if not insert_res.data:
|
|
raise HTTPException(status_code=500, detail="Failed to create file record")
|
|
file_row = insert_res.data[0]
|
|
file_id = file_row['id']
|
|
|
|
# Final storage path: bucket/cabinet_id/file_id/file
|
|
final_storage_path = f"{cabinet_id}/{file_id}/{name}"
|
|
try:
|
|
storage.upload_file(bucket, final_storage_path, file_bytes, file.content_type or 'application/octet-stream', upsert=True)
|
|
except Exception as e:
|
|
# cleanup staged row
|
|
client.supabase.table('files').delete().eq('id', file_id).execute()
|
|
raise HTTPException(status_code=500, detail=f"Storage upload failed: {str(e)}")
|
|
|
|
# Update DB path to final
|
|
update_res = client.supabase.table('files').update({
|
|
'path': final_storage_path
|
|
}).eq('id', file_id).execute()
|
|
# Kick off initial artefacts generation in background (Tika + Docling frontmatter + no-OCR)
|
|
try:
|
|
if background_tasks is not None:
|
|
logger.info(f"Scheduling initial artefacts generation for file_id={file_id}")
|
|
background_tasks.add_task(generate_initial_artefacts, file_id, payload)
|
|
else:
|
|
logger.info(f"Running initial artefacts generation synchronously for file_id={file_id}")
|
|
generate_initial_artefacts(file_id, payload)
|
|
except Exception as e:
|
|
logger.error(f"Failed to schedule initial artefacts for file_id={file_id}: {e}")
|
|
|
|
return update_res.data
|
|
|
|
@router.get("/files")
|
|
def list_files(cabinet_id: str, payload: Dict[str, Any] = Depends(auth)):
|
|
client = SupabaseServiceRoleClient()
|
|
res = client.supabase.table('files').select('*').eq('cabinet_id', cabinet_id).execute()
|
|
return res.data
|
|
|
|
@router.post("/files/{file_id}/move")
|
|
def move_file(file_id: str, body: Dict[str, Any], payload: Dict[str, Any] = Depends(auth)):
|
|
client = SupabaseServiceRoleClient()
|
|
updates = {}
|
|
if 'cabinet_id' in body:
|
|
updates['cabinet_id'] = body['cabinet_id']
|
|
if 'path' in body:
|
|
updates['path'] = body['path']
|
|
if not updates:
|
|
raise HTTPException(status_code=400, detail="No changes provided")
|
|
res = client.supabase.table('files').update(updates).eq('id', file_id).execute()
|
|
return res.data
|
|
|
|
@router.delete("/files/{file_id}")
|
|
def delete_file(file_id: str, payload: Dict[str, Any] = Depends(auth)):
|
|
client = SupabaseServiceRoleClient()
|
|
res = client.supabase.table('files').delete().eq('id', file_id).execute()
|
|
return res.data
|
|
|
|
@router.get("/files/{file_id}/artefacts")
|
|
def list_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)):
|
|
client = SupabaseServiceRoleClient()
|
|
res = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).order('created_at', desc=True).execute()
|
|
return res.data
|
|
|
|
@router.get("/files/{file_id}/viewer-artefacts")
|
|
def list_viewer_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)):
|
|
"""
|
|
Get artefacts organized for UI viewer display, including frontmatter JSON,
|
|
processing bundles, and analysis data with proper display metadata.
|
|
"""
|
|
client = SupabaseServiceRoleClient()
|
|
|
|
# Get all artefacts for the file
|
|
res = client.supabase.table('document_artefacts').select('*').eq('file_id', file_id).order('created_at', desc=True).execute()
|
|
all_artefacts = res.data or []
|
|
|
|
# Organize artefacts by category for UI display
|
|
viewer_artefacts = {
|
|
'document_analysis': [],
|
|
'processing_bundles': [],
|
|
'raw_data': []
|
|
}
|
|
|
|
for artefact in all_artefacts:
|
|
artefact_type = artefact.get('type', '')
|
|
extra = artefact.get('extra', {})
|
|
|
|
# Enhanced artefact info for UI display
|
|
artefact_info = {
|
|
'id': artefact['id'],
|
|
'type': artefact_type,
|
|
'display_name': extra.get('display_name'),
|
|
'bundle_label': extra.get('bundle_label'),
|
|
'section_title': extra.get('section_title'),
|
|
'page_range': extra.get('page_range'),
|
|
'page_count': extra.get('page_count'),
|
|
'pipeline': extra.get('pipeline'),
|
|
'processing_mode': extra.get('processing_mode'),
|
|
'ui_order': extra.get('ui_order', 999),
|
|
'description': extra.get('description'),
|
|
'viewer_type': extra.get('viewer_type', 'json'),
|
|
'created_at': artefact['created_at'],
|
|
'status': artefact.get('status', 'unknown')
|
|
}
|
|
|
|
# Categorize artefacts for UI organization
|
|
if artefact_type == 'docling_frontmatter_json':
|
|
artefact_info.update({
|
|
'display_name': artefact_info['display_name'] or 'Document Frontmatter',
|
|
'bundle_label': artefact_info['bundle_label'] or 'Frontmatter Analysis',
|
|
'description': artefact_info['description'] or 'OCR analysis of document structure and metadata',
|
|
'ui_order': 1,
|
|
'viewer_type': 'json'
|
|
})
|
|
viewer_artefacts['document_analysis'].append(artefact_info)
|
|
|
|
elif artefact_type == 'split_map_json':
|
|
artefact_info.update({
|
|
'display_name': 'Document Structure Map',
|
|
'bundle_label': 'Split Map',
|
|
'description': 'Document section boundaries and organization structure',
|
|
'ui_order': 2,
|
|
'viewer_type': 'json'
|
|
})
|
|
viewer_artefacts['document_analysis'].append(artefact_info)
|
|
|
|
elif artefact_type == 'tika_json':
|
|
artefact_info.update({
|
|
'display_name': 'Document Metadata',
|
|
'bundle_label': 'Tika Analysis',
|
|
'description': 'Raw document metadata and properties extracted by Apache Tika',
|
|
'ui_order': 3,
|
|
'viewer_type': 'json'
|
|
})
|
|
viewer_artefacts['raw_data'].append(artefact_info)
|
|
|
|
elif artefact_type in ['canonical_docling_json', 'docling_bundle_split', 'docling_bundle', 'docling_standard', 'docling_bundle_split_pages']:
|
|
# Processing bundles (OCR, No-OCR, VLM) - use original_pipeline for proper differentiation
|
|
pipeline_name = extra.get('original_pipeline', extra.get('pipeline', 'Unknown'))
|
|
bundle_label = artefact_info['bundle_label'] or f"{pipeline_name.upper().replace('_', '-')} Bundle"
|
|
display_name = artefact_info['display_name'] or f"{pipeline_name.upper().replace('_', '-')} Processing Result"
|
|
|
|
# Special handling for master manifests
|
|
if artefact_type == 'docling_bundle_split_pages':
|
|
display_name = f"{pipeline_name.upper().replace('_', '-')} Document Pages"
|
|
bundle_label = f"{pipeline_name.upper().replace('_', '-')} Pages Bundle"
|
|
artefact_info.update({
|
|
'viewer_type': 'bundle_collection',
|
|
'is_master_manifest': True,
|
|
'ui_order': 10 # Show master manifests before individual pages
|
|
})
|
|
elif artefact_type == 'docling_standard':
|
|
# Individual page bundles - lower UI priority
|
|
artefact_info.update({
|
|
'viewer_type': 'page_bundle',
|
|
'is_individual_page': True,
|
|
'ui_order': extra.get('split_order', 999) + 100 # Show after master manifests
|
|
})
|
|
|
|
artefact_info.update({
|
|
'display_name': display_name,
|
|
'bundle_label': bundle_label,
|
|
'description': f"Docling processing result using {pipeline_name.replace('_', '-')} pipeline",
|
|
'pipeline_type': pipeline_name # Add explicit pipeline type for UI
|
|
})
|
|
viewer_artefacts['processing_bundles'].append(artefact_info)
|
|
|
|
elif artefact_type.startswith('docling_') and artefact_type.endswith('_json'):
|
|
# Other docling JSON results
|
|
pipeline_name = artefact_type.replace('docling_', '').replace('_json', '').upper()
|
|
artefact_info.update({
|
|
'display_name': f"{pipeline_name} Analysis",
|
|
'bundle_label': f"{pipeline_name} Result",
|
|
'description': f"Docling {pipeline_name.lower()} processing result",
|
|
'viewer_type': 'json'
|
|
})
|
|
viewer_artefacts['processing_bundles'].append(artefact_info)
|
|
|
|
elif artefact_type == 'page_images':
|
|
artefact_info.update({
|
|
'display_name': 'Page Images',
|
|
'bundle_label': 'Visual Pages',
|
|
'description': 'Generated page images for document visualization',
|
|
'viewer_type': 'images'
|
|
})
|
|
viewer_artefacts['raw_data'].append(artefact_info)
|
|
|
|
# Sort each category by ui_order
|
|
for category in viewer_artefacts.values():
|
|
category.sort(key=lambda x: (x['ui_order'], x['created_at']))
|
|
|
|
return {
|
|
'file_id': file_id,
|
|
'categories': viewer_artefacts,
|
|
'total_artefacts': len(all_artefacts)
|
|
}
|
|
|
|
@router.post("/files/{file_id}/artefacts/initial")
|
|
def generate_initial_artefacts(file_id: str, payload: Dict[str, Any] = Depends(auth)):
|
|
"""
|
|
Generate initial artefacts using the new three-phase pipeline architecture.
|
|
|
|
Phase 1: Document Structure Discovery & Analysis
|
|
- Tika metadata extraction
|
|
- Page images generation
|
|
- Document structure analysis (LLM-enhanced)
|
|
- Split map generation
|
|
|
|
Phase 2: Triggered automatically after Phase 1 completion
|
|
"""
|
|
logger.info(f"Three-phase pipeline: Starting Phase 1 for file_id={file_id}")
|
|
|
|
from modules.pipeline_controller import get_pipeline_controller
|
|
|
|
client = SupabaseServiceRoleClient()
|
|
storage = StorageAdmin()
|
|
controller = get_pipeline_controller()
|
|
|
|
# Load file row
|
|
fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
|
|
file_row = fr.data
|
|
if not file_row:
|
|
raise HTTPException(status_code=404, detail="File not found")
|
|
|
|
bucket = file_row['bucket']
|
|
storage_path = file_row['path']
|
|
cabinet_id = file_row['cabinet_id']
|
|
mime = file_row.get('mime_type') or 'application/octet-stream'
|
|
filename = file_row.get('name', 'file')
|
|
|
|
# Step 1: Convert to PDF if not already a PDF (synchronous for now)
|
|
processing_path = storage_path
|
|
processing_mime = mime
|
|
|
|
if mime != 'application/pdf':
|
|
logger.info(f"Converting non-PDF file to PDF: file_id={file_id} mime={mime}")
|
|
try:
|
|
file_bytes = storage.download_file(bucket, storage_path)
|
|
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
# Save original file to temp location
|
|
temp_input = Path(temp_dir) / filename
|
|
with open(temp_input, 'wb') as f:
|
|
f.write(file_bytes)
|
|
|
|
# Convert to PDF
|
|
pdf_bytes = doc_processor.convert_to_pdf(temp_input)
|
|
|
|
# Store PDF as artefact
|
|
pdf_artefact_id = str(uuid.uuid4())
|
|
pdf_rel_path = f"{cabinet_id}/{file_id}/{pdf_artefact_id}/document.pdf"
|
|
storage.upload_file(bucket, pdf_rel_path, pdf_bytes, 'application/pdf', upsert=True)
|
|
|
|
pdf_ar = client.supabase.table('document_artefacts').insert({
|
|
'file_id': file_id,
|
|
'type': 'document_pdf',
|
|
'rel_path': pdf_rel_path,
|
|
'extra': {'converted_from': mime, 'original_filename': filename},
|
|
'status': 'completed'
|
|
}).execute()
|
|
|
|
# Use converted PDF for subsequent processing
|
|
processing_path = pdf_rel_path
|
|
processing_mime = 'application/pdf'
|
|
logger.info(f"PDF conversion: completed file_id={file_id} rel_path={pdf_rel_path}")
|
|
|
|
except Exception as e:
|
|
logger.error(f"PDF conversion: error processing file_id={file_id}: {e}")
|
|
# Continue with original file if conversion fails
|
|
else:
|
|
logger.info(f"File is already PDF, skipping conversion: file_id={file_id}")
|
|
|
|
# Step 2: Enqueue Phase 1 tasks using the new pipeline controller
|
|
user_id = payload.get('sub') or payload.get('user_id')
|
|
priority = TaskPriority.HIGH if user_id else TaskPriority.NORMAL
|
|
|
|
try:
|
|
# Update file row with processing path
|
|
updated_file_row = {**file_row, 'path': processing_path, 'mime_type': processing_mime}
|
|
|
|
# Enqueue Phase 1 tasks
|
|
phase1_tasks = controller.enqueue_phase1_tasks(
|
|
file_id=file_id,
|
|
file_row=updated_file_row,
|
|
processing_path=processing_path,
|
|
processing_mime=processing_mime,
|
|
priority=priority
|
|
)
|
|
|
|
total_tasks = sum(len(task_list) for task_list in phase1_tasks.values())
|
|
|
|
logger.info(f"Three-phase pipeline: Enqueued {total_tasks} Phase 1 tasks for file_id={file_id}")
|
|
|
|
|
|
return {
|
|
'message': f'Three-phase pipeline: Enqueued {total_tasks} Phase 1 tasks. Phase 2 will trigger automatically after completion.',
|
|
'phase1_tasks': {k: v for k, v in phase1_tasks.items()},
|
|
'file_id': file_id,
|
|
'pipeline_mode': 'three_phase',
|
|
'bundle_architecture_enabled': True
|
|
}
|
|
|
|
except QueueConnectionError as e:
|
|
logger.error(f"Queue system unavailable for file_id={file_id}: {e}")
|
|
logger.error("Redis is not running. Please start the API server with './start.sh dev' to auto-start Redis.")
|
|
return {
|
|
'message': 'File uploaded successfully, but processing tasks could not be queued (Redis unavailable)',
|
|
'file_id': file_id,
|
|
'queue_status': 'unavailable',
|
|
'error': 'Queue system unavailable. Please restart the API server with Redis enabled.'
|
|
}
|
|
except Exception as e:
|
|
logger.error(f"Unexpected error enqueueing Phase 1 tasks for file_id={file_id}: {e}")
|
|
return {
|
|
'message': 'File uploaded successfully, but processing tasks failed to queue',
|
|
'file_id': file_id,
|
|
'queue_status': 'failed',
|
|
'error': str(e)
|
|
}
|
|
|
|
@router.get("/files/{file_id}/page-images/manifest")
|
|
def get_page_images_manifest(file_id: str, payload: Dict[str, Any] = Depends(auth)):
|
|
"""Return the page_images manifest JSON for a file via service-role access."""
|
|
client = SupabaseServiceRoleClient()
|
|
storage = StorageAdmin()
|
|
|
|
# Find file row to get bucket
|
|
fr = client.supabase.table('files').select('id,bucket,cabinet_id').eq('id', file_id).single().execute()
|
|
file_row = fr.data or {}
|
|
if not file_row:
|
|
raise HTTPException(status_code=404, detail="File not found")
|
|
bucket = file_row['bucket']
|
|
cabinet_id = file_row['cabinet_id']
|
|
|
|
# Find page_images artefact
|
|
arts = client.supabase.table('document_artefacts') \
|
|
.select('id,type,rel_path,extra') \
|
|
.eq('file_id', file_id).eq('type', 'page_images') \
|
|
.order('created_at', desc=True).limit(1).execute().data or []
|
|
if not arts:
|
|
raise HTTPException(status_code=404, detail="page_images artefact not found")
|
|
art = arts[0]
|
|
|
|
# Manifest path
|
|
manifest_rel_path = (art.get('extra') or {}).get('manifest') or f"{art['rel_path'].rstrip('/')}/page_images.json"
|
|
|
|
try:
|
|
raw = storage.download_file(bucket, manifest_rel_path)
|
|
import json as _json
|
|
manifest = _json.loads(raw.decode('utf-8'))
|
|
# Ensure bucket and base prefix are present for the UI
|
|
manifest.setdefault('bucket', bucket)
|
|
manifest.setdefault('base_dir', art['rel_path'])
|
|
return manifest
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Failed to load manifest: {e}")
|
|
|
|
def json_dumps(obj: Any) -> str:
|
|
try:
|
|
import json
|
|
return json.dumps(obj, ensure_ascii=False)
|
|
except Exception:
|
|
return "{}"
|
|
|
|
|
|
@router.get("/files/{file_id}/artefacts/{artefact_id}/json")
|
|
def get_artefact_json(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)):
|
|
"""Return the JSON content of a document artefact using service-role storage access."""
|
|
client = SupabaseServiceRoleClient()
|
|
storage = StorageAdmin()
|
|
# Look up artefact to get rel_path and validate it belongs to file
|
|
ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path').eq('id', artefact_id).single().execute()
|
|
artefact = ar.data
|
|
if not artefact:
|
|
raise HTTPException(status_code=404, detail="Artefact not found")
|
|
if artefact.get('file_id') != file_id:
|
|
raise HTTPException(status_code=400, detail="Artefact does not belong to file")
|
|
|
|
# Look up file to get bucket
|
|
fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute()
|
|
file_row = fr.data
|
|
if not file_row:
|
|
raise HTTPException(status_code=404, detail="File not found")
|
|
|
|
bucket = file_row['bucket']
|
|
rel_path = artefact['rel_path']
|
|
try:
|
|
raw = storage.download_file(bucket, rel_path)
|
|
import json as _json
|
|
data = _json.loads(raw.decode('utf-8'))
|
|
return data
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Failed to load artefact JSON: {str(e)}")
|
|
|
|
|
|
@router.get("/files/{file_id}/artefacts/{artefact_id}/vlm-section-manifest")
|
|
def get_vlm_section_manifest(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)):
|
|
"""Return the VLM section page bundle manifest JSON for a VLM section bundle artefact."""
|
|
client = SupabaseServiceRoleClient()
|
|
storage = StorageAdmin()
|
|
|
|
ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path,type,extra').eq('id', artefact_id).single().execute().data
|
|
if not ar:
|
|
raise HTTPException(status_code=404, detail="Artefact not found")
|
|
if ar.get('file_id') != file_id:
|
|
raise HTTPException(status_code=400, detail="Artefact does not belong to file")
|
|
if ar.get('type') != 'vlm_section_page_bundle':
|
|
raise HTTPException(status_code=400, detail="Artefact is not a VLM section page bundle")
|
|
|
|
fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute().data
|
|
if not fr:
|
|
raise HTTPException(status_code=404, detail="File not found")
|
|
bucket = fr['bucket']
|
|
|
|
# The rel_path directly points to the manifest JSON file
|
|
manifest_rel_path = ar['rel_path']
|
|
|
|
try:
|
|
raw = storage.download_file(bucket, manifest_rel_path)
|
|
import json as _json
|
|
data = _json.loads(raw.decode('utf-8'))
|
|
# ensure bucket present for client use
|
|
data.setdefault('bucket', bucket)
|
|
return data
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Failed to load VLM section manifest: {e}")
|
|
|
|
|
|
@router.post("/files/{file_id}/artefacts/outline")
|
|
def enqueue_outline_structure(file_id: str, payload: Dict[str, Any] = Depends(auth)):
|
|
"""
|
|
Manually enqueue the fast document outline (headings-only) analysis for an existing file.
|
|
Returns the queued task id.
|
|
"""
|
|
client = SupabaseServiceRoleClient()
|
|
|
|
fr = client.supabase.table('files').select('id,bucket,cabinet_id,path,mime_type').eq('id', file_id).single().execute()
|
|
file_row = fr.data
|
|
if not file_row:
|
|
raise HTTPException(status_code=404, detail="File not found")
|
|
|
|
bucket = file_row['bucket']
|
|
storage_path = file_row['path']
|
|
cabinet_id = file_row['cabinet_id']
|
|
mime = file_row.get('mime_type') or 'application/pdf'
|
|
|
|
# Prefer converted PDF artefact if available
|
|
arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or []
|
|
pdf_art = next((a for a in arts if a.get('type') == 'document_pdf'), None)
|
|
processing_path = pdf_art['rel_path'] if pdf_art else storage_path
|
|
|
|
try:
|
|
task_id = enqueue_docling_task(
|
|
file_id=file_id,
|
|
task_type='document_structure_analysis',
|
|
payload={
|
|
'bucket': bucket,
|
|
'file_path': processing_path,
|
|
'cabinet_id': cabinet_id,
|
|
'mime_type': mime,
|
|
'config': {
|
|
'target_type': 'inbody',
|
|
'to_formats': 'json',
|
|
'do_ocr': False,
|
|
'force_ocr': False
|
|
}
|
|
},
|
|
priority=TaskPriority.NORMAL,
|
|
timeout=300
|
|
)
|
|
return { 'message': 'outline task enqueued', 'task_id': task_id, 'file_id': file_id }
|
|
except QueueConnectionError as e:
|
|
raise HTTPException(status_code=503, detail=f"Queue unavailable: {e}")
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Failed to enqueue outline task: {e}")
|
|
|
|
@router.get("/files/proxy")
|
|
def proxy_storage_file(bucket: str, path: str, payload: Dict[str, Any] = Depends(auth)):
|
|
"""Proxy a storage file (service-role), useful for private image access in the UI."""
|
|
storage = StorageAdmin()
|
|
try:
|
|
data = storage.download_file(bucket, path)
|
|
media = 'application/octet-stream'
|
|
lp = path.lower()
|
|
if lp.endswith('.png'):
|
|
media = 'image/png'
|
|
elif lp.endswith('.webp'):
|
|
media = 'image/webp'
|
|
elif lp.endswith('.jpg') or lp.endswith('.jpeg'):
|
|
media = 'image/jpeg'
|
|
elif lp.endswith('.json'):
|
|
media = 'application/json'
|
|
return Response(content=data, media_type=media)
|
|
except Exception as e:
|
|
raise HTTPException(status_code=404, detail=f"File not found or inaccessible: {e}")
|
|
|
|
|
|
# Signed proxy for iframe/img tags without Authorization header
|
|
@router.get("/files/proxy_signed")
|
|
def proxy_storage_file_signed(bucket: str, path: str, token: str):
|
|
"""Proxy using a signed bearer token passed as query param 'token'."""
|
|
try:
|
|
payload = verify_supabase_jwt_str(token)
|
|
if not payload:
|
|
raise HTTPException(status_code=403, detail="Invalid token")
|
|
except Exception as e:
|
|
raise HTTPException(status_code=403, detail=f"Invalid token: {e}")
|
|
|
|
storage = StorageAdmin()
|
|
try:
|
|
data = storage.download_file(bucket, path)
|
|
media = 'application/octet-stream'
|
|
lp = path.lower()
|
|
if lp.endswith('.png'):
|
|
media = 'image/png'
|
|
elif lp.endswith('.webp'):
|
|
media = 'image/webp'
|
|
elif lp.endswith('.jpg') or lp.endswith('.jpeg'):
|
|
media = 'image/jpeg'
|
|
elif lp.endswith('.json'):
|
|
media = 'application/json'
|
|
return Response(content=data, media_type=media)
|
|
except Exception as e:
|
|
raise HTTPException(status_code=404, detail=f"File not found or inaccessible: {e}")
|
|
|
|
# -------- Canonical bundle manifest ---------
|
|
|
|
@router.get("/files/{file_id}/artefacts/{artefact_id}/manifest")
|
|
def get_canonical_manifest(file_id: str, artefact_id: str, payload: Dict[str, Any] = Depends(auth)):
|
|
"""Return the manifest.json for a canonical_docling_bundle artefact."""
|
|
client = SupabaseServiceRoleClient()
|
|
storage = StorageAdmin()
|
|
|
|
ar = client.supabase.table('document_artefacts').select('id,file_id,rel_path,extra').eq('id', artefact_id).single().execute().data
|
|
if not ar:
|
|
raise HTTPException(status_code=404, detail="Artefact not found")
|
|
if ar.get('file_id') != file_id:
|
|
raise HTTPException(status_code=400, detail="Artefact does not belong to file")
|
|
extra = ar.get('extra') or {}
|
|
manifest_rel_path = extra.get('manifest')
|
|
if not manifest_rel_path:
|
|
raise HTTPException(status_code=404, detail="Manifest path not recorded on artefact")
|
|
|
|
fr = client.supabase.table('files').select('bucket').eq('id', file_id).single().execute().data
|
|
if not fr:
|
|
raise HTTPException(status_code=404, detail="File not found")
|
|
bucket = fr['bucket']
|
|
|
|
try:
|
|
raw = storage.download_file(bucket, manifest_rel_path)
|
|
import json as _json
|
|
data = _json.loads(raw.decode('utf-8'))
|
|
# ensure bucket present for client use
|
|
data.setdefault('bucket', bucket)
|
|
return data
|
|
except Exception as e:
|
|
raise HTTPException(status_code=500, detail=f"Failed to load manifest: {e}")
|
|
|
|
# -------- Canonical Docling generation ---------
|
|
|
|
def _load_split_map(client: SupabaseServiceRoleClient, storage: StorageAdmin, bucket: str, file_id: str) -> Optional[Dict[str, Any]]:
|
|
try:
|
|
arts = client.supabase.table('document_artefacts') \
|
|
.select('id,type,rel_path') \
|
|
.eq('file_id', file_id).eq('type', 'split_map_json') \
|
|
.order('created_at', desc=True).limit(1).execute().data or []
|
|
if not arts:
|
|
return None
|
|
art = arts[0]
|
|
raw = storage.download_file(bucket, art['rel_path'])
|
|
import json as _json
|
|
return _json.loads(raw.decode('utf-8'))
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
@router.post("/files/{file_id}/artefacts/canonical-docling")
|
|
def enqueue_canonical_docling(
|
|
file_id: str,
|
|
body: Dict[str, Any] = Body(default={}),
|
|
payload: Dict[str, Any] = Depends(auth)
|
|
):
|
|
"""Enqueue generation of canonical Docling JSON(s) for a file.
|
|
|
|
If a split_map is available and the document is large, this will enqueue
|
|
multiple Docling jobs using page ranges per section. Otherwise a single
|
|
job is created for the whole document.
|
|
"""
|
|
client = SupabaseServiceRoleClient()
|
|
storage = StorageAdmin()
|
|
|
|
fr = client.supabase.table('files').select('*').eq('id', file_id).single().execute()
|
|
file_row = fr.data
|
|
if not file_row:
|
|
raise HTTPException(status_code=404, detail="File not found")
|
|
|
|
bucket = file_row['bucket']
|
|
cabinet_id = file_row['cabinet_id']
|
|
mime = file_row.get('mime_type') or 'application/pdf'
|
|
storage_path = file_row['path']
|
|
|
|
# Prefer converted PDF if available
|
|
try:
|
|
arts = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).order('created_at', desc=True).execute().data or []
|
|
a_pdf = next((a for a in arts if a.get('type') == 'document_pdf'), None)
|
|
processing_path = a_pdf['rel_path'] if a_pdf else storage_path
|
|
processing_mime = 'application/pdf' if a_pdf else mime
|
|
except Exception:
|
|
processing_path = storage_path
|
|
processing_mime = mime
|
|
|
|
# Determine page_count (prefer Tika; fallback to PDF parser if needed)
|
|
page_count = None
|
|
try:
|
|
arts_pc = client.supabase.table('document_artefacts').select('type,rel_path').eq('file_id', file_id).execute().data or []
|
|
a_tika_pc = next((a for a in arts_pc if a.get('type') == 'tika_json'), None)
|
|
if a_tika_pc:
|
|
raw = storage.download_file(bucket, a_tika_pc['rel_path'])
|
|
import json as _json
|
|
tj = _json.loads(raw.decode('utf-8'))
|
|
for k in ("xmpTPg:NPages", "Page-Count", "pdf:PageCount"):
|
|
v = tj.get(k) or tj.get(k.lower())
|
|
if v is not None:
|
|
page_count = int(v)
|
|
break
|
|
except Exception as e:
|
|
logger.debug(f"[canonical-docling] Tika page_count read failed: {e}")
|
|
pass
|
|
|
|
# Fallback: compute page_count from PDF if Tika did not provide it
|
|
if page_count is None:
|
|
try:
|
|
pdf_bytes = storage.download_file(bucket, processing_path)
|
|
try:
|
|
import fitz # PyMuPDF
|
|
doc = fitz.open(stream=pdf_bytes, filetype='pdf')
|
|
page_count = int(doc.page_count)
|
|
doc.close()
|
|
logger.info(f"[canonical-docling] page_count via PyMuPDF: {page_count}")
|
|
except Exception:
|
|
try:
|
|
from PyPDF2 import PdfReader
|
|
reader = PdfReader(io.BytesIO(pdf_bytes))
|
|
page_count = int(len(reader.pages))
|
|
logger.info(f"[canonical-docling] page_count via PyPDF2: {page_count}")
|
|
except Exception:
|
|
page_count = None
|
|
except Exception:
|
|
page_count = None
|
|
else:
|
|
logger.info(f"[canonical-docling] page_count via Tika: {page_count}")
|
|
|
|
# Optional custom range from caller
|
|
custom_range = body.get('custom_range')
|
|
custom_label = body.get('custom_label') or ''
|
|
selected_section_id = body.get('selected_section_id')
|
|
selected_section_title = body.get('selected_section_title')
|
|
|
|
# Load split map if requested and document is large enough
|
|
use_split_requested = bool(body.get('use_split_map', True))
|
|
split_threshold = int(body.get('threshold') or os.getenv('DOCLING_SPLIT_THRESHOLD', '50'))
|
|
ranges = [] # list of (start,end)
|
|
split_map = None
|
|
sections = [] # list of dicts: {start,end,title}
|
|
logger.info(f"[canonical-docling] use_split_map={use_split_requested} threshold={split_threshold} page_count={page_count}")
|
|
# If custom range provided, honor it and bypass split map
|
|
if isinstance(custom_range, list) and len(custom_range) >= 2:
|
|
try:
|
|
cs = int(custom_range[0]); ce = int(custom_range[1])
|
|
if page_count is not None:
|
|
cs = max(1, min(cs, page_count))
|
|
ce = max(cs, min(ce, page_count))
|
|
ranges = [(cs, ce)]
|
|
sections = [{'start': cs, 'end': ce, 'title': custom_label or 'Custom range'}]
|
|
use_split_requested = False
|
|
logger.info(f"[canonical-docling] using custom_range start={cs} end={ce} label='{custom_label}'")
|
|
except Exception as _e:
|
|
logger.warning(f"[canonical-docling] invalid custom_range; falling back. err={_e}")
|
|
|
|
if not ranges and use_split_requested and (page_count is None or page_count >= split_threshold):
|
|
split_map = _load_split_map(client, storage, bucket, file_id)
|
|
entries = (split_map or {}).get('entries') if split_map else []
|
|
logger.info(f"[canonical-docling] split_map loaded entries={len(entries) if isinstance(entries, list) else 0}")
|
|
if split_map and isinstance(entries, list) and len(entries) > 0:
|
|
# Normalize and sort entries by start_page to enforce correct order
|
|
norm: list[dict] = []
|
|
for e in entries:
|
|
try:
|
|
s = int(e.get('start_page', 1))
|
|
t = int(e.get('end_page', s))
|
|
if t < s:
|
|
t = s
|
|
title = e.get('title') or e.get('label') or ''
|
|
norm.append({'start': s, 'end': t, 'title': title})
|
|
except Exception:
|
|
continue
|
|
norm.sort(key=lambda x: x['start'])
|
|
# Deduplicate identical or overlapping starts by keeping the earliest occurrence
|
|
ordered: list[dict] = []
|
|
last_end = 0
|
|
for e in norm:
|
|
s, t = int(e['start']), int(e['end'])
|
|
if ordered and s <= last_end:
|
|
# Clamp to prevent inversion and maintain order
|
|
s = last_end + 1
|
|
if s > (page_count or s):
|
|
continue
|
|
if t < s:
|
|
t = s
|
|
last_end = max(last_end, t)
|
|
ordered.append({'start': s, 'end': t, 'title': e['title']})
|
|
for e in ordered:
|
|
ranges.append((e['start'], e['end']))
|
|
sections.append(e)
|
|
|
|
# Fallback: if no split_map ranges... we shouldn't be here
|
|
if not ranges:
|
|
# If document is large, split into fixed windows to protect Docling server
|
|
if page_count is not None and page_count >= split_threshold:
|
|
chunk = int(os.getenv('DOCLING_FALLBACK_CHUNK_PAGES', '25'))
|
|
chunk = max(5, min(100, chunk))
|
|
for i in range(1, (page_count or 1) + 1, chunk):
|
|
end = min(i + chunk - 1, page_count or i)
|
|
ranges.append((i, end))
|
|
sections.append({'start': i, 'end': end, 'title': f"Pages {i}-{end}"})
|
|
logger.warning(f"[canonical-docling] using fallback chunking ranges={len(ranges)} chunk={chunk}")
|
|
else:
|
|
ranges = [(1, page_count or 9223372036854775807)]
|
|
logger.warning(f"[canonical-docling] using single-range fallback (small doc)")
|
|
|
|
# Build config
|
|
cfg = body.get('config', {})
|
|
pipeline = cfg.get('pipeline', 'standard')
|
|
config: Dict[str, Any] = {
|
|
# target_type is computed in processor based on to_formats unless explicitly provided by user
|
|
'to_formats': cfg.get('to_formats', 'json'),
|
|
'do_ocr': bool(cfg.get('do_ocr', True)),
|
|
'force_ocr': bool(cfg.get('force_ocr', False)),
|
|
'image_export_mode': cfg.get('image_export_mode', 'embedded'),
|
|
'ocr_engine': cfg.get('ocr_engine', 'easyocr'),
|
|
'ocr_lang': cfg.get('ocr_lang', 'en'),
|
|
'pdf_backend': cfg.get('pdf_backend', 'dlparse_v4'),
|
|
'table_mode': cfg.get('table_mode', 'fast'),
|
|
'pipeline': pipeline,
|
|
'do_picture_classification': bool(cfg.get('do_picture_classification', False)),
|
|
'do_picture_description': bool(cfg.get('do_picture_description', False)),
|
|
}
|
|
# If user explicitly set target_type, pass it through
|
|
if 'target_type' in cfg:
|
|
config['target_type'] = cfg['target_type']
|
|
# Optional VLM settings (only include API fields if provided as JSON by caller)
|
|
if config['do_picture_description']:
|
|
pd_api = cfg.get('picture_description_api')
|
|
if isinstance(pd_api, (dict, list)):
|
|
config['picture_description_api'] = pd_api
|
|
elif isinstance(pd_api, str) and pd_api.strip().startswith(('{', '[')):
|
|
config['picture_description_api'] = pd_api
|
|
if cfg.get('picture_description_prompt'):
|
|
config['picture_description_prompt'] = cfg['picture_description_prompt']
|
|
if pipeline == 'vlm':
|
|
# Provider presets mapping
|
|
provider = (cfg.get('vlm_provider') or '').strip().lower()
|
|
provider_model = (cfg.get('vlm_provider_model') or '').strip()
|
|
provider_base = (cfg.get('vlm_provider_base_url') or '').strip()
|
|
if provider in ('ollama', 'openai') and provider_model:
|
|
if provider == 'ollama':
|
|
base_url = provider_base or os.getenv('OLLAMA_BASE_URL') or os.getenv('VLM_OLLAMA_BASE_URL')
|
|
if base_url:
|
|
endpoint = f"{base_url.rstrip('/')}/v1/chat/completions"
|
|
# Use OpenAI provider schema against Ollama's OpenAI-compatible endpoint
|
|
cfg_api = {
|
|
'provider': 'openai',
|
|
'url': endpoint,
|
|
'model': provider_model,
|
|
'response_format': 'markdown',
|
|
'request_params': {'model': provider_model}
|
|
}
|
|
logger.info(f"[canonical-docling] VLM provider=ollama mapped to openai-compatible url={endpoint} model={provider_model}")
|
|
config['vlm_pipeline_model_api'] = cfg_api
|
|
# Also wire picture_description_api if picture description is enabled
|
|
if config.get('do_picture_description'):
|
|
config['picture_description_api'] = {
|
|
'url': endpoint,
|
|
'headers': {},
|
|
'params': {'model': provider_model}
|
|
}
|
|
elif provider == 'openai':
|
|
base_url = provider_base or os.getenv('OPENAI_BASE_URL') or 'https://api.openai.com/v1'
|
|
api_key = os.getenv('OPENAI_API_KEY') or os.getenv('OPENAI_API_KEY_READONLY')
|
|
# Do not inline key if not present; server may have default
|
|
model_cfg: Dict[str, Any] = {
|
|
'provider': 'openai',
|
|
'url': f"{base_url.rstrip('/')}/chat/completions",
|
|
'model': provider_model,
|
|
'response_format': 'markdown',
|
|
'request_params': {'model': provider_model}
|
|
}
|
|
if api_key:
|
|
model_cfg['api_key'] = api_key
|
|
# Also pass explicit Authorization header for servers that expect it
|
|
model_cfg['headers'] = {
|
|
'Authorization': f"Bearer {api_key}"
|
|
}
|
|
logger.info(f"[canonical-docling] VLM provider=openai url={model_cfg['url']} model={provider_model} api_key={'yes' if api_key else 'no'}")
|
|
config['vlm_pipeline_model_api'] = model_cfg
|
|
# Also wire picture_description_api if picture description is enabled
|
|
if config.get('do_picture_description'):
|
|
headers = {'Authorization': f"Bearer {api_key}"} if api_key else {}
|
|
config['picture_description_api'] = {
|
|
'url': f"{base_url.rstrip('/')}/chat/completions",
|
|
'headers': headers,
|
|
'params': {'model': provider_model}
|
|
}
|
|
else:
|
|
# Pass through explicit API/local JSON if provided by caller
|
|
vpa = cfg.get('vlm_pipeline_model_api')
|
|
if isinstance(vpa, (dict, list)):
|
|
config['vlm_pipeline_model_api'] = vpa
|
|
elif isinstance(vpa, str) and vpa.strip().startswith(('{', '[')):
|
|
config['vlm_pipeline_model_api'] = vpa
|
|
|
|
# Enqueue tasks for each range
|
|
priority = TaskPriority.HIGH
|
|
task_ids = []
|
|
multi = len(ranges) > 1
|
|
logger.info(f"[canonical-docling] final ranges={len(ranges)} multi={multi} pipeline={pipeline} producer={body.get('producer', 'manual')}")
|
|
|
|
# Create a group id for split bundles (used for UI grouping)
|
|
# Use provided group_id if present (for two-pass auto system), otherwise generate new
|
|
group_id = body.get('group_id') or (str(uuid.uuid4()) if multi else None)
|
|
if multi and not sections:
|
|
# Build sections from ranges if titles were not captured
|
|
for (start, end) in ranges:
|
|
sections.append({'start': int(start), 'end': int(end), 'title': ''})
|
|
|
|
idx = 0
|
|
for (start, end) in ranges:
|
|
# Locate title for this range if available
|
|
title = ''
|
|
if multi and sections and idx < len(sections):
|
|
title = sections[idx].get('title') or ''
|
|
idx += 1
|
|
|
|
cfg_range = dict(config)
|
|
# Ensure 1-based inclusive range is passed through
|
|
cfg_range['page_range'] = [max(1, int(start)), max(int(start), int(end))]
|
|
extra = {
|
|
'is_subdoc': multi,
|
|
'page_range': [int(start), int(end)],
|
|
'label': (title or f"subdoc p{int(start)}-{int(end)}") if multi else 'canonical'
|
|
}
|
|
# Attach selected section metadata if provided by caller
|
|
if selected_section_id:
|
|
extra['selected_section_id'] = selected_section_id
|
|
if selected_section_title or custom_label:
|
|
extra['selected_section_title'] = selected_section_title or custom_label
|
|
# For split processing, force split bundle artefact type and add grouping/order metadata
|
|
if multi:
|
|
extra.update({
|
|
# UI grouping metadata
|
|
'split_order': idx,
|
|
'split_heading': title,
|
|
'split_total': len(ranges)
|
|
})
|
|
if group_id:
|
|
extra['group_id'] = group_id
|
|
extra['group_pack_type'] = 'docling_standard_auto_split'
|
|
else:
|
|
# Single-bundle case: allow caller to override type (defaults to canonical bundle)
|
|
if 'artefact_type_override' in body and body.get('artefact_type_override'):
|
|
extra['artefact_type_override'] = body.get('artefact_type_override')
|
|
|
|
# Mark producer and selection metadata
|
|
extra['producer'] = body.get('producer') or ('auto_split' if (multi and body.get('use_split_map')) else 'manual')
|
|
if selected_section_id:
|
|
extra['selected_section_id'] = selected_section_id
|
|
if selected_section_title or custom_label:
|
|
extra['selected_section_title'] = selected_section_title or custom_label
|
|
|
|
# Enhanced logging for canonical operations
|
|
if multi:
|
|
logger.info(f"[canonical-docling] enqueue range idx={idx}/{len(ranges)} start={start} end={end} title='{title}' group_id={group_id} producer={extra.get('producer')} pipeline={pipeline}")
|
|
else:
|
|
logger.info(f"[canonical-docling] enqueue single range start={start} end={end} producer={extra.get('producer')} pipeline={pipeline}")
|
|
tid = enqueue_docling_task(
|
|
file_id=file_id,
|
|
task_type='canonical_docling_subdoc_json' if multi else 'canonical_docling_json',
|
|
payload={
|
|
'bucket': bucket,
|
|
'file_path': processing_path,
|
|
'cabinet_id': cabinet_id,
|
|
'mime_type': processing_mime,
|
|
'config': cfg_range,
|
|
'artefact_extra': extra,
|
|
# Ensure canonical tasks respect upstream dependencies (e.g., Frontmatter)
|
|
'depends_on': body.get('depends_on', []),
|
|
# Pass through grouping info if provided by caller (kept for backward-compat)
|
|
'group_pack_type': body.get('group_pack_type')
|
|
},
|
|
priority=priority,
|
|
timeout=int(body.get('timeout', DOCLING_NOOCR_TIMEOUT))
|
|
)
|
|
task_ids.append(tid)
|
|
|
|
logger.info(f"[canonical-docling] completed enqueue file_id={file_id} tasks={len(task_ids)} ranges={len(ranges)} pipeline={pipeline} producer={body.get('producer','manual')} group_id={group_id if multi else 'single'}")
|
|
|
|
return {
|
|
'message': f'enqueued {len(task_ids)} canonical docling job(s)',
|
|
'task_ids': task_ids,
|
|
'ranges': ranges,
|
|
'used_split_map': bool(split_map),
|
|
'group_id': group_id,
|
|
'pipeline': pipeline,
|
|
'producer': body.get('producer', 'manual')
|
|
}
|
|
|