api/routers/dev/document_conversion.py
2025-07-11 13:52:19 +00:00

139 lines
4.9 KiB
Python

from fastapi import APIRouter, UploadFile, File, HTTPException
from typing import List, Optional, Dict
from pathlib import Path
import shutil
import tempfile
from pydantic import BaseModel
from modules.document_processor import DocumentProcessor
import os
class BatchConvertRequest(BaseModel):
directory: str
output_dir: Optional[str] = None
router = APIRouter()
doc_processor = DocumentProcessor()
@router.post("/convert-to-pdf")
async def convert_to_pdf(
files: List[UploadFile] = File(...),
output_format: str = "pdf"
):
"""
Convert uploaded documents to PDF format
"""
results = []
with tempfile.TemporaryDirectory() as temp_dir:
for file in files:
# Save uploaded file to temp directory
temp_file = Path(temp_dir) / file.filename
with temp_file.open("wb") as buffer:
shutil.copyfileobj(file.file, buffer)
try:
# Process the document
pdf_content = doc_processor.convert_to_pdf(temp_file)
results.append({
"filename": file.filename,
"converted_content": pdf_content,
"status": "success"
})
except Exception as e:
results.append({
"filename": file.filename,
"error": str(e),
"status": "error"
})
return results
@router.post("/batch-convert")
async def batch_convert(
directory: str,
output_format: str = "pdf"
):
"""
Convert all documents in a directory to PDF format
"""
try:
results = doc_processor.batch_convert_directory(directory)
return results
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
@router.post("/batch-convert-recursive")
async def batch_convert_recursive(request_data: BatchConvertRequest):
"""
Convert all documents in a directory and its subdirectories to PDF
"""
try:
directory_path = Path(request_data.directory)
if not directory_path.exists():
raise HTTPException(status_code=404, detail=f"Directory not found: {request_data.directory}")
output_path = None
if request_data.output_dir:
output_path = Path(request_data.output_dir)
output_path.mkdir(parents=True, exist_ok=True)
results = []
supported_extensions = doc_processor.supported_extensions.keys()
# Debug: Print processing info
print(f"Processing directory: {directory_path}")
print(f"Output directory: {output_path}")
print(f"Supported extensions: {list(supported_extensions)}")
# Count files before processing
all_files = []
for ext in supported_extensions:
all_files.extend(list(directory_path.rglob(f"*.{ext}")))
print(f"Found {len(all_files)} files to process")
# Recursively find all documents
for file_path in all_files:
try:
print(f"Processing: {file_path}")
# Convert the document
pdf_content = doc_processor.convert_to_pdf(file_path)
# Determine output path
if output_path:
# Preserve directory structure in output_dir
rel_path = file_path.relative_to(directory_path)
out_path = output_path / rel_path.with_suffix('.pdf')
out_path.parent.mkdir(parents=True, exist_ok=True)
else:
out_path = file_path.with_suffix('.pdf')
# Save the PDF
with open(out_path, 'wb') as f:
f.write(pdf_content)
results.append({
"source_file": str(file_path),
"output_file": str(out_path),
"status": "success"
})
print(f"Successfully converted: {file_path} -> {out_path}")
except Exception as e:
print(f"Error converting {file_path}: {str(e)}")
results.append({
"source_file": str(file_path),
"status": "error",
"error": str(e)
})
response_data = {
"total_files": len(results),
"successful": sum(1 for r in results if r["status"] == "success"),
"failed": sum(1 for r in results if r["status"] == "error"),
"results": results
}
print(f"Conversion complete: {response_data['successful']} successful, {response_data['failed']} failed")
return response_data
except Exception as e:
print(f"Error in batch conversion: {str(e)}")
raise HTTPException(status_code=500, detail=str(e))