api/modules/document_processor.py
2025-07-11 13:52:19 +00:00

94 lines
3.3 KiB
Python

from pathlib import Path
import subprocess
import tempfile
import os
from typing import Dict, List, Optional
class DocumentProcessor:
def __init__(self):
self.supported_extensions = {
'doc': 'libreoffice',
'docx': 'libreoffice',
'odt': 'libreoffice',
'rtf': 'libreoffice',
'txt': 'libreoffice',
'html': 'libreoffice',
'htm': 'libreoffice',
'xls': 'libreoffice',
'xlsx': 'libreoffice',
'ppt': 'libreoffice',
'pptx': 'libreoffice',
'pdf': 'pdf'
}
def convert_to_pdf(self, input_file: Path) -> bytes:
"""
Convert a document to PDF format
"""
if not input_file.exists():
raise FileNotFoundError(f"Input file not found: {input_file}")
input_extension = input_file.suffix.lower()[1:] # Remove the dot
if input_extension not in self.supported_extensions:
raise ValueError(f"Unsupported file extension: {input_extension}")
if input_extension == 'pdf':
# If it's already a PDF, just read and return it
with open(input_file, 'rb') as f:
return f.read()
# Use LibreOffice for conversion
with tempfile.TemporaryDirectory() as temp_dir:
output_file = Path(temp_dir) / f"{input_file.stem}.pdf"
# Convert using LibreOffice
cmd = [
'libreoffice',
'--headless',
'--convert-to', 'pdf',
'--outdir', str(temp_dir),
str(input_file)
]
try:
subprocess.run(cmd, check=True, capture_output=True)
except subprocess.CalledProcessError as e:
raise RuntimeError(f"Conversion failed: {e.stderr.decode()}")
if not output_file.exists():
raise RuntimeError("Conversion failed: Output file not created")
# Read and return the PDF content
with open(output_file, 'rb') as f:
return f.read()
def batch_convert_directory(self, directory: str) -> List[Dict]:
"""
Convert all documents in a directory to PDF format
"""
directory_path = Path(directory)
if not directory_path.exists():
raise FileNotFoundError(f"Directory not found: {directory}")
results = []
for file_path in directory_path.glob('*'):
if file_path.is_file() and file_path.suffix.lower()[1:] in self.supported_extensions:
try:
pdf_content = self.convert_to_pdf(file_path)
output_file = file_path.with_suffix('.pdf')
with open(output_file, 'wb') as f:
f.write(pdf_content)
results.append({
"source_file": str(file_path),
"output_file": str(output_file),
"status": "success"
})
except Exception as e:
results.append({
"source_file": str(file_path),
"status": "error",
"error": str(e)
})
return results