from pathlib import Path import subprocess import tempfile import os from typing import Dict, List, Optional class DocumentProcessor: def __init__(self): self.supported_extensions = { 'doc': 'libreoffice', 'docx': 'libreoffice', 'odt': 'libreoffice', 'rtf': 'libreoffice', 'txt': 'libreoffice', 'html': 'libreoffice', 'htm': 'libreoffice', 'xls': 'libreoffice', 'xlsx': 'libreoffice', 'ppt': 'libreoffice', 'pptx': 'libreoffice', 'pdf': 'pdf' } def convert_to_pdf(self, input_file: Path) -> bytes: """ Convert a document to PDF format """ if not input_file.exists(): raise FileNotFoundError(f"Input file not found: {input_file}") input_extension = input_file.suffix.lower()[1:] # Remove the dot if input_extension not in self.supported_extensions: raise ValueError(f"Unsupported file extension: {input_extension}") if input_extension == 'pdf': # If it's already a PDF, just read and return it with open(input_file, 'rb') as f: return f.read() # Use LibreOffice for conversion with tempfile.TemporaryDirectory() as temp_dir: output_file = Path(temp_dir) / f"{input_file.stem}.pdf" # Convert using LibreOffice cmd = [ 'libreoffice', '--headless', '--convert-to', 'pdf', '--outdir', str(temp_dir), str(input_file) ] try: subprocess.run(cmd, check=True, capture_output=True) except subprocess.CalledProcessError as e: raise RuntimeError(f"Conversion failed: {e.stderr.decode()}") if not output_file.exists(): raise RuntimeError("Conversion failed: Output file not created") # Read and return the PDF content with open(output_file, 'rb') as f: return f.read() def batch_convert_directory(self, directory: str) -> List[Dict]: """ Convert all documents in a directory to PDF format """ directory_path = Path(directory) if not directory_path.exists(): raise FileNotFoundError(f"Directory not found: {directory}") results = [] for file_path in directory_path.glob('*'): if file_path.is_file() and file_path.suffix.lower()[1:] in self.supported_extensions: try: pdf_content = self.convert_to_pdf(file_path) output_file = file_path.with_suffix('.pdf') with open(output_file, 'wb') as f: f.write(pdf_content) results.append({ "source_file": str(file_path), "output_file": str(output_file), "status": "success" }) except Exception as e: results.append({ "source_file": str(file_path), "status": "error", "error": str(e) }) return results