107 lines
4.3 KiB
Python
107 lines
4.3 KiB
Python
from pathlib import Path
|
|
import subprocess
|
|
import tempfile
|
|
import os
|
|
from typing import Dict, List, Optional
|
|
|
|
class DocumentProcessor:
|
|
def __init__(self):
|
|
self.supported_extensions = {
|
|
'doc': 'libreoffice',
|
|
'docx': 'libreoffice',
|
|
'odt': 'libreoffice',
|
|
'rtf': 'libreoffice',
|
|
'txt': 'libreoffice',
|
|
'html': 'libreoffice',
|
|
'htm': 'libreoffice',
|
|
'xls': 'libreoffice',
|
|
'xlsx': 'libreoffice',
|
|
'ppt': 'libreoffice',
|
|
'pptx': 'libreoffice',
|
|
'pdf': 'pdf'
|
|
}
|
|
|
|
def convert_to_pdf(self, input_file: Path) -> bytes:
|
|
"""
|
|
Convert a document to PDF format
|
|
"""
|
|
if not input_file.exists():
|
|
raise FileNotFoundError(f"Input file not found: {input_file}")
|
|
|
|
input_extension = input_file.suffix.lower()[1:] # Remove the dot
|
|
if input_extension not in self.supported_extensions:
|
|
raise ValueError(f"Unsupported file extension: {input_extension}")
|
|
|
|
if input_extension == 'pdf':
|
|
# If it's already a PDF, just read and return it
|
|
with open(input_file, 'rb') as f:
|
|
return f.read()
|
|
|
|
# Use LibreOffice for conversion
|
|
with tempfile.TemporaryDirectory() as temp_dir:
|
|
with tempfile.TemporaryDirectory() as profile_dir:
|
|
output_file = Path(temp_dir) / f"{input_file.stem}.pdf"
|
|
|
|
# Convert using LibreOffice with explicit profile directory
|
|
cmd = [
|
|
'/Applications/LibreOffice.app/Contents/MacOS/soffice',
|
|
'--headless',
|
|
'--invisible',
|
|
'--nodefault',
|
|
'--nolockcheck',
|
|
'--nologo',
|
|
'--norestore',
|
|
f'-env:UserInstallation=file://{profile_dir}',
|
|
'--convert-to', 'pdf',
|
|
'--outdir', str(temp_dir),
|
|
str(input_file)
|
|
]
|
|
|
|
try:
|
|
result = subprocess.run(cmd, check=True, capture_output=True, timeout=120)
|
|
except subprocess.CalledProcessError as e:
|
|
stderr_msg = e.stderr.decode() if e.stderr else "No stderr output"
|
|
stdout_msg = e.stdout.decode() if e.stdout else "No stdout output"
|
|
raise RuntimeError(f"Conversion failed: {stderr_msg}. Stdout: {stdout_msg}")
|
|
except subprocess.TimeoutExpired:
|
|
raise RuntimeError("Conversion failed: Process timed out after 120 seconds")
|
|
|
|
if not output_file.exists():
|
|
# List all files in temp_dir for debugging
|
|
files_in_dir = list(Path(temp_dir).glob('*'))
|
|
raise RuntimeError(f"Conversion failed: Output file not created. Files in output dir: {files_in_dir}")
|
|
|
|
# Read and return the PDF content
|
|
with open(output_file, 'rb') as f:
|
|
return f.read()
|
|
|
|
def batch_convert_directory(self, directory: str) -> List[Dict]:
|
|
"""
|
|
Convert all documents in a directory to PDF format
|
|
"""
|
|
directory_path = Path(directory)
|
|
if not directory_path.exists():
|
|
raise FileNotFoundError(f"Directory not found: {directory}")
|
|
|
|
results = []
|
|
for file_path in directory_path.glob('*'):
|
|
if file_path.is_file() and file_path.suffix.lower()[1:] in self.supported_extensions:
|
|
try:
|
|
pdf_content = self.convert_to_pdf(file_path)
|
|
output_file = file_path.with_suffix('.pdf')
|
|
with open(output_file, 'wb') as f:
|
|
f.write(pdf_content)
|
|
|
|
results.append({
|
|
"source_file": str(file_path),
|
|
"output_file": str(output_file),
|
|
"status": "success"
|
|
})
|
|
except Exception as e:
|
|
results.append({
|
|
"source_file": str(file_path),
|
|
"status": "error",
|
|
"error": str(e)
|
|
})
|
|
|
|
return results |