api/modules/pdf_utils.py
2025-07-11 13:52:19 +00:00

43 lines
1.3 KiB
Python

from pathlib import Path
import PyPDF2
from typing import Optional
class PDFUtils:
@staticmethod
def extract_text_from_pdf(pdf_file: Path) -> str:
"""
Extract text content from a PDF file
"""
if not pdf_file.exists():
raise FileNotFoundError(f"PDF file not found: {pdf_file}")
text = ""
with open(pdf_file, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
for page in pdf_reader.pages:
text += page.extract_text() + "\n"
return text
@staticmethod
def get_pdf_metadata(pdf_file: Path) -> dict:
"""
Get metadata from a PDF file
"""
if not pdf_file.exists():
raise FileNotFoundError(f"PDF file not found: {pdf_file}")
with open(pdf_file, 'rb') as file:
pdf_reader = PyPDF2.PdfReader(file)
metadata = pdf_reader.metadata or {}
# Convert metadata to a dictionary
result = {}
for key, value in metadata.items():
if value:
result[key] = str(value)
# Add additional information
result['num_pages'] = len(pdf_reader.pages)
return result