from pathlib import Path import PyPDF2 from typing import Optional class PDFUtils: @staticmethod def extract_text_from_pdf(pdf_file: Path) -> str: """ Extract text content from a PDF file """ if not pdf_file.exists(): raise FileNotFoundError(f"PDF file not found: {pdf_file}") text = "" with open(pdf_file, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) for page in pdf_reader.pages: text += page.extract_text() + "\n" return text @staticmethod def get_pdf_metadata(pdf_file: Path) -> dict: """ Get metadata from a PDF file """ if not pdf_file.exists(): raise FileNotFoundError(f"PDF file not found: {pdf_file}") with open(pdf_file, 'rb') as file: pdf_reader = PyPDF2.PdfReader(file) metadata = pdf_reader.metadata or {} # Convert metadata to a dictionary result = {} for key, value in metadata.items(): if value: result[key] = str(value) # Add additional information result['num_pages'] = len(pdf_reader.pages) return result