43 lines
1.3 KiB
Python
43 lines
1.3 KiB
Python
from pathlib import Path
|
|
import PyPDF2
|
|
from typing import Optional
|
|
|
|
class PDFUtils:
|
|
@staticmethod
|
|
def extract_text_from_pdf(pdf_file: Path) -> str:
|
|
"""
|
|
Extract text content from a PDF file
|
|
"""
|
|
if not pdf_file.exists():
|
|
raise FileNotFoundError(f"PDF file not found: {pdf_file}")
|
|
|
|
text = ""
|
|
with open(pdf_file, 'rb') as file:
|
|
pdf_reader = PyPDF2.PdfReader(file)
|
|
for page in pdf_reader.pages:
|
|
text += page.extract_text() + "\n"
|
|
|
|
return text
|
|
|
|
@staticmethod
|
|
def get_pdf_metadata(pdf_file: Path) -> dict:
|
|
"""
|
|
Get metadata from a PDF file
|
|
"""
|
|
if not pdf_file.exists():
|
|
raise FileNotFoundError(f"PDF file not found: {pdf_file}")
|
|
|
|
with open(pdf_file, 'rb') as file:
|
|
pdf_reader = PyPDF2.PdfReader(file)
|
|
metadata = pdf_reader.metadata or {}
|
|
|
|
# Convert metadata to a dictionary
|
|
result = {}
|
|
for key, value in metadata.items():
|
|
if value:
|
|
result[key] = str(value)
|
|
|
|
# Add additional information
|
|
result['num_pages'] = len(pdf_reader.pages)
|
|
|
|
return result |