api/modules/page_image_generator.py
2025-11-14 14:47:19 +00:00

224 lines
8.3 KiB
Python

"""
Page Image Generation Module
This module generates full-resolution page images and thumbnails from PDF documents
for use in the document viewer UI.
"""
import io
import uuid
from typing import Dict, List, Any, Tuple
from pathlib import Path
import fitz # PyMuPDF
from PIL import Image
from modules.logger_tool import initialise_logger
import os
logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
class PageImageGenerator:
"""Generates page images and thumbnails from PDF documents"""
def __init__(self):
# Image generation settings
self.full_image_dpi = 200 # High quality for full images
self.thumbnail_dpi = 100 # Lower quality for thumbnails
self.thumbnail_max_width = 300
self.thumbnail_max_height = 400
self.image_format = "PNG"
self.thumbnail_format = "WEBP" # More efficient for thumbnails
def generate_page_images(self, file_id: str, cabinet_id: str, pdf_bytes: bytes) -> Dict[str, Any]:
"""
Generate full page images and thumbnails for a PDF document
Args:
file_id: File ID
cabinet_id: Cabinet ID for storage path
pdf_bytes: PDF file content
Returns:
Page images artefact data
"""
logger.info(f"Starting page image generation for file_id={file_id}")
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
page_count = len(doc)
artefact_id = str(uuid.uuid4())
page_images = []
for page_num in range(page_count):
page = doc[page_num]
page_number = page_num + 1
logger.debug(f"Processing page {page_number}/{page_count} for file_id={file_id}")
# Generate full resolution image
full_image_data, full_dimensions = self._generate_full_image(page, page_number)
full_image_path = f"{cabinet_id}/{file_id}/{artefact_id}/page_{page_number:03d}_full.png"
# Generate thumbnail
thumbnail_data, thumbnail_dimensions = self._generate_thumbnail(page, page_number)
thumbnail_path = f"{cabinet_id}/{file_id}/{artefact_id}/page_{page_number:03d}_thumb.webp"
page_info = {
"page": page_number,
"full_image_path": full_image_path,
"full_image_data": full_image_data, # Will be uploaded separately
"full_dimensions": full_dimensions,
"thumbnail_path": thumbnail_path,
"thumbnail_data": thumbnail_data, # Will be uploaded separately
"thumbnail_dimensions": thumbnail_dimensions,
"rotation": page.rotation,
"has_text": bool(page.get_text().strip()),
"has_images": len(page.get_images()) > 0,
"has_drawings": len(page.get_drawings()) > 0
}
page_images.append(page_info)
doc.close()
artefact_data = {
"version": 1,
"file_id": file_id,
"artefact_id": artefact_id,
"artefact_type": "page_images",
"generation_timestamp": self._get_timestamp(),
"page_count": page_count,
"page_images": page_images,
"generation_settings": {
"full_image_dpi": self.full_image_dpi,
"thumbnail_dpi": self.thumbnail_dpi,
"thumbnail_max_width": self.thumbnail_max_width,
"thumbnail_max_height": self.thumbnail_max_height,
"image_format": self.image_format,
"thumbnail_format": self.thumbnail_format
},
"storage_info": {
"total_full_images": page_count,
"total_thumbnails": page_count,
"estimated_storage_mb": self._estimate_storage_size(page_images)
}
}
logger.info(f"Generated {page_count} page images for file_id={file_id}")
return artefact_data
def _generate_full_image(self, page: fitz.Page, page_number: int) -> Tuple[bytes, Dict[str, int]]:
"""Generate full resolution page image"""
# Create transformation matrix for high DPI
mat = fitz.Matrix(self.full_image_dpi / 72, self.full_image_dpi / 72)
# Render page to pixmap
pix = page.get_pixmap(matrix=mat, alpha=False)
# Convert to PNG bytes
img_data = pix.tobytes("png")
dimensions = {
"width": pix.width,
"height": pix.height
}
pix = None # Free memory
return img_data, dimensions
def _generate_thumbnail(self, page: fitz.Page, page_number: int) -> Tuple[bytes, Dict[str, int]]:
"""Generate thumbnail image"""
# Create transformation matrix for lower DPI
mat = fitz.Matrix(self.thumbnail_dpi / 72, self.thumbnail_dpi / 72)
# Render page to pixmap
pix = page.get_pixmap(matrix=mat, alpha=False)
# Convert to PIL Image for resizing
img_data = pix.tobytes("png")
pil_image = Image.open(io.BytesIO(img_data))
# Resize to thumbnail dimensions while maintaining aspect ratio
pil_image.thumbnail((self.thumbnail_max_width, self.thumbnail_max_height), Image.Resampling.LANCZOS)
# Convert to WebP for better compression
thumbnail_buffer = io.BytesIO()
pil_image.save(thumbnail_buffer, format="WEBP", quality=85, optimize=True)
thumbnail_data = thumbnail_buffer.getvalue()
dimensions = {
"width": pil_image.width,
"height": pil_image.height
}
pix = None # Free memory
pil_image.close()
return thumbnail_data, dimensions
def _estimate_storage_size(self, page_images: List[Dict]) -> float:
"""Estimate total storage size in MB"""
total_bytes = 0
for page_info in page_images:
# Estimate full image size (PNG is roughly 3-4 bytes per pixel)
full_dims = page_info["full_dimensions"]
full_size = full_dims["width"] * full_dims["height"] * 3.5
# Estimate thumbnail size (WebP is much more efficient)
thumb_dims = page_info["thumbnail_dimensions"]
thumb_size = thumb_dims["width"] * thumb_dims["height"] * 0.5
total_bytes += full_size + thumb_size
return round(total_bytes / (1024 * 1024), 2) # Convert to MB
def _get_timestamp(self) -> str:
"""Get current timestamp in ISO format"""
import datetime
return datetime.datetime.utcnow().isoformat() + "Z"
def generate_single_page_image(self, pdf_bytes: bytes, page_number: int,
image_type: str = "full") -> Tuple[bytes, Dict[str, int]]:
"""
Generate a single page image (for on-demand generation)
Args:
pdf_bytes: PDF file content
page_number: Page number (1-based)
image_type: "full" or "thumbnail"
Returns:
Tuple of (image_bytes, dimensions)
"""
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
if page_number < 1 or page_number > len(doc):
doc.close()
raise ValueError(f"Page number {page_number} is out of range (1-{len(doc)})")
page = doc[page_number - 1] # Convert to 0-based index
if image_type == "thumbnail":
image_data, dimensions = self._generate_thumbnail(page, page_number)
else:
image_data, dimensions = self._generate_full_image(page, page_number)
doc.close()
return image_data, dimensions
def create_page_images_artefact(file_id: str, cabinet_id: str, pdf_bytes: bytes) -> Dict[str, Any]:
"""
Create page images artefact for a PDF document
Args:
file_id: File ID
cabinet_id: Cabinet ID
pdf_bytes: PDF file content
Returns:
Page images artefact data
"""
generator = PageImageGenerator()
return generator.generate_page_images(file_id, cabinet_id, pdf_bytes)