""" Page Image Generation Module This module generates full-resolution page images and thumbnails from PDF documents for use in the document viewer UI. """ import io import uuid from typing import Dict, List, Any, Tuple from pathlib import Path import fitz # PyMuPDF from PIL import Image from modules.logger_tool import initialise_logger import os logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True) class PageImageGenerator: """Generates page images and thumbnails from PDF documents""" def __init__(self): # Image generation settings self.full_image_dpi = 200 # High quality for full images self.thumbnail_dpi = 100 # Lower quality for thumbnails self.thumbnail_max_width = 300 self.thumbnail_max_height = 400 self.image_format = "PNG" self.thumbnail_format = "WEBP" # More efficient for thumbnails def generate_page_images(self, file_id: str, cabinet_id: str, pdf_bytes: bytes) -> Dict[str, Any]: """ Generate full page images and thumbnails for a PDF document Args: file_id: File ID cabinet_id: Cabinet ID for storage path pdf_bytes: PDF file content Returns: Page images artefact data """ logger.info(f"Starting page image generation for file_id={file_id}") doc = fitz.open(stream=pdf_bytes, filetype="pdf") page_count = len(doc) artefact_id = str(uuid.uuid4()) page_images = [] for page_num in range(page_count): page = doc[page_num] page_number = page_num + 1 logger.debug(f"Processing page {page_number}/{page_count} for file_id={file_id}") # Generate full resolution image full_image_data, full_dimensions = self._generate_full_image(page, page_number) full_image_path = f"{cabinet_id}/{file_id}/{artefact_id}/page_{page_number:03d}_full.png" # Generate thumbnail thumbnail_data, thumbnail_dimensions = self._generate_thumbnail(page, page_number) thumbnail_path = f"{cabinet_id}/{file_id}/{artefact_id}/page_{page_number:03d}_thumb.webp" page_info = { "page": page_number, "full_image_path": full_image_path, "full_image_data": full_image_data, # Will be uploaded separately "full_dimensions": full_dimensions, "thumbnail_path": thumbnail_path, "thumbnail_data": thumbnail_data, # Will be uploaded separately "thumbnail_dimensions": thumbnail_dimensions, "rotation": page.rotation, "has_text": bool(page.get_text().strip()), "has_images": len(page.get_images()) > 0, "has_drawings": len(page.get_drawings()) > 0 } page_images.append(page_info) doc.close() artefact_data = { "version": 1, "file_id": file_id, "artefact_id": artefact_id, "artefact_type": "page_images", "generation_timestamp": self._get_timestamp(), "page_count": page_count, "page_images": page_images, "generation_settings": { "full_image_dpi": self.full_image_dpi, "thumbnail_dpi": self.thumbnail_dpi, "thumbnail_max_width": self.thumbnail_max_width, "thumbnail_max_height": self.thumbnail_max_height, "image_format": self.image_format, "thumbnail_format": self.thumbnail_format }, "storage_info": { "total_full_images": page_count, "total_thumbnails": page_count, "estimated_storage_mb": self._estimate_storage_size(page_images) } } logger.info(f"Generated {page_count} page images for file_id={file_id}") return artefact_data def _generate_full_image(self, page: fitz.Page, page_number: int) -> Tuple[bytes, Dict[str, int]]: """Generate full resolution page image""" # Create transformation matrix for high DPI mat = fitz.Matrix(self.full_image_dpi / 72, self.full_image_dpi / 72) # Render page to pixmap pix = page.get_pixmap(matrix=mat, alpha=False) # Convert to PNG bytes img_data = pix.tobytes("png") dimensions = { "width": pix.width, "height": pix.height } pix = None # Free memory return img_data, dimensions def _generate_thumbnail(self, page: fitz.Page, page_number: int) -> Tuple[bytes, Dict[str, int]]: """Generate thumbnail image""" # Create transformation matrix for lower DPI mat = fitz.Matrix(self.thumbnail_dpi / 72, self.thumbnail_dpi / 72) # Render page to pixmap pix = page.get_pixmap(matrix=mat, alpha=False) # Convert to PIL Image for resizing img_data = pix.tobytes("png") pil_image = Image.open(io.BytesIO(img_data)) # Resize to thumbnail dimensions while maintaining aspect ratio pil_image.thumbnail((self.thumbnail_max_width, self.thumbnail_max_height), Image.Resampling.LANCZOS) # Convert to WebP for better compression thumbnail_buffer = io.BytesIO() pil_image.save(thumbnail_buffer, format="WEBP", quality=85, optimize=True) thumbnail_data = thumbnail_buffer.getvalue() dimensions = { "width": pil_image.width, "height": pil_image.height } pix = None # Free memory pil_image.close() return thumbnail_data, dimensions def _estimate_storage_size(self, page_images: List[Dict]) -> float: """Estimate total storage size in MB""" total_bytes = 0 for page_info in page_images: # Estimate full image size (PNG is roughly 3-4 bytes per pixel) full_dims = page_info["full_dimensions"] full_size = full_dims["width"] * full_dims["height"] * 3.5 # Estimate thumbnail size (WebP is much more efficient) thumb_dims = page_info["thumbnail_dimensions"] thumb_size = thumb_dims["width"] * thumb_dims["height"] * 0.5 total_bytes += full_size + thumb_size return round(total_bytes / (1024 * 1024), 2) # Convert to MB def _get_timestamp(self) -> str: """Get current timestamp in ISO format""" import datetime return datetime.datetime.utcnow().isoformat() + "Z" def generate_single_page_image(self, pdf_bytes: bytes, page_number: int, image_type: str = "full") -> Tuple[bytes, Dict[str, int]]: """ Generate a single page image (for on-demand generation) Args: pdf_bytes: PDF file content page_number: Page number (1-based) image_type: "full" or "thumbnail" Returns: Tuple of (image_bytes, dimensions) """ doc = fitz.open(stream=pdf_bytes, filetype="pdf") if page_number < 1 or page_number > len(doc): doc.close() raise ValueError(f"Page number {page_number} is out of range (1-{len(doc)})") page = doc[page_number - 1] # Convert to 0-based index if image_type == "thumbnail": image_data, dimensions = self._generate_thumbnail(page, page_number) else: image_data, dimensions = self._generate_full_image(page, page_number) doc.close() return image_data, dimensions def create_page_images_artefact(file_id: str, cabinet_id: str, pdf_bytes: bytes) -> Dict[str, Any]: """ Create page images artefact for a PDF document Args: file_id: File ID cabinet_id: Cabinet ID pdf_bytes: PDF file content Returns: Page images artefact data """ generator = PageImageGenerator() return generator.generate_page_images(file_id, cabinet_id, pdf_bytes)