224 lines
8.3 KiB
Python
224 lines
8.3 KiB
Python
"""
|
|
Page Image Generation Module
|
|
|
|
This module generates full-resolution page images and thumbnails from PDF documents
|
|
for use in the document viewer UI.
|
|
"""
|
|
|
|
import io
|
|
import uuid
|
|
from typing import Dict, List, Any, Tuple
|
|
from pathlib import Path
|
|
import fitz # PyMuPDF
|
|
from PIL import Image
|
|
from modules.logger_tool import initialise_logger
|
|
import os
|
|
|
|
logger = initialise_logger(__name__, os.getenv("LOG_LEVEL"), os.getenv("LOG_PATH"), 'default', True)
|
|
|
|
class PageImageGenerator:
|
|
"""Generates page images and thumbnails from PDF documents"""
|
|
|
|
def __init__(self):
|
|
# Image generation settings
|
|
self.full_image_dpi = 200 # High quality for full images
|
|
self.thumbnail_dpi = 100 # Lower quality for thumbnails
|
|
self.thumbnail_max_width = 300
|
|
self.thumbnail_max_height = 400
|
|
self.image_format = "PNG"
|
|
self.thumbnail_format = "WEBP" # More efficient for thumbnails
|
|
|
|
def generate_page_images(self, file_id: str, cabinet_id: str, pdf_bytes: bytes) -> Dict[str, Any]:
|
|
"""
|
|
Generate full page images and thumbnails for a PDF document
|
|
|
|
Args:
|
|
file_id: File ID
|
|
cabinet_id: Cabinet ID for storage path
|
|
pdf_bytes: PDF file content
|
|
|
|
Returns:
|
|
Page images artefact data
|
|
"""
|
|
logger.info(f"Starting page image generation for file_id={file_id}")
|
|
|
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
|
page_count = len(doc)
|
|
|
|
artefact_id = str(uuid.uuid4())
|
|
page_images = []
|
|
|
|
for page_num in range(page_count):
|
|
page = doc[page_num]
|
|
page_number = page_num + 1
|
|
|
|
logger.debug(f"Processing page {page_number}/{page_count} for file_id={file_id}")
|
|
|
|
# Generate full resolution image
|
|
full_image_data, full_dimensions = self._generate_full_image(page, page_number)
|
|
full_image_path = f"{cabinet_id}/{file_id}/{artefact_id}/page_{page_number:03d}_full.png"
|
|
|
|
# Generate thumbnail
|
|
thumbnail_data, thumbnail_dimensions = self._generate_thumbnail(page, page_number)
|
|
thumbnail_path = f"{cabinet_id}/{file_id}/{artefact_id}/page_{page_number:03d}_thumb.webp"
|
|
|
|
page_info = {
|
|
"page": page_number,
|
|
"full_image_path": full_image_path,
|
|
"full_image_data": full_image_data, # Will be uploaded separately
|
|
"full_dimensions": full_dimensions,
|
|
"thumbnail_path": thumbnail_path,
|
|
"thumbnail_data": thumbnail_data, # Will be uploaded separately
|
|
"thumbnail_dimensions": thumbnail_dimensions,
|
|
"rotation": page.rotation,
|
|
"has_text": bool(page.get_text().strip()),
|
|
"has_images": len(page.get_images()) > 0,
|
|
"has_drawings": len(page.get_drawings()) > 0
|
|
}
|
|
|
|
page_images.append(page_info)
|
|
|
|
doc.close()
|
|
|
|
artefact_data = {
|
|
"version": 1,
|
|
"file_id": file_id,
|
|
"artefact_id": artefact_id,
|
|
"artefact_type": "page_images",
|
|
"generation_timestamp": self._get_timestamp(),
|
|
"page_count": page_count,
|
|
"page_images": page_images,
|
|
"generation_settings": {
|
|
"full_image_dpi": self.full_image_dpi,
|
|
"thumbnail_dpi": self.thumbnail_dpi,
|
|
"thumbnail_max_width": self.thumbnail_max_width,
|
|
"thumbnail_max_height": self.thumbnail_max_height,
|
|
"image_format": self.image_format,
|
|
"thumbnail_format": self.thumbnail_format
|
|
},
|
|
"storage_info": {
|
|
"total_full_images": page_count,
|
|
"total_thumbnails": page_count,
|
|
"estimated_storage_mb": self._estimate_storage_size(page_images)
|
|
}
|
|
}
|
|
|
|
logger.info(f"Generated {page_count} page images for file_id={file_id}")
|
|
return artefact_data
|
|
|
|
def _generate_full_image(self, page: fitz.Page, page_number: int) -> Tuple[bytes, Dict[str, int]]:
|
|
"""Generate full resolution page image"""
|
|
# Create transformation matrix for high DPI
|
|
mat = fitz.Matrix(self.full_image_dpi / 72, self.full_image_dpi / 72)
|
|
|
|
# Render page to pixmap
|
|
pix = page.get_pixmap(matrix=mat, alpha=False)
|
|
|
|
# Convert to PNG bytes
|
|
img_data = pix.tobytes("png")
|
|
|
|
dimensions = {
|
|
"width": pix.width,
|
|
"height": pix.height
|
|
}
|
|
|
|
pix = None # Free memory
|
|
|
|
return img_data, dimensions
|
|
|
|
def _generate_thumbnail(self, page: fitz.Page, page_number: int) -> Tuple[bytes, Dict[str, int]]:
|
|
"""Generate thumbnail image"""
|
|
# Create transformation matrix for lower DPI
|
|
mat = fitz.Matrix(self.thumbnail_dpi / 72, self.thumbnail_dpi / 72)
|
|
|
|
# Render page to pixmap
|
|
pix = page.get_pixmap(matrix=mat, alpha=False)
|
|
|
|
# Convert to PIL Image for resizing
|
|
img_data = pix.tobytes("png")
|
|
pil_image = Image.open(io.BytesIO(img_data))
|
|
|
|
# Resize to thumbnail dimensions while maintaining aspect ratio
|
|
pil_image.thumbnail((self.thumbnail_max_width, self.thumbnail_max_height), Image.Resampling.LANCZOS)
|
|
|
|
# Convert to WebP for better compression
|
|
thumbnail_buffer = io.BytesIO()
|
|
pil_image.save(thumbnail_buffer, format="WEBP", quality=85, optimize=True)
|
|
thumbnail_data = thumbnail_buffer.getvalue()
|
|
|
|
dimensions = {
|
|
"width": pil_image.width,
|
|
"height": pil_image.height
|
|
}
|
|
|
|
pix = None # Free memory
|
|
pil_image.close()
|
|
|
|
return thumbnail_data, dimensions
|
|
|
|
def _estimate_storage_size(self, page_images: List[Dict]) -> float:
|
|
"""Estimate total storage size in MB"""
|
|
total_bytes = 0
|
|
|
|
for page_info in page_images:
|
|
# Estimate full image size (PNG is roughly 3-4 bytes per pixel)
|
|
full_dims = page_info["full_dimensions"]
|
|
full_size = full_dims["width"] * full_dims["height"] * 3.5
|
|
|
|
# Estimate thumbnail size (WebP is much more efficient)
|
|
thumb_dims = page_info["thumbnail_dimensions"]
|
|
thumb_size = thumb_dims["width"] * thumb_dims["height"] * 0.5
|
|
|
|
total_bytes += full_size + thumb_size
|
|
|
|
return round(total_bytes / (1024 * 1024), 2) # Convert to MB
|
|
|
|
def _get_timestamp(self) -> str:
|
|
"""Get current timestamp in ISO format"""
|
|
import datetime
|
|
return datetime.datetime.utcnow().isoformat() + "Z"
|
|
|
|
def generate_single_page_image(self, pdf_bytes: bytes, page_number: int,
|
|
image_type: str = "full") -> Tuple[bytes, Dict[str, int]]:
|
|
"""
|
|
Generate a single page image (for on-demand generation)
|
|
|
|
Args:
|
|
pdf_bytes: PDF file content
|
|
page_number: Page number (1-based)
|
|
image_type: "full" or "thumbnail"
|
|
|
|
Returns:
|
|
Tuple of (image_bytes, dimensions)
|
|
"""
|
|
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
|
|
|
|
if page_number < 1 or page_number > len(doc):
|
|
doc.close()
|
|
raise ValueError(f"Page number {page_number} is out of range (1-{len(doc)})")
|
|
|
|
page = doc[page_number - 1] # Convert to 0-based index
|
|
|
|
if image_type == "thumbnail":
|
|
image_data, dimensions = self._generate_thumbnail(page, page_number)
|
|
else:
|
|
image_data, dimensions = self._generate_full_image(page, page_number)
|
|
|
|
doc.close()
|
|
return image_data, dimensions
|
|
|
|
def create_page_images_artefact(file_id: str, cabinet_id: str, pdf_bytes: bytes) -> Dict[str, Any]:
|
|
"""
|
|
Create page images artefact for a PDF document
|
|
|
|
Args:
|
|
file_id: File ID
|
|
cabinet_id: Cabinet ID
|
|
pdf_bytes: PDF file content
|
|
|
|
Returns:
|
|
Page images artefact data
|
|
"""
|
|
generator = PageImageGenerator()
|
|
return generator.generate_page_images(file_id, cabinet_id, pdf_bytes)
|