#!/usr/bin/env python3 """ Simple test script for split_map.py functionality. Run this to verify the split map generator works correctly. """ import json import tempfile import uuid from pathlib import Path # Mock implementations for testing without full environment class MockStorage: def __init__(self): self.files = {} def download_file(self, bucket, path): return self.files.get(f"{bucket}/{path}", b"{}") def upload_file(self, bucket, path, data, mime, upsert=True): self.files[f"{bucket}/{path}"] = data class MockClient: def __init__(self): self.files = [] self.artefacts = [] @property def supabase(self): return self def table(self, name): self.current_table = name return self def select(self, cols): return self def eq(self, col, val): return self def single(self): return self def order(self, col, desc=False): return self def execute(self): if self.current_table == 'files': return type('Result', (), {'data': { 'id': 'test-file-id', 'bucket': 'test-bucket', 'cabinet_id': 'test-cabinet', 'name': 'test.pdf', 'path': 'test/path.pdf', 'mime_type': 'application/pdf' }})() elif self.current_table == 'document_artefacts': return type('Result', (), {'data': self.artefacts})() return type('Result', (), {'data': []})() def insert(self, data): if self.current_table == 'document_artefacts': self.artefacts.append(data) return self def test_outline_extraction(): """Test PDF outline extraction (requires PyMuPDF).""" print("Testing PDF outline extraction...") try: from routers.database.files.split_map import _try_outline # Create a simple PDF with bookmarks (this would need actual PDF bytes in real usage) # For now, just test the import and function signature result = _try_outline(b"dummy pdf bytes") print(f"Outline extraction result: {result}") except ImportError: print("PyMuPDF not available - outline extraction will be skipped") except Exception as e: print(f"Outline extraction test failed: {e}") def test_heading_extraction(): """Test heading extraction from Docling JSON.""" print("\nTesting heading extraction...") from routers.database.files.split_map import _try_headings # Mock Docling JSON with headings docling_json = { "blocks": [ { "role": "heading", "text": "Chapter 1: Introduction", "page": 1, "type": "h1" }, { "role": "paragraph", "text": "Some content...", "page": 1 }, { "role": "heading", "text": "Chapter 2: Methods", "page": 15, "type": "h1" }, { "role": "heading", "text": "Chapter 3: Results", "page": 30, "type": "h1" } ] } result = _try_headings(docling_json) print(f"Headings extraction result: {result}") expected = [("Chapter 1: Introduction", 1, 1), ("Chapter 2: Methods", 15, 1), ("Chapter 3: Results", 30, 1)] if result == expected: print("✓ Heading extraction test passed") else: print(f"✗ Heading extraction test failed. Expected: {expected}, Got: {result}") def test_toc_extraction(): """Test TOC extraction from Tika text.""" print("\nTesting TOC extraction...") from routers.database.files.split_map import _try_toc_text # Mock Tika text with TOC tika_text = """ Table of Contents Introduction .................. 1 Chapter 1: Getting Started .... 5 Chapter 2: Advanced Topics .... 15 Chapter 3: Examples ........... 25 Chapter 4: Conclusion ......... 35 Appendix A .................... 40 Index ......................... 45 Some other content follows... """ result = _try_toc_text(tika_text) print(f"TOC extraction result: {result}") if result and len(result) >= 5: print("✓ TOC extraction test passed") else: print("✗ TOC extraction test failed") def test_entries_building(): """Test building entries from start points.""" print("\nTesting entries building...") from routers.database.files.split_map import _entries_from_starts, _entries_from_pairs # Test starts format starts = [("Chapter 1", 1, 1), ("Chapter 2", 15, 1), ("Chapter 3", 30, 1)] entries1 = _entries_from_starts(starts, 50, "headings") print(f"Entries from starts: {len(entries1)} entries") # Test pairs format pairs = [("Introduction", 1), ("Methods", 15), ("Results", 30)] entries2 = _entries_from_pairs(pairs, 50, "outline") print(f"Entries from pairs: {len(entries2)} entries") # Verify structure if entries1 and all(key in entries1[0] for key in ['id', 'title', 'start_page', 'end_page', 'confidence']): print("✓ Entry building test passed") else: print("✗ Entry building test failed") def test_tiny_section_merging(): """Test merging of tiny sections.""" print("\nTesting tiny section merging...") from routers.database.files.split_map import _entries_from_pairs # Create pairs with tiny sections (1-2 pages) pairs = [("Chapter 1", 1), ("Short Section", 2), ("Another Short", 3), ("Chapter 2", 15)] entries = _entries_from_pairs(pairs, 50, "test") print(f"Original pairs: {len(pairs)}, Final entries: {len(entries)}") # Should merge tiny sections if len(entries) < len(pairs): print("✓ Tiny section merging test passed") else: print("✗ Tiny section merging test failed") def main(): """Run all tests.""" print("Running split_map.py tests...\n") test_outline_extraction() test_heading_extraction() test_toc_extraction() test_entries_building() test_tiny_section_merging() print("\nTests completed!") if __name__ == "__main__": main()