api/test_split_map.py

#!/usr/bin/env python3
"""
Simple test script for split_map.py functionality.
Run this to verify the split map generator works correctly.
"""

import json
import tempfile
import uuid
from pathlib import Path

# Mock implementations for testing without full environment
class MockStorage:
    def __init__(self):
        self.files = {}

    def download_file(self, bucket, path):
        return self.files.get(f"{bucket}/{path}", b"{}")

    def upload_file(self, bucket, path, data, mime, upsert=True):
        self.files[f"{bucket}/{path}"] = data

class MockClient:
    def __init__(self):
        self.files = []
        self.artefacts = []

    @property
    def supabase(self):
        return self

    def table(self, name):
        self.current_table = name
        return self

    def select(self, cols):
        return self

    def eq(self, col, val):
        return self

    def single(self):
        return self

    def order(self, col, desc=False):
        return self

    def execute(self):
        if self.current_table == 'files':
            return type('Result', (), {'data': {
                'id': 'test-file-id',
                'bucket': 'test-bucket',
                'cabinet_id': 'test-cabinet',
                'name': 'test.pdf',
                'path': 'test/path.pdf',
                'mime_type': 'application/pdf'
            }})()
        elif self.current_table == 'document_artefacts':
            return type('Result', (), {'data': self.artefacts})()
        return type('Result', (), {'data': []})()

    def insert(self, data):
        if self.current_table == 'document_artefacts':
            self.artefacts.append(data)
        return self

def test_outline_extraction():
    """Test PDF outline extraction (requires PyMuPDF)."""
    print("Testing PDF outline extraction...")

    try:
        from routers.database.files.split_map import _try_outline

        # Create a simple PDF with bookmarks (this would need actual PDF bytes in real usage)
        # For now, just test the import and function signature
        result = _try_outline(b"dummy pdf bytes")
        print(f"Outline extraction result: {result}")

    except ImportError:
        print("PyMuPDF not available - outline extraction will be skipped")
    except Exception as e:
        print(f"Outline extraction test failed: {e}")

def test_heading_extraction():
    """Test heading extraction from Docling JSON."""
    print("\nTesting heading extraction...")

    from routers.database.files.split_map import _try_headings

    # Mock Docling JSON with headings
    docling_json = {
        "blocks": [
            {
                "role": "heading",
                "text": "Chapter 1: Introduction",
                "page": 1,
                "type": "h1"
            },
            {
                "role": "paragraph",
                "text": "Some content...",
                "page": 1
            },
            {
                "role": "heading",
                "text": "Chapter 2: Methods",
                "page": 15,
                "type": "h1"
            },
            {
                "role": "heading",
                "text": "Chapter 3: Results",
                "page": 30,
                "type": "h1"
            }
        ]
    }

    result = _try_headings(docling_json)
    print(f"Headings extraction result: {result}")

    expected = [("Chapter 1: Introduction", 1, 1), ("Chapter 2: Methods", 15, 1), ("Chapter 3: Results", 30, 1)]
    if result == expected:
        print("✓ Heading extraction test passed")
    else:
        print(f"✗ Heading extraction test failed. Expected: {expected}, Got: {result}")

def test_toc_extraction():
    """Test TOC extraction from Tika text."""
    print("\nTesting TOC extraction...")

    from routers.database.files.split_map import _try_toc_text

    # Mock Tika text with TOC
    tika_text = """
Table of Contents

Introduction .................. 1
Chapter 1: Getting Started .... 5
Chapter 2: Advanced Topics .... 15
Chapter 3: Examples ........... 25
Chapter 4: Conclusion ......... 35
Appendix A .................... 40
Index ......................... 45

Some other content follows...
"""

    result = _try_toc_text(tika_text)
    print(f"TOC extraction result: {result}")

    if result and len(result) >= 5:
        print("✓ TOC extraction test passed")
    else:
        print("✗ TOC extraction test failed")

def test_entries_building():
    """Test building entries from start points."""
    print("\nTesting entries building...")

    from routers.database.files.split_map import _entries_from_starts, _entries_from_pairs

    # Test starts format
    starts = [("Chapter 1", 1, 1), ("Chapter 2", 15, 1), ("Chapter 3", 30, 1)]
    entries1 = _entries_from_starts(starts, 50, "headings")
    print(f"Entries from starts: {len(entries1)} entries")

    # Test pairs format
    pairs = [("Introduction", 1), ("Methods", 15), ("Results", 30)]
    entries2 = _entries_from_pairs(pairs, 50, "outline")
    print(f"Entries from pairs: {len(entries2)} entries")

    # Verify structure
    if entries1 and all(key in entries1[0] for key in ['id', 'title', 'start_page', 'end_page', 'confidence']):
        print("✓ Entry building test passed")
    else:
        print("✗ Entry building test failed")

def test_tiny_section_merging():
    """Test merging of tiny sections."""
    print("\nTesting tiny section merging...")

    from routers.database.files.split_map import _entries_from_pairs

    # Create pairs with tiny sections (1-2 pages)
    pairs = [("Chapter 1", 1), ("Short Section", 2), ("Another Short", 3), ("Chapter 2", 15)]
    entries = _entries_from_pairs(pairs, 50, "test")

    print(f"Original pairs: {len(pairs)}, Final entries: {len(entries)}")

    # Should merge tiny sections
    if len(entries) < len(pairs):
        print("✓ Tiny section merging test passed")
    else:
        print("✗ Tiny section merging test failed")

def main():
    """Run all tests."""
    print("Running split_map.py tests...\n")

    test_outline_extraction()
    test_heading_extraction()
    test_toc_extraction()
    test_entries_building()
    test_tiny_section_merging()

    print("\nTests completed!")

if __name__ == "__main__":
    main()