api/test_split_map.py
2025-11-14 14:47:19 +00:00

211 lines
6.2 KiB
Python

#!/usr/bin/env python3
"""
Simple test script for split_map.py functionality.
Run this to verify the split map generator works correctly.
"""
import json
import tempfile
import uuid
from pathlib import Path
# Mock implementations for testing without full environment
class MockStorage:
def __init__(self):
self.files = {}
def download_file(self, bucket, path):
return self.files.get(f"{bucket}/{path}", b"{}")
def upload_file(self, bucket, path, data, mime, upsert=True):
self.files[f"{bucket}/{path}"] = data
class MockClient:
def __init__(self):
self.files = []
self.artefacts = []
@property
def supabase(self):
return self
def table(self, name):
self.current_table = name
return self
def select(self, cols):
return self
def eq(self, col, val):
return self
def single(self):
return self
def order(self, col, desc=False):
return self
def execute(self):
if self.current_table == 'files':
return type('Result', (), {'data': {
'id': 'test-file-id',
'bucket': 'test-bucket',
'cabinet_id': 'test-cabinet',
'name': 'test.pdf',
'path': 'test/path.pdf',
'mime_type': 'application/pdf'
}})()
elif self.current_table == 'document_artefacts':
return type('Result', (), {'data': self.artefacts})()
return type('Result', (), {'data': []})()
def insert(self, data):
if self.current_table == 'document_artefacts':
self.artefacts.append(data)
return self
def test_outline_extraction():
"""Test PDF outline extraction (requires PyMuPDF)."""
print("Testing PDF outline extraction...")
try:
from routers.database.files.split_map import _try_outline
# Create a simple PDF with bookmarks (this would need actual PDF bytes in real usage)
# For now, just test the import and function signature
result = _try_outline(b"dummy pdf bytes")
print(f"Outline extraction result: {result}")
except ImportError:
print("PyMuPDF not available - outline extraction will be skipped")
except Exception as e:
print(f"Outline extraction test failed: {e}")
def test_heading_extraction():
"""Test heading extraction from Docling JSON."""
print("\nTesting heading extraction...")
from routers.database.files.split_map import _try_headings
# Mock Docling JSON with headings
docling_json = {
"blocks": [
{
"role": "heading",
"text": "Chapter 1: Introduction",
"page": 1,
"type": "h1"
},
{
"role": "paragraph",
"text": "Some content...",
"page": 1
},
{
"role": "heading",
"text": "Chapter 2: Methods",
"page": 15,
"type": "h1"
},
{
"role": "heading",
"text": "Chapter 3: Results",
"page": 30,
"type": "h1"
}
]
}
result = _try_headings(docling_json)
print(f"Headings extraction result: {result}")
expected = [("Chapter 1: Introduction", 1, 1), ("Chapter 2: Methods", 15, 1), ("Chapter 3: Results", 30, 1)]
if result == expected:
print("✓ Heading extraction test passed")
else:
print(f"✗ Heading extraction test failed. Expected: {expected}, Got: {result}")
def test_toc_extraction():
"""Test TOC extraction from Tika text."""
print("\nTesting TOC extraction...")
from routers.database.files.split_map import _try_toc_text
# Mock Tika text with TOC
tika_text = """
Table of Contents
Introduction .................. 1
Chapter 1: Getting Started .... 5
Chapter 2: Advanced Topics .... 15
Chapter 3: Examples ........... 25
Chapter 4: Conclusion ......... 35
Appendix A .................... 40
Index ......................... 45
Some other content follows...
"""
result = _try_toc_text(tika_text)
print(f"TOC extraction result: {result}")
if result and len(result) >= 5:
print("✓ TOC extraction test passed")
else:
print("✗ TOC extraction test failed")
def test_entries_building():
"""Test building entries from start points."""
print("\nTesting entries building...")
from routers.database.files.split_map import _entries_from_starts, _entries_from_pairs
# Test starts format
starts = [("Chapter 1", 1, 1), ("Chapter 2", 15, 1), ("Chapter 3", 30, 1)]
entries1 = _entries_from_starts(starts, 50, "headings")
print(f"Entries from starts: {len(entries1)} entries")
# Test pairs format
pairs = [("Introduction", 1), ("Methods", 15), ("Results", 30)]
entries2 = _entries_from_pairs(pairs, 50, "outline")
print(f"Entries from pairs: {len(entries2)} entries")
# Verify structure
if entries1 and all(key in entries1[0] for key in ['id', 'title', 'start_page', 'end_page', 'confidence']):
print("✓ Entry building test passed")
else:
print("✗ Entry building test failed")
def test_tiny_section_merging():
"""Test merging of tiny sections."""
print("\nTesting tiny section merging...")
from routers.database.files.split_map import _entries_from_pairs
# Create pairs with tiny sections (1-2 pages)
pairs = [("Chapter 1", 1), ("Short Section", 2), ("Another Short", 3), ("Chapter 2", 15)]
entries = _entries_from_pairs(pairs, 50, "test")
print(f"Original pairs: {len(pairs)}, Final entries: {len(entries)}")
# Should merge tiny sections
if len(entries) < len(pairs):
print("✓ Tiny section merging test passed")
else:
print("✗ Tiny section merging test failed")
def main():
"""Run all tests."""
print("Running split_map.py tests...\n")
test_outline_extraction()
test_heading_extraction()
test_toc_extraction()
test_entries_building()
test_tiny_section_merging()
print("\nTests completed!")
if __name__ == "__main__":
main()