211 lines
6.2 KiB
Python
211 lines
6.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Simple test script for split_map.py functionality.
|
|
Run this to verify the split map generator works correctly.
|
|
"""
|
|
|
|
import json
|
|
import tempfile
|
|
import uuid
|
|
from pathlib import Path
|
|
|
|
# Mock implementations for testing without full environment
|
|
class MockStorage:
|
|
def __init__(self):
|
|
self.files = {}
|
|
|
|
def download_file(self, bucket, path):
|
|
return self.files.get(f"{bucket}/{path}", b"{}")
|
|
|
|
def upload_file(self, bucket, path, data, mime, upsert=True):
|
|
self.files[f"{bucket}/{path}"] = data
|
|
|
|
class MockClient:
|
|
def __init__(self):
|
|
self.files = []
|
|
self.artefacts = []
|
|
|
|
@property
|
|
def supabase(self):
|
|
return self
|
|
|
|
def table(self, name):
|
|
self.current_table = name
|
|
return self
|
|
|
|
def select(self, cols):
|
|
return self
|
|
|
|
def eq(self, col, val):
|
|
return self
|
|
|
|
def single(self):
|
|
return self
|
|
|
|
def order(self, col, desc=False):
|
|
return self
|
|
|
|
def execute(self):
|
|
if self.current_table == 'files':
|
|
return type('Result', (), {'data': {
|
|
'id': 'test-file-id',
|
|
'bucket': 'test-bucket',
|
|
'cabinet_id': 'test-cabinet',
|
|
'name': 'test.pdf',
|
|
'path': 'test/path.pdf',
|
|
'mime_type': 'application/pdf'
|
|
}})()
|
|
elif self.current_table == 'document_artefacts':
|
|
return type('Result', (), {'data': self.artefacts})()
|
|
return type('Result', (), {'data': []})()
|
|
|
|
def insert(self, data):
|
|
if self.current_table == 'document_artefacts':
|
|
self.artefacts.append(data)
|
|
return self
|
|
|
|
def test_outline_extraction():
|
|
"""Test PDF outline extraction (requires PyMuPDF)."""
|
|
print("Testing PDF outline extraction...")
|
|
|
|
try:
|
|
from routers.database.files.split_map import _try_outline
|
|
|
|
# Create a simple PDF with bookmarks (this would need actual PDF bytes in real usage)
|
|
# For now, just test the import and function signature
|
|
result = _try_outline(b"dummy pdf bytes")
|
|
print(f"Outline extraction result: {result}")
|
|
|
|
except ImportError:
|
|
print("PyMuPDF not available - outline extraction will be skipped")
|
|
except Exception as e:
|
|
print(f"Outline extraction test failed: {e}")
|
|
|
|
def test_heading_extraction():
|
|
"""Test heading extraction from Docling JSON."""
|
|
print("\nTesting heading extraction...")
|
|
|
|
from routers.database.files.split_map import _try_headings
|
|
|
|
# Mock Docling JSON with headings
|
|
docling_json = {
|
|
"blocks": [
|
|
{
|
|
"role": "heading",
|
|
"text": "Chapter 1: Introduction",
|
|
"page": 1,
|
|
"type": "h1"
|
|
},
|
|
{
|
|
"role": "paragraph",
|
|
"text": "Some content...",
|
|
"page": 1
|
|
},
|
|
{
|
|
"role": "heading",
|
|
"text": "Chapter 2: Methods",
|
|
"page": 15,
|
|
"type": "h1"
|
|
},
|
|
{
|
|
"role": "heading",
|
|
"text": "Chapter 3: Results",
|
|
"page": 30,
|
|
"type": "h1"
|
|
}
|
|
]
|
|
}
|
|
|
|
result = _try_headings(docling_json)
|
|
print(f"Headings extraction result: {result}")
|
|
|
|
expected = [("Chapter 1: Introduction", 1, 1), ("Chapter 2: Methods", 15, 1), ("Chapter 3: Results", 30, 1)]
|
|
if result == expected:
|
|
print("✓ Heading extraction test passed")
|
|
else:
|
|
print(f"✗ Heading extraction test failed. Expected: {expected}, Got: {result}")
|
|
|
|
def test_toc_extraction():
|
|
"""Test TOC extraction from Tika text."""
|
|
print("\nTesting TOC extraction...")
|
|
|
|
from routers.database.files.split_map import _try_toc_text
|
|
|
|
# Mock Tika text with TOC
|
|
tika_text = """
|
|
Table of Contents
|
|
|
|
Introduction .................. 1
|
|
Chapter 1: Getting Started .... 5
|
|
Chapter 2: Advanced Topics .... 15
|
|
Chapter 3: Examples ........... 25
|
|
Chapter 4: Conclusion ......... 35
|
|
Appendix A .................... 40
|
|
Index ......................... 45
|
|
|
|
Some other content follows...
|
|
"""
|
|
|
|
result = _try_toc_text(tika_text)
|
|
print(f"TOC extraction result: {result}")
|
|
|
|
if result and len(result) >= 5:
|
|
print("✓ TOC extraction test passed")
|
|
else:
|
|
print("✗ TOC extraction test failed")
|
|
|
|
def test_entries_building():
|
|
"""Test building entries from start points."""
|
|
print("\nTesting entries building...")
|
|
|
|
from routers.database.files.split_map import _entries_from_starts, _entries_from_pairs
|
|
|
|
# Test starts format
|
|
starts = [("Chapter 1", 1, 1), ("Chapter 2", 15, 1), ("Chapter 3", 30, 1)]
|
|
entries1 = _entries_from_starts(starts, 50, "headings")
|
|
print(f"Entries from starts: {len(entries1)} entries")
|
|
|
|
# Test pairs format
|
|
pairs = [("Introduction", 1), ("Methods", 15), ("Results", 30)]
|
|
entries2 = _entries_from_pairs(pairs, 50, "outline")
|
|
print(f"Entries from pairs: {len(entries2)} entries")
|
|
|
|
# Verify structure
|
|
if entries1 and all(key in entries1[0] for key in ['id', 'title', 'start_page', 'end_page', 'confidence']):
|
|
print("✓ Entry building test passed")
|
|
else:
|
|
print("✗ Entry building test failed")
|
|
|
|
def test_tiny_section_merging():
|
|
"""Test merging of tiny sections."""
|
|
print("\nTesting tiny section merging...")
|
|
|
|
from routers.database.files.split_map import _entries_from_pairs
|
|
|
|
# Create pairs with tiny sections (1-2 pages)
|
|
pairs = [("Chapter 1", 1), ("Short Section", 2), ("Another Short", 3), ("Chapter 2", 15)]
|
|
entries = _entries_from_pairs(pairs, 50, "test")
|
|
|
|
print(f"Original pairs: {len(pairs)}, Final entries: {len(entries)}")
|
|
|
|
# Should merge tiny sections
|
|
if len(entries) < len(pairs):
|
|
print("✓ Tiny section merging test passed")
|
|
else:
|
|
print("✗ Tiny section merging test failed")
|
|
|
|
def main():
|
|
"""Run all tests."""
|
|
print("Running split_map.py tests...\n")
|
|
|
|
test_outline_extraction()
|
|
test_heading_extraction()
|
|
test_toc_extraction()
|
|
test_entries_building()
|
|
test_tiny_section_merging()
|
|
|
|
print("\nTests completed!")
|
|
|
|
if __name__ == "__main__":
|
|
main()
|