mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 09:25:42 +10:00
- Add real integration tests for SSL, redirects, and SEO plugins using Chrome session helpers for live URL testing - Remove fake "format" tests that just created dicts and asserted on them (apt, pip, npm provider output format tests) - Remove npm integration test that created dirs then checked they existed - Fix SQLite search test to use SQLITEFTS_DB constant instead of hardcoded value
352 lines
13 KiB
Python
352 lines
13 KiB
Python
"""
|
|
Tests for the SQLite FTS5 search backend.
|
|
|
|
Tests cover:
|
|
1. Search index creation
|
|
2. Indexing snapshots
|
|
3. Search queries with real test data
|
|
4. Flush operations
|
|
5. Edge cases (empty index, special characters)
|
|
"""
|
|
|
|
import os
|
|
import sqlite3
|
|
import tempfile
|
|
from pathlib import Path
|
|
from unittest.mock import patch
|
|
|
|
import pytest
|
|
from django.test import TestCase, override_settings
|
|
|
|
from archivebox.plugins.search_backend_sqlite.search import (
|
|
get_db_path,
|
|
search,
|
|
flush,
|
|
SQLITEFTS_DB,
|
|
FTS_TOKENIZERS,
|
|
)
|
|
|
|
|
|
class TestSqliteSearchBackend(TestCase):
|
|
"""Test SQLite FTS5 search backend."""
|
|
|
|
def setUp(self):
|
|
"""Create a temporary data directory with search index."""
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
self.db_path = Path(self.temp_dir) / SQLITEFTS_DB
|
|
|
|
# Patch DATA_DIR
|
|
self.settings_patch = patch(
|
|
'archivebox.plugins.search_backend_sqlite.search.settings'
|
|
)
|
|
self.mock_settings = self.settings_patch.start()
|
|
self.mock_settings.DATA_DIR = self.temp_dir
|
|
|
|
# Create FTS5 table
|
|
self._create_index()
|
|
|
|
def tearDown(self):
|
|
"""Clean up temporary directory."""
|
|
self.settings_patch.stop()
|
|
import shutil
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def _create_index(self):
|
|
"""Create the FTS5 search index table."""
|
|
conn = sqlite3.connect(str(self.db_path))
|
|
try:
|
|
conn.execute(f'''
|
|
CREATE VIRTUAL TABLE IF NOT EXISTS search_index
|
|
USING fts5(
|
|
snapshot_id,
|
|
url,
|
|
title,
|
|
content,
|
|
tokenize = '{FTS_TOKENIZERS}'
|
|
)
|
|
''')
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
|
|
def _index_snapshot(self, snapshot_id: str, url: str, title: str, content: str):
|
|
"""Add a snapshot to the index."""
|
|
conn = sqlite3.connect(str(self.db_path))
|
|
try:
|
|
conn.execute(
|
|
'INSERT INTO search_index (snapshot_id, url, title, content) VALUES (?, ?, ?, ?)',
|
|
(snapshot_id, url, title, content)
|
|
)
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
|
|
def test_get_db_path(self):
|
|
"""get_db_path should return correct path."""
|
|
path = get_db_path()
|
|
self.assertEqual(path, Path(self.temp_dir) / SQLITEFTS_DB)
|
|
|
|
def test_search_empty_index(self):
|
|
"""search should return empty list for empty index."""
|
|
results = search('nonexistent')
|
|
self.assertEqual(results, [])
|
|
|
|
def test_search_no_index_file(self):
|
|
"""search should return empty list when index file doesn't exist."""
|
|
os.remove(self.db_path)
|
|
results = search('test')
|
|
self.assertEqual(results, [])
|
|
|
|
def test_search_single_result(self):
|
|
"""search should find matching snapshot."""
|
|
self._index_snapshot(
|
|
'snap-001',
|
|
'https://example.com/page1',
|
|
'Example Page',
|
|
'This is example content about testing.'
|
|
)
|
|
|
|
results = search('example')
|
|
self.assertEqual(len(results), 1)
|
|
self.assertEqual(results[0], 'snap-001')
|
|
|
|
def test_search_multiple_results(self):
|
|
"""search should find all matching snapshots."""
|
|
self._index_snapshot('snap-001', 'https://example.com/1', 'Python Tutorial', 'Learn Python programming')
|
|
self._index_snapshot('snap-002', 'https://example.com/2', 'Python Guide', 'Advanced Python concepts')
|
|
self._index_snapshot('snap-003', 'https://example.com/3', 'JavaScript Basics', 'Learn JavaScript')
|
|
|
|
results = search('Python')
|
|
self.assertEqual(len(results), 2)
|
|
self.assertIn('snap-001', results)
|
|
self.assertIn('snap-002', results)
|
|
self.assertNotIn('snap-003', results)
|
|
|
|
def test_search_title_match(self):
|
|
"""search should match against title."""
|
|
self._index_snapshot('snap-001', 'https://example.com', 'Django Web Framework', 'Content here')
|
|
|
|
results = search('Django')
|
|
self.assertEqual(len(results), 1)
|
|
self.assertEqual(results[0], 'snap-001')
|
|
|
|
def test_search_url_match(self):
|
|
"""search should match against URL."""
|
|
self._index_snapshot('snap-001', 'https://archivebox.io/docs', 'Title', 'Content')
|
|
|
|
results = search('archivebox')
|
|
self.assertEqual(len(results), 1)
|
|
|
|
def test_search_content_match(self):
|
|
"""search should match against content."""
|
|
self._index_snapshot(
|
|
'snap-001',
|
|
'https://example.com',
|
|
'Generic Title',
|
|
'This document contains information about cryptography and security.'
|
|
)
|
|
|
|
results = search('cryptography')
|
|
self.assertEqual(len(results), 1)
|
|
|
|
def test_search_case_insensitive(self):
|
|
"""search should be case insensitive."""
|
|
self._index_snapshot('snap-001', 'https://example.com', 'Title', 'PYTHON programming')
|
|
|
|
results = search('python')
|
|
self.assertEqual(len(results), 1)
|
|
|
|
def test_search_stemming(self):
|
|
"""search should use porter stemmer for word stems."""
|
|
self._index_snapshot('snap-001', 'https://example.com', 'Title', 'Programming concepts')
|
|
|
|
# 'program' should match 'programming' with porter stemmer
|
|
results = search('program')
|
|
self.assertEqual(len(results), 1)
|
|
|
|
def test_search_multiple_words(self):
|
|
"""search should match documents with all words."""
|
|
self._index_snapshot('snap-001', 'https://example.com', 'Web Development', 'Learn web development skills')
|
|
self._index_snapshot('snap-002', 'https://example.com', 'Web Design', 'Design beautiful websites')
|
|
|
|
results = search('web development')
|
|
# FTS5 defaults to OR, so both might match
|
|
# With porter stemmer, both should match 'web'
|
|
self.assertIn('snap-001', results)
|
|
|
|
def test_search_phrase(self):
|
|
"""search should support phrase queries."""
|
|
self._index_snapshot('snap-001', 'https://example.com', 'Title', 'machine learning algorithms')
|
|
self._index_snapshot('snap-002', 'https://example.com', 'Title', 'machine algorithms learning')
|
|
|
|
# Phrase search with quotes
|
|
results = search('"machine learning"')
|
|
self.assertEqual(len(results), 1)
|
|
self.assertEqual(results[0], 'snap-001')
|
|
|
|
def test_search_distinct_results(self):
|
|
"""search should return distinct snapshot IDs."""
|
|
# Index same snapshot twice (could happen with multiple fields matching)
|
|
self._index_snapshot('snap-001', 'https://python.org', 'Python', 'Python programming language')
|
|
|
|
results = search('Python')
|
|
self.assertEqual(len(results), 1)
|
|
|
|
def test_flush_single(self):
|
|
"""flush should remove snapshot from index."""
|
|
self._index_snapshot('snap-001', 'https://example.com', 'Title', 'Content')
|
|
self._index_snapshot('snap-002', 'https://example.com', 'Title', 'Content')
|
|
|
|
flush(['snap-001'])
|
|
|
|
results = search('Content')
|
|
self.assertEqual(len(results), 1)
|
|
self.assertEqual(results[0], 'snap-002')
|
|
|
|
def test_flush_multiple(self):
|
|
"""flush should remove multiple snapshots."""
|
|
self._index_snapshot('snap-001', 'https://example.com', 'Title', 'Test')
|
|
self._index_snapshot('snap-002', 'https://example.com', 'Title', 'Test')
|
|
self._index_snapshot('snap-003', 'https://example.com', 'Title', 'Test')
|
|
|
|
flush(['snap-001', 'snap-003'])
|
|
|
|
results = search('Test')
|
|
self.assertEqual(len(results), 1)
|
|
self.assertEqual(results[0], 'snap-002')
|
|
|
|
def test_flush_nonexistent(self):
|
|
"""flush should not raise for nonexistent snapshots."""
|
|
# Should not raise
|
|
flush(['nonexistent-snap'])
|
|
|
|
def test_flush_no_index(self):
|
|
"""flush should not raise when index doesn't exist."""
|
|
os.remove(self.db_path)
|
|
# Should not raise
|
|
flush(['snap-001'])
|
|
|
|
def test_search_special_characters(self):
|
|
"""search should handle special characters in queries."""
|
|
self._index_snapshot('snap-001', 'https://example.com', 'C++ Programming', 'Learn C++ basics')
|
|
|
|
# FTS5 handles special chars
|
|
results = search('C++')
|
|
# May or may not match depending on tokenizer config
|
|
# At minimum, should not raise
|
|
self.assertIsInstance(results, list)
|
|
|
|
def test_search_unicode(self):
|
|
"""search should handle unicode content."""
|
|
self._index_snapshot('snap-001', 'https://example.com', 'Titre Francais', 'cafe resume')
|
|
self._index_snapshot('snap-002', 'https://example.com', 'Japanese', 'Hello world')
|
|
|
|
# With remove_diacritics, 'cafe' should match
|
|
results = search('cafe')
|
|
self.assertEqual(len(results), 1)
|
|
|
|
|
|
class TestSqliteSearchWithRealData(TestCase):
|
|
"""Integration tests with realistic archived content."""
|
|
|
|
def setUp(self):
|
|
"""Create index with realistic test data."""
|
|
self.temp_dir = tempfile.mkdtemp()
|
|
self.db_path = Path(self.temp_dir) / SQLITEFTS_DB
|
|
|
|
self.settings_patch = patch(
|
|
'archivebox.plugins.search_backend_sqlite.search.settings'
|
|
)
|
|
self.mock_settings = self.settings_patch.start()
|
|
self.mock_settings.DATA_DIR = self.temp_dir
|
|
|
|
# Create index
|
|
conn = sqlite3.connect(str(self.db_path))
|
|
try:
|
|
conn.execute(f'''
|
|
CREATE VIRTUAL TABLE IF NOT EXISTS search_index
|
|
USING fts5(
|
|
snapshot_id,
|
|
url,
|
|
title,
|
|
content,
|
|
tokenize = '{FTS_TOKENIZERS}'
|
|
)
|
|
''')
|
|
# Index realistic data
|
|
test_data = [
|
|
('snap-001', 'https://github.com/ArchiveBox/ArchiveBox',
|
|
'ArchiveBox - Self-hosted web archiving',
|
|
'Open source self-hosted web archiving. Collects, saves, and displays various types of content.'),
|
|
('snap-002', 'https://docs.python.org/3/tutorial/',
|
|
'Python 3 Tutorial',
|
|
'An informal introduction to Python. Python is an easy to learn, powerful programming language.'),
|
|
('snap-003', 'https://developer.mozilla.org/docs/Web/JavaScript',
|
|
'JavaScript - MDN Web Docs',
|
|
'JavaScript (JS) is a lightweight, interpreted programming language with first-class functions.'),
|
|
('snap-004', 'https://news.ycombinator.com',
|
|
'Hacker News',
|
|
'Social news website focusing on computer science and entrepreneurship.'),
|
|
('snap-005', 'https://en.wikipedia.org/wiki/Web_archiving',
|
|
'Web archiving - Wikipedia',
|
|
'Web archiving is the process of collecting portions of the World Wide Web to ensure the information is preserved.'),
|
|
]
|
|
conn.executemany(
|
|
'INSERT INTO search_index (snapshot_id, url, title, content) VALUES (?, ?, ?, ?)',
|
|
test_data
|
|
)
|
|
conn.commit()
|
|
finally:
|
|
conn.close()
|
|
|
|
def tearDown(self):
|
|
"""Clean up."""
|
|
self.settings_patch.stop()
|
|
import shutil
|
|
shutil.rmtree(self.temp_dir, ignore_errors=True)
|
|
|
|
def test_search_archivebox(self):
|
|
"""Search for 'archivebox' should find relevant results."""
|
|
results = search('archivebox')
|
|
self.assertIn('snap-001', results)
|
|
|
|
def test_search_programming(self):
|
|
"""Search for 'programming' should find Python and JS docs."""
|
|
results = search('programming')
|
|
self.assertIn('snap-002', results)
|
|
self.assertIn('snap-003', results)
|
|
|
|
def test_search_web_archiving(self):
|
|
"""Search for 'web archiving' should find relevant results."""
|
|
results = search('web archiving')
|
|
# Both ArchiveBox and Wikipedia should match
|
|
self.assertIn('snap-001', results)
|
|
self.assertIn('snap-005', results)
|
|
|
|
def test_search_github(self):
|
|
"""Search for 'github' should find URL match."""
|
|
results = search('github')
|
|
self.assertIn('snap-001', results)
|
|
|
|
def test_search_tutorial(self):
|
|
"""Search for 'tutorial' should find Python tutorial."""
|
|
results = search('tutorial')
|
|
self.assertIn('snap-002', results)
|
|
|
|
def test_flush_and_search(self):
|
|
"""Flushing a snapshot should remove it from search results."""
|
|
# Verify it's there first
|
|
results = search('archivebox')
|
|
self.assertIn('snap-001', results)
|
|
|
|
# Flush it
|
|
flush(['snap-001'])
|
|
|
|
# Should no longer be found
|
|
results = search('archivebox')
|
|
self.assertNotIn('snap-001', results)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
pytest.main([__file__, '-v'])
|