Files
Claude 8a0acdebcd Add SSL, redirects, SEO plugin tests and fix fake test issues
- Add real integration tests for SSL, redirects, and SEO plugins
  using Chrome session helpers for live URL testing
- Remove fake "format" tests that just created dicts and asserted on them
  (apt, pip, npm provider output format tests)
- Remove npm integration test that created dirs then checked they existed
- Fix SQLite search test to use SQLITEFTS_DB constant instead of hardcoded value
2025-12-31 12:00:00 +00:00

352 lines
13 KiB
Python

"""
Tests for the SQLite FTS5 search backend.
Tests cover:
1. Search index creation
2. Indexing snapshots
3. Search queries with real test data
4. Flush operations
5. Edge cases (empty index, special characters)
"""
import os
import sqlite3
import tempfile
from pathlib import Path
from unittest.mock import patch
import pytest
from django.test import TestCase, override_settings
from archivebox.plugins.search_backend_sqlite.search import (
get_db_path,
search,
flush,
SQLITEFTS_DB,
FTS_TOKENIZERS,
)
class TestSqliteSearchBackend(TestCase):
"""Test SQLite FTS5 search backend."""
def setUp(self):
"""Create a temporary data directory with search index."""
self.temp_dir = tempfile.mkdtemp()
self.db_path = Path(self.temp_dir) / SQLITEFTS_DB
# Patch DATA_DIR
self.settings_patch = patch(
'archivebox.plugins.search_backend_sqlite.search.settings'
)
self.mock_settings = self.settings_patch.start()
self.mock_settings.DATA_DIR = self.temp_dir
# Create FTS5 table
self._create_index()
def tearDown(self):
"""Clean up temporary directory."""
self.settings_patch.stop()
import shutil
shutil.rmtree(self.temp_dir, ignore_errors=True)
def _create_index(self):
"""Create the FTS5 search index table."""
conn = sqlite3.connect(str(self.db_path))
try:
conn.execute(f'''
CREATE VIRTUAL TABLE IF NOT EXISTS search_index
USING fts5(
snapshot_id,
url,
title,
content,
tokenize = '{FTS_TOKENIZERS}'
)
''')
conn.commit()
finally:
conn.close()
def _index_snapshot(self, snapshot_id: str, url: str, title: str, content: str):
"""Add a snapshot to the index."""
conn = sqlite3.connect(str(self.db_path))
try:
conn.execute(
'INSERT INTO search_index (snapshot_id, url, title, content) VALUES (?, ?, ?, ?)',
(snapshot_id, url, title, content)
)
conn.commit()
finally:
conn.close()
def test_get_db_path(self):
"""get_db_path should return correct path."""
path = get_db_path()
self.assertEqual(path, Path(self.temp_dir) / SQLITEFTS_DB)
def test_search_empty_index(self):
"""search should return empty list for empty index."""
results = search('nonexistent')
self.assertEqual(results, [])
def test_search_no_index_file(self):
"""search should return empty list when index file doesn't exist."""
os.remove(self.db_path)
results = search('test')
self.assertEqual(results, [])
def test_search_single_result(self):
"""search should find matching snapshot."""
self._index_snapshot(
'snap-001',
'https://example.com/page1',
'Example Page',
'This is example content about testing.'
)
results = search('example')
self.assertEqual(len(results), 1)
self.assertEqual(results[0], 'snap-001')
def test_search_multiple_results(self):
"""search should find all matching snapshots."""
self._index_snapshot('snap-001', 'https://example.com/1', 'Python Tutorial', 'Learn Python programming')
self._index_snapshot('snap-002', 'https://example.com/2', 'Python Guide', 'Advanced Python concepts')
self._index_snapshot('snap-003', 'https://example.com/3', 'JavaScript Basics', 'Learn JavaScript')
results = search('Python')
self.assertEqual(len(results), 2)
self.assertIn('snap-001', results)
self.assertIn('snap-002', results)
self.assertNotIn('snap-003', results)
def test_search_title_match(self):
"""search should match against title."""
self._index_snapshot('snap-001', 'https://example.com', 'Django Web Framework', 'Content here')
results = search('Django')
self.assertEqual(len(results), 1)
self.assertEqual(results[0], 'snap-001')
def test_search_url_match(self):
"""search should match against URL."""
self._index_snapshot('snap-001', 'https://archivebox.io/docs', 'Title', 'Content')
results = search('archivebox')
self.assertEqual(len(results), 1)
def test_search_content_match(self):
"""search should match against content."""
self._index_snapshot(
'snap-001',
'https://example.com',
'Generic Title',
'This document contains information about cryptography and security.'
)
results = search('cryptography')
self.assertEqual(len(results), 1)
def test_search_case_insensitive(self):
"""search should be case insensitive."""
self._index_snapshot('snap-001', 'https://example.com', 'Title', 'PYTHON programming')
results = search('python')
self.assertEqual(len(results), 1)
def test_search_stemming(self):
"""search should use porter stemmer for word stems."""
self._index_snapshot('snap-001', 'https://example.com', 'Title', 'Programming concepts')
# 'program' should match 'programming' with porter stemmer
results = search('program')
self.assertEqual(len(results), 1)
def test_search_multiple_words(self):
"""search should match documents with all words."""
self._index_snapshot('snap-001', 'https://example.com', 'Web Development', 'Learn web development skills')
self._index_snapshot('snap-002', 'https://example.com', 'Web Design', 'Design beautiful websites')
results = search('web development')
# FTS5 defaults to OR, so both might match
# With porter stemmer, both should match 'web'
self.assertIn('snap-001', results)
def test_search_phrase(self):
"""search should support phrase queries."""
self._index_snapshot('snap-001', 'https://example.com', 'Title', 'machine learning algorithms')
self._index_snapshot('snap-002', 'https://example.com', 'Title', 'machine algorithms learning')
# Phrase search with quotes
results = search('"machine learning"')
self.assertEqual(len(results), 1)
self.assertEqual(results[0], 'snap-001')
def test_search_distinct_results(self):
"""search should return distinct snapshot IDs."""
# Index same snapshot twice (could happen with multiple fields matching)
self._index_snapshot('snap-001', 'https://python.org', 'Python', 'Python programming language')
results = search('Python')
self.assertEqual(len(results), 1)
def test_flush_single(self):
"""flush should remove snapshot from index."""
self._index_snapshot('snap-001', 'https://example.com', 'Title', 'Content')
self._index_snapshot('snap-002', 'https://example.com', 'Title', 'Content')
flush(['snap-001'])
results = search('Content')
self.assertEqual(len(results), 1)
self.assertEqual(results[0], 'snap-002')
def test_flush_multiple(self):
"""flush should remove multiple snapshots."""
self._index_snapshot('snap-001', 'https://example.com', 'Title', 'Test')
self._index_snapshot('snap-002', 'https://example.com', 'Title', 'Test')
self._index_snapshot('snap-003', 'https://example.com', 'Title', 'Test')
flush(['snap-001', 'snap-003'])
results = search('Test')
self.assertEqual(len(results), 1)
self.assertEqual(results[0], 'snap-002')
def test_flush_nonexistent(self):
"""flush should not raise for nonexistent snapshots."""
# Should not raise
flush(['nonexistent-snap'])
def test_flush_no_index(self):
"""flush should not raise when index doesn't exist."""
os.remove(self.db_path)
# Should not raise
flush(['snap-001'])
def test_search_special_characters(self):
"""search should handle special characters in queries."""
self._index_snapshot('snap-001', 'https://example.com', 'C++ Programming', 'Learn C++ basics')
# FTS5 handles special chars
results = search('C++')
# May or may not match depending on tokenizer config
# At minimum, should not raise
self.assertIsInstance(results, list)
def test_search_unicode(self):
"""search should handle unicode content."""
self._index_snapshot('snap-001', 'https://example.com', 'Titre Francais', 'cafe resume')
self._index_snapshot('snap-002', 'https://example.com', 'Japanese', 'Hello world')
# With remove_diacritics, 'cafe' should match
results = search('cafe')
self.assertEqual(len(results), 1)
class TestSqliteSearchWithRealData(TestCase):
"""Integration tests with realistic archived content."""
def setUp(self):
"""Create index with realistic test data."""
self.temp_dir = tempfile.mkdtemp()
self.db_path = Path(self.temp_dir) / SQLITEFTS_DB
self.settings_patch = patch(
'archivebox.plugins.search_backend_sqlite.search.settings'
)
self.mock_settings = self.settings_patch.start()
self.mock_settings.DATA_DIR = self.temp_dir
# Create index
conn = sqlite3.connect(str(self.db_path))
try:
conn.execute(f'''
CREATE VIRTUAL TABLE IF NOT EXISTS search_index
USING fts5(
snapshot_id,
url,
title,
content,
tokenize = '{FTS_TOKENIZERS}'
)
''')
# Index realistic data
test_data = [
('snap-001', 'https://github.com/ArchiveBox/ArchiveBox',
'ArchiveBox - Self-hosted web archiving',
'Open source self-hosted web archiving. Collects, saves, and displays various types of content.'),
('snap-002', 'https://docs.python.org/3/tutorial/',
'Python 3 Tutorial',
'An informal introduction to Python. Python is an easy to learn, powerful programming language.'),
('snap-003', 'https://developer.mozilla.org/docs/Web/JavaScript',
'JavaScript - MDN Web Docs',
'JavaScript (JS) is a lightweight, interpreted programming language with first-class functions.'),
('snap-004', 'https://news.ycombinator.com',
'Hacker News',
'Social news website focusing on computer science and entrepreneurship.'),
('snap-005', 'https://en.wikipedia.org/wiki/Web_archiving',
'Web archiving - Wikipedia',
'Web archiving is the process of collecting portions of the World Wide Web to ensure the information is preserved.'),
]
conn.executemany(
'INSERT INTO search_index (snapshot_id, url, title, content) VALUES (?, ?, ?, ?)',
test_data
)
conn.commit()
finally:
conn.close()
def tearDown(self):
"""Clean up."""
self.settings_patch.stop()
import shutil
shutil.rmtree(self.temp_dir, ignore_errors=True)
def test_search_archivebox(self):
"""Search for 'archivebox' should find relevant results."""
results = search('archivebox')
self.assertIn('snap-001', results)
def test_search_programming(self):
"""Search for 'programming' should find Python and JS docs."""
results = search('programming')
self.assertIn('snap-002', results)
self.assertIn('snap-003', results)
def test_search_web_archiving(self):
"""Search for 'web archiving' should find relevant results."""
results = search('web archiving')
# Both ArchiveBox and Wikipedia should match
self.assertIn('snap-001', results)
self.assertIn('snap-005', results)
def test_search_github(self):
"""Search for 'github' should find URL match."""
results = search('github')
self.assertIn('snap-001', results)
def test_search_tutorial(self):
"""Search for 'tutorial' should find Python tutorial."""
results = search('tutorial')
self.assertIn('snap-002', results)
def test_flush_and_search(self):
"""Flushing a snapshot should remove it from search results."""
# Verify it's there first
results = search('archivebox')
self.assertIn('snap-001', results)
# Flush it
flush(['snap-001'])
# Should no longer be found
results = search('archivebox')
self.assertNotIn('snap-001', results)
if __name__ == '__main__':
pytest.main([__file__, '-v'])