Files
ArchiveBox/archivebox/search/__init__.py
Nick Sweeting f0aa19fa7d wip
2025-12-28 17:51:54 -08:00

121 lines
3.2 KiB
Python

"""
Search module for ArchiveBox.
Search indexing is handled by search backend hooks in plugins:
archivebox/plugins/search_backend_*/on_Snapshot__*_index_*.py
This module provides the query interface that dynamically discovers
search backend plugins using the hooks system.
Search backends must provide a search.py module with:
- search(query: str) -> List[str] (returns snapshot IDs)
- flush(snapshot_ids: Iterable[str]) -> None
"""
__package__ = 'archivebox.search'
from typing import TYPE_CHECKING, Any, Optional
from django.db.models import QuerySet
from archivebox.misc.util import enforce_types
from archivebox.misc.logging import stderr
from archivebox.config.common import SEARCH_BACKEND_CONFIG
if TYPE_CHECKING:
from archivebox.core.models import Snapshot
# Cache discovered backends to avoid repeated filesystem scans
_search_backends_cache: Optional[dict] = None
def get_available_backends() -> dict:
"""
Discover all available search backend plugins.
Uses the hooks system to find plugins with search.py modules.
Results are cached after first call.
"""
global _search_backends_cache
if _search_backends_cache is None:
from archivebox.hooks import get_search_backends
_search_backends_cache = get_search_backends()
return _search_backends_cache
def get_backend() -> Any:
"""
Get the configured search backend module.
Discovers available backends via the hooks system and returns
the one matching SEARCH_BACKEND_ENGINE configuration.
Falls back to 'ripgrep' if configured backend is not found.
"""
backend_name = SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE
backends = get_available_backends()
if backend_name in backends:
return backends[backend_name]
# Fallback to ripgrep if available (no index needed)
if 'ripgrep' in backends:
return backends['ripgrep']
# No backends found
available = list(backends.keys())
raise RuntimeError(
f'Search backend "{backend_name}" not found. '
f'Available backends: {available or "none"}'
)
@enforce_types
def query_search_index(query: str) -> QuerySet:
"""
Search for snapshots matching the query.
Returns a QuerySet of Snapshot objects matching the search.
"""
from archivebox.core.models import Snapshot
if not SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND:
return Snapshot.objects.none()
backend = get_backend()
try:
snapshot_pks = backend.search(query)
except Exception as err:
stderr()
stderr(
f'[X] The search backend threw an exception={err}:',
color='red',
)
raise
else:
return Snapshot.objects.filter(pk__in=snapshot_pks)
@enforce_types
def flush_search_index(snapshots: QuerySet) -> None:
"""
Remove snapshots from the search index.
"""
if not SEARCH_BACKEND_CONFIG.USE_INDEXING_BACKEND or not snapshots:
return
backend = get_backend()
snapshot_pks = [str(pk) for pk in snapshots.values_list('pk', flat=True)]
try:
backend.flush(snapshot_pks)
except Exception as err:
stderr()
stderr(
f'[X] The search backend threw an exception={err}:',
color='red',
)