mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 01:15:57 +10:00
121 lines
3.2 KiB
Python
121 lines
3.2 KiB
Python
"""
|
|
Search module for ArchiveBox.
|
|
|
|
Search indexing is handled by search backend hooks in plugins:
|
|
archivebox/plugins/search_backend_*/on_Snapshot__*_index_*.py
|
|
|
|
This module provides the query interface that dynamically discovers
|
|
search backend plugins using the hooks system.
|
|
|
|
Search backends must provide a search.py module with:
|
|
- search(query: str) -> List[str] (returns snapshot IDs)
|
|
- flush(snapshot_ids: Iterable[str]) -> None
|
|
"""
|
|
|
|
__package__ = 'archivebox.search'
|
|
|
|
from typing import TYPE_CHECKING, Any, Optional
|
|
|
|
from django.db.models import QuerySet
|
|
|
|
from archivebox.misc.util import enforce_types
|
|
from archivebox.misc.logging import stderr
|
|
from archivebox.config.common import SEARCH_BACKEND_CONFIG
|
|
|
|
if TYPE_CHECKING:
|
|
from archivebox.core.models import Snapshot
|
|
|
|
|
|
# Cache discovered backends to avoid repeated filesystem scans
|
|
_search_backends_cache: Optional[dict] = None
|
|
|
|
|
|
def get_available_backends() -> dict:
|
|
"""
|
|
Discover all available search backend plugins.
|
|
|
|
Uses the hooks system to find plugins with search.py modules.
|
|
Results are cached after first call.
|
|
"""
|
|
global _search_backends_cache
|
|
|
|
if _search_backends_cache is None:
|
|
from archivebox.hooks import get_search_backends
|
|
_search_backends_cache = get_search_backends()
|
|
|
|
return _search_backends_cache
|
|
|
|
|
|
def get_backend() -> Any:
|
|
"""
|
|
Get the configured search backend module.
|
|
|
|
Discovers available backends via the hooks system and returns
|
|
the one matching SEARCH_BACKEND_ENGINE configuration.
|
|
|
|
Falls back to 'ripgrep' if configured backend is not found.
|
|
"""
|
|
backend_name = SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE
|
|
backends = get_available_backends()
|
|
|
|
if backend_name in backends:
|
|
return backends[backend_name]
|
|
|
|
# Fallback to ripgrep if available (no index needed)
|
|
if 'ripgrep' in backends:
|
|
return backends['ripgrep']
|
|
|
|
# No backends found
|
|
available = list(backends.keys())
|
|
raise RuntimeError(
|
|
f'Search backend "{backend_name}" not found. '
|
|
f'Available backends: {available or "none"}'
|
|
)
|
|
|
|
|
|
@enforce_types
|
|
def query_search_index(query: str) -> QuerySet:
|
|
"""
|
|
Search for snapshots matching the query.
|
|
|
|
Returns a QuerySet of Snapshot objects matching the search.
|
|
"""
|
|
from archivebox.core.models import Snapshot
|
|
|
|
if not SEARCH_BACKEND_CONFIG.USE_SEARCHING_BACKEND:
|
|
return Snapshot.objects.none()
|
|
|
|
backend = get_backend()
|
|
try:
|
|
snapshot_pks = backend.search(query)
|
|
except Exception as err:
|
|
stderr()
|
|
stderr(
|
|
f'[X] The search backend threw an exception={err}:',
|
|
color='red',
|
|
)
|
|
raise
|
|
else:
|
|
return Snapshot.objects.filter(pk__in=snapshot_pks)
|
|
|
|
|
|
@enforce_types
|
|
def flush_search_index(snapshots: QuerySet) -> None:
|
|
"""
|
|
Remove snapshots from the search index.
|
|
"""
|
|
if not SEARCH_BACKEND_CONFIG.USE_INDEXING_BACKEND or not snapshots:
|
|
return
|
|
|
|
backend = get_backend()
|
|
snapshot_pks = [str(pk) for pk in snapshots.values_list('pk', flat=True)]
|
|
|
|
try:
|
|
backend.flush(snapshot_pks)
|
|
except Exception as err:
|
|
stderr()
|
|
stderr(
|
|
f'[X] The search backend threw an exception={err}:',
|
|
color='red',
|
|
)
|