Add ripgrep rg search backend and set as default

This commit is contained in:
JDC
2020-11-22 20:56:24 -05:00
committed by Nick Sweeting
parent 8484bdb973
commit 95382b3812
4 changed files with 56 additions and 15 deletions

View File

@@ -142,7 +142,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
'SEARCH_BACKEND_CONFIG' : {
'USE_INDEXING_BACKEND': {'type': bool, 'default': True},
'USE_SEARCHING_BACKEND': {'type': bool, 'default': True},
'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'sonic'},
'SEARCH_BACKEND_ENGINE': {'type': str, 'default': 'ripgrep'},
'SEARCH_BACKEND_HOST_NAME': {'type': str, 'default': 'localhost'},
'SEARCH_BACKEND_PORT': {'type': int, 'default': 1491},
'SEARCH_BACKEND_PASSWORD': {'type': str, 'default': 'SecretPassword'},

View File

@@ -0,0 +1,43 @@
import re
from subprocess import run, PIPE, DEVNULL
from typing import List, Generator
from archivebox.config import setup_django, ARCHIVE_DIR, ARCHIVE_DIR_NAME
from archivebox.util import enforce_types
DEFAULT_ARGUMENTS = '-ilt' # Case insensitive, matching files, types
DEFAULT_EXTENSIONS = 'html'
REGEX_ARGUMENT = '-e'
TIMESTAMP_REGEX = r'\/([\d]+\.[\d]+)\/'
ts_regex = re.compile(TIMESTAMP_REGEX)
@enforce_types
def index(snapshot_id: str, texts: List[str]):
return
@enforce_types
def flush(snapshot_ids: Generator[str, None, None]):
return
@enforce_types
def search(text: str) -> List[str]:
is_rg_installed = run(['which', 'rg'], stdout=DEVNULL, stderr=DEVNULL)
if is_rg_installed.returncode:
raise Exception("rg binary not found, install ripgrep to use this backend")
setup_django(check_db=True)
from core.models import Snapshot
rg = run(['rg',DEFAULT_ARGUMENTS, DEFAULT_EXTENSIONS, REGEX_ARGUMENT, text, str(ARCHIVE_DIR)],stdout=PIPE, stderr=PIPE, timeout=60)
file_paths = [p.decode().replace(str(ARCHIVE_DIR_NAME), '') for p in rg.stdout.splitlines()]
timestamps = set()
for path in file_paths:
if ts := ts_regex.findall(path):
timestamps.add(ts[0])
snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)]
return snap_ids