mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 07:17:52 +10:00
wip major changes
This commit is contained in:
24
archivebox/plugins/search_backend_sqlite/config.json
Normal file
24
archivebox/plugins/search_backend_sqlite/config.json
Normal file
@@ -0,0 +1,24 @@
|
||||
{
|
||||
"$schema": "http://json-schema.org/draft-07/schema#",
|
||||
"type": "object",
|
||||
"additionalProperties": false,
|
||||
"properties": {
|
||||
"SQLITEFTS_DB": {
|
||||
"type": "string",
|
||||
"default": "search.sqlite3",
|
||||
"description": "SQLite FTS database filename"
|
||||
},
|
||||
"FTS_SEPARATE_DATABASE": {
|
||||
"type": "boolean",
|
||||
"default": true,
|
||||
"x-aliases": ["SQLITEFTS_SEPARATE_DATABASE"],
|
||||
"description": "Use separate database file for FTS index"
|
||||
},
|
||||
"FTS_TOKENIZERS": {
|
||||
"type": "string",
|
||||
"default": "porter unicode61 remove_diacritics 2",
|
||||
"x-aliases": ["SQLITEFTS_TOKENIZERS"],
|
||||
"description": "FTS5 tokenizer configuration"
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -0,0 +1,215 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
SQLite FTS5 search backend - indexes snapshot content for full-text search.
|
||||
|
||||
This hook runs after all extractors and indexes text content in SQLite FTS5.
|
||||
Only runs if SEARCH_BACKEND_ENGINE=sqlite.
|
||||
|
||||
Usage: on_Snapshot__90_index_sqlite.py --url=<url> --snapshot-id=<uuid>
|
||||
|
||||
Environment variables:
|
||||
SEARCH_BACKEND_ENGINE: Must be 'sqlite' for this hook to run
|
||||
USE_INDEXING_BACKEND: Enable search indexing (default: true)
|
||||
SQLITEFTS_DB: Database filename (default: search.sqlite3)
|
||||
FTS_TOKENIZERS: FTS5 tokenizer config (default: porter unicode61 remove_diacritics 2)
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
EXTRACTOR_NAME = 'index_sqlite'
|
||||
OUTPUT_DIR = 'search_index'
|
||||
|
||||
# Text file patterns to index, in priority order
|
||||
INDEXABLE_FILES = [
|
||||
('readability', 'content.txt'),
|
||||
('readability', 'content.html'),
|
||||
('mercury', 'content.txt'),
|
||||
('mercury', 'content.html'),
|
||||
('htmltotext', 'output.txt'),
|
||||
('singlefile', 'singlefile.html'),
|
||||
('dom', 'output.html'),
|
||||
('wget', '**/*.html'),
|
||||
('wget', '**/*.htm'),
|
||||
('title', 'title.txt'),
|
||||
]
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def strip_html_tags(html: str) -> str:
|
||||
"""Remove HTML tags, keeping text content."""
|
||||
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
||||
html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
|
||||
html = re.sub(r'<[^>]+>', ' ', html)
|
||||
html = html.replace(' ', ' ').replace('&', '&')
|
||||
html = html.replace('<', '<').replace('>', '>')
|
||||
html = html.replace('"', '"')
|
||||
html = re.sub(r'\s+', ' ', html)
|
||||
return html.strip()
|
||||
|
||||
|
||||
def find_indexable_content() -> list[tuple[str, str]]:
|
||||
"""Find text content to index from extractor outputs."""
|
||||
results = []
|
||||
cwd = Path.cwd()
|
||||
|
||||
for extractor, file_pattern in INDEXABLE_FILES:
|
||||
extractor_dir = cwd / extractor
|
||||
if not extractor_dir.exists():
|
||||
continue
|
||||
|
||||
if '*' in file_pattern:
|
||||
matches = list(extractor_dir.glob(file_pattern))
|
||||
else:
|
||||
match = extractor_dir / file_pattern
|
||||
matches = [match] if match.exists() else []
|
||||
|
||||
for match in matches:
|
||||
if match.is_file() and match.stat().st_size > 0:
|
||||
try:
|
||||
content = match.read_text(encoding='utf-8', errors='ignore')
|
||||
if content.strip():
|
||||
if match.suffix in ('.html', '.htm'):
|
||||
content = strip_html_tags(content)
|
||||
results.append((f'{extractor}/{match.name}', content))
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def get_db_path() -> Path:
|
||||
"""Get path to the search index database."""
|
||||
data_dir = get_env('DATA_DIR', str(Path.cwd().parent.parent))
|
||||
db_name = get_env('SQLITEFTS_DB', 'search.sqlite3')
|
||||
return Path(data_dir) / db_name
|
||||
|
||||
|
||||
def index_in_sqlite(snapshot_id: str, texts: list[str]) -> None:
|
||||
"""Index texts in SQLite FTS5."""
|
||||
db_path = get_db_path()
|
||||
tokenizers = get_env('FTS_TOKENIZERS', 'porter unicode61 remove_diacritics 2')
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
|
||||
try:
|
||||
# Create FTS5 table if needed
|
||||
conn.execute(f'''
|
||||
CREATE VIRTUAL TABLE IF NOT EXISTS search_index
|
||||
USING fts5(snapshot_id, content, tokenize='{tokenizers}')
|
||||
''')
|
||||
|
||||
# Remove existing entries
|
||||
conn.execute('DELETE FROM search_index WHERE snapshot_id = ?', (snapshot_id,))
|
||||
|
||||
# Insert new content
|
||||
content = '\n\n'.join(texts)
|
||||
conn.execute(
|
||||
'INSERT INTO search_index (snapshot_id, content) VALUES (?, ?)',
|
||||
(snapshot_id, content)
|
||||
)
|
||||
conn.commit()
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.option('--url', required=True, help='URL that was archived')
|
||||
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
|
||||
def main(url: str, snapshot_id: str):
|
||||
"""Index snapshot content in SQLite FTS5."""
|
||||
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
output = None
|
||||
status = 'failed'
|
||||
error = ''
|
||||
indexed_sources = []
|
||||
|
||||
try:
|
||||
# Check if this backend is enabled (permanent skips - don't retry)
|
||||
backend = get_env('SEARCH_BACKEND_ENGINE', 'sqlite')
|
||||
if backend != 'sqlite':
|
||||
print(f'Skipping SQLite indexing (SEARCH_BACKEND_ENGINE={backend})')
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
|
||||
print(f'STATUS=skipped')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
|
||||
sys.exit(0) # Permanent skip - different backend selected
|
||||
if not get_env_bool('USE_INDEXING_BACKEND', True):
|
||||
print('Skipping indexing (USE_INDEXING_BACKEND=False)')
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
|
||||
print(f'STATUS=skipped')
|
||||
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
|
||||
sys.exit(0) # Permanent skip - indexing disabled
|
||||
else:
|
||||
contents = find_indexable_content()
|
||||
indexed_sources = [source for source, _ in contents]
|
||||
|
||||
if not contents:
|
||||
status = 'skipped'
|
||||
print('No indexable content found')
|
||||
else:
|
||||
texts = [content for _, content in contents]
|
||||
index_in_sqlite(snapshot_id, texts)
|
||||
status = 'succeeded'
|
||||
output = OUTPUT_DIR
|
||||
print(f'SQLite FTS indexed {len(texts)} documents')
|
||||
print(f'Sources: {", ".join(indexed_sources)}')
|
||||
|
||||
except Exception as e:
|
||||
error = f'{type(e).__name__}: {e}'
|
||||
status = 'failed'
|
||||
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
duration = (end_ts - start_ts).total_seconds()
|
||||
|
||||
print(f'START_TS={start_ts.isoformat()}')
|
||||
print(f'END_TS={end_ts.isoformat()}')
|
||||
print(f'DURATION={duration:.2f}')
|
||||
if output:
|
||||
print(f'OUTPUT={output}')
|
||||
print(f'STATUS={status}')
|
||||
|
||||
if error:
|
||||
print(f'ERROR={error}', file=sys.stderr)
|
||||
|
||||
result_json = {
|
||||
'extractor': EXTRACTOR_NAME,
|
||||
'url': url,
|
||||
'snapshot_id': snapshot_id,
|
||||
'status': status,
|
||||
'start_ts': start_ts.isoformat(),
|
||||
'end_ts': end_ts.isoformat(),
|
||||
'duration': round(duration, 2),
|
||||
'output': output,
|
||||
'indexed_sources': indexed_sources,
|
||||
'error': error or None,
|
||||
}
|
||||
print(f'RESULT_JSON={json.dumps(result_json)}')
|
||||
|
||||
sys.exit(0 if status == 'succeeded' else 1)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
65
archivebox/plugins/search_backend_sqlite/search.py
Normal file
65
archivebox/plugins/search_backend_sqlite/search.py
Normal file
@@ -0,0 +1,65 @@
|
||||
"""
|
||||
SQLite FTS5 search backend - search and flush operations.
|
||||
|
||||
This module provides the search interface for the SQLite FTS backend.
|
||||
|
||||
Environment variables:
|
||||
SQLITEFTS_DB: Database filename (default: search.sqlite3)
|
||||
FTS_SEPARATE_DATABASE: Use separate database file (default: true)
|
||||
FTS_TOKENIZERS: FTS5 tokenizer config (default: porter unicode61 remove_diacritics 2)
|
||||
"""
|
||||
|
||||
import os
|
||||
import sqlite3
|
||||
from pathlib import Path
|
||||
from typing import List, Iterable
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
|
||||
# Config with old var names for backwards compatibility
|
||||
SQLITEFTS_DB = os.environ.get('SQLITEFTS_DB', 'search.sqlite3').strip()
|
||||
FTS_SEPARATE_DATABASE = os.environ.get('FTS_SEPARATE_DATABASE', 'true').lower() in ('true', '1', 'yes')
|
||||
FTS_TOKENIZERS = os.environ.get('FTS_TOKENIZERS', 'porter unicode61 remove_diacritics 2').strip()
|
||||
|
||||
|
||||
def get_db_path() -> Path:
|
||||
"""Get path to the search index database."""
|
||||
return Path(settings.DATA_DIR) / SQLITEFTS_DB
|
||||
|
||||
|
||||
def search(query: str) -> List[str]:
|
||||
"""Search for snapshots matching the query."""
|
||||
db_path = get_db_path()
|
||||
if not db_path.exists():
|
||||
return []
|
||||
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
try:
|
||||
cursor = conn.execute(
|
||||
'SELECT DISTINCT snapshot_id FROM search_index WHERE search_index MATCH ?',
|
||||
(query,)
|
||||
)
|
||||
return [row[0] for row in cursor.fetchall()]
|
||||
except sqlite3.OperationalError:
|
||||
# Table doesn't exist yet
|
||||
return []
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
def flush(snapshot_ids: Iterable[str]) -> None:
|
||||
"""Remove snapshots from the index."""
|
||||
db_path = get_db_path()
|
||||
if not db_path.exists():
|
||||
return
|
||||
|
||||
conn = sqlite3.connect(str(db_path))
|
||||
try:
|
||||
for snapshot_id in snapshot_ids:
|
||||
conn.execute('DELETE FROM search_index WHERE snapshot_id = ?', (snapshot_id,))
|
||||
conn.commit()
|
||||
except sqlite3.OperationalError:
|
||||
pass # Table doesn't exist
|
||||
finally:
|
||||
conn.close()
|
||||
Reference in New Issue
Block a user