wip major changes

This commit is contained in:
Nick Sweeting
2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions

View File

@@ -0,0 +1,24 @@
{
"$schema": "http://json-schema.org/draft-07/schema#",
"type": "object",
"additionalProperties": false,
"properties": {
"SQLITEFTS_DB": {
"type": "string",
"default": "search.sqlite3",
"description": "SQLite FTS database filename"
},
"FTS_SEPARATE_DATABASE": {
"type": "boolean",
"default": true,
"x-aliases": ["SQLITEFTS_SEPARATE_DATABASE"],
"description": "Use separate database file for FTS index"
},
"FTS_TOKENIZERS": {
"type": "string",
"default": "porter unicode61 remove_diacritics 2",
"x-aliases": ["SQLITEFTS_TOKENIZERS"],
"description": "FTS5 tokenizer configuration"
}
}
}

View File

@@ -0,0 +1,215 @@
#!/usr/bin/env python3
"""
SQLite FTS5 search backend - indexes snapshot content for full-text search.
This hook runs after all extractors and indexes text content in SQLite FTS5.
Only runs if SEARCH_BACKEND_ENGINE=sqlite.
Usage: on_Snapshot__90_index_sqlite.py --url=<url> --snapshot-id=<uuid>
Environment variables:
SEARCH_BACKEND_ENGINE: Must be 'sqlite' for this hook to run
USE_INDEXING_BACKEND: Enable search indexing (default: true)
SQLITEFTS_DB: Database filename (default: search.sqlite3)
FTS_TOKENIZERS: FTS5 tokenizer config (default: porter unicode61 remove_diacritics 2)
"""
import json
import os
import re
import sqlite3
import sys
from datetime import datetime, timezone
from pathlib import Path
import rich_click as click
# Extractor metadata
EXTRACTOR_NAME = 'index_sqlite'
OUTPUT_DIR = 'search_index'
# Text file patterns to index, in priority order
INDEXABLE_FILES = [
('readability', 'content.txt'),
('readability', 'content.html'),
('mercury', 'content.txt'),
('mercury', 'content.html'),
('htmltotext', 'output.txt'),
('singlefile', 'singlefile.html'),
('dom', 'output.html'),
('wget', '**/*.html'),
('wget', '**/*.htm'),
('title', 'title.txt'),
]
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def strip_html_tags(html: str) -> str:
"""Remove HTML tags, keeping text content."""
html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
html = re.sub(r'<[^>]+>', ' ', html)
html = html.replace('&nbsp;', ' ').replace('&amp;', '&')
html = html.replace('&lt;', '<').replace('&gt;', '>')
html = html.replace('&quot;', '"')
html = re.sub(r'\s+', ' ', html)
return html.strip()
def find_indexable_content() -> list[tuple[str, str]]:
"""Find text content to index from extractor outputs."""
results = []
cwd = Path.cwd()
for extractor, file_pattern in INDEXABLE_FILES:
extractor_dir = cwd / extractor
if not extractor_dir.exists():
continue
if '*' in file_pattern:
matches = list(extractor_dir.glob(file_pattern))
else:
match = extractor_dir / file_pattern
matches = [match] if match.exists() else []
for match in matches:
if match.is_file() and match.stat().st_size > 0:
try:
content = match.read_text(encoding='utf-8', errors='ignore')
if content.strip():
if match.suffix in ('.html', '.htm'):
content = strip_html_tags(content)
results.append((f'{extractor}/{match.name}', content))
except Exception:
continue
return results
def get_db_path() -> Path:
"""Get path to the search index database."""
data_dir = get_env('DATA_DIR', str(Path.cwd().parent.parent))
db_name = get_env('SQLITEFTS_DB', 'search.sqlite3')
return Path(data_dir) / db_name
def index_in_sqlite(snapshot_id: str, texts: list[str]) -> None:
"""Index texts in SQLite FTS5."""
db_path = get_db_path()
tokenizers = get_env('FTS_TOKENIZERS', 'porter unicode61 remove_diacritics 2')
conn = sqlite3.connect(str(db_path))
try:
# Create FTS5 table if needed
conn.execute(f'''
CREATE VIRTUAL TABLE IF NOT EXISTS search_index
USING fts5(snapshot_id, content, tokenize='{tokenizers}')
''')
# Remove existing entries
conn.execute('DELETE FROM search_index WHERE snapshot_id = ?', (snapshot_id,))
# Insert new content
content = '\n\n'.join(texts)
conn.execute(
'INSERT INTO search_index (snapshot_id, content) VALUES (?, ?)',
(snapshot_id, content)
)
conn.commit()
finally:
conn.close()
@click.command()
@click.option('--url', required=True, help='URL that was archived')
@click.option('--snapshot-id', required=True, help='Snapshot UUID')
def main(url: str, snapshot_id: str):
"""Index snapshot content in SQLite FTS5."""
start_ts = datetime.now(timezone.utc)
output = None
status = 'failed'
error = ''
indexed_sources = []
try:
# Check if this backend is enabled (permanent skips - don't retry)
backend = get_env('SEARCH_BACKEND_ENGINE', 'sqlite')
if backend != 'sqlite':
print(f'Skipping SQLite indexing (SEARCH_BACKEND_ENGINE={backend})')
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
print(f'STATUS=skipped')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
sys.exit(0) # Permanent skip - different backend selected
if not get_env_bool('USE_INDEXING_BACKEND', True):
print('Skipping indexing (USE_INDEXING_BACKEND=False)')
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
print(f'STATUS=skipped')
print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
sys.exit(0) # Permanent skip - indexing disabled
else:
contents = find_indexable_content()
indexed_sources = [source for source, _ in contents]
if not contents:
status = 'skipped'
print('No indexable content found')
else:
texts = [content for _, content in contents]
index_in_sqlite(snapshot_id, texts)
status = 'succeeded'
output = OUTPUT_DIR
print(f'SQLite FTS indexed {len(texts)} documents')
print(f'Sources: {", ".join(indexed_sources)}')
except Exception as e:
error = f'{type(e).__name__}: {e}'
status = 'failed'
end_ts = datetime.now(timezone.utc)
duration = (end_ts - start_ts).total_seconds()
print(f'START_TS={start_ts.isoformat()}')
print(f'END_TS={end_ts.isoformat()}')
print(f'DURATION={duration:.2f}')
if output:
print(f'OUTPUT={output}')
print(f'STATUS={status}')
if error:
print(f'ERROR={error}', file=sys.stderr)
result_json = {
'extractor': EXTRACTOR_NAME,
'url': url,
'snapshot_id': snapshot_id,
'status': status,
'start_ts': start_ts.isoformat(),
'end_ts': end_ts.isoformat(),
'duration': round(duration, 2),
'output': output,
'indexed_sources': indexed_sources,
'error': error or None,
}
print(f'RESULT_JSON={json.dumps(result_json)}')
sys.exit(0 if status == 'succeeded' else 1)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,65 @@
"""
SQLite FTS5 search backend - search and flush operations.
This module provides the search interface for the SQLite FTS backend.
Environment variables:
SQLITEFTS_DB: Database filename (default: search.sqlite3)
FTS_SEPARATE_DATABASE: Use separate database file (default: true)
FTS_TOKENIZERS: FTS5 tokenizer config (default: porter unicode61 remove_diacritics 2)
"""
import os
import sqlite3
from pathlib import Path
from typing import List, Iterable
from django.conf import settings
# Config with old var names for backwards compatibility
SQLITEFTS_DB = os.environ.get('SQLITEFTS_DB', 'search.sqlite3').strip()
FTS_SEPARATE_DATABASE = os.environ.get('FTS_SEPARATE_DATABASE', 'true').lower() in ('true', '1', 'yes')
FTS_TOKENIZERS = os.environ.get('FTS_TOKENIZERS', 'porter unicode61 remove_diacritics 2').strip()
def get_db_path() -> Path:
"""Get path to the search index database."""
return Path(settings.DATA_DIR) / SQLITEFTS_DB
def search(query: str) -> List[str]:
"""Search for snapshots matching the query."""
db_path = get_db_path()
if not db_path.exists():
return []
conn = sqlite3.connect(str(db_path))
try:
cursor = conn.execute(
'SELECT DISTINCT snapshot_id FROM search_index WHERE search_index MATCH ?',
(query,)
)
return [row[0] for row in cursor.fetchall()]
except sqlite3.OperationalError:
# Table doesn't exist yet
return []
finally:
conn.close()
def flush(snapshot_ids: Iterable[str]) -> None:
"""Remove snapshots from the index."""
db_path = get_db_path()
if not db_path.exists():
return
conn = sqlite3.connect(str(db_path))
try:
for snapshot_id in snapshot_ids:
conn.execute('DELETE FROM search_index WHERE snapshot_id = ?', (snapshot_id,))
conn.commit()
except sqlite3.OperationalError:
pass # Table doesn't exist
finally:
conn.close()