wip major changes

2026-04-05 07:17:52 +10:00 · 2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions
--- a/archivebox/plugins/search_backend_sqlite/init.py
+++ b/archivebox/plugins/search_backend_sqlite/init.py
--- a/archivebox/plugins/search_backend_sqlite/config.json
+++ b/archivebox/plugins/search_backend_sqlite/config.json
@@ -0,0 +1,24 @@
+{
+  "$schema": "http://json-schema.org/draft-07/schema#",
+  "type": "object",
+  "additionalProperties": false,
+  "properties": {
+    "SQLITEFTS_DB": {
+      "type": "string",
+      "default": "search.sqlite3",
+      "description": "SQLite FTS database filename"
+    },
+    "FTS_SEPARATE_DATABASE": {
+      "type": "boolean",
+      "default": true,
+      "x-aliases": ["SQLITEFTS_SEPARATE_DATABASE"],
+      "description": "Use separate database file for FTS index"
+    },
+    "FTS_TOKENIZERS": {
+      "type": "string",
+      "default": "porter unicode61 remove_diacritics 2",
+      "x-aliases": ["SQLITEFTS_TOKENIZERS"],
+      "description": "FTS5 tokenizer configuration"
+    }
+  }
+}
--- a/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py
+++ b/archivebox/plugins/search_backend_sqlite/on_Snapshot__90_index_sqlite.py
@@ -0,0 +1,215 @@
+#!/usr/bin/env python3
+"""
+SQLite FTS5 search backend - indexes snapshot content for full-text search.
+
+This hook runs after all extractors and indexes text content in SQLite FTS5.
+Only runs if SEARCH_BACKEND_ENGINE=sqlite.
+
+Usage: on_Snapshot__90_index_sqlite.py --url=<url> --snapshot-id=<uuid>
+
+Environment variables:
+    SEARCH_BACKEND_ENGINE: Must be 'sqlite' for this hook to run
+    USE_INDEXING_BACKEND: Enable search indexing (default: true)
+    SQLITEFTS_DB: Database filename (default: search.sqlite3)
+    FTS_TOKENIZERS: FTS5 tokenizer config (default: porter unicode61 remove_diacritics 2)
+"""
+
+import json
+import os
+import re
+import sqlite3
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+
+import rich_click as click
+
+
+# Extractor metadata
+EXTRACTOR_NAME = 'index_sqlite'
+OUTPUT_DIR = 'search_index'
+
+# Text file patterns to index, in priority order
+INDEXABLE_FILES = [
+    ('readability', 'content.txt'),
+    ('readability', 'content.html'),
+    ('mercury', 'content.txt'),
+    ('mercury', 'content.html'),
+    ('htmltotext', 'output.txt'),
+    ('singlefile', 'singlefile.html'),
+    ('dom', 'output.html'),
+    ('wget', '**/*.html'),
+    ('wget', '**/*.htm'),
+    ('title', 'title.txt'),
+]
+
+
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def strip_html_tags(html: str) -> str:
+    """Remove HTML tags, keeping text content."""
+    html = re.sub(r'<script[^>]*>.*?</script>', '', html, flags=re.DOTALL | re.IGNORECASE)
+    html = re.sub(r'<style[^>]*>.*?</style>', '', html, flags=re.DOTALL | re.IGNORECASE)
+    html = re.sub(r'<[^>]+>', ' ', html)
+    html = html.replace('&nbsp;', ' ').replace('&amp;', '&')
+    html = html.replace('&lt;', '<').replace('&gt;', '>')
+    html = html.replace('&quot;', '"')
+    html = re.sub(r'\s+', ' ', html)
+    return html.strip()
+
+
+def find_indexable_content() -> list[tuple[str, str]]:
+    """Find text content to index from extractor outputs."""
+    results = []
+    cwd = Path.cwd()
+
+    for extractor, file_pattern in INDEXABLE_FILES:
+        extractor_dir = cwd / extractor
+        if not extractor_dir.exists():
+            continue
+
+        if '*' in file_pattern:
+            matches = list(extractor_dir.glob(file_pattern))
+        else:
+            match = extractor_dir / file_pattern
+            matches = [match] if match.exists() else []
+
+        for match in matches:
+            if match.is_file() and match.stat().st_size > 0:
+                try:
+                    content = match.read_text(encoding='utf-8', errors='ignore')
+                    if content.strip():
+                        if match.suffix in ('.html', '.htm'):
+                            content = strip_html_tags(content)
+                        results.append((f'{extractor}/{match.name}', content))
+                except Exception:
+                    continue
+
+    return results
+
+
+def get_db_path() -> Path:
+    """Get path to the search index database."""
+    data_dir = get_env('DATA_DIR', str(Path.cwd().parent.parent))
+    db_name = get_env('SQLITEFTS_DB', 'search.sqlite3')
+    return Path(data_dir) / db_name
+
+
+def index_in_sqlite(snapshot_id: str, texts: list[str]) -> None:
+    """Index texts in SQLite FTS5."""
+    db_path = get_db_path()
+    tokenizers = get_env('FTS_TOKENIZERS', 'porter unicode61 remove_diacritics 2')
+    conn = sqlite3.connect(str(db_path))
+
+    try:
+        # Create FTS5 table if needed
+        conn.execute(f'''
+            CREATE VIRTUAL TABLE IF NOT EXISTS search_index
+            USING fts5(snapshot_id, content, tokenize='{tokenizers}')
+        ''')
+
+        # Remove existing entries
+        conn.execute('DELETE FROM search_index WHERE snapshot_id = ?', (snapshot_id,))
+
+        # Insert new content
+        content = '\n\n'.join(texts)
+        conn.execute(
+            'INSERT INTO search_index (snapshot_id, content) VALUES (?, ?)',
+            (snapshot_id, content)
+        )
+        conn.commit()
+    finally:
+        conn.close()
+
+
+@click.command()
+@click.option('--url', required=True, help='URL that was archived')
+@click.option('--snapshot-id', required=True, help='Snapshot UUID')
+def main(url: str, snapshot_id: str):
+    """Index snapshot content in SQLite FTS5."""
+
+    start_ts = datetime.now(timezone.utc)
+    output = None
+    status = 'failed'
+    error = ''
+    indexed_sources = []
+
+    try:
+        # Check if this backend is enabled (permanent skips - don't retry)
+        backend = get_env('SEARCH_BACKEND_ENGINE', 'sqlite')
+        if backend != 'sqlite':
+            print(f'Skipping SQLite indexing (SEARCH_BACKEND_ENGINE={backend})')
+            print(f'START_TS={start_ts.isoformat()}')
+            print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
+            print(f'STATUS=skipped')
+            print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
+            sys.exit(0)  # Permanent skip - different backend selected
+        if not get_env_bool('USE_INDEXING_BACKEND', True):
+            print('Skipping indexing (USE_INDEXING_BACKEND=False)')
+            print(f'START_TS={start_ts.isoformat()}')
+            print(f'END_TS={datetime.now(timezone.utc).isoformat()}')
+            print(f'STATUS=skipped')
+            print(f'RESULT_JSON={json.dumps({"extractor": EXTRACTOR_NAME, "status": "skipped", "url": url, "snapshot_id": snapshot_id})}')
+            sys.exit(0)  # Permanent skip - indexing disabled
+        else:
+            contents = find_indexable_content()
+            indexed_sources = [source for source, _ in contents]
+
+            if not contents:
+                status = 'skipped'
+                print('No indexable content found')
+            else:
+                texts = [content for _, content in contents]
+                index_in_sqlite(snapshot_id, texts)
+                status = 'succeeded'
+                output = OUTPUT_DIR
+                print(f'SQLite FTS indexed {len(texts)} documents')
+                print(f'Sources: {", ".join(indexed_sources)}')
+
+    except Exception as e:
+        error = f'{type(e).__name__}: {e}'
+        status = 'failed'
+
+    end_ts = datetime.now(timezone.utc)
+    duration = (end_ts - start_ts).total_seconds()
+
+    print(f'START_TS={start_ts.isoformat()}')
+    print(f'END_TS={end_ts.isoformat()}')
+    print(f'DURATION={duration:.2f}')
+    if output:
+        print(f'OUTPUT={output}')
+    print(f'STATUS={status}')
+
+    if error:
+        print(f'ERROR={error}', file=sys.stderr)
+
+    result_json = {
+        'extractor': EXTRACTOR_NAME,
+        'url': url,
+        'snapshot_id': snapshot_id,
+        'status': status,
+        'start_ts': start_ts.isoformat(),
+        'end_ts': end_ts.isoformat(),
+        'duration': round(duration, 2),
+        'output': output,
+        'indexed_sources': indexed_sources,
+        'error': error or None,
+    }
+    print(f'RESULT_JSON={json.dumps(result_json)}')
+
+    sys.exit(0 if status == 'succeeded' else 1)
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/plugins/search_backend_sqlite/search.py
+++ b/archivebox/plugins/search_backend_sqlite/search.py
@@ -0,0 +1,65 @@
+"""
+SQLite FTS5 search backend - search and flush operations.
+
+This module provides the search interface for the SQLite FTS backend.
+
+Environment variables:
+    SQLITEFTS_DB: Database filename (default: search.sqlite3)
+    FTS_SEPARATE_DATABASE: Use separate database file (default: true)
+    FTS_TOKENIZERS: FTS5 tokenizer config (default: porter unicode61 remove_diacritics 2)
+"""
+
+import os
+import sqlite3
+from pathlib import Path
+from typing import List, Iterable
+
+from django.conf import settings
+
+
+# Config with old var names for backwards compatibility
+SQLITEFTS_DB = os.environ.get('SQLITEFTS_DB', 'search.sqlite3').strip()
+FTS_SEPARATE_DATABASE = os.environ.get('FTS_SEPARATE_DATABASE', 'true').lower() in ('true', '1', 'yes')
+FTS_TOKENIZERS = os.environ.get('FTS_TOKENIZERS', 'porter unicode61 remove_diacritics 2').strip()
+
+
+def get_db_path() -> Path:
+    """Get path to the search index database."""
+    return Path(settings.DATA_DIR) / SQLITEFTS_DB
+
+
+def search(query: str) -> List[str]:
+    """Search for snapshots matching the query."""
+    db_path = get_db_path()
+    if not db_path.exists():
+        return []
+
+    conn = sqlite3.connect(str(db_path))
+    try:
+        cursor = conn.execute(
+            'SELECT DISTINCT snapshot_id FROM search_index WHERE search_index MATCH ?',
+            (query,)
+        )
+        return [row[0] for row in cursor.fetchall()]
+    except sqlite3.OperationalError:
+        # Table doesn't exist yet
+        return []
+    finally:
+        conn.close()
+
+
+def flush(snapshot_ids: Iterable[str]) -> None:
+    """Remove snapshots from the index."""
+    db_path = get_db_path()
+    if not db_path.exists():
+        return
+
+    conn = sqlite3.connect(str(db_path))
+    try:
+        for snapshot_id in snapshot_ids:
+            conn.execute('DELETE FROM search_index WHERE snapshot_id = ?', (snapshot_id,))
+        conn.commit()
+    except sqlite3.OperationalError:
+        pass  # Table doesn't exist
+    finally:
+        conn.close()