ArchiveBox/archivebox/cli/archivebox_update.py

#!/usr/bin/env python3

__package__ = "archivebox.cli"

import os
import time

from typing import TYPE_CHECKING, Any
from collections.abc import Callable, Iterable
from pathlib import Path

import rich_click as click
from django.core.exceptions import ObjectDoesNotExist
from django.db.models import Q, QuerySet

from archivebox.misc.util import enforce_types, docstring

if TYPE_CHECKING:
    from archivebox.core.models import Snapshot
    from archivebox.crawls.models import Crawl


LINK_FILTERS: dict[str, Callable[[str], Q]] = {
    "exact": lambda pattern: Q(url=pattern),
    "substring": lambda pattern: Q(url__icontains=pattern),
    "regex": lambda pattern: Q(url__iregex=pattern),
    "domain": lambda pattern: (
        Q(url__istartswith=f"http://{pattern}") | Q(url__istartswith=f"https://{pattern}") | Q(url__istartswith=f"ftp://{pattern}")
    ),
    "tag": lambda pattern: Q(tags__name=pattern),
    "timestamp": lambda pattern: Q(timestamp=pattern),
}


def _apply_pattern_filters(
    snapshots: QuerySet["Snapshot", "Snapshot"],
    filter_patterns: list[str],
    filter_type: str,
) -> QuerySet["Snapshot", "Snapshot"]:
    filter_builder = LINK_FILTERS.get(filter_type)
    if filter_builder is None:
        raise SystemExit(2)

    query = Q()
    for pattern in filter_patterns:
        query |= filter_builder(pattern)
    return snapshots.filter(query)


def _get_snapshot_crawl(snapshot: "Snapshot") -> "Crawl | None":
    try:
        return snapshot.crawl
    except ObjectDoesNotExist:
        return None


def _get_search_indexing_plugins() -> list[str]:
    from abx_dl.models import discover_plugins
    from archivebox.hooks import get_search_backends

    available_backends = set(get_search_backends())
    plugins = discover_plugins()
    return sorted(
        plugin_name
        for plugin_name, plugin in plugins.items()
        if plugin_name.startswith("search_backend_")
        and plugin_name.removeprefix("search_backend_") in available_backends
        and any("Snapshot" in hook.name and "index" in hook.name.lower() for hook in plugin.hooks)
    )


def _build_filtered_snapshots_queryset(
    *,
    filter_patterns: Iterable[str],
    filter_type: str,
    before: float | None,
    after: float | None,
    resume: str | None = None,
):
    from archivebox.core.models import Snapshot
    from datetime import datetime

    snapshots = Snapshot.objects.all()

    if filter_patterns:
        snapshots = _apply_pattern_filters(snapshots, list(filter_patterns), filter_type)

    if before:
        snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
    if after:
        snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
    if resume:
        snapshots = snapshots.filter(timestamp__lte=resume)

    return snapshots.select_related("crawl").order_by("-bookmarked_at")


def reindex_snapshots(
    snapshots: QuerySet["Snapshot", "Snapshot"],
    *,
    search_plugins: list[str],
    batch_size: int,
) -> dict[str, int]:
    from archivebox.cli.archivebox_extract import run_plugins

    stats = {"processed": 0, "reconciled": 0, "queued": 0, "reindexed": 0}
    records: list[dict[str, str]] = []

    total = snapshots.count()
    print(f"[*] Reindexing {total} snapshots with search plugins: {', '.join(search_plugins)}")

    for snapshot in snapshots.iterator(chunk_size=batch_size):
        stats["processed"] += 1

        if _get_snapshot_crawl(snapshot) is None:
            continue

        output_dir = Path(snapshot.output_dir)
        has_directory = output_dir.exists() and output_dir.is_dir()
        if has_directory:
            snapshot.reconcile_with_index_json()
            stats["reconciled"] += 1

        for plugin_name in search_plugins:
            existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by("-created_at").first()
            if existing_result:
                existing_result.reset_for_retry()
            records.append(
                {
                    "type": "ArchiveResult",
                    "snapshot_id": str(snapshot.id),
                    "plugin": plugin_name,
                },
            )
            stats["queued"] += 1

    if not records:
        return stats

    exit_code = run_plugins(
        args=(),
        records=records,
        wait=True,
        emit_results=False,
    )
    if exit_code != 0:
        raise SystemExit(exit_code)

    stats["reindexed"] = len(records)
    return stats


@enforce_types
def update(
    filter_patterns: Iterable[str] = (),
    filter_type: str = "exact",
    before: float | None = None,
    after: float | None = None,
    resume: str | None = None,
    batch_size: int = 100,
    continuous: bool = False,
    index_only: bool = False,
) -> None:
    """
    Update snapshots: migrate old dirs, reconcile DB, and re-queue for archiving.

    Three-phase operation (without filters):
    - Phase 1: Drain old archive/ dirs by moving to new fs location (0.8.x → 0.9.x)
    - Phase 2: O(n) scan over entire DB from most recent to least recent
    - No orphan scans needed (trust 1:1 mapping between DB and filesystem after phase 1)

    With filters: Only phase 2 (DB query), no filesystem operations.
    Without filters: All phases (full update).
    """

    from rich import print
    from archivebox.config.django import setup_django

    setup_django()

    from django.core.management import call_command

    # Run migrations first to ensure DB schema is up-to-date
    print("[*] Checking for pending migrations...")
    try:
        call_command("migrate", "--no-input", verbosity=0)
    except Exception as e:
        print(f"[!] Warning: Migration check failed: {e}")

    while True:
        if index_only:
            search_plugins = _get_search_indexing_plugins()
            if not search_plugins:
                print("[*] No search indexing plugins are available, nothing to backfill.")
                break

            if not (filter_patterns or before or after):
                print("[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...")
                drain_old_archive_dirs(
                    resume_from=resume,
                    batch_size=batch_size,
                )

            snapshots = _build_filtered_snapshots_queryset(
                filter_patterns=filter_patterns,
                filter_type=filter_type,
                before=before,
                after=after,
                resume=resume,
            )
            stats = reindex_snapshots(
                snapshots,
                search_plugins=search_plugins,
                batch_size=batch_size,
            )
            print_index_stats(stats)
        elif filter_patterns or before or after:
            # Filtered mode: query DB only
            print("[*] Processing filtered snapshots from database...")
            stats = process_filtered_snapshots(
                filter_patterns=filter_patterns,
                filter_type=filter_type,
                before=before,
                after=after,
                resume=resume,
                batch_size=batch_size,
            )
            print_stats(stats)
        else:
            # Full mode: drain old dirs + process DB
            stats_combined = {"phase1": {}, "phase2": {}}

            print("[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...")
            stats_combined["phase1"] = drain_old_archive_dirs(
                resume_from=resume,
                batch_size=batch_size,
            )

            print("[*] Phase 2: Processing all database snapshots (most recent first)...")
            stats_combined["phase2"] = process_all_db_snapshots(batch_size=batch_size, resume=resume)

            # Phase 3: Deduplication (disabled for now)
            # print('[*] Phase 3: Deduplicating...')
            # stats_combined['deduplicated'] = Snapshot.find_and_merge_duplicates()

            print_combined_stats(stats_combined)

        if not continuous:
            break

        print("[yellow]Sleeping 60s before next pass...[/yellow]")
        time.sleep(60)
        resume = None


def drain_old_archive_dirs(resume_from: str | None = None, batch_size: int = 100) -> dict[str, int]:
    """
    Drain old archive/ directories (0.8.x → 0.9.x migration).

    Only processes real directories (skips symlinks - those are already migrated).
    For each old dir found in archive/:
      1. Load or create DB snapshot
      2. Trigger fs migration on save() to move to data/users/{user}/...
      3. Leave symlink in archive/ pointing to new location

    After this drains, archive/ should only contain symlinks and we can trust
    1:1 mapping between DB and filesystem.
    """
    from archivebox.core.models import Snapshot
    from archivebox.config import CONSTANTS
    from django.db import transaction

    stats = {"processed": 0, "migrated": 0, "skipped": 0, "invalid": 0}

    archive_dir = CONSTANTS.ARCHIVE_DIR
    if not archive_dir.exists():
        return stats

    print("[DEBUG Phase1] Scanning for old directories in archive/...")

    # Scan for real directories only (skip symlinks - they're already migrated)
    all_entries = list(os.scandir(archive_dir))
    print(f"[DEBUG Phase1] Total entries in archive/: {len(all_entries)}")
    entries = [
        (e.stat().st_mtime, e.path)
        for e in all_entries
        if e.is_dir(follow_symlinks=False)  # Skip symlinks
    ]
    entries.sort(reverse=True)  # Newest first
    print(f"[DEBUG Phase1] Real directories (not symlinks): {len(entries)}")
    print(f"[*] Found {len(entries)} old directories to drain")

    for mtime, entry_path in entries:
        entry_path = Path(entry_path)

        # Resume from timestamp if specified
        if resume_from and entry_path.name > resume_from:
            continue

        stats["processed"] += 1

        # Try to load existing snapshot from DB
        snapshot = Snapshot.load_from_directory(entry_path)

        if not snapshot:
            # Not in DB - create new snapshot record
            snapshot = Snapshot.create_from_directory(entry_path)
            if not snapshot:
                # Invalid directory - move to invalid/
                Snapshot.move_directory_to_invalid(entry_path)
                stats["invalid"] += 1
                print(f"    [{stats['processed']}] Invalid: {entry_path.name}")
                continue

            try:
                snapshot.save()
                stats["migrated"] += 1
                print(f"    [{stats['processed']}] Imported orphaned snapshot: {entry_path.name}")
            except Exception as e:
                stats["skipped"] += 1
                print(f"    [{stats['processed']}] Skipped (error: {e}): {entry_path.name}")
            continue

        # Ensure snapshot has a valid crawl (migration 0024 may have failed)
        has_valid_crawl = _get_snapshot_crawl(snapshot) is not None

        if not has_valid_crawl:
            # Create a new crawl (created_by will default to system user)
            from archivebox.crawls.models import Crawl

            crawl = Crawl.objects.create(urls=snapshot.url)
            # Use queryset update to avoid triggering save() hooks
            from archivebox.core.models import Snapshot as SnapshotModel

            SnapshotModel.objects.filter(pk=snapshot.pk).update(crawl=crawl)
            # Refresh the instance
            snapshot.crawl = crawl
            print(f"[DEBUG Phase1] Created missing crawl for snapshot {str(snapshot.id)[:8]}")

        # Check if needs migration (0.8.x → 0.9.x)
        print(
            f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}",
        )
        if snapshot.fs_migration_needed:
            try:
                # Calculate paths using actual directory (entry_path), not snapshot.timestamp
                # because snapshot.timestamp might be truncated
                old_dir = entry_path
                new_dir = snapshot.get_storage_path_for_version("0.9.0")
                print(f"[DEBUG Phase1] Migrating {old_dir.name} → {new_dir}")

                # Manually migrate files
                if not new_dir.exists() and old_dir.exists():
                    new_dir.mkdir(parents=True, exist_ok=True)
                    import shutil

                    file_count = 0
                    for old_file in old_dir.rglob("*"):
                        if old_file.is_file():
                            rel_path = old_file.relative_to(old_dir)
                            new_file = new_dir / rel_path
                            if not new_file.exists():
                                new_file.parent.mkdir(parents=True, exist_ok=True)
                                shutil.copy2(old_file, new_file)
                                file_count += 1
                    print(f"[DEBUG Phase1] Copied {file_count} files")

                # Update only fs_version field using queryset update (bypasses validation)
                from archivebox.core.models import Snapshot as SnapshotModel

                SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version="0.9.0")

                # Commit the transaction
                transaction.commit()

                # Cleanup: delete old dir and create symlink
                if old_dir.exists() and old_dir != new_dir:
                    snapshot._cleanup_old_migration_dir(old_dir, new_dir)

                stats["migrated"] += 1
                print(f"    [{stats['processed']}] Migrated: {entry_path.name}")
            except Exception as e:
                stats["skipped"] += 1
                print(f"    [{stats['processed']}] Skipped (error: {e}): {entry_path.name}")
        else:
            stats["skipped"] += 1

        if stats["processed"] % batch_size == 0:
            transaction.commit()

    transaction.commit()
    return stats


def process_all_db_snapshots(batch_size: int = 100, resume: str | None = None) -> dict[str, int]:
    """
    O(n) scan over entire DB from most recent to least recent.

    For each snapshot:
      1. Reconcile index.json with DB (merge titles, tags, archive results)
      2. Queue for archiving (state machine will handle it)

    No orphan detection needed - we trust 1:1 mapping between DB and filesystem
    after Phase 1 has drained all old archive/ directories.
    """
    from archivebox.core.models import Snapshot
    from django.db import transaction
    from django.utils import timezone

    stats = {"processed": 0, "reconciled": 0, "queued": 0}

    queryset = Snapshot.objects.all()
    if resume:
        queryset = queryset.filter(timestamp__lte=resume)
    total = queryset.count()
    print(f"[*] Processing {total} snapshots from database (most recent first)...")

    # Process from most recent to least recent
    for snapshot in queryset.select_related("crawl").order_by("-bookmarked_at").iterator(chunk_size=batch_size):
        stats["processed"] += 1

        # Skip snapshots with missing crawl references (orphaned by migration errors)
        if _get_snapshot_crawl(snapshot) is None:
            continue

        try:
            print(
                f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}",
            )

            # Check if snapshot has a directory on disk
            from pathlib import Path

            output_dir = Path(snapshot.output_dir)
            has_directory = output_dir.exists() and output_dir.is_dir()

            # Only reconcile if directory exists (don't create empty directories for orphans)
            if has_directory:
                snapshot.reconcile_with_index_json()

            # Clean up invalid field values from old migrations
            if not isinstance(snapshot.current_step, int):
                snapshot.current_step = 0

            # If still needs migration, it's an orphan (no directory on disk)
            # Mark it as migrated to prevent save() from triggering filesystem migration
            if snapshot.fs_migration_needed:
                if has_directory:
                    print(f"[DEBUG Phase2] WARNING: Snapshot {str(snapshot.id)[:8]} has directory but still needs migration")
                else:
                    print(f"[DEBUG Phase2] Orphan snapshot {str(snapshot.id)[:8]} - marking as migrated without filesystem operation")
                # Use queryset update to set fs_version without triggering save() hooks
                from archivebox.core.models import Snapshot as SnapshotModel

                SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version="0.9.0")
                snapshot.fs_version = "0.9.0"

            # Queue for archiving (state machine will handle it)
            snapshot.status = Snapshot.StatusChoices.QUEUED
            snapshot.retry_at = timezone.now()
            snapshot.save()

            stats["reconciled"] += 1 if has_directory else 0
            stats["queued"] += 1
        except Exception as e:
            # Skip snapshots that can't be processed (e.g., missing crawl)
            print(f"    [!] Skipping snapshot {snapshot.id}: {e}")
            continue

        if stats["processed"] % batch_size == 0:
            transaction.commit()
            print(f"    [{stats['processed']}/{total}] Processed...")

    transaction.commit()
    return stats


def process_filtered_snapshots(
    filter_patterns: Iterable[str],
    filter_type: str,
    before: float | None,
    after: float | None,
    resume: str | None,
    batch_size: int,
) -> dict[str, int]:
    """Process snapshots matching filters (DB query only)."""
    from django.db import transaction
    from django.utils import timezone

    stats = {"processed": 0, "reconciled": 0, "queued": 0}

    snapshots = _build_filtered_snapshots_queryset(
        filter_patterns=filter_patterns,
        filter_type=filter_type,
        before=before,
        after=after,
        resume=resume,
    )

    total = snapshots.count()
    print(f"[*] Found {total} matching snapshots")

    for snapshot in snapshots.select_related("crawl").iterator(chunk_size=batch_size):
        stats["processed"] += 1

        # Skip snapshots with missing crawl references
        if _get_snapshot_crawl(snapshot) is None:
            continue

        try:
            # Reconcile index.json with DB
            snapshot.reconcile_with_index_json()

            # Clean up invalid field values from old migrations
            if not isinstance(snapshot.current_step, int):
                snapshot.current_step = 0

            # Queue for archiving
            snapshot.status = Snapshot.StatusChoices.QUEUED
            snapshot.retry_at = timezone.now()
            snapshot.save()

            stats["reconciled"] += 1
            stats["queued"] += 1
        except Exception as e:
            # Skip snapshots that can't be processed
            print(f"    [!] Skipping snapshot {snapshot.id}: {e}")
            continue

        if stats["processed"] % batch_size == 0:
            transaction.commit()
            print(f"    [{stats['processed']}/{total}] Processed...")

    transaction.commit()
    return stats


def print_stats(stats: dict):
    """Print statistics for filtered mode."""
    from rich import print

    print(f"""
[green]Update Complete[/green]
  Processed:   {stats["processed"]}
  Reconciled:  {stats["reconciled"]}
  Queued:      {stats["queued"]}
""")


def print_combined_stats(stats_combined: dict):
    """Print statistics for full mode."""
    from rich import print

    s1 = stats_combined["phase1"]
    s2 = stats_combined["phase2"]

    print(f"""
[green]Archive Update Complete[/green]

Phase 1 (Drain Old Dirs):
  Checked:     {s1.get("processed", 0)}
  Migrated:    {s1.get("migrated", 0)}
  Skipped:     {s1.get("skipped", 0)}
  Invalid:     {s1.get("invalid", 0)}

Phase 2 (Process DB):
  Processed:   {s2.get("processed", 0)}
  Reconciled:  {s2.get("reconciled", 0)}
  Queued:      {s2.get("queued", 0)}
""")


def print_index_stats(stats: dict[str, Any]) -> None:
    from rich import print

    print(f"""
[green]Search Reindex Complete[/green]
  Processed:   {stats["processed"]}
  Reconciled:  {stats["reconciled"]}
  Queued:      {stats["queued"]}
  Reindexed:   {stats["reindexed"]}
""")


@click.command()
@click.option("--resume", type=str, help="Resume from timestamp")
@click.option("--before", type=float, help="Only snapshots before timestamp")
@click.option("--after", type=float, help="Only snapshots after timestamp")
@click.option("--filter-type", "-t", type=click.Choice(["exact", "substring", "regex", "domain", "tag", "timestamp"]), default="exact")
@click.option("--batch-size", type=int, default=100, help="Commit every N snapshots")
@click.option("--continuous", is_flag=True, help="Run continuously as background worker")
@click.option("--index-only", is_flag=True, help="Backfill available search indexes from existing archived content")
@click.argument("filter_patterns", nargs=-1)
@docstring(update.__doc__)
def main(**kwargs):
    update(**kwargs)


if __name__ == "__main__":
    main()