ArchiveBox/archivebox/cli/archivebox_extract.py

#!/usr/bin/env python3

"""
archivebox extract [snapshot_ids...] [--plugins=NAMES]

Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL.

Input formats:
    - Snapshot UUIDs (one per line)
    - JSONL: {"type": "Snapshot", "id": "...", "url": "..."}
    - JSONL: {"type": "ArchiveResult", "snapshot_id": "...", "plugin": "..."}

Output (JSONL):
    {"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", "status": "..."}

Examples:
    # Extract specific snapshot
    archivebox extract 01234567-89ab-cdef-0123-456789abcdef

    # Pipe from snapshot command
    archivebox snapshot https://example.com | archivebox extract

    # Run specific plugins only
    archivebox extract --plugins=screenshot,singlefile 01234567-89ab-cdef-0123-456789abcdef

    # Chain commands
    archivebox crawl https://example.com | archivebox snapshot | archivebox extract
"""

__package__ = 'archivebox.cli'
__command__ = 'archivebox extract'

import sys
from collections import defaultdict

import rich_click as click


def process_archiveresult_by_id(archiveresult_id: str) -> int:
    """
    Re-run extraction for a single ArchiveResult by ID.

    ArchiveResults are projected status rows, not queued work items. Re-running
    a single result means resetting that row and queueing its parent snapshot
    through the shared crawl runner with the corresponding plugin selected.
    """
    from rich import print as rprint
    from django.utils import timezone
    from archivebox.core.models import ArchiveResult
    from archivebox.services.runner import run_crawl

    try:
        archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
    except ArchiveResult.DoesNotExist:
        rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr)
        return 1

    rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)

    try:
        archiveresult.reset_for_retry()
        snapshot = archiveresult.snapshot
        snapshot.status = snapshot.StatusChoices.QUEUED
        snapshot.retry_at = timezone.now()
        snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])

        crawl = snapshot.crawl
        if crawl.status != crawl.StatusChoices.STARTED:
            crawl.status = crawl.StatusChoices.QUEUED
        crawl.retry_at = timezone.now()
        crawl.save(update_fields=['status', 'retry_at', 'modified_at'])

        run_crawl(str(crawl.id), snapshot_ids=[str(snapshot.id)], selected_plugins=[archiveresult.plugin])
        archiveresult.refresh_from_db()

        if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
            print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]')
            return 0
        elif archiveresult.status == ArchiveResult.StatusChoices.NORESULTS:
            print(f'[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]')
            return 0
        elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
            print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr)
            return 1
        else:
            # Still in progress or backoff - not a failure
            print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]')
            return 0

    except Exception as e:
        print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
        return 1


def run_plugins(
    args: tuple,
    records: list[dict] | None = None,
    plugins: str = '',
    wait: bool = True,
) -> int:
    """
    Run plugins on Snapshots from input.

    Reads Snapshot IDs or JSONL from args/stdin, runs plugins, outputs JSONL.

    Exit codes:
        0: Success
        1: Failure
    """
    from rich import print as rprint
    from django.utils import timezone

    from archivebox.misc.jsonl import (
        read_args_or_stdin, write_record,
        TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
    )
    from archivebox.core.models import Snapshot, ArchiveResult
    from archivebox.services.runner import run_crawl

    is_tty = sys.stdout.isatty()

    # Parse comma-separated plugins list once (reused in creation and filtering)
    plugins_list = [p.strip() for p in plugins.split(',') if p.strip()] if plugins else []

    # Parse stdin/args exactly once per CLI invocation.
    # `main()` may already have consumed stdin to distinguish Snapshot input from
    # ArchiveResult IDs; if so, it must pass the parsed records through here
    # instead of asking this helper to reread an already-drained pipe.
    if records is None:
        records = list(read_args_or_stdin(args))

    if not records:
        rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
        return 1

    # Gather snapshot IDs and optional plugin constraints to process
    snapshot_ids = set()
    requested_plugins_by_snapshot: dict[str, set[str]] = defaultdict(set)
    for record in records:
        record_type = record.get('type')

        if record_type == TYPE_SNAPSHOT:
            snapshot_id = record.get('id')
            if snapshot_id:
                snapshot_ids.add(snapshot_id)
            elif record.get('url'):
                # Look up by URL (get most recent if multiple exist)
                snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first()
                if snap:
                    snapshot_ids.add(str(snap.id))
                else:
                    rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)

        elif record_type == TYPE_ARCHIVERESULT:
            snapshot_id = record.get('snapshot_id')
            if snapshot_id:
                snapshot_ids.add(snapshot_id)
                plugin_name = record.get('plugin')
                if plugin_name and not plugins_list:
                    requested_plugins_by_snapshot[str(snapshot_id)].add(str(plugin_name))

        elif 'id' in record:
            # Assume it's a snapshot ID
            snapshot_ids.add(record['id'])

    if not snapshot_ids:
        rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr)
        return 1

    # Get snapshots and ensure they have pending ArchiveResults
    processed_count = 0
    for snapshot_id in snapshot_ids:
        try:
            snapshot = Snapshot.objects.get(id=snapshot_id)
        except Snapshot.DoesNotExist:
            rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
            continue

        for plugin_name in requested_plugins_by_snapshot.get(str(snapshot.id), set()):
            existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by('-created_at').first()
            if existing_result and existing_result.status in [
                ArchiveResult.StatusChoices.FAILED,
                ArchiveResult.StatusChoices.SKIPPED,
                ArchiveResult.StatusChoices.NORESULTS,
                ArchiveResult.StatusChoices.BACKOFF,
            ]:
                existing_result.reset_for_retry()

        # Reset snapshot status to allow processing
        if snapshot.status == Snapshot.StatusChoices.SEALED:
            snapshot.status = Snapshot.StatusChoices.STARTED
            snapshot.retry_at = timezone.now()
            snapshot.save()

        processed_count += 1

    if processed_count == 0:
        rprint('[red]No snapshots to process[/red]', file=sys.stderr)
        return 1

    rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr)

    # Run orchestrator if --wait (default)
    if wait:
        rprint('[blue]Running plugins...[/blue]', file=sys.stderr)
        snapshot_ids_by_crawl: dict[str, set[str]] = defaultdict(set)
        for snapshot_id in snapshot_ids:
            try:
                snapshot = Snapshot.objects.only('id', 'crawl_id').get(id=snapshot_id)
            except Snapshot.DoesNotExist:
                continue
            snapshot_ids_by_crawl[str(snapshot.crawl_id)].add(str(snapshot.id))

        for crawl_id, crawl_snapshot_ids in snapshot_ids_by_crawl.items():
            selected_plugins = plugins_list or sorted({
                plugin
                for snapshot_id in crawl_snapshot_ids
                for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())
            }) or None
            run_crawl(
                crawl_id,
                snapshot_ids=sorted(crawl_snapshot_ids),
                selected_plugins=selected_plugins,
            )

    # Output results as JSONL (when piped) or human-readable (when TTY)
    for snapshot_id in snapshot_ids:
        try:
            snapshot = Snapshot.objects.get(id=snapshot_id)
            results = snapshot.archiveresult_set.all()
            if plugins_list:
                results = results.filter(plugin__in=plugins_list)

            for result in results:
                if is_tty:
                    status_color = {
                        'succeeded': 'green',
                        'failed': 'red',
                        'skipped': 'yellow',
                    }.get(result.status, 'dim')
                    rprint(f'  [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ""}', file=sys.stderr)
                else:
                    write_record(result.to_json())
        except Snapshot.DoesNotExist:
            continue

    return 0


def is_archiveresult_id(value: str) -> bool:
    """Check if value looks like an ArchiveResult UUID."""
    import re
    uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
    if not uuid_pattern.match(value):
        return False
    # Verify it's actually an ArchiveResult (not a Snapshot or other object)
    from archivebox.core.models import ArchiveResult
    return ArchiveResult.objects.filter(id=value).exists()


@click.command()
@click.option('--plugins', '--plugin', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)')
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
@click.argument('args', nargs=-1)
def main(plugins: str, wait: bool, args: tuple):
    """Run plugins on Snapshots, or process existing ArchiveResults by ID"""
    from archivebox.misc.jsonl import read_args_or_stdin

    # Read all input
    records = list(read_args_or_stdin(args))

    if not records:
        from rich import print as rprint
        rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
        sys.exit(1)

    # Check if input looks like existing ArchiveResult IDs to process
    all_are_archiveresult_ids = all(
        is_archiveresult_id(r.get('id') or r.get('url', ''))
        for r in records
    )

    if all_are_archiveresult_ids:
        # Process existing ArchiveResults by ID
        from rich import print as rprint

        exit_code = 0
        for record in records:
            archiveresult_id = record.get('id') or record.get('url')
            if not isinstance(archiveresult_id, str):
                rprint(f'[red]Invalid ArchiveResult input: {record}[/red]', file=sys.stderr)
                exit_code = 1
                continue
            result = process_archiveresult_by_id(archiveresult_id)
            if result != 0:
                exit_code = result
        sys.exit(exit_code)
    else:
        # Default behavior: run plugins on Snapshots from input
        sys.exit(run_plugins(args, records=records, plugins=plugins, wait=wait))


if __name__ == '__main__':
    main()