ArchiveBox/archivebox/cli/archivebox_extract.py

#!/usr/bin/env python3

"""
archivebox extract [snapshot_ids...] [--plugins=NAMES]

Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL.

Input formats:
    - Snapshot UUIDs (one per line)
    - JSONL: {"type": "Snapshot", "id": "...", "url": "..."}
    - JSONL: {"type": "ArchiveResult", "snapshot_id": "...", "plugin": "..."}

Output (JSONL):
    {"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", "status": "..."}

Examples:
    # Extract specific snapshot
    archivebox extract 01234567-89ab-cdef-0123-456789abcdef

    # Pipe from snapshot command
    archivebox snapshot https://example.com | archivebox extract

    # Run specific plugins only
    archivebox extract --plugins=screenshot,singlefile 01234567-89ab-cdef-0123-456789abcdef

    # Chain commands
    archivebox crawl https://example.com | archivebox snapshot | archivebox extract
"""

__package__ = 'archivebox.cli'
__command__ = 'archivebox extract'

import sys

import rich_click as click


def process_archiveresult_by_id(archiveresult_id: str) -> int:
    """
    Run extraction for a single ArchiveResult by ID (used by workers).

    Triggers the ArchiveResult's state machine tick() to run the extractor
    plugin, but only after claiming ownership via retry_at. This keeps direct
    CLI execution aligned with the worker lifecycle and prevents duplicate hook
    runs if another process already owns the same ArchiveResult.
    """
    from rich import print as rprint
    from archivebox.core.models import ArchiveResult

    try:
        archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
    except ArchiveResult.DoesNotExist:
        rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr)
        return 1

    rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)

    try:
        # Claim-before-tick is the required calling pattern for direct
        # state-machine drivers. If another worker already owns this row,
        # report that and exit without running duplicate extractor side effects.
        if not archiveresult.tick_claimed(lock_seconds=120):
            print(f'[yellow]Extraction already claimed by another process: {archiveresult.plugin}[/yellow]')
            return 0

        if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
            print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]')
            return 0
        elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
            print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr)
            return 1
        else:
            # Still in progress or backoff - not a failure
            print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]')
            return 0

    except Exception as e:
        print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
        return 1


def run_plugins(
    args: tuple,
    plugins: str = '',
    wait: bool = True,
) -> int:
    """
    Run plugins on Snapshots from input.

    Reads Snapshot IDs or JSONL from args/stdin, runs plugins, outputs JSONL.

    Exit codes:
        0: Success
        1: Failure
    """
    from rich import print as rprint
    from django.utils import timezone

    from archivebox.misc.jsonl import (
        read_args_or_stdin, write_record,
        TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
    )
    from archivebox.core.models import Snapshot, ArchiveResult
    from archivebox.workers.orchestrator import Orchestrator

    is_tty = sys.stdout.isatty()

    # Parse comma-separated plugins list once (reused in creation and filtering)
    plugins_list = [p.strip() for p in plugins.split(',') if p.strip()] if plugins else []

    # Collect all input records
    records = list(read_args_or_stdin(args))

    if not records:
        rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
        return 1

    # Gather snapshot IDs to process
    snapshot_ids = set()
    for record in records:
        record_type = record.get('type')

        if record_type == TYPE_SNAPSHOT:
            snapshot_id = record.get('id')
            if snapshot_id:
                snapshot_ids.add(snapshot_id)
            elif record.get('url'):
                # Look up by URL (get most recent if multiple exist)
                snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first()
                if snap:
                    snapshot_ids.add(str(snap.id))
                else:
                    rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)

        elif record_type == TYPE_ARCHIVERESULT:
            snapshot_id = record.get('snapshot_id')
            if snapshot_id:
                snapshot_ids.add(snapshot_id)

        elif 'id' in record:
            # Assume it's a snapshot ID
            snapshot_ids.add(record['id'])

    if not snapshot_ids:
        rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr)
        return 1

    # Get snapshots and ensure they have pending ArchiveResults
    processed_count = 0
    for snapshot_id in snapshot_ids:
        try:
            snapshot = Snapshot.objects.get(id=snapshot_id)
        except Snapshot.DoesNotExist:
            rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
            continue

        # Create pending ArchiveResults if needed
        if plugins_list:
            # Only create for specific plugins
            for plugin_name in plugins_list:
                result, created = ArchiveResult.objects.get_or_create(
                    snapshot=snapshot,
                    plugin=plugin_name,
                    defaults={
                        'status': ArchiveResult.StatusChoices.QUEUED,
                        'retry_at': timezone.now(),
                    }
                )
                if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
                    # Reset for retry
                    result.status = ArchiveResult.StatusChoices.QUEUED
                    result.retry_at = timezone.now()
                    result.save()
        else:
            # Create all pending plugins
            snapshot.create_pending_archiveresults()

        # Reset snapshot status to allow processing
        if snapshot.status == Snapshot.StatusChoices.SEALED:
            snapshot.status = Snapshot.StatusChoices.STARTED
            snapshot.retry_at = timezone.now()
            snapshot.save()

        processed_count += 1

    if processed_count == 0:
        rprint('[red]No snapshots to process[/red]', file=sys.stderr)
        return 1

    rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr)

    # Run orchestrator if --wait (default)
    if wait:
        rprint('[blue]Running plugins...[/blue]', file=sys.stderr)
        orchestrator = Orchestrator(exit_on_idle=True)
        orchestrator.runloop()

    # Output results as JSONL (when piped) or human-readable (when TTY)
    for snapshot_id in snapshot_ids:
        try:
            snapshot = Snapshot.objects.get(id=snapshot_id)
            results = snapshot.archiveresult_set.all()
            if plugins_list:
                results = results.filter(plugin__in=plugins_list)

            for result in results:
                if is_tty:
                    status_color = {
                        'succeeded': 'green',
                        'failed': 'red',
                        'skipped': 'yellow',
                    }.get(result.status, 'dim')
                    rprint(f'  [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ""}', file=sys.stderr)
                else:
                    write_record(result.to_json())
        except Snapshot.DoesNotExist:
            continue

    return 0


def is_archiveresult_id(value: str) -> bool:
    """Check if value looks like an ArchiveResult UUID."""
    import re
    uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
    if not uuid_pattern.match(value):
        return False
    # Verify it's actually an ArchiveResult (not a Snapshot or other object)
    from archivebox.core.models import ArchiveResult
    return ArchiveResult.objects.filter(id=value).exists()


@click.command()
@click.option('--plugins', '--plugin', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)')
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
@click.argument('args', nargs=-1)
def main(plugins: str, wait: bool, args: tuple):
    """Run plugins on Snapshots, or process existing ArchiveResults by ID"""
    from archivebox.misc.jsonl import read_args_or_stdin

    # Read all input
    records = list(read_args_or_stdin(args))

    if not records:
        from rich import print as rprint
        rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
        sys.exit(1)

    # Check if input looks like existing ArchiveResult IDs to process
    all_are_archiveresult_ids = all(
        is_archiveresult_id(r.get('id') or r.get('url', ''))
        for r in records
    )

    if all_are_archiveresult_ids:
        # Process existing ArchiveResults by ID
        exit_code = 0
        for record in records:
            archiveresult_id = record.get('id') or record.get('url')
            result = process_archiveresult_by_id(archiveresult_id)
            if result != 0:
                exit_code = result
        sys.exit(exit_code)
    else:
        # Default behavior: run plugins on Snapshots from input
        sys.exit(run_plugins(args, plugins=plugins, wait=wait))


if __name__ == '__main__':
    main()