#!/usr/bin/env python3 """ archivebox extract [snapshot_ids...] [--plugins=NAMES] Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL. Input formats: - Snapshot UUIDs (one per line) - JSONL: {"type": "Snapshot", "id": "...", "url": "..."} - JSONL: {"type": "ArchiveResult", "snapshot_id": "...", "plugin": "..."} Output (JSONL): {"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", "status": "..."} Examples: # Extract specific snapshot archivebox extract 01234567-89ab-cdef-0123-456789abcdef # Pipe from snapshot command archivebox snapshot https://example.com | archivebox extract # Run specific plugins only archivebox extract --plugins=screenshot,singlefile 01234567-89ab-cdef-0123-456789abcdef # Chain commands archivebox crawl https://example.com | archivebox snapshot | archivebox extract """ __package__ = "archivebox.cli" __command__ = "archivebox extract" import sys from collections import defaultdict import rich_click as click def process_archiveresult_by_id(archiveresult_id: str) -> int: """ Re-run extraction for a single ArchiveResult by ID. ArchiveResults are projected status rows, not queued work items. Re-running a single result means resetting that row and queueing its parent snapshot through the shared crawl runner with the corresponding plugin selected. """ from rich import print as rprint from django.utils import timezone from archivebox.core.models import ArchiveResult from archivebox.services.runner import run_crawl try: archiveresult = ArchiveResult.objects.get(id=archiveresult_id) except ArchiveResult.DoesNotExist: rprint(f"[red]ArchiveResult {archiveresult_id} not found[/red]", file=sys.stderr) return 1 rprint(f"[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]", file=sys.stderr) try: archiveresult.reset_for_retry() snapshot = archiveresult.snapshot snapshot.status = snapshot.StatusChoices.QUEUED snapshot.retry_at = timezone.now() snapshot.save(update_fields=["status", "retry_at", "modified_at"]) crawl = snapshot.crawl if crawl.status != crawl.StatusChoices.STARTED: crawl.status = crawl.StatusChoices.QUEUED crawl.retry_at = timezone.now() crawl.save(update_fields=["status", "retry_at", "modified_at"]) run_crawl(str(crawl.id), snapshot_ids=[str(snapshot.id)], selected_plugins=[archiveresult.plugin]) archiveresult.refresh_from_db() if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED: print(f"[green]Extraction succeeded: {archiveresult.output_str}[/green]") return 0 elif archiveresult.status == ArchiveResult.StatusChoices.NORESULTS: print(f"[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]") return 0 elif archiveresult.status == ArchiveResult.StatusChoices.FAILED: print(f"[red]Extraction failed: {archiveresult.output_str}[/red]", file=sys.stderr) return 1 else: # Still in progress or backoff - not a failure print(f"[yellow]Extraction status: {archiveresult.status}[/yellow]") return 0 except Exception as e: print(f"[red]Extraction error: {type(e).__name__}: {e}[/red]", file=sys.stderr) return 1 def run_plugins( args: tuple, records: list[dict] | None = None, plugins: str = "", wait: bool = True, emit_results: bool = True, ) -> int: """ Run plugins on Snapshots from input. Reads Snapshot IDs or JSONL from args/stdin, runs plugins, outputs JSONL. Exit codes: 0: Success 1: Failure """ from rich import print as rprint from django.utils import timezone from archivebox.misc.jsonl import ( read_args_or_stdin, write_record, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, ) from archivebox.core.models import Snapshot from archivebox.services.runner import run_crawl is_tty = sys.stdout.isatty() # Parse comma-separated plugins list once (reused in creation and filtering) plugins_list = [p.strip() for p in plugins.split(",") if p.strip()] if plugins else [] # Parse stdin/args exactly once per CLI invocation. # `main()` may already have consumed stdin to distinguish Snapshot input from # ArchiveResult IDs; if so, it must pass the parsed records through here # instead of asking this helper to reread an already-drained pipe. if records is None: records = list(read_args_or_stdin(args)) if not records: rprint("[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]", file=sys.stderr) return 1 # Gather snapshot IDs and optional plugin constraints to process snapshot_ids = set() requested_plugins_by_snapshot: dict[str, set[str]] = defaultdict(set) for record in records: record_type = record.get("type") if record_type == TYPE_SNAPSHOT: snapshot_id = record.get("id") if snapshot_id: snapshot_ids.add(snapshot_id) elif record.get("url"): # Look up by URL (get most recent if multiple exist) snap = Snapshot.objects.filter(url=record["url"]).order_by("-created_at").first() if snap: snapshot_ids.add(str(snap.id)) else: rprint(f"[yellow]Snapshot not found for URL: {record['url']}[/yellow]", file=sys.stderr) elif record_type == TYPE_ARCHIVERESULT: snapshot_id = record.get("snapshot_id") if snapshot_id: snapshot_ids.add(snapshot_id) plugin_name = record.get("plugin") if plugin_name and not plugins_list: requested_plugins_by_snapshot[str(snapshot_id)].add(str(plugin_name)) elif "id" in record: # Assume it's a snapshot ID snapshot_ids.add(record["id"]) if not snapshot_ids: rprint("[red]No valid snapshot IDs found in input[/red]", file=sys.stderr) return 1 # Get snapshots and ensure they have pending ArchiveResults processed_count = 0 for snapshot_id in snapshot_ids: try: snapshot = Snapshot.objects.get(id=snapshot_id) except Snapshot.DoesNotExist: rprint(f"[yellow]Snapshot {snapshot_id} not found[/yellow]", file=sys.stderr) continue requested_plugin_names = set(plugins_list) | requested_plugins_by_snapshot.get(str(snapshot.id), set()) for plugin_name in requested_plugin_names: existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by("-created_at").first() if existing_result: existing_result.reset_for_retry() # Reset snapshot status to allow processing if snapshot.status == Snapshot.StatusChoices.SEALED: snapshot.status = Snapshot.StatusChoices.STARTED snapshot.retry_at = timezone.now() snapshot.save() processed_count += 1 if processed_count == 0: rprint("[red]No snapshots to process[/red]", file=sys.stderr) return 1 rprint(f"[blue]Queued {processed_count} snapshots for extraction[/blue]", file=sys.stderr) # Run orchestrator if --wait (default) if wait: rprint("[blue]Running plugins...[/blue]", file=sys.stderr) snapshot_ids_by_crawl: dict[str, set[str]] = defaultdict(set) for snapshot_id in snapshot_ids: try: snapshot = Snapshot.objects.only("id", "crawl_id").get(id=snapshot_id) except Snapshot.DoesNotExist: continue snapshot_ids_by_crawl[str(snapshot.crawl_id)].add(str(snapshot.id)) for crawl_id, crawl_snapshot_ids in snapshot_ids_by_crawl.items(): selected_plugins = ( plugins_list or sorted( {plugin for snapshot_id in crawl_snapshot_ids for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())}, ) or None ) run_crawl( crawl_id, snapshot_ids=sorted(crawl_snapshot_ids), selected_plugins=selected_plugins, ) if not emit_results: return 0 # Output results as JSONL (when piped) or human-readable (when TTY) for snapshot_id in snapshot_ids: try: snapshot = Snapshot.objects.get(id=snapshot_id) results = snapshot.archiveresult_set.all() if plugins_list: results = results.filter(plugin__in=plugins_list) for result in results: if is_tty: status_color = { "succeeded": "green", "failed": "red", "skipped": "yellow", }.get(result.status, "dim") rprint( f" [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ''}", file=sys.stderr, ) else: write_record(result.to_json()) except Snapshot.DoesNotExist: continue return 0 def is_archiveresult_id(value: str) -> bool: """Check if value looks like an ArchiveResult UUID.""" import re uuid_pattern = re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.I) if not uuid_pattern.match(value): return False # Verify it's actually an ArchiveResult (not a Snapshot or other object) from archivebox.core.models import ArchiveResult return ArchiveResult.objects.filter(id=value).exists() @click.command() @click.option("--plugins", "--plugin", "-p", default="", help="Comma-separated list of plugins to run (e.g., screenshot,singlefile)") @click.option("--wait/--no-wait", default=True, help="Wait for plugins to complete (default: wait)") @click.argument("args", nargs=-1) def main(plugins: str, wait: bool, args: tuple): """Run plugins on Snapshots, or process existing ArchiveResults by ID""" from archivebox.misc.jsonl import read_args_or_stdin # Read all input records = list(read_args_or_stdin(args)) if not records: from rich import print as rprint rprint("[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]", file=sys.stderr) sys.exit(1) # Check if input looks like existing ArchiveResult IDs to process all_are_archiveresult_ids = all(is_archiveresult_id(r.get("id") or r.get("url", "")) for r in records) if all_are_archiveresult_ids: # Process existing ArchiveResults by ID from rich import print as rprint exit_code = 0 for record in records: archiveresult_id = record.get("id") or record.get("url") if not isinstance(archiveresult_id, str): rprint(f"[red]Invalid ArchiveResult input: {record}[/red]", file=sys.stderr) exit_code = 1 continue result = process_archiveresult_by_id(archiveresult_id) if result != 0: exit_code = result sys.exit(exit_code) else: # Default behavior: run plugins on Snapshots from input sys.exit(run_plugins(args, records=records, plugins=plugins, wait=wait)) if __name__ == "__main__": main()