mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 23:37:58 +10:00
312 lines
12 KiB
Python
312 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
|
|
"""
|
|
archivebox extract [snapshot_ids...] [--plugins=NAMES]
|
|
|
|
Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL.
|
|
|
|
Input formats:
|
|
- Snapshot UUIDs (one per line)
|
|
- JSONL: {"type": "Snapshot", "id": "...", "url": "..."}
|
|
- JSONL: {"type": "ArchiveResult", "snapshot_id": "...", "plugin": "..."}
|
|
|
|
Output (JSONL):
|
|
{"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", "status": "..."}
|
|
|
|
Examples:
|
|
# Extract specific snapshot
|
|
archivebox extract 01234567-89ab-cdef-0123-456789abcdef
|
|
|
|
# Pipe from snapshot command
|
|
archivebox snapshot https://example.com | archivebox extract
|
|
|
|
# Run specific plugins only
|
|
archivebox extract --plugins=screenshot,singlefile 01234567-89ab-cdef-0123-456789abcdef
|
|
|
|
# Chain commands
|
|
archivebox crawl https://example.com | archivebox snapshot | archivebox extract
|
|
"""
|
|
|
|
__package__ = "archivebox.cli"
|
|
__command__ = "archivebox extract"
|
|
|
|
import sys
|
|
from collections import defaultdict
|
|
|
|
import rich_click as click
|
|
|
|
|
|
def process_archiveresult_by_id(archiveresult_id: str) -> int:
|
|
"""
|
|
Re-run extraction for a single ArchiveResult by ID.
|
|
|
|
ArchiveResults are projected status rows, not queued work items. Re-running
|
|
a single result means resetting that row and queueing its parent snapshot
|
|
through the shared crawl runner with the corresponding plugin selected.
|
|
"""
|
|
from rich import print as rprint
|
|
from django.utils import timezone
|
|
from archivebox.core.models import ArchiveResult
|
|
from archivebox.services.runner import run_crawl
|
|
|
|
try:
|
|
archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
|
|
except ArchiveResult.DoesNotExist:
|
|
rprint(f"[red]ArchiveResult {archiveresult_id} not found[/red]", file=sys.stderr)
|
|
return 1
|
|
|
|
rprint(f"[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]", file=sys.stderr)
|
|
|
|
try:
|
|
archiveresult.reset_for_retry()
|
|
snapshot = archiveresult.snapshot
|
|
snapshot.status = snapshot.StatusChoices.QUEUED
|
|
snapshot.retry_at = timezone.now()
|
|
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
|
|
|
|
crawl = snapshot.crawl
|
|
if crawl.status != crawl.StatusChoices.STARTED:
|
|
crawl.status = crawl.StatusChoices.QUEUED
|
|
crawl.retry_at = timezone.now()
|
|
crawl.save(update_fields=["status", "retry_at", "modified_at"])
|
|
|
|
run_crawl(str(crawl.id), snapshot_ids=[str(snapshot.id)], selected_plugins=[archiveresult.plugin])
|
|
archiveresult.refresh_from_db()
|
|
|
|
if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
|
|
print(f"[green]Extraction succeeded: {archiveresult.output_str}[/green]")
|
|
return 0
|
|
elif archiveresult.status == ArchiveResult.StatusChoices.NORESULTS:
|
|
print(f"[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]")
|
|
return 0
|
|
elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
|
|
print(f"[red]Extraction failed: {archiveresult.output_str}[/red]", file=sys.stderr)
|
|
return 1
|
|
else:
|
|
# Still in progress or backoff - not a failure
|
|
print(f"[yellow]Extraction status: {archiveresult.status}[/yellow]")
|
|
return 0
|
|
|
|
except Exception as e:
|
|
print(f"[red]Extraction error: {type(e).__name__}: {e}[/red]", file=sys.stderr)
|
|
return 1
|
|
|
|
|
|
def run_plugins(
|
|
args: tuple,
|
|
records: list[dict] | None = None,
|
|
plugins: str = "",
|
|
wait: bool = True,
|
|
emit_results: bool = True,
|
|
) -> int:
|
|
"""
|
|
Run plugins on Snapshots from input.
|
|
|
|
Reads Snapshot IDs or JSONL from args/stdin, runs plugins, outputs JSONL.
|
|
|
|
Exit codes:
|
|
0: Success
|
|
1: Failure
|
|
"""
|
|
from rich import print as rprint
|
|
from django.utils import timezone
|
|
|
|
from archivebox.misc.jsonl import (
|
|
read_args_or_stdin,
|
|
write_record,
|
|
TYPE_SNAPSHOT,
|
|
TYPE_ARCHIVERESULT,
|
|
)
|
|
from archivebox.core.models import Snapshot
|
|
from archivebox.services.runner import run_crawl
|
|
|
|
is_tty = sys.stdout.isatty()
|
|
|
|
# Parse comma-separated plugins list once (reused in creation and filtering)
|
|
plugins_list = [p.strip() for p in plugins.split(",") if p.strip()] if plugins else []
|
|
|
|
# Parse stdin/args exactly once per CLI invocation.
|
|
# `main()` may already have consumed stdin to distinguish Snapshot input from
|
|
# ArchiveResult IDs; if so, it must pass the parsed records through here
|
|
# instead of asking this helper to reread an already-drained pipe.
|
|
if records is None:
|
|
records = list(read_args_or_stdin(args))
|
|
|
|
if not records:
|
|
rprint("[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]", file=sys.stderr)
|
|
return 1
|
|
|
|
# Gather snapshot IDs and optional plugin constraints to process
|
|
snapshot_ids = set()
|
|
requested_plugins_by_snapshot: dict[str, set[str]] = defaultdict(set)
|
|
for record in records:
|
|
record_type = record.get("type")
|
|
|
|
if record_type == TYPE_SNAPSHOT:
|
|
snapshot_id = record.get("id")
|
|
if snapshot_id:
|
|
snapshot_ids.add(snapshot_id)
|
|
elif record.get("url"):
|
|
# Look up by URL (get most recent if multiple exist)
|
|
snap = Snapshot.objects.filter(url=record["url"]).order_by("-created_at").first()
|
|
if snap:
|
|
snapshot_ids.add(str(snap.id))
|
|
else:
|
|
rprint(f"[yellow]Snapshot not found for URL: {record['url']}[/yellow]", file=sys.stderr)
|
|
|
|
elif record_type == TYPE_ARCHIVERESULT:
|
|
snapshot_id = record.get("snapshot_id")
|
|
if snapshot_id:
|
|
snapshot_ids.add(snapshot_id)
|
|
plugin_name = record.get("plugin")
|
|
if plugin_name and not plugins_list:
|
|
requested_plugins_by_snapshot[str(snapshot_id)].add(str(plugin_name))
|
|
|
|
elif "id" in record:
|
|
# Assume it's a snapshot ID
|
|
snapshot_ids.add(record["id"])
|
|
|
|
if not snapshot_ids:
|
|
rprint("[red]No valid snapshot IDs found in input[/red]", file=sys.stderr)
|
|
return 1
|
|
|
|
# Get snapshots and ensure they have pending ArchiveResults
|
|
processed_count = 0
|
|
for snapshot_id in snapshot_ids:
|
|
try:
|
|
snapshot = Snapshot.objects.get(id=snapshot_id)
|
|
except Snapshot.DoesNotExist:
|
|
rprint(f"[yellow]Snapshot {snapshot_id} not found[/yellow]", file=sys.stderr)
|
|
continue
|
|
|
|
requested_plugin_names = set(plugins_list) | requested_plugins_by_snapshot.get(str(snapshot.id), set())
|
|
for plugin_name in requested_plugin_names:
|
|
existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by("-created_at").first()
|
|
if existing_result:
|
|
existing_result.reset_for_retry()
|
|
|
|
# Reset snapshot status to allow processing
|
|
if snapshot.status == Snapshot.StatusChoices.SEALED:
|
|
snapshot.status = Snapshot.StatusChoices.STARTED
|
|
snapshot.retry_at = timezone.now()
|
|
snapshot.save()
|
|
|
|
processed_count += 1
|
|
|
|
if processed_count == 0:
|
|
rprint("[red]No snapshots to process[/red]", file=sys.stderr)
|
|
return 1
|
|
|
|
rprint(f"[blue]Queued {processed_count} snapshots for extraction[/blue]", file=sys.stderr)
|
|
|
|
# Run orchestrator if --wait (default)
|
|
if wait:
|
|
rprint("[blue]Running plugins...[/blue]", file=sys.stderr)
|
|
snapshot_ids_by_crawl: dict[str, set[str]] = defaultdict(set)
|
|
for snapshot_id in snapshot_ids:
|
|
try:
|
|
snapshot = Snapshot.objects.only("id", "crawl_id").get(id=snapshot_id)
|
|
except Snapshot.DoesNotExist:
|
|
continue
|
|
snapshot_ids_by_crawl[str(snapshot.crawl_id)].add(str(snapshot.id))
|
|
|
|
for crawl_id, crawl_snapshot_ids in snapshot_ids_by_crawl.items():
|
|
selected_plugins = (
|
|
plugins_list
|
|
or sorted(
|
|
{plugin for snapshot_id in crawl_snapshot_ids for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())},
|
|
)
|
|
or None
|
|
)
|
|
run_crawl(
|
|
crawl_id,
|
|
snapshot_ids=sorted(crawl_snapshot_ids),
|
|
selected_plugins=selected_plugins,
|
|
)
|
|
|
|
if not emit_results:
|
|
return 0
|
|
|
|
# Output results as JSONL (when piped) or human-readable (when TTY)
|
|
for snapshot_id in snapshot_ids:
|
|
try:
|
|
snapshot = Snapshot.objects.get(id=snapshot_id)
|
|
results = snapshot.archiveresult_set.all()
|
|
if plugins_list:
|
|
results = results.filter(plugin__in=plugins_list)
|
|
|
|
for result in results:
|
|
if is_tty:
|
|
status_color = {
|
|
"succeeded": "green",
|
|
"failed": "red",
|
|
"skipped": "yellow",
|
|
}.get(result.status, "dim")
|
|
rprint(
|
|
f" [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ''}",
|
|
file=sys.stderr,
|
|
)
|
|
else:
|
|
write_record(result.to_json())
|
|
except Snapshot.DoesNotExist:
|
|
continue
|
|
|
|
return 0
|
|
|
|
|
|
def is_archiveresult_id(value: str) -> bool:
|
|
"""Check if value looks like an ArchiveResult UUID."""
|
|
import re
|
|
|
|
uuid_pattern = re.compile(r"^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$", re.I)
|
|
if not uuid_pattern.match(value):
|
|
return False
|
|
# Verify it's actually an ArchiveResult (not a Snapshot or other object)
|
|
from archivebox.core.models import ArchiveResult
|
|
|
|
return ArchiveResult.objects.filter(id=value).exists()
|
|
|
|
|
|
@click.command()
|
|
@click.option("--plugins", "--plugin", "-p", default="", help="Comma-separated list of plugins to run (e.g., screenshot,singlefile)")
|
|
@click.option("--wait/--no-wait", default=True, help="Wait for plugins to complete (default: wait)")
|
|
@click.argument("args", nargs=-1)
|
|
def main(plugins: str, wait: bool, args: tuple):
|
|
"""Run plugins on Snapshots, or process existing ArchiveResults by ID"""
|
|
from archivebox.misc.jsonl import read_args_or_stdin
|
|
|
|
# Read all input
|
|
records = list(read_args_or_stdin(args))
|
|
|
|
if not records:
|
|
from rich import print as rprint
|
|
|
|
rprint("[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
# Check if input looks like existing ArchiveResult IDs to process
|
|
all_are_archiveresult_ids = all(is_archiveresult_id(r.get("id") or r.get("url", "")) for r in records)
|
|
|
|
if all_are_archiveresult_ids:
|
|
# Process existing ArchiveResults by ID
|
|
from rich import print as rprint
|
|
|
|
exit_code = 0
|
|
for record in records:
|
|
archiveresult_id = record.get("id") or record.get("url")
|
|
if not isinstance(archiveresult_id, str):
|
|
rprint(f"[red]Invalid ArchiveResult input: {record}[/red]", file=sys.stderr)
|
|
exit_code = 1
|
|
continue
|
|
result = process_archiveresult_by_id(archiveresult_id)
|
|
if result != 0:
|
|
exit_code = result
|
|
sys.exit(exit_code)
|
|
else:
|
|
# Default behavior: run plugins on Snapshots from input
|
|
sys.exit(run_plugins(args, records=records, plugins=plugins, wait=wait))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|