Files
ArchiveBox/archivebox/cli/archivebox_archiveresult.py
Nick Sweeting b749b26c5d wip
2026-03-23 03:58:32 -07:00

388 lines
12 KiB
Python

#!/usr/bin/env python3
"""
archivebox archiveresult <action> [args...] [--filters]
Manage ArchiveResult records (plugin extraction results).
Actions:
create - Create ArchiveResults for Snapshots (queue extractions)
list - List ArchiveResults as JSONL (with optional filters)
update - Update ArchiveResults from stdin JSONL
delete - Delete ArchiveResults from stdin JSONL
Examples:
# Create ArchiveResults for snapshots (queue for extraction)
archivebox snapshot list --status=queued | archivebox archiveresult create
archivebox archiveresult create --plugin=screenshot --snapshot-id=<uuid>
# List with filters
archivebox archiveresult list --status=failed
archivebox archiveresult list --plugin=screenshot --status=succeeded
# Update (reset failed extractions to queued)
archivebox archiveresult list --status=failed | archivebox archiveresult update --status=queued
# Delete
archivebox archiveresult list --plugin=singlefile | archivebox archiveresult delete --yes
# Re-run failed extractions
archivebox archiveresult list --status=failed | archivebox run
"""
__package__ = "archivebox.cli"
__command__ = "archivebox archiveresult"
import sys
import rich_click as click
from rich import print as rprint
from archivebox.cli.cli_utils import apply_filters
def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = "", status: str = "queued") -> dict:
return {
"type": "ArchiveResult",
"snapshot_id": str(snapshot_id),
"plugin": plugin,
"hook_name": hook_name,
"status": status,
}
# =============================================================================
# CREATE
# =============================================================================
def create_archiveresults(
snapshot_id: str | None = None,
plugin: str | None = None,
status: str = "queued",
) -> int:
"""
Create ArchiveResult request records for Snapshots.
Reads Snapshot records from stdin and emits ArchiveResult request JSONL.
Pass-through: Non-Snapshot/ArchiveResult records are output unchanged.
If --plugin is specified, only emits requests for that plugin.
Otherwise, emits requests for all enabled snapshot hooks.
Exit codes:
0: Success
1: Failure
"""
from archivebox.config.configset import get_config
from archivebox.hooks import discover_hooks
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
from archivebox.core.models import Snapshot
is_tty = sys.stdout.isatty()
# If snapshot_id provided directly, use that
if snapshot_id:
try:
snapshots = [Snapshot.objects.get(id=snapshot_id)]
pass_through_records = []
except Snapshot.DoesNotExist:
rprint(f"[red]Snapshot not found: {snapshot_id}[/red]", file=sys.stderr)
return 1
else:
# Read from stdin
records = list(read_stdin())
if not records:
rprint("[yellow]No Snapshot records provided via stdin[/yellow]", file=sys.stderr)
return 1
# Separate snapshot records from pass-through records
snapshot_ids = []
pass_through_records = []
for record in records:
record_type = record.get("type", "")
if record_type == TYPE_SNAPSHOT:
# Pass through the Snapshot record itself
pass_through_records.append(record)
if record.get("id"):
snapshot_ids.append(record["id"])
elif record_type == TYPE_ARCHIVERESULT:
# ArchiveResult records: pass through if they have an id
if record.get("id"):
pass_through_records.append(record)
# If no id, we could create it, but for now just pass through
else:
pass_through_records.append(record)
elif record_type:
# Other typed records (Crawl, Tag, etc): pass through
pass_through_records.append(record)
elif record.get("id"):
# Untyped record with id - assume it's a snapshot ID
snapshot_ids.append(record["id"])
# Output pass-through records first
if not is_tty:
for record in pass_through_records:
write_record(record)
if not snapshot_ids:
if pass_through_records:
rprint(f"[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]", file=sys.stderr)
return 0
rprint("[yellow]No valid Snapshot IDs in input[/yellow]", file=sys.stderr)
return 1
snapshots = list(Snapshot.objects.filter(id__in=snapshot_ids))
if not snapshots:
rprint("[yellow]No matching snapshots found[/yellow]", file=sys.stderr)
return 0 if pass_through_records else 1
created_count = 0
for snapshot in snapshots:
if plugin:
if not is_tty:
write_record(build_archiveresult_request(snapshot.id, plugin, status=status))
created_count += 1
else:
config = get_config(crawl=snapshot.crawl, snapshot=snapshot)
hooks = discover_hooks("Snapshot", config=config)
for hook_path in hooks:
hook_name = hook_path.name
plugin_name = hook_path.parent.name
if not is_tty:
write_record(build_archiveresult_request(snapshot.id, plugin_name, hook_name=hook_name, status=status))
created_count += 1
rprint(f"[green]Created {created_count} archive result request records[/green]", file=sys.stderr)
return 0
# =============================================================================
# LIST
# =============================================================================
def list_archiveresults(
status: str | None = None,
plugin: str | None = None,
snapshot_id: str | None = None,
limit: int | None = None,
) -> int:
"""
List ArchiveResults as JSONL with optional filters.
Exit codes:
0: Success (even if no results)
"""
from archivebox.misc.jsonl import write_record
from archivebox.core.models import ArchiveResult
is_tty = sys.stdout.isatty()
queryset = ArchiveResult.objects.all().order_by("-start_ts")
# Apply filters
filter_kwargs = {
"status": status,
"plugin": plugin,
"snapshot_id": snapshot_id,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for result in queryset:
if is_tty:
status_color = {
"queued": "yellow",
"started": "blue",
"succeeded": "green",
"failed": "red",
"skipped": "dim",
"noresults": "dim",
"backoff": "magenta",
}.get(result.status, "dim")
rprint(
f"[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}",
)
else:
write_record(result.to_json())
count += 1
rprint(f"[dim]Listed {count} archive results[/dim]", file=sys.stderr)
return 0
# =============================================================================
# UPDATE
# =============================================================================
def update_archiveresults(
status: str | None = None,
) -> int:
"""
Update ArchiveResults from stdin JSONL.
Reads ArchiveResult records from stdin and applies updates.
Uses PATCH semantics - only specified fields are updated.
Exit codes:
0: Success
1: No input or error
"""
from archivebox.misc.jsonl import read_stdin, write_record
from archivebox.core.models import ArchiveResult
is_tty = sys.stdout.isatty()
records = list(read_stdin())
if not records:
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
updated_count = 0
for record in records:
result_id = record.get("id")
if not result_id:
continue
try:
result = ArchiveResult.objects.get(id=result_id)
# Apply updates from CLI flags
if status:
result.status = status
result.save()
updated_count += 1
if not is_tty:
write_record(result.to_json())
except ArchiveResult.DoesNotExist:
rprint(f"[yellow]ArchiveResult not found: {result_id}[/yellow]", file=sys.stderr)
continue
rprint(f"[green]Updated {updated_count} archive results[/green]", file=sys.stderr)
return 0
# =============================================================================
# DELETE
# =============================================================================
def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete ArchiveResults from stdin JSONL.
Requires --yes flag to confirm deletion.
Exit codes:
0: Success
1: No input or missing --yes flag
"""
from archivebox.misc.jsonl import read_stdin
from archivebox.core.models import ArchiveResult
records = list(read_stdin())
if not records:
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
result_ids = [r.get("id") for r in records if r.get("id")]
if not result_ids:
rprint("[yellow]No valid archive result IDs in input[/yellow]", file=sys.stderr)
return 1
results = ArchiveResult.objects.filter(id__in=result_ids)
count = results.count()
if count == 0:
rprint("[yellow]No matching archive results found[/yellow]", file=sys.stderr)
return 0
if dry_run:
rprint(f"[yellow]Would delete {count} archive results (dry run)[/yellow]", file=sys.stderr)
for result in results[:10]:
rprint(f" [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}", file=sys.stderr)
if count > 10:
rprint(f" ... and {count - 10} more", file=sys.stderr)
return 0
if not yes:
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = results.delete()
rprint(f"[green]Deleted {deleted_count} archive results[/green]", file=sys.stderr)
return 0
# =============================================================================
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage ArchiveResult records (plugin extraction results)."""
pass
@main.command("create")
@click.option("--snapshot-id", help="Snapshot ID to create results for")
@click.option("--plugin", "-p", help="Plugin name (e.g., screenshot, singlefile)")
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
def create_cmd(snapshot_id: str | None, plugin: str | None, status: str):
"""Create ArchiveResults for Snapshots from stdin JSONL."""
sys.exit(create_archiveresults(snapshot_id=snapshot_id, plugin=plugin, status=status))
@main.command("list")
@click.option("--status", "-s", help="Filter by status (queued, started, succeeded, failed, skipped)")
@click.option("--plugin", "-p", help="Filter by plugin name")
@click.option("--snapshot-id", help="Filter by snapshot ID")
@click.option("--limit", "-n", type=int, help="Limit number of results")
def list_cmd(
status: str | None,
plugin: str | None,
snapshot_id: str | None,
limit: int | None,
):
"""List ArchiveResults as JSONL."""
sys.exit(
list_archiveresults(
status=status,
plugin=plugin,
snapshot_id=snapshot_id,
limit=limit,
),
)
@main.command("update")
@click.option("--status", "-s", help="Set status")
def update_cmd(status: str | None):
"""Update ArchiveResults from stdin JSONL."""
sys.exit(update_archiveresults(status=status))
@main.command("delete")
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
def delete_cmd(yes: bool, dry_run: bool):
"""Delete ArchiveResults from stdin JSONL."""
sys.exit(delete_archiveresults(yes=yes, dry_run=dry_run))
if __name__ == "__main__":
main()