Files
ArchiveBox/archivebox/cli/archivebox_snapshot.py
Nick Sweeting b749b26c5d wip
2026-03-23 03:58:32 -07:00

498 lines
16 KiB
Python

#!/usr/bin/env python3
"""
archivebox snapshot <action> [args...] [--filters]
Manage Snapshot records.
Actions:
create - Create Snapshots from URLs or Crawl JSONL
list - List Snapshots as JSONL (with optional filters)
update - Update Snapshots from stdin JSONL
delete - Delete Snapshots from stdin JSONL
Examples:
# Create
archivebox snapshot create https://example.com --tag=news
archivebox crawl create https://example.com | archivebox snapshot create
# List with filters
archivebox snapshot list --status=queued
archivebox snapshot list --url__icontains=example.com
# Update
archivebox snapshot list --tag=old | archivebox snapshot update --tag=new
# Delete
archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes
"""
__package__ = "archivebox.cli"
__command__ = "archivebox snapshot"
import sys
from collections.abc import Iterable
import rich_click as click
from rich import print as rprint
from django.db.models import Q, Sum
from django.db.models.functions import Coalesce
from archivebox.cli.cli_utils import apply_filters
# =============================================================================
# CREATE
# =============================================================================
def create_snapshots(
urls: Iterable[str],
tag: str = "",
status: str = "queued",
depth: int = 0,
created_by_id: int | None = None,
) -> int:
"""
Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records).
Pass-through: Records that are not Crawl/Snapshot/URL are output unchanged.
Exit codes:
0: Success
1: Failure
"""
from archivebox.misc.jsonl import (
read_args_or_stdin,
write_record,
TYPE_SNAPSHOT,
TYPE_CRAWL,
)
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
created_by_id = created_by_id or get_or_create_system_user_pk()
is_tty = sys.stdout.isatty()
# Collect all input records
records = list(read_args_or_stdin(urls))
if not records:
rprint("[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr)
return 1
# Process each record - handle Crawls and plain URLs/Snapshots
created_snapshots = []
pass_through_count = 0
for record in records:
record_type = record.get("type", "")
try:
if record_type == TYPE_CRAWL:
# Pass through the Crawl record itself first
if not is_tty:
write_record(record)
# Input is a Crawl - get or create it, then create Snapshots for its URLs
crawl = None
crawl_id = record.get("id")
if crawl_id:
try:
crawl = Crawl.objects.get(id=crawl_id)
except Crawl.DoesNotExist:
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
else:
crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id})
if not crawl:
continue
# Create snapshots for each URL in the crawl
for url in crawl.get_urls_list():
merged_tags = crawl.tags_str
if tag:
merged_tags = f"{merged_tags},{tag}" if merged_tags else tag
snapshot_record = {
"url": url,
"tags": merged_tags,
"crawl_id": str(crawl.id),
"depth": depth,
"status": status,
}
snapshot = Snapshot.from_json(snapshot_record, overrides={"created_by_id": created_by_id})
if snapshot:
created_snapshots.append(snapshot)
if not is_tty:
write_record(snapshot.to_json())
elif record_type == TYPE_SNAPSHOT or record.get("url"):
# Input is a Snapshot or plain URL
if tag and not record.get("tags"):
record["tags"] = tag
if status:
record["status"] = status
record["depth"] = record.get("depth", depth)
snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id})
if snapshot:
created_snapshots.append(snapshot)
if not is_tty:
write_record(snapshot.to_json())
else:
# Pass-through: output records we don't handle
if not is_tty:
write_record(record)
pass_through_count += 1
except Exception as e:
rprint(f"[red]Error creating snapshot: {e}[/red]", file=sys.stderr)
continue
if not created_snapshots:
if pass_through_count > 0:
rprint(f"[dim]Passed through {pass_through_count} records, no new snapshots[/dim]", file=sys.stderr)
return 0
rprint("[red]No snapshots created[/red]", file=sys.stderr)
return 1
rprint(f"[green]Created {len(created_snapshots)} snapshots[/green]", file=sys.stderr)
if is_tty:
for snapshot in created_snapshots:
rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr)
return 0
# =============================================================================
# LIST
# =============================================================================
def list_snapshots(
status: str | None = None,
url__icontains: str | None = None,
url__istartswith: str | None = None,
tag: str | None = None,
crawl_id: str | None = None,
limit: int | None = None,
sort: str | None = None,
csv: str | None = None,
with_headers: bool = False,
search: str | None = None,
query: str | None = None,
) -> int:
"""
List Snapshots as JSONL with optional filters.
Exit codes:
0: Success (even if no results)
"""
from archivebox.misc.jsonl import write_record
from archivebox.core.models import Snapshot
from archivebox.search import (
get_default_search_mode,
get_search_mode,
prioritize_metadata_matches,
query_search_index,
)
if with_headers and not csv:
rprint("[red]--with-headers requires --csv[/red]", file=sys.stderr)
return 2
is_tty = sys.stdout.isatty() and not csv
queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0)).order_by("-created_at")
# Apply filters
filter_kwargs = {
"status": status,
"url__icontains": url__icontains,
"url__istartswith": url__istartswith,
"crawl_id": crawl_id,
}
queryset = apply_filters(queryset, filter_kwargs)
# Tag filter requires special handling (M2M)
if tag:
queryset = queryset.filter(tags__name__iexact=tag)
query = (query or "").strip()
if query:
metadata_qs = queryset.filter(
Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query),
)
requested_search_mode = (search or "").strip().lower()
if requested_search_mode == "content":
requested_search_mode = "contents"
search_mode = get_default_search_mode() if not requested_search_mode else get_search_mode(requested_search_mode)
if search_mode == "meta":
queryset = metadata_qs
else:
try:
deep_qsearch = None
if search_mode == "deep":
qsearch = query_search_index(query, search_mode="contents")
deep_qsearch = query_search_index(query, search_mode="deep")
else:
qsearch = query_search_index(query, search_mode=search_mode)
queryset = prioritize_metadata_matches(
queryset,
metadata_qs,
qsearch,
deep_queryset=deep_qsearch,
ordering=("-created_at",) if not sort else None,
)
except Exception as err:
rprint(
f"[yellow]Search backend error, falling back to metadata search: {err}[/yellow]",
file=sys.stderr,
)
queryset = metadata_qs
if sort:
queryset = queryset.order_by(sort)
if limit:
queryset = queryset[:limit]
count = 0
if csv:
cols = [col.strip() for col in csv.split(",") if col.strip()]
if not cols:
rprint("[red]No CSV columns provided[/red]", file=sys.stderr)
return 2
rows: list[str] = []
if with_headers:
rows.append(",".join(cols))
for snapshot in queryset.iterator(chunk_size=500):
rows.append(snapshot.to_csv(cols=cols, separator=","))
count += 1
output = "\n".join(rows)
if output:
sys.stdout.write(output)
if not output.endswith("\n"):
sys.stdout.write("\n")
rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr)
return 0
for snapshot in queryset:
if is_tty:
status_color = {
"queued": "yellow",
"started": "blue",
"sealed": "green",
}.get(snapshot.status, "dim")
rprint(f"[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}")
else:
write_record(snapshot.to_json())
count += 1
rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr)
return 0
# =============================================================================
# UPDATE
# =============================================================================
def update_snapshots(
status: str | None = None,
tag: str | None = None,
) -> int:
"""
Update Snapshots from stdin JSONL.
Reads Snapshot records from stdin and applies updates.
Uses PATCH semantics - only specified fields are updated.
Exit codes:
0: Success
1: No input or error
"""
from django.utils import timezone
from archivebox.misc.jsonl import read_stdin, write_record
from archivebox.core.models import Snapshot
is_tty = sys.stdout.isatty()
records = list(read_stdin())
if not records:
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
updated_count = 0
for record in records:
snapshot_id = record.get("id")
if not snapshot_id:
continue
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
# Apply updates from CLI flags (override stdin values)
if status:
snapshot.status = status
snapshot.retry_at = timezone.now()
if tag:
# Add tag to existing tags
snapshot.save() # Ensure saved before M2M
from archivebox.core.models import Tag
tag_obj, _ = Tag.objects.get_or_create(name=tag)
snapshot.tags.add(tag_obj)
snapshot.save()
updated_count += 1
if not is_tty:
write_record(snapshot.to_json())
except Snapshot.DoesNotExist:
rprint(f"[yellow]Snapshot not found: {snapshot_id}[/yellow]", file=sys.stderr)
continue
rprint(f"[green]Updated {updated_count} snapshots[/green]", file=sys.stderr)
return 0
# =============================================================================
# DELETE
# =============================================================================
def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Snapshots from stdin JSONL.
Requires --yes flag to confirm deletion.
Exit codes:
0: Success
1: No input or missing --yes flag
"""
from archivebox.misc.jsonl import read_stdin
from archivebox.core.models import Snapshot
records = list(read_stdin())
if not records:
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
snapshot_ids = [r.get("id") for r in records if r.get("id")]
if not snapshot_ids:
rprint("[yellow]No valid snapshot IDs in input[/yellow]", file=sys.stderr)
return 1
snapshots = Snapshot.objects.filter(id__in=snapshot_ids)
count = snapshots.count()
if count == 0:
rprint("[yellow]No matching snapshots found[/yellow]", file=sys.stderr)
return 0
if dry_run:
rprint(f"[yellow]Would delete {count} snapshots (dry run)[/yellow]", file=sys.stderr)
for snapshot in snapshots:
rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr)
return 0
if not yes:
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = snapshots.delete()
rprint(f"[green]Deleted {deleted_count} snapshots[/green]", file=sys.stderr)
return 0
# =============================================================================
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Snapshot records."""
pass
@main.command("create")
@click.argument("urls", nargs=-1)
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
@click.option("--depth", "-d", type=int, default=0, help="Crawl depth (default: 0)")
def create_cmd(urls: tuple, tag: str, status: str, depth: int):
"""Create Snapshots from URLs or stdin JSONL."""
sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
@main.command("list")
@click.option("--status", "-s", help="Filter by status (queued, started, sealed)")
@click.option("--url__icontains", help="Filter by URL contains")
@click.option("--url__istartswith", help="Filter by URL starts with")
@click.option("--tag", "-t", help="Filter by tag name")
@click.option("--crawl-id", help="Filter by crawl ID")
@click.option("--limit", "-n", type=int, help="Limit number of results")
@click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at")
@click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: timestamp,url,title")
@click.option("--with-headers", is_flag=True, help="Include column headers in structured output")
@click.option("--search", type=click.Choice(["meta", "content", "contents", "deep"]), help="Search mode to use for the query")
@click.argument("query", nargs=-1)
def list_cmd(
status: str | None,
url__icontains: str | None,
url__istartswith: str | None,
tag: str | None,
crawl_id: str | None,
limit: int | None,
sort: str | None,
csv: str | None,
with_headers: bool,
search: str | None,
query: tuple[str, ...],
):
"""List Snapshots as JSONL."""
sys.exit(
list_snapshots(
status=status,
url__icontains=url__icontains,
url__istartswith=url__istartswith,
tag=tag,
crawl_id=crawl_id,
limit=limit,
sort=sort,
csv=csv,
with_headers=with_headers,
search=search,
query=" ".join(query),
),
)
@main.command("update")
@click.option("--status", "-s", help="Set status")
@click.option("--tag", "-t", help="Add tag")
def update_cmd(status: str | None, tag: str | None):
"""Update Snapshots from stdin JSONL."""
sys.exit(update_snapshots(status=status, tag=tag))
@main.command("delete")
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Snapshots from stdin JSONL."""
sys.exit(delete_snapshots(yes=yes, dry_run=dry_run))
if __name__ == "__main__":
main()