#!/usr/bin/env python3 """ archivebox snapshot [args...] [--filters] Manage Snapshot records. Actions: create - Create Snapshots from URLs or Crawl JSONL list - List Snapshots as JSONL (with optional filters) update - Update Snapshots from stdin JSONL delete - Delete Snapshots from stdin JSONL Examples: # Create archivebox snapshot create https://example.com --tag=news archivebox crawl create https://example.com | archivebox snapshot create # List with filters archivebox snapshot list --status=queued archivebox snapshot list --url__icontains=example.com # Update archivebox snapshot list --tag=old | archivebox snapshot update --tag=new # Delete archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes """ __package__ = "archivebox.cli" __command__ = "archivebox snapshot" import sys from collections.abc import Iterable import rich_click as click from rich import print as rprint from django.db.models import Q, Sum from django.db.models.functions import Coalesce from archivebox.cli.cli_utils import apply_filters # ============================================================================= # CREATE # ============================================================================= def create_snapshots( urls: Iterable[str], tag: str = "", status: str = "queued", depth: int = 0, created_by_id: int | None = None, ) -> int: """ Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records). Pass-through: Records that are not Crawl/Snapshot/URL are output unchanged. Exit codes: 0: Success 1: Failure """ from archivebox.misc.jsonl import ( read_args_or_stdin, write_record, TYPE_SNAPSHOT, TYPE_CRAWL, ) from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.core.models import Snapshot from archivebox.crawls.models import Crawl created_by_id = created_by_id or get_or_create_system_user_pk() is_tty = sys.stdout.isatty() # Collect all input records records = list(read_args_or_stdin(urls)) if not records: rprint("[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr) return 1 # Process each record - handle Crawls and plain URLs/Snapshots created_snapshots = [] pass_through_count = 0 for record in records: record_type = record.get("type", "") try: if record_type == TYPE_CRAWL: # Pass through the Crawl record itself first if not is_tty: write_record(record) # Input is a Crawl - get or create it, then create Snapshots for its URLs crawl = None crawl_id = record.get("id") if crawl_id: try: crawl = Crawl.objects.get(id=crawl_id) except Crawl.DoesNotExist: crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id}) else: crawl = Crawl.from_json(record, overrides={"created_by_id": created_by_id}) if not crawl: continue # Create snapshots for each URL in the crawl for url in crawl.get_urls_list(): merged_tags = crawl.tags_str if tag: merged_tags = f"{merged_tags},{tag}" if merged_tags else tag snapshot_record = { "url": url, "tags": merged_tags, "crawl_id": str(crawl.id), "depth": depth, "status": status, } snapshot = Snapshot.from_json(snapshot_record, overrides={"created_by_id": created_by_id}) if snapshot: created_snapshots.append(snapshot) if not is_tty: write_record(snapshot.to_json()) elif record_type == TYPE_SNAPSHOT or record.get("url"): # Input is a Snapshot or plain URL if tag and not record.get("tags"): record["tags"] = tag if status: record["status"] = status record["depth"] = record.get("depth", depth) snapshot = Snapshot.from_json(record, overrides={"created_by_id": created_by_id}) if snapshot: created_snapshots.append(snapshot) if not is_tty: write_record(snapshot.to_json()) else: # Pass-through: output records we don't handle if not is_tty: write_record(record) pass_through_count += 1 except Exception as e: rprint(f"[red]Error creating snapshot: {e}[/red]", file=sys.stderr) continue if not created_snapshots: if pass_through_count > 0: rprint(f"[dim]Passed through {pass_through_count} records, no new snapshots[/dim]", file=sys.stderr) return 0 rprint("[red]No snapshots created[/red]", file=sys.stderr) return 1 rprint(f"[green]Created {len(created_snapshots)} snapshots[/green]", file=sys.stderr) if is_tty: for snapshot in created_snapshots: rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr) return 0 # ============================================================================= # LIST # ============================================================================= def list_snapshots( status: str | None = None, url__icontains: str | None = None, url__istartswith: str | None = None, tag: str | None = None, crawl_id: str | None = None, limit: int | None = None, sort: str | None = None, csv: str | None = None, with_headers: bool = False, search: str | None = None, query: str | None = None, ) -> int: """ List Snapshots as JSONL with optional filters. Exit codes: 0: Success (even if no results) """ from archivebox.misc.jsonl import write_record from archivebox.core.models import Snapshot from archivebox.search import ( get_default_search_mode, get_search_mode, prioritize_metadata_matches, query_search_index, ) if with_headers and not csv: rprint("[red]--with-headers requires --csv[/red]", file=sys.stderr) return 2 is_tty = sys.stdout.isatty() and not csv queryset = Snapshot.objects.annotate(output_size_sum=Coalesce(Sum("archiveresult__output_size"), 0)).order_by("-created_at") # Apply filters filter_kwargs = { "status": status, "url__icontains": url__icontains, "url__istartswith": url__istartswith, "crawl_id": crawl_id, } queryset = apply_filters(queryset, filter_kwargs) # Tag filter requires special handling (M2M) if tag: queryset = queryset.filter(tags__name__iexact=tag) query = (query or "").strip() if query: metadata_qs = queryset.filter( Q(title__icontains=query) | Q(url__icontains=query) | Q(timestamp__icontains=query) | Q(tags__name__icontains=query), ) requested_search_mode = (search or "").strip().lower() if requested_search_mode == "content": requested_search_mode = "contents" search_mode = get_default_search_mode() if not requested_search_mode else get_search_mode(requested_search_mode) if search_mode == "meta": queryset = metadata_qs else: try: deep_qsearch = None if search_mode == "deep": qsearch = query_search_index(query, search_mode="contents") deep_qsearch = query_search_index(query, search_mode="deep") else: qsearch = query_search_index(query, search_mode=search_mode) queryset = prioritize_metadata_matches( queryset, metadata_qs, qsearch, deep_queryset=deep_qsearch, ordering=("-created_at",) if not sort else None, ) except Exception as err: rprint( f"[yellow]Search backend error, falling back to metadata search: {err}[/yellow]", file=sys.stderr, ) queryset = metadata_qs if sort: queryset = queryset.order_by(sort) if limit: queryset = queryset[:limit] count = 0 if csv: cols = [col.strip() for col in csv.split(",") if col.strip()] if not cols: rprint("[red]No CSV columns provided[/red]", file=sys.stderr) return 2 rows: list[str] = [] if with_headers: rows.append(",".join(cols)) for snapshot in queryset.iterator(chunk_size=500): rows.append(snapshot.to_csv(cols=cols, separator=",")) count += 1 output = "\n".join(rows) if output: sys.stdout.write(output) if not output.endswith("\n"): sys.stdout.write("\n") rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr) return 0 for snapshot in queryset: if is_tty: status_color = { "queued": "yellow", "started": "blue", "sealed": "green", }.get(snapshot.status, "dim") rprint(f"[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}") else: write_record(snapshot.to_json()) count += 1 rprint(f"[dim]Listed {count} snapshots[/dim]", file=sys.stderr) return 0 # ============================================================================= # UPDATE # ============================================================================= def update_snapshots( status: str | None = None, tag: str | None = None, ) -> int: """ Update Snapshots from stdin JSONL. Reads Snapshot records from stdin and applies updates. Uses PATCH semantics - only specified fields are updated. Exit codes: 0: Success 1: No input or error """ from django.utils import timezone from archivebox.misc.jsonl import read_stdin, write_record from archivebox.core.models import Snapshot is_tty = sys.stdout.isatty() records = list(read_stdin()) if not records: rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) return 1 updated_count = 0 for record in records: snapshot_id = record.get("id") if not snapshot_id: continue try: snapshot = Snapshot.objects.get(id=snapshot_id) # Apply updates from CLI flags (override stdin values) if status: snapshot.status = status snapshot.retry_at = timezone.now() if tag: # Add tag to existing tags snapshot.save() # Ensure saved before M2M from archivebox.core.models import Tag tag_obj, _ = Tag.objects.get_or_create(name=tag) snapshot.tags.add(tag_obj) snapshot.save() updated_count += 1 if not is_tty: write_record(snapshot.to_json()) except Snapshot.DoesNotExist: rprint(f"[yellow]Snapshot not found: {snapshot_id}[/yellow]", file=sys.stderr) continue rprint(f"[green]Updated {updated_count} snapshots[/green]", file=sys.stderr) return 0 # ============================================================================= # DELETE # ============================================================================= def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int: """ Delete Snapshots from stdin JSONL. Requires --yes flag to confirm deletion. Exit codes: 0: Success 1: No input or missing --yes flag """ from archivebox.misc.jsonl import read_stdin from archivebox.core.models import Snapshot records = list(read_stdin()) if not records: rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr) return 1 snapshot_ids = [r.get("id") for r in records if r.get("id")] if not snapshot_ids: rprint("[yellow]No valid snapshot IDs in input[/yellow]", file=sys.stderr) return 1 snapshots = Snapshot.objects.filter(id__in=snapshot_ids) count = snapshots.count() if count == 0: rprint("[yellow]No matching snapshots found[/yellow]", file=sys.stderr) return 0 if dry_run: rprint(f"[yellow]Would delete {count} snapshots (dry run)[/yellow]", file=sys.stderr) for snapshot in snapshots: rprint(f" [dim]{snapshot.id}[/dim] {snapshot.url[:60]}", file=sys.stderr) return 0 if not yes: rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr) return 1 # Perform deletion deleted_count, _ = snapshots.delete() rprint(f"[green]Deleted {deleted_count} snapshots[/green]", file=sys.stderr) return 0 # ============================================================================= # CLI Commands # ============================================================================= @click.group() def main(): """Manage Snapshot records.""" pass @main.command("create") @click.argument("urls", nargs=-1) @click.option("--tag", "-t", default="", help="Comma-separated tags to add") @click.option("--status", "-s", default="queued", help="Initial status (default: queued)") @click.option("--depth", "-d", type=int, default=0, help="Crawl depth (default: 0)") def create_cmd(urls: tuple, tag: str, status: str, depth: int): """Create Snapshots from URLs or stdin JSONL.""" sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth)) @main.command("list") @click.option("--status", "-s", help="Filter by status (queued, started, sealed)") @click.option("--url__icontains", help="Filter by URL contains") @click.option("--url__istartswith", help="Filter by URL starts with") @click.option("--tag", "-t", help="Filter by tag name") @click.option("--crawl-id", help="Filter by crawl ID") @click.option("--limit", "-n", type=int, help="Limit number of results") @click.option("--sort", "-o", type=str, help="Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at") @click.option("--csv", "-C", type=str, help="Print output as CSV with the provided fields, e.g.: timestamp,url,title") @click.option("--with-headers", is_flag=True, help="Include column headers in structured output") @click.option("--search", type=click.Choice(["meta", "content", "contents", "deep"]), help="Search mode to use for the query") @click.argument("query", nargs=-1) def list_cmd( status: str | None, url__icontains: str | None, url__istartswith: str | None, tag: str | None, crawl_id: str | None, limit: int | None, sort: str | None, csv: str | None, with_headers: bool, search: str | None, query: tuple[str, ...], ): """List Snapshots as JSONL.""" sys.exit( list_snapshots( status=status, url__icontains=url__icontains, url__istartswith=url__istartswith, tag=tag, crawl_id=crawl_id, limit=limit, sort=sort, csv=csv, with_headers=with_headers, search=search, query=" ".join(query), ), ) @main.command("update") @click.option("--status", "-s", help="Set status") @click.option("--tag", "-t", help="Add tag") def update_cmd(status: str | None, tag: str | None): """Update Snapshots from stdin JSONL.""" sys.exit(update_snapshots(status=status, tag=tag)) @main.command("delete") @click.option("--yes", "-y", is_flag=True, help="Confirm deletion") @click.option("--dry-run", is_flag=True, help="Show what would be deleted") def delete_cmd(yes: bool, dry_run: bool): """Delete Snapshots from stdin JSONL.""" sys.exit(delete_snapshots(yes=yes, dry_run=dry_run)) if __name__ == "__main__": main()