Files
ArchiveBox/archivebox/cli/archivebox_crawl.py
Nick Sweeting b749b26c5d wip
2026-03-23 03:58:32 -07:00

385 lines
11 KiB
Python

#!/usr/bin/env python3
"""
archivebox crawl <action> [args...] [--filters]
Manage Crawl records.
Actions:
create - Create Crawl jobs from URLs
list - List Crawls as JSONL (with optional filters)
update - Update Crawls from stdin JSONL
delete - Delete Crawls from stdin JSONL
Examples:
# Create
archivebox crawl create https://example.com https://foo.com --depth=1
archivebox crawl create --tag=news https://example.com
# List with filters
archivebox crawl list --status=queued
archivebox crawl list --urls__icontains=example.com
# Update
archivebox crawl list --status=started | archivebox crawl update --status=queued
# Delete
archivebox crawl list --urls__icontains=spam.com | archivebox crawl delete --yes
# Full pipeline
archivebox crawl create https://example.com | archivebox snapshot create | archivebox run
"""
__package__ = "archivebox.cli"
__command__ = "archivebox crawl"
import sys
from collections.abc import Iterable
import rich_click as click
from rich import print as rprint
from archivebox.cli.cli_utils import apply_filters
# =============================================================================
# CREATE
# =============================================================================
def create_crawl(
urls: Iterable[str],
depth: int = 0,
tag: str = "",
status: str = "queued",
created_by_id: int | None = None,
) -> int:
"""
Create a Crawl job from URLs.
Takes URLs as args or stdin, creates one Crawl with all URLs, outputs JSONL.
Pass-through: Records that are not URLs are output unchanged (for piping).
Exit codes:
0: Success
1: Failure
"""
from archivebox.misc.jsonl import read_args_or_stdin, write_record, TYPE_CRAWL
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
created_by_id = created_by_id or get_or_create_system_user_pk()
is_tty = sys.stdout.isatty()
# Collect all input records
records = list(read_args_or_stdin(urls))
if not records:
rprint("[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]", file=sys.stderr)
return 1
# Separate pass-through records from URL records
url_list = []
pass_through_records = []
for record in records:
record_type = record.get("type", "")
# Pass-through: output records that aren't URL/Crawl types
if record_type and record_type != TYPE_CRAWL and not record.get("url") and not record.get("urls"):
pass_through_records.append(record)
continue
# Handle existing Crawl records (just pass through with id)
if record_type == TYPE_CRAWL and record.get("id"):
pass_through_records.append(record)
continue
# Collect URLs
url = record.get("url")
if url:
url_list.append(url)
# Handle 'urls' field (newline-separated)
urls_field = record.get("urls")
if urls_field:
for line in urls_field.split("\n"):
line = line.strip()
if line and not line.startswith("#"):
url_list.append(line)
# Output pass-through records first
if not is_tty:
for record in pass_through_records:
write_record(record)
if not url_list:
if pass_through_records:
# If we had pass-through records but no URLs, that's OK
rprint(f"[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]", file=sys.stderr)
return 0
rprint("[red]No valid URLs found[/red]", file=sys.stderr)
return 1
try:
# Build crawl record with all URLs as newline-separated string
crawl_record = {
"urls": "\n".join(url_list),
"max_depth": depth,
"tags_str": tag,
"status": status,
"label": "",
}
crawl = Crawl.from_json(crawl_record, overrides={"created_by_id": created_by_id})
if not crawl:
rprint("[red]Failed to create crawl[/red]", file=sys.stderr)
return 1
# Output JSONL record (only when piped)
if not is_tty:
write_record(crawl.to_json())
rprint(f"[green]Created crawl with {len(url_list)} URLs[/green]", file=sys.stderr)
# If TTY, show human-readable output
if is_tty:
rprint(f" [dim]{crawl.id}[/dim]", file=sys.stderr)
for url in url_list[:5]: # Show first 5 URLs
rprint(f" {url[:70]}", file=sys.stderr)
if len(url_list) > 5:
rprint(f" ... and {len(url_list) - 5} more", file=sys.stderr)
return 0
except Exception as e:
rprint(f"[red]Error creating crawl: {e}[/red]", file=sys.stderr)
return 1
# =============================================================================
# LIST
# =============================================================================
def list_crawls(
status: str | None = None,
urls__icontains: str | None = None,
max_depth: int | None = None,
limit: int | None = None,
) -> int:
"""
List Crawls as JSONL with optional filters.
Exit codes:
0: Success (even if no results)
"""
from archivebox.misc.jsonl import write_record
from archivebox.crawls.models import Crawl
is_tty = sys.stdout.isatty()
queryset = Crawl.objects.all().order_by("-created_at")
# Apply filters
filter_kwargs = {
"status": status,
"urls__icontains": urls__icontains,
"max_depth": max_depth,
}
queryset = apply_filters(queryset, filter_kwargs, limit=limit)
count = 0
for crawl in queryset:
if is_tty:
status_color = {
"queued": "yellow",
"started": "blue",
"sealed": "green",
}.get(crawl.status, "dim")
url_preview = crawl.urls[:50].replace("\n", " ")
rprint(f"[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...")
else:
write_record(crawl.to_json())
count += 1
rprint(f"[dim]Listed {count} crawls[/dim]", file=sys.stderr)
return 0
# =============================================================================
# UPDATE
# =============================================================================
def update_crawls(
status: str | None = None,
max_depth: int | None = None,
) -> int:
"""
Update Crawls from stdin JSONL.
Reads Crawl records from stdin and applies updates.
Uses PATCH semantics - only specified fields are updated.
Exit codes:
0: Success
1: No input or error
"""
from django.utils import timezone
from archivebox.misc.jsonl import read_stdin, write_record
from archivebox.crawls.models import Crawl
is_tty = sys.stdout.isatty()
records = list(read_stdin())
if not records:
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
updated_count = 0
for record in records:
crawl_id = record.get("id")
if not crawl_id:
continue
try:
crawl = Crawl.objects.get(id=crawl_id)
# Apply updates from CLI flags
if status:
crawl.status = status
crawl.retry_at = timezone.now()
if max_depth is not None:
crawl.max_depth = max_depth
crawl.save()
updated_count += 1
if not is_tty:
write_record(crawl.to_json())
except Crawl.DoesNotExist:
rprint(f"[yellow]Crawl not found: {crawl_id}[/yellow]", file=sys.stderr)
continue
rprint(f"[green]Updated {updated_count} crawls[/green]", file=sys.stderr)
return 0
# =============================================================================
# DELETE
# =============================================================================
def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
"""
Delete Crawls from stdin JSONL.
Requires --yes flag to confirm deletion.
Exit codes:
0: Success
1: No input or missing --yes flag
"""
from archivebox.misc.jsonl import read_stdin
from archivebox.crawls.models import Crawl
records = list(read_stdin())
if not records:
rprint("[yellow]No records provided via stdin[/yellow]", file=sys.stderr)
return 1
crawl_ids = [r.get("id") for r in records if r.get("id")]
if not crawl_ids:
rprint("[yellow]No valid crawl IDs in input[/yellow]", file=sys.stderr)
return 1
crawls = Crawl.objects.filter(id__in=crawl_ids)
count = crawls.count()
if count == 0:
rprint("[yellow]No matching crawls found[/yellow]", file=sys.stderr)
return 0
if dry_run:
rprint(f"[yellow]Would delete {count} crawls (dry run)[/yellow]", file=sys.stderr)
for crawl in crawls:
url_preview = crawl.urls[:50].replace("\n", " ")
rprint(f" [dim]{crawl.id}[/dim] {url_preview}...", file=sys.stderr)
return 0
if not yes:
rprint("[red]Use --yes to confirm deletion[/red]", file=sys.stderr)
return 1
# Perform deletion
deleted_count, _ = crawls.delete()
rprint(f"[green]Deleted {deleted_count} crawls[/green]", file=sys.stderr)
return 0
# =============================================================================
# CLI Commands
# =============================================================================
@click.group()
def main():
"""Manage Crawl records."""
pass
@main.command("create")
@click.argument("urls", nargs=-1)
@click.option("--depth", "-d", type=int, default=0, help="Max crawl depth (default: 0)")
@click.option("--tag", "-t", default="", help="Comma-separated tags to add")
@click.option("--status", "-s", default="queued", help="Initial status (default: queued)")
def create_cmd(urls: tuple, depth: int, tag: str, status: str):
"""Create a Crawl job from URLs or stdin."""
sys.exit(create_crawl(urls, depth=depth, tag=tag, status=status))
@main.command("list")
@click.option("--status", "-s", help="Filter by status (queued, started, sealed)")
@click.option("--urls__icontains", help="Filter by URLs contains")
@click.option("--max-depth", type=int, help="Filter by max depth")
@click.option("--limit", "-n", type=int, help="Limit number of results")
def list_cmd(
status: str | None,
urls__icontains: str | None,
max_depth: int | None,
limit: int | None,
):
"""List Crawls as JSONL."""
sys.exit(
list_crawls(
status=status,
urls__icontains=urls__icontains,
max_depth=max_depth,
limit=limit,
),
)
@main.command("update")
@click.option("--status", "-s", help="Set status")
@click.option("--max-depth", type=int, help="Set max depth")
def update_cmd(status: str | None, max_depth: int | None):
"""Update Crawls from stdin JSONL."""
sys.exit(update_crawls(status=status, max_depth=max_depth))
@main.command("delete")
@click.option("--yes", "-y", is_flag=True, help="Confirm deletion")
@click.option("--dry-run", is_flag=True, help="Show what would be deleted")
def delete_cmd(yes: bool, dry_run: bool):
"""Delete Crawls from stdin JSONL."""
sys.exit(delete_crawls(yes=yes, dry_run=dry_run))
if __name__ == "__main__":
main()