#!/usr/bin/env python3 """ archivebox crawl [urls_or_snapshot_ids...] [--depth=N] [--plugin=NAME] Discover outgoing links from URLs or existing Snapshots. If a URL is passed, creates a Snapshot for it first, then runs parser plugins. If a snapshot_id is passed, runs parser plugins on the existing Snapshot. Outputs discovered outlink URLs as JSONL. Pipe the output to `archivebox snapshot` to archive the discovered URLs. Input formats: - Plain URLs (one per line) - Snapshot UUIDs (one per line) - JSONL: {"type": "Snapshot", "url": "...", ...} - JSONL: {"type": "Snapshot", "id": "...", ...} Output (JSONL): {"type": "Snapshot", "url": "https://discovered-url.com", "via_extractor": "...", ...} Examples: # Discover links from a page (creates snapshot first) archivebox crawl https://example.com # Discover links from an existing snapshot archivebox crawl 01234567-89ab-cdef-0123-456789abcdef # Full recursive crawl pipeline archivebox crawl https://example.com | archivebox snapshot | archivebox extract # Use only specific parser plugin archivebox crawl --plugin=parse_html_urls https://example.com # Chain: create snapshot, then crawl its outlinks archivebox snapshot https://example.com | archivebox crawl | archivebox snapshot | archivebox extract """ __package__ = 'archivebox.cli' __command__ = 'archivebox crawl' import sys import json from pathlib import Path from typing import Optional import rich_click as click from archivebox.misc.util import docstring def discover_outlinks( args: tuple, depth: int = 1, plugin: str = '', wait: bool = True, ) -> int: """ Discover outgoing links from URLs or existing Snapshots. Accepts URLs or snapshot_ids. For URLs, creates Snapshots first. Runs parser plugins, outputs discovered URLs as JSONL. The output can be piped to `archivebox snapshot` to archive the discovered links. Exit codes: 0: Success 1: Failure """ from rich import print as rprint from django.utils import timezone from archivebox.misc.jsonl import ( read_args_or_stdin, write_record, TYPE_SNAPSHOT ) from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.core.models import Snapshot, ArchiveResult from archivebox.crawls.models import Crawl from archivebox.config import CONSTANTS from workers.orchestrator import Orchestrator created_by_id = get_or_create_system_user_pk() is_tty = sys.stdout.isatty() # Collect all input records records = list(read_args_or_stdin(args)) if not records: rprint('[yellow]No URLs or snapshot IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr) return 1 # Separate records into existing snapshots vs new URLs existing_snapshot_ids = [] new_url_records = [] for record in records: # Check if it's an existing snapshot (has id but no url, or looks like a UUID) if record.get('id') and not record.get('url'): existing_snapshot_ids.append(record['id']) elif record.get('id'): # Has both id and url - check if snapshot exists try: Snapshot.objects.get(id=record['id']) existing_snapshot_ids.append(record['id']) except Snapshot.DoesNotExist: new_url_records.append(record) elif record.get('url'): new_url_records.append(record) # For new URLs, create a Crawl and Snapshots snapshot_ids = list(existing_snapshot_ids) if new_url_records: # Create a Crawl to manage this operation sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__crawl.txt' sources_file.parent.mkdir(parents=True, exist_ok=True) sources_file.write_text('\n'.join(r.get('url', '') for r in new_url_records if r.get('url'))) crawl = Crawl.from_file( sources_file, max_depth=depth, label=f'crawl --depth={depth}', created_by=created_by_id, ) # Create snapshots for new URLs for record in new_url_records: try: record['crawl_id'] = str(crawl.id) record['depth'] = record.get('depth', 0) overrides = {'created_by_id': created_by_id} snapshot = Snapshot.from_jsonl(record, overrides=overrides) if snapshot: snapshot_ids.append(str(snapshot.id)) except Exception as e: rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr) continue if not snapshot_ids: rprint('[red]No snapshots to process[/red]', file=sys.stderr) return 1 if existing_snapshot_ids: rprint(f'[blue]Using {len(existing_snapshot_ids)} existing snapshots[/blue]', file=sys.stderr) if new_url_records: rprint(f'[blue]Created {len(snapshot_ids) - len(existing_snapshot_ids)} new snapshots[/blue]', file=sys.stderr) rprint(f'[blue]Running parser plugins on {len(snapshot_ids)} snapshots...[/blue]', file=sys.stderr) # Create ArchiveResults for plugins # If --plugin is specified, only run that one. Otherwise, run all available plugins. # The orchestrator will handle dependency ordering (plugins declare deps in config.json) for snapshot_id in snapshot_ids: try: snapshot = Snapshot.objects.get(id=snapshot_id) if plugin: # User specified a single plugin to run ArchiveResult.objects.get_or_create( snapshot=snapshot, extractor=plugin, defaults={ 'status': ArchiveResult.StatusChoices.QUEUED, 'retry_at': timezone.now(), } ) else: # Create pending ArchiveResults for all enabled plugins # This uses hook discovery to find available plugins dynamically snapshot.create_pending_archiveresults() # Mark snapshot as started snapshot.status = Snapshot.StatusChoices.STARTED snapshot.retry_at = timezone.now() snapshot.save() except Snapshot.DoesNotExist: continue # Run plugins if wait: rprint('[blue]Running outlink plugins...[/blue]', file=sys.stderr) orchestrator = Orchestrator(exit_on_idle=True) orchestrator.runloop() # Collect discovered URLs from urls.jsonl files # Uses dynamic discovery - any plugin that outputs urls.jsonl is considered a parser from archivebox.hooks import collect_urls_from_plugins discovered_urls = {} for snapshot_id in snapshot_ids: try: snapshot = Snapshot.objects.get(id=snapshot_id) snapshot_dir = Path(snapshot.output_dir) # Dynamically collect urls.jsonl from ANY plugin subdirectory for entry in collect_urls_from_plugins(snapshot_dir): url = entry.get('url') if url and url not in discovered_urls: # Add metadata for crawl tracking entry['type'] = TYPE_SNAPSHOT entry['depth'] = snapshot.depth + 1 entry['via_snapshot'] = str(snapshot.id) discovered_urls[url] = entry except Snapshot.DoesNotExist: continue rprint(f'[green]Discovered {len(discovered_urls)} URLs[/green]', file=sys.stderr) # Output discovered URLs as JSONL (when piped) or human-readable (when TTY) for url, entry in discovered_urls.items(): if is_tty: via = entry.get('via_extractor', 'unknown') rprint(f' [dim]{via}[/dim] {url[:80]}', file=sys.stderr) else: write_record(entry) return 0 def process_crawl_by_id(crawl_id: str) -> int: """ Process a single Crawl by ID (used by workers). Triggers the Crawl's state machine tick() which will: - Transition from queued -> started (creates root snapshot) - Transition from started -> sealed (when all snapshots done) """ from rich import print as rprint from archivebox.crawls.models import Crawl try: crawl = Crawl.objects.get(id=crawl_id) except Crawl.DoesNotExist: rprint(f'[red]Crawl {crawl_id} not found[/red]', file=sys.stderr) return 1 rprint(f'[blue]Processing Crawl {crawl.id} (status={crawl.status})[/blue]', file=sys.stderr) try: crawl.sm.tick() crawl.refresh_from_db() rprint(f'[green]Crawl complete (status={crawl.status})[/green]', file=sys.stderr) return 0 except Exception as e: rprint(f'[red]Crawl error: {type(e).__name__}: {e}[/red]', file=sys.stderr) return 1 def is_crawl_id(value: str) -> bool: """Check if value looks like a Crawl UUID.""" import re uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I) if not uuid_pattern.match(value): return False # Verify it's actually a Crawl (not a Snapshot or other object) from archivebox.crawls.models import Crawl return Crawl.objects.filter(id=value).exists() @click.command() @click.option('--depth', '-d', type=int, default=1, help='Max depth for recursive crawling (default: 1)') @click.option('--plugin', '-p', default='', help='Use only this parser plugin (e.g., parse_html_urls, parse_dom_outlinks)') @click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)') @click.argument('args', nargs=-1) def main(depth: int, plugin: str, wait: bool, args: tuple): """Discover outgoing links from URLs or existing Snapshots, or process Crawl by ID""" from archivebox.misc.jsonl import read_args_or_stdin # Read all input records = list(read_args_or_stdin(args)) if not records: from rich import print as rprint rprint('[yellow]No URLs, Snapshot IDs, or Crawl IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr) sys.exit(1) # Check if input looks like existing Crawl IDs to process # If ALL inputs are Crawl UUIDs, process them all_are_crawl_ids = all( is_crawl_id(r.get('id') or r.get('url', '')) for r in records ) if all_are_crawl_ids: # Process existing Crawls by ID exit_code = 0 for record in records: crawl_id = record.get('id') or record.get('url') result = process_crawl_by_id(crawl_id) if result != 0: exit_code = result sys.exit(exit_code) else: # Default behavior: discover outlinks from input (URLs or Snapshot IDs) sys.exit(discover_outlinks(args, depth=depth, plugin=plugin, wait=wait)) if __name__ == '__main__': main()