From dd2302ad92fde449cc0c0c4860e0846e195c6fef Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Tue, 30 Dec 2025 16:12:53 -0800 Subject: [PATCH] new jsonl cli interface --- TODO_cli_refactor.md | 131 ++++++ archivebox.ts | 4 +- archivebox/cli/__init__.py | 31 +- archivebox/cli/archivebox_archiveresult.py | 365 ++++++++++++++++ archivebox/cli/archivebox_binary.py | 304 +++++++++++++ archivebox/cli/archivebox_crawl.py | 356 ++++++++++----- archivebox/cli/archivebox_extract.py | 265 ------------ archivebox/cli/archivebox_init.py | 2 +- archivebox/cli/archivebox_machine.py | 113 +++++ archivebox/cli/archivebox_orchestrator.py | 67 --- archivebox/cli/archivebox_process.py | 121 ++++++ archivebox/cli/archivebox_remove.py | 98 ----- archivebox/cli/archivebox_run.py | 155 +++++++ archivebox/cli/archivebox_search.py | 131 ------ archivebox/cli/archivebox_snapshot.py | 406 ++++++++++++------ archivebox/cli/archivebox_tag.py | 307 +++++++++++++ archivebox/cli/tests_piping.py | 73 ++-- archivebox/core/forms.py | 2 +- archivebox/core/models.py | 193 +++++++-- archivebox/crawls/models.py | 69 ++- archivebox/hooks.py | 64 +-- archivebox/machine/models.py | 208 ++++++++- archivebox/misc/jsonl.py | 35 +- ...n_Crawl__00_install_puppeteer_chromium.py} | 87 +++- .../on_Crawl__10_chrome_validate_config.py | 172 -------- ...bg.js => on_Crawl__30_chrome_launch.bg.js} | 4 +- ...l_istilldontcareaboutcookies_extension.js} | 0 .../singlefile/on_Crawl__04_singlefile.js | 268 ------------ ..._Crawl__20_install_singlefile_extension.js | 281 ++++++++++++ .../singlefile/tests/test_singlefile.py | 181 ++------ .../{captcha2 => twocaptcha}/config.json | 0 ...Crawl__20_install_twocaptcha_extension.js} | 4 +- ...configure_twocaptcha_extension_options.js} | 6 +- .../templates/icon.html | 0 .../tests/test_twocaptcha.py} | 18 +- ... on_Crawl__20_install_ublock_extension.js} | 0 ...config.py => on_Crawl__10_install_wget.py} | 0 37 files changed, 2919 insertions(+), 1602 deletions(-) create mode 100644 TODO_cli_refactor.md create mode 100644 archivebox/cli/archivebox_archiveresult.py create mode 100644 archivebox/cli/archivebox_binary.py delete mode 100644 archivebox/cli/archivebox_extract.py create mode 100644 archivebox/cli/archivebox_machine.py delete mode 100644 archivebox/cli/archivebox_orchestrator.py create mode 100644 archivebox/cli/archivebox_process.py delete mode 100644 archivebox/cli/archivebox_remove.py create mode 100644 archivebox/cli/archivebox_run.py delete mode 100644 archivebox/cli/archivebox_search.py create mode 100644 archivebox/cli/archivebox_tag.py rename archivebox/plugins/chrome/{on_Crawl__00_chrome_install.py => on_Crawl__00_install_puppeteer_chromium.py} (68%) delete mode 100644 archivebox/plugins/chrome/on_Crawl__10_chrome_validate_config.py rename archivebox/plugins/chrome/{on_Crawl__20_chrome_launch.bg.js => on_Crawl__30_chrome_launch.bg.js} (98%) rename archivebox/plugins/istilldontcareaboutcookies/{on_Crawl__02_istilldontcareaboutcookies.js => on_Crawl__20_install_istilldontcareaboutcookies_extension.js} (100%) delete mode 100755 archivebox/plugins/singlefile/on_Crawl__04_singlefile.js create mode 100755 archivebox/plugins/singlefile/on_Crawl__20_install_singlefile_extension.js rename archivebox/plugins/{captcha2 => twocaptcha}/config.json (100%) rename archivebox/plugins/{captcha2/on_Crawl__01_captcha2.js => twocaptcha/on_Crawl__20_install_twocaptcha_extension.js} (97%) rename archivebox/plugins/{captcha2/on_Crawl__11_captcha2_config.js => twocaptcha/on_Crawl__25_configure_twocaptcha_extension_options.js} (97%) rename archivebox/plugins/{captcha2 => twocaptcha}/templates/icon.html (100%) rename archivebox/plugins/{captcha2/tests/test_captcha2.py => twocaptcha/tests/test_twocaptcha.py} (90%) rename archivebox/plugins/ublock/{on_Crawl__03_ublock.js => on_Crawl__20_install_ublock_extension.js} (100%) rename archivebox/plugins/wget/{on_Crawl__10_wget_validate_config.py => on_Crawl__10_install_wget.py} (100%) diff --git a/TODO_cli_refactor.md b/TODO_cli_refactor.md new file mode 100644 index 00000000..0ce5e092 --- /dev/null +++ b/TODO_cli_refactor.md @@ -0,0 +1,131 @@ +# ArchiveBox CLI Refactor TODO + +## Design Decisions + +1. **Keep `archivebox add`** as high-level convenience command +2. **Unified `archivebox run`** for processing (replaces per-model `run` and `orchestrator`) +3. **Expose all models** including binary, process, machine +4. **Clean break** from old command structure (no backward compatibility aliases) + +## Final Architecture + +``` +archivebox [args...] [--filters] +archivebox run [stdin JSONL] +``` + +### Actions (4 per model): +- `create` - Create records (from args, stdin, or JSONL), dedupes by indexed fields +- `list` - Query records (with filters, returns JSONL) +- `update` - Modify records (from stdin JSONL, PATCH semantics) +- `delete` - Remove records (from stdin JSONL, requires --yes) + +### Unified Run Command: +- `archivebox run` - Process queued work + - With stdin JSONL: Process piped records, exit when complete + - Without stdin (TTY): Run orchestrator in foreground until killed + +### Models (7 total): +- `crawl` - Crawl jobs +- `snapshot` - Individual archived pages +- `archiveresult` - Plugin extraction results +- `tag` - Tags/labels +- `binary` - Detected binaries (chrome, wget, etc.) +- `process` - Process execution records (read-only) +- `machine` - Machine/host records (read-only) + +--- + +## Implementation Checklist + +### Phase 1: Unified Run Command +- [x] Create `archivebox/cli/archivebox_run.py` - unified processing command + +### Phase 2: Core Model Commands +- [x] Refactor `archivebox/cli/archivebox_snapshot.py` to Click group with create|list|update|delete +- [x] Refactor `archivebox/cli/archivebox_crawl.py` to Click group with create|list|update|delete +- [x] Create `archivebox/cli/archivebox_archiveresult.py` with create|list|update|delete +- [x] Create `archivebox/cli/archivebox_tag.py` with create|list|update|delete + +### Phase 3: System Model Commands +- [x] Create `archivebox/cli/archivebox_binary.py` with create|list|update|delete +- [x] Create `archivebox/cli/archivebox_process.py` with list only (read-only) +- [x] Create `archivebox/cli/archivebox_machine.py` with list only (read-only) + +### Phase 4: Registry & Cleanup +- [x] Update `archivebox/cli/__init__.py` command registry +- [x] Delete `archivebox/cli/archivebox_extract.py` +- [x] Delete `archivebox/cli/archivebox_remove.py` +- [x] Delete `archivebox/cli/archivebox_search.py` +- [x] Delete `archivebox/cli/archivebox_orchestrator.py` +- [x] Update `archivebox/cli/archivebox_add.py` internals (no changes needed - uses models directly) +- [x] Update `archivebox/cli/tests_piping.py` + +### Phase 5: Tests for New Commands +- [ ] Add tests for `archivebox run` command +- [ ] Add tests for `archivebox crawl create|list|update|delete` +- [ ] Add tests for `archivebox snapshot create|list|update|delete` +- [ ] Add tests for `archivebox archiveresult create|list|update|delete` +- [ ] Add tests for `archivebox tag create|list|update|delete` +- [ ] Add tests for `archivebox binary create|list|update|delete` +- [ ] Add tests for `archivebox process list` +- [ ] Add tests for `archivebox machine list` + +--- + +## Usage Examples + +### Basic CRUD +```bash +# Create +archivebox crawl create https://example.com https://foo.com --depth=1 +archivebox snapshot create https://example.com --tag=news + +# List with filters +archivebox crawl list --status=queued +archivebox snapshot list --url__icontains=example.com +archivebox archiveresult list --status=failed --plugin=screenshot + +# Update (reads JSONL from stdin, applies changes) +archivebox snapshot list --tag=old | archivebox snapshot update --tag=new + +# Delete (requires --yes) +archivebox crawl list --url__icontains=example.com | archivebox crawl delete --yes +``` + +### Unified Run Command +```bash +# Run orchestrator in foreground (replaces `archivebox orchestrator`) +archivebox run + +# Process specific records (pipe any JSONL type, exits when done) +archivebox snapshot list --status=queued | archivebox run +archivebox archiveresult list --status=failed | archivebox run +archivebox crawl list --status=queued | archivebox run + +# Mixed types work too - run handles any JSONL +cat mixed_records.jsonl | archivebox run +``` + +### Composed Workflows +```bash +# Full pipeline (replaces old `archivebox add`) +archivebox crawl create https://example.com --status=queued \ + | archivebox snapshot create --status=queued \ + | archivebox archiveresult create --status=queued \ + | archivebox run + +# Re-run failed extractions +archivebox archiveresult list --status=failed | archivebox run + +# Delete all snapshots for a domain +archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes +``` + +### Keep `archivebox add` as convenience +```bash +# This remains the simple user-friendly interface: +archivebox add https://example.com --depth=1 --tag=news + +# Internally equivalent to the composed pipeline above +``` diff --git a/archivebox.ts b/archivebox.ts index bf27cac5..e21b549d 100644 --- a/archivebox.ts +++ b/archivebox.ts @@ -478,7 +478,7 @@ interface LoadedChromeExtension extends ChromeExtension { const CHROME_EXTENSIONS: LoadedChromeExtension[] = [ // Content access / unblocking / blocking plugins - {webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo', name: 'captcha2'}, // https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer + {webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo', name: 'twocaptcha'}, // https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer {webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm', name: 'istilldontcareaboutcookies'}, {webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm', name: 'ublock'}, // {webstore_id: 'mlomiejdfkolichcflejclcbmpeaniij', name: 'ghostery'}, @@ -1123,7 +1123,7 @@ async function setup2CaptchaExtension({browser, extensions}) { try { // open a new tab to finish setting up the 2captcha extension manually using its extension options page page = await browser.newPage() - const { options_url } = extensions.filter(ext => ext.name === 'captcha2')[0] + const { options_url } = extensions.filter(ext => ext.name === 'twocaptcha')[0] await page.goto(options_url) await wait(2_500) await page.bringToFront() diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 5a33e11a..c0d35a54 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -27,36 +27,43 @@ class ArchiveBoxGroup(click.Group): 'init': 'archivebox.cli.archivebox_init.main', 'install': 'archivebox.cli.archivebox_install.main', } + # Model commands (CRUD operations via subcommands) + model_commands = { + 'crawl': 'archivebox.cli.archivebox_crawl.main', + 'snapshot': 'archivebox.cli.archivebox_snapshot.main', + 'archiveresult': 'archivebox.cli.archivebox_archiveresult.main', + 'tag': 'archivebox.cli.archivebox_tag.main', + 'binary': 'archivebox.cli.archivebox_binary.main', + 'process': 'archivebox.cli.archivebox_process.main', + 'machine': 'archivebox.cli.archivebox_machine.main', + } archive_commands = { + # High-level commands 'add': 'archivebox.cli.archivebox_add.main', - 'remove': 'archivebox.cli.archivebox_remove.main', + 'run': 'archivebox.cli.archivebox_run.main', 'update': 'archivebox.cli.archivebox_update.main', - 'search': 'archivebox.cli.archivebox_search.main', 'status': 'archivebox.cli.archivebox_status.main', 'config': 'archivebox.cli.archivebox_config.main', 'schedule': 'archivebox.cli.archivebox_schedule.main', 'server': 'archivebox.cli.archivebox_server.main', 'shell': 'archivebox.cli.archivebox_shell.main', 'manage': 'archivebox.cli.archivebox_manage.main', - # Worker/orchestrator commands - 'orchestrator': 'archivebox.cli.archivebox_orchestrator.main', + # Worker command 'worker': 'archivebox.cli.archivebox_worker.main', - # Task commands (called by workers as subprocesses) - 'crawl': 'archivebox.cli.archivebox_crawl.main', - 'snapshot': 'archivebox.cli.archivebox_snapshot.main', - 'extract': 'archivebox.cli.archivebox_extract.main', } all_subcommands = { **meta_commands, **setup_commands, + **model_commands, **archive_commands, } renamed_commands = { 'setup': 'install', - 'list': 'search', 'import': 'add', 'archive': 'add', - 'export': 'search', + # Old commands replaced by new model commands + 'orchestrator': 'run', + 'extract': 'archiveresult', } @classmethod @@ -110,9 +117,9 @@ def cli(ctx, help=False): if help or ctx.invoked_subcommand is None: ctx.invoke(ctx.command.get_command(ctx, 'help')) - # if the subcommand is in the archive_commands dict and is not 'manage', + # if the subcommand is in archive_commands or model_commands, # then we need to set up the django environment and check that we're in a valid data folder - if subcommand in ArchiveBoxGroup.archive_commands: + if subcommand in ArchiveBoxGroup.archive_commands or subcommand in ArchiveBoxGroup.model_commands: # print('SETUP DJANGO AND CHECK DATA FOLDER') try: from archivebox.config.django import setup_django diff --git a/archivebox/cli/archivebox_archiveresult.py b/archivebox/cli/archivebox_archiveresult.py new file mode 100644 index 00000000..1f725a03 --- /dev/null +++ b/archivebox/cli/archivebox_archiveresult.py @@ -0,0 +1,365 @@ +#!/usr/bin/env python3 + +""" +archivebox archiveresult [args...] [--filters] + +Manage ArchiveResult records (plugin extraction results). + +Actions: + create - Create ArchiveResults for Snapshots (queue extractions) + list - List ArchiveResults as JSONL (with optional filters) + update - Update ArchiveResults from stdin JSONL + delete - Delete ArchiveResults from stdin JSONL + +Examples: + # Create ArchiveResults for snapshots (queue for extraction) + archivebox snapshot list --status=queued | archivebox archiveresult create + archivebox archiveresult create --plugin=screenshot --snapshot-id= + + # List with filters + archivebox archiveresult list --status=failed + archivebox archiveresult list --plugin=screenshot --status=succeeded + + # Update (reset failed extractions to queued) + archivebox archiveresult list --status=failed | archivebox archiveresult update --status=queued + + # Delete + archivebox archiveresult list --plugin=singlefile | archivebox archiveresult delete --yes + + # Re-run failed extractions + archivebox archiveresult list --status=failed | archivebox run +""" + +__package__ = 'archivebox.cli' +__command__ = 'archivebox archiveresult' + +import sys +from typing import Optional + +import rich_click as click +from rich import print as rprint + + +def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None): + """Apply Django-style filters from CLI kwargs to a QuerySet.""" + filters = {} + for key, value in filter_kwargs.items(): + if value is not None and key not in ('limit', 'offset'): + filters[key] = value + + if filters: + queryset = queryset.filter(**filters) + + if limit: + queryset = queryset[:limit] + + return queryset + + +# ============================================================================= +# CREATE +# ============================================================================= + +def create_archiveresults( + snapshot_id: Optional[str] = None, + plugin: Optional[str] = None, + status: str = 'queued', +) -> int: + """ + Create ArchiveResults for Snapshots. + + Reads Snapshot records from stdin and creates ArchiveResult entries. + If --plugin is specified, only creates results for that plugin. + Otherwise, creates results for all pending plugins. + + Exit codes: + 0: Success + 1: Failure + """ + from django.utils import timezone + + from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT + from archivebox.core.models import Snapshot, ArchiveResult + + is_tty = sys.stdout.isatty() + + # If snapshot_id provided directly, use that + if snapshot_id: + try: + snapshots = [Snapshot.objects.get(id=snapshot_id)] + except Snapshot.DoesNotExist: + rprint(f'[red]Snapshot not found: {snapshot_id}[/red]', file=sys.stderr) + return 1 + else: + # Read from stdin + records = list(read_stdin()) + if not records: + rprint('[yellow]No Snapshot records provided via stdin[/yellow]', file=sys.stderr) + return 1 + + # Filter to only Snapshot records + snapshot_ids = [] + for record in records: + if record.get('type') == TYPE_SNAPSHOT: + if record.get('id'): + snapshot_ids.append(record['id']) + elif record.get('id'): + # Assume it's a snapshot ID if no type specified + snapshot_ids.append(record['id']) + + if not snapshot_ids: + rprint('[yellow]No valid Snapshot IDs in input[/yellow]', file=sys.stderr) + return 1 + + snapshots = list(Snapshot.objects.filter(id__in=snapshot_ids)) + + if not snapshots: + rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr) + return 1 + + created_count = 0 + for snapshot in snapshots: + if plugin: + # Create for specific plugin only + result, created = ArchiveResult.objects.get_or_create( + snapshot=snapshot, + plugin=plugin, + defaults={ + 'status': status, + 'retry_at': timezone.now(), + } + ) + if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]: + # Reset for retry + result.status = status + result.retry_at = timezone.now() + result.save() + + if not is_tty: + write_record(result.to_json()) + created_count += 1 + else: + # Create all pending plugins + snapshot.create_pending_archiveresults() + for result in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.QUEUED): + if not is_tty: + write_record(result.to_json()) + created_count += 1 + + rprint(f'[green]Created/queued {created_count} archive results[/green]', file=sys.stderr) + return 0 + + +# ============================================================================= +# LIST +# ============================================================================= + +def list_archiveresults( + status: Optional[str] = None, + plugin: Optional[str] = None, + snapshot_id: Optional[str] = None, + limit: Optional[int] = None, +) -> int: + """ + List ArchiveResults as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.core.models import ArchiveResult + + is_tty = sys.stdout.isatty() + + queryset = ArchiveResult.objects.all().order_by('-start_ts') + + # Apply filters + filter_kwargs = { + 'status': status, + 'plugin': plugin, + 'snapshot_id': snapshot_id, + } + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for result in queryset: + if is_tty: + status_color = { + 'queued': 'yellow', + 'started': 'blue', + 'succeeded': 'green', + 'failed': 'red', + 'skipped': 'dim', + 'backoff': 'magenta', + }.get(result.status, 'dim') + rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}') + else: + write_record(result.to_json()) + count += 1 + + rprint(f'[dim]Listed {count} archive results[/dim]', file=sys.stderr) + return 0 + + +# ============================================================================= +# UPDATE +# ============================================================================= + +def update_archiveresults( + status: Optional[str] = None, +) -> int: + """ + Update ArchiveResults from stdin JSONL. + + Reads ArchiveResult records from stdin and applies updates. + Uses PATCH semantics - only specified fields are updated. + + Exit codes: + 0: Success + 1: No input or error + """ + from django.utils import timezone + + from archivebox.misc.jsonl import read_stdin, write_record + from archivebox.core.models import ArchiveResult + + is_tty = sys.stdout.isatty() + + records = list(read_stdin()) + if not records: + rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + return 1 + + updated_count = 0 + for record in records: + result_id = record.get('id') + if not result_id: + continue + + try: + result = ArchiveResult.objects.get(id=result_id) + + # Apply updates from CLI flags + if status: + result.status = status + result.retry_at = timezone.now() + + result.save() + updated_count += 1 + + if not is_tty: + write_record(result.to_json()) + + except ArchiveResult.DoesNotExist: + rprint(f'[yellow]ArchiveResult not found: {result_id}[/yellow]', file=sys.stderr) + continue + + rprint(f'[green]Updated {updated_count} archive results[/green]', file=sys.stderr) + return 0 + + +# ============================================================================= +# DELETE +# ============================================================================= + +def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int: + """ + Delete ArchiveResults from stdin JSONL. + + Requires --yes flag to confirm deletion. + + Exit codes: + 0: Success + 1: No input or missing --yes flag + """ + from archivebox.misc.jsonl import read_stdin + from archivebox.core.models import ArchiveResult + + records = list(read_stdin()) + if not records: + rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + return 1 + + result_ids = [r.get('id') for r in records if r.get('id')] + + if not result_ids: + rprint('[yellow]No valid archive result IDs in input[/yellow]', file=sys.stderr) + return 1 + + results = ArchiveResult.objects.filter(id__in=result_ids) + count = results.count() + + if count == 0: + rprint('[yellow]No matching archive results found[/yellow]', file=sys.stderr) + return 0 + + if dry_run: + rprint(f'[yellow]Would delete {count} archive results (dry run)[/yellow]', file=sys.stderr) + for result in results[:10]: + rprint(f' [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}', file=sys.stderr) + if count > 10: + rprint(f' ... and {count - 10} more', file=sys.stderr) + return 0 + + if not yes: + rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr) + return 1 + + # Perform deletion + deleted_count, _ = results.delete() + rprint(f'[green]Deleted {deleted_count} archive results[/green]', file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + +@click.group() +def main(): + """Manage ArchiveResult records (plugin extraction results).""" + pass + + +@main.command('create') +@click.option('--snapshot-id', help='Snapshot ID to create results for') +@click.option('--plugin', '-p', help='Plugin name (e.g., screenshot, singlefile)') +@click.option('--status', '-s', default='queued', help='Initial status (default: queued)') +def create_cmd(snapshot_id: Optional[str], plugin: Optional[str], status: str): + """Create ArchiveResults for Snapshots from stdin JSONL.""" + sys.exit(create_archiveresults(snapshot_id=snapshot_id, plugin=plugin, status=status)) + + +@main.command('list') +@click.option('--status', '-s', help='Filter by status (queued, started, succeeded, failed, skipped)') +@click.option('--plugin', '-p', help='Filter by plugin name') +@click.option('--snapshot-id', help='Filter by snapshot ID') +@click.option('--limit', '-n', type=int, help='Limit number of results') +def list_cmd(status: Optional[str], plugin: Optional[str], + snapshot_id: Optional[str], limit: Optional[int]): + """List ArchiveResults as JSONL.""" + sys.exit(list_archiveresults( + status=status, + plugin=plugin, + snapshot_id=snapshot_id, + limit=limit, + )) + + +@main.command('update') +@click.option('--status', '-s', help='Set status') +def update_cmd(status: Optional[str]): + """Update ArchiveResults from stdin JSONL.""" + sys.exit(update_archiveresults(status=status)) + + +@main.command('delete') +@click.option('--yes', '-y', is_flag=True, help='Confirm deletion') +@click.option('--dry-run', is_flag=True, help='Show what would be deleted') +def delete_cmd(yes: bool, dry_run: bool): + """Delete ArchiveResults from stdin JSONL.""" + sys.exit(delete_archiveresults(yes=yes, dry_run=dry_run)) + + +if __name__ == '__main__': + main() diff --git a/archivebox/cli/archivebox_binary.py b/archivebox/cli/archivebox_binary.py new file mode 100644 index 00000000..98ab33be --- /dev/null +++ b/archivebox/cli/archivebox_binary.py @@ -0,0 +1,304 @@ +#!/usr/bin/env python3 + +""" +archivebox binary [args...] [--filters] + +Manage Binary records (detected executables like chrome, wget, etc.). + +Actions: + create - Create/register a Binary + list - List Binaries as JSONL (with optional filters) + update - Update Binaries from stdin JSONL + delete - Delete Binaries from stdin JSONL + +Examples: + # List all binaries + archivebox binary list + + # List specific binary + archivebox binary list --name=chrome + + # List binaries with specific version + archivebox binary list --version__icontains=120 + + # Delete old binary entries + archivebox binary list --name=chrome | archivebox binary delete --yes +""" + +__package__ = 'archivebox.cli' +__command__ = 'archivebox binary' + +import sys +from typing import Optional + +import rich_click as click +from rich import print as rprint + + +def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None): + """Apply Django-style filters from CLI kwargs to a QuerySet.""" + filters = {} + for key, value in filter_kwargs.items(): + if value is not None and key not in ('limit', 'offset'): + filters[key] = value + + if filters: + queryset = queryset.filter(**filters) + + if limit: + queryset = queryset[:limit] + + return queryset + + +# ============================================================================= +# CREATE +# ============================================================================= + +def create_binary( + name: str, + abspath: str, + version: str = '', +) -> int: + """ + Create/register a Binary. + + Exit codes: + 0: Success + 1: Failure + """ + from archivebox.misc.jsonl import write_record + from archivebox.machine.models import Binary + + is_tty = sys.stdout.isatty() + + if not name or not abspath: + rprint('[red]Both --name and --abspath are required[/red]', file=sys.stderr) + return 1 + + try: + binary, created = Binary.objects.get_or_create( + name=name, + abspath=abspath, + defaults={'version': version} + ) + + if not is_tty: + write_record(binary.to_json()) + + if created: + rprint(f'[green]Created binary: {name} at {abspath}[/green]', file=sys.stderr) + else: + rprint(f'[dim]Binary already exists: {name} at {abspath}[/dim]', file=sys.stderr) + + return 0 + + except Exception as e: + rprint(f'[red]Error creating binary: {e}[/red]', file=sys.stderr) + return 1 + + +# ============================================================================= +# LIST +# ============================================================================= + +def list_binaries( + name: Optional[str] = None, + abspath__icontains: Optional[str] = None, + version__icontains: Optional[str] = None, + limit: Optional[int] = None, +) -> int: + """ + List Binaries as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.machine.models import Binary + + is_tty = sys.stdout.isatty() + + queryset = Binary.objects.all().order_by('name', '-loaded_at') + + # Apply filters + filter_kwargs = { + 'name': name, + 'abspath__icontains': abspath__icontains, + 'version__icontains': version__icontains, + } + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for binary in queryset: + if is_tty: + rprint(f'[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}') + else: + write_record(binary.to_json()) + count += 1 + + rprint(f'[dim]Listed {count} binaries[/dim]', file=sys.stderr) + return 0 + + +# ============================================================================= +# UPDATE +# ============================================================================= + +def update_binaries( + version: Optional[str] = None, + abspath: Optional[str] = None, +) -> int: + """ + Update Binaries from stdin JSONL. + + Reads Binary records from stdin and applies updates. + Uses PATCH semantics - only specified fields are updated. + + Exit codes: + 0: Success + 1: No input or error + """ + from archivebox.misc.jsonl import read_stdin, write_record + from archivebox.machine.models import Binary + + is_tty = sys.stdout.isatty() + + records = list(read_stdin()) + if not records: + rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + return 1 + + updated_count = 0 + for record in records: + binary_id = record.get('id') + if not binary_id: + continue + + try: + binary = Binary.objects.get(id=binary_id) + + # Apply updates from CLI flags + if version: + binary.version = version + if abspath: + binary.abspath = abspath + + binary.save() + updated_count += 1 + + if not is_tty: + write_record(binary.to_json()) + + except Binary.DoesNotExist: + rprint(f'[yellow]Binary not found: {binary_id}[/yellow]', file=sys.stderr) + continue + + rprint(f'[green]Updated {updated_count} binaries[/green]', file=sys.stderr) + return 0 + + +# ============================================================================= +# DELETE +# ============================================================================= + +def delete_binaries(yes: bool = False, dry_run: bool = False) -> int: + """ + Delete Binaries from stdin JSONL. + + Requires --yes flag to confirm deletion. + + Exit codes: + 0: Success + 1: No input or missing --yes flag + """ + from archivebox.misc.jsonl import read_stdin + from archivebox.machine.models import Binary + + records = list(read_stdin()) + if not records: + rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + return 1 + + binary_ids = [r.get('id') for r in records if r.get('id')] + + if not binary_ids: + rprint('[yellow]No valid binary IDs in input[/yellow]', file=sys.stderr) + return 1 + + binaries = Binary.objects.filter(id__in=binary_ids) + count = binaries.count() + + if count == 0: + rprint('[yellow]No matching binaries found[/yellow]', file=sys.stderr) + return 0 + + if dry_run: + rprint(f'[yellow]Would delete {count} binaries (dry run)[/yellow]', file=sys.stderr) + for binary in binaries: + rprint(f' {binary.name} {binary.abspath}', file=sys.stderr) + return 0 + + if not yes: + rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr) + return 1 + + # Perform deletion + deleted_count, _ = binaries.delete() + rprint(f'[green]Deleted {deleted_count} binaries[/green]', file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + +@click.group() +def main(): + """Manage Binary records (detected executables).""" + pass + + +@main.command('create') +@click.option('--name', '-n', required=True, help='Binary name (e.g., chrome, wget)') +@click.option('--abspath', '-p', required=True, help='Absolute path to binary') +@click.option('--version', '-v', default='', help='Binary version') +def create_cmd(name: str, abspath: str, version: str): + """Create/register a Binary.""" + sys.exit(create_binary(name=name, abspath=abspath, version=version)) + + +@main.command('list') +@click.option('--name', '-n', help='Filter by name') +@click.option('--abspath__icontains', help='Filter by path contains') +@click.option('--version__icontains', help='Filter by version contains') +@click.option('--limit', type=int, help='Limit number of results') +def list_cmd(name: Optional[str], abspath__icontains: Optional[str], + version__icontains: Optional[str], limit: Optional[int]): + """List Binaries as JSONL.""" + sys.exit(list_binaries( + name=name, + abspath__icontains=abspath__icontains, + version__icontains=version__icontains, + limit=limit, + )) + + +@main.command('update') +@click.option('--version', '-v', help='Set version') +@click.option('--abspath', '-p', help='Set path') +def update_cmd(version: Optional[str], abspath: Optional[str]): + """Update Binaries from stdin JSONL.""" + sys.exit(update_binaries(version=version, abspath=abspath)) + + +@main.command('delete') +@click.option('--yes', '-y', is_flag=True, help='Confirm deletion') +@click.option('--dry-run', is_flag=True, help='Show what would be deleted') +def delete_cmd(yes: bool, dry_run: bool): + """Delete Binaries from stdin JSONL.""" + sys.exit(delete_binaries(yes=yes, dry_run=dry_run)) + + +if __name__ == '__main__': + main() diff --git a/archivebox/cli/archivebox_crawl.py b/archivebox/cli/archivebox_crawl.py index d8c3c7ad..d0621fcc 100644 --- a/archivebox/cli/archivebox_crawl.py +++ b/archivebox/cli/archivebox_crawl.py @@ -1,108 +1,134 @@ #!/usr/bin/env python3 """ -archivebox crawl [urls...] [--depth=N] [--tag=TAG] +archivebox crawl [args...] [--filters] -Create Crawl jobs from URLs. Accepts URLs as arguments, from stdin, or via JSONL. -Does NOT immediately start the crawl - pipe to `archivebox snapshot` to process. +Manage Crawl records. -Input formats: - - Plain URLs (one per line) - - JSONL: {"url": "...", "depth": 1, "tags": "..."} - -Output (JSONL): - {"type": "Crawl", "id": "...", "urls": "...", "status": "queued", ...} +Actions: + create - Create Crawl jobs from URLs + list - List Crawls as JSONL (with optional filters) + update - Update Crawls from stdin JSONL + delete - Delete Crawls from stdin JSONL Examples: - # Create a crawl job - archivebox crawl https://example.com + # Create + archivebox crawl create https://example.com https://foo.com --depth=1 + archivebox crawl create --tag=news https://example.com - # Create crawl with depth - archivebox crawl --depth=1 https://example.com + # List with filters + archivebox crawl list --status=queued + archivebox crawl list --urls__icontains=example.com - # Full pipeline: create crawl, create snapshots, run extractors - archivebox crawl https://example.com | archivebox snapshot | archivebox extract + # Update + archivebox crawl list --status=started | archivebox crawl update --status=queued - # Process existing Crawl by ID (runs the crawl state machine) - archivebox crawl 01234567-89ab-cdef-0123-456789abcdef + # Delete + archivebox crawl list --urls__icontains=spam.com | archivebox crawl delete --yes + + # Full pipeline + archivebox crawl create https://example.com | archivebox snapshot create | archivebox run """ __package__ = 'archivebox.cli' __command__ = 'archivebox crawl' import sys -from typing import Optional +from typing import Optional, Iterable import rich_click as click +from rich import print as rprint -def create_crawls( - records: list, +def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None): + """Apply Django-style filters from CLI kwargs to a QuerySet.""" + filters = {} + for key, value in filter_kwargs.items(): + if value is not None and key not in ('limit', 'offset'): + filters[key] = value + + if filters: + queryset = queryset.filter(**filters) + + if limit: + queryset = queryset[:limit] + + return queryset + + +# ============================================================================= +# CREATE +# ============================================================================= + +def create_crawl( + urls: Iterable[str], depth: int = 0, tag: str = '', + status: str = 'queued', created_by_id: Optional[int] = None, ) -> int: """ - Create a single Crawl job from all input URLs. + Create a Crawl job from URLs. - Takes pre-read records, creates one Crawl with all URLs, outputs JSONL. - Does NOT start the crawl - just creates the job in QUEUED state. + Takes URLs as args or stdin, creates one Crawl with all URLs, outputs JSONL. Exit codes: 0: Success 1: Failure """ - from rich import print as rprint - - from archivebox.misc.jsonl import write_record + from archivebox.misc.jsonl import read_args_or_stdin, write_record from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.crawls.models import Crawl created_by_id = created_by_id or get_or_create_system_user_pk() is_tty = sys.stdout.isatty() + # Collect all input records + records = list(read_args_or_stdin(urls)) + if not records: rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr) return 1 # Collect all URLs into a single newline-separated string - urls = [] + url_list = [] for record in records: url = record.get('url') if url: - urls.append(url) + url_list.append(url) - if not urls: + if not url_list: rprint('[red]No valid URLs found[/red]', file=sys.stderr) return 1 try: # Build crawl record with all URLs as newline-separated string crawl_record = { - 'urls': '\n'.join(urls), + 'urls': '\n'.join(url_list), 'max_depth': depth, 'tags_str': tag, + 'status': status, 'label': '', } - crawl = Crawl.from_jsonl(crawl_record, overrides={'created_by_id': created_by_id}) + crawl = Crawl.from_json(crawl_record, overrides={'created_by_id': created_by_id}) if not crawl: rprint('[red]Failed to create crawl[/red]', file=sys.stderr) return 1 # Output JSONL record (only when piped) if not is_tty: - write_record(crawl.to_jsonl()) + write_record(crawl.to_json()) - rprint(f'[green]Created crawl with {len(urls)} URLs[/green]', file=sys.stderr) + rprint(f'[green]Created crawl with {len(url_list)} URLs[/green]', file=sys.stderr) # If TTY, show human-readable output if is_tty: rprint(f' [dim]{crawl.id}[/dim]', file=sys.stderr) - for url in urls[:5]: # Show first 5 URLs + for url in url_list[:5]: # Show first 5 URLs rprint(f' {url[:70]}', file=sys.stderr) - if len(urls) > 5: - rprint(f' ... and {len(urls) - 5} more', file=sys.stderr) + if len(url_list) > 5: + rprint(f' ... and {len(url_list) - 5} more', file=sys.stderr) return 0 @@ -111,81 +137,217 @@ def create_crawls( return 1 -def process_crawl_by_id(crawl_id: str) -> int: - """ - Process a single Crawl by ID (used by workers). +# ============================================================================= +# LIST +# ============================================================================= - Triggers the Crawl's state machine tick() which will: - - Transition from queued -> started (creates root snapshot) - - Transition from started -> sealed (when all snapshots done) +def list_crawls( + status: Optional[str] = None, + urls__icontains: Optional[str] = None, + max_depth: Optional[int] = None, + limit: Optional[int] = None, +) -> int: """ - from rich import print as rprint + List Crawls as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record from archivebox.crawls.models import Crawl - try: - crawl = Crawl.objects.get(id=crawl_id) - except Crawl.DoesNotExist: - rprint(f'[red]Crawl {crawl_id} not found[/red]', file=sys.stderr) - return 1 + is_tty = sys.stdout.isatty() - rprint(f'[blue]Processing Crawl {crawl.id} (status={crawl.status})[/blue]', file=sys.stderr) + queryset = Crawl.objects.all().order_by('-created_at') - try: - crawl.sm.tick() - crawl.refresh_from_db() - rprint(f'[green]Crawl complete (status={crawl.status})[/green]', file=sys.stderr) - return 0 - except Exception as e: - rprint(f'[red]Crawl error: {type(e).__name__}: {e}[/red]', file=sys.stderr) - return 1 + # Apply filters + filter_kwargs = { + 'status': status, + 'urls__icontains': urls__icontains, + 'max_depth': max_depth, + } + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for crawl in queryset: + if is_tty: + status_color = { + 'queued': 'yellow', + 'started': 'blue', + 'sealed': 'green', + }.get(crawl.status, 'dim') + url_preview = crawl.urls[:50].replace('\n', ' ') + rprint(f'[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...') + else: + write_record(crawl.to_json()) + count += 1 + + rprint(f'[dim]Listed {count} crawls[/dim]', file=sys.stderr) + return 0 -def is_crawl_id(value: str) -> bool: - """Check if value looks like a Crawl UUID.""" - import re - uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I) - if not uuid_pattern.match(value): - return False - # Verify it's actually a Crawl (not a Snapshot or other object) +# ============================================================================= +# UPDATE +# ============================================================================= + +def update_crawls( + status: Optional[str] = None, + max_depth: Optional[int] = None, +) -> int: + """ + Update Crawls from stdin JSONL. + + Reads Crawl records from stdin and applies updates. + Uses PATCH semantics - only specified fields are updated. + + Exit codes: + 0: Success + 1: No input or error + """ + from django.utils import timezone + + from archivebox.misc.jsonl import read_stdin, write_record from archivebox.crawls.models import Crawl - return Crawl.objects.filter(id=value).exists() + is_tty = sys.stdout.isatty() -@click.command() -@click.option('--depth', '-d', type=int, default=0, help='Max depth for recursive crawling (default: 0, no recursion)') -@click.option('--tag', '-t', default='', help='Comma-separated tags to add to snapshots') -@click.argument('args', nargs=-1) -def main(depth: int, tag: str, args: tuple): - """Create Crawl jobs from URLs, or process existing Crawls by ID""" - from archivebox.misc.jsonl import read_args_or_stdin - - # Read all input - records = list(read_args_or_stdin(args)) - + records = list(read_stdin()) if not records: - from rich import print as rprint - rprint('[yellow]No URLs or Crawl IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr) - sys.exit(1) + rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + return 1 - # Check if input looks like existing Crawl IDs to process - # If ALL inputs are Crawl UUIDs, process them - all_are_crawl_ids = all( - is_crawl_id(r.get('id') or r.get('url', '')) - for r in records - ) + updated_count = 0 + for record in records: + crawl_id = record.get('id') + if not crawl_id: + continue - if all_are_crawl_ids: - # Process existing Crawls by ID - exit_code = 0 - for record in records: - crawl_id = record.get('id') or record.get('url') - result = process_crawl_by_id(crawl_id) - if result != 0: - exit_code = result - sys.exit(exit_code) - else: - # Default behavior: create Crawl jobs from URLs - sys.exit(create_crawls(records, depth=depth, tag=tag)) + try: + crawl = Crawl.objects.get(id=crawl_id) + + # Apply updates from CLI flags + if status: + crawl.status = status + crawl.retry_at = timezone.now() + if max_depth is not None: + crawl.max_depth = max_depth + + crawl.save() + updated_count += 1 + + if not is_tty: + write_record(crawl.to_json()) + + except Crawl.DoesNotExist: + rprint(f'[yellow]Crawl not found: {crawl_id}[/yellow]', file=sys.stderr) + continue + + rprint(f'[green]Updated {updated_count} crawls[/green]', file=sys.stderr) + return 0 + + +# ============================================================================= +# DELETE +# ============================================================================= + +def delete_crawls(yes: bool = False, dry_run: bool = False) -> int: + """ + Delete Crawls from stdin JSONL. + + Requires --yes flag to confirm deletion. + + Exit codes: + 0: Success + 1: No input or missing --yes flag + """ + from archivebox.misc.jsonl import read_stdin + from archivebox.crawls.models import Crawl + + records = list(read_stdin()) + if not records: + rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + return 1 + + crawl_ids = [r.get('id') for r in records if r.get('id')] + + if not crawl_ids: + rprint('[yellow]No valid crawl IDs in input[/yellow]', file=sys.stderr) + return 1 + + crawls = Crawl.objects.filter(id__in=crawl_ids) + count = crawls.count() + + if count == 0: + rprint('[yellow]No matching crawls found[/yellow]', file=sys.stderr) + return 0 + + if dry_run: + rprint(f'[yellow]Would delete {count} crawls (dry run)[/yellow]', file=sys.stderr) + for crawl in crawls: + url_preview = crawl.urls[:50].replace('\n', ' ') + rprint(f' [dim]{crawl.id}[/dim] {url_preview}...', file=sys.stderr) + return 0 + + if not yes: + rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr) + return 1 + + # Perform deletion + deleted_count, _ = crawls.delete() + rprint(f'[green]Deleted {deleted_count} crawls[/green]', file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + +@click.group() +def main(): + """Manage Crawl records.""" + pass + + +@main.command('create') +@click.argument('urls', nargs=-1) +@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)') +@click.option('--tag', '-t', default='', help='Comma-separated tags to add') +@click.option('--status', '-s', default='queued', help='Initial status (default: queued)') +def create_cmd(urls: tuple, depth: int, tag: str, status: str): + """Create a Crawl job from URLs or stdin.""" + sys.exit(create_crawl(urls, depth=depth, tag=tag, status=status)) + + +@main.command('list') +@click.option('--status', '-s', help='Filter by status (queued, started, sealed)') +@click.option('--urls__icontains', help='Filter by URLs contains') +@click.option('--max-depth', type=int, help='Filter by max depth') +@click.option('--limit', '-n', type=int, help='Limit number of results') +def list_cmd(status: Optional[str], urls__icontains: Optional[str], + max_depth: Optional[int], limit: Optional[int]): + """List Crawls as JSONL.""" + sys.exit(list_crawls( + status=status, + urls__icontains=urls__icontains, + max_depth=max_depth, + limit=limit, + )) + + +@main.command('update') +@click.option('--status', '-s', help='Set status') +@click.option('--max-depth', type=int, help='Set max depth') +def update_cmd(status: Optional[str], max_depth: Optional[int]): + """Update Crawls from stdin JSONL.""" + sys.exit(update_crawls(status=status, max_depth=max_depth)) + + +@main.command('delete') +@click.option('--yes', '-y', is_flag=True, help='Confirm deletion') +@click.option('--dry-run', is_flag=True, help='Show what would be deleted') +def delete_cmd(yes: bool, dry_run: bool): + """Delete Crawls from stdin JSONL.""" + sys.exit(delete_crawls(yes=yes, dry_run=dry_run)) if __name__ == '__main__': diff --git a/archivebox/cli/archivebox_extract.py b/archivebox/cli/archivebox_extract.py deleted file mode 100644 index 7dc043ae..00000000 --- a/archivebox/cli/archivebox_extract.py +++ /dev/null @@ -1,265 +0,0 @@ -#!/usr/bin/env python3 - -""" -archivebox extract [snapshot_ids...] [--plugins=NAMES] - -Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL. - -Input formats: - - Snapshot UUIDs (one per line) - - JSONL: {"type": "Snapshot", "id": "...", "url": "..."} - - JSONL: {"type": "ArchiveResult", "snapshot_id": "...", "plugin": "..."} - -Output (JSONL): - {"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", "status": "..."} - -Examples: - # Extract specific snapshot - archivebox extract 01234567-89ab-cdef-0123-456789abcdef - - # Pipe from snapshot command - archivebox snapshot https://example.com | archivebox extract - - # Run specific plugins only - archivebox extract --plugins=screenshot,singlefile 01234567-89ab-cdef-0123-456789abcdef - - # Chain commands - archivebox crawl https://example.com | archivebox snapshot | archivebox extract -""" - -__package__ = 'archivebox.cli' -__command__ = 'archivebox extract' - -import sys -from typing import Optional, List - -import rich_click as click - - -def process_archiveresult_by_id(archiveresult_id: str) -> int: - """ - Run extraction for a single ArchiveResult by ID (used by workers). - - Triggers the ArchiveResult's state machine tick() to run the extractor plugin. - """ - from rich import print as rprint - from archivebox.core.models import ArchiveResult - - try: - archiveresult = ArchiveResult.objects.get(id=archiveresult_id) - except ArchiveResult.DoesNotExist: - rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr) - return 1 - - rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr) - - try: - # Trigger state machine tick - this runs the actual extraction - archiveresult.sm.tick() - archiveresult.refresh_from_db() - - if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED: - print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]') - return 0 - elif archiveresult.status == ArchiveResult.StatusChoices.FAILED: - print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr) - return 1 - else: - # Still in progress or backoff - not a failure - print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]') - return 0 - - except Exception as e: - print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr) - return 1 - - -def run_plugins( - args: tuple, - plugins: str = '', - wait: bool = True, -) -> int: - """ - Run plugins on Snapshots from input. - - Reads Snapshot IDs or JSONL from args/stdin, runs plugins, outputs JSONL. - - Exit codes: - 0: Success - 1: Failure - """ - from rich import print as rprint - from django.utils import timezone - - from archivebox.misc.jsonl import ( - read_args_or_stdin, write_record, - TYPE_SNAPSHOT, TYPE_ARCHIVERESULT - ) - from archivebox.core.models import Snapshot, ArchiveResult - from archivebox.workers.orchestrator import Orchestrator - - is_tty = sys.stdout.isatty() - - # Parse comma-separated plugins list once (reused in creation and filtering) - plugins_list = [p.strip() for p in plugins.split(',') if p.strip()] if plugins else [] - - # Collect all input records - records = list(read_args_or_stdin(args)) - - if not records: - rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr) - return 1 - - # Gather snapshot IDs to process - snapshot_ids = set() - for record in records: - record_type = record.get('type') - - if record_type == TYPE_SNAPSHOT: - snapshot_id = record.get('id') - if snapshot_id: - snapshot_ids.add(snapshot_id) - elif record.get('url'): - # Look up by URL (get most recent if multiple exist) - snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first() - if snap: - snapshot_ids.add(str(snap.id)) - else: - rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr) - - elif record_type == TYPE_ARCHIVERESULT: - snapshot_id = record.get('snapshot_id') - if snapshot_id: - snapshot_ids.add(snapshot_id) - - elif 'id' in record: - # Assume it's a snapshot ID - snapshot_ids.add(record['id']) - - if not snapshot_ids: - rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr) - return 1 - - # Get snapshots and ensure they have pending ArchiveResults - processed_count = 0 - for snapshot_id in snapshot_ids: - try: - snapshot = Snapshot.objects.get(id=snapshot_id) - except Snapshot.DoesNotExist: - rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr) - continue - - # Create pending ArchiveResults if needed - if plugins_list: - # Only create for specific plugins - for plugin_name in plugins_list: - result, created = ArchiveResult.objects.get_or_create( - snapshot=snapshot, - plugin=plugin_name, - defaults={ - 'status': ArchiveResult.StatusChoices.QUEUED, - 'retry_at': timezone.now(), - } - ) - if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]: - # Reset for retry - result.status = ArchiveResult.StatusChoices.QUEUED - result.retry_at = timezone.now() - result.save() - else: - # Create all pending plugins - snapshot.create_pending_archiveresults() - - # Reset snapshot status to allow processing - if snapshot.status == Snapshot.StatusChoices.SEALED: - snapshot.status = Snapshot.StatusChoices.STARTED - snapshot.retry_at = timezone.now() - snapshot.save() - - processed_count += 1 - - if processed_count == 0: - rprint('[red]No snapshots to process[/red]', file=sys.stderr) - return 1 - - rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr) - - # Run orchestrator if --wait (default) - if wait: - rprint('[blue]Running plugins...[/blue]', file=sys.stderr) - orchestrator = Orchestrator(exit_on_idle=True) - orchestrator.runloop() - - # Output results as JSONL (when piped) or human-readable (when TTY) - for snapshot_id in snapshot_ids: - try: - snapshot = Snapshot.objects.get(id=snapshot_id) - results = snapshot.archiveresult_set.all() - if plugins_list: - results = results.filter(plugin__in=plugins_list) - - for result in results: - if is_tty: - status_color = { - 'succeeded': 'green', - 'failed': 'red', - 'skipped': 'yellow', - }.get(result.status, 'dim') - rprint(f' [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ""}', file=sys.stderr) - else: - write_record(result.to_jsonl()) - except Snapshot.DoesNotExist: - continue - - return 0 - - -def is_archiveresult_id(value: str) -> bool: - """Check if value looks like an ArchiveResult UUID.""" - import re - uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I) - if not uuid_pattern.match(value): - return False - # Verify it's actually an ArchiveResult (not a Snapshot or other object) - from archivebox.core.models import ArchiveResult - return ArchiveResult.objects.filter(id=value).exists() - - -@click.command() -@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)') -@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)') -@click.argument('args', nargs=-1) -def main(plugins: str, wait: bool, args: tuple): - """Run plugins on Snapshots, or process existing ArchiveResults by ID""" - from archivebox.misc.jsonl import read_args_or_stdin - - # Read all input - records = list(read_args_or_stdin(args)) - - if not records: - from rich import print as rprint - rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr) - sys.exit(1) - - # Check if input looks like existing ArchiveResult IDs to process - all_are_archiveresult_ids = all( - is_archiveresult_id(r.get('id') or r.get('url', '')) - for r in records - ) - - if all_are_archiveresult_ids: - # Process existing ArchiveResults by ID - exit_code = 0 - for record in records: - archiveresult_id = record.get('id') or record.get('url') - result = process_archiveresult_by_id(archiveresult_id) - if result != 0: - exit_code = result - sys.exit(exit_code) - else: - # Default behavior: run plugins on Snapshots from input - sys.exit(run_plugins(args, plugins=plugins, wait=wait)) - - -if __name__ == '__main__': - main() diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py index ed67c77d..5ef6c9ca 100755 --- a/archivebox/cli/archivebox_init.py +++ b/archivebox/cli/archivebox_init.py @@ -127,7 +127,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None: if pending_links: for link_dict in pending_links.values(): - Snapshot.from_jsonl(link_dict) + Snapshot.from_json(link_dict) # Hint for orphaned snapshot directories print() diff --git a/archivebox/cli/archivebox_machine.py b/archivebox/cli/archivebox_machine.py new file mode 100644 index 00000000..e63eac41 --- /dev/null +++ b/archivebox/cli/archivebox_machine.py @@ -0,0 +1,113 @@ +#!/usr/bin/env python3 + +""" +archivebox machine [--filters] + +Manage Machine records (system-managed, mostly read-only). + +Machine records track the host machines where ArchiveBox runs. +They are created automatically by the system and are primarily for debugging. + +Actions: + list - List Machines as JSONL (with optional filters) + +Examples: + # List all machines + archivebox machine list + + # List machines by hostname + archivebox machine list --hostname__icontains=myserver +""" + +__package__ = 'archivebox.cli' +__command__ = 'archivebox machine' + +import sys +from typing import Optional + +import rich_click as click +from rich import print as rprint + + +def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None): + """Apply Django-style filters from CLI kwargs to a QuerySet.""" + filters = {} + for key, value in filter_kwargs.items(): + if value is not None and key not in ('limit', 'offset'): + filters[key] = value + + if filters: + queryset = queryset.filter(**filters) + + if limit: + queryset = queryset[:limit] + + return queryset + + +# ============================================================================= +# LIST +# ============================================================================= + +def list_machines( + hostname__icontains: Optional[str] = None, + os_platform: Optional[str] = None, + limit: Optional[int] = None, +) -> int: + """ + List Machines as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.machine.models import Machine + + is_tty = sys.stdout.isatty() + + queryset = Machine.objects.all().order_by('-created_at') + + # Apply filters + filter_kwargs = { + 'hostname__icontains': hostname__icontains, + 'os_platform': os_platform, + } + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for machine in queryset: + if is_tty: + rprint(f'[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}') + else: + write_record(machine.to_json()) + count += 1 + + rprint(f'[dim]Listed {count} machines[/dim]', file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + +@click.group() +def main(): + """Manage Machine records (read-only, system-managed).""" + pass + + +@main.command('list') +@click.option('--hostname__icontains', help='Filter by hostname contains') +@click.option('--os-platform', help='Filter by OS platform') +@click.option('--limit', '-n', type=int, help='Limit number of results') +def list_cmd(hostname__icontains: Optional[str], os_platform: Optional[str], limit: Optional[int]): + """List Machines as JSONL.""" + sys.exit(list_machines( + hostname__icontains=hostname__icontains, + os_platform=os_platform, + limit=limit, + )) + + +if __name__ == '__main__': + main() diff --git a/archivebox/cli/archivebox_orchestrator.py b/archivebox/cli/archivebox_orchestrator.py deleted file mode 100644 index 4b272727..00000000 --- a/archivebox/cli/archivebox_orchestrator.py +++ /dev/null @@ -1,67 +0,0 @@ -#!/usr/bin/env python3 - -""" -archivebox orchestrator [--daemon] - -Start the orchestrator process that manages workers. - -The orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult) -and lazily spawns worker processes when there is work to be done. -""" - -__package__ = 'archivebox.cli' -__command__ = 'archivebox orchestrator' - -import sys - -import rich_click as click - -from archivebox.misc.util import docstring - - -def orchestrator(daemon: bool = False, watch: bool = False) -> int: - """ - Start the orchestrator process. - - The orchestrator: - 1. Polls each model queue (Crawl, Snapshot, ArchiveResult) - 2. Spawns worker processes when there is work to do - 3. Monitors worker health and restarts failed workers - 4. Exits when all queues are empty (unless --daemon) - - Args: - daemon: Run forever (don't exit when idle) - watch: Just watch the queues without spawning workers (for debugging) - - Exit codes: - 0: All work completed successfully - 1: Error occurred - """ - from archivebox.workers.orchestrator import Orchestrator - - if Orchestrator.is_running(): - print('[yellow]Orchestrator is already running[/yellow]') - return 0 - - try: - orchestrator_instance = Orchestrator(exit_on_idle=not daemon) - orchestrator_instance.runloop() - return 0 - except KeyboardInterrupt: - return 0 - except Exception as e: - print(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr) - return 1 - - -@click.command() -@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)") -@click.option('--watch', '-w', is_flag=True, help="Watch queues without spawning workers") -@docstring(orchestrator.__doc__) -def main(daemon: bool, watch: bool): - """Start the ArchiveBox orchestrator process""" - sys.exit(orchestrator(daemon=daemon, watch=watch)) - - -if __name__ == '__main__': - main() diff --git a/archivebox/cli/archivebox_process.py b/archivebox/cli/archivebox_process.py new file mode 100644 index 00000000..9784650b --- /dev/null +++ b/archivebox/cli/archivebox_process.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python3 + +""" +archivebox process [--filters] + +Manage Process records (system-managed, mostly read-only). + +Process records track executions of binaries during extraction. +They are created automatically by the system and are primarily for debugging. + +Actions: + list - List Processes as JSONL (with optional filters) + +Examples: + # List all processes + archivebox process list + + # List processes by binary + archivebox process list --binary-name=chrome + + # List recent processes + archivebox process list --limit=10 +""" + +__package__ = 'archivebox.cli' +__command__ = 'archivebox process' + +import sys +from typing import Optional + +import rich_click as click +from rich import print as rprint + + +def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None): + """Apply Django-style filters from CLI kwargs to a QuerySet.""" + filters = {} + for key, value in filter_kwargs.items(): + if value is not None and key not in ('limit', 'offset'): + filters[key] = value + + if filters: + queryset = queryset.filter(**filters) + + if limit: + queryset = queryset[:limit] + + return queryset + + +# ============================================================================= +# LIST +# ============================================================================= + +def list_processes( + binary_name: Optional[str] = None, + machine_id: Optional[str] = None, + limit: Optional[int] = None, +) -> int: + """ + List Processes as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.machine.models import Process + + is_tty = sys.stdout.isatty() + + queryset = Process.objects.all().select_related('binary', 'machine').order_by('-start_ts') + + # Apply filters + filter_kwargs = {} + if binary_name: + filter_kwargs['binary__name'] = binary_name + if machine_id: + filter_kwargs['machine_id'] = machine_id + + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for process in queryset: + if is_tty: + binary_name_str = process.binary.name if process.binary else 'unknown' + exit_code = process.returncode if process.returncode is not None else '?' + status_color = 'green' if process.returncode == 0 else 'red' if process.returncode else 'yellow' + rprint(f'[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]') + else: + write_record(process.to_json()) + count += 1 + + rprint(f'[dim]Listed {count} processes[/dim]', file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + +@click.group() +def main(): + """Manage Process records (read-only, system-managed).""" + pass + + +@main.command('list') +@click.option('--binary-name', '-b', help='Filter by binary name') +@click.option('--machine-id', '-m', help='Filter by machine ID') +@click.option('--limit', '-n', type=int, help='Limit number of results') +def list_cmd(binary_name: Optional[str], machine_id: Optional[str], limit: Optional[int]): + """List Processes as JSONL.""" + sys.exit(list_processes( + binary_name=binary_name, + machine_id=machine_id, + limit=limit, + )) + + +if __name__ == '__main__': + main() diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py deleted file mode 100644 index 374b60d3..00000000 --- a/archivebox/cli/archivebox_remove.py +++ /dev/null @@ -1,98 +0,0 @@ -#!/usr/bin/env python3 - -__package__ = 'archivebox.cli' -__command__ = 'archivebox remove' - -import shutil -from pathlib import Path -from typing import Iterable - -import rich_click as click - -from django.db.models import QuerySet - -from archivebox.config import DATA_DIR -from archivebox.config.django import setup_django -from archivebox.misc.util import enforce_types, docstring -from archivebox.misc.checks import check_data_folder -from archivebox.misc.logging_util import ( - log_list_started, - log_list_finished, - log_removal_started, - log_removal_finished, - TimedProgress, -) - - -@enforce_types -def remove(filter_patterns: Iterable[str]=(), - filter_type: str='exact', - snapshots: QuerySet | None=None, - after: float | None=None, - before: float | None=None, - yes: bool=False, - delete: bool=False, - out_dir: Path=DATA_DIR) -> QuerySet: - """Remove the specified URLs from the archive""" - - setup_django() - check_data_folder() - - from archivebox.cli.archivebox_search import get_snapshots - - log_list_started(filter_patterns, filter_type) - timer = TimedProgress(360, prefix=' ') - try: - snapshots = get_snapshots( - snapshots=snapshots, - filter_patterns=list(filter_patterns) if filter_patterns else None, - filter_type=filter_type, - after=after, - before=before, - ) - finally: - timer.end() - - if not snapshots.exists(): - log_removal_finished(0, 0) - raise SystemExit(1) - - log_list_finished(snapshots) - log_removal_started(snapshots, yes=yes, delete=delete) - - timer = TimedProgress(360, prefix=' ') - try: - for snapshot in snapshots: - if delete: - shutil.rmtree(snapshot.output_dir, ignore_errors=True) - finally: - timer.end() - - to_remove = snapshots.count() - - from archivebox.search import flush_search_index - from archivebox.core.models import Snapshot - - flush_search_index(snapshots=snapshots) - snapshots.delete() - all_snapshots = Snapshot.objects.all() - log_removal_finished(all_snapshots.count(), to_remove) - - return all_snapshots - - -@click.command() -@click.option('--yes', is_flag=True, help='Remove links instantly without prompting to confirm') -@click.option('--delete', is_flag=True, help='Delete the archived content and metadata folder in addition to removing from index') -@click.option('--before', type=float, help='Remove only URLs bookmarked before timestamp') -@click.option('--after', type=float, help='Remove only URLs bookmarked after timestamp') -@click.option('--filter-type', '-f', type=click.Choice(('exact', 'substring', 'domain', 'regex', 'tag')), default='exact', help='Type of pattern matching to use when filtering URLs') -@click.argument('filter_patterns', nargs=-1) -@docstring(remove.__doc__) -def main(**kwargs): - """Remove the specified URLs from the archive""" - remove(**kwargs) - - -if __name__ == '__main__': - main() diff --git a/archivebox/cli/archivebox_run.py b/archivebox/cli/archivebox_run.py new file mode 100644 index 00000000..6efd9018 --- /dev/null +++ b/archivebox/cli/archivebox_run.py @@ -0,0 +1,155 @@ +#!/usr/bin/env python3 + +""" +archivebox run [--daemon] + +Unified command for processing queued work. + +Modes: + - With stdin JSONL: Process piped records, exit when complete + - Without stdin (TTY): Run orchestrator in foreground until killed + +Examples: + # Run orchestrator in foreground (replaces `archivebox orchestrator`) + archivebox run + + # Run as daemon (don't exit on idle) + archivebox run --daemon + + # Process specific records (pipe any JSONL type, exits when done) + archivebox snapshot list --status=queued | archivebox run + archivebox archiveresult list --status=failed | archivebox run + archivebox crawl list --status=queued | archivebox run + + # Mixed types work too + cat mixed_records.jsonl | archivebox run +""" + +__package__ = 'archivebox.cli' +__command__ = 'archivebox run' + +import sys + +import rich_click as click +from rich import print as rprint + + +def process_stdin_records() -> int: + """ + Process JSONL records from stdin. + + Reads records, queues them for processing, then runs orchestrator until complete. + Handles any record type: Crawl, Snapshot, ArchiveResult, etc. + + Returns exit code (0 = success, 1 = error). + """ + from django.utils import timezone + + from archivebox.misc.jsonl import read_stdin, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT + from archivebox.core.models import Snapshot, ArchiveResult + from archivebox.crawls.models import Crawl + from archivebox.workers.orchestrator import Orchestrator + + records = list(read_stdin()) + + if not records: + return 0 # Nothing to process + + queued_count = 0 + + for record in records: + record_type = record.get('type') + record_id = record.get('id') + + if not record_id: + continue + + try: + if record_type == TYPE_CRAWL: + crawl = Crawl.objects.get(id=record_id) + if crawl.status in [Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]: + crawl.retry_at = timezone.now() + crawl.save() + queued_count += 1 + + elif record_type == TYPE_SNAPSHOT: + snapshot = Snapshot.objects.get(id=record_id) + if snapshot.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]: + snapshot.retry_at = timezone.now() + snapshot.save() + queued_count += 1 + + elif record_type == TYPE_ARCHIVERESULT: + archiveresult = ArchiveResult.objects.get(id=record_id) + if archiveresult.status in [ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED, ArchiveResult.StatusChoices.BACKOFF]: + archiveresult.retry_at = timezone.now() + archiveresult.save() + queued_count += 1 + + except (Crawl.DoesNotExist, Snapshot.DoesNotExist, ArchiveResult.DoesNotExist): + rprint(f'[yellow]Record not found: {record_type} {record_id}[/yellow]', file=sys.stderr) + continue + + if queued_count == 0: + rprint('[yellow]No records to process[/yellow]', file=sys.stderr) + return 0 + + rprint(f'[blue]Processing {queued_count} records...[/blue]', file=sys.stderr) + + # Run orchestrator until all queued work is done + orchestrator = Orchestrator(exit_on_idle=True) + orchestrator.runloop() + + return 0 + + +def run_orchestrator(daemon: bool = False) -> int: + """ + Run the orchestrator process. + + The orchestrator: + 1. Polls each model queue (Crawl, Snapshot, ArchiveResult) + 2. Spawns worker processes when there is work to do + 3. Monitors worker health and restarts failed workers + 4. Exits when all queues are empty (unless --daemon) + + Args: + daemon: Run forever (don't exit when idle) + + Returns exit code (0 = success, 1 = error). + """ + from archivebox.workers.orchestrator import Orchestrator + + if Orchestrator.is_running(): + rprint('[yellow]Orchestrator is already running[/yellow]', file=sys.stderr) + return 0 + + try: + orchestrator = Orchestrator(exit_on_idle=not daemon) + orchestrator.runloop() + return 0 + except KeyboardInterrupt: + return 0 + except Exception as e: + rprint(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr) + return 1 + + +@click.command() +@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)") +def main(daemon: bool): + """ + Process queued work. + + When stdin is piped: Process those specific records and exit. + When run standalone: Run orchestrator in foreground. + """ + # Check if stdin has data (non-TTY means piped input) + if not sys.stdin.isatty(): + sys.exit(process_stdin_records()) + else: + sys.exit(run_orchestrator(daemon=daemon)) + + +if __name__ == '__main__': + main() diff --git a/archivebox/cli/archivebox_search.py b/archivebox/cli/archivebox_search.py deleted file mode 100644 index 055e952d..00000000 --- a/archivebox/cli/archivebox_search.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python3 - -__package__ = 'archivebox.cli' -__command__ = 'archivebox search' - -from pathlib import Path -from typing import Optional, List, Any - -import rich_click as click -from rich import print - -from django.db.models import QuerySet - -from archivebox.config import DATA_DIR -from archivebox.misc.logging import stderr -from archivebox.misc.util import enforce_types, docstring - -# Filter types for URL matching -LINK_FILTERS = { - 'exact': lambda pattern: {'url': pattern}, - 'substring': lambda pattern: {'url__icontains': pattern}, - 'regex': lambda pattern: {'url__iregex': pattern}, - 'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'}, - 'tag': lambda pattern: {'tags__name': pattern}, - 'timestamp': lambda pattern: {'timestamp': pattern}, -} - -STATUS_CHOICES = ['indexed', 'archived', 'unarchived'] - - - -def get_snapshots(snapshots: Optional[QuerySet]=None, - filter_patterns: Optional[List[str]]=None, - filter_type: str='substring', - after: Optional[float]=None, - before: Optional[float]=None, - out_dir: Path=DATA_DIR) -> QuerySet: - """Filter and return Snapshots matching the given criteria.""" - from archivebox.core.models import Snapshot - - if snapshots: - result = snapshots - else: - result = Snapshot.objects.all() - - if after is not None: - result = result.filter(timestamp__gte=after) - if before is not None: - result = result.filter(timestamp__lt=before) - if filter_patterns: - result = Snapshot.objects.filter_by_patterns(filter_patterns, filter_type) - - if not result: - stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow') - - return result - - -@enforce_types -def search(filter_patterns: list[str] | None=None, - filter_type: str='substring', - status: str='indexed', - before: float | None=None, - after: float | None=None, - sort: str | None=None, - json: bool=False, - html: bool=False, - csv: str | None=None, - with_headers: bool=False): - """List, filter, and export information about archive entries""" - from archivebox.core.models import Snapshot - - if with_headers and not (json or html or csv): - stderr('[X] --with-headers requires --json, --html or --csv\n', color='red') - raise SystemExit(2) - - # Query DB directly - no filesystem scanning - snapshots = get_snapshots( - filter_patterns=list(filter_patterns) if filter_patterns else None, - filter_type=filter_type, - before=before, - after=after, - ) - - # Apply status filter - if status == 'archived': - snapshots = snapshots.filter(downloaded_at__isnull=False) - elif status == 'unarchived': - snapshots = snapshots.filter(downloaded_at__isnull=True) - # 'indexed' = all snapshots (no filter) - - if sort: - snapshots = snapshots.order_by(sort) - - # Export to requested format - if json: - output = snapshots.to_json(with_headers=with_headers) - elif html: - output = snapshots.to_html(with_headers=with_headers) - elif csv: - output = snapshots.to_csv(cols=csv.split(','), header=with_headers) - else: - from archivebox.misc.logging_util import printable_folders - # Convert to dict for printable_folders - folders = {s.output_dir: s for s in snapshots} - output = printable_folders(folders, with_headers) - - print(output) - return output - - -@click.command() -@click.option('--filter-type', '-f', type=click.Choice(['search', *LINK_FILTERS.keys()]), default='substring', help='Pattern matching type for filtering URLs') -@click.option('--status', '-s', type=click.Choice(STATUS_CHOICES), default='indexed', help='List snapshots with the given status') -@click.option('--before', '-b', type=float, help='List snapshots bookmarked before the given UNIX timestamp') -@click.option('--after', '-a', type=float, help='List snapshots bookmarked after the given UNIX timestamp') -@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at') -@click.option('--json', '-J', is_flag=True, help='Print output in JSON format') -@click.option('--html', '-M', is_flag=True, help='Print output in HTML format (suitable for viewing statically without a server)') -@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: created_at,url,title') -@click.option('--with-headers', '-H', is_flag=True, help='Include extra CSV/HTML headers in the output') -@click.help_option('--help', '-h') -@click.argument('filter_patterns', nargs=-1) -@docstring(search.__doc__) -def main(**kwargs): - return search(**kwargs) - - - -if __name__ == '__main__': - main() diff --git a/archivebox/cli/archivebox_snapshot.py b/archivebox/cli/archivebox_snapshot.py index dc540139..87e7482b 100644 --- a/archivebox/cli/archivebox_snapshot.py +++ b/archivebox/cli/archivebox_snapshot.py @@ -1,93 +1,76 @@ #!/usr/bin/env python3 """ -archivebox snapshot [urls_or_crawl_ids...] [--tag=TAG] [--plugins=NAMES] +archivebox snapshot [args...] [--filters] -Create Snapshots from URLs or Crawl jobs. Accepts URLs, Crawl JSONL, or Crawl IDs. +Manage Snapshot records. -Input formats: - - Plain URLs (one per line) - - JSONL: {"type": "Crawl", "id": "...", "urls": "..."} - - JSONL: {"type": "Snapshot", "url": "...", "title": "...", "tags": "..."} - - Crawl UUIDs (one per line) - -Output (JSONL): - {"type": "Snapshot", "id": "...", "url": "...", "status": "queued", ...} +Actions: + create - Create Snapshots from URLs or Crawl JSONL + list - List Snapshots as JSONL (with optional filters) + update - Update Snapshots from stdin JSONL + delete - Delete Snapshots from stdin JSONL Examples: - # Create snapshots from URLs directly - archivebox snapshot https://example.com https://foo.com + # Create + archivebox snapshot create https://example.com --tag=news + archivebox crawl create https://example.com | archivebox snapshot create - # Pipe from crawl command - archivebox crawl https://example.com | archivebox snapshot + # List with filters + archivebox snapshot list --status=queued + archivebox snapshot list --url__icontains=example.com - # Chain with extract - archivebox crawl https://example.com | archivebox snapshot | archivebox extract + # Update + archivebox snapshot list --tag=old | archivebox snapshot update --tag=new - # Run specific plugins after creating snapshots - archivebox snapshot --plugins=screenshot,singlefile https://example.com - - # Process existing Snapshot by ID - archivebox snapshot 01234567-89ab-cdef-0123-456789abcdef + # Delete + archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes """ __package__ = 'archivebox.cli' __command__ = 'archivebox snapshot' import sys -from typing import Optional +from typing import Optional, Iterable import rich_click as click - -from archivebox.misc.util import docstring +from rich import print as rprint -def process_snapshot_by_id(snapshot_id: str) -> int: - """ - Process a single Snapshot by ID (used by workers). +def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None): + """Apply Django-style filters from CLI kwargs to a QuerySet.""" + filters = {} + for key, value in filter_kwargs.items(): + if value is not None and key not in ('limit', 'offset'): + filters[key] = value - Triggers the Snapshot's state machine tick() which will: - - Transition from queued -> started (creates pending ArchiveResults) - - Transition from started -> sealed (when all ArchiveResults done) - """ - from rich import print as rprint - from archivebox.core.models import Snapshot + if filters: + queryset = queryset.filter(**filters) - try: - snapshot = Snapshot.objects.get(id=snapshot_id) - except Snapshot.DoesNotExist: - rprint(f'[red]Snapshot {snapshot_id} not found[/red]', file=sys.stderr) - return 1 + if limit: + queryset = queryset[:limit] - rprint(f'[blue]Processing Snapshot {snapshot.id} {snapshot.url[:50]} (status={snapshot.status})[/blue]', file=sys.stderr) + return queryset - try: - snapshot.sm.tick() - snapshot.refresh_from_db() - rprint(f'[green]Snapshot complete (status={snapshot.status})[/green]', file=sys.stderr) - return 0 - except Exception as e: - rprint(f'[red]Snapshot error: {type(e).__name__}: {e}[/red]', file=sys.stderr) - return 1 +# ============================================================================= +# CREATE +# ============================================================================= def create_snapshots( - args: tuple, + urls: Iterable[str], tag: str = '', - plugins: str = '', + status: str = 'queued', + depth: int = 0, created_by_id: Optional[int] = None, ) -> int: """ - Create Snapshots from URLs, Crawl JSONL, or Crawl IDs. - - Reads from args or stdin, creates Snapshot objects, outputs JSONL. - If --plugins is passed, also runs specified plugins (blocking). + Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records). Exit codes: 0: Success 1: Failure """ - from rich import print as rprint from django.utils import timezone from archivebox.misc.jsonl import ( @@ -102,7 +85,7 @@ def create_snapshots( is_tty = sys.stdout.isatty() # Collect all input records - records = list(read_args_or_stdin(args)) + records = list(read_args_or_stdin(urls)) if not records: rprint('[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr) @@ -122,47 +105,44 @@ def create_snapshots( try: crawl = Crawl.objects.get(id=crawl_id) except Crawl.DoesNotExist: - # Crawl doesn't exist, create it - crawl = Crawl.from_jsonl(record, overrides={'created_by_id': created_by_id}) + crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id}) else: - # No ID, create new crawl - crawl = Crawl.from_jsonl(record, overrides={'created_by_id': created_by_id}) + crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id}) if not crawl: continue # Create snapshots for each URL in the crawl for url in crawl.get_urls_list(): - # Merge CLI tags with crawl tags merged_tags = crawl.tags_str if tag: - if merged_tags: - merged_tags = f"{merged_tags},{tag}" - else: - merged_tags = tag + merged_tags = f"{merged_tags},{tag}" if merged_tags else tag snapshot_record = { 'url': url, 'tags': merged_tags, 'crawl_id': str(crawl.id), - 'depth': 0, + 'depth': depth, + 'status': status, } - snapshot = Snapshot.from_jsonl(snapshot_record, overrides={'created_by_id': created_by_id}) + snapshot = Snapshot.from_json(snapshot_record, overrides={'created_by_id': created_by_id}) if snapshot: created_snapshots.append(snapshot) if not is_tty: - write_record(snapshot.to_jsonl()) + write_record(snapshot.to_json()) elif record_type == TYPE_SNAPSHOT or record.get('url'): # Input is a Snapshot or plain URL - # Add tags if provided via CLI if tag and not record.get('tags'): record['tags'] = tag + if status: + record['status'] = status + record['depth'] = record.get('depth', depth) - snapshot = Snapshot.from_jsonl(record, overrides={'created_by_id': created_by_id}) + snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id}) if snapshot: created_snapshots.append(snapshot) if not is_tty: - write_record(snapshot.to_jsonl()) + write_record(snapshot.to_json()) except Exception as e: rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr) @@ -174,93 +154,237 @@ def create_snapshots( rprint(f'[green]Created {len(created_snapshots)} snapshots[/green]', file=sys.stderr) - # If TTY, show human-readable output if is_tty: for snapshot in created_snapshots: rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr) - # If --plugins is passed, create ArchiveResults and run the orchestrator - if plugins: - from archivebox.core.models import ArchiveResult - from archivebox.workers.orchestrator import Orchestrator - - # Parse comma-separated plugins list - plugins_list = [p.strip() for p in plugins.split(',') if p.strip()] - - # Create ArchiveResults for the specific plugins on each snapshot - for snapshot in created_snapshots: - for plugin_name in plugins_list: - result, created = ArchiveResult.objects.get_or_create( - snapshot=snapshot, - plugin=plugin_name, - defaults={ - 'status': ArchiveResult.StatusChoices.QUEUED, - 'retry_at': timezone.now(), - } - ) - if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]: - # Reset for retry - result.status = ArchiveResult.StatusChoices.QUEUED - result.retry_at = timezone.now() - result.save() - - rprint(f'[blue]Running plugins: {plugins}...[/blue]', file=sys.stderr) - orchestrator = Orchestrator(exit_on_idle=True) - orchestrator.runloop() - return 0 -def is_snapshot_id(value: str) -> bool: - """Check if value looks like a Snapshot UUID.""" - import re - uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I) - if not uuid_pattern.match(value): - return False - # Verify it's actually a Snapshot (not a Crawl or other object) +# ============================================================================= +# LIST +# ============================================================================= + +def list_snapshots( + status: Optional[str] = None, + url__icontains: Optional[str] = None, + url__istartswith: Optional[str] = None, + tag: Optional[str] = None, + crawl_id: Optional[str] = None, + limit: Optional[int] = None, +) -> int: + """ + List Snapshots as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record from archivebox.core.models import Snapshot - return Snapshot.objects.filter(id=value).exists() + + is_tty = sys.stdout.isatty() + + queryset = Snapshot.objects.all().order_by('-created_at') + + # Apply filters + filter_kwargs = { + 'status': status, + 'url__icontains': url__icontains, + 'url__istartswith': url__istartswith, + 'crawl_id': crawl_id, + } + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + # Tag filter requires special handling (M2M) + if tag: + queryset = queryset.filter(tags__name__iexact=tag) + + count = 0 + for snapshot in queryset: + if is_tty: + status_color = { + 'queued': 'yellow', + 'started': 'blue', + 'sealed': 'green', + }.get(snapshot.status, 'dim') + rprint(f'[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}') + else: + write_record(snapshot.to_json()) + count += 1 + + rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr) + return 0 -@click.command() -@click.option('--tag', '-t', default='', help='Comma-separated tags to add to each snapshot') -@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run after creating snapshots (e.g., screenshot,singlefile)') -@click.argument('args', nargs=-1) -def main(tag: str, plugins: str, args: tuple): - """Create Snapshots from URLs/Crawls, or process existing Snapshots by ID""" - from archivebox.misc.jsonl import read_args_or_stdin +# ============================================================================= +# UPDATE +# ============================================================================= - # Read all input - records = list(read_args_or_stdin(args)) +def update_snapshots( + status: Optional[str] = None, + tag: Optional[str] = None, +) -> int: + """ + Update Snapshots from stdin JSONL. + Reads Snapshot records from stdin and applies updates. + Uses PATCH semantics - only specified fields are updated. + + Exit codes: + 0: Success + 1: No input or error + """ + from django.utils import timezone + + from archivebox.misc.jsonl import read_stdin, write_record + from archivebox.core.models import Snapshot + + is_tty = sys.stdout.isatty() + + records = list(read_stdin()) if not records: - from rich import print as rprint - rprint('[yellow]No URLs, Crawl IDs, or Snapshot IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr) - sys.exit(1) + rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + return 1 - # Check if input looks like existing Snapshot IDs to process - # If ALL inputs are UUIDs with no URL and exist as Snapshots, process them - all_are_snapshot_ids = all( - is_snapshot_id(r.get('id') or r.get('url', '')) - for r in records - if r.get('type') != 'Crawl' # Don't check Crawl records as Snapshot IDs - ) + updated_count = 0 + for record in records: + snapshot_id = record.get('id') + if not snapshot_id: + continue - # But also check that we're not receiving Crawl JSONL - has_crawl_records = any(r.get('type') == 'Crawl' for r in records) + try: + snapshot = Snapshot.objects.get(id=snapshot_id) - if all_are_snapshot_ids and not has_crawl_records: - # Process existing Snapshots by ID - exit_code = 0 - for record in records: - snapshot_id = record.get('id') or record.get('url') - result = process_snapshot_by_id(snapshot_id) - if result != 0: - exit_code = result - sys.exit(exit_code) - else: - # Create new Snapshots from URLs or Crawls - sys.exit(create_snapshots(args, tag=tag, plugins=plugins)) + # Apply updates from CLI flags (override stdin values) + if status: + snapshot.status = status + snapshot.retry_at = timezone.now() + if tag: + # Add tag to existing tags + snapshot.save() # Ensure saved before M2M + from archivebox.core.models import Tag + tag_obj, _ = Tag.objects.get_or_create(name=tag) + snapshot.tags.add(tag_obj) + + snapshot.save() + updated_count += 1 + + if not is_tty: + write_record(snapshot.to_json()) + + except Snapshot.DoesNotExist: + rprint(f'[yellow]Snapshot not found: {snapshot_id}[/yellow]', file=sys.stderr) + continue + + rprint(f'[green]Updated {updated_count} snapshots[/green]', file=sys.stderr) + return 0 + + +# ============================================================================= +# DELETE +# ============================================================================= + +def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int: + """ + Delete Snapshots from stdin JSONL. + + Requires --yes flag to confirm deletion. + + Exit codes: + 0: Success + 1: No input or missing --yes flag + """ + from archivebox.misc.jsonl import read_stdin + from archivebox.core.models import Snapshot + + records = list(read_stdin()) + if not records: + rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + return 1 + + snapshot_ids = [r.get('id') for r in records if r.get('id')] + + if not snapshot_ids: + rprint('[yellow]No valid snapshot IDs in input[/yellow]', file=sys.stderr) + return 1 + + snapshots = Snapshot.objects.filter(id__in=snapshot_ids) + count = snapshots.count() + + if count == 0: + rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr) + return 0 + + if dry_run: + rprint(f'[yellow]Would delete {count} snapshots (dry run)[/yellow]', file=sys.stderr) + for snapshot in snapshots: + rprint(f' [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr) + return 0 + + if not yes: + rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr) + return 1 + + # Perform deletion + deleted_count, _ = snapshots.delete() + rprint(f'[green]Deleted {deleted_count} snapshots[/green]', file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + +@click.group() +def main(): + """Manage Snapshot records.""" + pass + + +@main.command('create') +@click.argument('urls', nargs=-1) +@click.option('--tag', '-t', default='', help='Comma-separated tags to add') +@click.option('--status', '-s', default='queued', help='Initial status (default: queued)') +@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)') +def create_cmd(urls: tuple, tag: str, status: str, depth: int): + """Create Snapshots from URLs or stdin JSONL.""" + sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth)) + + +@main.command('list') +@click.option('--status', '-s', help='Filter by status (queued, started, sealed)') +@click.option('--url__icontains', help='Filter by URL contains') +@click.option('--url__istartswith', help='Filter by URL starts with') +@click.option('--tag', '-t', help='Filter by tag name') +@click.option('--crawl-id', help='Filter by crawl ID') +@click.option('--limit', '-n', type=int, help='Limit number of results') +def list_cmd(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str], + tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]): + """List Snapshots as JSONL.""" + sys.exit(list_snapshots( + status=status, + url__icontains=url__icontains, + url__istartswith=url__istartswith, + tag=tag, + crawl_id=crawl_id, + limit=limit, + )) + + +@main.command('update') +@click.option('--status', '-s', help='Set status') +@click.option('--tag', '-t', help='Add tag') +def update_cmd(status: Optional[str], tag: Optional[str]): + """Update Snapshots from stdin JSONL.""" + sys.exit(update_snapshots(status=status, tag=tag)) + + +@main.command('delete') +@click.option('--yes', '-y', is_flag=True, help='Confirm deletion') +@click.option('--dry-run', is_flag=True, help='Show what would be deleted') +def delete_cmd(yes: bool, dry_run: bool): + """Delete Snapshots from stdin JSONL.""" + sys.exit(delete_snapshots(yes=yes, dry_run=dry_run)) if __name__ == '__main__': diff --git a/archivebox/cli/archivebox_tag.py b/archivebox/cli/archivebox_tag.py new file mode 100644 index 00000000..c9461396 --- /dev/null +++ b/archivebox/cli/archivebox_tag.py @@ -0,0 +1,307 @@ +#!/usr/bin/env python3 + +""" +archivebox tag [args...] [--filters] + +Manage Tag records. + +Actions: + create - Create Tags + list - List Tags as JSONL (with optional filters) + update - Update Tags from stdin JSONL + delete - Delete Tags from stdin JSONL + +Examples: + # Create + archivebox tag create news tech science + archivebox tag create "important stuff" + + # List + archivebox tag list + archivebox tag list --name__icontains=news + + # Update (rename tags) + archivebox tag list --name=oldname | archivebox tag update --name=newname + + # Delete + archivebox tag list --name=unused | archivebox tag delete --yes +""" + +__package__ = 'archivebox.cli' +__command__ = 'archivebox tag' + +import sys +from typing import Optional, Iterable + +import rich_click as click +from rich import print as rprint + + +def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None): + """Apply Django-style filters from CLI kwargs to a QuerySet.""" + filters = {} + for key, value in filter_kwargs.items(): + if value is not None and key not in ('limit', 'offset'): + filters[key] = value + + if filters: + queryset = queryset.filter(**filters) + + if limit: + queryset = queryset[:limit] + + return queryset + + +# ============================================================================= +# CREATE +# ============================================================================= + +def create_tags(names: Iterable[str]) -> int: + """ + Create Tags from names. + + Exit codes: + 0: Success + 1: Failure + """ + from archivebox.misc.jsonl import write_record + from archivebox.core.models import Tag + + is_tty = sys.stdout.isatty() + + # Convert to list if needed + name_list = list(names) if names else [] + + if not name_list: + rprint('[yellow]No tag names provided. Pass names as arguments.[/yellow]', file=sys.stderr) + return 1 + + created_count = 0 + for name in name_list: + name = name.strip() + if not name: + continue + + tag, created = Tag.objects.get_or_create(name=name) + + if not is_tty: + write_record(tag.to_json()) + + if created: + created_count += 1 + rprint(f'[green]Created tag: {name}[/green]', file=sys.stderr) + else: + rprint(f'[dim]Tag already exists: {name}[/dim]', file=sys.stderr) + + rprint(f'[green]Created {created_count} new tags[/green]', file=sys.stderr) + return 0 + + +# ============================================================================= +# LIST +# ============================================================================= + +def list_tags( + name: Optional[str] = None, + name__icontains: Optional[str] = None, + limit: Optional[int] = None, +) -> int: + """ + List Tags as JSONL with optional filters. + + Exit codes: + 0: Success (even if no results) + """ + from archivebox.misc.jsonl import write_record + from archivebox.core.models import Tag + + is_tty = sys.stdout.isatty() + + queryset = Tag.objects.all().order_by('name') + + # Apply filters + filter_kwargs = { + 'name': name, + 'name__icontains': name__icontains, + } + queryset = apply_filters(queryset, filter_kwargs, limit=limit) + + count = 0 + for tag in queryset: + snapshot_count = tag.snapshot_set.count() + if is_tty: + rprint(f'[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]') + else: + write_record(tag.to_json()) + count += 1 + + rprint(f'[dim]Listed {count} tags[/dim]', file=sys.stderr) + return 0 + + +# ============================================================================= +# UPDATE +# ============================================================================= + +def update_tags(name: Optional[str] = None) -> int: + """ + Update Tags from stdin JSONL. + + Reads Tag records from stdin and applies updates. + Uses PATCH semantics - only specified fields are updated. + + Exit codes: + 0: Success + 1: No input or error + """ + from archivebox.misc.jsonl import read_stdin, write_record + from archivebox.core.models import Tag + + is_tty = sys.stdout.isatty() + + records = list(read_stdin()) + if not records: + rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + return 1 + + updated_count = 0 + for record in records: + tag_id = record.get('id') + old_name = record.get('name') + + if not tag_id and not old_name: + continue + + try: + if tag_id: + tag = Tag.objects.get(id=tag_id) + else: + tag = Tag.objects.get(name=old_name) + + # Apply updates from CLI flags + if name: + tag.name = name + tag.save() + + updated_count += 1 + + if not is_tty: + write_record(tag.to_json()) + + except Tag.DoesNotExist: + rprint(f'[yellow]Tag not found: {tag_id or old_name}[/yellow]', file=sys.stderr) + continue + + rprint(f'[green]Updated {updated_count} tags[/green]', file=sys.stderr) + return 0 + + +# ============================================================================= +# DELETE +# ============================================================================= + +def delete_tags(yes: bool = False, dry_run: bool = False) -> int: + """ + Delete Tags from stdin JSONL. + + Requires --yes flag to confirm deletion. + + Exit codes: + 0: Success + 1: No input or missing --yes flag + """ + from archivebox.misc.jsonl import read_stdin + from archivebox.core.models import Tag + + records = list(read_stdin()) + if not records: + rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr) + return 1 + + # Collect tag IDs or names + tag_ids = [] + tag_names = [] + for r in records: + if r.get('id'): + tag_ids.append(r['id']) + elif r.get('name'): + tag_names.append(r['name']) + + if not tag_ids and not tag_names: + rprint('[yellow]No valid tag IDs or names in input[/yellow]', file=sys.stderr) + return 1 + + from django.db.models import Q + query = Q() + if tag_ids: + query |= Q(id__in=tag_ids) + if tag_names: + query |= Q(name__in=tag_names) + + tags = Tag.objects.filter(query) + count = tags.count() + + if count == 0: + rprint('[yellow]No matching tags found[/yellow]', file=sys.stderr) + return 0 + + if dry_run: + rprint(f'[yellow]Would delete {count} tags (dry run)[/yellow]', file=sys.stderr) + for tag in tags: + rprint(f' {tag.name}', file=sys.stderr) + return 0 + + if not yes: + rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr) + return 1 + + # Perform deletion + deleted_count, _ = tags.delete() + rprint(f'[green]Deleted {deleted_count} tags[/green]', file=sys.stderr) + return 0 + + +# ============================================================================= +# CLI Commands +# ============================================================================= + +@click.group() +def main(): + """Manage Tag records.""" + pass + + +@main.command('create') +@click.argument('names', nargs=-1) +def create_cmd(names: tuple): + """Create Tags from names.""" + sys.exit(create_tags(names)) + + +@main.command('list') +@click.option('--name', help='Filter by exact name') +@click.option('--name__icontains', help='Filter by name contains') +@click.option('--limit', '-n', type=int, help='Limit number of results') +def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]): + """List Tags as JSONL.""" + sys.exit(list_tags(name=name, name__icontains=name__icontains, limit=limit)) + + +@main.command('update') +@click.option('--name', '-n', help='Set new name') +def update_cmd(name: Optional[str]): + """Update Tags from stdin JSONL.""" + sys.exit(update_tags(name=name)) + + +@main.command('delete') +@click.option('--yes', '-y', is_flag=True, help='Confirm deletion') +@click.option('--dry-run', is_flag=True, help='Show what would be deleted') +def delete_cmd(yes: bool, dry_run: bool): + """Delete Tags from stdin JSONL.""" + sys.exit(delete_tags(yes=yes, dry_run=dry_run)) + + +if __name__ == '__main__': + main() diff --git a/archivebox/cli/tests_piping.py b/archivebox/cli/tests_piping.py index f6aee426..47953232 100644 --- a/archivebox/cli/tests_piping.py +++ b/archivebox/cli/tests_piping.py @@ -1,17 +1,18 @@ #!/usr/bin/env python3 """ -Tests for CLI piping workflow: crawl | snapshot | extract +Tests for CLI piping workflow: crawl | snapshot | archiveresult | run This module tests the JSONL-based piping between CLI commands as described in: https://github.com/ArchiveBox/ArchiveBox/issues/1363 Workflows tested: - archivebox crawl URL -> Crawl JSONL - archivebox snapshot -> Snapshot JSONL (accepts Crawl or URL input) - archivebox extract -> ArchiveResult JSONL (accepts Snapshot input) + archivebox crawl create URL -> Crawl JSONL + archivebox snapshot create -> Snapshot JSONL (accepts Crawl or URL input) + archivebox archiveresult create -> ArchiveResult JSONL (accepts Snapshot input) + archivebox run -> Process queued records (accepts any JSONL) Pipeline: - archivebox crawl URL | archivebox snapshot | archivebox extract + archivebox crawl create URL | archivebox snapshot create | archivebox archiveresult create | archivebox run Each command should: - Accept URLs, IDs, or JSONL as input (args or stdin) @@ -154,13 +155,13 @@ class TestJSONLParsing(unittest.TestCase): class TestJSONLOutput(unittest.TestCase): """Test JSONL output formatting.""" - def test_crawl_to_jsonl(self): - """Crawl model should serialize to JSONL correctly.""" + def test_crawl_to_json(self): + """Crawl model should serialize to JSON correctly.""" from archivebox.misc.jsonl import TYPE_CRAWL - # Create a mock crawl with to_jsonl method configured + # Create a mock crawl with to_json method configured mock_crawl = MagicMock() - mock_crawl.to_jsonl.return_value = { + mock_crawl.to_json.return_value = { 'type': TYPE_CRAWL, 'schema_version': '0.9.0', 'id': 'test-crawl-uuid', @@ -172,7 +173,7 @@ class TestJSONLOutput(unittest.TestCase): 'created_at': None, } - result = mock_crawl.to_jsonl() + result = mock_crawl.to_json() self.assertEqual(result['type'], TYPE_CRAWL) self.assertEqual(result['id'], 'test-crawl-uuid') self.assertEqual(result['urls'], 'https://example.com') @@ -351,8 +352,8 @@ class TestSnapshotCommand(unittest.TestCase): # using real Snapshot instances. -class TestExtractCommand(unittest.TestCase): - """Unit tests for archivebox extract command.""" +class TestArchiveResultCommand(unittest.TestCase): + """Unit tests for archivebox archiveresult command.""" def setUp(self): """Set up test environment.""" @@ -363,8 +364,8 @@ class TestExtractCommand(unittest.TestCase): """Clean up test environment.""" shutil.rmtree(self.test_dir, ignore_errors=True) - def test_extract_accepts_snapshot_id(self): - """extract should accept snapshot IDs as input.""" + def test_archiveresult_accepts_snapshot_id(self): + """archiveresult should accept snapshot IDs as input.""" from archivebox.misc.jsonl import read_args_or_stdin uuid = '01234567-89ab-cdef-0123-456789abcdef' @@ -374,8 +375,8 @@ class TestExtractCommand(unittest.TestCase): self.assertEqual(len(records), 1) self.assertEqual(records[0]['id'], uuid) - def test_extract_accepts_jsonl_snapshot(self): - """extract should accept JSONL Snapshot records.""" + def test_archiveresult_accepts_jsonl_snapshot(self): + """archiveresult should accept JSONL Snapshot records.""" from archivebox.misc.jsonl import read_args_or_stdin, TYPE_SNAPSHOT stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n') @@ -387,8 +388,8 @@ class TestExtractCommand(unittest.TestCase): self.assertEqual(records[0]['type'], TYPE_SNAPSHOT) self.assertEqual(records[0]['id'], 'abc123') - def test_extract_gathers_snapshot_ids(self): - """extract should gather snapshot IDs from various input formats.""" + def test_archiveresult_gathers_snapshot_ids(self): + """archiveresult should gather snapshot IDs from various input formats.""" from archivebox.misc.jsonl import TYPE_SNAPSHOT, TYPE_ARCHIVERESULT records = [ @@ -529,7 +530,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase): # Create crawl with multiple URLs (as newline-separated string) urls = 'https://test-crawl-1.example.com\nhttps://test-crawl-2.example.com' - crawl = Crawl.from_jsonl({'urls': urls}, overrides={'created_by_id': created_by_id}) + crawl = Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id}) self.assertIsNotNone(crawl) self.assertIsNotNone(crawl.id) @@ -543,7 +544,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase): self.assertIn('https://test-crawl-2.example.com', urls_list) # Verify output format - output = crawl.to_jsonl() + output = crawl.to_json() self.assertEqual(output['type'], TYPE_CRAWL) self.assertIn('id', output) self.assertEqual(output['urls'], urls) @@ -566,8 +567,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase): # Step 1: Create crawl (simulating 'archivebox crawl') urls = 'https://crawl-to-snap-1.example.com\nhttps://crawl-to-snap-2.example.com' - crawl = Crawl.from_jsonl({'urls': urls}, overrides={'created_by_id': created_by_id}) - crawl_output = crawl.to_jsonl() + crawl = Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id}) + crawl_output = crawl.to_json() # Step 2: Parse crawl output as snapshot input stdin = StringIO(json.dumps(crawl_output) + '\n') @@ -581,7 +582,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase): # Step 3: Create snapshots from crawl URLs created_snapshots = [] for url in crawl.get_urls_list(): - snapshot = Snapshot.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id}) + snapshot = Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id}) if snapshot: created_snapshots.append(snapshot) @@ -589,7 +590,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase): # Verify snapshot output for snapshot in created_snapshots: - output = snapshot.to_jsonl() + output = snapshot.to_json() self.assertEqual(output['type'], TYPE_SNAPSHOT) self.assertIn(output['url'], [ 'https://crawl-to-snap-1.example.com', @@ -619,13 +620,13 @@ class TestPipingWorkflowIntegration(unittest.TestCase): # Create snapshot overrides = {'created_by_id': created_by_id} - snapshot = Snapshot.from_jsonl(records[0], overrides=overrides) + snapshot = Snapshot.from_json(records[0], overrides=overrides) self.assertIsNotNone(snapshot.id) self.assertEqual(snapshot.url, url) # Verify output format - output = snapshot.to_jsonl() + output = snapshot.to_json() self.assertEqual(output['type'], TYPE_SNAPSHOT) self.assertIn('id', output) self.assertEqual(output['url'], url) @@ -647,8 +648,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase): # Step 1: Create snapshot (simulating 'archivebox snapshot') url = 'https://test-extract-1.example.com' overrides = {'created_by_id': created_by_id} - snapshot = Snapshot.from_jsonl({'url': url}, overrides=overrides) - snapshot_output = snapshot.to_jsonl() + snapshot = Snapshot.from_json({'url': url}, overrides=overrides) + snapshot_output = snapshot.to_json() # Step 2: Parse snapshot output as extract input stdin = StringIO(json.dumps(snapshot_output) + '\n') @@ -686,8 +687,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase): # === archivebox crawl https://example.com === url = 'https://test-pipeline-full.example.com' - crawl = Crawl.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id}) - crawl_jsonl = json.dumps(crawl.to_jsonl()) + crawl = Crawl.from_json({'url': url}, overrides={'created_by_id': created_by_id}) + crawl_jsonl = json.dumps(crawl.to_json()) # === | archivebox snapshot === stdin = StringIO(crawl_jsonl + '\n') @@ -705,7 +706,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase): if crawl_id: db_crawl = Crawl.objects.get(id=crawl_id) for crawl_url in db_crawl.get_urls_list(): - snapshot = Snapshot.from_jsonl({'url': crawl_url}, overrides={'created_by_id': created_by_id}) + snapshot = Snapshot.from_json({'url': crawl_url}, overrides={'created_by_id': created_by_id}) if snapshot: created_snapshots.append(snapshot) @@ -713,7 +714,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase): self.assertEqual(created_snapshots[0].url, url) # === | archivebox extract === - snapshot_jsonl_lines = [json.dumps(s.to_jsonl()) for s in created_snapshots] + snapshot_jsonl_lines = [json.dumps(s.to_json()) for s in created_snapshots] stdin = StringIO('\n'.join(snapshot_jsonl_lines) + '\n') stdin.isatty = lambda: False @@ -757,12 +758,12 @@ class TestDepthWorkflows(unittest.TestCase): # Create crawl with depth 0 url = 'https://depth0-test.example.com' - crawl = Crawl.from_jsonl({'url': url, 'max_depth': 0}, overrides={'created_by_id': created_by_id}) + crawl = Crawl.from_json({'url': url, 'max_depth': 0}, overrides={'created_by_id': created_by_id}) self.assertEqual(crawl.max_depth, 0) # Create snapshot - snapshot = Snapshot.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id}) + snapshot = Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id}) self.assertEqual(snapshot.url, url) def test_depth_metadata_in_crawl(self): @@ -773,7 +774,7 @@ class TestDepthWorkflows(unittest.TestCase): created_by_id = get_or_create_system_user_pk() # Create crawl with depth - crawl = Crawl.from_jsonl( + crawl = Crawl.from_json( {'url': 'https://depth-meta-test.example.com', 'max_depth': 2}, overrides={'created_by_id': created_by_id} ) @@ -781,7 +782,7 @@ class TestDepthWorkflows(unittest.TestCase): self.assertEqual(crawl.max_depth, 2) # Verify in JSONL output - output = crawl.to_jsonl() + output = crawl.to_json() self.assertEqual(output['max_depth'], 2) diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py index dd7d04da..b749951d 100644 --- a/archivebox/core/forms.py +++ b/archivebox/core/forms.py @@ -158,7 +158,7 @@ class AddLinkForm(forms.Form): 'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite' } binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'} - extensions = {'captcha2', 'istilldontcareaboutcookies', 'ublock'} + extensions = {'twocaptcha', 'istilldontcareaboutcookies', 'ublock'} # Populate plugin field choices self.fields['chrome_plugins'].choices = [ diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 883733c5..1dca0810 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1,6 +1,6 @@ __package__ = 'archivebox.core' -from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING +from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING, Iterator, Set from archivebox.uuid_compat import uuid7 from datetime import datetime, timedelta from django_stubs_ext.db.models import TypedModelMeta @@ -41,6 +41,8 @@ from archivebox.machine.models import NetworkInterface, Binary class Tag(ModelWithSerializers): + JSONL_TYPE = 'Tag' + # Keep AutoField for compatibility with main branch migrations # Don't use UUIDField here - requires complex FK transformation id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID') @@ -91,26 +93,66 @@ class Tag(ModelWithSerializers): def api_url(self) -> str: return reverse_lazy('api-1:get_tag', args=[self.id]) - def to_jsonl(self) -> dict: + def to_json(self) -> dict: """ - Convert Tag model instance to a JSONL record. + Convert Tag model instance to a JSON-serializable dict. """ from archivebox.config import VERSION return { - 'type': 'Tag', + 'type': self.JSONL_TYPE, 'schema_version': VERSION, 'id': str(self.id), 'name': self.name, 'slug': self.slug, } - @staticmethod - def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None): + def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]: """ - Create/update Tag from JSONL record. + Yield this Tag as a JSON record. Args: - record: JSONL record with 'name' field + seen: Set of (type, id) tuples already emitted (for deduplication) + **kwargs: Passed to children (none for Tag, leaf node) + + Yields: + dict: JSON-serializable record for this tag + """ + if seen is not None: + key = (self.JSONL_TYPE, str(self.id)) + if key in seen: + return + seen.add(key) + yield self.to_json() + + @classmethod + def from_jsonl(cls, records, overrides: Dict[str, Any] = None) -> list['Tag']: + """ + Create/update Tags from an iterable of JSONL records. + Filters to only records with type='Tag'. + + Args: + records: Iterable of dicts (JSONL records) + overrides: Optional dict with 'snapshot' to auto-attach tags + + Returns: + List of Tag instances (skips None results) + """ + results = [] + for record in records: + record_type = record.get('type', cls.JSONL_TYPE) + if record_type == cls.JSONL_TYPE: + instance = cls.from_json(record, overrides=overrides) + if instance: + results.append(instance) + return results + + @staticmethod + def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None) -> 'Tag | None': + """ + Create/update a single Tag from a JSON record dict. + + Args: + record: Dict with 'name' field overrides: Optional dict with 'snapshot' to auto-attach tag Returns: @@ -289,6 +331,8 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)): class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine): + JSONL_TYPE = 'Snapshot' + id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) @@ -968,38 +1012,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea Each line is a JSON record with a 'type' field: - Snapshot: snapshot metadata (crawl_id, url, tags, etc.) - - ArchiveResult: extractor results (plugin, status, output, etc.) - Binary: binary info used for the extraction - Process: process execution details (cmd, exit_code, timing, etc.) + - ArchiveResult: extractor results (plugin, status, output, etc.) """ import json index_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME index_path.parent.mkdir(parents=True, exist_ok=True) - # Track unique binaries and processes to avoid duplicates - binaries_seen = set() - processes_seen = set() - with open(index_path, 'w') as f: - # Write Snapshot record first (to_jsonl includes crawl_id, fs_version) - f.write(json.dumps(self.to_jsonl()) + '\n') - - # Write ArchiveResult records with their associated Binary and Process - # Use select_related to optimize queries - for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'): - # Write Binary record if not already written - if ar.process and ar.process.binary and ar.process.binary_id not in binaries_seen: - binaries_seen.add(ar.process.binary_id) - f.write(json.dumps(ar.process.binary.to_jsonl()) + '\n') - - # Write Process record if not already written - if ar.process and ar.process_id not in processes_seen: - processes_seen.add(ar.process_id) - f.write(json.dumps(ar.process.to_jsonl()) + '\n') - - # Write ArchiveResult record - f.write(json.dumps(ar.to_jsonl()) + '\n') + for record in self.to_jsonl(): + f.write(json.dumps(record) + '\n') def read_index_jsonl(self) -> dict: """ @@ -1420,14 +1444,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea return False - def to_jsonl(self) -> dict: + def to_json(self) -> dict: """ - Convert Snapshot model instance to a JSONL record. + Convert Snapshot model instance to a JSON-serializable dict. Includes all fields needed to fully reconstruct/identify this snapshot. """ from archivebox.config import VERSION return { - 'type': 'Snapshot', + 'type': self.JSONL_TYPE, 'schema_version': VERSION, 'id': str(self.id), 'crawl_id': str(self.crawl_id), @@ -1442,12 +1466,68 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea 'fs_version': self.fs_version, } - @staticmethod - def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True): + def to_jsonl(self, seen: Set[tuple] = None, archiveresult: bool = True, process: bool = True, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]: """ - Create/update Snapshot from JSONL record or dict. + Yield this Snapshot and optionally related objects as JSON records. - Unified method that handles: + Uses select_related for efficient querying. Deduplicates automatically. + + Args: + seen: Set of (type, id) tuples already emitted (for deduplication) + archiveresult: Include related ArchiveResults (default: True) + process: Include Process for each ArchiveResult (default: True) + binary: Include Binary for each Process (default: True) + machine: Include Machine for each Process (default: False) + iface: Include NetworkInterface for each Process (default: False) + **kwargs: Additional options passed to children + + Yields: + dict: JSON-serializable records + """ + if seen is None: + seen = set() + + key = (self.JSONL_TYPE, str(self.id)) + if key in seen: + return + seen.add(key) + + yield self.to_json() + + if archiveresult: + # Use select_related to optimize queries + for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'): + yield from ar.to_jsonl(seen=seen, process=process, binary=binary, machine=machine, iface=iface, **kwargs) + + @classmethod + def from_jsonl(cls, records, overrides: Dict[str, Any] = None, queue_for_extraction: bool = True) -> list['Snapshot']: + """ + Create/update Snapshots from an iterable of JSONL records. + Filters to only records with type='Snapshot' (or no type). + + Args: + records: Iterable of dicts (JSONL records) + overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id' + queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True) + + Returns: + List of Snapshot instances (skips None results) + """ + results = [] + for record in records: + record_type = record.get('type', cls.JSONL_TYPE) + if record_type == cls.JSONL_TYPE: + instance = cls.from_json(record, overrides=overrides, queue_for_extraction=queue_for_extraction) + if instance: + results.append(instance) + return results + + @staticmethod + def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True) -> 'Snapshot | None': + """ + Create/update a single Snapshot from a JSON record dict. + + Handles: - ID-based patching: {"id": "...", "title": "new title"} - URL-based create/update: {"url": "...", "title": "...", "tags": "..."} - Auto-creates Crawl if not provided @@ -2054,8 +2134,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea result['canonical'] = self.canonical_outputs() return result - def to_json(self, indent: int = 4) -> str: - """Convert to JSON string""" + def to_json_str(self, indent: int = 4) -> str: + """Convert to JSON string for file output.""" return to_json(self.to_dict(extended=True), indent=indent) def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str: @@ -2203,6 +2283,8 @@ class SnapshotMachine(BaseStateMachine, strict_states=True): class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine): + JSONL_TYPE = 'ArchiveResult' + class StatusChoices(models.TextChoices): QUEUED = 'queued', 'Queued' STARTED = 'started', 'Started' @@ -2274,13 +2356,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi """Convenience property to access the user who created this archive result via its snapshot's crawl.""" return self.snapshot.crawl.created_by - def to_jsonl(self) -> dict: + def to_json(self) -> dict: """ - Convert ArchiveResult model instance to a JSONL record. + Convert ArchiveResult model instance to a JSON-serializable dict. """ from archivebox.config import VERSION record = { - 'type': 'ArchiveResult', + 'type': self.JSONL_TYPE, 'schema_version': VERSION, 'id': str(self.id), 'snapshot_id': str(self.snapshot_id), @@ -2308,6 +2390,31 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi record['process_id'] = str(self.process_id) return record + def to_jsonl(self, seen: Set[tuple] = None, process: bool = True, **kwargs) -> Iterator[dict]: + """ + Yield this ArchiveResult and optionally related objects as JSON records. + + Args: + seen: Set of (type, id) tuples already emitted (for deduplication) + process: Include related Process and its children (default: True) + **kwargs: Passed to Process.to_jsonl() (e.g., binary=True, machine=False) + + Yields: + dict: JSON-serializable records + """ + if seen is None: + seen = set() + + key = (self.JSONL_TYPE, str(self.id)) + if key in seen: + return + seen.add(key) + + yield self.to_json() + + if process and self.process: + yield from self.process.to_jsonl(seen=seen, **kwargs) + def save(self, *args, **kwargs): is_new = self._state.adding diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index 3e1a53f9..9e756f29 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -1,6 +1,6 @@ __package__ = 'archivebox.crawls' -from typing import TYPE_CHECKING, Iterable +from typing import TYPE_CHECKING, Iterable, Iterator, Set from datetime import timedelta from archivebox.uuid_compat import uuid7 from pathlib import Path @@ -59,6 +59,8 @@ class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats): class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWithStateMachine): + JSONL_TYPE = 'Crawl' + id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False) @@ -134,13 +136,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith def api_url(self) -> str: return reverse_lazy('api-1:get_crawl', args=[self.id]) - def to_jsonl(self) -> dict: + def to_json(self) -> dict: """ - Convert Crawl model instance to a JSONL record. + Convert Crawl model instance to a JSON-serializable dict. """ from archivebox.config import VERSION return { - 'type': 'Crawl', + 'type': self.JSONL_TYPE, 'schema_version': VERSION, 'id': str(self.id), 'urls': self.urls, @@ -151,10 +153,63 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith 'created_at': self.created_at.isoformat() if self.created_at else None, } - @staticmethod - def from_jsonl(record: dict, overrides: dict = None): + def to_jsonl(self, seen: Set[tuple] = None, snapshot: bool = True, archiveresult: bool = True, process: bool = True, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]: """ - Create or get a Crawl from a JSONL record. + Yield this Crawl and optionally related objects as JSON records. + + Args: + seen: Set of (type, id) tuples already emitted (for deduplication) + snapshot: Include related Snapshots (default: True) + archiveresult: Include ArchiveResults for each Snapshot (default: True) + process: Include Process for each ArchiveResult (default: True) + binary: Include Binary for each Process (default: True) + machine: Include Machine for each Process (default: False) + iface: Include NetworkInterface for each Process (default: False) + **kwargs: Additional options passed to children + + Yields: + dict: JSON-serializable records + """ + if seen is None: + seen = set() + + key = (self.JSONL_TYPE, str(self.id)) + if key in seen: + return + seen.add(key) + + yield self.to_json() + + if snapshot: + for snap in self.snapshot_set.all(): + yield from snap.to_jsonl(seen=seen, archiveresult=archiveresult, process=process, binary=binary, machine=machine, iface=iface, **kwargs) + + @classmethod + def from_jsonl(cls, records, overrides: dict = None) -> list['Crawl']: + """ + Create/update Crawls from an iterable of JSONL records. + Filters to only records with type='Crawl' (or no type). + + Args: + records: Iterable of dicts (JSONL records) + overrides: Dict of field overrides (e.g., created_by_id) + + Returns: + List of Crawl instances (skips None results) + """ + results = [] + for record in records: + record_type = record.get('type', cls.JSONL_TYPE) + if record_type == cls.JSONL_TYPE: + instance = cls.from_json(record, overrides=overrides) + if instance: + results.append(instance) + return results + + @staticmethod + def from_json(record: dict, overrides: dict = None) -> 'Crawl | None': + """ + Create or get a single Crawl from a JSON record dict. Args: record: Dict with 'urls' (required), optional 'max_depth', 'tags_str', 'label' diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 6485f2c0..2a506e9b 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -1176,7 +1176,9 @@ def create_model_record(record: Dict[str, Any]) -> Any: def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any] = None) -> Dict[str, int]: """ Process JSONL records from hook output. - Dispatches to Model.from_jsonl() for each record type. + + Uses Model.from_jsonl() which automatically filters by JSONL_TYPE. + Each model only processes records matching its type. Args: records: List of JSONL record dicts from result['records'] @@ -1185,54 +1187,26 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any Returns: Dict with counts by record type """ - stats = {} + from archivebox.core.models import Snapshot, Tag + from archivebox.machine.models import Binary, Machine + overrides = overrides or {} - for record in records: - record_type = record.get('type') - if not record_type: - continue + # Filter out ArchiveResult records (they update the calling AR, not create new ones) + filtered_records = [r for r in records if r.get('type') != 'ArchiveResult'] - # Skip ArchiveResult records (they update the calling ArchiveResult, not create new ones) - if record_type == 'ArchiveResult': - continue + # Each model's from_jsonl() filters to only its own type + snapshots = Snapshot.from_jsonl(filtered_records, overrides) + tags = Tag.from_jsonl(filtered_records, overrides) + binaries = Binary.from_jsonl(filtered_records, overrides) + machines = Machine.from_jsonl(filtered_records, overrides) - try: - # Dispatch to appropriate model's from_jsonl() method - if record_type == 'Snapshot': - from archivebox.core.models import Snapshot - obj = Snapshot.from_jsonl(record.copy(), overrides) - if obj: - stats['Snapshot'] = stats.get('Snapshot', 0) + 1 - - elif record_type == 'Tag': - from archivebox.core.models import Tag - obj = Tag.from_jsonl(record.copy(), overrides) - if obj: - stats['Tag'] = stats.get('Tag', 0) + 1 - - elif record_type == 'Binary': - from archivebox.machine.models import Binary - obj = Binary.from_jsonl(record.copy(), overrides) - if obj: - stats['Binary'] = stats.get('Binary', 0) + 1 - - elif record_type == 'Machine': - from archivebox.machine.models import Machine - obj = Machine.from_jsonl(record.copy(), overrides) - if obj: - stats['Machine'] = stats.get('Machine', 0) + 1 - - else: - import sys - print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr) - - except Exception as e: - import sys - print(f"Warning: Failed to create {record_type}: {e}", file=sys.stderr) - continue - - return stats + return { + 'Snapshot': len(snapshots), + 'Tag': len(tags), + 'Binary': len(binaries), + 'Machine': len(machines), + } def process_is_alive(pid_file: Path) -> bool: diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index 2d15bf1f..c0659afd 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -1,6 +1,7 @@ __package__ = 'archivebox.machine' import socket +from typing import Iterator, Set from archivebox.uuid_compat import uuid7 from datetime import timedelta @@ -29,6 +30,8 @@ class MachineManager(models.Manager): class Machine(ModelWithHealthStats): + JSONL_TYPE = 'Machine' + id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) @@ -69,13 +72,35 @@ class Machine(ModelWithHealthStats): ) return _CURRENT_MACHINE - @staticmethod - def from_jsonl(record: dict, overrides: dict = None): + @classmethod + def from_jsonl(cls, records, overrides: dict = None) -> list['Machine']: """ - Update Machine config from JSONL record. + Update Machine configs from an iterable of JSONL records. + Filters to only records with type='Machine'. Args: - record: JSONL record with '_method': 'update', 'key': '...', 'value': '...' + records: Iterable of dicts (JSONL records) + overrides: Not used + + Returns: + List of Machine instances (skips None results) + """ + results = [] + for record in records: + record_type = record.get('type', cls.JSONL_TYPE) + if record_type == cls.JSONL_TYPE: + instance = cls.from_json(record, overrides=overrides) + if instance: + results.append(instance) + return results + + @staticmethod + def from_json(record: dict, overrides: dict = None) -> 'Machine | None': + """ + Update a single Machine config from a JSON record dict. + + Args: + record: Dict with '_method': 'update', 'key': '...', 'value': '...' overrides: Not used Returns: @@ -94,6 +119,44 @@ class Machine(ModelWithHealthStats): return machine return None + def to_json(self) -> dict: + """ + Convert Machine model instance to a JSON-serializable dict. + """ + from archivebox.config import VERSION + return { + 'type': self.JSONL_TYPE, + 'schema_version': VERSION, + 'id': str(self.id), + 'guid': self.guid, + 'hostname': self.hostname, + 'hw_in_docker': self.hw_in_docker, + 'hw_in_vm': self.hw_in_vm, + 'os_arch': self.os_arch, + 'os_family': self.os_family, + 'os_platform': self.os_platform, + 'os_release': self.os_release, + 'created_at': self.created_at.isoformat() if self.created_at else None, + } + + def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]: + """ + Yield this Machine as a JSON record. + + Args: + seen: Set of (type, id) tuples already emitted (for deduplication) + **kwargs: Passed to children (none for Machine, leaf node) + + Yields: + dict: JSON-serializable record for this machine + """ + if seen is not None: + key = (self.JSONL_TYPE, str(self.id)) + if key in seen: + return + seen.add(key) + yield self.to_json() + class NetworkInterfaceManager(models.Manager): def current(self) -> 'NetworkInterface': @@ -101,6 +164,8 @@ class NetworkInterfaceManager(models.Manager): class NetworkInterface(ModelWithHealthStats): + JSONL_TYPE = 'NetworkInterface' + id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) modified_at = models.DateTimeField(auto_now=True) @@ -139,6 +204,46 @@ class NetworkInterface(ModelWithHealthStats): ) return _CURRENT_INTERFACE + def to_json(self) -> dict: + """ + Convert NetworkInterface model instance to a JSON-serializable dict. + """ + from archivebox.config import VERSION + return { + 'type': self.JSONL_TYPE, + 'schema_version': VERSION, + 'id': str(self.id), + 'machine_id': str(self.machine_id), + 'hostname': self.hostname, + 'iface': self.iface, + 'ip_public': self.ip_public, + 'ip_local': self.ip_local, + 'mac_address': self.mac_address, + 'dns_server': self.dns_server, + 'isp': self.isp, + 'city': self.city, + 'region': self.region, + 'country': self.country, + 'created_at': self.created_at.isoformat() if self.created_at else None, + } + + def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]: + """ + Yield this NetworkInterface as a JSON record. + + Args: + seen: Set of (type, id) tuples already emitted (for deduplication) + **kwargs: Passed to children (none for NetworkInterface, leaf node) + + Yields: + dict: JSON-serializable record for this network interface + """ + if seen is not None: + key = (self.JSONL_TYPE, str(self.id)) + if key in seen: + return + seen.add(key) + yield self.to_json() class BinaryManager(models.Manager): @@ -165,7 +270,7 @@ class BinaryManager(models.Manager): class Binary(ModelWithHealthStats): """ - Tracks an binary on a specific machine. + Tracks a binary on a specific machine. Follows the unified state machine pattern: - queued: Binary needs to be installed @@ -176,6 +281,7 @@ class Binary(ModelWithHealthStats): State machine calls run() which executes on_Binary__install_* hooks to install the binary using the specified providers. """ + JSONL_TYPE = 'Binary' class StatusChoices(models.TextChoices): QUEUED = 'queued', 'Queued' @@ -242,13 +348,13 @@ class Binary(ModelWithHealthStats): 'is_valid': self.is_valid, } - def to_jsonl(self) -> dict: + def to_json(self) -> dict: """ - Convert Binary model instance to a JSONL record. + Convert Binary model instance to a JSON-serializable dict. """ from archivebox.config import VERSION return { - 'type': 'Binary', + 'type': self.JSONL_TYPE, 'schema_version': VERSION, 'id': str(self.id), 'machine_id': str(self.machine_id), @@ -260,17 +366,57 @@ class Binary(ModelWithHealthStats): 'status': self.status, } - @staticmethod - def from_jsonl(record: dict, overrides: dict = None): + def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]: """ - Create/update Binary from JSONL record. + Yield this Binary as a JSON record. + + Args: + seen: Set of (type, id) tuples already emitted (for deduplication) + **kwargs: Passed to children (none for Binary, leaf node) + + Yields: + dict: JSON-serializable record for this binary + """ + if seen is not None: + key = (self.JSONL_TYPE, str(self.id)) + if key in seen: + return + seen.add(key) + yield self.to_json() + + @classmethod + def from_jsonl(cls, records, overrides: dict = None) -> list['Binary']: + """ + Create/update Binaries from an iterable of JSONL records. + Filters to only records with type='Binary'. + + Args: + records: Iterable of dicts (JSONL records) + overrides: Not used + + Returns: + List of Binary instances (skips None results) + """ + results = [] + for record in records: + record_type = record.get('type', cls.JSONL_TYPE) + if record_type == cls.JSONL_TYPE: + instance = cls.from_json(record, overrides=overrides) + if instance: + results.append(instance) + return results + + @staticmethod + def from_json(record: dict, overrides: dict = None) -> 'Binary | None': + """ + Create/update a single Binary from a JSON record dict. Handles two cases: 1. From binaries.jsonl: creates queued binary with name, binproviders, overrides 2. From hook output: updates binary with abspath, version, sha256, binprovider Args: - record: JSONL record with 'name' and either: + record: Dict with 'name' and either: - 'binproviders', 'overrides' (from binaries.jsonl) - 'abspath', 'version', 'sha256', 'binprovider' (from hook output) overrides: Not used @@ -494,6 +640,7 @@ class Process(ModelWithHealthStats): State machine calls launch() to spawn the process and monitors its lifecycle. """ + JSONL_TYPE = 'Process' class StatusChoices(models.TextChoices): QUEUED = 'queued', 'Queued' @@ -624,13 +771,13 @@ class Process(ModelWithHealthStats): return self.archiveresult.hook_name return '' - def to_jsonl(self) -> dict: + def to_json(self) -> dict: """ - Convert Process model instance to a JSONL record. + Convert Process model instance to a JSON-serializable dict. """ from archivebox.config import VERSION record = { - 'type': 'Process', + 'type': self.JSONL_TYPE, 'schema_version': VERSION, 'id': str(self.id), 'machine_id': str(self.machine_id), @@ -650,6 +797,37 @@ class Process(ModelWithHealthStats): record['timeout'] = self.timeout return record + def to_jsonl(self, seen: Set[tuple] = None, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]: + """ + Yield this Process and optionally related objects as JSON records. + + Args: + seen: Set of (type, id) tuples already emitted (for deduplication) + binary: Include related Binary (default: True) + machine: Include related Machine (default: False) + iface: Include related NetworkInterface (default: False) + **kwargs: Passed to children + + Yields: + dict: JSON-serializable records + """ + if seen is None: + seen = set() + + key = (self.JSONL_TYPE, str(self.id)) + if key in seen: + return + seen.add(key) + + yield self.to_json() + + if binary and self.binary: + yield from self.binary.to_jsonl(seen=seen, **kwargs) + if machine and self.machine: + yield from self.machine.to_jsonl(seen=seen, **kwargs) + if iface and self.iface: + yield from self.iface.to_jsonl(seen=seen, **kwargs) + def update_and_requeue(self, **kwargs): """ Update process fields and requeue for worker state machine. diff --git a/archivebox/misc/jsonl.py b/archivebox/misc/jsonl.py index 1e555a0a..df1163ab 100644 --- a/archivebox/misc/jsonl.py +++ b/archivebox/misc/jsonl.py @@ -24,7 +24,7 @@ __package__ = 'archivebox.misc' import sys import json -from typing import Iterator, Dict, Any, Optional, TextIO, Callable +from typing import Iterator, Dict, Any, Optional, TextIO from pathlib import Path @@ -150,36 +150,3 @@ def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] = count += 1 return count - -def filter_by_type(records: Iterator[Dict[str, Any]], record_type: str) -> Iterator[Dict[str, Any]]: - """ - Filter records by type. - """ - for record in records: - if record.get('type') == record_type: - yield record - - -def process_records( - records: Iterator[Dict[str, Any]], - handlers: Dict[str, Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]] -) -> Iterator[Dict[str, Any]]: - """ - Process records through type-specific handlers. - - Args: - records: Input record iterator - handlers: Dict mapping type names to handler functions - Handlers return output records or None to skip - - Yields output records from handlers. - """ - for record in records: - record_type = record.get('type') - handler = handlers.get(record_type) - if handler: - result = handler(record) - if result: - yield result - - diff --git a/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py b/archivebox/plugins/chrome/on_Crawl__00_install_puppeteer_chromium.py similarity index 68% rename from archivebox/plugins/chrome/on_Crawl__00_chrome_install.py rename to archivebox/plugins/chrome/on_Crawl__00_install_puppeteer_chromium.py index 4c6bbbdd..6730333f 100644 --- a/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py +++ b/archivebox/plugins/chrome/on_Crawl__00_install_puppeteer_chromium.py @@ -3,7 +3,12 @@ Install hook for Chrome/Chromium and puppeteer-core. Runs at crawl start to install/find Chromium and puppeteer-core. -Outputs JSONL for Binary and Machine config updates. +Also validates config and computes derived values. + +Outputs: + - JSONL for Binary and Machine config updates + - COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env + Respects CHROME_BINARY env var for custom binary paths. Uses `npx @puppeteer/browsers install chromium@latest` and parses output. @@ -19,6 +24,28 @@ import subprocess from pathlib import Path +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def detect_docker() -> bool: + """Detect if running inside Docker container.""" + return ( + os.path.exists('/.dockerenv') or + os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or + os.path.exists('/run/.containerenv') + ) + + def get_chrome_version(binary_path: str) -> str | None: """Get Chrome/Chromium version string.""" try: @@ -131,13 +158,41 @@ def install_chromium() -> dict | None: def main(): + warnings = [] + errors = [] + computed = {} + # Install puppeteer-core if NODE_MODULES_DIR is set install_puppeteer_core() + # Check if Chrome is enabled + chrome_enabled = get_env_bool('CHROME_ENABLED', True) + + # Detect Docker and adjust sandbox + in_docker = detect_docker() + computed['IN_DOCKER'] = str(in_docker).lower() + + chrome_sandbox = get_env_bool('CHROME_SANDBOX', True) + if in_docker and chrome_sandbox: + warnings.append( + "Running in Docker with CHROME_SANDBOX=true. " + "Chrome may fail to start. Consider setting CHROME_SANDBOX=false." + ) + # Auto-disable sandbox in Docker unless explicitly set + if not get_env('CHROME_SANDBOX'): + computed['CHROME_SANDBOX'] = 'false' + + # Check Node.js availability + node_binary = get_env('NODE_BINARY', 'node') + computed['NODE_BINARY'] = node_binary + # Check if CHROME_BINARY is already set and valid - configured_binary = os.environ.get('CHROME_BINARY', '').strip() + configured_binary = get_env('CHROME_BINARY', '') if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK): version = get_chrome_version(configured_binary) + computed['CHROME_BINARY'] = configured_binary + computed['CHROME_VERSION'] = version or 'unknown' + print(json.dumps({ 'type': 'Binary', 'name': 'chromium', @@ -145,12 +200,22 @@ def main(): 'version': version, 'binprovider': 'env', })) + + # Output computed values + for key, value in computed.items(): + print(f"COMPUTED:{key}={value}") + for warning in warnings: + print(f"WARNING:{warning}", file=sys.stderr) + sys.exit(0) # Install/find Chromium via puppeteer result = install_chromium() if result and result.get('abspath'): + computed['CHROME_BINARY'] = result['abspath'] + computed['CHROME_VERSION'] = result['version'] or 'unknown' + print(json.dumps({ 'type': 'Binary', 'name': result['name'], @@ -174,9 +239,25 @@ def main(): 'value': result['version'], })) + # Output computed values + for key, value in computed.items(): + print(f"COMPUTED:{key}={value}") + for warning in warnings: + print(f"WARNING:{warning}", file=sys.stderr) + sys.exit(0) else: - print("Chromium binary not found", file=sys.stderr) + errors.append("Chromium binary not found") + computed['CHROME_BINARY'] = '' + + # Output computed values and errors + for key, value in computed.items(): + print(f"COMPUTED:{key}={value}") + for warning in warnings: + print(f"WARNING:{warning}", file=sys.stderr) + for error in errors: + print(f"ERROR:{error}", file=sys.stderr) + sys.exit(1) diff --git a/archivebox/plugins/chrome/on_Crawl__10_chrome_validate_config.py b/archivebox/plugins/chrome/on_Crawl__10_chrome_validate_config.py deleted file mode 100644 index 7aa8639c..00000000 --- a/archivebox/plugins/chrome/on_Crawl__10_chrome_validate_config.py +++ /dev/null @@ -1,172 +0,0 @@ -#!/usr/bin/env python3 -""" -Validate and compute derived Chrome config values. - -This hook runs early in the Crawl lifecycle to: -1. Auto-detect Chrome binary location -2. Compute sandbox settings based on Docker detection -3. Validate binary availability and version -4. Set computed env vars for subsequent hooks - -Output: - - COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env - - Binary JSONL records to stdout when binaries are found -""" - -import json -import os -import sys - -from abx_pkg import Binary, EnvProvider - - -# Chrome binary search order -CHROME_BINARY_NAMES = [ - 'chromium', - 'chromium-browser', - 'google-chrome', - 'google-chrome-stable', - 'chrome', -] - -def get_env(name: str, default: str = '') -> str: - return os.environ.get(name, default).strip() - -def get_env_bool(name: str, default: bool = False) -> bool: - val = get_env(name, '').lower() - if val in ('true', '1', 'yes', 'on'): - return True - if val in ('false', '0', 'no', 'off'): - return False - return default - - -def detect_docker() -> bool: - """Detect if running inside Docker container.""" - return ( - os.path.exists('/.dockerenv') or - os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or - os.path.exists('/run/.containerenv') - ) - - -def find_chrome_binary(configured: str, provider: EnvProvider) -> Binary | None: - """Find Chrome binary using abx-pkg, checking configured path first.""" - # Try configured binary first - if configured: - try: - binary = Binary(name=configured, binproviders=[provider]).load() - if binary.abspath: - return binary - except Exception: - pass - - # Search common names - for name in CHROME_BINARY_NAMES: - try: - binary = Binary(name=name, binproviders=[provider]).load() - if binary.abspath: - return binary - except Exception: - continue - - return None - - -def output_binary(binary: Binary, name: str): - """Output Binary JSONL record to stdout.""" - machine_id = os.environ.get('MACHINE_ID', '') - - record = { - 'type': 'Binary', - 'name': name, - 'abspath': str(binary.abspath), - 'version': str(binary.version) if binary.version else '', - 'sha256': binary.sha256 or '', - 'binprovider': 'env', - 'machine_id': machine_id, - } - print(json.dumps(record)) - - -def main(): - warnings = [] - errors = [] - computed = {} - - # Get config values - chrome_binary = get_env('CHROME_BINARY', 'chromium') - chrome_sandbox = get_env_bool('CHROME_SANDBOX', True) - screenshot_enabled = get_env_bool('SCREENSHOT_ENABLED', True) - pdf_enabled = get_env_bool('PDF_ENABLED', True) - dom_enabled = get_env_bool('DOM_ENABLED', True) - - # Compute USE_CHROME (derived from extractor enabled flags) - use_chrome = screenshot_enabled or pdf_enabled or dom_enabled - computed['USE_CHROME'] = str(use_chrome).lower() - - # Detect Docker and adjust sandbox - in_docker = detect_docker() - computed['IN_DOCKER'] = str(in_docker).lower() - - if in_docker and chrome_sandbox: - warnings.append( - "Running in Docker with CHROME_SANDBOX=true. " - "Chrome may fail to start. Consider setting CHROME_SANDBOX=false." - ) - # Auto-disable sandbox in Docker unless explicitly set - if not get_env('CHROME_SANDBOX'): - computed['CHROME_SANDBOX'] = 'false' - - # Find Chrome binary using abx-pkg - provider = EnvProvider() - if use_chrome: - chrome = find_chrome_binary(chrome_binary, provider) - if not chrome or not chrome.abspath: - errors.append( - f"Chrome binary not found (tried: {chrome_binary}). " - "Install Chrome/Chromium or set CHROME_BINARY path." - ) - computed['CHROME_BINARY'] = '' - else: - computed['CHROME_BINARY'] = str(chrome.abspath) - computed['CHROME_VERSION'] = str(chrome.version) if chrome.version else 'unknown' - - # Output Binary JSONL record for Chrome - output_binary(chrome, name='chrome') - - # Check Node.js for Puppeteer - node_binary_name = get_env('NODE_BINARY', 'node') - try: - node = Binary(name=node_binary_name, binproviders=[provider]).load() - node_path = str(node.abspath) if node.abspath else '' - except Exception: - node = None - node_path = '' - - if use_chrome and not node_path: - errors.append( - f"Node.js not found (tried: {node_binary_name}). " - "Install Node.js or set NODE_BINARY path for Puppeteer." - ) - else: - computed['NODE_BINARY'] = node_path - if node and node.abspath: - # Output Binary JSONL record for Node - output_binary(node, name='node') - - # Output computed values - for key, value in computed.items(): - print(f"COMPUTED:{key}={value}") - - for warning in warnings: - print(f"WARNING:{warning}", file=sys.stderr) - - for error in errors: - print(f"ERROR:{error}", file=sys.stderr) - - sys.exit(1 if errors else 0) - - -if __name__ == '__main__': - main() diff --git a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js b/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js similarity index 98% rename from archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js rename to archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js index c2d62775..d025be81 100644 --- a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js +++ b/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js @@ -9,7 +9,7 @@ * --load-extension and --disable-extensions-except flags. * * Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id= --source-url= - * Output: Creates chrome/ directory under crawl output dir with: + * Output: Writes to current directory (executor creates chrome/ dir): * - cdp_url.txt: WebSocket URL for CDP connection * - chrome.pid: Chromium process ID (for cleanup) * - port.txt: Debug port number @@ -42,7 +42,7 @@ const { // Extractor metadata const PLUGIN_NAME = 'chrome_launch'; -const OUTPUT_DIR = 'chrome'; +const OUTPUT_DIR = '.'; // Global state for cleanup let chromePid = null; diff --git a/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__20_install_istilldontcareaboutcookies_extension.js similarity index 100% rename from archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js rename to archivebox/plugins/istilldontcareaboutcookies/on_Crawl__20_install_istilldontcareaboutcookies_extension.js diff --git a/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js b/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js deleted file mode 100755 index 7637bf98..00000000 --- a/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js +++ /dev/null @@ -1,268 +0,0 @@ -#!/usr/bin/env node -/** - * SingleFile Extension Plugin - * - * Installs and uses the SingleFile Chrome extension for archiving complete web pages. - * Falls back to single-file-cli if the extension is not available. - * - * Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle - * - * Priority: 04 (early) - Must install before Chrome session starts at Crawl level - * Hook: on_Crawl (runs once per crawl, not per snapshot) - * - * This extension automatically: - * - Saves complete web pages as single HTML files - * - Inlines all resources (CSS, JS, images, fonts) - * - Preserves page fidelity better than wget/curl - * - Works with SPAs and dynamically loaded content - */ - -const path = require('path'); -const fs = require('fs'); -const { promisify } = require('util'); -const { exec } = require('child_process'); - -const execAsync = promisify(exec); - -// Import extension utilities -const extensionUtils = require('../chrome/chrome_utils.js'); - -// Extension metadata -const EXTENSION = { - webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle', - name: 'singlefile', -}; - -// Get extensions directory from environment or use default -const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR || - path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions'); - -const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR || - path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads'); - -const OUTPUT_DIR = '.'; -const OUTPUT_FILE = 'singlefile.html'; - -/** - * Install the SingleFile extension - */ -async function installSinglefileExtension() { - console.log('[*] Installing SingleFile extension...'); - - // Install the extension - const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR); - - if (!extension) { - console.error('[❌] Failed to install SingleFile extension'); - return null; - } - - console.log('[+] SingleFile extension installed'); - console.log('[+] Web pages will be saved as single HTML files'); - - return extension; -} - -/** - * Wait for a specified amount of time - */ -function wait(ms) { - return new Promise(resolve => setTimeout(resolve, ms)); -} - -/** - * Save a page using the SingleFile extension - * - * @param {Object} page - Puppeteer page object - * @param {Object} extension - Extension metadata with dispatchAction method - * @param {Object} options - Additional options - * @returns {Promise} - Path to saved file or null on failure - */ -async function saveSinglefileWithExtension(page, extension, options = {}) { - if (!extension || !extension.version) { - throw new Error('SingleFile extension not found or not loaded'); - } - - const url = await page.url(); - - // Check for unsupported URL schemes - const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob']; - const scheme = url.split(':')[0]; - if (URL_SCHEMES_IGNORED.includes(scheme)) { - console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`); - return null; - } - - // Ensure downloads directory exists - await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true }); - - // Get list of existing files to ignore - const files_before = new Set( - (await fs.promises.readdir(CHROME_DOWNLOADS_DIR)) - .filter(fn => fn.endsWith('.html')) - ); - - // Output directory is current directory (hook already runs in output dir) - const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE); - - console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`); - - // Bring page to front (extension action button acts on foreground tab) - await page.bringToFront(); - - // Trigger the extension's action (toolbar button click) - await extension.dispatchAction(); - - // Wait for file to appear in downloads directory - const check_delay = 3000; // 3 seconds - const max_tries = 10; - let files_new = []; - - for (let attempt = 0; attempt < max_tries; attempt++) { - await wait(check_delay); - - const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR)) - .filter(fn => fn.endsWith('.html')); - - files_new = files_after.filter(file => !files_before.has(file)); - - if (files_new.length === 0) { - continue; - } - - // Find the matching file by checking if it contains the URL in the HTML header - for (const file of files_new) { - const dl_path = path.join(CHROME_DOWNLOADS_DIR, file); - const dl_text = await fs.promises.readFile(dl_path, 'utf-8'); - const dl_header = dl_text.split('meta charset')[0]; - - if (dl_header.includes(`url: ${url}`)) { - console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`); - await fs.promises.rename(dl_path, out_path); - return out_path; - } - } - } - - console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`); - console.warn(`[⚠️] New files found: ${files_new.join(', ')}`); - return null; -} - -/** - * Save a page using single-file-cli (fallback method) - * - * @param {string} url - URL to archive - * @param {Object} options - Additional options - * @returns {Promise} - Path to saved file or null on failure - */ -async function saveSinglefileWithCLI(url, options = {}) { - console.log('[*] Falling back to single-file-cli...'); - - // Find single-file binary - let binary = null; - try { - const { stdout } = await execAsync('which single-file'); - binary = stdout.trim(); - } catch (err) { - console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli'); - return null; - } - - // Output directory is current directory (hook already runs in output dir) - const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE); - - // Build command - const cmd = [ - binary, - '--browser-headless', - url, - out_path, - ]; - - // Add optional args - if (options.userAgent) { - cmd.splice(2, 0, '--browser-user-agent', options.userAgent); - } - if (options.cookiesFile && fs.existsSync(options.cookiesFile)) { - cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile); - } - if (options.ignoreSSL) { - cmd.splice(2, 0, '--browser-ignore-insecure-certs'); - } - - // Execute - try { - const timeout = options.timeout || 120000; - await execAsync(cmd.join(' '), { timeout }); - - if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) { - console.log(`[+] SingleFile saved via CLI: ${out_path}`); - return out_path; - } - - console.error('[❌] SingleFile CLI completed but no output file found'); - return null; - } catch (err) { - console.error(`[❌] SingleFile CLI error: ${err.message}`); - return null; - } -} - -/** - * Main entry point - install extension before archiving - */ -async function main() { - // Check if extension is already cached - const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json'); - - if (fs.existsSync(cacheFile)) { - try { - const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8')); - const manifestPath = path.join(cached.unpacked_path, 'manifest.json'); - - if (fs.existsSync(manifestPath)) { - console.log('[*] SingleFile extension already installed (using cache)'); - return cached; - } - } catch (e) { - // Cache file corrupted, re-install - console.warn('[⚠️] Extension cache corrupted, re-installing...'); - } - } - - // Install extension - const extension = await installSinglefileExtension(); - - // Export extension metadata for chrome plugin to load - if (extension) { - // Write extension info to a cache file that chrome plugin can read - await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true }); - await fs.promises.writeFile( - cacheFile, - JSON.stringify(extension, null, 2) - ); - console.log(`[+] Extension metadata written to ${cacheFile}`); - } - - return extension; -} - -// Export functions for use by other plugins -module.exports = { - EXTENSION, - installSinglefileExtension, - saveSinglefileWithExtension, - saveSinglefileWithCLI, -}; - -// Run if executed directly -if (require.main === module) { - main().then(() => { - console.log('[✓] SingleFile extension setup complete'); - process.exit(0); - }).catch(err => { - console.error('[❌] SingleFile extension setup failed:', err); - process.exit(1); - }); -} diff --git a/archivebox/plugins/singlefile/on_Crawl__20_install_singlefile_extension.js b/archivebox/plugins/singlefile/on_Crawl__20_install_singlefile_extension.js new file mode 100755 index 00000000..59bbda46 --- /dev/null +++ b/archivebox/plugins/singlefile/on_Crawl__20_install_singlefile_extension.js @@ -0,0 +1,281 @@ +#!/usr/bin/env node +/** + * SingleFile Extension Plugin + * + * DISABLED: Extension functionality commented out - using single-file-cli only + * + * Installs and uses the SingleFile Chrome extension for archiving complete web pages. + * Falls back to single-file-cli if the extension is not available. + * + * Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle + * + * Priority: 04 (early) - Must install before Chrome session starts at Crawl level + * Hook: on_Crawl (runs once per crawl, not per snapshot) + * + * This extension automatically: + * - Saves complete web pages as single HTML files + * - Inlines all resources (CSS, JS, images, fonts) + * - Preserves page fidelity better than wget/curl + * - Works with SPAs and dynamically loaded content + */ + +const path = require('path'); +const fs = require('fs'); +const { promisify } = require('util'); +const { exec } = require('child_process'); + +const execAsync = promisify(exec); + +// DISABLED: Extension functionality - using single-file-cli only +// // Import extension utilities +// const extensionUtils = require('../chrome/chrome_utils.js'); + +// // Extension metadata +// const EXTENSION = { +// webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle', +// name: 'singlefile', +// }; + +// // Get extensions directory from environment or use default +// const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR || +// path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions'); + +// const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR || +// path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads'); + +const OUTPUT_DIR = '.'; +const OUTPUT_FILE = 'singlefile.html'; + +// DISABLED: Extension functionality - using single-file-cli only +// /** +// * Install the SingleFile extension +// */ +// async function installSinglefileExtension() { +// console.log('[*] Installing SingleFile extension...'); + +// // Install the extension +// const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR); + +// if (!extension) { +// console.error('[❌] Failed to install SingleFile extension'); +// return null; +// } + +// console.log('[+] SingleFile extension installed'); +// console.log('[+] Web pages will be saved as single HTML files'); + +// return extension; +// } + +// /** +// * Wait for a specified amount of time +// */ +// function wait(ms) { +// return new Promise(resolve => setTimeout(resolve, ms)); +// } + +// /** +// * Save a page using the SingleFile extension +// * +// * @param {Object} page - Puppeteer page object +// * @param {Object} extension - Extension metadata with dispatchAction method +// * @param {Object} options - Additional options +// * @returns {Promise} - Path to saved file or null on failure +// */ +// async function saveSinglefileWithExtension(page, extension, options = {}) { +// if (!extension || !extension.version) { +// throw new Error('SingleFile extension not found or not loaded'); +// } + +// const url = await page.url(); + +// // Check for unsupported URL schemes +// const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob']; +// const scheme = url.split(':')[0]; +// if (URL_SCHEMES_IGNORED.includes(scheme)) { +// console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`); +// return null; +// } + +// // Ensure downloads directory exists +// await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true }); + +// // Get list of existing files to ignore +// const files_before = new Set( +// (await fs.promises.readdir(CHROME_DOWNLOADS_DIR)) +// .filter(fn => fn.endsWith('.html')) +// ); + +// // Output directory is current directory (hook already runs in output dir) +// const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE); + +// console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`); + +// // Bring page to front (extension action button acts on foreground tab) +// await page.bringToFront(); + +// // Trigger the extension's action (toolbar button click) +// await extension.dispatchAction(); + +// // Wait for file to appear in downloads directory +// const check_delay = 3000; // 3 seconds +// const max_tries = 10; +// let files_new = []; + +// for (let attempt = 0; attempt < max_tries; attempt++) { +// await wait(check_delay); + +// const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR)) +// .filter(fn => fn.endsWith('.html')); + +// files_new = files_after.filter(file => !files_before.has(file)); + +// if (files_new.length === 0) { +// continue; +// } + +// // Find the matching file by checking if it contains the URL in the HTML header +// for (const file of files_new) { +// const dl_path = path.join(CHROME_DOWNLOADS_DIR, file); +// const dl_text = await fs.promises.readFile(dl_path, 'utf-8'); +// const dl_header = dl_text.split('meta charset')[0]; + +// if (dl_header.includes(`url: ${url}`)) { +// console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`); +// await fs.promises.rename(dl_path, out_path); +// return out_path; +// } +// } +// } + +// console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`); +// console.warn(`[⚠️] New files found: ${files_new.join(', ')}`); +// return null; +// } + +/** + * Save a page using single-file-cli (fallback method) + * + * @param {string} url - URL to archive + * @param {Object} options - Additional options + * @returns {Promise} - Path to saved file or null on failure + */ +async function saveSinglefileWithCLI(url, options = {}) { + console.log('[*] Falling back to single-file-cli...'); + + // Find single-file binary + let binary = null; + try { + const { stdout } = await execAsync('which single-file'); + binary = stdout.trim(); + } catch (err) { + console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli'); + return null; + } + + // Output directory is current directory (hook already runs in output dir) + const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE); + + // Build command + const cmd = [ + binary, + '--browser-headless', + url, + out_path, + ]; + + // Add optional args + if (options.userAgent) { + cmd.splice(2, 0, '--browser-user-agent', options.userAgent); + } + if (options.cookiesFile && fs.existsSync(options.cookiesFile)) { + cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile); + } + if (options.ignoreSSL) { + cmd.splice(2, 0, '--browser-ignore-insecure-certs'); + } + + // Execute + try { + const timeout = options.timeout || 120000; + await execAsync(cmd.join(' '), { timeout }); + + if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) { + console.log(`[+] SingleFile saved via CLI: ${out_path}`); + return out_path; + } + + console.error('[❌] SingleFile CLI completed but no output file found'); + return null; + } catch (err) { + console.error(`[❌] SingleFile CLI error: ${err.message}`); + return null; + } +} + +// DISABLED: Extension functionality - using single-file-cli only +// /** +// * Main entry point - install extension before archiving +// */ +// async function main() { +// // Check if extension is already cached +// const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json'); + +// if (fs.existsSync(cacheFile)) { +// try { +// const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8')); +// const manifestPath = path.join(cached.unpacked_path, 'manifest.json'); + +// if (fs.existsSync(manifestPath)) { +// console.log('[*] SingleFile extension already installed (using cache)'); +// return cached; +// } +// } catch (e) { +// // Cache file corrupted, re-install +// console.warn('[⚠️] Extension cache corrupted, re-installing...'); +// } +// } + +// // Install extension +// const extension = await installSinglefileExtension(); + +// // Export extension metadata for chrome plugin to load +// if (extension) { +// // Write extension info to a cache file that chrome plugin can read +// await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true }); +// await fs.promises.writeFile( +// cacheFile, +// JSON.stringify(extension, null, 2) +// ); +// console.log(`[+] Extension metadata written to ${cacheFile}`); +// } + +// return extension; +// } + +// Export functions for use by other plugins +module.exports = { + // DISABLED: Extension functionality - using single-file-cli only + // EXTENSION, + // installSinglefileExtension, + // saveSinglefileWithExtension, + saveSinglefileWithCLI, +}; + +// DISABLED: Extension functionality - using single-file-cli only +// // Run if executed directly +// if (require.main === module) { +// main().then(() => { +// console.log('[✓] SingleFile extension setup complete'); +// process.exit(0); +// }).catch(err => { +// console.error('[❌] SingleFile extension setup failed:', err); +// process.exit(1); +// }); +// } + +// No-op when run directly (extension install disabled) +if (require.main === module) { + console.log('[*] SingleFile extension install disabled - using single-file-cli only'); + process.exit(0); +} diff --git a/archivebox/plugins/singlefile/tests/test_singlefile.py b/archivebox/plugins/singlefile/tests/test_singlefile.py index aace617f..8d6d01b0 100644 --- a/archivebox/plugins/singlefile/tests/test_singlefile.py +++ b/archivebox/plugins/singlefile/tests/test_singlefile.py @@ -2,16 +2,15 @@ Integration tests for singlefile plugin Tests verify: -1. Hook script exists and has correct metadata -2. Extension installation and caching works -3. Chrome/node dependencies available -4. Hook can be executed successfully +1. Hook scripts exist with correct naming +2. CLI-based singlefile extraction works +3. Dependencies available via abx-pkg +4. Output contains valid HTML """ import json import os import subprocess -import sys import tempfile from pathlib import Path @@ -20,177 +19,63 @@ import pytest PLUGIN_DIR = Path(__file__).parent.parent PLUGINS_ROOT = PLUGIN_DIR.parent -INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_singlefile.*'), None) -NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py' +SNAPSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_singlefile.py'), None) TEST_URL = "https://example.com" -def test_install_script_exists(): - """Verify install script exists""" - assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}" +def test_snapshot_hook_exists(): + """Verify snapshot extraction hook exists""" + assert SNAPSHOT_HOOK is not None and SNAPSHOT_HOOK.exists(), f"Snapshot hook not found in {PLUGIN_DIR}" -def test_extension_metadata(): - """Test that SingleFile extension has correct metadata""" - with tempfile.TemporaryDirectory() as tmpdir: - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions") - - result = subprocess.run( - ["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"], - capture_output=True, - text=True, - env=env - ) - - assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}" - - metadata = json.loads(result.stdout) - assert metadata["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle" - assert metadata["name"] == "singlefile" - - -def test_install_creates_cache(): - """Test that install creates extension cache""" - with tempfile.TemporaryDirectory() as tmpdir: - ext_dir = Path(tmpdir) / "chrome_extensions" - ext_dir.mkdir(parents=True) - - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) - - result = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=60 - ) - - # Check output mentions installation - assert "SingleFile" in result.stdout or "singlefile" in result.stdout - - # Check cache file was created - cache_file = ext_dir / "singlefile.extension.json" - assert cache_file.exists(), "Cache file should be created" - - # Verify cache content - cache_data = json.loads(cache_file.read_text()) - assert cache_data["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle" - assert cache_data["name"] == "singlefile" - - -def test_install_twice_uses_cache(): - """Test that running install twice uses existing cache on second run""" - with tempfile.TemporaryDirectory() as tmpdir: - ext_dir = Path(tmpdir) / "chrome_extensions" - ext_dir.mkdir(parents=True) - - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) - - # First install - downloads the extension - result1 = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=60 - ) - assert result1.returncode == 0, f"First install failed: {result1.stderr}" - - # Verify cache was created - cache_file = ext_dir / "singlefile.extension.json" - assert cache_file.exists(), "Cache file should exist after first install" - - # Second install - should use cache - result2 = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=30 - ) - assert result2.returncode == 0, f"Second install failed: {result2.stderr}" - - # Second run should be faster (uses cache) and mention cache - assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0 - - -def test_no_configuration_required(): - """Test that SingleFile works without configuration""" - with tempfile.TemporaryDirectory() as tmpdir: - ext_dir = Path(tmpdir) / "chrome_extensions" - ext_dir.mkdir(parents=True) - - env = os.environ.copy() - env["CHROME_EXTENSIONS_DIR"] = str(ext_dir) - # No API keys needed - - result = subprocess.run( - ["node", str(INSTALL_SCRIPT)], - capture_output=True, - text=True, - env=env, - timeout=60 - ) - - # Should work without API keys - assert result.returncode == 0 - - -def test_priority_order(): - """Test that singlefile has correct priority (04)""" - # Extract priority from filename - filename = INSTALL_SCRIPT.name - assert "04" in filename, "SingleFile should have priority 04" - assert filename.startswith("on_Crawl__04_"), "Should follow priority naming convention for Crawl hooks" - - -def test_output_directory_structure(): - """Test that plugin defines correct output structure""" - # Verify the script mentions singlefile output directory - script_content = INSTALL_SCRIPT.read_text() - - # Should mention singlefile output directory - assert "singlefile" in script_content.lower() - # Should mention HTML output - assert ".html" in script_content or "html" in script_content.lower() +def test_snapshot_hook_priority(): + """Test that snapshot hook has correct priority (50)""" + filename = SNAPSHOT_HOOK.name + assert "50" in filename, "SingleFile snapshot hook should have priority 50" + assert filename.startswith("on_Snapshot__50_"), "Should follow priority naming convention" def test_verify_deps_with_abx_pkg(): - """Verify dependencies are available via abx-pkg after hook installation.""" - from abx_pkg import Binary, EnvProvider, BinProviderOverrides + """Verify dependencies are available via abx-pkg.""" + from abx_pkg import Binary, EnvProvider EnvProvider.model_rebuild() - # Verify node is available (singlefile uses Chrome extension, needs Node) + # Verify node is available node_binary = Binary(name='node', binproviders=[EnvProvider()]) node_loaded = node_binary.load() assert node_loaded and node_loaded.abspath, "Node.js required for singlefile plugin" -def test_singlefile_hook_runs(): - """Verify singlefile hook can be executed and completes.""" - # Prerequisites checked by earlier test - +def test_singlefile_cli_archives_example_com(): + """Test that singlefile CLI archives example.com and produces valid HTML.""" with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - # Run singlefile extraction hook + env = os.environ.copy() + env['SINGLEFILE_ENABLED'] = 'true' + + # Run singlefile snapshot hook result = subprocess.run( - ['node', str(INSTALL_SCRIPT), f'--url={TEST_URL}', '--snapshot-id=test789'], + ['python', str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'], cwd=tmpdir, capture_output=True, text=True, + env=env, timeout=120 ) - # Hook should complete successfully (even if it just installs extension) assert result.returncode == 0, f"Hook execution failed: {result.stderr}" - # Verify extension installation happens - assert 'SingleFile extension' in result.stdout or result.returncode == 0, "Should install extension or complete" + # Verify output file exists + output_file = tmpdir / 'singlefile.html' + assert output_file.exists(), f"singlefile.html not created. stdout: {result.stdout}, stderr: {result.stderr}" + + # Verify it contains real HTML + html_content = output_file.read_text() + assert len(html_content) > 500, "Output file too small to be valid HTML" + assert '' in html_content or ' ext.name === 'captcha2'); + const captchaExt = extensions.find(ext => ext.name === 'twocaptcha'); if (!captchaExt) { console.error('[*] 2captcha extension not installed, skipping configuration'); @@ -236,7 +236,7 @@ async function main() { const snapshotId = args.snapshot_id; if (!url || !snapshotId) { - console.error('Usage: on_Snapshot__21_captcha2_config.js --url= --snapshot-id='); + console.error('Usage: on_Snapshot__21_twocaptcha_config.js --url= --snapshot-id='); process.exit(1); } diff --git a/archivebox/plugins/captcha2/templates/icon.html b/archivebox/plugins/twocaptcha/templates/icon.html similarity index 100% rename from archivebox/plugins/captcha2/templates/icon.html rename to archivebox/plugins/twocaptcha/templates/icon.html diff --git a/archivebox/plugins/captcha2/tests/test_captcha2.py b/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py similarity index 90% rename from archivebox/plugins/captcha2/tests/test_captcha2.py rename to archivebox/plugins/twocaptcha/tests/test_twocaptcha.py index bc08a072..ab4f4a4b 100644 --- a/archivebox/plugins/captcha2/tests/test_captcha2.py +++ b/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py @@ -1,5 +1,5 @@ """ -Unit tests for captcha2 plugin +Unit tests for twocaptcha plugin Tests invoke the plugin hooks as external processes and verify outputs/side effects. """ @@ -14,8 +14,8 @@ import pytest PLUGIN_DIR = Path(__file__).parent.parent -INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2.*'), None) -CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2_config.*'), None) +INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_twocaptcha_extension.*'), None) +CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_configure_twocaptcha_extension_options.*'), None) def test_install_script_exists(): @@ -29,7 +29,7 @@ def test_config_script_exists(): def test_extension_metadata(): - """Test that captcha2 extension has correct metadata""" + """Test that twocaptcha extension has correct metadata""" with tempfile.TemporaryDirectory() as tmpdir: env = os.environ.copy() env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions") @@ -46,7 +46,7 @@ def test_extension_metadata(): metadata = json.loads(result.stdout) assert metadata["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo" - assert metadata["name"] == "captcha2" + assert metadata["name"] == "twocaptcha" def test_install_creates_cache(): @@ -72,13 +72,13 @@ def test_install_creates_cache(): assert "[*] Installing 2captcha extension" in result.stdout or "[*] 2captcha extension already installed" in result.stdout # Check cache file was created - cache_file = ext_dir / "captcha2.extension.json" + cache_file = ext_dir / "twocaptcha.extension.json" assert cache_file.exists(), "Cache file should be created" # Verify cache content cache_data = json.loads(cache_file.read_text()) assert cache_data["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo" - assert cache_data["name"] == "captcha2" + assert cache_data["name"] == "twocaptcha" assert "unpacked_path" in cache_data assert "version" in cache_data @@ -104,7 +104,7 @@ def test_install_twice_uses_cache(): assert result1.returncode == 0, f"First install failed: {result1.stderr}" # Verify cache was created - cache_file = ext_dir / "captcha2.extension.json" + cache_file = ext_dir / "twocaptcha.extension.json" assert cache_file.exists(), "Cache file should exist after first install" # Second install - should use cache @@ -175,7 +175,7 @@ def test_config_script_structure(): script_content = CONFIG_SCRIPT.read_text() # Should mention configuration marker file - assert "CONFIG_MARKER" in script_content or "captcha2_configured" in script_content + assert "CONFIG_MARKER" in script_content or "twocaptcha_configured" in script_content # Should mention API key assert "API_KEY_2CAPTCHA" in script_content diff --git a/archivebox/plugins/ublock/on_Crawl__03_ublock.js b/archivebox/plugins/ublock/on_Crawl__20_install_ublock_extension.js similarity index 100% rename from archivebox/plugins/ublock/on_Crawl__03_ublock.js rename to archivebox/plugins/ublock/on_Crawl__20_install_ublock_extension.js diff --git a/archivebox/plugins/wget/on_Crawl__10_wget_validate_config.py b/archivebox/plugins/wget/on_Crawl__10_install_wget.py similarity index 100% rename from archivebox/plugins/wget/on_Crawl__10_wget_validate_config.py rename to archivebox/plugins/wget/on_Crawl__10_install_wget.py