From dd2302ad92fde449cc0c0c4860e0846e195c6fef Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Tue, 30 Dec 2025 16:12:53 -0800
Subject: [PATCH] new jsonl cli interface

---
 TODO_cli_refactor.md                          | 131 ++++++
 archivebox.ts                                 |   4 +-
 archivebox/cli/__init__.py                    |  31 +-
 archivebox/cli/archivebox_archiveresult.py    | 365 ++++++++++++++++
 archivebox/cli/archivebox_binary.py           | 304 +++++++++++++
 archivebox/cli/archivebox_crawl.py            | 356 ++++++++++-----
 archivebox/cli/archivebox_extract.py          | 265 ------------
 archivebox/cli/archivebox_init.py             |   2 +-
 archivebox/cli/archivebox_machine.py          | 113 +++++
 archivebox/cli/archivebox_orchestrator.py     |  67 ---
 archivebox/cli/archivebox_process.py          | 121 ++++++
 archivebox/cli/archivebox_remove.py           |  98 -----
 archivebox/cli/archivebox_run.py              | 155 +++++++
 archivebox/cli/archivebox_search.py           | 131 ------
 archivebox/cli/archivebox_snapshot.py         | 406 ++++++++++++------
 archivebox/cli/archivebox_tag.py              | 307 +++++++++++++
 archivebox/cli/tests_piping.py                |  73 ++--
 archivebox/core/forms.py                      |   2 +-
 archivebox/core/models.py                     | 193 +++++++--
 archivebox/crawls/models.py                   |  69 ++-
 archivebox/hooks.py                           |  64 +--
 archivebox/machine/models.py                  | 208 ++++++++-
 archivebox/misc/jsonl.py                      |  35 +-
 ...n_Crawl__00_install_puppeteer_chromium.py} |  87 +++-
 .../on_Crawl__10_chrome_validate_config.py    | 172 --------
 ...bg.js => on_Crawl__30_chrome_launch.bg.js} |   4 +-
 ...l_istilldontcareaboutcookies_extension.js} |   0
 .../singlefile/on_Crawl__04_singlefile.js     | 268 ------------
 ..._Crawl__20_install_singlefile_extension.js | 281 ++++++++++++
 .../singlefile/tests/test_singlefile.py       | 181 ++------
 .../{captcha2 => twocaptcha}/config.json      |   0
 ...Crawl__20_install_twocaptcha_extension.js} |   4 +-
 ...configure_twocaptcha_extension_options.js} |   6 +-
 .../templates/icon.html                       |   0
 .../tests/test_twocaptcha.py}                 |  18 +-
 ... on_Crawl__20_install_ublock_extension.js} |   0
 ...config.py => on_Crawl__10_install_wget.py} |   0
 37 files changed, 2919 insertions(+), 1602 deletions(-)
 create mode 100644 TODO_cli_refactor.md
 create mode 100644 archivebox/cli/archivebox_archiveresult.py
 create mode 100644 archivebox/cli/archivebox_binary.py
 delete mode 100644 archivebox/cli/archivebox_extract.py
 create mode 100644 archivebox/cli/archivebox_machine.py
 delete mode 100644 archivebox/cli/archivebox_orchestrator.py
 create mode 100644 archivebox/cli/archivebox_process.py
 delete mode 100644 archivebox/cli/archivebox_remove.py
 create mode 100644 archivebox/cli/archivebox_run.py
 delete mode 100644 archivebox/cli/archivebox_search.py
 create mode 100644 archivebox/cli/archivebox_tag.py
 rename archivebox/plugins/chrome/{on_Crawl__00_chrome_install.py => on_Crawl__00_install_puppeteer_chromium.py} (68%)
 delete mode 100644 archivebox/plugins/chrome/on_Crawl__10_chrome_validate_config.py
 rename archivebox/plugins/chrome/{on_Crawl__20_chrome_launch.bg.js => on_Crawl__30_chrome_launch.bg.js} (98%)
 rename archivebox/plugins/istilldontcareaboutcookies/{on_Crawl__02_istilldontcareaboutcookies.js => on_Crawl__20_install_istilldontcareaboutcookies_extension.js} (100%)
 delete mode 100755 archivebox/plugins/singlefile/on_Crawl__04_singlefile.js
 create mode 100755 archivebox/plugins/singlefile/on_Crawl__20_install_singlefile_extension.js
 rename archivebox/plugins/{captcha2 => twocaptcha}/config.json (100%)
 rename archivebox/plugins/{captcha2/on_Crawl__01_captcha2.js => twocaptcha/on_Crawl__20_install_twocaptcha_extension.js} (97%)
 rename archivebox/plugins/{captcha2/on_Crawl__11_captcha2_config.js => twocaptcha/on_Crawl__25_configure_twocaptcha_extension_options.js} (97%)
 rename archivebox/plugins/{captcha2 => twocaptcha}/templates/icon.html (100%)
 rename archivebox/plugins/{captcha2/tests/test_captcha2.py => twocaptcha/tests/test_twocaptcha.py} (90%)
 rename archivebox/plugins/ublock/{on_Crawl__03_ublock.js => on_Crawl__20_install_ublock_extension.js} (100%)
 rename archivebox/plugins/wget/{on_Crawl__10_wget_validate_config.py => on_Crawl__10_install_wget.py} (100%)

diff --git a/TODO_cli_refactor.md b/TODO_cli_refactor.md
new file mode 100644
index 00000000..0ce5e092
--- /dev/null
+++ b/TODO_cli_refactor.md
@@ -0,0 +1,131 @@
+# ArchiveBox CLI Refactor TODO
+
+## Design Decisions
+
+1. **Keep `archivebox add`** as high-level convenience command
+2. **Unified `archivebox run`** for processing (replaces per-model `run` and `orchestrator`)
+3. **Expose all models** including binary, process, machine
+4. **Clean break** from old command structure (no backward compatibility aliases)
+
+## Final Architecture
+
+```
+archivebox <model> <action> [args...] [--filters]
+archivebox run [stdin JSONL]
+```
+
+### Actions (4 per model):
+- `create` - Create records (from args, stdin, or JSONL), dedupes by indexed fields
+- `list` - Query records (with filters, returns JSONL)
+- `update` - Modify records (from stdin JSONL, PATCH semantics)
+- `delete` - Remove records (from stdin JSONL, requires --yes)
+
+### Unified Run Command:
+- `archivebox run` - Process queued work
+  - With stdin JSONL: Process piped records, exit when complete
+  - Without stdin (TTY): Run orchestrator in foreground until killed
+
+### Models (7 total):
+- `crawl` - Crawl jobs
+- `snapshot` - Individual archived pages
+- `archiveresult` - Plugin extraction results
+- `tag` - Tags/labels
+- `binary` - Detected binaries (chrome, wget, etc.)
+- `process` - Process execution records (read-only)
+- `machine` - Machine/host records (read-only)
+
+---
+
+## Implementation Checklist
+
+### Phase 1: Unified Run Command
+- [x] Create `archivebox/cli/archivebox_run.py` - unified processing command
+
+### Phase 2: Core Model Commands
+- [x] Refactor `archivebox/cli/archivebox_snapshot.py` to Click group with create|list|update|delete
+- [x] Refactor `archivebox/cli/archivebox_crawl.py` to Click group with create|list|update|delete
+- [x] Create `archivebox/cli/archivebox_archiveresult.py` with create|list|update|delete
+- [x] Create `archivebox/cli/archivebox_tag.py` with create|list|update|delete
+
+### Phase 3: System Model Commands
+- [x] Create `archivebox/cli/archivebox_binary.py` with create|list|update|delete
+- [x] Create `archivebox/cli/archivebox_process.py` with list only (read-only)
+- [x] Create `archivebox/cli/archivebox_machine.py` with list only (read-only)
+
+### Phase 4: Registry & Cleanup
+- [x] Update `archivebox/cli/__init__.py` command registry
+- [x] Delete `archivebox/cli/archivebox_extract.py`
+- [x] Delete `archivebox/cli/archivebox_remove.py`
+- [x] Delete `archivebox/cli/archivebox_search.py`
+- [x] Delete `archivebox/cli/archivebox_orchestrator.py`
+- [x] Update `archivebox/cli/archivebox_add.py` internals (no changes needed - uses models directly)
+- [x] Update `archivebox/cli/tests_piping.py`
+
+### Phase 5: Tests for New Commands
+- [ ] Add tests for `archivebox run` command
+- [ ] Add tests for `archivebox crawl create|list|update|delete`
+- [ ] Add tests for `archivebox snapshot create|list|update|delete`
+- [ ] Add tests for `archivebox archiveresult create|list|update|delete`
+- [ ] Add tests for `archivebox tag create|list|update|delete`
+- [ ] Add tests for `archivebox binary create|list|update|delete`
+- [ ] Add tests for `archivebox process list`
+- [ ] Add tests for `archivebox machine list`
+
+---
+
+## Usage Examples
+
+### Basic CRUD
+```bash
+# Create
+archivebox crawl create https://example.com https://foo.com --depth=1
+archivebox snapshot create https://example.com --tag=news
+
+# List with filters
+archivebox crawl list --status=queued
+archivebox snapshot list --url__icontains=example.com
+archivebox archiveresult list --status=failed --plugin=screenshot
+
+# Update (reads JSONL from stdin, applies changes)
+archivebox snapshot list --tag=old | archivebox snapshot update --tag=new
+
+# Delete (requires --yes)
+archivebox crawl list --url__icontains=example.com | archivebox crawl delete --yes
+```
+
+### Unified Run Command
+```bash
+# Run orchestrator in foreground (replaces `archivebox orchestrator`)
+archivebox run
+
+# Process specific records (pipe any JSONL type, exits when done)
+archivebox snapshot list --status=queued | archivebox run
+archivebox archiveresult list --status=failed | archivebox run
+archivebox crawl list --status=queued | archivebox run
+
+# Mixed types work too - run handles any JSONL
+cat mixed_records.jsonl | archivebox run
+```
+
+### Composed Workflows
+```bash
+# Full pipeline (replaces old `archivebox add`)
+archivebox crawl create https://example.com --status=queued \
+  | archivebox snapshot create --status=queued \
+  | archivebox archiveresult create --status=queued \
+  | archivebox run
+
+# Re-run failed extractions
+archivebox archiveresult list --status=failed | archivebox run
+
+# Delete all snapshots for a domain
+archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes
+```
+
+### Keep `archivebox add` as convenience
+```bash
+# This remains the simple user-friendly interface:
+archivebox add https://example.com --depth=1 --tag=news
+
+# Internally equivalent to the composed pipeline above
+```
diff --git a/archivebox.ts b/archivebox.ts
index bf27cac5..e21b549d 100644
--- a/archivebox.ts
+++ b/archivebox.ts
@@ -478,7 +478,7 @@ interface LoadedChromeExtension extends ChromeExtension {
 
 const CHROME_EXTENSIONS: LoadedChromeExtension[] = [
     // Content access / unblocking / blocking plugins
-    {webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo', name: 'captcha2'},                 // https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
+    {webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo', name: 'twocaptcha'},                 // https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
     {webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm', name: 'istilldontcareaboutcookies'},
     {webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm', name: 'ublock'},
     // {webstore_id: 'mlomiejdfkolichcflejclcbmpeaniij', name: 'ghostery'},
@@ -1123,7 +1123,7 @@ async function setup2CaptchaExtension({browser, extensions}) {
     try {
         // open a new tab to finish setting up the 2captcha extension manually using its extension options page
         page = await browser.newPage()
-        const { options_url } = extensions.filter(ext => ext.name === 'captcha2')[0]
+        const { options_url } = extensions.filter(ext => ext.name === 'twocaptcha')[0]
         await page.goto(options_url)
         await wait(2_500)
         await page.bringToFront()
diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py
index 5a33e11a..c0d35a54 100644
--- a/archivebox/cli/__init__.py
+++ b/archivebox/cli/__init__.py
@@ -27,36 +27,43 @@ class ArchiveBoxGroup(click.Group):
         'init': 'archivebox.cli.archivebox_init.main',
         'install': 'archivebox.cli.archivebox_install.main',
     }
+    # Model commands (CRUD operations via subcommands)
+    model_commands = {
+        'crawl': 'archivebox.cli.archivebox_crawl.main',
+        'snapshot': 'archivebox.cli.archivebox_snapshot.main',
+        'archiveresult': 'archivebox.cli.archivebox_archiveresult.main',
+        'tag': 'archivebox.cli.archivebox_tag.main',
+        'binary': 'archivebox.cli.archivebox_binary.main',
+        'process': 'archivebox.cli.archivebox_process.main',
+        'machine': 'archivebox.cli.archivebox_machine.main',
+    }
     archive_commands = {
+        # High-level commands
         'add': 'archivebox.cli.archivebox_add.main',
-        'remove': 'archivebox.cli.archivebox_remove.main',
+        'run': 'archivebox.cli.archivebox_run.main',
         'update': 'archivebox.cli.archivebox_update.main',
-        'search': 'archivebox.cli.archivebox_search.main',
         'status': 'archivebox.cli.archivebox_status.main',
         'config': 'archivebox.cli.archivebox_config.main',
         'schedule': 'archivebox.cli.archivebox_schedule.main',
         'server': 'archivebox.cli.archivebox_server.main',
         'shell': 'archivebox.cli.archivebox_shell.main',
         'manage': 'archivebox.cli.archivebox_manage.main',
-        # Worker/orchestrator commands
-        'orchestrator': 'archivebox.cli.archivebox_orchestrator.main',
+        # Worker command
         'worker': 'archivebox.cli.archivebox_worker.main',
-        # Task commands (called by workers as subprocesses)
-        'crawl': 'archivebox.cli.archivebox_crawl.main',
-        'snapshot': 'archivebox.cli.archivebox_snapshot.main',
-        'extract': 'archivebox.cli.archivebox_extract.main',
     }
     all_subcommands = {
         **meta_commands,
         **setup_commands,
+        **model_commands,
         **archive_commands,
     }
     renamed_commands = {
         'setup': 'install',
-        'list': 'search',
         'import': 'add',
         'archive': 'add',
-        'export': 'search',
+        # Old commands replaced by new model commands
+        'orchestrator': 'run',
+        'extract': 'archiveresult',
     }
     
     @classmethod
@@ -110,9 +117,9 @@ def cli(ctx, help=False):
     if help or ctx.invoked_subcommand is None:
         ctx.invoke(ctx.command.get_command(ctx, 'help'))
     
-    # if the subcommand is in the archive_commands dict and is not 'manage',
+    # if the subcommand is in archive_commands or model_commands,
     # then we need to set up the django environment and check that we're in a valid data folder
-    if subcommand in ArchiveBoxGroup.archive_commands:
+    if subcommand in ArchiveBoxGroup.archive_commands or subcommand in ArchiveBoxGroup.model_commands:
         # print('SETUP DJANGO AND CHECK DATA FOLDER')
         try:
             from archivebox.config.django import setup_django
diff --git a/archivebox/cli/archivebox_archiveresult.py b/archivebox/cli/archivebox_archiveresult.py
new file mode 100644
index 00000000..1f725a03
--- /dev/null
+++ b/archivebox/cli/archivebox_archiveresult.py
@@ -0,0 +1,365 @@
+#!/usr/bin/env python3
+
+"""
+archivebox archiveresult <action> [args...] [--filters]
+
+Manage ArchiveResult records (plugin extraction results).
+
+Actions:
+    create  - Create ArchiveResults for Snapshots (queue extractions)
+    list    - List ArchiveResults as JSONL (with optional filters)
+    update  - Update ArchiveResults from stdin JSONL
+    delete  - Delete ArchiveResults from stdin JSONL
+
+Examples:
+    # Create ArchiveResults for snapshots (queue for extraction)
+    archivebox snapshot list --status=queued | archivebox archiveresult create
+    archivebox archiveresult create --plugin=screenshot --snapshot-id=<uuid>
+
+    # List with filters
+    archivebox archiveresult list --status=failed
+    archivebox archiveresult list --plugin=screenshot --status=succeeded
+
+    # Update (reset failed extractions to queued)
+    archivebox archiveresult list --status=failed | archivebox archiveresult update --status=queued
+
+    # Delete
+    archivebox archiveresult list --plugin=singlefile | archivebox archiveresult delete --yes
+
+    # Re-run failed extractions
+    archivebox archiveresult list --status=failed | archivebox run
+"""
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox archiveresult'
+
+import sys
+from typing import Optional
+
+import rich_click as click
+from rich import print as rprint
+
+
+def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
+    """Apply Django-style filters from CLI kwargs to a QuerySet."""
+    filters = {}
+    for key, value in filter_kwargs.items():
+        if value is not None and key not in ('limit', 'offset'):
+            filters[key] = value
+
+    if filters:
+        queryset = queryset.filter(**filters)
+
+    if limit:
+        queryset = queryset[:limit]
+
+    return queryset
+
+
+# =============================================================================
+# CREATE
+# =============================================================================
+
+def create_archiveresults(
+    snapshot_id: Optional[str] = None,
+    plugin: Optional[str] = None,
+    status: str = 'queued',
+) -> int:
+    """
+    Create ArchiveResults for Snapshots.
+
+    Reads Snapshot records from stdin and creates ArchiveResult entries.
+    If --plugin is specified, only creates results for that plugin.
+    Otherwise, creates results for all pending plugins.
+
+    Exit codes:
+        0: Success
+        1: Failure
+    """
+    from django.utils import timezone
+
+    from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT
+    from archivebox.core.models import Snapshot, ArchiveResult
+
+    is_tty = sys.stdout.isatty()
+
+    # If snapshot_id provided directly, use that
+    if snapshot_id:
+        try:
+            snapshots = [Snapshot.objects.get(id=snapshot_id)]
+        except Snapshot.DoesNotExist:
+            rprint(f'[red]Snapshot not found: {snapshot_id}[/red]', file=sys.stderr)
+            return 1
+    else:
+        # Read from stdin
+        records = list(read_stdin())
+        if not records:
+            rprint('[yellow]No Snapshot records provided via stdin[/yellow]', file=sys.stderr)
+            return 1
+
+        # Filter to only Snapshot records
+        snapshot_ids = []
+        for record in records:
+            if record.get('type') == TYPE_SNAPSHOT:
+                if record.get('id'):
+                    snapshot_ids.append(record['id'])
+            elif record.get('id'):
+                # Assume it's a snapshot ID if no type specified
+                snapshot_ids.append(record['id'])
+
+        if not snapshot_ids:
+            rprint('[yellow]No valid Snapshot IDs in input[/yellow]', file=sys.stderr)
+            return 1
+
+        snapshots = list(Snapshot.objects.filter(id__in=snapshot_ids))
+
+    if not snapshots:
+        rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
+        return 1
+
+    created_count = 0
+    for snapshot in snapshots:
+        if plugin:
+            # Create for specific plugin only
+            result, created = ArchiveResult.objects.get_or_create(
+                snapshot=snapshot,
+                plugin=plugin,
+                defaults={
+                    'status': status,
+                    'retry_at': timezone.now(),
+                }
+            )
+            if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
+                # Reset for retry
+                result.status = status
+                result.retry_at = timezone.now()
+                result.save()
+
+            if not is_tty:
+                write_record(result.to_json())
+            created_count += 1
+        else:
+            # Create all pending plugins
+            snapshot.create_pending_archiveresults()
+            for result in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.QUEUED):
+                if not is_tty:
+                    write_record(result.to_json())
+                created_count += 1
+
+    rprint(f'[green]Created/queued {created_count} archive results[/green]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# LIST
+# =============================================================================
+
+def list_archiveresults(
+    status: Optional[str] = None,
+    plugin: Optional[str] = None,
+    snapshot_id: Optional[str] = None,
+    limit: Optional[int] = None,
+) -> int:
+    """
+    List ArchiveResults as JSONL with optional filters.
+
+    Exit codes:
+        0: Success (even if no results)
+    """
+    from archivebox.misc.jsonl import write_record
+    from archivebox.core.models import ArchiveResult
+
+    is_tty = sys.stdout.isatty()
+
+    queryset = ArchiveResult.objects.all().order_by('-start_ts')
+
+    # Apply filters
+    filter_kwargs = {
+        'status': status,
+        'plugin': plugin,
+        'snapshot_id': snapshot_id,
+    }
+    queryset = apply_filters(queryset, filter_kwargs, limit=limit)
+
+    count = 0
+    for result in queryset:
+        if is_tty:
+            status_color = {
+                'queued': 'yellow',
+                'started': 'blue',
+                'succeeded': 'green',
+                'failed': 'red',
+                'skipped': 'dim',
+                'backoff': 'magenta',
+            }.get(result.status, 'dim')
+            rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}')
+        else:
+            write_record(result.to_json())
+        count += 1
+
+    rprint(f'[dim]Listed {count} archive results[/dim]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# UPDATE
+# =============================================================================
+
+def update_archiveresults(
+    status: Optional[str] = None,
+) -> int:
+    """
+    Update ArchiveResults from stdin JSONL.
+
+    Reads ArchiveResult records from stdin and applies updates.
+    Uses PATCH semantics - only specified fields are updated.
+
+    Exit codes:
+        0: Success
+        1: No input or error
+    """
+    from django.utils import timezone
+
+    from archivebox.misc.jsonl import read_stdin, write_record
+    from archivebox.core.models import ArchiveResult
+
+    is_tty = sys.stdout.isatty()
+
+    records = list(read_stdin())
+    if not records:
+        rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
+        return 1
+
+    updated_count = 0
+    for record in records:
+        result_id = record.get('id')
+        if not result_id:
+            continue
+
+        try:
+            result = ArchiveResult.objects.get(id=result_id)
+
+            # Apply updates from CLI flags
+            if status:
+                result.status = status
+                result.retry_at = timezone.now()
+
+            result.save()
+            updated_count += 1
+
+            if not is_tty:
+                write_record(result.to_json())
+
+        except ArchiveResult.DoesNotExist:
+            rprint(f'[yellow]ArchiveResult not found: {result_id}[/yellow]', file=sys.stderr)
+            continue
+
+    rprint(f'[green]Updated {updated_count} archive results[/green]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# DELETE
+# =============================================================================
+
+def delete_archiveresults(yes: bool = False, dry_run: bool = False) -> int:
+    """
+    Delete ArchiveResults from stdin JSONL.
+
+    Requires --yes flag to confirm deletion.
+
+    Exit codes:
+        0: Success
+        1: No input or missing --yes flag
+    """
+    from archivebox.misc.jsonl import read_stdin
+    from archivebox.core.models import ArchiveResult
+
+    records = list(read_stdin())
+    if not records:
+        rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
+        return 1
+
+    result_ids = [r.get('id') for r in records if r.get('id')]
+
+    if not result_ids:
+        rprint('[yellow]No valid archive result IDs in input[/yellow]', file=sys.stderr)
+        return 1
+
+    results = ArchiveResult.objects.filter(id__in=result_ids)
+    count = results.count()
+
+    if count == 0:
+        rprint('[yellow]No matching archive results found[/yellow]', file=sys.stderr)
+        return 0
+
+    if dry_run:
+        rprint(f'[yellow]Would delete {count} archive results (dry run)[/yellow]', file=sys.stderr)
+        for result in results[:10]:
+            rprint(f'  [dim]{result.id}[/dim] {result.plugin} {result.snapshot.url[:40]}', file=sys.stderr)
+        if count > 10:
+            rprint(f'  ... and {count - 10} more', file=sys.stderr)
+        return 0
+
+    if not yes:
+        rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
+        return 1
+
+    # Perform deletion
+    deleted_count, _ = results.delete()
+    rprint(f'[green]Deleted {deleted_count} archive results[/green]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# CLI Commands
+# =============================================================================
+
+@click.group()
+def main():
+    """Manage ArchiveResult records (plugin extraction results)."""
+    pass
+
+
+@main.command('create')
+@click.option('--snapshot-id', help='Snapshot ID to create results for')
+@click.option('--plugin', '-p', help='Plugin name (e.g., screenshot, singlefile)')
+@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
+def create_cmd(snapshot_id: Optional[str], plugin: Optional[str], status: str):
+    """Create ArchiveResults for Snapshots from stdin JSONL."""
+    sys.exit(create_archiveresults(snapshot_id=snapshot_id, plugin=plugin, status=status))
+
+
+@main.command('list')
+@click.option('--status', '-s', help='Filter by status (queued, started, succeeded, failed, skipped)')
+@click.option('--plugin', '-p', help='Filter by plugin name')
+@click.option('--snapshot-id', help='Filter by snapshot ID')
+@click.option('--limit', '-n', type=int, help='Limit number of results')
+def list_cmd(status: Optional[str], plugin: Optional[str],
+             snapshot_id: Optional[str], limit: Optional[int]):
+    """List ArchiveResults as JSONL."""
+    sys.exit(list_archiveresults(
+        status=status,
+        plugin=plugin,
+        snapshot_id=snapshot_id,
+        limit=limit,
+    ))
+
+
+@main.command('update')
+@click.option('--status', '-s', help='Set status')
+def update_cmd(status: Optional[str]):
+    """Update ArchiveResults from stdin JSONL."""
+    sys.exit(update_archiveresults(status=status))
+
+
+@main.command('delete')
+@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
+@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
+def delete_cmd(yes: bool, dry_run: bool):
+    """Delete ArchiveResults from stdin JSONL."""
+    sys.exit(delete_archiveresults(yes=yes, dry_run=dry_run))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/archivebox/cli/archivebox_binary.py b/archivebox/cli/archivebox_binary.py
new file mode 100644
index 00000000..98ab33be
--- /dev/null
+++ b/archivebox/cli/archivebox_binary.py
@@ -0,0 +1,304 @@
+#!/usr/bin/env python3
+
+"""
+archivebox binary <action> [args...] [--filters]
+
+Manage Binary records (detected executables like chrome, wget, etc.).
+
+Actions:
+    create  - Create/register a Binary
+    list    - List Binaries as JSONL (with optional filters)
+    update  - Update Binaries from stdin JSONL
+    delete  - Delete Binaries from stdin JSONL
+
+Examples:
+    # List all binaries
+    archivebox binary list
+
+    # List specific binary
+    archivebox binary list --name=chrome
+
+    # List binaries with specific version
+    archivebox binary list --version__icontains=120
+
+    # Delete old binary entries
+    archivebox binary list --name=chrome | archivebox binary delete --yes
+"""
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox binary'
+
+import sys
+from typing import Optional
+
+import rich_click as click
+from rich import print as rprint
+
+
+def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
+    """Apply Django-style filters from CLI kwargs to a QuerySet."""
+    filters = {}
+    for key, value in filter_kwargs.items():
+        if value is not None and key not in ('limit', 'offset'):
+            filters[key] = value
+
+    if filters:
+        queryset = queryset.filter(**filters)
+
+    if limit:
+        queryset = queryset[:limit]
+
+    return queryset
+
+
+# =============================================================================
+# CREATE
+# =============================================================================
+
+def create_binary(
+    name: str,
+    abspath: str,
+    version: str = '',
+) -> int:
+    """
+    Create/register a Binary.
+
+    Exit codes:
+        0: Success
+        1: Failure
+    """
+    from archivebox.misc.jsonl import write_record
+    from archivebox.machine.models import Binary
+
+    is_tty = sys.stdout.isatty()
+
+    if not name or not abspath:
+        rprint('[red]Both --name and --abspath are required[/red]', file=sys.stderr)
+        return 1
+
+    try:
+        binary, created = Binary.objects.get_or_create(
+            name=name,
+            abspath=abspath,
+            defaults={'version': version}
+        )
+
+        if not is_tty:
+            write_record(binary.to_json())
+
+        if created:
+            rprint(f'[green]Created binary: {name} at {abspath}[/green]', file=sys.stderr)
+        else:
+            rprint(f'[dim]Binary already exists: {name} at {abspath}[/dim]', file=sys.stderr)
+
+        return 0
+
+    except Exception as e:
+        rprint(f'[red]Error creating binary: {e}[/red]', file=sys.stderr)
+        return 1
+
+
+# =============================================================================
+# LIST
+# =============================================================================
+
+def list_binaries(
+    name: Optional[str] = None,
+    abspath__icontains: Optional[str] = None,
+    version__icontains: Optional[str] = None,
+    limit: Optional[int] = None,
+) -> int:
+    """
+    List Binaries as JSONL with optional filters.
+
+    Exit codes:
+        0: Success (even if no results)
+    """
+    from archivebox.misc.jsonl import write_record
+    from archivebox.machine.models import Binary
+
+    is_tty = sys.stdout.isatty()
+
+    queryset = Binary.objects.all().order_by('name', '-loaded_at')
+
+    # Apply filters
+    filter_kwargs = {
+        'name': name,
+        'abspath__icontains': abspath__icontains,
+        'version__icontains': version__icontains,
+    }
+    queryset = apply_filters(queryset, filter_kwargs, limit=limit)
+
+    count = 0
+    for binary in queryset:
+        if is_tty:
+            rprint(f'[cyan]{binary.name:20}[/cyan] [dim]{binary.version:15}[/dim] {binary.abspath}')
+        else:
+            write_record(binary.to_json())
+        count += 1
+
+    rprint(f'[dim]Listed {count} binaries[/dim]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# UPDATE
+# =============================================================================
+
+def update_binaries(
+    version: Optional[str] = None,
+    abspath: Optional[str] = None,
+) -> int:
+    """
+    Update Binaries from stdin JSONL.
+
+    Reads Binary records from stdin and applies updates.
+    Uses PATCH semantics - only specified fields are updated.
+
+    Exit codes:
+        0: Success
+        1: No input or error
+    """
+    from archivebox.misc.jsonl import read_stdin, write_record
+    from archivebox.machine.models import Binary
+
+    is_tty = sys.stdout.isatty()
+
+    records = list(read_stdin())
+    if not records:
+        rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
+        return 1
+
+    updated_count = 0
+    for record in records:
+        binary_id = record.get('id')
+        if not binary_id:
+            continue
+
+        try:
+            binary = Binary.objects.get(id=binary_id)
+
+            # Apply updates from CLI flags
+            if version:
+                binary.version = version
+            if abspath:
+                binary.abspath = abspath
+
+            binary.save()
+            updated_count += 1
+
+            if not is_tty:
+                write_record(binary.to_json())
+
+        except Binary.DoesNotExist:
+            rprint(f'[yellow]Binary not found: {binary_id}[/yellow]', file=sys.stderr)
+            continue
+
+    rprint(f'[green]Updated {updated_count} binaries[/green]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# DELETE
+# =============================================================================
+
+def delete_binaries(yes: bool = False, dry_run: bool = False) -> int:
+    """
+    Delete Binaries from stdin JSONL.
+
+    Requires --yes flag to confirm deletion.
+
+    Exit codes:
+        0: Success
+        1: No input or missing --yes flag
+    """
+    from archivebox.misc.jsonl import read_stdin
+    from archivebox.machine.models import Binary
+
+    records = list(read_stdin())
+    if not records:
+        rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
+        return 1
+
+    binary_ids = [r.get('id') for r in records if r.get('id')]
+
+    if not binary_ids:
+        rprint('[yellow]No valid binary IDs in input[/yellow]', file=sys.stderr)
+        return 1
+
+    binaries = Binary.objects.filter(id__in=binary_ids)
+    count = binaries.count()
+
+    if count == 0:
+        rprint('[yellow]No matching binaries found[/yellow]', file=sys.stderr)
+        return 0
+
+    if dry_run:
+        rprint(f'[yellow]Would delete {count} binaries (dry run)[/yellow]', file=sys.stderr)
+        for binary in binaries:
+            rprint(f'  {binary.name} {binary.abspath}', file=sys.stderr)
+        return 0
+
+    if not yes:
+        rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
+        return 1
+
+    # Perform deletion
+    deleted_count, _ = binaries.delete()
+    rprint(f'[green]Deleted {deleted_count} binaries[/green]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# CLI Commands
+# =============================================================================
+
+@click.group()
+def main():
+    """Manage Binary records (detected executables)."""
+    pass
+
+
+@main.command('create')
+@click.option('--name', '-n', required=True, help='Binary name (e.g., chrome, wget)')
+@click.option('--abspath', '-p', required=True, help='Absolute path to binary')
+@click.option('--version', '-v', default='', help='Binary version')
+def create_cmd(name: str, abspath: str, version: str):
+    """Create/register a Binary."""
+    sys.exit(create_binary(name=name, abspath=abspath, version=version))
+
+
+@main.command('list')
+@click.option('--name', '-n', help='Filter by name')
+@click.option('--abspath__icontains', help='Filter by path contains')
+@click.option('--version__icontains', help='Filter by version contains')
+@click.option('--limit', type=int, help='Limit number of results')
+def list_cmd(name: Optional[str], abspath__icontains: Optional[str],
+             version__icontains: Optional[str], limit: Optional[int]):
+    """List Binaries as JSONL."""
+    sys.exit(list_binaries(
+        name=name,
+        abspath__icontains=abspath__icontains,
+        version__icontains=version__icontains,
+        limit=limit,
+    ))
+
+
+@main.command('update')
+@click.option('--version', '-v', help='Set version')
+@click.option('--abspath', '-p', help='Set path')
+def update_cmd(version: Optional[str], abspath: Optional[str]):
+    """Update Binaries from stdin JSONL."""
+    sys.exit(update_binaries(version=version, abspath=abspath))
+
+
+@main.command('delete')
+@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
+@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
+def delete_cmd(yes: bool, dry_run: bool):
+    """Delete Binaries from stdin JSONL."""
+    sys.exit(delete_binaries(yes=yes, dry_run=dry_run))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/archivebox/cli/archivebox_crawl.py b/archivebox/cli/archivebox_crawl.py
index d8c3c7ad..d0621fcc 100644
--- a/archivebox/cli/archivebox_crawl.py
+++ b/archivebox/cli/archivebox_crawl.py
@@ -1,108 +1,134 @@
 #!/usr/bin/env python3
 
 """
-archivebox crawl [urls...] [--depth=N] [--tag=TAG]
+archivebox crawl <action> [args...] [--filters]
 
-Create Crawl jobs from URLs. Accepts URLs as arguments, from stdin, or via JSONL.
-Does NOT immediately start the crawl - pipe to `archivebox snapshot` to process.
+Manage Crawl records.
 
-Input formats:
-    - Plain URLs (one per line)
-    - JSONL: {"url": "...", "depth": 1, "tags": "..."}
-
-Output (JSONL):
-    {"type": "Crawl", "id": "...", "urls": "...", "status": "queued", ...}
+Actions:
+    create  - Create Crawl jobs from URLs
+    list    - List Crawls as JSONL (with optional filters)
+    update  - Update Crawls from stdin JSONL
+    delete  - Delete Crawls from stdin JSONL
 
 Examples:
-    # Create a crawl job
-    archivebox crawl https://example.com
+    # Create
+    archivebox crawl create https://example.com https://foo.com --depth=1
+    archivebox crawl create --tag=news https://example.com
 
-    # Create crawl with depth
-    archivebox crawl --depth=1 https://example.com
+    # List with filters
+    archivebox crawl list --status=queued
+    archivebox crawl list --urls__icontains=example.com
 
-    # Full pipeline: create crawl, create snapshots, run extractors
-    archivebox crawl https://example.com | archivebox snapshot | archivebox extract
+    # Update
+    archivebox crawl list --status=started | archivebox crawl update --status=queued
 
-    # Process existing Crawl by ID (runs the crawl state machine)
-    archivebox crawl 01234567-89ab-cdef-0123-456789abcdef
+    # Delete
+    archivebox crawl list --urls__icontains=spam.com | archivebox crawl delete --yes
+
+    # Full pipeline
+    archivebox crawl create https://example.com | archivebox snapshot create | archivebox run
 """
 
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox crawl'
 
 import sys
-from typing import Optional
+from typing import Optional, Iterable
 
 import rich_click as click
+from rich import print as rprint
 
 
-def create_crawls(
-    records: list,
+def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
+    """Apply Django-style filters from CLI kwargs to a QuerySet."""
+    filters = {}
+    for key, value in filter_kwargs.items():
+        if value is not None and key not in ('limit', 'offset'):
+            filters[key] = value
+
+    if filters:
+        queryset = queryset.filter(**filters)
+
+    if limit:
+        queryset = queryset[:limit]
+
+    return queryset
+
+
+# =============================================================================
+# CREATE
+# =============================================================================
+
+def create_crawl(
+    urls: Iterable[str],
     depth: int = 0,
     tag: str = '',
+    status: str = 'queued',
     created_by_id: Optional[int] = None,
 ) -> int:
     """
-    Create a single Crawl job from all input URLs.
+    Create a Crawl job from URLs.
 
-    Takes pre-read records, creates one Crawl with all URLs, outputs JSONL.
-    Does NOT start the crawl - just creates the job in QUEUED state.
+    Takes URLs as args or stdin, creates one Crawl with all URLs, outputs JSONL.
 
     Exit codes:
         0: Success
         1: Failure
     """
-    from rich import print as rprint
-
-    from archivebox.misc.jsonl import write_record
+    from archivebox.misc.jsonl import read_args_or_stdin, write_record
     from archivebox.base_models.models import get_or_create_system_user_pk
     from archivebox.crawls.models import Crawl
 
     created_by_id = created_by_id or get_or_create_system_user_pk()
     is_tty = sys.stdout.isatty()
 
+    # Collect all input records
+    records = list(read_args_or_stdin(urls))
+
     if not records:
         rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
         return 1
 
     # Collect all URLs into a single newline-separated string
-    urls = []
+    url_list = []
     for record in records:
         url = record.get('url')
         if url:
-            urls.append(url)
+            url_list.append(url)
 
-    if not urls:
+    if not url_list:
         rprint('[red]No valid URLs found[/red]', file=sys.stderr)
         return 1
 
     try:
         # Build crawl record with all URLs as newline-separated string
         crawl_record = {
-            'urls': '\n'.join(urls),
+            'urls': '\n'.join(url_list),
             'max_depth': depth,
             'tags_str': tag,
+            'status': status,
             'label': '',
         }
 
-        crawl = Crawl.from_jsonl(crawl_record, overrides={'created_by_id': created_by_id})
+        crawl = Crawl.from_json(crawl_record, overrides={'created_by_id': created_by_id})
         if not crawl:
             rprint('[red]Failed to create crawl[/red]', file=sys.stderr)
             return 1
 
         # Output JSONL record (only when piped)
         if not is_tty:
-            write_record(crawl.to_jsonl())
+            write_record(crawl.to_json())
 
-        rprint(f'[green]Created crawl with {len(urls)} URLs[/green]', file=sys.stderr)
+        rprint(f'[green]Created crawl with {len(url_list)} URLs[/green]', file=sys.stderr)
 
         # If TTY, show human-readable output
         if is_tty:
             rprint(f'  [dim]{crawl.id}[/dim]', file=sys.stderr)
-            for url in urls[:5]:  # Show first 5 URLs
+            for url in url_list[:5]:  # Show first 5 URLs
                 rprint(f'    {url[:70]}', file=sys.stderr)
-            if len(urls) > 5:
-                rprint(f'    ... and {len(urls) - 5} more', file=sys.stderr)
+            if len(url_list) > 5:
+                rprint(f'    ... and {len(url_list) - 5} more', file=sys.stderr)
 
         return 0
 
@@ -111,81 +137,217 @@ def create_crawls(
         return 1
 
 
-def process_crawl_by_id(crawl_id: str) -> int:
-    """
-    Process a single Crawl by ID (used by workers).
+# =============================================================================
+# LIST
+# =============================================================================
 
-    Triggers the Crawl's state machine tick() which will:
-    - Transition from queued -> started (creates root snapshot)
-    - Transition from started -> sealed (when all snapshots done)
+def list_crawls(
+    status: Optional[str] = None,
+    urls__icontains: Optional[str] = None,
+    max_depth: Optional[int] = None,
+    limit: Optional[int] = None,
+) -> int:
     """
-    from rich import print as rprint
+    List Crawls as JSONL with optional filters.
+
+    Exit codes:
+        0: Success (even if no results)
+    """
+    from archivebox.misc.jsonl import write_record
     from archivebox.crawls.models import Crawl
 
-    try:
-        crawl = Crawl.objects.get(id=crawl_id)
-    except Crawl.DoesNotExist:
-        rprint(f'[red]Crawl {crawl_id} not found[/red]', file=sys.stderr)
-        return 1
+    is_tty = sys.stdout.isatty()
 
-    rprint(f'[blue]Processing Crawl {crawl.id} (status={crawl.status})[/blue]', file=sys.stderr)
+    queryset = Crawl.objects.all().order_by('-created_at')
 
-    try:
-        crawl.sm.tick()
-        crawl.refresh_from_db()
-        rprint(f'[green]Crawl complete (status={crawl.status})[/green]', file=sys.stderr)
-        return 0
-    except Exception as e:
-        rprint(f'[red]Crawl error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
-        return 1
+    # Apply filters
+    filter_kwargs = {
+        'status': status,
+        'urls__icontains': urls__icontains,
+        'max_depth': max_depth,
+    }
+    queryset = apply_filters(queryset, filter_kwargs, limit=limit)
+
+    count = 0
+    for crawl in queryset:
+        if is_tty:
+            status_color = {
+                'queued': 'yellow',
+                'started': 'blue',
+                'sealed': 'green',
+            }.get(crawl.status, 'dim')
+            url_preview = crawl.urls[:50].replace('\n', ' ')
+            rprint(f'[{status_color}]{crawl.status:8}[/{status_color}] [dim]{crawl.id}[/dim] {url_preview}...')
+        else:
+            write_record(crawl.to_json())
+        count += 1
+
+    rprint(f'[dim]Listed {count} crawls[/dim]', file=sys.stderr)
+    return 0
 
 
-def is_crawl_id(value: str) -> bool:
-    """Check if value looks like a Crawl UUID."""
-    import re
-    uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
-    if not uuid_pattern.match(value):
-        return False
-    # Verify it's actually a Crawl (not a Snapshot or other object)
+# =============================================================================
+# UPDATE
+# =============================================================================
+
+def update_crawls(
+    status: Optional[str] = None,
+    max_depth: Optional[int] = None,
+) -> int:
+    """
+    Update Crawls from stdin JSONL.
+
+    Reads Crawl records from stdin and applies updates.
+    Uses PATCH semantics - only specified fields are updated.
+
+    Exit codes:
+        0: Success
+        1: No input or error
+    """
+    from django.utils import timezone
+
+    from archivebox.misc.jsonl import read_stdin, write_record
     from archivebox.crawls.models import Crawl
-    return Crawl.objects.filter(id=value).exists()
 
+    is_tty = sys.stdout.isatty()
 
-@click.command()
-@click.option('--depth', '-d', type=int, default=0, help='Max depth for recursive crawling (default: 0, no recursion)')
-@click.option('--tag', '-t', default='', help='Comma-separated tags to add to snapshots')
-@click.argument('args', nargs=-1)
-def main(depth: int, tag: str, args: tuple):
-    """Create Crawl jobs from URLs, or process existing Crawls by ID"""
-    from archivebox.misc.jsonl import read_args_or_stdin
-
-    # Read all input
-    records = list(read_args_or_stdin(args))
-
+    records = list(read_stdin())
     if not records:
-        from rich import print as rprint
-        rprint('[yellow]No URLs or Crawl IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
-        sys.exit(1)
+        rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
+        return 1
 
-    # Check if input looks like existing Crawl IDs to process
-    # If ALL inputs are Crawl UUIDs, process them
-    all_are_crawl_ids = all(
-        is_crawl_id(r.get('id') or r.get('url', ''))
-        for r in records
-    )
+    updated_count = 0
+    for record in records:
+        crawl_id = record.get('id')
+        if not crawl_id:
+            continue
 
-    if all_are_crawl_ids:
-        # Process existing Crawls by ID
-        exit_code = 0
-        for record in records:
-            crawl_id = record.get('id') or record.get('url')
-            result = process_crawl_by_id(crawl_id)
-            if result != 0:
-                exit_code = result
-        sys.exit(exit_code)
-    else:
-        # Default behavior: create Crawl jobs from URLs
-        sys.exit(create_crawls(records, depth=depth, tag=tag))
+        try:
+            crawl = Crawl.objects.get(id=crawl_id)
+
+            # Apply updates from CLI flags
+            if status:
+                crawl.status = status
+                crawl.retry_at = timezone.now()
+            if max_depth is not None:
+                crawl.max_depth = max_depth
+
+            crawl.save()
+            updated_count += 1
+
+            if not is_tty:
+                write_record(crawl.to_json())
+
+        except Crawl.DoesNotExist:
+            rprint(f'[yellow]Crawl not found: {crawl_id}[/yellow]', file=sys.stderr)
+            continue
+
+    rprint(f'[green]Updated {updated_count} crawls[/green]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# DELETE
+# =============================================================================
+
+def delete_crawls(yes: bool = False, dry_run: bool = False) -> int:
+    """
+    Delete Crawls from stdin JSONL.
+
+    Requires --yes flag to confirm deletion.
+
+    Exit codes:
+        0: Success
+        1: No input or missing --yes flag
+    """
+    from archivebox.misc.jsonl import read_stdin
+    from archivebox.crawls.models import Crawl
+
+    records = list(read_stdin())
+    if not records:
+        rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
+        return 1
+
+    crawl_ids = [r.get('id') for r in records if r.get('id')]
+
+    if not crawl_ids:
+        rprint('[yellow]No valid crawl IDs in input[/yellow]', file=sys.stderr)
+        return 1
+
+    crawls = Crawl.objects.filter(id__in=crawl_ids)
+    count = crawls.count()
+
+    if count == 0:
+        rprint('[yellow]No matching crawls found[/yellow]', file=sys.stderr)
+        return 0
+
+    if dry_run:
+        rprint(f'[yellow]Would delete {count} crawls (dry run)[/yellow]', file=sys.stderr)
+        for crawl in crawls:
+            url_preview = crawl.urls[:50].replace('\n', ' ')
+            rprint(f'  [dim]{crawl.id}[/dim] {url_preview}...', file=sys.stderr)
+        return 0
+
+    if not yes:
+        rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
+        return 1
+
+    # Perform deletion
+    deleted_count, _ = crawls.delete()
+    rprint(f'[green]Deleted {deleted_count} crawls[/green]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# CLI Commands
+# =============================================================================
+
+@click.group()
+def main():
+    """Manage Crawl records."""
+    pass
+
+
+@main.command('create')
+@click.argument('urls', nargs=-1)
+@click.option('--depth', '-d', type=int, default=0, help='Max crawl depth (default: 0)')
+@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
+@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
+def create_cmd(urls: tuple, depth: int, tag: str, status: str):
+    """Create a Crawl job from URLs or stdin."""
+    sys.exit(create_crawl(urls, depth=depth, tag=tag, status=status))
+
+
+@main.command('list')
+@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
+@click.option('--urls__icontains', help='Filter by URLs contains')
+@click.option('--max-depth', type=int, help='Filter by max depth')
+@click.option('--limit', '-n', type=int, help='Limit number of results')
+def list_cmd(status: Optional[str], urls__icontains: Optional[str],
+             max_depth: Optional[int], limit: Optional[int]):
+    """List Crawls as JSONL."""
+    sys.exit(list_crawls(
+        status=status,
+        urls__icontains=urls__icontains,
+        max_depth=max_depth,
+        limit=limit,
+    ))
+
+
+@main.command('update')
+@click.option('--status', '-s', help='Set status')
+@click.option('--max-depth', type=int, help='Set max depth')
+def update_cmd(status: Optional[str], max_depth: Optional[int]):
+    """Update Crawls from stdin JSONL."""
+    sys.exit(update_crawls(status=status, max_depth=max_depth))
+
+
+@main.command('delete')
+@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
+@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
+def delete_cmd(yes: bool, dry_run: bool):
+    """Delete Crawls from stdin JSONL."""
+    sys.exit(delete_crawls(yes=yes, dry_run=dry_run))
 
 
 if __name__ == '__main__':
diff --git a/archivebox/cli/archivebox_extract.py b/archivebox/cli/archivebox_extract.py
deleted file mode 100644
index 7dc043ae..00000000
--- a/archivebox/cli/archivebox_extract.py
+++ /dev/null
@@ -1,265 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-archivebox extract [snapshot_ids...] [--plugins=NAMES]
-
-Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL.
-
-Input formats:
-    - Snapshot UUIDs (one per line)
-    - JSONL: {"type": "Snapshot", "id": "...", "url": "..."}
-    - JSONL: {"type": "ArchiveResult", "snapshot_id": "...", "plugin": "..."}
-
-Output (JSONL):
-    {"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", "status": "..."}
-
-Examples:
-    # Extract specific snapshot
-    archivebox extract 01234567-89ab-cdef-0123-456789abcdef
-
-    # Pipe from snapshot command
-    archivebox snapshot https://example.com | archivebox extract
-
-    # Run specific plugins only
-    archivebox extract --plugins=screenshot,singlefile 01234567-89ab-cdef-0123-456789abcdef
-
-    # Chain commands
-    archivebox crawl https://example.com | archivebox snapshot | archivebox extract
-"""
-
-__package__ = 'archivebox.cli'
-__command__ = 'archivebox extract'
-
-import sys
-from typing import Optional, List
-
-import rich_click as click
-
-
-def process_archiveresult_by_id(archiveresult_id: str) -> int:
-    """
-    Run extraction for a single ArchiveResult by ID (used by workers).
-
-    Triggers the ArchiveResult's state machine tick() to run the extractor plugin.
-    """
-    from rich import print as rprint
-    from archivebox.core.models import ArchiveResult
-
-    try:
-        archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
-    except ArchiveResult.DoesNotExist:
-        rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr)
-        return 1
-
-    rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
-
-    try:
-        # Trigger state machine tick - this runs the actual extraction
-        archiveresult.sm.tick()
-        archiveresult.refresh_from_db()
-
-        if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
-            print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]')
-            return 0
-        elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
-            print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr)
-            return 1
-        else:
-            # Still in progress or backoff - not a failure
-            print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]')
-            return 0
-
-    except Exception as e:
-        print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
-        return 1
-
-
-def run_plugins(
-    args: tuple,
-    plugins: str = '',
-    wait: bool = True,
-) -> int:
-    """
-    Run plugins on Snapshots from input.
-
-    Reads Snapshot IDs or JSONL from args/stdin, runs plugins, outputs JSONL.
-
-    Exit codes:
-        0: Success
-        1: Failure
-    """
-    from rich import print as rprint
-    from django.utils import timezone
-
-    from archivebox.misc.jsonl import (
-        read_args_or_stdin, write_record,
-        TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
-    )
-    from archivebox.core.models import Snapshot, ArchiveResult
-    from archivebox.workers.orchestrator import Orchestrator
-
-    is_tty = sys.stdout.isatty()
-
-    # Parse comma-separated plugins list once (reused in creation and filtering)
-    plugins_list = [p.strip() for p in plugins.split(',') if p.strip()] if plugins else []
-
-    # Collect all input records
-    records = list(read_args_or_stdin(args))
-
-    if not records:
-        rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
-        return 1
-
-    # Gather snapshot IDs to process
-    snapshot_ids = set()
-    for record in records:
-        record_type = record.get('type')
-
-        if record_type == TYPE_SNAPSHOT:
-            snapshot_id = record.get('id')
-            if snapshot_id:
-                snapshot_ids.add(snapshot_id)
-            elif record.get('url'):
-                # Look up by URL (get most recent if multiple exist)
-                snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first()
-                if snap:
-                    snapshot_ids.add(str(snap.id))
-                else:
-                    rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
-
-        elif record_type == TYPE_ARCHIVERESULT:
-            snapshot_id = record.get('snapshot_id')
-            if snapshot_id:
-                snapshot_ids.add(snapshot_id)
-
-        elif 'id' in record:
-            # Assume it's a snapshot ID
-            snapshot_ids.add(record['id'])
-
-    if not snapshot_ids:
-        rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr)
-        return 1
-
-    # Get snapshots and ensure they have pending ArchiveResults
-    processed_count = 0
-    for snapshot_id in snapshot_ids:
-        try:
-            snapshot = Snapshot.objects.get(id=snapshot_id)
-        except Snapshot.DoesNotExist:
-            rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
-            continue
-
-        # Create pending ArchiveResults if needed
-        if plugins_list:
-            # Only create for specific plugins
-            for plugin_name in plugins_list:
-                result, created = ArchiveResult.objects.get_or_create(
-                    snapshot=snapshot,
-                    plugin=plugin_name,
-                    defaults={
-                        'status': ArchiveResult.StatusChoices.QUEUED,
-                        'retry_at': timezone.now(),
-                    }
-                )
-                if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
-                    # Reset for retry
-                    result.status = ArchiveResult.StatusChoices.QUEUED
-                    result.retry_at = timezone.now()
-                    result.save()
-        else:
-            # Create all pending plugins
-            snapshot.create_pending_archiveresults()
-
-        # Reset snapshot status to allow processing
-        if snapshot.status == Snapshot.StatusChoices.SEALED:
-            snapshot.status = Snapshot.StatusChoices.STARTED
-            snapshot.retry_at = timezone.now()
-            snapshot.save()
-
-        processed_count += 1
-
-    if processed_count == 0:
-        rprint('[red]No snapshots to process[/red]', file=sys.stderr)
-        return 1
-
-    rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr)
-
-    # Run orchestrator if --wait (default)
-    if wait:
-        rprint('[blue]Running plugins...[/blue]', file=sys.stderr)
-        orchestrator = Orchestrator(exit_on_idle=True)
-        orchestrator.runloop()
-
-    # Output results as JSONL (when piped) or human-readable (when TTY)
-    for snapshot_id in snapshot_ids:
-        try:
-            snapshot = Snapshot.objects.get(id=snapshot_id)
-            results = snapshot.archiveresult_set.all()
-            if plugins_list:
-                results = results.filter(plugin__in=plugins_list)
-
-            for result in results:
-                if is_tty:
-                    status_color = {
-                        'succeeded': 'green',
-                        'failed': 'red',
-                        'skipped': 'yellow',
-                    }.get(result.status, 'dim')
-                    rprint(f'  [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ""}', file=sys.stderr)
-                else:
-                    write_record(result.to_jsonl())
-        except Snapshot.DoesNotExist:
-            continue
-
-    return 0
-
-
-def is_archiveresult_id(value: str) -> bool:
-    """Check if value looks like an ArchiveResult UUID."""
-    import re
-    uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
-    if not uuid_pattern.match(value):
-        return False
-    # Verify it's actually an ArchiveResult (not a Snapshot or other object)
-    from archivebox.core.models import ArchiveResult
-    return ArchiveResult.objects.filter(id=value).exists()
-
-
-@click.command()
-@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)')
-@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
-@click.argument('args', nargs=-1)
-def main(plugins: str, wait: bool, args: tuple):
-    """Run plugins on Snapshots, or process existing ArchiveResults by ID"""
-    from archivebox.misc.jsonl import read_args_or_stdin
-
-    # Read all input
-    records = list(read_args_or_stdin(args))
-
-    if not records:
-        from rich import print as rprint
-        rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
-        sys.exit(1)
-
-    # Check if input looks like existing ArchiveResult IDs to process
-    all_are_archiveresult_ids = all(
-        is_archiveresult_id(r.get('id') or r.get('url', ''))
-        for r in records
-    )
-
-    if all_are_archiveresult_ids:
-        # Process existing ArchiveResults by ID
-        exit_code = 0
-        for record in records:
-            archiveresult_id = record.get('id') or record.get('url')
-            result = process_archiveresult_by_id(archiveresult_id)
-            if result != 0:
-                exit_code = result
-        sys.exit(exit_code)
-    else:
-        # Default behavior: run plugins on Snapshots from input
-        sys.exit(run_plugins(args, plugins=plugins, wait=wait))
-
-
-if __name__ == '__main__':
-    main()
diff --git a/archivebox/cli/archivebox_init.py b/archivebox/cli/archivebox_init.py
index ed67c77d..5ef6c9ca 100755
--- a/archivebox/cli/archivebox_init.py
+++ b/archivebox/cli/archivebox_init.py
@@ -127,7 +127,7 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
 
             if pending_links:
                 for link_dict in pending_links.values():
-                    Snapshot.from_jsonl(link_dict)
+                    Snapshot.from_json(link_dict)
 
             # Hint for orphaned snapshot directories
             print()
diff --git a/archivebox/cli/archivebox_machine.py b/archivebox/cli/archivebox_machine.py
new file mode 100644
index 00000000..e63eac41
--- /dev/null
+++ b/archivebox/cli/archivebox_machine.py
@@ -0,0 +1,113 @@
+#!/usr/bin/env python3
+
+"""
+archivebox machine <action> [--filters]
+
+Manage Machine records (system-managed, mostly read-only).
+
+Machine records track the host machines where ArchiveBox runs.
+They are created automatically by the system and are primarily for debugging.
+
+Actions:
+    list    - List Machines as JSONL (with optional filters)
+
+Examples:
+    # List all machines
+    archivebox machine list
+
+    # List machines by hostname
+    archivebox machine list --hostname__icontains=myserver
+"""
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox machine'
+
+import sys
+from typing import Optional
+
+import rich_click as click
+from rich import print as rprint
+
+
+def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
+    """Apply Django-style filters from CLI kwargs to a QuerySet."""
+    filters = {}
+    for key, value in filter_kwargs.items():
+        if value is not None and key not in ('limit', 'offset'):
+            filters[key] = value
+
+    if filters:
+        queryset = queryset.filter(**filters)
+
+    if limit:
+        queryset = queryset[:limit]
+
+    return queryset
+
+
+# =============================================================================
+# LIST
+# =============================================================================
+
+def list_machines(
+    hostname__icontains: Optional[str] = None,
+    os_platform: Optional[str] = None,
+    limit: Optional[int] = None,
+) -> int:
+    """
+    List Machines as JSONL with optional filters.
+
+    Exit codes:
+        0: Success (even if no results)
+    """
+    from archivebox.misc.jsonl import write_record
+    from archivebox.machine.models import Machine
+
+    is_tty = sys.stdout.isatty()
+
+    queryset = Machine.objects.all().order_by('-created_at')
+
+    # Apply filters
+    filter_kwargs = {
+        'hostname__icontains': hostname__icontains,
+        'os_platform': os_platform,
+    }
+    queryset = apply_filters(queryset, filter_kwargs, limit=limit)
+
+    count = 0
+    for machine in queryset:
+        if is_tty:
+            rprint(f'[cyan]{machine.hostname:30}[/cyan] [dim]{machine.os_platform:10}[/dim] {machine.id}')
+        else:
+            write_record(machine.to_json())
+        count += 1
+
+    rprint(f'[dim]Listed {count} machines[/dim]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# CLI Commands
+# =============================================================================
+
+@click.group()
+def main():
+    """Manage Machine records (read-only, system-managed)."""
+    pass
+
+
+@main.command('list')
+@click.option('--hostname__icontains', help='Filter by hostname contains')
+@click.option('--os-platform', help='Filter by OS platform')
+@click.option('--limit', '-n', type=int, help='Limit number of results')
+def list_cmd(hostname__icontains: Optional[str], os_platform: Optional[str], limit: Optional[int]):
+    """List Machines as JSONL."""
+    sys.exit(list_machines(
+        hostname__icontains=hostname__icontains,
+        os_platform=os_platform,
+        limit=limit,
+    ))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/archivebox/cli/archivebox_orchestrator.py b/archivebox/cli/archivebox_orchestrator.py
deleted file mode 100644
index 4b272727..00000000
--- a/archivebox/cli/archivebox_orchestrator.py
+++ /dev/null
@@ -1,67 +0,0 @@
-#!/usr/bin/env python3
-
-"""
-archivebox orchestrator [--daemon]
-
-Start the orchestrator process that manages workers.
-
-The orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult)
-and lazily spawns worker processes when there is work to be done.
-"""
-
-__package__ = 'archivebox.cli'
-__command__ = 'archivebox orchestrator'
-
-import sys
-
-import rich_click as click
-
-from archivebox.misc.util import docstring
-
-
-def orchestrator(daemon: bool = False, watch: bool = False) -> int:
-    """
-    Start the orchestrator process.
-    
-    The orchestrator:
-    1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
-    2. Spawns worker processes when there is work to do
-    3. Monitors worker health and restarts failed workers
-    4. Exits when all queues are empty (unless --daemon)
-    
-    Args:
-        daemon: Run forever (don't exit when idle)
-        watch: Just watch the queues without spawning workers (for debugging)
-    
-    Exit codes:
-        0: All work completed successfully
-        1: Error occurred
-    """
-    from archivebox.workers.orchestrator import Orchestrator
-    
-    if Orchestrator.is_running():
-        print('[yellow]Orchestrator is already running[/yellow]')
-        return 0
-    
-    try:
-        orchestrator_instance = Orchestrator(exit_on_idle=not daemon)
-        orchestrator_instance.runloop()
-        return 0
-    except KeyboardInterrupt:
-        return 0
-    except Exception as e:
-        print(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
-        return 1
-
-
-@click.command()
-@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
-@click.option('--watch', '-w', is_flag=True, help="Watch queues without spawning workers")
-@docstring(orchestrator.__doc__)
-def main(daemon: bool, watch: bool):
-    """Start the ArchiveBox orchestrator process"""
-    sys.exit(orchestrator(daemon=daemon, watch=watch))
-
-
-if __name__ == '__main__':
-    main()
diff --git a/archivebox/cli/archivebox_process.py b/archivebox/cli/archivebox_process.py
new file mode 100644
index 00000000..9784650b
--- /dev/null
+++ b/archivebox/cli/archivebox_process.py
@@ -0,0 +1,121 @@
+#!/usr/bin/env python3
+
+"""
+archivebox process <action> [--filters]
+
+Manage Process records (system-managed, mostly read-only).
+
+Process records track executions of binaries during extraction.
+They are created automatically by the system and are primarily for debugging.
+
+Actions:
+    list    - List Processes as JSONL (with optional filters)
+
+Examples:
+    # List all processes
+    archivebox process list
+
+    # List processes by binary
+    archivebox process list --binary-name=chrome
+
+    # List recent processes
+    archivebox process list --limit=10
+"""
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox process'
+
+import sys
+from typing import Optional
+
+import rich_click as click
+from rich import print as rprint
+
+
+def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
+    """Apply Django-style filters from CLI kwargs to a QuerySet."""
+    filters = {}
+    for key, value in filter_kwargs.items():
+        if value is not None and key not in ('limit', 'offset'):
+            filters[key] = value
+
+    if filters:
+        queryset = queryset.filter(**filters)
+
+    if limit:
+        queryset = queryset[:limit]
+
+    return queryset
+
+
+# =============================================================================
+# LIST
+# =============================================================================
+
+def list_processes(
+    binary_name: Optional[str] = None,
+    machine_id: Optional[str] = None,
+    limit: Optional[int] = None,
+) -> int:
+    """
+    List Processes as JSONL with optional filters.
+
+    Exit codes:
+        0: Success (even if no results)
+    """
+    from archivebox.misc.jsonl import write_record
+    from archivebox.machine.models import Process
+
+    is_tty = sys.stdout.isatty()
+
+    queryset = Process.objects.all().select_related('binary', 'machine').order_by('-start_ts')
+
+    # Apply filters
+    filter_kwargs = {}
+    if binary_name:
+        filter_kwargs['binary__name'] = binary_name
+    if machine_id:
+        filter_kwargs['machine_id'] = machine_id
+
+    queryset = apply_filters(queryset, filter_kwargs, limit=limit)
+
+    count = 0
+    for process in queryset:
+        if is_tty:
+            binary_name_str = process.binary.name if process.binary else 'unknown'
+            exit_code = process.returncode if process.returncode is not None else '?'
+            status_color = 'green' if process.returncode == 0 else 'red' if process.returncode else 'yellow'
+            rprint(f'[{status_color}]exit={exit_code:3}[/{status_color}] [cyan]{binary_name_str:15}[/cyan] [dim]{process.id}[/dim]')
+        else:
+            write_record(process.to_json())
+        count += 1
+
+    rprint(f'[dim]Listed {count} processes[/dim]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# CLI Commands
+# =============================================================================
+
+@click.group()
+def main():
+    """Manage Process records (read-only, system-managed)."""
+    pass
+
+
+@main.command('list')
+@click.option('--binary-name', '-b', help='Filter by binary name')
+@click.option('--machine-id', '-m', help='Filter by machine ID')
+@click.option('--limit', '-n', type=int, help='Limit number of results')
+def list_cmd(binary_name: Optional[str], machine_id: Optional[str], limit: Optional[int]):
+    """List Processes as JSONL."""
+    sys.exit(list_processes(
+        binary_name=binary_name,
+        machine_id=machine_id,
+        limit=limit,
+    ))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/archivebox/cli/archivebox_remove.py b/archivebox/cli/archivebox_remove.py
deleted file mode 100644
index 374b60d3..00000000
--- a/archivebox/cli/archivebox_remove.py
+++ /dev/null
@@ -1,98 +0,0 @@
-#!/usr/bin/env python3
-
-__package__ = 'archivebox.cli'
-__command__ = 'archivebox remove'
-
-import shutil
-from pathlib import Path
-from typing import Iterable
-
-import rich_click as click
-
-from django.db.models import QuerySet
-
-from archivebox.config import DATA_DIR
-from archivebox.config.django import setup_django
-from archivebox.misc.util import enforce_types, docstring
-from archivebox.misc.checks import check_data_folder
-from archivebox.misc.logging_util import (
-    log_list_started,
-    log_list_finished,
-    log_removal_started,
-    log_removal_finished,
-    TimedProgress,
-)
-
-
-@enforce_types
-def remove(filter_patterns: Iterable[str]=(),
-          filter_type: str='exact',
-          snapshots: QuerySet | None=None,
-          after: float | None=None,
-          before: float | None=None,
-          yes: bool=False,
-          delete: bool=False,
-          out_dir: Path=DATA_DIR) -> QuerySet:
-    """Remove the specified URLs from the archive"""
-    
-    setup_django()
-    check_data_folder()
-    
-    from archivebox.cli.archivebox_search import get_snapshots
-
-    log_list_started(filter_patterns, filter_type)
-    timer = TimedProgress(360, prefix='      ')
-    try:
-        snapshots = get_snapshots(
-            snapshots=snapshots,
-            filter_patterns=list(filter_patterns) if filter_patterns else None,
-            filter_type=filter_type,
-            after=after,
-            before=before,
-        )
-    finally:
-        timer.end()
-
-    if not snapshots.exists():
-        log_removal_finished(0, 0)
-        raise SystemExit(1)
-
-    log_list_finished(snapshots)
-    log_removal_started(snapshots, yes=yes, delete=delete)
-
-    timer = TimedProgress(360, prefix='      ')
-    try:
-        for snapshot in snapshots:
-            if delete:
-                shutil.rmtree(snapshot.output_dir, ignore_errors=True)
-    finally:
-        timer.end()
-
-    to_remove = snapshots.count()
-
-    from archivebox.search import flush_search_index
-    from archivebox.core.models import Snapshot
-
-    flush_search_index(snapshots=snapshots)
-    snapshots.delete()
-    all_snapshots = Snapshot.objects.all()
-    log_removal_finished(all_snapshots.count(), to_remove)
-
-    return all_snapshots
-
-
-@click.command()
-@click.option('--yes', is_flag=True, help='Remove links instantly without prompting to confirm')
-@click.option('--delete', is_flag=True, help='Delete the archived content and metadata folder in addition to removing from index')
-@click.option('--before', type=float, help='Remove only URLs bookmarked before timestamp')
-@click.option('--after', type=float, help='Remove only URLs bookmarked after timestamp')
-@click.option('--filter-type', '-f', type=click.Choice(('exact', 'substring', 'domain', 'regex', 'tag')), default='exact', help='Type of pattern matching to use when filtering URLs')
-@click.argument('filter_patterns', nargs=-1)
-@docstring(remove.__doc__)
-def main(**kwargs):
-    """Remove the specified URLs from the archive"""
-    remove(**kwargs)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/archivebox/cli/archivebox_run.py b/archivebox/cli/archivebox_run.py
new file mode 100644
index 00000000..6efd9018
--- /dev/null
+++ b/archivebox/cli/archivebox_run.py
@@ -0,0 +1,155 @@
+#!/usr/bin/env python3
+
+"""
+archivebox run [--daemon]
+
+Unified command for processing queued work.
+
+Modes:
+    - With stdin JSONL: Process piped records, exit when complete
+    - Without stdin (TTY): Run orchestrator in foreground until killed
+
+Examples:
+    # Run orchestrator in foreground (replaces `archivebox orchestrator`)
+    archivebox run
+
+    # Run as daemon (don't exit on idle)
+    archivebox run --daemon
+
+    # Process specific records (pipe any JSONL type, exits when done)
+    archivebox snapshot list --status=queued | archivebox run
+    archivebox archiveresult list --status=failed | archivebox run
+    archivebox crawl list --status=queued | archivebox run
+
+    # Mixed types work too
+    cat mixed_records.jsonl | archivebox run
+"""
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox run'
+
+import sys
+
+import rich_click as click
+from rich import print as rprint
+
+
+def process_stdin_records() -> int:
+    """
+    Process JSONL records from stdin.
+
+    Reads records, queues them for processing, then runs orchestrator until complete.
+    Handles any record type: Crawl, Snapshot, ArchiveResult, etc.
+
+    Returns exit code (0 = success, 1 = error).
+    """
+    from django.utils import timezone
+
+    from archivebox.misc.jsonl import read_stdin, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
+    from archivebox.core.models import Snapshot, ArchiveResult
+    from archivebox.crawls.models import Crawl
+    from archivebox.workers.orchestrator import Orchestrator
+
+    records = list(read_stdin())
+
+    if not records:
+        return 0  # Nothing to process
+
+    queued_count = 0
+
+    for record in records:
+        record_type = record.get('type')
+        record_id = record.get('id')
+
+        if not record_id:
+            continue
+
+        try:
+            if record_type == TYPE_CRAWL:
+                crawl = Crawl.objects.get(id=record_id)
+                if crawl.status in [Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]:
+                    crawl.retry_at = timezone.now()
+                    crawl.save()
+                    queued_count += 1
+
+            elif record_type == TYPE_SNAPSHOT:
+                snapshot = Snapshot.objects.get(id=record_id)
+                if snapshot.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]:
+                    snapshot.retry_at = timezone.now()
+                    snapshot.save()
+                    queued_count += 1
+
+            elif record_type == TYPE_ARCHIVERESULT:
+                archiveresult = ArchiveResult.objects.get(id=record_id)
+                if archiveresult.status in [ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED, ArchiveResult.StatusChoices.BACKOFF]:
+                    archiveresult.retry_at = timezone.now()
+                    archiveresult.save()
+                    queued_count += 1
+
+        except (Crawl.DoesNotExist, Snapshot.DoesNotExist, ArchiveResult.DoesNotExist):
+            rprint(f'[yellow]Record not found: {record_type} {record_id}[/yellow]', file=sys.stderr)
+            continue
+
+    if queued_count == 0:
+        rprint('[yellow]No records to process[/yellow]', file=sys.stderr)
+        return 0
+
+    rprint(f'[blue]Processing {queued_count} records...[/blue]', file=sys.stderr)
+
+    # Run orchestrator until all queued work is done
+    orchestrator = Orchestrator(exit_on_idle=True)
+    orchestrator.runloop()
+
+    return 0
+
+
+def run_orchestrator(daemon: bool = False) -> int:
+    """
+    Run the orchestrator process.
+
+    The orchestrator:
+    1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
+    2. Spawns worker processes when there is work to do
+    3. Monitors worker health and restarts failed workers
+    4. Exits when all queues are empty (unless --daemon)
+
+    Args:
+        daemon: Run forever (don't exit when idle)
+
+    Returns exit code (0 = success, 1 = error).
+    """
+    from archivebox.workers.orchestrator import Orchestrator
+
+    if Orchestrator.is_running():
+        rprint('[yellow]Orchestrator is already running[/yellow]', file=sys.stderr)
+        return 0
+
+    try:
+        orchestrator = Orchestrator(exit_on_idle=not daemon)
+        orchestrator.runloop()
+        return 0
+    except KeyboardInterrupt:
+        return 0
+    except Exception as e:
+        rprint(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
+        return 1
+
+
+@click.command()
+@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
+def main(daemon: bool):
+    """
+    Process queued work.
+
+    When stdin is piped: Process those specific records and exit.
+    When run standalone: Run orchestrator in foreground.
+    """
+    # Check if stdin has data (non-TTY means piped input)
+    if not sys.stdin.isatty():
+        sys.exit(process_stdin_records())
+    else:
+        sys.exit(run_orchestrator(daemon=daemon))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/archivebox/cli/archivebox_search.py b/archivebox/cli/archivebox_search.py
deleted file mode 100644
index 055e952d..00000000
--- a/archivebox/cli/archivebox_search.py
+++ /dev/null
@@ -1,131 +0,0 @@
-#!/usr/bin/env python3
-
-__package__ = 'archivebox.cli'
-__command__ = 'archivebox search'
-
-from pathlib import Path
-from typing import Optional, List, Any
-
-import rich_click as click
-from rich import print
-
-from django.db.models import QuerySet
-
-from archivebox.config import DATA_DIR
-from archivebox.misc.logging import stderr
-from archivebox.misc.util import enforce_types, docstring
-
-# Filter types for URL matching
-LINK_FILTERS = {
-    'exact': lambda pattern: {'url': pattern},
-    'substring': lambda pattern: {'url__icontains': pattern},
-    'regex': lambda pattern: {'url__iregex': pattern},
-    'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
-    'tag': lambda pattern: {'tags__name': pattern},
-    'timestamp': lambda pattern: {'timestamp': pattern},
-}
-
-STATUS_CHOICES = ['indexed', 'archived', 'unarchived']
-
-
-
-def get_snapshots(snapshots: Optional[QuerySet]=None,
-                  filter_patterns: Optional[List[str]]=None,
-                  filter_type: str='substring',
-                  after: Optional[float]=None,
-                  before: Optional[float]=None,
-                  out_dir: Path=DATA_DIR) -> QuerySet:
-    """Filter and return Snapshots matching the given criteria."""
-    from archivebox.core.models import Snapshot
-
-    if snapshots:
-        result = snapshots
-    else:
-        result = Snapshot.objects.all()
-
-    if after is not None:
-        result = result.filter(timestamp__gte=after)
-    if before is not None:
-        result = result.filter(timestamp__lt=before)
-    if filter_patterns:
-        result = Snapshot.objects.filter_by_patterns(filter_patterns, filter_type)
-
-    if not result:
-        stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
-
-    return result
-
-
-@enforce_types
-def search(filter_patterns: list[str] | None=None,
-           filter_type: str='substring',
-           status: str='indexed',
-           before: float | None=None,
-           after: float | None=None,
-           sort: str | None=None,
-           json: bool=False,
-           html: bool=False,
-           csv: str | None=None,
-           with_headers: bool=False):
-    """List, filter, and export information about archive entries"""
-    from archivebox.core.models import Snapshot
-
-    if with_headers and not (json or html or csv):
-        stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
-        raise SystemExit(2)
-
-    # Query DB directly - no filesystem scanning
-    snapshots = get_snapshots(
-        filter_patterns=list(filter_patterns) if filter_patterns else None,
-        filter_type=filter_type,
-        before=before,
-        after=after,
-    )
-
-    # Apply status filter
-    if status == 'archived':
-        snapshots = snapshots.filter(downloaded_at__isnull=False)
-    elif status == 'unarchived':
-        snapshots = snapshots.filter(downloaded_at__isnull=True)
-    # 'indexed' = all snapshots (no filter)
-
-    if sort:
-        snapshots = snapshots.order_by(sort)
-
-    # Export to requested format
-    if json:
-        output = snapshots.to_json(with_headers=with_headers)
-    elif html:
-        output = snapshots.to_html(with_headers=with_headers)
-    elif csv:
-        output = snapshots.to_csv(cols=csv.split(','), header=with_headers)
-    else:
-        from archivebox.misc.logging_util import printable_folders
-        # Convert to dict for printable_folders
-        folders = {s.output_dir: s for s in snapshots}
-        output = printable_folders(folders, with_headers)
-
-    print(output)
-    return output
-
-
-@click.command()
-@click.option('--filter-type', '-f', type=click.Choice(['search', *LINK_FILTERS.keys()]), default='substring', help='Pattern matching type for filtering URLs')
-@click.option('--status', '-s', type=click.Choice(STATUS_CHOICES), default='indexed', help='List snapshots with the given status')
-@click.option('--before', '-b', type=float, help='List snapshots bookmarked before the given UNIX timestamp')
-@click.option('--after', '-a', type=float, help='List snapshots bookmarked after the given UNIX timestamp')
-@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
-@click.option('--json', '-J', is_flag=True, help='Print output in JSON format')
-@click.option('--html', '-M', is_flag=True, help='Print output in HTML format (suitable for viewing statically without a server)')
-@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: created_at,url,title')
-@click.option('--with-headers', '-H', is_flag=True, help='Include extra CSV/HTML headers in the output')
-@click.help_option('--help', '-h')
-@click.argument('filter_patterns', nargs=-1)
-@docstring(search.__doc__)
-def main(**kwargs):
-    return search(**kwargs)
-
-
-
-if __name__ == '__main__':
-    main()
diff --git a/archivebox/cli/archivebox_snapshot.py b/archivebox/cli/archivebox_snapshot.py
index dc540139..87e7482b 100644
--- a/archivebox/cli/archivebox_snapshot.py
+++ b/archivebox/cli/archivebox_snapshot.py
@@ -1,93 +1,76 @@
 #!/usr/bin/env python3
 
 """
-archivebox snapshot [urls_or_crawl_ids...] [--tag=TAG] [--plugins=NAMES]
+archivebox snapshot <action> [args...] [--filters]
 
-Create Snapshots from URLs or Crawl jobs. Accepts URLs, Crawl JSONL, or Crawl IDs.
+Manage Snapshot records.
 
-Input formats:
-    - Plain URLs (one per line)
-    - JSONL: {"type": "Crawl", "id": "...", "urls": "..."}
-    - JSONL: {"type": "Snapshot", "url": "...", "title": "...", "tags": "..."}
-    - Crawl UUIDs (one per line)
-
-Output (JSONL):
-    {"type": "Snapshot", "id": "...", "url": "...", "status": "queued", ...}
+Actions:
+    create  - Create Snapshots from URLs or Crawl JSONL
+    list    - List Snapshots as JSONL (with optional filters)
+    update  - Update Snapshots from stdin JSONL
+    delete  - Delete Snapshots from stdin JSONL
 
 Examples:
-    # Create snapshots from URLs directly
-    archivebox snapshot https://example.com https://foo.com
+    # Create
+    archivebox snapshot create https://example.com --tag=news
+    archivebox crawl create https://example.com | archivebox snapshot create
 
-    # Pipe from crawl command
-    archivebox crawl https://example.com | archivebox snapshot
+    # List with filters
+    archivebox snapshot list --status=queued
+    archivebox snapshot list --url__icontains=example.com
 
-    # Chain with extract
-    archivebox crawl https://example.com | archivebox snapshot | archivebox extract
+    # Update
+    archivebox snapshot list --tag=old | archivebox snapshot update --tag=new
 
-    # Run specific plugins after creating snapshots
-    archivebox snapshot --plugins=screenshot,singlefile https://example.com
-
-    # Process existing Snapshot by ID
-    archivebox snapshot 01234567-89ab-cdef-0123-456789abcdef
+    # Delete
+    archivebox snapshot list --url__icontains=spam.com | archivebox snapshot delete --yes
 """
 
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox snapshot'
 
 import sys
-from typing import Optional
+from typing import Optional, Iterable
 
 import rich_click as click
-
-from archivebox.misc.util import docstring
+from rich import print as rprint
 
 
-def process_snapshot_by_id(snapshot_id: str) -> int:
-    """
-    Process a single Snapshot by ID (used by workers).
+def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
+    """Apply Django-style filters from CLI kwargs to a QuerySet."""
+    filters = {}
+    for key, value in filter_kwargs.items():
+        if value is not None and key not in ('limit', 'offset'):
+            filters[key] = value
 
-    Triggers the Snapshot's state machine tick() which will:
-    - Transition from queued -> started (creates pending ArchiveResults)
-    - Transition from started -> sealed (when all ArchiveResults done)
-    """
-    from rich import print as rprint
-    from archivebox.core.models import Snapshot
+    if filters:
+        queryset = queryset.filter(**filters)
 
-    try:
-        snapshot = Snapshot.objects.get(id=snapshot_id)
-    except Snapshot.DoesNotExist:
-        rprint(f'[red]Snapshot {snapshot_id} not found[/red]', file=sys.stderr)
-        return 1
+    if limit:
+        queryset = queryset[:limit]
 
-    rprint(f'[blue]Processing Snapshot {snapshot.id} {snapshot.url[:50]} (status={snapshot.status})[/blue]', file=sys.stderr)
+    return queryset
 
-    try:
-        snapshot.sm.tick()
-        snapshot.refresh_from_db()
-        rprint(f'[green]Snapshot complete (status={snapshot.status})[/green]', file=sys.stderr)
-        return 0
-    except Exception as e:
-        rprint(f'[red]Snapshot error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
-        return 1
 
+# =============================================================================
+# CREATE
+# =============================================================================
 
 def create_snapshots(
-    args: tuple,
+    urls: Iterable[str],
     tag: str = '',
-    plugins: str = '',
+    status: str = 'queued',
+    depth: int = 0,
     created_by_id: Optional[int] = None,
 ) -> int:
     """
-    Create Snapshots from URLs, Crawl JSONL, or Crawl IDs.
-
-    Reads from args or stdin, creates Snapshot objects, outputs JSONL.
-    If --plugins is passed, also runs specified plugins (blocking).
+    Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records).
 
     Exit codes:
         0: Success
         1: Failure
     """
-    from rich import print as rprint
     from django.utils import timezone
 
     from archivebox.misc.jsonl import (
@@ -102,7 +85,7 @@ def create_snapshots(
     is_tty = sys.stdout.isatty()
 
     # Collect all input records
-    records = list(read_args_or_stdin(args))
+    records = list(read_args_or_stdin(urls))
 
     if not records:
         rprint('[yellow]No URLs or Crawls provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
@@ -122,47 +105,44 @@ def create_snapshots(
                     try:
                         crawl = Crawl.objects.get(id=crawl_id)
                     except Crawl.DoesNotExist:
-                        # Crawl doesn't exist, create it
-                        crawl = Crawl.from_jsonl(record, overrides={'created_by_id': created_by_id})
+                        crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
                 else:
-                    # No ID, create new crawl
-                    crawl = Crawl.from_jsonl(record, overrides={'created_by_id': created_by_id})
+                    crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
 
                 if not crawl:
                     continue
 
                 # Create snapshots for each URL in the crawl
                 for url in crawl.get_urls_list():
-                    # Merge CLI tags with crawl tags
                     merged_tags = crawl.tags_str
                     if tag:
-                        if merged_tags:
-                            merged_tags = f"{merged_tags},{tag}"
-                        else:
-                            merged_tags = tag
+                        merged_tags = f"{merged_tags},{tag}" if merged_tags else tag
                     snapshot_record = {
                         'url': url,
                         'tags': merged_tags,
                         'crawl_id': str(crawl.id),
-                        'depth': 0,
+                        'depth': depth,
+                        'status': status,
                     }
-                    snapshot = Snapshot.from_jsonl(snapshot_record, overrides={'created_by_id': created_by_id})
+                    snapshot = Snapshot.from_json(snapshot_record, overrides={'created_by_id': created_by_id})
                     if snapshot:
                         created_snapshots.append(snapshot)
                         if not is_tty:
-                            write_record(snapshot.to_jsonl())
+                            write_record(snapshot.to_json())
 
             elif record_type == TYPE_SNAPSHOT or record.get('url'):
                 # Input is a Snapshot or plain URL
-                # Add tags if provided via CLI
                 if tag and not record.get('tags'):
                     record['tags'] = tag
+                if status:
+                    record['status'] = status
+                record['depth'] = record.get('depth', depth)
 
-                snapshot = Snapshot.from_jsonl(record, overrides={'created_by_id': created_by_id})
+                snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
                 if snapshot:
                     created_snapshots.append(snapshot)
                     if not is_tty:
-                        write_record(snapshot.to_jsonl())
+                        write_record(snapshot.to_json())
 
         except Exception as e:
             rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
@@ -174,93 +154,237 @@ def create_snapshots(
 
     rprint(f'[green]Created {len(created_snapshots)} snapshots[/green]', file=sys.stderr)
 
-    # If TTY, show human-readable output
     if is_tty:
         for snapshot in created_snapshots:
             rprint(f'  [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
 
-    # If --plugins is passed, create ArchiveResults and run the orchestrator
-    if plugins:
-        from archivebox.core.models import ArchiveResult
-        from archivebox.workers.orchestrator import Orchestrator
-
-        # Parse comma-separated plugins list
-        plugins_list = [p.strip() for p in plugins.split(',') if p.strip()]
-
-        # Create ArchiveResults for the specific plugins on each snapshot
-        for snapshot in created_snapshots:
-            for plugin_name in plugins_list:
-                result, created = ArchiveResult.objects.get_or_create(
-                    snapshot=snapshot,
-                    plugin=plugin_name,
-                    defaults={
-                        'status': ArchiveResult.StatusChoices.QUEUED,
-                        'retry_at': timezone.now(),
-                    }
-                )
-                if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
-                    # Reset for retry
-                    result.status = ArchiveResult.StatusChoices.QUEUED
-                    result.retry_at = timezone.now()
-                    result.save()
-
-        rprint(f'[blue]Running plugins: {plugins}...[/blue]', file=sys.stderr)
-        orchestrator = Orchestrator(exit_on_idle=True)
-        orchestrator.runloop()
-
     return 0
 
 
-def is_snapshot_id(value: str) -> bool:
-    """Check if value looks like a Snapshot UUID."""
-    import re
-    uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
-    if not uuid_pattern.match(value):
-        return False
-    # Verify it's actually a Snapshot (not a Crawl or other object)
+# =============================================================================
+# LIST
+# =============================================================================
+
+def list_snapshots(
+    status: Optional[str] = None,
+    url__icontains: Optional[str] = None,
+    url__istartswith: Optional[str] = None,
+    tag: Optional[str] = None,
+    crawl_id: Optional[str] = None,
+    limit: Optional[int] = None,
+) -> int:
+    """
+    List Snapshots as JSONL with optional filters.
+
+    Exit codes:
+        0: Success (even if no results)
+    """
+    from archivebox.misc.jsonl import write_record
     from archivebox.core.models import Snapshot
-    return Snapshot.objects.filter(id=value).exists()
+
+    is_tty = sys.stdout.isatty()
+
+    queryset = Snapshot.objects.all().order_by('-created_at')
+
+    # Apply filters
+    filter_kwargs = {
+        'status': status,
+        'url__icontains': url__icontains,
+        'url__istartswith': url__istartswith,
+        'crawl_id': crawl_id,
+    }
+    queryset = apply_filters(queryset, filter_kwargs, limit=limit)
+
+    # Tag filter requires special handling (M2M)
+    if tag:
+        queryset = queryset.filter(tags__name__iexact=tag)
+
+    count = 0
+    for snapshot in queryset:
+        if is_tty:
+            status_color = {
+                'queued': 'yellow',
+                'started': 'blue',
+                'sealed': 'green',
+            }.get(snapshot.status, 'dim')
+            rprint(f'[{status_color}]{snapshot.status:8}[/{status_color}] [dim]{snapshot.id}[/dim] {snapshot.url[:60]}')
+        else:
+            write_record(snapshot.to_json())
+        count += 1
+
+    rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
+    return 0
 
 
-@click.command()
-@click.option('--tag', '-t', default='', help='Comma-separated tags to add to each snapshot')
-@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run after creating snapshots (e.g., screenshot,singlefile)')
-@click.argument('args', nargs=-1)
-def main(tag: str, plugins: str, args: tuple):
-    """Create Snapshots from URLs/Crawls, or process existing Snapshots by ID"""
-    from archivebox.misc.jsonl import read_args_or_stdin
+# =============================================================================
+# UPDATE
+# =============================================================================
 
-    # Read all input
-    records = list(read_args_or_stdin(args))
+def update_snapshots(
+    status: Optional[str] = None,
+    tag: Optional[str] = None,
+) -> int:
+    """
+    Update Snapshots from stdin JSONL.
 
+    Reads Snapshot records from stdin and applies updates.
+    Uses PATCH semantics - only specified fields are updated.
+
+    Exit codes:
+        0: Success
+        1: No input or error
+    """
+    from django.utils import timezone
+
+    from archivebox.misc.jsonl import read_stdin, write_record
+    from archivebox.core.models import Snapshot
+
+    is_tty = sys.stdout.isatty()
+
+    records = list(read_stdin())
     if not records:
-        from rich import print as rprint
-        rprint('[yellow]No URLs, Crawl IDs, or Snapshot IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
-        sys.exit(1)
+        rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
+        return 1
 
-    # Check if input looks like existing Snapshot IDs to process
-    # If ALL inputs are UUIDs with no URL and exist as Snapshots, process them
-    all_are_snapshot_ids = all(
-        is_snapshot_id(r.get('id') or r.get('url', ''))
-        for r in records
-        if r.get('type') != 'Crawl'  # Don't check Crawl records as Snapshot IDs
-    )
+    updated_count = 0
+    for record in records:
+        snapshot_id = record.get('id')
+        if not snapshot_id:
+            continue
 
-    # But also check that we're not receiving Crawl JSONL
-    has_crawl_records = any(r.get('type') == 'Crawl' for r in records)
+        try:
+            snapshot = Snapshot.objects.get(id=snapshot_id)
 
-    if all_are_snapshot_ids and not has_crawl_records:
-        # Process existing Snapshots by ID
-        exit_code = 0
-        for record in records:
-            snapshot_id = record.get('id') or record.get('url')
-            result = process_snapshot_by_id(snapshot_id)
-            if result != 0:
-                exit_code = result
-        sys.exit(exit_code)
-    else:
-        # Create new Snapshots from URLs or Crawls
-        sys.exit(create_snapshots(args, tag=tag, plugins=plugins))
+            # Apply updates from CLI flags (override stdin values)
+            if status:
+                snapshot.status = status
+                snapshot.retry_at = timezone.now()
+            if tag:
+                # Add tag to existing tags
+                snapshot.save()  # Ensure saved before M2M
+                from archivebox.core.models import Tag
+                tag_obj, _ = Tag.objects.get_or_create(name=tag)
+                snapshot.tags.add(tag_obj)
+
+            snapshot.save()
+            updated_count += 1
+
+            if not is_tty:
+                write_record(snapshot.to_json())
+
+        except Snapshot.DoesNotExist:
+            rprint(f'[yellow]Snapshot not found: {snapshot_id}[/yellow]', file=sys.stderr)
+            continue
+
+    rprint(f'[green]Updated {updated_count} snapshots[/green]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# DELETE
+# =============================================================================
+
+def delete_snapshots(yes: bool = False, dry_run: bool = False) -> int:
+    """
+    Delete Snapshots from stdin JSONL.
+
+    Requires --yes flag to confirm deletion.
+
+    Exit codes:
+        0: Success
+        1: No input or missing --yes flag
+    """
+    from archivebox.misc.jsonl import read_stdin
+    from archivebox.core.models import Snapshot
+
+    records = list(read_stdin())
+    if not records:
+        rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
+        return 1
+
+    snapshot_ids = [r.get('id') for r in records if r.get('id')]
+
+    if not snapshot_ids:
+        rprint('[yellow]No valid snapshot IDs in input[/yellow]', file=sys.stderr)
+        return 1
+
+    snapshots = Snapshot.objects.filter(id__in=snapshot_ids)
+    count = snapshots.count()
+
+    if count == 0:
+        rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
+        return 0
+
+    if dry_run:
+        rprint(f'[yellow]Would delete {count} snapshots (dry run)[/yellow]', file=sys.stderr)
+        for snapshot in snapshots:
+            rprint(f'  [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
+        return 0
+
+    if not yes:
+        rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
+        return 1
+
+    # Perform deletion
+    deleted_count, _ = snapshots.delete()
+    rprint(f'[green]Deleted {deleted_count} snapshots[/green]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# CLI Commands
+# =============================================================================
+
+@click.group()
+def main():
+    """Manage Snapshot records."""
+    pass
+
+
+@main.command('create')
+@click.argument('urls', nargs=-1)
+@click.option('--tag', '-t', default='', help='Comma-separated tags to add')
+@click.option('--status', '-s', default='queued', help='Initial status (default: queued)')
+@click.option('--depth', '-d', type=int, default=0, help='Crawl depth (default: 0)')
+def create_cmd(urls: tuple, tag: str, status: str, depth: int):
+    """Create Snapshots from URLs or stdin JSONL."""
+    sys.exit(create_snapshots(urls, tag=tag, status=status, depth=depth))
+
+
+@main.command('list')
+@click.option('--status', '-s', help='Filter by status (queued, started, sealed)')
+@click.option('--url__icontains', help='Filter by URL contains')
+@click.option('--url__istartswith', help='Filter by URL starts with')
+@click.option('--tag', '-t', help='Filter by tag name')
+@click.option('--crawl-id', help='Filter by crawl ID')
+@click.option('--limit', '-n', type=int, help='Limit number of results')
+def list_cmd(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
+             tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]):
+    """List Snapshots as JSONL."""
+    sys.exit(list_snapshots(
+        status=status,
+        url__icontains=url__icontains,
+        url__istartswith=url__istartswith,
+        tag=tag,
+        crawl_id=crawl_id,
+        limit=limit,
+    ))
+
+
+@main.command('update')
+@click.option('--status', '-s', help='Set status')
+@click.option('--tag', '-t', help='Add tag')
+def update_cmd(status: Optional[str], tag: Optional[str]):
+    """Update Snapshots from stdin JSONL."""
+    sys.exit(update_snapshots(status=status, tag=tag))
+
+
+@main.command('delete')
+@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
+@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
+def delete_cmd(yes: bool, dry_run: bool):
+    """Delete Snapshots from stdin JSONL."""
+    sys.exit(delete_snapshots(yes=yes, dry_run=dry_run))
 
 
 if __name__ == '__main__':
diff --git a/archivebox/cli/archivebox_tag.py b/archivebox/cli/archivebox_tag.py
new file mode 100644
index 00000000..c9461396
--- /dev/null
+++ b/archivebox/cli/archivebox_tag.py
@@ -0,0 +1,307 @@
+#!/usr/bin/env python3
+
+"""
+archivebox tag <action> [args...] [--filters]
+
+Manage Tag records.
+
+Actions:
+    create  - Create Tags
+    list    - List Tags as JSONL (with optional filters)
+    update  - Update Tags from stdin JSONL
+    delete  - Delete Tags from stdin JSONL
+
+Examples:
+    # Create
+    archivebox tag create news tech science
+    archivebox tag create "important stuff"
+
+    # List
+    archivebox tag list
+    archivebox tag list --name__icontains=news
+
+    # Update (rename tags)
+    archivebox tag list --name=oldname | archivebox tag update --name=newname
+
+    # Delete
+    archivebox tag list --name=unused | archivebox tag delete --yes
+"""
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox tag'
+
+import sys
+from typing import Optional, Iterable
+
+import rich_click as click
+from rich import print as rprint
+
+
+def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
+    """Apply Django-style filters from CLI kwargs to a QuerySet."""
+    filters = {}
+    for key, value in filter_kwargs.items():
+        if value is not None and key not in ('limit', 'offset'):
+            filters[key] = value
+
+    if filters:
+        queryset = queryset.filter(**filters)
+
+    if limit:
+        queryset = queryset[:limit]
+
+    return queryset
+
+
+# =============================================================================
+# CREATE
+# =============================================================================
+
+def create_tags(names: Iterable[str]) -> int:
+    """
+    Create Tags from names.
+
+    Exit codes:
+        0: Success
+        1: Failure
+    """
+    from archivebox.misc.jsonl import write_record
+    from archivebox.core.models import Tag
+
+    is_tty = sys.stdout.isatty()
+
+    # Convert to list if needed
+    name_list = list(names) if names else []
+
+    if not name_list:
+        rprint('[yellow]No tag names provided. Pass names as arguments.[/yellow]', file=sys.stderr)
+        return 1
+
+    created_count = 0
+    for name in name_list:
+        name = name.strip()
+        if not name:
+            continue
+
+        tag, created = Tag.objects.get_or_create(name=name)
+
+        if not is_tty:
+            write_record(tag.to_json())
+
+        if created:
+            created_count += 1
+            rprint(f'[green]Created tag: {name}[/green]', file=sys.stderr)
+        else:
+            rprint(f'[dim]Tag already exists: {name}[/dim]', file=sys.stderr)
+
+    rprint(f'[green]Created {created_count} new tags[/green]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# LIST
+# =============================================================================
+
+def list_tags(
+    name: Optional[str] = None,
+    name__icontains: Optional[str] = None,
+    limit: Optional[int] = None,
+) -> int:
+    """
+    List Tags as JSONL with optional filters.
+
+    Exit codes:
+        0: Success (even if no results)
+    """
+    from archivebox.misc.jsonl import write_record
+    from archivebox.core.models import Tag
+
+    is_tty = sys.stdout.isatty()
+
+    queryset = Tag.objects.all().order_by('name')
+
+    # Apply filters
+    filter_kwargs = {
+        'name': name,
+        'name__icontains': name__icontains,
+    }
+    queryset = apply_filters(queryset, filter_kwargs, limit=limit)
+
+    count = 0
+    for tag in queryset:
+        snapshot_count = tag.snapshot_set.count()
+        if is_tty:
+            rprint(f'[cyan]{tag.name:30}[/cyan] [dim]({snapshot_count} snapshots)[/dim]')
+        else:
+            write_record(tag.to_json())
+        count += 1
+
+    rprint(f'[dim]Listed {count} tags[/dim]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# UPDATE
+# =============================================================================
+
+def update_tags(name: Optional[str] = None) -> int:
+    """
+    Update Tags from stdin JSONL.
+
+    Reads Tag records from stdin and applies updates.
+    Uses PATCH semantics - only specified fields are updated.
+
+    Exit codes:
+        0: Success
+        1: No input or error
+    """
+    from archivebox.misc.jsonl import read_stdin, write_record
+    from archivebox.core.models import Tag
+
+    is_tty = sys.stdout.isatty()
+
+    records = list(read_stdin())
+    if not records:
+        rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
+        return 1
+
+    updated_count = 0
+    for record in records:
+        tag_id = record.get('id')
+        old_name = record.get('name')
+
+        if not tag_id and not old_name:
+            continue
+
+        try:
+            if tag_id:
+                tag = Tag.objects.get(id=tag_id)
+            else:
+                tag = Tag.objects.get(name=old_name)
+
+            # Apply updates from CLI flags
+            if name:
+                tag.name = name
+                tag.save()
+
+            updated_count += 1
+
+            if not is_tty:
+                write_record(tag.to_json())
+
+        except Tag.DoesNotExist:
+            rprint(f'[yellow]Tag not found: {tag_id or old_name}[/yellow]', file=sys.stderr)
+            continue
+
+    rprint(f'[green]Updated {updated_count} tags[/green]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# DELETE
+# =============================================================================
+
+def delete_tags(yes: bool = False, dry_run: bool = False) -> int:
+    """
+    Delete Tags from stdin JSONL.
+
+    Requires --yes flag to confirm deletion.
+
+    Exit codes:
+        0: Success
+        1: No input or missing --yes flag
+    """
+    from archivebox.misc.jsonl import read_stdin
+    from archivebox.core.models import Tag
+
+    records = list(read_stdin())
+    if not records:
+        rprint('[yellow]No records provided via stdin[/yellow]', file=sys.stderr)
+        return 1
+
+    # Collect tag IDs or names
+    tag_ids = []
+    tag_names = []
+    for r in records:
+        if r.get('id'):
+            tag_ids.append(r['id'])
+        elif r.get('name'):
+            tag_names.append(r['name'])
+
+    if not tag_ids and not tag_names:
+        rprint('[yellow]No valid tag IDs or names in input[/yellow]', file=sys.stderr)
+        return 1
+
+    from django.db.models import Q
+    query = Q()
+    if tag_ids:
+        query |= Q(id__in=tag_ids)
+    if tag_names:
+        query |= Q(name__in=tag_names)
+
+    tags = Tag.objects.filter(query)
+    count = tags.count()
+
+    if count == 0:
+        rprint('[yellow]No matching tags found[/yellow]', file=sys.stderr)
+        return 0
+
+    if dry_run:
+        rprint(f'[yellow]Would delete {count} tags (dry run)[/yellow]', file=sys.stderr)
+        for tag in tags:
+            rprint(f'  {tag.name}', file=sys.stderr)
+        return 0
+
+    if not yes:
+        rprint('[red]Use --yes to confirm deletion[/red]', file=sys.stderr)
+        return 1
+
+    # Perform deletion
+    deleted_count, _ = tags.delete()
+    rprint(f'[green]Deleted {deleted_count} tags[/green]', file=sys.stderr)
+    return 0
+
+
+# =============================================================================
+# CLI Commands
+# =============================================================================
+
+@click.group()
+def main():
+    """Manage Tag records."""
+    pass
+
+
+@main.command('create')
+@click.argument('names', nargs=-1)
+def create_cmd(names: tuple):
+    """Create Tags from names."""
+    sys.exit(create_tags(names))
+
+
+@main.command('list')
+@click.option('--name', help='Filter by exact name')
+@click.option('--name__icontains', help='Filter by name contains')
+@click.option('--limit', '-n', type=int, help='Limit number of results')
+def list_cmd(name: Optional[str], name__icontains: Optional[str], limit: Optional[int]):
+    """List Tags as JSONL."""
+    sys.exit(list_tags(name=name, name__icontains=name__icontains, limit=limit))
+
+
+@main.command('update')
+@click.option('--name', '-n', help='Set new name')
+def update_cmd(name: Optional[str]):
+    """Update Tags from stdin JSONL."""
+    sys.exit(update_tags(name=name))
+
+
+@main.command('delete')
+@click.option('--yes', '-y', is_flag=True, help='Confirm deletion')
+@click.option('--dry-run', is_flag=True, help='Show what would be deleted')
+def delete_cmd(yes: bool, dry_run: bool):
+    """Delete Tags from stdin JSONL."""
+    sys.exit(delete_tags(yes=yes, dry_run=dry_run))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/archivebox/cli/tests_piping.py b/archivebox/cli/tests_piping.py
index f6aee426..47953232 100644
--- a/archivebox/cli/tests_piping.py
+++ b/archivebox/cli/tests_piping.py
@@ -1,17 +1,18 @@
 #!/usr/bin/env python3
 """
-Tests for CLI piping workflow: crawl | snapshot | extract
+Tests for CLI piping workflow: crawl | snapshot | archiveresult | run
 
 This module tests the JSONL-based piping between CLI commands as described in:
 https://github.com/ArchiveBox/ArchiveBox/issues/1363
 
 Workflows tested:
-    archivebox crawl URL         -> Crawl JSONL
-    archivebox snapshot          -> Snapshot JSONL (accepts Crawl or URL input)
-    archivebox extract           -> ArchiveResult JSONL (accepts Snapshot input)
+    archivebox crawl create URL        -> Crawl JSONL
+    archivebox snapshot create         -> Snapshot JSONL (accepts Crawl or URL input)
+    archivebox archiveresult create    -> ArchiveResult JSONL (accepts Snapshot input)
+    archivebox run                     -> Process queued records (accepts any JSONL)
 
 Pipeline:
-    archivebox crawl URL | archivebox snapshot | archivebox extract
+    archivebox crawl create URL | archivebox snapshot create | archivebox archiveresult create | archivebox run
 
 Each command should:
     - Accept URLs, IDs, or JSONL as input (args or stdin)
@@ -154,13 +155,13 @@ class TestJSONLParsing(unittest.TestCase):
 class TestJSONLOutput(unittest.TestCase):
     """Test JSONL output formatting."""
 
-    def test_crawl_to_jsonl(self):
-        """Crawl model should serialize to JSONL correctly."""
+    def test_crawl_to_json(self):
+        """Crawl model should serialize to JSON correctly."""
         from archivebox.misc.jsonl import TYPE_CRAWL
 
-        # Create a mock crawl with to_jsonl method configured
+        # Create a mock crawl with to_json method configured
         mock_crawl = MagicMock()
-        mock_crawl.to_jsonl.return_value = {
+        mock_crawl.to_json.return_value = {
             'type': TYPE_CRAWL,
             'schema_version': '0.9.0',
             'id': 'test-crawl-uuid',
@@ -172,7 +173,7 @@ class TestJSONLOutput(unittest.TestCase):
             'created_at': None,
         }
 
-        result = mock_crawl.to_jsonl()
+        result = mock_crawl.to_json()
         self.assertEqual(result['type'], TYPE_CRAWL)
         self.assertEqual(result['id'], 'test-crawl-uuid')
         self.assertEqual(result['urls'], 'https://example.com')
@@ -351,8 +352,8 @@ class TestSnapshotCommand(unittest.TestCase):
     # using real Snapshot instances.
 
 
-class TestExtractCommand(unittest.TestCase):
-    """Unit tests for archivebox extract command."""
+class TestArchiveResultCommand(unittest.TestCase):
+    """Unit tests for archivebox archiveresult command."""
 
     def setUp(self):
         """Set up test environment."""
@@ -363,8 +364,8 @@ class TestExtractCommand(unittest.TestCase):
         """Clean up test environment."""
         shutil.rmtree(self.test_dir, ignore_errors=True)
 
-    def test_extract_accepts_snapshot_id(self):
-        """extract should accept snapshot IDs as input."""
+    def test_archiveresult_accepts_snapshot_id(self):
+        """archiveresult should accept snapshot IDs as input."""
         from archivebox.misc.jsonl import read_args_or_stdin
 
         uuid = '01234567-89ab-cdef-0123-456789abcdef'
@@ -374,8 +375,8 @@ class TestExtractCommand(unittest.TestCase):
         self.assertEqual(len(records), 1)
         self.assertEqual(records[0]['id'], uuid)
 
-    def test_extract_accepts_jsonl_snapshot(self):
-        """extract should accept JSONL Snapshot records."""
+    def test_archiveresult_accepts_jsonl_snapshot(self):
+        """archiveresult should accept JSONL Snapshot records."""
         from archivebox.misc.jsonl import read_args_or_stdin, TYPE_SNAPSHOT
 
         stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n')
@@ -387,8 +388,8 @@ class TestExtractCommand(unittest.TestCase):
         self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
         self.assertEqual(records[0]['id'], 'abc123')
 
-    def test_extract_gathers_snapshot_ids(self):
-        """extract should gather snapshot IDs from various input formats."""
+    def test_archiveresult_gathers_snapshot_ids(self):
+        """archiveresult should gather snapshot IDs from various input formats."""
         from archivebox.misc.jsonl import TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
 
         records = [
@@ -529,7 +530,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
 
         # Create crawl with multiple URLs (as newline-separated string)
         urls = 'https://test-crawl-1.example.com\nhttps://test-crawl-2.example.com'
-        crawl = Crawl.from_jsonl({'urls': urls}, overrides={'created_by_id': created_by_id})
+        crawl = Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id})
 
         self.assertIsNotNone(crawl)
         self.assertIsNotNone(crawl.id)
@@ -543,7 +544,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
         self.assertIn('https://test-crawl-2.example.com', urls_list)
 
         # Verify output format
-        output = crawl.to_jsonl()
+        output = crawl.to_json()
         self.assertEqual(output['type'], TYPE_CRAWL)
         self.assertIn('id', output)
         self.assertEqual(output['urls'], urls)
@@ -566,8 +567,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
 
         # Step 1: Create crawl (simulating 'archivebox crawl')
         urls = 'https://crawl-to-snap-1.example.com\nhttps://crawl-to-snap-2.example.com'
-        crawl = Crawl.from_jsonl({'urls': urls}, overrides={'created_by_id': created_by_id})
-        crawl_output = crawl.to_jsonl()
+        crawl = Crawl.from_json({'urls': urls}, overrides={'created_by_id': created_by_id})
+        crawl_output = crawl.to_json()
 
         # Step 2: Parse crawl output as snapshot input
         stdin = StringIO(json.dumps(crawl_output) + '\n')
@@ -581,7 +582,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
         # Step 3: Create snapshots from crawl URLs
         created_snapshots = []
         for url in crawl.get_urls_list():
-            snapshot = Snapshot.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id})
+            snapshot = Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id})
             if snapshot:
                 created_snapshots.append(snapshot)
 
@@ -589,7 +590,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
 
         # Verify snapshot output
         for snapshot in created_snapshots:
-            output = snapshot.to_jsonl()
+            output = snapshot.to_json()
             self.assertEqual(output['type'], TYPE_SNAPSHOT)
             self.assertIn(output['url'], [
                 'https://crawl-to-snap-1.example.com',
@@ -619,13 +620,13 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
 
         # Create snapshot
         overrides = {'created_by_id': created_by_id}
-        snapshot = Snapshot.from_jsonl(records[0], overrides=overrides)
+        snapshot = Snapshot.from_json(records[0], overrides=overrides)
 
         self.assertIsNotNone(snapshot.id)
         self.assertEqual(snapshot.url, url)
 
         # Verify output format
-        output = snapshot.to_jsonl()
+        output = snapshot.to_json()
         self.assertEqual(output['type'], TYPE_SNAPSHOT)
         self.assertIn('id', output)
         self.assertEqual(output['url'], url)
@@ -647,8 +648,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
         # Step 1: Create snapshot (simulating 'archivebox snapshot')
         url = 'https://test-extract-1.example.com'
         overrides = {'created_by_id': created_by_id}
-        snapshot = Snapshot.from_jsonl({'url': url}, overrides=overrides)
-        snapshot_output = snapshot.to_jsonl()
+        snapshot = Snapshot.from_json({'url': url}, overrides=overrides)
+        snapshot_output = snapshot.to_json()
 
         # Step 2: Parse snapshot output as extract input
         stdin = StringIO(json.dumps(snapshot_output) + '\n')
@@ -686,8 +687,8 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
 
         # === archivebox crawl https://example.com ===
         url = 'https://test-pipeline-full.example.com'
-        crawl = Crawl.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id})
-        crawl_jsonl = json.dumps(crawl.to_jsonl())
+        crawl = Crawl.from_json({'url': url}, overrides={'created_by_id': created_by_id})
+        crawl_jsonl = json.dumps(crawl.to_json())
 
         # === | archivebox snapshot ===
         stdin = StringIO(crawl_jsonl + '\n')
@@ -705,7 +706,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
                 if crawl_id:
                     db_crawl = Crawl.objects.get(id=crawl_id)
                     for crawl_url in db_crawl.get_urls_list():
-                        snapshot = Snapshot.from_jsonl({'url': crawl_url}, overrides={'created_by_id': created_by_id})
+                        snapshot = Snapshot.from_json({'url': crawl_url}, overrides={'created_by_id': created_by_id})
                         if snapshot:
                             created_snapshots.append(snapshot)
 
@@ -713,7 +714,7 @@ class TestPipingWorkflowIntegration(unittest.TestCase):
         self.assertEqual(created_snapshots[0].url, url)
 
         # === | archivebox extract ===
-        snapshot_jsonl_lines = [json.dumps(s.to_jsonl()) for s in created_snapshots]
+        snapshot_jsonl_lines = [json.dumps(s.to_json()) for s in created_snapshots]
         stdin = StringIO('\n'.join(snapshot_jsonl_lines) + '\n')
         stdin.isatty = lambda: False
 
@@ -757,12 +758,12 @@ class TestDepthWorkflows(unittest.TestCase):
 
         # Create crawl with depth 0
         url = 'https://depth0-test.example.com'
-        crawl = Crawl.from_jsonl({'url': url, 'max_depth': 0}, overrides={'created_by_id': created_by_id})
+        crawl = Crawl.from_json({'url': url, 'max_depth': 0}, overrides={'created_by_id': created_by_id})
 
         self.assertEqual(crawl.max_depth, 0)
 
         # Create snapshot
-        snapshot = Snapshot.from_jsonl({'url': url}, overrides={'created_by_id': created_by_id})
+        snapshot = Snapshot.from_json({'url': url}, overrides={'created_by_id': created_by_id})
         self.assertEqual(snapshot.url, url)
 
     def test_depth_metadata_in_crawl(self):
@@ -773,7 +774,7 @@ class TestDepthWorkflows(unittest.TestCase):
         created_by_id = get_or_create_system_user_pk()
 
         # Create crawl with depth
-        crawl = Crawl.from_jsonl(
+        crawl = Crawl.from_json(
             {'url': 'https://depth-meta-test.example.com', 'max_depth': 2},
             overrides={'created_by_id': created_by_id}
         )
@@ -781,7 +782,7 @@ class TestDepthWorkflows(unittest.TestCase):
         self.assertEqual(crawl.max_depth, 2)
 
         # Verify in JSONL output
-        output = crawl.to_jsonl()
+        output = crawl.to_json()
         self.assertEqual(output['max_depth'], 2)
 
 
diff --git a/archivebox/core/forms.py b/archivebox/core/forms.py
index dd7d04da..b749951d 100644
--- a/archivebox/core/forms.py
+++ b/archivebox/core/forms.py
@@ -158,7 +158,7 @@ class AddLinkForm(forms.Form):
             'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
         }
         binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
-        extensions = {'captcha2', 'istilldontcareaboutcookies', 'ublock'}
+        extensions = {'twocaptcha', 'istilldontcareaboutcookies', 'ublock'}
 
         # Populate plugin field choices
         self.fields['chrome_plugins'].choices = [
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 883733c5..1dca0810 100755
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -1,6 +1,6 @@
 __package__ = 'archivebox.core'
 
-from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
+from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING, Iterator, Set
 from archivebox.uuid_compat import uuid7
 from datetime import datetime, timedelta
 from django_stubs_ext.db.models import TypedModelMeta
@@ -41,6 +41,8 @@ from archivebox.machine.models import NetworkInterface, Binary
 
 
 class Tag(ModelWithSerializers):
+    JSONL_TYPE = 'Tag'
+
     # Keep AutoField for compatibility with main branch migrations
     # Don't use UUIDField here - requires complex FK transformation
     id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
@@ -91,26 +93,66 @@ class Tag(ModelWithSerializers):
     def api_url(self) -> str:
         return reverse_lazy('api-1:get_tag', args=[self.id])
 
-    def to_jsonl(self) -> dict:
+    def to_json(self) -> dict:
         """
-        Convert Tag model instance to a JSONL record.
+        Convert Tag model instance to a JSON-serializable dict.
         """
         from archivebox.config import VERSION
         return {
-            'type': 'Tag',
+            'type': self.JSONL_TYPE,
             'schema_version': VERSION,
             'id': str(self.id),
             'name': self.name,
             'slug': self.slug,
         }
 
-    @staticmethod
-    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
+    def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]:
         """
-        Create/update Tag from JSONL record.
+        Yield this Tag as a JSON record.
 
         Args:
-            record: JSONL record with 'name' field
+            seen: Set of (type, id) tuples already emitted (for deduplication)
+            **kwargs: Passed to children (none for Tag, leaf node)
+
+        Yields:
+            dict: JSON-serializable record for this tag
+        """
+        if seen is not None:
+            key = (self.JSONL_TYPE, str(self.id))
+            if key in seen:
+                return
+            seen.add(key)
+        yield self.to_json()
+
+    @classmethod
+    def from_jsonl(cls, records, overrides: Dict[str, Any] = None) -> list['Tag']:
+        """
+        Create/update Tags from an iterable of JSONL records.
+        Filters to only records with type='Tag'.
+
+        Args:
+            records: Iterable of dicts (JSONL records)
+            overrides: Optional dict with 'snapshot' to auto-attach tags
+
+        Returns:
+            List of Tag instances (skips None results)
+        """
+        results = []
+        for record in records:
+            record_type = record.get('type', cls.JSONL_TYPE)
+            if record_type == cls.JSONL_TYPE:
+                instance = cls.from_json(record, overrides=overrides)
+                if instance:
+                    results.append(instance)
+        return results
+
+    @staticmethod
+    def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None) -> 'Tag | None':
+        """
+        Create/update a single Tag from a JSON record dict.
+
+        Args:
+            record: Dict with 'name' field
             overrides: Optional dict with 'snapshot' to auto-attach tag
 
         Returns:
@@ -289,6 +331,8 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
 
 
 class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
+    JSONL_TYPE = 'Snapshot'
+
     id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
     created_at = models.DateTimeField(default=timezone.now, db_index=True)
     modified_at = models.DateTimeField(auto_now=True)
@@ -968,38 +1012,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
         Each line is a JSON record with a 'type' field:
         - Snapshot: snapshot metadata (crawl_id, url, tags, etc.)
-        - ArchiveResult: extractor results (plugin, status, output, etc.)
         - Binary: binary info used for the extraction
         - Process: process execution details (cmd, exit_code, timing, etc.)
+        - ArchiveResult: extractor results (plugin, status, output, etc.)
         """
         import json
 
         index_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME
         index_path.parent.mkdir(parents=True, exist_ok=True)
 
-        # Track unique binaries and processes to avoid duplicates
-        binaries_seen = set()
-        processes_seen = set()
-
         with open(index_path, 'w') as f:
-            # Write Snapshot record first (to_jsonl includes crawl_id, fs_version)
-            f.write(json.dumps(self.to_jsonl()) + '\n')
-
-            # Write ArchiveResult records with their associated Binary and Process
-            # Use select_related to optimize queries
-            for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'):
-                # Write Binary record if not already written
-                if ar.process and ar.process.binary and ar.process.binary_id not in binaries_seen:
-                    binaries_seen.add(ar.process.binary_id)
-                    f.write(json.dumps(ar.process.binary.to_jsonl()) + '\n')
-
-                # Write Process record if not already written
-                if ar.process and ar.process_id not in processes_seen:
-                    processes_seen.add(ar.process_id)
-                    f.write(json.dumps(ar.process.to_jsonl()) + '\n')
-
-                # Write ArchiveResult record
-                f.write(json.dumps(ar.to_jsonl()) + '\n')
+            for record in self.to_jsonl():
+                f.write(json.dumps(record) + '\n')
 
     def read_index_jsonl(self) -> dict:
         """
@@ -1420,14 +1444,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
 
         return False
 
-    def to_jsonl(self) -> dict:
+    def to_json(self) -> dict:
         """
-        Convert Snapshot model instance to a JSONL record.
+        Convert Snapshot model instance to a JSON-serializable dict.
         Includes all fields needed to fully reconstruct/identify this snapshot.
         """
         from archivebox.config import VERSION
         return {
-            'type': 'Snapshot',
+            'type': self.JSONL_TYPE,
             'schema_version': VERSION,
             'id': str(self.id),
             'crawl_id': str(self.crawl_id),
@@ -1442,12 +1466,68 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
             'fs_version': self.fs_version,
         }
 
-    @staticmethod
-    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
+    def to_jsonl(self, seen: Set[tuple] = None, archiveresult: bool = True, process: bool = True, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]:
         """
-        Create/update Snapshot from JSONL record or dict.
+        Yield this Snapshot and optionally related objects as JSON records.
 
-        Unified method that handles:
+        Uses select_related for efficient querying. Deduplicates automatically.
+
+        Args:
+            seen: Set of (type, id) tuples already emitted (for deduplication)
+            archiveresult: Include related ArchiveResults (default: True)
+            process: Include Process for each ArchiveResult (default: True)
+            binary: Include Binary for each Process (default: True)
+            machine: Include Machine for each Process (default: False)
+            iface: Include NetworkInterface for each Process (default: False)
+            **kwargs: Additional options passed to children
+
+        Yields:
+            dict: JSON-serializable records
+        """
+        if seen is None:
+            seen = set()
+
+        key = (self.JSONL_TYPE, str(self.id))
+        if key in seen:
+            return
+        seen.add(key)
+
+        yield self.to_json()
+
+        if archiveresult:
+            # Use select_related to optimize queries
+            for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'):
+                yield from ar.to_jsonl(seen=seen, process=process, binary=binary, machine=machine, iface=iface, **kwargs)
+
+    @classmethod
+    def from_jsonl(cls, records, overrides: Dict[str, Any] = None, queue_for_extraction: bool = True) -> list['Snapshot']:
+        """
+        Create/update Snapshots from an iterable of JSONL records.
+        Filters to only records with type='Snapshot' (or no type).
+
+        Args:
+            records: Iterable of dicts (JSONL records)
+            overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
+            queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)
+
+        Returns:
+            List of Snapshot instances (skips None results)
+        """
+        results = []
+        for record in records:
+            record_type = record.get('type', cls.JSONL_TYPE)
+            if record_type == cls.JSONL_TYPE:
+                instance = cls.from_json(record, overrides=overrides, queue_for_extraction=queue_for_extraction)
+                if instance:
+                    results.append(instance)
+        return results
+
+    @staticmethod
+    def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True) -> 'Snapshot | None':
+        """
+        Create/update a single Snapshot from a JSON record dict.
+
+        Handles:
         - ID-based patching: {"id": "...", "title": "new title"}
         - URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
         - Auto-creates Crawl if not provided
@@ -2054,8 +2134,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
             result['canonical'] = self.canonical_outputs()
         return result
 
-    def to_json(self, indent: int = 4) -> str:
-        """Convert to JSON string"""
+    def to_json_str(self, indent: int = 4) -> str:
+        """Convert to JSON string for file output."""
         return to_json(self.to_dict(extended=True), indent=indent)
 
     def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
@@ -2203,6 +2283,8 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
 
 
 class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
+    JSONL_TYPE = 'ArchiveResult'
+
     class StatusChoices(models.TextChoices):
         QUEUED = 'queued', 'Queued'
         STARTED = 'started', 'Started'
@@ -2274,13 +2356,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
         """Convenience property to access the user who created this archive result via its snapshot's crawl."""
         return self.snapshot.crawl.created_by
 
-    def to_jsonl(self) -> dict:
+    def to_json(self) -> dict:
         """
-        Convert ArchiveResult model instance to a JSONL record.
+        Convert ArchiveResult model instance to a JSON-serializable dict.
         """
         from archivebox.config import VERSION
         record = {
-            'type': 'ArchiveResult',
+            'type': self.JSONL_TYPE,
             'schema_version': VERSION,
             'id': str(self.id),
             'snapshot_id': str(self.snapshot_id),
@@ -2308,6 +2390,31 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
             record['process_id'] = str(self.process_id)
         return record
 
+    def to_jsonl(self, seen: Set[tuple] = None, process: bool = True, **kwargs) -> Iterator[dict]:
+        """
+        Yield this ArchiveResult and optionally related objects as JSON records.
+
+        Args:
+            seen: Set of (type, id) tuples already emitted (for deduplication)
+            process: Include related Process and its children (default: True)
+            **kwargs: Passed to Process.to_jsonl() (e.g., binary=True, machine=False)
+
+        Yields:
+            dict: JSON-serializable records
+        """
+        if seen is None:
+            seen = set()
+
+        key = (self.JSONL_TYPE, str(self.id))
+        if key in seen:
+            return
+        seen.add(key)
+
+        yield self.to_json()
+
+        if process and self.process:
+            yield from self.process.to_jsonl(seen=seen, **kwargs)
+
     def save(self, *args, **kwargs):
         is_new = self._state.adding
 
diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py
index 3e1a53f9..9e756f29 100755
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -1,6 +1,6 @@
 __package__ = 'archivebox.crawls'
 
-from typing import TYPE_CHECKING, Iterable
+from typing import TYPE_CHECKING, Iterable, Iterator, Set
 from datetime import timedelta
 from archivebox.uuid_compat import uuid7
 from pathlib import Path
@@ -59,6 +59,8 @@ class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
 
 
 class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWithStateMachine):
+    JSONL_TYPE = 'Crawl'
+
     id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
     created_at = models.DateTimeField(default=timezone.now, db_index=True)
     created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
@@ -134,13 +136,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
     def api_url(self) -> str:
         return reverse_lazy('api-1:get_crawl', args=[self.id])
 
-    def to_jsonl(self) -> dict:
+    def to_json(self) -> dict:
         """
-        Convert Crawl model instance to a JSONL record.
+        Convert Crawl model instance to a JSON-serializable dict.
         """
         from archivebox.config import VERSION
         return {
-            'type': 'Crawl',
+            'type': self.JSONL_TYPE,
             'schema_version': VERSION,
             'id': str(self.id),
             'urls': self.urls,
@@ -151,10 +153,63 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
             'created_at': self.created_at.isoformat() if self.created_at else None,
         }
 
-    @staticmethod
-    def from_jsonl(record: dict, overrides: dict = None):
+    def to_jsonl(self, seen: Set[tuple] = None, snapshot: bool = True, archiveresult: bool = True, process: bool = True, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]:
         """
-        Create or get a Crawl from a JSONL record.
+        Yield this Crawl and optionally related objects as JSON records.
+
+        Args:
+            seen: Set of (type, id) tuples already emitted (for deduplication)
+            snapshot: Include related Snapshots (default: True)
+            archiveresult: Include ArchiveResults for each Snapshot (default: True)
+            process: Include Process for each ArchiveResult (default: True)
+            binary: Include Binary for each Process (default: True)
+            machine: Include Machine for each Process (default: False)
+            iface: Include NetworkInterface for each Process (default: False)
+            **kwargs: Additional options passed to children
+
+        Yields:
+            dict: JSON-serializable records
+        """
+        if seen is None:
+            seen = set()
+
+        key = (self.JSONL_TYPE, str(self.id))
+        if key in seen:
+            return
+        seen.add(key)
+
+        yield self.to_json()
+
+        if snapshot:
+            for snap in self.snapshot_set.all():
+                yield from snap.to_jsonl(seen=seen, archiveresult=archiveresult, process=process, binary=binary, machine=machine, iface=iface, **kwargs)
+
+    @classmethod
+    def from_jsonl(cls, records, overrides: dict = None) -> list['Crawl']:
+        """
+        Create/update Crawls from an iterable of JSONL records.
+        Filters to only records with type='Crawl' (or no type).
+
+        Args:
+            records: Iterable of dicts (JSONL records)
+            overrides: Dict of field overrides (e.g., created_by_id)
+
+        Returns:
+            List of Crawl instances (skips None results)
+        """
+        results = []
+        for record in records:
+            record_type = record.get('type', cls.JSONL_TYPE)
+            if record_type == cls.JSONL_TYPE:
+                instance = cls.from_json(record, overrides=overrides)
+                if instance:
+                    results.append(instance)
+        return results
+
+    @staticmethod
+    def from_json(record: dict, overrides: dict = None) -> 'Crawl | None':
+        """
+        Create or get a single Crawl from a JSON record dict.
 
         Args:
             record: Dict with 'urls' (required), optional 'max_depth', 'tags_str', 'label'
diff --git a/archivebox/hooks.py b/archivebox/hooks.py
index 6485f2c0..2a506e9b 100644
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -1176,7 +1176,9 @@ def create_model_record(record: Dict[str, Any]) -> Any:
 def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any] = None) -> Dict[str, int]:
     """
     Process JSONL records from hook output.
-    Dispatches to Model.from_jsonl() for each record type.
+
+    Uses Model.from_jsonl() which automatically filters by JSONL_TYPE.
+    Each model only processes records matching its type.
 
     Args:
         records: List of JSONL record dicts from result['records']
@@ -1185,54 +1187,26 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any
     Returns:
         Dict with counts by record type
     """
-    stats = {}
+    from archivebox.core.models import Snapshot, Tag
+    from archivebox.machine.models import Binary, Machine
+
     overrides = overrides or {}
 
-    for record in records:
-        record_type = record.get('type')
-        if not record_type:
-            continue
+    # Filter out ArchiveResult records (they update the calling AR, not create new ones)
+    filtered_records = [r for r in records if r.get('type') != 'ArchiveResult']
 
-        # Skip ArchiveResult records (they update the calling ArchiveResult, not create new ones)
-        if record_type == 'ArchiveResult':
-            continue
+    # Each model's from_jsonl() filters to only its own type
+    snapshots = Snapshot.from_jsonl(filtered_records, overrides)
+    tags = Tag.from_jsonl(filtered_records, overrides)
+    binaries = Binary.from_jsonl(filtered_records, overrides)
+    machines = Machine.from_jsonl(filtered_records, overrides)
 
-        try:
-            # Dispatch to appropriate model's from_jsonl() method
-            if record_type == 'Snapshot':
-                from archivebox.core.models import Snapshot
-                obj = Snapshot.from_jsonl(record.copy(), overrides)
-                if obj:
-                    stats['Snapshot'] = stats.get('Snapshot', 0) + 1
-
-            elif record_type == 'Tag':
-                from archivebox.core.models import Tag
-                obj = Tag.from_jsonl(record.copy(), overrides)
-                if obj:
-                    stats['Tag'] = stats.get('Tag', 0) + 1
-
-            elif record_type == 'Binary':
-                from archivebox.machine.models import Binary
-                obj = Binary.from_jsonl(record.copy(), overrides)
-                if obj:
-                    stats['Binary'] = stats.get('Binary', 0) + 1
-
-            elif record_type == 'Machine':
-                from archivebox.machine.models import Machine
-                obj = Machine.from_jsonl(record.copy(), overrides)
-                if obj:
-                    stats['Machine'] = stats.get('Machine', 0) + 1
-
-            else:
-                import sys
-                print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
-
-        except Exception as e:
-            import sys
-            print(f"Warning: Failed to create {record_type}: {e}", file=sys.stderr)
-            continue
-
-    return stats
+    return {
+        'Snapshot': len(snapshots),
+        'Tag': len(tags),
+        'Binary': len(binaries),
+        'Machine': len(machines),
+    }
 
 
 def process_is_alive(pid_file: Path) -> bool:
diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py
index 2d15bf1f..c0659afd 100755
--- a/archivebox/machine/models.py
+++ b/archivebox/machine/models.py
@@ -1,6 +1,7 @@
 __package__ = 'archivebox.machine'
 
 import socket
+from typing import Iterator, Set
 from archivebox.uuid_compat import uuid7
 from datetime import timedelta
 
@@ -29,6 +30,8 @@ class MachineManager(models.Manager):
 
 
 class Machine(ModelWithHealthStats):
+    JSONL_TYPE = 'Machine'
+
     id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
     created_at = models.DateTimeField(default=timezone.now, db_index=True)
     modified_at = models.DateTimeField(auto_now=True)
@@ -69,13 +72,35 @@ class Machine(ModelWithHealthStats):
         )
         return _CURRENT_MACHINE
 
-    @staticmethod
-    def from_jsonl(record: dict, overrides: dict = None):
+    @classmethod
+    def from_jsonl(cls, records, overrides: dict = None) -> list['Machine']:
         """
-        Update Machine config from JSONL record.
+        Update Machine configs from an iterable of JSONL records.
+        Filters to only records with type='Machine'.
 
         Args:
-            record: JSONL record with '_method': 'update', 'key': '...', 'value': '...'
+            records: Iterable of dicts (JSONL records)
+            overrides: Not used
+
+        Returns:
+            List of Machine instances (skips None results)
+        """
+        results = []
+        for record in records:
+            record_type = record.get('type', cls.JSONL_TYPE)
+            if record_type == cls.JSONL_TYPE:
+                instance = cls.from_json(record, overrides=overrides)
+                if instance:
+                    results.append(instance)
+        return results
+
+    @staticmethod
+    def from_json(record: dict, overrides: dict = None) -> 'Machine | None':
+        """
+        Update a single Machine config from a JSON record dict.
+
+        Args:
+            record: Dict with '_method': 'update', 'key': '...', 'value': '...'
             overrides: Not used
 
         Returns:
@@ -94,6 +119,44 @@ class Machine(ModelWithHealthStats):
                 return machine
         return None
 
+    def to_json(self) -> dict:
+        """
+        Convert Machine model instance to a JSON-serializable dict.
+        """
+        from archivebox.config import VERSION
+        return {
+            'type': self.JSONL_TYPE,
+            'schema_version': VERSION,
+            'id': str(self.id),
+            'guid': self.guid,
+            'hostname': self.hostname,
+            'hw_in_docker': self.hw_in_docker,
+            'hw_in_vm': self.hw_in_vm,
+            'os_arch': self.os_arch,
+            'os_family': self.os_family,
+            'os_platform': self.os_platform,
+            'os_release': self.os_release,
+            'created_at': self.created_at.isoformat() if self.created_at else None,
+        }
+
+    def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]:
+        """
+        Yield this Machine as a JSON record.
+
+        Args:
+            seen: Set of (type, id) tuples already emitted (for deduplication)
+            **kwargs: Passed to children (none for Machine, leaf node)
+
+        Yields:
+            dict: JSON-serializable record for this machine
+        """
+        if seen is not None:
+            key = (self.JSONL_TYPE, str(self.id))
+            if key in seen:
+                return
+            seen.add(key)
+        yield self.to_json()
+
 
 class NetworkInterfaceManager(models.Manager):
     def current(self) -> 'NetworkInterface':
@@ -101,6 +164,8 @@ class NetworkInterfaceManager(models.Manager):
 
 
 class NetworkInterface(ModelWithHealthStats):
+    JSONL_TYPE = 'NetworkInterface'
+
     id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
     created_at = models.DateTimeField(default=timezone.now, db_index=True)
     modified_at = models.DateTimeField(auto_now=True)
@@ -139,6 +204,46 @@ class NetworkInterface(ModelWithHealthStats):
         )
         return _CURRENT_INTERFACE
 
+    def to_json(self) -> dict:
+        """
+        Convert NetworkInterface model instance to a JSON-serializable dict.
+        """
+        from archivebox.config import VERSION
+        return {
+            'type': self.JSONL_TYPE,
+            'schema_version': VERSION,
+            'id': str(self.id),
+            'machine_id': str(self.machine_id),
+            'hostname': self.hostname,
+            'iface': self.iface,
+            'ip_public': self.ip_public,
+            'ip_local': self.ip_local,
+            'mac_address': self.mac_address,
+            'dns_server': self.dns_server,
+            'isp': self.isp,
+            'city': self.city,
+            'region': self.region,
+            'country': self.country,
+            'created_at': self.created_at.isoformat() if self.created_at else None,
+        }
+
+    def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]:
+        """
+        Yield this NetworkInterface as a JSON record.
+
+        Args:
+            seen: Set of (type, id) tuples already emitted (for deduplication)
+            **kwargs: Passed to children (none for NetworkInterface, leaf node)
+
+        Yields:
+            dict: JSON-serializable record for this network interface
+        """
+        if seen is not None:
+            key = (self.JSONL_TYPE, str(self.id))
+            if key in seen:
+                return
+            seen.add(key)
+        yield self.to_json()
 
 
 class BinaryManager(models.Manager):
@@ -165,7 +270,7 @@ class BinaryManager(models.Manager):
 
 class Binary(ModelWithHealthStats):
     """
-    Tracks an binary on a specific machine.
+    Tracks a binary on a specific machine.
 
     Follows the unified state machine pattern:
     - queued: Binary needs to be installed
@@ -176,6 +281,7 @@ class Binary(ModelWithHealthStats):
     State machine calls run() which executes on_Binary__install_* hooks
     to install the binary using the specified providers.
     """
+    JSONL_TYPE = 'Binary'
 
     class StatusChoices(models.TextChoices):
         QUEUED = 'queued', 'Queued'
@@ -242,13 +348,13 @@ class Binary(ModelWithHealthStats):
             'is_valid': self.is_valid,
         }
 
-    def to_jsonl(self) -> dict:
+    def to_json(self) -> dict:
         """
-        Convert Binary model instance to a JSONL record.
+        Convert Binary model instance to a JSON-serializable dict.
         """
         from archivebox.config import VERSION
         return {
-            'type': 'Binary',
+            'type': self.JSONL_TYPE,
             'schema_version': VERSION,
             'id': str(self.id),
             'machine_id': str(self.machine_id),
@@ -260,17 +366,57 @@ class Binary(ModelWithHealthStats):
             'status': self.status,
         }
 
-    @staticmethod
-    def from_jsonl(record: dict, overrides: dict = None):
+    def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]:
         """
-        Create/update Binary from JSONL record.
+        Yield this Binary as a JSON record.
+
+        Args:
+            seen: Set of (type, id) tuples already emitted (for deduplication)
+            **kwargs: Passed to children (none for Binary, leaf node)
+
+        Yields:
+            dict: JSON-serializable record for this binary
+        """
+        if seen is not None:
+            key = (self.JSONL_TYPE, str(self.id))
+            if key in seen:
+                return
+            seen.add(key)
+        yield self.to_json()
+
+    @classmethod
+    def from_jsonl(cls, records, overrides: dict = None) -> list['Binary']:
+        """
+        Create/update Binaries from an iterable of JSONL records.
+        Filters to only records with type='Binary'.
+
+        Args:
+            records: Iterable of dicts (JSONL records)
+            overrides: Not used
+
+        Returns:
+            List of Binary instances (skips None results)
+        """
+        results = []
+        for record in records:
+            record_type = record.get('type', cls.JSONL_TYPE)
+            if record_type == cls.JSONL_TYPE:
+                instance = cls.from_json(record, overrides=overrides)
+                if instance:
+                    results.append(instance)
+        return results
+
+    @staticmethod
+    def from_json(record: dict, overrides: dict = None) -> 'Binary | None':
+        """
+        Create/update a single Binary from a JSON record dict.
 
         Handles two cases:
         1. From binaries.jsonl: creates queued binary with name, binproviders, overrides
         2. From hook output: updates binary with abspath, version, sha256, binprovider
 
         Args:
-            record: JSONL record with 'name' and either:
+            record: Dict with 'name' and either:
                     - 'binproviders', 'overrides' (from binaries.jsonl)
                     - 'abspath', 'version', 'sha256', 'binprovider' (from hook output)
             overrides: Not used
@@ -494,6 +640,7 @@ class Process(ModelWithHealthStats):
 
     State machine calls launch() to spawn the process and monitors its lifecycle.
     """
+    JSONL_TYPE = 'Process'
 
     class StatusChoices(models.TextChoices):
         QUEUED = 'queued', 'Queued'
@@ -624,13 +771,13 @@ class Process(ModelWithHealthStats):
             return self.archiveresult.hook_name
         return ''
 
-    def to_jsonl(self) -> dict:
+    def to_json(self) -> dict:
         """
-        Convert Process model instance to a JSONL record.
+        Convert Process model instance to a JSON-serializable dict.
         """
         from archivebox.config import VERSION
         record = {
-            'type': 'Process',
+            'type': self.JSONL_TYPE,
             'schema_version': VERSION,
             'id': str(self.id),
             'machine_id': str(self.machine_id),
@@ -650,6 +797,37 @@ class Process(ModelWithHealthStats):
             record['timeout'] = self.timeout
         return record
 
+    def to_jsonl(self, seen: Set[tuple] = None, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]:
+        """
+        Yield this Process and optionally related objects as JSON records.
+
+        Args:
+            seen: Set of (type, id) tuples already emitted (for deduplication)
+            binary: Include related Binary (default: True)
+            machine: Include related Machine (default: False)
+            iface: Include related NetworkInterface (default: False)
+            **kwargs: Passed to children
+
+        Yields:
+            dict: JSON-serializable records
+        """
+        if seen is None:
+            seen = set()
+
+        key = (self.JSONL_TYPE, str(self.id))
+        if key in seen:
+            return
+        seen.add(key)
+
+        yield self.to_json()
+
+        if binary and self.binary:
+            yield from self.binary.to_jsonl(seen=seen, **kwargs)
+        if machine and self.machine:
+            yield from self.machine.to_jsonl(seen=seen, **kwargs)
+        if iface and self.iface:
+            yield from self.iface.to_jsonl(seen=seen, **kwargs)
+
     def update_and_requeue(self, **kwargs):
         """
         Update process fields and requeue for worker state machine.
diff --git a/archivebox/misc/jsonl.py b/archivebox/misc/jsonl.py
index 1e555a0a..df1163ab 100644
--- a/archivebox/misc/jsonl.py
+++ b/archivebox/misc/jsonl.py
@@ -24,7 +24,7 @@ __package__ = 'archivebox.misc'
 
 import sys
 import json
-from typing import Iterator, Dict, Any, Optional, TextIO, Callable
+from typing import Iterator, Dict, Any, Optional, TextIO
 from pathlib import Path
 
 
@@ -150,36 +150,3 @@ def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] =
         count += 1
     return count
 
-
-def filter_by_type(records: Iterator[Dict[str, Any]], record_type: str) -> Iterator[Dict[str, Any]]:
-    """
-    Filter records by type.
-    """
-    for record in records:
-        if record.get('type') == record_type:
-            yield record
-
-
-def process_records(
-    records: Iterator[Dict[str, Any]],
-    handlers: Dict[str, Callable[[Dict[str, Any]], Optional[Dict[str, Any]]]]
-) -> Iterator[Dict[str, Any]]:
-    """
-    Process records through type-specific handlers.
-
-    Args:
-        records: Input record iterator
-        handlers: Dict mapping type names to handler functions
-                 Handlers return output records or None to skip
-
-    Yields output records from handlers.
-    """
-    for record in records:
-        record_type = record.get('type')
-        handler = handlers.get(record_type)
-        if handler:
-            result = handler(record)
-            if result:
-                yield result
-
-
diff --git a/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py b/archivebox/plugins/chrome/on_Crawl__00_install_puppeteer_chromium.py
similarity index 68%
rename from archivebox/plugins/chrome/on_Crawl__00_chrome_install.py
rename to archivebox/plugins/chrome/on_Crawl__00_install_puppeteer_chromium.py
index 4c6bbbdd..6730333f 100644
--- a/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py
+++ b/archivebox/plugins/chrome/on_Crawl__00_install_puppeteer_chromium.py
@@ -3,7 +3,12 @@
 Install hook for Chrome/Chromium and puppeteer-core.
 
 Runs at crawl start to install/find Chromium and puppeteer-core.
-Outputs JSONL for Binary and Machine config updates.
+Also validates config and computes derived values.
+
+Outputs:
+    - JSONL for Binary and Machine config updates
+    - COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
+
 Respects CHROME_BINARY env var for custom binary paths.
 Uses `npx @puppeteer/browsers install chromium@latest` and parses output.
 
@@ -19,6 +24,28 @@ import subprocess
 from pathlib import Path
 
 
+def get_env(name: str, default: str = '') -> str:
+    return os.environ.get(name, default).strip()
+
+
+def get_env_bool(name: str, default: bool = False) -> bool:
+    val = get_env(name, '').lower()
+    if val in ('true', '1', 'yes', 'on'):
+        return True
+    if val in ('false', '0', 'no', 'off'):
+        return False
+    return default
+
+
+def detect_docker() -> bool:
+    """Detect if running inside Docker container."""
+    return (
+        os.path.exists('/.dockerenv') or
+        os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
+        os.path.exists('/run/.containerenv')
+    )
+
+
 def get_chrome_version(binary_path: str) -> str | None:
     """Get Chrome/Chromium version string."""
     try:
@@ -131,13 +158,41 @@ def install_chromium() -> dict | None:
 
 
 def main():
+    warnings = []
+    errors = []
+    computed = {}
+
     # Install puppeteer-core if NODE_MODULES_DIR is set
     install_puppeteer_core()
 
+    # Check if Chrome is enabled
+    chrome_enabled = get_env_bool('CHROME_ENABLED', True)
+
+    # Detect Docker and adjust sandbox
+    in_docker = detect_docker()
+    computed['IN_DOCKER'] = str(in_docker).lower()
+
+    chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
+    if in_docker and chrome_sandbox:
+        warnings.append(
+            "Running in Docker with CHROME_SANDBOX=true. "
+            "Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
+        )
+        # Auto-disable sandbox in Docker unless explicitly set
+        if not get_env('CHROME_SANDBOX'):
+            computed['CHROME_SANDBOX'] = 'false'
+
+    # Check Node.js availability
+    node_binary = get_env('NODE_BINARY', 'node')
+    computed['NODE_BINARY'] = node_binary
+
     # Check if CHROME_BINARY is already set and valid
-    configured_binary = os.environ.get('CHROME_BINARY', '').strip()
+    configured_binary = get_env('CHROME_BINARY', '')
     if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
         version = get_chrome_version(configured_binary)
+        computed['CHROME_BINARY'] = configured_binary
+        computed['CHROME_VERSION'] = version or 'unknown'
+
         print(json.dumps({
             'type': 'Binary',
             'name': 'chromium',
@@ -145,12 +200,22 @@ def main():
             'version': version,
             'binprovider': 'env',
         }))
+
+        # Output computed values
+        for key, value in computed.items():
+            print(f"COMPUTED:{key}={value}")
+        for warning in warnings:
+            print(f"WARNING:{warning}", file=sys.stderr)
+
         sys.exit(0)
 
     # Install/find Chromium via puppeteer
     result = install_chromium()
 
     if result and result.get('abspath'):
+        computed['CHROME_BINARY'] = result['abspath']
+        computed['CHROME_VERSION'] = result['version'] or 'unknown'
+
         print(json.dumps({
             'type': 'Binary',
             'name': result['name'],
@@ -174,9 +239,25 @@ def main():
                 'value': result['version'],
             }))
 
+        # Output computed values
+        for key, value in computed.items():
+            print(f"COMPUTED:{key}={value}")
+        for warning in warnings:
+            print(f"WARNING:{warning}", file=sys.stderr)
+
         sys.exit(0)
     else:
-        print("Chromium binary not found", file=sys.stderr)
+        errors.append("Chromium binary not found")
+        computed['CHROME_BINARY'] = ''
+
+        # Output computed values and errors
+        for key, value in computed.items():
+            print(f"COMPUTED:{key}={value}")
+        for warning in warnings:
+            print(f"WARNING:{warning}", file=sys.stderr)
+        for error in errors:
+            print(f"ERROR:{error}", file=sys.stderr)
+
         sys.exit(1)
 
 
diff --git a/archivebox/plugins/chrome/on_Crawl__10_chrome_validate_config.py b/archivebox/plugins/chrome/on_Crawl__10_chrome_validate_config.py
deleted file mode 100644
index 7aa8639c..00000000
--- a/archivebox/plugins/chrome/on_Crawl__10_chrome_validate_config.py
+++ /dev/null
@@ -1,172 +0,0 @@
-#!/usr/bin/env python3
-"""
-Validate and compute derived Chrome config values.
-
-This hook runs early in the Crawl lifecycle to:
-1. Auto-detect Chrome binary location
-2. Compute sandbox settings based on Docker detection
-3. Validate binary availability and version
-4. Set computed env vars for subsequent hooks
-
-Output:
-    - COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
-    - Binary JSONL records to stdout when binaries are found
-"""
-
-import json
-import os
-import sys
-
-from abx_pkg import Binary, EnvProvider
-
-
-# Chrome binary search order
-CHROME_BINARY_NAMES = [
-    'chromium',
-    'chromium-browser',
-    'google-chrome',
-    'google-chrome-stable',
-    'chrome',
-]
-
-def get_env(name: str, default: str = '') -> str:
-    return os.environ.get(name, default).strip()
-
-def get_env_bool(name: str, default: bool = False) -> bool:
-    val = get_env(name, '').lower()
-    if val in ('true', '1', 'yes', 'on'):
-        return True
-    if val in ('false', '0', 'no', 'off'):
-        return False
-    return default
-
-
-def detect_docker() -> bool:
-    """Detect if running inside Docker container."""
-    return (
-        os.path.exists('/.dockerenv') or
-        os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
-        os.path.exists('/run/.containerenv')
-    )
-
-
-def find_chrome_binary(configured: str, provider: EnvProvider) -> Binary | None:
-    """Find Chrome binary using abx-pkg, checking configured path first."""
-    # Try configured binary first
-    if configured:
-        try:
-            binary = Binary(name=configured, binproviders=[provider]).load()
-            if binary.abspath:
-                return binary
-        except Exception:
-            pass
-
-    # Search common names
-    for name in CHROME_BINARY_NAMES:
-        try:
-            binary = Binary(name=name, binproviders=[provider]).load()
-            if binary.abspath:
-                return binary
-        except Exception:
-            continue
-
-    return None
-
-
-def output_binary(binary: Binary, name: str):
-    """Output Binary JSONL record to stdout."""
-    machine_id = os.environ.get('MACHINE_ID', '')
-
-    record = {
-        'type': 'Binary',
-        'name': name,
-        'abspath': str(binary.abspath),
-        'version': str(binary.version) if binary.version else '',
-        'sha256': binary.sha256 or '',
-        'binprovider': 'env',
-        'machine_id': machine_id,
-    }
-    print(json.dumps(record))
-
-
-def main():
-    warnings = []
-    errors = []
-    computed = {}
-
-    # Get config values
-    chrome_binary = get_env('CHROME_BINARY', 'chromium')
-    chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
-    screenshot_enabled = get_env_bool('SCREENSHOT_ENABLED', True)
-    pdf_enabled = get_env_bool('PDF_ENABLED', True)
-    dom_enabled = get_env_bool('DOM_ENABLED', True)
-
-    # Compute USE_CHROME (derived from extractor enabled flags)
-    use_chrome = screenshot_enabled or pdf_enabled or dom_enabled
-    computed['USE_CHROME'] = str(use_chrome).lower()
-
-    # Detect Docker and adjust sandbox
-    in_docker = detect_docker()
-    computed['IN_DOCKER'] = str(in_docker).lower()
-
-    if in_docker and chrome_sandbox:
-        warnings.append(
-            "Running in Docker with CHROME_SANDBOX=true. "
-            "Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
-        )
-        # Auto-disable sandbox in Docker unless explicitly set
-        if not get_env('CHROME_SANDBOX'):
-            computed['CHROME_SANDBOX'] = 'false'
-
-    # Find Chrome binary using abx-pkg
-    provider = EnvProvider()
-    if use_chrome:
-        chrome = find_chrome_binary(chrome_binary, provider)
-        if not chrome or not chrome.abspath:
-            errors.append(
-                f"Chrome binary not found (tried: {chrome_binary}). "
-                "Install Chrome/Chromium or set CHROME_BINARY path."
-            )
-            computed['CHROME_BINARY'] = ''
-        else:
-            computed['CHROME_BINARY'] = str(chrome.abspath)
-            computed['CHROME_VERSION'] = str(chrome.version) if chrome.version else 'unknown'
-
-            # Output Binary JSONL record for Chrome
-            output_binary(chrome, name='chrome')
-
-    # Check Node.js for Puppeteer
-    node_binary_name = get_env('NODE_BINARY', 'node')
-    try:
-        node = Binary(name=node_binary_name, binproviders=[provider]).load()
-        node_path = str(node.abspath) if node.abspath else ''
-    except Exception:
-        node = None
-        node_path = ''
-
-    if use_chrome and not node_path:
-        errors.append(
-            f"Node.js not found (tried: {node_binary_name}). "
-            "Install Node.js or set NODE_BINARY path for Puppeteer."
-        )
-    else:
-        computed['NODE_BINARY'] = node_path
-        if node and node.abspath:
-            # Output Binary JSONL record for Node
-            output_binary(node, name='node')
-
-    # Output computed values
-    for key, value in computed.items():
-        print(f"COMPUTED:{key}={value}")
-
-    for warning in warnings:
-        print(f"WARNING:{warning}", file=sys.stderr)
-
-    for error in errors:
-        print(f"ERROR:{error}", file=sys.stderr)
-
-    sys.exit(1 if errors else 0)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js b/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js
similarity index 98%
rename from archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js
rename to archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js
index c2d62775..d025be81 100644
--- a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js
+++ b/archivebox/plugins/chrome/on_Crawl__30_chrome_launch.bg.js
@@ -9,7 +9,7 @@
  * --load-extension and --disable-extensions-except flags.
  *
  * Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
- * Output: Creates chrome/ directory under crawl output dir with:
+ * Output: Writes to current directory (executor creates chrome/ dir):
  *   - cdp_url.txt: WebSocket URL for CDP connection
  *   - chrome.pid: Chromium process ID (for cleanup)
  *   - port.txt: Debug port number
@@ -42,7 +42,7 @@ const {
 
 // Extractor metadata
 const PLUGIN_NAME = 'chrome_launch';
-const OUTPUT_DIR = 'chrome';
+const OUTPUT_DIR = '.';
 
 // Global state for cleanup
 let chromePid = null;
diff --git a/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__20_install_istilldontcareaboutcookies_extension.js
similarity index 100%
rename from archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js
rename to archivebox/plugins/istilldontcareaboutcookies/on_Crawl__20_install_istilldontcareaboutcookies_extension.js
diff --git a/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js b/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js
deleted file mode 100755
index 7637bf98..00000000
--- a/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js
+++ /dev/null
@@ -1,268 +0,0 @@
-#!/usr/bin/env node
-/**
- * SingleFile Extension Plugin
- *
- * Installs and uses the SingleFile Chrome extension for archiving complete web pages.
- * Falls back to single-file-cli if the extension is not available.
- *
- * Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
- *
- * Priority: 04 (early) - Must install before Chrome session starts at Crawl level
- * Hook: on_Crawl (runs once per crawl, not per snapshot)
- *
- * This extension automatically:
- * - Saves complete web pages as single HTML files
- * - Inlines all resources (CSS, JS, images, fonts)
- * - Preserves page fidelity better than wget/curl
- * - Works with SPAs and dynamically loaded content
- */
-
-const path = require('path');
-const fs = require('fs');
-const { promisify } = require('util');
-const { exec } = require('child_process');
-
-const execAsync = promisify(exec);
-
-// Import extension utilities
-const extensionUtils = require('../chrome/chrome_utils.js');
-
-// Extension metadata
-const EXTENSION = {
-    webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
-    name: 'singlefile',
-};
-
-// Get extensions directory from environment or use default
-const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
-    path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
-
-const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
-    path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
-
-const OUTPUT_DIR = '.';
-const OUTPUT_FILE = 'singlefile.html';
-
-/**
- * Install the SingleFile extension
- */
-async function installSinglefileExtension() {
-    console.log('[*] Installing SingleFile extension...');
-
-    // Install the extension
-    const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
-
-    if (!extension) {
-        console.error('[❌] Failed to install SingleFile extension');
-        return null;
-    }
-
-    console.log('[+] SingleFile extension installed');
-    console.log('[+] Web pages will be saved as single HTML files');
-
-    return extension;
-}
-
-/**
- * Wait for a specified amount of time
- */
-function wait(ms) {
-    return new Promise(resolve => setTimeout(resolve, ms));
-}
-
-/**
- * Save a page using the SingleFile extension
- *
- * @param {Object} page - Puppeteer page object
- * @param {Object} extension - Extension metadata with dispatchAction method
- * @param {Object} options - Additional options
- * @returns {Promise<string|null>} - Path to saved file or null on failure
- */
-async function saveSinglefileWithExtension(page, extension, options = {}) {
-    if (!extension || !extension.version) {
-        throw new Error('SingleFile extension not found or not loaded');
-    }
-
-    const url = await page.url();
-
-    // Check for unsupported URL schemes
-    const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
-    const scheme = url.split(':')[0];
-    if (URL_SCHEMES_IGNORED.includes(scheme)) {
-        console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
-        return null;
-    }
-
-    // Ensure downloads directory exists
-    await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
-
-    // Get list of existing files to ignore
-    const files_before = new Set(
-        (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
-            .filter(fn => fn.endsWith('.html'))
-    );
-
-    // Output directory is current directory (hook already runs in output dir)
-    const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
-
-    console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
-
-    // Bring page to front (extension action button acts on foreground tab)
-    await page.bringToFront();
-
-    // Trigger the extension's action (toolbar button click)
-    await extension.dispatchAction();
-
-    // Wait for file to appear in downloads directory
-    const check_delay = 3000; // 3 seconds
-    const max_tries = 10;
-    let files_new = [];
-
-    for (let attempt = 0; attempt < max_tries; attempt++) {
-        await wait(check_delay);
-
-        const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
-            .filter(fn => fn.endsWith('.html'));
-
-        files_new = files_after.filter(file => !files_before.has(file));
-
-        if (files_new.length === 0) {
-            continue;
-        }
-
-        // Find the matching file by checking if it contains the URL in the HTML header
-        for (const file of files_new) {
-            const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
-            const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
-            const dl_header = dl_text.split('meta charset')[0];
-
-            if (dl_header.includes(`url: ${url}`)) {
-                console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
-                await fs.promises.rename(dl_path, out_path);
-                return out_path;
-            }
-        }
-    }
-
-    console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
-    console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
-    return null;
-}
-
-/**
- * Save a page using single-file-cli (fallback method)
- *
- * @param {string} url - URL to archive
- * @param {Object} options - Additional options
- * @returns {Promise<string|null>} - Path to saved file or null on failure
- */
-async function saveSinglefileWithCLI(url, options = {}) {
-    console.log('[*] Falling back to single-file-cli...');
-
-    // Find single-file binary
-    let binary = null;
-    try {
-        const { stdout } = await execAsync('which single-file');
-        binary = stdout.trim();
-    } catch (err) {
-        console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
-        return null;
-    }
-
-    // Output directory is current directory (hook already runs in output dir)
-    const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
-
-    // Build command
-    const cmd = [
-        binary,
-        '--browser-headless',
-        url,
-        out_path,
-    ];
-
-    // Add optional args
-    if (options.userAgent) {
-        cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
-    }
-    if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
-        cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
-    }
-    if (options.ignoreSSL) {
-        cmd.splice(2, 0, '--browser-ignore-insecure-certs');
-    }
-
-    // Execute
-    try {
-        const timeout = options.timeout || 120000;
-        await execAsync(cmd.join(' '), { timeout });
-
-        if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
-            console.log(`[+] SingleFile saved via CLI: ${out_path}`);
-            return out_path;
-        }
-
-        console.error('[❌] SingleFile CLI completed but no output file found');
-        return null;
-    } catch (err) {
-        console.error(`[❌] SingleFile CLI error: ${err.message}`);
-        return null;
-    }
-}
-
-/**
- * Main entry point - install extension before archiving
- */
-async function main() {
-    // Check if extension is already cached
-    const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
-
-    if (fs.existsSync(cacheFile)) {
-        try {
-            const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
-            const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
-
-            if (fs.existsSync(manifestPath)) {
-                console.log('[*] SingleFile extension already installed (using cache)');
-                return cached;
-            }
-        } catch (e) {
-            // Cache file corrupted, re-install
-            console.warn('[⚠️] Extension cache corrupted, re-installing...');
-        }
-    }
-
-    // Install extension
-    const extension = await installSinglefileExtension();
-
-    // Export extension metadata for chrome plugin to load
-    if (extension) {
-        // Write extension info to a cache file that chrome plugin can read
-        await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
-        await fs.promises.writeFile(
-            cacheFile,
-            JSON.stringify(extension, null, 2)
-        );
-        console.log(`[+] Extension metadata written to ${cacheFile}`);
-    }
-
-    return extension;
-}
-
-// Export functions for use by other plugins
-module.exports = {
-    EXTENSION,
-    installSinglefileExtension,
-    saveSinglefileWithExtension,
-    saveSinglefileWithCLI,
-};
-
-// Run if executed directly
-if (require.main === module) {
-    main().then(() => {
-        console.log('[✓] SingleFile extension setup complete');
-        process.exit(0);
-    }).catch(err => {
-        console.error('[❌] SingleFile extension setup failed:', err);
-        process.exit(1);
-    });
-}
diff --git a/archivebox/plugins/singlefile/on_Crawl__20_install_singlefile_extension.js b/archivebox/plugins/singlefile/on_Crawl__20_install_singlefile_extension.js
new file mode 100755
index 00000000..59bbda46
--- /dev/null
+++ b/archivebox/plugins/singlefile/on_Crawl__20_install_singlefile_extension.js
@@ -0,0 +1,281 @@
+#!/usr/bin/env node
+/**
+ * SingleFile Extension Plugin
+ *
+ * DISABLED: Extension functionality commented out - using single-file-cli only
+ *
+ * Installs and uses the SingleFile Chrome extension for archiving complete web pages.
+ * Falls back to single-file-cli if the extension is not available.
+ *
+ * Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
+ *
+ * Priority: 04 (early) - Must install before Chrome session starts at Crawl level
+ * Hook: on_Crawl (runs once per crawl, not per snapshot)
+ *
+ * This extension automatically:
+ * - Saves complete web pages as single HTML files
+ * - Inlines all resources (CSS, JS, images, fonts)
+ * - Preserves page fidelity better than wget/curl
+ * - Works with SPAs and dynamically loaded content
+ */
+
+const path = require('path');
+const fs = require('fs');
+const { promisify } = require('util');
+const { exec } = require('child_process');
+
+const execAsync = promisify(exec);
+
+// DISABLED: Extension functionality - using single-file-cli only
+// // Import extension utilities
+// const extensionUtils = require('../chrome/chrome_utils.js');
+
+// // Extension metadata
+// const EXTENSION = {
+//     webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
+//     name: 'singlefile',
+// };
+
+// // Get extensions directory from environment or use default
+// const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
+//     path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
+
+// const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
+//     path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
+
+const OUTPUT_DIR = '.';
+const OUTPUT_FILE = 'singlefile.html';
+
+// DISABLED: Extension functionality - using single-file-cli only
+// /**
+//  * Install the SingleFile extension
+//  */
+// async function installSinglefileExtension() {
+//     console.log('[*] Installing SingleFile extension...');
+
+//     // Install the extension
+//     const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
+
+//     if (!extension) {
+//         console.error('[❌] Failed to install SingleFile extension');
+//         return null;
+//     }
+
+//     console.log('[+] SingleFile extension installed');
+//     console.log('[+] Web pages will be saved as single HTML files');
+
+//     return extension;
+// }
+
+// /**
+//  * Wait for a specified amount of time
+//  */
+// function wait(ms) {
+//     return new Promise(resolve => setTimeout(resolve, ms));
+// }
+
+// /**
+//  * Save a page using the SingleFile extension
+//  *
+//  * @param {Object} page - Puppeteer page object
+//  * @param {Object} extension - Extension metadata with dispatchAction method
+//  * @param {Object} options - Additional options
+//  * @returns {Promise<string|null>} - Path to saved file or null on failure
+//  */
+// async function saveSinglefileWithExtension(page, extension, options = {}) {
+//     if (!extension || !extension.version) {
+//         throw new Error('SingleFile extension not found or not loaded');
+//     }
+
+//     const url = await page.url();
+
+//     // Check for unsupported URL schemes
+//     const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
+//     const scheme = url.split(':')[0];
+//     if (URL_SCHEMES_IGNORED.includes(scheme)) {
+//         console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
+//         return null;
+//     }
+
+//     // Ensure downloads directory exists
+//     await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
+
+//     // Get list of existing files to ignore
+//     const files_before = new Set(
+//         (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
+//             .filter(fn => fn.endsWith('.html'))
+//     );
+
+//     // Output directory is current directory (hook already runs in output dir)
+//     const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
+
+//     console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
+
+//     // Bring page to front (extension action button acts on foreground tab)
+//     await page.bringToFront();
+
+//     // Trigger the extension's action (toolbar button click)
+//     await extension.dispatchAction();
+
+//     // Wait for file to appear in downloads directory
+//     const check_delay = 3000; // 3 seconds
+//     const max_tries = 10;
+//     let files_new = [];
+
+//     for (let attempt = 0; attempt < max_tries; attempt++) {
+//         await wait(check_delay);
+
+//         const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
+//             .filter(fn => fn.endsWith('.html'));
+
+//         files_new = files_after.filter(file => !files_before.has(file));
+
+//         if (files_new.length === 0) {
+//             continue;
+//         }
+
+//         // Find the matching file by checking if it contains the URL in the HTML header
+//         for (const file of files_new) {
+//             const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
+//             const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
+//             const dl_header = dl_text.split('meta charset')[0];
+
+//             if (dl_header.includes(`url: ${url}`)) {
+//                 console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
+//                 await fs.promises.rename(dl_path, out_path);
+//                 return out_path;
+//             }
+//         }
+//     }
+
+//     console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
+//     console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
+//     return null;
+// }
+
+/**
+ * Save a page using single-file-cli (fallback method)
+ *
+ * @param {string} url - URL to archive
+ * @param {Object} options - Additional options
+ * @returns {Promise<string|null>} - Path to saved file or null on failure
+ */
+async function saveSinglefileWithCLI(url, options = {}) {
+    console.log('[*] Falling back to single-file-cli...');
+
+    // Find single-file binary
+    let binary = null;
+    try {
+        const { stdout } = await execAsync('which single-file');
+        binary = stdout.trim();
+    } catch (err) {
+        console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
+        return null;
+    }
+
+    // Output directory is current directory (hook already runs in output dir)
+    const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
+
+    // Build command
+    const cmd = [
+        binary,
+        '--browser-headless',
+        url,
+        out_path,
+    ];
+
+    // Add optional args
+    if (options.userAgent) {
+        cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
+    }
+    if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
+        cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
+    }
+    if (options.ignoreSSL) {
+        cmd.splice(2, 0, '--browser-ignore-insecure-certs');
+    }
+
+    // Execute
+    try {
+        const timeout = options.timeout || 120000;
+        await execAsync(cmd.join(' '), { timeout });
+
+        if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
+            console.log(`[+] SingleFile saved via CLI: ${out_path}`);
+            return out_path;
+        }
+
+        console.error('[❌] SingleFile CLI completed but no output file found');
+        return null;
+    } catch (err) {
+        console.error(`[❌] SingleFile CLI error: ${err.message}`);
+        return null;
+    }
+}
+
+// DISABLED: Extension functionality - using single-file-cli only
+// /**
+//  * Main entry point - install extension before archiving
+//  */
+// async function main() {
+//     // Check if extension is already cached
+//     const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
+
+//     if (fs.existsSync(cacheFile)) {
+//         try {
+//             const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
+//             const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
+
+//             if (fs.existsSync(manifestPath)) {
+//                 console.log('[*] SingleFile extension already installed (using cache)');
+//                 return cached;
+//             }
+//         } catch (e) {
+//             // Cache file corrupted, re-install
+//             console.warn('[⚠️] Extension cache corrupted, re-installing...');
+//         }
+//     }
+
+//     // Install extension
+//     const extension = await installSinglefileExtension();
+
+//     // Export extension metadata for chrome plugin to load
+//     if (extension) {
+//         // Write extension info to a cache file that chrome plugin can read
+//         await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
+//         await fs.promises.writeFile(
+//             cacheFile,
+//             JSON.stringify(extension, null, 2)
+//         );
+//         console.log(`[+] Extension metadata written to ${cacheFile}`);
+//     }
+
+//     return extension;
+// }
+
+// Export functions for use by other plugins
+module.exports = {
+    // DISABLED: Extension functionality - using single-file-cli only
+    // EXTENSION,
+    // installSinglefileExtension,
+    // saveSinglefileWithExtension,
+    saveSinglefileWithCLI,
+};
+
+// DISABLED: Extension functionality - using single-file-cli only
+// // Run if executed directly
+// if (require.main === module) {
+//     main().then(() => {
+//         console.log('[✓] SingleFile extension setup complete');
+//         process.exit(0);
+//     }).catch(err => {
+//         console.error('[❌] SingleFile extension setup failed:', err);
+//         process.exit(1);
+//     });
+// }
+
+// No-op when run directly (extension install disabled)
+if (require.main === module) {
+    console.log('[*] SingleFile extension install disabled - using single-file-cli only');
+    process.exit(0);
+}
diff --git a/archivebox/plugins/singlefile/tests/test_singlefile.py b/archivebox/plugins/singlefile/tests/test_singlefile.py
index aace617f..8d6d01b0 100644
--- a/archivebox/plugins/singlefile/tests/test_singlefile.py
+++ b/archivebox/plugins/singlefile/tests/test_singlefile.py
@@ -2,16 +2,15 @@
 Integration tests for singlefile plugin
 
 Tests verify:
-1. Hook script exists and has correct metadata
-2. Extension installation and caching works
-3. Chrome/node dependencies available
-4. Hook can be executed successfully
+1. Hook scripts exist with correct naming
+2. CLI-based singlefile extraction works
+3. Dependencies available via abx-pkg
+4. Output contains valid HTML
 """
 
 import json
 import os
 import subprocess
-import sys
 import tempfile
 from pathlib import Path
 
@@ -20,177 +19,63 @@ import pytest
 
 PLUGIN_DIR = Path(__file__).parent.parent
 PLUGINS_ROOT = PLUGIN_DIR.parent
-INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_singlefile.*'), None)
-NPM_PROVIDER_HOOK = PLUGINS_ROOT / 'npm' / 'on_Binary__install_using_npm_provider.py'
+SNAPSHOT_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_singlefile.py'), None)
 TEST_URL = "https://example.com"
 
 
-def test_install_script_exists():
-    """Verify install script exists"""
-    assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
+def test_snapshot_hook_exists():
+    """Verify snapshot extraction hook exists"""
+    assert SNAPSHOT_HOOK is not None and SNAPSHOT_HOOK.exists(), f"Snapshot hook not found in {PLUGIN_DIR}"
 
 
-def test_extension_metadata():
-    """Test that SingleFile extension has correct metadata"""
-    with tempfile.TemporaryDirectory() as tmpdir:
-        env = os.environ.copy()
-        env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
-
-        result = subprocess.run(
-            ["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
-            capture_output=True,
-            text=True,
-            env=env
-        )
-
-        assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
-
-        metadata = json.loads(result.stdout)
-        assert metadata["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
-        assert metadata["name"] == "singlefile"
-
-
-def test_install_creates_cache():
-    """Test that install creates extension cache"""
-    with tempfile.TemporaryDirectory() as tmpdir:
-        ext_dir = Path(tmpdir) / "chrome_extensions"
-        ext_dir.mkdir(parents=True)
-
-        env = os.environ.copy()
-        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
-
-        result = subprocess.run(
-            ["node", str(INSTALL_SCRIPT)],
-            capture_output=True,
-            text=True,
-            env=env,
-            timeout=60
-        )
-
-        # Check output mentions installation
-        assert "SingleFile" in result.stdout or "singlefile" in result.stdout
-
-        # Check cache file was created
-        cache_file = ext_dir / "singlefile.extension.json"
-        assert cache_file.exists(), "Cache file should be created"
-
-        # Verify cache content
-        cache_data = json.loads(cache_file.read_text())
-        assert cache_data["webstore_id"] == "mpiodijhokgodhhofbcjdecpffjipkle"
-        assert cache_data["name"] == "singlefile"
-
-
-def test_install_twice_uses_cache():
-    """Test that running install twice uses existing cache on second run"""
-    with tempfile.TemporaryDirectory() as tmpdir:
-        ext_dir = Path(tmpdir) / "chrome_extensions"
-        ext_dir.mkdir(parents=True)
-
-        env = os.environ.copy()
-        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
-
-        # First install - downloads the extension
-        result1 = subprocess.run(
-            ["node", str(INSTALL_SCRIPT)],
-            capture_output=True,
-            text=True,
-            env=env,
-            timeout=60
-        )
-        assert result1.returncode == 0, f"First install failed: {result1.stderr}"
-
-        # Verify cache was created
-        cache_file = ext_dir / "singlefile.extension.json"
-        assert cache_file.exists(), "Cache file should exist after first install"
-
-        # Second install - should use cache
-        result2 = subprocess.run(
-            ["node", str(INSTALL_SCRIPT)],
-            capture_output=True,
-            text=True,
-            env=env,
-            timeout=30
-        )
-        assert result2.returncode == 0, f"Second install failed: {result2.stderr}"
-
-        # Second run should be faster (uses cache) and mention cache
-        assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0
-
-
-def test_no_configuration_required():
-    """Test that SingleFile works without configuration"""
-    with tempfile.TemporaryDirectory() as tmpdir:
-        ext_dir = Path(tmpdir) / "chrome_extensions"
-        ext_dir.mkdir(parents=True)
-
-        env = os.environ.copy()
-        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
-        # No API keys needed
-
-        result = subprocess.run(
-            ["node", str(INSTALL_SCRIPT)],
-            capture_output=True,
-            text=True,
-            env=env,
-            timeout=60
-        )
-
-        # Should work without API keys
-        assert result.returncode == 0
-
-
-def test_priority_order():
-    """Test that singlefile has correct priority (04)"""
-    # Extract priority from filename
-    filename = INSTALL_SCRIPT.name
-    assert "04" in filename, "SingleFile should have priority 04"
-    assert filename.startswith("on_Crawl__04_"), "Should follow priority naming convention for Crawl hooks"
-
-
-def test_output_directory_structure():
-    """Test that plugin defines correct output structure"""
-    # Verify the script mentions singlefile output directory
-    script_content = INSTALL_SCRIPT.read_text()
-
-    # Should mention singlefile output directory
-    assert "singlefile" in script_content.lower()
-    # Should mention HTML output
-    assert ".html" in script_content or "html" in script_content.lower()
+def test_snapshot_hook_priority():
+    """Test that snapshot hook has correct priority (50)"""
+    filename = SNAPSHOT_HOOK.name
+    assert "50" in filename, "SingleFile snapshot hook should have priority 50"
+    assert filename.startswith("on_Snapshot__50_"), "Should follow priority naming convention"
 
 
 def test_verify_deps_with_abx_pkg():
-    """Verify dependencies are available via abx-pkg after hook installation."""
-    from abx_pkg import Binary, EnvProvider, BinProviderOverrides
+    """Verify dependencies are available via abx-pkg."""
+    from abx_pkg import Binary, EnvProvider
 
     EnvProvider.model_rebuild()
 
-    # Verify node is available (singlefile uses Chrome extension, needs Node)
+    # Verify node is available
     node_binary = Binary(name='node', binproviders=[EnvProvider()])
     node_loaded = node_binary.load()
     assert node_loaded and node_loaded.abspath, "Node.js required for singlefile plugin"
 
 
-def test_singlefile_hook_runs():
-    """Verify singlefile hook can be executed and completes."""
-    # Prerequisites checked by earlier test
-
+def test_singlefile_cli_archives_example_com():
+    """Test that singlefile CLI archives example.com and produces valid HTML."""
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)
 
-        # Run singlefile extraction hook
+        env = os.environ.copy()
+        env['SINGLEFILE_ENABLED'] = 'true'
+
+        # Run singlefile snapshot hook
         result = subprocess.run(
-            ['node', str(INSTALL_SCRIPT), f'--url={TEST_URL}', '--snapshot-id=test789'],
+            ['python', str(SNAPSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
             cwd=tmpdir,
             capture_output=True,
             text=True,
+            env=env,
             timeout=120
         )
 
-        # Hook should complete successfully (even if it just installs extension)
         assert result.returncode == 0, f"Hook execution failed: {result.stderr}"
 
-        # Verify extension installation happens
-        assert 'SingleFile extension' in result.stdout or result.returncode == 0, "Should install extension or complete"
+        # Verify output file exists
+        output_file = tmpdir / 'singlefile.html'
+        assert output_file.exists(), f"singlefile.html not created. stdout: {result.stdout}, stderr: {result.stderr}"
+
+        # Verify it contains real HTML
+        html_content = output_file.read_text()
+        assert len(html_content) > 500, "Output file too small to be valid HTML"
+        assert '<!DOCTYPE html>' in html_content or '<html' in html_content, "Output should contain HTML doctype or html tag"
+        assert 'Example Domain' in html_content, "Output should contain example.com content"
 
 
 if __name__ == '__main__':
diff --git a/archivebox/plugins/captcha2/config.json b/archivebox/plugins/twocaptcha/config.json
similarity index 100%
rename from archivebox/plugins/captcha2/config.json
rename to archivebox/plugins/twocaptcha/config.json
diff --git a/archivebox/plugins/captcha2/on_Crawl__01_captcha2.js b/archivebox/plugins/twocaptcha/on_Crawl__20_install_twocaptcha_extension.js
similarity index 97%
rename from archivebox/plugins/captcha2/on_Crawl__01_captcha2.js
rename to archivebox/plugins/twocaptcha/on_Crawl__20_install_twocaptcha_extension.js
index 45fb8956..5465e0cd 100755
--- a/archivebox/plugins/captcha2/on_Crawl__01_captcha2.js
+++ b/archivebox/plugins/twocaptcha/on_Crawl__20_install_twocaptcha_extension.js
@@ -25,7 +25,7 @@ const extensionUtils = require('../chrome/chrome_utils.js');
 // Extension metadata
 const EXTENSION = {
     webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo',
-    name: 'captcha2',
+    name: 'twocaptcha',
 };
 
 // Get extensions directory from environment or use default
@@ -69,7 +69,7 @@ async function installCaptchaExtension() {
  */
 async function main() {
     // Check if extension is already cached
-    const cacheFile = path.join(EXTENSIONS_DIR, 'captcha2.extension.json');
+    const cacheFile = path.join(EXTENSIONS_DIR, 'twocaptcha.extension.json');
 
     if (fs.existsSync(cacheFile)) {
         try {
diff --git a/archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js b/archivebox/plugins/twocaptcha/on_Crawl__25_configure_twocaptcha_extension_options.js
similarity index 97%
rename from archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js
rename to archivebox/plugins/twocaptcha/on_Crawl__25_configure_twocaptcha_extension_options.js
index cf528a1b..8a1dc440 100755
--- a/archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js
+++ b/archivebox/plugins/twocaptcha/on_Crawl__25_configure_twocaptcha_extension_options.js
@@ -29,7 +29,7 @@ function getCrawlChromeSessionDir() {
 }
 
 const CHROME_SESSION_DIR = getCrawlChromeSessionDir() || '../chrome';
-const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.captcha2_configured');
+const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.twocaptcha_configured');
 
 // Get environment variable with default
 function getEnv(name, defaultValue = '') {
@@ -70,7 +70,7 @@ async function configure2Captcha() {
     }
 
     const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
-    const captchaExt = extensions.find(ext => ext.name === 'captcha2');
+    const captchaExt = extensions.find(ext => ext.name === 'twocaptcha');
 
     if (!captchaExt) {
         console.error('[*] 2captcha extension not installed, skipping configuration');
@@ -236,7 +236,7 @@ async function main() {
     const snapshotId = args.snapshot_id;
 
     if (!url || !snapshotId) {
-        console.error('Usage: on_Snapshot__21_captcha2_config.js --url=<url> --snapshot-id=<uuid>');
+        console.error('Usage: on_Snapshot__21_twocaptcha_config.js --url=<url> --snapshot-id=<uuid>');
         process.exit(1);
     }
 
diff --git a/archivebox/plugins/captcha2/templates/icon.html b/archivebox/plugins/twocaptcha/templates/icon.html
similarity index 100%
rename from archivebox/plugins/captcha2/templates/icon.html
rename to archivebox/plugins/twocaptcha/templates/icon.html
diff --git a/archivebox/plugins/captcha2/tests/test_captcha2.py b/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py
similarity index 90%
rename from archivebox/plugins/captcha2/tests/test_captcha2.py
rename to archivebox/plugins/twocaptcha/tests/test_twocaptcha.py
index bc08a072..ab4f4a4b 100644
--- a/archivebox/plugins/captcha2/tests/test_captcha2.py
+++ b/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py
@@ -1,5 +1,5 @@
 """
-Unit tests for captcha2 plugin
+Unit tests for twocaptcha plugin
 
 Tests invoke the plugin hooks as external processes and verify outputs/side effects.
 """
@@ -14,8 +14,8 @@ import pytest
 
 
 PLUGIN_DIR = Path(__file__).parent.parent
-INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2.*'), None)
-CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2_config.*'), None)
+INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_install_twocaptcha_extension.*'), None)
+CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_configure_twocaptcha_extension_options.*'), None)
 
 
 def test_install_script_exists():
@@ -29,7 +29,7 @@ def test_config_script_exists():
 
 
 def test_extension_metadata():
-    """Test that captcha2 extension has correct metadata"""
+    """Test that twocaptcha extension has correct metadata"""
     with tempfile.TemporaryDirectory() as tmpdir:
         env = os.environ.copy()
         env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
@@ -46,7 +46,7 @@ def test_extension_metadata():
 
         metadata = json.loads(result.stdout)
         assert metadata["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
-        assert metadata["name"] == "captcha2"
+        assert metadata["name"] == "twocaptcha"
 
 
 def test_install_creates_cache():
@@ -72,13 +72,13 @@ def test_install_creates_cache():
         assert "[*] Installing 2captcha extension" in result.stdout or "[*] 2captcha extension already installed" in result.stdout
 
         # Check cache file was created
-        cache_file = ext_dir / "captcha2.extension.json"
+        cache_file = ext_dir / "twocaptcha.extension.json"
         assert cache_file.exists(), "Cache file should be created"
 
         # Verify cache content
         cache_data = json.loads(cache_file.read_text())
         assert cache_data["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
-        assert cache_data["name"] == "captcha2"
+        assert cache_data["name"] == "twocaptcha"
         assert "unpacked_path" in cache_data
         assert "version" in cache_data
 
@@ -104,7 +104,7 @@ def test_install_twice_uses_cache():
         assert result1.returncode == 0, f"First install failed: {result1.stderr}"
 
         # Verify cache was created
-        cache_file = ext_dir / "captcha2.extension.json"
+        cache_file = ext_dir / "twocaptcha.extension.json"
         assert cache_file.exists(), "Cache file should exist after first install"
 
         # Second install - should use cache
@@ -175,7 +175,7 @@ def test_config_script_structure():
     script_content = CONFIG_SCRIPT.read_text()
 
     # Should mention configuration marker file
-    assert "CONFIG_MARKER" in script_content or "captcha2_configured" in script_content
+    assert "CONFIG_MARKER" in script_content or "twocaptcha_configured" in script_content
 
     # Should mention API key
     assert "API_KEY_2CAPTCHA" in script_content
diff --git a/archivebox/plugins/ublock/on_Crawl__03_ublock.js b/archivebox/plugins/ublock/on_Crawl__20_install_ublock_extension.js
similarity index 100%
rename from archivebox/plugins/ublock/on_Crawl__03_ublock.js
rename to archivebox/plugins/ublock/on_Crawl__20_install_ublock_extension.js
diff --git a/archivebox/plugins/wget/on_Crawl__10_wget_validate_config.py b/archivebox/plugins/wget/on_Crawl__10_install_wget.py
similarity index 100%
rename from archivebox/plugins/wget/on_Crawl__10_wget_validate_config.py
rename to archivebox/plugins/wget/on_Crawl__10_install_wget.py