wip major changes

2026-04-04 14:57:56 +10:00 · 2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions
--- a/archivebox/cli/init.py
+++ b/archivebox/cli/init.py
@@ -37,7 +37,13 @@ class ArchiveBoxGroup(click.Group):
        'server': 'archivebox.cli.archivebox_server.main',
        'shell': 'archivebox.cli.archivebox_shell.main',
        'manage': 'archivebox.cli.archivebox_manage.main',
+        # Worker/orchestrator commands
+        'orchestrator': 'archivebox.cli.archivebox_orchestrator.main',
        'worker': 'archivebox.cli.archivebox_worker.main',
+        # Task commands (called by workers as subprocesses)
+        'crawl': 'archivebox.cli.archivebox_crawl.main',
+        'snapshot': 'archivebox.cli.archivebox_snapshot.main',
+        'extract': 'archivebox.cli.archivebox_extract.main',
    }
    all_subcommands = {
        **meta_commands,
@@ -118,11 +124,14 @@ def cli(ctx, help=False):
                raise
            

-def main(args=None, prog_name=None):
+def main(args=None, prog_name=None, stdin=None):
    # show `docker run archivebox xyz` in help messages if running in docker
    IN_DOCKER = os.environ.get('IN_DOCKER', False) in ('1', 'true', 'True', 'TRUE', 'yes')
    IS_TTY = sys.stdin.isatty()
    prog_name = prog_name or (f'docker compose run{"" if IS_TTY else " -T"} archivebox' if IN_DOCKER else 'archivebox')
+    
+    # stdin param allows passing input data from caller (used by __main__.py)
+    # currently not used by click-based CLI, but kept for backwards compatibility

    try:
        cli(args=args, prog_name=prog_name)
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -16,214 +16,135 @@ from archivebox.misc.util import enforce_types, docstring
 from archivebox import CONSTANTS
 from archivebox.config.common import ARCHIVING_CONFIG
 from archivebox.config.permissions import USER, HOSTNAME
-from archivebox.parsers import PARSERS


 if TYPE_CHECKING:
    from core.models import Snapshot


-ORCHESTRATOR = None
-
@enforce_types
 def add(urls: str | list[str],
        depth: int | str=0,
        tag: str='',
        parser: str="auto",
-        extract: str="",
+        plugins: str="",
        persona: str='Default',
        overwrite: bool=False,
        update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
        index_only: bool=False,
        bg: bool=False,
        created_by_id: int | None=None) -> QuerySet['Snapshot']:
-    """Add a new URL or list of URLs to your archive"""
+    """Add a new URL or list of URLs to your archive.

-    global ORCHESTRATOR
+    The new flow is:
+    1. Save URLs to sources file
+    2. Create Seed pointing to the file
+    3. Create Crawl with max_depth
+    4. Create root Snapshot pointing to file:// URL (depth=0)
+    5. Orchestrator runs parser extractors on root snapshot
+    6. Parser extractors output to urls.jsonl
+    7. URLs are added to Crawl.urls and child Snapshots are created
+    8. Repeat until max_depth is reached
+    """
+
+    from rich import print

    depth = int(depth)

-    assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
-    
-    # import models once django is set up
-    from crawls.models import Seed, Crawl
-    from workers.orchestrator import Orchestrator
-    from archivebox.base_models.models import get_or_create_system_user_pk
+    assert depth in (0, 1, 2, 3, 4), 'Depth must be 0-4'

+    # import models once django is set up
+    from core.models import Snapshot
+    from crawls.models import Seed, Crawl
+    from archivebox.base_models.models import get_or_create_system_user_pk
+    from workers.orchestrator import Orchestrator

    created_by_id = created_by_id or get_or_create_system_user_pk()
-    
-    # 1. save the provided urls to sources/2024-11-05__23-59-59__cli_add.txt
+
+    # 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
    sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
+    sources_file.parent.mkdir(parents=True, exist_ok=True)
    sources_file.write_text(urls if isinstance(urls, str) else '\n'.join(urls))
-    
-    # 2. create a new Seed pointing to the sources/2024-11-05__23-59-59__cli_add.txt
+
+    # 2. Create a new Seed pointing to the sources file
    cli_args = [*sys.argv]
    if cli_args[0].lower().endswith('archivebox'):
-        cli_args[0] = 'archivebox'  # full path to archivebox bin to just archivebox e.g. /Volumes/NVME/Users/squash/archivebox/.venv/bin/archivebox -> archivebox
+        cli_args[0] = 'archivebox'
    cmd_str = ' '.join(cli_args)
-    seed = Seed.from_file(sources_file, label=f'{USER}@{HOSTNAME} $ {cmd_str}', parser=parser, tag=tag, created_by=created_by_id, config={
-        'ONLY_NEW': not update,
-        'INDEX_ONLY': index_only,
-        'OVERWRITE': overwrite,
-        'EXTRACTORS': extract,
-        'DEFAULT_PERSONA': persona or 'Default',
-    })
-    # 3. create a new Crawl pointing to the Seed
-    crawl = Crawl.from_seed(seed, max_depth=depth)
-    
-    # 4. start the Orchestrator & wait until it completes
-    #    ... orchestrator will create the root Snapshot, which creates pending ArchiveResults, which gets run by the ArchiveResultActors ...
-    # from crawls.actors import CrawlActor
-    # from core.actors import SnapshotActor, ArchiveResultActor

-    if not bg:
-        orchestrator = Orchestrator(exit_on_idle=True, max_concurrent_actors=4)
-        orchestrator.start()
-    
-    # 5. return the list of new Snapshots created
+    seed = Seed.from_file(
+        sources_file,
+        label=f'{USER}@{HOSTNAME} $ {cmd_str}',
+        parser=parser,
+        tag=tag,
+        created_by=created_by_id,
+        config={
+            'ONLY_NEW': not update,
+            'INDEX_ONLY': index_only,
+            'OVERWRITE': overwrite,
+            'EXTRACTORS': plugins,
+            'DEFAULT_PERSONA': persona or 'Default',
+        }
+    )
+
+    # 3. Create a new Crawl pointing to the Seed (status=queued)
+    crawl = Crawl.from_seed(seed, max_depth=depth)
+
+    print(f'[green]\\[+] Created Crawl {crawl.id} with max_depth={depth}[/green]')
+    print(f'    [dim]Seed: {seed.uri}[/dim]')
+
+    # 4. The CrawlMachine will create the root Snapshot when started
+    #    Root snapshot URL = file:///path/to/sources/...txt
+    #    Parser extractors will run on it and discover URLs
+    #    Those URLs become child Snapshots (depth=1)
+
+    if index_only:
+        # Just create the crawl but don't start processing
+        print('[yellow]\\[*] Index-only mode - crawl created but not started[/yellow]')
+        # Create root snapshot manually
+        crawl.create_root_snapshot()
+        return crawl.snapshot_set.all()
+
+    # 5. Start the orchestrator to process the queue
+    #    The orchestrator will:
+    #    - Process Crawl -> create root Snapshot
+    #    - Process root Snapshot -> run parser extractors -> discover URLs
+    #    - Create child Snapshots from discovered URLs
+    #    - Process child Snapshots -> run extractors
+    #    - Repeat until max_depth reached
+
+    if bg:
+        # Background mode: start orchestrator and return immediately
+        print('[yellow]\\[*] Running in background mode - starting orchestrator...[/yellow]')
+        orchestrator = Orchestrator(exit_on_idle=True)
+        orchestrator.start()  # Fork to background
+    else:
+        # Foreground mode: run orchestrator until all work is done
+        print(f'[green]\\[*] Starting orchestrator to process crawl...[/green]')
+        orchestrator = Orchestrator(exit_on_idle=True)
+        orchestrator.runloop()  # Block until complete
+
+    # 6. Return the list of Snapshots in this crawl
    return crawl.snapshot_set.all()


@click.command()
-@click.option('--depth', '-d', type=click.Choice(('0', '1')), default='0', help='Recursively archive linked pages up to N hops away')
+@click.option('--depth', '-d', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
-@click.option('--parser', type=click.Choice(['auto', *PARSERS.keys()]), default='auto', help='Parser for reading input URLs')
-@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
+@click.option('--parser', default='auto', help='Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)')
+@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...')
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
-# @click.option('--update-all', is_flag=True, help='Update ALL links in index when finished adding new ones')
-@click.option('--bg', is_flag=True, help='Run crawl in background worker instead of immediately')
+@click.option('--bg', is_flag=True, help='Run archiving in background (start orchestrator and return immediately)')
@click.argument('urls', nargs=-1, type=click.Path())
@docstring(add.__doc__)
 def main(**kwargs):
    """Add a new URL or list of URLs to your archive"""
-    
+
    add(**kwargs)


 if __name__ == '__main__':
    main()
-
-
-
-
-# OLD VERSION:
-# def add(urls: Union[str, List[str]],
-#         tag: str='',
-#         depth: int=0,
-#         update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
-#         update_all: bool=False,
-#         index_only: bool=False,
-#         overwrite: bool=False,
-#         # duplicate: bool=False,  # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
-#         init: bool=False,
-#         extractors: str="",
-#         parser: str="auto",
-#         created_by_id: int | None=None,
-#         out_dir: Path=DATA_DIR) -> List[Link]:
-#     """Add a new URL or list of URLs to your archive"""
-
-#     from core.models import Snapshot, Tag
-#     # from workers.supervisord_util import start_cli_workers, tail_worker_logs
-#     # from workers.tasks import bg_archive_link
-    
-
-#     assert depth in (0, 1), 'Depth must be 0 or 1 (depth >1 is not supported yet)'
-
-#     extractors = extractors.split(",") if extractors else []
-
-#     if init:
-#         run_subcommand('init', stdin=None, pwd=out_dir)
-
-#     # Load list of links from the existing index
-#     check_data_folder()
-
-#     # worker = start_cli_workers()
-    
-#     new_links: List[Link] = []
-#     all_links = load_main_index(out_dir=out_dir)
-
-#     log_importing_started(urls=urls, depth=depth, index_only=index_only)
-#     if isinstance(urls, str):
-#         # save verbatim stdin to sources
-#         write_ahead_log = save_text_as_source(urls, filename='{ts}-import.txt', out_dir=out_dir)
-#     elif isinstance(urls, list):
-#         # save verbatim args to sources
-#         write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
-    
-
-#     new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser)
-
-#     # If we're going one level deeper, download each link and look for more links
-#     new_links_depth = []
-#     if new_links and depth == 1:
-#         log_crawl_started(new_links)
-#         for new_link in new_links:
-#             try:
-#                 downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
-#                 new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
-#             except Exception as err:
-#                 stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')
-
-#     imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
-    
-#     new_links = dedupe_links(all_links, imported_links)
-
-#     write_main_index(links=new_links, out_dir=out_dir, created_by_id=created_by_id)
-#     all_links = load_main_index(out_dir=out_dir)
-
-#     tags = [
-#         Tag.objects.get_or_create(name=name.strip(), defaults={'created_by_id': created_by_id})[0]
-#         for name in tag.split(',')
-#         if name.strip()
-#     ]
-#     if tags:
-#         for link in imported_links:
-#             snapshot = Snapshot.objects.get(url=link.url)
-#             snapshot.tags.add(*tags)
-#             snapshot.tags_str(nocache=True)
-#             snapshot.save()
-#         # print(f'    √ Tagged {len(imported_links)} Snapshots with {len(tags)} tags {tags_str}')
-
-#     if index_only:
-#         # mock archive all the links using the fake index_only extractor method in order to update their state
-#         if overwrite:
-#             archive_links(imported_links, overwrite=overwrite, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
-#         else:
-#             archive_links(new_links, overwrite=False, methods=['index_only'], out_dir=out_dir, created_by_id=created_by_id)
-#     else:
-#         # fully run the archive extractor methods for each link
-#         archive_kwargs = {
-#             "out_dir": out_dir,
-#             "created_by_id": created_by_id,
-#         }
-#         if extractors:
-#             archive_kwargs["methods"] = extractors
-
-#         stderr()
-
-#         ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
-
-#         if update:
-#             stderr(f'[*] [{ts}] Archiving + updating {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
-#             archive_links(imported_links, overwrite=overwrite, **archive_kwargs)
-#         elif update_all:
-#             stderr(f'[*] [{ts}] Archiving + updating {len(all_links)}/{len(all_links)}', len(all_links), 'URLs from entire library...', color='green')
-#             archive_links(all_links, overwrite=overwrite, **archive_kwargs)
-#         elif overwrite:
-#             stderr(f'[*] [{ts}] Archiving + overwriting {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
-#             archive_links(imported_links, overwrite=True, **archive_kwargs)
-#         elif new_links:
-#             stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green')
-#             archive_links(new_links, overwrite=False, **archive_kwargs)
-
-#     # tail_worker_logs(worker['stdout_logfile'])
-
-#     # if CAN_UPGRADE:
-#     #     hint(f"There's a new version of ArchiveBox available! Your current version is {VERSION}. You can upgrade to {VERSIONS_AVAILABLE['recommended_version']['tag_name']} ({VERSIONS_AVAILABLE['recommended_version']['html_url']}). For more on how to upgrade: https://github.com/ArchiveBox/ArchiveBox/wiki/Upgrading-or-Merging-Archives\n")
-
-#     return new_links
-
--- a/archivebox/cli/archivebox_config.py
+++ b/archivebox/cli/archivebox_config.py
@@ -20,15 +20,15 @@ def config(*keys,
          **kwargs) -> None:
    """Get and set your ArchiveBox project configuration values"""

-    import archivebox
    from archivebox.misc.checks import check_data_folder
    from archivebox.misc.logging_util import printable_config
    from archivebox.config.collection import load_all_config, write_config_file, get_real_name
+    from archivebox.config.configset import get_flat_config, get_all_configs

    check_data_folder()

-    FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
-    CONFIGS = archivebox.pm.hook.get_CONFIGS()
+    FLAT_CONFIG = get_flat_config()
+    CONFIGS = get_all_configs()
    
    config_options: list[str] = list(kwargs.pop('key=value', []) or keys or [f'{key}={val}' for key, val in kwargs.items()])
    no_args = not (get or set or reset or config_options)
@@ -105,7 +105,7 @@ def config(*keys,
        if new_config:
            before = FLAT_CONFIG
            matching_config = write_config_file(new_config)
-            after = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()}
+            after = {**load_all_config(), **get_flat_config()}
            print(printable_config(matching_config))

            side_effect_changes = {}
--- a/archivebox/cli/archivebox_crawl.py
+++ b/archivebox/cli/archivebox_crawl.py
@@ -0,0 +1,302 @@
+#!/usr/bin/env python3
+
+"""
+archivebox crawl [urls_or_snapshot_ids...] [--depth=N] [--plugin=NAME]
+
+Discover outgoing links from URLs or existing Snapshots.
+
+If a URL is passed, creates a Snapshot for it first, then runs parser plugins.
+If a snapshot_id is passed, runs parser plugins on the existing Snapshot.
+Outputs discovered outlink URLs as JSONL.
+
+Pipe the output to `archivebox snapshot` to archive the discovered URLs.
+
+Input formats:
+    - Plain URLs (one per line)
+    - Snapshot UUIDs (one per line)
+    - JSONL: {"type": "Snapshot", "url": "...", ...}
+    - JSONL: {"type": "Snapshot", "id": "...", ...}
+
+Output (JSONL):
+    {"type": "Snapshot", "url": "https://discovered-url.com", "via_extractor": "...", ...}
+
+Examples:
+    # Discover links from a page (creates snapshot first)
+    archivebox crawl https://example.com
+
+    # Discover links from an existing snapshot
+    archivebox crawl 01234567-89ab-cdef-0123-456789abcdef
+
+    # Full recursive crawl pipeline
+    archivebox crawl https://example.com | archivebox snapshot | archivebox extract
+
+    # Use only specific parser plugin
+    archivebox crawl --plugin=parse_html_urls https://example.com
+
+    # Chain: create snapshot, then crawl its outlinks
+    archivebox snapshot https://example.com | archivebox crawl | archivebox snapshot | archivebox extract
+"""
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox crawl'
+
+import sys
+import json
+from pathlib import Path
+from typing import Optional
+
+import rich_click as click
+
+from archivebox.misc.util import docstring
+
+
+def discover_outlinks(
+    args: tuple,
+    depth: int = 1,
+    plugin: str = '',
+    wait: bool = True,
+) -> int:
+    """
+    Discover outgoing links from URLs or existing Snapshots.
+
+    Accepts URLs or snapshot_ids. For URLs, creates Snapshots first.
+    Runs parser plugins, outputs discovered URLs as JSONL.
+    The output can be piped to `archivebox snapshot` to archive the discovered links.
+
+    Exit codes:
+        0: Success
+        1: Failure
+    """
+    from rich import print as rprint
+    from django.utils import timezone
+
+    from archivebox.misc.jsonl import (
+        read_args_or_stdin, write_record,
+        TYPE_SNAPSHOT, get_or_create_snapshot
+    )
+    from archivebox.base_models.models import get_or_create_system_user_pk
+    from core.models import Snapshot, ArchiveResult
+    from crawls.models import Seed, Crawl
+    from archivebox.config import CONSTANTS
+    from workers.orchestrator import Orchestrator
+
+    created_by_id = get_or_create_system_user_pk()
+    is_tty = sys.stdout.isatty()
+
+    # Collect all input records
+    records = list(read_args_or_stdin(args))
+
+    if not records:
+        rprint('[yellow]No URLs or snapshot IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
+        return 1
+
+    # Separate records into existing snapshots vs new URLs
+    existing_snapshot_ids = []
+    new_url_records = []
+
+    for record in records:
+        # Check if it's an existing snapshot (has id but no url, or looks like a UUID)
+        if record.get('id') and not record.get('url'):
+            existing_snapshot_ids.append(record['id'])
+        elif record.get('id'):
+            # Has both id and url - check if snapshot exists
+            try:
+                Snapshot.objects.get(id=record['id'])
+                existing_snapshot_ids.append(record['id'])
+            except Snapshot.DoesNotExist:
+                new_url_records.append(record)
+        elif record.get('url'):
+            new_url_records.append(record)
+
+    # For new URLs, create a Crawl and Snapshots
+    snapshot_ids = list(existing_snapshot_ids)
+
+    if new_url_records:
+        # Create a Crawl to manage this operation
+        sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__crawl.txt'
+        sources_file.parent.mkdir(parents=True, exist_ok=True)
+        sources_file.write_text('\n'.join(r.get('url', '') for r in new_url_records if r.get('url')))
+
+        seed = Seed.from_file(
+            sources_file,
+            label=f'crawl --depth={depth}',
+            created_by=created_by_id,
+        )
+        crawl = Crawl.from_seed(seed, max_depth=depth)
+
+        # Create snapshots for new URLs
+        for record in new_url_records:
+            try:
+                record['crawl_id'] = str(crawl.id)
+                record['depth'] = record.get('depth', 0)
+
+                snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
+                snapshot_ids.append(str(snapshot.id))
+
+            except Exception as e:
+                rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
+                continue
+
+    if not snapshot_ids:
+        rprint('[red]No snapshots to process[/red]', file=sys.stderr)
+        return 1
+
+    if existing_snapshot_ids:
+        rprint(f'[blue]Using {len(existing_snapshot_ids)} existing snapshots[/blue]', file=sys.stderr)
+    if new_url_records:
+        rprint(f'[blue]Created {len(snapshot_ids) - len(existing_snapshot_ids)} new snapshots[/blue]', file=sys.stderr)
+    rprint(f'[blue]Running parser plugins on {len(snapshot_ids)} snapshots...[/blue]', file=sys.stderr)
+
+    # Create ArchiveResults for plugins
+    # If --plugin is specified, only run that one. Otherwise, run all available plugins.
+    # The orchestrator will handle dependency ordering (plugins declare deps in config.json)
+    for snapshot_id in snapshot_ids:
+        try:
+            snapshot = Snapshot.objects.get(id=snapshot_id)
+
+            if plugin:
+                # User specified a single plugin to run
+                ArchiveResult.objects.get_or_create(
+                    snapshot=snapshot,
+                    extractor=plugin,
+                    defaults={
+                        'status': ArchiveResult.StatusChoices.QUEUED,
+                        'retry_at': timezone.now(),
+                        'created_by_id': snapshot.created_by_id,
+                    }
+                )
+            else:
+                # Create pending ArchiveResults for all enabled plugins
+                # This uses hook discovery to find available plugins dynamically
+                snapshot.create_pending_archiveresults()
+
+            # Mark snapshot as started
+            snapshot.status = Snapshot.StatusChoices.STARTED
+            snapshot.retry_at = timezone.now()
+            snapshot.save()
+
+        except Snapshot.DoesNotExist:
+            continue
+
+    # Run plugins
+    if wait:
+        rprint('[blue]Running outlink plugins...[/blue]', file=sys.stderr)
+        orchestrator = Orchestrator(exit_on_idle=True)
+        orchestrator.runloop()
+
+    # Collect discovered URLs from urls.jsonl files
+    # Uses dynamic discovery - any plugin that outputs urls.jsonl is considered a parser
+    from archivebox.hooks import collect_urls_from_extractors
+
+    discovered_urls = {}
+    for snapshot_id in snapshot_ids:
+        try:
+            snapshot = Snapshot.objects.get(id=snapshot_id)
+            snapshot_dir = Path(snapshot.output_dir)
+
+            # Dynamically collect urls.jsonl from ANY plugin subdirectory
+            for entry in collect_urls_from_extractors(snapshot_dir):
+                url = entry.get('url')
+                if url and url not in discovered_urls:
+                    # Add metadata for crawl tracking
+                    entry['type'] = TYPE_SNAPSHOT
+                    entry['depth'] = snapshot.depth + 1
+                    entry['via_snapshot'] = str(snapshot.id)
+                    discovered_urls[url] = entry
+
+        except Snapshot.DoesNotExist:
+            continue
+
+    rprint(f'[green]Discovered {len(discovered_urls)} URLs[/green]', file=sys.stderr)
+
+    # Output discovered URLs as JSONL (when piped) or human-readable (when TTY)
+    for url, entry in discovered_urls.items():
+        if is_tty:
+            via = entry.get('via_extractor', 'unknown')
+            rprint(f'  [dim]{via}[/dim] {url[:80]}', file=sys.stderr)
+        else:
+            write_record(entry)
+
+    return 0
+
+
+def process_crawl_by_id(crawl_id: str) -> int:
+    """
+    Process a single Crawl by ID (used by workers).
+
+    Triggers the Crawl's state machine tick() which will:
+    - Transition from queued -> started (creates root snapshot)
+    - Transition from started -> sealed (when all snapshots done)
+    """
+    from rich import print as rprint
+    from crawls.models import Crawl
+
+    try:
+        crawl = Crawl.objects.get(id=crawl_id)
+    except Crawl.DoesNotExist:
+        rprint(f'[red]Crawl {crawl_id} not found[/red]', file=sys.stderr)
+        return 1
+
+    rprint(f'[blue]Processing Crawl {crawl.id} (status={crawl.status})[/blue]', file=sys.stderr)
+
+    try:
+        crawl.sm.tick()
+        crawl.refresh_from_db()
+        rprint(f'[green]Crawl complete (status={crawl.status})[/green]', file=sys.stderr)
+        return 0
+    except Exception as e:
+        rprint(f'[red]Crawl error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
+        return 1
+
+
+def is_crawl_id(value: str) -> bool:
+    """Check if value looks like a Crawl UUID."""
+    import re
+    uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
+    if not uuid_pattern.match(value):
+        return False
+    # Verify it's actually a Crawl (not a Snapshot or other object)
+    from crawls.models import Crawl
+    return Crawl.objects.filter(id=value).exists()
+
+
+@click.command()
+@click.option('--depth', '-d', type=int, default=1, help='Max depth for recursive crawling (default: 1)')
+@click.option('--plugin', '-p', default='', help='Use only this parser plugin (e.g., parse_html_urls, parse_dom_outlinks)')
+@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
+@click.argument('args', nargs=-1)
+def main(depth: int, plugin: str, wait: bool, args: tuple):
+    """Discover outgoing links from URLs or existing Snapshots, or process Crawl by ID"""
+    from archivebox.misc.jsonl import read_args_or_stdin
+
+    # Read all input
+    records = list(read_args_or_stdin(args))
+
+    if not records:
+        from rich import print as rprint
+        rprint('[yellow]No URLs, Snapshot IDs, or Crawl IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
+        sys.exit(1)
+
+    # Check if input looks like existing Crawl IDs to process
+    # If ALL inputs are Crawl UUIDs, process them
+    all_are_crawl_ids = all(
+        is_crawl_id(r.get('id') or r.get('url', ''))
+        for r in records
+    )
+
+    if all_are_crawl_ids:
+        # Process existing Crawls by ID
+        exit_code = 0
+        for record in records:
+            crawl_id = record.get('id') or record.get('url')
+            result = process_crawl_by_id(crawl_id)
+            if result != 0:
+                exit_code = result
+        sys.exit(exit_code)
+    else:
+        # Default behavior: discover outlinks from input (URLs or Snapshot IDs)
+        sys.exit(discover_outlinks(args, depth=depth, plugin=plugin, wait=wait))
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/cli/archivebox_extract.py
+++ b/archivebox/cli/archivebox_extract.py
@@ -1,49 +1,262 @@
 #!/usr/bin/env python3

+"""
+archivebox extract [snapshot_ids...] [--plugin=NAME]
+
+Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL.
+
+Input formats:
+    - Snapshot UUIDs (one per line)
+    - JSONL: {"type": "Snapshot", "id": "...", "url": "..."}
+    - JSONL: {"type": "ArchiveResult", "snapshot_id": "...", "plugin": "..."}
+
+Output (JSONL):
+    {"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", "status": "..."}
+
+Examples:
+    # Extract specific snapshot
+    archivebox extract 01234567-89ab-cdef-0123-456789abcdef
+
+    # Pipe from snapshot command
+    archivebox snapshot https://example.com | archivebox extract
+
+    # Run specific plugin only
+    archivebox extract --plugin=screenshot 01234567-89ab-cdef-0123-456789abcdef
+
+    # Chain commands
+    archivebox crawl https://example.com | archivebox snapshot | archivebox extract
+"""
+
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox extract'

-
 import sys
-from typing import TYPE_CHECKING, Generator
+from typing import Optional, List

 import rich_click as click

-from django.db.models import Q

-from archivebox.misc.util import enforce_types, docstring
+def process_archiveresult_by_id(archiveresult_id: str) -> int:
+    """
+    Run extraction for a single ArchiveResult by ID (used by workers).

-
-if TYPE_CHECKING:
+    Triggers the ArchiveResult's state machine tick() to run the extractor.
+    """
+    from rich import print as rprint
    from core.models import ArchiveResult

+    try:
+        archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
+    except ArchiveResult.DoesNotExist:
+        rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr)
+        return 1

-ORCHESTRATOR = None
+    rprint(f'[blue]Extracting {archiveresult.extractor} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)

-@enforce_types
-def extract(archiveresult_id: str) -> Generator['ArchiveResult', None, None]:
-    archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
-    if not archiveresult:
-        raise Exception(f'ArchiveResult {archiveresult_id} not found')
-    
-    return archiveresult.EXTRACTOR.extract()
+    try:
+        # Trigger state machine tick - this runs the actual extraction
+        archiveresult.sm.tick()
+        archiveresult.refresh_from_db()
+
+        if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
+            print(f'[green]Extraction succeeded: {archiveresult.output}[/green]')
+            return 0
+        elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
+            print(f'[red]Extraction failed: {archiveresult.output}[/red]', file=sys.stderr)
+            return 1
+        else:
+            # Still in progress or backoff - not a failure
+            print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]')
+            return 0
+
+    except Exception as e:
+        print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
+        return 1
+
+
+def run_plugins(
+    args: tuple,
+    plugin: str = '',
+    wait: bool = True,
+) -> int:
+    """
+    Run plugins on Snapshots from input.
+
+    Reads Snapshot IDs or JSONL from args/stdin, runs plugins, outputs JSONL.
+
+    Exit codes:
+        0: Success
+        1: Failure
+    """
+    from rich import print as rprint
+    from django.utils import timezone
+
+    from archivebox.misc.jsonl import (
+        read_args_or_stdin, write_record, archiveresult_to_jsonl,
+        TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
+    )
+    from core.models import Snapshot, ArchiveResult
+    from workers.orchestrator import Orchestrator
+
+    is_tty = sys.stdout.isatty()
+
+    # Collect all input records
+    records = list(read_args_or_stdin(args))
+
+    if not records:
+        rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
+        return 1
+
+    # Gather snapshot IDs to process
+    snapshot_ids = set()
+    for record in records:
+        record_type = record.get('type')
+
+        if record_type == TYPE_SNAPSHOT:
+            snapshot_id = record.get('id')
+            if snapshot_id:
+                snapshot_ids.add(snapshot_id)
+            elif record.get('url'):
+                # Look up by URL
+                try:
+                    snap = Snapshot.objects.get(url=record['url'])
+                    snapshot_ids.add(str(snap.id))
+                except Snapshot.DoesNotExist:
+                    rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
+
+        elif record_type == TYPE_ARCHIVERESULT:
+            snapshot_id = record.get('snapshot_id')
+            if snapshot_id:
+                snapshot_ids.add(snapshot_id)
+
+        elif 'id' in record:
+            # Assume it's a snapshot ID
+            snapshot_ids.add(record['id'])
+
+    if not snapshot_ids:
+        rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr)
+        return 1
+
+    # Get snapshots and ensure they have pending ArchiveResults
+    processed_count = 0
+    for snapshot_id in snapshot_ids:
+        try:
+            snapshot = Snapshot.objects.get(id=snapshot_id)
+        except Snapshot.DoesNotExist:
+            rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
+            continue
+
+        # Create pending ArchiveResults if needed
+        if plugin:
+            # Only create for specific plugin
+            result, created = ArchiveResult.objects.get_or_create(
+                snapshot=snapshot,
+                extractor=plugin,
+                defaults={
+                    'status': ArchiveResult.StatusChoices.QUEUED,
+                    'retry_at': timezone.now(),
+                    'created_by_id': snapshot.created_by_id,
+                }
+            )
+            if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
+                # Reset for retry
+                result.status = ArchiveResult.StatusChoices.QUEUED
+                result.retry_at = timezone.now()
+                result.save()
+        else:
+            # Create all pending plugins
+            snapshot.create_pending_archiveresults()
+
+        # Reset snapshot status to allow processing
+        if snapshot.status == Snapshot.StatusChoices.SEALED:
+            snapshot.status = Snapshot.StatusChoices.STARTED
+            snapshot.retry_at = timezone.now()
+            snapshot.save()
+
+        processed_count += 1
+
+    if processed_count == 0:
+        rprint('[red]No snapshots to process[/red]', file=sys.stderr)
+        return 1
+
+    rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr)
+
+    # Run orchestrator if --wait (default)
+    if wait:
+        rprint('[blue]Running plugins...[/blue]', file=sys.stderr)
+        orchestrator = Orchestrator(exit_on_idle=True)
+        orchestrator.runloop()
+
+    # Output results as JSONL (when piped) or human-readable (when TTY)
+    for snapshot_id in snapshot_ids:
+        try:
+            snapshot = Snapshot.objects.get(id=snapshot_id)
+            results = snapshot.archiveresult_set.all()
+            if plugin:
+                results = results.filter(extractor=plugin)
+
+            for result in results:
+                if is_tty:
+                    status_color = {
+                        'succeeded': 'green',
+                        'failed': 'red',
+                        'skipped': 'yellow',
+                    }.get(result.status, 'dim')
+                    rprint(f'  [{status_color}]{result.status}[/{status_color}] {result.extractor} → {result.output or ""}', file=sys.stderr)
+                else:
+                    write_record(archiveresult_to_jsonl(result))
+        except Snapshot.DoesNotExist:
+            continue
+
+    return 0
+
+
+def is_archiveresult_id(value: str) -> bool:
+    """Check if value looks like an ArchiveResult UUID."""
+    import re
+    uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
+    if not uuid_pattern.match(value):
+        return False
+    # Verify it's actually an ArchiveResult (not a Snapshot or other object)
+    from core.models import ArchiveResult
+    return ArchiveResult.objects.filter(id=value).exists()

-# <user>@<machine_id>#<datetime>/absolute/path/to/binary
-# 2014.24.01

@click.command()
+@click.option('--plugin', '-p', default='', help='Run only this plugin (e.g., screenshot, singlefile)')
+@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
+@click.argument('args', nargs=-1)
+def main(plugin: str, wait: bool, args: tuple):
+    """Run plugins on Snapshots, or process existing ArchiveResults by ID"""
+    from archivebox.misc.jsonl import read_args_or_stdin

-@click.argument('archiveresult_ids', nargs=-1, type=str)
-@docstring(extract.__doc__)
-def main(archiveresult_ids: list[str]):
-    """Add a new URL or list of URLs to your archive"""
-    
-    for archiveresult_id in (archiveresult_ids or sys.stdin):
-        print(f'Extracting {archiveresult_id}...')
-        archiveresult = extract(str(archiveresult_id))
-        print(archiveresult.as_json())
+    # Read all input
+    records = list(read_args_or_stdin(args))
+
+    if not records:
+        from rich import print as rprint
+        rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
+        sys.exit(1)
+
+    # Check if input looks like existing ArchiveResult IDs to process
+    all_are_archiveresult_ids = all(
+        is_archiveresult_id(r.get('id') or r.get('url', ''))
+        for r in records
+    )
+
+    if all_are_archiveresult_ids:
+        # Process existing ArchiveResults by ID
+        exit_code = 0
+        for record in records:
+            archiveresult_id = record.get('id') or record.get('url')
+            result = process_archiveresult_by_id(archiveresult_id)
+            if result != 0:
+                exit_code = result
+        sys.exit(exit_code)
+    else:
+        # Default behavior: run plugins on Snapshots from input
+        sys.exit(run_plugins(args, plugin=plugin, wait=wait))


 if __name__ == '__main__':
    main()
-
--- a/archivebox/cli/archivebox_init.py
+++ b/archivebox/cli/archivebox_init.py
@@ -21,10 +21,9 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
    from archivebox.config import CONSTANTS, VERSION, DATA_DIR
    from archivebox.config.common import SERVER_CONFIG
    from archivebox.config.collection import write_config_file
-    from archivebox.index import load_main_index, write_main_index, fix_invalid_folder_locations, get_invalid_folders
-    from archivebox.index.schema import Link
-    from archivebox.index.json import parse_json_main_index, parse_json_links_details
-    from archivebox.index.sql import apply_migrations
+    from archivebox.misc.folders import fix_invalid_folder_locations, get_invalid_folders
+    from archivebox.misc.legacy import parse_json_main_index, parse_json_links_details, SnapshotDict
+    from archivebox.misc.db import apply_migrations
    
    # if os.access(out_dir / CONSTANTS.JSON_INDEX_FILENAME, os.F_OK):
    #     print("[red]:warning: This folder contains a JSON index. It is deprecated, and will no longer be kept up to date automatically.[/red]", file=sys.stderr)
@@ -100,10 +99,10 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
    from core.models import Snapshot

    all_links = Snapshot.objects.none()
-    pending_links: dict[str, Link] = {}
+    pending_links: dict[str, SnapshotDict] = {}

    if existing_index:
-        all_links = load_main_index(DATA_DIR, warn=False)
+        all_links = Snapshot.objects.all()
        print(f'    √ Loaded {all_links.count()} links from existing main index.')

    if quick:
@@ -119,9 +118,9 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=

            # Links in JSON index but not in main index
            orphaned_json_links = {
-                link.url: link
-                for link in parse_json_main_index(DATA_DIR)
-                if not all_links.filter(url=link.url).exists()
+                link_dict['url']: link_dict
+                for link_dict in parse_json_main_index(DATA_DIR)
+                if not all_links.filter(url=link_dict['url']).exists()
            }
            if orphaned_json_links:
                pending_links.update(orphaned_json_links)
@@ -129,9 +128,9 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=

            # Links in data dir indexes but not in main index
            orphaned_data_dir_links = {
-                link.url: link
-                for link in parse_json_links_details(DATA_DIR)
-                if not all_links.filter(url=link.url).exists()
+                link_dict['url']: link_dict
+                for link_dict in parse_json_links_details(DATA_DIR)
+                if not all_links.filter(url=link_dict['url']).exists()
            }
            if orphaned_data_dir_links:
                pending_links.update(orphaned_data_dir_links)
@@ -159,7 +158,8 @@ def init(force: bool=False, quick: bool=False, install: bool=False, setup: bool=
            print('        archivebox init --quick', file=sys.stderr)
            raise SystemExit(1)
        
-        write_main_index(list(pending_links.values()), DATA_DIR)
+        if pending_links:
+            Snapshot.objects.create_from_dicts(list(pending_links.values()))

    print('\n[green]----------------------------------------------------------------------[/green]')

--- a/archivebox/cli/archivebox_install.py
+++ b/archivebox/cli/archivebox_install.py
@@ -4,7 +4,7 @@ __package__ = 'archivebox.cli'

 import os
 import sys
-from typing import Optional, List
+import shutil

 import rich_click as click
 from rich import print
@@ -13,149 +13,86 @@ from archivebox.misc.util import docstring, enforce_types


@enforce_types
-def install(binproviders: Optional[List[str]]=None, binaries: Optional[List[str]]=None, dry_run: bool=False) -> None:
-    """Automatically install all ArchiveBox dependencies and extras"""
-    
-    # if running as root:
-    #    - run init to create index + lib dir
-    #    - chown -R 911 DATA_DIR
-    #    - install all binaries as root
-    #    - chown -R 911 LIB_DIR
-    # else:
-    #    - run init to create index + lib dir as current user
-    #    - install all binaries as current user
-    #    - recommend user re-run with sudo if any deps need to be installed as root
+def install(dry_run: bool=False) -> None:
+    """Detect and install ArchiveBox dependencies by running a dependency-check crawl"""

-    import abx
-    import archivebox
-    from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, SudoPermission
-    from archivebox.config.paths import DATA_DIR, ARCHIVE_DIR, get_or_create_working_lib_dir
+    from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
+    from archivebox.config.paths import ARCHIVE_DIR
    from archivebox.misc.logging import stderr
    from archivebox.cli.archivebox_init import init
-    from archivebox.misc.system import run as run_shell
-

    if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()):
        init()  # must init full index because we need a db to store InstalledBinary entries in

-    print('\n[green][+] Installing ArchiveBox dependencies automatically...[/green]')
-    
-    # we never want the data dir to be owned by root, detect owner of existing owner of DATA_DIR to try and guess desired non-root UID
+    print('\n[green][+] Detecting ArchiveBox dependencies...[/green]')
+
    if IS_ROOT:
        EUID = os.geteuid()
-        
-        # if we have sudo/root permissions, take advantage of them just while installing dependencies
        print()
-        print(f'[yellow]:warning:  Running as UID=[blue]{EUID}[/blue] with [red]sudo[/red] only for dependencies that need it.[/yellow]')
-        print(f'    DATA_DIR, LIB_DIR, and TMP_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
+        print(f'[yellow]:warning:  Running as UID=[blue]{EUID}[/blue].[/yellow]')
+        print(f'    DATA_DIR will be owned by [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue].')
        print()
-    
-    LIB_DIR = get_or_create_working_lib_dir()
-    
-    package_manager_names = ', '.join(
-        f'[yellow]{binprovider.name}[/yellow]'
-        for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values()))
-        if not binproviders or (binproviders and binprovider.name in binproviders)
-    )
-    print(f'[+] Setting up package managers {package_manager_names}...')
-    for binprovider in reversed(list(abx.as_dict(abx.pm.hook.get_BINPROVIDERS()).values())):
-        if binproviders and binprovider.name not in binproviders:
-            continue
-        try:
-            binprovider.setup()
-        except Exception:
-            # it's ok, installing binaries below will automatically set up package managers as needed
-            # e.g. if user does not have npm available we cannot set it up here yet, but once npm Binary is installed
-            # the next package that depends on npm will automatically call binprovider.setup() during its own install
-            pass
-    
-    print()
-    
-    for binary in reversed(list(abx.as_dict(abx.pm.hook.get_BINARIES()).values())):
-        if binary.name in ('archivebox', 'django', 'sqlite', 'python'):
-            # obviously must already be installed if we are running
-            continue
-        
-        if binaries and binary.name not in binaries:
-            continue
-        
-        providers = ' [grey53]or[/grey53] '.join(
-            provider.name for provider in binary.binproviders_supported
-            if not binproviders or (binproviders and provider.name in binproviders)
-        )
-        if not providers:
-            continue
-        print(f'[+] Detecting / Installing [yellow]{binary.name.ljust(22)}[/yellow] using [red]{providers}[/red]...')
-        try:
-            with SudoPermission(uid=0, fallback=True):
-                # print(binary.load_or_install(fresh=True).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'}))
-                if binproviders:
-                    providers_supported_by_binary = [provider.name for provider in binary.binproviders_supported]
-                    for binprovider_name in binproviders:
-                        if binprovider_name not in providers_supported_by_binary:
-                            continue
-                        try:
-                            if dry_run:
-                                # always show install commands when doing a dry run
-                                sys.stderr.write("\033[2;49;90m")  # grey53
-                                result = binary.install(binproviders=[binprovider_name], dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
-                                sys.stderr.write("\033[00m\n")     # reset
-                            else:
-                                loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, binproviders=[binprovider_name], fresh=True, dry_run=dry_run, quiet=False)
-                                result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
-                            if result and result['loaded_version']:
-                                break
-                        except Exception as e:
-                            print(f'[red]:cross_mark: Failed to install {binary.name} as using {binprovider_name} as user {ARCHIVEBOX_USER}: {e}[/red]')
-                else:
-                    if dry_run:
-                        sys.stderr.write("\033[2;49;90m")  # grey53
-                        binary.install(dry_run=dry_run).model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
-                        sys.stderr.write("\033[00m\n")  # reset
-                    else:
-                        loaded_binary = archivebox.pm.hook.binary_load_or_install(binary=binary, fresh=True, dry_run=dry_run)
-                        result = loaded_binary.model_dump(exclude={'overrides', 'bin_dir', 'hook_type'})
-            if IS_ROOT and LIB_DIR:
-                with SudoPermission(uid=0):
-                    if ARCHIVEBOX_USER == 0:
-                        os.system(f'chmod -R 777 "{LIB_DIR.resolve()}"')
-                    else:    
-                        os.system(f'chown -R {ARCHIVEBOX_USER} "{LIB_DIR.resolve()}"')
-        except Exception as e:
-            print(f'[red]:cross_mark: Failed to install {binary.name} as user {ARCHIVEBOX_USER}: {e}[/red]')
-            if binaries and len(binaries) == 1:
-                # if we are only installing a single binary, raise the exception so the user can see what went wrong
-                raise
-                
+
+    if dry_run:
+        print('[dim]Dry run - would create a crawl to detect dependencies[/dim]')
+        return
+
+    # Set up Django
    from archivebox.config.django import setup_django
    setup_django()
-    
+
+    from django.utils import timezone
+    from crawls.models import Seed, Crawl
+    from archivebox.base_models.models import get_or_create_system_user_pk
+
+    # Create a seed and crawl for dependency detection
+    # Using a minimal crawl that will trigger on_Crawl hooks
+    created_by_id = get_or_create_system_user_pk()
+
+    seed = Seed.objects.create(
+        uri='archivebox://install',
+        label='Dependency detection',
+        created_by_id=created_by_id,
+    )
+
+    crawl = Crawl.objects.create(
+        seed=seed,
+        max_depth=0,
+        created_by_id=created_by_id,
+        status='queued',
+    )
+
+    print(f'[+] Created dependency detection crawl: {crawl.id}')
+    print('[+] Running crawl to detect binaries via on_Crawl hooks...')
+    print()
+
+    # Run the crawl synchronously (this triggers on_Crawl hooks)
+    from workers.orchestrator import Orchestrator
+    orchestrator = Orchestrator(exit_on_idle=True)
+    orchestrator.runloop()
+
+    print()
+
+    # Check for superuser
    from django.contrib.auth import get_user_model
    User = get_user_model()

    if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
        stderr('\n[+] Don\'t forget to create a new admin user for the Web UI...', color='green')
        stderr('    archivebox manage createsuperuser')
-        # run_subcommand('manage', subcommand_args=['createsuperuser'], pwd=out_dir)
-    
-    print('\n[green][√] Set up ArchiveBox and its dependencies successfully.[/green]\n', file=sys.stderr)
-    
-    from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
-    
-    extra_args = []
-    if binproviders:
-        extra_args.append(f'--binproviders={",".join(binproviders)}')
-    if binaries:
-        extra_args.append(f'--binaries={",".join(binaries)}')
-    
-    proc = run_shell([ARCHIVEBOX_BINARY.load().abspath, 'version', *extra_args], capture_output=False, cwd=DATA_DIR)
-    raise SystemExit(proc.returncode)
+
+    print()
+
+    # Run version to show full status
+    archivebox_path = shutil.which('archivebox') or sys.executable
+    if 'python' in archivebox_path:
+        os.system(f'{sys.executable} -m archivebox version')
+    else:
+        os.system(f'{archivebox_path} version')


@click.command()
-@click.option('--binproviders', '-p', type=str, help='Select binproviders to use DEFAULT=env,apt,brew,sys_pip,venv_pip,lib_pip,pipx,sys_npm,lib_npm,puppeteer,playwright (all)', default=None)
-@click.option('--binaries', '-b', type=str, help='Select binaries to install DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)', default=None)
-@click.option('--dry-run', '-d', is_flag=True, help='Show what would be installed without actually installing anything', default=False)
+@click.option('--dry-run', '-d', is_flag=True, help='Show what would happen without actually running', default=False)
@docstring(install.__doc__)
 def main(**kwargs) -> None:
    install(**kwargs)
--- a/archivebox/cli/archivebox_orchestrator.py
+++ b/archivebox/cli/archivebox_orchestrator.py
@@ -0,0 +1,67 @@
+#!/usr/bin/env python3
+
+"""
+archivebox orchestrator [--daemon]
+
+Start the orchestrator process that manages workers.
+
+The orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult)
+and lazily spawns worker processes when there is work to be done.
+"""
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox orchestrator'
+
+import sys
+
+import rich_click as click
+
+from archivebox.misc.util import docstring
+
+
+def orchestrator(daemon: bool = False, watch: bool = False) -> int:
+    """
+    Start the orchestrator process.
+    
+    The orchestrator:
+    1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
+    2. Spawns worker processes when there is work to do
+    3. Monitors worker health and restarts failed workers
+    4. Exits when all queues are empty (unless --daemon)
+    
+    Args:
+        daemon: Run forever (don't exit when idle)
+        watch: Just watch the queues without spawning workers (for debugging)
+    
+    Exit codes:
+        0: All work completed successfully
+        1: Error occurred
+    """
+    from workers.orchestrator import Orchestrator
+    
+    if Orchestrator.is_running():
+        print('[yellow]Orchestrator is already running[/yellow]')
+        return 0
+    
+    try:
+        orchestrator_instance = Orchestrator(exit_on_idle=not daemon)
+        orchestrator_instance.runloop()
+        return 0
+    except KeyboardInterrupt:
+        return 0
+    except Exception as e:
+        print(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
+        return 1
+
+
+@click.command()
+@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
+@click.option('--watch', '-w', is_flag=True, help="Watch queues without spawning workers")
+@docstring(orchestrator.__doc__)
+def main(daemon: bool, watch: bool):
+    """Start the ArchiveBox orchestrator process"""
+    sys.exit(orchestrator(daemon=daemon, watch=watch))
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/cli/archivebox_remove.py
+++ b/archivebox/cli/archivebox_remove.py
@@ -12,10 +12,7 @@ import rich_click as click
 from django.db.models import QuerySet

 from archivebox.config import DATA_DIR
-from archivebox.index.schema import Link
 from archivebox.config.django import setup_django
-from archivebox.index import load_main_index
-from archivebox.index.sql import remove_from_sql_main_index
 from archivebox.misc.util import enforce_types, docstring
 from archivebox.misc.checks import check_data_folder
 from archivebox.misc.logging_util import (
@@ -35,7 +32,7 @@ def remove(filter_patterns: Iterable[str]=(),
          before: float | None=None,
          yes: bool=False,
          delete: bool=False,
-          out_dir: Path=DATA_DIR) -> Iterable[Link]:
+          out_dir: Path=DATA_DIR) -> QuerySet:
    """Remove the specified URLs from the archive"""
    
    setup_django()
@@ -63,27 +60,27 @@ def remove(filter_patterns: Iterable[str]=(),
        log_removal_finished(0, 0)
        raise SystemExit(1)

-    log_links = [link.as_link() for link in snapshots]
-    log_list_finished(log_links)
-    log_removal_started(log_links, yes=yes, delete=delete)
+    log_list_finished(snapshots)
+    log_removal_started(snapshots, yes=yes, delete=delete)

    timer = TimedProgress(360, prefix='      ')
    try:
        for snapshot in snapshots:
            if delete:
-                shutil.rmtree(snapshot.as_link().link_dir, ignore_errors=True)
+                shutil.rmtree(snapshot.output_dir, ignore_errors=True)
    finally:
        timer.end()

    to_remove = snapshots.count()

    from archivebox.search import flush_search_index
+    from core.models import Snapshot

    flush_search_index(snapshots=snapshots)
-    remove_from_sql_main_index(snapshots=snapshots, out_dir=out_dir)
-    all_snapshots = load_main_index(out_dir=out_dir)
+    snapshots.delete()
+    all_snapshots = Snapshot.objects.all()
    log_removal_finished(all_snapshots.count(), to_remove)
-    
+
    return all_snapshots


--- a/archivebox/cli/archivebox_schedule.py
+++ b/archivebox/cli/archivebox_schedule.py
@@ -35,9 +35,12 @@ def schedule(add: bool=False,
 
    depth = int(depth)
    
+    import shutil
    from crontab import CronTab, CronSlices
    from archivebox.misc.system import dedupe_cron_jobs
-    from abx_plugin_pip.binaries import ARCHIVEBOX_BINARY
+    
+    # Find the archivebox binary path
+    ARCHIVEBOX_ABSPATH = shutil.which('archivebox') or sys.executable.replace('python', 'archivebox')

    Path(CONSTANTS.LOGS_DIR).mkdir(exist_ok=True)

@@ -58,7 +61,7 @@ def schedule(add: bool=False,
            'cd',
            quoted(out_dir),
            '&&',
-            quoted(ARCHIVEBOX_BINARY.load().abspath),
+            quoted(ARCHIVEBOX_ABSPATH),
            *([
                'add',
                *(['--overwrite'] if overwrite else []),
--- a/archivebox/cli/archivebox_search.py
+++ b/archivebox/cli/archivebox_search.py
@@ -4,7 +4,7 @@ __package__ = 'archivebox.cli'
 __command__ = 'archivebox search'

 from pathlib import Path
-from typing import Optional, List, Iterable
+from typing import Optional, List, Any

 import rich_click as click
 from rich import print
@@ -12,11 +12,19 @@ from rich import print
 from django.db.models import QuerySet

 from archivebox.config import DATA_DIR
-from archivebox.index import LINK_FILTERS
-from archivebox.index.schema import Link
 from archivebox.misc.logging import stderr
 from archivebox.misc.util import enforce_types, docstring

+# Filter types for URL matching
+LINK_FILTERS = {
+    'exact': lambda pattern: {'url': pattern},
+    'substring': lambda pattern: {'url__icontains': pattern},
+    'regex': lambda pattern: {'url__iregex': pattern},
+    'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
+    'tag': lambda pattern: {'tags__name': pattern},
+    'timestamp': lambda pattern: {'timestamp': pattern},
+}
+
 STATUS_CHOICES = [
    'indexed', 'archived', 'unarchived', 'present', 'valid', 'invalid',
    'duplicate', 'orphaned', 'corrupted', 'unrecognized'
@@ -24,38 +32,37 @@ STATUS_CHOICES = [



-def list_links(snapshots: Optional[QuerySet]=None,
-               filter_patterns: Optional[List[str]]=None,
-               filter_type: str='substring',
-               after: Optional[float]=None,
-               before: Optional[float]=None,
-               out_dir: Path=DATA_DIR) -> Iterable[Link]:
-    
-    from archivebox.index import load_main_index
-    from archivebox.index import snapshot_filter
+def get_snapshots(snapshots: Optional[QuerySet]=None,
+                  filter_patterns: Optional[List[str]]=None,
+                  filter_type: str='substring',
+                  after: Optional[float]=None,
+                  before: Optional[float]=None,
+                  out_dir: Path=DATA_DIR) -> QuerySet:
+    """Filter and return Snapshots matching the given criteria."""
+    from core.models import Snapshot

    if snapshots:
-        all_snapshots = snapshots
+        result = snapshots
    else:
-        all_snapshots = load_main_index(out_dir=out_dir)
+        result = Snapshot.objects.all()

    if after is not None:
-        all_snapshots = all_snapshots.filter(timestamp__gte=after)
+        result = result.filter(timestamp__gte=after)
    if before is not None:
-        all_snapshots = all_snapshots.filter(timestamp__lt=before)
+        result = result.filter(timestamp__lt=before)
    if filter_patterns:
-        all_snapshots = snapshot_filter(all_snapshots, filter_patterns, filter_type)
+        result = Snapshot.objects.filter_by_patterns(filter_patterns, filter_type)

-    if not all_snapshots:
+    if not result:
        stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')

-    return all_snapshots
+    return result


-def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict[str, Link | None]:
-    
+def list_folders(snapshots: QuerySet, status: str, out_dir: Path=DATA_DIR) -> dict[str, Any]:
+
    from archivebox.misc.checks import check_data_folder
-    from archivebox.index import (
+    from archivebox.misc.folders import (
        get_indexed_folders,
        get_archived_folders,
        get_unarchived_folders,
@@ -67,7 +74,7 @@ def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict
        get_corrupted_folders,
        get_unrecognized_folders,
    )
-    
+
    check_data_folder()

    STATUS_FUNCTIONS = {
@@ -84,7 +91,7 @@ def list_folders(links: list[Link], status: str, out_dir: Path=DATA_DIR) -> dict
    }

    try:
-        return STATUS_FUNCTIONS[status](links, out_dir=out_dir)
+        return STATUS_FUNCTIONS[status](snapshots, out_dir=out_dir)
    except KeyError:
        raise ValueError('Status not recognized.')

@@ -109,7 +116,7 @@ def search(filter_patterns: list[str] | None=None,
        stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
        raise SystemExit(2)

-    snapshots = list_links(
+    snapshots = get_snapshots(
        filter_patterns=list(filter_patterns) if filter_patterns else None,
        filter_type=filter_type,
        before=before,
@@ -120,20 +127,24 @@ def search(filter_patterns: list[str] | None=None,
        snapshots = snapshots.order_by(sort)

    folders = list_folders(
-        links=snapshots,
+        snapshots=snapshots,
        status=status,
        out_dir=DATA_DIR,
    )

    if json:
-        from archivebox.index.json import generate_json_index_from_links
-        output = generate_json_index_from_links(folders.values(), with_headers)
+        from core.models import Snapshot
+        # Filter for non-None snapshots
+        valid_snapshots = [s for s in folders.values() if s is not None]
+        output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_json(with_headers=with_headers)
    elif html:
-        from archivebox.index.html import generate_index_from_links
-        output = generate_index_from_links(folders.values(), with_headers) 
+        from core.models import Snapshot
+        valid_snapshots = [s for s in folders.values() if s is not None]
+        output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_html(with_headers=with_headers)
    elif csv:
-        from archivebox.index.csv import links_to_csv
-        output = links_to_csv(folders.values(), csv.split(','), with_headers)
+        from core.models import Snapshot
+        valid_snapshots = [s for s in folders.values() if s is not None]
+        output = Snapshot.objects.filter(pk__in=[s.pk for s in valid_snapshots]).to_csv(cols=csv.split(','), header=with_headers)
    else:
        from archivebox.misc.logging_util import printable_folders
        output = printable_folders(folders, with_headers)
--- a/archivebox/cli/archivebox_snapshot.py
+++ b/archivebox/cli/archivebox_snapshot.py
@@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+
+"""
+archivebox snapshot [urls...] [--depth=N] [--tag=TAG] [--plugins=...]
+
+Create Snapshots from URLs. Accepts URLs as arguments, from stdin, or via JSONL.
+
+Input formats:
+    - Plain URLs (one per line)
+    - JSONL: {"type": "Snapshot", "url": "...", "title": "...", "tags": "..."}
+
+Output (JSONL):
+    {"type": "Snapshot", "id": "...", "url": "...", "status": "queued", ...}
+
+Examples:
+    # Create snapshots from URLs
+    archivebox snapshot https://example.com https://foo.com
+
+    # Pipe from stdin
+    echo 'https://example.com' | archivebox snapshot
+
+    # Chain with extract
+    archivebox snapshot https://example.com | archivebox extract
+
+    # With crawl depth
+    archivebox snapshot --depth=1 https://example.com
+"""
+
+__package__ = 'archivebox.cli'
+__command__ = 'archivebox snapshot'
+
+import sys
+from typing import Optional
+
+import rich_click as click
+
+from archivebox.misc.util import docstring
+
+
+def process_snapshot_by_id(snapshot_id: str) -> int:
+    """
+    Process a single Snapshot by ID (used by workers).
+
+    Triggers the Snapshot's state machine tick() which will:
+    - Transition from queued -> started (creates pending ArchiveResults)
+    - Transition from started -> sealed (when all ArchiveResults done)
+    """
+    from rich import print as rprint
+    from core.models import Snapshot
+
+    try:
+        snapshot = Snapshot.objects.get(id=snapshot_id)
+    except Snapshot.DoesNotExist:
+        rprint(f'[red]Snapshot {snapshot_id} not found[/red]', file=sys.stderr)
+        return 1
+
+    rprint(f'[blue]Processing Snapshot {snapshot.id} {snapshot.url[:50]} (status={snapshot.status})[/blue]', file=sys.stderr)
+
+    try:
+        snapshot.sm.tick()
+        snapshot.refresh_from_db()
+        rprint(f'[green]Snapshot complete (status={snapshot.status})[/green]', file=sys.stderr)
+        return 0
+    except Exception as e:
+        rprint(f'[red]Snapshot error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
+        return 1
+
+
+def create_snapshots(
+    urls: tuple,
+    depth: int = 0,
+    tag: str = '',
+    plugins: str = '',
+    created_by_id: Optional[int] = None,
+) -> int:
+    """
+    Create Snapshots from URLs or JSONL records.
+
+    Reads from args or stdin, creates Snapshot objects, outputs JSONL.
+    If --plugins is passed, also runs specified plugins (blocking).
+
+    Exit codes:
+        0: Success
+        1: Failure
+    """
+    from rich import print as rprint
+    from django.utils import timezone
+
+    from archivebox.misc.jsonl import (
+        read_args_or_stdin, write_record, snapshot_to_jsonl,
+        TYPE_SNAPSHOT, TYPE_TAG, get_or_create_snapshot
+    )
+    from archivebox.base_models.models import get_or_create_system_user_pk
+    from core.models import Snapshot
+    from crawls.models import Seed, Crawl
+    from archivebox.config import CONSTANTS
+
+    created_by_id = created_by_id or get_or_create_system_user_pk()
+    is_tty = sys.stdout.isatty()
+
+    # Collect all input records
+    records = list(read_args_or_stdin(urls))
+
+    if not records:
+        rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
+        return 1
+
+    # If depth > 0, we need a Crawl to manage recursive discovery
+    crawl = None
+    if depth > 0:
+        # Create a seed for this batch
+        sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__snapshot.txt'
+        sources_file.parent.mkdir(parents=True, exist_ok=True)
+        sources_file.write_text('\n'.join(r.get('url', '') for r in records if r.get('url')))
+
+        seed = Seed.from_file(
+            sources_file,
+            label=f'snapshot --depth={depth}',
+            created_by=created_by_id,
+        )
+        crawl = Crawl.from_seed(seed, max_depth=depth)
+
+    # Process each record
+    created_snapshots = []
+    for record in records:
+        if record.get('type') != TYPE_SNAPSHOT and 'url' not in record:
+            continue
+
+        try:
+            # Add crawl info if we have one
+            if crawl:
+                record['crawl_id'] = str(crawl.id)
+                record['depth'] = record.get('depth', 0)
+
+            # Add tags if provided via CLI
+            if tag and not record.get('tags'):
+                record['tags'] = tag
+
+            # Get or create the snapshot
+            snapshot = get_or_create_snapshot(record, created_by_id=created_by_id)
+            created_snapshots.append(snapshot)
+
+            # Output JSONL record (only when piped)
+            if not is_tty:
+                write_record(snapshot_to_jsonl(snapshot))
+
+        except Exception as e:
+            rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
+            continue
+
+    if not created_snapshots:
+        rprint('[red]No snapshots created[/red]', file=sys.stderr)
+        return 1
+
+    rprint(f'[green]Created {len(created_snapshots)} snapshots[/green]', file=sys.stderr)
+
+    # If TTY, show human-readable output
+    if is_tty:
+        for snapshot in created_snapshots:
+            rprint(f'  [dim]{snapshot.id}[/dim] {snapshot.url[:60]}', file=sys.stderr)
+
+    # If --plugins is passed, run the orchestrator for those plugins
+    if plugins:
+        from workers.orchestrator import Orchestrator
+        rprint(f'[blue]Running plugins: {plugins or "all"}...[/blue]', file=sys.stderr)
+        orchestrator = Orchestrator(exit_on_idle=True)
+        orchestrator.runloop()
+
+    return 0
+
+
+def is_snapshot_id(value: str) -> bool:
+    """Check if value looks like a Snapshot UUID."""
+    import re
+    uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
+    return bool(uuid_pattern.match(value))
+
+
+@click.command()
+@click.option('--depth', '-d', type=int, default=0, help='Recursively crawl linked pages up to N levels deep')
+@click.option('--tag', '-t', default='', help='Comma-separated tags to add to each snapshot')
+@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run after creating snapshots (e.g. title,screenshot)')
+@click.argument('args', nargs=-1)
+def main(depth: int, tag: str, plugins: str, args: tuple):
+    """Create Snapshots from URLs, or process existing Snapshots by ID"""
+    from archivebox.misc.jsonl import read_args_or_stdin
+
+    # Read all input
+    records = list(read_args_or_stdin(args))
+
+    if not records:
+        from rich import print as rprint
+        rprint('[yellow]No URLs or Snapshot IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
+        sys.exit(1)
+
+    # Check if input looks like existing Snapshot IDs to process
+    # If ALL inputs are UUIDs with no URL, assume we're processing existing Snapshots
+    all_are_ids = all(
+        (r.get('id') and not r.get('url')) or is_snapshot_id(r.get('url', ''))
+        for r in records
+    )
+
+    if all_are_ids:
+        # Process existing Snapshots by ID
+        exit_code = 0
+        for record in records:
+            snapshot_id = record.get('id') or record.get('url')
+            result = process_snapshot_by_id(snapshot_id)
+            if result != 0:
+                exit_code = result
+        sys.exit(exit_code)
+    else:
+        # Create new Snapshots from URLs
+        sys.exit(create_snapshots(args, depth=depth, tag=tag, plugins=plugins))
+
+
+if __name__ == '__main__':
+    main()
--- a/archivebox/cli/archivebox_status.py
+++ b/archivebox/cli/archivebox_status.py
@@ -10,9 +10,8 @@ from rich import print
 from archivebox.misc.util import enforce_types, docstring
 from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVE_DIR
 from archivebox.config.common import SHELL_CONFIG
-from archivebox.index.json import parse_json_links_details
-from archivebox.index import (
-    load_main_index,
+from archivebox.misc.legacy import parse_json_links_details
+from archivebox.misc.folders import (
    get_indexed_folders,
    get_archived_folders,
    get_invalid_folders,
@@ -33,7 +32,7 @@ def status(out_dir: Path=DATA_DIR) -> None:
    """Print out some info and statistics about the archive collection"""

    from django.contrib.auth import get_user_model
-    from archivebox.index.sql import get_admins
+    from archivebox.misc.db import get_admins
    from core.models import Snapshot
    User = get_user_model()

@@ -44,7 +43,7 @@ def status(out_dir: Path=DATA_DIR) -> None:
    print(f'    Index size: {size} across {num_files} files')
    print()

-    links = load_main_index(out_dir=out_dir)
+    links = Snapshot.objects.all()
    num_sql_links = links.count()
    num_link_details = sum(1 for link in parse_json_links_details(out_dir=out_dir))
    print(f'    > SQL Main Index: {num_sql_links} links'.ljust(36), f'(found in {CONSTANTS.SQL_INDEX_FILENAME})')
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -8,8 +8,7 @@ import rich_click as click
 from typing import Iterable

 from archivebox.misc.util import enforce_types, docstring
-from archivebox.index import (
-    LINK_FILTERS,
+from archivebox.misc.folders import (
    get_indexed_folders,
    get_archived_folders,
    get_unarchived_folders,
@@ -22,6 +21,16 @@ from archivebox.index import (
    get_unrecognized_folders,
 )

+# Filter types for URL matching
+LINK_FILTERS = {
+    'exact': lambda pattern: {'url': pattern},
+    'substring': lambda pattern: {'url__icontains': pattern},
+    'regex': lambda pattern: {'url__iregex': pattern},
+    'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
+    'tag': lambda pattern: {'tags__name': pattern},
+    'timestamp': lambda pattern: {'timestamp': pattern},
+}
+

@enforce_types
 def update(filter_patterns: Iterable[str]=(),
@@ -33,15 +42,66 @@ def update(filter_patterns: Iterable[str]=(),
          after: float | None=None,
          status: str='indexed',
          filter_type: str='exact',
-          extract: str="") -> None:
+          plugins: str="",
+          max_workers: int=4) -> None:
    """Import any new links from subscriptions and retry any previously failed/skipped links"""
    
+    from rich import print
+    
    from archivebox.config.django import setup_django
    setup_django()
+
+    from django.utils import timezone
+    from core.models import Snapshot
+    from workers.orchestrator import parallel_archive
    
-    from workers.orchestrator import Orchestrator
-    orchestrator = Orchestrator(exit_on_idle=False)
-    orchestrator.start()
+    # Get snapshots to update based on filters
+    snapshots = Snapshot.objects.all()
+    
+    if filter_patterns:
+        snapshots = Snapshot.objects.filter_by_patterns(list(filter_patterns), filter_type)
+    
+    if status == 'unarchived':
+        snapshots = snapshots.filter(downloaded_at__isnull=True)
+    elif status == 'archived':
+        snapshots = snapshots.filter(downloaded_at__isnull=False)
+    
+    if before:
+        from datetime import datetime
+        snapshots = snapshots.filter(bookmarked_at__lt=datetime.fromtimestamp(before))
+    if after:
+        from datetime import datetime
+        snapshots = snapshots.filter(bookmarked_at__gt=datetime.fromtimestamp(after))
+    
+    if resume:
+        snapshots = snapshots.filter(timestamp__gte=str(resume))
+    
+    snapshot_ids = list(snapshots.values_list('pk', flat=True))
+    
+    if not snapshot_ids:
+        print('[yellow]No snapshots found matching the given filters[/yellow]')
+        return
+    
+    print(f'[green]\\[*] Found {len(snapshot_ids)} snapshots to update[/green]')
+    
+    if index_only:
+        print('[yellow]Index-only mode - skipping archiving[/yellow]')
+        return
+    
+    methods = plugins.split(',') if plugins else None
+
+    # Queue snapshots for archiving via the state machine system
+    # Workers will pick them up and run the plugins
+    if len(snapshot_ids) > 1 and max_workers > 1:
+        parallel_archive(snapshot_ids, max_workers=max_workers, overwrite=overwrite, methods=methods)
+    else:
+        # Queue snapshots by setting status to queued
+        for snapshot in snapshots:
+            Snapshot.objects.filter(id=snapshot.id).update(
+                status=Snapshot.StatusChoices.QUEUED,
+                retry_at=timezone.now(),
+            )
+        print(f'[green]Queued {len(snapshot_ids)} snapshots for archiving[/green]')


@click.command()
@@ -71,7 +131,8 @@ Update only links or data directories that have the given status:
    unrecognized  {get_unrecognized_folders.__doc__}
 ''')
@click.option('--filter-type', '-t', type=click.Choice([*LINK_FILTERS.keys(), 'search']), default='exact', help='Type of pattern matching to use when filtering URLs')
-@click.option('--extract', '-e', default='', help='Comma-separated list of extractors to use e.g. title,favicon,screenshot,singlefile,...')
+@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to use e.g. title,favicon,screenshot,singlefile,...')
+@click.option('--max-workers', '-j', type=int, default=4, help='Number of parallel worker processes for archiving')
@click.argument('filter_patterns', nargs=-1)
@docstring(update.__doc__)
 def main(**kwargs):
--- a/archivebox/cli/archivebox_version.py
+++ b/archivebox/cli/archivebox_version.py
@@ -3,7 +3,10 @@
 __package__ = 'archivebox.cli'

 import sys
-from typing import Iterable
+import os
+import platform
+from pathlib import Path
+from typing import Iterable, Optional

 import rich_click as click

@@ -12,7 +15,6 @@ from archivebox.misc.util import docstring, enforce_types

@enforce_types
 def version(quiet: bool=False,
-            binproviders: Iterable[str]=(),
            binaries: Iterable[str]=()) -> list[str]:
    """Print the ArchiveBox version, debug metadata, and installed dependency versions"""
    
@@ -22,37 +24,24 @@ def version(quiet: bool=False,
    if quiet or '--version' in sys.argv:
        return []
    
-    # Only do slower imports when getting full version info
-    import os
-    import platform
-    from pathlib import Path
-    
    from rich.panel import Panel
    from rich.console import Console
-    from abx_pkg import Binary
    
-    import abx
-    import archivebox
    from archivebox.config import CONSTANTS, DATA_DIR
    from archivebox.config.version import get_COMMIT_HASH, get_BUILD_TIME
    from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, RUNNING_AS_UID, RUNNING_AS_GID, IN_DOCKER
    from archivebox.config.paths import get_data_locations, get_code_locations
    from archivebox.config.common import SHELL_CONFIG, STORAGE_CONFIG, SEARCH_BACKEND_CONFIG
    from archivebox.misc.logging_util import printable_folder_status
-    
-    from abx_plugin_default_binproviders import apt, brew, env
+    from archivebox.config.configset import get_config
    
    console = Console()
    prnt = console.print
    
-    LDAP_ENABLED = archivebox.pm.hook.get_SCOPE_CONFIG().LDAP_ENABLED
+    # Check if LDAP is enabled (simple config lookup)
+    config = get_config()
+    LDAP_ENABLED = config.get('LDAP_ENABLED', False)

-    # 0.7.1
-    # ArchiveBox v0.7.1+editable COMMIT_HASH=951bba5 BUILD_TIME=2023-12-17 16:46:05 1702860365
-    # IN_DOCKER=False IN_QEMU=False ARCH=arm64 OS=Darwin PLATFORM=macOS-14.2-arm64-arm-64bit PYTHON=Cpython
-    # FS_ATOMIC=True FS_REMOTE=False FS_USER=501:20 FS_PERMS=644
-    # DEBUG=False IS_TTY=True TZ=UTC SEARCH_BACKEND=ripgrep LDAP=False
-    
    p = platform.uname()
    COMMIT_HASH = get_COMMIT_HASH()
    prnt(
@@ -68,15 +57,26 @@ def version(quiet: bool=False,
        f'PLATFORM={platform.platform()}',
        f'PYTHON={sys.implementation.name.title()}' + (' (venv)' if CONSTANTS.IS_INSIDE_VENV else ''),
    )
-    OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
-    DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
-    prnt(
-        f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
-        f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}',
-        f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
-        f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
-        f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
-    )
+    
+    try:
+        OUTPUT_IS_REMOTE_FS = get_data_locations().DATA_DIR.is_mount or get_data_locations().ARCHIVE_DIR.is_mount
+    except Exception:
+        OUTPUT_IS_REMOTE_FS = False
+        
+    try:
+        DATA_DIR_STAT = CONSTANTS.DATA_DIR.stat()
+        prnt(
+            f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
+            f'FS_UID={DATA_DIR_STAT.st_uid}:{DATA_DIR_STAT.st_gid}',
+            f'FS_PERMS={STORAGE_CONFIG.OUTPUT_PERMISSIONS}',
+            f'FS_ATOMIC={STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES}',
+            f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
+        )
+    except Exception:
+        prnt(
+            f'EUID={os.geteuid()}:{os.getegid()} UID={RUNNING_AS_UID}:{RUNNING_AS_GID} PUID={ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}',
+        )
+        
    prnt(
        f'DEBUG={SHELL_CONFIG.DEBUG}',
        f'IS_TTY={SHELL_CONFIG.IS_TTY}',
@@ -84,14 +84,11 @@ def version(quiet: bool=False,
        f'ID={CONSTANTS.MACHINE_ID}:{CONSTANTS.COLLECTION_ID}',
        f'SEARCH_BACKEND={SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE}',
        f'LDAP={LDAP_ENABLED}',
-        #f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})',  # add this if we have more useful info to show eventually
    )
    prnt()
    
    if not (os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) and os.access(CONSTANTS.CONFIG_FILE, os.R_OK)):
        PANEL_TEXT = '\n'.join((
-            # '',
-            # f'[yellow]CURRENT DIR =[/yellow] [red]{os.getcwd()}[/red]',
            '',
            '[violet]Hint:[/violet] [green]cd[/green] into a collection [blue]DATA_DIR[/blue] and run [green]archivebox version[/green] again...',
            '      [grey53]OR[/grey53] run [green]archivebox init[/green] to create a new collection in the current dir.',
@@ -105,77 +102,94 @@ def version(quiet: bool=False,

    prnt('[pale_green1][i] Binary Dependencies:[/pale_green1]')
    failures = []
-    BINARIES = abx.as_dict(archivebox.pm.hook.get_BINARIES())
-    for name, binary in list(BINARIES.items()):
-        if binary.name == 'archivebox':
-            continue
-        
-        # skip if the binary is not in the requested list of binaries
-        if binaries and binary.name not in binaries:
-            continue
-        
-        # skip if the binary is not supported by any of the requested binproviders
-        if binproviders and binary.binproviders_supported and not any(provider.name in binproviders for provider in binary.binproviders_supported):
-            continue
-        
-        err = None
-        try:
-            loaded_bin = binary.load()
-        except Exception as e:
-            err = e
-            loaded_bin = binary
-        provider_summary = f'[dark_sea_green3]{loaded_bin.binprovider.name.ljust(10)}[/dark_sea_green3]' if loaded_bin.binprovider else '[grey23]not found[/grey23] '
-        if loaded_bin.abspath:
-            abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
-            if ' ' in abspath:
-                abspath = abspath.replace(' ', r'\ ')
-        else:
-            abspath = f'[red]{err}[/red]'
-        prnt('', '[green]√[/green]' if loaded_bin.is_valid else '[red]X[/red]', '', loaded_bin.name.ljust(21), str(loaded_bin.version).ljust(12), provider_summary, abspath, overflow='ignore', crop=False)
-        if not loaded_bin.is_valid:
-            failures.append(loaded_bin.name)
-            
-    prnt()
-    prnt('[gold3][i] Package Managers:[/gold3]')
-    BINPROVIDERS = abx.as_dict(archivebox.pm.hook.get_BINPROVIDERS())
-    for name, binprovider in list(BINPROVIDERS.items()):
-        err = None
-        
-        if binproviders and binprovider.name not in binproviders:
-            continue
-        
-        # TODO: implement a BinProvider.BINARY() method that gets the loaded binary for a binprovider's INSTALLER_BIN
-        loaded_bin = binprovider.INSTALLER_BINARY or Binary(name=binprovider.INSTALLER_BIN, binproviders=[env, apt, brew])
-        
-        abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
-        abspath = None
-        if loaded_bin.abspath:
-            abspath = str(loaded_bin.abspath).replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
-            if ' ' in abspath:
-                abspath = abspath.replace(' ', r'\ ')
-                
-        PATH = str(binprovider.PATH).replace(str(DATA_DIR), '[light_slate_blue].[/light_slate_blue]').replace(str(Path('~').expanduser()), '~')
-        ownership_summary = f'UID=[blue]{str(binprovider.EUID).ljust(4)}[/blue]'
-        provider_summary = f'[dark_sea_green3]{str(abspath).ljust(52)}[/dark_sea_green3]' if abspath else f'[grey23]{"not available".ljust(52)}[/grey23]'
-        prnt('', '[green]√[/green]' if binprovider.is_valid else '[grey53]-[/grey53]', '', binprovider.name.ljust(11), provider_summary, ownership_summary, f'PATH={PATH}', overflow='ellipsis', soft_wrap=True)

-    if not (binaries or binproviders):
-        # dont show source code / data dir info if we just want to get version info for a binary or binprovider
-        
+    # Setup Django before importing models
+    from archivebox.config.django import setup_django
+    setup_django()
+
+    from machine.models import Machine, InstalledBinary
+
+    machine = Machine.current()
+
+    # Get all *_BINARY config values
+    binary_config_keys = [key for key in config.keys() if key.endswith('_BINARY')]
+
+    if not binary_config_keys:
+        prnt('', '[grey53]No binary dependencies defined in config.[/grey53]')
+    else:
+        for key in sorted(set(binary_config_keys)):
+            # Get the actual binary name/path from config value
+            bin_value = config.get(key, '').strip()
+            if not bin_value:
+                continue
+
+            # Check if it's a path (has slashes) or just a name
+            is_path = '/' in bin_value
+
+            if is_path:
+                # It's a full path - match against abspath
+                bin_name = Path(bin_value).name
+                # Skip if user specified specific binaries and this isn't one
+                if binaries and bin_name not in binaries:
+                    continue
+                # Find InstalledBinary where abspath ends with this path
+                installed = InstalledBinary.objects.filter(
+                    machine=machine,
+                    abspath__endswith=bin_value,
+                ).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
+            else:
+                # It's just a binary name - match against name
+                bin_name = bin_value
+                # Skip if user specified specific binaries and this isn't one
+                if binaries and bin_name not in binaries:
+                    continue
+                # Find InstalledBinary by name
+                installed = InstalledBinary.objects.filter(
+                    machine=machine,
+                    name__iexact=bin_name,
+                ).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
+
+            if installed and installed.is_valid:
+                display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
+                version_str = (installed.version or 'unknown')[:15]
+                provider = (installed.binprovider or 'env')[:8]
+                prnt('', '[green]√[/green]', '', bin_name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
+            else:
+                prnt('', '[red]X[/red]', '', bin_name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
+                failures.append(bin_name)
+
+    # Show hint if no binaries are installed yet
+    has_any_installed = InstalledBinary.objects.filter(machine=machine).exclude(abspath='').exists()
+    if not has_any_installed:
+        prnt()
+        prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]')
+
+    if not binaries:
+        # Show code and data locations
        prnt()
        prnt('[deep_sky_blue3][i] Code locations:[/deep_sky_blue3]')
-        for name, path in get_code_locations().items():
-            prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
+        try:
+            for name, path in get_code_locations().items():
+                if isinstance(path, dict):
+                    prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
+        except Exception as e:
+            prnt(f'  [red]Error getting code locations: {e}[/red]')

        prnt()
        if os.access(CONSTANTS.ARCHIVE_DIR, os.R_OK) or os.access(CONSTANTS.CONFIG_FILE, os.R_OK):
            prnt('[bright_yellow][i] Data locations:[/bright_yellow]')
-            for name, path in get_data_locations().items():
-                prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
-        
-            from archivebox.misc.checks import check_data_dir_permissions
+            try:
+                for name, path in get_data_locations().items():
+                    if isinstance(path, dict):
+                        prnt(printable_folder_status(name, path), overflow='ignore', crop=False)
+            except Exception as e:
+                prnt(f'  [red]Error getting data locations: {e}[/red]')
            
-            check_data_dir_permissions()
+            try:
+                from archivebox.misc.checks import check_data_dir_permissions
+                check_data_dir_permissions()
+            except Exception:
+                pass
        else:
            prnt()
            prnt('[red][i] Data locations:[/red] (not in a data directory)')
@@ -194,7 +208,6 @@ def version(quiet: bool=False,

@click.command()
@click.option('--quiet', '-q', is_flag=True, help='Only print ArchiveBox version number and nothing else. (equivalent to archivebox --version)')
-@click.option('--binproviders', '-p', help='Select binproviders to detect DEFAULT=env,apt,brew,sys_pip,venv_pip,lib_pip,pipx,sys_npm,lib_npm,puppeteer,playwright (all)')
@click.option('--binaries', '-b', help='Select binaries to detect DEFAULT=curl,wget,git,yt-dlp,chrome,single-file,readability-extractor,postlight-parser,... (all)')
@docstring(version.__doc__)
 def main(**kwargs):
--- a/archivebox/cli/archivebox_worker.py
+++ b/archivebox/cli/archivebox_worker.py
@@ -4,29 +4,46 @@ __package__ = 'archivebox.cli'
 __command__ = 'archivebox worker'

 import sys
-import json

 import rich_click as click

+from archivebox.misc.util import docstring
+
+
+def worker(worker_type: str, daemon: bool = False, plugin: str | None = None):
+    """
+    Start a worker process to process items from the queue.
+
+    Worker types:
+        - crawl: Process Crawl objects (parse seeds, create snapshots)
+        - snapshot: Process Snapshot objects (create archive results)
+        - archiveresult: Process ArchiveResult objects (run plugins)
+
+    Workers poll the database for queued items, claim them atomically,
+    and spawn subprocess tasks to handle each item.
+    """
+    from workers.worker import get_worker_class
+
+    WorkerClass = get_worker_class(worker_type)
+
+    # Build kwargs
+    kwargs = {'daemon': daemon}
+    if plugin and worker_type == 'archiveresult':
+        kwargs['extractor'] = plugin  # internal field still called extractor
+
+    # Create and run worker
+    worker_instance = WorkerClass(**kwargs)
+    worker_instance.runloop()
+

@click.command()
-@click.argument('worker_type')
-@click.option('--wait-for-first-event', is_flag=True)
-@click.option('--exit-on-idle', is_flag=True)
-def main(worker_type: str, wait_for_first_event: bool, exit_on_idle: bool):
-    """Start an ArchiveBox worker process of the given type"""
-    
-    from workers.worker import get_worker_type
-    
-    # allow piping in events to process from stdin
-    # if not sys.stdin.isatty():
-    #     for line in sys.stdin.readlines():
-    #         Event.dispatch(event=json.loads(line), parent=None)
-
-    # run the actor
-    Worker = get_worker_type(worker_type)
-    for event in Worker.run(wait_for_first_event=wait_for_first_event, exit_on_idle=exit_on_idle):
-        print(event)
+@click.argument('worker_type', type=click.Choice(['crawl', 'snapshot', 'archiveresult']))
+@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
+@click.option('--plugin', '-p', default=None, help='Filter by plugin (archiveresult only)')
+@docstring(worker.__doc__)
+def main(worker_type: str, daemon: bool, plugin: str | None):
+    """Start an ArchiveBox worker process"""
+    worker(worker_type, daemon=daemon, plugin=plugin)


 if __name__ == '__main__':
--- a/archivebox/cli/tests.py
+++ b/archivebox/cli/tests.py
@@ -31,7 +31,6 @@ DATA_DIR = 'data.tests'
 os.environ.update(TEST_CONFIG)

 from ..main import init
-from ..index import load_main_index
 from archivebox.config.constants import (
    SQL_INDEX_FILENAME,
    JSON_INDEX_FILENAME,
--- a/archivebox/cli/tests_piping.py
+++ b/archivebox/cli/tests_piping.py
@@ -0,0 +1,966 @@
+#!/usr/bin/env python3
+"""
+Tests for CLI piping workflow: crawl | snapshot | extract
+
+This module tests the JSONL-based piping between CLI commands as described in:
+https://github.com/ArchiveBox/ArchiveBox/issues/1363
+
+Workflows tested:
+    archivebox snapshot URL | archivebox extract
+    archivebox crawl URL | archivebox snapshot | archivebox extract
+    archivebox crawl --plugin=PARSER URL | archivebox snapshot | archivebox extract
+
+Each command should:
+    - Accept URLs, snapshot_ids, or JSONL as input (args or stdin)
+    - Output JSONL to stdout when piped (not TTY)
+    - Output human-readable to stderr when TTY
+"""
+
+__package__ = 'archivebox.cli'
+
+import os
+import sys
+import json
+import shutil
+import tempfile
+import unittest
+from io import StringIO
+from pathlib import Path
+from unittest.mock import patch, MagicMock
+
+# Test configuration - disable slow extractors
+TEST_CONFIG = {
+    'USE_COLOR': 'False',
+    'SHOW_PROGRESS': 'False',
+    'SAVE_ARCHIVE_DOT_ORG': 'False',
+    'SAVE_TITLE': 'True',  # Fast extractor
+    'SAVE_FAVICON': 'False',
+    'SAVE_WGET': 'False',
+    'SAVE_WARC': 'False',
+    'SAVE_PDF': 'False',
+    'SAVE_SCREENSHOT': 'False',
+    'SAVE_DOM': 'False',
+    'SAVE_SINGLEFILE': 'False',
+    'SAVE_READABILITY': 'False',
+    'SAVE_MERCURY': 'False',
+    'SAVE_GIT': 'False',
+    'SAVE_MEDIA': 'False',
+    'SAVE_HEADERS': 'False',
+    'USE_CURL': 'False',
+    'USE_WGET': 'False',
+    'USE_GIT': 'False',
+    'USE_CHROME': 'False',
+    'USE_YOUTUBEDL': 'False',
+    'USE_NODE': 'False',
+}
+
+os.environ.update(TEST_CONFIG)
+
+
+# =============================================================================
+# JSONL Utility Tests
+# =============================================================================
+
+class TestJSONLParsing(unittest.TestCase):
+    """Test JSONL input parsing utilities."""
+
+    def test_parse_plain_url(self):
+        """Plain URLs should be parsed as Snapshot records."""
+        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
+
+        result = parse_line('https://example.com')
+        self.assertIsNotNone(result)
+        self.assertEqual(result['type'], TYPE_SNAPSHOT)
+        self.assertEqual(result['url'], 'https://example.com')
+
+    def test_parse_jsonl_snapshot(self):
+        """JSONL Snapshot records should preserve all fields."""
+        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
+
+        line = '{"type": "Snapshot", "url": "https://example.com", "tags": "test,demo"}'
+        result = parse_line(line)
+        self.assertIsNotNone(result)
+        self.assertEqual(result['type'], TYPE_SNAPSHOT)
+        self.assertEqual(result['url'], 'https://example.com')
+        self.assertEqual(result['tags'], 'test,demo')
+
+    def test_parse_jsonl_with_id(self):
+        """JSONL with id field should be recognized."""
+        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
+
+        line = '{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}'
+        result = parse_line(line)
+        self.assertIsNotNone(result)
+        self.assertEqual(result['id'], 'abc123')
+        self.assertEqual(result['url'], 'https://example.com')
+
+    def test_parse_uuid_as_snapshot_id(self):
+        """Bare UUIDs should be parsed as snapshot IDs."""
+        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
+
+        uuid = '01234567-89ab-cdef-0123-456789abcdef'
+        result = parse_line(uuid)
+        self.assertIsNotNone(result)
+        self.assertEqual(result['type'], TYPE_SNAPSHOT)
+        self.assertEqual(result['id'], uuid)
+
+    def test_parse_empty_line(self):
+        """Empty lines should return None."""
+        from archivebox.misc.jsonl import parse_line
+
+        self.assertIsNone(parse_line(''))
+        self.assertIsNone(parse_line('   '))
+        self.assertIsNone(parse_line('\n'))
+
+    def test_parse_comment_line(self):
+        """Comment lines should return None."""
+        from archivebox.misc.jsonl import parse_line
+
+        self.assertIsNone(parse_line('# This is a comment'))
+        self.assertIsNone(parse_line('  # Indented comment'))
+
+    def test_parse_invalid_url(self):
+        """Invalid URLs should return None."""
+        from archivebox.misc.jsonl import parse_line
+
+        self.assertIsNone(parse_line('not-a-url'))
+        self.assertIsNone(parse_line('ftp://example.com'))  # Only http/https/file
+
+    def test_parse_file_url(self):
+        """file:// URLs should be parsed."""
+        from archivebox.misc.jsonl import parse_line, TYPE_SNAPSHOT
+
+        result = parse_line('file:///path/to/file.txt')
+        self.assertIsNotNone(result)
+        self.assertEqual(result['type'], TYPE_SNAPSHOT)
+        self.assertEqual(result['url'], 'file:///path/to/file.txt')
+
+
+class TestJSONLOutput(unittest.TestCase):
+    """Test JSONL output formatting."""
+
+    def test_snapshot_to_jsonl(self):
+        """Snapshot model should serialize to JSONL correctly."""
+        from archivebox.misc.jsonl import snapshot_to_jsonl, TYPE_SNAPSHOT
+
+        # Create a mock snapshot
+        mock_snapshot = MagicMock()
+        mock_snapshot.id = 'test-uuid-1234'
+        mock_snapshot.url = 'https://example.com'
+        mock_snapshot.title = 'Example Title'
+        mock_snapshot.tags_str.return_value = 'tag1,tag2'
+        mock_snapshot.bookmarked_at = None
+        mock_snapshot.created_at = None
+        mock_snapshot.timestamp = '1234567890'
+        mock_snapshot.depth = 0
+        mock_snapshot.status = 'queued'
+
+        result = snapshot_to_jsonl(mock_snapshot)
+        self.assertEqual(result['type'], TYPE_SNAPSHOT)
+        self.assertEqual(result['id'], 'test-uuid-1234')
+        self.assertEqual(result['url'], 'https://example.com')
+        self.assertEqual(result['title'], 'Example Title')
+
+    def test_archiveresult_to_jsonl(self):
+        """ArchiveResult model should serialize to JSONL correctly."""
+        from archivebox.misc.jsonl import archiveresult_to_jsonl, TYPE_ARCHIVERESULT
+
+        mock_result = MagicMock()
+        mock_result.id = 'result-uuid-5678'
+        mock_result.snapshot_id = 'snapshot-uuid-1234'
+        mock_result.extractor = 'title'
+        mock_result.status = 'succeeded'
+        mock_result.output = 'Example Title'
+        mock_result.start_ts = None
+        mock_result.end_ts = None
+
+        result = archiveresult_to_jsonl(mock_result)
+        self.assertEqual(result['type'], TYPE_ARCHIVERESULT)
+        self.assertEqual(result['id'], 'result-uuid-5678')
+        self.assertEqual(result['snapshot_id'], 'snapshot-uuid-1234')
+        self.assertEqual(result['extractor'], 'title')
+        self.assertEqual(result['status'], 'succeeded')
+
+
+class TestReadArgsOrStdin(unittest.TestCase):
+    """Test reading from args or stdin."""
+
+    def test_read_from_args(self):
+        """Should read URLs from command line args."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        args = ('https://example1.com', 'https://example2.com')
+        records = list(read_args_or_stdin(args))
+
+        self.assertEqual(len(records), 2)
+        self.assertEqual(records[0]['url'], 'https://example1.com')
+        self.assertEqual(records[1]['url'], 'https://example2.com')
+
+    def test_read_from_stdin(self):
+        """Should read URLs from stdin when no args provided."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        stdin_content = 'https://example1.com\nhttps://example2.com\n'
+        stream = StringIO(stdin_content)
+
+        # Mock isatty to return False (simulating piped input)
+        stream.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stream))
+
+        self.assertEqual(len(records), 2)
+        self.assertEqual(records[0]['url'], 'https://example1.com')
+        self.assertEqual(records[1]['url'], 'https://example2.com')
+
+    def test_read_jsonl_from_stdin(self):
+        """Should read JSONL from stdin."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        stdin_content = '{"type": "Snapshot", "url": "https://example.com", "tags": "test"}\n'
+        stream = StringIO(stdin_content)
+        stream.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stream))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['url'], 'https://example.com')
+        self.assertEqual(records[0]['tags'], 'test')
+
+    def test_skip_tty_stdin(self):
+        """Should not read from TTY stdin (would block)."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        stream = StringIO('https://example.com')
+        stream.isatty = lambda: True  # Simulate TTY
+
+        records = list(read_args_or_stdin((), stream=stream))
+        self.assertEqual(len(records), 0)
+
+
+# =============================================================================
+# Unit Tests for Individual Commands
+# =============================================================================
+
+class TestCrawlCommand(unittest.TestCase):
+    """Unit tests for archivebox crawl command."""
+
+    def setUp(self):
+        """Set up test environment."""
+        self.test_dir = tempfile.mkdtemp()
+        os.environ['DATA_DIR'] = self.test_dir
+
+    def tearDown(self):
+        """Clean up test environment."""
+        shutil.rmtree(self.test_dir, ignore_errors=True)
+
+    def test_crawl_accepts_url(self):
+        """crawl should accept URLs as input."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        args = ('https://example.com',)
+        records = list(read_args_or_stdin(args))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['url'], 'https://example.com')
+
+    def test_crawl_accepts_snapshot_id(self):
+        """crawl should accept snapshot IDs as input."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        uuid = '01234567-89ab-cdef-0123-456789abcdef'
+        args = (uuid,)
+        records = list(read_args_or_stdin(args))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['id'], uuid)
+
+    def test_crawl_accepts_jsonl(self):
+        """crawl should accept JSONL with snapshot info."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n')
+        stdin.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stdin))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['id'], 'abc123')
+        self.assertEqual(records[0]['url'], 'https://example.com')
+
+    def test_crawl_separates_existing_vs_new(self):
+        """crawl should identify existing snapshots vs new URLs."""
+        # This tests the logic in discover_outlinks() that separates
+        # records with 'id' (existing) from records with just 'url' (new)
+
+        records = [
+            {'type': 'Snapshot', 'id': 'existing-id-1'},  # Existing (id only)
+            {'type': 'Snapshot', 'url': 'https://new-url.com'},  # New (url only)
+            {'type': 'Snapshot', 'id': 'existing-id-2', 'url': 'https://existing.com'},  # Existing (has id)
+        ]
+
+        existing = []
+        new = []
+
+        for record in records:
+            if record.get('id') and not record.get('url'):
+                existing.append(record['id'])
+            elif record.get('id'):
+                existing.append(record['id'])  # Has both id and url - treat as existing
+            elif record.get('url'):
+                new.append(record)
+
+        self.assertEqual(len(existing), 2)
+        self.assertEqual(len(new), 1)
+        self.assertEqual(new[0]['url'], 'https://new-url.com')
+
+
+class TestSnapshotCommand(unittest.TestCase):
+    """Unit tests for archivebox snapshot command."""
+
+    def setUp(self):
+        """Set up test environment."""
+        self.test_dir = tempfile.mkdtemp()
+        os.environ['DATA_DIR'] = self.test_dir
+
+    def tearDown(self):
+        """Clean up test environment."""
+        shutil.rmtree(self.test_dir, ignore_errors=True)
+
+    def test_snapshot_accepts_url(self):
+        """snapshot should accept URLs as input."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        args = ('https://example.com',)
+        records = list(read_args_or_stdin(args))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['url'], 'https://example.com')
+
+    def test_snapshot_accepts_jsonl_with_metadata(self):
+        """snapshot should accept JSONL with tags and other metadata."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        stdin = StringIO('{"type": "Snapshot", "url": "https://example.com", "tags": "tag1,tag2", "title": "Test"}\n')
+        stdin.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stdin))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['url'], 'https://example.com')
+        self.assertEqual(records[0]['tags'], 'tag1,tag2')
+        self.assertEqual(records[0]['title'], 'Test')
+
+    def test_snapshot_output_format(self):
+        """snapshot output should include id and url."""
+        from archivebox.misc.jsonl import snapshot_to_jsonl
+
+        mock_snapshot = MagicMock()
+        mock_snapshot.id = 'test-id'
+        mock_snapshot.url = 'https://example.com'
+        mock_snapshot.title = 'Test'
+        mock_snapshot.tags_str.return_value = ''
+        mock_snapshot.bookmarked_at = None
+        mock_snapshot.created_at = None
+        mock_snapshot.timestamp = '123'
+        mock_snapshot.depth = 0
+        mock_snapshot.status = 'queued'
+
+        output = snapshot_to_jsonl(mock_snapshot)
+
+        self.assertIn('id', output)
+        self.assertIn('url', output)
+        self.assertEqual(output['type'], 'Snapshot')
+
+
+class TestExtractCommand(unittest.TestCase):
+    """Unit tests for archivebox extract command."""
+
+    def setUp(self):
+        """Set up test environment."""
+        self.test_dir = tempfile.mkdtemp()
+        os.environ['DATA_DIR'] = self.test_dir
+
+    def tearDown(self):
+        """Clean up test environment."""
+        shutil.rmtree(self.test_dir, ignore_errors=True)
+
+    def test_extract_accepts_snapshot_id(self):
+        """extract should accept snapshot IDs as input."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        uuid = '01234567-89ab-cdef-0123-456789abcdef'
+        args = (uuid,)
+        records = list(read_args_or_stdin(args))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['id'], uuid)
+
+    def test_extract_accepts_jsonl_snapshot(self):
+        """extract should accept JSONL Snapshot records."""
+        from archivebox.misc.jsonl import read_args_or_stdin, TYPE_SNAPSHOT
+
+        stdin = StringIO('{"type": "Snapshot", "id": "abc123", "url": "https://example.com"}\n')
+        stdin.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stdin))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
+        self.assertEqual(records[0]['id'], 'abc123')
+
+    def test_extract_gathers_snapshot_ids(self):
+        """extract should gather snapshot IDs from various input formats."""
+        from archivebox.misc.jsonl import TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
+
+        records = [
+            {'type': TYPE_SNAPSHOT, 'id': 'snap-1'},
+            {'type': TYPE_SNAPSHOT, 'id': 'snap-2', 'url': 'https://example.com'},
+            {'type': TYPE_ARCHIVERESULT, 'snapshot_id': 'snap-3'},
+            {'id': 'snap-4'},  # Bare id
+        ]
+
+        snapshot_ids = set()
+        for record in records:
+            record_type = record.get('type')
+
+            if record_type == TYPE_SNAPSHOT:
+                snapshot_id = record.get('id')
+                if snapshot_id:
+                    snapshot_ids.add(snapshot_id)
+            elif record_type == TYPE_ARCHIVERESULT:
+                snapshot_id = record.get('snapshot_id')
+                if snapshot_id:
+                    snapshot_ids.add(snapshot_id)
+            elif 'id' in record:
+                snapshot_ids.add(record['id'])
+
+        self.assertEqual(len(snapshot_ids), 4)
+        self.assertIn('snap-1', snapshot_ids)
+        self.assertIn('snap-2', snapshot_ids)
+        self.assertIn('snap-3', snapshot_ids)
+        self.assertIn('snap-4', snapshot_ids)
+
+
+# =============================================================================
+# URL Collection Tests
+# =============================================================================
+
+class TestURLCollection(unittest.TestCase):
+    """Test collecting urls.jsonl from extractor output."""
+
+    def setUp(self):
+        """Create test directory structure."""
+        self.test_dir = Path(tempfile.mkdtemp())
+
+        # Create fake extractor output directories with urls.jsonl
+        (self.test_dir / 'wget').mkdir()
+        (self.test_dir / 'wget' / 'urls.jsonl').write_text(
+            '{"url": "https://wget-link-1.com"}\n'
+            '{"url": "https://wget-link-2.com"}\n'
+        )
+
+        (self.test_dir / 'parse_html_urls').mkdir()
+        (self.test_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
+            '{"url": "https://html-link-1.com"}\n'
+            '{"url": "https://html-link-2.com", "title": "HTML Link 2"}\n'
+        )
+
+        (self.test_dir / 'screenshot').mkdir()
+        # No urls.jsonl in screenshot dir - not a parser
+
+    def tearDown(self):
+        """Clean up test directory."""
+        shutil.rmtree(self.test_dir, ignore_errors=True)
+
+    def test_collect_urls_from_extractors(self):
+        """Should collect urls.jsonl from all extractor subdirectories."""
+        from archivebox.hooks import collect_urls_from_extractors
+
+        urls = collect_urls_from_extractors(self.test_dir)
+
+        self.assertEqual(len(urls), 4)
+
+        # Check that via_extractor is set
+        extractors = {u['via_extractor'] for u in urls}
+        self.assertIn('wget', extractors)
+        self.assertIn('parse_html_urls', extractors)
+        self.assertNotIn('screenshot', extractors)  # No urls.jsonl
+
+    def test_collect_urls_preserves_metadata(self):
+        """Should preserve metadata from urls.jsonl entries."""
+        from archivebox.hooks import collect_urls_from_extractors
+
+        urls = collect_urls_from_extractors(self.test_dir)
+
+        # Find the entry with title
+        titled = [u for u in urls if u.get('title') == 'HTML Link 2']
+        self.assertEqual(len(titled), 1)
+        self.assertEqual(titled[0]['url'], 'https://html-link-2.com')
+
+    def test_collect_urls_empty_dir(self):
+        """Should handle empty or non-existent directories."""
+        from archivebox.hooks import collect_urls_from_extractors
+
+        empty_dir = self.test_dir / 'nonexistent'
+        urls = collect_urls_from_extractors(empty_dir)
+
+        self.assertEqual(len(urls), 0)
+
+
+# =============================================================================
+# Integration Tests
+# =============================================================================
+
+class TestPipingWorkflowIntegration(unittest.TestCase):
+    """
+    Integration tests for the complete piping workflow.
+
+    These tests require Django to be set up and use the actual database.
+    """
+
+    @classmethod
+    def setUpClass(cls):
+        """Set up Django and test database."""
+        cls.test_dir = tempfile.mkdtemp()
+        os.environ['DATA_DIR'] = cls.test_dir
+
+        # Initialize Django
+        from archivebox.config.django import setup_django
+        setup_django()
+
+        # Initialize the archive
+        from archivebox.cli.archivebox_init import init
+        init()
+
+    @classmethod
+    def tearDownClass(cls):
+        """Clean up test database."""
+        shutil.rmtree(cls.test_dir, ignore_errors=True)
+
+    def test_snapshot_creates_and_outputs_jsonl(self):
+        """
+        Test: archivebox snapshot URL
+        Should create a Snapshot and output JSONL when piped.
+        """
+        from core.models import Snapshot
+        from archivebox.misc.jsonl import (
+            read_args_or_stdin, write_record, snapshot_to_jsonl,
+            TYPE_SNAPSHOT, get_or_create_snapshot
+        )
+        from archivebox.base_models.models import get_or_create_system_user_pk
+
+        created_by_id = get_or_create_system_user_pk()
+
+        # Simulate input
+        url = 'https://test-snapshot-1.example.com'
+        records = list(read_args_or_stdin((url,)))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['url'], url)
+
+        # Create snapshot
+        snapshot = get_or_create_snapshot(records[0], created_by_id=created_by_id)
+
+        self.assertIsNotNone(snapshot.id)
+        self.assertEqual(snapshot.url, url)
+
+        # Verify output format
+        output = snapshot_to_jsonl(snapshot)
+        self.assertEqual(output['type'], TYPE_SNAPSHOT)
+        self.assertIn('id', output)
+        self.assertEqual(output['url'], url)
+
+    def test_extract_accepts_snapshot_from_previous_command(self):
+        """
+        Test: archivebox snapshot URL | archivebox extract
+        Extract should accept JSONL output from snapshot command.
+        """
+        from core.models import Snapshot, ArchiveResult
+        from archivebox.misc.jsonl import (
+            snapshot_to_jsonl, read_args_or_stdin, get_or_create_snapshot,
+            TYPE_SNAPSHOT
+        )
+        from archivebox.base_models.models import get_or_create_system_user_pk
+
+        created_by_id = get_or_create_system_user_pk()
+
+        # Step 1: Create snapshot (simulating 'archivebox snapshot')
+        url = 'https://test-extract-1.example.com'
+        snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
+        snapshot_output = snapshot_to_jsonl(snapshot)
+
+        # Step 2: Parse snapshot output as extract input
+        stdin = StringIO(json.dumps(snapshot_output) + '\n')
+        stdin.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stdin))
+
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['type'], TYPE_SNAPSHOT)
+        self.assertEqual(records[0]['id'], str(snapshot.id))
+
+        # Step 3: Gather snapshot IDs (as extract does)
+        snapshot_ids = set()
+        for record in records:
+            if record.get('type') == TYPE_SNAPSHOT and record.get('id'):
+                snapshot_ids.add(record['id'])
+
+        self.assertIn(str(snapshot.id), snapshot_ids)
+
+    def test_crawl_outputs_discovered_urls(self):
+        """
+        Test: archivebox crawl URL
+        Should create snapshot, run plugins, output discovered URLs.
+        """
+        from archivebox.hooks import collect_urls_from_extractors
+        from archivebox.misc.jsonl import TYPE_SNAPSHOT
+
+        # Create a mock snapshot directory with urls.jsonl
+        test_snapshot_dir = Path(self.test_dir) / 'archive' / 'test-crawl-snapshot'
+        test_snapshot_dir.mkdir(parents=True, exist_ok=True)
+
+        # Create mock extractor output
+        (test_snapshot_dir / 'parse_html_urls').mkdir()
+        (test_snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
+            '{"url": "https://discovered-1.com"}\n'
+            '{"url": "https://discovered-2.com", "title": "Discovered 2"}\n'
+        )
+
+        # Collect URLs (as crawl does)
+        discovered = collect_urls_from_extractors(test_snapshot_dir)
+
+        self.assertEqual(len(discovered), 2)
+
+        # Add crawl metadata (as crawl does)
+        for entry in discovered:
+            entry['type'] = TYPE_SNAPSHOT
+            entry['depth'] = 1
+            entry['via_snapshot'] = 'test-crawl-snapshot'
+
+        # Verify output format
+        self.assertEqual(discovered[0]['type'], TYPE_SNAPSHOT)
+        self.assertEqual(discovered[0]['depth'], 1)
+        self.assertEqual(discovered[0]['url'], 'https://discovered-1.com')
+
+    def test_full_pipeline_snapshot_extract(self):
+        """
+        Test: archivebox snapshot URL | archivebox extract
+
+        This is equivalent to: archivebox add URL
+        """
+        from core.models import Snapshot
+        from archivebox.misc.jsonl import (
+            get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
+            TYPE_SNAPSHOT
+        )
+        from archivebox.base_models.models import get_or_create_system_user_pk
+
+        created_by_id = get_or_create_system_user_pk()
+
+        # === archivebox snapshot https://example.com ===
+        url = 'https://test-pipeline-1.example.com'
+        snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
+        snapshot_jsonl = json.dumps(snapshot_to_jsonl(snapshot))
+
+        # === | archivebox extract ===
+        stdin = StringIO(snapshot_jsonl + '\n')
+        stdin.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stdin))
+
+        # Extract should receive the snapshot ID
+        self.assertEqual(len(records), 1)
+        self.assertEqual(records[0]['id'], str(snapshot.id))
+
+        # Verify snapshot exists in DB
+        db_snapshot = Snapshot.objects.get(id=snapshot.id)
+        self.assertEqual(db_snapshot.url, url)
+
+    def test_full_pipeline_crawl_snapshot_extract(self):
+        """
+        Test: archivebox crawl URL | archivebox snapshot | archivebox extract
+
+        This is equivalent to: archivebox add --depth=1 URL
+        """
+        from core.models import Snapshot
+        from archivebox.misc.jsonl import (
+            get_or_create_snapshot, snapshot_to_jsonl, read_args_or_stdin,
+            TYPE_SNAPSHOT
+        )
+        from archivebox.base_models.models import get_or_create_system_user_pk
+        from archivebox.hooks import collect_urls_from_extractors
+
+        created_by_id = get_or_create_system_user_pk()
+
+        # === archivebox crawl https://example.com ===
+        # Step 1: Create snapshot for starting URL
+        start_url = 'https://test-crawl-pipeline.example.com'
+        start_snapshot = get_or_create_snapshot({'url': start_url}, created_by_id=created_by_id)
+
+        # Step 2: Simulate extractor output with discovered URLs
+        snapshot_dir = Path(self.test_dir) / 'archive' / str(start_snapshot.timestamp)
+        snapshot_dir.mkdir(parents=True, exist_ok=True)
+        (snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
+        (snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
+            '{"url": "https://outlink-1.example.com"}\n'
+            '{"url": "https://outlink-2.example.com"}\n'
+        )
+
+        # Step 3: Collect discovered URLs (crawl output)
+        discovered = collect_urls_from_extractors(snapshot_dir)
+        crawl_output = []
+        for entry in discovered:
+            entry['type'] = TYPE_SNAPSHOT
+            entry['depth'] = 1
+            crawl_output.append(json.dumps(entry))
+
+        # === | archivebox snapshot ===
+        stdin = StringIO('\n'.join(crawl_output) + '\n')
+        stdin.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stdin))
+        self.assertEqual(len(records), 2)
+
+        # Create snapshots for discovered URLs
+        created_snapshots = []
+        for record in records:
+            snap = get_or_create_snapshot(record, created_by_id=created_by_id)
+            created_snapshots.append(snap)
+
+        self.assertEqual(len(created_snapshots), 2)
+
+        # === | archivebox extract ===
+        snapshot_jsonl_lines = [json.dumps(snapshot_to_jsonl(s)) for s in created_snapshots]
+        stdin = StringIO('\n'.join(snapshot_jsonl_lines) + '\n')
+        stdin.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stdin))
+        self.assertEqual(len(records), 2)
+
+        # Verify all snapshots exist in DB
+        for record in records:
+            db_snapshot = Snapshot.objects.get(id=record['id'])
+            self.assertIn(db_snapshot.url, [
+                'https://outlink-1.example.com',
+                'https://outlink-2.example.com'
+            ])
+
+
+class TestDepthWorkflows(unittest.TestCase):
+    """Test various depth crawl workflows."""
+
+    @classmethod
+    def setUpClass(cls):
+        """Set up Django and test database."""
+        cls.test_dir = tempfile.mkdtemp()
+        os.environ['DATA_DIR'] = cls.test_dir
+
+        from archivebox.config.django import setup_django
+        setup_django()
+
+        from archivebox.cli.archivebox_init import init
+        init()
+
+    @classmethod
+    def tearDownClass(cls):
+        """Clean up test database."""
+        shutil.rmtree(cls.test_dir, ignore_errors=True)
+
+    def test_depth_0_workflow(self):
+        """
+        Test: archivebox snapshot URL | archivebox extract
+
+        Depth 0: Only archive the specified URL, no crawling.
+        """
+        from core.models import Snapshot
+        from archivebox.misc.jsonl import get_or_create_snapshot
+        from archivebox.base_models.models import get_or_create_system_user_pk
+
+        created_by_id = get_or_create_system_user_pk()
+
+        # Create snapshot
+        url = 'https://depth0-test.example.com'
+        snapshot = get_or_create_snapshot({'url': url}, created_by_id=created_by_id)
+
+        # Verify only one snapshot created
+        self.assertEqual(Snapshot.objects.filter(url=url).count(), 1)
+        self.assertEqual(snapshot.url, url)
+
+    def test_depth_1_workflow(self):
+        """
+        Test: archivebox crawl URL | archivebox snapshot | archivebox extract
+
+        Depth 1: Archive URL + all outlinks from that URL.
+        """
+        # This is tested in test_full_pipeline_crawl_snapshot_extract
+        pass
+
+    def test_depth_metadata_propagation(self):
+        """Test that depth metadata propagates through the pipeline."""
+        from archivebox.misc.jsonl import TYPE_SNAPSHOT
+
+        # Simulate crawl output with depth metadata
+        crawl_output = [
+            {'type': TYPE_SNAPSHOT, 'url': 'https://hop1.com', 'depth': 1, 'via_snapshot': 'root'},
+            {'type': TYPE_SNAPSHOT, 'url': 'https://hop2.com', 'depth': 2, 'via_snapshot': 'hop1'},
+        ]
+
+        # Verify depth is preserved
+        for entry in crawl_output:
+            self.assertIn('depth', entry)
+            self.assertIn('via_snapshot', entry)
+
+
+class TestParserPluginWorkflows(unittest.TestCase):
+    """Test workflows with specific parser plugins."""
+
+    @classmethod
+    def setUpClass(cls):
+        """Set up Django and test database."""
+        cls.test_dir = tempfile.mkdtemp()
+        os.environ['DATA_DIR'] = cls.test_dir
+
+        from archivebox.config.django import setup_django
+        setup_django()
+
+        from archivebox.cli.archivebox_init import init
+        init()
+
+    @classmethod
+    def tearDownClass(cls):
+        """Clean up test database."""
+        shutil.rmtree(cls.test_dir, ignore_errors=True)
+
+    def test_html_parser_workflow(self):
+        """
+        Test: archivebox crawl --plugin=parse_html_urls URL | archivebox snapshot | archivebox extract
+        """
+        from archivebox.hooks import collect_urls_from_extractors
+        from archivebox.misc.jsonl import TYPE_SNAPSHOT
+
+        # Create mock output directory
+        snapshot_dir = Path(self.test_dir) / 'archive' / 'html-parser-test'
+        snapshot_dir.mkdir(parents=True, exist_ok=True)
+        (snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
+        (snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
+            '{"url": "https://html-discovered.com", "title": "HTML Link"}\n'
+        )
+
+        # Collect URLs
+        discovered = collect_urls_from_extractors(snapshot_dir)
+
+        self.assertEqual(len(discovered), 1)
+        self.assertEqual(discovered[0]['url'], 'https://html-discovered.com')
+        self.assertEqual(discovered[0]['via_extractor'], 'parse_html_urls')
+
+    def test_rss_parser_workflow(self):
+        """
+        Test: archivebox crawl --plugin=parse_rss_urls URL | archivebox snapshot | archivebox extract
+        """
+        from archivebox.hooks import collect_urls_from_extractors
+
+        # Create mock output directory
+        snapshot_dir = Path(self.test_dir) / 'archive' / 'rss-parser-test'
+        snapshot_dir.mkdir(parents=True, exist_ok=True)
+        (snapshot_dir / 'parse_rss_urls').mkdir(exist_ok=True)
+        (snapshot_dir / 'parse_rss_urls' / 'urls.jsonl').write_text(
+            '{"url": "https://rss-item-1.com", "title": "RSS Item 1"}\n'
+            '{"url": "https://rss-item-2.com", "title": "RSS Item 2"}\n'
+        )
+
+        # Collect URLs
+        discovered = collect_urls_from_extractors(snapshot_dir)
+
+        self.assertEqual(len(discovered), 2)
+        self.assertTrue(all(d['via_extractor'] == 'parse_rss_urls' for d in discovered))
+
+    def test_multiple_parsers_dedupe(self):
+        """
+        Multiple parsers may discover the same URL - should be deduplicated.
+        """
+        from archivebox.hooks import collect_urls_from_extractors
+
+        # Create mock output with duplicate URLs from different parsers
+        snapshot_dir = Path(self.test_dir) / 'archive' / 'dedupe-test'
+        snapshot_dir.mkdir(parents=True, exist_ok=True)
+
+        (snapshot_dir / 'parse_html_urls').mkdir(exist_ok=True)
+        (snapshot_dir / 'parse_html_urls' / 'urls.jsonl').write_text(
+            '{"url": "https://same-url.com"}\n'
+        )
+
+        (snapshot_dir / 'wget').mkdir(exist_ok=True)
+        (snapshot_dir / 'wget' / 'urls.jsonl').write_text(
+            '{"url": "https://same-url.com"}\n'  # Same URL, different extractor
+        )
+
+        # Collect URLs
+        all_discovered = collect_urls_from_extractors(snapshot_dir)
+
+        # Both entries are returned (deduplication happens at the crawl command level)
+        self.assertEqual(len(all_discovered), 2)
+
+        # Verify both extractors found the same URL
+        urls = {d['url'] for d in all_discovered}
+        self.assertEqual(urls, {'https://same-url.com'})
+
+
+class TestEdgeCases(unittest.TestCase):
+    """Test edge cases and error handling."""
+
+    def test_empty_input(self):
+        """Commands should handle empty input gracefully."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        # Empty args, TTY stdin (should not block)
+        stdin = StringIO('')
+        stdin.isatty = lambda: True
+
+        records = list(read_args_or_stdin((), stream=stdin))
+        self.assertEqual(len(records), 0)
+
+    def test_malformed_jsonl(self):
+        """Should skip malformed JSONL lines."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        stdin = StringIO(
+            '{"url": "https://good.com"}\n'
+            'not valid json\n'
+            '{"url": "https://also-good.com"}\n'
+        )
+        stdin.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stdin))
+
+        self.assertEqual(len(records), 2)
+        urls = {r['url'] for r in records}
+        self.assertEqual(urls, {'https://good.com', 'https://also-good.com'})
+
+    def test_mixed_input_formats(self):
+        """Should handle mixed URLs and JSONL."""
+        from archivebox.misc.jsonl import read_args_or_stdin
+
+        stdin = StringIO(
+            'https://plain-url.com\n'
+            '{"type": "Snapshot", "url": "https://jsonl-url.com", "tags": "test"}\n'
+            '01234567-89ab-cdef-0123-456789abcdef\n'  # UUID
+        )
+        stdin.isatty = lambda: False
+
+        records = list(read_args_or_stdin((), stream=stdin))
+
+        self.assertEqual(len(records), 3)
+
+        # Plain URL
+        self.assertEqual(records[0]['url'], 'https://plain-url.com')
+
+        # JSONL with metadata
+        self.assertEqual(records[1]['url'], 'https://jsonl-url.com')
+        self.assertEqual(records[1]['tags'], 'test')
+
+        # UUID
+        self.assertEqual(records[2]['id'], '01234567-89ab-cdef-0123-456789abcdef')
+
+
+if __name__ == '__main__':
+    unittest.main()