WIP: checkpoint working tree before rebasing onto dev

2026-04-06 07:47:53 +10:00 · 2026-03-22 20:23:45 -07:00
parent a6548df8d0
commit f400a2cd67
87 changed files with 12607 additions and 1808 deletions
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -47,11 +47,13 @@ def _collect_input_urls(args: tuple[str, ...]) -> list[str]:
 def add(urls: str | list[str],
        depth: int | str=0,
        tag: str='',
+        url_allowlist: str='',
+        url_denylist: str='',
        parser: str="auto",
        plugins: str="",
        persona: str='Default',
        overwrite: bool=False,
-        update: bool=not ARCHIVING_CONFIG.ONLY_NEW,
+        update: bool | None=None,
        index_only: bool=False,
        bg: bool=False,
        created_by_id: int | None=None) -> tuple['Crawl', QuerySet['Snapshot']]:
@@ -85,6 +87,8 @@ def add(urls: str | list[str],

    created_by_id = created_by_id or get_or_create_system_user_pk()
    started_at = timezone.now()
+    if update is None:
+        update = not ARCHIVING_CONFIG.ONLY_NEW

    # 1. Save the provided URLs to sources/2024-11-05__23-59-59__cli_add.txt
    sources_file = CONSTANTS.SOURCES_DIR / f'{timezone.now().strftime("%Y-%m-%d__%H-%M-%S")}__cli_add.txt'
@@ -120,6 +124,8 @@ def add(urls: str | list[str],
            'PLUGINS': plugins,
            'DEFAULT_PERSONA': persona_name,
            'PARSER': parser,
+            **({'URL_ALLOWLIST': url_allowlist} if url_allowlist else {}),
+            **({'URL_DENYLIST': url_denylist} if url_denylist else {}),
        }
    )

@@ -150,6 +156,9 @@ def add(urls: str | list[str],
            snapshot.ensure_crawl_symlink()
        return crawl, crawl.snapshot_set.all()

+    if bg:
+        crawl.create_snapshots_from_urls()
+
    # 5. Start the crawl runner to process the queue
    #    The runner will:
    #    - Process Crawl -> create Snapshots from all URLs
@@ -192,8 +201,7 @@ def add(urls: str | list[str],
            except Exception:
                rel_output_str = str(crawl.output_dir)

-            # Build admin URL from SERVER_CONFIG
-            bind_addr = SERVER_CONFIG.BIND_ADDR
+            bind_addr = SERVER_CONFIG.BIND_ADDR or '127.0.0.1:8000'
            if bind_addr.startswith('http://') or bind_addr.startswith('https://'):
                base_url = bind_addr
            else:
@@ -218,11 +226,13 @@ def add(urls: str | list[str],
@click.command()
@click.option('--depth', '-d', type=click.Choice([str(i) for i in range(5)]), default='0', help='Recursively archive linked pages up to N hops away')
@click.option('--tag', '-t', default='', help='Comma-separated list of tags to add to each snapshot e.g. tag1,tag2,tag3')
+@click.option('--url-allowlist', '--domain-allowlist', default='', help='Comma-separated URL/domain allowlist for this crawl')
+@click.option('--url-denylist', '--domain-denylist', default='', help='Comma-separated URL/domain denylist for this crawl')
@click.option('--parser', default='auto', help='Parser for reading input URLs (auto, txt, html, rss, json, jsonl, netscape, ...)')
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run e.g. title,favicon,screenshot,singlefile,...')
@click.option('--persona', default='Default', help='Authentication profile to use when archiving')
@click.option('--overwrite', '-F', is_flag=True, help='Overwrite existing data if URLs have been archived previously')
-@click.option('--update', is_flag=True, default=ARCHIVING_CONFIG.ONLY_NEW, help='Retry any previously skipped/failed URLs when re-adding them')
+@click.option('--update', is_flag=True, default=None, help='Retry any previously skipped/failed URLs when re-adding them')
@click.option('--index-only', is_flag=True, help='Just add the URLs to the index without archiving them now')
@click.option('--bg', is_flag=True, help='Run archiving in background (queue work and return immediately)')
@click.argument('urls', nargs=-1, type=click.Path())
--- a/archivebox/cli/archivebox_archiveresult.py
+++ b/archivebox/cli/archivebox_archiveresult.py
@@ -42,6 +42,16 @@ from rich import print as rprint
 from archivebox.cli.cli_utils import apply_filters


+def build_archiveresult_request(snapshot_id: str, plugin: str, hook_name: str = '', status: str = 'queued') -> dict:
+    return {
+        'type': 'ArchiveResult',
+        'snapshot_id': str(snapshot_id),
+        'plugin': plugin,
+        'hook_name': hook_name,
+        'status': status,
+    }
+
+
 # =============================================================================
 # CREATE
 # =============================================================================
@@ -52,21 +62,21 @@ def create_archiveresults(
    status: str = 'queued',
 ) -> int:
    """
-    Create ArchiveResults for Snapshots.
+    Create ArchiveResult request records for Snapshots.

-    Reads Snapshot records from stdin and creates ArchiveResult entries.
+    Reads Snapshot records from stdin and emits ArchiveResult request JSONL.
    Pass-through: Non-Snapshot/ArchiveResult records are output unchanged.
-    If --plugin is specified, only creates results for that plugin.
-    Otherwise, creates results for all pending plugins.
+    If --plugin is specified, only emits requests for that plugin.
+    Otherwise, emits requests for all enabled snapshot hooks.

    Exit codes:
        0: Success
        1: Failure
    """
-    from django.utils import timezone
-
+    from archivebox.config.configset import get_config
+    from archivebox.hooks import discover_hooks
    from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
-    from archivebox.core.models import Snapshot, ArchiveResult
+    from archivebox.core.models import Snapshot

    is_tty = sys.stdout.isatty()

@@ -135,33 +145,20 @@ def create_archiveresults(
    created_count = 0
    for snapshot in snapshots:
        if plugin:
-            # Create for specific plugin only
-            result, created = ArchiveResult.objects.get_or_create(
-                snapshot=snapshot,
-                plugin=plugin,
-                defaults={
-                    'status': status,
-                    'retry_at': timezone.now(),
-                }
-            )
-            if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
-                # Reset for retry
-                result.status = status
-                result.retry_at = timezone.now()
-                result.save()
-
            if not is_tty:
-                write_record(result.to_json())
+                write_record(build_archiveresult_request(snapshot.id, plugin, status=status))
            created_count += 1
        else:
-            # Create all pending plugins
-            snapshot.create_pending_archiveresults()
-            for result in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.QUEUED):
+            config = get_config(crawl=snapshot.crawl, snapshot=snapshot)
+            hooks = discover_hooks('Snapshot', config=config)
+            for hook_path in hooks:
+                hook_name = hook_path.name
+                plugin_name = hook_path.parent.name
                if not is_tty:
-                    write_record(result.to_json())
+                    write_record(build_archiveresult_request(snapshot.id, plugin_name, hook_name=hook_name, status=status))
                created_count += 1

-    rprint(f'[green]Created/queued {created_count} archive results[/green]', file=sys.stderr)
+    rprint(f'[green]Created {created_count} archive result request records[/green]', file=sys.stderr)
    return 0


@@ -205,6 +202,7 @@ def list_archiveresults(
                'succeeded': 'green',
                'failed': 'red',
                'skipped': 'dim',
+                'noresults': 'dim',
                'backoff': 'magenta',
            }.get(result.status, 'dim')
            rprint(f'[{status_color}]{result.status:10}[/{status_color}] {result.plugin:15} [dim]{result.id}[/dim] {result.snapshot.url[:40]}')
@@ -233,8 +231,6 @@ def update_archiveresults(
        0: Success
        1: No input or error
    """
-    from django.utils import timezone
-
    from archivebox.misc.jsonl import read_stdin, write_record
    from archivebox.core.models import ArchiveResult

@@ -257,7 +253,6 @@ def update_archiveresults(
            # Apply updates from CLI flags
            if status:
                result.status = status
-                result.retry_at = timezone.now()

            result.save()
            updated_count += 1
--- a/archivebox/cli/archivebox_extract.py
+++ b/archivebox/cli/archivebox_extract.py
@@ -38,15 +38,16 @@ import rich_click as click

 def process_archiveresult_by_id(archiveresult_id: str) -> int:
    """
-    Run extraction for a single ArchiveResult by ID (used by workers).
+    Re-run extraction for a single ArchiveResult by ID.

-    Triggers the ArchiveResult's state machine tick() to run the extractor
-    plugin, but only after claiming ownership via retry_at. This keeps direct
-    CLI execution aligned with the worker lifecycle and prevents duplicate hook
-    runs if another process already owns the same ArchiveResult.
+    ArchiveResults are projected status rows, not queued work items. Re-running
+    a single result means resetting that row and queueing its parent snapshot
+    through the shared crawl runner with the corresponding plugin selected.
    """
    from rich import print as rprint
+    from django.utils import timezone
    from archivebox.core.models import ArchiveResult
+    from archivebox.services.runner import run_crawl

    try:
        archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
@@ -57,16 +58,27 @@ def process_archiveresult_by_id(archiveresult_id: str) -> int:
    rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)

    try:
-        # Claim-before-tick is the required calling pattern for direct
-        # state-machine drivers. If another worker already owns this row,
-        # report that and exit without running duplicate extractor side effects.
-        if not archiveresult.tick_claimed(lock_seconds=120):
-            print(f'[yellow]Extraction already claimed by another process: {archiveresult.plugin}[/yellow]')
-            return 0
+        archiveresult.reset_for_retry()
+        snapshot = archiveresult.snapshot
+        snapshot.status = snapshot.StatusChoices.QUEUED
+        snapshot.retry_at = timezone.now()
+        snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])
+
+        crawl = snapshot.crawl
+        if crawl.status != crawl.StatusChoices.STARTED:
+            crawl.status = crawl.StatusChoices.QUEUED
+        crawl.retry_at = timezone.now()
+        crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
+
+        run_crawl(str(crawl.id), snapshot_ids=[str(snapshot.id)], selected_plugins=[archiveresult.plugin])
+        archiveresult.refresh_from_db()

        if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
            print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]')
            return 0
+        elif archiveresult.status == ArchiveResult.StatusChoices.NORESULTS:
+            print(f'[dim]Extraction completed with no results: {archiveresult.output_str}[/dim]')
+            return 0
        elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
            print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr)
            return 1
@@ -121,8 +133,9 @@ def run_plugins(
        rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
        return 1

-    # Gather snapshot IDs to process
+    # Gather snapshot IDs and optional plugin constraints to process
    snapshot_ids = set()
+    requested_plugins_by_snapshot: dict[str, set[str]] = defaultdict(set)
    for record in records:
        record_type = record.get('type')

@@ -142,6 +155,9 @@ def run_plugins(
            snapshot_id = record.get('snapshot_id')
            if snapshot_id:
                snapshot_ids.add(snapshot_id)
+                plugin_name = record.get('plugin')
+                if plugin_name and not plugins_list:
+                    requested_plugins_by_snapshot[str(snapshot_id)].add(str(plugin_name))

        elif 'id' in record:
            # Assume it's a snapshot ID
@@ -160,26 +176,15 @@ def run_plugins(
            rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
            continue

-        # Create pending ArchiveResults if needed
-        if plugins_list:
-            # Only create for specific plugins
-            for plugin_name in plugins_list:
-                result, created = ArchiveResult.objects.get_or_create(
-                    snapshot=snapshot,
-                    plugin=plugin_name,
-                    defaults={
-                        'status': ArchiveResult.StatusChoices.QUEUED,
-                        'retry_at': timezone.now(),
-                    }
-                )
-                if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
-                    # Reset for retry
-                    result.status = ArchiveResult.StatusChoices.QUEUED
-                    result.retry_at = timezone.now()
-                    result.save()
-        else:
-            # Create all pending plugins
-            snapshot.create_pending_archiveresults()
+        for plugin_name in requested_plugins_by_snapshot.get(str(snapshot.id), set()):
+            existing_result = snapshot.archiveresult_set.filter(plugin=plugin_name).order_by('-created_at').first()
+            if existing_result and existing_result.status in [
+                ArchiveResult.StatusChoices.FAILED,
+                ArchiveResult.StatusChoices.SKIPPED,
+                ArchiveResult.StatusChoices.NORESULTS,
+                ArchiveResult.StatusChoices.BACKOFF,
+            ]:
+                existing_result.reset_for_retry()

        # Reset snapshot status to allow processing
        if snapshot.status == Snapshot.StatusChoices.SEALED:
@@ -207,10 +212,15 @@ def run_plugins(
            snapshot_ids_by_crawl[str(snapshot.crawl_id)].add(str(snapshot.id))

        for crawl_id, crawl_snapshot_ids in snapshot_ids_by_crawl.items():
+            selected_plugins = plugins_list or sorted({
+                plugin
+                for snapshot_id in crawl_snapshot_ids
+                for plugin in requested_plugins_by_snapshot.get(str(snapshot_id), set())
+            }) or None
            run_crawl(
                crawl_id,
                snapshot_ids=sorted(crawl_snapshot_ids),
-                selected_plugins=plugins_list or None,
+                selected_plugins=selected_plugins,
            )

    # Output results as JSONL (when piped) or human-readable (when TTY)
--- a/archivebox/cli/archivebox_list.py
+++ b/archivebox/cli/archivebox_list.py
@@ -18,9 +18,13 @@ from archivebox.cli.archivebox_snapshot import list_snapshots
@click.option('--tag', '-t', help='Filter by tag name')
@click.option('--crawl-id', help='Filter by crawl ID')
@click.option('--limit', '-n', type=int, help='Limit number of results')
+@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
+@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: timestamp,url,title')
+@click.option('--with-headers', is_flag=True, help='Include column headers in structured output')
 def main(status: Optional[str], url__icontains: Optional[str], url__istartswith: Optional[str],
-         tag: Optional[str], crawl_id: Optional[str], limit: Optional[int]) -> None:
-    """List Snapshots as JSONL."""
+         tag: Optional[str], crawl_id: Optional[str], limit: Optional[int],
+         sort: Optional[str], csv: Optional[str], with_headers: bool) -> None:
+    """List Snapshots."""
    sys.exit(list_snapshots(
        status=status,
        url__icontains=url__icontains,
@@ -28,6 +32,9 @@ def main(status: Optional[str], url__icontains: Optional[str], url__istartswith:
        tag=tag,
        crawl_id=crawl_id,
        limit=limit,
+        sort=sort,
+        csv=csv,
+        with_headers=with_headers,
    ))


--- a/archivebox/cli/archivebox_persona.py
+++ b/archivebox/cli/archivebox_persona.py
@@ -42,6 +42,7 @@ import rich_click as click
 from rich import print as rprint

 from archivebox.cli.cli_utils import apply_filters
+from archivebox.personas import importers as persona_importers


 # =============================================================================
@@ -440,8 +441,6 @@ def create_personas(
        browser_binary = get_browser_binary(import_from)
        if browser_binary:
            rprint(f'[dim]Using {import_from} binary: {browser_binary}[/dim]', file=sys.stderr)
-    else:
-        browser_binary = None

    created_count = 0
    for name in name_list:
@@ -450,7 +449,7 @@ def create_personas(
            continue

        # Validate persona name to prevent path traversal
-        is_valid, error_msg = validate_persona_name(name)
+        is_valid, error_msg = persona_importers.validate_persona_name(name)
        if not is_valid:
            rprint(f'[red]Invalid persona name "{name}": {error_msg}[/red]', file=sys.stderr)
            continue
@@ -468,49 +467,29 @@ def create_personas(

        # Import browser profile if requested
        if import_from in CHROMIUM_BROWSERS and source_profile_dir is not None:
-            persona_chrome_dir = Path(persona.CHROME_USER_DATA_DIR)
-
-            # Copy the browser profile
-            rprint(f'[dim]Copying browser profile to {persona_chrome_dir}...[/dim]', file=sys.stderr)
-
            try:
-                # Remove existing chrome_user_data if it exists
-                if persona_chrome_dir.exists():
-                    shutil.rmtree(persona_chrome_dir)
-
-                # Copy the profile directory
-                # We copy the entire user data dir, not just Default profile
-                shutil.copytree(
-                    source_profile_dir,
-                    persona_chrome_dir,
-                    symlinks=True,
-                    ignore=shutil.ignore_patterns(
-                        'Cache', 'Code Cache', 'GPUCache', 'ShaderCache',
-                        'Service Worker', 'GCM Store', '*.log', 'Crashpad',
-                        'BrowserMetrics', 'BrowserMetrics-spare.pma',
-                        'SingletonLock', 'SingletonSocket', 'SingletonCookie',
-                    ),
+                import_source = persona_importers.resolve_browser_import_source(import_from, profile_dir=profile)
+                import_result = persona_importers.import_persona_from_source(
+                    persona,
+                    import_source,
+                    copy_profile=True,
+                    import_cookies=True,
+                    capture_storage=False,
                )
-                rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
-
-                # Extract cookies via CDP
-                rprint('[dim]Extracting cookies via CDP...[/dim]', file=sys.stderr)
-
-                if extract_cookies_via_cdp(
-                    persona_chrome_dir,
-                    cookies_file,
-                    profile_dir=profile,
-                    chrome_binary=browser_binary,
-                ):
-                    rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
-                else:
-                    rprint('[yellow]Could not extract cookies automatically.[/yellow]', file=sys.stderr)
-                    rprint('[dim]You can manually export cookies using a browser extension.[/dim]', file=sys.stderr)
-
            except Exception as e:
-                rprint(f'[red]Failed to copy browser profile: {e}[/red]', file=sys.stderr)
+                rprint(f'[red]Failed to import browser profile: {e}[/red]', file=sys.stderr)
                return 1

+            if import_result.profile_copied:
+                rprint('[green]Copied browser profile to persona[/green]', file=sys.stderr)
+            if import_result.cookies_imported:
+                rprint(f'[green]Extracted cookies to {cookies_file}[/green]', file=sys.stderr)
+            elif not import_result.profile_copied:
+                rprint('[yellow]Could not import cookies automatically.[/yellow]', file=sys.stderr)
+
+            for warning in import_result.warnings:
+                rprint(f'[yellow]{warning}[/yellow]', file=sys.stderr)
+
        if not is_tty:
            write_record({
                'id': str(persona.id) if hasattr(persona, 'id') else None,
@@ -616,7 +595,7 @@ def update_personas(name: Optional[str] = None) -> int:
            # Apply updates from CLI flags
            if name:
                # Validate new name to prevent path traversal
-                is_valid, error_msg = validate_persona_name(name)
+                is_valid, error_msg = persona_importers.validate_persona_name(name)
                if not is_valid:
                    rprint(f'[red]Invalid new persona name "{name}": {error_msg}[/red]', file=sys.stderr)
                    continue
--- a/archivebox/cli/archivebox_pluginmap.py
+++ b/archivebox/cli/archivebox_pluginmap.py
@@ -89,56 +89,6 @@ SNAPSHOT_MACHINE_DIAGRAM = """
 └─────────────────────────────────────────────────────────────────────────────┘
 """

-ARCHIVERESULT_MACHINE_DIAGRAM = """
-┌─────────────────────────────────────────────────────────────────────────────┐
-│                          ArchiveResultMachine                               │
-├─────────────────────────────────────────────────────────────────────────────┤
-│                                                                             │
-│   ┌─────────────┐                                                           │
-│   │   QUEUED    │◄─────────────────┐                                        │
-│   │  (initial)  │                  │                                        │
-│   └──┬───────┬──┘                  │                                        │
-│      │       │                     │ tick() unless can_start()              │
-│      │       │ exceeded_max_       │                                        │
-│      │       │ attempts            │                                        │
-│      │       ▼                     │                                        │
-│      │  ┌──────────┐               │                                        │
-│      │  │ SKIPPED  │               │                                        │
-│      │  │ (final)  │               │                                        │
-│      │  └──────────┘               │                                        │
-│      │ tick() when                 │                                        │
-│      │ can_start()                 │                                        │
-│      ▼                             │                                        │
-│   ┌─────────────┐                  │                                        │
-│   │   STARTED   │──────────────────┘                                        │
-│   │             │◄─────────────────────────────────────────────────┐        │
-│   │ enter:      │                      │                           │        │
-│   │ result.run()│ tick() unless        │                           │        │
-│   │ (execute    │ is_finished()        │                           │        │
-│   │  hook via   │──────────────────────┘                           │        │
-│   │  run_hook())│                                                  │        │
-│   └──────┬──────┘                                                  │        │
-│          │                                                         │        │
-│          │ tick() checks status set by hook output                 │        │
-│          ├─────────────┬─────────────┬─────────────┐               │        │
-│          ▼             ▼             ▼             ▼               │        │
-│   ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐         │        │
-│   │ SUCCEEDED │ │  FAILED   │ │  SKIPPED  │ │  BACKOFF  │         │        │
-│   │  (final)  │ │  (final)  │ │  (final)  │ │           │         │        │
-│   └───────────┘ └───────────┘ └───────────┘ └──┬──────┬─┘         │        │
-│                                                 │      │            │        │
-│                                   exceeded_max_ │      │ can_start()│        │
-│                                   attempts      │      │ loops back │        │
-│                                        ▼        │      └────────────┘        │
-│                                   ┌──────────┐  │                            │
-│                                   │ SKIPPED  │◄─┘                            │
-│                                   │ (final)  │                               │
-│                                   └──────────┘                               │
-│                                                                             │
-│   Each ArchiveResult runs ONE specific hook (stored in .hook_name field)    │
-└─────────────────────────────────────────────────────────────────────────────┘
-"""
-
 BINARY_MACHINE_DIAGRAM = """
 ┌─────────────────────────────────────────────────────────────────────────────┐
 │                             BinaryMachine                                   │
@@ -193,8 +143,8 @@ def pluginmap(
    """
    Show a map of all state machines and their associated plugin hooks.

-    Displays ASCII art diagrams of the core model state machines (Crawl, Snapshot,
-    ArchiveResult, Binary) and lists all auto-detected on_Modelname_xyz hooks
+    Displays ASCII art diagrams of the core queued model state machines (Crawl,
+    Snapshot, Binary) and lists all auto-detected on_Modelname_xyz hooks
    that will run for each model's transitions.
    """
    from rich.console import Console
@@ -257,17 +207,6 @@ def pluginmap(
        prnt(f'[dim]User plugins: {USER_PLUGINS_DIR}[/dim]')
        prnt()

-    # Show diagrams first (unless quiet mode)
-    if not quiet:
-        # Show ArchiveResult diagram separately since it's different
-        prnt(Panel(
-            ARCHIVERESULT_MACHINE_DIAGRAM,
-            title='[bold green]ArchiveResultMachine[/bold green]',
-            border_style='green',
-            expand=False,
-        ))
-        prnt()
-
    for event_name, info in model_events.items():
        # Discover hooks for this event
        hooks = discover_hooks(event_name, filter_disabled=not show_disabled)
--- a/archivebox/cli/archivebox_run.py
+++ b/archivebox/cli/archivebox_run.py
@@ -145,17 +145,25 @@ def process_stdin_records() -> int:
                    try:
                        archiveresult = ArchiveResult.objects.get(id=record_id)
                    except ArchiveResult.DoesNotExist:
-                        archiveresult = ArchiveResult.from_json(record)
+                        archiveresult = None
                else:
-                    # New archiveresult - create it
-                    archiveresult = ArchiveResult.from_json(record)
+                    archiveresult = None

+                snapshot_id = record.get('snapshot_id')
+                plugin_name = record.get('plugin')
+                snapshot = None
                if archiveresult:
-                    archiveresult.retry_at = timezone.now()
-                    if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.BACKOFF]:
-                        archiveresult.status = ArchiveResult.StatusChoices.QUEUED
-                    archiveresult.save()
+                    if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.NORESULTS, ArchiveResult.StatusChoices.BACKOFF]:
+                        archiveresult.reset_for_retry()
                    snapshot = archiveresult.snapshot
+                    plugin_name = plugin_name or archiveresult.plugin
+                elif snapshot_id:
+                    try:
+                        snapshot = Snapshot.objects.get(id=snapshot_id)
+                    except Snapshot.DoesNotExist:
+                        snapshot = None
+
+                if snapshot:
                    snapshot.retry_at = timezone.now()
                    if snapshot.status != Snapshot.StatusChoices.STARTED:
                        snapshot.status = Snapshot.StatusChoices.QUEUED
@@ -167,9 +175,9 @@ def process_stdin_records() -> int:
                    crawl.save(update_fields=['status', 'retry_at', 'modified_at'])
                    crawl_id = str(snapshot.crawl_id)
                    snapshot_ids_by_crawl[crawl_id].add(str(snapshot.id))
-                    if archiveresult.plugin:
-                        plugin_names_by_crawl[crawl_id].add(archiveresult.plugin)
-                    output_records.append(archiveresult.to_json())
+                    if plugin_name:
+                        plugin_names_by_crawl[crawl_id].add(str(plugin_name))
+                    output_records.append(record if not archiveresult else archiveresult.to_json())
                    queued_count += 1

            elif record_type == TYPE_BINARY:
@@ -234,9 +242,11 @@ def run_runner(daemon: bool = False) -> int:
    """
    from django.utils import timezone
    from archivebox.machine.models import Machine, Process
-    from archivebox.services.runner import run_pending_crawls
+    from archivebox.services.runner import recover_orphaned_crawls, recover_orphaned_snapshots, run_pending_crawls

    Process.cleanup_stale_running()
+    recover_orphaned_snapshots()
+    recover_orphaned_crawls()
    Machine.current()
    current = Process.current()
    if current.process_type != Process.TypeChoices.ORCHESTRATOR:
@@ -305,6 +315,13 @@ def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
            traceback.print_exc()
            sys.exit(1)

+    if daemon:
+        if not sys.stdin.isatty():
+            exit_code = process_stdin_records()
+            if exit_code != 0:
+                sys.exit(exit_code)
+        sys.exit(run_runner(daemon=True))
+
    if not sys.stdin.isatty():
        sys.exit(process_stdin_records())
    else:
--- a/archivebox/cli/archivebox_server.py
+++ b/archivebox/cli/archivebox_server.py
@@ -3,9 +3,7 @@
 __package__ = 'archivebox.cli'

 from typing import Iterable
-import os
 import sys
-import subprocess

 import rich_click as click
 from rich import print
@@ -14,6 +12,41 @@ from archivebox.misc.util import docstring, enforce_types
 from archivebox.config.common import SERVER_CONFIG


+def stop_existing_background_runner(*, machine, process_model, supervisor=None, stop_worker_fn=None, log=print) -> int:
+    """Stop any existing orchestrator process so the server can take ownership."""
+    process_model.cleanup_stale_running(machine=machine)
+
+    running_runners = list(process_model.objects.filter(
+        machine=machine,
+        status=process_model.StatusChoices.RUNNING,
+        process_type=process_model.TypeChoices.ORCHESTRATOR,
+    ).order_by('created_at'))
+
+    if not running_runners:
+        return 0
+
+    log('[yellow][*] Stopping existing ArchiveBox background runner...[/yellow]')
+
+    if supervisor is not None and stop_worker_fn is not None:
+        for worker_name in ('worker_runner', 'worker_runner_watch'):
+            try:
+                stop_worker_fn(supervisor, worker_name)
+            except Exception:
+                pass
+
+    for proc in running_runners:
+        try:
+            proc.kill_tree(graceful_timeout=2.0)
+        except Exception:
+            try:
+                proc.terminate(graceful_timeout=2.0)
+            except Exception:
+                pass
+
+    process_model.cleanup_stale_running(machine=machine)
+    return len(running_runners)
+
+
@enforce_types
 def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
          reload: bool=False,
@@ -39,25 +72,6 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
    if debug or reload:
        SHELL_CONFIG.DEBUG = True

-    if run_in_debug:
-        os.environ['ARCHIVEBOX_RUNSERVER'] = '1'
-        if reload:
-            os.environ['ARCHIVEBOX_AUTORELOAD'] = '1'
-            from archivebox.config.common import STORAGE_CONFIG
-            pidfile = str(STORAGE_CONFIG.TMP_DIR / 'runserver.pid')
-            os.environ['ARCHIVEBOX_RUNSERVER_PIDFILE'] = pidfile
-
-            from django.utils.autoreload import DJANGO_AUTORELOAD_ENV
-            is_reloader_child = os.environ.get(DJANGO_AUTORELOAD_ENV) == 'true'
-            if not is_reloader_child:
-                env = os.environ.copy()
-                subprocess.Popen(
-                    [sys.executable, '-m', 'archivebox', 'manage', 'runner_watch', f'--pidfile={pidfile}'],
-                    env=env,
-                    stdout=subprocess.DEVNULL,
-                    stderr=subprocess.DEVNULL,
-                )
-
    from django.contrib.auth.models import User
    
    if not User.objects.filter(is_superuser=True).exclude(username='system').exists():
@@ -81,73 +95,62 @@ def server(runserver_args: Iterable[str]=(SERVER_CONFIG.BIND_ADDR,),
    except IndexError:
        pass

+    from archivebox.workers.supervisord_util import (
+        get_existing_supervisord_process,
+        get_worker,
+        stop_worker,
+        start_server_workers,
+        is_port_in_use,
+    )
+    from archivebox.machine.models import Machine, Process
+
+    # Check if port is already in use
+    if is_port_in_use(host, int(port)):
+        print(f'[red][X] Error: Port {port} is already in use[/red]')
+        print(f'    Another process (possibly daphne or runserver) is already listening on {host}:{port}')
+        print('    Stop the conflicting process or choose a different port')
+        sys.exit(1)
+
+    machine = Machine.current()
+    stop_existing_background_runner(
+        machine=machine,
+        process_model=Process,
+        supervisor=get_existing_supervisord_process(),
+        stop_worker_fn=stop_worker,
+    )
+
+    supervisor = get_existing_supervisord_process()
+    if supervisor:
+        server_worker_name = 'worker_runserver' if run_in_debug else 'worker_daphne'
+        server_proc = get_worker(supervisor, server_worker_name)
+        server_state = server_proc.get('statename') if isinstance(server_proc, dict) else None
+        if server_state == 'RUNNING':
+            runner_proc = get_worker(supervisor, 'worker_runner')
+            runner_watch_proc = get_worker(supervisor, 'worker_runner_watch')
+            runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None
+            runner_watch_state = runner_watch_proc.get('statename') if isinstance(runner_watch_proc, dict) else None
+            print('[red][X] Error: ArchiveBox server is already running[/red]')
+            print(f'    [green]√[/green] Web server ({server_worker_name}) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
+            if runner_state == 'RUNNING':
+                print('    [green]√[/green] Background runner (worker_runner) is RUNNING')
+            if runner_watch_state == 'RUNNING':
+                print('    [green]√[/green] Reload watcher (worker_runner_watch) is RUNNING')
+            print()
+            print('[yellow]To stop the existing server, run:[/yellow]')
+            print('    pkill -f "archivebox server"')
+            print('    pkill -f supervisord')
+            sys.exit(1)
+
    if run_in_debug:
-        from django.core.management import call_command
        print('[green][+] Starting ArchiveBox webserver in DEBUG mode...[/green]')
-        print(f'    [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
-        print(f'    [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
-        print('    > Writing ArchiveBox error log to ./logs/errors.log')
-        if not reload:
-            runserver_args.append('--noreload')  # '--insecure'
-        if nothreading:
-            runserver_args.append('--nothreading')
-        call_command("runserver", *runserver_args)
    else:
-        from archivebox.workers.supervisord_util import (
-            get_existing_supervisord_process,
-            get_worker,
-            start_server_workers,
-            is_port_in_use,
-        )
-        from archivebox.machine.models import Machine, Process
-
-        # Check if port is already in use
-        if is_port_in_use(host, int(port)):
-            print(f'[red][X] Error: Port {port} is already in use[/red]')
-            print(f'    Another process (possibly daphne) is already listening on {host}:{port}')
-            print('    Stop the conflicting process or choose a different port')
-            sys.exit(1)
-
-        # Check if the background crawl runner is already running for this data directory
-        if Process.objects.filter(
-            machine=Machine.current(),
-            status=Process.StatusChoices.RUNNING,
-            process_type=Process.TypeChoices.ORCHESTRATOR,
-        ).exists():
-            print('[red][X] Error: ArchiveBox background runner is already running for this data directory[/red]')
-            print('    Stop the existing runner before starting a new server')
-            print('    To stop: pkill -f "archivebox run --daemon"')
-            sys.exit(1)
-
-        # Check if supervisord is already running
-        supervisor = get_existing_supervisord_process()
-        if supervisor:
-            daphne_proc = get_worker(supervisor, 'worker_daphne')
-            daphne_state = daphne_proc.get('statename') if isinstance(daphne_proc, dict) else None
-
-            # If daphne is already running, error out
-            if daphne_state == 'RUNNING':
-                runner_proc = get_worker(supervisor, 'worker_runner')
-                runner_state = runner_proc.get('statename') if isinstance(runner_proc, dict) else None
-                print('[red][X] Error: ArchiveBox server is already running[/red]')
-                print(f'    [green]√[/green] Web server (worker_daphne) is RUNNING on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
-                if runner_state == 'RUNNING':
-                    print('    [green]√[/green] Background runner (worker_runner) is RUNNING')
-                print()
-                print('[yellow]To stop the existing server, run:[/yellow]')
-                print('    pkill -f "archivebox server"')
-                print('    pkill -f supervisord')
-                sys.exit(1)
-            # Otherwise, daphne is not running - fall through to start it
-
-        # No existing workers found - start new ones
        print('[green][+] Starting ArchiveBox webserver...[/green]')
-        print(f'    [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
-        print(f'    [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
-        print('    > Writing ArchiveBox error log to ./logs/errors.log')
-        print()
-        start_server_workers(host=host, port=port, daemonize=daemonize)
-        print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")
+    print(f'    [blink][green]>[/green][/blink] Starting ArchiveBox webserver on [deep_sky_blue4][link=http://{host}:{port}]http://{host}:{port}[/link][/deep_sky_blue4]')
+    print(f'    [green]>[/green] Log in to ArchiveBox Admin UI on [deep_sky_blue3][link=http://{host}:{port}/admin]http://{host}:{port}/admin[/link][/deep_sky_blue3]')
+    print('    > Writing ArchiveBox error log to ./logs/errors.log')
+    print()
+    start_server_workers(host=host, port=port, daemonize=daemonize, debug=run_in_debug, reload=reload, nothreading=nothreading)
+    print("\n[i][green][🟩] ArchiveBox server shut down gracefully.[/green][/i]")


@click.command()
--- a/archivebox/cli/archivebox_snapshot.py
+++ b/archivebox/cli/archivebox_snapshot.py
@@ -172,6 +172,9 @@ def list_snapshots(
    tag: Optional[str] = None,
    crawl_id: Optional[str] = None,
    limit: Optional[int] = None,
+    sort: Optional[str] = None,
+    csv: Optional[str] = None,
+    with_headers: bool = False,
 ) -> int:
    """
    List Snapshots as JSONL with optional filters.
@@ -182,7 +185,11 @@ def list_snapshots(
    from archivebox.misc.jsonl import write_record
    from archivebox.core.models import Snapshot

-    is_tty = sys.stdout.isatty()
+    if with_headers and not csv:
+        rprint('[red]--with-headers requires --csv[/red]', file=sys.stderr)
+        return 2
+
+    is_tty = sys.stdout.isatty() and not csv

    queryset = Snapshot.objects.all().order_by('-created_at')

@@ -199,7 +206,29 @@ def list_snapshots(
    if tag:
        queryset = queryset.filter(tags__name__iexact=tag)

+    if sort:
+        queryset = queryset.order_by(sort)
+
    count = 0
+    if csv:
+        cols = [col.strip() for col in csv.split(',') if col.strip()]
+        if not cols:
+            rprint('[red]No CSV columns provided[/red]', file=sys.stderr)
+            return 2
+        rows: list[str] = []
+        if with_headers:
+            rows.append(','.join(cols))
+        for snapshot in queryset.iterator(chunk_size=500):
+            rows.append(snapshot.to_csv(cols=cols, separator=','))
+            count += 1
+        output = '\n'.join(rows)
+        if output:
+            sys.stdout.write(output)
+            if not output.endswith('\n'):
+                sys.stdout.write('\n')
+        rprint(f'[dim]Listed {count} snapshots[/dim]', file=sys.stderr)
+        return 0
+
    for snapshot in queryset:
        if is_tty:
            status_color = {