Merge remote-tracking branch 'origin/dev' into claude/analyze-test-coverage-mWgwv

2026-04-06 07:47:53 +10:00 · 2025-12-31 11:45:22 +00:00
parent 0cb5f0712d bdb3d946b8
commit a063d8cd43
26 changed files with 3893 additions and 1185 deletions
--- a/TODO_process_tracking.md
+++ b/TODO_process_tracking.md
@@ -28,7 +28,7 @@ Process(cmd=['archivebox', 'add', 'https://example.com'])           # CLI entry
 **File:** `archivebox/machine/models.py`
 ```python
-class Process(ModelWithStateMachine):
+class Process(ModelWithHealthStats):
    # ... existing fields ...
    # NEW: Parent process FK for hierarchy tracking
@@ -621,6 +621,18 @@ class Process(ModelWithHealthStats):
        return self
    def is_alive(self) -> bool:
        """Check if this process is still running."""
        from archivebox.misc.process_utils import validate_pid_file
        if self.status == self.StatusChoices.EXITED:
            return False
        if not self.pid:
            return False
        return validate_pid_file(self.pid_file, self.cmd_file)
    def kill(self, signal_num: int = 15) -> bool:
        """
        Kill this process and update status.
@@ -700,7 +712,7 @@ class Process(ModelWithHealthStats):
        Wait for process to exit, polling periodically.
        Args:
-            timeout: Max seconds to wait (None = use self.timeout, or config.TIMEOUT * 5 if that's also None)
+            timeout: Max seconds to wait (None = use self.timeout)
        Returns:
            exit_code
@@ -709,10 +721,8 @@ class Process(ModelWithHealthStats):
            TimeoutError if process doesn't exit in time
        """
        import time
        from archivebox import config
-        # Require a timeout - default to config.TIMEOUT * 5 (typically 300s)
+        timeout = timeout or self.timeout
        timeout = timeout or self.timeout or (config.TIMEOUT * 5)
        start = time.time()
        while True:
@@ -1692,6 +1702,230 @@ class ProcessAdmin(admin.ModelAdmin):
 ---
 ## Phase 8: Code Consolidation (Delete Redundant Logic)
 The goal is to consolidate all subprocess management into `Process` model methods, eliminating duplicate logic scattered across the codebase.
 ### 8.1 Files to Simplify/Delete
 | File | Current Lines | After Consolidation | Savings |
 |------|--------------|---------------------|---------|
 | `workers/pid_utils.py` | ~192 lines | DELETE entirely | -192 |
 | `misc/process_utils.py` | ~85 lines | Keep as low-level utils | 0 |
 | `hooks.py` (run_hook) | ~100 lines | -50 lines (use Process.launch) | -50 |
 | `hooks.py` (kill/alive) | ~50 lines | DELETE (use Process.kill/is_running) | -50 |
 | `crawls/models.py` (cleanup) | ~100 lines | -70 lines (use Process.kill) | -70 |
 | `supervisord_util.py` | ~50 lines process mgmt | -30 lines | -30 |
 | **TOTAL** | | | **~-390 lines** |
 ### 8.2 Detailed Consolidation Map
 #### `workers/pid_utils.py` → DELETE ENTIRELY
 | Current Function | Replacement |
 |------------------|-------------|
 | `write_pid_file(worker_type, worker_id)` | `Process.current()` auto-creates |
 | `read_pid_file(path)` | `Process.objects.get_by_pid(pid)` |
 | `remove_pid_file(path)` | Manual cleanup in `Process.kill()` and legacy hook cleanup code |
 | `is_process_alive(pid)` | `Process.is_running` / `Process.proc is not None` |
 | `get_all_pid_files()` | `Process.objects.filter(machine=Machine.current(), status=Process.StatusChoices.RUNNING)` |
 | `get_all_worker_pids(type)` | `Process.objects.filter(machine=Machine.current(), process_type=type, status=Process.StatusChoices.RUNNING)` |
 | `cleanup_stale_pid_files()` | `Process.cleanup_stale_running()` |
 | `get_running_worker_count(type)` | `Process.objects.filter(...).count()` |
 | `get_next_worker_id(type)` | Use `Max(worker_id)+1` under transaction or DB sequence to avoid race conditions |
 | `stop_worker(pid, graceful)` | `Process.terminate(graceful_timeout)` or `Process.kill_tree()` |
 #### `hooks.py` Changes
 **Current `run_hook()` lines 374-398:**
 ```python
 # DELETE these lines - replaced by Process.launch()
 stdout_file = output_dir / 'stdout.log'
 stderr_file = output_dir / 'stderr.log'
 pid_file = output_dir / 'hook.pid'
 cmd_file = output_dir / 'cmd.sh'
 write_cmd_file(cmd_file, cmd)
 with open(stdout_file, 'w') as out, open(stderr_file, 'w') as err:
    process = subprocess.Popen(cmd, ...)
    write_pid_file_with_mtime(pid_file, process.pid, time.time())
 ```
 **New `run_hook()` using Process:**
 ```python
 # Only store env delta or allowlist to avoid leaking secrets
 env_delta = {k: v for k, v in env.items() if k in ALLOWED_ENV_VARS}
 hook_process = Process.objects.create(
    parent=parent_process,
    process_type=Process.TypeChoices.HOOK,
    cmd=cmd, pwd=str(output_dir), env=env_delta, timeout=timeout,
 )
 hook_process.launch(background=is_background)
 # stdout/stderr/pid_file all handled internally by Process.launch()
 ```
 **DELETE these functions entirely:**
 ```python
 def process_is_alive(pid_file: Path) -> bool:  # lines 1238-1256
 def kill_process(pid_file: Path, sig, validate):  # lines 1259-1282
 ```
 **Replace with:**
 ```python
 # Use Process methods directly:
 process.is_running  # replaces process_is_alive()
 process.kill()      # replaces kill_process()
 ```
 #### `crawls/models.py` Changes
 **Current `Crawl.cleanup()` lines 418-493:**
 ```python
 # DELETE all this inline process logic:
 def is_process_alive(pid):
    try:
        os.kill(pid, 0)
        return True
    except (OSError, ProcessLookupError):
        return False
 for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
    if not validate_pid_file(pid_file, cmd_file):
        pid_file.unlink(missing_ok=True)
        continue
    pid = int(pid_file.read_text().strip())
    os.killpg(pid, signal.SIGTERM)
    time.sleep(2)
    if not is_process_alive(pid):
        pid_file.unlink(missing_ok=True)
        continue
    os.killpg(pid, signal.SIGKILL)
    # ... more cleanup logic
 ```
 **New `Crawl.cleanup()` using Process:**
 ```python
 def cleanup(self):
    # Kill all running child processes for this crawl
    for snapshot in self.snapshot_set.all():
        for ar in snapshot.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED):
            if ar.process_id:
                # Kill hook process and all its children
                ar.process.kill()
                for child in ar.process.children.filter(status='running'):
                    child.kill()
    # Run on_CrawlEnd hooks (foreground)
    # ... existing hook running logic ...
 ```
 #### `supervisord_util.py` Changes
 **Current global tracking:**
 ```python
 _supervisord_proc = None  # subprocess.Popen reference
 def stop_existing_supervisord_process():
    global _supervisord_proc
    if _supervisord_proc and _supervisord_proc.poll() is None:
        _supervisord_proc.terminate()
        _supervisord_proc.wait(timeout=5)
        # ... fallback to PID file ...
 ```
 **New using Process model:**
 ```python
 _supervisord_db_process = None  # Process model instance
 def start_new_supervisord_process():
    # ... existing subprocess.Popen ...
    global _supervisord_db_process
    _supervisord_db_process = Process.objects.create(
        parent=Process.current(),
        process_type=Process.TypeChoices.SUPERVISORD,
        pid=proc.pid,
        cmd=['supervisord', f'--configuration={CONFIG_FILE}'],
        started_at=timezone.now(),
        status=Process.StatusChoices.RUNNING,
    )
 def stop_existing_supervisord_process():
    global _supervisord_db_process
    if _supervisord_db_process:
        _supervisord_db_process.kill()  # Handles children, PID validation, etc.
        _supervisord_db_process = None
 ```
 #### `workers/worker.py` Changes
 **Current:**
 ```python
 from .pid_utils import write_pid_file, remove_pid_file, ...
 def on_startup(self):
    self.pid = os.getpid()
    self.pid_file = write_pid_file(self.name, self.worker_id)
 def on_shutdown(self, error=None):
    if self.pid_file:
        remove_pid_file(self.pid_file)
 ```
 **New:**
 ```python
 # No import needed - Process.current() handles everything
 def on_startup(self):
    self.db_process = Process.current()
    # Process.current() auto-detects type, finds parent via PPID, creates record
 def on_shutdown(self, error=None):
    if self.db_process:
        self.db_process.exit_code = 0 if error is None else 1
        self.db_process.status = Process.StatusChoices.EXITED
        self.db_process.ended_at = timezone.now()
        self.db_process.save()
 ```
 ### 8.3 New Process Model Methods Summary
 All process operations now go through `Process`:
 ```python
 # Getting current process
 Process.current()  # Creates/retrieves Process for os.getpid()
 # Spawning new process
 proc = Process.objects.create(parent=Process.current(), cmd=[...], ...)
 proc.launch(background=False)  # Handles Popen, PID file, stdout/stderr
 # Checking process status
 proc.is_running        # True if OS process exists and matches
 proc.proc              # psutil.Process or None (validated)
 proc.poll()            # Returns exit_code or None
 # Terminating process
 proc.kill()            # Safe kill with PID validation
 proc.kill(SIGKILL)     # Force kill
 # Waiting for completion
 proc.wait(timeout=30)  # Blocks until exit or timeout
 # Cleanup
 Process.cleanup_stale_running()  # Mark orphaned processes as EXITED
 ```
 ### 8.4 Benefits
 1. **Single Source of Truth**: All process state in database, queryable
 2. **PID Reuse Protection**: `Process.proc` validates via psutil.create_time()
 3. **Hierarchy Tracking**: `Process.parent` / `Process.children` for tree traversal
 4. **Machine-Scoped**: All queries filter by `machine=Machine.current()`
 5. **Audit Trail**: Every subprocess is logged with timestamps, exit codes
 6. **No Stale PID Files**: Process records update status automatically
 ---
 ## Open Questions
 1. **Performance**: Deep hierarchies with many children could slow queries. Consider:
--- a/archivebox/cli/archivebox_extract.py
+++ b/archivebox/cli/archivebox_extract.py
@@ -0,0 +1,265 @@
 #!/usr/bin/env python3
 """
 archivebox extract [snapshot_ids...] [--plugins=NAMES]
 Run plugins on Snapshots. Accepts snapshot IDs as arguments, from stdin, or via JSONL.
 Input formats:
    - Snapshot UUIDs (one per line)
    - JSONL: {"type": "Snapshot", "id": "...", "url": "..."}
    - JSONL: {"type": "ArchiveResult", "snapshot_id": "...", "plugin": "..."}
 Output (JSONL):
    {"type": "ArchiveResult", "id": "...", "snapshot_id": "...", "plugin": "...", "status": "..."}
 Examples:
    # Extract specific snapshot
    archivebox extract 01234567-89ab-cdef-0123-456789abcdef
    # Pipe from snapshot command
    archivebox snapshot https://example.com | archivebox extract
    # Run specific plugins only
    archivebox extract --plugins=screenshot,singlefile 01234567-89ab-cdef-0123-456789abcdef
    # Chain commands
    archivebox crawl https://example.com | archivebox snapshot | archivebox extract
 """
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox extract'
 import sys
 from typing import Optional, List
 import rich_click as click
 def process_archiveresult_by_id(archiveresult_id: str) -> int:
    """
    Run extraction for a single ArchiveResult by ID (used by workers).
    Triggers the ArchiveResult's state machine tick() to run the extractor plugin.
    """
    from rich import print as rprint
    from archivebox.core.models import ArchiveResult
    try:
        archiveresult = ArchiveResult.objects.get(id=archiveresult_id)
    except ArchiveResult.DoesNotExist:
        rprint(f'[red]ArchiveResult {archiveresult_id} not found[/red]', file=sys.stderr)
        return 1
    rprint(f'[blue]Extracting {archiveresult.plugin} for {archiveresult.snapshot.url}[/blue]', file=sys.stderr)
    try:
        # Trigger state machine tick - this runs the actual extraction
        archiveresult.sm.tick()
        archiveresult.refresh_from_db()
        if archiveresult.status == ArchiveResult.StatusChoices.SUCCEEDED:
            print(f'[green]Extraction succeeded: {archiveresult.output_str}[/green]')
            return 0
        elif archiveresult.status == ArchiveResult.StatusChoices.FAILED:
            print(f'[red]Extraction failed: {archiveresult.output_str}[/red]', file=sys.stderr)
            return 1
        else:
            # Still in progress or backoff - not a failure
            print(f'[yellow]Extraction status: {archiveresult.status}[/yellow]')
            return 0
    except Exception as e:
        print(f'[red]Extraction error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
        return 1
 def run_plugins(
    args: tuple,
    plugins: str = '',
    wait: bool = True,
 ) -> int:
    """
    Run plugins on Snapshots from input.
    Reads Snapshot IDs or JSONL from args/stdin, runs plugins, outputs JSONL.
    Exit codes:
        0: Success
        1: Failure
    """
    from rich import print as rprint
    from django.utils import timezone
    from archivebox.misc.jsonl import (
        read_args_or_stdin, write_record,
        TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
    )
    from archivebox.core.models import Snapshot, ArchiveResult
    from archivebox.workers.orchestrator import Orchestrator
    is_tty = sys.stdout.isatty()
    # Parse comma-separated plugins list once (reused in creation and filtering)
    plugins_list = [p.strip() for p in plugins.split(',') if p.strip()] if plugins else []
    # Collect all input records
    records = list(read_args_or_stdin(args))
    if not records:
        rprint('[yellow]No snapshots provided. Pass snapshot IDs as arguments or via stdin.[/yellow]', file=sys.stderr)
        return 1
    # Gather snapshot IDs to process
    snapshot_ids = set()
    for record in records:
        record_type = record.get('type')
        if record_type == TYPE_SNAPSHOT:
            snapshot_id = record.get('id')
            if snapshot_id:
                snapshot_ids.add(snapshot_id)
            elif record.get('url'):
                # Look up by URL (get most recent if multiple exist)
                snap = Snapshot.objects.filter(url=record['url']).order_by('-created_at').first()
                if snap:
                    snapshot_ids.add(str(snap.id))
                else:
                    rprint(f'[yellow]Snapshot not found for URL: {record["url"]}[/yellow]', file=sys.stderr)
        elif record_type == TYPE_ARCHIVERESULT:
            snapshot_id = record.get('snapshot_id')
            if snapshot_id:
                snapshot_ids.add(snapshot_id)
        elif 'id' in record:
            # Assume it's a snapshot ID
            snapshot_ids.add(record['id'])
    if not snapshot_ids:
        rprint('[red]No valid snapshot IDs found in input[/red]', file=sys.stderr)
        return 1
    # Get snapshots and ensure they have pending ArchiveResults
    processed_count = 0
    for snapshot_id in snapshot_ids:
        try:
            snapshot = Snapshot.objects.get(id=snapshot_id)
        except Snapshot.DoesNotExist:
            rprint(f'[yellow]Snapshot {snapshot_id} not found[/yellow]', file=sys.stderr)
            continue
        # Create pending ArchiveResults if needed
        if plugins_list:
            # Only create for specific plugins
            for plugin_name in plugins_list:
                result, created = ArchiveResult.objects.get_or_create(
                    snapshot=snapshot,
                    plugin=plugin_name,
                    defaults={
                        'status': ArchiveResult.StatusChoices.QUEUED,
                        'retry_at': timezone.now(),
                    }
                )
                if not created and result.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED]:
                    # Reset for retry
                    result.status = ArchiveResult.StatusChoices.QUEUED
                    result.retry_at = timezone.now()
                    result.save()
        else:
            # Create all pending plugins
            snapshot.create_pending_archiveresults()
        # Reset snapshot status to allow processing
        if snapshot.status == Snapshot.StatusChoices.SEALED:
            snapshot.status = Snapshot.StatusChoices.STARTED
            snapshot.retry_at = timezone.now()
            snapshot.save()
        processed_count += 1
    if processed_count == 0:
        rprint('[red]No snapshots to process[/red]', file=sys.stderr)
        return 1
    rprint(f'[blue]Queued {processed_count} snapshots for extraction[/blue]', file=sys.stderr)
    # Run orchestrator if --wait (default)
    if wait:
        rprint('[blue]Running plugins...[/blue]', file=sys.stderr)
        orchestrator = Orchestrator(exit_on_idle=True)
        orchestrator.runloop()
    # Output results as JSONL (when piped) or human-readable (when TTY)
    for snapshot_id in snapshot_ids:
        try:
            snapshot = Snapshot.objects.get(id=snapshot_id)
            results = snapshot.archiveresult_set.all()
            if plugins_list:
                results = results.filter(plugin__in=plugins_list)
            for result in results:
                if is_tty:
                    status_color = {
                        'succeeded': 'green',
                        'failed': 'red',
                        'skipped': 'yellow',
                    }.get(result.status, 'dim')
                    rprint(f'  [{status_color}]{result.status}[/{status_color}] {result.plugin} → {result.output_str or ""}', file=sys.stderr)
                else:
                    write_record(result.to_jsonl())
        except Snapshot.DoesNotExist:
            continue
    return 0
 def is_archiveresult_id(value: str) -> bool:
    """Check if value looks like an ArchiveResult UUID."""
    import re
    uuid_pattern = re.compile(r'^[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}$', re.I)
    if not uuid_pattern.match(value):
        return False
    # Verify it's actually an ArchiveResult (not a Snapshot or other object)
    from archivebox.core.models import ArchiveResult
    return ArchiveResult.objects.filter(id=value).exists()
@click.command()
@click.option('--plugins', '-p', default='', help='Comma-separated list of plugins to run (e.g., screenshot,singlefile)')
@click.option('--wait/--no-wait', default=True, help='Wait for plugins to complete (default: wait)')
@click.argument('args', nargs=-1)
 def main(plugins: str, wait: bool, args: tuple):
    """Run plugins on Snapshots, or process existing ArchiveResults by ID"""
    from archivebox.misc.jsonl import read_args_or_stdin
    # Read all input
    records = list(read_args_or_stdin(args))
    if not records:
        from rich import print as rprint
        rprint('[yellow]No Snapshot IDs or ArchiveResult IDs provided. Pass as arguments or via stdin.[/yellow]', file=sys.stderr)
        sys.exit(1)
    # Check if input looks like existing ArchiveResult IDs to process
    all_are_archiveresult_ids = all(
        is_archiveresult_id(r.get('id') or r.get('url', ''))
        for r in records
    )
    if all_are_archiveresult_ids:
        # Process existing ArchiveResults by ID
        exit_code = 0
        for record in records:
            archiveresult_id = record.get('id') or record.get('url')
            result = process_archiveresult_by_id(archiveresult_id)
            if result != 0:
                exit_code = result
        sys.exit(exit_code)
    else:
        # Default behavior: run plugins on Snapshots from input
        sys.exit(run_plugins(args, plugins=plugins, wait=wait))
 if __name__ == '__main__':
    main()
--- a/archivebox/cli/archivebox_orchestrator.py
+++ b/archivebox/cli/archivebox_orchestrator.py
@@ -0,0 +1,67 @@
 #!/usr/bin/env python3
 """
 archivebox orchestrator [--daemon]
 Start the orchestrator process that manages workers.
 The orchestrator polls queues for each model type (Crawl, Snapshot, ArchiveResult)
 and lazily spawns worker processes when there is work to be done.
 """
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox orchestrator'
 import sys
 import rich_click as click
 from archivebox.misc.util import docstring
 def orchestrator(daemon: bool = False, watch: bool = False) -> int:
    """
    Start the orchestrator process.
    The orchestrator:
    1. Polls each model queue (Crawl, Snapshot, ArchiveResult)
    2. Spawns worker processes when there is work to do
    3. Monitors worker health and restarts failed workers
    4. Exits when all queues are empty (unless --daemon)
    Args:
        daemon: Run forever (don't exit when idle)
        watch: Just watch the queues without spawning workers (for debugging)
    Exit codes:
        0: All work completed successfully
        1: Error occurred
    """
    from archivebox.workers.orchestrator import Orchestrator
    if Orchestrator.is_running():
        print('[yellow]Orchestrator is already running[/yellow]')
        return 0
    try:
        orchestrator_instance = Orchestrator(exit_on_idle=not daemon)
        orchestrator_instance.runloop()
        return 0
    except KeyboardInterrupt:
        return 0
    except Exception as e:
        print(f'[red]Orchestrator error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
        return 1
@click.command()
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
@click.option('--watch', '-w', is_flag=True, help="Watch queues without spawning workers")
@docstring(orchestrator.__doc__)
 def main(daemon: bool, watch: bool):
    """Start the ArchiveBox orchestrator process"""
    sys.exit(orchestrator(daemon=daemon, watch=watch))
 if __name__ == '__main__':
    main()
--- a/archivebox/cli/archivebox_remove.py
+++ b/archivebox/cli/archivebox_remove.py
@@ -0,0 +1,98 @@
 #!/usr/bin/env python3
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox remove'
 import shutil
 from pathlib import Path
 from typing import Iterable
 import rich_click as click
 from django.db.models import QuerySet
 from archivebox.config import DATA_DIR
 from archivebox.config.django import setup_django
 from archivebox.misc.util import enforce_types, docstring
 from archivebox.misc.checks import check_data_folder
 from archivebox.misc.logging_util import (
    log_list_started,
    log_list_finished,
    log_removal_started,
    log_removal_finished,
    TimedProgress,
 )
@enforce_types
 def remove(filter_patterns: Iterable[str]=(),
          filter_type: str='exact',
          snapshots: QuerySet | None=None,
          after: float | None=None,
          before: float | None=None,
          yes: bool=False,
          delete: bool=False,
          out_dir: Path=DATA_DIR) -> QuerySet:
    """Remove the specified URLs from the archive"""
    setup_django()
    check_data_folder()
    from archivebox.cli.archivebox_search import get_snapshots
    log_list_started(filter_patterns, filter_type)
    timer = TimedProgress(360, prefix='      ')
    try:
        snapshots = get_snapshots(
            snapshots=snapshots,
            filter_patterns=list(filter_patterns) if filter_patterns else None,
            filter_type=filter_type,
            after=after,
            before=before,
        )
    finally:
        timer.end()
    if not snapshots.exists():
        log_removal_finished(0, 0)
        raise SystemExit(1)
    log_list_finished(snapshots)
    log_removal_started(snapshots, yes=yes, delete=delete)
    timer = TimedProgress(360, prefix='      ')
    try:
        for snapshot in snapshots:
            if delete:
                shutil.rmtree(snapshot.output_dir, ignore_errors=True)
    finally:
        timer.end()
    to_remove = snapshots.count()
    from archivebox.search import flush_search_index
    from archivebox.core.models import Snapshot
    flush_search_index(snapshots=snapshots)
    snapshots.delete()
    all_snapshots = Snapshot.objects.all()
    log_removal_finished(all_snapshots.count(), to_remove)
    return all_snapshots
@click.command()
@click.option('--yes', is_flag=True, help='Remove links instantly without prompting to confirm')
@click.option('--delete', is_flag=True, help='Delete the archived content and metadata folder in addition to removing from index')
@click.option('--before', type=float, help='Remove only URLs bookmarked before timestamp')
@click.option('--after', type=float, help='Remove only URLs bookmarked after timestamp')
@click.option('--filter-type', '-f', type=click.Choice(('exact', 'substring', 'domain', 'regex', 'tag')), default='exact', help='Type of pattern matching to use when filtering URLs')
@click.argument('filter_patterns', nargs=-1)
@docstring(remove.__doc__)
 def main(**kwargs):
    """Remove the specified URLs from the archive"""
    remove(**kwargs)
 if __name__ == '__main__':
    main()
--- a/archivebox/cli/archivebox_search.py
+++ b/archivebox/cli/archivebox_search.py
@@ -0,0 +1,131 @@
 #!/usr/bin/env python3
 __package__ = 'archivebox.cli'
 __command__ = 'archivebox search'
 from pathlib import Path
 from typing import Optional, List, Any
 import rich_click as click
 from rich import print
 from django.db.models import QuerySet
 from archivebox.config import DATA_DIR
 from archivebox.misc.logging import stderr
 from archivebox.misc.util import enforce_types, docstring
 # Filter types for URL matching
 LINK_FILTERS = {
    'exact': lambda pattern: {'url': pattern},
    'substring': lambda pattern: {'url__icontains': pattern},
    'regex': lambda pattern: {'url__iregex': pattern},
    'domain': lambda pattern: {'url__istartswith': f'http://{pattern}'},
    'tag': lambda pattern: {'tags__name': pattern},
    'timestamp': lambda pattern: {'timestamp': pattern},
 }
 STATUS_CHOICES = ['indexed', 'archived', 'unarchived']
 def get_snapshots(snapshots: Optional[QuerySet]=None,
                  filter_patterns: Optional[List[str]]=None,
                  filter_type: str='substring',
                  after: Optional[float]=None,
                  before: Optional[float]=None,
                  out_dir: Path=DATA_DIR) -> QuerySet:
    """Filter and return Snapshots matching the given criteria."""
    from archivebox.core.models import Snapshot
    if snapshots:
        result = snapshots
    else:
        result = Snapshot.objects.all()
    if after is not None:
        result = result.filter(timestamp__gte=after)
    if before is not None:
        result = result.filter(timestamp__lt=before)
    if filter_patterns:
        result = Snapshot.objects.filter_by_patterns(filter_patterns, filter_type)
    if not result:
        stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
    return result
@enforce_types
 def search(filter_patterns: list[str] | None=None,
           filter_type: str='substring',
           status: str='indexed',
           before: float | None=None,
           after: float | None=None,
           sort: str | None=None,
           json: bool=False,
           html: bool=False,
           csv: str | None=None,
           with_headers: bool=False):
    """List, filter, and export information about archive entries"""
    from archivebox.core.models import Snapshot
    if with_headers and not (json or html or csv):
        stderr('[X] --with-headers requires --json, --html or --csv\n', color='red')
        raise SystemExit(2)
    # Query DB directly - no filesystem scanning
    snapshots = get_snapshots(
        filter_patterns=list(filter_patterns) if filter_patterns else None,
        filter_type=filter_type,
        before=before,
        after=after,
    )
    # Apply status filter
    if status == 'archived':
        snapshots = snapshots.filter(downloaded_at__isnull=False)
    elif status == 'unarchived':
        snapshots = snapshots.filter(downloaded_at__isnull=True)
    # 'indexed' = all snapshots (no filter)
    if sort:
        snapshots = snapshots.order_by(sort)
    # Export to requested format
    if json:
        output = snapshots.to_json(with_headers=with_headers)
    elif html:
        output = snapshots.to_html(with_headers=with_headers)
    elif csv:
        output = snapshots.to_csv(cols=csv.split(','), header=with_headers)
    else:
        from archivebox.misc.logging_util import printable_folders
        # Convert to dict for printable_folders
        folders = {s.output_dir: s for s in snapshots}
        output = printable_folders(folders, with_headers)
    print(output)
    return output
@click.command()
@click.option('--filter-type', '-f', type=click.Choice(['search', *LINK_FILTERS.keys()]), default='substring', help='Pattern matching type for filtering URLs')
@click.option('--status', '-s', type=click.Choice(STATUS_CHOICES), default='indexed', help='List snapshots with the given status')
@click.option('--before', '-b', type=float, help='List snapshots bookmarked before the given UNIX timestamp')
@click.option('--after', '-a', type=float, help='List snapshots bookmarked after the given UNIX timestamp')
@click.option('--sort', '-o', type=str, help='Field to sort by, e.g. url, created_at, bookmarked_at, downloaded_at')
@click.option('--json', '-J', is_flag=True, help='Print output in JSON format')
@click.option('--html', '-M', is_flag=True, help='Print output in HTML format (suitable for viewing statically without a server)')
@click.option('--csv', '-C', type=str, help='Print output as CSV with the provided fields, e.g.: created_at,url,title')
@click.option('--with-headers', '-H', is_flag=True, help='Include extra CSV/HTML headers in the output')
@click.help_option('--help', '-h')
@click.argument('filter_patterns', nargs=-1)
@docstring(search.__doc__)
 def main(**kwargs):
    return search(**kwargs)
 if __name__ == '__main__':
    main()
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -1,6 +1,6 @@
 __package__ = 'archivebox.core'
-from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING, Iterator, Set
+from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
 from archivebox.uuid_compat import uuid7
 from datetime import datetime, timedelta
 from django_stubs_ext.db.models import TypedModelMeta
@@ -41,8 +41,6 @@ from archivebox.machine.models import NetworkInterface, Binary
 class Tag(ModelWithSerializers):
    JSONL_TYPE = 'Tag'
    # Keep AutoField for compatibility with main branch migrations
    # Don't use UUIDField here - requires complex FK transformation
    id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
@@ -93,66 +91,26 @@ class Tag(ModelWithSerializers):
    def api_url(self) -> str:
        return reverse_lazy('api-1:get_tag', args=[self.id])
-    def to_json(self) -> dict:
+    def to_jsonl(self) -> dict:
        """
-        Convert Tag model instance to a JSON-serializable dict.
+        Convert Tag model instance to a JSONL record.
        """
        from archivebox.config import VERSION
        return {
-            'type': self.JSONL_TYPE,
+            'type': 'Tag',
            'schema_version': VERSION,
            'id': str(self.id),
            'name': self.name,
            'slug': self.slug,
        }
    def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]:
        """
        Yield this Tag as a JSON record.
        Args:
            seen: Set of (type, id) tuples already emitted (for deduplication)
            **kwargs: Passed to children (none for Tag, leaf node)
        Yields:
            dict: JSON-serializable record for this tag
        """
        if seen is not None:
            key = (self.JSONL_TYPE, str(self.id))
            if key in seen:
                return
            seen.add(key)
        yield self.to_json()
    @classmethod
    def from_jsonl(cls, records, overrides: Dict[str, Any] = None) -> list['Tag']:
        """
        Create/update Tags from an iterable of JSONL records.
        Filters to only records with type='Tag'.
        Args:
            records: Iterable of dicts (JSONL records)
            overrides: Optional dict with 'snapshot' to auto-attach tags
        Returns:
            List of Tag instances (skips None results)
        """
        results = []
        for record in records:
            record_type = record.get('type', cls.JSONL_TYPE)
            if record_type == cls.JSONL_TYPE:
                instance = cls.from_json(record, overrides=overrides)
                if instance:
                    results.append(instance)
        return results
    @staticmethod
-    def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None) -> 'Tag | None':
+    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
        """
-        Create/update a single Tag from a JSON record dict.
+        Create/update Tag from JSONL record.
        Args:
-            record: Dict with 'name' field
+            record: JSONL record with 'name' field
            overrides: Optional dict with 'snapshot' to auto-attach tag
        Returns:
@@ -331,8 +289,6 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
 class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
    JSONL_TYPE = 'Snapshot'
    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
    modified_at = models.DateTimeField(auto_now=True)
@@ -469,7 +425,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
    def _fs_next_version(self, version: str) -> str:
        """Get next version in migration chain (0.7/0.8 had same layout, only 0.8→0.9 migration needed)"""
-        # Treat 0.7.0 and 0.8.0 as equivalent (both used data/archive/{timestamp})
+        # Treat 0.7.0 and 0.8.0 as equivalent (both used archive/{timestamp})
        if version in ('0.7.0', '0.8.0'):
            return '0.9.0'
        return self._fs_current_version()
@@ -478,8 +434,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        """
        Migrate from flat to nested structure.
-        0.8.x: data/archive/{timestamp}/{extractor}/
+        0.8.x: archive/{timestamp}/
-        0.9.x: data/users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/{plugin}/
+        0.9.x: users/{user}/snapshots/YYYYMMDD/{domain}/{uuid}/
        Transaction handling:
        1. Copy files INSIDE transaction
@@ -597,8 +553,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        Calculate storage path for specific filesystem version.
        Centralizes path logic so it's reusable.
-        0.7.x/0.8.x: data/archive/{timestamp}
+        0.7.x/0.8.x: archive/{timestamp}
-        0.9.x: data/users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/
+        0.9.x: users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/
        """
        from datetime import datetime
@@ -1012,18 +968,38 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        Each line is a JSON record with a 'type' field:
        - Snapshot: snapshot metadata (crawl_id, url, tags, etc.)
        - ArchiveResult: extractor results (plugin, status, output, etc.)
        - Binary: binary info used for the extraction
        - Process: process execution details (cmd, exit_code, timing, etc.)
        - ArchiveResult: extractor results (plugin, status, output, etc.)
        """
        import json
        index_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME
        index_path.parent.mkdir(parents=True, exist_ok=True)
        # Track unique binaries and processes to avoid duplicates
        binaries_seen = set()
        processes_seen = set()
        with open(index_path, 'w') as f:
-            for record in self.to_jsonl():
+            # Write Snapshot record first (to_jsonl includes crawl_id, fs_version)
-                f.write(json.dumps(record) + '\n')
+            f.write(json.dumps(self.to_jsonl()) + '\n')
            # Write ArchiveResult records with their associated Binary and Process
            # Use select_related to optimize queries
            for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'):
                # Write Binary record if not already written
                if ar.process and ar.process.binary and ar.process.binary_id not in binaries_seen:
                    binaries_seen.add(ar.process.binary_id)
                    f.write(json.dumps(ar.process.binary.to_jsonl()) + '\n')
                # Write Process record if not already written
                if ar.process and ar.process_id not in processes_seen:
                    processes_seen.add(ar.process_id)
                    f.write(json.dumps(ar.process.to_jsonl()) + '\n')
                # Write ArchiveResult record
                f.write(json.dumps(ar.to_jsonl()) + '\n')
    def read_index_jsonl(self) -> dict:
        """
@@ -1407,22 +1383,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        Clean up background ArchiveResult hooks.
        Called by the state machine when entering the 'sealed' state.
-        Gracefully terminates background hooks using plugin-specific timeouts:
+        Kills any background hooks and finalizes their ArchiveResults.
            1. Send SIGTERM to all background hook processes
            2. Wait up to each hook's plugin-specific timeout
            3. Send SIGKILL to any hooks still running after timeout
        """
-        from archivebox.hooks import graceful_terminate_background_hooks
+        from archivebox.misc.process_utils import safe_kill_process
        from archivebox.config.configset import get_config
        # Kill any background ArchiveResult hooks
        if not self.OUTPUT_DIR.exists():
            return
-        # Get merged config for plugin-specific timeout lookup
+        # Find all .pid files in this snapshot's output directory
-        config = get_config(crawl=self.crawl, snapshot=self)
+        for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
-
+            cmd_file = pid_file.parent / 'cmd.sh'
-        # Gracefully terminate all background hooks with plugin-specific timeouts
+            safe_kill_process(pid_file, cmd_file)
        graceful_terminate_background_hooks(self.OUTPUT_DIR, config)
        # Update all STARTED ArchiveResults from filesystem
        results = self.archiveresult_set.filter(status=ArchiveResult.StatusChoices.STARTED)
@@ -1435,32 +1407,35 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        Used by state machine to determine if snapshot is finished.
        """
-        from archivebox.hooks import process_is_alive
+        from archivebox.misc.process_utils import validate_pid_file
        if not self.OUTPUT_DIR.exists():
            return False
-        # Check all .pid files in the snapshot directory (hook-specific names)
+        for plugin_dir in self.OUTPUT_DIR.iterdir():
-        for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
+            if not plugin_dir.is_dir():
-            if process_is_alive(pid_file):
+                continue
            pid_file = plugin_dir / 'hook.pid'
            cmd_file = plugin_dir / 'cmd.sh'
            if validate_pid_file(pid_file, cmd_file):
                return True
        return False
-    def to_json(self) -> dict:
+    def to_jsonl(self) -> dict:
        """
-        Convert Snapshot model instance to a JSON-serializable dict.
+        Convert Snapshot model instance to a JSONL record.
        Includes all fields needed to fully reconstruct/identify this snapshot.
        """
        from archivebox.config import VERSION
        return {
-            'type': self.JSONL_TYPE,
+            'type': 'Snapshot',
            'schema_version': VERSION,
            'id': str(self.id),
            'crawl_id': str(self.crawl_id),
            'url': self.url,
            'title': self.title,
-            'tags_str': self.tags_str(),
+            'tags': self.tags_str(),
            'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
            'created_at': self.created_at.isoformat() if self.created_at else None,
            'timestamp': self.timestamp,
@@ -1469,68 +1444,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
            'fs_version': self.fs_version,
        }
    def to_jsonl(self, seen: Set[tuple] = None, archiveresult: bool = True, process: bool = True, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]:
        """
        Yield this Snapshot and optionally related objects as JSON records.
        Uses select_related for efficient querying. Deduplicates automatically.
        Args:
            seen: Set of (type, id) tuples already emitted (for deduplication)
            archiveresult: Include related ArchiveResults (default: True)
            process: Include Process for each ArchiveResult (default: True)
            binary: Include Binary for each Process (default: True)
            machine: Include Machine for each Process (default: False)
            iface: Include NetworkInterface for each Process (default: False)
            **kwargs: Additional options passed to children
        Yields:
            dict: JSON-serializable records
        """
        if seen is None:
            seen = set()
        key = (self.JSONL_TYPE, str(self.id))
        if key in seen:
            return
        seen.add(key)
        yield self.to_json()
        if archiveresult:
            # Use select_related to optimize queries
            for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'):
                yield from ar.to_jsonl(seen=seen, process=process, binary=binary, machine=machine, iface=iface, **kwargs)
    @classmethod
    def from_jsonl(cls, records, overrides: Dict[str, Any] = None, queue_for_extraction: bool = True) -> list['Snapshot']:
        """
        Create/update Snapshots from an iterable of JSONL records.
        Filters to only records with type='Snapshot' (or no type).
        Args:
            records: Iterable of dicts (JSONL records)
            overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
            queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)
        Returns:
            List of Snapshot instances (skips None results)
        """
        results = []
        for record in records:
            record_type = record.get('type', cls.JSONL_TYPE)
            if record_type == cls.JSONL_TYPE:
                instance = cls.from_json(record, overrides=overrides, queue_for_extraction=queue_for_extraction)
                if instance:
                    results.append(instance)
        return results
    @staticmethod
-    def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True) -> 'Snapshot | None':
+    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
        """
-        Create/update a single Snapshot from a JSON record dict.
+        Create/update Snapshot from JSONL record or dict.
-        Handles:
+        Unified method that handles:
        - ID-based patching: {"id": "...", "title": "new title"}
        - URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
        - Auto-creates Crawl if not provided
@@ -2137,8 +2056,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
            result['canonical'] = self.canonical_outputs()
        return result
-    def to_json_str(self, indent: int = 4) -> str:
+    def to_json(self, indent: int = 4) -> str:
-        """Convert to JSON string for file output."""
+        """Convert to JSON string"""
        return to_json(self.to_dict(extended=True), indent=indent)
    def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
@@ -2286,8 +2205,6 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
 class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
    JSONL_TYPE = 'ArchiveResult'
    class StatusChoices(models.TextChoices):
        QUEUED = 'queued', 'Queued'
        STARTED = 'started', 'Started'
@@ -2321,7 +2238,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
    process = models.OneToOneField(
        'machine.Process',
        on_delete=models.PROTECT,
-        null=False,
+        null=False,  # Required after migration 4
        related_name='archiveresult',
        help_text='Process execution details for this archive result'
    )
@@ -2359,13 +2276,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        """Convenience property to access the user who created this archive result via its snapshot's crawl."""
        return self.snapshot.crawl.created_by
-    def to_json(self) -> dict:
+    def to_jsonl(self) -> dict:
        """
-        Convert ArchiveResult model instance to a JSON-serializable dict.
+        Convert ArchiveResult model instance to a JSONL record.
        """
        from archivebox.config import VERSION
        record = {
-            'type': self.JSONL_TYPE,
+            'type': 'ArchiveResult',
            'schema_version': VERSION,
            'id': str(self.id),
            'snapshot_id': str(self.snapshot_id),
@@ -2393,121 +2310,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
            record['process_id'] = str(self.process_id)
        return record
    def to_jsonl(self, seen: Set[tuple] = None, process: bool = True, **kwargs) -> Iterator[dict]:
        """
        Yield this ArchiveResult and optionally related objects as JSON records.
        Args:
            seen: Set of (type, id) tuples already emitted (for deduplication)
            process: Include related Process and its children (default: True)
            **kwargs: Passed to Process.to_jsonl() (e.g., binary=True, machine=False)
        Yields:
            dict: JSON-serializable records
        """
        if seen is None:
            seen = set()
        key = (self.JSONL_TYPE, str(self.id))
        if key in seen:
            return
        seen.add(key)
        yield self.to_json()
        if process and self.process:
            yield from self.process.to_jsonl(seen=seen, **kwargs)
    @classmethod
    def from_jsonl(cls, records, overrides: Dict[str, Any] = None) -> list['ArchiveResult']:
        """
        Create/update ArchiveResults from an iterable of JSONL records.
        Filters to only records with type='ArchiveResult'.
        Args:
            records: Iterable of dicts (JSONL records)
            overrides: Dict of field overrides
        Returns:
            List of ArchiveResult instances (skips None results)
        """
        results = []
        for record in records:
            record_type = record.get('type', cls.JSONL_TYPE)
            if record_type == cls.JSONL_TYPE:
                instance = cls.from_json(record, overrides=overrides)
                if instance:
                    results.append(instance)
        return results
    @staticmethod
    def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None) -> 'ArchiveResult | None':
        """
        Create or update a single ArchiveResult from a JSON record dict.
        Args:
            record: Dict with 'snapshot_id' and 'plugin' (required for create),
                    or 'id' (for update)
            overrides: Dict of field overrides (e.g., config overrides)
        Returns:
            ArchiveResult instance or None if invalid
        """
        from django.utils import timezone
        overrides = overrides or {}
        # If 'id' is provided, lookup and update existing
        result_id = record.get('id')
        if result_id:
            try:
                result = ArchiveResult.objects.get(id=result_id)
                # Update fields from record
                if record.get('status'):
                    result.status = record['status']
                    result.retry_at = timezone.now()
                result.save()
                return result
            except ArchiveResult.DoesNotExist:
                pass  # Fall through to create
        # Required fields for creation
        snapshot_id = record.get('snapshot_id')
        plugin = record.get('plugin')
        if not snapshot_id or not plugin:
            return None
        try:
            snapshot = Snapshot.objects.get(id=snapshot_id)
        except Snapshot.DoesNotExist:
            return None
        # Check if result already exists for this snapshot+plugin
        existing = ArchiveResult.objects.filter(
            snapshot=snapshot,
            plugin=plugin,
        ).first()
        if existing:
            # Update existing result if status provided
            if record.get('status'):
                existing.status = record['status']
                existing.retry_at = timezone.now()
                existing.save()
            return existing
        # Create new ArchiveResult
        result = ArchiveResult(
            snapshot=snapshot,
            plugin=plugin,
            status=record.get('status', ArchiveResult.StatusChoices.QUEUED),
            retry_at=timezone.now(),
            hook_name=record.get('hook_name', ''),
        )
        result.save()
        return result
    def save(self, *args, **kwargs):
        is_new = self._state.adding
@@ -2795,26 +2597,9 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
            self.save()
            return
-        # Derive hook basename for hook-specific filenames
+        # Read and parse JSONL output from stdout.log
-        # e.g., "on_Snapshot__50_wget.py" -> "on_Snapshot__50_wget"
+        stdout_file = plugin_dir / 'stdout.log'
        hook_basename = Path(self.hook_name).stem if self.hook_name else 'hook'
        # Read and parse JSONL output from hook-specific stdout log
        stdout_file = plugin_dir / f'{hook_basename}.stdout.log'
        stderr_file = plugin_dir / f'{hook_basename}.stderr.log'
        returncode_file = plugin_dir / f'{hook_basename}.returncode'
        stdout = stdout_file.read_text() if stdout_file.exists() else ''
        stderr = stderr_file.read_text() if stderr_file.exists() else ''
        # Read returncode from file (written by graceful_terminate_background_hooks)
        returncode = None
        if returncode_file.exists():
            try:
                rc_text = returncode_file.read_text().strip()
                returncode = int(rc_text) if rc_text else None
            except (ValueError, OSError):
                pass
        records = []
        for line in stdout.splitlines():
@@ -2849,43 +2634,12 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
                self._set_binary_from_cmd(hook_data['cmd'])
            # Note: cmd_version is derived from binary.version, not stored on Process
        else:
-            # No ArchiveResult JSONL record - determine status from returncode
+            # No ArchiveResult record = failed
-            if returncode is not None:
+            self.status = self.StatusChoices.FAILED
-                if returncode == 0:
+            self.output_str = 'Hook did not output ArchiveResult record'
                    self.status = self.StatusChoices.SUCCEEDED
                    self.output_str = 'Hook completed successfully (no JSONL output)'
                elif returncode < 0:
                    # Negative = killed by signal (e.g., -9 for SIGKILL, -15 for SIGTERM)
                    sig_num = abs(returncode)
                    sig_name = {9: 'SIGKILL', 15: 'SIGTERM'}.get(sig_num, f'signal {sig_num}')
                    self.status = self.StatusChoices.FAILED
                    self.output_str = f'Hook killed by {sig_name}'
                    if stderr:
                        self.output_str += f'\n\nstderr:\n{stderr[:2000]}'
                else:
                    self.status = self.StatusChoices.FAILED
                    self.output_str = f'Hook failed with exit code {returncode}'
                    if stderr:
                        self.output_str += f'\n\nstderr:\n{stderr[:2000]}'
            else:
                # No returncode file and no JSONL = failed
                self.status = self.StatusChoices.FAILED
                self.output_str = 'Hook did not output ArchiveResult record'
                if stderr:
                    self.output_str += f'\n\nstderr:\n{stderr[:2000]}'
        # Walk filesystem and populate output_files, output_size, output_mimetypes
-        # Exclude hook output files (hook-specific names like on_Snapshot__50_wget.stdout.log)
+        exclude_names = {'stdout.log', 'stderr.log', 'hook.pid', 'listener.pid'}
        def is_hook_output_file(name: str) -> bool:
            """Check if a file is a hook output file that should be excluded."""
            return (
                name.endswith('.stdout.log') or
                name.endswith('.stderr.log') or
                name.endswith('.pid') or
                name.endswith('.returncode') or
                (name.endswith('.sh') and name.startswith('on_'))
            )
        mime_sizes = defaultdict(int)
        total_size = 0
        output_files = {}
@@ -2893,7 +2647,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        for file_path in plugin_dir.rglob('*'):
            if not file_path.is_file():
                continue
-            if is_hook_output_file(file_path.name):
+            if file_path.name in exclude_names:
                continue
            try:
@@ -2951,10 +2705,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        }
        process_hook_records(filtered_records, overrides=overrides)
-        # Cleanup PID files, returncode files, and empty logs (hook-specific names)
+        # Cleanup PID files and empty logs
-        pid_file = plugin_dir / f'{hook_basename}.pid'
+        pid_file = plugin_dir / 'hook.pid'
        pid_file.unlink(missing_ok=True)
-        returncode_file.unlink(missing_ok=True)
+        stderr_file = plugin_dir / 'stderr.log'
        if stdout_file.exists() and stdout_file.stat().st_size == 0:
            stdout_file.unlink()
        if stderr_file.exists() and stderr_file.stat().st_size == 0:
@@ -3060,9 +2814,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        plugin_dir = Path(self.pwd) if self.pwd else None
        if not plugin_dir:
            return False
-        # Use hook-specific pid filename
+        pid_file = plugin_dir / 'hook.pid'
        hook_basename = Path(self.hook_name).stem if self.hook_name else 'hook'
        pid_file = plugin_dir / f'{hook_basename}.pid'
        return pid_file.exists()
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -1,6 +1,6 @@
 __package__ = 'archivebox.crawls'
-from typing import TYPE_CHECKING, Iterable, Iterator, Set
+from typing import TYPE_CHECKING, Iterable
 from datetime import timedelta
 from archivebox.uuid_compat import uuid7
 from pathlib import Path
@@ -59,8 +59,6 @@ class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
 class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWithStateMachine):
    JSONL_TYPE = 'Crawl'
    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
@@ -136,13 +134,13 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
    def api_url(self) -> str:
        return reverse_lazy('api-1:get_crawl', args=[self.id])
-    def to_json(self) -> dict:
+    def to_jsonl(self) -> dict:
        """
-        Convert Crawl model instance to a JSON-serializable dict.
+        Convert Crawl model instance to a JSONL record.
        """
        from archivebox.config import VERSION
        return {
-            'type': self.JSONL_TYPE,
+            'type': 'Crawl',
            'schema_version': VERSION,
            'id': str(self.id),
            'urls': self.urls,
@@ -153,63 +151,10 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
            'created_at': self.created_at.isoformat() if self.created_at else None,
        }
    def to_jsonl(self, seen: Set[tuple] = None, snapshot: bool = True, archiveresult: bool = True, process: bool = True, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]:
        """
        Yield this Crawl and optionally related objects as JSON records.
        Args:
            seen: Set of (type, id) tuples already emitted (for deduplication)
            snapshot: Include related Snapshots (default: True)
            archiveresult: Include ArchiveResults for each Snapshot (default: True)
            process: Include Process for each ArchiveResult (default: True)
            binary: Include Binary for each Process (default: True)
            machine: Include Machine for each Process (default: False)
            iface: Include NetworkInterface for each Process (default: False)
            **kwargs: Additional options passed to children
        Yields:
            dict: JSON-serializable records
        """
        if seen is None:
            seen = set()
        key = (self.JSONL_TYPE, str(self.id))
        if key in seen:
            return
        seen.add(key)
        yield self.to_json()
        if snapshot:
            for snap in self.snapshot_set.all():
                yield from snap.to_jsonl(seen=seen, archiveresult=archiveresult, process=process, binary=binary, machine=machine, iface=iface, **kwargs)
    @classmethod
    def from_jsonl(cls, records, overrides: dict = None) -> list['Crawl']:
        """
        Create/update Crawls from an iterable of JSONL records.
        Filters to only records with type='Crawl' (or no type).
        Args:
            records: Iterable of dicts (JSONL records)
            overrides: Dict of field overrides (e.g., created_by_id)
        Returns:
            List of Crawl instances (skips None results)
        """
        results = []
        for record in records:
            record_type = record.get('type', cls.JSONL_TYPE)
            if record_type == cls.JSONL_TYPE:
                instance = cls.from_json(record, overrides=overrides)
                if instance:
                    results.append(instance)
        return results
    @staticmethod
-    def from_json(record: dict, overrides: dict = None) -> 'Crawl | None':
+    def from_jsonl(record: dict, overrides: dict = None):
        """
-        Create or get a single Crawl from a JSON record dict.
+        Create or get a Crawl from a JSONL record.
        Args:
            record: Dict with 'urls' (required), optional 'max_depth', 'tags_str', 'label'
@@ -250,45 +195,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
        )
        return crawl
    @staticmethod
    def extract_domain_from_url(url: str) -> str:
        """
        Extract domain from URL for path structure.
        Uses full hostname with sanitized special chars.
        Examples:
            https://example.com:8080 → example.com_8080
            https://sub.example.com → sub.example.com
            file:///path → localhost
            data:text/html → data
        """
        from urllib.parse import urlparse
        try:
            parsed = urlparse(url)
            if parsed.scheme in ('http', 'https'):
                if parsed.port:
                    return f"{parsed.hostname}_{parsed.port}".replace(':', '_')
                return parsed.hostname or 'unknown'
            elif parsed.scheme == 'file':
                return 'localhost'
            elif parsed.scheme:
                return parsed.scheme
            else:
                return 'unknown'
        except Exception:
            return 'unknown'
    @property
    def output_dir_parent(self) -> str:
-        """Construct parent directory: users/{username}/crawls/{YYYYMMDD}/{domain}"""
+        """Construct parent directory: users/{user_id}/crawls/{YYYYMMDD}"""
        date_str = self.created_at.strftime('%Y%m%d')
-        username = self.created_by.username
+        return f'users/{self.created_by_id}/crawls/{date_str}'
        # Get domain from first URL
        first_url = self.get_urls_list()[0] if self.get_urls_list() else ''
        domain = self.extract_domain_from_url(first_url) if first_url else 'unknown'
        return f'users/{username}/crawls/{date_str}/{domain}'
    @property
    def output_dir_name(self) -> str:
@@ -506,84 +417,18 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
    def cleanup(self):
        """Clean up background hooks and run on_CrawlEnd hooks."""
        import os
        import signal
        import time
        from pathlib import Path
        from archivebox.hooks import run_hook, discover_hooks
-        from archivebox.misc.process_utils import validate_pid_file
+        from archivebox.misc.process_utils import safe_kill_process
        def is_process_alive(pid):
            """Check if a process exists."""
            try:
                os.kill(pid, 0)  # Signal 0 checks existence without killing
                return True
            except (OSError, ProcessLookupError):
                return False
        # Kill any background processes by scanning for all .pid files
        if self.OUTPUT_DIR.exists():
            for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
                # Validate PID before killing to avoid killing unrelated processes
                cmd_file = pid_file.parent / 'cmd.sh'
-                if not validate_pid_file(pid_file, cmd_file):
+                # safe_kill_process now waits for termination and escalates to SIGKILL
-                    # PID reused by different process or process dead
+                # Returns True only if process is confirmed dead
                killed = safe_kill_process(pid_file, cmd_file)
                if killed:
                    pid_file.unlink(missing_ok=True)
                    continue
                try:
                    pid = int(pid_file.read_text().strip())
                    # Step 1: Send SIGTERM for graceful shutdown
                    try:
                        # Try to kill process group first (handles detached processes like Chrome)
                        try:
                            os.killpg(pid, signal.SIGTERM)
                        except (OSError, ProcessLookupError):
                            # Fall back to killing just the process
                            os.kill(pid, signal.SIGTERM)
                    except ProcessLookupError:
                        # Already dead
                        pid_file.unlink(missing_ok=True)
                        continue
                    # Step 2: Wait for graceful shutdown
                    time.sleep(2)
                    # Step 3: Check if still alive
                    if not is_process_alive(pid):
                        # Process terminated gracefully
                        pid_file.unlink(missing_ok=True)
                        continue
                    # Step 4: Process still alive, force kill ENTIRE process group with SIGKILL
                    try:
                        try:
                            # Always kill entire process group with SIGKILL (not individual processes)
                            os.killpg(pid, signal.SIGKILL)
                        except (OSError, ProcessLookupError) as e:
                            # Process group kill failed, try single process as fallback
                            os.kill(pid, signal.SIGKILL)
                    except ProcessLookupError:
                        # Process died between check and kill
                        pid_file.unlink(missing_ok=True)
                        continue
                    # Step 5: Wait and verify death
                    time.sleep(1)
                    if is_process_alive(pid):
                        # Process is unkillable (likely in UNE state on macOS)
                        # This happens when Chrome crashes in kernel syscall (IOSurface)
                        # Log but don't block cleanup - process will remain until reboot
                        print(f'[yellow]⚠️ Process {pid} is unkillable (likely crashed in kernel). Will remain until reboot.[/yellow]')
                    else:
                        # Successfully killed
                        pid_file.unlink(missing_ok=True)
                except (ValueError, OSError) as e:
                    # Invalid PID file or permission error
                    pass
        # Run on_CrawlEnd hooks
        from archivebox.config.configset import get_config
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -365,14 +365,11 @@ def run_hook(
    # Old convention: __background in stem (for backwards compatibility)
    is_background = '.bg.' in script.name or '__background' in script.stem
-    # Set up output files for ALL hooks - use hook-specific names to avoid conflicts
+    # Set up output files for ALL hooks (useful for debugging)
-    # when multiple hooks run in the same plugin directory
+    stdout_file = output_dir / 'stdout.log'
-    # e.g., on_Snapshot__20_chrome_tab.bg.js -> on_Snapshot__20_chrome_tab.bg.stdout.log
+    stderr_file = output_dir / 'stderr.log'
-    hook_basename = script.stem  # e.g., "on_Snapshot__20_chrome_tab.bg"
+    pid_file = output_dir / 'hook.pid'
-    stdout_file = output_dir / f'{hook_basename}.stdout.log'
+    cmd_file = output_dir / 'cmd.sh'
    stderr_file = output_dir / f'{hook_basename}.stderr.log'
    pid_file = output_dir / f'{hook_basename}.pid'
    cmd_file = output_dir / f'{hook_basename}.sh'
    try:
        # Write command script for validation
@@ -424,14 +421,8 @@ def run_hook(
        # Detect new files created by the hook
        files_after = set(output_dir.rglob('*')) if output_dir.exists() else set()
        new_files = [str(f.relative_to(output_dir)) for f in (files_after - files_before) if f.is_file()]
-        # Exclude the log/pid/sh files themselves from new_files (hook-specific names)
+        # Exclude the log files themselves from new_files
-        hook_output_files = {
+        new_files = [f for f in new_files if f not in ('stdout.log', 'stderr.log', 'hook.pid')]
            f'{hook_basename}.stdout.log',
            f'{hook_basename}.stderr.log',
            f'{hook_basename}.pid',
            f'{hook_basename}.sh',
        }
        new_files = [f for f in new_files if f not in hook_output_files]
        # Parse JSONL output from stdout
        # Each line starting with { that has 'type' field is a record
@@ -1185,9 +1176,7 @@ def create_model_record(record: Dict[str, Any]) -> Any:
 def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any] = None) -> Dict[str, int]:
    """
    Process JSONL records from hook output.
-
+    Dispatches to Model.from_jsonl() for each record type.
    Uses Model.from_jsonl() which automatically filters by JSONL_TYPE.
    Each model only processes records matching its type.
    Args:
        records: List of JSONL record dicts from result['records']
@@ -1196,250 +1185,51 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any
    Returns:
        Dict with counts by record type
    """
-    from archivebox.core.models import Snapshot, Tag
+    stats = {}
    from archivebox.machine.models import Binary, Machine
    overrides = overrides or {}
-    # Filter out ArchiveResult records (they update the calling AR, not create new ones)
+    for record in records:
-    filtered_records = [r for r in records if r.get('type') != 'ArchiveResult']
+        record_type = record.get('type')
        if not record_type:
            continue
-    # Each model's from_jsonl() filters to only its own type
+        # Skip ArchiveResult records (they update the calling ArchiveResult, not create new ones)
-    snapshots = Snapshot.from_jsonl(filtered_records, overrides)
+        if record_type == 'ArchiveResult':
    tags = Tag.from_jsonl(filtered_records, overrides)
    binaries = Binary.from_jsonl(filtered_records, overrides)
    machines = Machine.from_jsonl(filtered_records, overrides)
    return {
        'Snapshot': len(snapshots),
        'Tag': len(tags),
        'Binary': len(binaries),
        'Machine': len(machines),
    }
 def process_is_alive(pid_file: Path) -> bool:
    """
    Check if process in PID file is still running.
    Args:
        pid_file: Path to hook.pid file
    Returns:
        True if process is alive, False otherwise
    """
    if not pid_file.exists():
        return False
    try:
        pid = int(pid_file.read_text().strip())
        os.kill(pid, 0)  # Signal 0 = check if process exists without killing it
        return True
    except (OSError, ValueError):
        return False
 def kill_process(pid_file: Path, sig: int = signal.SIGTERM, validate: bool = True):
    """
    Kill process in PID file with optional validation.
    Args:
        pid_file: Path to hook-specific .pid file (e.g., on_Snapshot__20_chrome_tab.bg.pid)
        sig: Signal to send (default SIGTERM)
        validate: If True, validate process identity before killing (default: True)
    """
    from archivebox.misc.process_utils import safe_kill_process
    if validate:
        # Use safe kill with validation
        # Derive cmd file from pid file: on_Snapshot__20_chrome_tab.bg.pid -> on_Snapshot__20_chrome_tab.bg.sh
        cmd_file = pid_file.with_suffix('.sh')
        safe_kill_process(pid_file, cmd_file, signal_num=sig)
    else:
        # Legacy behavior - kill without validation
        if not pid_file.exists():
            return
        try:
            pid = int(pid_file.read_text().strip())
            os.kill(pid, sig)
        except (OSError, ValueError):
            pass
 def graceful_terminate_background_hooks(
    output_dir: Path,
    config: Dict[str, Any],
    poll_interval: float = 0.5,
 ) -> Dict[str, Dict[str, Any]]:
    """
    Gracefully terminate all background hooks in an output directory.
    Termination strategy:
        1. Send SIGTERM to all background hook processes (polite shutdown request)
        2. For each hook, wait up to its plugin-specific timeout
        3. Send SIGKILL to any hooks still running after their timeout expires
        4. Reap each process with waitpid() to get exit code
        5. Write returncode to .returncode file for update_from_output()
    Args:
        output_dir: Snapshot output directory containing plugin subdirs with .pid files
        config: Merged config dict from get_config() for timeout lookup
        poll_interval: Seconds between process liveness checks (default: 0.5s)
    Returns:
        Dict mapping hook names to result info:
            {
                'hook_name': {
                    'status': 'sigterm' | 'sigkill' | 'already_dead' | 'invalid',
                    'returncode': int or None,
                    'pid': int or None,
                }
            }
    Example:
        from archivebox.config.configset import get_config
        config = get_config(crawl=my_crawl, snapshot=my_snapshot)
        results = graceful_terminate_background_hooks(snapshot.OUTPUT_DIR, config)
        # {'on_Snapshot__20_chrome_tab.bg': {'status': 'sigterm', 'returncode': 0, 'pid': 12345}}
    """
    from archivebox.misc.process_utils import validate_pid_file
    if not output_dir.exists():
        return {}
    results = {}
    # Collect all pid files and their metadata
    pid_files = list(output_dir.glob('**/*.pid'))
    if not pid_files:
        return {}
    # Phase 1: Send SIGTERM to all background hook processes
    active_hooks = []  # List of (pid_file, hook_name, plugin_name, timeout, pid)
    for pid_file in pid_files:
        hook_name = pid_file.stem  # e.g., "on_Snapshot__20_chrome_tab.bg"
        cmd_file = pid_file.with_suffix('.sh')
        # Validate and get PID
        if not validate_pid_file(pid_file, cmd_file):
            results[hook_name] = {'status': 'invalid', 'returncode': None, 'pid': None}
            pid_file.unlink(missing_ok=True)
            continue
        try:
-            pid = int(pid_file.read_text().strip())
+            # Dispatch to appropriate model's from_jsonl() method
-        except (ValueError, OSError):
+            if record_type == 'Snapshot':
-            results[hook_name] = {'status': 'invalid', 'returncode': None, 'pid': None}
+                from archivebox.core.models import Snapshot
-            pid_file.unlink(missing_ok=True)
+                obj = Snapshot.from_jsonl(record.copy(), overrides)
                if obj:
                    stats['Snapshot'] = stats.get('Snapshot', 0) + 1
            elif record_type == 'Tag':
                from archivebox.core.models import Tag
                obj = Tag.from_jsonl(record.copy(), overrides)
                if obj:
                    stats['Tag'] = stats.get('Tag', 0) + 1
            elif record_type == 'Binary':
                from archivebox.machine.models import Binary
                obj = Binary.from_jsonl(record.copy(), overrides)
                if obj:
                    stats['Binary'] = stats.get('Binary', 0) + 1
            elif record_type == 'Machine':
                from archivebox.machine.models import Machine
                obj = Machine.from_jsonl(record.copy(), overrides)
                if obj:
                    stats['Machine'] = stats.get('Machine', 0) + 1
            else:
                import sys
                print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
        except Exception as e:
            import sys
            print(f"Warning: Failed to create {record_type}: {e}", file=sys.stderr)
            continue
-        # Check if process is still alive
+    return stats
        if not process_is_alive(pid_file):
            # Process already dead - try to reap it and get exit code
            returncode = _reap_process(pid)
            results[hook_name] = {'status': 'already_dead', 'returncode': returncode, 'pid': pid}
            _write_returncode_file(pid_file, returncode)
            pid_file.unlink(missing_ok=True)
            continue
        # Get plugin name from parent directory (e.g., "chrome_session")
        plugin_name = pid_file.parent.name
        # Get plugin-specific timeout
        plugin_config = get_plugin_special_config(plugin_name, config)
        timeout = plugin_config['timeout']
        # Send SIGTERM
        try:
            os.kill(pid, signal.SIGTERM)
        except (OSError, ProcessLookupError):
            returncode = _reap_process(pid)
            results[hook_name] = {'status': 'already_dead', 'returncode': returncode, 'pid': pid}
            _write_returncode_file(pid_file, returncode)
            pid_file.unlink(missing_ok=True)
            continue
        active_hooks.append((pid_file, hook_name, plugin_name, timeout, pid))
    # Phase 2: Wait for each hook's timeout, then SIGKILL if still running
    for pid_file, hook_name, plugin_name, timeout, pid in active_hooks:
        deadline = time.time() + timeout
        exited_cleanly = False
        # Poll until deadline or process exits
        while time.time() < deadline:
            if not process_is_alive(pid_file):
                exited_cleanly = True
                break
            time.sleep(poll_interval)
        if exited_cleanly:
            # Process exited from SIGTERM - reap it to get exit code
            returncode = _reap_process(pid)
            results[hook_name] = {'status': 'sigterm', 'returncode': returncode, 'pid': pid}
        else:
            # Timeout expired, send SIGKILL
            try:
                os.kill(pid, signal.SIGKILL)
            except (OSError, ProcessLookupError):
                pass  # Process died between check and kill
            # Wait briefly for SIGKILL to take effect, then reap
            time.sleep(0.1)
            returncode = _reap_process(pid)
            # returncode from SIGKILL is typically -9 (negative signal number)
            results[hook_name] = {'status': 'sigkill', 'returncode': returncode, 'pid': pid}
        # Write returncode file for update_from_output() to read
        _write_returncode_file(pid_file, results[hook_name]['returncode'])
        pid_file.unlink(missing_ok=True)
    return results
 def _reap_process(pid: int) -> Optional[int]:
    """
    Reap a terminated process and return its exit code.
    Uses os.waitpid() with WNOHANG to avoid blocking.
    Returns None if process cannot be reaped (not a child, already reaped, etc).
    """
    try:
        # WNOHANG: return immediately if process hasn't exited
        # We call this after we know process is dead, so it should return immediately
        wpid, status = os.waitpid(pid, os.WNOHANG)
        if wpid == 0:
            # Process still running (shouldn't happen since we checked)
            return None
        if os.WIFEXITED(status):
            return os.WEXITSTATUS(status)
        elif os.WIFSIGNALED(status):
            # Killed by signal - return negative signal number (convention)
            return -os.WTERMSIG(status)
        return None
    except ChildProcessError:
        # Not our child process (was started by subprocess.Popen which already reaped it,
        # or process was started by different parent). This is expected for hooks.
        return None
    except OSError:
        return None
 def _write_returncode_file(pid_file: Path, returncode: Optional[int]) -> None:
    """
    Write returncode to a .returncode file next to the .pid file.
    This allows update_from_output() to know the exit code even for background hooks.
    """
    returncode_file = pid_file.with_suffix('.returncode')
    try:
        if returncode is not None:
            returncode_file.write_text(str(returncode))
        else:
            # Unknown exit code - write empty file to indicate process was terminated
            returncode_file.write_text('')
    except OSError:
        pass  # Best effort
--- a/archivebox/machine/migrations/0002_process_parent_and_type.py
+++ b/archivebox/machine/migrations/0002_process_parent_and_type.py
@@ -0,0 +1,101 @@
 # Generated on 2025-12-31
 # Adds parent FK and process_type field to Process model
 from django.db import migrations, models
 import django.db.models.deletion
 class Migration(migrations.Migration):
    dependencies = [
        ('machine', '0001_initial'),
    ]
    operations = [
        migrations.SeparateDatabaseAndState(
            database_operations=[
                migrations.RunSQL(
                    sql="""
                -- Add parent_id FK column to machine_process
                ALTER TABLE machine_process ADD COLUMN parent_id TEXT REFERENCES machine_process(id) ON DELETE SET NULL;
                CREATE INDEX IF NOT EXISTS machine_process_parent_id_idx ON machine_process(parent_id);
                -- Add process_type column with default 'binary'
                ALTER TABLE machine_process ADD COLUMN process_type VARCHAR(16) NOT NULL DEFAULT 'binary';
                CREATE INDEX IF NOT EXISTS machine_process_process_type_idx ON machine_process(process_type);
                -- Add composite index for parent + status queries
                CREATE INDEX IF NOT EXISTS machine_process_parent_status_idx ON machine_process(parent_id, status);
                -- Add composite index for machine + pid + started_at (for PID reuse protection)
                CREATE INDEX IF NOT EXISTS machine_process_machine_pid_started_idx ON machine_process(machine_id, pid, started_at);
            """,
                    # Migration is irreversible due to SQLite limitations
                    # SQLite doesn't support DROP COLUMN, would require table rebuild
                    reverse_sql=migrations.RunSQL.noop
                ),
            ],
            state_operations=[
                # Add parent FK
                migrations.AddField(
                    model_name='process',
                    name='parent',
                    field=models.ForeignKey(
                        blank=True,
                        help_text='Parent process that spawned this one',
                        null=True,
                        on_delete=django.db.models.deletion.SET_NULL,
                        related_name='children',
                        to='machine.process',
                    ),
                ),
                # Add process_type field
                migrations.AddField(
                    model_name='process',
                    name='process_type',
                    field=models.CharField(
                        choices=[
                            ('cli', 'CLI Command'),
                            ('supervisord', 'Supervisord Daemon'),
                            ('orchestrator', 'Orchestrator'),
                            ('worker', 'Worker Process'),
                            ('hook', 'Hook Script'),
                            ('binary', 'Binary Execution'),
                        ],
                        default='binary',
                        help_text='Type of process in the execution hierarchy',
                        max_length=16,
                    ),
                ),
                # Add indexes - must match the SQL index names exactly
                migrations.AddIndex(
                    model_name='process',
                    index=models.Index(
                        fields=['parent'],
                        name='machine_process_parent_id_idx',
                    ),
                ),
                migrations.AddIndex(
                    model_name='process',
                    index=models.Index(
                        fields=['process_type'],
                        name='machine_process_process_type_idx',
                    ),
                ),
                migrations.AddIndex(
                    model_name='process',
                    index=models.Index(
                        fields=['parent', 'status'],
                        name='machine_process_parent_status_idx',
                    ),
                ),
                migrations.AddIndex(
                    model_name='process',
                    index=models.Index(
                        fields=['machine', 'pid', 'started_at'],
                        name='machine_process_machine_pid_started_idx',
                    ),
                ),
            ],
        ),
    ]
--- a/archivebox/machine/models.py
+++ b/archivebox/machine/models.py
--- a/archivebox/misc/process_utils.py
+++ b/archivebox/misc/process_utils.py
@@ -70,15 +70,54 @@ def write_cmd_file(cmd_file: Path, cmd: list[str]):
        pass
-def safe_kill_process(pid_file: Path, cmd_file: Optional[Path] = None, signal_num: int = 15) -> bool:
+def safe_kill_process(pid_file: Path, cmd_file: Optional[Path] = None, signal_num: int = 15, timeout: float = 3.0) -> bool:
-    """Kill process after validation. Returns True if killed."""
+    """
    Kill process after validation, with graceful wait and SIGKILL escalation.
    Returns True only if process is confirmed dead (either already dead or killed successfully).
    """
    import time
    import signal
    if not validate_pid_file(pid_file, cmd_file):
        pid_file.unlink(missing_ok=True)  # Clean stale file
-        return False
+        return True  # Process already dead, consider it killed
    try:
        pid = int(pid_file.read_text().strip())
-        os.kill(pid, signal_num)
+
-        return True
+        # Send initial signal (SIGTERM by default)
-    except (OSError, ValueError, ProcessLookupError):
+        try:
            os.kill(pid, signal_num)
        except ProcessLookupError:
            # Process already dead
            return True
        # Wait for process to terminate gracefully
        start_time = time.time()
        while time.time() - start_time < timeout:
            try:
                os.kill(pid, 0)  # Check if process still exists
                time.sleep(0.1)
            except ProcessLookupError:
                # Process terminated
                return True
        # Process didn't terminate, escalate to SIGKILL
        try:
            os.kill(pid, signal.SIGKILL)
            time.sleep(0.5)  # Brief wait after SIGKILL
            # Verify it's dead
            try:
                os.kill(pid, 0)
                # Process still alive after SIGKILL - this is unusual
                return False
            except ProcessLookupError:
                # Process finally dead
                return True
        except ProcessLookupError:
            # Process died between timeout and SIGKILL
            return True
    except (OSError, ValueError):
        return False
--- a/archivebox/plugins/captcha2/config.json
+++ b/archivebox/plugins/captcha2/config.json
@@ -0,0 +1,21 @@
 {
  "$schema": "http://json-schema.org/draft-07/schema#",
  "type": "object",
  "additionalProperties": false,
  "required_plugins": ["chrome"],
  "properties": {
    "CAPTCHA2_ENABLED": {
      "type": "boolean",
      "default": true,
      "x-aliases": ["USE_CAPTCHA2"],
      "description": "Enable Captcha2 browser extension for CAPTCHA solving"
    },
    "CAPTCHA2_TIMEOUT": {
      "type": "integer",
      "default": 60,
      "minimum": 5,
      "x-fallback": "TIMEOUT",
      "description": "Timeout for CAPTCHA solving in seconds"
    }
  }
 }
--- a/archivebox/plugins/captcha2/on_Crawl__01_captcha2.js
+++ b/archivebox/plugins/captcha2/on_Crawl__01_captcha2.js
@@ -0,0 +1,121 @@
 #!/usr/bin/env node
 /**
 * 2Captcha Extension Plugin
 *
 * Installs and configures the 2captcha Chrome extension for automatic
 * CAPTCHA solving during page archiving.
 *
 * Extension: https://chromewebstore.google.com/detail/ifibfemgeogfhoebkmokieepdoobkbpo
 * Documentation: https://2captcha.com/blog/how-to-use-2captcha-solver-extension-in-puppeteer
 *
 * Priority: 01 (early) - Must install before Chrome session starts at Crawl level
 * Hook: on_Crawl (runs once per crawl, not per snapshot)
 *
 * Requirements:
 * - API_KEY_2CAPTCHA environment variable must be set
 * - Extension will automatically solve reCAPTCHA, hCaptcha, Cloudflare Turnstile, etc.
 */
 const path = require('path');
 const fs = require('fs');
 // Import extension utilities
 const extensionUtils = require('../chrome/chrome_utils.js');
 // Extension metadata
 const EXTENSION = {
    webstore_id: 'ifibfemgeogfhoebkmokieepdoobkbpo',
    name: 'captcha2',
 };
 // Get extensions directory from environment or use default
 const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
    path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
 /**
 * Install and configure the 2captcha extension
 */
 async function installCaptchaExtension() {
    console.log('[*] Installing 2captcha extension...');
    // Install the extension
    const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
    if (!extension) {
        console.error('[❌] Failed to install 2captcha extension');
        return null;
    }
    // Check if API key is configured
    const apiKey = process.env.API_KEY_2CAPTCHA;
    if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
        console.warn('[⚠️] 2captcha extension installed but API_KEY_2CAPTCHA not configured');
        console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
    } else {
        console.log('[+] 2captcha extension installed and API key configured');
    }
    return extension;
 }
 /**
 * Note: 2captcha configuration is now handled by chrome plugin
 * during first-time browser setup to avoid repeated configuration on every snapshot.
 * The API key is injected via chrome.storage API once per browser session.
 */
 /**
 * Main entry point - install extension before archiving
 */
 async function main() {
    // Check if extension is already cached
    const cacheFile = path.join(EXTENSIONS_DIR, 'captcha2.extension.json');
    if (fs.existsSync(cacheFile)) {
        try {
            const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
            const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
            if (fs.existsSync(manifestPath)) {
                console.log('[*] 2captcha extension already installed (using cache)');
                return cached;
            }
        } catch (e) {
            // Cache file corrupted, re-install
            console.warn('[⚠️] Extension cache corrupted, re-installing...');
        }
    }
    // Install extension
    const extension = await installCaptchaExtension();
    // Export extension metadata for chrome plugin to load
    if (extension) {
        // Write extension info to a cache file that chrome plugin can read
        await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
        await fs.promises.writeFile(
            cacheFile,
            JSON.stringify(extension, null, 2)
        );
        console.log(`[+] Extension metadata written to ${cacheFile}`);
    }
    return extension;
 }
 // Export functions for use by other plugins
 module.exports = {
    EXTENSION,
    installCaptchaExtension,
 };
 // Run if executed directly
 if (require.main === module) {
    main().then(() => {
        console.log('[✓] 2captcha extension setup complete');
        process.exit(0);
    }).catch(err => {
        console.error('[❌] 2captcha extension setup failed:', err);
        process.exit(1);
    });
 }
--- a/archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js
+++ b/archivebox/plugins/captcha2/on_Crawl__11_captcha2_config.js
@@ -0,0 +1,279 @@
 #!/usr/bin/env node
 /**
 * 2Captcha Extension Configuration
 *
 * Configures the 2captcha extension with API key after Crawl-level Chrome session starts.
 * Runs once per crawl to inject API key into extension storage.
 *
 * Priority: 11 (after chrome_launch at 20)
 * Hook: on_Crawl (runs once per crawl, not per snapshot)
 *
 * Requirements:
 * - API_KEY_2CAPTCHA environment variable must be set
 * - chrome plugin must have loaded extensions (extensions.json must exist)
 */
 const path = require('path');
 const fs = require('fs');
 // Add NODE_MODULES_DIR to module resolution paths if set
 if (process.env.NODE_MODULES_DIR) module.paths.unshift(process.env.NODE_MODULES_DIR);
 const puppeteer = require('puppeteer-core');
 // Get crawl's chrome directory from environment variable set by hooks.py
 function getCrawlChromeSessionDir() {
    const crawlOutputDir = process.env.CRAWL_OUTPUT_DIR || '';
    if (!crawlOutputDir) {
        return null;
    }
    return path.join(crawlOutputDir, 'chrome');
 }
 const CHROME_SESSION_DIR = getCrawlChromeSessionDir() || '../chrome';
 const CONFIG_MARKER = path.join(CHROME_SESSION_DIR, '.captcha2_configured');
 // Get environment variable with default
 function getEnv(name, defaultValue = '') {
    return (process.env[name] || defaultValue).trim();
 }
 // Parse command line arguments
 function parseArgs() {
    const args = {};
    process.argv.slice(2).forEach(arg => {
        if (arg.startsWith('--')) {
            const [key, ...valueParts] = arg.slice(2).split('=');
            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
        }
    });
    return args;
 }
 async function configure2Captcha() {
    // Check if already configured in this session
    if (fs.existsSync(CONFIG_MARKER)) {
        console.error('[*] 2captcha already configured in this browser session');
        return { success: true, skipped: true };
    }
    // Check if API key is set
    const apiKey = getEnv('API_KEY_2CAPTCHA');
    if (!apiKey || apiKey === 'YOUR_API_KEY_HERE') {
        console.warn('[⚠️] 2captcha extension loaded but API_KEY_2CAPTCHA not configured');
        console.warn('[⚠️] Set API_KEY_2CAPTCHA environment variable to enable automatic CAPTCHA solving');
        return { success: false, error: 'API_KEY_2CAPTCHA not configured' };
    }
    // Load extensions metadata
    const extensionsFile = path.join(CHROME_SESSION_DIR, 'extensions.json');
    if (!fs.existsSync(extensionsFile)) {
        return { success: false, error: 'extensions.json not found - chrome plugin must run first' };
    }
    const extensions = JSON.parse(fs.readFileSync(extensionsFile, 'utf-8'));
    const captchaExt = extensions.find(ext => ext.name === 'captcha2');
    if (!captchaExt) {
        console.error('[*] 2captcha extension not installed, skipping configuration');
        return { success: true, skipped: true };
    }
    console.error('[*] Configuring 2captcha extension with API key...');
    try {
        // Connect to the existing Chrome session via CDP
        const cdpFile = path.join(CHROME_SESSION_DIR, 'cdp_url.txt');
        if (!fs.existsSync(cdpFile)) {
            return { success: false, error: 'CDP URL not found - chrome plugin must run first' };
        }
        const cdpUrl = fs.readFileSync(cdpFile, 'utf-8').trim();
        const browser = await puppeteer.connect({ browserWSEndpoint: cdpUrl });
        try {
            // Method 1: Try to inject via extension background page
            if (captchaExt.target && captchaExt.target_ctx) {
                console.error('[*] Attempting to configure via extension background page...');
                // Reconnect to the browser to get fresh target context
                const targets = await browser.targets();
                const extTarget = targets.find(t =>
                    t.url().startsWith(`chrome-extension://${captchaExt.id}`)
                );
                if (extTarget) {
                    const extContext = await extTarget.worker() || await extTarget.page();
                    if (extContext) {
                        await extContext.evaluate((key) => {
                            // Try all common storage patterns
                            if (typeof chrome !== 'undefined' && chrome.storage) {
                                chrome.storage.local.set({
                                    apiKey: key,
                                    api_key: key,
                                    '2captcha_apikey': key,
                                    apikey: key,
                                    'solver-api-key': key,
                                });
                                chrome.storage.sync.set({
                                    apiKey: key,
                                    api_key: key,
                                    '2captcha_apikey': key,
                                    apikey: key,
                                    'solver-api-key': key,
                                });
                            }
                            // Also try localStorage as fallback
                            if (typeof localStorage !== 'undefined') {
                                localStorage.setItem('apiKey', key);
                                localStorage.setItem('2captcha_apikey', key);
                                localStorage.setItem('solver-api-key', key);
                            }
                        }, apiKey);
                        console.error('[+] 2captcha API key configured successfully via background page');
                        // Mark as configured
                        fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
                        return { success: true, method: 'background_page' };
                    }
                }
            }
            // Method 2: Try to configure via options page
            console.error('[*] Attempting to configure via options page...');
            const optionsUrl = `chrome-extension://${captchaExt.id}/options.html`;
            const configPage = await browser.newPage();
            try {
                await configPage.goto(optionsUrl, { waitUntil: 'networkidle0', timeout: 10000 });
                const configured = await configPage.evaluate((key) => {
                    // Try to find API key input field
                    const selectors = [
                        'input[name*="apikey" i]',
                        'input[id*="apikey" i]',
                        'input[name*="api-key" i]',
                        'input[id*="api-key" i]',
                        'input[name*="key" i]',
                        'input[placeholder*="api" i]',
                        'input[type="text"]',
                    ];
                    for (const selector of selectors) {
                        const input = document.querySelector(selector);
                        if (input) {
                            input.value = key;
                            input.dispatchEvent(new Event('input', { bubbles: true }));
                            input.dispatchEvent(new Event('change', { bubbles: true }));
                            // Try to find and click save button
                            const saveSelectors = [
                                'button[type="submit"]',
                                'input[type="submit"]',
                                'button:contains("Save")',
                                'button:contains("Apply")',
                            ];
                            for (const btnSel of saveSelectors) {
                                const btn = document.querySelector(btnSel);
                                if (btn) {
                                    btn.click();
                                    break;
                                }
                            }
                            // Also save to storage
                            if (typeof chrome !== 'undefined' && chrome.storage) {
                                chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
                                chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
                            }
                            return true;
                        }
                    }
                    // Fallback: Just save to storage
                    if (typeof chrome !== 'undefined' && chrome.storage) {
                        chrome.storage.local.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
                        chrome.storage.sync.set({ apiKey: key, api_key: key, '2captcha_apikey': key });
                        return true;
                    }
                    return false;
                }, apiKey);
                await configPage.close();
                if (configured) {
                    console.error('[+] 2captcha API key configured successfully via options page');
                    // Mark as configured
                    fs.writeFileSync(CONFIG_MARKER, new Date().toISOString());
                    return { success: true, method: 'options_page' };
                }
            } catch (e) {
                console.warn(`[⚠️] Failed to configure via options page: ${e.message}`);
                try {
                    await configPage.close();
                } catch (e2) {}
            }
            return { success: false, error: 'Could not configure via any method' };
        } finally {
            browser.disconnect();
        }
    } catch (e) {
        return { success: false, error: `${e.name}: ${e.message}` };
    }
 }
 async function main() {
    const args = parseArgs();
    const url = args.url;
    const snapshotId = args.snapshot_id;
    if (!url || !snapshotId) {
        console.error('Usage: on_Snapshot__21_captcha2_config.js --url=<url> --snapshot-id=<uuid>');
        process.exit(1);
    }
    const startTs = new Date();
    let status = 'failed';
    let error = '';
    try {
        const result = await configure2Captcha();
        if (result.skipped) {
            status = 'skipped';
        } else if (result.success) {
            status = 'succeeded';
        } else {
            status = 'failed';
            error = result.error || 'Configuration failed';
        }
    } catch (e) {
        error = `${e.name}: ${e.message}`;
        status = 'failed';
    }
    const endTs = new Date();
    const duration = (endTs - startTs) / 1000;
    if (error) {
        console.error(`ERROR: ${error}`);
    }
    // Config hooks don't emit JSONL - they're utility hooks for setup
    // Exit code indicates success/failure
    process.exit(status === 'succeeded' || status === 'skipped' ? 0 : 1);
 }
 main().catch(e => {
    console.error(`Fatal error: ${e.message}`);
    process.exit(1);
 });
--- a/archivebox/plugins/captcha2/templates/icon.html
+++ b/archivebox/plugins/captcha2/templates/icon.html
--- a/archivebox/plugins/captcha2/tests/test_captcha2.py
+++ b/archivebox/plugins/captcha2/tests/test_captcha2.py
@@ -0,0 +1,184 @@
 """
 Unit tests for captcha2 plugin
 Tests invoke the plugin hooks as external processes and verify outputs/side effects.
 """
 import json
 import os
 import subprocess
 import tempfile
 from pathlib import Path
 import pytest
 PLUGIN_DIR = Path(__file__).parent.parent
 INSTALL_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2.*'), None)
 CONFIG_SCRIPT = next(PLUGIN_DIR.glob('on_Crawl__*_captcha2_config.*'), None)
 def test_install_script_exists():
    """Verify install script exists"""
    assert INSTALL_SCRIPT.exists(), f"Install script not found: {INSTALL_SCRIPT}"
 def test_config_script_exists():
    """Verify config script exists"""
    assert CONFIG_SCRIPT.exists(), f"Config script not found: {CONFIG_SCRIPT}"
 def test_extension_metadata():
    """Test that captcha2 extension has correct metadata"""
    with tempfile.TemporaryDirectory() as tmpdir:
        env = os.environ.copy()
        env["CHROME_EXTENSIONS_DIR"] = str(Path(tmpdir) / "chrome_extensions")
        # Just check the script can be loaded
        result = subprocess.run(
            ["node", "-e", f"const ext = require('{INSTALL_SCRIPT}'); console.log(JSON.stringify(ext.EXTENSION))"],
            capture_output=True,
            text=True,
            env=env
        )
        assert result.returncode == 0, f"Failed to load extension metadata: {result.stderr}"
        metadata = json.loads(result.stdout)
        assert metadata["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
        assert metadata["name"] == "captcha2"
 def test_install_creates_cache():
    """Test that install creates extension cache"""
    with tempfile.TemporaryDirectory() as tmpdir:
        ext_dir = Path(tmpdir) / "chrome_extensions"
        ext_dir.mkdir(parents=True)
        env = os.environ.copy()
        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
        env["API_KEY_2CAPTCHA"] = "test_api_key"
        # Run install script
        result = subprocess.run(
            ["node", str(INSTALL_SCRIPT)],
            capture_output=True,
            text=True,
            env=env,
            timeout=60
        )
        # Check output mentions installation
        assert "[*] Installing 2captcha extension" in result.stdout or "[*] 2captcha extension already installed" in result.stdout
        # Check cache file was created
        cache_file = ext_dir / "captcha2.extension.json"
        assert cache_file.exists(), "Cache file should be created"
        # Verify cache content
        cache_data = json.loads(cache_file.read_text())
        assert cache_data["webstore_id"] == "ifibfemgeogfhoebkmokieepdoobkbpo"
        assert cache_data["name"] == "captcha2"
        assert "unpacked_path" in cache_data
        assert "version" in cache_data
 def test_install_twice_uses_cache():
    """Test that running install twice uses existing cache on second run"""
    with tempfile.TemporaryDirectory() as tmpdir:
        ext_dir = Path(tmpdir) / "chrome_extensions"
        ext_dir.mkdir(parents=True)
        env = os.environ.copy()
        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
        env["API_KEY_2CAPTCHA"] = "test_api_key"
        # First install - downloads the extension
        result1 = subprocess.run(
            ["node", str(INSTALL_SCRIPT)],
            capture_output=True,
            text=True,
            env=env,
            timeout=60
        )
        assert result1.returncode == 0, f"First install failed: {result1.stderr}"
        # Verify cache was created
        cache_file = ext_dir / "captcha2.extension.json"
        assert cache_file.exists(), "Cache file should exist after first install"
        # Second install - should use cache
        result2 = subprocess.run(
            ["node", str(INSTALL_SCRIPT)],
            capture_output=True,
            text=True,
            env=env,
            timeout=30
        )
        assert result2.returncode == 0, f"Second install failed: {result2.stderr}"
        # Second run should mention cache reuse
        assert "already installed" in result2.stdout or "cache" in result2.stdout.lower() or result2.returncode == 0
 def test_install_warns_without_api_key():
    """Test that install warns when API key not configured"""
    with tempfile.TemporaryDirectory() as tmpdir:
        ext_dir = Path(tmpdir) / "chrome_extensions"
        ext_dir.mkdir(parents=True)
        env = os.environ.copy()
        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
        # Don't set API_KEY_2CAPTCHA
        # Run install script
        result = subprocess.run(
            ["node", str(INSTALL_SCRIPT)],
            capture_output=True,
            text=True,
            env=env,
            timeout=60
        )
        # Should warn about missing API key
        combined_output = result.stdout + result.stderr
        assert "API_KEY_2CAPTCHA not configured" in combined_output or "Set API_KEY_2CAPTCHA" in combined_output
 def test_install_success_with_api_key():
    """Test that install succeeds when API key is configured"""
    with tempfile.TemporaryDirectory() as tmpdir:
        ext_dir = Path(tmpdir) / "chrome_extensions"
        ext_dir.mkdir(parents=True)
        env = os.environ.copy()
        env["CHROME_EXTENSIONS_DIR"] = str(ext_dir)
        env["API_KEY_2CAPTCHA"] = "test_valid_api_key_123"
        # Run install script
        result = subprocess.run(
            ["node", str(INSTALL_SCRIPT)],
            capture_output=True,
            text=True,
            env=env,
            timeout=60
        )
        # Should mention API key configured
        combined_output = result.stdout + result.stderr
        assert "API key configured" in combined_output or "API_KEY_2CAPTCHA" in combined_output
 def test_config_script_structure():
    """Test that config script has proper structure"""
    # Verify the script exists and contains expected markers
    script_content = CONFIG_SCRIPT.read_text()
    # Should mention configuration marker file
    assert "CONFIG_MARKER" in script_content or "captcha2_configured" in script_content
    # Should mention API key
    assert "API_KEY_2CAPTCHA" in script_content
    # Should have main function or be executable
    assert "async function" in script_content or "main" in script_content
--- a/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py
+++ b/archivebox/plugins/chrome/on_Crawl__00_chrome_install.py
@@ -0,0 +1,184 @@
 #!/usr/bin/env python3
 """
 Install hook for Chrome/Chromium and puppeteer-core.
 Runs at crawl start to install/find Chromium and puppeteer-core.
 Outputs JSONL for Binary and Machine config updates.
 Respects CHROME_BINARY env var for custom binary paths.
 Uses `npx @puppeteer/browsers install chromium@latest` and parses output.
 NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
 --load-extension and --disable-extensions-except flags, which are needed for
 loading unpacked extensions in headless mode.
 """
 import os
 import sys
 import json
 import subprocess
 from pathlib import Path
 def get_chrome_version(binary_path: str) -> str | None:
    """Get Chrome/Chromium version string."""
    try:
        result = subprocess.run(
            [binary_path, '--version'],
            capture_output=True,
            text=True,
            timeout=5
        )
        if result.returncode == 0:
            return result.stdout.strip()
    except Exception:
        pass
    return None
 def install_puppeteer_core() -> bool:
    """Install puppeteer-core to NODE_MODULES_DIR if not present."""
    node_modules_dir = os.environ.get('NODE_MODULES_DIR', '').strip()
    if not node_modules_dir:
        # No isolated node_modules, skip (will use global)
        return True
    node_modules_path = Path(node_modules_dir)
    if (node_modules_path / 'puppeteer-core').exists():
        return True
    # Get npm prefix from NODE_MODULES_DIR (parent of node_modules)
    npm_prefix = node_modules_path.parent
    try:
        print(f"[*] Installing puppeteer-core to {npm_prefix}...", file=sys.stderr)
        result = subprocess.run(
            ['npm', 'install', '--prefix', str(npm_prefix), 'puppeteer-core', '@puppeteer/browsers'],
            capture_output=True,
            text=True,
            timeout=60
        )
        if result.returncode == 0:
            print(f"[+] puppeteer-core installed", file=sys.stderr)
            return True
        else:
            print(f"[!] Failed to install puppeteer-core: {result.stderr}", file=sys.stderr)
            return False
    except Exception as e:
        print(f"[!] Failed to install puppeteer-core: {e}", file=sys.stderr)
        return False
 def install_chromium() -> dict | None:
    """Install Chromium using @puppeteer/browsers and parse output for binary path.
    Output format: "chromium@<version> <path_to_binary>"
    e.g.: "chromium@1563294 /Users/x/.cache/puppeteer/chromium/.../Chromium"
    Note: npx is fast when chromium is already cached - it returns the path without re-downloading.
    """
    try:
        print("[*] Installing Chromium via @puppeteer/browsers...", file=sys.stderr)
        # Use --path to install to puppeteer's standard cache location
        cache_path = os.path.expanduser('~/.cache/puppeteer')
        result = subprocess.run(
            ['npx', '@puppeteer/browsers', 'install', 'chromium@1563297', f'--path={cache_path}'],
            capture_output=True,
            text=True,
            stdin=subprocess.DEVNULL,
            timeout=300
        )
        if result.returncode != 0:
            print(f"[!] Failed to install Chromium: {result.stderr}", file=sys.stderr)
            return None
        # Parse output: "chromium@1563294 /path/to/Chromium"
        output = result.stdout.strip()
        parts = output.split(' ', 1)
        if len(parts) != 2:
            print(f"[!] Failed to parse install output: {output}", file=sys.stderr)
            return None
        version_str = parts[0]  # "chromium@1563294"
        binary_path = parts[1].strip()
        if not binary_path or not os.path.exists(binary_path):
            print(f"[!] Binary not found at: {binary_path}", file=sys.stderr)
            return None
        # Extract version number
        version = version_str.split('@')[1] if '@' in version_str else None
        print(f"[+] Chromium installed: {binary_path}", file=sys.stderr)
        return {
            'name': 'chromium',
            'abspath': binary_path,
            'version': version,
            'binprovider': 'puppeteer',
        }
    except subprocess.TimeoutExpired:
        print("[!] Chromium install timed out", file=sys.stderr)
    except FileNotFoundError:
        print("[!] npx not found - is Node.js installed?", file=sys.stderr)
    except Exception as e:
        print(f"[!] Failed to install Chromium: {e}", file=sys.stderr)
    return None
 def main():
    # Install puppeteer-core if NODE_MODULES_DIR is set
    install_puppeteer_core()
    # Check if CHROME_BINARY is already set and valid
    configured_binary = os.environ.get('CHROME_BINARY', '').strip()
    if configured_binary and os.path.isfile(configured_binary) and os.access(configured_binary, os.X_OK):
        version = get_chrome_version(configured_binary)
        print(json.dumps({
            'type': 'Binary',
            'name': 'chromium',
            'abspath': configured_binary,
            'version': version,
            'binprovider': 'env',
        }))
        sys.exit(0)
    # Install/find Chromium via puppeteer
    result = install_chromium()
    if result and result.get('abspath'):
        print(json.dumps({
            'type': 'Binary',
            'name': result['name'],
            'abspath': result['abspath'],
            'version': result['version'],
            'binprovider': result['binprovider'],
        }))
        print(json.dumps({
            'type': 'Machine',
            '_method': 'update',
            'key': 'config/CHROME_BINARY',
            'value': result['abspath'],
        }))
        if result['version']:
            print(json.dumps({
                'type': 'Machine',
                '_method': 'update',
                'key': 'config/CHROMIUM_VERSION',
                'value': result['version'],
            }))
        sys.exit(0)
    else:
        print("Chromium binary not found", file=sys.stderr)
        sys.exit(1)
 if __name__ == '__main__':
    main()
--- a/archivebox/plugins/chrome/on_Crawl__10_chrome_validate_config.py
+++ b/archivebox/plugins/chrome/on_Crawl__10_chrome_validate_config.py
@@ -0,0 +1,172 @@
 #!/usr/bin/env python3
 """
 Validate and compute derived Chrome config values.
 This hook runs early in the Crawl lifecycle to:
 1. Auto-detect Chrome binary location
 2. Compute sandbox settings based on Docker detection
 3. Validate binary availability and version
 4. Set computed env vars for subsequent hooks
 Output:
    - COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
    - Binary JSONL records to stdout when binaries are found
 """
 import json
 import os
 import sys
 from abx_pkg import Binary, EnvProvider
 # Chrome binary search order
 CHROME_BINARY_NAMES = [
    'chromium',
    'chromium-browser',
    'google-chrome',
    'google-chrome-stable',
    'chrome',
 ]
 def get_env(name: str, default: str = '') -> str:
    return os.environ.get(name, default).strip()
 def get_env_bool(name: str, default: bool = False) -> bool:
    val = get_env(name, '').lower()
    if val in ('true', '1', 'yes', 'on'):
        return True
    if val in ('false', '0', 'no', 'off'):
        return False
    return default
 def detect_docker() -> bool:
    """Detect if running inside Docker container."""
    return (
        os.path.exists('/.dockerenv') or
        os.environ.get('IN_DOCKER', '').lower() in ('true', '1', 'yes') or
        os.path.exists('/run/.containerenv')
    )
 def find_chrome_binary(configured: str, provider: EnvProvider) -> Binary | None:
    """Find Chrome binary using abx-pkg, checking configured path first."""
    # Try configured binary first
    if configured:
        try:
            binary = Binary(name=configured, binproviders=[provider]).load()
            if binary.abspath:
                return binary
        except Exception:
            pass
    # Search common names
    for name in CHROME_BINARY_NAMES:
        try:
            binary = Binary(name=name, binproviders=[provider]).load()
            if binary.abspath:
                return binary
        except Exception:
            continue
    return None
 def output_binary(binary: Binary, name: str):
    """Output Binary JSONL record to stdout."""
    machine_id = os.environ.get('MACHINE_ID', '')
    record = {
        'type': 'Binary',
        'name': name,
        'abspath': str(binary.abspath),
        'version': str(binary.version) if binary.version else '',
        'sha256': binary.sha256 or '',
        'binprovider': 'env',
        'machine_id': machine_id,
    }
    print(json.dumps(record))
 def main():
    warnings = []
    errors = []
    computed = {}
    # Get config values
    chrome_binary = get_env('CHROME_BINARY', 'chromium')
    chrome_sandbox = get_env_bool('CHROME_SANDBOX', True)
    screenshot_enabled = get_env_bool('SCREENSHOT_ENABLED', True)
    pdf_enabled = get_env_bool('PDF_ENABLED', True)
    dom_enabled = get_env_bool('DOM_ENABLED', True)
    # Compute USE_CHROME (derived from extractor enabled flags)
    use_chrome = screenshot_enabled or pdf_enabled or dom_enabled
    computed['USE_CHROME'] = str(use_chrome).lower()
    # Detect Docker and adjust sandbox
    in_docker = detect_docker()
    computed['IN_DOCKER'] = str(in_docker).lower()
    if in_docker and chrome_sandbox:
        warnings.append(
            "Running in Docker with CHROME_SANDBOX=true. "
            "Chrome may fail to start. Consider setting CHROME_SANDBOX=false."
        )
        # Auto-disable sandbox in Docker unless explicitly set
        if not get_env('CHROME_SANDBOX'):
            computed['CHROME_SANDBOX'] = 'false'
    # Find Chrome binary using abx-pkg
    provider = EnvProvider()
    if use_chrome:
        chrome = find_chrome_binary(chrome_binary, provider)
        if not chrome or not chrome.abspath:
            errors.append(
                f"Chrome binary not found (tried: {chrome_binary}). "
                "Install Chrome/Chromium or set CHROME_BINARY path."
            )
            computed['CHROME_BINARY'] = ''
        else:
            computed['CHROME_BINARY'] = str(chrome.abspath)
            computed['CHROME_VERSION'] = str(chrome.version) if chrome.version else 'unknown'
            # Output Binary JSONL record for Chrome
            output_binary(chrome, name='chrome')
    # Check Node.js for Puppeteer
    node_binary_name = get_env('NODE_BINARY', 'node')
    try:
        node = Binary(name=node_binary_name, binproviders=[provider]).load()
        node_path = str(node.abspath) if node.abspath else ''
    except Exception:
        node = None
        node_path = ''
    if use_chrome and not node_path:
        errors.append(
            f"Node.js not found (tried: {node_binary_name}). "
            "Install Node.js or set NODE_BINARY path for Puppeteer."
        )
    else:
        computed['NODE_BINARY'] = node_path
        if node and node.abspath:
            # Output Binary JSONL record for Node
            output_binary(node, name='node')
    # Output computed values
    for key, value in computed.items():
        print(f"COMPUTED:{key}={value}")
    for warning in warnings:
        print(f"WARNING:{warning}", file=sys.stderr)
    for error in errors:
        print(f"ERROR:{error}", file=sys.stderr)
    sys.exit(1 if errors else 0)
 if __name__ == '__main__':
    main()
--- a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js
+++ b/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js
@@ -0,0 +1,245 @@
 #!/usr/bin/env node
 /**
 * Launch a shared Chromium browser session for the entire crawl.
 *
 * This runs once per crawl and keeps Chromium alive for all snapshots to share.
 * Each snapshot creates its own tab via on_Snapshot__20_chrome_tab.bg.js.
 *
 * NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
 * --load-extension and --disable-extensions-except flags.
 *
 * Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
 * Output: Creates chrome/ directory under crawl output dir with:
 *   - cdp_url.txt: WebSocket URL for CDP connection
 *   - chrome.pid: Chromium process ID (for cleanup)
 *   - port.txt: Debug port number
 *   - extensions.json: Loaded extensions metadata
 *
 * Environment variables:
 *     NODE_MODULES_DIR: Path to node_modules directory for module resolution
 *     CHROME_BINARY: Path to Chromium binary (falls back to auto-detection)
 *     CHROME_RESOLUTION: Page resolution (default: 1440,2000)
 *     CHROME_HEADLESS: Run in headless mode (default: true)
 *     CHROME_CHECK_SSL_VALIDITY: Whether to check SSL certificates (default: true)
 *     CHROME_EXTENSIONS_DIR: Directory containing Chrome extensions
 */
 // Add NODE_MODULES_DIR to module resolution paths if set
 if (process.env.NODE_MODULES_DIR) {
    module.paths.unshift(process.env.NODE_MODULES_DIR);
 }
 const fs = require('fs');
 const path = require('path');
 const puppeteer = require('puppeteer-core');
 const {
    findChromium,
    launchChromium,
    killChrome,
    getEnv,
    writePidWithMtime,
 } = require('./chrome_utils.js');
 // Extractor metadata
 const PLUGIN_NAME = 'chrome_launch';
 const OUTPUT_DIR = 'chrome';
 // Global state for cleanup
 let chromePid = null;
 let browserInstance = null;
 // Parse command line arguments
 function parseArgs() {
    const args = {};
    process.argv.slice(2).forEach((arg) => {
        if (arg.startsWith('--')) {
            const [key, ...valueParts] = arg.slice(2).split('=');
            args[key.replace(/-/g, '_')] = valueParts.join('=') || true;
        }
    });
    return args;
 }
 // Cleanup handler for SIGTERM
 async function cleanup() {
    console.error('[*] Cleaning up Chrome session...');
    // Try graceful browser close first
    if (browserInstance) {
        try {
            console.error('[*] Closing browser gracefully...');
            await browserInstance.close();
            browserInstance = null;
            console.error('[+] Browser closed gracefully');
        } catch (e) {
            console.error(`[!] Graceful close failed: ${e.message}`);
        }
    }
    // Kill Chrome process
    if (chromePid) {
        await killChrome(chromePid, OUTPUT_DIR);
    }
    process.exit(0);
 }
 // Register signal handlers
 process.on('SIGTERM', cleanup);
 process.on('SIGINT', cleanup);
 async function main() {
    const args = parseArgs();
    const crawlId = args.crawl_id;
    try {
        const binary = findChromium();
        if (!binary) {
            console.error('ERROR: Chromium binary not found');
            console.error('DEPENDENCY_NEEDED=chromium');
            console.error('BIN_PROVIDERS=puppeteer,env,playwright,apt,brew');
            console.error('INSTALL_HINT=npx @puppeteer/browsers install chromium@latest');
            process.exit(1);
        }
        // Get Chromium version
        let version = '';
        try {
            const { execSync } = require('child_process');
            version = execSync(`"${binary}" --version`, { encoding: 'utf8', timeout: 5000 })
                .trim()
                .slice(0, 64);
        } catch (e) {}
        console.error(`[*] Using browser: ${binary}`);
        if (version) console.error(`[*] Version: ${version}`);
        // Load installed extensions
        const extensionsDir = getEnv('CHROME_EXTENSIONS_DIR') ||
            path.join(getEnv('DATA_DIR', '.'), 'personas', getEnv('ACTIVE_PERSONA', 'Default'), 'chrome_extensions');
        const installedExtensions = [];
        const extensionPaths = [];
        if (fs.existsSync(extensionsDir)) {
            const files = fs.readdirSync(extensionsDir);
            for (const file of files) {
                if (file.endsWith('.extension.json')) {
                    try {
                        const extPath = path.join(extensionsDir, file);
                        const extData = JSON.parse(fs.readFileSync(extPath, 'utf-8'));
                        if (extData.unpacked_path && fs.existsSync(extData.unpacked_path)) {
                            installedExtensions.push(extData);
                            extensionPaths.push(extData.unpacked_path);
                            console.error(`[*] Loading extension: ${extData.name || file}`);
                        }
                    } catch (e) {
                        console.warn(`[!] Skipping invalid extension cache: ${file}`);
                    }
                }
            }
        }
        if (installedExtensions.length > 0) {
            console.error(`[+] Found ${installedExtensions.length} extension(s) to load`);
        }
        // Write hook's own PID
        const hookStartTime = Date.now() / 1000;
        if (!fs.existsSync(OUTPUT_DIR)) {
            fs.mkdirSync(OUTPUT_DIR, { recursive: true });
        }
        writePidWithMtime(path.join(OUTPUT_DIR, 'hook.pid'), process.pid, hookStartTime);
        // Launch Chromium using consolidated function
        const result = await launchChromium({
            binary,
            outputDir: OUTPUT_DIR,
            extensionPaths,
        });
        if (!result.success) {
            console.error(`ERROR: ${result.error}`);
            process.exit(1);
        }
        chromePid = result.pid;
        const cdpUrl = result.cdpUrl;
        // Write extensions metadata
        if (installedExtensions.length > 0) {
            fs.writeFileSync(
                path.join(OUTPUT_DIR, 'extensions.json'),
                JSON.stringify(installedExtensions, null, 2)
            );
        }
        // Connect puppeteer for extension verification
        console.error(`[*] Connecting puppeteer to CDP...`);
        const browser = await puppeteer.connect({
            browserWSEndpoint: cdpUrl,
            defaultViewport: null,
        });
        browserInstance = browser;
        // Verify extensions loaded
        if (extensionPaths.length > 0) {
            await new Promise(r => setTimeout(r, 3000));
            const targets = browser.targets();
            console.error(`[*] All browser targets (${targets.length}):`);
            for (const t of targets) {
                console.error(`    - ${t.type()}: ${t.url().slice(0, 80)}`);
            }
            const extTargets = targets.filter(t =>
                t.url().startsWith('chrome-extension://') ||
                t.type() === 'service_worker' ||
                t.type() === 'background_page'
            );
            // Filter out built-in extensions
            const builtinIds = [
                'nkeimhogjdpnpccoofpliimaahmaaome',
                'fignfifoniblkonapihmkfakmlgkbkcf',
                'ahfgeienlihckogmohjhadlkjgocpleb',
                'mhjfbmdgcfjbbpaeojofohoefgiehjai',
            ];
            const customExtTargets = extTargets.filter(t => {
                const url = t.url();
                if (!url.startsWith('chrome-extension://')) return false;
                const extId = url.split('://')[1].split('/')[0];
                return !builtinIds.includes(extId);
            });
            console.error(`[+] Found ${customExtTargets.length} custom extension target(s)`);
            for (const target of customExtTargets) {
                const url = target.url();
                const extId = url.split('://')[1].split('/')[0];
                console.error(`[+] Extension loaded: ${extId} (${target.type()})`);
            }
            if (customExtTargets.length === 0 && extensionPaths.length > 0) {
                console.error(`[!] Warning: No custom extensions detected. Extension loading may have failed.`);
                console.error(`[!] Make sure you are using Chromium, not Chrome (Chrome 137+ removed --load-extension support)`);
            }
        }
        console.error(`[+] Chromium session started for crawl ${crawlId}`);
        console.error(`[+] CDP URL: ${cdpUrl}`);
        console.error(`[+] PID: ${chromePid}`);
        // Stay alive to handle cleanup on SIGTERM
        console.log('[*] Chromium launch hook staying alive to handle cleanup...');
        setInterval(() => {}, 1000000);
    } catch (e) {
        console.error(`ERROR: ${e.name}: ${e.message}`);
        process.exit(1);
    }
 }
 main().catch((e) => {
    console.error(`Fatal error: ${e.message}`);
    process.exit(1);
 });
--- a/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js
+++ b/archivebox/plugins/istilldontcareaboutcookies/on_Crawl__02_istilldontcareaboutcookies.js
@@ -0,0 +1,115 @@
 #!/usr/bin/env node
 /**
 * I Still Don't Care About Cookies Extension Plugin
 *
 * Installs and configures the "I still don't care about cookies" Chrome extension
 * for automatic cookie consent banner dismissal during page archiving.
 *
 * Extension: https://chromewebstore.google.com/detail/edibdbjcniadpccecjdfdjjppcpchdlm
 *
 * Priority: 02 (early) - Must install before Chrome session starts at Crawl level
 * Hook: on_Crawl (runs once per crawl, not per snapshot)
 *
 * This extension automatically:
 * - Dismisses cookie consent popups
 * - Removes cookie banners
 * - Accepts necessary cookies to proceed with browsing
 * - Works on thousands of websites out of the box
 */
 const path = require('path');
 const fs = require('fs');
 // Import extension utilities
 const extensionUtils = require('../chrome/chrome_utils.js');
 // Extension metadata
 const EXTENSION = {
    webstore_id: 'edibdbjcniadpccecjdfdjjppcpchdlm',
    name: 'istilldontcareaboutcookies',
 };
 // Get extensions directory from environment or use default
 const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
    path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
 /**
 * Install the I Still Don't Care About Cookies extension
 */
 async function installCookiesExtension() {
    console.log('[*] Installing I Still Don\'t Care About Cookies extension...');
    // Install the extension
    const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
    if (!extension) {
        console.error('[❌] Failed to install I Still Don\'t Care About Cookies extension');
        return null;
    }
    console.log('[+] I Still Don\'t Care About Cookies extension installed');
    console.log('[+] Cookie banners will be automatically dismissed during archiving');
    return extension;
 }
 /**
 * Note: This extension works out of the box with no configuration needed.
 * It automatically detects and dismisses cookie banners on page load.
 */
 /**
 * Main entry point - install extension before archiving
 */
 async function main() {
    // Check if extension is already cached
    const cacheFile = path.join(EXTENSIONS_DIR, 'istilldontcareaboutcookies.extension.json');
    if (fs.existsSync(cacheFile)) {
        try {
            const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
            const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
            if (fs.existsSync(manifestPath)) {
                console.log('[*] I Still Don\'t Care About Cookies extension already installed (using cache)');
                return cached;
            }
        } catch (e) {
            // Cache file corrupted, re-install
            console.warn('[⚠️] Extension cache corrupted, re-installing...');
        }
    }
    // Install extension
    const extension = await installCookiesExtension();
    // Export extension metadata for chrome plugin to load
    if (extension) {
        // Write extension info to a cache file that chrome plugin can read
        await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
        await fs.promises.writeFile(
            cacheFile,
            JSON.stringify(extension, null, 2)
        );
        console.log(`[+] Extension metadata written to ${cacheFile}`);
    }
    return extension;
 }
 // Export functions for use by other plugins
 module.exports = {
    EXTENSION,
    installCookiesExtension,
 };
 // Run if executed directly
 if (require.main === module) {
    main().then(() => {
        console.log('[✓] I Still Don\'t Care About Cookies extension setup complete');
        process.exit(0);
    }).catch(err => {
        console.error('[❌] I Still Don\'t Care About Cookies extension setup failed:', err);
        process.exit(1);
    });
 }
--- a/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js
+++ b/archivebox/plugins/singlefile/on_Crawl__04_singlefile.js
@@ -0,0 +1,268 @@
 #!/usr/bin/env node
 /**
 * SingleFile Extension Plugin
 *
 * Installs and uses the SingleFile Chrome extension for archiving complete web pages.
 * Falls back to single-file-cli if the extension is not available.
 *
 * Extension: https://chromewebstore.google.com/detail/mpiodijhokgodhhofbcjdecpffjipkle
 *
 * Priority: 04 (early) - Must install before Chrome session starts at Crawl level
 * Hook: on_Crawl (runs once per crawl, not per snapshot)
 *
 * This extension automatically:
 * - Saves complete web pages as single HTML files
 * - Inlines all resources (CSS, JS, images, fonts)
 * - Preserves page fidelity better than wget/curl
 * - Works with SPAs and dynamically loaded content
 */
 const path = require('path');
 const fs = require('fs');
 const { promisify } = require('util');
 const { exec } = require('child_process');
 const execAsync = promisify(exec);
 // Import extension utilities
 const extensionUtils = require('../chrome/chrome_utils.js');
 // Extension metadata
 const EXTENSION = {
    webstore_id: 'mpiodijhokgodhhofbcjdecpffjipkle',
    name: 'singlefile',
 };
 // Get extensions directory from environment or use default
 const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
    path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
 const CHROME_DOWNLOADS_DIR = process.env.CHROME_DOWNLOADS_DIR ||
    path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_downloads');
 const OUTPUT_DIR = '.';
 const OUTPUT_FILE = 'singlefile.html';
 /**
 * Install the SingleFile extension
 */
 async function installSinglefileExtension() {
    console.log('[*] Installing SingleFile extension...');
    // Install the extension
    const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
    if (!extension) {
        console.error('[❌] Failed to install SingleFile extension');
        return null;
    }
    console.log('[+] SingleFile extension installed');
    console.log('[+] Web pages will be saved as single HTML files');
    return extension;
 }
 /**
 * Wait for a specified amount of time
 */
 function wait(ms) {
    return new Promise(resolve => setTimeout(resolve, ms));
 }
 /**
 * Save a page using the SingleFile extension
 *
 * @param {Object} page - Puppeteer page object
 * @param {Object} extension - Extension metadata with dispatchAction method
 * @param {Object} options - Additional options
 * @returns {Promise<string|null>} - Path to saved file or null on failure
 */
 async function saveSinglefileWithExtension(page, extension, options = {}) {
    if (!extension || !extension.version) {
        throw new Error('SingleFile extension not found or not loaded');
    }
    const url = await page.url();
    // Check for unsupported URL schemes
    const URL_SCHEMES_IGNORED = ['about', 'chrome', 'chrome-extension', 'data', 'javascript', 'blob'];
    const scheme = url.split(':')[0];
    if (URL_SCHEMES_IGNORED.includes(scheme)) {
        console.log(`[⚠️] Skipping SingleFile for URL scheme: ${scheme}`);
        return null;
    }
    // Ensure downloads directory exists
    await fs.promises.mkdir(CHROME_DOWNLOADS_DIR, { recursive: true });
    // Get list of existing files to ignore
    const files_before = new Set(
        (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
            .filter(fn => fn.endsWith('.html'))
    );
    // Output directory is current directory (hook already runs in output dir)
    const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
    console.log(`[🛠️] Saving SingleFile HTML using extension (${extension.id})...`);
    // Bring page to front (extension action button acts on foreground tab)
    await page.bringToFront();
    // Trigger the extension's action (toolbar button click)
    await extension.dispatchAction();
    // Wait for file to appear in downloads directory
    const check_delay = 3000; // 3 seconds
    const max_tries = 10;
    let files_new = [];
    for (let attempt = 0; attempt < max_tries; attempt++) {
        await wait(check_delay);
        const files_after = (await fs.promises.readdir(CHROME_DOWNLOADS_DIR))
            .filter(fn => fn.endsWith('.html'));
        files_new = files_after.filter(file => !files_before.has(file));
        if (files_new.length === 0) {
            continue;
        }
        // Find the matching file by checking if it contains the URL in the HTML header
        for (const file of files_new) {
            const dl_path = path.join(CHROME_DOWNLOADS_DIR, file);
            const dl_text = await fs.promises.readFile(dl_path, 'utf-8');
            const dl_header = dl_text.split('meta charset')[0];
            if (dl_header.includes(`url: ${url}`)) {
                console.log(`[✍️] Moving SingleFile download from ${file} to ${out_path}`);
                await fs.promises.rename(dl_path, out_path);
                return out_path;
            }
        }
    }
    console.warn(`[❌] Couldn't find matching SingleFile HTML in ${CHROME_DOWNLOADS_DIR} after waiting ${(check_delay * max_tries) / 1000}s`);
    console.warn(`[⚠️] New files found: ${files_new.join(', ')}`);
    return null;
 }
 /**
 * Save a page using single-file-cli (fallback method)
 *
 * @param {string} url - URL to archive
 * @param {Object} options - Additional options
 * @returns {Promise<string|null>} - Path to saved file or null on failure
 */
 async function saveSinglefileWithCLI(url, options = {}) {
    console.log('[*] Falling back to single-file-cli...');
    // Find single-file binary
    let binary = null;
    try {
        const { stdout } = await execAsync('which single-file');
        binary = stdout.trim();
    } catch (err) {
        console.error('[❌] single-file-cli not found. Install with: npm install -g single-file-cli');
        return null;
    }
    // Output directory is current directory (hook already runs in output dir)
    const out_path = path.join(OUTPUT_DIR, OUTPUT_FILE);
    // Build command
    const cmd = [
        binary,
        '--browser-headless',
        url,
        out_path,
    ];
    // Add optional args
    if (options.userAgent) {
        cmd.splice(2, 0, '--browser-user-agent', options.userAgent);
    }
    if (options.cookiesFile && fs.existsSync(options.cookiesFile)) {
        cmd.splice(2, 0, '--browser-cookies-file', options.cookiesFile);
    }
    if (options.ignoreSSL) {
        cmd.splice(2, 0, '--browser-ignore-insecure-certs');
    }
    // Execute
    try {
        const timeout = options.timeout || 120000;
        await execAsync(cmd.join(' '), { timeout });
        if (fs.existsSync(out_path) && fs.statSync(out_path).size > 0) {
            console.log(`[+] SingleFile saved via CLI: ${out_path}`);
            return out_path;
        }
        console.error('[❌] SingleFile CLI completed but no output file found');
        return null;
    } catch (err) {
        console.error(`[❌] SingleFile CLI error: ${err.message}`);
        return null;
    }
 }
 /**
 * Main entry point - install extension before archiving
 */
 async function main() {
    // Check if extension is already cached
    const cacheFile = path.join(EXTENSIONS_DIR, 'singlefile.extension.json');
    if (fs.existsSync(cacheFile)) {
        try {
            const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
            const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
            if (fs.existsSync(manifestPath)) {
                console.log('[*] SingleFile extension already installed (using cache)');
                return cached;
            }
        } catch (e) {
            // Cache file corrupted, re-install
            console.warn('[⚠️] Extension cache corrupted, re-installing...');
        }
    }
    // Install extension
    const extension = await installSinglefileExtension();
    // Export extension metadata for chrome plugin to load
    if (extension) {
        // Write extension info to a cache file that chrome plugin can read
        await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
        await fs.promises.writeFile(
            cacheFile,
            JSON.stringify(extension, null, 2)
        );
        console.log(`[+] Extension metadata written to ${cacheFile}`);
    }
    return extension;
 }
 // Export functions for use by other plugins
 module.exports = {
    EXTENSION,
    installSinglefileExtension,
    saveSinglefileWithExtension,
    saveSinglefileWithCLI,
 };
 // Run if executed directly
 if (require.main === module) {
    main().then(() => {
        console.log('[✓] SingleFile extension setup complete');
        process.exit(0);
    }).catch(err => {
        console.error('[❌] SingleFile extension setup failed:', err);
        process.exit(1);
    });
 }
--- a/archivebox/plugins/ublock/on_Crawl__03_ublock.js
+++ b/archivebox/plugins/ublock/on_Crawl__03_ublock.js
@@ -0,0 +1,116 @@
 #!/usr/bin/env node
 /**
 * uBlock Origin Extension Plugin
 *
 * Installs and configures the uBlock Origin Chrome extension for ad blocking
 * and privacy protection during page archiving.
 *
 * Extension: https://chromewebstore.google.com/detail/cjpalhdlnbpafiamejdnhcphjbkeiagm
 *
 * Priority: 03 (early) - Must install before Chrome session starts at Crawl level
 * Hook: on_Crawl (runs once per crawl, not per snapshot)
 *
 * This extension automatically:
 * - Blocks ads, trackers, and malware domains
 * - Reduces page load time and bandwidth usage
 * - Improves privacy during archiving
 * - Removes clutter from archived pages
 * - Uses efficient blocking with filter lists
 */
 const path = require('path');
 const fs = require('fs');
 // Import extension utilities
 const extensionUtils = require('../chrome/chrome_utils.js');
 // Extension metadata
 const EXTENSION = {
    webstore_id: 'cjpalhdlnbpafiamejdnhcphjbkeiagm',
    name: 'ublock',
 };
 // Get extensions directory from environment or use default
 const EXTENSIONS_DIR = process.env.CHROME_EXTENSIONS_DIR ||
    path.join(process.env.DATA_DIR || './data', 'personas', process.env.ACTIVE_PERSONA || 'Default', 'chrome_extensions');
 /**
 * Install the uBlock Origin extension
 */
 async function installUblockExtension() {
    console.log('[*] Installing uBlock Origin extension...');
    // Install the extension
    const extension = await extensionUtils.loadOrInstallExtension(EXTENSION, EXTENSIONS_DIR);
    if (!extension) {
        console.error('[❌] Failed to install uBlock Origin extension');
        return null;
    }
    console.log('[+] uBlock Origin extension installed');
    console.log('[+] Ads and trackers will be blocked during archiving');
    return extension;
 }
 /**
 * Note: uBlock Origin works automatically with default filter lists.
 * No configuration needed - blocks ads, trackers, and malware domains out of the box.
 */
 /**
 * Main entry point - install extension before archiving
 */
 async function main() {
    // Check if extension is already cached
    const cacheFile = path.join(EXTENSIONS_DIR, 'ublock.extension.json');
    if (fs.existsSync(cacheFile)) {
        try {
            const cached = JSON.parse(fs.readFileSync(cacheFile, 'utf-8'));
            const manifestPath = path.join(cached.unpacked_path, 'manifest.json');
            if (fs.existsSync(manifestPath)) {
                console.log('[*] uBlock Origin extension already installed (using cache)');
                return cached;
            }
        } catch (e) {
            // Cache file corrupted, re-install
            console.warn('[⚠️] Extension cache corrupted, re-installing...');
        }
    }
    // Install extension
    const extension = await installUblockExtension();
    // Export extension metadata for chrome plugin to load
    if (extension) {
        // Write extension info to a cache file that chrome plugin can read
        await fs.promises.mkdir(EXTENSIONS_DIR, { recursive: true });
        await fs.promises.writeFile(
            cacheFile,
            JSON.stringify(extension, null, 2)
        );
        console.log(`[+] Extension metadata written to ${cacheFile}`);
    }
    return extension;
 }
 // Export functions for use by other plugins
 module.exports = {
    EXTENSION,
    installUblockExtension,
 };
 // Run if executed directly
 if (require.main === module) {
    main().then(() => {
        console.log('[✓] uBlock Origin extension setup complete');
        process.exit(0);
    }).catch(err => {
        console.error('[❌] uBlock Origin extension setup failed:', err);
        process.exit(1);
    });
 }
--- a/archivebox/plugins/wget/on_Crawl__10_wget_validate_config.py
+++ b/archivebox/plugins/wget/on_Crawl__10_wget_validate_config.py
@@ -0,0 +1,130 @@
 #!/usr/bin/env python3
 """
 Validate and compute derived wget config values.
 This hook runs early in the Crawl lifecycle to:
 1. Validate config values with warnings (not hard errors)
 2. Compute derived values (USE_WGET from WGET_ENABLED)
 3. Check binary availability and version
 Output:
    - COMPUTED:KEY=VALUE lines that hooks.py parses and adds to env
    - Binary JSONL records to stdout when binaries are found
 """
 import json
 import os
 import shutil
 import subprocess
 import sys
 from abx_pkg import Binary, EnvProvider
 # Read config from environment (already validated by JSONSchema)
 def get_env(name: str, default: str = '') -> str:
    return os.environ.get(name, default).strip()
 def get_env_bool(name: str, default: bool = False) -> bool:
    val = get_env(name, '').lower()
    if val in ('true', '1', 'yes', 'on'):
        return True
    if val in ('false', '0', 'no', 'off'):
        return False
    return default
 def get_env_int(name: str, default: int = 0) -> int:
    try:
        return int(get_env(name, str(default)))
    except ValueError:
        return default
 def output_binary(binary: Binary, name: str):
    """Output Binary JSONL record to stdout."""
    machine_id = os.environ.get('MACHINE_ID', '')
    record = {
        'type': 'Binary',
        'name': name,
        'abspath': str(binary.abspath),
        'version': str(binary.version) if binary.version else '',
        'sha256': binary.sha256 or '',
        'binprovider': 'env',
        'machine_id': machine_id,
    }
    print(json.dumps(record))
 def main():
    warnings = []
    errors = []
    computed = {}
    # Get config values
    wget_enabled = get_env_bool('WGET_ENABLED', True)
    wget_save_warc = get_env_bool('WGET_SAVE_WARC', True)
    wget_timeout = get_env_int('WGET_TIMEOUT') or get_env_int('TIMEOUT', 60)
    wget_binary = get_env('WGET_BINARY', 'wget')
    # Compute derived values (USE_WGET for backward compatibility)
    use_wget = wget_enabled
    computed['USE_WGET'] = str(use_wget).lower()
    # Validate timeout with warning (not error)
    if use_wget and wget_timeout < 20:
        warnings.append(
            f"WGET_TIMEOUT={wget_timeout} is very low. "
            "wget may fail to archive sites if set to less than ~20 seconds. "
            "Consider setting WGET_TIMEOUT=60 or higher."
        )
    # Check binary availability using abx-pkg
    provider = EnvProvider()
    try:
        binary = Binary(name=wget_binary, binproviders=[provider]).load()
        binary_path = str(binary.abspath) if binary.abspath else ''
    except Exception:
        binary = None
        binary_path = ''
    if not binary_path:
        if use_wget:
            errors.append(f"WGET_BINARY={wget_binary} not found. Install wget or set WGET_ENABLED=false.")
        computed['WGET_BINARY'] = ''
    else:
        computed['WGET_BINARY'] = binary_path
        wget_version = str(binary.version) if binary.version else 'unknown'
        computed['WGET_VERSION'] = wget_version
        # Output Binary JSONL record
        output_binary(binary, name='wget')
    # Check for compression support
    if computed.get('WGET_BINARY'):
        try:
            result = subprocess.run(
                [computed['WGET_BINARY'], '--compression=auto', '--help'],
                capture_output=True, timeout=5
            )
            computed['WGET_AUTO_COMPRESSION'] = 'true' if result.returncode == 0 else 'false'
        except Exception:
            computed['WGET_AUTO_COMPRESSION'] = 'false'
    # Output results
    # Format: KEY=VALUE lines that hooks.py will parse and add to env
    for key, value in computed.items():
        print(f"COMPUTED:{key}={value}")
    for warning in warnings:
        print(f"WARNING:{warning}", file=sys.stderr)
    for error in errors:
        print(f"ERROR:{error}", file=sys.stderr)
    # Exit with error if any hard errors
    sys.exit(1 if errors else 0)
 if __name__ == '__main__':
    main()
--- a/archivebox/workers/orchestrator.py
+++ b/archivebox/workers/orchestrator.py
@@ -30,7 +30,7 @@ __package__ = 'archivebox.workers'
 import os
 import time
 from typing import Type
-from multiprocessing import Process
+from multiprocessing import Process as MPProcess
 from django.utils import timezone
@@ -38,12 +38,6 @@ from rich import print
 from archivebox.misc.logging_util import log_worker_event
 from .worker import Worker, CrawlWorker, SnapshotWorker, ArchiveResultWorker
 from .pid_utils import (
    write_pid_file,
    remove_pid_file,
    get_all_worker_pids,
    cleanup_stale_pid_files,
 )
 def _run_orchestrator_process(exit_on_idle: bool) -> None:
@@ -78,6 +72,7 @@ class Orchestrator:
        self.pid: int = os.getpid()
        self.pid_file = None
        self.idle_count: int = 0
        self._last_cleanup_time: float = 0.0  # For throttling cleanup_stale_running()
    def __repr__(self) -> str:
        return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]'
@@ -85,16 +80,26 @@ class Orchestrator:
    @classmethod
    def is_running(cls) -> bool:
        """Check if an orchestrator is already running."""
-        workers = get_all_worker_pids('orchestrator')
+        from archivebox.machine.models import Process
-        return len(workers) > 0
+
-    
+        # Clean up stale processes before counting
        Process.cleanup_stale_running()
        return Process.get_running_count(process_type=Process.TypeChoices.ORCHESTRATOR) > 0
    def on_startup(self) -> None:
        """Called when orchestrator starts."""
-        self.pid = os.getpid()
+        from archivebox.machine.models import Process
        self.pid_file = write_pid_file('orchestrator', worker_id=0)
-        # Clean up any stale PID files from previous runs
+        self.pid = os.getpid()
-        stale_count = cleanup_stale_pid_files()
+        # Register orchestrator process in database with explicit type
        self.db_process = Process.current()
        # Ensure the process type is correctly set to ORCHESTRATOR
        if self.db_process.process_type != Process.TypeChoices.ORCHESTRATOR:
            self.db_process.process_type = Process.TypeChoices.ORCHESTRATOR
            self.db_process.save(update_fields=['process_type'])
        # Clean up any stale Process records from previous runs
        stale_count = Process.cleanup_stale_running()
        # Collect startup metadata
        metadata = {
@@ -112,11 +117,16 @@ class Orchestrator:
            pid=self.pid,
            metadata=metadata,
        )
-    
+
    def on_shutdown(self, error: BaseException | None = None) -> None:
        """Called when orchestrator shuts down."""
-        if self.pid_file:
+        # Update Process record status
-            remove_pid_file(self.pid_file)
+        if hasattr(self, 'db_process') and self.db_process:
            # KeyboardInterrupt is a graceful shutdown, not an error
            self.db_process.exit_code = 1 if error and not isinstance(error, KeyboardInterrupt) else 0
            self.db_process.status = self.db_process.StatusChoices.EXITED
            self.db_process.ended_at = timezone.now()
            self.db_process.save()
        log_worker_event(
            worker_type='Orchestrator',
@@ -125,10 +135,19 @@ class Orchestrator:
            pid=self.pid,
            error=error if error and not isinstance(error, KeyboardInterrupt) else None,
        )
-    
+
    def get_total_worker_count(self) -> int:
        """Get total count of running workers across all types."""
-        cleanup_stale_pid_files()
+        from archivebox.machine.models import Process
        import time
        # Throttle cleanup to once every 30 seconds to avoid performance issues
        CLEANUP_THROTTLE_SECONDS = 30
        now = time.time()
        if now - self._last_cleanup_time > CLEANUP_THROTTLE_SECONDS:
            Process.cleanup_stale_running()
            self._last_cleanup_time = now
        return sum(len(W.get_running_workers()) for W in self.WORKER_TYPES)
    def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool:
@@ -287,7 +306,7 @@ class Orchestrator:
        Returns the PID of the new process.
        """
        # Use module-level function to avoid pickle errors with local functions
-        proc = Process(
+        proc = MPProcess(
            target=_run_orchestrator_process,
            args=(self.exit_on_idle,),
            name='orchestrator'
--- a/archivebox/workers/pid_utils.py
+++ b/archivebox/workers/pid_utils.py
@@ -1,191 +0,0 @@
 """
 PID file utilities for tracking worker and orchestrator processes.
 PID files are stored in data/tmp/workers/ and contain:
 - Line 1: PID
 - Line 2: Worker type (orchestrator, crawl, snapshot, archiveresult)
 - Line 3: Extractor filter (optional, for archiveresult workers)
 - Line 4: Started at ISO timestamp
 """
 __package__ = 'archivebox.workers'
 import os
 import signal
 from pathlib import Path
 from datetime import datetime, timezone
 from django.conf import settings
 def get_pid_dir() -> Path:
    """Get the directory for PID files, creating it if needed."""
    pid_dir = Path(settings.DATA_DIR) / 'tmp' / 'workers'
    pid_dir.mkdir(parents=True, exist_ok=True)
    return pid_dir
 def write_pid_file(worker_type: str, worker_id: int = 0, extractor: str | None = None) -> Path:
    """
    Write a PID file for the current process.
    Returns the path to the PID file.
    """
    pid_dir = get_pid_dir()
    if worker_type == 'orchestrator':
        pid_file = pid_dir / 'orchestrator.pid'
    else:
        pid_file = pid_dir / f'{worker_type}_worker_{worker_id}.pid'
    content = f"{os.getpid()}\n{worker_type}\n{extractor or ''}\n{datetime.now(timezone.utc).isoformat()}\n"
    pid_file.write_text(content)
    return pid_file
 def read_pid_file(path: Path) -> dict | None:
    """
    Read and parse a PID file.
    Returns dict with pid, worker_type, extractor, started_at or None if invalid.
    """
    try:
        if not path.exists():
            return None
        lines = path.read_text().strip().split('\n')
        if len(lines) < 4:
            return None
        return {
            'pid': int(lines[0]),
            'worker_type': lines[1],
            'extractor': lines[2] or None,
            'started_at': datetime.fromisoformat(lines[3]),
            'pid_file': path,
        }
    except (ValueError, IndexError, OSError):
        return None
 def remove_pid_file(path: Path) -> None:
    """Remove a PID file if it exists."""
    try:
        path.unlink(missing_ok=True)
    except OSError:
        pass
 def is_process_alive(pid: int) -> bool:
    """Check if a process with the given PID is still running."""
    try:
        os.kill(pid, 0)  # Signal 0 doesn't kill, just checks
        return True
    except (OSError, ProcessLookupError):
        return False
 def get_all_pid_files() -> list[Path]:
    """Get all PID files in the workers directory."""
    pid_dir = get_pid_dir()
    return list(pid_dir.glob('*.pid'))
 def get_all_worker_pids(worker_type: str | None = None) -> list[dict]:
    """
    Get info about all running workers.
    Optionally filter by worker_type.
    """
    workers = []
    for pid_file in get_all_pid_files():
        info = read_pid_file(pid_file)
        if info is None:
            continue
        # Skip if process is dead
        if not is_process_alive(info['pid']):
            continue
        # Filter by type if specified
        if worker_type and info['worker_type'] != worker_type:
            continue
        workers.append(info)
    return workers
 def cleanup_stale_pid_files() -> int:
    """
    Remove PID files for processes that are no longer running.
    Returns the number of stale files removed.
    """
    removed = 0
    for pid_file in get_all_pid_files():
        info = read_pid_file(pid_file)
        if info is None:
            # Invalid PID file, remove it
            remove_pid_file(pid_file)
            removed += 1
            continue
        if not is_process_alive(info['pid']):
            remove_pid_file(pid_file)
            removed += 1
    return removed
 def get_running_worker_count(worker_type: str) -> int:
    """Get the count of running workers of a specific type."""
    return len(get_all_worker_pids(worker_type))
 def get_next_worker_id(worker_type: str) -> int:
    """Get the next available worker ID for a given type."""
    existing_ids = set()
    for pid_file in get_all_pid_files():
        # Parse worker ID from filename like "snapshot_worker_3.pid"
        name = pid_file.stem
        if name.startswith(f'{worker_type}_worker_'):
            try:
                worker_id = int(name.split('_')[-1])
                existing_ids.add(worker_id)
            except ValueError:
                continue
    # Find the lowest unused ID
    next_id = 0
    while next_id in existing_ids:
        next_id += 1
    return next_id
 def stop_worker(pid: int, graceful: bool = True) -> bool:
    """
    Stop a worker process.
    If graceful=True, sends SIGTERM first, then SIGKILL after timeout.
    Returns True if process was stopped.
    """
    if not is_process_alive(pid):
        return True
    try:
        if graceful:
            os.kill(pid, signal.SIGTERM)
            # Give it a moment to shut down
            import time
            for _ in range(10):  # Wait up to 1 second
                time.sleep(0.1)
                if not is_process_alive(pid):
                    return True
            # Force kill if still running
            os.kill(pid, signal.SIGKILL)
        else:
            os.kill(pid, signal.SIGKILL)
        return True
    except (OSError, ProcessLookupError):
        return True  # Process already dead
--- a/archivebox/workers/worker.py
+++ b/archivebox/workers/worker.py
@@ -17,7 +17,7 @@ import traceback
 from typing import ClassVar, Any
 from datetime import timedelta
 from pathlib import Path
-from multiprocessing import Process, cpu_count
+from multiprocessing import Process as MPProcess, cpu_count
 from django.db.models import QuerySet
 from django.utils import timezone
@@ -26,13 +26,6 @@ from django.conf import settings
 from rich import print
 from archivebox.misc.logging_util import log_worker_event
 from .pid_utils import (
    write_pid_file,
    remove_pid_file,
    get_all_worker_pids,
    get_next_worker_id,
    cleanup_stale_pid_files,
 )
 CPU_COUNT = cpu_count()
@@ -133,8 +126,15 @@ class Worker:
    def on_startup(self) -> None:
        """Called when worker starts."""
        from archivebox.machine.models import Process
        self.pid = os.getpid()
-        self.pid_file = write_pid_file(self.name, self.worker_id)
+        # Register this worker process in the database
        self.db_process = Process.current()
        # Explicitly set process_type to WORKER to prevent mis-detection
        if self.db_process.process_type != Process.TypeChoices.WORKER:
            self.db_process.process_type = Process.TypeChoices.WORKER
            self.db_process.save(update_fields=['process_type'])
        # Determine worker type for logging
        worker_type_name = self.__class__.__name__
@@ -160,9 +160,12 @@ class Worker:
    def on_shutdown(self, error: BaseException | None = None) -> None:
        """Called when worker shuts down."""
-        # Remove PID file
+        # Update Process record status
-        if self.pid_file:
+        if hasattr(self, 'db_process') and self.db_process:
-            remove_pid_file(self.pid_file)
+            self.db_process.exit_code = 1 if error else 0
            self.db_process.status = self.db_process.StatusChoices.EXITED
            self.db_process.ended_at = timezone.now()
            self.db_process.save()
        # Determine worker type for logging
        worker_type_name = self.__class__.__name__
@@ -288,11 +291,13 @@ class Worker:
        Fork a new worker as a subprocess.
        Returns the PID of the new process.
        """
        from archivebox.machine.models import Process
        if worker_id is None:
-            worker_id = get_next_worker_id(cls.name)
+            worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
        # Use module-level function for pickling compatibility
-        proc = Process(
+        proc = MPProcess(
            target=_run_worker,
            args=(cls.name, worker_id, daemon),
            kwargs=kwargs,
@@ -304,15 +309,31 @@ class Worker:
        return proc.pid
    @classmethod
-    def get_running_workers(cls) -> list[dict]:
+    def get_running_workers(cls) -> list:
        """Get info about all running workers of this type."""
-        cleanup_stale_pid_files()
+        from archivebox.machine.models import Process
-        return get_all_worker_pids(cls.name)
+
        Process.cleanup_stale_running()
        # Convert Process objects to dicts to match the expected API contract
        processes = Process.get_running(process_type=Process.TypeChoices.WORKER)
        # Note: worker_id is not stored on Process model, it's dynamically generated
        # We return process_id (UUID) and pid (OS process ID) instead
        return [
            {
                'pid': p.pid,
                'process_id': str(p.id),  # UUID of Process record
                'started_at': p.started_at.isoformat() if p.started_at else None,
                'status': p.status,
            }
            for p in processes
        ]
    @classmethod
    def get_worker_count(cls) -> int:
        """Get count of running workers of this type."""
-        return len(cls.get_running_workers())
+        from archivebox.machine.models import Process
        return Process.get_running_count(process_type=Process.TypeChoices.WORKER)
 class CrawlWorker(Worker):
@@ -402,11 +423,13 @@ class ArchiveResultWorker(Worker):
    @classmethod
    def start(cls, worker_id: int | None = None, daemon: bool = False, plugin: str | None = None, **kwargs: Any) -> int:
        """Fork a new worker as subprocess with optional plugin filter."""
        from archivebox.machine.models import Process
        if worker_id is None:
-            worker_id = get_next_worker_id(cls.name)
+            worker_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
        # Use module-level function for pickling compatibility
-        proc = Process(
+        proc = MPProcess(
            target=_run_worker,
            args=(cls.name, worker_id, daemon),
            kwargs={'plugin': plugin, **kwargs},