Refactor ArchiveBox onto abx-dl bus runner

2026-04-06 07:47:53 +10:00 · 2026-03-21 11:47:57 -07:00
parent ee9ed440d1
commit c87079aa0a
45 changed files with 1282 additions and 6396 deletions
--- a/archivebox/tests/test_cli_add_interrupt.py
+++ b/archivebox/tests/test_cli_add_interrupt.py
@@ -1,133 +0,0 @@
-import os
-import signal
-import sqlite3
-import subprocess
-import sys
-import time
-from pathlib import Path
-
-
-def _run(cmd, data_dir: Path, env: dict, timeout: int = 120):
-    return subprocess.run(
-        cmd,
-        cwd=data_dir,
-        env=env,
-        capture_output=True,
-        text=True,
-        timeout=timeout,
-    )
-
-
-def _make_env(data_dir: Path) -> dict:
-    env = os.environ.copy()
-    env["DATA_DIR"] = str(data_dir)
-    env["USE_COLOR"] = "False"
-    env["SHOW_PROGRESS"] = "False"
-    env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
-    env["PLUGINS"] = "favicon"
-    # Keep it fast but still real hooks
-    env["SAVE_FAVICON"] = "True"
-    env["SAVE_TITLE"] = "False"
-    env["SAVE_WGET"] = "False"
-    env["SAVE_WARC"] = "False"
-    env["SAVE_PDF"] = "False"
-    env["SAVE_SCREENSHOT"] = "False"
-    env["SAVE_DOM"] = "False"
-    env["SAVE_SINGLEFILE"] = "False"
-    env["SAVE_READABILITY"] = "False"
-    env["SAVE_MERCURY"] = "False"
-    env["SAVE_GIT"] = "False"
-    env["SAVE_YTDLP"] = "False"
-    env["SAVE_HEADERS"] = "False"
-    env["SAVE_HTMLTOTEXT"] = "False"
-    return env
-
-
-def _count_running_processes(db_path: Path, where: str) -> int:
-    for _ in range(50):
-        try:
-            conn = sqlite3.connect(db_path, timeout=1)
-            cur = conn.cursor()
-            count = cur.execute(
-                f"SELECT COUNT(*) FROM machine_process WHERE status = 'running' AND {where}"
-            ).fetchone()[0]
-            conn.close()
-            return count
-        except sqlite3.OperationalError:
-            time.sleep(0.1)
-    return 0
-
-
-def _wait_for_count(db_path: Path, where: str, target: int, timeout: int = 20) -> bool:
-    start = time.time()
-    while time.time() - start < timeout:
-        if _count_running_processes(db_path, where) >= target:
-            return True
-        time.sleep(0.1)
-    return False
-
-
-def test_add_parents_workers_to_orchestrator(tmp_path):
-    data_dir = tmp_path / "data"
-    data_dir.mkdir()
-    env = _make_env(data_dir)
-
-    init = _run([sys.executable, "-m", "archivebox", "init", "--quick"], data_dir, env)
-    assert init.returncode == 0, init.stderr
-
-    add = _run([sys.executable, "-m", "archivebox", "add", "--plugins=favicon", "https://example.com"], data_dir, env, timeout=120)
-    assert add.returncode == 0, add.stderr
-
-    conn = sqlite3.connect(data_dir / "index.sqlite3")
-    cur = conn.cursor()
-    orchestrator = cur.execute(
-        "SELECT id FROM machine_process WHERE process_type = 'orchestrator' ORDER BY created_at DESC LIMIT 1"
-    ).fetchone()
-    assert orchestrator is not None
-    orchestrator_id = orchestrator[0]
-
-    worker_count = cur.execute(
-        "SELECT COUNT(*) FROM machine_process WHERE process_type = 'worker' AND worker_type = 'crawl' "
-        "AND parent_id = ?",
-        (orchestrator_id,),
-    ).fetchone()[0]
-    conn.close()
-
-    assert worker_count >= 1, "Expected crawl worker to be parented to orchestrator"
-
-
-def test_add_interrupt_cleans_orphaned_processes(tmp_path):
-    data_dir = tmp_path / "data"
-    data_dir.mkdir()
-    env = _make_env(data_dir)
-
-    init = _run([sys.executable, "-m", "archivebox", "init", "--quick"], data_dir, env)
-    assert init.returncode == 0, init.stderr
-
-    proc = subprocess.Popen(
-        [sys.executable, "-m", "archivebox", "add", "--plugins=favicon", "https://example.com"],
-        cwd=data_dir,
-        env=env,
-        stdout=subprocess.PIPE,
-        stderr=subprocess.PIPE,
-        text=True,
-    )
-
-    db_path = data_dir / "index.sqlite3"
-    saw_worker = _wait_for_count(db_path, "process_type = 'worker'", 1, timeout=20)
-    assert saw_worker, "Expected at least one worker to start before interrupt"
-
-    proc.send_signal(signal.SIGINT)
-    proc.wait(timeout=30)
-
-    # Wait for workers/hooks to be cleaned up
-    start = time.time()
-    while time.time() - start < 30:
-        running = _count_running_processes(db_path, "process_type IN ('worker','hook')")
-        if running == 0:
-            break
-        time.sleep(0.2)
-
-    assert _count_running_processes(db_path, "process_type IN ('worker','hook')") == 0, (
-        "Expected no running worker/hook processes after interrupt"
-    )
--- a/archivebox/tests/test_cli_piping.py
+++ b/archivebox/tests/test_cli_piping.py
@@ -1,5 +1,5 @@
 """
-Tests for JSONL piping contracts and `archivebox run` / `archivebox orchestrator`.
+Tests for JSONL piping contracts and `archivebox run`.

 This file covers both:
 - low-level JSONL/stdin parsing behavior that makes CLI piping work
@@ -252,8 +252,8 @@ def test_snapshot_list_stdout_pipes_into_run(initialized_archive):
    assert snapshot_status == "sealed"


-def test_archiveresult_list_stdout_pipes_into_orchestrator_alias(initialized_archive):
-    """`archivebox archiveresult list | archivebox orchestrator` should preserve clean JSONL stdout."""
+def test_archiveresult_list_stdout_pipes_into_run(initialized_archive):
+    """`archivebox archiveresult list | archivebox run` should preserve clean JSONL stdout."""
    url = create_test_url()

    snapshot_stdout, snapshot_stderr, snapshot_code = run_archivebox_cmd(
@@ -279,18 +279,17 @@ def test_archiveresult_list_stdout_pipes_into_orchestrator_alias(initialized_arc
    assert list_code == 0, list_stderr
    _assert_stdout_is_jsonl_only(list_stdout)

-    orchestrator_stdout, orchestrator_stderr, orchestrator_code = run_archivebox_cmd(
-        ["orchestrator"],
+    run_stdout, run_stderr, run_code = run_archivebox_cmd(
+        ["run"],
        stdin=list_stdout,
        data_dir=initialized_archive,
        timeout=120,
        env=PIPE_TEST_ENV,
    )
-    assert orchestrator_code == 0, orchestrator_stderr
-    _assert_stdout_is_jsonl_only(orchestrator_stdout)
-    assert "renamed to `archivebox run`" in orchestrator_stderr
+    assert run_code == 0, run_stderr
+    _assert_stdout_is_jsonl_only(run_stdout)

-    run_records = parse_jsonl_output(orchestrator_stdout)
+    run_records = parse_jsonl_output(run_stdout)
    assert any(
        record.get("type") == "ArchiveResult" and record.get("id") == archiveresult["id"]
        for record in run_records
--- a/archivebox/tests/test_cli_run_binary_worker.py
+++ b/archivebox/tests/test_cli_run_binary_worker.py
@@ -1,342 +0,0 @@
-"""
-Tests for BinaryWorker processing Binary queue.
-
-Tests cover:
- BinaryWorker is spawned by Orchestrator when Binary queue has work
- Binary hooks (on_Binary__*) actually run and install binaries
- Binary status transitions from QUEUED -> INSTALLED
- BinaryWorker exits after idle timeout
-"""
-
-import json
-import sqlite3
-
-from archivebox.tests.conftest import (
-    run_archivebox_cmd,
-)
-
-
-class TestBinaryWorkerSpawning:
-    """Tests for BinaryWorker lifecycle."""
-
-    def test_binary_worker_spawns_when_binary_queued(self, initialized_archive):
-        """Orchestrator spawns BinaryWorker when Binary queue has work."""
-        # Create a Binary record via CLI
-        binary_record = {
-            'type': 'Binary',
-            'name': 'python3',
-            'binproviders': 'env',  # Use env provider to detect system python
-        }
-
-        # Use `archivebox run` to create the Binary (this queues it)
-        stdout, stderr, code = run_archivebox_cmd(
-            ['run'],
-            stdin=json.dumps(binary_record),
-            data_dir=initialized_archive,
-            timeout=60,  # Increased timeout to allow for binary installation
-        )
-
-        assert code == 0, f"Failed to create Binary: {stderr}"
-
-        # Verify Binary was created in DB
-        conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
-        c = conn.cursor()
-        binaries = c.execute(
-            "SELECT name, status, abspath FROM machine_binary WHERE name='python3'"
-        ).fetchall()
-        conn.close()
-
-        assert len(binaries) >= 1, "Binary was not created in database"
-        name, status, abspath = binaries[0]
-        assert name == 'python3'
-        # Status should be INSTALLED after BinaryWorker processed it
-        # (or QUEUED if worker timed out before installing)
-        assert status in ['installed', 'queued']
-
-
-    def test_binary_hooks_actually_run(self, initialized_archive):
-        """Binary installation hooks (on_Binary__*) run and update abspath."""
-        # Create a Binary for python3 (guaranteed to exist on system)
-        binary_record = {
-            'type': 'Binary',
-            'name': 'python3',
-            'binproviders': 'env',
-        }
-
-        stdout, stderr, code = run_archivebox_cmd(
-            ['run'],
-            stdin=json.dumps(binary_record),
-            data_dir=initialized_archive,
-            timeout=30,
-        )
-
-        assert code == 0, f"Failed to process Binary: {stderr}"
-
-        # Query database to check if hooks ran and populated abspath
-        conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
-        c = conn.cursor()
-        result = c.execute(
-            "SELECT name, status, abspath, version FROM machine_binary WHERE name='python3'"
-        ).fetchone()
-        conn.close()
-
-        assert result is not None, "Binary not found in database"
-        name, status, abspath, version = result
-
-        # If hooks ran successfully, abspath should be populated
-        if status == 'installed':
-            assert abspath, f"Binary installed but abspath is empty: {abspath}"
-            assert '/python3' in abspath or '\\python3' in abspath, \
-                f"abspath doesn't look like a python3 path: {abspath}"
-            # Version should also be populated
-            assert version, f"Binary installed but version is empty: {version}"
-
-
-    def test_binary_status_transitions(self, initialized_archive):
-        """Binary status correctly transitions QUEUED -> INSTALLED."""
-        binary_record = {
-            'type': 'Binary',
-            'name': 'python3',
-            'binproviders': 'env',
-        }
-
-        # Create and process the Binary
-        stdout, stderr, code = run_archivebox_cmd(
-            ['run'],
-            stdin=json.dumps(binary_record),
-            data_dir=initialized_archive,
-            timeout=30,
-        )
-
-        assert code == 0
-
-        # Check final status
-        conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
-        c = conn.cursor()
-        status = c.execute(
-            "SELECT status FROM machine_binary WHERE name='python3'"
-        ).fetchone()
-        conn.close()
-
-        assert status is not None
-        # Should be installed (or queued if worker timed out)
-        assert status[0] in ['installed', 'queued']
-
-
-class TestBinaryWorkerHooks:
-    """Tests for specific Binary hook providers."""
-
-    def test_env_provider_hook_detects_system_binary(self, initialized_archive):
-        """on_Binary__15_env_discover.py hook detects system binaries."""
-        binary_record = {
-            'type': 'Binary',
-            'name': 'python3',
-            'binproviders': 'env',
-        }
-
-        stdout, stderr, code = run_archivebox_cmd(
-            ['run'],
-            stdin=json.dumps(binary_record),
-            data_dir=initialized_archive,
-            timeout=30,
-        )
-
-        assert code == 0
-
-        # Check that env provider hook populated the Binary
-        conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
-        c = conn.cursor()
-        result = c.execute(
-            "SELECT binprovider, abspath FROM machine_binary WHERE name='python3' AND status='installed'"
-        ).fetchone()
-        conn.close()
-
-        if result:
-            binprovider, abspath = result
-            assert binprovider == 'env', f"Expected env provider, got: {binprovider}"
-            assert abspath, "abspath should be populated by env provider"
-
-
-    def test_multiple_binaries_processed_in_batch(self, initialized_archive):
-        """BinaryWorker processes multiple queued binaries."""
-        # Create multiple Binary records
-        binaries = [
-            {'type': 'Binary', 'name': 'python3', 'binproviders': 'env'},
-            {'type': 'Binary', 'name': 'curl', 'binproviders': 'env'},
-        ]
-
-        stdin = '\n'.join(json.dumps(b) for b in binaries)
-
-        stdout, stderr, code = run_archivebox_cmd(
-            ['run'],
-            stdin=stdin,
-            data_dir=initialized_archive,
-            timeout=90,  # Need more time for multiple binaries
-        )
-
-        assert code == 0
-
-        # Both should be processed
-        conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
-        c = conn.cursor()
-        installed = c.execute(
-            "SELECT name FROM machine_binary WHERE name IN ('python3', 'curl')"
-        ).fetchall()
-        conn.close()
-
-        assert len(installed) >= 1, "At least one binary should be created"
-
-    def test_puppeteer_binary_sets_skip_download_for_hooks(self, initialized_archive):
-        """Puppeteer installs expose skip-download env to Binary hooks."""
-        user_plugins_dir = initialized_archive / 'test_plugins'
-        plugin_dir = user_plugins_dir / 'inspectnpm'
-        plugin_dir.mkdir(parents=True, exist_ok=True)
-
-        hook = plugin_dir / 'on_Binary__10_inspectnpm_install.py'
-        hook.write_text(
-            """#!/usr/bin/env python3
-import argparse
-import json
-import os
-import shutil
-import sys
-
-parser = argparse.ArgumentParser()
-parser.add_argument('--machine-id', required=True)
-parser.add_argument('--binary-id', required=True)
-parser.add_argument('--name', required=True)
-parser.add_argument('--binproviders', default='*')
-args = parser.parse_args()
-
-record = {
-    'type': 'Binary',
-    'name': args.name,
-    'abspath': shutil.which('python3') or sys.executable,
-    'version': '1.0.0',
-    'sha256': '',
-    'binprovider': 'inspectnpm',
-    'machine_id': args.machine_id,
-    'binary_id': args.binary_id,
-}
-print(json.dumps(record))
-print(json.dumps({
-    'type': 'Machine',
-    'config': {
-        'SEEN_PUPPETEER_SKIP_DOWNLOAD': os.environ.get('PUPPETEER_SKIP_DOWNLOAD', ''),
-        'SEEN_PUPPETEER_SKIP_CHROMIUM_DOWNLOAD': os.environ.get('PUPPETEER_SKIP_CHROMIUM_DOWNLOAD', ''),
-    },
-}))
-"""
-        )
-
-        binary_record = {
-            'type': 'Binary',
-            'name': 'puppeteer',
-            'binproviders': 'inspectnpm',
-        }
-
-        stdout, stderr, code = run_archivebox_cmd(
-            ['run'],
-            stdin=json.dumps(binary_record),
-            data_dir=initialized_archive,
-            env={
-                'ARCHIVEBOX_USER_PLUGINS_DIR': str(user_plugins_dir),
-                'PLUGINS': 'inspectnpm',
-            },
-            timeout=60,
-        )
-
-        assert code == 0, f"Failed to process puppeteer Binary: {stderr}"
-
-        conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
-        c = conn.cursor()
-        result = c.execute(
-            "SELECT status, binprovider FROM machine_binary WHERE name='puppeteer'"
-        ).fetchone()
-        hook_rows = c.execute(
-            "SELECT cmd, env FROM machine_process WHERE process_type='hook' ORDER BY created_at DESC"
-        ).fetchall()
-        conn.close()
-
-        assert result is not None, "Puppeteer binary not found in database"
-        status, binprovider = result
-        assert status == 'installed', f"Expected puppeteer to install, got: {status}"
-        assert binprovider == 'inspectnpm', f"Expected inspectnpm provider, got: {binprovider}"
-
-        hook_env = None
-        for cmd_json, env_json in hook_rows:
-            cmd = json.loads(cmd_json)
-            if any('inspectnpm' in part for part in cmd):
-                hook_env = json.loads(env_json)
-                break
-
-        assert hook_env is not None, "Inspectnpm hook process not found"
-        assert hook_env.get('PUPPETEER_SKIP_DOWNLOAD') == 'true'
-        assert hook_env.get('PUPPETEER_SKIP_CHROMIUM_DOWNLOAD') == 'true'
-
-
-class TestBinaryWorkerEdgeCases:
-    """Tests for edge cases and error handling."""
-
-    def test_nonexistent_binary_stays_queued(self, initialized_archive):
-        """Binary that doesn't exist stays queued (doesn't fail permanently)."""
-        binary_record = {
-            'type': 'Binary',
-            'name': 'nonexistent-binary-xyz-12345',
-            'binproviders': 'env',
-        }
-
-        stdout, stderr, code = run_archivebox_cmd(
-            ['run'],
-            stdin=json.dumps(binary_record),
-            data_dir=initialized_archive,
-            timeout=30,
-        )
-
-        # Command should still succeed (orchestrator doesn't fail on binary install failures)
-        assert code == 0
-
-        # Binary should remain queued (not installed)
-        conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
-        c = conn.cursor()
-        result = c.execute(
-            "SELECT status FROM machine_binary WHERE name='nonexistent-binary-xyz-12345'"
-        ).fetchone()
-        conn.close()
-
-        if result:
-            status = result[0]
-            # Should stay queued since installation failed
-            assert status == 'queued', f"Expected queued, got: {status}"
-
-
-    def test_binary_worker_respects_machine_isolation(self, initialized_archive):
-        """BinaryWorker only processes binaries for current machine."""
-        # This is implicitly tested by other tests - Binary.objects.filter(machine=current)
-        # ensures only current machine's binaries are processed
-        binary_record = {
-            'type': 'Binary',
-            'name': 'python3',
-            'binproviders': 'env',
-        }
-
-        stdout, stderr, code = run_archivebox_cmd(
-            ['run'],
-            stdin=json.dumps(binary_record),
-            data_dir=initialized_archive,
-            timeout=30,
-        )
-
-        assert code == 0
-
-        # Check that machine_id is set correctly
-        conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
-        c = conn.cursor()
-        result = c.execute(
-            "SELECT machine_id FROM machine_binary WHERE name='python3'"
-        ).fetchone()
-        conn.close()
-
-        assert result is not None
-        machine_id = result[0]
-        assert machine_id, "machine_id should be set on Binary"
--- a/archivebox/tests/test_machine_models.py
+++ b/archivebox/tests/test_machine_models.py
@@ -369,9 +369,9 @@ class TestProcessCurrent(TestCase):

        self.assertEqual(proc1.id, proc2.id)

-    def test_process_detect_type_orchestrator(self):
-        """_detect_process_type should detect orchestrator."""
-        with patch('sys.argv', ['archivebox', 'manage', 'orchestrator']):
+    def test_process_detect_type_runner(self):
+        """_detect_process_type should detect the background runner command."""
+        with patch('sys.argv', ['archivebox', 'run', '--daemon']):
            result = Process._detect_process_type()
            self.assertEqual(result, Process.TypeChoices.ORCHESTRATOR)

@@ -381,11 +381,11 @@ class TestProcessCurrent(TestCase):
            result = Process._detect_process_type()
            self.assertEqual(result, Process.TypeChoices.CLI)

-    def test_process_detect_type_worker(self):
-        """_detect_process_type should detect workers."""
-        with patch('sys.argv', ['python', '-m', 'crawl_worker']):
+    def test_process_detect_type_binary(self):
+        """_detect_process_type should detect non-ArchiveBox subprocesses as binary processes."""
+        with patch('sys.argv', ['/usr/bin/wget', 'https://example.com']):
            result = Process._detect_process_type()
-            self.assertEqual(result, Process.TypeChoices.WORKER)
+            self.assertEqual(result, Process.TypeChoices.BINARY)


 class TestProcessHierarchy(TestCase):
--- a/archivebox/tests/test_orchestrator.py
+++ b/archivebox/tests/test_orchestrator.py
@@ -1,484 +0,0 @@
-"""
-Unit tests for the Orchestrator and Worker classes.
-
-Tests cover:
-1. Orchestrator lifecycle (startup, shutdown)
-2. Queue polling and worker spawning
-3. Idle detection and exit logic
-4. Worker registration and management
-5. Process model methods (replacing old pid_utils)
-"""
-
-import os
-import time
-from datetime import datetime, timedelta
-from unittest.mock import patch
-from typing import ClassVar
-
-import pytest
-from django.test import TestCase
-from django.utils import timezone
-
-from archivebox.workers.orchestrator import Orchestrator
-from archivebox.workers.worker import Worker
-
-
-class FakeWorker(Worker):
-    name: ClassVar[str] = 'crawl'
-    MAX_CONCURRENT_TASKS: ClassVar[int] = 5
-    running_workers: ClassVar[list[dict[str, object]]] = []
-
-    @classmethod
-    def get_running_workers(cls) -> list[dict[str, object]]:
-        return cls.running_workers
-
-
-class TestOrchestratorUnit(TestCase):
-    """Unit tests for Orchestrator class (mocked dependencies)."""
-
-    def test_orchestrator_creation(self):
-        """Orchestrator should initialize with correct defaults."""
-        orchestrator = Orchestrator(exit_on_idle=True)
-
-        self.assertTrue(orchestrator.exit_on_idle)
-        self.assertEqual(orchestrator.idle_count, 0)
-        self.assertIsNone(orchestrator.pid_file)
-
-    def test_orchestrator_repr(self):
-        """Orchestrator __repr__ should include PID."""
-        orchestrator = Orchestrator()
-        repr_str = repr(orchestrator)
-
-        self.assertIn('Orchestrator', repr_str)
-        self.assertIn(str(os.getpid()), repr_str)
-
-    def test_has_pending_work(self):
-        """has_pending_work should check if any queue has items."""
-        orchestrator = Orchestrator()
-
-        self.assertFalse(orchestrator.has_pending_work({'crawl': 0, 'snapshot': 0}))
-        self.assertTrue(orchestrator.has_pending_work({'crawl': 0, 'snapshot': 5}))
-        self.assertTrue(orchestrator.has_pending_work({'crawl': 10, 'snapshot': 0}))
-
-    def test_should_exit_not_exit_on_idle(self):
-        """should_exit should return False when exit_on_idle is False."""
-        orchestrator = Orchestrator(exit_on_idle=False)
-        orchestrator.idle_count = 100
-
-        self.assertFalse(orchestrator.should_exit({'crawl': 0}))
-
-    def test_should_exit_pending_work(self):
-        """should_exit should return False when there's pending work."""
-        orchestrator = Orchestrator(exit_on_idle=True)
-        orchestrator.idle_count = 100
-
-        self.assertFalse(orchestrator.should_exit({'crawl': 5}))
-
-    @patch.object(Orchestrator, 'has_running_workers')
-    def test_should_exit_running_workers(self, mock_has_workers):
-        """should_exit should return False when workers are running."""
-        mock_has_workers.return_value = True
-        orchestrator = Orchestrator(exit_on_idle=True)
-        orchestrator.idle_count = 100
-
-        self.assertFalse(orchestrator.should_exit({'crawl': 0}))
-
-    @patch.object(Orchestrator, 'has_running_workers')
-    @patch.object(Orchestrator, 'has_future_work')
-    def test_should_exit_idle_timeout(self, mock_future, mock_workers):
-        """should_exit should return True after idle timeout with no work."""
-        mock_workers.return_value = False
-        mock_future.return_value = False
-
-        orchestrator = Orchestrator(exit_on_idle=True)
-        orchestrator.idle_count = orchestrator.IDLE_TIMEOUT
-
-        self.assertTrue(orchestrator.should_exit({'crawl': 0, 'snapshot': 0}))
-
-    @patch.object(Orchestrator, 'has_running_workers')
-    @patch.object(Orchestrator, 'has_future_work')
-    def test_should_exit_below_idle_timeout(self, mock_future, mock_workers):
-        """should_exit should return False below idle timeout."""
-        mock_workers.return_value = False
-        mock_future.return_value = False
-
-        orchestrator = Orchestrator(exit_on_idle=True)
-        orchestrator.idle_count = orchestrator.IDLE_TIMEOUT - 1
-
-        self.assertFalse(orchestrator.should_exit({'crawl': 0}))
-
-    def test_should_spawn_worker_no_queue(self):
-        """should_spawn_worker should return False when queue is empty."""
-        orchestrator = Orchestrator()
-
-        FakeWorker.running_workers = []
-        self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 0))
-
-    def test_should_spawn_worker_at_limit(self):
-        """should_spawn_worker should return False when at per-type limit."""
-        orchestrator = Orchestrator()
-
-        running_workers: list[dict[str, object]] = [{'worker_id': worker_id} for worker_id in range(orchestrator.MAX_CRAWL_WORKERS)]
-        FakeWorker.running_workers = running_workers
-        self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 10))
-
-    @patch.object(Orchestrator, 'get_total_worker_count')
-    def test_should_spawn_worker_at_total_limit(self, mock_total):
-        """should_spawn_worker should return False when at total limit."""
-        orchestrator = Orchestrator()
-        mock_total.return_value = 0
-        running_workers: list[dict[str, object]] = [{'worker_id': worker_id} for worker_id in range(orchestrator.MAX_CRAWL_WORKERS)]
-        FakeWorker.running_workers = running_workers
-        self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 10))
-
-    @patch.object(Orchestrator, 'get_total_worker_count')
-    def test_should_spawn_worker_success(self, mock_total):
-        """should_spawn_worker should return True when conditions are met."""
-        orchestrator = Orchestrator()
-        mock_total.return_value = 0
-
-        FakeWorker.running_workers = []
-        self.assertTrue(orchestrator.should_spawn_worker(FakeWorker, 10))
-
-    @patch.object(Orchestrator, 'get_total_worker_count')
-    def test_should_spawn_worker_enough_workers(self, mock_total):
-        """should_spawn_worker should return False when enough workers for queue."""
-        orchestrator = Orchestrator()
-        mock_total.return_value = 2
-
-        FakeWorker.running_workers = [{}]  # 1 worker running
-        self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 3))
-
-
-class TestOrchestratorWithProcess(TestCase):
-    """Test Orchestrator using Process model for tracking."""
-
-    def setUp(self):
-        """Reset process cache."""
-        import archivebox.machine.models as models
-        models._CURRENT_MACHINE = None
-        models._CURRENT_PROCESS = None
-
-    def test_is_running_no_orchestrator(self):
-        """is_running should return False when no orchestrator process exists."""
-        from archivebox.machine.models import Process
-
-        # Clean up any stale processes first
-        Process.cleanup_stale_running()
-
-        # Mark any running orchestrators as exited for clean test state
-        Process.objects.filter(
-            process_type=Process.TypeChoices.ORCHESTRATOR,
-            status=Process.StatusChoices.RUNNING
-        ).update(status=Process.StatusChoices.EXITED)
-
-        self.assertFalse(Orchestrator.is_running())
-
-    def test_is_running_with_orchestrator_process(self):
-        """is_running should return True when orchestrator Process exists."""
-        from archivebox.machine.models import Process, Machine
-        import psutil
-
-        machine = Machine.current()
-        current_proc = psutil.Process(os.getpid())
-
-        # Create an orchestrator Process record
-        proc = Process.objects.create(
-            machine=machine,
-            process_type=Process.TypeChoices.ORCHESTRATOR,
-            status=Process.StatusChoices.RUNNING,
-            pid=os.getpid(),  # Use current PID so it appears alive
-            started_at=datetime.fromtimestamp(current_proc.create_time(), tz=timezone.get_current_timezone()),
-            cmd=current_proc.cmdline(),
-        )
-
-        try:
-            # Should detect running orchestrator
-            self.assertTrue(Orchestrator.is_running())
-        finally:
-            # Clean up
-            proc.status = Process.StatusChoices.EXITED
-            proc.save()
-
-    def test_orchestrator_uses_process_for_is_running(self):
-        """Orchestrator.is_running should use Process.get_running_count."""
-        from archivebox.machine.models import Process
-
-        # Verify is_running uses Process model, not pid files
-        with patch.object(Process, 'get_running_count') as mock_count:
-            mock_count.return_value = 1
-
-            result = Orchestrator.is_running()
-
-            # Should have called Process.get_running_count with orchestrator type
-            mock_count.assert_called()
-            self.assertTrue(result)
-
-    def test_orchestrator_scoped_worker_count(self):
-        """Orchestrator with crawl_id should count only descendant workers."""
-        from archivebox.machine.models import Process, Machine
-
-        machine = Machine.current()
-        orchestrator = Orchestrator(exit_on_idle=True, crawl_id='test-crawl')
-
-        orchestrator.db_process = Process.objects.create(
-            machine=machine,
-            process_type=Process.TypeChoices.ORCHESTRATOR,
-            status=Process.StatusChoices.RUNNING,
-            pid=12345,
-            started_at=timezone.now(),
-        )
-
-        # Prevent cleanup from marking fake PIDs as exited
-        orchestrator._last_cleanup_time = time.time()
-
-        Process.objects.create(
-            machine=machine,
-            process_type=Process.TypeChoices.WORKER,
-            worker_type='crawl',
-            status=Process.StatusChoices.RUNNING,
-            pid=12346,
-            parent=orchestrator.db_process,
-            started_at=timezone.now(),
-        )
-
-        Process.objects.create(
-            machine=machine,
-            process_type=Process.TypeChoices.WORKER,
-            worker_type='crawl',
-            status=Process.StatusChoices.RUNNING,
-            pid=12347,
-            started_at=timezone.now(),
-        )
-
-        self.assertEqual(orchestrator.get_total_worker_count(), 1)
-
-
-class TestProcessBasedWorkerTracking(TestCase):
-    """Test Process model methods that replace pid_utils functionality."""
-
-    def setUp(self):
-        """Reset caches."""
-        import archivebox.machine.models as models
-        models._CURRENT_MACHINE = None
-        models._CURRENT_PROCESS = None
-
-    def test_process_current_creates_record(self):
-        """Process.current() should create a Process record for current PID."""
-        from archivebox.machine.models import Process
-
-        proc = Process.current()
-
-        self.assertIsNotNone(proc)
-        self.assertEqual(proc.pid, os.getpid())
-        self.assertEqual(proc.status, Process.StatusChoices.RUNNING)
-        self.assertIsNotNone(proc.machine)
-        self.assertIsNotNone(proc.started_at)
-
-    def test_process_current_caches_result(self):
-        """Process.current() should return cached Process within interval."""
-        from archivebox.machine.models import Process
-
-        proc1 = Process.current()
-        proc2 = Process.current()
-
-        self.assertEqual(proc1.id, proc2.id)
-
-    def test_process_get_running_count(self):
-        """Process.get_running_count should count running processes by type."""
-        from archivebox.machine.models import Process, Machine
-
-        machine = Machine.current()
-
-        # Create some worker processes
-        for i in range(3):
-            Process.objects.create(
-                machine=machine,
-                process_type=Process.TypeChoices.WORKER,
-                status=Process.StatusChoices.RUNNING,
-                pid=99990 + i,  # Fake PIDs
-                started_at=timezone.now(),
-            )
-
-        count = Process.get_running_count(process_type=Process.TypeChoices.WORKER)
-        self.assertGreaterEqual(count, 3)
-
-    def test_process_get_next_worker_id(self):
-        """Process.get_next_worker_id should return count of running workers."""
-        from archivebox.machine.models import Process, Machine
-
-        machine = Machine.current()
-
-        # Create 2 worker processes
-        for i in range(2):
-            Process.objects.create(
-                machine=machine,
-                process_type=Process.TypeChoices.WORKER,
-                status=Process.StatusChoices.RUNNING,
-                pid=99980 + i,
-                started_at=timezone.now(),
-            )
-
-        next_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
-        self.assertGreaterEqual(next_id, 2)
-
-    def test_process_cleanup_stale_running(self):
-        """Process.cleanup_stale_running should mark stale processes as exited."""
-        from archivebox.machine.models import Process, Machine, PID_REUSE_WINDOW
-
-        machine = Machine.current()
-
-        # Create a stale process (old started_at, fake PID)
-        stale_proc = Process.objects.create(
-            machine=machine,
-            process_type=Process.TypeChoices.WORKER,
-            status=Process.StatusChoices.RUNNING,
-            pid=999999,  # Fake PID that doesn't exist
-            started_at=timezone.now() - PID_REUSE_WINDOW - timedelta(hours=1),
-        )
-
-        cleaned = Process.cleanup_stale_running()
-
-        self.assertGreaterEqual(cleaned, 1)
-
-        stale_proc.refresh_from_db()
-        self.assertEqual(stale_proc.status, Process.StatusChoices.EXITED)
-
-    def test_process_get_running(self):
-        """Process.get_running should return queryset of running processes."""
-        from archivebox.machine.models import Process, Machine
-
-        machine = Machine.current()
-
-        # Create a running process
-        proc = Process.objects.create(
-            machine=machine,
-            process_type=Process.TypeChoices.HOOK,
-            status=Process.StatusChoices.RUNNING,
-            pid=99970,
-            started_at=timezone.now(),
-        )
-
-        running = Process.get_running(process_type=Process.TypeChoices.HOOK)
-
-        self.assertIn(proc, running)
-
-    def test_process_type_detection(self):
-        """Process._detect_process_type should detect process type from argv."""
-        from archivebox.machine.models import Process
-
-        # Test detection logic
-        with patch('sys.argv', ['archivebox', 'manage', 'orchestrator']):
-            result = Process._detect_process_type()
-            self.assertEqual(result, Process.TypeChoices.ORCHESTRATOR)
-
-        with patch('sys.argv', ['archivebox', 'add', 'http://example.com']):
-            result = Process._detect_process_type()
-            self.assertEqual(result, Process.TypeChoices.CLI)
-
-        with patch('sys.argv', ['supervisord', '-c', 'config.ini']):
-            result = Process._detect_process_type()
-            self.assertEqual(result, Process.TypeChoices.SUPERVISORD)
-
-
-class TestProcessLifecycle(TestCase):
-    """Test Process model lifecycle methods."""
-
-    def setUp(self):
-        """Reset caches and create a machine."""
-        import archivebox.machine.models as models
-        models._CURRENT_MACHINE = None
-        models._CURRENT_PROCESS = None
-        self.machine = models.Machine.current()
-
-    def test_process_is_running_property(self):
-        """Process.is_running should check actual OS process."""
-        from archivebox.machine.models import Process
-        proc = Process.current()
-
-        # Should be running (current process exists)
-        self.assertTrue(proc.is_running)
-
-        # Create a process with fake PID
-        fake_proc = Process.objects.create(
-            machine=self.machine,
-            status=Process.StatusChoices.RUNNING,
-            pid=999999,
-            started_at=timezone.now(),
-        )
-
-        # Should not be running (PID doesn't exist)
-        self.assertFalse(fake_proc.is_running)
-
-    def test_process_poll(self):
-        """Process.poll should check and update exit status."""
-        from archivebox.machine.models import Process
-
-        # Create a process with fake PID (already exited)
-        proc = Process.objects.create(
-            machine=self.machine,
-            status=Process.StatusChoices.RUNNING,
-            pid=999999,
-            started_at=timezone.now(),
-        )
-
-        exit_code = proc.poll()
-
-        # Should have detected exit and updated status
-        self.assertIsNotNone(exit_code)
-        proc.refresh_from_db()
-        self.assertEqual(proc.status, Process.StatusChoices.EXITED)
-
-    def test_process_terminate_already_dead(self):
-        """Process.terminate should handle already-dead processes."""
-        from archivebox.machine.models import Process
-
-        # Create a process with fake PID
-        proc = Process.objects.create(
-            machine=self.machine,
-            status=Process.StatusChoices.RUNNING,
-            pid=999999,
-            started_at=timezone.now(),
-        )
-
-        result = proc.terminate()
-
-        # Should return False (was already dead)
-        self.assertFalse(result)
-
-        proc.refresh_from_db()
-        self.assertEqual(proc.status, Process.StatusChoices.EXITED)
-
-    def test_process_tree_traversal(self):
-        """Process parent/children relationships should work."""
-        from archivebox.machine.models import Process
-
-        # Create parent process
-        parent = Process.objects.create(
-            machine=self.machine,
-            process_type=Process.TypeChoices.CLI,
-            status=Process.StatusChoices.RUNNING,
-            pid=1,
-            started_at=timezone.now(),
-        )
-
-        # Create child process
-        child = Process.objects.create(
-            machine=self.machine,
-            parent=parent,
-            process_type=Process.TypeChoices.WORKER,
-            status=Process.StatusChoices.RUNNING,
-            pid=2,
-            started_at=timezone.now(),
-        )
-
-        # Test relationships
-        self.assertEqual(child.parent, parent)
-        self.assertIn(child, parent.children.all())
-        self.assertEqual(child.root, parent)
-        self.assertEqual(child.depth, 1)
-        self.assertEqual(parent.depth, 0)
-
-
-if __name__ == '__main__':
-    pytest.main([__file__, '-v'])
--- a/archivebox/tests/test_real_world_add.py
+++ b/archivebox/tests/test_real_world_add.py
@@ -1,138 +0,0 @@
-import os
-import sqlite3
-import subprocess
-from pathlib import Path
-
-
-def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
-    candidates = {snapshot_id}
-    if len(snapshot_id) == 32:
-        hyphenated = f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}"
-        candidates.add(hyphenated)
-    elif len(snapshot_id) == 36 and '-' in snapshot_id:
-        candidates.add(snapshot_id.replace('-', ''))
-
-    for needle in candidates:
-        for path in data_dir.rglob(needle):
-            if path.is_dir():
-                return path
-    return None
-
-
-def _find_html_with_text(root: Path, needle: str) -> list[Path]:
-    hits: list[Path] = []
-    for path in root.rglob("*.htm*"):
-        if not path.is_file():
-            continue
-        try:
-            if needle in path.read_text(errors="ignore"):
-                hits.append(path)
-        except Exception:
-            continue
-    return hits
-
-
-def test_add_real_world_example_domain(tmp_path):
-    os.chdir(tmp_path)
-    tmp_short = Path("/tmp") / f"abx-{tmp_path.name}"
-    tmp_short.mkdir(parents=True, exist_ok=True)
-    env = os.environ.copy()
-    env["TMP_DIR"] = str(tmp_short)
-    env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
-    env["SAVE_TITLE"] = "True"
-    env["SAVE_WGET"] = "True"
-    env["SAVE_SINGLEFILE"] = "True"
-    env["SAVE_READABILITY"] = "False"
-    env["SAVE_HTMLTOTEXT"] = "True"
-    env["SAVE_HEADERS"] = "True"
-    env["SAVE_PDF"] = "False"
-    env["SAVE_SCREENSHOT"] = "False"
-    env["SAVE_ARCHIVEDOTORG"] = "False"
-    env["SAVE_YTDLP"] = "False"
-    env["SAVE_GIT"] = "False"
-
-    init = subprocess.run(
-        ["archivebox", "init"],
-        capture_output=True,
-        text=True,
-        timeout=120,
-        env=env,
-    )
-    assert init.returncode == 0, f"archivebox init failed: {init.stderr}"
-
-    result = subprocess.run(
-        ["archivebox", "add", "--plugins=title,wget,singlefile,htmltotext,headers", "https://example.com"],
-        capture_output=True,
-        text=True,
-        timeout=900,
-        env=env,
-    )
-    assert result.returncode == 0, (
-        "archivebox add failed.\n"
-        f"stdout:\n{result.stdout}\n"
-        f"stderr:\n{result.stderr}"
-    )
-
-    conn = sqlite3.connect(tmp_path / "index.sqlite3")
-    c = conn.cursor()
-    snapshot_row = c.execute(
-        "SELECT id, url, title FROM core_snapshot WHERE url = ?",
-        ("https://example.com",),
-    ).fetchone()
-    assert snapshot_row is not None, "Snapshot for https://example.com not found in DB"
-    snapshot_id, snapshot_url, snapshot_title = snapshot_row
-    assert snapshot_title and "Example Domain" in snapshot_title, (
-        f"Expected title to contain Example Domain, got: {snapshot_title}"
-    )
-
-    failed_results = c.execute(
-        "SELECT COUNT(*) FROM core_archiveresult WHERE snapshot_id = ? AND status = 'failed'",
-        (snapshot_id,),
-    ).fetchone()[0]
-    assert failed_results == 0, "Some archive results failed for example.com snapshot"
-
-    binary_workers = c.execute(
-        "SELECT COUNT(*) FROM machine_process WHERE process_type = 'worker' AND worker_type = 'binary'"
-    ).fetchone()[0]
-    assert binary_workers > 0, "Expected BinaryWorker to run installs via BinaryMachine"
-
-    failed_binary_workers = c.execute(
-        "SELECT COUNT(*) FROM machine_process WHERE process_type = 'worker' AND worker_type = 'binary' "
-        "AND exit_code IS NOT NULL AND exit_code != 0"
-    ).fetchone()[0]
-    assert failed_binary_workers == 0, "BinaryWorker reported non-zero exit codes"
-
-    queued_binaries = c.execute(
-        "SELECT name FROM machine_binary WHERE status != 'installed'"
-    ).fetchall()
-    assert not queued_binaries, f"Some binaries did not install: {queued_binaries}"
-    conn.close()
-
-    snapshot_dir = _find_snapshot_dir(tmp_path, str(snapshot_id))
-    assert snapshot_dir is not None, "Snapshot output directory not found"
-
-    title_path = snapshot_dir / "title" / "title.txt"
-    assert title_path.exists(), f"Missing title output: {title_path}"
-    assert "Example Domain" in title_path.read_text(errors="ignore")
-
-    html_sources = []
-    for candidate in ("wget", "singlefile", "dom"):
-        for candidate_dir in (snapshot_dir / candidate, *snapshot_dir.glob(f"*_{candidate}")):
-            if candidate_dir.exists():
-                html_sources.extend(_find_html_with_text(candidate_dir, "Example Domain"))
-    assert len(html_sources) >= 2, (
-        "Expected HTML outputs from multiple extractors to contain Example Domain "
-        f"(found {len(html_sources)})."
-    )
-
-    text_hits = 0
-    for path in (
-        *snapshot_dir.glob("*_htmltotext/htmltotext.txt"),
-        snapshot_dir / "htmltotext" / "htmltotext.txt",
-    ):
-        if path.exists() and "Example Domain" in path.read_text(errors="ignore"):
-            text_hits += 1
-    assert text_hits >= 1, (
-        "Expected htmltotext output to contain Example Domain "
-        f"(htmltotext hits={text_hits})."
-    )
--- a/archivebox/tests/test_scheduled_crawls.py
+++ b/archivebox/tests/test_scheduled_crawls.py
@@ -1,84 +0,0 @@
-from datetime import timedelta
-from typing import cast
-from unittest.mock import patch
-
-from django.contrib.auth import get_user_model
-from django.contrib.auth.models import UserManager
-from django.test import TestCase
-from django.utils import timezone
-
-from archivebox.crawls.models import Crawl, CrawlSchedule
-from archivebox.workers.orchestrator import Orchestrator
-from archivebox.workers.worker import CrawlWorker
-
-
-class TestScheduledCrawlMaterialization(TestCase):
-    def setUp(self):
-        user_manager = cast(UserManager, get_user_model().objects)
-        self.user = user_manager.create_user(
-            username='schedule-user',
-            password='password',
-        )
-
-    def _create_due_schedule(self) -> CrawlSchedule:
-        template = Crawl.objects.create(
-            urls='https://example.com/feed.xml',
-            max_depth=1,
-            tags_str='scheduled',
-            label='Scheduled Feed',
-            notes='template',
-            created_by=self.user,
-            status=Crawl.StatusChoices.SEALED,
-            retry_at=None,
-        )
-        schedule = CrawlSchedule.objects.create(
-            template=template,
-            schedule='daily',
-            is_enabled=True,
-            label='Scheduled Feed',
-            notes='template',
-            created_by=self.user,
-        )
-        past = timezone.now() - timedelta(days=2)
-        Crawl.objects.filter(pk=template.pk).update(created_at=past, modified_at=past)
-        template.refresh_from_db()
-        schedule.refresh_from_db()
-        return schedule
-
-    def test_global_orchestrator_materializes_due_schedule(self):
-        schedule = self._create_due_schedule()
-
-        orchestrator = Orchestrator(exit_on_idle=False)
-        orchestrator._materialize_due_schedules()
-
-        scheduled_crawls = Crawl.objects.filter(schedule=schedule).order_by('created_at')
-        self.assertEqual(scheduled_crawls.count(), 2)
-
-        queued_crawl = scheduled_crawls.last()
-        self.assertIsNotNone(queued_crawl)
-        assert queued_crawl is not None
-        self.assertEqual(queued_crawl.status, Crawl.StatusChoices.QUEUED)
-        self.assertEqual(queued_crawl.urls, 'https://example.com/feed.xml')
-        self.assertEqual(queued_crawl.max_depth, 1)
-        self.assertEqual(queued_crawl.tags_str, 'scheduled')
-
-    def test_one_shot_orchestrator_does_not_materialize_due_schedule(self):
-        schedule = self._create_due_schedule()
-
-        Orchestrator(exit_on_idle=True)._materialize_due_schedules()
-        self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
-
-        Orchestrator(exit_on_idle=False, crawl_id=str(schedule.template.id))._materialize_due_schedules()
-        self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
-
-    @patch.object(CrawlWorker, 'start')
-    def test_global_orchestrator_waits_one_tick_before_spawning_materialized_schedule(self, mock_start):
-        schedule = self._create_due_schedule()
-
-        orchestrator = Orchestrator(exit_on_idle=False)
-        with patch.object(orchestrator, '_claim_crawl', return_value=True):
-            queue_sizes = orchestrator.check_queues_and_spawn_workers()
-
-        self.assertEqual(queue_sizes['crawl'], 1)
-        self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 2)
-        mock_start.assert_not_called()
--- a/archivebox/tests/test_snapshot_worker.py
+++ b/archivebox/tests/test_snapshot_worker.py
@@ -1,76 +0,0 @@
-from pathlib import Path
-from types import SimpleNamespace
-from typing import Any, cast
-from unittest.mock import patch
-
-from django.test import SimpleTestCase
-
-from archivebox.workers.worker import SnapshotWorker
-
-
-class TestSnapshotWorkerRetryForegroundHooks(SimpleTestCase):
-    def _make_worker(self):
-        worker = SnapshotWorker.__new__(SnapshotWorker)
-        worker.pid = 12345
-        cast(Any, worker).snapshot = SimpleNamespace(
-            status='started',
-            refresh_from_db=lambda: None,
-        )
-        worker._snapshot_exceeded_hard_timeout = lambda: False
-        worker._seal_snapshot_due_to_timeout = lambda: None
-        worker._run_hook = lambda *args, **kwargs: SimpleNamespace()
-        worker._wait_for_hook = lambda process, ar: None
-        return worker
-
-    @patch('archivebox.workers.worker.log_worker_event')
-    def test_retry_skips_successful_hook_with_only_inline_output(self, mock_log):
-        worker = self._make_worker()
-        archive_result = SimpleNamespace(
-            status='succeeded',
-            output_files={},
-            output_str='scrolled 600px',
-            output_json=None,
-            refresh_from_db=lambda: None,
-        )
-
-        worker._retry_failed_empty_foreground_hooks(
-            [(Path('/tmp/on_Snapshot__45_infiniscroll.js'), archive_result)],
-            config={},
-        )
-
-        mock_log.assert_not_called()
-
-    @patch('archivebox.workers.worker.log_worker_event')
-    def test_retry_replays_failed_hook_with_no_outputs(self, mock_log):
-        worker = self._make_worker()
-        run_calls = []
-        wait_calls = []
-
-        def run_hook(*args, **kwargs):
-            run_calls.append((args, kwargs))
-            return SimpleNamespace()
-
-        def wait_for_hook(process, ar):
-            wait_calls.append((process, ar))
-            ar.status = 'succeeded'
-            ar.output_files = {'singlefile.html': {}}
-
-        archive_result = SimpleNamespace(
-            status='failed',
-            output_files={},
-            output_str='',
-            output_json=None,
-            refresh_from_db=lambda: None,
-        )
-
-        worker._run_hook = run_hook
-        worker._wait_for_hook = wait_for_hook
-
-        worker._retry_failed_empty_foreground_hooks(
-            [(Path('/tmp/on_Snapshot__50_singlefile.py'), archive_result)],
-            config={},
-        )
-
-        assert len(run_calls) == 1
-        assert len(wait_calls) == 1
-        mock_log.assert_called_once()
--- a/archivebox/tests/test_state_machine_claims.py
+++ b/archivebox/tests/test_state_machine_claims.py
@@ -1,143 +0,0 @@
-import threading
-import time
-
-import pytest
-from django.db import close_old_connections
-from django.utils import timezone
-
-from archivebox.base_models.models import get_or_create_system_user_pk
-from archivebox.crawls.models import Crawl
-from archivebox.machine.models import Binary, Machine
-from archivebox.workers.worker import BinaryWorker
-
-
-def get_fresh_machine() -> Machine:
-    import archivebox.machine.models as machine_models
-
-    machine_models._CURRENT_MACHINE = None
-    machine_models._CURRENT_BINARIES.clear()
-    return Machine.current()
-
-
-@pytest.mark.django_db
-def test_claim_processing_lock_does_not_steal_future_retry_at():
-    """
-    retry_at is both the schedule and the ownership lock.
-
-    Once one process claims a due row and moves retry_at into the future, a
-    fresh reader must not be able to "re-claim" that future timestamp and run
-    the same side effects a second time.
-    """
-    machine = get_fresh_machine()
-    binary = Binary.objects.create(
-        machine=machine,
-        name='claim-test',
-        binproviders='env',
-        status=Binary.StatusChoices.QUEUED,
-        retry_at=timezone.now(),
-    )
-
-    owner = Binary.objects.get(pk=binary.pk)
-    contender = Binary.objects.get(pk=binary.pk)
-
-    assert owner.claim_processing_lock(lock_seconds=30) is True
-
-    contender.refresh_from_db()
-    assert contender.retry_at > timezone.now()
-    assert contender.claim_processing_lock(lock_seconds=30) is False
-
-
-@pytest.mark.django_db
-def test_binary_worker_skips_binary_claimed_by_other_owner(monkeypatch):
-    """
-    BinaryWorker must never run install side effects for a Binary whose retry_at
-    lock has already been claimed by another process.
-    """
-    machine = get_fresh_machine()
-    binary = Binary.objects.create(
-        machine=machine,
-        name='claimed-binary',
-        binproviders='env',
-        status=Binary.StatusChoices.QUEUED,
-        retry_at=timezone.now(),
-    )
-
-    owner = Binary.objects.get(pk=binary.pk)
-    assert owner.claim_processing_lock(lock_seconds=30) is True
-
-    calls: list[str] = []
-
-    def fake_run(self):
-        calls.append(self.name)
-        self.status = self.StatusChoices.INSTALLED
-        self.abspath = '/tmp/fake-binary'
-        self.version = '1.0'
-        self.save(update_fields=['status', 'abspath', 'version', 'modified_at'])
-
-    monkeypatch.setattr(Binary, 'run', fake_run)
-
-    worker = BinaryWorker(binary_id=str(binary.id))
-    worker._process_single_binary()
-
-    assert calls == []
-
-
-@pytest.mark.django_db(transaction=True)
-def test_crawl_install_declared_binaries_waits_for_existing_owner(monkeypatch):
-    """
-    Crawl.install_declared_binaries should wait for the current owner of a Binary
-    to finish instead of launching a duplicate install against shared provider
-    state such as the npm tree.
-    """
-    machine = get_fresh_machine()
-    crawl = Crawl.objects.create(
-        urls='https://example.com',
-        created_by_id=get_or_create_system_user_pk(),
-        status=Crawl.StatusChoices.QUEUED,
-        retry_at=timezone.now(),
-    )
-    binary = Binary.objects.create(
-        machine=machine,
-        name='puppeteer',
-        binproviders='npm',
-        status=Binary.StatusChoices.QUEUED,
-        retry_at=timezone.now(),
-    )
-
-    owner = Binary.objects.get(pk=binary.pk)
-    assert owner.claim_processing_lock(lock_seconds=30) is True
-
-    calls: list[str] = []
-
-    def fake_run(self):
-        calls.append(self.name)
-        self.status = self.StatusChoices.INSTALLED
-        self.abspath = '/tmp/should-not-run'
-        self.version = '1.0'
-        self.save(update_fields=['status', 'abspath', 'version', 'modified_at'])
-
-    monkeypatch.setattr(Binary, 'run', fake_run)
-
-    def finish_existing_install():
-        close_old_connections()
-        try:
-            time.sleep(0.3)
-            Binary.objects.filter(pk=binary.pk).update(
-                status=Binary.StatusChoices.INSTALLED,
-                retry_at=None,
-                abspath='/tmp/finished-by-owner',
-                version='1.0',
-                modified_at=timezone.now(),
-            )
-        finally:
-            close_old_connections()
-
-    thread = threading.Thread(target=finish_existing_install, daemon=True)
-    thread.start()
-    crawl.install_declared_binaries({'puppeteer'}, machine=machine)
-    thread.join(timeout=5)
-
-    binary.refresh_from_db()
-    assert binary.status == Binary.StatusChoices.INSTALLED
-    assert binary.abspath == '/tmp/finished-by-owner'
-    assert calls == []
--- a/archivebox/tests/test_worker_config_propagation.py
+++ b/archivebox/tests/test_worker_config_propagation.py