codecov, migrations, orchestrator fixes

2026-01-03 09:25:42 +10:00 · 2026-01-01 16:57:04 -08:00
parent 60422adc87
commit 9008cefca2
21 changed files with 153 additions and 57 deletions
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -1515,6 +1515,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        parent_snapshot = overrides.get('snapshot')  # Parent snapshot
        created_by_id = overrides.get('created_by_id') or (parent_snapshot.created_by.pk if parent_snapshot else get_or_create_system_user_pk())

+        # DEBUG: Check if crawl_id in record matches overrides crawl
+        import sys
+        record_crawl_id = record.get('crawl_id')
+        if record_crawl_id and crawl and str(crawl.id) != str(record_crawl_id):
+            print(f"[yellow]⚠️  Snapshot.from_json crawl mismatch: record has crawl_id={record_crawl_id}, overrides has crawl={crawl.id}[/yellow]", file=sys.stderr)
+
        # If no crawl provided, inherit from parent or auto-create one
        if not crawl:
            if parent_snapshot:
@@ -1536,6 +1542,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                    label=f'auto-created for {url[:50]}',
                    created_by_id=created_by_id,
                )
+                print(f"[red]⚠️  Snapshot.from_json auto-created new crawl {crawl.id} for url={url}[/red]", file=sys.stderr)

        # Parse tags
        tags_str = record.get('tags', '')
@@ -1546,8 +1553,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                if tag.strip()
            ))

-        # Get most recent snapshot with this URL (URLs can exist in multiple crawls)
-        snapshot = Snapshot.objects.filter(url=url).order_by('-created_at').first()
+        # Check for existing snapshot with same URL in same crawl
+        # (URLs can exist in multiple crawls, but should be unique within a crawl)
+        snapshot = Snapshot.objects.filter(url=url, crawl=crawl).order_by('-created_at').first()

        title = record.get('title')
        timestamp = record.get('timestamp')
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -892,15 +892,34 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
    """
    plugin_upper = plugin_name.upper()

-    # 1. Enabled: PLUGINNAME_ENABLED (default True)
+    # 1. Enabled: Check PLUGINS whitelist first, then PLUGINNAME_ENABLED (default True)
    # Old names (USE_*, SAVE_*) are aliased in config.json via x-aliases
-    enabled_key = f'{plugin_upper}_ENABLED'
-    enabled = config.get(enabled_key)
-    if enabled is None:
-        enabled = True
-    elif isinstance(enabled, str):
-        # Handle string values from config file ("true"/"false")
-        enabled = enabled.lower() not in ('false', '0', 'no', '')
+
+    # Check if PLUGINS whitelist is specified (e.g., --plugins=wget,favicon)
+    plugins_whitelist = config.get('PLUGINS', '')
+    if plugins_whitelist:
+        # PLUGINS whitelist is specified - only enable plugins in the list
+        plugin_names = [p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()]
+        if plugin_name.lower() not in plugin_names:
+            # Plugin not in whitelist - explicitly disabled
+            enabled = False
+        else:
+            # Plugin is in whitelist - check if explicitly disabled by PLUGINNAME_ENABLED
+            enabled_key = f'{plugin_upper}_ENABLED'
+            enabled = config.get(enabled_key)
+            if enabled is None:
+                enabled = True  # Default to enabled if in whitelist
+            elif isinstance(enabled, str):
+                enabled = enabled.lower() not in ('false', '0', 'no', '')
+    else:
+        # No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True)
+        enabled_key = f'{plugin_upper}_ENABLED'
+        enabled = config.get(enabled_key)
+        if enabled is None:
+            enabled = True
+        elif isinstance(enabled, str):
+            # Handle string values from config file ("true"/"false")
+            enabled = enabled.lower() not in ('false', '0', 'no', '')

    # 2. Timeout: PLUGINNAME_TIMEOUT (fallback to TIMEOUT, default 300)
    timeout_key = f'{plugin_upper}_TIMEOUT'
--- a/archivebox/plugins/accessibility/tests/test_accessibility.py
+++ b/archivebox/plugins/accessibility/tests/test_accessibility.py
@@ -80,7 +80,8 @@ class TestAccessibilityWithChrome(TestCase):
                # Run accessibility hook with the active Chrome session
                result = subprocess.run(
                    ['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
-                    cwd=str(snapshot_chrome_dir),
+                    cwd=str(snapshot_chrome_dir,
+            env=get_test_env()),
                    capture_output=True,
                    text=True,
                    timeout=60,
--- a/archivebox/plugins/chrome/tests/test_chrome.py
+++ b/archivebox/plugins/chrome/tests/test_chrome.py
@@ -208,7 +208,8 @@ def test_chrome_launch_and_tab_creation():
        env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
        result = subprocess.run(
            ['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-123', '--crawl-id=test-crawl-123'],
-            cwd=str(snapshot_chrome_dir),
+            cwd=str(snapshot_chrome_dir,
+            env=get_test_env()),
            capture_output=True,
            text=True,
            timeout=60,
@@ -268,7 +269,8 @@ def test_chrome_navigation():

        result = subprocess.run(
            ['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123', '--crawl-id=test-crawl-nav'],
-            cwd=str(snapshot_chrome_dir),
+            cwd=str(snapshot_chrome_dir,
+            env=get_test_env()),
            capture_output=True,
            text=True,
            timeout=60,
@@ -279,7 +281,8 @@ def test_chrome_navigation():
        # Navigate to URL
        result = subprocess.run(
            ['node', str(CHROME_NAVIGATE_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123'],
-            cwd=str(snapshot_chrome_dir),
+            cwd=str(snapshot_chrome_dir,
+            env=get_test_env()),
            capture_output=True,
            text=True,
            timeout=120,
@@ -414,7 +417,8 @@ def test_multiple_snapshots_share_chrome():
            # Create tab for this snapshot
            result = subprocess.run(
                ['node', str(CHROME_TAB_HOOK), f'--url=https://example.com/{snap_num}', f'--snapshot-id=snap-{snap_num}', '--crawl-id=test-multi-crawl'],
-                cwd=str(snapshot_chrome_dir),
+                cwd=str(snapshot_chrome_dir,
+            env=get_test_env()),
                capture_output=True,
                text=True,
                timeout=60,
--- a/archivebox/plugins/consolelog/tests/test_consolelog.py
+++ b/archivebox/plugins/consolelog/tests/test_consolelog.py
@@ -80,7 +80,8 @@ class TestConsolelogWithChrome(TestCase):
                # Run consolelog hook with the active Chrome session
                result = subprocess.run(
                    ['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
-                    cwd=str(snapshot_chrome_dir),
+                    cwd=str(snapshot_chrome_dir,
+            env=get_test_env()),
                    capture_output=True,
                    text=True,
                    timeout=120,  # Longer timeout as it waits for navigation
--- a/archivebox/plugins/dom/tests/test_dom.py
+++ b/archivebox/plugins/dom/tests/test_dom.py
@@ -68,7 +68,8 @@ def test_extracts_dom_from_example_com():
            capture_output=True,
            text=True,
            timeout=120
-        )
+        ,
+            env=get_test_env())

        assert result.returncode == 0, f"Extraction failed: {result.stderr}"

@@ -152,7 +153,8 @@ def test_staticfile_present_skips():
            capture_output=True,
            text=True,
            timeout=30
-        )
+        ,
+            env=get_test_env())

        assert result.returncode == 0, "Should exit 0 when permanently skipping"

--- a/archivebox/plugins/headers/tests/test_headers.py
+++ b/archivebox/plugins/headers/tests/test_headers.py
@@ -50,7 +50,8 @@ def test_node_is_available():
        capture_output=True,
        text=True,
        timeout=10
-    )
+    ,
+            env=get_test_env())
    assert result.returncode == 0, f"node not executable: {result.stderr}"
    assert result.stdout.startswith('v'), f"Unexpected node version format: {result.stdout}"

@@ -72,7 +73,8 @@ def test_extracts_headers_from_example_com():
            capture_output=True,
            text=True,
            timeout=60
-        )
+        ,
+            env=get_test_env())

        assert result.returncode == 0, f"Extraction failed: {result.stderr}"

@@ -133,7 +135,8 @@ def test_headers_output_structure():
            capture_output=True,
            text=True,
            timeout=60
-        )
+        ,
+            env=get_test_env())

        assert result.returncode == 0, f"Extraction failed: {result.stderr}"

@@ -192,7 +195,8 @@ def test_falls_back_to_http_when_chrome_unavailable():
            capture_output=True,
            text=True,
            timeout=60
-        )
+        ,
+            env=get_test_env())

        assert result.returncode == 0, f"Extraction failed: {result.stderr}"

@@ -309,7 +313,8 @@ def test_handles_https_urls():
            capture_output=True,
            text=True,
            timeout=60
-        )
+        ,
+            env=get_test_env())

        if result.returncode == 0:
            output_headers_file = tmpdir / 'headers.json'
@@ -334,7 +339,8 @@ def test_handles_404_gracefully():
            capture_output=True,
            text=True,
            timeout=60
-        )
+        ,
+            env=get_test_env())

        # May succeed or fail depending on server behavior
        # If it succeeds, verify 404 status is captured
--- a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py
+++ b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py
@@ -123,7 +123,8 @@ def test_scrolls_page_and_outputs_stats():

            result = subprocess.run(
                ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'],
-                cwd=str(infiniscroll_dir),
+                cwd=str(infiniscroll_dir,
+            env=get_test_env()),
                capture_output=True,
                text=True,
                timeout=60,
@@ -187,7 +188,8 @@ def test_config_scroll_limit_honored():

            result = subprocess.run(
                ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-limit'],
-                cwd=str(infiniscroll_dir),
+                cwd=str(infiniscroll_dir,
+            env=get_test_env()),
                capture_output=True,
                text=True,
                timeout=60,
@@ -246,7 +248,8 @@ def test_config_timeout_honored():
            start_time = time.time()
            result = subprocess.run(
                ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-timeout'],
-                cwd=str(infiniscroll_dir),
+                cwd=str(infiniscroll_dir,
+            env=get_test_env()),
                capture_output=True,
                text=True,
                timeout=30,
--- a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py
+++ b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py
@@ -154,7 +154,8 @@ def test_extension_loads_in_chromium():
        # Step 1: Install the extension
        result = subprocess.run(
            ['node', str(INSTALL_SCRIPT)],
-            cwd=str(tmpdir),
+            cwd=str(tmpdir,
+            env=get_test_env()),
            capture_output=True,
            text=True,
            env=env,
@@ -291,7 +292,8 @@ const puppeteer = require('puppeteer-core');

            result = subprocess.run(
                ['node', str(script_path)],
-                cwd=str(tmpdir),
+                cwd=str(tmpdir,
+            env=get_test_env()),
                capture_output=True,
                text=True,
                env=env,
@@ -443,7 +445,8 @@ const puppeteer = require('puppeteer-core');

    result = subprocess.run(
        ['node', str(script_path)],
-        cwd=str(script_dir),
+        cwd=str(script_dir,
+            env=get_test_env()),
        capture_output=True,
        text=True,
        env=env,
@@ -557,7 +560,8 @@ def test_hides_cookie_consent_on_filmin():

        result = subprocess.run(
            ['node', str(INSTALL_SCRIPT)],
-            cwd=str(tmpdir),
+            cwd=str(tmpdir,
+            env=get_test_env()),
            capture_output=True,
            text=True,
            env=env_with_ext,
--- a/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py
+++ b/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py
@@ -80,7 +80,8 @@ class TestParseDomOutlinksWithChrome(TestCase):
                # Run outlinks hook with the active Chrome session
                result = subprocess.run(
                    ['node', str(OUTLINKS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
-                    cwd=str(snapshot_chrome_dir),
+                    cwd=str(snapshot_chrome_dir,
+            env=get_test_env()),
                    capture_output=True,
                    text=True,
                    timeout=60,
--- a/archivebox/plugins/pdf/tests/test_pdf.py
+++ b/archivebox/plugins/pdf/tests/test_pdf.py
@@ -69,7 +69,8 @@ def test_extracts_pdf_from_example_com():
            capture_output=True,
            text=True,
            timeout=120
-        )
+        ,
+            env=get_test_env())

        # Parse clean JSONL output (hook might fail due to network issues)
        result_json = None
--- a/archivebox/plugins/redirects/tests/test_redirects.py
+++ b/archivebox/plugins/redirects/tests/test_redirects.py
@@ -81,7 +81,8 @@ class TestRedirectsWithChrome(TestCase):
                # Run redirects hook with the active Chrome session
                result = subprocess.run(
                    ['node', str(REDIRECTS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
-                    cwd=str(snapshot_chrome_dir),
+                    cwd=str(snapshot_chrome_dir,
+            env=get_test_env()),
                    capture_output=True,
                    text=True,
                    timeout=60,
--- a/archivebox/plugins/responses/tests/test_responses.py
+++ b/archivebox/plugins/responses/tests/test_responses.py
@@ -80,7 +80,8 @@ class TestResponsesWithChrome(TestCase):
                # Run responses hook with the active Chrome session
                result = subprocess.run(
                    ['node', str(RESPONSES_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
-                    cwd=str(snapshot_chrome_dir),
+                    cwd=str(snapshot_chrome_dir,
+            env=get_test_env()),
                    capture_output=True,
                    text=True,
                    timeout=120,  # Longer timeout as it waits for navigation
--- a/archivebox/plugins/screenshot/tests/test_screenshot.py
+++ b/archivebox/plugins/screenshot/tests/test_screenshot.py
@@ -65,7 +65,8 @@ def test_extracts_screenshot_from_example_com():
            cwd=tmpdir,
            capture_output=True,
            text=True,
-            timeout=120
+            timeout=120,
+            env=get_test_env()
        )

        assert result.returncode == 0, f"Extraction failed: {result.stderr}"
--- a/archivebox/plugins/seo/tests/test_seo.py
+++ b/archivebox/plugins/seo/tests/test_seo.py
@@ -80,7 +80,8 @@ class TestSEOWithChrome(TestCase):
                # Run SEO hook with the active Chrome session
                result = subprocess.run(
                    ['node', str(SEO_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
-                    cwd=str(snapshot_chrome_dir),
+                    cwd=str(snapshot_chrome_dir,
+            env=get_test_env()),
                    capture_output=True,
                    text=True,
                    timeout=60,
--- a/archivebox/plugins/ssl/tests/test_ssl.py
+++ b/archivebox/plugins/ssl/tests/test_ssl.py
@@ -80,7 +80,8 @@ class TestSSLWithChrome(TestCase):
                # Run SSL hook with the active Chrome session
                result = subprocess.run(
                    ['node', str(SSL_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
-                    cwd=str(snapshot_chrome_dir),
+                    cwd=str(snapshot_chrome_dir,
+            env=get_test_env()),
                    capture_output=True,
                    text=True,
                    timeout=60,
--- a/archivebox/plugins/staticfile/tests/test_staticfile.py
+++ b/archivebox/plugins/staticfile/tests/test_staticfile.py
@@ -80,7 +80,8 @@ class TestStaticfileWithChrome(TestCase):
                # Run staticfile hook with the active Chrome session
                result = subprocess.run(
                    ['node', str(STATICFILE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
-                    cwd=str(snapshot_chrome_dir),
+                    cwd=str(snapshot_chrome_dir,
+            env=get_test_env()),
                    capture_output=True,
                    text=True,
                    timeout=120,  # Longer timeout as it waits for navigation
--- a/archivebox/plugins/title/tests/test_title.py
+++ b/archivebox/plugins/title/tests/test_title.py
@@ -53,7 +53,8 @@ def test_extracts_title_from_example_com():
            capture_output=True,
            text=True,
            timeout=60
-        )
+        ,
+            env=get_test_env())

        assert result.returncode == 0, f"Extraction failed: {result.stderr}"

@@ -105,7 +106,8 @@ def test_falls_back_to_http_when_chrome_unavailable():
            capture_output=True,
            text=True,
            timeout=60
-        )
+        ,
+            env=get_test_env())

        assert result.returncode == 0, f"Extraction failed: {result.stderr}"

@@ -219,7 +221,8 @@ def test_handles_https_urls():
            capture_output=True,
            text=True,
            timeout=60
-        )
+        ,
+            env=get_test_env())

        if result.returncode == 0:
            # Hook writes to current directory
@@ -249,7 +252,8 @@ def test_handles_404_gracefully():
            capture_output=True,
            text=True,
            timeout=60
-        )
+        ,
+            env=get_test_env())

        # May succeed or fail depending on server behavior
        # example.com returns "Example Domain" even for 404s
@@ -272,7 +276,8 @@ def test_handles_redirects():
            capture_output=True,
            text=True,
            timeout=60
-        )
+        ,
+            env=get_test_env())

        # Should succeed and follow redirect
        if result.returncode == 0:
--- a/archivebox/plugins/ublock/tests/test_ublock.py
+++ b/archivebox/plugins/ublock/tests/test_ublock.py
@@ -283,7 +283,8 @@ const puppeteer = require('puppeteer-core');

    result = subprocess.run(
        ['node', str(script_path)],
-        cwd=str(script_dir),
+        cwd=str(script_dir,
+            env=get_test_env()),
        capture_output=True,
        text=True,
        env=env,
@@ -482,7 +483,8 @@ const puppeteer = require('puppeteer-core');

            result = subprocess.run(
                ['node', str(script_path)],
-                cwd=str(tmpdir),
+                cwd=str(tmpdir,
+            env=get_test_env()),
                capture_output=True,
                text=True,
                env=env,
--- a/archivebox/workers/orchestrator.py
+++ b/archivebox/workers/orchestrator.py
@@ -30,6 +30,7 @@ __package__ = 'archivebox.workers'
 import os
 import time
 from typing import Type
+from datetime import timedelta
 from multiprocessing import Process as MPProcess

 from django.utils import timezone
@@ -67,12 +68,19 @@ class Orchestrator:
    MAX_WORKERS_PER_TYPE: int = 8  # Max workers per model type
    MAX_TOTAL_WORKERS: int = 24  # Max workers across all types
    
-    def __init__(self, exit_on_idle: bool = True):
+    def __init__(self, exit_on_idle: bool = True, crawl_id: str | None = None):
        self.exit_on_idle = exit_on_idle
+        self.crawl_id = crawl_id  # If set, only process work for this crawl
        self.pid: int = os.getpid()
        self.pid_file = None
        self.idle_count: int = 0
        self._last_cleanup_time: float = 0.0  # For throttling cleanup_stale_running()
+
+        # CRITICAL: In foreground mode (exit_on_idle=True), use ONLY 1 worker
+        # to keep execution strictly sequential and deterministic
+        if self.exit_on_idle:
+            self.MAX_WORKERS_PER_TYPE = 1
+            self.MAX_TOTAL_WORKERS = 1
    
    def __repr__(self) -> str:
        return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]'
@@ -315,15 +323,12 @@ class Orchestrator:
        # Enable progress bars only in TTY + foreground mode
        show_progress = IS_TTY and self.exit_on_idle

-        # Debug
-        print(f"[yellow]DEBUG: IS_TTY={IS_TTY}, exit_on_idle={self.exit_on_idle}, show_progress={show_progress}[/yellow]")
-
        self.on_startup()
        task_ids = {}

        if not show_progress:
            # No progress bars - just run normally
-            self._run_orchestrator_loop(None, task_ids, None, None)
+            self._run_orchestrator_loop(None, task_ids)
        else:
            # Redirect worker subprocess output to /dev/null
            devnull_fd = os.open(os.devnull, os.O_WRONLY)
@@ -356,7 +361,7 @@ class Orchestrator:
                    TaskProgressColumn(),
                    console=orchestrator_console,
                ) as progress:
-                    self._run_orchestrator_loop(progress, task_ids, None, None)
+                    self._run_orchestrator_loop(progress, task_ids)

                # Restore original console
                logging_module.CONSOLE = original_console
@@ -374,7 +379,7 @@ class Orchestrator:
                    pass
                # stdout_for_console is closed by orchestrator_console

-    def _run_orchestrator_loop(self, progress, task_ids, read_fd, console):
+    def _run_orchestrator_loop(self, progress, task_ids):
        """Run the main orchestrator loop with optional progress display."""
        try:
            while True:
@@ -385,12 +390,28 @@ class Orchestrator:
                if progress:
                    from archivebox.core.models import Snapshot

-                    # Get all started snapshots
-                    active_snapshots = list(Snapshot.objects.filter(status='started'))
+                    # Get all started snapshots (optionally filtered by crawl_id)
+                    snapshot_filter = {'status': 'started'}
+                    if self.crawl_id:
+                        snapshot_filter['crawl_id'] = self.crawl_id
+                    else:
+                        # Only if processing all crawls, filter by recent modified_at to avoid stale snapshots
+                        recent_cutoff = timezone.now() - timedelta(minutes=5)
+                        snapshot_filter['modified_at__gte'] = recent_cutoff
+
+                    active_snapshots = list(Snapshot.objects.filter(**snapshot_filter))

                    # Track which snapshots are still active
                    active_ids = set()

+                    # Debug: check for duplicates
+                    snapshot_urls = [s.url for s in active_snapshots]
+                    if len(active_snapshots) != len(set(snapshot_urls)):
+                        # We have duplicate URLs - let's deduplicate by showing snapshot ID
+                        show_id = True
+                    else:
+                        show_id = False
+
                    for snapshot in active_snapshots:
                        active_ids.add(snapshot.id)

@@ -421,7 +442,11 @@ class Orchestrator:

                        # Build description with URL + current plugin
                        url = snapshot.url[:50] + '...' if len(snapshot.url) > 50 else snapshot.url
-                        description = f"{url}{current_plugin}"
+                        if show_id:
+                            # Show snapshot ID if there are duplicate URLs
+                            description = f"[{str(snapshot.id)[:8]}] {url}{current_plugin}"
+                        else:
+                            description = f"{url}{current_plugin}"

                        # Create or update task
                        if snapshot.id not in task_ids:
--- a/archivebox/workers/worker.py
+++ b/archivebox/workers/worker.py
@@ -63,9 +63,10 @@ class Worker:
    POLL_INTERVAL: ClassVar[float] = 0.2  # How often to check for new work (seconds)
    IDLE_TIMEOUT: ClassVar[int] = 50  # Exit after N idle iterations (10 sec at 0.2 poll interval)

-    def __init__(self, worker_id: int = 0, daemon: bool = False, **kwargs: Any):
+    def __init__(self, worker_id: int = 0, daemon: bool = False, crawl_id: str | None = None, **kwargs: Any):
        self.worker_id = worker_id
        self.daemon = daemon
+        self.crawl_id = crawl_id  # If set, only process work for this crawl
        self.pid: int = os.getpid()
        self.pid_file: Path | None = None
        self.idle_count: int = 0
@@ -346,6 +347,13 @@ class CrawlWorker(Worker):
        from archivebox.crawls.models import Crawl
        return Crawl

+    def get_queue(self) -> QuerySet:
+        """Get queue of Crawls ready for processing, optionally filtered by crawl_id."""
+        qs = super().get_queue()
+        if self.crawl_id:
+            qs = qs.filter(id=self.crawl_id)
+        return qs
+

 class SnapshotWorker(Worker):
    """Worker for processing Snapshot objects."""