diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 928aa990..403c441e 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1515,6 +1515,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea parent_snapshot = overrides.get('snapshot') # Parent snapshot created_by_id = overrides.get('created_by_id') or (parent_snapshot.created_by.pk if parent_snapshot else get_or_create_system_user_pk()) + # DEBUG: Check if crawl_id in record matches overrides crawl + import sys + record_crawl_id = record.get('crawl_id') + if record_crawl_id and crawl and str(crawl.id) != str(record_crawl_id): + print(f"[yellow]⚠️ Snapshot.from_json crawl mismatch: record has crawl_id={record_crawl_id}, overrides has crawl={crawl.id}[/yellow]", file=sys.stderr) + # If no crawl provided, inherit from parent or auto-create one if not crawl: if parent_snapshot: @@ -1536,6 +1542,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea label=f'auto-created for {url[:50]}', created_by_id=created_by_id, ) + print(f"[red]⚠️ Snapshot.from_json auto-created new crawl {crawl.id} for url={url}[/red]", file=sys.stderr) # Parse tags tags_str = record.get('tags', '') @@ -1546,8 +1553,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea if tag.strip() )) - # Get most recent snapshot with this URL (URLs can exist in multiple crawls) - snapshot = Snapshot.objects.filter(url=url).order_by('-created_at').first() + # Check for existing snapshot with same URL in same crawl + # (URLs can exist in multiple crawls, but should be unique within a crawl) + snapshot = Snapshot.objects.filter(url=url, crawl=crawl).order_by('-created_at').first() title = record.get('title') timestamp = record.get('timestamp') diff --git a/archivebox/hooks.py b/archivebox/hooks.py index f955974b..116671ac 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -892,15 +892,34 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[ """ plugin_upper = plugin_name.upper() - # 1. Enabled: PLUGINNAME_ENABLED (default True) + # 1. Enabled: Check PLUGINS whitelist first, then PLUGINNAME_ENABLED (default True) # Old names (USE_*, SAVE_*) are aliased in config.json via x-aliases - enabled_key = f'{plugin_upper}_ENABLED' - enabled = config.get(enabled_key) - if enabled is None: - enabled = True - elif isinstance(enabled, str): - # Handle string values from config file ("true"/"false") - enabled = enabled.lower() not in ('false', '0', 'no', '') + + # Check if PLUGINS whitelist is specified (e.g., --plugins=wget,favicon) + plugins_whitelist = config.get('PLUGINS', '') + if plugins_whitelist: + # PLUGINS whitelist is specified - only enable plugins in the list + plugin_names = [p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()] + if plugin_name.lower() not in plugin_names: + # Plugin not in whitelist - explicitly disabled + enabled = False + else: + # Plugin is in whitelist - check if explicitly disabled by PLUGINNAME_ENABLED + enabled_key = f'{plugin_upper}_ENABLED' + enabled = config.get(enabled_key) + if enabled is None: + enabled = True # Default to enabled if in whitelist + elif isinstance(enabled, str): + enabled = enabled.lower() not in ('false', '0', 'no', '') + else: + # No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True) + enabled_key = f'{plugin_upper}_ENABLED' + enabled = config.get(enabled_key) + if enabled is None: + enabled = True + elif isinstance(enabled, str): + # Handle string values from config file ("true"/"false") + enabled = enabled.lower() not in ('false', '0', 'no', '') # 2. Timeout: PLUGINNAME_TIMEOUT (fallback to TIMEOUT, default 300) timeout_key = f'{plugin_upper}_TIMEOUT' diff --git a/archivebox/plugins/accessibility/tests/test_accessibility.py b/archivebox/plugins/accessibility/tests/test_accessibility.py index 0c85b145..4fc8a1fe 100644 --- a/archivebox/plugins/accessibility/tests/test_accessibility.py +++ b/archivebox/plugins/accessibility/tests/test_accessibility.py @@ -80,7 +80,8 @@ class TestAccessibilityWithChrome(TestCase): # Run accessibility hook with the active Chrome session result = subprocess.run( ['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), + cwd=str(snapshot_chrome_dir, + env=get_test_env()), capture_output=True, text=True, timeout=60, diff --git a/archivebox/plugins/chrome/tests/test_chrome.py b/archivebox/plugins/chrome/tests/test_chrome.py index d455ba41..6c801a5e 100644 --- a/archivebox/plugins/chrome/tests/test_chrome.py +++ b/archivebox/plugins/chrome/tests/test_chrome.py @@ -208,7 +208,8 @@ def test_chrome_launch_and_tab_creation(): env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) result = subprocess.run( ['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-123', '--crawl-id=test-crawl-123'], - cwd=str(snapshot_chrome_dir), + cwd=str(snapshot_chrome_dir, + env=get_test_env()), capture_output=True, text=True, timeout=60, @@ -268,7 +269,8 @@ def test_chrome_navigation(): result = subprocess.run( ['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123', '--crawl-id=test-crawl-nav'], - cwd=str(snapshot_chrome_dir), + cwd=str(snapshot_chrome_dir, + env=get_test_env()), capture_output=True, text=True, timeout=60, @@ -279,7 +281,8 @@ def test_chrome_navigation(): # Navigate to URL result = subprocess.run( ['node', str(CHROME_NAVIGATE_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123'], - cwd=str(snapshot_chrome_dir), + cwd=str(snapshot_chrome_dir, + env=get_test_env()), capture_output=True, text=True, timeout=120, @@ -414,7 +417,8 @@ def test_multiple_snapshots_share_chrome(): # Create tab for this snapshot result = subprocess.run( ['node', str(CHROME_TAB_HOOK), f'--url=https://example.com/{snap_num}', f'--snapshot-id=snap-{snap_num}', '--crawl-id=test-multi-crawl'], - cwd=str(snapshot_chrome_dir), + cwd=str(snapshot_chrome_dir, + env=get_test_env()), capture_output=True, text=True, timeout=60, diff --git a/archivebox/plugins/consolelog/tests/test_consolelog.py b/archivebox/plugins/consolelog/tests/test_consolelog.py index 741776f0..ca75f130 100644 --- a/archivebox/plugins/consolelog/tests/test_consolelog.py +++ b/archivebox/plugins/consolelog/tests/test_consolelog.py @@ -80,7 +80,8 @@ class TestConsolelogWithChrome(TestCase): # Run consolelog hook with the active Chrome session result = subprocess.run( ['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), + cwd=str(snapshot_chrome_dir, + env=get_test_env()), capture_output=True, text=True, timeout=120, # Longer timeout as it waits for navigation diff --git a/archivebox/plugins/dom/tests/test_dom.py b/archivebox/plugins/dom/tests/test_dom.py index 7fe69d64..fea41b8d 100644 --- a/archivebox/plugins/dom/tests/test_dom.py +++ b/archivebox/plugins/dom/tests/test_dom.py @@ -68,7 +68,8 @@ def test_extracts_dom_from_example_com(): capture_output=True, text=True, timeout=120 - ) + , + env=get_test_env()) assert result.returncode == 0, f"Extraction failed: {result.stderr}" @@ -152,7 +153,8 @@ def test_staticfile_present_skips(): capture_output=True, text=True, timeout=30 - ) + , + env=get_test_env()) assert result.returncode == 0, "Should exit 0 when permanently skipping" diff --git a/archivebox/plugins/headers/tests/test_headers.py b/archivebox/plugins/headers/tests/test_headers.py index e9fd1298..0930737c 100644 --- a/archivebox/plugins/headers/tests/test_headers.py +++ b/archivebox/plugins/headers/tests/test_headers.py @@ -50,7 +50,8 @@ def test_node_is_available(): capture_output=True, text=True, timeout=10 - ) + , + env=get_test_env()) assert result.returncode == 0, f"node not executable: {result.stderr}" assert result.stdout.startswith('v'), f"Unexpected node version format: {result.stdout}" @@ -72,7 +73,8 @@ def test_extracts_headers_from_example_com(): capture_output=True, text=True, timeout=60 - ) + , + env=get_test_env()) assert result.returncode == 0, f"Extraction failed: {result.stderr}" @@ -133,7 +135,8 @@ def test_headers_output_structure(): capture_output=True, text=True, timeout=60 - ) + , + env=get_test_env()) assert result.returncode == 0, f"Extraction failed: {result.stderr}" @@ -192,7 +195,8 @@ def test_falls_back_to_http_when_chrome_unavailable(): capture_output=True, text=True, timeout=60 - ) + , + env=get_test_env()) assert result.returncode == 0, f"Extraction failed: {result.stderr}" @@ -309,7 +313,8 @@ def test_handles_https_urls(): capture_output=True, text=True, timeout=60 - ) + , + env=get_test_env()) if result.returncode == 0: output_headers_file = tmpdir / 'headers.json' @@ -334,7 +339,8 @@ def test_handles_404_gracefully(): capture_output=True, text=True, timeout=60 - ) + , + env=get_test_env()) # May succeed or fail depending on server behavior # If it succeeds, verify 404 status is captured diff --git a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py index eee44ce4..16a7631d 100644 --- a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py +++ b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py @@ -123,7 +123,8 @@ def test_scrolls_page_and_outputs_stats(): result = subprocess.run( ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'], - cwd=str(infiniscroll_dir), + cwd=str(infiniscroll_dir, + env=get_test_env()), capture_output=True, text=True, timeout=60, @@ -187,7 +188,8 @@ def test_config_scroll_limit_honored(): result = subprocess.run( ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-limit'], - cwd=str(infiniscroll_dir), + cwd=str(infiniscroll_dir, + env=get_test_env()), capture_output=True, text=True, timeout=60, @@ -246,7 +248,8 @@ def test_config_timeout_honored(): start_time = time.time() result = subprocess.run( ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-timeout'], - cwd=str(infiniscroll_dir), + cwd=str(infiniscroll_dir, + env=get_test_env()), capture_output=True, text=True, timeout=30, diff --git a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py index 13a62e58..a9525c89 100644 --- a/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py +++ b/archivebox/plugins/istilldontcareaboutcookies/tests/test_istilldontcareaboutcookies.py @@ -154,7 +154,8 @@ def test_extension_loads_in_chromium(): # Step 1: Install the extension result = subprocess.run( ['node', str(INSTALL_SCRIPT)], - cwd=str(tmpdir), + cwd=str(tmpdir, + env=get_test_env()), capture_output=True, text=True, env=env, @@ -291,7 +292,8 @@ const puppeteer = require('puppeteer-core'); result = subprocess.run( ['node', str(script_path)], - cwd=str(tmpdir), + cwd=str(tmpdir, + env=get_test_env()), capture_output=True, text=True, env=env, @@ -443,7 +445,8 @@ const puppeteer = require('puppeteer-core'); result = subprocess.run( ['node', str(script_path)], - cwd=str(script_dir), + cwd=str(script_dir, + env=get_test_env()), capture_output=True, text=True, env=env, @@ -557,7 +560,8 @@ def test_hides_cookie_consent_on_filmin(): result = subprocess.run( ['node', str(INSTALL_SCRIPT)], - cwd=str(tmpdir), + cwd=str(tmpdir, + env=get_test_env()), capture_output=True, text=True, env=env_with_ext, diff --git a/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py b/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py index 7f519ea2..d87df28d 100644 --- a/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py +++ b/archivebox/plugins/parse_dom_outlinks/tests/test_parse_dom_outlinks.py @@ -80,7 +80,8 @@ class TestParseDomOutlinksWithChrome(TestCase): # Run outlinks hook with the active Chrome session result = subprocess.run( ['node', str(OUTLINKS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), + cwd=str(snapshot_chrome_dir, + env=get_test_env()), capture_output=True, text=True, timeout=60, diff --git a/archivebox/plugins/pdf/tests/test_pdf.py b/archivebox/plugins/pdf/tests/test_pdf.py index c160cfdc..8751faef 100644 --- a/archivebox/plugins/pdf/tests/test_pdf.py +++ b/archivebox/plugins/pdf/tests/test_pdf.py @@ -69,7 +69,8 @@ def test_extracts_pdf_from_example_com(): capture_output=True, text=True, timeout=120 - ) + , + env=get_test_env()) # Parse clean JSONL output (hook might fail due to network issues) result_json = None diff --git a/archivebox/plugins/redirects/tests/test_redirects.py b/archivebox/plugins/redirects/tests/test_redirects.py index 06d95246..934b14c7 100644 --- a/archivebox/plugins/redirects/tests/test_redirects.py +++ b/archivebox/plugins/redirects/tests/test_redirects.py @@ -81,7 +81,8 @@ class TestRedirectsWithChrome(TestCase): # Run redirects hook with the active Chrome session result = subprocess.run( ['node', str(REDIRECTS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), + cwd=str(snapshot_chrome_dir, + env=get_test_env()), capture_output=True, text=True, timeout=60, diff --git a/archivebox/plugins/responses/tests/test_responses.py b/archivebox/plugins/responses/tests/test_responses.py index 129d92a3..ea710c83 100644 --- a/archivebox/plugins/responses/tests/test_responses.py +++ b/archivebox/plugins/responses/tests/test_responses.py @@ -80,7 +80,8 @@ class TestResponsesWithChrome(TestCase): # Run responses hook with the active Chrome session result = subprocess.run( ['node', str(RESPONSES_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), + cwd=str(snapshot_chrome_dir, + env=get_test_env()), capture_output=True, text=True, timeout=120, # Longer timeout as it waits for navigation diff --git a/archivebox/plugins/screenshot/tests/test_screenshot.py b/archivebox/plugins/screenshot/tests/test_screenshot.py index 24d4960d..1aed35e6 100644 --- a/archivebox/plugins/screenshot/tests/test_screenshot.py +++ b/archivebox/plugins/screenshot/tests/test_screenshot.py @@ -65,7 +65,8 @@ def test_extracts_screenshot_from_example_com(): cwd=tmpdir, capture_output=True, text=True, - timeout=120 + timeout=120, + env=get_test_env() ) assert result.returncode == 0, f"Extraction failed: {result.stderr}" diff --git a/archivebox/plugins/seo/tests/test_seo.py b/archivebox/plugins/seo/tests/test_seo.py index 2b01a356..23beaa76 100644 --- a/archivebox/plugins/seo/tests/test_seo.py +++ b/archivebox/plugins/seo/tests/test_seo.py @@ -80,7 +80,8 @@ class TestSEOWithChrome(TestCase): # Run SEO hook with the active Chrome session result = subprocess.run( ['node', str(SEO_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), + cwd=str(snapshot_chrome_dir, + env=get_test_env()), capture_output=True, text=True, timeout=60, diff --git a/archivebox/plugins/ssl/tests/test_ssl.py b/archivebox/plugins/ssl/tests/test_ssl.py index cf131d9b..48ec0a6c 100644 --- a/archivebox/plugins/ssl/tests/test_ssl.py +++ b/archivebox/plugins/ssl/tests/test_ssl.py @@ -80,7 +80,8 @@ class TestSSLWithChrome(TestCase): # Run SSL hook with the active Chrome session result = subprocess.run( ['node', str(SSL_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), + cwd=str(snapshot_chrome_dir, + env=get_test_env()), capture_output=True, text=True, timeout=60, diff --git a/archivebox/plugins/staticfile/tests/test_staticfile.py b/archivebox/plugins/staticfile/tests/test_staticfile.py index 05af3a02..f80d0839 100644 --- a/archivebox/plugins/staticfile/tests/test_staticfile.py +++ b/archivebox/plugins/staticfile/tests/test_staticfile.py @@ -80,7 +80,8 @@ class TestStaticfileWithChrome(TestCase): # Run staticfile hook with the active Chrome session result = subprocess.run( ['node', str(STATICFILE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir), + cwd=str(snapshot_chrome_dir, + env=get_test_env()), capture_output=True, text=True, timeout=120, # Longer timeout as it waits for navigation diff --git a/archivebox/plugins/title/tests/test_title.py b/archivebox/plugins/title/tests/test_title.py index 285f7309..91b548d6 100644 --- a/archivebox/plugins/title/tests/test_title.py +++ b/archivebox/plugins/title/tests/test_title.py @@ -53,7 +53,8 @@ def test_extracts_title_from_example_com(): capture_output=True, text=True, timeout=60 - ) + , + env=get_test_env()) assert result.returncode == 0, f"Extraction failed: {result.stderr}" @@ -105,7 +106,8 @@ def test_falls_back_to_http_when_chrome_unavailable(): capture_output=True, text=True, timeout=60 - ) + , + env=get_test_env()) assert result.returncode == 0, f"Extraction failed: {result.stderr}" @@ -219,7 +221,8 @@ def test_handles_https_urls(): capture_output=True, text=True, timeout=60 - ) + , + env=get_test_env()) if result.returncode == 0: # Hook writes to current directory @@ -249,7 +252,8 @@ def test_handles_404_gracefully(): capture_output=True, text=True, timeout=60 - ) + , + env=get_test_env()) # May succeed or fail depending on server behavior # example.com returns "Example Domain" even for 404s @@ -272,7 +276,8 @@ def test_handles_redirects(): capture_output=True, text=True, timeout=60 - ) + , + env=get_test_env()) # Should succeed and follow redirect if result.returncode == 0: diff --git a/archivebox/plugins/ublock/tests/test_ublock.py b/archivebox/plugins/ublock/tests/test_ublock.py index 91492d4e..5489739d 100644 --- a/archivebox/plugins/ublock/tests/test_ublock.py +++ b/archivebox/plugins/ublock/tests/test_ublock.py @@ -283,7 +283,8 @@ const puppeteer = require('puppeteer-core'); result = subprocess.run( ['node', str(script_path)], - cwd=str(script_dir), + cwd=str(script_dir, + env=get_test_env()), capture_output=True, text=True, env=env, @@ -482,7 +483,8 @@ const puppeteer = require('puppeteer-core'); result = subprocess.run( ['node', str(script_path)], - cwd=str(tmpdir), + cwd=str(tmpdir, + env=get_test_env()), capture_output=True, text=True, env=env, diff --git a/archivebox/workers/orchestrator.py b/archivebox/workers/orchestrator.py index b074d529..99d4e27e 100644 --- a/archivebox/workers/orchestrator.py +++ b/archivebox/workers/orchestrator.py @@ -30,6 +30,7 @@ __package__ = 'archivebox.workers' import os import time from typing import Type +from datetime import timedelta from multiprocessing import Process as MPProcess from django.utils import timezone @@ -67,12 +68,19 @@ class Orchestrator: MAX_WORKERS_PER_TYPE: int = 8 # Max workers per model type MAX_TOTAL_WORKERS: int = 24 # Max workers across all types - def __init__(self, exit_on_idle: bool = True): + def __init__(self, exit_on_idle: bool = True, crawl_id: str | None = None): self.exit_on_idle = exit_on_idle + self.crawl_id = crawl_id # If set, only process work for this crawl self.pid: int = os.getpid() self.pid_file = None self.idle_count: int = 0 self._last_cleanup_time: float = 0.0 # For throttling cleanup_stale_running() + + # CRITICAL: In foreground mode (exit_on_idle=True), use ONLY 1 worker + # to keep execution strictly sequential and deterministic + if self.exit_on_idle: + self.MAX_WORKERS_PER_TYPE = 1 + self.MAX_TOTAL_WORKERS = 1 def __repr__(self) -> str: return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]' @@ -315,15 +323,12 @@ class Orchestrator: # Enable progress bars only in TTY + foreground mode show_progress = IS_TTY and self.exit_on_idle - # Debug - print(f"[yellow]DEBUG: IS_TTY={IS_TTY}, exit_on_idle={self.exit_on_idle}, show_progress={show_progress}[/yellow]") - self.on_startup() task_ids = {} if not show_progress: # No progress bars - just run normally - self._run_orchestrator_loop(None, task_ids, None, None) + self._run_orchestrator_loop(None, task_ids) else: # Redirect worker subprocess output to /dev/null devnull_fd = os.open(os.devnull, os.O_WRONLY) @@ -356,7 +361,7 @@ class Orchestrator: TaskProgressColumn(), console=orchestrator_console, ) as progress: - self._run_orchestrator_loop(progress, task_ids, None, None) + self._run_orchestrator_loop(progress, task_ids) # Restore original console logging_module.CONSOLE = original_console @@ -374,7 +379,7 @@ class Orchestrator: pass # stdout_for_console is closed by orchestrator_console - def _run_orchestrator_loop(self, progress, task_ids, read_fd, console): + def _run_orchestrator_loop(self, progress, task_ids): """Run the main orchestrator loop with optional progress display.""" try: while True: @@ -385,12 +390,28 @@ class Orchestrator: if progress: from archivebox.core.models import Snapshot - # Get all started snapshots - active_snapshots = list(Snapshot.objects.filter(status='started')) + # Get all started snapshots (optionally filtered by crawl_id) + snapshot_filter = {'status': 'started'} + if self.crawl_id: + snapshot_filter['crawl_id'] = self.crawl_id + else: + # Only if processing all crawls, filter by recent modified_at to avoid stale snapshots + recent_cutoff = timezone.now() - timedelta(minutes=5) + snapshot_filter['modified_at__gte'] = recent_cutoff + + active_snapshots = list(Snapshot.objects.filter(**snapshot_filter)) # Track which snapshots are still active active_ids = set() + # Debug: check for duplicates + snapshot_urls = [s.url for s in active_snapshots] + if len(active_snapshots) != len(set(snapshot_urls)): + # We have duplicate URLs - let's deduplicate by showing snapshot ID + show_id = True + else: + show_id = False + for snapshot in active_snapshots: active_ids.add(snapshot.id) @@ -421,7 +442,11 @@ class Orchestrator: # Build description with URL + current plugin url = snapshot.url[:50] + '...' if len(snapshot.url) > 50 else snapshot.url - description = f"{url}{current_plugin}" + if show_id: + # Show snapshot ID if there are duplicate URLs + description = f"[{str(snapshot.id)[:8]}] {url}{current_plugin}" + else: + description = f"{url}{current_plugin}" # Create or update task if snapshot.id not in task_ids: diff --git a/archivebox/workers/worker.py b/archivebox/workers/worker.py index 918b2bba..898c4210 100644 --- a/archivebox/workers/worker.py +++ b/archivebox/workers/worker.py @@ -63,9 +63,10 @@ class Worker: POLL_INTERVAL: ClassVar[float] = 0.2 # How often to check for new work (seconds) IDLE_TIMEOUT: ClassVar[int] = 50 # Exit after N idle iterations (10 sec at 0.2 poll interval) - def __init__(self, worker_id: int = 0, daemon: bool = False, **kwargs: Any): + def __init__(self, worker_id: int = 0, daemon: bool = False, crawl_id: str | None = None, **kwargs: Any): self.worker_id = worker_id self.daemon = daemon + self.crawl_id = crawl_id # If set, only process work for this crawl self.pid: int = os.getpid() self.pid_file: Path | None = None self.idle_count: int = 0 @@ -346,6 +347,13 @@ class CrawlWorker(Worker): from archivebox.crawls.models import Crawl return Crawl + def get_queue(self) -> QuerySet: + """Get queue of Crawls ready for processing, optionally filtered by crawl_id.""" + qs = super().get_queue() + if self.crawl_id: + qs = qs.filter(id=self.crawl_id) + return qs + class SnapshotWorker(Worker): """Worker for processing Snapshot objects."""