codecov, migrations, orchestrator fixes

This commit is contained in:
Nick Sweeting
2026-01-01 16:57:04 -08:00
parent 60422adc87
commit 9008cefca2
21 changed files with 153 additions and 57 deletions

View File

@@ -1515,6 +1515,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
parent_snapshot = overrides.get('snapshot') # Parent snapshot
created_by_id = overrides.get('created_by_id') or (parent_snapshot.created_by.pk if parent_snapshot else get_or_create_system_user_pk())
# DEBUG: Check if crawl_id in record matches overrides crawl
import sys
record_crawl_id = record.get('crawl_id')
if record_crawl_id and crawl and str(crawl.id) != str(record_crawl_id):
print(f"[yellow]⚠️ Snapshot.from_json crawl mismatch: record has crawl_id={record_crawl_id}, overrides has crawl={crawl.id}[/yellow]", file=sys.stderr)
# If no crawl provided, inherit from parent or auto-create one
if not crawl:
if parent_snapshot:
@@ -1536,6 +1542,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
label=f'auto-created for {url[:50]}',
created_by_id=created_by_id,
)
print(f"[red]⚠️ Snapshot.from_json auto-created new crawl {crawl.id} for url={url}[/red]", file=sys.stderr)
# Parse tags
tags_str = record.get('tags', '')
@@ -1546,8 +1553,9 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
if tag.strip()
))
# Get most recent snapshot with this URL (URLs can exist in multiple crawls)
snapshot = Snapshot.objects.filter(url=url).order_by('-created_at').first()
# Check for existing snapshot with same URL in same crawl
# (URLs can exist in multiple crawls, but should be unique within a crawl)
snapshot = Snapshot.objects.filter(url=url, crawl=crawl).order_by('-created_at').first()
title = record.get('title')
timestamp = record.get('timestamp')

View File

@@ -892,15 +892,34 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
"""
plugin_upper = plugin_name.upper()
# 1. Enabled: PLUGINNAME_ENABLED (default True)
# 1. Enabled: Check PLUGINS whitelist first, then PLUGINNAME_ENABLED (default True)
# Old names (USE_*, SAVE_*) are aliased in config.json via x-aliases
enabled_key = f'{plugin_upper}_ENABLED'
enabled = config.get(enabled_key)
if enabled is None:
enabled = True
elif isinstance(enabled, str):
# Handle string values from config file ("true"/"false")
enabled = enabled.lower() not in ('false', '0', 'no', '')
# Check if PLUGINS whitelist is specified (e.g., --plugins=wget,favicon)
plugins_whitelist = config.get('PLUGINS', '')
if plugins_whitelist:
# PLUGINS whitelist is specified - only enable plugins in the list
plugin_names = [p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()]
if plugin_name.lower() not in plugin_names:
# Plugin not in whitelist - explicitly disabled
enabled = False
else:
# Plugin is in whitelist - check if explicitly disabled by PLUGINNAME_ENABLED
enabled_key = f'{plugin_upper}_ENABLED'
enabled = config.get(enabled_key)
if enabled is None:
enabled = True # Default to enabled if in whitelist
elif isinstance(enabled, str):
enabled = enabled.lower() not in ('false', '0', 'no', '')
else:
# No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True)
enabled_key = f'{plugin_upper}_ENABLED'
enabled = config.get(enabled_key)
if enabled is None:
enabled = True
elif isinstance(enabled, str):
# Handle string values from config file ("true"/"false")
enabled = enabled.lower() not in ('false', '0', 'no', '')
# 2. Timeout: PLUGINNAME_TIMEOUT (fallback to TIMEOUT, default 300)
timeout_key = f'{plugin_upper}_TIMEOUT'

View File

@@ -80,7 +80,8 @@ class TestAccessibilityWithChrome(TestCase):
# Run accessibility hook with the active Chrome session
result = subprocess.run(
['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
capture_output=True,
text=True,
timeout=60,

View File

@@ -208,7 +208,8 @@ def test_chrome_launch_and_tab_creation():
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
result = subprocess.run(
['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-123', '--crawl-id=test-crawl-123'],
cwd=str(snapshot_chrome_dir),
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
capture_output=True,
text=True,
timeout=60,
@@ -268,7 +269,8 @@ def test_chrome_navigation():
result = subprocess.run(
['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123', '--crawl-id=test-crawl-nav'],
cwd=str(snapshot_chrome_dir),
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
capture_output=True,
text=True,
timeout=60,
@@ -279,7 +281,8 @@ def test_chrome_navigation():
# Navigate to URL
result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123'],
cwd=str(snapshot_chrome_dir),
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
capture_output=True,
text=True,
timeout=120,
@@ -414,7 +417,8 @@ def test_multiple_snapshots_share_chrome():
# Create tab for this snapshot
result = subprocess.run(
['node', str(CHROME_TAB_HOOK), f'--url=https://example.com/{snap_num}', f'--snapshot-id=snap-{snap_num}', '--crawl-id=test-multi-crawl'],
cwd=str(snapshot_chrome_dir),
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
capture_output=True,
text=True,
timeout=60,

View File

@@ -80,7 +80,8 @@ class TestConsolelogWithChrome(TestCase):
# Run consolelog hook with the active Chrome session
result = subprocess.run(
['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
capture_output=True,
text=True,
timeout=120, # Longer timeout as it waits for navigation

View File

@@ -68,7 +68,8 @@ def test_extracts_dom_from_example_com():
capture_output=True,
text=True,
timeout=120
)
,
env=get_test_env())
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
@@ -152,7 +153,8 @@ def test_staticfile_present_skips():
capture_output=True,
text=True,
timeout=30
)
,
env=get_test_env())
assert result.returncode == 0, "Should exit 0 when permanently skipping"

View File

@@ -50,7 +50,8 @@ def test_node_is_available():
capture_output=True,
text=True,
timeout=10
)
,
env=get_test_env())
assert result.returncode == 0, f"node not executable: {result.stderr}"
assert result.stdout.startswith('v'), f"Unexpected node version format: {result.stdout}"
@@ -72,7 +73,8 @@ def test_extracts_headers_from_example_com():
capture_output=True,
text=True,
timeout=60
)
,
env=get_test_env())
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
@@ -133,7 +135,8 @@ def test_headers_output_structure():
capture_output=True,
text=True,
timeout=60
)
,
env=get_test_env())
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
@@ -192,7 +195,8 @@ def test_falls_back_to_http_when_chrome_unavailable():
capture_output=True,
text=True,
timeout=60
)
,
env=get_test_env())
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
@@ -309,7 +313,8 @@ def test_handles_https_urls():
capture_output=True,
text=True,
timeout=60
)
,
env=get_test_env())
if result.returncode == 0:
output_headers_file = tmpdir / 'headers.json'
@@ -334,7 +339,8 @@ def test_handles_404_gracefully():
capture_output=True,
text=True,
timeout=60
)
,
env=get_test_env())
# May succeed or fail depending on server behavior
# If it succeeds, verify 404 status is captured

View File

@@ -123,7 +123,8 @@ def test_scrolls_page_and_outputs_stats():
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'],
cwd=str(infiniscroll_dir),
cwd=str(infiniscroll_dir,
env=get_test_env()),
capture_output=True,
text=True,
timeout=60,
@@ -187,7 +188,8 @@ def test_config_scroll_limit_honored():
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-limit'],
cwd=str(infiniscroll_dir),
cwd=str(infiniscroll_dir,
env=get_test_env()),
capture_output=True,
text=True,
timeout=60,
@@ -246,7 +248,8 @@ def test_config_timeout_honored():
start_time = time.time()
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-timeout'],
cwd=str(infiniscroll_dir),
cwd=str(infiniscroll_dir,
env=get_test_env()),
capture_output=True,
text=True,
timeout=30,

View File

@@ -154,7 +154,8 @@ def test_extension_loads_in_chromium():
# Step 1: Install the extension
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
cwd=str(tmpdir),
cwd=str(tmpdir,
env=get_test_env()),
capture_output=True,
text=True,
env=env,
@@ -291,7 +292,8 @@ const puppeteer = require('puppeteer-core');
result = subprocess.run(
['node', str(script_path)],
cwd=str(tmpdir),
cwd=str(tmpdir,
env=get_test_env()),
capture_output=True,
text=True,
env=env,
@@ -443,7 +445,8 @@ const puppeteer = require('puppeteer-core');
result = subprocess.run(
['node', str(script_path)],
cwd=str(script_dir),
cwd=str(script_dir,
env=get_test_env()),
capture_output=True,
text=True,
env=env,
@@ -557,7 +560,8 @@ def test_hides_cookie_consent_on_filmin():
result = subprocess.run(
['node', str(INSTALL_SCRIPT)],
cwd=str(tmpdir),
cwd=str(tmpdir,
env=get_test_env()),
capture_output=True,
text=True,
env=env_with_ext,

View File

@@ -80,7 +80,8 @@ class TestParseDomOutlinksWithChrome(TestCase):
# Run outlinks hook with the active Chrome session
result = subprocess.run(
['node', str(OUTLINKS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
capture_output=True,
text=True,
timeout=60,

View File

@@ -69,7 +69,8 @@ def test_extracts_pdf_from_example_com():
capture_output=True,
text=True,
timeout=120
)
,
env=get_test_env())
# Parse clean JSONL output (hook might fail due to network issues)
result_json = None

View File

@@ -81,7 +81,8 @@ class TestRedirectsWithChrome(TestCase):
# Run redirects hook with the active Chrome session
result = subprocess.run(
['node', str(REDIRECTS_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
capture_output=True,
text=True,
timeout=60,

View File

@@ -80,7 +80,8 @@ class TestResponsesWithChrome(TestCase):
# Run responses hook with the active Chrome session
result = subprocess.run(
['node', str(RESPONSES_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
capture_output=True,
text=True,
timeout=120, # Longer timeout as it waits for navigation

View File

@@ -65,7 +65,8 @@ def test_extracts_screenshot_from_example_com():
cwd=tmpdir,
capture_output=True,
text=True,
timeout=120
timeout=120,
env=get_test_env()
)
assert result.returncode == 0, f"Extraction failed: {result.stderr}"

View File

@@ -80,7 +80,8 @@ class TestSEOWithChrome(TestCase):
# Run SEO hook with the active Chrome session
result = subprocess.run(
['node', str(SEO_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
capture_output=True,
text=True,
timeout=60,

View File

@@ -80,7 +80,8 @@ class TestSSLWithChrome(TestCase):
# Run SSL hook with the active Chrome session
result = subprocess.run(
['node', str(SSL_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
capture_output=True,
text=True,
timeout=60,

View File

@@ -80,7 +80,8 @@ class TestStaticfileWithChrome(TestCase):
# Run staticfile hook with the active Chrome session
result = subprocess.run(
['node', str(STATICFILE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
capture_output=True,
text=True,
timeout=120, # Longer timeout as it waits for navigation

View File

@@ -53,7 +53,8 @@ def test_extracts_title_from_example_com():
capture_output=True,
text=True,
timeout=60
)
,
env=get_test_env())
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
@@ -105,7 +106,8 @@ def test_falls_back_to_http_when_chrome_unavailable():
capture_output=True,
text=True,
timeout=60
)
,
env=get_test_env())
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
@@ -219,7 +221,8 @@ def test_handles_https_urls():
capture_output=True,
text=True,
timeout=60
)
,
env=get_test_env())
if result.returncode == 0:
# Hook writes to current directory
@@ -249,7 +252,8 @@ def test_handles_404_gracefully():
capture_output=True,
text=True,
timeout=60
)
,
env=get_test_env())
# May succeed or fail depending on server behavior
# example.com returns "Example Domain" even for 404s
@@ -272,7 +276,8 @@ def test_handles_redirects():
capture_output=True,
text=True,
timeout=60
)
,
env=get_test_env())
# Should succeed and follow redirect
if result.returncode == 0:

View File

@@ -283,7 +283,8 @@ const puppeteer = require('puppeteer-core');
result = subprocess.run(
['node', str(script_path)],
cwd=str(script_dir),
cwd=str(script_dir,
env=get_test_env()),
capture_output=True,
text=True,
env=env,
@@ -482,7 +483,8 @@ const puppeteer = require('puppeteer-core');
result = subprocess.run(
['node', str(script_path)],
cwd=str(tmpdir),
cwd=str(tmpdir,
env=get_test_env()),
capture_output=True,
text=True,
env=env,

View File

@@ -30,6 +30,7 @@ __package__ = 'archivebox.workers'
import os
import time
from typing import Type
from datetime import timedelta
from multiprocessing import Process as MPProcess
from django.utils import timezone
@@ -67,12 +68,19 @@ class Orchestrator:
MAX_WORKERS_PER_TYPE: int = 8 # Max workers per model type
MAX_TOTAL_WORKERS: int = 24 # Max workers across all types
def __init__(self, exit_on_idle: bool = True):
def __init__(self, exit_on_idle: bool = True, crawl_id: str | None = None):
self.exit_on_idle = exit_on_idle
self.crawl_id = crawl_id # If set, only process work for this crawl
self.pid: int = os.getpid()
self.pid_file = None
self.idle_count: int = 0
self._last_cleanup_time: float = 0.0 # For throttling cleanup_stale_running()
# CRITICAL: In foreground mode (exit_on_idle=True), use ONLY 1 worker
# to keep execution strictly sequential and deterministic
if self.exit_on_idle:
self.MAX_WORKERS_PER_TYPE = 1
self.MAX_TOTAL_WORKERS = 1
def __repr__(self) -> str:
return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]'
@@ -315,15 +323,12 @@ class Orchestrator:
# Enable progress bars only in TTY + foreground mode
show_progress = IS_TTY and self.exit_on_idle
# Debug
print(f"[yellow]DEBUG: IS_TTY={IS_TTY}, exit_on_idle={self.exit_on_idle}, show_progress={show_progress}[/yellow]")
self.on_startup()
task_ids = {}
if not show_progress:
# No progress bars - just run normally
self._run_orchestrator_loop(None, task_ids, None, None)
self._run_orchestrator_loop(None, task_ids)
else:
# Redirect worker subprocess output to /dev/null
devnull_fd = os.open(os.devnull, os.O_WRONLY)
@@ -356,7 +361,7 @@ class Orchestrator:
TaskProgressColumn(),
console=orchestrator_console,
) as progress:
self._run_orchestrator_loop(progress, task_ids, None, None)
self._run_orchestrator_loop(progress, task_ids)
# Restore original console
logging_module.CONSOLE = original_console
@@ -374,7 +379,7 @@ class Orchestrator:
pass
# stdout_for_console is closed by orchestrator_console
def _run_orchestrator_loop(self, progress, task_ids, read_fd, console):
def _run_orchestrator_loop(self, progress, task_ids):
"""Run the main orchestrator loop with optional progress display."""
try:
while True:
@@ -385,12 +390,28 @@ class Orchestrator:
if progress:
from archivebox.core.models import Snapshot
# Get all started snapshots
active_snapshots = list(Snapshot.objects.filter(status='started'))
# Get all started snapshots (optionally filtered by crawl_id)
snapshot_filter = {'status': 'started'}
if self.crawl_id:
snapshot_filter['crawl_id'] = self.crawl_id
else:
# Only if processing all crawls, filter by recent modified_at to avoid stale snapshots
recent_cutoff = timezone.now() - timedelta(minutes=5)
snapshot_filter['modified_at__gte'] = recent_cutoff
active_snapshots = list(Snapshot.objects.filter(**snapshot_filter))
# Track which snapshots are still active
active_ids = set()
# Debug: check for duplicates
snapshot_urls = [s.url for s in active_snapshots]
if len(active_snapshots) != len(set(snapshot_urls)):
# We have duplicate URLs - let's deduplicate by showing snapshot ID
show_id = True
else:
show_id = False
for snapshot in active_snapshots:
active_ids.add(snapshot.id)
@@ -421,7 +442,11 @@ class Orchestrator:
# Build description with URL + current plugin
url = snapshot.url[:50] + '...' if len(snapshot.url) > 50 else snapshot.url
description = f"{url}{current_plugin}"
if show_id:
# Show snapshot ID if there are duplicate URLs
description = f"[{str(snapshot.id)[:8]}] {url}{current_plugin}"
else:
description = f"{url}{current_plugin}"
# Create or update task
if snapshot.id not in task_ids:

View File

@@ -63,9 +63,10 @@ class Worker:
POLL_INTERVAL: ClassVar[float] = 0.2 # How often to check for new work (seconds)
IDLE_TIMEOUT: ClassVar[int] = 50 # Exit after N idle iterations (10 sec at 0.2 poll interval)
def __init__(self, worker_id: int = 0, daemon: bool = False, **kwargs: Any):
def __init__(self, worker_id: int = 0, daemon: bool = False, crawl_id: str | None = None, **kwargs: Any):
self.worker_id = worker_id
self.daemon = daemon
self.crawl_id = crawl_id # If set, only process work for this crawl
self.pid: int = os.getpid()
self.pid_file: Path | None = None
self.idle_count: int = 0
@@ -346,6 +347,13 @@ class CrawlWorker(Worker):
from archivebox.crawls.models import Crawl
return Crawl
def get_queue(self) -> QuerySet:
"""Get queue of Crawls ready for processing, optionally filtered by crawl_id."""
qs = super().get_queue()
if self.crawl_id:
qs = qs.filter(id=self.crawl_id)
return qs
class SnapshotWorker(Worker):
"""Worker for processing Snapshot objects."""