diff --git a/.claude/settings.local.json b/.claude/settings.local.json index fede3847..ae6afbbb 100644 --- a/.claude/settings.local.json +++ b/.claude/settings.local.json @@ -26,7 +26,8 @@ "Bash(grep:*)", "WebFetch(domain:python-statemachine.readthedocs.io)", "Bash(./bin/run_plugin_tests.sh:*)", - "Bash(done)" + "Bash(done)", + "Bash(coverage erase:*)" ] } } diff --git a/.gitignore b/.gitignore index 5f6ffcae..832334e7 100644 --- a/.gitignore +++ b/.gitignore @@ -10,6 +10,7 @@ tests/out/ .coverage .coverage.* coverage.json +coverage/ htmlcov/ # Python and Node dependencies diff --git a/archivebox/config/constants.py b/archivebox/config/constants.py index ec11dff9..a863837c 100644 --- a/archivebox/config/constants.py +++ b/archivebox/config/constants.py @@ -173,6 +173,8 @@ class ConstantsDict(Mapping): CUSTOM_TEMPLATES_DIR_NAME, CUSTOM_PLUGINS_DIR_NAME, CRONTABS_DIR_NAME, + "invalid", + "users", # Backwards compatibility with old directory names "user_plugins", # old name for USER_PLUGINS_DIR (now 'plugins') "user_templates", # old name for CUSTOM_TEMPLATES_DIR (now 'templates') diff --git a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py index 676639c7..ddcdcd28 100644 --- a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py +++ b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py @@ -57,18 +57,8 @@ class Migration(migrations.Migration): name='snapshot', options={'verbose_name': 'Snapshot', 'verbose_name_plural': 'Snapshots'}, ), - migrations.RemoveField( - model_name='archiveresult', - name='cmd', - ), - migrations.RemoveField( - model_name='archiveresult', - name='cmd_version', - ), - migrations.RemoveField( - model_name='archiveresult', - name='pwd', - ), + # NOTE: RemoveField for cmd, cmd_version, pwd moved to migration 0027 + # to allow data migration to Process records first migrations.AddField( model_name='archiveresult', name='config', diff --git a/archivebox/core/models.py b/archivebox/core/models.py index c3731354..928aa990 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -2208,7 +2208,7 @@ class SnapshotMachine(BaseStateMachine, strict_states=True): tick = ( queued.to.itself(unless='can_start') | queued.to(started, cond='can_start') | - started.to.itself(unless='is_finished') | + started.to.itself(unless='is_finished', on='on_started_to_started') | started.to(sealed, cond='is_finished') ) @@ -2243,6 +2243,13 @@ class SnapshotMachine(BaseStateMachine, strict_states=True): status=Snapshot.StatusChoices.STARTED, ) + def on_started_to_started(self): + """Called when Snapshot stays in started state (archiveresults not finished yet).""" + # Bump retry_at so we check again in a few seconds + self.snapshot.update_and_requeue( + retry_at=timezone.now() + timedelta(seconds=5), + ) + @sealed.enter def enter_sealed(self): # Clean up background hooks diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index dd849d2a..e8415918 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -502,7 +502,7 @@ class CrawlMachine(BaseStateMachine, strict_states=True): tick = ( queued.to.itself(unless='can_start') | queued.to(started, cond='can_start') | - started.to.itself(unless='is_finished') | + started.to.itself(unless='is_finished', on='on_started_to_started') | started.to(sealed, cond='is_finished') ) diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 9078e02a..f955974b 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -1201,6 +1201,14 @@ def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any # Dispatch to appropriate model's from_json() method if record_type == 'Snapshot': from archivebox.core.models import Snapshot + + # Check if discovered snapshot exceeds crawl max_depth + snapshot_depth = record.get('depth', 0) + crawl = overrides.get('crawl') + if crawl and snapshot_depth > crawl.max_depth: + # Skip - this URL was discovered but exceeds max crawl depth + continue + obj = Snapshot.from_json(record.copy(), overrides) if obj: stats['Snapshot'] = stats.get('Snapshot', 0) + 1 diff --git a/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py b/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py index 6ec7bcb9..caccdac5 100755 --- a/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py +++ b/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py @@ -163,8 +163,10 @@ def fetch_content(url: str) -> str: @click.command() @click.option('--url', required=True, help='Netscape bookmark file URL to parse') -@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)') -def main(url: str, snapshot_id: str = None): +@click.option('--snapshot-id', required=False, help='Parent Snapshot UUID') +@click.option('--crawl-id', required=False, help='Crawl UUID') +@click.option('--depth', type=int, default=0, help='Current depth level') +def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0): """Parse Netscape bookmark HTML and extract URLs.""" try: @@ -188,7 +190,12 @@ def main(url: str, snapshot_id: str = None): 'type': 'Snapshot', 'url': unescape(bookmark_url), 'plugin': PLUGIN_NAME, + 'depth': depth + 1, } + if snapshot_id: + entry['parent_snapshot_id'] = snapshot_id + if crawl_id: + entry['crawl_id'] = crawl_id if title: entry['title'] = unescape(title) if tags_str: diff --git a/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py b/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py index 491555d4..d899c742 100755 --- a/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py +++ b/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py @@ -100,8 +100,10 @@ def fetch_content(url: str) -> str: @click.command() @click.option('--url', required=True, help='URL to parse (file:// or https://)') -@click.option('--snapshot-id', required=False, help='Snapshot UUID (unused but required by hook runner)') -def main(url: str, snapshot_id: str = None): +@click.option('--snapshot-id', required=False, help='Parent Snapshot UUID') +@click.option('--crawl-id', required=False, help='Crawl UUID') +@click.option('--depth', type=int, default=0, help='Current depth level') +def main(url: str, snapshot_id: str = None, crawl_id: str = None, depth: int = 0): """Parse plain text and extract URLs.""" try: @@ -123,9 +125,12 @@ def main(url: str, snapshot_id: str = None): 'type': 'Snapshot', 'url': found_url, 'plugin': PLUGIN_NAME, + 'depth': depth + 1, } if snapshot_id: record['parent_snapshot_id'] = snapshot_id + if crawl_id: + record['crawl_id'] = crawl_id print(json.dumps(record)) # Emit ArchiveResult record to mark completion diff --git a/archivebox/tests/test_migrations_08_to_09.py b/archivebox/tests/test_migrations_08_to_09.py index 487911a5..37f5ce83 100644 --- a/archivebox/tests/test_migrations_08_to_09.py +++ b/archivebox/tests/test_migrations_08_to_09.py @@ -30,6 +30,7 @@ from .test_migrations_helpers import ( verify_foreign_keys, verify_all_snapshots_in_output, verify_crawl_count, + verify_process_migration, ) @@ -260,6 +261,54 @@ class TestMigrationFrom08x(unittest.TestCase): self.assertTrue('ArchiveBox' in output or 'version' in output.lower(), f"Version output missing expected content: {output[:500]}") + def test_migration_creates_process_records(self): + """Migration should create Process records for all ArchiveResults.""" + result = run_archivebox(self.work_dir, ['init'], timeout=45) + self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}") + + # Verify Process records created + expected_count = len(self.original_data['archiveresults']) + ok, msg = verify_process_migration(self.db_path, expected_count) + self.assertTrue(ok, msg) + + def test_migration_creates_binary_records(self): + """Migration should create Binary records from cmd_version data.""" + result = run_archivebox(self.work_dir, ['init'], timeout=45) + self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}") + + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + + # Check Binary records exist + cursor.execute("SELECT COUNT(*) FROM machine_binary") + binary_count = cursor.fetchone()[0] + + # Should have at least one binary per unique extractor + extractors = set(ar['extractor'] for ar in self.original_data['archiveresults']) + self.assertGreaterEqual(binary_count, len(extractors), + f"Expected at least {len(extractors)} Binaries, got {binary_count}") + + conn.close() + + def test_migration_preserves_cmd_data(self): + """Migration should preserve cmd data in Process.cmd field.""" + result = run_archivebox(self.work_dir, ['init'], timeout=45) + self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}") + + conn = sqlite3.connect(str(self.db_path)) + cursor = conn.cursor() + + # Check that Process records have cmd arrays + cursor.execute("SELECT cmd FROM machine_process WHERE cmd != '[]'") + cmd_records = cursor.fetchall() + + # All Processes should have non-empty cmd (test data has json.dumps([extractor, '--version'])) + expected_count = len(self.original_data['archiveresults']) + self.assertEqual(len(cmd_records), expected_count, + f"Expected {expected_count} Processes with cmd, got {len(cmd_records)}") + + conn.close() + class TestMigrationDataIntegrity08x(unittest.TestCase): """Comprehensive data integrity tests for 0.8.x migrations.""" diff --git a/archivebox/tests/test_migrations_helpers.py b/archivebox/tests/test_migrations_helpers.py index 55544a11..ffdf1b4d 100644 --- a/archivebox/tests/test_migrations_helpers.py +++ b/archivebox/tests/test_migrations_helpers.py @@ -730,44 +730,26 @@ def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]: tag_id = cursor.lastrowid created_data['tags'].append({'id': tag_id, 'name': name, 'slug': name.lower()}) - # Create Seeds first (required for 0.8.x Crawls) - test_seeds = [ - ('https://example.com', 'auto', 'Example Seed'), - ('https://github.com/ArchiveBox', 'auto', 'GitHub Seed'), - ] - - created_data['seeds'] = [] - for uri, extractor, label in test_seeds: - seed_id = generate_uuid() - cursor.execute(""" - INSERT INTO crawls_seed (id, created_at, created_by_id, modified_at, uri, - extractor, tags_str, label, config, output_dir, notes, - num_uses_failed, num_uses_succeeded) - VALUES (?, datetime('now'), ?, datetime('now'), ?, ?, '', ?, '{}', '', '', 0, 0) - """, (seed_id, user_id, uri, extractor, label)) - created_data['seeds'].append({'id': seed_id, 'uri': uri, 'label': label}) - - # Create 2 Crawls (linked to Seeds) + # Create 2 Crawls (0.9.0 schema - no seeds) test_crawls = [ - ('https://example.com\nhttps://example.org', 0, 'Example Crawl', created_data['seeds'][0]['id']), - ('https://github.com/ArchiveBox', 1, 'GitHub Crawl', created_data['seeds'][1]['id']), + ('https://example.com\nhttps://example.org', 0, 'Example Crawl'), + ('https://github.com/ArchiveBox', 1, 'GitHub Crawl'), ] - for i, (urls, max_depth, label, seed_id) in enumerate(test_crawls): + for i, (urls, max_depth, label) in enumerate(test_crawls): crawl_id = generate_uuid() cursor.execute(""" - INSERT INTO crawls_crawl (id, created_at, created_by_id, modified_at, seed_id, urls, + INSERT INTO crawls_crawl (id, created_at, created_by_id, modified_at, urls, config, max_depth, tags_str, label, status, retry_at, num_uses_failed, num_uses_succeeded) - VALUES (?, datetime('now'), ?, datetime('now'), ?, ?, '{}', ?, '', ?, 'queued', datetime('now'), 0, 0) - """, (crawl_id, user_id, seed_id, urls, max_depth, label)) + VALUES (?, datetime('now'), ?, datetime('now'), ?, '{}', ?, '', ?, 'queued', datetime('now'), 0, 0) + """, (crawl_id, user_id, urls, max_depth, label)) created_data['crawls'].append({ 'id': crawl_id, 'urls': urls, 'max_depth': max_depth, 'label': label, - 'seed_id': seed_id, }) # Create 5 snapshots linked to crawls @@ -1146,3 +1128,64 @@ def verify_crawl_count(db_path: Path, expected: int) -> Tuple[bool, str]: if count == expected: return True, f"Crawl count OK: {count}" return False, f"Crawl count mismatch: expected {expected}, got {count}" + + +def verify_process_migration(db_path: Path, expected_archiveresult_count: int) -> Tuple[bool, str]: + """ + Verify that ArchiveResults were properly migrated to Process records. + + Checks: + 1. All ArchiveResults have process_id set + 2. Process count matches ArchiveResult count + 3. Binary records created for unique cmd_version values + 4. Status mapping is correct + """ + conn = sqlite3.connect(str(db_path)) + cursor = conn.cursor() + + # Check all ArchiveResults have process_id + cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE process_id IS NULL") + null_count = cursor.fetchone()[0] + + if null_count > 0: + conn.close() + return False, f"Found {null_count} ArchiveResults without process_id" + + # Check Process count + cursor.execute("SELECT COUNT(*) FROM machine_process") + process_count = cursor.fetchone()[0] + + if process_count != expected_archiveresult_count: + conn.close() + return False, f"Expected {expected_archiveresult_count} Processes, got {process_count}" + + # Check status mapping + cursor.execute(""" + SELECT ar.status, p.status, p.exit_code + FROM core_archiveresult ar + JOIN machine_process p ON ar.process_id = p.id + """) + + status_errors = [] + for ar_status, p_status, p_exit_code in cursor.fetchall(): + expected_p_status, expected_exit_code = { + 'queued': ('queued', None), + 'started': ('running', None), + 'backoff': ('queued', None), + 'succeeded': ('exited', 0), + 'failed': ('exited', 1), + 'skipped': ('exited', None), + }.get(ar_status, ('queued', None)) + + if p_status != expected_p_status: + status_errors.append(f"AR status {ar_status} → Process {p_status}, expected {expected_p_status}") + + if p_exit_code != expected_exit_code: + status_errors.append(f"AR status {ar_status} → exit_code {p_exit_code}, expected {expected_exit_code}") + + if status_errors: + conn.close() + return False, f"Status mapping errors: {'; '.join(status_errors[:5])}" + + conn.close() + return True, f"Process migration verified: {process_count} Processes created" diff --git a/archivebox/workers/orchestrator.py b/archivebox/workers/orchestrator.py index 6f86a592..b074d529 100644 --- a/archivebox/workers/orchestrator.py +++ b/archivebox/workers/orchestrator.py @@ -175,8 +175,50 @@ class Orchestrator: """Spawn a new worker process. Returns PID or None if spawn failed.""" try: pid = WorkerClass.start(daemon=False) - # Worker spawning is logged by the worker itself in on_startup() - return pid + + # CRITICAL: Block until worker registers itself in Process table + # This prevents race condition where orchestrator spawns multiple workers + # before any of them finish on_startup() and register + from archivebox.machine.models import Process + import time + + timeout = 5.0 # seconds to wait for worker registration + poll_interval = 0.1 # check every 100ms + elapsed = 0.0 + spawn_time = timezone.now() + + while elapsed < timeout: + # Check if worker process is registered with strict criteria: + # 1. Correct PID + # 2. WORKER process type + # 3. RUNNING status + # 4. Parent is this orchestrator + # 5. Started recently (within last 10 seconds) + worker_process = Process.objects.filter( + pid=pid, + process_type=Process.TypeChoices.WORKER, + status=Process.StatusChoices.RUNNING, + parent_id=self.db_process.id, + started_at__gte=spawn_time - timedelta(seconds=10), + ).first() + + if worker_process: + # Worker successfully registered! + return pid + + time.sleep(poll_interval) + elapsed += poll_interval + + # Timeout - worker failed to register + log_worker_event( + worker_type='Orchestrator', + event='Worker failed to register in time', + indent_level=0, + pid=self.pid, + metadata={'worker_type': WorkerClass.name, 'worker_pid': pid, 'timeout': timeout}, + ) + return None + except Exception as e: log_worker_event( worker_type='Orchestrator', @@ -266,48 +308,75 @@ class Orchestrator: def runloop(self) -> None: """Main orchestrator loop.""" from rich.progress import Progress, BarColumn, TextColumn, TaskProgressColumn - from archivebox.misc.logging import IS_TTY - import archivebox.misc.logging as logging_module + from archivebox.misc.logging import IS_TTY, CONSOLE + import sys + import os # Enable progress bars only in TTY + foreground mode show_progress = IS_TTY and self.exit_on_idle - # Save original consoles - original_console = logging_module.CONSOLE - original_stderr = logging_module.STDERR + # Debug + print(f"[yellow]DEBUG: IS_TTY={IS_TTY}, exit_on_idle={self.exit_on_idle}, show_progress={show_progress}[/yellow]") - # Create Progress with the console it will control - progress = Progress( - TextColumn("[cyan]{task.description}"), - BarColumn(bar_width=40), - TaskProgressColumn(), - transient=False, - console=original_console, # Use the original console - ) if show_progress else None + self.on_startup() + task_ids = {} - task_ids = {} # snapshot_id -> task_id + if not show_progress: + # No progress bars - just run normally + self._run_orchestrator_loop(None, task_ids, None, None) + else: + # Redirect worker subprocess output to /dev/null + devnull_fd = os.open(os.devnull, os.O_WRONLY) - # Wrapper to convert console.print() to console.log() for Rich Progress - class ConsoleLogWrapper: - def __init__(self, console): - self._console = console - def print(self, *args, **kwargs): - # Use log() instead of print() to work with Live display - self._console.log(*args) - def __getattr__(self, name): - return getattr(self._console, name) + # Save original stdout/stderr (make 2 copies - one for Console, one for restoring) + original_stdout = sys.stdout.fileno() + original_stderr = sys.stderr.fileno() + stdout_for_console = os.dup(original_stdout) + stdout_for_restore = os.dup(original_stdout) + stderr_for_restore = os.dup(original_stderr) + try: + # Redirect stdout/stderr to /dev/null (workers will inherit this) + os.dup2(devnull_fd, original_stdout) + os.dup2(devnull_fd, original_stderr) + + # Create Console using saved stdout (not the redirected one) + from rich.console import Console + import archivebox.misc.logging as logging_module + orchestrator_console = Console(file=os.fdopen(stdout_for_console, 'w'), force_terminal=True) + + # Update global CONSOLE so orchestrator logs appear too + original_console = logging_module.CONSOLE + logging_module.CONSOLE = orchestrator_console + + # Now create Progress and run loop (DON'T restore stdout/stderr - workers need /dev/null) + with Progress( + TextColumn("[cyan]{task.description}"), + BarColumn(bar_width=40), + TaskProgressColumn(), + console=orchestrator_console, + ) as progress: + self._run_orchestrator_loop(progress, task_ids, None, None) + + # Restore original console + logging_module.CONSOLE = original_console + finally: + # Restore stdout/stderr + os.dup2(stdout_for_restore, original_stdout) + os.dup2(stderr_for_restore, original_stderr) + + # Cleanup + try: + os.close(devnull_fd) + os.close(stdout_for_restore) + os.close(stderr_for_restore) + except: + pass + # stdout_for_console is closed by orchestrator_console + + def _run_orchestrator_loop(self, progress, task_ids, read_fd, console): + """Run the main orchestrator loop with optional progress display.""" try: - if progress: - progress.start() - # Wrap progress.console so print() calls become log() calls - wrapped_console = ConsoleLogWrapper(progress.console) - logging_module.CONSOLE = wrapped_console - logging_module.STDERR = wrapped_console - - # Call on_startup AFTER redirecting consoles - self.on_startup() - while True: # Check queues and spawn workers queue_sizes = self.check_queues_and_spawn_workers() @@ -333,12 +402,33 @@ class Orchestrator: status__in=['succeeded', 'skipped', 'failed'] ).count() + # Find currently running hook (ordered by hook_name to get lowest step number) + current_ar = snapshot.archiveresult_set.filter(status='started').order_by('hook_name').first() + if not current_ar: + # If nothing running, show next queued item (ordered to get next in sequence) + current_ar = snapshot.archiveresult_set.filter(status='queued').order_by('hook_name').first() + + current_plugin = '' + if current_ar: + # Use hook_name if available, otherwise plugin name + hook_name = current_ar.hook_name or current_ar.plugin or '' + # Extract just the hook name without path (e.g., "on_Snapshot__50_wget.py" -> "wget") + if hook_name: + # Clean up the name: remove prefix and extension + clean_name = hook_name.split('__')[-1] if '__' in hook_name else hook_name + clean_name = clean_name.replace('.py', '').replace('.sh', '').replace('.bg', '') + current_plugin = f" • {clean_name}" + + # Build description with URL + current plugin + url = snapshot.url[:50] + '...' if len(snapshot.url) > 50 else snapshot.url + description = f"{url}{current_plugin}" + # Create or update task if snapshot.id not in task_ids: - url = snapshot.url[:60] + '...' if len(snapshot.url) > 60 else snapshot.url - task_ids[snapshot.id] = progress.add_task(url, total=total, completed=completed) + task_ids[snapshot.id] = progress.add_task(description, total=total, completed=completed) else: - progress.update(task_ids[snapshot.id], completed=completed) + # Update both progress and description + progress.update(task_ids[snapshot.id], description=description, completed=completed) # Remove tasks for snapshots that are no longer active for snapshot_id in list(task_ids.keys()): @@ -373,12 +463,6 @@ class Orchestrator: raise else: self.on_shutdown() - finally: - if progress: - # Restore original consoles - logging_module.CONSOLE = original_console - logging_module.STDERR = original_stderr - progress.stop() def start(self) -> int: """ diff --git a/bin/test_plugins.sh b/bin/test_plugins.sh index eead957a..3e8305bf 100755 --- a/bin/test_plugins.sh +++ b/bin/test_plugins.sh @@ -1,14 +1,20 @@ #!/bin/bash -# Run ArchiveBox plugin tests +# Run ArchiveBox plugin tests with coverage # # All plugin tests use pytest and are located in pluginname/tests/test_*.py # -# Usage: ./bin/run_plugin_tests.sh [plugin_name] +# Usage: ./bin/test_plugins.sh [plugin_name] [--no-coverage] # # Examples: -# ./bin/run_plugin_tests.sh # Run all plugin tests -# ./bin/run_plugin_tests.sh chrome # Run chrome plugin tests -# ./bin/run_plugin_tests.sh parse_* # Run all parse_* plugin tests +# ./bin/test_plugins.sh # Run all plugin tests with coverage +# ./bin/test_plugins.sh chrome # Run chrome plugin tests with coverage +# ./bin/test_plugins.sh parse_* # Run all parse_* plugin tests with coverage +# ./bin/test_plugins.sh --no-coverage # Run all tests without coverage +# +# Coverage results are saved to .coverage and can be viewed with: +# coverage combine +# coverage report +# coverage json set -e @@ -18,11 +24,43 @@ RED='\033[0;31m' YELLOW='\033[1;33m' NC='\033[0m' # No Color +# Save root directory first +ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)" + # Parse arguments -PLUGIN_FILTER="${1:-}" +PLUGIN_FILTER="" +ENABLE_COVERAGE=true + +for arg in "$@"; do + if [ "$arg" = "--no-coverage" ]; then + ENABLE_COVERAGE=false + else + PLUGIN_FILTER="$arg" + fi +done + +# Reset coverage data if collecting coverage +if [ "$ENABLE_COVERAGE" = true ]; then + echo "Resetting coverage data..." + cd "$ROOT_DIR" || exit 1 + coverage erase + rm -rf "$ROOT_DIR/coverage/js" 2>/dev/null + mkdir -p "$ROOT_DIR/coverage/js" + + # Enable Python subprocess coverage + export COVERAGE_PROCESS_START="$ROOT_DIR/pyproject.toml" + export PYTHONPATH="$ROOT_DIR:$PYTHONPATH" # For sitecustomize.py + + # Enable Node.js V8 coverage (built-in, no packages needed) + export NODE_V8_COVERAGE="$ROOT_DIR/coverage/js" + + echo "Python coverage: enabled (subprocess support)" + echo "JavaScript coverage: enabled (NODE_V8_COVERAGE)" + echo "" +fi # Change to plugins directory -cd "$(dirname "$0")/../archivebox/plugins" || exit 1 +cd "$ROOT_DIR/archivebox/plugins" || exit 1 echo "==========================================" echo "ArchiveBox Plugin Tests" @@ -34,6 +72,12 @@ if [ -n "$PLUGIN_FILTER" ]; then else echo "Running all plugin tests" fi + +if [ "$ENABLE_COVERAGE" = true ]; then + echo "Coverage: enabled" +else + echo "Coverage: disabled" +fi echo "" # Track results @@ -67,7 +111,13 @@ for test_dir in $TEST_DIRS; do echo -e "${YELLOW}[RUNNING]${NC} $plugin_name" - if python -m pytest "$test_dir" -p no:django -v --tb=short 2>&1 | grep -v "^platform\|^cachedir\|^rootdir\|^configfile\|^plugins:" | tail -100; then + # Build pytest command with optional coverage + PYTEST_CMD="python -m pytest $test_dir -p no:django -v --tb=short" + if [ "$ENABLE_COVERAGE" = true ]; then + PYTEST_CMD="$PYTEST_CMD --cov=$plugin_name --cov-append --cov-branch" + fi + + if eval "$PYTEST_CMD" 2>&1 | grep -v "^platform\|^cachedir\|^rootdir\|^configfile\|^plugins:" | tail -100; then echo -e "${GREEN}[PASSED]${NC} $plugin_name" PASSED_PLUGINS=$((PASSED_PLUGINS + 1)) else @@ -91,6 +141,41 @@ if [ $TOTAL_PLUGINS -eq 0 ]; then exit 0 elif [ $FAILED_PLUGINS -eq 0 ]; then echo -e "${GREEN}✓ All plugin tests passed!${NC}" + + # Show coverage summary if enabled + if [ "$ENABLE_COVERAGE" = true ]; then + echo "" + echo "==========================================" + echo "Python Coverage Summary" + echo "==========================================" + # Coverage data is in ROOT_DIR, combine and report from there + cd "$ROOT_DIR" || exit 1 + # Copy coverage data from plugins dir if it exists + if [ -f "$ROOT_DIR/archivebox/plugins/.coverage" ]; then + cp "$ROOT_DIR/archivebox/plugins/.coverage" "$ROOT_DIR/.coverage" + fi + coverage combine 2>/dev/null || true + coverage report --include="archivebox/plugins/*" --omit="*/tests/*" 2>&1 | head -50 + echo "" + + echo "==========================================" + echo "JavaScript Coverage Summary" + echo "==========================================" + if [ -d "$ROOT_DIR/coverage/js" ] && [ "$(ls -A "$ROOT_DIR/coverage/js" 2>/dev/null)" ]; then + node "$ROOT_DIR/bin/convert_v8_coverage.js" "$ROOT_DIR/coverage/js" + else + echo "No JavaScript coverage data collected" + echo "(JS hooks may not have been executed during tests)" + fi + echo "" + + echo "For detailed coverage reports (from project root):" + echo " Python: coverage report --show-missing --include='archivebox/plugins/*' --omit='*/tests/*'" + echo " Python: coverage json # LLM-friendly format" + echo " Python: coverage html # Interactive HTML report" + echo " JavaScript: node bin/convert_v8_coverage.js coverage/js" + fi + exit 0 else echo -e "${RED}✗ Some plugin tests failed${NC}"