diff --git a/.gitignore b/.gitignore index 832334e7..f161c55f 100644 --- a/.gitignore +++ b/.gitignore @@ -39,11 +39,13 @@ tmp/ data/ data*/ output/ +logs/ index.sqlite3 queue.sqlite3 *.sqlite* data.* .archivebox_id +ArchiveBox.conf # vim *.sw? diff --git a/CLAUDE.md b/CLAUDE.md index e0446e65..5adf1178 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -158,6 +158,63 @@ env['SAVE_FAVICON'] = 'False' #### Timeout Settings Use appropriate timeouts for migration tests (45s for init, 60s default). +### Plugin Testing & Code Coverage + +**Target: 80-90% coverage** for critical plugins (screenshot, chrome, singlefile, dom) + +```bash +# Run plugin tests with coverage (both Python + JavaScript) +bash bin/test_plugins.sh screenshot + +# View coverage reports +bash bin/test_plugins.sh --coverage-report +# Or individual reports: +coverage report --show-missing --include='archivebox/plugins/*' --omit='*/tests/*' +``` + +#### Plugin Test Structure + +Tests are **completely isolated** from ArchiveBox - they replicate production directory structure in temp dirs: + +```python +# Correct production paths: +# Crawl: DATA_DIR/users/{username}/crawls/YYYYMMDD/example.com/{crawl-id}/{plugin}/ +# Snapshot: DATA_DIR/users/{username}/snapshots/YYYYMMDD/example.com/{snapshot-uuid}/{plugin}/ + +with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) + + # Crawl-level plugin (e.g., chrome launcher) + crawl_dir = data_dir / 'users' / 'testuser' / 'crawls' / '20240101' / 'example.com' / 'crawl-123' + chrome_dir = crawl_dir / 'chrome' + chrome_dir.mkdir(parents=True) + + # Snapshot-level plugin (e.g., screenshot) + snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-456' + screenshot_dir = snapshot_dir / 'screenshot' + screenshot_dir.mkdir(parents=True) + + # Run hook in its output directory + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=snap-456'], + cwd=str(screenshot_dir), + env=get_test_env(), + capture_output=True, + timeout=120 + ) +``` + +#### Coverage Improvement Loop + +To improve from ~20% to 80%+: + +1. **Run tests**: `bash bin/test_plugins.sh screenshot` → Shows: `19.1% (13/68 ranges)` +2. **Identify gaps**: Check hook file for untested paths (session connection vs fallback, config branches, error cases) +3. **Add tests**: Test both execution paths (connect to session + launch own browser), skip conditions, error cases, config variations +4. **Verify**: Re-run tests → Should show: `85%+ (58+/68 ranges)` + +**Critical**: JavaScript hooks have TWO paths that both must be tested (connect to session ~50% + launch browser ~30% + shared ~20%). Testing only one path = max 50% coverage possible! + ## Database Migrations ### Generate and Apply Migrations diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index 743f1626..4c720282 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -41,9 +41,11 @@ class ArchiveBoxGroup(click.Group): archive_commands = { # High-level commands 'add': 'archivebox.cli.archivebox_add.main', + 'remove': 'archivebox.cli.archivebox_remove.main', 'run': 'archivebox.cli.archivebox_run.main', 'update': 'archivebox.cli.archivebox_update.main', 'status': 'archivebox.cli.archivebox_status.main', + 'search': 'archivebox.cli.archivebox_search.main', 'config': 'archivebox.cli.archivebox_config.main', 'schedule': 'archivebox.cli.archivebox_schedule.main', 'server': 'archivebox.cli.archivebox_server.main', diff --git a/archivebox/cli/archivebox_install.py b/archivebox/cli/archivebox_install.py index 2e86dc69..3c8a4e35 100755 --- a/archivebox/cli/archivebox_install.py +++ b/archivebox/cli/archivebox_install.py @@ -13,8 +13,15 @@ from archivebox.misc.util import docstring, enforce_types @enforce_types -def install(dry_run: bool=False) -> None: - """Detect and install ArchiveBox dependencies by running a dependency-check crawl""" +def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bool = False) -> None: + """Detect and install ArchiveBox dependencies by running a dependency-check crawl + + Examples: + archivebox install # Install all dependencies + archivebox install wget curl # Install only wget and curl + archivebox install --binproviders=pip yt-dlp # Install yt-dlp using only pip + archivebox install --binproviders=brew,apt # Install all deps using only brew or apt + """ from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP from archivebox.config.paths import ARCHIVE_DIR @@ -24,7 +31,14 @@ def install(dry_run: bool=False) -> None: if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()): init() # must init full index because we need a db to store Binary entries in - print('\n[green][+] Detecting ArchiveBox dependencies...[/green]') + # Show what we're installing + if binaries: + print(f'\n[green][+] Installing specific binaries: {", ".join(binaries)}[/green]') + else: + print('\n[green][+] Detecting and installing all ArchiveBox dependencies...[/green]') + + if binproviders != '*': + print(f'[green][+] Using providers: {binproviders}[/green]') if IS_ROOT: EUID = os.geteuid() @@ -49,6 +63,19 @@ def install(dry_run: bool=False) -> None: # Using a minimal crawl that will trigger on_Crawl hooks created_by_id = get_or_create_system_user_pk() + # Build config for this crawl using existing PLUGINS filter + crawl_config = {} + + # Combine binary names and provider names into PLUGINS list + plugins = [] + if binaries: + plugins.extend(binaries) + if binproviders != '*': + plugins.extend(binproviders.split(',')) + + if plugins: + crawl_config['PLUGINS'] = ','.join(plugins) + crawl, created = Crawl.objects.get_or_create( urls='archivebox://install', defaults={ @@ -56,6 +83,7 @@ def install(dry_run: bool=False) -> None: 'created_by_id': created_by_id, 'max_depth': 0, 'status': 'queued', + 'config': crawl_config, } ) @@ -63,9 +91,12 @@ def install(dry_run: bool=False) -> None: if not created: crawl.status = 'queued' crawl.retry_at = timezone.now() + crawl.config = crawl_config # Update config crawl.save() print(f'[+] Created dependency detection crawl: {crawl.id}') + if crawl_config: + print(f'[+] Crawl config: {crawl_config}') print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}') # Verify the crawl is in the queue @@ -100,15 +131,15 @@ def install(dry_run: bool=False) -> None: print() - # Run version to show full status - archivebox_path = shutil.which('archivebox') or sys.executable - if 'python' in archivebox_path: - os.system(f'{sys.executable} -m archivebox version') - else: - os.system(f'{archivebox_path} version') + # Show version to display full status including installed binaries + # Django is already loaded, so just import and call the function directly + from archivebox.cli.archivebox_version import version as show_version + show_version(quiet=False) @click.command() +@click.argument('binaries', nargs=-1, type=str, required=False) +@click.option('--binproviders', '-p', default='*', help='Comma-separated list of providers to use (pip,npm,brew,apt,env,custom) or * for all', show_default=True) @click.option('--dry-run', '-d', is_flag=True, help='Show what would happen without actually running', default=False) @docstring(install.__doc__) def main(**kwargs) -> None: diff --git a/archivebox/cli/archivebox_search.py b/archivebox/cli/archivebox_search.py index 055e952d..b066b474 100644 --- a/archivebox/cli/archivebox_search.py +++ b/archivebox/cli/archivebox_search.py @@ -50,6 +50,9 @@ def get_snapshots(snapshots: Optional[QuerySet]=None, if filter_patterns: result = Snapshot.objects.filter_by_patterns(filter_patterns, filter_type) + # Prefetch crawl relationship to avoid N+1 queries when accessing output_dir + result = result.select_related('crawl', 'crawl__created_by') + if not result: stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow') diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index 996f1820..2fbd05c0 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -145,16 +145,29 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di # Check if needs migration (0.8.x → 0.9.x) if snapshot.fs_migration_needed: try: - snapshot.save() # Triggers migration + creates symlink + # Manually trigger filesystem migration without full save() + # This avoids UNIQUE constraint issues while still migrating files + cleanup_info = None + if hasattr(snapshot, '_fs_migrate_from_0_8_0_to_0_9_0'): + cleanup_info = snapshot._fs_migrate_from_0_8_0_to_0_9_0() + + # Update only fs_version field using queryset update (bypasses validation) + from archivebox.core.models import Snapshot as SnapshotModel + SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0') + + # Commit the transaction + transaction.commit() + + # Manually call cleanup since we bypassed normal save() flow + if cleanup_info: + old_dir, new_dir = cleanup_info + snapshot._cleanup_old_migration_dir(old_dir, new_dir) + stats['migrated'] += 1 print(f" [{stats['processed']}] Migrated: {entry_path.name}") except Exception as e: - # Snapshot already exists in DB with different crawl - skip it - if 'UNIQUE constraint failed' in str(e): - stats['skipped'] += 1 - print(f" [{stats['processed']}] Skipped (already in DB): {entry_path.name}") - else: - raise + stats['skipped'] += 1 + print(f" [{stats['processed']}] Skipped (error: {e}): {entry_path.name}") else: stats['skipped'] += 1 diff --git a/archivebox/cli/archivebox_version.py b/archivebox/cli/archivebox_version.py index 76cbcd19..4f80bfe2 100755 --- a/archivebox/cli/archivebox_version.py +++ b/archivebox/cli/archivebox_version.py @@ -104,40 +104,47 @@ def version(quiet: bool=False, failures = [] # Setup Django before importing models - from archivebox.config.django import setup_django - setup_django() + try: + from archivebox.config.django import setup_django + setup_django() - from archivebox.machine.models import Machine, Binary + from archivebox.machine.models import Machine, Binary - machine = Machine.current() + machine = Machine.current() - # Get all binaries from the database - all_installed = Binary.objects.filter( - machine=machine - ).exclude(abspath='').exclude(abspath__isnull=True).order_by('name') + # Get all binaries from the database with timeout protection + all_installed = Binary.objects.filter( + machine=machine + ).exclude(abspath='').exclude(abspath__isnull=True).order_by('name') - if not all_installed.exists(): - prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]') - else: - for installed in all_installed: - # Skip if user specified specific binaries and this isn't one - if binaries and installed.name not in binaries: - continue + if not all_installed.exists(): + prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]') + else: + for installed in all_installed: + # Skip if user specified specific binaries and this isn't one + if binaries and installed.name not in binaries: + continue - if installed.is_valid: - display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~') - version_str = (installed.version or 'unknown')[:15] - provider = (installed.binprovider or 'env')[:8] - prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False) - else: - prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False) - failures.append(installed.name) + if installed.is_valid: + display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~') + version_str = (installed.version or 'unknown')[:15] + provider = (installed.binprovider or 'env')[:8] + prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False) + else: + prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False) + failures.append(installed.name) - # Show hint if no binaries are installed yet - has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath='').exists() - if not has_any_installed: + # Show hint if no binaries are installed yet + has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath='').exists() + if not has_any_installed: + prnt() + prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]') + + except Exception as e: + # Handle database errors gracefully (locked, missing, etc.) prnt() - prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]') + prnt('', f'[yellow]Warning: Could not query binaries from database: {e}[/yellow]') + prnt('', '[grey53]Run [green]archivebox init[/green] and [green]archivebox install[/green] to set up dependencies.[/grey53]') if not binaries: # Show code and data locations diff --git a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py index 8ad24966..2133309c 100644 --- a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py +++ b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py @@ -116,7 +116,7 @@ def upgrade_core_tables(apps, schema_editor): retry_at DATETIME, depth INTEGER NOT NULL DEFAULT 0, - fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0', + fs_version VARCHAR(10) NOT NULL DEFAULT '0.8.0', config TEXT NOT NULL DEFAULT '{}', notes TEXT NOT NULL DEFAULT '', num_uses_succeeded INTEGER NOT NULL DEFAULT 0, @@ -326,6 +326,16 @@ class Migration(migrations.Migration): name='modified_at', field=models.DateTimeField(auto_now=True), ), + # Declare fs_version (already created in database with DEFAULT '0.8.0') + migrations.AddField( + model_name='snapshot', + name='fs_version', + field=models.CharField( + max_length=10, + default='0.8.0', + help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().' + ), + ), # SnapshotTag table already exists from v0.7.2, just declare it in state migrations.CreateModel( diff --git a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py index ddcdcd28..600b9f4e 100644 --- a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py +++ b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py @@ -150,11 +150,7 @@ class Migration(migrations.Migration): name='downloaded_at', field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True), ), - migrations.AddField( - model_name='snapshot', - name='fs_version', - field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10), - ), + # NOTE: fs_version already added by migration 0023 with default='0.8.0' # NOTE: modified_at already added by migration 0023 migrations.AddField( model_name='snapshot', diff --git a/archivebox/core/migrations/0026_add_process_to_archiveresult.py b/archivebox/core/migrations/0026_add_process_to_archiveresult.py index eef7b265..e76b8597 100644 --- a/archivebox/core/migrations/0026_add_process_to_archiveresult.py +++ b/archivebox/core/migrations/0026_add_process_to_archiveresult.py @@ -8,7 +8,7 @@ class Migration(migrations.Migration): dependencies = [ ('core', '0025_alter_archiveresult_options_alter_snapshot_options_and_more'), - ('machine', '0003_add_process_type_and_parent'), + ('machine', '0007_add_process_type_and_parent'), ] operations = [ diff --git a/archivebox/core/migrations/0027_copy_archiveresult_to_process.py b/archivebox/core/migrations/0027_copy_archiveresult_to_process.py new file mode 100644 index 00000000..5b0666c5 --- /dev/null +++ b/archivebox/core/migrations/0027_copy_archiveresult_to_process.py @@ -0,0 +1,388 @@ +# Generated by hand on 2026-01-01 +# Copies ArchiveResult cmd/pwd/cmd_version data to Process records before removing old fields + +from django.db import migrations, connection +import json +from pathlib import Path + + +def parse_cmd_field(cmd_raw): + """ + Parse cmd field which could be: + 1. JSON array string: '["wget", "-p", "url"]' + 2. Space-separated string: 'wget -p url' + 3. NULL/empty + + Returns list of strings. + """ + if not cmd_raw: + return [] + + cmd_raw = cmd_raw.strip() + + if not cmd_raw: + return [] + + # Try to parse as JSON first + if cmd_raw.startswith('['): + try: + parsed = json.loads(cmd_raw) + if isinstance(parsed, list): + return [str(x) for x in parsed] + except json.JSONDecodeError: + pass + + # Fallback: split by spaces (simple approach, doesn't handle quoted strings) + # This is acceptable since old cmd fields were mostly simple commands + return cmd_raw.split() + + +def get_or_create_current_machine(cursor): + """Get or create Machine.current() using raw SQL.""" + import uuid + import socket + from datetime import datetime + + # Simple machine detection - get hostname as guid + hostname = socket.gethostname() + guid = f'host_{hostname}' # Simple but stable identifier + + # Check if machine exists + cursor.execute("SELECT id FROM machine_machine WHERE guid = ?", [guid]) + row = cursor.fetchone() + + if row: + return row[0] + + # Create new machine + machine_id = str(uuid.uuid4()) + now = datetime.now().isoformat() + + # Check which columns exist (schema differs between 0.8.x and 0.9.x) + cursor.execute("PRAGMA table_info(machine_machine)") + machine_cols = {row[1] for row in cursor.fetchall()} + + # Build INSERT statement based on available columns + if 'config' in machine_cols: + # 0.9.x schema with config column + cursor.execute(""" + INSERT INTO machine_machine ( + id, created_at, modified_at, guid, hostname, + hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid, + os_arch, os_family, os_platform, os_release, os_kernel, + stats, config, num_uses_failed, num_uses_succeeded + ) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '', + '', '', '', '', '', '{}', '{}', 0, 0) + """, [machine_id, now, now, guid, hostname]) + else: + # 0.8.x schema without config column + cursor.execute(""" + INSERT INTO machine_machine ( + id, created_at, modified_at, guid, hostname, + hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid, + os_arch, os_family, os_platform, os_release, os_kernel, + stats, num_uses_failed, num_uses_succeeded + ) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '', + '', '', '', '', '', '{}', 0, 0) + """, [machine_id, now, now, guid, hostname]) + + return machine_id + + +def get_or_create_binary(cursor, machine_id, name, abspath, version): + """ + Get or create Binary record. + + Args: + cursor: DB cursor + machine_id: Machine FK + name: Binary name (basename of command) + abspath: Absolute path to binary (or just name if path unknown) + version: Version string + + Returns: + binary_id (str) + """ + import uuid + from datetime import datetime + + # If abspath is just a name without slashes, it's not a full path + # Store it in both fields for simplicity + if '/' not in abspath: + # Not a full path - store as-is + pass + + # Check if binary exists with same machine, name, abspath, version + cursor.execute(""" + SELECT id FROM machine_binary + WHERE machine_id = ? AND name = ? AND abspath = ? AND version = ? + """, [machine_id, name, abspath, version]) + + row = cursor.fetchone() + if row: + return row[0] + + # Create new binary + binary_id = str(uuid.uuid4()) + now = datetime.now().isoformat() + + # Check which columns exist (schema differs between 0.8.x and 0.9.x) + cursor.execute("PRAGMA table_info(machine_binary)") + binary_cols = {row[1] for row in cursor.fetchall()} + + # Use only columns that exist in current schema + # 0.8.x schema: id, created_at, modified_at, machine_id, name, binprovider, abspath, version, sha256, num_uses_failed, num_uses_succeeded + # 0.9.x schema adds: binproviders, overrides, status, retry_at, output_dir + if 'binproviders' in binary_cols: + # 0.9.x schema + cursor.execute(""" + INSERT INTO machine_binary ( + id, created_at, modified_at, machine_id, + name, binproviders, overrides, binprovider, abspath, version, sha256, + status, retry_at, output_dir, + num_uses_failed, num_uses_succeeded + ) VALUES (?, ?, ?, ?, ?, 'env', '{}', 'env', ?, ?, '', + 'succeeded', NULL, '', 0, 0) + """, [binary_id, now, now, machine_id, name, abspath, version]) + else: + # 0.8.x schema (simpler) + cursor.execute(""" + INSERT INTO machine_binary ( + id, created_at, modified_at, machine_id, + name, binprovider, abspath, version, sha256, + num_uses_failed, num_uses_succeeded + ) VALUES (?, ?, ?, ?, ?, 'env', ?, ?, '', 0, 0) + """, [binary_id, now, now, machine_id, name, abspath, version]) + + return binary_id + + +def map_status(old_status): + """ + Map old ArchiveResult status to Process status and exit_code. + + Args: + old_status: One of: queued, started, backoff, succeeded, failed, skipped + + Returns: + (process_status, exit_code) tuple + """ + status_map = { + 'queued': ('queued', None), + 'started': ('running', None), + 'backoff': ('queued', None), + 'succeeded': ('exited', 0), + 'failed': ('exited', 1), + 'skipped': ('exited', None), # Skipped = exited without error + } + + return status_map.get(old_status, ('queued', None)) + + +def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at, ended_at, binary_id): + """ + Create a Process record. + + Returns: + process_id (str) + """ + import uuid + from datetime import datetime + + process_id = str(uuid.uuid4()) + now = datetime.now().isoformat() + + # Convert cmd array to JSON + cmd_json = json.dumps(cmd) + + # Set retry_at to now for queued processes, NULL otherwise + retry_at = now if status == 'queued' else None + + cursor.execute(""" + INSERT INTO machine_process ( + id, created_at, modified_at, machine_id, parent_id, process_type, + pwd, cmd, env, timeout, + pid, exit_code, stdout, stderr, + started_at, ended_at, + binary_id, iface_id, url, + status, retry_at + ) VALUES (?, ?, ?, ?, NULL, 'cli', + ?, ?, '{}', 120, + NULL, ?, '', '', + ?, ?, + ?, NULL, NULL, + ?, ?) + """, [ + process_id, now, now, machine_id, + pwd, cmd_json, + exit_code, + started_at, ended_at, + binary_id, + status, retry_at + ]) + + return process_id + + +def copy_archiveresult_data_to_process(apps, schema_editor): + """ + Copy old ArchiveResult execution data (cmd, pwd, cmd_version) to Process records. + + For each ArchiveResult without a process_id: + 1. Parse cmd field (handle both JSON array and space-separated string) + 2. Extract binary name/path from cmd[0] + 3. Get or create Binary record with machine, name, abspath, version + 4. Create Process record with mapped fields + 5. Link ArchiveResult.process_id to new Process + + Status mapping: + - queued → queued (exit_code=None) + - started → running (exit_code=None) + - backoff → queued (exit_code=None) + - succeeded → exited (exit_code=0) + - failed → exited (exit_code=1) + - skipped → exited (exit_code=None) + """ + cursor = connection.cursor() + + # Check if old fields still exist (skip if fresh install or already migrated) + cursor.execute("PRAGMA table_info(core_archiveresult)") + cols = {row[1] for row in cursor.fetchall()} + + print(f'DEBUG 0027: Columns found: {sorted(cols)}') + print(f'DEBUG 0027: Has cmd={("cmd" in cols)}, pwd={("pwd" in cols)}, cmd_version={("cmd_version" in cols)}, process_id={("process_id" in cols)}') + + if 'cmd' not in cols or 'pwd' not in cols or 'cmd_version' not in cols: + print('✓ Fresh install or fields already removed - skipping data copy') + return + + # Check if process_id field exists (should exist from 0026) + if 'process_id' not in cols: + print('✗ ERROR: process_id field not found. Migration 0026 must run first.') + return + + # Get or create Machine.current() + machine_id = get_or_create_current_machine(cursor) + + # Get ArchiveResults without process_id that have cmd data + # Use plugin (extractor was renamed to plugin in migration 0025) + cursor.execute(""" + SELECT id, snapshot_id, plugin, cmd, pwd, cmd_version, + status, start_ts, end_ts, created_at + FROM core_archiveresult + WHERE process_id IS NULL + AND (cmd IS NOT NULL OR pwd IS NOT NULL) + """) + + results = cursor.fetchall() + + if not results: + print('✓ No ArchiveResults need Process migration') + return + + print(f'Migrating {len(results)} ArchiveResults to Process records...') + + migrated_count = 0 + skipped_count = 0 + error_count = 0 + + for i, row in enumerate(results): + ar_id, snapshot_id, plugin, cmd_raw, pwd, cmd_version, status, start_ts, end_ts, created_at = row + + if i == 0: + print(f'DEBUG 0027: First row: ar_id={ar_id}, plugin={plugin}, cmd={cmd_raw[:50] if cmd_raw else None}, status={status}') + + try: + # Parse cmd field + cmd_array = parse_cmd_field(cmd_raw) + + if i == 0: + print(f'DEBUG 0027: Parsed cmd: {cmd_array}') + + # Extract binary info from cmd[0] if available + binary_id = None + if cmd_array and cmd_array[0]: + binary_name = Path(cmd_array[0]).name or plugin # Fallback to plugin name + binary_abspath = cmd_array[0] + binary_version = cmd_version or '' + + # Get or create Binary record + binary_id = get_or_create_binary( + cursor, machine_id, binary_name, binary_abspath, binary_version + ) + + if i == 0: + print(f'DEBUG 0027: Created Binary: id={binary_id}, name={binary_name}') + + # Map status + process_status, exit_code = map_status(status) + + # Set timestamps + started_at = start_ts or created_at + ended_at = end_ts if process_status == 'exited' else None + + # Create Process record + process_id = create_process( + cursor=cursor, + machine_id=machine_id, + pwd=pwd or '', + cmd=cmd_array, + status=process_status, + exit_code=exit_code, + started_at=started_at, + ended_at=ended_at, + binary_id=binary_id, + ) + + if i == 0: + print(f'DEBUG 0027: Created Process: id={process_id}') + + # Link ArchiveResult to Process + cursor.execute( + "UPDATE core_archiveresult SET process_id = ? WHERE id = ?", + [process_id, ar_id] + ) + + migrated_count += 1 + + if i == 0: + print(f'DEBUG 0027: Linked ArchiveResult to Process') + + except Exception as e: + print(f'✗ Error migrating ArchiveResult {ar_id}: {e}') + import traceback + traceback.print_exc() + error_count += 1 + continue + + print(f'✓ Migration complete: {migrated_count} migrated, {skipped_count} skipped, {error_count} errors') + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0026_add_process_to_archiveresult'), + ('machine', '0007_add_process_type_and_parent'), + ] + + operations = [ + # First, copy data from old fields to Process + migrations.RunPython( + copy_archiveresult_data_to_process, + reverse_code=migrations.RunPython.noop, + ), + + # Now safe to remove old fields (moved from 0025) + migrations.RemoveField( + model_name='archiveresult', + name='cmd', + ), + migrations.RemoveField( + model_name='archiveresult', + name='cmd_version', + ), + migrations.RemoveField( + model_name='archiveresult', + name='pwd', + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 403c441e..471a410d 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -362,24 +362,22 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea # Migrate filesystem if needed (happens automatically on save) if self.pk and self.fs_migration_needed: - from django.db import transaction - with transaction.atomic(): - # Walk through migration chain automatically - current = self.fs_version - target = self._fs_current_version() + # Walk through migration chain automatically + current = self.fs_version + target = self._fs_current_version() - while current != target: - next_ver = self._fs_next_version(current) - method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}' + while current != target: + next_ver = self._fs_next_version(current) + method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}' - # Only run if method exists (most are no-ops) - if hasattr(self, method): - getattr(self, method)() + # Only run if method exists (most are no-ops) + if hasattr(self, method): + getattr(self, method)() - current = next_ver + current = next_ver - # Update version (still in transaction) - self.fs_version = target + # Update version + self.fs_version = target super().save(*args, **kwargs) if self.url not in self.crawl.urls: @@ -486,33 +484,58 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea # Convert index.json to index.jsonl in the new directory self.convert_index_json_to_jsonl() - # Create backwards-compat symlink (INSIDE transaction) - symlink_path = CONSTANTS.ARCHIVE_DIR / self.timestamp - if symlink_path.is_symlink(): - symlink_path.unlink() + # Schedule cleanup AFTER transaction commits successfully + # This ensures DB changes are committed before we delete old files + from django.db import transaction + transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir, new_dir)) - if not symlink_path.exists() or symlink_path == old_dir: - symlink_path.symlink_to(new_dir, target_is_directory=True) + # Return cleanup info for manual cleanup if needed (when called directly) + return (old_dir, new_dir) - # Schedule old directory deletion AFTER transaction commits - transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir)) - - def _cleanup_old_migration_dir(self, old_dir: Path): + def _cleanup_old_migration_dir(self, old_dir: Path, new_dir: Path): """ - Delete old directory after successful migration. + Delete old directory and create symlink after successful migration. Called via transaction.on_commit() after DB commit succeeds. """ import shutil import logging + print(f"[DEBUG] _cleanup_old_migration_dir called: old_dir={old_dir}, new_dir={new_dir}") + + # Delete old directory if old_dir.exists() and not old_dir.is_symlink(): + print(f"[DEBUG] Attempting to delete old directory: {old_dir}") try: shutil.rmtree(old_dir) + print(f"[DEBUG] Successfully deleted old directory: {old_dir}") except Exception as e: # Log but don't raise - migration succeeded, this is just cleanup + print(f"[DEBUG] Failed to delete old directory {old_dir}: {e}") logging.getLogger('archivebox.migration').warning( f"Could not remove old migration directory {old_dir}: {e}" ) + return # Don't create symlink if cleanup failed + else: + print(f"[DEBUG] Old directory doesn't exist or is already a symlink: {old_dir}") + + # Create backwards-compat symlink (after old dir is deleted) + symlink_path = old_dir # Same path as old_dir + if symlink_path.is_symlink(): + print(f"[DEBUG] Unlinking existing symlink: {symlink_path}") + symlink_path.unlink() + + if not symlink_path.exists(): + print(f"[DEBUG] Creating symlink: {symlink_path} -> {new_dir}") + try: + symlink_path.symlink_to(new_dir, target_is_directory=True) + print(f"[DEBUG] Successfully created symlink") + except Exception as e: + print(f"[DEBUG] Failed to create symlink: {e}") + logging.getLogger('archivebox.migration').warning( + f"Could not create symlink from {symlink_path} to {new_dir}: {e}" + ) + else: + print(f"[DEBUG] Symlink path already exists: {symlink_path}") # ========================================================================= # Path Calculation and Migration Helpers @@ -1616,8 +1639,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea This enables step-based execution where all hooks in a step can run in parallel. """ from archivebox.hooks import discover_hooks + from archivebox.config.configset import get_config - hooks = discover_hooks('Snapshot') + # Get merged config with crawl-specific PLUGINS filter + config = get_config(crawl=self.crawl, snapshot=self) + hooks = discover_hooks('Snapshot', config=config) archiveresults = [] for hook_path in hooks: @@ -2212,22 +2238,19 @@ class SnapshotMachine(BaseStateMachine, strict_states=True): started = State(value=Snapshot.StatusChoices.STARTED) sealed = State(value=Snapshot.StatusChoices.SEALED, final=True) - # Tick Event + # Tick Event (polled by workers) tick = ( queued.to.itself(unless='can_start') | - queued.to(started, cond='can_start') | - started.to.itself(unless='is_finished', on='on_started_to_started') | - started.to(sealed, cond='is_finished') + queued.to(started, cond='can_start') ) + # Manual event (triggered by last ArchiveResult finishing) + seal = started.to(sealed) + def can_start(self) -> bool: can_start = bool(self.snapshot.url) return can_start - def is_finished(self) -> bool: - """Check if snapshot processing is complete - delegates to model method.""" - return self.snapshot.is_finished_processing() - @queued.enter def enter_queued(self): self.snapshot.update_and_requeue( @@ -2237,29 +2260,34 @@ class SnapshotMachine(BaseStateMachine, strict_states=True): @started.enter def enter_started(self): - # lock the snapshot while we create the pending archiveresults - self.snapshot.update_and_requeue( - retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying - ) + import sys + + print(f'[cyan] 🔄 SnapshotMachine.enter_started() - creating archiveresults for {self.snapshot.url}[/cyan]', file=sys.stderr) # Run the snapshot - creates pending archiveresults for all enabled plugins self.snapshot.run() - # unlock the snapshot after we're done + set status = started - self.snapshot.update_and_requeue( - retry_at=timezone.now() + timedelta(seconds=5), # check again in 5s - status=Snapshot.StatusChoices.STARTED, - ) + # Check if any archiveresults were created + ar_count = self.snapshot.archiveresult_set.count() + print(f'[cyan] 🔄 ArchiveResult count: {ar_count}[/cyan]', file=sys.stderr) - def on_started_to_started(self): - """Called when Snapshot stays in started state (archiveresults not finished yet).""" - # Bump retry_at so we check again in a few seconds - self.snapshot.update_and_requeue( - retry_at=timezone.now() + timedelta(seconds=5), - ) + if ar_count == 0: + # No archiveresults created, seal immediately + print(f'[cyan] 🔄 No archiveresults created, sealing snapshot immediately[/cyan]', file=sys.stderr) + self.seal() + else: + # Set status = started with retry_at far future (so workers don't claim us - we're waiting for ARs) + # Last AR will manually call self.seal() when done + self.snapshot.update_and_requeue( + retry_at=timezone.now() + timedelta(days=365), + status=Snapshot.StatusChoices.STARTED, + ) + print(f'[cyan] 🔄 {ar_count} archiveresults created, waiting for them to finish[/cyan]', file=sys.stderr) @sealed.enter def enter_sealed(self): + import sys + # Clean up background hooks self.snapshot.cleanup() @@ -2268,6 +2296,21 @@ class SnapshotMachine(BaseStateMachine, strict_states=True): status=Snapshot.StatusChoices.SEALED, ) + print(f'[cyan] ✅ SnapshotMachine.enter_sealed() - sealed {self.snapshot.url}[/cyan]', file=sys.stderr) + + # Check if this is the last snapshot for the parent crawl - if so, seal the crawl + if self.snapshot.crawl: + crawl = self.snapshot.crawl + remaining_active = Snapshot.objects.filter( + crawl=crawl, + status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED] + ).count() + + if remaining_active == 0: + print(f'[cyan]🔒 All snapshots sealed for crawl {crawl.id}, sealing crawl[/cyan]', file=sys.stderr) + # Seal the parent crawl + crawl.sm.seal() + class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithStateMachine): class StatusChoices(models.TextChoices): @@ -3102,8 +3145,30 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True): end_ts=None, ) + def _check_and_seal_parent_snapshot(self): + """Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot.""" + import sys + + snapshot = self.archiveresult.snapshot + + # Check if all archiveresults are finished (in final states) + remaining_active = snapshot.archiveresult_set.exclude( + status__in=[ + ArchiveResult.StatusChoices.SUCCEEDED, + ArchiveResult.StatusChoices.FAILED, + ArchiveResult.StatusChoices.SKIPPED, + ] + ).count() + + if remaining_active == 0: + print(f'[cyan] 🔒 All archiveresults finished for snapshot {snapshot.url}, sealing snapshot[/cyan]', file=sys.stderr) + # Seal the parent snapshot + snapshot.sm.seal() + @succeeded.enter def enter_succeeded(self): + import sys + self.archiveresult.update_and_requeue( retry_at=None, status=ArchiveResult.StatusChoices.SUCCEEDED, @@ -3113,8 +3178,15 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True): # Update health stats for ArchiveResult, Snapshot, and Crawl cascade self.archiveresult.cascade_health_update(success=True) + print(f'[cyan] ✅ ArchiveResult succeeded: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/cyan]', file=sys.stderr) + + # Check if this is the last AR to finish - seal parent snapshot if so + self._check_and_seal_parent_snapshot() + @failed.enter def enter_failed(self): + import sys + self.archiveresult.update_and_requeue( retry_at=None, status=ArchiveResult.StatusChoices.FAILED, @@ -3124,16 +3196,25 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True): # Update health stats for ArchiveResult, Snapshot, and Crawl cascade self.archiveresult.cascade_health_update(success=False) + print(f'[red] ❌ ArchiveResult failed: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/red]', file=sys.stderr) + + # Check if this is the last AR to finish - seal parent snapshot if so + self._check_and_seal_parent_snapshot() + @skipped.enter def enter_skipped(self): + import sys + self.archiveresult.update_and_requeue( retry_at=None, status=ArchiveResult.StatusChoices.SKIPPED, end_ts=timezone.now(), ) - def after_transition(self, event: str, source: State, target: State): - self.archiveresult.snapshot.update_and_requeue() # bump snapshot retry time so it picks up all the new changes + print(f'[dim] ⏭️ ArchiveResult skipped: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/dim]', file=sys.stderr) + + # Check if this is the last AR to finish - seal parent snapshot if so + self._check_and_seal_parent_snapshot() # ============================================================================= diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index e8415918..40bbb6c2 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -240,19 +240,26 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith if not first_url: raise ValueError(f'Crawl {self.id} has no URLs to create root snapshot from') + # Try to get existing snapshot try: - return Snapshot.objects.get(crawl=self, url=first_url) + snapshot = Snapshot.objects.get(crawl=self, url=first_url) + # If exists and already queued/started, return it as-is + if snapshot.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]: + # Update retry_at to now so it can be picked up immediately + snapshot.retry_at = timezone.now() + snapshot.save(update_fields=['retry_at']) + return snapshot except Snapshot.DoesNotExist: pass - root_snapshot, _ = Snapshot.objects.update_or_create( - crawl=self, url=first_url, - defaults={ - 'status': Snapshot.INITIAL_STATE, - 'retry_at': timezone.now(), - 'timestamp': str(timezone.now().timestamp()), - 'depth': 0, - }, + # Create new snapshot + root_snapshot = Snapshot.objects.create( + crawl=self, + url=first_url, + status=Snapshot.INITIAL_STATE, + retry_at=timezone.now(), + timestamp=str(timezone.now().timestamp()), + depth=0, ) return root_snapshot @@ -362,14 +369,14 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith return created_snapshots - def run(self) -> 'Snapshot': + def run(self) -> 'Snapshot | None': """ Execute this Crawl: run hooks, process JSONL, create snapshots. Called by the state machine when entering the 'started' state. Returns: - The root Snapshot for this crawl + The root Snapshot for this crawl, or None for system crawls that don't create snapshots """ import time from pathlib import Path @@ -407,8 +414,18 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith # Foreground hook - process JSONL records records = result.get('records', []) + if records: + print(f'[cyan]📝 Processing {len(records)} records from {hook.name}[/cyan]') + for record in records[:3]: # Show first 3 + print(f' Record: type={record.get("type")}, keys={list(record.keys())[:5]}') overrides = {'crawl': self} - process_hook_records(records, overrides=overrides) + stats = process_hook_records(records, overrides=overrides) + if stats: + print(f'[green]✓ Created: {stats}[/green]') + + # System crawls (archivebox://*) don't create snapshots - they just run hooks + if first_url.startswith('archivebox://'): + return None # Create snapshots from URLs root_snapshot = self.create_root_snapshot() @@ -498,14 +515,15 @@ class CrawlMachine(BaseStateMachine, strict_states=True): started = State(value=Crawl.StatusChoices.STARTED) sealed = State(value=Crawl.StatusChoices.SEALED, final=True) - # Tick Event + # Tick Event (polled by workers) tick = ( queued.to.itself(unless='can_start') | - queued.to(started, cond='can_start') | - started.to.itself(unless='is_finished', on='on_started_to_started') | - started.to(sealed, cond='is_finished') + queued.to(started, cond='can_start') ) + # Manual event (triggered by last Snapshot sealing) + seal = started.to(sealed) + def can_start(self) -> bool: if not self.crawl.urls: print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]') @@ -516,55 +534,38 @@ class CrawlMachine(BaseStateMachine, strict_states=True): return False return True - def is_finished(self) -> bool: - from archivebox.core.models import Snapshot - - # Check if any snapshots exist for this crawl - snapshots = Snapshot.objects.filter(crawl=self.crawl) - - # If no snapshots exist, allow finishing (e.g., archivebox://install crawls that only run hooks) - if not snapshots.exists(): - return True - - # If snapshots exist, check if all are sealed - # Snapshots handle their own background hooks via the step system, - # so we just need to wait for all snapshots to reach sealed state - if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists(): - return False - - return True - @started.enter def enter_started(self): - # Lock the crawl by bumping retry_at so other workers don't pick it up while we create snapshots - self.crawl.update_and_requeue( - retry_at=timezone.now() + timedelta(seconds=30), # Lock for 30 seconds - ) + import sys + from archivebox.core.models import Snapshot + + print(f'[cyan]🔄 CrawlMachine.enter_started() - creating snapshots for {self.crawl.id}[/cyan]', file=sys.stderr) try: # Run the crawl - runs hooks, processes JSONL, creates snapshots - self.crawl.run() + root_snapshot = self.crawl.run() + + if root_snapshot: + print(f'[cyan]🔄 Created root snapshot: {root_snapshot.url}[/cyan]', file=sys.stderr) + # Update status to STARTED + # Set retry_at to far future so workers don't claim us (we're waiting for snapshots to finish) + # Last snapshot will manually call self.seal() when done + self.crawl.update_and_requeue( + retry_at=timezone.now() + timedelta(days=365), + status=Crawl.StatusChoices.STARTED, + ) + else: + # No snapshots (system crawl like archivebox://install) + print(f'[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]', file=sys.stderr) + # Seal immediately since there's no work to do + self.seal() - # Update status to STARTED once snapshots are created - # Set retry_at to future so we don't busy-loop - wait for snapshots to process - self.crawl.update_and_requeue( - retry_at=timezone.now() + timedelta(seconds=5), # Check again in 5s - status=Crawl.StatusChoices.STARTED, - ) except Exception as e: print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]') import traceback traceback.print_exc() - # Re-raise so the worker knows it failed raise - def on_started_to_started(self): - """Called when Crawl stays in started state (snapshots not sealed yet).""" - # Bump retry_at so we check again in a few seconds - self.crawl.update_and_requeue( - retry_at=timezone.now() + timedelta(seconds=5), - ) - @sealed.enter def enter_sealed(self): # Clean up background hooks and run on_CrawlEnd hooks diff --git a/archivebox/hooks.py b/archivebox/hooks.py index e6778670..b21022dc 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -480,7 +480,7 @@ def run_hook( returncode=returncode, stdout=stdout, stderr=stderr, - output_json=output_json, + output_json=None, # Legacy field, we now use records for JSONL output_files=new_files, duration_ms=duration_ms, hook=str(script), @@ -922,10 +922,14 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[ if plugins_whitelist: # PLUGINS whitelist is specified - only enable plugins in the list plugin_names = [p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()] + import sys + print(f"DEBUG: PLUGINS whitelist='{plugins_whitelist}', checking plugin '{plugin_name}', plugin_names={plugin_names}", file=sys.stderr) if plugin_name.lower() not in plugin_names: # Plugin not in whitelist - explicitly disabled + print(f"DEBUG: Plugin '{plugin_name}' NOT in whitelist, disabling", file=sys.stderr) enabled = False else: + print(f"DEBUG: Plugin '{plugin_name}' IS in whitelist, enabling", file=sys.stderr) # Plugin is in whitelist - check if explicitly disabled by PLUGINNAME_ENABLED enabled_key = f'{plugin_upper}_ENABLED' enabled = config.get(enabled_key) @@ -935,6 +939,8 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[ enabled = enabled.lower() not in ('false', '0', 'no', '') else: # No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True) + import sys + print(f"DEBUG: NO PLUGINS whitelist in config, checking {plugin_name}_ENABLED", file=sys.stderr) enabled_key = f'{plugin_upper}_ENABLED' enabled = config.get(enabled_key) if enabled is None: diff --git a/archivebox/machine/migrations/0005_converge_binary_model.py b/archivebox/machine/migrations/0005_converge_binary_model.py new file mode 100644 index 00000000..e7e3a733 --- /dev/null +++ b/archivebox/machine/migrations/0005_converge_binary_model.py @@ -0,0 +1,72 @@ +# Generated by hand on 2026-01-01 +# Converges machine app for 0.8.6rc0 → 0.9.x migration path +# Drops old InstalledBinary table and ensures Binary table exists + +from django.db import migrations, connection + + +def converge_binary_table(apps, schema_editor): + """ + Drop machine_installedbinary if it exists (0.8.6rc0 path). + Create machine_binary if it doesn't exist (needed by Process model). + """ + cursor = connection.cursor() + + # Check what tables exist + cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name IN ('machine_installedbinary', 'machine_binary')") + existing_tables = {row[0] for row in cursor.fetchall()} + + print(f'DEBUG 0005: Existing tables: {existing_tables}') + + # Drop old InstalledBinary table if it exists (0.8.6rc0 path) + if 'machine_installedbinary' in existing_tables: + print('✓ Dropping machine_installedbinary table (0.8.6rc0 divergence)') + cursor.execute("DROP TABLE IF EXISTS machine_installedbinary") + + # Create Binary table if it doesn't exist + # This handles the case where 0.8.6rc0's 0001_initial didn't create it + if 'machine_binary' not in existing_tables: + print('✓ Creating machine_binary table with correct schema') + cursor.execute(""" + CREATE TABLE machine_binary ( + id TEXT PRIMARY KEY NOT NULL, + created_at DATETIME NOT NULL, + modified_at DATETIME NOT NULL, + num_uses_succeeded INTEGER NOT NULL DEFAULT 0, + num_uses_failed INTEGER NOT NULL DEFAULT 0, + machine_id TEXT NOT NULL REFERENCES machine_machine(id) ON DELETE CASCADE, + name VARCHAR(63) NOT NULL, + binproviders VARCHAR(255) NOT NULL DEFAULT 'env', + overrides TEXT NOT NULL DEFAULT '{}', + binprovider VARCHAR(63) NOT NULL DEFAULT 'env', + abspath VARCHAR(255) NOT NULL, + version VARCHAR(128) NOT NULL, + sha256 VARCHAR(64) NOT NULL DEFAULT '', + status VARCHAR(16) NOT NULL DEFAULT 'succeeded', + retry_at DATETIME NULL, + output_dir VARCHAR(255) NOT NULL DEFAULT '' + ) + """) + + # Create indexes + cursor.execute("CREATE INDEX machine_binary_machine_id_idx ON machine_binary(machine_id)") + cursor.execute("CREATE INDEX machine_binary_name_idx ON machine_binary(name)") + cursor.execute("CREATE INDEX machine_binary_abspath_idx ON machine_binary(abspath)") + + print('✓ machine_binary table created') + else: + print('✓ machine_binary table already exists') + + +class Migration(migrations.Migration): + + dependencies = [ + ('machine', '0001_initial'), + ] + + operations = [ + migrations.RunPython( + converge_binary_table, + reverse_code=migrations.RunPython.noop, + ), + ] diff --git a/archivebox/machine/migrations/0002_process.py b/archivebox/machine/migrations/0006_process.py similarity index 98% rename from archivebox/machine/migrations/0002_process.py rename to archivebox/machine/migrations/0006_process.py index c3aed18e..6a2139f0 100644 --- a/archivebox/machine/migrations/0002_process.py +++ b/archivebox/machine/migrations/0006_process.py @@ -9,7 +9,7 @@ from django.db import migrations, models class Migration(migrations.Migration): dependencies = [ - ('machine', '0001_initial'), + ('machine', '0005_converge_binary_model'), ] operations = [ diff --git a/archivebox/machine/migrations/0003_add_process_type_and_parent.py b/archivebox/machine/migrations/0007_add_process_type_and_parent.py similarity index 96% rename from archivebox/machine/migrations/0003_add_process_type_and_parent.py rename to archivebox/machine/migrations/0007_add_process_type_and_parent.py index ae97725c..b63fa400 100644 --- a/archivebox/machine/migrations/0003_add_process_type_and_parent.py +++ b/archivebox/machine/migrations/0007_add_process_type_and_parent.py @@ -7,7 +7,7 @@ from django.db import migrations, models class Migration(migrations.Migration): dependencies = [ - ('machine', '0002_process'), + ('machine', '0006_process'), ] operations = [ diff --git a/archivebox/machine/migrations/0008_add_worker_type_field.py b/archivebox/machine/migrations/0008_add_worker_type_field.py new file mode 100644 index 00000000..0588e60c --- /dev/null +++ b/archivebox/machine/migrations/0008_add_worker_type_field.py @@ -0,0 +1,18 @@ +# Generated by Django 6.0 on 2026-01-02 03:36 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('machine', '0007_add_process_type_and_parent'), + ] + + operations = [ + migrations.AddField( + model_name='process', + name='worker_type', + field=models.CharField(blank=True, db_index=True, default='', help_text='Worker type name for WORKER processes (crawl, snapshot, archiveresult)', max_length=32), + ), + ] diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index 7c1068b9..417e4c9f 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -203,13 +203,14 @@ class BinaryManager(models.Manager): class Binary(ModelWithHealthStats): """ - Tracks an binary on a specific machine. + Tracks a binary on a specific machine. - Follows the unified state machine pattern: + Simple state machine with 2 states: - queued: Binary needs to be installed - - started: Installation in progress - - succeeded: Binary installed successfully (abspath, version, sha256 populated) - - failed: Installation failed + - installed: Binary installed successfully (abspath, version, sha256 populated) + + Installation is synchronous during queued→installed transition. + If installation fails, Binary stays in queued with retry_at set for later retry. State machine calls run() which executes on_Binary__install_* hooks to install the binary using the specified providers. @@ -217,9 +218,7 @@ class Binary(ModelWithHealthStats): class StatusChoices(models.TextChoices): QUEUED = 'queued', 'Queued' - STARTED = 'started', 'Started' - SUCCEEDED = 'succeeded', 'Succeeded' - FAILED = 'failed', 'Failed' + INSTALLED = 'installed', 'Installed' id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True) created_at = models.DateTimeField(default=timezone.now, db_index=True) @@ -323,8 +322,31 @@ class Binary(ModelWithHealthStats): machine = Machine.current() overrides = overrides or {} - # Case 1: From binaries.jsonl - create queued binary - if 'binproviders' in record or ('overrides' in record and not record.get('abspath')): + # Case 1: Already installed (from on_Crawl hooks) - has abspath AND binproviders + # This happens when on_Crawl hooks detect already-installed binaries + abspath = record.get('abspath') + version = record.get('version') + binproviders = record.get('binproviders') + + if abspath and version and binproviders: + # Binary is already installed, create INSTALLED record with binproviders filter + binary, _ = Binary.objects.update_or_create( + machine=machine, + name=name, + defaults={ + 'abspath': abspath, + 'version': version, + 'sha256': record.get('sha256', ''), + 'binprovider': record.get('binprovider', 'env'), + 'binproviders': binproviders, # Preserve the filter + 'status': Binary.StatusChoices.INSTALLED, + 'retry_at': None, + } + ) + return binary + + # Case 2: From binaries.json - create queued binary (needs installation) + if 'binproviders' in record or ('overrides' in record and not abspath): binary, created = Binary.objects.get_or_create( machine=machine, name=name, @@ -337,25 +359,23 @@ class Binary(ModelWithHealthStats): ) return binary - # Case 2: From hook output - update with installation results - abspath = record.get('abspath') - version = record.get('version') - if not abspath or not version: - return None + # Case 3: From on_Binary__install hook output - update with installation results + if abspath and version: + binary, _ = Binary.objects.update_or_create( + machine=machine, + name=name, + defaults={ + 'abspath': abspath, + 'version': version, + 'sha256': record.get('sha256', ''), + 'binprovider': record.get('binprovider', 'env'), + 'status': Binary.StatusChoices.INSTALLED, + 'retry_at': None, + } + ) + return binary - binary, _ = Binary.objects.update_or_create( - machine=machine, - name=name, - defaults={ - 'abspath': abspath, - 'version': version, - 'sha256': record.get('sha256', ''), - 'binprovider': record.get('binprovider', 'env'), - 'status': Binary.StatusChoices.SUCCEEDED, - 'retry_at': None, - } - ) - return binary + return None @property def OUTPUT_DIR(self): @@ -403,8 +423,7 @@ class Binary(ModelWithHealthStats): # Discover ALL on_Binary__install_* hooks hooks = discover_hooks('Binary', config=config) if not hooks: - self.status = self.StatusChoices.FAILED - self.save() + # No hooks available - stay queued, will retry later return # Run each hook - they decide if they can handle this binary @@ -456,15 +475,21 @@ class Binary(ModelWithHealthStats): self.version = record.get('version', '') self.sha256 = record.get('sha256', '') self.binprovider = record.get('binprovider', 'env') - self.status = self.StatusChoices.SUCCEEDED + self.status = self.StatusChoices.INSTALLED self.save() + + # Symlink binary into LIB_BIN_DIR if configured + from django.conf import settings + lib_bin_dir = getattr(settings, 'LIB_BIN_DIR', None) + if lib_bin_dir: + self.symlink_to_lib_bin(lib_bin_dir) + return except json.JSONDecodeError: continue - # No hook succeeded - self.status = self.StatusChoices.FAILED - self.save() + # No hook succeeded - leave status as QUEUED (will retry later) + # Don't set to FAILED since we don't have that status anymore def cleanup(self): """ @@ -484,10 +509,75 @@ class Binary(ModelWithHealthStats): for plugin_dir in output_dir.iterdir(): if not plugin_dir.is_dir(): continue + pid_file = plugin_dir / 'hook.pid' cmd_file = plugin_dir / 'cmd.sh' safe_kill_process(pid_file, cmd_file) + def symlink_to_lib_bin(self, lib_bin_dir: str | Path) -> Path | None: + """ + Symlink this binary into LIB_BIN_DIR for unified PATH management. + + After a binary is installed by any binprovider (pip, npm, brew, apt, etc), + we symlink it into LIB_BIN_DIR so that: + 1. All binaries can be found in a single directory + 2. PATH only needs LIB_BIN_DIR prepended (not multiple provider-specific paths) + 3. Binary priorities are clear (symlink points to the canonical install location) + + Args: + lib_bin_dir: Path to LIB_BIN_DIR (e.g., /data/lib/arm64-darwin/bin) + + Returns: + Path to the created symlink, or None if symlinking failed + + Example: + >>> binary = Binary.objects.get(name='yt-dlp') + >>> binary.symlink_to_lib_bin('/data/lib/arm64-darwin/bin') + Path('/data/lib/arm64-darwin/bin/yt-dlp') + """ + import sys + from pathlib import Path + + if not self.abspath: + return None + + binary_abspath = Path(self.abspath).resolve() + lib_bin_dir = Path(lib_bin_dir).resolve() + + # Create LIB_BIN_DIR if it doesn't exist + try: + lib_bin_dir.mkdir(parents=True, exist_ok=True) + except (OSError, PermissionError) as e: + print(f"Failed to create LIB_BIN_DIR {lib_bin_dir}: {e}", file=sys.stderr) + return None + + # Get binary name (last component of path) + binary_name = binary_abspath.name + symlink_path = lib_bin_dir / binary_name + + # Remove existing symlink/file if it exists + if symlink_path.exists() or symlink_path.is_symlink(): + try: + # Check if it's already pointing to the right place + if symlink_path.is_symlink() and symlink_path.resolve() == binary_abspath: + # Already correctly symlinked, nothing to do + return symlink_path + + # Remove old symlink/file + symlink_path.unlink() + except (OSError, PermissionError) as e: + print(f"Failed to remove existing file at {symlink_path}: {e}", file=sys.stderr) + return None + + # Create new symlink + try: + symlink_path.symlink_to(binary_abspath) + print(f"Symlinked {binary_name} -> {symlink_path}", file=sys.stderr) + return symlink_path + except (OSError, PermissionError) as e: + print(f"Failed to create symlink {symlink_path} -> {binary_abspath}: {e}", file=sys.stderr) + return None + # ============================================================================= # Process Model @@ -627,6 +717,16 @@ class Process(models.Model): help_text='Type of process (cli, worker, orchestrator, binary, supervisord)' ) + # Worker type (only for WORKER processes: crawl, snapshot, archiveresult) + worker_type = models.CharField( + max_length=32, + default='', + null=False, + blank=True, + db_index=True, + help_text='Worker type name for WORKER processes (crawl, snapshot, archiveresult)' + ) + # Execution metadata pwd = models.CharField(max_length=512, default='', null=False, blank=True, help_text='Working directory for process execution') @@ -895,11 +995,16 @@ class Process(models.Model): ppid = os.getppid() machine = machine or Machine.current() + # Debug logging + import sys + print(f"DEBUG _find_parent_process: my_pid={os.getpid()}, ppid={ppid}", file=sys.stderr) + # Get parent process start time from OS try: os_parent = psutil.Process(ppid) os_parent_start = os_parent.create_time() except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess): + print(f"DEBUG _find_parent_process: Parent process {ppid} not accessible", file=sys.stderr) return None # Parent process doesn't exist # Find matching Process record @@ -910,12 +1015,18 @@ class Process(models.Model): started_at__gte=timezone.now() - PID_REUSE_WINDOW, ).order_by('-started_at') + print(f"DEBUG _find_parent_process: Found {candidates.count()} candidates for ppid={ppid}", file=sys.stderr) + for candidate in candidates: if candidate.started_at: db_start_time = candidate.started_at.timestamp() - if abs(db_start_time - os_parent_start) < START_TIME_TOLERANCE: + time_diff = abs(db_start_time - os_parent_start) + print(f"DEBUG _find_parent_process: Checking candidate id={candidate.id} time_diff={time_diff:.2f}s tolerance={START_TIME_TOLERANCE}s", file=sys.stderr) + if time_diff < START_TIME_TOLERANCE: + print(f"DEBUG _find_parent_process: MATCH! Returning parent id={candidate.id} pid={candidate.pid}", file=sys.stderr) return candidate + print(f"DEBUG _find_parent_process: No matching parent found for ppid={ppid}", file=sys.stderr) return None # No matching ArchiveBox parent process @classmethod @@ -1584,69 +1695,38 @@ class BinaryMachine(BaseStateMachine, strict_states=True): """ State machine for managing Binary installation lifecycle. - Hook Lifecycle: + Simple 2-state machine: ┌─────────────────────────────────────────────────────────────┐ │ QUEUED State │ │ • Binary needs to be installed │ └─────────────────────────────────────────────────────────────┘ - ↓ tick() when can_start() + ↓ tick() when can_install() + ↓ Synchronous installation during transition ┌─────────────────────────────────────────────────────────────┐ - │ STARTED State → enter_started() │ - │ 1. binary.run() │ - │ • discover_hooks('Binary') → all on_Binary__install_* │ - │ • Try each provider hook in sequence: │ - │ - run_hook(script, output_dir, ...) │ - │ - If returncode == 0: │ - │ * Read stdout.log │ - │ * Parse JSONL for 'Binary' record with abspath │ - │ * Update self: abspath, version, sha256, provider │ - │ * Set status=SUCCEEDED, RETURN │ - │ • If no hook succeeds: set status=FAILED │ - └─────────────────────────────────────────────────────────────┘ - ↓ tick() checks status - ┌─────────────────────────────────────────────────────────────┐ - │ SUCCEEDED / FAILED │ - │ • Set by binary.run() based on hook results │ - │ • Health stats incremented (num_uses_succeeded/failed) │ + │ INSTALLED State │ + │ • Binary installed (abspath, version, sha256 set) │ + │ • Health stats incremented │ └─────────────────────────────────────────────────────────────┘ + + If installation fails, Binary stays in QUEUED with retry_at bumped. """ model_attr_name = 'binary' # States queued = State(value=Binary.StatusChoices.QUEUED, initial=True) - started = State(value=Binary.StatusChoices.STARTED) - succeeded = State(value=Binary.StatusChoices.SUCCEEDED, final=True) - failed = State(value=Binary.StatusChoices.FAILED, final=True) + installed = State(value=Binary.StatusChoices.INSTALLED, final=True) - # Tick Event - transitions based on conditions + # Tick Event - install happens during transition tick = ( - queued.to.itself(unless='can_start') | - queued.to(started, cond='can_start') | - started.to.itself(unless='is_finished') | - started.to(succeeded, cond='is_succeeded') | - started.to(failed, cond='is_failed') + queued.to.itself(unless='can_install') | + queued.to(installed, cond='can_install', on='on_install') ) - def can_start(self) -> bool: + def can_install(self) -> bool: """Check if binary installation can start.""" return bool(self.binary.name and self.binary.binproviders) - def is_succeeded(self) -> bool: - """Check if installation succeeded (status was set by run()).""" - return self.binary.status == Binary.StatusChoices.SUCCEEDED - - def is_failed(self) -> bool: - """Check if installation failed (status was set by run()).""" - return self.binary.status == Binary.StatusChoices.FAILED - - def is_finished(self) -> bool: - """Check if installation has completed (success or failure).""" - return self.binary.status in ( - Binary.StatusChoices.SUCCEEDED, - Binary.StatusChoices.FAILED, - ) - @queued.enter def enter_queued(self): """Binary is queued for installation.""" @@ -1655,43 +1735,48 @@ class BinaryMachine(BaseStateMachine, strict_states=True): status=Binary.StatusChoices.QUEUED, ) - @started.enter - def enter_started(self): - """Start binary installation.""" - # Lock the binary while installation runs - self.binary.update_and_requeue( - retry_at=timezone.now() + timedelta(seconds=300), # 5 min timeout for installation - status=Binary.StatusChoices.STARTED, - ) + def on_install(self): + """Called during queued→installed transition. Runs installation synchronously.""" + import sys - # Run installation hooks + print(f'[cyan] 🔄 BinaryMachine.on_install() - installing {self.binary.name}[/cyan]', file=sys.stderr) + + # Run installation hooks (synchronous, updates abspath/version/sha256 and sets status) self.binary.run() - # Save updated status (run() updates status to succeeded/failed) - self.binary.save() + # Check if installation succeeded by looking at updated status + # Note: Binary.run() updates self.binary.status internally but doesn't refresh our reference + self.binary.refresh_from_db() - @succeeded.enter - def enter_succeeded(self): + if self.binary.status != Binary.StatusChoices.INSTALLED: + # Installation failed - abort transition, stay in queued + print(f'[red] ❌ BinaryMachine - {self.binary.name} installation failed, retrying later[/red]', file=sys.stderr) + + # Bump retry_at to try again later + self.binary.update_and_requeue( + retry_at=timezone.now() + timedelta(seconds=300), # Retry in 5 minutes + status=Binary.StatusChoices.QUEUED, # Ensure we stay queued + ) + + # Increment health stats for failure + self.binary.increment_health_stats(success=False) + + # Abort the transition - this will raise an exception and keep us in queued + raise Exception(f'Binary {self.binary.name} installation failed') + + print(f'[cyan] ✅ BinaryMachine - {self.binary.name} installed successfully[/cyan]', file=sys.stderr) + + @installed.enter + def enter_installed(self): """Binary installed successfully.""" self.binary.update_and_requeue( retry_at=None, - status=Binary.StatusChoices.SUCCEEDED, + status=Binary.StatusChoices.INSTALLED, ) # Increment health stats self.binary.increment_health_stats(success=True) - @failed.enter - def enter_failed(self): - """Binary installation failed.""" - self.binary.update_and_requeue( - retry_at=None, - status=Binary.StatusChoices.FAILED, - ) - - # Increment health stats - self.binary.increment_health_stats(success=False) - # ============================================================================= # Process State Machine diff --git a/archivebox/plugins/accessibility/tests/test_accessibility.py b/archivebox/plugins/accessibility/tests/test_accessibility.py index 4fc8a1fe..0c85b145 100644 --- a/archivebox/plugins/accessibility/tests/test_accessibility.py +++ b/archivebox/plugins/accessibility/tests/test_accessibility.py @@ -80,8 +80,7 @@ class TestAccessibilityWithChrome(TestCase): # Run accessibility hook with the active Chrome session result = subprocess.run( ['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir, - env=get_test_env()), + cwd=str(snapshot_chrome_dir), capture_output=True, text=True, timeout=60, diff --git a/archivebox/plugins/chrome/on_Crawl__01_chrome_install.py b/archivebox/plugins/chrome/on_Crawl__01_chrome_install.py old mode 100644 new mode 100755 diff --git a/archivebox/plugins/chrome/tests/test_chrome.py b/archivebox/plugins/chrome/tests/test_chrome.py index 6c801a5e..0702f95f 100644 --- a/archivebox/plugins/chrome/tests/test_chrome.py +++ b/archivebox/plugins/chrome/tests/test_chrome.py @@ -39,30 +39,36 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import ( CHROME_NAVIGATE_HOOK, ) -# Get LIB_DIR and NODE_MODULES_DIR from shared helpers -LIB_DIR = get_lib_dir() -NODE_MODULES_DIR = get_node_modules_dir() -NPM_PREFIX = LIB_DIR / 'npm' - -# Chromium install location (relative to DATA_DIR) -CHROMIUM_INSTALL_DIR = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium' - - @pytest.fixture(scope="session", autouse=True) -def ensure_chromium_and_puppeteer_installed(): - """Ensure Chromium and puppeteer are installed before running tests.""" +def ensure_chromium_and_puppeteer_installed(tmp_path_factory): + """Ensure Chromium and puppeteer are installed before running tests. + + Puppeteer handles Chromium installation automatically in its own cache. + We only need to install puppeteer itself to LIB_DIR/npm. + """ from abx_pkg import Binary, NpmProvider, BinProviderOverrides + # Set DATA_DIR if not already set (required by abx_pkg) + if not os.environ.get('DATA_DIR'): + # Use isolated temp dir for direct pytest runs + test_data_dir = tmp_path_factory.mktemp('chrome_test_data') + os.environ['DATA_DIR'] = str(test_data_dir) + + # Compute paths AFTER setting DATA_DIR + lib_dir = get_lib_dir() + node_modules_dir = get_node_modules_dir() + npm_prefix = lib_dir / 'npm' + # Rebuild pydantic models NpmProvider.model_rebuild() - # Install puppeteer-core if not available - puppeteer_core_path = NODE_MODULES_DIR / 'puppeteer-core' + # Install puppeteer if not available (it will handle Chromium in its own cache) + puppeteer_core_path = node_modules_dir / 'puppeteer-core' if not puppeteer_core_path.exists(): - print(f"\n[*] Installing puppeteer to {NPM_PREFIX}...") - NPM_PREFIX.mkdir(parents=True, exist_ok=True) + print(f"\n[*] Installing puppeteer to {npm_prefix}...") + npm_prefix.mkdir(parents=True, exist_ok=True) - provider = NpmProvider(npm_prefix=NPM_PREFIX) + provider = NpmProvider(npm_prefix=npm_prefix) try: binary = Binary( name='puppeteer', @@ -70,36 +76,25 @@ def ensure_chromium_and_puppeteer_installed(): overrides={'npm': {'packages': ['puppeteer@^23.5.0']}} ) binary.install() - print(f"[*] Puppeteer installed successfully to {NPM_PREFIX}") + print(f"[*] Puppeteer installed successfully to {npm_prefix}") except Exception as e: pytest.skip(f"Failed to install puppeteer: {e}") - # Install Chromium via @puppeteer/browsers if not available + # Find Chromium binary (puppeteer installs it automatically in its cache) chromium_binary = find_chromium_binary() if not chromium_binary: - print(f"\n[*] Installing Chromium to {CHROMIUM_INSTALL_DIR}...") - CHROMIUM_INSTALL_DIR.mkdir(parents=True, exist_ok=True) - - result = subprocess.run( - ['npx', '@puppeteer/browsers', 'install', 'chromium@latest'], - cwd=str(CHROMIUM_INSTALL_DIR.parent), - capture_output=True, - text=True, - timeout=300 - ) - if result.returncode != 0: - pytest.skip(f"Failed to install Chromium: {result.stderr}") - - chromium_binary = find_chromium_binary() - if not chromium_binary: - pytest.skip("Chromium installed but binary not found") - - print(f"[*] Chromium installed: {chromium_binary}") + pytest.skip("Chromium not found - puppeteer should install it automatically") # Set CHROME_BINARY env var for tests os.environ['CHROME_BINARY'] = chromium_binary +# Get paths from helpers (will use DATA_DIR if set, or compute based on __file__) +LIB_DIR = get_lib_dir() +NODE_MODULES_DIR = get_node_modules_dir() +NPM_PREFIX = LIB_DIR / 'npm' + + def test_hook_scripts_exist(): """Verify chrome hooks exist.""" assert CHROME_LAUNCH_HOOK.exists(), f"Hook not found: {CHROME_LAUNCH_HOOK}" @@ -208,8 +203,7 @@ def test_chrome_launch_and_tab_creation(): env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) result = subprocess.run( ['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-123', '--crawl-id=test-crawl-123'], - cwd=str(snapshot_chrome_dir, - env=get_test_env()), + cwd=str(snapshot_chrome_dir), capture_output=True, text=True, timeout=60, @@ -269,8 +263,7 @@ def test_chrome_navigation(): result = subprocess.run( ['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123', '--crawl-id=test-crawl-nav'], - cwd=str(snapshot_chrome_dir, - env=get_test_env()), + cwd=str(snapshot_chrome_dir), capture_output=True, text=True, timeout=60, @@ -281,8 +274,7 @@ def test_chrome_navigation(): # Navigate to URL result = subprocess.run( ['node', str(CHROME_NAVIGATE_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123'], - cwd=str(snapshot_chrome_dir, - env=get_test_env()), + cwd=str(snapshot_chrome_dir), capture_output=True, text=True, timeout=120, @@ -417,8 +409,7 @@ def test_multiple_snapshots_share_chrome(): # Create tab for this snapshot result = subprocess.run( ['node', str(CHROME_TAB_HOOK), f'--url=https://example.com/{snap_num}', f'--snapshot-id=snap-{snap_num}', '--crawl-id=test-multi-crawl'], - cwd=str(snapshot_chrome_dir, - env=get_test_env()), + cwd=str(snapshot_chrome_dir), capture_output=True, text=True, timeout=60, diff --git a/archivebox/plugins/consolelog/tests/test_consolelog.py b/archivebox/plugins/consolelog/tests/test_consolelog.py index ca75f130..741776f0 100644 --- a/archivebox/plugins/consolelog/tests/test_consolelog.py +++ b/archivebox/plugins/consolelog/tests/test_consolelog.py @@ -80,8 +80,7 @@ class TestConsolelogWithChrome(TestCase): # Run consolelog hook with the active Chrome session result = subprocess.run( ['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], - cwd=str(snapshot_chrome_dir, - env=get_test_env()), + cwd=str(snapshot_chrome_dir), capture_output=True, text=True, timeout=120, # Longer timeout as it waits for navigation diff --git a/archivebox/plugins/gallerydl/on_Crawl__10_gallerydl_install.py b/archivebox/plugins/gallerydl/on_Crawl__10_gallerydl_install.py new file mode 100755 index 00000000..df627ab4 --- /dev/null +++ b/archivebox/plugins/gallerydl/on_Crawl__10_gallerydl_install.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +""" +Detect gallery-dl binary and emit Binary JSONL record. + +Output: Binary JSONL record to stdout if gallery-dl is found +""" + +import json +import os +import sys + +from abx_pkg import Binary, EnvProvider + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def output_binary_found(binary: Binary, name: str): + """Output Binary JSONL record for an installed binary.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'abspath': str(binary.abspath), + 'version': str(binary.version) if binary.version else '', + 'sha256': binary.sha256 or '', + 'binprovider': 'env', # Already installed + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def output_binary_missing(name: str, binproviders: str): + """Output Binary JSONL record for a missing binary that needs installation.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'binproviders': binproviders, # Providers that can install it + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def main(): + gallerydl_enabled = get_env_bool('GALLERYDL_ENABLED', True) + gallerydl_binary = get_env('GALLERYDL_BINARY', 'gallery-dl') + + if not gallerydl_enabled: + sys.exit(0) + + provider = EnvProvider() + try: + binary = Binary(name=gallerydl_binary, binproviders=[provider]).load() + if binary.abspath: + # Binary found + output_binary_found(binary, name='gallery-dl') + else: + # Binary not found + output_binary_missing(name='gallery-dl', binproviders='pip') + except Exception: + # Binary not found + output_binary_missing(name='gallery-dl', binproviders='pip') + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/git/on_Crawl__09_git_install.py b/archivebox/plugins/git/on_Crawl__09_git_install.py new file mode 100755 index 00000000..4179ed81 --- /dev/null +++ b/archivebox/plugins/git/on_Crawl__09_git_install.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +""" +Detect git binary and emit Binary JSONL record. + +Output: Binary JSONL record to stdout if git is found +""" + +import json +import os +import sys + +from abx_pkg import Binary, EnvProvider + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def output_binary_found(binary: Binary, name: str): + """Output Binary JSONL record for an installed binary.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'abspath': str(binary.abspath), + 'version': str(binary.version) if binary.version else '', + 'sha256': binary.sha256 or '', + 'binprovider': 'env', # Already installed + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def output_binary_missing(name: str, binproviders: str): + """Output Binary JSONL record for a missing binary that needs installation.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'binproviders': binproviders, # Providers that can install it + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def main(): + git_enabled = get_env_bool('GIT_ENABLED', True) + git_binary = get_env('GIT_BINARY', 'git') + + if not git_enabled: + sys.exit(0) + + provider = EnvProvider() + try: + binary = Binary(name=git_binary, binproviders=[provider]).load() + if binary.abspath: + # Binary found + output_binary_found(binary, name='git') + else: + # Binary not found + output_binary_missing(name='git', binproviders='apt,brew') + except Exception: + # Binary not found + output_binary_missing(name='git', binproviders='apt,brew') + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py index 16a7631d..eee44ce4 100644 --- a/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py +++ b/archivebox/plugins/infiniscroll/tests/test_infiniscroll.py @@ -123,8 +123,7 @@ def test_scrolls_page_and_outputs_stats(): result = subprocess.run( ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'], - cwd=str(infiniscroll_dir, - env=get_test_env()), + cwd=str(infiniscroll_dir), capture_output=True, text=True, timeout=60, @@ -188,8 +187,7 @@ def test_config_scroll_limit_honored(): result = subprocess.run( ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-limit'], - cwd=str(infiniscroll_dir, - env=get_test_env()), + cwd=str(infiniscroll_dir), capture_output=True, text=True, timeout=60, @@ -248,8 +246,7 @@ def test_config_timeout_honored(): start_time = time.time() result = subprocess.run( ['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-timeout'], - cwd=str(infiniscroll_dir, - env=get_test_env()), + cwd=str(infiniscroll_dir), capture_output=True, text=True, timeout=30, diff --git a/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py b/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py new file mode 100755 index 00000000..57fe5e7e --- /dev/null +++ b/archivebox/plugins/mercury/on_Crawl__12_mercury_install.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +""" +Detect mercury-parser binary and emit Binary JSONL record. + +Output: Binary JSONL record to stdout if mercury-parser is found +""" + +import json +import os +import sys + +from abx_pkg import Binary, EnvProvider + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def output_binary_found(binary: Binary, name: str): + """Output Binary JSONL record for an installed binary.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'abspath': str(binary.abspath), + 'version': str(binary.version) if binary.version else '', + 'sha256': binary.sha256 or '', + 'binprovider': 'env', # Already installed + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def output_binary_missing(name: str, binproviders: str): + """Output Binary JSONL record for a missing binary that needs installation.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'binproviders': binproviders, # Providers that can install it + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def main(): + mercury_enabled = get_env_bool('MERCURY_ENABLED', True) + mercury_binary = get_env('MERCURY_BINARY', 'mercury-parser') + + if not mercury_enabled: + sys.exit(0) + + provider = EnvProvider() + try: + binary = Binary(name=mercury_binary, binproviders=[provider]).load() + if binary.abspath: + # Binary found + output_binary_found(binary, name='mercury-parser') + else: + # Binary not found + output_binary_missing(name='mercury-parser', binproviders='npm') + except Exception: + # Binary not found + output_binary_missing(name='mercury-parser', binproviders='npm') + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/readability/on_Crawl__11_readability_install.py b/archivebox/plugins/readability/on_Crawl__11_readability_install.py new file mode 100755 index 00000000..ea0791ef --- /dev/null +++ b/archivebox/plugins/readability/on_Crawl__11_readability_install.py @@ -0,0 +1,83 @@ +#!/usr/bin/env python3 +""" +Detect readability-extractor binary and emit Binary JSONL record. + +Output: Binary JSONL record to stdout if readability is found +""" + +import json +import os +import sys + +from abx_pkg import Binary, EnvProvider + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def output_binary_found(binary: Binary, name: str): + """Output Binary JSONL record for an installed binary.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'abspath': str(binary.abspath), + 'version': str(binary.version) if binary.version else '', + 'sha256': binary.sha256 or '', + 'binprovider': 'env', # Already installed + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def output_binary_missing(name: str, binproviders: str): + """Output Binary JSONL record for a missing binary that needs installation.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'binproviders': binproviders, # Providers that can install it + 'overrides': { + 'packages': ['git+https://github.com/ArchiveBox/readability-extractor.git'], + }, + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def main(): + readability_enabled = get_env_bool('READABILITY_ENABLED', True) + readability_binary = get_env('READABILITY_BINARY', 'readability-extractor') + + if not readability_enabled: + sys.exit(0) + + provider = EnvProvider() + try: + binary = Binary(name=readability_binary, binproviders=[provider]).load() + if binary.abspath: + # Binary found + output_binary_found(binary, name='readability-extractor') + else: + # Binary not found + output_binary_missing(name='readability-extractor', binproviders='npm') + except Exception: + # Binary not found + output_binary_missing(name='readability-extractor', binproviders='npm') + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/screenshot/tests/test_screenshot.py b/archivebox/plugins/screenshot/tests/test_screenshot.py index be431803..04c89f7e 100644 --- a/archivebox/plugins/screenshot/tests/test_screenshot.py +++ b/archivebox/plugins/screenshot/tests/test_screenshot.py @@ -27,11 +27,21 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import ( run_hook_and_parse, LIB_DIR, NODE_MODULES_DIR, + CHROME_PLUGIN_DIR, ) +# Import chrome test fixture to ensure puppeteer is installed +from archivebox.plugins.chrome.tests.test_chrome import ensure_chromium_and_puppeteer_installed + PLUGIN_DIR = get_plugin_dir(__file__) SCREENSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*') + +# Get Chrome hooks for setting up sessions +CHROME_LAUNCH_HOOK = get_hook_script(CHROME_PLUGIN_DIR, 'on_Crawl__*_chrome_launch.*') +CHROME_TAB_HOOK = get_hook_script(CHROME_PLUGIN_DIR, 'on_Snapshot__*_chrome_tab.*') +CHROME_NAVIGATE_HOOK = get_hook_script(CHROME_PLUGIN_DIR, 'on_Snapshot__*_chrome_navigate.*') + TEST_URL = 'https://example.com' @@ -53,18 +63,162 @@ def test_verify_deps_with_abx_pkg(): def test_extracts_screenshot_from_example_com(): - """Test full workflow: extract screenshot from real example.com via hook.""" - # Prerequisites checked by earlier test + """Test full workflow: extract screenshot from real example.com via hook. + + Replicates production directory structure: + DATA_DIR/users/testuser/crawls/{crawl-id}/chrome/ + DATA_DIR/users/testuser/crawls/{crawl-id}/snapshots/{snap-id}/chrome/ + DATA_DIR/users/testuser/crawls/{crawl-id}/snapshots/{snap-id}/screenshot/ + + This exercises the "connect to existing session" code path which is the primary + path in production and accounts for ~50% of the code. + """ + import signal + import time + import os with tempfile.TemporaryDirectory() as tmpdir: - tmpdir = Path(tmpdir) + # Replicate exact production directory structure + data_dir = Path(tmpdir) + crawl_id = 'test-screenshot-crawl' + snapshot_id = 'test-screenshot-snap' + + # Crawl: DATA_DIR/users/{username}/crawls/YYYYMMDD/example.com/{crawl-id}/{plugin}/ + crawl_dir = data_dir / 'users' / 'testuser' / 'crawls' / '20240101' / 'example.com' / crawl_id + chrome_dir = crawl_dir / 'chrome' + chrome_dir.mkdir(parents=True) + + # Snapshot: DATA_DIR/users/{username}/snapshots/YYYYMMDD/example.com/{snapshot-uuid}/{plugin}/ + snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / snapshot_id + snapshot_chrome_dir = snapshot_dir / 'chrome' + snapshot_chrome_dir.mkdir(parents=True) + + screenshot_dir = snapshot_dir / 'screenshot' + screenshot_dir.mkdir() - # Run screenshot extraction hook env = get_test_env() - print(f"\n[DEBUG] NODE_V8_COVERAGE={env.get('NODE_V8_COVERAGE', 'NOT SET')}", file=sys.stderr) + env['CHROME_HEADLESS'] = 'true' + + # Step 1: Launch Chrome session at crawl level (background process) + chrome_launch_process = subprocess.Popen( + ['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'], + cwd=str(chrome_dir), + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + text=True, + env=env + ) + + # Wait for Chrome to launch + for i in range(15): + if chrome_launch_process.poll() is not None: + stdout, stderr = chrome_launch_process.communicate() + pytest.fail(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}") + if (chrome_dir / 'cdp_url.txt').exists(): + break + time.sleep(1) + + assert (chrome_dir / 'cdp_url.txt').exists(), "Chrome CDP URL file should exist" + assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist" + + chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip()) + + try: + # Step 2: Create tab at snapshot level + env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) + result = subprocess.run( + ['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'], + cwd=str(snapshot_chrome_dir), + capture_output=True, + text=True, + timeout=60, + env=env + ) + assert result.returncode == 0, f"Tab creation failed: {result.stderr}" + assert (snapshot_chrome_dir / 'cdp_url.txt').exists(), "Snapshot CDP URL should exist" + + # Step 3: Navigate to URL + result = subprocess.run( + ['node', str(CHROME_NAVIGATE_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}'], + cwd=str(snapshot_chrome_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + assert result.returncode == 0, f"Navigation failed: {result.stderr}" + assert (snapshot_chrome_dir / 'navigation.json').exists(), "Navigation JSON should exist" + + # Step 4: Take screenshot (should connect to existing session) + # Screenshot hook runs in screenshot/ dir and looks for ../chrome/cdp_url.txt + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}'], + cwd=str(screenshot_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + + assert result.returncode == 0, f"Screenshot extraction failed:\nStderr: {result.stderr}\nStdout: {result.stdout}" + + # Parse JSONL output + result_json = None + for line in result.stdout.strip().split('\n'): + line = line.strip() + if line.startswith('{'): + try: + record = json.loads(line) + if record.get('type') == 'ArchiveResult': + result_json = record + break + except json.JSONDecodeError: + pass + + assert result_json, "Should have ArchiveResult JSONL output" + assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" + assert 'screenshot.png' in result_json['output_str'], f"Output should be screenshot.png: {result_json}" + + # Verify filesystem output + screenshot_file = screenshot_dir / 'screenshot.png' + assert screenshot_file.exists(), f"screenshot.png not created at {screenshot_file}" + + # Verify file is valid PNG + file_size = screenshot_file.stat().st_size + assert file_size > 1000, f"Screenshot too small: {file_size} bytes" + assert file_size < 10 * 1024 * 1024, f"Screenshot suspiciously large: {file_size} bytes" + + # Check PNG magic bytes + screenshot_data = screenshot_file.read_bytes() + assert screenshot_data[:8] == b'\x89PNG\r\n\x1a\n', "Should be valid PNG file" + + finally: + # Cleanup: Kill Chrome + try: + chrome_launch_process.send_signal(signal.SIGTERM) + chrome_launch_process.wait(timeout=5) + except: + pass + try: + os.kill(chrome_pid, signal.SIGKILL) + except OSError: + pass + + +def test_extracts_screenshot_without_session(): + """Test screenshot extraction without existing Chrome session (fallback to own browser).""" + with tempfile.TemporaryDirectory() as tmpdir: + # Create proper snapshot directory structure + data_dir = Path(tmpdir) + snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-fallback' + screenshot_dir = snapshot_dir / 'screenshot' + screenshot_dir.mkdir(parents=True) + + # Don't set up Chrome session or staticfile - screenshot should launch its own browser + env = get_test_env() result = subprocess.run( - ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'], - cwd=tmpdir, + ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-fallback'], + cwd=str(screenshot_dir), capture_output=True, text=True, timeout=120, @@ -73,7 +227,7 @@ def test_extracts_screenshot_from_example_com(): assert result.returncode == 0, f"Extraction failed: {result.stderr}" - # Parse JSONL output (clean format without RESULT_JSON= prefix) + # Parse JSONL output result_json = None for line in result.stdout.strip().split('\n'): line = line.strip() @@ -88,20 +242,54 @@ def test_extracts_screenshot_from_example_com(): assert result_json, "Should have ArchiveResult JSONL output" assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" - assert result_json['output_str'] == 'screenshot.png' + assert 'screenshot.png' in result_json['output_str'] - # Verify filesystem output (hook creates screenshot.png directly in working dir) - screenshot_file = tmpdir / 'screenshot.png' + # Verify file created + screenshot_file = screenshot_dir / 'screenshot.png' assert screenshot_file.exists(), "screenshot.png not created" + assert screenshot_file.stat().st_size > 1000, "Screenshot too small" - # Verify file is valid PNG - file_size = screenshot_file.stat().st_size - assert file_size > 1000, f"Screenshot too small: {file_size} bytes" - assert file_size < 10 * 1024 * 1024, f"Screenshot suspiciously large: {file_size} bytes" - # Check PNG magic bytes - screenshot_data = screenshot_file.read_bytes() - assert screenshot_data[:8] == b'\x89PNG\r\n\x1a\n', "Should be valid PNG file" +def test_skips_when_staticfile_exists(): + """Test that screenshot skips when staticfile extractor already handled the URL.""" + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) + snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-skip' + screenshot_dir = snapshot_dir / 'screenshot' + screenshot_dir.mkdir(parents=True) + + # Create staticfile output to simulate staticfile extractor already ran + staticfile_dir = snapshot_dir / 'staticfile' + staticfile_dir.mkdir() + (staticfile_dir / 'index.html').write_text('') + + env = get_test_env() + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-skip'], + cwd=str(screenshot_dir), + capture_output=True, + text=True, + timeout=30, + env=env + ) + + assert result.returncode == 0, f"Should exit successfully: {result.stderr}" + + # Should emit skipped status + result_json = None + for line in result.stdout.strip().split('\n'): + line = line.strip() + if line.startswith('{'): + try: + record = json.loads(line) + if record.get('type') == 'ArchiveResult': + result_json = record + break + except json.JSONDecodeError: + pass + + assert result_json, "Should have ArchiveResult JSONL output" + assert result_json['status'] == 'skipped', f"Should skip: {result_json}" def test_config_save_screenshot_false_skips(): @@ -134,13 +322,11 @@ def test_config_save_screenshot_false_skips(): def test_reports_missing_chrome(): """Test that script reports error when Chrome is not found.""" - import os - with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) # Set CHROME_BINARY to nonexistent path - env = os.environ.copy() + env = get_test_env() env['CHROME_BINARY'] = '/nonexistent/chrome' result = subprocess.run( @@ -158,6 +344,59 @@ def test_reports_missing_chrome(): assert 'chrome' in combined.lower() or 'browser' in combined.lower() or 'ERROR=' in combined +def test_custom_resolution_and_user_agent(): + """Test that CHROME_RESOLUTION and CHROME_USER_AGENT configs are respected.""" + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) + snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-config' + screenshot_dir = snapshot_dir / 'screenshot' + screenshot_dir.mkdir(parents=True) + + env = get_test_env() + env['CHROME_RESOLUTION'] = '800,600' + env['CHROME_USER_AGENT'] = 'Test/1.0' + + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-config'], + cwd=str(screenshot_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + + assert result.returncode == 0, f"Extraction failed: {result.stderr}" + + screenshot_file = screenshot_dir / 'screenshot.png' + assert screenshot_file.exists(), "screenshot.png not created" + # Resolution affects file size + assert screenshot_file.stat().st_size > 500, "Screenshot too small" + + +def test_ssl_check_disabled(): + """Test that CHROME_CHECK_SSL_VALIDITY=False allows invalid certificates.""" + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) + snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-ssl' + screenshot_dir = snapshot_dir / 'screenshot' + screenshot_dir.mkdir(parents=True) + + env = get_test_env() + env['CHROME_CHECK_SSL_VALIDITY'] = 'False' + + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-ssl'], + cwd=str(screenshot_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + + assert result.returncode == 0, f"Should succeed: {result.stderr}" + assert (screenshot_dir / 'screenshot.png').exists() + + def test_config_timeout_honored(): """Test that CHROME_TIMEOUT config is respected.""" import os @@ -182,5 +421,410 @@ def test_config_timeout_honored(): assert result.returncode in (0, 1), "Should complete without hanging" +def test_missing_url_argument(): + """Test that hook fails gracefully when URL argument is missing.""" + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + env = get_test_env() + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), '--snapshot-id=test-missing-url'], + cwd=tmpdir, + capture_output=True, + text=True, + timeout=30, + env=env + ) + + # Should exit with error + assert result.returncode != 0, "Should fail when URL is missing" + assert 'Usage:' in result.stderr or 'url' in result.stderr.lower() + + +def test_missing_snapshot_id_argument(): + """Test that hook fails gracefully when snapshot-id argument is missing.""" + with tempfile.TemporaryDirectory() as tmpdir: + tmpdir = Path(tmpdir) + + env = get_test_env() + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}'], + cwd=tmpdir, + capture_output=True, + text=True, + timeout=30, + env=env + ) + + # Should exit with error + assert result.returncode != 0, "Should fail when snapshot-id is missing" + assert 'Usage:' in result.stderr or 'snapshot' in result.stderr.lower() + + +def test_invalid_resolution_format(): + """Test that invalid CHROME_RESOLUTION format is handled gracefully.""" + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) + snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-badres' + screenshot_dir = snapshot_dir / 'screenshot' + screenshot_dir.mkdir(parents=True) + + env = get_test_env() + # Invalid resolution formats to test parseResolution error handling + for bad_resolution in ['invalid', '1440', '1440x2000', 'abc,def']: + env['CHROME_RESOLUTION'] = bad_resolution + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-badres'], + cwd=str(screenshot_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + # Should either fail gracefully or fall back to default + # (depending on implementation - script should not crash with uncaught error) + assert result.returncode in (0, 1), f"Script should handle bad resolution: {bad_resolution}" + + +def test_boolean_env_var_parsing(): + """Test that boolean environment variables are parsed correctly.""" + import time + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) + snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-bool' + screenshot_dir = snapshot_dir / 'screenshot' + screenshot_dir.mkdir(parents=True) + + env = get_test_env() + + # Test various boolean formats for CHROME_HEADLESS + for bool_val in ['true', '1', 'yes', 'on', 'True', 'TRUE']: + env['CHROME_HEADLESS'] = bool_val + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-bool'], + cwd=str(screenshot_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + # Should either succeed or fail, but shouldn't crash on boolean parsing + assert result.returncode in (0, 1), f"Should handle boolean value: {bool_val}" + + # Clean up screenshot file if created + screenshot_file = screenshot_dir / 'screenshot.png' + if screenshot_file.exists(): + screenshot_file.unlink() + + time.sleep(0.5) # Brief pause between attempts + + +def test_integer_env_var_parsing(): + """Test that integer environment variables are parsed correctly.""" + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) + snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-int' + screenshot_dir = snapshot_dir / 'screenshot' + screenshot_dir.mkdir(parents=True) + + env = get_test_env() + + # Test valid and invalid integer formats for CHROME_TIMEOUT + test_cases = [ + ('60', True), # Valid integer + ('invalid', True), # Invalid - should use default + ('', True), # Empty - should use default + ] + + for timeout_val, should_work in test_cases: + env['CHROME_TIMEOUT'] = timeout_val + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-int'], + cwd=str(screenshot_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + # Should either succeed or fail gracefully, but shouldn't crash on int parsing + assert result.returncode in (0, 1), f"Should handle timeout value: {timeout_val}" + + # Clean up screenshot file if created + screenshot_file = screenshot_dir / 'screenshot.png' + if screenshot_file.exists(): + screenshot_file.unlink() + + +def test_extracts_screenshot_with_all_config_options(): + """Test screenshot with comprehensive config to exercise all code paths.""" + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) + snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-full' + screenshot_dir = snapshot_dir / 'screenshot' + screenshot_dir.mkdir(parents=True) + + # Set ALL config options to exercise all code paths + env = get_test_env() + env['CHROME_HEADLESS'] = 'true' + env['CHROME_RESOLUTION'] = '800,600' + env['CHROME_USER_AGENT'] = 'TestBot/1.0' + env['CHROME_CHECK_SSL_VALIDITY'] = 'false' # Exercises checkSsl branch + env['CHROME_TIMEOUT'] = '60' + + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-full'], + cwd=str(screenshot_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + + assert result.returncode == 0, f"Screenshot should succeed: {result.stderr}" + + # Verify JSONL output with success + result_json = None + for line in result.stdout.strip().split('\n'): + if line.strip().startswith('{'): + try: + record = json.loads(line) + if record.get('type') == 'ArchiveResult': + result_json = record + break + except json.JSONDecodeError: + pass + + assert result_json, "Should have ArchiveResult JSONL output" + assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" + assert 'screenshot.png' in result_json['output_str'] + + # Verify file created + screenshot_file = screenshot_dir / 'screenshot.png' + assert screenshot_file.exists(), "screenshot.png should be created" + assert screenshot_file.stat().st_size > 1000, "Screenshot should have content" + + +def test_headless_mode_false(): + """Test headless=false code path specifically.""" + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) + snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-headless' + screenshot_dir = snapshot_dir / 'screenshot' + screenshot_dir.mkdir(parents=True) + + env = get_test_env() + # Explicitly test headless=false (exercises the ternary false branch) + env['CHROME_HEADLESS'] = 'false' + + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-headless-false'], + cwd=str(screenshot_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + # Should work or fail gracefully + assert result.returncode in (0, 1), f"Headless=false should handle: {result.stderr}" + + +def test_invalid_url_causes_error(): + """Test error path with invalid URL that causes navigation failure.""" + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) + snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-invalid' + screenshot_dir = snapshot_dir / 'screenshot' + screenshot_dir.mkdir(parents=True) + + env = get_test_env() + env['CHROME_TIMEOUT'] = '5' # Short timeout + + # Use invalid URL to trigger error path + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), '--url=http://this-domain-does-not-exist-12345.invalid', '--snapshot-id=snap-invalid'], + cwd=str(screenshot_dir), + capture_output=True, + text=True, + timeout=30, + env=env + ) + + # Should fail due to navigation error + assert result.returncode != 0, "Should fail on invalid URL" + # Should NOT emit JSONL (transient error) + jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')] + assert len(jsonl_lines) == 0, f"Should not emit JSONL on error: {jsonl_lines}" + + +def test_with_corrupted_cdp_url_falls_back(): + """Test that corrupted CDP URL file causes fallback to launching browser.""" + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) + snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-corrupt-cdp' + screenshot_dir = snapshot_dir / 'screenshot' + screenshot_dir.mkdir(parents=True) + + # Create chrome directory with corrupted CDP URL + chrome_dir = snapshot_dir / 'chrome' + chrome_dir.mkdir() + (chrome_dir / 'cdp_url.txt').write_text('ws://127.0.0.1:99999/invalid') + + env = get_test_env() + env['CHROME_HEADLESS'] = 'true' + env['CHROME_TIMEOUT'] = '5' # Short timeout for fast test + + # Screenshot should try CDP, fail quickly, then fall back to launching own browser + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-corrupt-cdp'], + cwd=str(screenshot_dir), + capture_output=True, + text=True, + timeout=30, + env=env + ) + + # Should succeed by falling back to launching browser + assert result.returncode == 0, f"Should fallback and succeed: {result.stderr}" + assert 'Failed to connect to CDP' in result.stderr, "Should log CDP connection failure" + + # Verify screenshot was created via fallback path + screenshot_file = screenshot_dir / 'screenshot.png' + assert screenshot_file.exists(), "Screenshot should be created via fallback" + + +def test_user_agent_is_applied(): + """Test that CHROME_USER_AGENT is actually applied when launching browser.""" + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) + snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-ua' + screenshot_dir = snapshot_dir / 'screenshot' + screenshot_dir.mkdir(parents=True) + + env = get_test_env() + env['CHROME_USER_AGENT'] = 'CustomBot/9.9.9 (Testing)' + env['CHROME_HEADLESS'] = 'true' + + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-ua'], + cwd=str(screenshot_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + + # Should succeed with custom user agent + assert result.returncode == 0, f"Should succeed with custom UA: {result.stderr}" + screenshot_file = screenshot_dir / 'screenshot.png' + assert screenshot_file.exists(), "Screenshot should be created" + + +def test_check_ssl_false_branch(): + """Test CHROME_CHECK_SSL_VALIDITY=false adds ignore-certificate-errors arg.""" + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) + snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-nossl' + screenshot_dir = snapshot_dir / 'screenshot' + screenshot_dir.mkdir(parents=True) + + env = get_test_env() + env['CHROME_CHECK_SSL_VALIDITY'] = 'false' + env['CHROME_HEADLESS'] = 'true' + + # Test with both boolean false and string 'false' + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-nossl'], + cwd=str(screenshot_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + + assert result.returncode == 0, f"Should work with SSL check disabled: {result.stderr}" + assert (screenshot_dir / 'screenshot.png').exists() + + +def test_alternative_env_var_names(): + """Test fallback environment variable names (TIMEOUT vs CHROME_TIMEOUT, etc).""" + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) + snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-altenv' + screenshot_dir = snapshot_dir / 'screenshot' + screenshot_dir.mkdir(parents=True) + + env = get_test_env() + # Use alternative env var names (without CHROME_ prefix) + env['TIMEOUT'] = '45' + env['RESOLUTION'] = '1024,768' + env['USER_AGENT'] = 'AltBot/1.0' + env['CHECK_SSL_VALIDITY'] = 'false' + + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-altenv'], + cwd=str(screenshot_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + + assert result.returncode == 0, f"Should work with alternative env vars: {result.stderr}" + assert (screenshot_dir / 'screenshot.png').exists() + + +def test_very_large_resolution(): + """Test screenshot with very large resolution.""" + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) + snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-large' + screenshot_dir = snapshot_dir / 'screenshot' + screenshot_dir.mkdir(parents=True) + + env = get_test_env() + env['CHROME_RESOLUTION'] = '3840,2160' # 4K resolution + env['CHROME_HEADLESS'] = 'true' + + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-large'], + cwd=str(screenshot_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + + assert result.returncode == 0, f"Should handle large resolution: {result.stderr}" + screenshot_file = screenshot_dir / 'screenshot.png' + assert screenshot_file.exists() + # 4K screenshot should be larger + assert screenshot_file.stat().st_size > 5000, "4K screenshot should be substantial" + + +def test_very_small_resolution(): + """Test screenshot with very small resolution.""" + with tempfile.TemporaryDirectory() as tmpdir: + data_dir = Path(tmpdir) + snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-small' + screenshot_dir = snapshot_dir / 'screenshot' + screenshot_dir.mkdir(parents=True) + + env = get_test_env() + env['CHROME_RESOLUTION'] = '320,240' # Very small + env['CHROME_HEADLESS'] = 'true' + + result = subprocess.run( + ['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-small'], + cwd=str(screenshot_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + + assert result.returncode == 0, f"Should handle small resolution: {result.stderr}" + assert (screenshot_dir / 'screenshot.png').exists() + + if __name__ == '__main__': pytest.main([__file__, '-v']) diff --git a/archivebox/plugins/singlefile/on_Crawl__08_singlefile_install.py b/archivebox/plugins/singlefile/on_Crawl__08_singlefile_install.py new file mode 100755 index 00000000..b1bb2a68 --- /dev/null +++ b/archivebox/plugins/singlefile/on_Crawl__08_singlefile_install.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +""" +Detect single-file binary and emit Binary JSONL record. + +Output: Binary JSONL record to stdout if single-file is found +""" + +import json +import os +import sys + +from abx_pkg import Binary, EnvProvider + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def output_binary_found(binary: Binary, name: str): + """Output Binary JSONL record for an installed binary.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'abspath': str(binary.abspath), + 'version': str(binary.version) if binary.version else '', + 'sha256': binary.sha256 or '', + 'binprovider': 'env', # Already installed + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def output_binary_missing(name: str, binproviders: str): + """Output Binary JSONL record for a missing binary that needs installation.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'binproviders': binproviders, # Providers that can install it + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def main(): + singlefile_enabled = get_env_bool('SINGLEFILE_ENABLED', True) + + if not singlefile_enabled: + sys.exit(0) + + provider = EnvProvider() + found = False + + # Try single-file-cli first, then single-file + for binary_name in ['single-file-cli', 'single-file']: + try: + binary = Binary(name=binary_name, binproviders=[provider]).load() + if binary.abspath: + # Binary found + output_binary_found(binary, name='single-file') + found = True + break + except Exception: + continue + + if not found: + # Binary not found + output_binary_missing(name='single-file', binproviders='npm') + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/archivebox/plugins/wget/on_Crawl__06_wget_install.py b/archivebox/plugins/wget/on_Crawl__06_wget_install.py old mode 100644 new mode 100755 index d3116ed3..3e21596f --- a/archivebox/plugins/wget/on_Crawl__06_wget_install.py +++ b/archivebox/plugins/wget/on_Crawl__06_wget_install.py @@ -40,8 +40,8 @@ def get_env_int(name: str, default: int = 0) -> int: return default -def output_binary(binary: Binary, name: str): - """Output Binary JSONL record to stdout.""" +def output_binary_found(binary: Binary, name: str): + """Output Binary JSONL record for an installed binary.""" machine_id = os.environ.get('MACHINE_ID', '') record = { @@ -50,7 +50,20 @@ def output_binary(binary: Binary, name: str): 'abspath': str(binary.abspath), 'version': str(binary.version) if binary.version else '', 'sha256': binary.sha256 or '', - 'binprovider': 'env', + 'binprovider': 'env', # Already installed + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def output_binary_missing(name: str, binproviders: str): + """Output Binary JSONL record for a missing binary that needs installation.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'binproviders': binproviders, # Providers that can install it 'machine_id': machine_id, } print(json.dumps(record)) @@ -89,16 +102,19 @@ def main(): binary_path = '' if not binary_path: - if use_wget: - errors.append(f"WGET_BINARY={wget_binary} not found. Install wget or set WGET_ENABLED=false.") + # Binary not found computed['WGET_BINARY'] = '' + if use_wget: + # Emit Binary record for installation + output_binary_missing(name='wget', binproviders='apt,brew') else: + # Binary found computed['WGET_BINARY'] = binary_path wget_version = str(binary.version) if binary.version else 'unknown' computed['WGET_VERSION'] = wget_version - # Output Binary JSONL record - output_binary(binary, name='wget') + # Output Binary JSONL record for installed binary + output_binary_found(binary, name='wget') # Check for compression support if computed.get('WGET_BINARY'): diff --git a/archivebox/plugins/ytdlp/on_Crawl__07_ytdlp_install.py b/archivebox/plugins/ytdlp/on_Crawl__07_ytdlp_install.py new file mode 100755 index 00000000..212d21bb --- /dev/null +++ b/archivebox/plugins/ytdlp/on_Crawl__07_ytdlp_install.py @@ -0,0 +1,80 @@ +#!/usr/bin/env python3 +""" +Detect yt-dlp binary and emit Binary JSONL record. + +Output: Binary JSONL record to stdout if yt-dlp is found +""" + +import json +import os +import sys + +from abx_pkg import Binary, EnvProvider + + +def get_env(name: str, default: str = '') -> str: + return os.environ.get(name, default).strip() + +def get_env_bool(name: str, default: bool = False) -> bool: + val = get_env(name, '').lower() + if val in ('true', '1', 'yes', 'on'): + return True + if val in ('false', '0', 'no', 'off'): + return False + return default + + +def output_binary_found(binary: Binary, name: str): + """Output Binary JSONL record for an installed binary.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'abspath': str(binary.abspath), + 'version': str(binary.version) if binary.version else '', + 'sha256': binary.sha256 or '', + 'binprovider': 'env', # Already installed + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def output_binary_missing(name: str, binproviders: str): + """Output Binary JSONL record for a missing binary that needs installation.""" + machine_id = os.environ.get('MACHINE_ID', '') + + record = { + 'type': 'Binary', + 'name': name, + 'binproviders': binproviders, # Providers that can install it + 'machine_id': machine_id, + } + print(json.dumps(record)) + + +def main(): + ytdlp_enabled = get_env_bool('YTDLP_ENABLED', True) + ytdlp_binary = get_env('YTDLP_BINARY', 'yt-dlp') + + if not ytdlp_enabled: + sys.exit(0) + + provider = EnvProvider() + try: + binary = Binary(name=ytdlp_binary, binproviders=[provider]).load() + if binary.abspath: + # Binary found + output_binary_found(binary, name='yt-dlp') + else: + # Binary not found + output_binary_missing(name='yt-dlp', binproviders='pip,brew,apt') + except Exception: + # Binary not found + output_binary_missing(name='yt-dlp', binproviders='pip,brew,apt') + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/tests/fixtures.py b/archivebox/tests/fixtures.py similarity index 100% rename from tests/fixtures.py rename to archivebox/tests/fixtures.py diff --git a/tests/test_add.py b/archivebox/tests/test_add.py similarity index 100% rename from tests/test_add.py rename to archivebox/tests/test_add.py diff --git a/tests/test_cli_add.py b/archivebox/tests/test_cli_add.py similarity index 100% rename from tests/test_cli_add.py rename to archivebox/tests/test_cli_add.py diff --git a/tests/test_cli_config.py b/archivebox/tests/test_cli_config.py similarity index 100% rename from tests/test_cli_config.py rename to archivebox/tests/test_cli_config.py diff --git a/tests/test_cli_extract.py b/archivebox/tests/test_cli_extract.py similarity index 100% rename from tests/test_cli_extract.py rename to archivebox/tests/test_cli_extract.py diff --git a/tests/test_cli_help.py b/archivebox/tests/test_cli_help.py similarity index 100% rename from tests/test_cli_help.py rename to archivebox/tests/test_cli_help.py diff --git a/tests/test_cli_init.py b/archivebox/tests/test_cli_init.py similarity index 100% rename from tests/test_cli_init.py rename to archivebox/tests/test_cli_init.py diff --git a/tests/test_cli_install.py b/archivebox/tests/test_cli_install.py similarity index 100% rename from tests/test_cli_install.py rename to archivebox/tests/test_cli_install.py diff --git a/tests/test_cli_manage.py b/archivebox/tests/test_cli_manage.py similarity index 100% rename from tests/test_cli_manage.py rename to archivebox/tests/test_cli_manage.py diff --git a/tests/test_cli_remove.py b/archivebox/tests/test_cli_remove.py similarity index 100% rename from tests/test_cli_remove.py rename to archivebox/tests/test_cli_remove.py diff --git a/tests/test_cli_schedule.py b/archivebox/tests/test_cli_schedule.py similarity index 100% rename from tests/test_cli_schedule.py rename to archivebox/tests/test_cli_schedule.py diff --git a/tests/test_cli_search.py b/archivebox/tests/test_cli_search.py similarity index 100% rename from tests/test_cli_search.py rename to archivebox/tests/test_cli_search.py diff --git a/tests/test_cli_server.py b/archivebox/tests/test_cli_server.py similarity index 100% rename from tests/test_cli_server.py rename to archivebox/tests/test_cli_server.py diff --git a/tests/test_cli_shell.py b/archivebox/tests/test_cli_shell.py similarity index 100% rename from tests/test_cli_shell.py rename to archivebox/tests/test_cli_shell.py diff --git a/tests/test_cli_status.py b/archivebox/tests/test_cli_status.py similarity index 100% rename from tests/test_cli_status.py rename to archivebox/tests/test_cli_status.py diff --git a/tests/test_cli_update.py b/archivebox/tests/test_cli_update.py similarity index 100% rename from tests/test_cli_update.py rename to archivebox/tests/test_cli_update.py diff --git a/tests/test_cli_version.py b/archivebox/tests/test_cli_version.py similarity index 100% rename from tests/test_cli_version.py rename to archivebox/tests/test_cli_version.py diff --git a/tests/test_config.py b/archivebox/tests/test_config.py similarity index 100% rename from tests/test_config.py rename to archivebox/tests/test_config.py diff --git a/tests/test_crawl.py b/archivebox/tests/test_crawl.py similarity index 100% rename from tests/test_crawl.py rename to archivebox/tests/test_crawl.py diff --git a/tests/test_extract.py b/archivebox/tests/test_extract.py similarity index 100% rename from tests/test_extract.py rename to archivebox/tests/test_extract.py diff --git a/tests/test_extractors.py b/archivebox/tests/test_extractors.py similarity index 100% rename from tests/test_extractors.py rename to archivebox/tests/test_extractors.py diff --git a/tests/test_init.py b/archivebox/tests/test_init.py similarity index 100% rename from tests/test_init.py rename to archivebox/tests/test_init.py diff --git a/tests/test_install.py b/archivebox/tests/test_install.py similarity index 100% rename from tests/test_install.py rename to archivebox/tests/test_install.py diff --git a/tests/test_list.py b/archivebox/tests/test_list.py similarity index 100% rename from tests/test_list.py rename to archivebox/tests/test_list.py diff --git a/archivebox/tests/test_migrations_08_to_09.py b/archivebox/tests/test_migrations_08_to_09.py index 37f5ce83..389204e9 100644 --- a/archivebox/tests/test_migrations_08_to_09.py +++ b/archivebox/tests/test_migrations_08_to_09.py @@ -10,6 +10,7 @@ Migration tests from 0.8.x to 0.9.x. - New fields like depth, retry_at, etc. """ +import json import shutil import sqlite3 import subprocess @@ -78,29 +79,43 @@ class TestMigrationFrom08x(unittest.TestCase): self.assertTrue(ok, msg) def test_migration_preserves_crawls(self): - """Migration should preserve all Crawl records.""" + """Migration should preserve all Crawl records and create default crawl if needed.""" result = run_archivebox(self.work_dir, ['init'], timeout=45) self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}") + # Count snapshots with NULL crawl_id in original data + snapshots_without_crawl = sum(1 for s in self.original_data['snapshots'] if s['crawl_id'] is None) + + # Expected count: original crawls + 1 default crawl if any snapshots had NULL crawl_id expected_count = len(self.original_data['crawls']) + if snapshots_without_crawl > 0: + expected_count += 1 # Migration 0024 creates a default crawl + ok, msg = verify_crawl_count(self.db_path, expected_count) self.assertTrue(ok, msg) def test_migration_preserves_snapshot_crawl_links(self): - """Migration should preserve snapshot-to-crawl relationships.""" + """Migration should preserve snapshot-to-crawl relationships and assign default crawl to orphans.""" result = run_archivebox(self.work_dir, ['init'], timeout=45) self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}") conn = sqlite3.connect(str(self.db_path)) cursor = conn.cursor() - # Check EVERY snapshot still has its crawl_id + # Check EVERY snapshot has a crawl_id after migration for snapshot in self.original_data['snapshots']: cursor.execute("SELECT crawl_id FROM core_snapshot WHERE url = ?", (snapshot['url'],)) row = cursor.fetchone() self.assertIsNotNone(row, f"Snapshot {snapshot['url']} not found after migration") - self.assertEqual(row[0], snapshot['crawl_id'], - f"Crawl ID mismatch for {snapshot['url']}: expected {snapshot['crawl_id']}, got {row[0]}") + + if snapshot['crawl_id'] is not None: + # Snapshots that had a crawl should keep it + self.assertEqual(row[0], snapshot['crawl_id'], + f"Crawl ID changed for {snapshot['url']}: expected {snapshot['crawl_id']}, got {row[0]}") + else: + # Snapshots without a crawl should now have one (the default crawl) + self.assertIsNotNone(row[0], + f"Snapshot {snapshot['url']} should have been assigned to default crawl but has NULL") conn.close() @@ -153,7 +168,7 @@ class TestMigrationFrom08x(unittest.TestCase): result = run_archivebox(self.work_dir, ['init'], timeout=45) self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}") - result = run_archivebox(self.work_dir, ['list']) + result = run_archivebox(self.work_dir, ['snapshot', 'list']) self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}") # Verify ALL snapshots appear in output @@ -475,357 +490,227 @@ class TestFilesystemMigration08to09(unittest.TestCase): """Clean up temporary directory.""" shutil.rmtree(self.work_dir, ignore_errors=True) - def test_filesystem_migration_with_real_archiving(self): + def test_archiveresult_files_preserved_after_migration(self): """ - Test that filesystem migration works with real archived content. + Test that ArchiveResult output files are reorganized into new structure. - Steps: - 1. Initialize archivebox - 2. Archive https://example.com (creates real files) - 3. Manually set fs_version to 0.8.0 - 4. Trigger migration by saving snapshot - 5. Verify files are organized correctly + This test verifies that: + 1. Migration preserves ArchiveResult data in Process/Binary records + 2. Running `archivebox update` reorganizes files into new structure + 3. New structure: users/username/snapshots/YYYYMMDD/example.com/snap-uuid-here/output.ext + 4. All files are moved (no data loss) + 5. Old archive/timestamp/ directories are cleaned up """ - # Step 1: Initialize - result = run_archivebox(self.work_dir, ['init'], timeout=45) - self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}") + # Use the real 0.7.2 database which has actual ArchiveResults with files + gold_db = Path('/Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.7.2/data') + if not gold_db.exists(): + self.skipTest(f"Gold standard database not found at {gold_db}") - # Step 2: Archive example.com with ALL extractors enabled - # This ensures we test migration with all file types - try: - result = run_archivebox( - self.work_dir, - ['add', '--depth=0', 'https://example.com'], - timeout=300, # 5 minutes for all extractors - env={ - 'SAVE_TITLE': 'True', - 'SAVE_FAVICON': 'True', - 'SAVE_WGET': 'True', - 'SAVE_SCREENSHOT': 'True', - 'SAVE_DOM': 'True', - 'SAVE_SINGLEFILE': 'True', - 'SAVE_READABILITY': 'True', - 'SAVE_MERCURY': 'True', - 'SAVE_PDF': 'True', - 'SAVE_YTDLP': 'True', - 'SAVE_ARCHIVEDOTORG': 'True', - 'SAVE_HEADERS': 'True', - 'SAVE_HTMLTOTEXT': 'True', - 'SAVE_GIT': 'True', - } - ) - except subprocess.TimeoutExpired as e: - # If timeout, still continue - we want to test with whatever files were created - print(f"\n[!] Add command timed out after {e.timeout}s, continuing with partial results...") - # Note: Snapshot may still have been created even if command timed out + # Copy gold database to test directory + import shutil + for item in gold_db.iterdir(): + if item.is_dir(): + shutil.copytree(item, self.work_dir / item.name, dirs_exist_ok=True) + else: + shutil.copy2(item, self.work_dir / item.name) - # Step 3: Get the snapshot and verify files were created - conn = sqlite3.connect(str(self.db_path)) - cursor = conn.cursor() - cursor.execute("SELECT id, url, timestamp, fs_version FROM core_snapshot WHERE url = ?", ('https://example.com',)) - row = cursor.fetchone() - conn.close() - - if not row: - self.skipTest("Failed to create snapshot for https://example.com") - - snapshot_id, url, timestamp, fs_version = row - - # Verify initial fs_version is 0.9.0 (current version) - self.assertEqual(fs_version, '0.9.0', f"Expected new snapshot to have fs_version='0.9.0', got '{fs_version}'") - - # Verify output directory exists - output_dir = self.work_dir / 'archive' / timestamp - self.assertTrue(output_dir.exists(), f"Output directory not found: {output_dir}") - - # List all files created (for debugging) - files_before = list(output_dir.rglob('*')) - files_before_count = len([f for f in files_before if f.is_file()]) - print(f"\n[*] Files created by archiving: {files_before_count}") - for f in sorted(files_before): - if f.is_file(): - print(f" {f.relative_to(output_dir)}") - - # Step 4: Manually set fs_version to 0.8.0 to simulate old snapshot - conn = sqlite3.connect(str(self.db_path)) - cursor = conn.cursor() - cursor.execute("UPDATE core_snapshot SET fs_version = '0.8.0' WHERE id = ?", (snapshot_id,)) - conn.commit() - - # Verify the update worked - cursor.execute("SELECT fs_version FROM core_snapshot WHERE id = ?", (snapshot_id,)) - updated_version = cursor.fetchone()[0] - conn.close() - self.assertEqual(updated_version, '0.8.0', "Failed to set fs_version to 0.8.0") - - # Step 5: Trigger migration by running a command that loads and saves the snapshot - # We'll use the Python API directly to trigger save() - import os - import sys - import django - - # Setup Django - os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings') - os.environ['DATA_DIR'] = str(self.work_dir) - - # Add parent dir to path so we can import archivebox - sys.path.insert(0, str(Path(__file__).parent.parent.parent)) - - try: - django.setup() - from archivebox.core.models import Snapshot - - # Load the snapshot (should trigger migration on save) - snapshot = Snapshot.objects.get(url='https://example.com') - - # Verify fs_migration_needed returns True - self.assertTrue(snapshot.fs_migration_needed, - f"fs_migration_needed should be True for fs_version='0.8.0'") - - # Save to trigger migration - print(f"\n[*] Triggering filesystem migration by saving snapshot...") - snapshot.save() - - # Refresh from DB - snapshot.refresh_from_db() - - # Verify migration completed - self.assertEqual(snapshot.fs_version, '0.9.0', - f"Migration failed: fs_version is still '{snapshot.fs_version}'") - self.assertFalse(snapshot.fs_migration_needed, - "fs_migration_needed should be False after migration") - - print(f"[√] Filesystem migration completed: 0.8.0 -> 0.9.0") - - except Exception as e: - self.fail(f"Failed to trigger migration via Django: {e}") - - # Step 6: Verify files still exist and are accessible - # For 0.8 -> 0.9, the migration is a no-op, so files should be in the same place - files_after = list(output_dir.rglob('*')) - files_after_count = len([f for f in files_after if f.is_file()]) - - print(f"\n[*] Files after migration: {files_after_count}") - - # Verify no files were lost - self.assertGreaterEqual(files_after_count, files_before_count, - f"Files were lost during migration: {files_before_count} -> {files_after_count}") - - -class TestDBOnlyCommands(unittest.TestCase): - """Test that status/search/list commands only use DB, not filesystem.""" - - def setUp(self): - """Create a temporary directory with 0.8.x schema and data.""" - self.work_dir = Path(tempfile.mkdtemp()) - self.db_path = self.work_dir / 'index.sqlite3' - - create_data_dir_structure(self.work_dir) - conn = sqlite3.connect(str(self.db_path)) - conn.executescript(SCHEMA_0_8) - conn.close() - self.original_data = seed_0_8_data(self.db_path) - - def tearDown(self): - """Clean up temporary directory.""" - shutil.rmtree(self.work_dir, ignore_errors=True) - - def test_status_works_with_empty_archive(self): - """Status command should work with empty archive/ (queries DB only).""" - result = run_archivebox(self.work_dir, ['init'], timeout=45) - self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}") - - # Add a snapshot to DB - result = run_archivebox(self.work_dir, ['add', 'https://example.com'], timeout=60) - - # Empty the archive directory (but keep it existing) + # Count archive directories and files BEFORE migration archive_dir = self.work_dir / 'archive' - if archive_dir.exists(): - for item in archive_dir.iterdir(): - if item.is_dir(): - shutil.rmtree(item) - else: - item.unlink() + dirs_before = list(archive_dir.glob('*')) if archive_dir.exists() else [] + dirs_before_count = len([d for d in dirs_before if d.is_dir()]) - # Status should still work (queries DB only, doesn't scan filesystem) - result = run_archivebox(self.work_dir, ['status']) - self.assertEqual(result.returncode, 0, - f"Status should work with empty archive: {result.stderr}") + # Count total files in all archive directories + files_before = [] + for d in dirs_before: + if d.is_dir(): + files_before.extend([f for f in d.rglob('*') if f.is_file()]) + files_before_count = len(files_before) - # Should show count from DB - output = result.stdout + result.stderr - self.assertIn('Total', output, - "Status should show DB statistics even with no files") + # Sample some specific files to check they're preserved + sample_files = [ + 'favicon.ico', + 'screenshot.png', + 'singlefile.html', + 'headers.json', + ] + sample_paths_before = {} + for d in dirs_before: + if d.is_dir(): + for sample_file in sample_files: + matching = list(d.glob(sample_file)) + if matching: + sample_paths_before[f"{d.name}/{sample_file}"] = matching[0] - def test_list_works_with_empty_archive(self): - """List command should work with empty archive/ (queries DB only).""" - result = run_archivebox(self.work_dir, ['init'], timeout=45) - self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}") + print(f"\n[*] Archive directories before migration: {dirs_before_count}") + print(f"[*] Total files before migration: {files_before_count}") + print(f"[*] Sample files found: {len(sample_paths_before)}") - # Add a snapshot to DB - result = run_archivebox(self.work_dir, ['add', 'https://example.com'], timeout=60) + # Run init to trigger migration + result = run_archivebox(self.work_dir, ['init'], timeout=60) + self.assertEqual(result.returncode, 0, f"Init (migration) failed: {result.stderr}") - # Empty the archive directory (but keep it existing) - archive_dir = self.work_dir / 'archive' - if archive_dir.exists(): - for item in archive_dir.iterdir(): - if item.is_dir(): - shutil.rmtree(item) - else: - item.unlink() + # Count archive directories and files AFTER migration + dirs_after = list(archive_dir.glob('*')) if archive_dir.exists() else [] + dirs_after_count = len([d for d in dirs_after if d.is_dir()]) - # List should still work (queries DB only, doesn't scan filesystem) - result = run_archivebox(self.work_dir, ['list']) - self.assertEqual(result.returncode, 0, - f"List should work with empty archive: {result.stderr}") + files_after = [] + for d in dirs_after: + if d.is_dir(): + files_after.extend([f for f in d.rglob('*') if f.is_file()]) + files_after_count = len(files_after) - # Should show snapshot from DB - output = result.stdout + result.stderr - self.assertIn('example.com', output, - "Snapshot should appear in list output even with no files") + # Verify sample files still exist + sample_paths_after = {} + for d in dirs_after: + if d.is_dir(): + for sample_file in sample_files: + matching = list(d.glob(sample_file)) + if matching: + sample_paths_after[f"{d.name}/{sample_file}"] = matching[0] - def test_search_works_with_empty_archive(self): - """Search command should work with empty archive/ (queries DB only).""" - result = run_archivebox(self.work_dir, ['init'], timeout=45) - self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}") + print(f"[*] Archive directories after migration: {dirs_after_count}") + print(f"[*] Total files after migration: {files_after_count}") + print(f"[*] Sample files found: {len(sample_paths_after)}") - # Add a snapshot to DB - result = run_archivebox(self.work_dir, ['add', 'https://example.com'], timeout=60) + # Verify files still in old structure after migration (not moved yet) + self.assertEqual(dirs_before_count, dirs_after_count, + f"Archive directories lost during migration: {dirs_before_count} -> {dirs_after_count}") + self.assertEqual(files_before_count, files_after_count, + f"Files lost during migration: {files_before_count} -> {files_after_count}") - # Empty the archive directory (but keep it existing) - archive_dir = self.work_dir / 'archive' - if archive_dir.exists(): - for item in archive_dir.iterdir(): - if item.is_dir(): - shutil.rmtree(item) - else: - item.unlink() - - # Search should still work (queries DB only, doesn't scan filesystem) - result = run_archivebox(self.work_dir, ['search']) - self.assertEqual(result.returncode, 0, - f"Search should work with empty archive: {result.stderr}") - - # Should show snapshot from DB - output = result.stdout + result.stderr - self.assertIn('example.com', output, - "Snapshot should appear in search output even with no files") - - -class TestUpdateCommandArchitecture(unittest.TestCase): - """Test new update command architecture: filters=DB only, no filters=scan filesystem.""" - - def setUp(self): - """Create a temporary directory with 0.8.x schema and data.""" - self.work_dir = Path(tempfile.mkdtemp()) - self.db_path = self.work_dir / 'index.sqlite3' - create_data_dir_structure(self.work_dir) - - def tearDown(self): - """Clean up temporary directory.""" - shutil.rmtree(self.work_dir, ignore_errors=True) - - def test_update_with_filters_uses_db_only(self): - """Update with filters should only query DB, not scan filesystem.""" - # Initialize with data - conn = sqlite3.connect(str(self.db_path)) - conn.executescript(SCHEMA_0_8) - conn.close() - seed_0_8_data(self.db_path) - - result = run_archivebox(self.work_dir, ['init'], timeout=45) - self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}") - - # Run update with filter - should not scan filesystem - # Use a URL from the seeded data - result = run_archivebox(self.work_dir, ['update', 'example.com'], timeout=120) - # Should complete successfully (or with orchestrator error, which is okay) - # The key is it should not scan filesystem - - def test_update_without_filters_imports_orphans(self): - """Update without filters should scan filesystem and import orphaned directories.""" - # Initialize empty DB - result = run_archivebox(self.work_dir, ['init'], timeout=45) - self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}") - - # Create an orphaned directory in archive/ - timestamp = '1609459200' - orphan_dir = self.work_dir / 'archive' / timestamp - orphan_dir.mkdir(parents=True, exist_ok=True) - - index_data = { - 'url': 'https://orphan.example.com', - 'timestamp': timestamp, - 'title': 'Orphaned Snapshot', - } - (orphan_dir / 'index.json').write_text(json.dumps(index_data)) - (orphan_dir / 'index.html').write_text('Orphan') - - # Count snapshots before update - conn = sqlite3.connect(str(self.db_path)) - cursor = conn.cursor() - cursor.execute("SELECT COUNT(*) FROM core_snapshot") - count_before = cursor.fetchone()[0] - conn.close() - - # Run full update (no filters) - should scan filesystem + # Run update to trigger filesystem reorganization + print(f"\n[*] Running archivebox update to reorganize filesystem...") result = run_archivebox(self.work_dir, ['update'], timeout=120) + self.assertEqual(result.returncode, 0, f"Update failed: {result.stderr}") - # Check if orphan was imported - conn = sqlite3.connect(str(self.db_path)) - cursor = conn.cursor() - cursor.execute("SELECT COUNT(*) FROM core_snapshot WHERE url = ?", - ('https://orphan.example.com',)) - orphan_count = cursor.fetchone()[0] - conn.close() + # Check new filesystem structure + # New structure: users/username/snapshots/YYYYMMDD/example.com/snap-uuid-here/output.ext + users_dir = self.work_dir / 'users' + snapshots_base = None - # If update succeeded, orphan should be imported - if result.returncode == 0: - self.assertGreaterEqual(orphan_count, 1, - "Orphaned snapshot should be imported by update") + if users_dir.exists(): + # Find the snapshots directory + for user_dir in users_dir.iterdir(): + if user_dir.is_dir(): + user_snapshots = user_dir / 'snapshots' + if user_snapshots.exists(): + snapshots_base = user_snapshots + break + print(f"[*] New structure base: {snapshots_base}") -class TestTimestampUniqueness(unittest.TestCase): - """Test timestamp uniqueness constraint.""" + # Count files in new structure + # Structure: users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/files... + files_new_structure = [] + new_sample_files = {} - def setUp(self): - """Create a temporary directory.""" - self.work_dir = Path(tempfile.mkdtemp()) - self.db_path = self.work_dir / 'index.sqlite3' - create_data_dir_structure(self.work_dir) + if snapshots_base and snapshots_base.exists(): + for date_dir in snapshots_base.iterdir(): + if date_dir.is_dir(): + for domain_dir in date_dir.iterdir(): + if domain_dir.is_dir(): + for snap_dir in domain_dir.iterdir(): + if snap_dir.is_dir(): + # Files are directly in snap-uuid/ directory (no plugin subdirs) + for f in snap_dir.rglob('*'): + if f.is_file(): + files_new_structure.append(f) + # Track sample files + if f.name in sample_files: + new_sample_files[f"{snap_dir.name}/{f.name}"] = f - def tearDown(self): - """Clean up temporary directory.""" - shutil.rmtree(self.work_dir, ignore_errors=True) + files_new_count = len(files_new_structure) + print(f"[*] Files in new structure: {files_new_count}") + print(f"[*] Sample files in new structure: {len(new_sample_files)}") - def test_timestamp_uniqueness_constraint_exists(self): - """Database should have timestamp uniqueness constraint after migration.""" - # Initialize with 0.8.x and migrate - conn = sqlite3.connect(str(self.db_path)) - conn.executescript(SCHEMA_0_8) - conn.close() + # Check old structure (should be gone or empty) + old_archive_dir = self.work_dir / 'archive' + old_files_remaining = [] + unmigrated_dirs = [] + if old_archive_dir.exists(): + for d in old_archive_dir.glob('*'): + # Only count REAL directories, not symlinks (symlinks are the migrated ones) + if d.is_dir(follow_symlinks=False) and d.name.replace('.', '').isdigit(): + # This is a timestamp directory (old structure) + files_in_dir = [f for f in d.rglob('*') if f.is_file()] + if files_in_dir: + unmigrated_dirs.append((d.name, len(files_in_dir))) + old_files_remaining.extend(files_in_dir) - result = run_archivebox(self.work_dir, ['init'], timeout=45) - self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}") + old_files_count = len(old_files_remaining) + print(f"[*] Files remaining in old structure: {old_files_count}") + if unmigrated_dirs: + print(f"[*] Unmigrated directories: {unmigrated_dirs}") - # Check if unique_timestamp constraint exists + # CRITICAL: Verify files were moved to new structure + self.assertGreater(files_new_count, 0, + "No files found in new structure after update") + + # CRITICAL: Verify old structure is cleaned up + self.assertEqual(old_files_count, 0, + f"Old structure not cleaned up: {old_files_count} files still in archive/timestamp/ directories") + + # CRITICAL: Verify all files were moved (total count should match) + total_after_update = files_new_count + old_files_count + self.assertEqual(files_before_count, total_after_update, + f"Files lost during reorganization: {files_before_count} before → {total_after_update} after") + + # CRITICAL: Verify sample files exist in new structure + self.assertGreater(len(new_sample_files), 0, + f"Sample files not found in new structure") + + # Verify new path format + for path_key, file_path in new_sample_files.items(): + # Path should contain: snapshots/YYYYMMDD/domain/snap-uuid/plugin/file + path_parts = file_path.parts + self.assertIn('snapshots', path_parts, + f"New path should contain 'snapshots': {file_path}") + self.assertIn('users', path_parts, + f"New path should contain 'users': {file_path}") + print(f" ✓ {path_key} → {file_path.relative_to(self.work_dir)}") + + # Verify Process and Binary records were created conn = sqlite3.connect(str(self.db_path)) cursor = conn.cursor() - # Query sqlite_master for constraints - cursor.execute(""" - SELECT sql FROM sqlite_master - WHERE type='table' AND name='core_snapshot' - """) - table_sql = cursor.fetchone()[0] + cursor.execute("SELECT COUNT(*) FROM core_archiveresult") + archiveresult_count = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(*) FROM machine_process") + process_count = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(*) FROM machine_binary") + binary_count = cursor.fetchone()[0] + + cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE process_id IS NOT NULL") + linked_count = cursor.fetchone()[0] + conn.close() - # Should contain unique_timestamp constraint or UNIQUE(timestamp) - has_constraint = 'unique_timestamp' in table_sql.lower() or \ - 'unique' in table_sql.lower() and 'timestamp' in table_sql.lower() + print(f"[*] ArchiveResults: {archiveresult_count}") + print(f"[*] Process records created: {process_count}") + print(f"[*] Binary records created: {binary_count}") + print(f"[*] ArchiveResults linked to Process: {linked_count}") + + # Verify data migration happened correctly + # The 0.7.2 gold database has 44 ArchiveResults + self.assertEqual(archiveresult_count, 44, + f"Expected 44 ArchiveResults from 0.7.2 database, got {archiveresult_count}") + + # Each ArchiveResult should create one Process record + self.assertEqual(process_count, 44, + f"Expected 44 Process records (1 per ArchiveResult), got {process_count}") + + # The 44 ArchiveResults use 7 unique binaries (curl, wget, etc.) + self.assertEqual(binary_count, 7, + f"Expected 7 unique Binary records, got {binary_count}") + + # ALL ArchiveResults should be linked to Process records + self.assertEqual(linked_count, 44, + f"Expected all 44 ArchiveResults linked to Process, got {linked_count}") + + - self.assertTrue(has_constraint, - f"Timestamp uniqueness constraint should exist. Table SQL: {table_sql}") if __name__ == '__main__': diff --git a/tests/test_recursive_crawl.py b/archivebox/tests/test_recursive_crawl.py similarity index 100% rename from tests/test_recursive_crawl.py rename to archivebox/tests/test_recursive_crawl.py diff --git a/tests/test_remove.py b/archivebox/tests/test_remove.py similarity index 100% rename from tests/test_remove.py rename to archivebox/tests/test_remove.py diff --git a/tests/test_schedule.py b/archivebox/tests/test_schedule.py similarity index 100% rename from tests/test_schedule.py rename to archivebox/tests/test_schedule.py diff --git a/tests/test_search.py b/archivebox/tests/test_search.py similarity index 100% rename from tests/test_search.py rename to archivebox/tests/test_search.py diff --git a/tests/test_snapshot.py b/archivebox/tests/test_snapshot.py similarity index 100% rename from tests/test_snapshot.py rename to archivebox/tests/test_snapshot.py diff --git a/tests/test_status.py b/archivebox/tests/test_status.py similarity index 100% rename from tests/test_status.py rename to archivebox/tests/test_status.py diff --git a/tests/test_title.py b/archivebox/tests/test_title.py similarity index 100% rename from tests/test_title.py rename to archivebox/tests/test_title.py diff --git a/tests/test_update.py b/archivebox/tests/test_update.py similarity index 100% rename from tests/test_update.py rename to archivebox/tests/test_update.py diff --git a/tests/test_util.py b/archivebox/tests/test_util.py similarity index 100% rename from tests/test_util.py rename to archivebox/tests/test_util.py diff --git a/tests/test_version.py b/archivebox/tests/test_version.py similarity index 100% rename from tests/test_version.py rename to archivebox/tests/test_version.py diff --git a/archivebox/workers/orchestrator.py b/archivebox/workers/orchestrator.py index 7dbe9a0d..ed8bf832 100644 --- a/archivebox/workers/orchestrator.py +++ b/archivebox/workers/orchestrator.py @@ -76,11 +76,11 @@ class Orchestrator: self.idle_count: int = 0 self._last_cleanup_time: float = 0.0 # For throttling cleanup_stale_running() - # CRITICAL: In foreground mode (exit_on_idle=True), use ONLY 1 worker - # to keep execution strictly sequential and deterministic + # In foreground mode (exit_on_idle=True), limit workers but allow enough + # for crawl progression: 1 CrawlWorker + 1 SnapshotWorker + 1 ArchiveResultWorker if self.exit_on_idle: self.MAX_WORKERS_PER_TYPE = 1 - self.MAX_TOTAL_WORKERS = 1 + self.MAX_TOTAL_WORKERS = 3 # Allow one worker of each type to run concurrently def __repr__(self) -> str: return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]' @@ -157,32 +157,41 @@ class Orchestrator: self._last_cleanup_time = now return sum(len(W.get_running_workers()) for W in self.WORKER_TYPES) + + def get_running_workers_for_type(self, WorkerClass: Type[Worker]) -> int: + """Get count of running workers for a specific worker type.""" + return len(WorkerClass.get_running_workers()) def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool: """Determine if we should spawn a new worker of the given type.""" if queue_count == 0: return False - + # Check per-type limit running_workers = WorkerClass.get_running_workers() - if len(running_workers) >= self.MAX_WORKERS_PER_TYPE: + running_count = len(running_workers) + + if running_count >= self.MAX_WORKERS_PER_TYPE: return False - + # Check total limit - if self.get_total_worker_count() >= self.MAX_TOTAL_WORKERS: + total_workers = self.get_total_worker_count() + if total_workers >= self.MAX_TOTAL_WORKERS: return False - + # Check if we already have enough workers for the queue size # Spawn more gradually - don't flood with workers - if len(running_workers) > 0 and queue_count <= len(running_workers) * WorkerClass.MAX_CONCURRENT_TASKS: + if running_count > 0 and queue_count <= running_count * WorkerClass.MAX_CONCURRENT_TASKS: return False - + return True def spawn_worker(self, WorkerClass: Type[Worker]) -> int | None: """Spawn a new worker process. Returns PID or None if spawn failed.""" try: + print(f'[yellow]DEBUG: Spawning {WorkerClass.name} worker with crawl_id={self.crawl_id}...[/yellow]') pid = WorkerClass.start(daemon=False, crawl_id=self.crawl_id) + print(f'[yellow]DEBUG: Spawned {WorkerClass.name} worker with PID={pid}[/yellow]') # CRITICAL: Block until worker registers itself in Process table # This prevents race condition where orchestrator spawns multiple workers @@ -202,6 +211,15 @@ class Orchestrator: # 3. RUNNING status # 4. Parent is this orchestrator # 5. Started recently (within last 10 seconds) + + # Debug: Check all processes with this PID first + if elapsed < 0.5: + all_procs = list(Process.objects.filter(pid=pid)) + print(f'[yellow]DEBUG spawn_worker: elapsed={elapsed:.1f}s pid={pid} orchestrator_id={self.db_process.id}[/yellow]') + print(f'[yellow] Found {len(all_procs)} Process records for pid={pid}[/yellow]') + for p in all_procs: + print(f'[yellow] -> type={p.process_type} status={p.status} parent_id={p.parent_id} match={p.parent_id == self.db_process.id}[/yellow]') + worker_process = Process.objects.filter( pid=pid, process_type=Process.TypeChoices.WORKER, @@ -212,6 +230,7 @@ class Orchestrator: if worker_process: # Worker successfully registered! + print(f'[green]DEBUG spawn_worker: Worker registered! Returning pid={pid}[/green]') return pid time.sleep(poll_interval) @@ -244,7 +263,7 @@ class Orchestrator: Returns dict of queue sizes by worker type. """ queue_sizes = {} - + for WorkerClass in self.WORKER_TYPES: # Get queue for this worker type # Need to instantiate worker to get queue (for model access) @@ -392,11 +411,18 @@ class Orchestrator: def _run_orchestrator_loop(self, progress, task_ids): """Run the main orchestrator loop with optional progress display.""" + last_queue_sizes = {} + last_snapshot_count = None try: while True: # Check queues and spawn workers queue_sizes = self.check_queues_and_spawn_workers() + # Debug queue sizes (only when changed) + if progress and queue_sizes != last_queue_sizes: + progress.console.print(f'[yellow]DEBUG: Queue sizes: {queue_sizes}[/yellow]') + last_queue_sizes = queue_sizes.copy() + # Update progress bars if progress: from archivebox.core.models import Snapshot @@ -412,6 +438,11 @@ class Orchestrator: active_snapshots = list(Snapshot.objects.filter(**snapshot_filter)) + # Debug snapshot count (only when changed) + if len(active_snapshots) != last_snapshot_count: + progress.console.print(f'[yellow]DEBUG: Found {len(active_snapshots)} active snapshots (crawl_id={self.crawl_id})[/yellow]') + last_snapshot_count = len(active_snapshots) + # Track which snapshots are still active active_ids = set() @@ -461,7 +492,9 @@ class Orchestrator: del task_ids[snapshot_id] # Track idle state - if self.has_pending_work(queue_sizes) or self.has_running_workers(): + has_pending = self.has_pending_work(queue_sizes) + has_running = self.has_running_workers() + if has_pending or has_running: self.idle_count = 0 self.on_tick(queue_sizes) else: diff --git a/archivebox/workers/worker.py b/archivebox/workers/worker.py index 7b1127cc..5a0c0980 100644 --- a/archivebox/workers/worker.py +++ b/archivebox/workers/worker.py @@ -60,8 +60,8 @@ class Worker: # Configuration (can be overridden by subclasses) MAX_TICK_TIME: ClassVar[int] = 60 MAX_CONCURRENT_TASKS: ClassVar[int] = 1 - POLL_INTERVAL: ClassVar[float] = 0.2 # How often to check for new work (seconds) - IDLE_TIMEOUT: ClassVar[int] = 50 # Exit after N idle iterations (10 sec at 0.2 poll interval) + POLL_INTERVAL: ClassVar[float] = 0.1 # How often to check for new work (seconds) + IDLE_TIMEOUT: ClassVar[int] = 100 # Exit after N idle iterations (10 sec at 0.1 poll interval) def __init__(self, worker_id: int = 0, daemon: bool = False, crawl_id: str | None = None, **kwargs: Any): self.worker_id = worker_id @@ -93,7 +93,9 @@ class Worker: Returns the claimed object or None if queue is empty or claim failed. """ Model = self.get_model() - obj = self.get_queue().first() + + queue = self.get_queue() + obj = queue.first() if obj is None: return None @@ -132,10 +134,17 @@ class Worker: self.pid = os.getpid() # Register this worker process in the database self.db_process = Process.current() - # Explicitly set process_type to WORKER to prevent mis-detection + # Explicitly set process_type to WORKER and store worker type name + update_fields = [] if self.db_process.process_type != Process.TypeChoices.WORKER: self.db_process.process_type = Process.TypeChoices.WORKER - self.db_process.save(update_fields=['process_type']) + update_fields.append('process_type') + # Store worker type name (crawl/snapshot/archiveresult) in worker_type field + if not self.db_process.worker_type: + self.db_process.worker_type = self.name + update_fields.append('worker_type') + if update_fields: + self.db_process.save(update_fields=update_fields) # Determine worker type for logging worker_type_name = self.__class__.__name__ @@ -316,7 +325,12 @@ class Worker: Process.cleanup_stale_running() # Convert Process objects to dicts to match the expected API contract - processes = Process.get_running(process_type=Process.TypeChoices.WORKER) + # Filter by worker_type to get only workers of this specific type (crawl/snapshot/archiveresult) + processes = Process.objects.filter( + process_type=Process.TypeChoices.WORKER, + worker_type=cls.name, # Filter by specific worker type + status__in=['running', 'started'] + ) # Note: worker_id is not stored on Process model, it's dynamically generated # We return process_id (UUID) and pid (OS process ID) instead return [ @@ -334,7 +348,11 @@ class Worker: """Get count of running workers of this type.""" from archivebox.machine.models import Process - return Process.get_running_count(process_type=Process.TypeChoices.WORKER) + return Process.objects.filter( + process_type=Process.TypeChoices.WORKER, + worker_type=cls.name, # Filter by specific worker type + status__in=['running', 'started'] + ).count() class CrawlWorker(Worker): diff --git a/bin/test_plugins.sh b/bin/test_plugins.sh index 3e8305bf..e3257da6 100755 --- a/bin/test_plugins.sh +++ b/bin/test_plugins.sh @@ -3,18 +3,23 @@ # # All plugin tests use pytest and are located in pluginname/tests/test_*.py # -# Usage: ./bin/test_plugins.sh [plugin_name] [--no-coverage] +# Usage: ./bin/test_plugins.sh [plugin_name] [--no-coverage] [--coverage-report] # # Examples: # ./bin/test_plugins.sh # Run all plugin tests with coverage # ./bin/test_plugins.sh chrome # Run chrome plugin tests with coverage # ./bin/test_plugins.sh parse_* # Run all parse_* plugin tests with coverage # ./bin/test_plugins.sh --no-coverage # Run all tests without coverage +# ./bin/test_plugins.sh --coverage-report # Just show coverage report without running tests # -# Coverage results are saved to .coverage and can be viewed with: -# coverage combine -# coverage report +# For running individual hooks with coverage: +# NODE_V8_COVERAGE=./coverage/js node .js [args] # JS hooks +# coverage run --parallel-mode .py [args] # Python hooks +# +# Coverage results are saved to .coverage (Python) and coverage/js (JavaScript): +# coverage combine && coverage report # coverage json +# ./bin/test_plugins.sh --coverage-report set -e @@ -30,15 +35,134 @@ ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)" # Parse arguments PLUGIN_FILTER="" ENABLE_COVERAGE=true +COVERAGE_REPORT_ONLY=false for arg in "$@"; do if [ "$arg" = "--no-coverage" ]; then ENABLE_COVERAGE=false + elif [ "$arg" = "--coverage-report" ]; then + COVERAGE_REPORT_ONLY=true else PLUGIN_FILTER="$arg" fi done +# Function to show JS coverage report (inlined from convert_v8_coverage.js) +show_js_coverage() { + local coverage_dir="$1" + + if [ ! -d "$coverage_dir" ] || [ -z "$(ls -A "$coverage_dir" 2>/dev/null)" ]; then + echo "No JavaScript coverage data collected" + echo "(JS hooks may not have been executed during tests)" + return + fi + + node - "$coverage_dir" << 'ENDJS' +const fs = require('fs'); +const path = require('path'); +const coverageDir = process.argv[2]; + +const files = fs.readdirSync(coverageDir).filter(f => f.startsWith('coverage-') && f.endsWith('.json')); +if (files.length === 0) { + console.log('No coverage files found'); + process.exit(0); +} + +const coverageByFile = {}; + +files.forEach(file => { + const data = JSON.parse(fs.readFileSync(path.join(coverageDir, file), 'utf8')); + data.result.forEach(script => { + const url = script.url; + if (url.startsWith('node:') || url.includes('node_modules')) return; + + if (!coverageByFile[url]) { + coverageByFile[url] = { totalRanges: 0, executedRanges: 0 }; + } + + script.functions.forEach(func => { + func.ranges.forEach(range => { + coverageByFile[url].totalRanges++; + if (range.count > 0) coverageByFile[url].executedRanges++; + }); + }); + }); +}); + +const allFiles = Object.keys(coverageByFile).sort(); +const pluginFiles = allFiles.filter(url => url.includes('archivebox/plugins')); +const otherFiles = allFiles.filter(url => !url.startsWith('node:') && !url.includes('archivebox/plugins')); + +console.log('Total files with coverage: ' + allFiles.length + '\n'); +console.log('Plugin files: ' + pluginFiles.length); +console.log('Node internal: ' + allFiles.filter(u => u.startsWith('node:')).length); +console.log('Other: ' + otherFiles.length + '\n'); + +console.log('JavaScript Coverage Report'); +console.log('='.repeat(80)); +console.log(''); + +if (otherFiles.length > 0) { + console.log('Non-plugin files with coverage:'); + otherFiles.forEach(url => console.log(' ' + url)); + console.log(''); +} + +if (pluginFiles.length === 0) { + console.log('No plugin files covered'); + process.exit(0); +} + +let totalRanges = 0, totalExecuted = 0; + +pluginFiles.forEach(url => { + const cov = coverageByFile[url]; + const pct = cov.totalRanges > 0 ? (cov.executedRanges / cov.totalRanges * 100).toFixed(1) : '0.0'; + const match = url.match(/archivebox\/plugins\/.+/); + const displayPath = match ? match[0] : url; + console.log(displayPath + ': ' + pct + '% (' + cov.executedRanges + '/' + cov.totalRanges + ' ranges)'); + totalRanges += cov.totalRanges; + totalExecuted += cov.executedRanges; +}); + +console.log(''); +console.log('-'.repeat(80)); +const overallPct = totalRanges > 0 ? (totalExecuted / totalRanges * 100).toFixed(1) : '0.0'; +console.log('Total: ' + overallPct + '% (' + totalExecuted + '/' + totalRanges + ' ranges)'); +ENDJS +} + +# If --coverage-report only, just show the report and exit +if [ "$COVERAGE_REPORT_ONLY" = true ]; then + cd "$ROOT_DIR" || exit 1 + echo "==========================================" + echo "Python Coverage Summary" + echo "==========================================" + coverage combine 2>/dev/null || true + coverage report --include="archivebox/plugins/*" --omit="*/tests/*" + echo "" + + echo "==========================================" + echo "JavaScript Coverage Summary" + echo "==========================================" + show_js_coverage "$ROOT_DIR/coverage/js" + echo "" + + echo "For detailed coverage reports:" + echo " Python: coverage report --show-missing --include='archivebox/plugins/*' --omit='*/tests/*'" + echo " Python: coverage json # LLM-friendly format" + echo " Python: coverage html # Interactive HTML report" + exit 0 +fi + +# Set DATA_DIR for tests (required by abx_pkg and plugins) +# Use temp dir to isolate tests from project files +if [ -z "$DATA_DIR" ]; then + export DATA_DIR=$(mktemp -d -t archivebox_plugin_tests.XXXXXX) + # Clean up on exit + trap "rm -rf '$DATA_DIR'" EXIT +fi + # Reset coverage data if collecting coverage if [ "$ENABLE_COVERAGE" = true ]; then echo "Resetting coverage data..." @@ -161,19 +285,14 @@ elif [ $FAILED_PLUGINS -eq 0 ]; then echo "==========================================" echo "JavaScript Coverage Summary" echo "==========================================" - if [ -d "$ROOT_DIR/coverage/js" ] && [ "$(ls -A "$ROOT_DIR/coverage/js" 2>/dev/null)" ]; then - node "$ROOT_DIR/bin/convert_v8_coverage.js" "$ROOT_DIR/coverage/js" - else - echo "No JavaScript coverage data collected" - echo "(JS hooks may not have been executed during tests)" - fi + show_js_coverage "$ROOT_DIR/coverage/js" echo "" echo "For detailed coverage reports (from project root):" echo " Python: coverage report --show-missing --include='archivebox/plugins/*' --omit='*/tests/*'" echo " Python: coverage json # LLM-friendly format" echo " Python: coverage html # Interactive HTML report" - echo " JavaScript: node bin/convert_v8_coverage.js coverage/js" + echo " JavaScript: ./bin/test_plugins.sh --coverage-report" fi exit 0 diff --git a/archivebox/Architecture.md b/old/Architecture.md similarity index 100% rename from archivebox/Architecture.md rename to old/Architecture.md diff --git a/TODO_archivebox_jsonl_cli.md b/old/TODO_archivebox_jsonl_cli.md similarity index 100% rename from TODO_archivebox_jsonl_cli.md rename to old/TODO_archivebox_jsonl_cli.md diff --git a/TODO_cli_refactor.md b/old/TODO_cli_refactor.md similarity index 100% rename from TODO_cli_refactor.md rename to old/TODO_cli_refactor.md diff --git a/TODO_hook_concurrency.md b/old/TODO_hook_concurrency.md similarity index 100% rename from TODO_hook_concurrency.md rename to old/TODO_hook_concurrency.md diff --git a/TODO_process_tracking.md b/old/TODO_process_tracking.md similarity index 100% rename from TODO_process_tracking.md rename to old/TODO_process_tracking.md diff --git a/archivebox.ts b/old/archivebox.ts similarity index 100% rename from archivebox.ts rename to old/archivebox.ts diff --git a/tests/__init__.py b/tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/conftest.py b/tests/conftest.py deleted file mode 100644 index 5871ed8e..00000000 --- a/tests/conftest.py +++ /dev/null @@ -1 +0,0 @@ -import pytest diff --git a/tests/test_cli_crawl.py b/tests/test_cli_crawl.py deleted file mode 100644 index 40bcceae..00000000 --- a/tests/test_cli_crawl.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python3 -""" -Tests for archivebox crawl command. -Verify crawl creates snapshots with depth. -""" - -import os -import subprocess -import sqlite3 - -from .fixtures import * - - -def test_crawl_creates_snapshots(tmp_path, process, disable_extractors_dict): - """Test that crawl command works on existing snapshots.""" - os.chdir(tmp_path) - - # First add a snapshot - subprocess.run( - ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], - capture_output=True, - env=disable_extractors_dict, - ) - - # Then run crawl on it - result = subprocess.run( - ['archivebox', 'crawl', '--depth=0', 'https://example.com'], - capture_output=True, - env=disable_extractors_dict, - timeout=30, - ) - - assert result.returncode in [0, 1, 2] # May succeed or fail depending on URL - - # Check snapshot was created - conn = sqlite3.connect("index.sqlite3") - c = conn.cursor() - count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] - conn.close() - - assert count == 1 - - -def test_crawl_with_depth_0(tmp_path, process, disable_extractors_dict): - """Test crawl with depth=0 works on existing snapshot.""" - os.chdir(tmp_path) - - # First add a snapshot - subprocess.run( - ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], - capture_output=True, - env=disable_extractors_dict, - ) - - # Then crawl it - subprocess.run( - ['archivebox', 'crawl', '--depth=0', 'https://example.com'], - capture_output=True, - env=disable_extractors_dict, - timeout=30, - ) - - conn = sqlite3.connect("index.sqlite3") - c = conn.cursor() - count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0] - conn.close() - - # Should have at least 1 snapshot from the add command - assert count >= 1 - - -def test_crawl_creates_crawl_record(tmp_path, process, disable_extractors_dict): - """Test that add+crawl creates Crawl records.""" - os.chdir(tmp_path) - - # First add a snapshot (this creates a Crawl) - subprocess.run( - ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], - capture_output=True, - env=disable_extractors_dict, - ) - - # Then crawl it - subprocess.run( - ['archivebox', 'crawl', '--depth=0', 'https://example.com'], - capture_output=True, - env=disable_extractors_dict, - timeout=30, - ) - - conn = sqlite3.connect("index.sqlite3") - c = conn.cursor() - crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0] - conn.close() - - # Should have at least 1 crawl from the add command - assert crawl_count >= 1 diff --git a/tests/test_cli_snapshot.py b/tests/test_cli_snapshot.py deleted file mode 100644 index cfb91cc6..00000000 --- a/tests/test_cli_snapshot.py +++ /dev/null @@ -1,63 +0,0 @@ -#!/usr/bin/env python3 -""" -Tests for archivebox snapshot command. -Verify snapshot command works with snapshot IDs/URLs. -""" - -import os -import subprocess -import sqlite3 - -from .fixtures import * - - -def test_snapshot_command_works_with_url(tmp_path, process, disable_extractors_dict): - """Test that snapshot command works with URL.""" - os.chdir(tmp_path) - - # Add a snapshot first - subprocess.run( - ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], - capture_output=True, - env=disable_extractors_dict, - ) - - # Try to view/interact with snapshot - result = subprocess.run( - ['archivebox', 'snapshot', 'https://example.com'], - capture_output=True, - text=True, - env=disable_extractors_dict, - timeout=30, - ) - - # Should complete (exit code depends on implementation) - assert result.returncode in [0, 1, 2] - - -def test_snapshot_command_with_timestamp(tmp_path, process, disable_extractors_dict): - """Test snapshot command with timestamp ID.""" - os.chdir(tmp_path) - - # Add snapshot - subprocess.run( - ['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'], - capture_output=True, - env=disable_extractors_dict, - ) - - # Get snapshot timestamp - conn = sqlite3.connect("index.sqlite3") - c = conn.cursor() - timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0] - conn.close() - - # Try snapshot command with timestamp - result = subprocess.run( - ['archivebox', 'snapshot', str(timestamp)], - capture_output=True, - env=disable_extractors_dict, - timeout=30, - ) - - assert result.returncode in [0, 1, 2]