move tests into subfolder, add missing install hooks

This commit is contained in:
Nick Sweeting
2026-01-02 00:22:07 -08:00
parent c2afb40350
commit 65ee09ceab
80 changed files with 2659 additions and 859 deletions

2
.gitignore vendored
View File

@@ -39,11 +39,13 @@ tmp/
data/
data*/
output/
logs/
index.sqlite3
queue.sqlite3
*.sqlite*
data.*
.archivebox_id
ArchiveBox.conf
# vim
*.sw?

View File

@@ -158,6 +158,63 @@ env['SAVE_FAVICON'] = 'False'
#### Timeout Settings
Use appropriate timeouts for migration tests (45s for init, 60s default).
### Plugin Testing & Code Coverage
**Target: 80-90% coverage** for critical plugins (screenshot, chrome, singlefile, dom)
```bash
# Run plugin tests with coverage (both Python + JavaScript)
bash bin/test_plugins.sh screenshot
# View coverage reports
bash bin/test_plugins.sh --coverage-report
# Or individual reports:
coverage report --show-missing --include='archivebox/plugins/*' --omit='*/tests/*'
```
#### Plugin Test Structure
Tests are **completely isolated** from ArchiveBox - they replicate production directory structure in temp dirs:
```python
# Correct production paths:
# Crawl: DATA_DIR/users/{username}/crawls/YYYYMMDD/example.com/{crawl-id}/{plugin}/
# Snapshot: DATA_DIR/users/{username}/snapshots/YYYYMMDD/example.com/{snapshot-uuid}/{plugin}/
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
# Crawl-level plugin (e.g., chrome launcher)
crawl_dir = data_dir / 'users' / 'testuser' / 'crawls' / '20240101' / 'example.com' / 'crawl-123'
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir(parents=True)
# Snapshot-level plugin (e.g., screenshot)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-456'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
# Run hook in its output directory
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=snap-456'],
cwd=str(screenshot_dir),
env=get_test_env(),
capture_output=True,
timeout=120
)
```
#### Coverage Improvement Loop
To improve from ~20% to 80%+:
1. **Run tests**: `bash bin/test_plugins.sh screenshot` → Shows: `19.1% (13/68 ranges)`
2. **Identify gaps**: Check hook file for untested paths (session connection vs fallback, config branches, error cases)
3. **Add tests**: Test both execution paths (connect to session + launch own browser), skip conditions, error cases, config variations
4. **Verify**: Re-run tests → Should show: `85%+ (58+/68 ranges)`
**Critical**: JavaScript hooks have TWO paths that both must be tested (connect to session ~50% + launch browser ~30% + shared ~20%). Testing only one path = max 50% coverage possible!
## Database Migrations
### Generate and Apply Migrations

View File

@@ -41,9 +41,11 @@ class ArchiveBoxGroup(click.Group):
archive_commands = {
# High-level commands
'add': 'archivebox.cli.archivebox_add.main',
'remove': 'archivebox.cli.archivebox_remove.main',
'run': 'archivebox.cli.archivebox_run.main',
'update': 'archivebox.cli.archivebox_update.main',
'status': 'archivebox.cli.archivebox_status.main',
'search': 'archivebox.cli.archivebox_search.main',
'config': 'archivebox.cli.archivebox_config.main',
'schedule': 'archivebox.cli.archivebox_schedule.main',
'server': 'archivebox.cli.archivebox_server.main',

View File

@@ -13,8 +13,15 @@ from archivebox.misc.util import docstring, enforce_types
@enforce_types
def install(dry_run: bool=False) -> None:
"""Detect and install ArchiveBox dependencies by running a dependency-check crawl"""
def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bool = False) -> None:
"""Detect and install ArchiveBox dependencies by running a dependency-check crawl
Examples:
archivebox install # Install all dependencies
archivebox install wget curl # Install only wget and curl
archivebox install --binproviders=pip yt-dlp # Install yt-dlp using only pip
archivebox install --binproviders=brew,apt # Install all deps using only brew or apt
"""
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
from archivebox.config.paths import ARCHIVE_DIR
@@ -24,7 +31,14 @@ def install(dry_run: bool=False) -> None:
if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()):
init() # must init full index because we need a db to store Binary entries in
print('\n[green][+] Detecting ArchiveBox dependencies...[/green]')
# Show what we're installing
if binaries:
print(f'\n[green][+] Installing specific binaries: {", ".join(binaries)}[/green]')
else:
print('\n[green][+] Detecting and installing all ArchiveBox dependencies...[/green]')
if binproviders != '*':
print(f'[green][+] Using providers: {binproviders}[/green]')
if IS_ROOT:
EUID = os.geteuid()
@@ -49,6 +63,19 @@ def install(dry_run: bool=False) -> None:
# Using a minimal crawl that will trigger on_Crawl hooks
created_by_id = get_or_create_system_user_pk()
# Build config for this crawl using existing PLUGINS filter
crawl_config = {}
# Combine binary names and provider names into PLUGINS list
plugins = []
if binaries:
plugins.extend(binaries)
if binproviders != '*':
plugins.extend(binproviders.split(','))
if plugins:
crawl_config['PLUGINS'] = ','.join(plugins)
crawl, created = Crawl.objects.get_or_create(
urls='archivebox://install',
defaults={
@@ -56,6 +83,7 @@ def install(dry_run: bool=False) -> None:
'created_by_id': created_by_id,
'max_depth': 0,
'status': 'queued',
'config': crawl_config,
}
)
@@ -63,9 +91,12 @@ def install(dry_run: bool=False) -> None:
if not created:
crawl.status = 'queued'
crawl.retry_at = timezone.now()
crawl.config = crawl_config # Update config
crawl.save()
print(f'[+] Created dependency detection crawl: {crawl.id}')
if crawl_config:
print(f'[+] Crawl config: {crawl_config}')
print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}')
# Verify the crawl is in the queue
@@ -100,15 +131,15 @@ def install(dry_run: bool=False) -> None:
print()
# Run version to show full status
archivebox_path = shutil.which('archivebox') or sys.executable
if 'python' in archivebox_path:
os.system(f'{sys.executable} -m archivebox version')
else:
os.system(f'{archivebox_path} version')
# Show version to display full status including installed binaries
# Django is already loaded, so just import and call the function directly
from archivebox.cli.archivebox_version import version as show_version
show_version(quiet=False)
@click.command()
@click.argument('binaries', nargs=-1, type=str, required=False)
@click.option('--binproviders', '-p', default='*', help='Comma-separated list of providers to use (pip,npm,brew,apt,env,custom) or * for all', show_default=True)
@click.option('--dry-run', '-d', is_flag=True, help='Show what would happen without actually running', default=False)
@docstring(install.__doc__)
def main(**kwargs) -> None:

View File

@@ -50,6 +50,9 @@ def get_snapshots(snapshots: Optional[QuerySet]=None,
if filter_patterns:
result = Snapshot.objects.filter_by_patterns(filter_patterns, filter_type)
# Prefetch crawl relationship to avoid N+1 queries when accessing output_dir
result = result.select_related('crawl', 'crawl__created_by')
if not result:
stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')

View File

@@ -145,16 +145,29 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di
# Check if needs migration (0.8.x → 0.9.x)
if snapshot.fs_migration_needed:
try:
snapshot.save() # Triggers migration + creates symlink
# Manually trigger filesystem migration without full save()
# This avoids UNIQUE constraint issues while still migrating files
cleanup_info = None
if hasattr(snapshot, '_fs_migrate_from_0_8_0_to_0_9_0'):
cleanup_info = snapshot._fs_migrate_from_0_8_0_to_0_9_0()
# Update only fs_version field using queryset update (bypasses validation)
from archivebox.core.models import Snapshot as SnapshotModel
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0')
# Commit the transaction
transaction.commit()
# Manually call cleanup since we bypassed normal save() flow
if cleanup_info:
old_dir, new_dir = cleanup_info
snapshot._cleanup_old_migration_dir(old_dir, new_dir)
stats['migrated'] += 1
print(f" [{stats['processed']}] Migrated: {entry_path.name}")
except Exception as e:
# Snapshot already exists in DB with different crawl - skip it
if 'UNIQUE constraint failed' in str(e):
stats['skipped'] += 1
print(f" [{stats['processed']}] Skipped (already in DB): {entry_path.name}")
else:
raise
stats['skipped'] += 1
print(f" [{stats['processed']}] Skipped (error: {e}): {entry_path.name}")
else:
stats['skipped'] += 1

View File

@@ -104,40 +104,47 @@ def version(quiet: bool=False,
failures = []
# Setup Django before importing models
from archivebox.config.django import setup_django
setup_django()
try:
from archivebox.config.django import setup_django
setup_django()
from archivebox.machine.models import Machine, Binary
from archivebox.machine.models import Machine, Binary
machine = Machine.current()
machine = Machine.current()
# Get all binaries from the database
all_installed = Binary.objects.filter(
machine=machine
).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')
# Get all binaries from the database with timeout protection
all_installed = Binary.objects.filter(
machine=machine
).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')
if not all_installed.exists():
prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]')
else:
for installed in all_installed:
# Skip if user specified specific binaries and this isn't one
if binaries and installed.name not in binaries:
continue
if not all_installed.exists():
prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]')
else:
for installed in all_installed:
# Skip if user specified specific binaries and this isn't one
if binaries and installed.name not in binaries:
continue
if installed.is_valid:
display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
version_str = (installed.version or 'unknown')[:15]
provider = (installed.binprovider or 'env')[:8]
prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
else:
prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
failures.append(installed.name)
if installed.is_valid:
display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
version_str = (installed.version or 'unknown')[:15]
provider = (installed.binprovider or 'env')[:8]
prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
else:
prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
failures.append(installed.name)
# Show hint if no binaries are installed yet
has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath='').exists()
if not has_any_installed:
# Show hint if no binaries are installed yet
has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath='').exists()
if not has_any_installed:
prnt()
prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]')
except Exception as e:
# Handle database errors gracefully (locked, missing, etc.)
prnt()
prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]')
prnt('', f'[yellow]Warning: Could not query binaries from database: {e}[/yellow]')
prnt('', '[grey53]Run [green]archivebox init[/green] and [green]archivebox install[/green] to set up dependencies.[/grey53]')
if not binaries:
# Show code and data locations

View File

@@ -116,7 +116,7 @@ def upgrade_core_tables(apps, schema_editor):
retry_at DATETIME,
depth INTEGER NOT NULL DEFAULT 0,
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
fs_version VARCHAR(10) NOT NULL DEFAULT '0.8.0',
config TEXT NOT NULL DEFAULT '{}',
notes TEXT NOT NULL DEFAULT '',
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
@@ -326,6 +326,16 @@ class Migration(migrations.Migration):
name='modified_at',
field=models.DateTimeField(auto_now=True),
),
# Declare fs_version (already created in database with DEFAULT '0.8.0')
migrations.AddField(
model_name='snapshot',
name='fs_version',
field=models.CharField(
max_length=10,
default='0.8.0',
help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().'
),
),
# SnapshotTag table already exists from v0.7.2, just declare it in state
migrations.CreateModel(

View File

@@ -150,11 +150,7 @@ class Migration(migrations.Migration):
name='downloaded_at',
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
),
migrations.AddField(
model_name='snapshot',
name='fs_version',
field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
),
# NOTE: fs_version already added by migration 0023 with default='0.8.0'
# NOTE: modified_at already added by migration 0023
migrations.AddField(
model_name='snapshot',

View File

@@ -8,7 +8,7 @@ class Migration(migrations.Migration):
dependencies = [
('core', '0025_alter_archiveresult_options_alter_snapshot_options_and_more'),
('machine', '0003_add_process_type_and_parent'),
('machine', '0007_add_process_type_and_parent'),
]
operations = [

View File

@@ -0,0 +1,388 @@
# Generated by hand on 2026-01-01
# Copies ArchiveResult cmd/pwd/cmd_version data to Process records before removing old fields
from django.db import migrations, connection
import json
from pathlib import Path
def parse_cmd_field(cmd_raw):
"""
Parse cmd field which could be:
1. JSON array string: '["wget", "-p", "url"]'
2. Space-separated string: 'wget -p url'
3. NULL/empty
Returns list of strings.
"""
if not cmd_raw:
return []
cmd_raw = cmd_raw.strip()
if not cmd_raw:
return []
# Try to parse as JSON first
if cmd_raw.startswith('['):
try:
parsed = json.loads(cmd_raw)
if isinstance(parsed, list):
return [str(x) for x in parsed]
except json.JSONDecodeError:
pass
# Fallback: split by spaces (simple approach, doesn't handle quoted strings)
# This is acceptable since old cmd fields were mostly simple commands
return cmd_raw.split()
def get_or_create_current_machine(cursor):
"""Get or create Machine.current() using raw SQL."""
import uuid
import socket
from datetime import datetime
# Simple machine detection - get hostname as guid
hostname = socket.gethostname()
guid = f'host_{hostname}' # Simple but stable identifier
# Check if machine exists
cursor.execute("SELECT id FROM machine_machine WHERE guid = ?", [guid])
row = cursor.fetchone()
if row:
return row[0]
# Create new machine
machine_id = str(uuid.uuid4())
now = datetime.now().isoformat()
# Check which columns exist (schema differs between 0.8.x and 0.9.x)
cursor.execute("PRAGMA table_info(machine_machine)")
machine_cols = {row[1] for row in cursor.fetchall()}
# Build INSERT statement based on available columns
if 'config' in machine_cols:
# 0.9.x schema with config column
cursor.execute("""
INSERT INTO machine_machine (
id, created_at, modified_at, guid, hostname,
hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
os_arch, os_family, os_platform, os_release, os_kernel,
stats, config, num_uses_failed, num_uses_succeeded
) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '',
'', '', '', '', '', '{}', '{}', 0, 0)
""", [machine_id, now, now, guid, hostname])
else:
# 0.8.x schema without config column
cursor.execute("""
INSERT INTO machine_machine (
id, created_at, modified_at, guid, hostname,
hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
os_arch, os_family, os_platform, os_release, os_kernel,
stats, num_uses_failed, num_uses_succeeded
) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '',
'', '', '', '', '', '{}', 0, 0)
""", [machine_id, now, now, guid, hostname])
return machine_id
def get_or_create_binary(cursor, machine_id, name, abspath, version):
"""
Get or create Binary record.
Args:
cursor: DB cursor
machine_id: Machine FK
name: Binary name (basename of command)
abspath: Absolute path to binary (or just name if path unknown)
version: Version string
Returns:
binary_id (str)
"""
import uuid
from datetime import datetime
# If abspath is just a name without slashes, it's not a full path
# Store it in both fields for simplicity
if '/' not in abspath:
# Not a full path - store as-is
pass
# Check if binary exists with same machine, name, abspath, version
cursor.execute("""
SELECT id FROM machine_binary
WHERE machine_id = ? AND name = ? AND abspath = ? AND version = ?
""", [machine_id, name, abspath, version])
row = cursor.fetchone()
if row:
return row[0]
# Create new binary
binary_id = str(uuid.uuid4())
now = datetime.now().isoformat()
# Check which columns exist (schema differs between 0.8.x and 0.9.x)
cursor.execute("PRAGMA table_info(machine_binary)")
binary_cols = {row[1] for row in cursor.fetchall()}
# Use only columns that exist in current schema
# 0.8.x schema: id, created_at, modified_at, machine_id, name, binprovider, abspath, version, sha256, num_uses_failed, num_uses_succeeded
# 0.9.x schema adds: binproviders, overrides, status, retry_at, output_dir
if 'binproviders' in binary_cols:
# 0.9.x schema
cursor.execute("""
INSERT INTO machine_binary (
id, created_at, modified_at, machine_id,
name, binproviders, overrides, binprovider, abspath, version, sha256,
status, retry_at, output_dir,
num_uses_failed, num_uses_succeeded
) VALUES (?, ?, ?, ?, ?, 'env', '{}', 'env', ?, ?, '',
'succeeded', NULL, '', 0, 0)
""", [binary_id, now, now, machine_id, name, abspath, version])
else:
# 0.8.x schema (simpler)
cursor.execute("""
INSERT INTO machine_binary (
id, created_at, modified_at, machine_id,
name, binprovider, abspath, version, sha256,
num_uses_failed, num_uses_succeeded
) VALUES (?, ?, ?, ?, ?, 'env', ?, ?, '', 0, 0)
""", [binary_id, now, now, machine_id, name, abspath, version])
return binary_id
def map_status(old_status):
"""
Map old ArchiveResult status to Process status and exit_code.
Args:
old_status: One of: queued, started, backoff, succeeded, failed, skipped
Returns:
(process_status, exit_code) tuple
"""
status_map = {
'queued': ('queued', None),
'started': ('running', None),
'backoff': ('queued', None),
'succeeded': ('exited', 0),
'failed': ('exited', 1),
'skipped': ('exited', None), # Skipped = exited without error
}
return status_map.get(old_status, ('queued', None))
def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at, ended_at, binary_id):
"""
Create a Process record.
Returns:
process_id (str)
"""
import uuid
from datetime import datetime
process_id = str(uuid.uuid4())
now = datetime.now().isoformat()
# Convert cmd array to JSON
cmd_json = json.dumps(cmd)
# Set retry_at to now for queued processes, NULL otherwise
retry_at = now if status == 'queued' else None
cursor.execute("""
INSERT INTO machine_process (
id, created_at, modified_at, machine_id, parent_id, process_type,
pwd, cmd, env, timeout,
pid, exit_code, stdout, stderr,
started_at, ended_at,
binary_id, iface_id, url,
status, retry_at
) VALUES (?, ?, ?, ?, NULL, 'cli',
?, ?, '{}', 120,
NULL, ?, '', '',
?, ?,
?, NULL, NULL,
?, ?)
""", [
process_id, now, now, machine_id,
pwd, cmd_json,
exit_code,
started_at, ended_at,
binary_id,
status, retry_at
])
return process_id
def copy_archiveresult_data_to_process(apps, schema_editor):
"""
Copy old ArchiveResult execution data (cmd, pwd, cmd_version) to Process records.
For each ArchiveResult without a process_id:
1. Parse cmd field (handle both JSON array and space-separated string)
2. Extract binary name/path from cmd[0]
3. Get or create Binary record with machine, name, abspath, version
4. Create Process record with mapped fields
5. Link ArchiveResult.process_id to new Process
Status mapping:
- queued → queued (exit_code=None)
- started → running (exit_code=None)
- backoff → queued (exit_code=None)
- succeeded → exited (exit_code=0)
- failed → exited (exit_code=1)
- skipped → exited (exit_code=None)
"""
cursor = connection.cursor()
# Check if old fields still exist (skip if fresh install or already migrated)
cursor.execute("PRAGMA table_info(core_archiveresult)")
cols = {row[1] for row in cursor.fetchall()}
print(f'DEBUG 0027: Columns found: {sorted(cols)}')
print(f'DEBUG 0027: Has cmd={("cmd" in cols)}, pwd={("pwd" in cols)}, cmd_version={("cmd_version" in cols)}, process_id={("process_id" in cols)}')
if 'cmd' not in cols or 'pwd' not in cols or 'cmd_version' not in cols:
print('✓ Fresh install or fields already removed - skipping data copy')
return
# Check if process_id field exists (should exist from 0026)
if 'process_id' not in cols:
print('✗ ERROR: process_id field not found. Migration 0026 must run first.')
return
# Get or create Machine.current()
machine_id = get_or_create_current_machine(cursor)
# Get ArchiveResults without process_id that have cmd data
# Use plugin (extractor was renamed to plugin in migration 0025)
cursor.execute("""
SELECT id, snapshot_id, plugin, cmd, pwd, cmd_version,
status, start_ts, end_ts, created_at
FROM core_archiveresult
WHERE process_id IS NULL
AND (cmd IS NOT NULL OR pwd IS NOT NULL)
""")
results = cursor.fetchall()
if not results:
print('✓ No ArchiveResults need Process migration')
return
print(f'Migrating {len(results)} ArchiveResults to Process records...')
migrated_count = 0
skipped_count = 0
error_count = 0
for i, row in enumerate(results):
ar_id, snapshot_id, plugin, cmd_raw, pwd, cmd_version, status, start_ts, end_ts, created_at = row
if i == 0:
print(f'DEBUG 0027: First row: ar_id={ar_id}, plugin={plugin}, cmd={cmd_raw[:50] if cmd_raw else None}, status={status}')
try:
# Parse cmd field
cmd_array = parse_cmd_field(cmd_raw)
if i == 0:
print(f'DEBUG 0027: Parsed cmd: {cmd_array}')
# Extract binary info from cmd[0] if available
binary_id = None
if cmd_array and cmd_array[0]:
binary_name = Path(cmd_array[0]).name or plugin # Fallback to plugin name
binary_abspath = cmd_array[0]
binary_version = cmd_version or ''
# Get or create Binary record
binary_id = get_or_create_binary(
cursor, machine_id, binary_name, binary_abspath, binary_version
)
if i == 0:
print(f'DEBUG 0027: Created Binary: id={binary_id}, name={binary_name}')
# Map status
process_status, exit_code = map_status(status)
# Set timestamps
started_at = start_ts or created_at
ended_at = end_ts if process_status == 'exited' else None
# Create Process record
process_id = create_process(
cursor=cursor,
machine_id=machine_id,
pwd=pwd or '',
cmd=cmd_array,
status=process_status,
exit_code=exit_code,
started_at=started_at,
ended_at=ended_at,
binary_id=binary_id,
)
if i == 0:
print(f'DEBUG 0027: Created Process: id={process_id}')
# Link ArchiveResult to Process
cursor.execute(
"UPDATE core_archiveresult SET process_id = ? WHERE id = ?",
[process_id, ar_id]
)
migrated_count += 1
if i == 0:
print(f'DEBUG 0027: Linked ArchiveResult to Process')
except Exception as e:
print(f'✗ Error migrating ArchiveResult {ar_id}: {e}')
import traceback
traceback.print_exc()
error_count += 1
continue
print(f'✓ Migration complete: {migrated_count} migrated, {skipped_count} skipped, {error_count} errors')
class Migration(migrations.Migration):
dependencies = [
('core', '0026_add_process_to_archiveresult'),
('machine', '0007_add_process_type_and_parent'),
]
operations = [
# First, copy data from old fields to Process
migrations.RunPython(
copy_archiveresult_data_to_process,
reverse_code=migrations.RunPython.noop,
),
# Now safe to remove old fields (moved from 0025)
migrations.RemoveField(
model_name='archiveresult',
name='cmd',
),
migrations.RemoveField(
model_name='archiveresult',
name='cmd_version',
),
migrations.RemoveField(
model_name='archiveresult',
name='pwd',
),
]

View File

@@ -362,24 +362,22 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# Migrate filesystem if needed (happens automatically on save)
if self.pk and self.fs_migration_needed:
from django.db import transaction
with transaction.atomic():
# Walk through migration chain automatically
current = self.fs_version
target = self._fs_current_version()
# Walk through migration chain automatically
current = self.fs_version
target = self._fs_current_version()
while current != target:
next_ver = self._fs_next_version(current)
method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'
while current != target:
next_ver = self._fs_next_version(current)
method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'
# Only run if method exists (most are no-ops)
if hasattr(self, method):
getattr(self, method)()
# Only run if method exists (most are no-ops)
if hasattr(self, method):
getattr(self, method)()
current = next_ver
current = next_ver
# Update version (still in transaction)
self.fs_version = target
# Update version
self.fs_version = target
super().save(*args, **kwargs)
if self.url not in self.crawl.urls:
@@ -486,33 +484,58 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
# Convert index.json to index.jsonl in the new directory
self.convert_index_json_to_jsonl()
# Create backwards-compat symlink (INSIDE transaction)
symlink_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
if symlink_path.is_symlink():
symlink_path.unlink()
# Schedule cleanup AFTER transaction commits successfully
# This ensures DB changes are committed before we delete old files
from django.db import transaction
transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir, new_dir))
if not symlink_path.exists() or symlink_path == old_dir:
symlink_path.symlink_to(new_dir, target_is_directory=True)
# Return cleanup info for manual cleanup if needed (when called directly)
return (old_dir, new_dir)
# Schedule old directory deletion AFTER transaction commits
transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir))
def _cleanup_old_migration_dir(self, old_dir: Path):
def _cleanup_old_migration_dir(self, old_dir: Path, new_dir: Path):
"""
Delete old directory after successful migration.
Delete old directory and create symlink after successful migration.
Called via transaction.on_commit() after DB commit succeeds.
"""
import shutil
import logging
print(f"[DEBUG] _cleanup_old_migration_dir called: old_dir={old_dir}, new_dir={new_dir}")
# Delete old directory
if old_dir.exists() and not old_dir.is_symlink():
print(f"[DEBUG] Attempting to delete old directory: {old_dir}")
try:
shutil.rmtree(old_dir)
print(f"[DEBUG] Successfully deleted old directory: {old_dir}")
except Exception as e:
# Log but don't raise - migration succeeded, this is just cleanup
print(f"[DEBUG] Failed to delete old directory {old_dir}: {e}")
logging.getLogger('archivebox.migration').warning(
f"Could not remove old migration directory {old_dir}: {e}"
)
return # Don't create symlink if cleanup failed
else:
print(f"[DEBUG] Old directory doesn't exist or is already a symlink: {old_dir}")
# Create backwards-compat symlink (after old dir is deleted)
symlink_path = old_dir # Same path as old_dir
if symlink_path.is_symlink():
print(f"[DEBUG] Unlinking existing symlink: {symlink_path}")
symlink_path.unlink()
if not symlink_path.exists():
print(f"[DEBUG] Creating symlink: {symlink_path} -> {new_dir}")
try:
symlink_path.symlink_to(new_dir, target_is_directory=True)
print(f"[DEBUG] Successfully created symlink")
except Exception as e:
print(f"[DEBUG] Failed to create symlink: {e}")
logging.getLogger('archivebox.migration').warning(
f"Could not create symlink from {symlink_path} to {new_dir}: {e}"
)
else:
print(f"[DEBUG] Symlink path already exists: {symlink_path}")
# =========================================================================
# Path Calculation and Migration Helpers
@@ -1616,8 +1639,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
This enables step-based execution where all hooks in a step can run in parallel.
"""
from archivebox.hooks import discover_hooks
from archivebox.config.configset import get_config
hooks = discover_hooks('Snapshot')
# Get merged config with crawl-specific PLUGINS filter
config = get_config(crawl=self.crawl, snapshot=self)
hooks = discover_hooks('Snapshot', config=config)
archiveresults = []
for hook_path in hooks:
@@ -2212,22 +2238,19 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
started = State(value=Snapshot.StatusChoices.STARTED)
sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
# Tick Event
# Tick Event (polled by workers)
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished', on='on_started_to_started') |
started.to(sealed, cond='is_finished')
queued.to(started, cond='can_start')
)
# Manual event (triggered by last ArchiveResult finishing)
seal = started.to(sealed)
def can_start(self) -> bool:
can_start = bool(self.snapshot.url)
return can_start
def is_finished(self) -> bool:
"""Check if snapshot processing is complete - delegates to model method."""
return self.snapshot.is_finished_processing()
@queued.enter
def enter_queued(self):
self.snapshot.update_and_requeue(
@@ -2237,29 +2260,34 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
@started.enter
def enter_started(self):
# lock the snapshot while we create the pending archiveresults
self.snapshot.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
)
import sys
print(f'[cyan] 🔄 SnapshotMachine.enter_started() - creating archiveresults for {self.snapshot.url}[/cyan]', file=sys.stderr)
# Run the snapshot - creates pending archiveresults for all enabled plugins
self.snapshot.run()
# unlock the snapshot after we're done + set status = started
self.snapshot.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=5), # check again in 5s
status=Snapshot.StatusChoices.STARTED,
)
# Check if any archiveresults were created
ar_count = self.snapshot.archiveresult_set.count()
print(f'[cyan] 🔄 ArchiveResult count: {ar_count}[/cyan]', file=sys.stderr)
def on_started_to_started(self):
"""Called when Snapshot stays in started state (archiveresults not finished yet)."""
# Bump retry_at so we check again in a few seconds
self.snapshot.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=5),
)
if ar_count == 0:
# No archiveresults created, seal immediately
print(f'[cyan] 🔄 No archiveresults created, sealing snapshot immediately[/cyan]', file=sys.stderr)
self.seal()
else:
# Set status = started with retry_at far future (so workers don't claim us - we're waiting for ARs)
# Last AR will manually call self.seal() when done
self.snapshot.update_and_requeue(
retry_at=timezone.now() + timedelta(days=365),
status=Snapshot.StatusChoices.STARTED,
)
print(f'[cyan] 🔄 {ar_count} archiveresults created, waiting for them to finish[/cyan]', file=sys.stderr)
@sealed.enter
def enter_sealed(self):
import sys
# Clean up background hooks
self.snapshot.cleanup()
@@ -2268,6 +2296,21 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
status=Snapshot.StatusChoices.SEALED,
)
print(f'[cyan] ✅ SnapshotMachine.enter_sealed() - sealed {self.snapshot.url}[/cyan]', file=sys.stderr)
# Check if this is the last snapshot for the parent crawl - if so, seal the crawl
if self.snapshot.crawl:
crawl = self.snapshot.crawl
remaining_active = Snapshot.objects.filter(
crawl=crawl,
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
).count()
if remaining_active == 0:
print(f'[cyan]🔒 All snapshots sealed for crawl {crawl.id}, sealing crawl[/cyan]', file=sys.stderr)
# Seal the parent crawl
crawl.sm.seal()
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithStateMachine):
class StatusChoices(models.TextChoices):
@@ -3102,8 +3145,30 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
end_ts=None,
)
def _check_and_seal_parent_snapshot(self):
"""Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot."""
import sys
snapshot = self.archiveresult.snapshot
# Check if all archiveresults are finished (in final states)
remaining_active = snapshot.archiveresult_set.exclude(
status__in=[
ArchiveResult.StatusChoices.SUCCEEDED,
ArchiveResult.StatusChoices.FAILED,
ArchiveResult.StatusChoices.SKIPPED,
]
).count()
if remaining_active == 0:
print(f'[cyan] 🔒 All archiveresults finished for snapshot {snapshot.url}, sealing snapshot[/cyan]', file=sys.stderr)
# Seal the parent snapshot
snapshot.sm.seal()
@succeeded.enter
def enter_succeeded(self):
import sys
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.SUCCEEDED,
@@ -3113,8 +3178,15 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
self.archiveresult.cascade_health_update(success=True)
print(f'[cyan] ✅ ArchiveResult succeeded: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/cyan]', file=sys.stderr)
# Check if this is the last AR to finish - seal parent snapshot if so
self._check_and_seal_parent_snapshot()
@failed.enter
def enter_failed(self):
import sys
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.FAILED,
@@ -3124,16 +3196,25 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
self.archiveresult.cascade_health_update(success=False)
print(f'[red] ❌ ArchiveResult failed: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/red]', file=sys.stderr)
# Check if this is the last AR to finish - seal parent snapshot if so
self._check_and_seal_parent_snapshot()
@skipped.enter
def enter_skipped(self):
import sys
self.archiveresult.update_and_requeue(
retry_at=None,
status=ArchiveResult.StatusChoices.SKIPPED,
end_ts=timezone.now(),
)
def after_transition(self, event: str, source: State, target: State):
self.archiveresult.snapshot.update_and_requeue() # bump snapshot retry time so it picks up all the new changes
print(f'[dim] ⏭️ ArchiveResult skipped: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/dim]', file=sys.stderr)
# Check if this is the last AR to finish - seal parent snapshot if so
self._check_and_seal_parent_snapshot()
# =============================================================================

View File

@@ -240,19 +240,26 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
if not first_url:
raise ValueError(f'Crawl {self.id} has no URLs to create root snapshot from')
# Try to get existing snapshot
try:
return Snapshot.objects.get(crawl=self, url=first_url)
snapshot = Snapshot.objects.get(crawl=self, url=first_url)
# If exists and already queued/started, return it as-is
if snapshot.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]:
# Update retry_at to now so it can be picked up immediately
snapshot.retry_at = timezone.now()
snapshot.save(update_fields=['retry_at'])
return snapshot
except Snapshot.DoesNotExist:
pass
root_snapshot, _ = Snapshot.objects.update_or_create(
crawl=self, url=first_url,
defaults={
'status': Snapshot.INITIAL_STATE,
'retry_at': timezone.now(),
'timestamp': str(timezone.now().timestamp()),
'depth': 0,
},
# Create new snapshot
root_snapshot = Snapshot.objects.create(
crawl=self,
url=first_url,
status=Snapshot.INITIAL_STATE,
retry_at=timezone.now(),
timestamp=str(timezone.now().timestamp()),
depth=0,
)
return root_snapshot
@@ -362,14 +369,14 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
return created_snapshots
def run(self) -> 'Snapshot':
def run(self) -> 'Snapshot | None':
"""
Execute this Crawl: run hooks, process JSONL, create snapshots.
Called by the state machine when entering the 'started' state.
Returns:
The root Snapshot for this crawl
The root Snapshot for this crawl, or None for system crawls that don't create snapshots
"""
import time
from pathlib import Path
@@ -407,8 +414,18 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
# Foreground hook - process JSONL records
records = result.get('records', [])
if records:
print(f'[cyan]📝 Processing {len(records)} records from {hook.name}[/cyan]')
for record in records[:3]: # Show first 3
print(f' Record: type={record.get("type")}, keys={list(record.keys())[:5]}')
overrides = {'crawl': self}
process_hook_records(records, overrides=overrides)
stats = process_hook_records(records, overrides=overrides)
if stats:
print(f'[green]✓ Created: {stats}[/green]')
# System crawls (archivebox://*) don't create snapshots - they just run hooks
if first_url.startswith('archivebox://'):
return None
# Create snapshots from URLs
root_snapshot = self.create_root_snapshot()
@@ -498,14 +515,15 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
started = State(value=Crawl.StatusChoices.STARTED)
sealed = State(value=Crawl.StatusChoices.SEALED, final=True)
# Tick Event
# Tick Event (polled by workers)
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished', on='on_started_to_started') |
started.to(sealed, cond='is_finished')
queued.to(started, cond='can_start')
)
# Manual event (triggered by last Snapshot sealing)
seal = started.to(sealed)
def can_start(self) -> bool:
if not self.crawl.urls:
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]')
@@ -516,55 +534,38 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
return False
return True
def is_finished(self) -> bool:
from archivebox.core.models import Snapshot
# Check if any snapshots exist for this crawl
snapshots = Snapshot.objects.filter(crawl=self.crawl)
# If no snapshots exist, allow finishing (e.g., archivebox://install crawls that only run hooks)
if not snapshots.exists():
return True
# If snapshots exist, check if all are sealed
# Snapshots handle their own background hooks via the step system,
# so we just need to wait for all snapshots to reach sealed state
if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
return False
return True
@started.enter
def enter_started(self):
# Lock the crawl by bumping retry_at so other workers don't pick it up while we create snapshots
self.crawl.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=30), # Lock for 30 seconds
)
import sys
from archivebox.core.models import Snapshot
print(f'[cyan]🔄 CrawlMachine.enter_started() - creating snapshots for {self.crawl.id}[/cyan]', file=sys.stderr)
try:
# Run the crawl - runs hooks, processes JSONL, creates snapshots
self.crawl.run()
root_snapshot = self.crawl.run()
if root_snapshot:
print(f'[cyan]🔄 Created root snapshot: {root_snapshot.url}[/cyan]', file=sys.stderr)
# Update status to STARTED
# Set retry_at to far future so workers don't claim us (we're waiting for snapshots to finish)
# Last snapshot will manually call self.seal() when done
self.crawl.update_and_requeue(
retry_at=timezone.now() + timedelta(days=365),
status=Crawl.StatusChoices.STARTED,
)
else:
# No snapshots (system crawl like archivebox://install)
print(f'[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]', file=sys.stderr)
# Seal immediately since there's no work to do
self.seal()
# Update status to STARTED once snapshots are created
# Set retry_at to future so we don't busy-loop - wait for snapshots to process
self.crawl.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=5), # Check again in 5s
status=Crawl.StatusChoices.STARTED,
)
except Exception as e:
print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
import traceback
traceback.print_exc()
# Re-raise so the worker knows it failed
raise
def on_started_to_started(self):
"""Called when Crawl stays in started state (snapshots not sealed yet)."""
# Bump retry_at so we check again in a few seconds
self.crawl.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=5),
)
@sealed.enter
def enter_sealed(self):
# Clean up background hooks and run on_CrawlEnd hooks

View File

@@ -480,7 +480,7 @@ def run_hook(
returncode=returncode,
stdout=stdout,
stderr=stderr,
output_json=output_json,
output_json=None, # Legacy field, we now use records for JSONL
output_files=new_files,
duration_ms=duration_ms,
hook=str(script),
@@ -922,10 +922,14 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
if plugins_whitelist:
# PLUGINS whitelist is specified - only enable plugins in the list
plugin_names = [p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()]
import sys
print(f"DEBUG: PLUGINS whitelist='{plugins_whitelist}', checking plugin '{plugin_name}', plugin_names={plugin_names}", file=sys.stderr)
if plugin_name.lower() not in plugin_names:
# Plugin not in whitelist - explicitly disabled
print(f"DEBUG: Plugin '{plugin_name}' NOT in whitelist, disabling", file=sys.stderr)
enabled = False
else:
print(f"DEBUG: Plugin '{plugin_name}' IS in whitelist, enabling", file=sys.stderr)
# Plugin is in whitelist - check if explicitly disabled by PLUGINNAME_ENABLED
enabled_key = f'{plugin_upper}_ENABLED'
enabled = config.get(enabled_key)
@@ -935,6 +939,8 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
enabled = enabled.lower() not in ('false', '0', 'no', '')
else:
# No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True)
import sys
print(f"DEBUG: NO PLUGINS whitelist in config, checking {plugin_name}_ENABLED", file=sys.stderr)
enabled_key = f'{plugin_upper}_ENABLED'
enabled = config.get(enabled_key)
if enabled is None:

View File

@@ -0,0 +1,72 @@
# Generated by hand on 2026-01-01
# Converges machine app for 0.8.6rc0 → 0.9.x migration path
# Drops old InstalledBinary table and ensures Binary table exists
from django.db import migrations, connection
def converge_binary_table(apps, schema_editor):
"""
Drop machine_installedbinary if it exists (0.8.6rc0 path).
Create machine_binary if it doesn't exist (needed by Process model).
"""
cursor = connection.cursor()
# Check what tables exist
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name IN ('machine_installedbinary', 'machine_binary')")
existing_tables = {row[0] for row in cursor.fetchall()}
print(f'DEBUG 0005: Existing tables: {existing_tables}')
# Drop old InstalledBinary table if it exists (0.8.6rc0 path)
if 'machine_installedbinary' in existing_tables:
print('✓ Dropping machine_installedbinary table (0.8.6rc0 divergence)')
cursor.execute("DROP TABLE IF EXISTS machine_installedbinary")
# Create Binary table if it doesn't exist
# This handles the case where 0.8.6rc0's 0001_initial didn't create it
if 'machine_binary' not in existing_tables:
print('✓ Creating machine_binary table with correct schema')
cursor.execute("""
CREATE TABLE machine_binary (
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL,
modified_at DATETIME NOT NULL,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
machine_id TEXT NOT NULL REFERENCES machine_machine(id) ON DELETE CASCADE,
name VARCHAR(63) NOT NULL,
binproviders VARCHAR(255) NOT NULL DEFAULT 'env',
overrides TEXT NOT NULL DEFAULT '{}',
binprovider VARCHAR(63) NOT NULL DEFAULT 'env',
abspath VARCHAR(255) NOT NULL,
version VARCHAR(128) NOT NULL,
sha256 VARCHAR(64) NOT NULL DEFAULT '',
status VARCHAR(16) NOT NULL DEFAULT 'succeeded',
retry_at DATETIME NULL,
output_dir VARCHAR(255) NOT NULL DEFAULT ''
)
""")
# Create indexes
cursor.execute("CREATE INDEX machine_binary_machine_id_idx ON machine_binary(machine_id)")
cursor.execute("CREATE INDEX machine_binary_name_idx ON machine_binary(name)")
cursor.execute("CREATE INDEX machine_binary_abspath_idx ON machine_binary(abspath)")
print('✓ machine_binary table created')
else:
print('✓ machine_binary table already exists')
class Migration(migrations.Migration):
dependencies = [
('machine', '0001_initial'),
]
operations = [
migrations.RunPython(
converge_binary_table,
reverse_code=migrations.RunPython.noop,
),
]

View File

@@ -9,7 +9,7 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('machine', '0001_initial'),
('machine', '0005_converge_binary_model'),
]
operations = [

View File

@@ -7,7 +7,7 @@ from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('machine', '0002_process'),
('machine', '0006_process'),
]
operations = [

View File

@@ -0,0 +1,18 @@
# Generated by Django 6.0 on 2026-01-02 03:36
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('machine', '0007_add_process_type_and_parent'),
]
operations = [
migrations.AddField(
model_name='process',
name='worker_type',
field=models.CharField(blank=True, db_index=True, default='', help_text='Worker type name for WORKER processes (crawl, snapshot, archiveresult)', max_length=32),
),
]

View File

@@ -203,13 +203,14 @@ class BinaryManager(models.Manager):
class Binary(ModelWithHealthStats):
"""
Tracks an binary on a specific machine.
Tracks a binary on a specific machine.
Follows the unified state machine pattern:
Simple state machine with 2 states:
- queued: Binary needs to be installed
- started: Installation in progress
- succeeded: Binary installed successfully (abspath, version, sha256 populated)
- failed: Installation failed
- installed: Binary installed successfully (abspath, version, sha256 populated)
Installation is synchronous during queued→installed transition.
If installation fails, Binary stays in queued with retry_at set for later retry.
State machine calls run() which executes on_Binary__install_* hooks
to install the binary using the specified providers.
@@ -217,9 +218,7 @@ class Binary(ModelWithHealthStats):
class StatusChoices(models.TextChoices):
QUEUED = 'queued', 'Queued'
STARTED = 'started', 'Started'
SUCCEEDED = 'succeeded', 'Succeeded'
FAILED = 'failed', 'Failed'
INSTALLED = 'installed', 'Installed'
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
@@ -323,8 +322,31 @@ class Binary(ModelWithHealthStats):
machine = Machine.current()
overrides = overrides or {}
# Case 1: From binaries.jsonl - create queued binary
if 'binproviders' in record or ('overrides' in record and not record.get('abspath')):
# Case 1: Already installed (from on_Crawl hooks) - has abspath AND binproviders
# This happens when on_Crawl hooks detect already-installed binaries
abspath = record.get('abspath')
version = record.get('version')
binproviders = record.get('binproviders')
if abspath and version and binproviders:
# Binary is already installed, create INSTALLED record with binproviders filter
binary, _ = Binary.objects.update_or_create(
machine=machine,
name=name,
defaults={
'abspath': abspath,
'version': version,
'sha256': record.get('sha256', ''),
'binprovider': record.get('binprovider', 'env'),
'binproviders': binproviders, # Preserve the filter
'status': Binary.StatusChoices.INSTALLED,
'retry_at': None,
}
)
return binary
# Case 2: From binaries.json - create queued binary (needs installation)
if 'binproviders' in record or ('overrides' in record and not abspath):
binary, created = Binary.objects.get_or_create(
machine=machine,
name=name,
@@ -337,25 +359,23 @@ class Binary(ModelWithHealthStats):
)
return binary
# Case 2: From hook output - update with installation results
abspath = record.get('abspath')
version = record.get('version')
if not abspath or not version:
return None
# Case 3: From on_Binary__install hook output - update with installation results
if abspath and version:
binary, _ = Binary.objects.update_or_create(
machine=machine,
name=name,
defaults={
'abspath': abspath,
'version': version,
'sha256': record.get('sha256', ''),
'binprovider': record.get('binprovider', 'env'),
'status': Binary.StatusChoices.INSTALLED,
'retry_at': None,
}
)
return binary
binary, _ = Binary.objects.update_or_create(
machine=machine,
name=name,
defaults={
'abspath': abspath,
'version': version,
'sha256': record.get('sha256', ''),
'binprovider': record.get('binprovider', 'env'),
'status': Binary.StatusChoices.SUCCEEDED,
'retry_at': None,
}
)
return binary
return None
@property
def OUTPUT_DIR(self):
@@ -403,8 +423,7 @@ class Binary(ModelWithHealthStats):
# Discover ALL on_Binary__install_* hooks
hooks = discover_hooks('Binary', config=config)
if not hooks:
self.status = self.StatusChoices.FAILED
self.save()
# No hooks available - stay queued, will retry later
return
# Run each hook - they decide if they can handle this binary
@@ -456,15 +475,21 @@ class Binary(ModelWithHealthStats):
self.version = record.get('version', '')
self.sha256 = record.get('sha256', '')
self.binprovider = record.get('binprovider', 'env')
self.status = self.StatusChoices.SUCCEEDED
self.status = self.StatusChoices.INSTALLED
self.save()
# Symlink binary into LIB_BIN_DIR if configured
from django.conf import settings
lib_bin_dir = getattr(settings, 'LIB_BIN_DIR', None)
if lib_bin_dir:
self.symlink_to_lib_bin(lib_bin_dir)
return
except json.JSONDecodeError:
continue
# No hook succeeded
self.status = self.StatusChoices.FAILED
self.save()
# No hook succeeded - leave status as QUEUED (will retry later)
# Don't set to FAILED since we don't have that status anymore
def cleanup(self):
"""
@@ -484,10 +509,75 @@ class Binary(ModelWithHealthStats):
for plugin_dir in output_dir.iterdir():
if not plugin_dir.is_dir():
continue
pid_file = plugin_dir / 'hook.pid'
cmd_file = plugin_dir / 'cmd.sh'
safe_kill_process(pid_file, cmd_file)
def symlink_to_lib_bin(self, lib_bin_dir: str | Path) -> Path | None:
"""
Symlink this binary into LIB_BIN_DIR for unified PATH management.
After a binary is installed by any binprovider (pip, npm, brew, apt, etc),
we symlink it into LIB_BIN_DIR so that:
1. All binaries can be found in a single directory
2. PATH only needs LIB_BIN_DIR prepended (not multiple provider-specific paths)
3. Binary priorities are clear (symlink points to the canonical install location)
Args:
lib_bin_dir: Path to LIB_BIN_DIR (e.g., /data/lib/arm64-darwin/bin)
Returns:
Path to the created symlink, or None if symlinking failed
Example:
>>> binary = Binary.objects.get(name='yt-dlp')
>>> binary.symlink_to_lib_bin('/data/lib/arm64-darwin/bin')
Path('/data/lib/arm64-darwin/bin/yt-dlp')
"""
import sys
from pathlib import Path
if not self.abspath:
return None
binary_abspath = Path(self.abspath).resolve()
lib_bin_dir = Path(lib_bin_dir).resolve()
# Create LIB_BIN_DIR if it doesn't exist
try:
lib_bin_dir.mkdir(parents=True, exist_ok=True)
except (OSError, PermissionError) as e:
print(f"Failed to create LIB_BIN_DIR {lib_bin_dir}: {e}", file=sys.stderr)
return None
# Get binary name (last component of path)
binary_name = binary_abspath.name
symlink_path = lib_bin_dir / binary_name
# Remove existing symlink/file if it exists
if symlink_path.exists() or symlink_path.is_symlink():
try:
# Check if it's already pointing to the right place
if symlink_path.is_symlink() and symlink_path.resolve() == binary_abspath:
# Already correctly symlinked, nothing to do
return symlink_path
# Remove old symlink/file
symlink_path.unlink()
except (OSError, PermissionError) as e:
print(f"Failed to remove existing file at {symlink_path}: {e}", file=sys.stderr)
return None
# Create new symlink
try:
symlink_path.symlink_to(binary_abspath)
print(f"Symlinked {binary_name} -> {symlink_path}", file=sys.stderr)
return symlink_path
except (OSError, PermissionError) as e:
print(f"Failed to create symlink {symlink_path} -> {binary_abspath}: {e}", file=sys.stderr)
return None
# =============================================================================
# Process Model
@@ -627,6 +717,16 @@ class Process(models.Model):
help_text='Type of process (cli, worker, orchestrator, binary, supervisord)'
)
# Worker type (only for WORKER processes: crawl, snapshot, archiveresult)
worker_type = models.CharField(
max_length=32,
default='',
null=False,
blank=True,
db_index=True,
help_text='Worker type name for WORKER processes (crawl, snapshot, archiveresult)'
)
# Execution metadata
pwd = models.CharField(max_length=512, default='', null=False, blank=True,
help_text='Working directory for process execution')
@@ -895,11 +995,16 @@ class Process(models.Model):
ppid = os.getppid()
machine = machine or Machine.current()
# Debug logging
import sys
print(f"DEBUG _find_parent_process: my_pid={os.getpid()}, ppid={ppid}", file=sys.stderr)
# Get parent process start time from OS
try:
os_parent = psutil.Process(ppid)
os_parent_start = os_parent.create_time()
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
print(f"DEBUG _find_parent_process: Parent process {ppid} not accessible", file=sys.stderr)
return None # Parent process doesn't exist
# Find matching Process record
@@ -910,12 +1015,18 @@ class Process(models.Model):
started_at__gte=timezone.now() - PID_REUSE_WINDOW,
).order_by('-started_at')
print(f"DEBUG _find_parent_process: Found {candidates.count()} candidates for ppid={ppid}", file=sys.stderr)
for candidate in candidates:
if candidate.started_at:
db_start_time = candidate.started_at.timestamp()
if abs(db_start_time - os_parent_start) < START_TIME_TOLERANCE:
time_diff = abs(db_start_time - os_parent_start)
print(f"DEBUG _find_parent_process: Checking candidate id={candidate.id} time_diff={time_diff:.2f}s tolerance={START_TIME_TOLERANCE}s", file=sys.stderr)
if time_diff < START_TIME_TOLERANCE:
print(f"DEBUG _find_parent_process: MATCH! Returning parent id={candidate.id} pid={candidate.pid}", file=sys.stderr)
return candidate
print(f"DEBUG _find_parent_process: No matching parent found for ppid={ppid}", file=sys.stderr)
return None # No matching ArchiveBox parent process
@classmethod
@@ -1584,69 +1695,38 @@ class BinaryMachine(BaseStateMachine, strict_states=True):
"""
State machine for managing Binary installation lifecycle.
Hook Lifecycle:
Simple 2-state machine:
┌─────────────────────────────────────────────────────────────┐
│ QUEUED State │
│ • Binary needs to be installed │
└─────────────────────────────────────────────────────────────┘
↓ tick() when can_start()
↓ tick() when can_install()
↓ Synchronous installation during transition
┌─────────────────────────────────────────────────────────────┐
│ STARTED State → enter_started()
1. binary.run()
• discover_hooks('Binary') → all on_Binary__install_*
│ • Try each provider hook in sequence: │
│ - run_hook(script, output_dir, ...) │
│ - If returncode == 0: │
│ * Read stdout.log │
│ * Parse JSONL for 'Binary' record with abspath │
│ * Update self: abspath, version, sha256, provider │
│ * Set status=SUCCEEDED, RETURN │
│ • If no hook succeeds: set status=FAILED │
└─────────────────────────────────────────────────────────────┘
↓ tick() checks status
┌─────────────────────────────────────────────────────────────┐
│ SUCCEEDED / FAILED │
│ • Set by binary.run() based on hook results │
│ • Health stats incremented (num_uses_succeeded/failed) │
INSTALLED State
• Binary installed (abspath, version, sha256 set)
• Health stats incremented
└─────────────────────────────────────────────────────────────┘
If installation fails, Binary stays in QUEUED with retry_at bumped.
"""
model_attr_name = 'binary'
# States
queued = State(value=Binary.StatusChoices.QUEUED, initial=True)
started = State(value=Binary.StatusChoices.STARTED)
succeeded = State(value=Binary.StatusChoices.SUCCEEDED, final=True)
failed = State(value=Binary.StatusChoices.FAILED, final=True)
installed = State(value=Binary.StatusChoices.INSTALLED, final=True)
# Tick Event - transitions based on conditions
# Tick Event - install happens during transition
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(succeeded, cond='is_succeeded') |
started.to(failed, cond='is_failed')
queued.to.itself(unless='can_install') |
queued.to(installed, cond='can_install', on='on_install')
)
def can_start(self) -> bool:
def can_install(self) -> bool:
"""Check if binary installation can start."""
return bool(self.binary.name and self.binary.binproviders)
def is_succeeded(self) -> bool:
"""Check if installation succeeded (status was set by run())."""
return self.binary.status == Binary.StatusChoices.SUCCEEDED
def is_failed(self) -> bool:
"""Check if installation failed (status was set by run())."""
return self.binary.status == Binary.StatusChoices.FAILED
def is_finished(self) -> bool:
"""Check if installation has completed (success or failure)."""
return self.binary.status in (
Binary.StatusChoices.SUCCEEDED,
Binary.StatusChoices.FAILED,
)
@queued.enter
def enter_queued(self):
"""Binary is queued for installation."""
@@ -1655,43 +1735,48 @@ class BinaryMachine(BaseStateMachine, strict_states=True):
status=Binary.StatusChoices.QUEUED,
)
@started.enter
def enter_started(self):
"""Start binary installation."""
# Lock the binary while installation runs
self.binary.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=300), # 5 min timeout for installation
status=Binary.StatusChoices.STARTED,
)
def on_install(self):
"""Called during queued→installed transition. Runs installation synchronously."""
import sys
# Run installation hooks
print(f'[cyan] 🔄 BinaryMachine.on_install() - installing {self.binary.name}[/cyan]', file=sys.stderr)
# Run installation hooks (synchronous, updates abspath/version/sha256 and sets status)
self.binary.run()
# Save updated status (run() updates status to succeeded/failed)
self.binary.save()
# Check if installation succeeded by looking at updated status
# Note: Binary.run() updates self.binary.status internally but doesn't refresh our reference
self.binary.refresh_from_db()
@succeeded.enter
def enter_succeeded(self):
if self.binary.status != Binary.StatusChoices.INSTALLED:
# Installation failed - abort transition, stay in queued
print(f'[red] ❌ BinaryMachine - {self.binary.name} installation failed, retrying later[/red]', file=sys.stderr)
# Bump retry_at to try again later
self.binary.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=300), # Retry in 5 minutes
status=Binary.StatusChoices.QUEUED, # Ensure we stay queued
)
# Increment health stats for failure
self.binary.increment_health_stats(success=False)
# Abort the transition - this will raise an exception and keep us in queued
raise Exception(f'Binary {self.binary.name} installation failed')
print(f'[cyan] ✅ BinaryMachine - {self.binary.name} installed successfully[/cyan]', file=sys.stderr)
@installed.enter
def enter_installed(self):
"""Binary installed successfully."""
self.binary.update_and_requeue(
retry_at=None,
status=Binary.StatusChoices.SUCCEEDED,
status=Binary.StatusChoices.INSTALLED,
)
# Increment health stats
self.binary.increment_health_stats(success=True)
@failed.enter
def enter_failed(self):
"""Binary installation failed."""
self.binary.update_and_requeue(
retry_at=None,
status=Binary.StatusChoices.FAILED,
)
# Increment health stats
self.binary.increment_health_stats(success=False)
# =============================================================================
# Process State Machine

View File

@@ -80,8 +80,7 @@ class TestAccessibilityWithChrome(TestCase):
# Run accessibility hook with the active Chrome session
result = subprocess.run(
['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,

View File

View File

@@ -39,30 +39,36 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
CHROME_NAVIGATE_HOOK,
)
# Get LIB_DIR and NODE_MODULES_DIR from shared helpers
LIB_DIR = get_lib_dir()
NODE_MODULES_DIR = get_node_modules_dir()
NPM_PREFIX = LIB_DIR / 'npm'
# Chromium install location (relative to DATA_DIR)
CHROMIUM_INSTALL_DIR = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
@pytest.fixture(scope="session", autouse=True)
def ensure_chromium_and_puppeteer_installed():
"""Ensure Chromium and puppeteer are installed before running tests."""
def ensure_chromium_and_puppeteer_installed(tmp_path_factory):
"""Ensure Chromium and puppeteer are installed before running tests.
Puppeteer handles Chromium installation automatically in its own cache.
We only need to install puppeteer itself to LIB_DIR/npm.
"""
from abx_pkg import Binary, NpmProvider, BinProviderOverrides
# Set DATA_DIR if not already set (required by abx_pkg)
if not os.environ.get('DATA_DIR'):
# Use isolated temp dir for direct pytest runs
test_data_dir = tmp_path_factory.mktemp('chrome_test_data')
os.environ['DATA_DIR'] = str(test_data_dir)
# Compute paths AFTER setting DATA_DIR
lib_dir = get_lib_dir()
node_modules_dir = get_node_modules_dir()
npm_prefix = lib_dir / 'npm'
# Rebuild pydantic models
NpmProvider.model_rebuild()
# Install puppeteer-core if not available
puppeteer_core_path = NODE_MODULES_DIR / 'puppeteer-core'
# Install puppeteer if not available (it will handle Chromium in its own cache)
puppeteer_core_path = node_modules_dir / 'puppeteer-core'
if not puppeteer_core_path.exists():
print(f"\n[*] Installing puppeteer to {NPM_PREFIX}...")
NPM_PREFIX.mkdir(parents=True, exist_ok=True)
print(f"\n[*] Installing puppeteer to {npm_prefix}...")
npm_prefix.mkdir(parents=True, exist_ok=True)
provider = NpmProvider(npm_prefix=NPM_PREFIX)
provider = NpmProvider(npm_prefix=npm_prefix)
try:
binary = Binary(
name='puppeteer',
@@ -70,36 +76,25 @@ def ensure_chromium_and_puppeteer_installed():
overrides={'npm': {'packages': ['puppeteer@^23.5.0']}}
)
binary.install()
print(f"[*] Puppeteer installed successfully to {NPM_PREFIX}")
print(f"[*] Puppeteer installed successfully to {npm_prefix}")
except Exception as e:
pytest.skip(f"Failed to install puppeteer: {e}")
# Install Chromium via @puppeteer/browsers if not available
# Find Chromium binary (puppeteer installs it automatically in its cache)
chromium_binary = find_chromium_binary()
if not chromium_binary:
print(f"\n[*] Installing Chromium to {CHROMIUM_INSTALL_DIR}...")
CHROMIUM_INSTALL_DIR.mkdir(parents=True, exist_ok=True)
result = subprocess.run(
['npx', '@puppeteer/browsers', 'install', 'chromium@latest'],
cwd=str(CHROMIUM_INSTALL_DIR.parent),
capture_output=True,
text=True,
timeout=300
)
if result.returncode != 0:
pytest.skip(f"Failed to install Chromium: {result.stderr}")
chromium_binary = find_chromium_binary()
if not chromium_binary:
pytest.skip("Chromium installed but binary not found")
print(f"[*] Chromium installed: {chromium_binary}")
pytest.skip("Chromium not found - puppeteer should install it automatically")
# Set CHROME_BINARY env var for tests
os.environ['CHROME_BINARY'] = chromium_binary
# Get paths from helpers (will use DATA_DIR if set, or compute based on __file__)
LIB_DIR = get_lib_dir()
NODE_MODULES_DIR = get_node_modules_dir()
NPM_PREFIX = LIB_DIR / 'npm'
def test_hook_scripts_exist():
"""Verify chrome hooks exist."""
assert CHROME_LAUNCH_HOOK.exists(), f"Hook not found: {CHROME_LAUNCH_HOOK}"
@@ -208,8 +203,7 @@ def test_chrome_launch_and_tab_creation():
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
result = subprocess.run(
['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-123', '--crawl-id=test-crawl-123'],
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,
@@ -269,8 +263,7 @@ def test_chrome_navigation():
result = subprocess.run(
['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123', '--crawl-id=test-crawl-nav'],
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,
@@ -281,8 +274,7 @@ def test_chrome_navigation():
# Navigate to URL
result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123'],
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120,
@@ -417,8 +409,7 @@ def test_multiple_snapshots_share_chrome():
# Create tab for this snapshot
result = subprocess.run(
['node', str(CHROME_TAB_HOOK), f'--url=https://example.com/{snap_num}', f'--snapshot-id=snap-{snap_num}', '--crawl-id=test-multi-crawl'],
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,

View File

@@ -80,8 +80,7 @@ class TestConsolelogWithChrome(TestCase):
# Run consolelog hook with the active Chrome session
result = subprocess.run(
['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir,
env=get_test_env()),
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120, # Longer timeout as it waits for navigation

View File

@@ -0,0 +1,80 @@
#!/usr/bin/env python3
"""
Detect gallery-dl binary and emit Binary JSONL record.
Output: Binary JSONL record to stdout if gallery-dl is found
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary_found(binary: Binary, name: str):
"""Output Binary JSONL record for an installed binary."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env', # Already installed
'machine_id': machine_id,
}
print(json.dumps(record))
def output_binary_missing(name: str, binproviders: str):
"""Output Binary JSONL record for a missing binary that needs installation."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders, # Providers that can install it
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
gallerydl_enabled = get_env_bool('GALLERYDL_ENABLED', True)
gallerydl_binary = get_env('GALLERYDL_BINARY', 'gallery-dl')
if not gallerydl_enabled:
sys.exit(0)
provider = EnvProvider()
try:
binary = Binary(name=gallerydl_binary, binproviders=[provider]).load()
if binary.abspath:
# Binary found
output_binary_found(binary, name='gallery-dl')
else:
# Binary not found
output_binary_missing(name='gallery-dl', binproviders='pip')
except Exception:
# Binary not found
output_binary_missing(name='gallery-dl', binproviders='pip')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,80 @@
#!/usr/bin/env python3
"""
Detect git binary and emit Binary JSONL record.
Output: Binary JSONL record to stdout if git is found
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary_found(binary: Binary, name: str):
"""Output Binary JSONL record for an installed binary."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env', # Already installed
'machine_id': machine_id,
}
print(json.dumps(record))
def output_binary_missing(name: str, binproviders: str):
"""Output Binary JSONL record for a missing binary that needs installation."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders, # Providers that can install it
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
git_enabled = get_env_bool('GIT_ENABLED', True)
git_binary = get_env('GIT_BINARY', 'git')
if not git_enabled:
sys.exit(0)
provider = EnvProvider()
try:
binary = Binary(name=git_binary, binproviders=[provider]).load()
if binary.abspath:
# Binary found
output_binary_found(binary, name='git')
else:
# Binary not found
output_binary_missing(name='git', binproviders='apt,brew')
except Exception:
# Binary not found
output_binary_missing(name='git', binproviders='apt,brew')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -123,8 +123,7 @@ def test_scrolls_page_and_outputs_stats():
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'],
cwd=str(infiniscroll_dir,
env=get_test_env()),
cwd=str(infiniscroll_dir),
capture_output=True,
text=True,
timeout=60,
@@ -188,8 +187,7 @@ def test_config_scroll_limit_honored():
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-limit'],
cwd=str(infiniscroll_dir,
env=get_test_env()),
cwd=str(infiniscroll_dir),
capture_output=True,
text=True,
timeout=60,
@@ -248,8 +246,7 @@ def test_config_timeout_honored():
start_time = time.time()
result = subprocess.run(
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-timeout'],
cwd=str(infiniscroll_dir,
env=get_test_env()),
cwd=str(infiniscroll_dir),
capture_output=True,
text=True,
timeout=30,

View File

@@ -0,0 +1,80 @@
#!/usr/bin/env python3
"""
Detect mercury-parser binary and emit Binary JSONL record.
Output: Binary JSONL record to stdout if mercury-parser is found
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary_found(binary: Binary, name: str):
"""Output Binary JSONL record for an installed binary."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env', # Already installed
'machine_id': machine_id,
}
print(json.dumps(record))
def output_binary_missing(name: str, binproviders: str):
"""Output Binary JSONL record for a missing binary that needs installation."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders, # Providers that can install it
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
mercury_enabled = get_env_bool('MERCURY_ENABLED', True)
mercury_binary = get_env('MERCURY_BINARY', 'mercury-parser')
if not mercury_enabled:
sys.exit(0)
provider = EnvProvider()
try:
binary = Binary(name=mercury_binary, binproviders=[provider]).load()
if binary.abspath:
# Binary found
output_binary_found(binary, name='mercury-parser')
else:
# Binary not found
output_binary_missing(name='mercury-parser', binproviders='npm')
except Exception:
# Binary not found
output_binary_missing(name='mercury-parser', binproviders='npm')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -0,0 +1,83 @@
#!/usr/bin/env python3
"""
Detect readability-extractor binary and emit Binary JSONL record.
Output: Binary JSONL record to stdout if readability is found
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary_found(binary: Binary, name: str):
"""Output Binary JSONL record for an installed binary."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env', # Already installed
'machine_id': machine_id,
}
print(json.dumps(record))
def output_binary_missing(name: str, binproviders: str):
"""Output Binary JSONL record for a missing binary that needs installation."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders, # Providers that can install it
'overrides': {
'packages': ['git+https://github.com/ArchiveBox/readability-extractor.git'],
},
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
readability_enabled = get_env_bool('READABILITY_ENABLED', True)
readability_binary = get_env('READABILITY_BINARY', 'readability-extractor')
if not readability_enabled:
sys.exit(0)
provider = EnvProvider()
try:
binary = Binary(name=readability_binary, binproviders=[provider]).load()
if binary.abspath:
# Binary found
output_binary_found(binary, name='readability-extractor')
else:
# Binary not found
output_binary_missing(name='readability-extractor', binproviders='npm')
except Exception:
# Binary not found
output_binary_missing(name='readability-extractor', binproviders='npm')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -27,11 +27,21 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
run_hook_and_parse,
LIB_DIR,
NODE_MODULES_DIR,
CHROME_PLUGIN_DIR,
)
# Import chrome test fixture to ensure puppeteer is installed
from archivebox.plugins.chrome.tests.test_chrome import ensure_chromium_and_puppeteer_installed
PLUGIN_DIR = get_plugin_dir(__file__)
SCREENSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*')
# Get Chrome hooks for setting up sessions
CHROME_LAUNCH_HOOK = get_hook_script(CHROME_PLUGIN_DIR, 'on_Crawl__*_chrome_launch.*')
CHROME_TAB_HOOK = get_hook_script(CHROME_PLUGIN_DIR, 'on_Snapshot__*_chrome_tab.*')
CHROME_NAVIGATE_HOOK = get_hook_script(CHROME_PLUGIN_DIR, 'on_Snapshot__*_chrome_navigate.*')
TEST_URL = 'https://example.com'
@@ -53,18 +63,162 @@ def test_verify_deps_with_abx_pkg():
def test_extracts_screenshot_from_example_com():
"""Test full workflow: extract screenshot from real example.com via hook."""
# Prerequisites checked by earlier test
"""Test full workflow: extract screenshot from real example.com via hook.
Replicates production directory structure:
DATA_DIR/users/testuser/crawls/{crawl-id}/chrome/
DATA_DIR/users/testuser/crawls/{crawl-id}/snapshots/{snap-id}/chrome/
DATA_DIR/users/testuser/crawls/{crawl-id}/snapshots/{snap-id}/screenshot/
This exercises the "connect to existing session" code path which is the primary
path in production and accounts for ~50% of the code.
"""
import signal
import time
import os
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Replicate exact production directory structure
data_dir = Path(tmpdir)
crawl_id = 'test-screenshot-crawl'
snapshot_id = 'test-screenshot-snap'
# Crawl: DATA_DIR/users/{username}/crawls/YYYYMMDD/example.com/{crawl-id}/{plugin}/
crawl_dir = data_dir / 'users' / 'testuser' / 'crawls' / '20240101' / 'example.com' / crawl_id
chrome_dir = crawl_dir / 'chrome'
chrome_dir.mkdir(parents=True)
# Snapshot: DATA_DIR/users/{username}/snapshots/YYYYMMDD/example.com/{snapshot-uuid}/{plugin}/
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / snapshot_id
snapshot_chrome_dir = snapshot_dir / 'chrome'
snapshot_chrome_dir.mkdir(parents=True)
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir()
# Run screenshot extraction hook
env = get_test_env()
print(f"\n[DEBUG] NODE_V8_COVERAGE={env.get('NODE_V8_COVERAGE', 'NOT SET')}", file=sys.stderr)
env['CHROME_HEADLESS'] = 'true'
# Step 1: Launch Chrome session at crawl level (background process)
chrome_launch_process = subprocess.Popen(
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
cwd=str(chrome_dir),
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
env=env
)
# Wait for Chrome to launch
for i in range(15):
if chrome_launch_process.poll() is not None:
stdout, stderr = chrome_launch_process.communicate()
pytest.fail(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
if (chrome_dir / 'cdp_url.txt').exists():
break
time.sleep(1)
assert (chrome_dir / 'cdp_url.txt').exists(), "Chrome CDP URL file should exist"
assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist"
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
try:
# Step 2: Create tab at snapshot level
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
result = subprocess.run(
['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=60,
env=env
)
assert result.returncode == 0, f"Tab creation failed: {result.stderr}"
assert (snapshot_chrome_dir / 'cdp_url.txt').exists(), "Snapshot CDP URL should exist"
# Step 3: Navigate to URL
result = subprocess.run(
['node', str(CHROME_NAVIGATE_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}'],
cwd=str(snapshot_chrome_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
assert result.returncode == 0, f"Navigation failed: {result.stderr}"
assert (snapshot_chrome_dir / 'navigation.json').exists(), "Navigation JSON should exist"
# Step 4: Take screenshot (should connect to existing session)
# Screenshot hook runs in screenshot/ dir and looks for ../chrome/cdp_url.txt
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
assert result.returncode == 0, f"Screenshot extraction failed:\nStderr: {result.stderr}\nStdout: {result.stdout}"
# Parse JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
assert 'screenshot.png' in result_json['output_str'], f"Output should be screenshot.png: {result_json}"
# Verify filesystem output
screenshot_file = screenshot_dir / 'screenshot.png'
assert screenshot_file.exists(), f"screenshot.png not created at {screenshot_file}"
# Verify file is valid PNG
file_size = screenshot_file.stat().st_size
assert file_size > 1000, f"Screenshot too small: {file_size} bytes"
assert file_size < 10 * 1024 * 1024, f"Screenshot suspiciously large: {file_size} bytes"
# Check PNG magic bytes
screenshot_data = screenshot_file.read_bytes()
assert screenshot_data[:8] == b'\x89PNG\r\n\x1a\n', "Should be valid PNG file"
finally:
# Cleanup: Kill Chrome
try:
chrome_launch_process.send_signal(signal.SIGTERM)
chrome_launch_process.wait(timeout=5)
except:
pass
try:
os.kill(chrome_pid, signal.SIGKILL)
except OSError:
pass
def test_extracts_screenshot_without_session():
"""Test screenshot extraction without existing Chrome session (fallback to own browser)."""
with tempfile.TemporaryDirectory() as tmpdir:
# Create proper snapshot directory structure
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-fallback'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
# Don't set up Chrome session or staticfile - screenshot should launch its own browser
env = get_test_env()
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
cwd=tmpdir,
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-fallback'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
@@ -73,7 +227,7 @@ def test_extracts_screenshot_from_example_com():
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
# Parse JSONL output (clean format without RESULT_JSON= prefix)
# Parse JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
@@ -88,20 +242,54 @@ def test_extracts_screenshot_from_example_com():
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
assert result_json['output_str'] == 'screenshot.png'
assert 'screenshot.png' in result_json['output_str']
# Verify filesystem output (hook creates screenshot.png directly in working dir)
screenshot_file = tmpdir / 'screenshot.png'
# Verify file created
screenshot_file = screenshot_dir / 'screenshot.png'
assert screenshot_file.exists(), "screenshot.png not created"
assert screenshot_file.stat().st_size > 1000, "Screenshot too small"
# Verify file is valid PNG
file_size = screenshot_file.stat().st_size
assert file_size > 1000, f"Screenshot too small: {file_size} bytes"
assert file_size < 10 * 1024 * 1024, f"Screenshot suspiciously large: {file_size} bytes"
# Check PNG magic bytes
screenshot_data = screenshot_file.read_bytes()
assert screenshot_data[:8] == b'\x89PNG\r\n\x1a\n', "Should be valid PNG file"
def test_skips_when_staticfile_exists():
"""Test that screenshot skips when staticfile extractor already handled the URL."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-skip'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
# Create staticfile output to simulate staticfile extractor already ran
staticfile_dir = snapshot_dir / 'staticfile'
staticfile_dir.mkdir()
(staticfile_dir / 'index.html').write_text('<html></html>')
env = get_test_env()
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-skip'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=30,
env=env
)
assert result.returncode == 0, f"Should exit successfully: {result.stderr}"
# Should emit skipped status
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'skipped', f"Should skip: {result_json}"
def test_config_save_screenshot_false_skips():
@@ -134,13 +322,11 @@ def test_config_save_screenshot_false_skips():
def test_reports_missing_chrome():
"""Test that script reports error when Chrome is not found."""
import os
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Set CHROME_BINARY to nonexistent path
env = os.environ.copy()
env = get_test_env()
env['CHROME_BINARY'] = '/nonexistent/chrome'
result = subprocess.run(
@@ -158,6 +344,59 @@ def test_reports_missing_chrome():
assert 'chrome' in combined.lower() or 'browser' in combined.lower() or 'ERROR=' in combined
def test_custom_resolution_and_user_agent():
"""Test that CHROME_RESOLUTION and CHROME_USER_AGENT configs are respected."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-config'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
env['CHROME_RESOLUTION'] = '800,600'
env['CHROME_USER_AGENT'] = 'Test/1.0'
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-config'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
screenshot_file = screenshot_dir / 'screenshot.png'
assert screenshot_file.exists(), "screenshot.png not created"
# Resolution affects file size
assert screenshot_file.stat().st_size > 500, "Screenshot too small"
def test_ssl_check_disabled():
"""Test that CHROME_CHECK_SSL_VALIDITY=False allows invalid certificates."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-ssl'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
env['CHROME_CHECK_SSL_VALIDITY'] = 'False'
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-ssl'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
assert result.returncode == 0, f"Should succeed: {result.stderr}"
assert (screenshot_dir / 'screenshot.png').exists()
def test_config_timeout_honored():
"""Test that CHROME_TIMEOUT config is respected."""
import os
@@ -182,5 +421,410 @@ def test_config_timeout_honored():
assert result.returncode in (0, 1), "Should complete without hanging"
def test_missing_url_argument():
"""Test that hook fails gracefully when URL argument is missing."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = get_test_env()
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), '--snapshot-id=test-missing-url'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=30,
env=env
)
# Should exit with error
assert result.returncode != 0, "Should fail when URL is missing"
assert 'Usage:' in result.stderr or 'url' in result.stderr.lower()
def test_missing_snapshot_id_argument():
"""Test that hook fails gracefully when snapshot-id argument is missing."""
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
env = get_test_env()
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}'],
cwd=tmpdir,
capture_output=True,
text=True,
timeout=30,
env=env
)
# Should exit with error
assert result.returncode != 0, "Should fail when snapshot-id is missing"
assert 'Usage:' in result.stderr or 'snapshot' in result.stderr.lower()
def test_invalid_resolution_format():
"""Test that invalid CHROME_RESOLUTION format is handled gracefully."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-badres'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
# Invalid resolution formats to test parseResolution error handling
for bad_resolution in ['invalid', '1440', '1440x2000', 'abc,def']:
env['CHROME_RESOLUTION'] = bad_resolution
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-badres'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
# Should either fail gracefully or fall back to default
# (depending on implementation - script should not crash with uncaught error)
assert result.returncode in (0, 1), f"Script should handle bad resolution: {bad_resolution}"
def test_boolean_env_var_parsing():
"""Test that boolean environment variables are parsed correctly."""
import time
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-bool'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
# Test various boolean formats for CHROME_HEADLESS
for bool_val in ['true', '1', 'yes', 'on', 'True', 'TRUE']:
env['CHROME_HEADLESS'] = bool_val
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-bool'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
# Should either succeed or fail, but shouldn't crash on boolean parsing
assert result.returncode in (0, 1), f"Should handle boolean value: {bool_val}"
# Clean up screenshot file if created
screenshot_file = screenshot_dir / 'screenshot.png'
if screenshot_file.exists():
screenshot_file.unlink()
time.sleep(0.5) # Brief pause between attempts
def test_integer_env_var_parsing():
"""Test that integer environment variables are parsed correctly."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-int'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
# Test valid and invalid integer formats for CHROME_TIMEOUT
test_cases = [
('60', True), # Valid integer
('invalid', True), # Invalid - should use default
('', True), # Empty - should use default
]
for timeout_val, should_work in test_cases:
env['CHROME_TIMEOUT'] = timeout_val
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-int'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
# Should either succeed or fail gracefully, but shouldn't crash on int parsing
assert result.returncode in (0, 1), f"Should handle timeout value: {timeout_val}"
# Clean up screenshot file if created
screenshot_file = screenshot_dir / 'screenshot.png'
if screenshot_file.exists():
screenshot_file.unlink()
def test_extracts_screenshot_with_all_config_options():
"""Test screenshot with comprehensive config to exercise all code paths."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-full'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
# Set ALL config options to exercise all code paths
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
env['CHROME_RESOLUTION'] = '800,600'
env['CHROME_USER_AGENT'] = 'TestBot/1.0'
env['CHROME_CHECK_SSL_VALIDITY'] = 'false' # Exercises checkSsl branch
env['CHROME_TIMEOUT'] = '60'
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-full'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
assert result.returncode == 0, f"Screenshot should succeed: {result.stderr}"
# Verify JSONL output with success
result_json = None
for line in result.stdout.strip().split('\n'):
if line.strip().startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
assert result_json, "Should have ArchiveResult JSONL output"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
assert 'screenshot.png' in result_json['output_str']
# Verify file created
screenshot_file = screenshot_dir / 'screenshot.png'
assert screenshot_file.exists(), "screenshot.png should be created"
assert screenshot_file.stat().st_size > 1000, "Screenshot should have content"
def test_headless_mode_false():
"""Test headless=false code path specifically."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-headless'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
# Explicitly test headless=false (exercises the ternary false branch)
env['CHROME_HEADLESS'] = 'false'
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-headless-false'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
# Should work or fail gracefully
assert result.returncode in (0, 1), f"Headless=false should handle: {result.stderr}"
def test_invalid_url_causes_error():
"""Test error path with invalid URL that causes navigation failure."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-invalid'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
env['CHROME_TIMEOUT'] = '5' # Short timeout
# Use invalid URL to trigger error path
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), '--url=http://this-domain-does-not-exist-12345.invalid', '--snapshot-id=snap-invalid'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=30,
env=env
)
# Should fail due to navigation error
assert result.returncode != 0, "Should fail on invalid URL"
# Should NOT emit JSONL (transient error)
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
assert len(jsonl_lines) == 0, f"Should not emit JSONL on error: {jsonl_lines}"
def test_with_corrupted_cdp_url_falls_back():
"""Test that corrupted CDP URL file causes fallback to launching browser."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-corrupt-cdp'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
# Create chrome directory with corrupted CDP URL
chrome_dir = snapshot_dir / 'chrome'
chrome_dir.mkdir()
(chrome_dir / 'cdp_url.txt').write_text('ws://127.0.0.1:99999/invalid')
env = get_test_env()
env['CHROME_HEADLESS'] = 'true'
env['CHROME_TIMEOUT'] = '5' # Short timeout for fast test
# Screenshot should try CDP, fail quickly, then fall back to launching own browser
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-corrupt-cdp'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=30,
env=env
)
# Should succeed by falling back to launching browser
assert result.returncode == 0, f"Should fallback and succeed: {result.stderr}"
assert 'Failed to connect to CDP' in result.stderr, "Should log CDP connection failure"
# Verify screenshot was created via fallback path
screenshot_file = screenshot_dir / 'screenshot.png'
assert screenshot_file.exists(), "Screenshot should be created via fallback"
def test_user_agent_is_applied():
"""Test that CHROME_USER_AGENT is actually applied when launching browser."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-ua'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
env['CHROME_USER_AGENT'] = 'CustomBot/9.9.9 (Testing)'
env['CHROME_HEADLESS'] = 'true'
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-ua'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
# Should succeed with custom user agent
assert result.returncode == 0, f"Should succeed with custom UA: {result.stderr}"
screenshot_file = screenshot_dir / 'screenshot.png'
assert screenshot_file.exists(), "Screenshot should be created"
def test_check_ssl_false_branch():
"""Test CHROME_CHECK_SSL_VALIDITY=false adds ignore-certificate-errors arg."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-nossl'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
env['CHROME_CHECK_SSL_VALIDITY'] = 'false'
env['CHROME_HEADLESS'] = 'true'
# Test with both boolean false and string 'false'
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-nossl'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
assert result.returncode == 0, f"Should work with SSL check disabled: {result.stderr}"
assert (screenshot_dir / 'screenshot.png').exists()
def test_alternative_env_var_names():
"""Test fallback environment variable names (TIMEOUT vs CHROME_TIMEOUT, etc)."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-altenv'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
# Use alternative env var names (without CHROME_ prefix)
env['TIMEOUT'] = '45'
env['RESOLUTION'] = '1024,768'
env['USER_AGENT'] = 'AltBot/1.0'
env['CHECK_SSL_VALIDITY'] = 'false'
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-altenv'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
assert result.returncode == 0, f"Should work with alternative env vars: {result.stderr}"
assert (screenshot_dir / 'screenshot.png').exists()
def test_very_large_resolution():
"""Test screenshot with very large resolution."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-large'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
env['CHROME_RESOLUTION'] = '3840,2160' # 4K resolution
env['CHROME_HEADLESS'] = 'true'
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-large'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
assert result.returncode == 0, f"Should handle large resolution: {result.stderr}"
screenshot_file = screenshot_dir / 'screenshot.png'
assert screenshot_file.exists()
# 4K screenshot should be larger
assert screenshot_file.stat().st_size > 5000, "4K screenshot should be substantial"
def test_very_small_resolution():
"""Test screenshot with very small resolution."""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir)
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-small'
screenshot_dir = snapshot_dir / 'screenshot'
screenshot_dir.mkdir(parents=True)
env = get_test_env()
env['CHROME_RESOLUTION'] = '320,240' # Very small
env['CHROME_HEADLESS'] = 'true'
result = subprocess.run(
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-small'],
cwd=str(screenshot_dir),
capture_output=True,
text=True,
timeout=120,
env=env
)
assert result.returncode == 0, f"Should handle small resolution: {result.stderr}"
assert (screenshot_dir / 'screenshot.png').exists()
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -0,0 +1,85 @@
#!/usr/bin/env python3
"""
Detect single-file binary and emit Binary JSONL record.
Output: Binary JSONL record to stdout if single-file is found
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary_found(binary: Binary, name: str):
"""Output Binary JSONL record for an installed binary."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env', # Already installed
'machine_id': machine_id,
}
print(json.dumps(record))
def output_binary_missing(name: str, binproviders: str):
"""Output Binary JSONL record for a missing binary that needs installation."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders, # Providers that can install it
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
singlefile_enabled = get_env_bool('SINGLEFILE_ENABLED', True)
if not singlefile_enabled:
sys.exit(0)
provider = EnvProvider()
found = False
# Try single-file-cli first, then single-file
for binary_name in ['single-file-cli', 'single-file']:
try:
binary = Binary(name=binary_name, binproviders=[provider]).load()
if binary.abspath:
# Binary found
output_binary_found(binary, name='single-file')
found = True
break
except Exception:
continue
if not found:
# Binary not found
output_binary_missing(name='single-file', binproviders='npm')
sys.exit(0)
if __name__ == '__main__':
main()

30
archivebox/plugins/wget/on_Crawl__06_wget_install.py Normal file → Executable file
View File

@@ -40,8 +40,8 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def output_binary(binary: Binary, name: str):
"""Output Binary JSONL record to stdout."""
def output_binary_found(binary: Binary, name: str):
"""Output Binary JSONL record for an installed binary."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
@@ -50,7 +50,20 @@ def output_binary(binary: Binary, name: str):
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env',
'binprovider': 'env', # Already installed
'machine_id': machine_id,
}
print(json.dumps(record))
def output_binary_missing(name: str, binproviders: str):
"""Output Binary JSONL record for a missing binary that needs installation."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders, # Providers that can install it
'machine_id': machine_id,
}
print(json.dumps(record))
@@ -89,16 +102,19 @@ def main():
binary_path = ''
if not binary_path:
if use_wget:
errors.append(f"WGET_BINARY={wget_binary} not found. Install wget or set WGET_ENABLED=false.")
# Binary not found
computed['WGET_BINARY'] = ''
if use_wget:
# Emit Binary record for installation
output_binary_missing(name='wget', binproviders='apt,brew')
else:
# Binary found
computed['WGET_BINARY'] = binary_path
wget_version = str(binary.version) if binary.version else 'unknown'
computed['WGET_VERSION'] = wget_version
# Output Binary JSONL record
output_binary(binary, name='wget')
# Output Binary JSONL record for installed binary
output_binary_found(binary, name='wget')
# Check for compression support
if computed.get('WGET_BINARY'):

View File

@@ -0,0 +1,80 @@
#!/usr/bin/env python3
"""
Detect yt-dlp binary and emit Binary JSONL record.
Output: Binary JSONL record to stdout if yt-dlp is found
"""
import json
import os
import sys
from abx_pkg import Binary, EnvProvider
def get_env(name: str, default: str = '') -> str:
return os.environ.get(name, default).strip()
def get_env_bool(name: str, default: bool = False) -> bool:
val = get_env(name, '').lower()
if val in ('true', '1', 'yes', 'on'):
return True
if val in ('false', '0', 'no', 'off'):
return False
return default
def output_binary_found(binary: Binary, name: str):
"""Output Binary JSONL record for an installed binary."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'abspath': str(binary.abspath),
'version': str(binary.version) if binary.version else '',
'sha256': binary.sha256 or '',
'binprovider': 'env', # Already installed
'machine_id': machine_id,
}
print(json.dumps(record))
def output_binary_missing(name: str, binproviders: str):
"""Output Binary JSONL record for a missing binary that needs installation."""
machine_id = os.environ.get('MACHINE_ID', '')
record = {
'type': 'Binary',
'name': name,
'binproviders': binproviders, # Providers that can install it
'machine_id': machine_id,
}
print(json.dumps(record))
def main():
ytdlp_enabled = get_env_bool('YTDLP_ENABLED', True)
ytdlp_binary = get_env('YTDLP_BINARY', 'yt-dlp')
if not ytdlp_enabled:
sys.exit(0)
provider = EnvProvider()
try:
binary = Binary(name=ytdlp_binary, binproviders=[provider]).load()
if binary.abspath:
# Binary found
output_binary_found(binary, name='yt-dlp')
else:
# Binary not found
output_binary_missing(name='yt-dlp', binproviders='pip,brew,apt')
except Exception:
# Binary not found
output_binary_missing(name='yt-dlp', binproviders='pip,brew,apt')
sys.exit(0)
if __name__ == '__main__':
main()

View File

@@ -10,6 +10,7 @@ Migration tests from 0.8.x to 0.9.x.
- New fields like depth, retry_at, etc.
"""
import json
import shutil
import sqlite3
import subprocess
@@ -78,29 +79,43 @@ class TestMigrationFrom08x(unittest.TestCase):
self.assertTrue(ok, msg)
def test_migration_preserves_crawls(self):
"""Migration should preserve all Crawl records."""
"""Migration should preserve all Crawl records and create default crawl if needed."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Count snapshots with NULL crawl_id in original data
snapshots_without_crawl = sum(1 for s in self.original_data['snapshots'] if s['crawl_id'] is None)
# Expected count: original crawls + 1 default crawl if any snapshots had NULL crawl_id
expected_count = len(self.original_data['crawls'])
if snapshots_without_crawl > 0:
expected_count += 1 # Migration 0024 creates a default crawl
ok, msg = verify_crawl_count(self.db_path, expected_count)
self.assertTrue(ok, msg)
def test_migration_preserves_snapshot_crawl_links(self):
"""Migration should preserve snapshot-to-crawl relationships."""
"""Migration should preserve snapshot-to-crawl relationships and assign default crawl to orphans."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
# Check EVERY snapshot still has its crawl_id
# Check EVERY snapshot has a crawl_id after migration
for snapshot in self.original_data['snapshots']:
cursor.execute("SELECT crawl_id FROM core_snapshot WHERE url = ?", (snapshot['url'],))
row = cursor.fetchone()
self.assertIsNotNone(row, f"Snapshot {snapshot['url']} not found after migration")
self.assertEqual(row[0], snapshot['crawl_id'],
f"Crawl ID mismatch for {snapshot['url']}: expected {snapshot['crawl_id']}, got {row[0]}")
if snapshot['crawl_id'] is not None:
# Snapshots that had a crawl should keep it
self.assertEqual(row[0], snapshot['crawl_id'],
f"Crawl ID changed for {snapshot['url']}: expected {snapshot['crawl_id']}, got {row[0]}")
else:
# Snapshots without a crawl should now have one (the default crawl)
self.assertIsNotNone(row[0],
f"Snapshot {snapshot['url']} should have been assigned to default crawl but has NULL")
conn.close()
@@ -153,7 +168,7 @@ class TestMigrationFrom08x(unittest.TestCase):
result = run_archivebox(self.work_dir, ['init'], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
result = run_archivebox(self.work_dir, ['list'])
result = run_archivebox(self.work_dir, ['snapshot', 'list'])
self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
# Verify ALL snapshots appear in output
@@ -475,357 +490,227 @@ class TestFilesystemMigration08to09(unittest.TestCase):
"""Clean up temporary directory."""
shutil.rmtree(self.work_dir, ignore_errors=True)
def test_filesystem_migration_with_real_archiving(self):
def test_archiveresult_files_preserved_after_migration(self):
"""
Test that filesystem migration works with real archived content.
Test that ArchiveResult output files are reorganized into new structure.
Steps:
1. Initialize archivebox
2. Archive https://example.com (creates real files)
3. Manually set fs_version to 0.8.0
4. Trigger migration by saving snapshot
5. Verify files are organized correctly
This test verifies that:
1. Migration preserves ArchiveResult data in Process/Binary records
2. Running `archivebox update` reorganizes files into new structure
3. New structure: users/username/snapshots/YYYYMMDD/example.com/snap-uuid-here/output.ext
4. All files are moved (no data loss)
5. Old archive/timestamp/ directories are cleaned up
"""
# Step 1: Initialize
result = run_archivebox(self.work_dir, ['init'], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Use the real 0.7.2 database which has actual ArchiveResults with files
gold_db = Path('/Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.7.2/data')
if not gold_db.exists():
self.skipTest(f"Gold standard database not found at {gold_db}")
# Step 2: Archive example.com with ALL extractors enabled
# This ensures we test migration with all file types
try:
result = run_archivebox(
self.work_dir,
['add', '--depth=0', 'https://example.com'],
timeout=300, # 5 minutes for all extractors
env={
'SAVE_TITLE': 'True',
'SAVE_FAVICON': 'True',
'SAVE_WGET': 'True',
'SAVE_SCREENSHOT': 'True',
'SAVE_DOM': 'True',
'SAVE_SINGLEFILE': 'True',
'SAVE_READABILITY': 'True',
'SAVE_MERCURY': 'True',
'SAVE_PDF': 'True',
'SAVE_YTDLP': 'True',
'SAVE_ARCHIVEDOTORG': 'True',
'SAVE_HEADERS': 'True',
'SAVE_HTMLTOTEXT': 'True',
'SAVE_GIT': 'True',
}
)
except subprocess.TimeoutExpired as e:
# If timeout, still continue - we want to test with whatever files were created
print(f"\n[!] Add command timed out after {e.timeout}s, continuing with partial results...")
# Note: Snapshot may still have been created even if command timed out
# Copy gold database to test directory
import shutil
for item in gold_db.iterdir():
if item.is_dir():
shutil.copytree(item, self.work_dir / item.name, dirs_exist_ok=True)
else:
shutil.copy2(item, self.work_dir / item.name)
# Step 3: Get the snapshot and verify files were created
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
cursor.execute("SELECT id, url, timestamp, fs_version FROM core_snapshot WHERE url = ?", ('https://example.com',))
row = cursor.fetchone()
conn.close()
if not row:
self.skipTest("Failed to create snapshot for https://example.com")
snapshot_id, url, timestamp, fs_version = row
# Verify initial fs_version is 0.9.0 (current version)
self.assertEqual(fs_version, '0.9.0', f"Expected new snapshot to have fs_version='0.9.0', got '{fs_version}'")
# Verify output directory exists
output_dir = self.work_dir / 'archive' / timestamp
self.assertTrue(output_dir.exists(), f"Output directory not found: {output_dir}")
# List all files created (for debugging)
files_before = list(output_dir.rglob('*'))
files_before_count = len([f for f in files_before if f.is_file()])
print(f"\n[*] Files created by archiving: {files_before_count}")
for f in sorted(files_before):
if f.is_file():
print(f" {f.relative_to(output_dir)}")
# Step 4: Manually set fs_version to 0.8.0 to simulate old snapshot
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
cursor.execute("UPDATE core_snapshot SET fs_version = '0.8.0' WHERE id = ?", (snapshot_id,))
conn.commit()
# Verify the update worked
cursor.execute("SELECT fs_version FROM core_snapshot WHERE id = ?", (snapshot_id,))
updated_version = cursor.fetchone()[0]
conn.close()
self.assertEqual(updated_version, '0.8.0', "Failed to set fs_version to 0.8.0")
# Step 5: Trigger migration by running a command that loads and saves the snapshot
# We'll use the Python API directly to trigger save()
import os
import sys
import django
# Setup Django
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
os.environ['DATA_DIR'] = str(self.work_dir)
# Add parent dir to path so we can import archivebox
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
try:
django.setup()
from archivebox.core.models import Snapshot
# Load the snapshot (should trigger migration on save)
snapshot = Snapshot.objects.get(url='https://example.com')
# Verify fs_migration_needed returns True
self.assertTrue(snapshot.fs_migration_needed,
f"fs_migration_needed should be True for fs_version='0.8.0'")
# Save to trigger migration
print(f"\n[*] Triggering filesystem migration by saving snapshot...")
snapshot.save()
# Refresh from DB
snapshot.refresh_from_db()
# Verify migration completed
self.assertEqual(snapshot.fs_version, '0.9.0',
f"Migration failed: fs_version is still '{snapshot.fs_version}'")
self.assertFalse(snapshot.fs_migration_needed,
"fs_migration_needed should be False after migration")
print(f"[√] Filesystem migration completed: 0.8.0 -> 0.9.0")
except Exception as e:
self.fail(f"Failed to trigger migration via Django: {e}")
# Step 6: Verify files still exist and are accessible
# For 0.8 -> 0.9, the migration is a no-op, so files should be in the same place
files_after = list(output_dir.rglob('*'))
files_after_count = len([f for f in files_after if f.is_file()])
print(f"\n[*] Files after migration: {files_after_count}")
# Verify no files were lost
self.assertGreaterEqual(files_after_count, files_before_count,
f"Files were lost during migration: {files_before_count} -> {files_after_count}")
class TestDBOnlyCommands(unittest.TestCase):
"""Test that status/search/list commands only use DB, not filesystem."""
def setUp(self):
"""Create a temporary directory with 0.8.x schema and data."""
self.work_dir = Path(tempfile.mkdtemp())
self.db_path = self.work_dir / 'index.sqlite3'
create_data_dir_structure(self.work_dir)
conn = sqlite3.connect(str(self.db_path))
conn.executescript(SCHEMA_0_8)
conn.close()
self.original_data = seed_0_8_data(self.db_path)
def tearDown(self):
"""Clean up temporary directory."""
shutil.rmtree(self.work_dir, ignore_errors=True)
def test_status_works_with_empty_archive(self):
"""Status command should work with empty archive/ (queries DB only)."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Add a snapshot to DB
result = run_archivebox(self.work_dir, ['add', 'https://example.com'], timeout=60)
# Empty the archive directory (but keep it existing)
# Count archive directories and files BEFORE migration
archive_dir = self.work_dir / 'archive'
if archive_dir.exists():
for item in archive_dir.iterdir():
if item.is_dir():
shutil.rmtree(item)
else:
item.unlink()
dirs_before = list(archive_dir.glob('*')) if archive_dir.exists() else []
dirs_before_count = len([d for d in dirs_before if d.is_dir()])
# Status should still work (queries DB only, doesn't scan filesystem)
result = run_archivebox(self.work_dir, ['status'])
self.assertEqual(result.returncode, 0,
f"Status should work with empty archive: {result.stderr}")
# Count total files in all archive directories
files_before = []
for d in dirs_before:
if d.is_dir():
files_before.extend([f for f in d.rglob('*') if f.is_file()])
files_before_count = len(files_before)
# Should show count from DB
output = result.stdout + result.stderr
self.assertIn('Total', output,
"Status should show DB statistics even with no files")
# Sample some specific files to check they're preserved
sample_files = [
'favicon.ico',
'screenshot.png',
'singlefile.html',
'headers.json',
]
sample_paths_before = {}
for d in dirs_before:
if d.is_dir():
for sample_file in sample_files:
matching = list(d.glob(sample_file))
if matching:
sample_paths_before[f"{d.name}/{sample_file}"] = matching[0]
def test_list_works_with_empty_archive(self):
"""List command should work with empty archive/ (queries DB only)."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
print(f"\n[*] Archive directories before migration: {dirs_before_count}")
print(f"[*] Total files before migration: {files_before_count}")
print(f"[*] Sample files found: {len(sample_paths_before)}")
# Add a snapshot to DB
result = run_archivebox(self.work_dir, ['add', 'https://example.com'], timeout=60)
# Run init to trigger migration
result = run_archivebox(self.work_dir, ['init'], timeout=60)
self.assertEqual(result.returncode, 0, f"Init (migration) failed: {result.stderr}")
# Empty the archive directory (but keep it existing)
archive_dir = self.work_dir / 'archive'
if archive_dir.exists():
for item in archive_dir.iterdir():
if item.is_dir():
shutil.rmtree(item)
else:
item.unlink()
# Count archive directories and files AFTER migration
dirs_after = list(archive_dir.glob('*')) if archive_dir.exists() else []
dirs_after_count = len([d for d in dirs_after if d.is_dir()])
# List should still work (queries DB only, doesn't scan filesystem)
result = run_archivebox(self.work_dir, ['list'])
self.assertEqual(result.returncode, 0,
f"List should work with empty archive: {result.stderr}")
files_after = []
for d in dirs_after:
if d.is_dir():
files_after.extend([f for f in d.rglob('*') if f.is_file()])
files_after_count = len(files_after)
# Should show snapshot from DB
output = result.stdout + result.stderr
self.assertIn('example.com', output,
"Snapshot should appear in list output even with no files")
# Verify sample files still exist
sample_paths_after = {}
for d in dirs_after:
if d.is_dir():
for sample_file in sample_files:
matching = list(d.glob(sample_file))
if matching:
sample_paths_after[f"{d.name}/{sample_file}"] = matching[0]
def test_search_works_with_empty_archive(self):
"""Search command should work with empty archive/ (queries DB only)."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
print(f"[*] Archive directories after migration: {dirs_after_count}")
print(f"[*] Total files after migration: {files_after_count}")
print(f"[*] Sample files found: {len(sample_paths_after)}")
# Add a snapshot to DB
result = run_archivebox(self.work_dir, ['add', 'https://example.com'], timeout=60)
# Verify files still in old structure after migration (not moved yet)
self.assertEqual(dirs_before_count, dirs_after_count,
f"Archive directories lost during migration: {dirs_before_count} -> {dirs_after_count}")
self.assertEqual(files_before_count, files_after_count,
f"Files lost during migration: {files_before_count} -> {files_after_count}")
# Empty the archive directory (but keep it existing)
archive_dir = self.work_dir / 'archive'
if archive_dir.exists():
for item in archive_dir.iterdir():
if item.is_dir():
shutil.rmtree(item)
else:
item.unlink()
# Search should still work (queries DB only, doesn't scan filesystem)
result = run_archivebox(self.work_dir, ['search'])
self.assertEqual(result.returncode, 0,
f"Search should work with empty archive: {result.stderr}")
# Should show snapshot from DB
output = result.stdout + result.stderr
self.assertIn('example.com', output,
"Snapshot should appear in search output even with no files")
class TestUpdateCommandArchitecture(unittest.TestCase):
"""Test new update command architecture: filters=DB only, no filters=scan filesystem."""
def setUp(self):
"""Create a temporary directory with 0.8.x schema and data."""
self.work_dir = Path(tempfile.mkdtemp())
self.db_path = self.work_dir / 'index.sqlite3'
create_data_dir_structure(self.work_dir)
def tearDown(self):
"""Clean up temporary directory."""
shutil.rmtree(self.work_dir, ignore_errors=True)
def test_update_with_filters_uses_db_only(self):
"""Update with filters should only query DB, not scan filesystem."""
# Initialize with data
conn = sqlite3.connect(str(self.db_path))
conn.executescript(SCHEMA_0_8)
conn.close()
seed_0_8_data(self.db_path)
result = run_archivebox(self.work_dir, ['init'], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Run update with filter - should not scan filesystem
# Use a URL from the seeded data
result = run_archivebox(self.work_dir, ['update', 'example.com'], timeout=120)
# Should complete successfully (or with orchestrator error, which is okay)
# The key is it should not scan filesystem
def test_update_without_filters_imports_orphans(self):
"""Update without filters should scan filesystem and import orphaned directories."""
# Initialize empty DB
result = run_archivebox(self.work_dir, ['init'], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Create an orphaned directory in archive/
timestamp = '1609459200'
orphan_dir = self.work_dir / 'archive' / timestamp
orphan_dir.mkdir(parents=True, exist_ok=True)
index_data = {
'url': 'https://orphan.example.com',
'timestamp': timestamp,
'title': 'Orphaned Snapshot',
}
(orphan_dir / 'index.json').write_text(json.dumps(index_data))
(orphan_dir / 'index.html').write_text('<html>Orphan</html>')
# Count snapshots before update
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM core_snapshot")
count_before = cursor.fetchone()[0]
conn.close()
# Run full update (no filters) - should scan filesystem
# Run update to trigger filesystem reorganization
print(f"\n[*] Running archivebox update to reorganize filesystem...")
result = run_archivebox(self.work_dir, ['update'], timeout=120)
self.assertEqual(result.returncode, 0, f"Update failed: {result.stderr}")
# Check if orphan was imported
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
cursor.execute("SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
('https://orphan.example.com',))
orphan_count = cursor.fetchone()[0]
conn.close()
# Check new filesystem structure
# New structure: users/username/snapshots/YYYYMMDD/example.com/snap-uuid-here/output.ext
users_dir = self.work_dir / 'users'
snapshots_base = None
# If update succeeded, orphan should be imported
if result.returncode == 0:
self.assertGreaterEqual(orphan_count, 1,
"Orphaned snapshot should be imported by update")
if users_dir.exists():
# Find the snapshots directory
for user_dir in users_dir.iterdir():
if user_dir.is_dir():
user_snapshots = user_dir / 'snapshots'
if user_snapshots.exists():
snapshots_base = user_snapshots
break
print(f"[*] New structure base: {snapshots_base}")
class TestTimestampUniqueness(unittest.TestCase):
"""Test timestamp uniqueness constraint."""
# Count files in new structure
# Structure: users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/files...
files_new_structure = []
new_sample_files = {}
def setUp(self):
"""Create a temporary directory."""
self.work_dir = Path(tempfile.mkdtemp())
self.db_path = self.work_dir / 'index.sqlite3'
create_data_dir_structure(self.work_dir)
if snapshots_base and snapshots_base.exists():
for date_dir in snapshots_base.iterdir():
if date_dir.is_dir():
for domain_dir in date_dir.iterdir():
if domain_dir.is_dir():
for snap_dir in domain_dir.iterdir():
if snap_dir.is_dir():
# Files are directly in snap-uuid/ directory (no plugin subdirs)
for f in snap_dir.rglob('*'):
if f.is_file():
files_new_structure.append(f)
# Track sample files
if f.name in sample_files:
new_sample_files[f"{snap_dir.name}/{f.name}"] = f
def tearDown(self):
"""Clean up temporary directory."""
shutil.rmtree(self.work_dir, ignore_errors=True)
files_new_count = len(files_new_structure)
print(f"[*] Files in new structure: {files_new_count}")
print(f"[*] Sample files in new structure: {len(new_sample_files)}")
def test_timestamp_uniqueness_constraint_exists(self):
"""Database should have timestamp uniqueness constraint after migration."""
# Initialize with 0.8.x and migrate
conn = sqlite3.connect(str(self.db_path))
conn.executescript(SCHEMA_0_8)
conn.close()
# Check old structure (should be gone or empty)
old_archive_dir = self.work_dir / 'archive'
old_files_remaining = []
unmigrated_dirs = []
if old_archive_dir.exists():
for d in old_archive_dir.glob('*'):
# Only count REAL directories, not symlinks (symlinks are the migrated ones)
if d.is_dir(follow_symlinks=False) and d.name.replace('.', '').isdigit():
# This is a timestamp directory (old structure)
files_in_dir = [f for f in d.rglob('*') if f.is_file()]
if files_in_dir:
unmigrated_dirs.append((d.name, len(files_in_dir)))
old_files_remaining.extend(files_in_dir)
result = run_archivebox(self.work_dir, ['init'], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
old_files_count = len(old_files_remaining)
print(f"[*] Files remaining in old structure: {old_files_count}")
if unmigrated_dirs:
print(f"[*] Unmigrated directories: {unmigrated_dirs}")
# Check if unique_timestamp constraint exists
# CRITICAL: Verify files were moved to new structure
self.assertGreater(files_new_count, 0,
"No files found in new structure after update")
# CRITICAL: Verify old structure is cleaned up
self.assertEqual(old_files_count, 0,
f"Old structure not cleaned up: {old_files_count} files still in archive/timestamp/ directories")
# CRITICAL: Verify all files were moved (total count should match)
total_after_update = files_new_count + old_files_count
self.assertEqual(files_before_count, total_after_update,
f"Files lost during reorganization: {files_before_count} before → {total_after_update} after")
# CRITICAL: Verify sample files exist in new structure
self.assertGreater(len(new_sample_files), 0,
f"Sample files not found in new structure")
# Verify new path format
for path_key, file_path in new_sample_files.items():
# Path should contain: snapshots/YYYYMMDD/domain/snap-uuid/plugin/file
path_parts = file_path.parts
self.assertIn('snapshots', path_parts,
f"New path should contain 'snapshots': {file_path}")
self.assertIn('users', path_parts,
f"New path should contain 'users': {file_path}")
print(f"{path_key}{file_path.relative_to(self.work_dir)}")
# Verify Process and Binary records were created
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
# Query sqlite_master for constraints
cursor.execute("""
SELECT sql FROM sqlite_master
WHERE type='table' AND name='core_snapshot'
""")
table_sql = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
archiveresult_count = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM machine_process")
process_count = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM machine_binary")
binary_count = cursor.fetchone()[0]
cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE process_id IS NOT NULL")
linked_count = cursor.fetchone()[0]
conn.close()
# Should contain unique_timestamp constraint or UNIQUE(timestamp)
has_constraint = 'unique_timestamp' in table_sql.lower() or \
'unique' in table_sql.lower() and 'timestamp' in table_sql.lower()
print(f"[*] ArchiveResults: {archiveresult_count}")
print(f"[*] Process records created: {process_count}")
print(f"[*] Binary records created: {binary_count}")
print(f"[*] ArchiveResults linked to Process: {linked_count}")
# Verify data migration happened correctly
# The 0.7.2 gold database has 44 ArchiveResults
self.assertEqual(archiveresult_count, 44,
f"Expected 44 ArchiveResults from 0.7.2 database, got {archiveresult_count}")
# Each ArchiveResult should create one Process record
self.assertEqual(process_count, 44,
f"Expected 44 Process records (1 per ArchiveResult), got {process_count}")
# The 44 ArchiveResults use 7 unique binaries (curl, wget, etc.)
self.assertEqual(binary_count, 7,
f"Expected 7 unique Binary records, got {binary_count}")
# ALL ArchiveResults should be linked to Process records
self.assertEqual(linked_count, 44,
f"Expected all 44 ArchiveResults linked to Process, got {linked_count}")
self.assertTrue(has_constraint,
f"Timestamp uniqueness constraint should exist. Table SQL: {table_sql}")
if __name__ == '__main__':

View File

@@ -76,11 +76,11 @@ class Orchestrator:
self.idle_count: int = 0
self._last_cleanup_time: float = 0.0 # For throttling cleanup_stale_running()
# CRITICAL: In foreground mode (exit_on_idle=True), use ONLY 1 worker
# to keep execution strictly sequential and deterministic
# In foreground mode (exit_on_idle=True), limit workers but allow enough
# for crawl progression: 1 CrawlWorker + 1 SnapshotWorker + 1 ArchiveResultWorker
if self.exit_on_idle:
self.MAX_WORKERS_PER_TYPE = 1
self.MAX_TOTAL_WORKERS = 1
self.MAX_TOTAL_WORKERS = 3 # Allow one worker of each type to run concurrently
def __repr__(self) -> str:
return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]'
@@ -157,32 +157,41 @@ class Orchestrator:
self._last_cleanup_time = now
return sum(len(W.get_running_workers()) for W in self.WORKER_TYPES)
def get_running_workers_for_type(self, WorkerClass: Type[Worker]) -> int:
"""Get count of running workers for a specific worker type."""
return len(WorkerClass.get_running_workers())
def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool:
"""Determine if we should spawn a new worker of the given type."""
if queue_count == 0:
return False
# Check per-type limit
running_workers = WorkerClass.get_running_workers()
if len(running_workers) >= self.MAX_WORKERS_PER_TYPE:
running_count = len(running_workers)
if running_count >= self.MAX_WORKERS_PER_TYPE:
return False
# Check total limit
if self.get_total_worker_count() >= self.MAX_TOTAL_WORKERS:
total_workers = self.get_total_worker_count()
if total_workers >= self.MAX_TOTAL_WORKERS:
return False
# Check if we already have enough workers for the queue size
# Spawn more gradually - don't flood with workers
if len(running_workers) > 0 and queue_count <= len(running_workers) * WorkerClass.MAX_CONCURRENT_TASKS:
if running_count > 0 and queue_count <= running_count * WorkerClass.MAX_CONCURRENT_TASKS:
return False
return True
def spawn_worker(self, WorkerClass: Type[Worker]) -> int | None:
"""Spawn a new worker process. Returns PID or None if spawn failed."""
try:
print(f'[yellow]DEBUG: Spawning {WorkerClass.name} worker with crawl_id={self.crawl_id}...[/yellow]')
pid = WorkerClass.start(daemon=False, crawl_id=self.crawl_id)
print(f'[yellow]DEBUG: Spawned {WorkerClass.name} worker with PID={pid}[/yellow]')
# CRITICAL: Block until worker registers itself in Process table
# This prevents race condition where orchestrator spawns multiple workers
@@ -202,6 +211,15 @@ class Orchestrator:
# 3. RUNNING status
# 4. Parent is this orchestrator
# 5. Started recently (within last 10 seconds)
# Debug: Check all processes with this PID first
if elapsed < 0.5:
all_procs = list(Process.objects.filter(pid=pid))
print(f'[yellow]DEBUG spawn_worker: elapsed={elapsed:.1f}s pid={pid} orchestrator_id={self.db_process.id}[/yellow]')
print(f'[yellow] Found {len(all_procs)} Process records for pid={pid}[/yellow]')
for p in all_procs:
print(f'[yellow] -> type={p.process_type} status={p.status} parent_id={p.parent_id} match={p.parent_id == self.db_process.id}[/yellow]')
worker_process = Process.objects.filter(
pid=pid,
process_type=Process.TypeChoices.WORKER,
@@ -212,6 +230,7 @@ class Orchestrator:
if worker_process:
# Worker successfully registered!
print(f'[green]DEBUG spawn_worker: Worker registered! Returning pid={pid}[/green]')
return pid
time.sleep(poll_interval)
@@ -244,7 +263,7 @@ class Orchestrator:
Returns dict of queue sizes by worker type.
"""
queue_sizes = {}
for WorkerClass in self.WORKER_TYPES:
# Get queue for this worker type
# Need to instantiate worker to get queue (for model access)
@@ -392,11 +411,18 @@ class Orchestrator:
def _run_orchestrator_loop(self, progress, task_ids):
"""Run the main orchestrator loop with optional progress display."""
last_queue_sizes = {}
last_snapshot_count = None
try:
while True:
# Check queues and spawn workers
queue_sizes = self.check_queues_and_spawn_workers()
# Debug queue sizes (only when changed)
if progress and queue_sizes != last_queue_sizes:
progress.console.print(f'[yellow]DEBUG: Queue sizes: {queue_sizes}[/yellow]')
last_queue_sizes = queue_sizes.copy()
# Update progress bars
if progress:
from archivebox.core.models import Snapshot
@@ -412,6 +438,11 @@ class Orchestrator:
active_snapshots = list(Snapshot.objects.filter(**snapshot_filter))
# Debug snapshot count (only when changed)
if len(active_snapshots) != last_snapshot_count:
progress.console.print(f'[yellow]DEBUG: Found {len(active_snapshots)} active snapshots (crawl_id={self.crawl_id})[/yellow]')
last_snapshot_count = len(active_snapshots)
# Track which snapshots are still active
active_ids = set()
@@ -461,7 +492,9 @@ class Orchestrator:
del task_ids[snapshot_id]
# Track idle state
if self.has_pending_work(queue_sizes) or self.has_running_workers():
has_pending = self.has_pending_work(queue_sizes)
has_running = self.has_running_workers()
if has_pending or has_running:
self.idle_count = 0
self.on_tick(queue_sizes)
else:

View File

@@ -60,8 +60,8 @@ class Worker:
# Configuration (can be overridden by subclasses)
MAX_TICK_TIME: ClassVar[int] = 60
MAX_CONCURRENT_TASKS: ClassVar[int] = 1
POLL_INTERVAL: ClassVar[float] = 0.2 # How often to check for new work (seconds)
IDLE_TIMEOUT: ClassVar[int] = 50 # Exit after N idle iterations (10 sec at 0.2 poll interval)
POLL_INTERVAL: ClassVar[float] = 0.1 # How often to check for new work (seconds)
IDLE_TIMEOUT: ClassVar[int] = 100 # Exit after N idle iterations (10 sec at 0.1 poll interval)
def __init__(self, worker_id: int = 0, daemon: bool = False, crawl_id: str | None = None, **kwargs: Any):
self.worker_id = worker_id
@@ -93,7 +93,9 @@ class Worker:
Returns the claimed object or None if queue is empty or claim failed.
"""
Model = self.get_model()
obj = self.get_queue().first()
queue = self.get_queue()
obj = queue.first()
if obj is None:
return None
@@ -132,10 +134,17 @@ class Worker:
self.pid = os.getpid()
# Register this worker process in the database
self.db_process = Process.current()
# Explicitly set process_type to WORKER to prevent mis-detection
# Explicitly set process_type to WORKER and store worker type name
update_fields = []
if self.db_process.process_type != Process.TypeChoices.WORKER:
self.db_process.process_type = Process.TypeChoices.WORKER
self.db_process.save(update_fields=['process_type'])
update_fields.append('process_type')
# Store worker type name (crawl/snapshot/archiveresult) in worker_type field
if not self.db_process.worker_type:
self.db_process.worker_type = self.name
update_fields.append('worker_type')
if update_fields:
self.db_process.save(update_fields=update_fields)
# Determine worker type for logging
worker_type_name = self.__class__.__name__
@@ -316,7 +325,12 @@ class Worker:
Process.cleanup_stale_running()
# Convert Process objects to dicts to match the expected API contract
processes = Process.get_running(process_type=Process.TypeChoices.WORKER)
# Filter by worker_type to get only workers of this specific type (crawl/snapshot/archiveresult)
processes = Process.objects.filter(
process_type=Process.TypeChoices.WORKER,
worker_type=cls.name, # Filter by specific worker type
status__in=['running', 'started']
)
# Note: worker_id is not stored on Process model, it's dynamically generated
# We return process_id (UUID) and pid (OS process ID) instead
return [
@@ -334,7 +348,11 @@ class Worker:
"""Get count of running workers of this type."""
from archivebox.machine.models import Process
return Process.get_running_count(process_type=Process.TypeChoices.WORKER)
return Process.objects.filter(
process_type=Process.TypeChoices.WORKER,
worker_type=cls.name, # Filter by specific worker type
status__in=['running', 'started']
).count()
class CrawlWorker(Worker):

View File

@@ -3,18 +3,23 @@
#
# All plugin tests use pytest and are located in pluginname/tests/test_*.py
#
# Usage: ./bin/test_plugins.sh [plugin_name] [--no-coverage]
# Usage: ./bin/test_plugins.sh [plugin_name] [--no-coverage] [--coverage-report]
#
# Examples:
# ./bin/test_plugins.sh # Run all plugin tests with coverage
# ./bin/test_plugins.sh chrome # Run chrome plugin tests with coverage
# ./bin/test_plugins.sh parse_* # Run all parse_* plugin tests with coverage
# ./bin/test_plugins.sh --no-coverage # Run all tests without coverage
# ./bin/test_plugins.sh --coverage-report # Just show coverage report without running tests
#
# Coverage results are saved to .coverage and can be viewed with:
# coverage combine
# coverage report
# For running individual hooks with coverage:
# NODE_V8_COVERAGE=./coverage/js node <hook>.js [args] # JS hooks
# coverage run --parallel-mode <hook>.py [args] # Python hooks
#
# Coverage results are saved to .coverage (Python) and coverage/js (JavaScript):
# coverage combine && coverage report
# coverage json
# ./bin/test_plugins.sh --coverage-report
set -e
@@ -30,15 +35,134 @@ ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
# Parse arguments
PLUGIN_FILTER=""
ENABLE_COVERAGE=true
COVERAGE_REPORT_ONLY=false
for arg in "$@"; do
if [ "$arg" = "--no-coverage" ]; then
ENABLE_COVERAGE=false
elif [ "$arg" = "--coverage-report" ]; then
COVERAGE_REPORT_ONLY=true
else
PLUGIN_FILTER="$arg"
fi
done
# Function to show JS coverage report (inlined from convert_v8_coverage.js)
show_js_coverage() {
local coverage_dir="$1"
if [ ! -d "$coverage_dir" ] || [ -z "$(ls -A "$coverage_dir" 2>/dev/null)" ]; then
echo "No JavaScript coverage data collected"
echo "(JS hooks may not have been executed during tests)"
return
fi
node - "$coverage_dir" << 'ENDJS'
const fs = require('fs');
const path = require('path');
const coverageDir = process.argv[2];
const files = fs.readdirSync(coverageDir).filter(f => f.startsWith('coverage-') && f.endsWith('.json'));
if (files.length === 0) {
console.log('No coverage files found');
process.exit(0);
}
const coverageByFile = {};
files.forEach(file => {
const data = JSON.parse(fs.readFileSync(path.join(coverageDir, file), 'utf8'));
data.result.forEach(script => {
const url = script.url;
if (url.startsWith('node:') || url.includes('node_modules')) return;
if (!coverageByFile[url]) {
coverageByFile[url] = { totalRanges: 0, executedRanges: 0 };
}
script.functions.forEach(func => {
func.ranges.forEach(range => {
coverageByFile[url].totalRanges++;
if (range.count > 0) coverageByFile[url].executedRanges++;
});
});
});
});
const allFiles = Object.keys(coverageByFile).sort();
const pluginFiles = allFiles.filter(url => url.includes('archivebox/plugins'));
const otherFiles = allFiles.filter(url => !url.startsWith('node:') && !url.includes('archivebox/plugins'));
console.log('Total files with coverage: ' + allFiles.length + '\n');
console.log('Plugin files: ' + pluginFiles.length);
console.log('Node internal: ' + allFiles.filter(u => u.startsWith('node:')).length);
console.log('Other: ' + otherFiles.length + '\n');
console.log('JavaScript Coverage Report');
console.log('='.repeat(80));
console.log('');
if (otherFiles.length > 0) {
console.log('Non-plugin files with coverage:');
otherFiles.forEach(url => console.log(' ' + url));
console.log('');
}
if (pluginFiles.length === 0) {
console.log('No plugin files covered');
process.exit(0);
}
let totalRanges = 0, totalExecuted = 0;
pluginFiles.forEach(url => {
const cov = coverageByFile[url];
const pct = cov.totalRanges > 0 ? (cov.executedRanges / cov.totalRanges * 100).toFixed(1) : '0.0';
const match = url.match(/archivebox\/plugins\/.+/);
const displayPath = match ? match[0] : url;
console.log(displayPath + ': ' + pct + '% (' + cov.executedRanges + '/' + cov.totalRanges + ' ranges)');
totalRanges += cov.totalRanges;
totalExecuted += cov.executedRanges;
});
console.log('');
console.log('-'.repeat(80));
const overallPct = totalRanges > 0 ? (totalExecuted / totalRanges * 100).toFixed(1) : '0.0';
console.log('Total: ' + overallPct + '% (' + totalExecuted + '/' + totalRanges + ' ranges)');
ENDJS
}
# If --coverage-report only, just show the report and exit
if [ "$COVERAGE_REPORT_ONLY" = true ]; then
cd "$ROOT_DIR" || exit 1
echo "=========================================="
echo "Python Coverage Summary"
echo "=========================================="
coverage combine 2>/dev/null || true
coverage report --include="archivebox/plugins/*" --omit="*/tests/*"
echo ""
echo "=========================================="
echo "JavaScript Coverage Summary"
echo "=========================================="
show_js_coverage "$ROOT_DIR/coverage/js"
echo ""
echo "For detailed coverage reports:"
echo " Python: coverage report --show-missing --include='archivebox/plugins/*' --omit='*/tests/*'"
echo " Python: coverage json # LLM-friendly format"
echo " Python: coverage html # Interactive HTML report"
exit 0
fi
# Set DATA_DIR for tests (required by abx_pkg and plugins)
# Use temp dir to isolate tests from project files
if [ -z "$DATA_DIR" ]; then
export DATA_DIR=$(mktemp -d -t archivebox_plugin_tests.XXXXXX)
# Clean up on exit
trap "rm -rf '$DATA_DIR'" EXIT
fi
# Reset coverage data if collecting coverage
if [ "$ENABLE_COVERAGE" = true ]; then
echo "Resetting coverage data..."
@@ -161,19 +285,14 @@ elif [ $FAILED_PLUGINS -eq 0 ]; then
echo "=========================================="
echo "JavaScript Coverage Summary"
echo "=========================================="
if [ -d "$ROOT_DIR/coverage/js" ] && [ "$(ls -A "$ROOT_DIR/coverage/js" 2>/dev/null)" ]; then
node "$ROOT_DIR/bin/convert_v8_coverage.js" "$ROOT_DIR/coverage/js"
else
echo "No JavaScript coverage data collected"
echo "(JS hooks may not have been executed during tests)"
fi
show_js_coverage "$ROOT_DIR/coverage/js"
echo ""
echo "For detailed coverage reports (from project root):"
echo " Python: coverage report --show-missing --include='archivebox/plugins/*' --omit='*/tests/*'"
echo " Python: coverage json # LLM-friendly format"
echo " Python: coverage html # Interactive HTML report"
echo " JavaScript: node bin/convert_v8_coverage.js coverage/js"
echo " JavaScript: ./bin/test_plugins.sh --coverage-report"
fi
exit 0

View File

View File

@@ -1 +0,0 @@
import pytest

View File

@@ -1,97 +0,0 @@
#!/usr/bin/env python3
"""
Tests for archivebox crawl command.
Verify crawl creates snapshots with depth.
"""
import os
import subprocess
import sqlite3
from .fixtures import *
def test_crawl_creates_snapshots(tmp_path, process, disable_extractors_dict):
"""Test that crawl command works on existing snapshots."""
os.chdir(tmp_path)
# First add a snapshot
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Then run crawl on it
result = subprocess.run(
['archivebox', 'crawl', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
)
assert result.returncode in [0, 1, 2] # May succeed or fail depending on URL
# Check snapshot was created
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
conn.close()
assert count == 1
def test_crawl_with_depth_0(tmp_path, process, disable_extractors_dict):
"""Test crawl with depth=0 works on existing snapshot."""
os.chdir(tmp_path)
# First add a snapshot
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Then crawl it
subprocess.run(
['archivebox', 'crawl', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
conn.close()
# Should have at least 1 snapshot from the add command
assert count >= 1
def test_crawl_creates_crawl_record(tmp_path, process, disable_extractors_dict):
"""Test that add+crawl creates Crawl records."""
os.chdir(tmp_path)
# First add a snapshot (this creates a Crawl)
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Then crawl it
subprocess.run(
['archivebox', 'crawl', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
)
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
conn.close()
# Should have at least 1 crawl from the add command
assert crawl_count >= 1

View File

@@ -1,63 +0,0 @@
#!/usr/bin/env python3
"""
Tests for archivebox snapshot command.
Verify snapshot command works with snapshot IDs/URLs.
"""
import os
import subprocess
import sqlite3
from .fixtures import *
def test_snapshot_command_works_with_url(tmp_path, process, disable_extractors_dict):
"""Test that snapshot command works with URL."""
os.chdir(tmp_path)
# Add a snapshot first
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Try to view/interact with snapshot
result = subprocess.run(
['archivebox', 'snapshot', 'https://example.com'],
capture_output=True,
text=True,
env=disable_extractors_dict,
timeout=30,
)
# Should complete (exit code depends on implementation)
assert result.returncode in [0, 1, 2]
def test_snapshot_command_with_timestamp(tmp_path, process, disable_extractors_dict):
"""Test snapshot command with timestamp ID."""
os.chdir(tmp_path)
# Add snapshot
subprocess.run(
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
capture_output=True,
env=disable_extractors_dict,
)
# Get snapshot timestamp
conn = sqlite3.connect("index.sqlite3")
c = conn.cursor()
timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0]
conn.close()
# Try snapshot command with timestamp
result = subprocess.run(
['archivebox', 'snapshot', str(timestamp)],
capture_output=True,
env=disable_extractors_dict,
timeout=30,
)
assert result.returncode in [0, 1, 2]