mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
move tests into subfolder, add missing install hooks
This commit is contained in:
2
.gitignore
vendored
2
.gitignore
vendored
@@ -39,11 +39,13 @@ tmp/
|
||||
data/
|
||||
data*/
|
||||
output/
|
||||
logs/
|
||||
index.sqlite3
|
||||
queue.sqlite3
|
||||
*.sqlite*
|
||||
data.*
|
||||
.archivebox_id
|
||||
ArchiveBox.conf
|
||||
|
||||
# vim
|
||||
*.sw?
|
||||
|
||||
57
CLAUDE.md
57
CLAUDE.md
@@ -158,6 +158,63 @@ env['SAVE_FAVICON'] = 'False'
|
||||
#### Timeout Settings
|
||||
Use appropriate timeouts for migration tests (45s for init, 60s default).
|
||||
|
||||
### Plugin Testing & Code Coverage
|
||||
|
||||
**Target: 80-90% coverage** for critical plugins (screenshot, chrome, singlefile, dom)
|
||||
|
||||
```bash
|
||||
# Run plugin tests with coverage (both Python + JavaScript)
|
||||
bash bin/test_plugins.sh screenshot
|
||||
|
||||
# View coverage reports
|
||||
bash bin/test_plugins.sh --coverage-report
|
||||
# Or individual reports:
|
||||
coverage report --show-missing --include='archivebox/plugins/*' --omit='*/tests/*'
|
||||
```
|
||||
|
||||
#### Plugin Test Structure
|
||||
|
||||
Tests are **completely isolated** from ArchiveBox - they replicate production directory structure in temp dirs:
|
||||
|
||||
```python
|
||||
# Correct production paths:
|
||||
# Crawl: DATA_DIR/users/{username}/crawls/YYYYMMDD/example.com/{crawl-id}/{plugin}/
|
||||
# Snapshot: DATA_DIR/users/{username}/snapshots/YYYYMMDD/example.com/{snapshot-uuid}/{plugin}/
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
|
||||
# Crawl-level plugin (e.g., chrome launcher)
|
||||
crawl_dir = data_dir / 'users' / 'testuser' / 'crawls' / '20240101' / 'example.com' / 'crawl-123'
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir(parents=True)
|
||||
|
||||
# Snapshot-level plugin (e.g., screenshot)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-456'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
# Run hook in its output directory
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), '--url=https://example.com', '--snapshot-id=snap-456'],
|
||||
cwd=str(screenshot_dir),
|
||||
env=get_test_env(),
|
||||
capture_output=True,
|
||||
timeout=120
|
||||
)
|
||||
```
|
||||
|
||||
#### Coverage Improvement Loop
|
||||
|
||||
To improve from ~20% to 80%+:
|
||||
|
||||
1. **Run tests**: `bash bin/test_plugins.sh screenshot` → Shows: `19.1% (13/68 ranges)`
|
||||
2. **Identify gaps**: Check hook file for untested paths (session connection vs fallback, config branches, error cases)
|
||||
3. **Add tests**: Test both execution paths (connect to session + launch own browser), skip conditions, error cases, config variations
|
||||
4. **Verify**: Re-run tests → Should show: `85%+ (58+/68 ranges)`
|
||||
|
||||
**Critical**: JavaScript hooks have TWO paths that both must be tested (connect to session ~50% + launch browser ~30% + shared ~20%). Testing only one path = max 50% coverage possible!
|
||||
|
||||
## Database Migrations
|
||||
|
||||
### Generate and Apply Migrations
|
||||
|
||||
@@ -41,9 +41,11 @@ class ArchiveBoxGroup(click.Group):
|
||||
archive_commands = {
|
||||
# High-level commands
|
||||
'add': 'archivebox.cli.archivebox_add.main',
|
||||
'remove': 'archivebox.cli.archivebox_remove.main',
|
||||
'run': 'archivebox.cli.archivebox_run.main',
|
||||
'update': 'archivebox.cli.archivebox_update.main',
|
||||
'status': 'archivebox.cli.archivebox_status.main',
|
||||
'search': 'archivebox.cli.archivebox_search.main',
|
||||
'config': 'archivebox.cli.archivebox_config.main',
|
||||
'schedule': 'archivebox.cli.archivebox_schedule.main',
|
||||
'server': 'archivebox.cli.archivebox_server.main',
|
||||
|
||||
@@ -13,8 +13,15 @@ from archivebox.misc.util import docstring, enforce_types
|
||||
|
||||
|
||||
@enforce_types
|
||||
def install(dry_run: bool=False) -> None:
|
||||
"""Detect and install ArchiveBox dependencies by running a dependency-check crawl"""
|
||||
def install(binaries: tuple[str, ...] = (), binproviders: str = '*', dry_run: bool = False) -> None:
|
||||
"""Detect and install ArchiveBox dependencies by running a dependency-check crawl
|
||||
|
||||
Examples:
|
||||
archivebox install # Install all dependencies
|
||||
archivebox install wget curl # Install only wget and curl
|
||||
archivebox install --binproviders=pip yt-dlp # Install yt-dlp using only pip
|
||||
archivebox install --binproviders=brew,apt # Install all deps using only brew or apt
|
||||
"""
|
||||
|
||||
from archivebox.config.permissions import IS_ROOT, ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
|
||||
from archivebox.config.paths import ARCHIVE_DIR
|
||||
@@ -24,7 +31,14 @@ def install(dry_run: bool=False) -> None:
|
||||
if not (os.access(ARCHIVE_DIR, os.R_OK) and ARCHIVE_DIR.is_dir()):
|
||||
init() # must init full index because we need a db to store Binary entries in
|
||||
|
||||
print('\n[green][+] Detecting ArchiveBox dependencies...[/green]')
|
||||
# Show what we're installing
|
||||
if binaries:
|
||||
print(f'\n[green][+] Installing specific binaries: {", ".join(binaries)}[/green]')
|
||||
else:
|
||||
print('\n[green][+] Detecting and installing all ArchiveBox dependencies...[/green]')
|
||||
|
||||
if binproviders != '*':
|
||||
print(f'[green][+] Using providers: {binproviders}[/green]')
|
||||
|
||||
if IS_ROOT:
|
||||
EUID = os.geteuid()
|
||||
@@ -49,6 +63,19 @@ def install(dry_run: bool=False) -> None:
|
||||
# Using a minimal crawl that will trigger on_Crawl hooks
|
||||
created_by_id = get_or_create_system_user_pk()
|
||||
|
||||
# Build config for this crawl using existing PLUGINS filter
|
||||
crawl_config = {}
|
||||
|
||||
# Combine binary names and provider names into PLUGINS list
|
||||
plugins = []
|
||||
if binaries:
|
||||
plugins.extend(binaries)
|
||||
if binproviders != '*':
|
||||
plugins.extend(binproviders.split(','))
|
||||
|
||||
if plugins:
|
||||
crawl_config['PLUGINS'] = ','.join(plugins)
|
||||
|
||||
crawl, created = Crawl.objects.get_or_create(
|
||||
urls='archivebox://install',
|
||||
defaults={
|
||||
@@ -56,6 +83,7 @@ def install(dry_run: bool=False) -> None:
|
||||
'created_by_id': created_by_id,
|
||||
'max_depth': 0,
|
||||
'status': 'queued',
|
||||
'config': crawl_config,
|
||||
}
|
||||
)
|
||||
|
||||
@@ -63,9 +91,12 @@ def install(dry_run: bool=False) -> None:
|
||||
if not created:
|
||||
crawl.status = 'queued'
|
||||
crawl.retry_at = timezone.now()
|
||||
crawl.config = crawl_config # Update config
|
||||
crawl.save()
|
||||
|
||||
print(f'[+] Created dependency detection crawl: {crawl.id}')
|
||||
if crawl_config:
|
||||
print(f'[+] Crawl config: {crawl_config}')
|
||||
print(f'[+] Crawl status: {crawl.status}, retry_at: {crawl.retry_at}')
|
||||
|
||||
# Verify the crawl is in the queue
|
||||
@@ -100,15 +131,15 @@ def install(dry_run: bool=False) -> None:
|
||||
|
||||
print()
|
||||
|
||||
# Run version to show full status
|
||||
archivebox_path = shutil.which('archivebox') or sys.executable
|
||||
if 'python' in archivebox_path:
|
||||
os.system(f'{sys.executable} -m archivebox version')
|
||||
else:
|
||||
os.system(f'{archivebox_path} version')
|
||||
# Show version to display full status including installed binaries
|
||||
# Django is already loaded, so just import and call the function directly
|
||||
from archivebox.cli.archivebox_version import version as show_version
|
||||
show_version(quiet=False)
|
||||
|
||||
|
||||
@click.command()
|
||||
@click.argument('binaries', nargs=-1, type=str, required=False)
|
||||
@click.option('--binproviders', '-p', default='*', help='Comma-separated list of providers to use (pip,npm,brew,apt,env,custom) or * for all', show_default=True)
|
||||
@click.option('--dry-run', '-d', is_flag=True, help='Show what would happen without actually running', default=False)
|
||||
@docstring(install.__doc__)
|
||||
def main(**kwargs) -> None:
|
||||
|
||||
@@ -50,6 +50,9 @@ def get_snapshots(snapshots: Optional[QuerySet]=None,
|
||||
if filter_patterns:
|
||||
result = Snapshot.objects.filter_by_patterns(filter_patterns, filter_type)
|
||||
|
||||
# Prefetch crawl relationship to avoid N+1 queries when accessing output_dir
|
||||
result = result.select_related('crawl', 'crawl__created_by')
|
||||
|
||||
if not result:
|
||||
stderr('[!] No Snapshots matched your filters:', filter_patterns, f'({filter_type})', color='lightyellow')
|
||||
|
||||
|
||||
@@ -145,16 +145,29 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di
|
||||
# Check if needs migration (0.8.x → 0.9.x)
|
||||
if snapshot.fs_migration_needed:
|
||||
try:
|
||||
snapshot.save() # Triggers migration + creates symlink
|
||||
# Manually trigger filesystem migration without full save()
|
||||
# This avoids UNIQUE constraint issues while still migrating files
|
||||
cleanup_info = None
|
||||
if hasattr(snapshot, '_fs_migrate_from_0_8_0_to_0_9_0'):
|
||||
cleanup_info = snapshot._fs_migrate_from_0_8_0_to_0_9_0()
|
||||
|
||||
# Update only fs_version field using queryset update (bypasses validation)
|
||||
from archivebox.core.models import Snapshot as SnapshotModel
|
||||
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0')
|
||||
|
||||
# Commit the transaction
|
||||
transaction.commit()
|
||||
|
||||
# Manually call cleanup since we bypassed normal save() flow
|
||||
if cleanup_info:
|
||||
old_dir, new_dir = cleanup_info
|
||||
snapshot._cleanup_old_migration_dir(old_dir, new_dir)
|
||||
|
||||
stats['migrated'] += 1
|
||||
print(f" [{stats['processed']}] Migrated: {entry_path.name}")
|
||||
except Exception as e:
|
||||
# Snapshot already exists in DB with different crawl - skip it
|
||||
if 'UNIQUE constraint failed' in str(e):
|
||||
stats['skipped'] += 1
|
||||
print(f" [{stats['processed']}] Skipped (already in DB): {entry_path.name}")
|
||||
else:
|
||||
raise
|
||||
stats['skipped'] += 1
|
||||
print(f" [{stats['processed']}] Skipped (error: {e}): {entry_path.name}")
|
||||
else:
|
||||
stats['skipped'] += 1
|
||||
|
||||
|
||||
@@ -104,40 +104,47 @@ def version(quiet: bool=False,
|
||||
failures = []
|
||||
|
||||
# Setup Django before importing models
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
try:
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from archivebox.machine.models import Machine, Binary
|
||||
from archivebox.machine.models import Machine, Binary
|
||||
|
||||
machine = Machine.current()
|
||||
machine = Machine.current()
|
||||
|
||||
# Get all binaries from the database
|
||||
all_installed = Binary.objects.filter(
|
||||
machine=machine
|
||||
).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')
|
||||
# Get all binaries from the database with timeout protection
|
||||
all_installed = Binary.objects.filter(
|
||||
machine=machine
|
||||
).exclude(abspath='').exclude(abspath__isnull=True).order_by('name')
|
||||
|
||||
if not all_installed.exists():
|
||||
prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]')
|
||||
else:
|
||||
for installed in all_installed:
|
||||
# Skip if user specified specific binaries and this isn't one
|
||||
if binaries and installed.name not in binaries:
|
||||
continue
|
||||
if not all_installed.exists():
|
||||
prnt('', '[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]')
|
||||
else:
|
||||
for installed in all_installed:
|
||||
# Skip if user specified specific binaries and this isn't one
|
||||
if binaries and installed.name not in binaries:
|
||||
continue
|
||||
|
||||
if installed.is_valid:
|
||||
display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
|
||||
version_str = (installed.version or 'unknown')[:15]
|
||||
provider = (installed.binprovider or 'env')[:8]
|
||||
prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
|
||||
else:
|
||||
prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
|
||||
failures.append(installed.name)
|
||||
if installed.is_valid:
|
||||
display_path = installed.abspath.replace(str(DATA_DIR), '.').replace(str(Path('~').expanduser()), '~')
|
||||
version_str = (installed.version or 'unknown')[:15]
|
||||
provider = (installed.binprovider or 'env')[:8]
|
||||
prnt('', '[green]√[/green]', '', installed.name.ljust(18), version_str.ljust(16), provider.ljust(8), display_path, overflow='ignore', crop=False)
|
||||
else:
|
||||
prnt('', '[red]X[/red]', '', installed.name.ljust(18), '[grey53]not installed[/grey53]', overflow='ignore', crop=False)
|
||||
failures.append(installed.name)
|
||||
|
||||
# Show hint if no binaries are installed yet
|
||||
has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath='').exists()
|
||||
if not has_any_installed:
|
||||
# Show hint if no binaries are installed yet
|
||||
has_any_installed = Binary.objects.filter(machine=machine).exclude(abspath='').exists()
|
||||
if not has_any_installed:
|
||||
prnt()
|
||||
prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]')
|
||||
|
||||
except Exception as e:
|
||||
# Handle database errors gracefully (locked, missing, etc.)
|
||||
prnt()
|
||||
prnt('', '[grey53]Run [green]archivebox install[/green] to detect and install dependencies.[/grey53]')
|
||||
prnt('', f'[yellow]Warning: Could not query binaries from database: {e}[/yellow]')
|
||||
prnt('', '[grey53]Run [green]archivebox init[/green] and [green]archivebox install[/green] to set up dependencies.[/grey53]')
|
||||
|
||||
if not binaries:
|
||||
# Show code and data locations
|
||||
|
||||
@@ -116,7 +116,7 @@ def upgrade_core_tables(apps, schema_editor):
|
||||
retry_at DATETIME,
|
||||
|
||||
depth INTEGER NOT NULL DEFAULT 0,
|
||||
fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
|
||||
fs_version VARCHAR(10) NOT NULL DEFAULT '0.8.0',
|
||||
config TEXT NOT NULL DEFAULT '{}',
|
||||
notes TEXT NOT NULL DEFAULT '',
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
@@ -326,6 +326,16 @@ class Migration(migrations.Migration):
|
||||
name='modified_at',
|
||||
field=models.DateTimeField(auto_now=True),
|
||||
),
|
||||
# Declare fs_version (already created in database with DEFAULT '0.8.0')
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='fs_version',
|
||||
field=models.CharField(
|
||||
max_length=10,
|
||||
default='0.8.0',
|
||||
help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().'
|
||||
),
|
||||
),
|
||||
|
||||
# SnapshotTag table already exists from v0.7.2, just declare it in state
|
||||
migrations.CreateModel(
|
||||
|
||||
@@ -150,11 +150,7 @@ class Migration(migrations.Migration):
|
||||
name='downloaded_at',
|
||||
field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
|
||||
),
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='fs_version',
|
||||
field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
|
||||
),
|
||||
# NOTE: fs_version already added by migration 0023 with default='0.8.0'
|
||||
# NOTE: modified_at already added by migration 0023
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
|
||||
@@ -8,7 +8,7 @@ class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0025_alter_archiveresult_options_alter_snapshot_options_and_more'),
|
||||
('machine', '0003_add_process_type_and_parent'),
|
||||
('machine', '0007_add_process_type_and_parent'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
|
||||
388
archivebox/core/migrations/0027_copy_archiveresult_to_process.py
Normal file
388
archivebox/core/migrations/0027_copy_archiveresult_to_process.py
Normal file
@@ -0,0 +1,388 @@
|
||||
# Generated by hand on 2026-01-01
|
||||
# Copies ArchiveResult cmd/pwd/cmd_version data to Process records before removing old fields
|
||||
|
||||
from django.db import migrations, connection
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def parse_cmd_field(cmd_raw):
|
||||
"""
|
||||
Parse cmd field which could be:
|
||||
1. JSON array string: '["wget", "-p", "url"]'
|
||||
2. Space-separated string: 'wget -p url'
|
||||
3. NULL/empty
|
||||
|
||||
Returns list of strings.
|
||||
"""
|
||||
if not cmd_raw:
|
||||
return []
|
||||
|
||||
cmd_raw = cmd_raw.strip()
|
||||
|
||||
if not cmd_raw:
|
||||
return []
|
||||
|
||||
# Try to parse as JSON first
|
||||
if cmd_raw.startswith('['):
|
||||
try:
|
||||
parsed = json.loads(cmd_raw)
|
||||
if isinstance(parsed, list):
|
||||
return [str(x) for x in parsed]
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Fallback: split by spaces (simple approach, doesn't handle quoted strings)
|
||||
# This is acceptable since old cmd fields were mostly simple commands
|
||||
return cmd_raw.split()
|
||||
|
||||
|
||||
def get_or_create_current_machine(cursor):
|
||||
"""Get or create Machine.current() using raw SQL."""
|
||||
import uuid
|
||||
import socket
|
||||
from datetime import datetime
|
||||
|
||||
# Simple machine detection - get hostname as guid
|
||||
hostname = socket.gethostname()
|
||||
guid = f'host_{hostname}' # Simple but stable identifier
|
||||
|
||||
# Check if machine exists
|
||||
cursor.execute("SELECT id FROM machine_machine WHERE guid = ?", [guid])
|
||||
row = cursor.fetchone()
|
||||
|
||||
if row:
|
||||
return row[0]
|
||||
|
||||
# Create new machine
|
||||
machine_id = str(uuid.uuid4())
|
||||
now = datetime.now().isoformat()
|
||||
|
||||
# Check which columns exist (schema differs between 0.8.x and 0.9.x)
|
||||
cursor.execute("PRAGMA table_info(machine_machine)")
|
||||
machine_cols = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
# Build INSERT statement based on available columns
|
||||
if 'config' in machine_cols:
|
||||
# 0.9.x schema with config column
|
||||
cursor.execute("""
|
||||
INSERT INTO machine_machine (
|
||||
id, created_at, modified_at, guid, hostname,
|
||||
hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
|
||||
os_arch, os_family, os_platform, os_release, os_kernel,
|
||||
stats, config, num_uses_failed, num_uses_succeeded
|
||||
) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '',
|
||||
'', '', '', '', '', '{}', '{}', 0, 0)
|
||||
""", [machine_id, now, now, guid, hostname])
|
||||
else:
|
||||
# 0.8.x schema without config column
|
||||
cursor.execute("""
|
||||
INSERT INTO machine_machine (
|
||||
id, created_at, modified_at, guid, hostname,
|
||||
hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
|
||||
os_arch, os_family, os_platform, os_release, os_kernel,
|
||||
stats, num_uses_failed, num_uses_succeeded
|
||||
) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '',
|
||||
'', '', '', '', '', '{}', 0, 0)
|
||||
""", [machine_id, now, now, guid, hostname])
|
||||
|
||||
return machine_id
|
||||
|
||||
|
||||
def get_or_create_binary(cursor, machine_id, name, abspath, version):
|
||||
"""
|
||||
Get or create Binary record.
|
||||
|
||||
Args:
|
||||
cursor: DB cursor
|
||||
machine_id: Machine FK
|
||||
name: Binary name (basename of command)
|
||||
abspath: Absolute path to binary (or just name if path unknown)
|
||||
version: Version string
|
||||
|
||||
Returns:
|
||||
binary_id (str)
|
||||
"""
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
# If abspath is just a name without slashes, it's not a full path
|
||||
# Store it in both fields for simplicity
|
||||
if '/' not in abspath:
|
||||
# Not a full path - store as-is
|
||||
pass
|
||||
|
||||
# Check if binary exists with same machine, name, abspath, version
|
||||
cursor.execute("""
|
||||
SELECT id FROM machine_binary
|
||||
WHERE machine_id = ? AND name = ? AND abspath = ? AND version = ?
|
||||
""", [machine_id, name, abspath, version])
|
||||
|
||||
row = cursor.fetchone()
|
||||
if row:
|
||||
return row[0]
|
||||
|
||||
# Create new binary
|
||||
binary_id = str(uuid.uuid4())
|
||||
now = datetime.now().isoformat()
|
||||
|
||||
# Check which columns exist (schema differs between 0.8.x and 0.9.x)
|
||||
cursor.execute("PRAGMA table_info(machine_binary)")
|
||||
binary_cols = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
# Use only columns that exist in current schema
|
||||
# 0.8.x schema: id, created_at, modified_at, machine_id, name, binprovider, abspath, version, sha256, num_uses_failed, num_uses_succeeded
|
||||
# 0.9.x schema adds: binproviders, overrides, status, retry_at, output_dir
|
||||
if 'binproviders' in binary_cols:
|
||||
# 0.9.x schema
|
||||
cursor.execute("""
|
||||
INSERT INTO machine_binary (
|
||||
id, created_at, modified_at, machine_id,
|
||||
name, binproviders, overrides, binprovider, abspath, version, sha256,
|
||||
status, retry_at, output_dir,
|
||||
num_uses_failed, num_uses_succeeded
|
||||
) VALUES (?, ?, ?, ?, ?, 'env', '{}', 'env', ?, ?, '',
|
||||
'succeeded', NULL, '', 0, 0)
|
||||
""", [binary_id, now, now, machine_id, name, abspath, version])
|
||||
else:
|
||||
# 0.8.x schema (simpler)
|
||||
cursor.execute("""
|
||||
INSERT INTO machine_binary (
|
||||
id, created_at, modified_at, machine_id,
|
||||
name, binprovider, abspath, version, sha256,
|
||||
num_uses_failed, num_uses_succeeded
|
||||
) VALUES (?, ?, ?, ?, ?, 'env', ?, ?, '', 0, 0)
|
||||
""", [binary_id, now, now, machine_id, name, abspath, version])
|
||||
|
||||
return binary_id
|
||||
|
||||
|
||||
def map_status(old_status):
|
||||
"""
|
||||
Map old ArchiveResult status to Process status and exit_code.
|
||||
|
||||
Args:
|
||||
old_status: One of: queued, started, backoff, succeeded, failed, skipped
|
||||
|
||||
Returns:
|
||||
(process_status, exit_code) tuple
|
||||
"""
|
||||
status_map = {
|
||||
'queued': ('queued', None),
|
||||
'started': ('running', None),
|
||||
'backoff': ('queued', None),
|
||||
'succeeded': ('exited', 0),
|
||||
'failed': ('exited', 1),
|
||||
'skipped': ('exited', None), # Skipped = exited without error
|
||||
}
|
||||
|
||||
return status_map.get(old_status, ('queued', None))
|
||||
|
||||
|
||||
def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at, ended_at, binary_id):
|
||||
"""
|
||||
Create a Process record.
|
||||
|
||||
Returns:
|
||||
process_id (str)
|
||||
"""
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
process_id = str(uuid.uuid4())
|
||||
now = datetime.now().isoformat()
|
||||
|
||||
# Convert cmd array to JSON
|
||||
cmd_json = json.dumps(cmd)
|
||||
|
||||
# Set retry_at to now for queued processes, NULL otherwise
|
||||
retry_at = now if status == 'queued' else None
|
||||
|
||||
cursor.execute("""
|
||||
INSERT INTO machine_process (
|
||||
id, created_at, modified_at, machine_id, parent_id, process_type,
|
||||
pwd, cmd, env, timeout,
|
||||
pid, exit_code, stdout, stderr,
|
||||
started_at, ended_at,
|
||||
binary_id, iface_id, url,
|
||||
status, retry_at
|
||||
) VALUES (?, ?, ?, ?, NULL, 'cli',
|
||||
?, ?, '{}', 120,
|
||||
NULL, ?, '', '',
|
||||
?, ?,
|
||||
?, NULL, NULL,
|
||||
?, ?)
|
||||
""", [
|
||||
process_id, now, now, machine_id,
|
||||
pwd, cmd_json,
|
||||
exit_code,
|
||||
started_at, ended_at,
|
||||
binary_id,
|
||||
status, retry_at
|
||||
])
|
||||
|
||||
return process_id
|
||||
|
||||
|
||||
def copy_archiveresult_data_to_process(apps, schema_editor):
|
||||
"""
|
||||
Copy old ArchiveResult execution data (cmd, pwd, cmd_version) to Process records.
|
||||
|
||||
For each ArchiveResult without a process_id:
|
||||
1. Parse cmd field (handle both JSON array and space-separated string)
|
||||
2. Extract binary name/path from cmd[0]
|
||||
3. Get or create Binary record with machine, name, abspath, version
|
||||
4. Create Process record with mapped fields
|
||||
5. Link ArchiveResult.process_id to new Process
|
||||
|
||||
Status mapping:
|
||||
- queued → queued (exit_code=None)
|
||||
- started → running (exit_code=None)
|
||||
- backoff → queued (exit_code=None)
|
||||
- succeeded → exited (exit_code=0)
|
||||
- failed → exited (exit_code=1)
|
||||
- skipped → exited (exit_code=None)
|
||||
"""
|
||||
cursor = connection.cursor()
|
||||
|
||||
# Check if old fields still exist (skip if fresh install or already migrated)
|
||||
cursor.execute("PRAGMA table_info(core_archiveresult)")
|
||||
cols = {row[1] for row in cursor.fetchall()}
|
||||
|
||||
print(f'DEBUG 0027: Columns found: {sorted(cols)}')
|
||||
print(f'DEBUG 0027: Has cmd={("cmd" in cols)}, pwd={("pwd" in cols)}, cmd_version={("cmd_version" in cols)}, process_id={("process_id" in cols)}')
|
||||
|
||||
if 'cmd' not in cols or 'pwd' not in cols or 'cmd_version' not in cols:
|
||||
print('✓ Fresh install or fields already removed - skipping data copy')
|
||||
return
|
||||
|
||||
# Check if process_id field exists (should exist from 0026)
|
||||
if 'process_id' not in cols:
|
||||
print('✗ ERROR: process_id field not found. Migration 0026 must run first.')
|
||||
return
|
||||
|
||||
# Get or create Machine.current()
|
||||
machine_id = get_or_create_current_machine(cursor)
|
||||
|
||||
# Get ArchiveResults without process_id that have cmd data
|
||||
# Use plugin (extractor was renamed to plugin in migration 0025)
|
||||
cursor.execute("""
|
||||
SELECT id, snapshot_id, plugin, cmd, pwd, cmd_version,
|
||||
status, start_ts, end_ts, created_at
|
||||
FROM core_archiveresult
|
||||
WHERE process_id IS NULL
|
||||
AND (cmd IS NOT NULL OR pwd IS NOT NULL)
|
||||
""")
|
||||
|
||||
results = cursor.fetchall()
|
||||
|
||||
if not results:
|
||||
print('✓ No ArchiveResults need Process migration')
|
||||
return
|
||||
|
||||
print(f'Migrating {len(results)} ArchiveResults to Process records...')
|
||||
|
||||
migrated_count = 0
|
||||
skipped_count = 0
|
||||
error_count = 0
|
||||
|
||||
for i, row in enumerate(results):
|
||||
ar_id, snapshot_id, plugin, cmd_raw, pwd, cmd_version, status, start_ts, end_ts, created_at = row
|
||||
|
||||
if i == 0:
|
||||
print(f'DEBUG 0027: First row: ar_id={ar_id}, plugin={plugin}, cmd={cmd_raw[:50] if cmd_raw else None}, status={status}')
|
||||
|
||||
try:
|
||||
# Parse cmd field
|
||||
cmd_array = parse_cmd_field(cmd_raw)
|
||||
|
||||
if i == 0:
|
||||
print(f'DEBUG 0027: Parsed cmd: {cmd_array}')
|
||||
|
||||
# Extract binary info from cmd[0] if available
|
||||
binary_id = None
|
||||
if cmd_array and cmd_array[0]:
|
||||
binary_name = Path(cmd_array[0]).name or plugin # Fallback to plugin name
|
||||
binary_abspath = cmd_array[0]
|
||||
binary_version = cmd_version or ''
|
||||
|
||||
# Get or create Binary record
|
||||
binary_id = get_or_create_binary(
|
||||
cursor, machine_id, binary_name, binary_abspath, binary_version
|
||||
)
|
||||
|
||||
if i == 0:
|
||||
print(f'DEBUG 0027: Created Binary: id={binary_id}, name={binary_name}')
|
||||
|
||||
# Map status
|
||||
process_status, exit_code = map_status(status)
|
||||
|
||||
# Set timestamps
|
||||
started_at = start_ts or created_at
|
||||
ended_at = end_ts if process_status == 'exited' else None
|
||||
|
||||
# Create Process record
|
||||
process_id = create_process(
|
||||
cursor=cursor,
|
||||
machine_id=machine_id,
|
||||
pwd=pwd or '',
|
||||
cmd=cmd_array,
|
||||
status=process_status,
|
||||
exit_code=exit_code,
|
||||
started_at=started_at,
|
||||
ended_at=ended_at,
|
||||
binary_id=binary_id,
|
||||
)
|
||||
|
||||
if i == 0:
|
||||
print(f'DEBUG 0027: Created Process: id={process_id}')
|
||||
|
||||
# Link ArchiveResult to Process
|
||||
cursor.execute(
|
||||
"UPDATE core_archiveresult SET process_id = ? WHERE id = ?",
|
||||
[process_id, ar_id]
|
||||
)
|
||||
|
||||
migrated_count += 1
|
||||
|
||||
if i == 0:
|
||||
print(f'DEBUG 0027: Linked ArchiveResult to Process')
|
||||
|
||||
except Exception as e:
|
||||
print(f'✗ Error migrating ArchiveResult {ar_id}: {e}')
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
error_count += 1
|
||||
continue
|
||||
|
||||
print(f'✓ Migration complete: {migrated_count} migrated, {skipped_count} skipped, {error_count} errors')
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0026_add_process_to_archiveresult'),
|
||||
('machine', '0007_add_process_type_and_parent'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# First, copy data from old fields to Process
|
||||
migrations.RunPython(
|
||||
copy_archiveresult_data_to_process,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
|
||||
# Now safe to remove old fields (moved from 0025)
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='cmd',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='cmd_version',
|
||||
),
|
||||
migrations.RemoveField(
|
||||
model_name='archiveresult',
|
||||
name='pwd',
|
||||
),
|
||||
]
|
||||
@@ -362,24 +362,22 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
# Migrate filesystem if needed (happens automatically on save)
|
||||
if self.pk and self.fs_migration_needed:
|
||||
from django.db import transaction
|
||||
with transaction.atomic():
|
||||
# Walk through migration chain automatically
|
||||
current = self.fs_version
|
||||
target = self._fs_current_version()
|
||||
# Walk through migration chain automatically
|
||||
current = self.fs_version
|
||||
target = self._fs_current_version()
|
||||
|
||||
while current != target:
|
||||
next_ver = self._fs_next_version(current)
|
||||
method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'
|
||||
while current != target:
|
||||
next_ver = self._fs_next_version(current)
|
||||
method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'
|
||||
|
||||
# Only run if method exists (most are no-ops)
|
||||
if hasattr(self, method):
|
||||
getattr(self, method)()
|
||||
# Only run if method exists (most are no-ops)
|
||||
if hasattr(self, method):
|
||||
getattr(self, method)()
|
||||
|
||||
current = next_ver
|
||||
current = next_ver
|
||||
|
||||
# Update version (still in transaction)
|
||||
self.fs_version = target
|
||||
# Update version
|
||||
self.fs_version = target
|
||||
|
||||
super().save(*args, **kwargs)
|
||||
if self.url not in self.crawl.urls:
|
||||
@@ -486,33 +484,58 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
# Convert index.json to index.jsonl in the new directory
|
||||
self.convert_index_json_to_jsonl()
|
||||
|
||||
# Create backwards-compat symlink (INSIDE transaction)
|
||||
symlink_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
|
||||
if symlink_path.is_symlink():
|
||||
symlink_path.unlink()
|
||||
# Schedule cleanup AFTER transaction commits successfully
|
||||
# This ensures DB changes are committed before we delete old files
|
||||
from django.db import transaction
|
||||
transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir, new_dir))
|
||||
|
||||
if not symlink_path.exists() or symlink_path == old_dir:
|
||||
symlink_path.symlink_to(new_dir, target_is_directory=True)
|
||||
# Return cleanup info for manual cleanup if needed (when called directly)
|
||||
return (old_dir, new_dir)
|
||||
|
||||
# Schedule old directory deletion AFTER transaction commits
|
||||
transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir))
|
||||
|
||||
def _cleanup_old_migration_dir(self, old_dir: Path):
|
||||
def _cleanup_old_migration_dir(self, old_dir: Path, new_dir: Path):
|
||||
"""
|
||||
Delete old directory after successful migration.
|
||||
Delete old directory and create symlink after successful migration.
|
||||
Called via transaction.on_commit() after DB commit succeeds.
|
||||
"""
|
||||
import shutil
|
||||
import logging
|
||||
|
||||
print(f"[DEBUG] _cleanup_old_migration_dir called: old_dir={old_dir}, new_dir={new_dir}")
|
||||
|
||||
# Delete old directory
|
||||
if old_dir.exists() and not old_dir.is_symlink():
|
||||
print(f"[DEBUG] Attempting to delete old directory: {old_dir}")
|
||||
try:
|
||||
shutil.rmtree(old_dir)
|
||||
print(f"[DEBUG] Successfully deleted old directory: {old_dir}")
|
||||
except Exception as e:
|
||||
# Log but don't raise - migration succeeded, this is just cleanup
|
||||
print(f"[DEBUG] Failed to delete old directory {old_dir}: {e}")
|
||||
logging.getLogger('archivebox.migration').warning(
|
||||
f"Could not remove old migration directory {old_dir}: {e}"
|
||||
)
|
||||
return # Don't create symlink if cleanup failed
|
||||
else:
|
||||
print(f"[DEBUG] Old directory doesn't exist or is already a symlink: {old_dir}")
|
||||
|
||||
# Create backwards-compat symlink (after old dir is deleted)
|
||||
symlink_path = old_dir # Same path as old_dir
|
||||
if symlink_path.is_symlink():
|
||||
print(f"[DEBUG] Unlinking existing symlink: {symlink_path}")
|
||||
symlink_path.unlink()
|
||||
|
||||
if not symlink_path.exists():
|
||||
print(f"[DEBUG] Creating symlink: {symlink_path} -> {new_dir}")
|
||||
try:
|
||||
symlink_path.symlink_to(new_dir, target_is_directory=True)
|
||||
print(f"[DEBUG] Successfully created symlink")
|
||||
except Exception as e:
|
||||
print(f"[DEBUG] Failed to create symlink: {e}")
|
||||
logging.getLogger('archivebox.migration').warning(
|
||||
f"Could not create symlink from {symlink_path} to {new_dir}: {e}"
|
||||
)
|
||||
else:
|
||||
print(f"[DEBUG] Symlink path already exists: {symlink_path}")
|
||||
|
||||
# =========================================================================
|
||||
# Path Calculation and Migration Helpers
|
||||
@@ -1616,8 +1639,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
This enables step-based execution where all hooks in a step can run in parallel.
|
||||
"""
|
||||
from archivebox.hooks import discover_hooks
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
hooks = discover_hooks('Snapshot')
|
||||
# Get merged config with crawl-specific PLUGINS filter
|
||||
config = get_config(crawl=self.crawl, snapshot=self)
|
||||
hooks = discover_hooks('Snapshot', config=config)
|
||||
archiveresults = []
|
||||
|
||||
for hook_path in hooks:
|
||||
@@ -2212,22 +2238,19 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
|
||||
started = State(value=Snapshot.StatusChoices.STARTED)
|
||||
sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)
|
||||
|
||||
# Tick Event
|
||||
# Tick Event (polled by workers)
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished', on='on_started_to_started') |
|
||||
started.to(sealed, cond='is_finished')
|
||||
queued.to(started, cond='can_start')
|
||||
)
|
||||
|
||||
# Manual event (triggered by last ArchiveResult finishing)
|
||||
seal = started.to(sealed)
|
||||
|
||||
def can_start(self) -> bool:
|
||||
can_start = bool(self.snapshot.url)
|
||||
return can_start
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
"""Check if snapshot processing is complete - delegates to model method."""
|
||||
return self.snapshot.is_finished_processing()
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
self.snapshot.update_and_requeue(
|
||||
@@ -2237,29 +2260,34 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
# lock the snapshot while we create the pending archiveresults
|
||||
self.snapshot.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=30), # if failed, wait 30s before retrying
|
||||
)
|
||||
import sys
|
||||
|
||||
print(f'[cyan] 🔄 SnapshotMachine.enter_started() - creating archiveresults for {self.snapshot.url}[/cyan]', file=sys.stderr)
|
||||
|
||||
# Run the snapshot - creates pending archiveresults for all enabled plugins
|
||||
self.snapshot.run()
|
||||
|
||||
# unlock the snapshot after we're done + set status = started
|
||||
self.snapshot.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=5), # check again in 5s
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
# Check if any archiveresults were created
|
||||
ar_count = self.snapshot.archiveresult_set.count()
|
||||
print(f'[cyan] 🔄 ArchiveResult count: {ar_count}[/cyan]', file=sys.stderr)
|
||||
|
||||
def on_started_to_started(self):
|
||||
"""Called when Snapshot stays in started state (archiveresults not finished yet)."""
|
||||
# Bump retry_at so we check again in a few seconds
|
||||
self.snapshot.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=5),
|
||||
)
|
||||
if ar_count == 0:
|
||||
# No archiveresults created, seal immediately
|
||||
print(f'[cyan] 🔄 No archiveresults created, sealing snapshot immediately[/cyan]', file=sys.stderr)
|
||||
self.seal()
|
||||
else:
|
||||
# Set status = started with retry_at far future (so workers don't claim us - we're waiting for ARs)
|
||||
# Last AR will manually call self.seal() when done
|
||||
self.snapshot.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(days=365),
|
||||
status=Snapshot.StatusChoices.STARTED,
|
||||
)
|
||||
print(f'[cyan] 🔄 {ar_count} archiveresults created, waiting for them to finish[/cyan]', file=sys.stderr)
|
||||
|
||||
@sealed.enter
|
||||
def enter_sealed(self):
|
||||
import sys
|
||||
|
||||
# Clean up background hooks
|
||||
self.snapshot.cleanup()
|
||||
|
||||
@@ -2268,6 +2296,21 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
|
||||
status=Snapshot.StatusChoices.SEALED,
|
||||
)
|
||||
|
||||
print(f'[cyan] ✅ SnapshotMachine.enter_sealed() - sealed {self.snapshot.url}[/cyan]', file=sys.stderr)
|
||||
|
||||
# Check if this is the last snapshot for the parent crawl - if so, seal the crawl
|
||||
if self.snapshot.crawl:
|
||||
crawl = self.snapshot.crawl
|
||||
remaining_active = Snapshot.objects.filter(
|
||||
crawl=crawl,
|
||||
status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
|
||||
).count()
|
||||
|
||||
if remaining_active == 0:
|
||||
print(f'[cyan]🔒 All snapshots sealed for crawl {crawl.id}, sealing crawl[/cyan]', file=sys.stderr)
|
||||
# Seal the parent crawl
|
||||
crawl.sm.seal()
|
||||
|
||||
|
||||
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithStateMachine):
|
||||
class StatusChoices(models.TextChoices):
|
||||
@@ -3102,8 +3145,30 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
|
||||
end_ts=None,
|
||||
)
|
||||
|
||||
def _check_and_seal_parent_snapshot(self):
|
||||
"""Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot."""
|
||||
import sys
|
||||
|
||||
snapshot = self.archiveresult.snapshot
|
||||
|
||||
# Check if all archiveresults are finished (in final states)
|
||||
remaining_active = snapshot.archiveresult_set.exclude(
|
||||
status__in=[
|
||||
ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
]
|
||||
).count()
|
||||
|
||||
if remaining_active == 0:
|
||||
print(f'[cyan] 🔒 All archiveresults finished for snapshot {snapshot.url}, sealing snapshot[/cyan]', file=sys.stderr)
|
||||
# Seal the parent snapshot
|
||||
snapshot.sm.seal()
|
||||
|
||||
@succeeded.enter
|
||||
def enter_succeeded(self):
|
||||
import sys
|
||||
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SUCCEEDED,
|
||||
@@ -3113,8 +3178,15 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
|
||||
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
|
||||
self.archiveresult.cascade_health_update(success=True)
|
||||
|
||||
print(f'[cyan] ✅ ArchiveResult succeeded: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/cyan]', file=sys.stderr)
|
||||
|
||||
# Check if this is the last AR to finish - seal parent snapshot if so
|
||||
self._check_and_seal_parent_snapshot()
|
||||
|
||||
@failed.enter
|
||||
def enter_failed(self):
|
||||
import sys
|
||||
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.FAILED,
|
||||
@@ -3124,16 +3196,25 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
|
||||
# Update health stats for ArchiveResult, Snapshot, and Crawl cascade
|
||||
self.archiveresult.cascade_health_update(success=False)
|
||||
|
||||
print(f'[red] ❌ ArchiveResult failed: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/red]', file=sys.stderr)
|
||||
|
||||
# Check if this is the last AR to finish - seal parent snapshot if so
|
||||
self._check_and_seal_parent_snapshot()
|
||||
|
||||
@skipped.enter
|
||||
def enter_skipped(self):
|
||||
import sys
|
||||
|
||||
self.archiveresult.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=ArchiveResult.StatusChoices.SKIPPED,
|
||||
end_ts=timezone.now(),
|
||||
)
|
||||
|
||||
def after_transition(self, event: str, source: State, target: State):
|
||||
self.archiveresult.snapshot.update_and_requeue() # bump snapshot retry time so it picks up all the new changes
|
||||
print(f'[dim] ⏭️ ArchiveResult skipped: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/dim]', file=sys.stderr)
|
||||
|
||||
# Check if this is the last AR to finish - seal parent snapshot if so
|
||||
self._check_and_seal_parent_snapshot()
|
||||
|
||||
|
||||
# =============================================================================
|
||||
|
||||
@@ -240,19 +240,26 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
if not first_url:
|
||||
raise ValueError(f'Crawl {self.id} has no URLs to create root snapshot from')
|
||||
|
||||
# Try to get existing snapshot
|
||||
try:
|
||||
return Snapshot.objects.get(crawl=self, url=first_url)
|
||||
snapshot = Snapshot.objects.get(crawl=self, url=first_url)
|
||||
# If exists and already queued/started, return it as-is
|
||||
if snapshot.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]:
|
||||
# Update retry_at to now so it can be picked up immediately
|
||||
snapshot.retry_at = timezone.now()
|
||||
snapshot.save(update_fields=['retry_at'])
|
||||
return snapshot
|
||||
except Snapshot.DoesNotExist:
|
||||
pass
|
||||
|
||||
root_snapshot, _ = Snapshot.objects.update_or_create(
|
||||
crawl=self, url=first_url,
|
||||
defaults={
|
||||
'status': Snapshot.INITIAL_STATE,
|
||||
'retry_at': timezone.now(),
|
||||
'timestamp': str(timezone.now().timestamp()),
|
||||
'depth': 0,
|
||||
},
|
||||
# Create new snapshot
|
||||
root_snapshot = Snapshot.objects.create(
|
||||
crawl=self,
|
||||
url=first_url,
|
||||
status=Snapshot.INITIAL_STATE,
|
||||
retry_at=timezone.now(),
|
||||
timestamp=str(timezone.now().timestamp()),
|
||||
depth=0,
|
||||
)
|
||||
return root_snapshot
|
||||
|
||||
@@ -362,14 +369,14 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
|
||||
return created_snapshots
|
||||
|
||||
def run(self) -> 'Snapshot':
|
||||
def run(self) -> 'Snapshot | None':
|
||||
"""
|
||||
Execute this Crawl: run hooks, process JSONL, create snapshots.
|
||||
|
||||
Called by the state machine when entering the 'started' state.
|
||||
|
||||
Returns:
|
||||
The root Snapshot for this crawl
|
||||
The root Snapshot for this crawl, or None for system crawls that don't create snapshots
|
||||
"""
|
||||
import time
|
||||
from pathlib import Path
|
||||
@@ -407,8 +414,18 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
|
||||
# Foreground hook - process JSONL records
|
||||
records = result.get('records', [])
|
||||
if records:
|
||||
print(f'[cyan]📝 Processing {len(records)} records from {hook.name}[/cyan]')
|
||||
for record in records[:3]: # Show first 3
|
||||
print(f' Record: type={record.get("type")}, keys={list(record.keys())[:5]}')
|
||||
overrides = {'crawl': self}
|
||||
process_hook_records(records, overrides=overrides)
|
||||
stats = process_hook_records(records, overrides=overrides)
|
||||
if stats:
|
||||
print(f'[green]✓ Created: {stats}[/green]')
|
||||
|
||||
# System crawls (archivebox://*) don't create snapshots - they just run hooks
|
||||
if first_url.startswith('archivebox://'):
|
||||
return None
|
||||
|
||||
# Create snapshots from URLs
|
||||
root_snapshot = self.create_root_snapshot()
|
||||
@@ -498,14 +515,15 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
|
||||
started = State(value=Crawl.StatusChoices.STARTED)
|
||||
sealed = State(value=Crawl.StatusChoices.SEALED, final=True)
|
||||
|
||||
# Tick Event
|
||||
# Tick Event (polled by workers)
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished', on='on_started_to_started') |
|
||||
started.to(sealed, cond='is_finished')
|
||||
queued.to(started, cond='can_start')
|
||||
)
|
||||
|
||||
# Manual event (triggered by last Snapshot sealing)
|
||||
seal = started.to(sealed)
|
||||
|
||||
def can_start(self) -> bool:
|
||||
if not self.crawl.urls:
|
||||
print(f'[red]⚠️ Crawl {self.crawl.id} cannot start: no URLs[/red]')
|
||||
@@ -516,55 +534,38 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
|
||||
return False
|
||||
return True
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
# Check if any snapshots exist for this crawl
|
||||
snapshots = Snapshot.objects.filter(crawl=self.crawl)
|
||||
|
||||
# If no snapshots exist, allow finishing (e.g., archivebox://install crawls that only run hooks)
|
||||
if not snapshots.exists():
|
||||
return True
|
||||
|
||||
# If snapshots exist, check if all are sealed
|
||||
# Snapshots handle their own background hooks via the step system,
|
||||
# so we just need to wait for all snapshots to reach sealed state
|
||||
if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
# Lock the crawl by bumping retry_at so other workers don't pick it up while we create snapshots
|
||||
self.crawl.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=30), # Lock for 30 seconds
|
||||
)
|
||||
import sys
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
print(f'[cyan]🔄 CrawlMachine.enter_started() - creating snapshots for {self.crawl.id}[/cyan]', file=sys.stderr)
|
||||
|
||||
try:
|
||||
# Run the crawl - runs hooks, processes JSONL, creates snapshots
|
||||
self.crawl.run()
|
||||
root_snapshot = self.crawl.run()
|
||||
|
||||
if root_snapshot:
|
||||
print(f'[cyan]🔄 Created root snapshot: {root_snapshot.url}[/cyan]', file=sys.stderr)
|
||||
# Update status to STARTED
|
||||
# Set retry_at to far future so workers don't claim us (we're waiting for snapshots to finish)
|
||||
# Last snapshot will manually call self.seal() when done
|
||||
self.crawl.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(days=365),
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
)
|
||||
else:
|
||||
# No snapshots (system crawl like archivebox://install)
|
||||
print(f'[cyan]🔄 No snapshots created, sealing crawl immediately[/cyan]', file=sys.stderr)
|
||||
# Seal immediately since there's no work to do
|
||||
self.seal()
|
||||
|
||||
# Update status to STARTED once snapshots are created
|
||||
# Set retry_at to future so we don't busy-loop - wait for snapshots to process
|
||||
self.crawl.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=5), # Check again in 5s
|
||||
status=Crawl.StatusChoices.STARTED,
|
||||
)
|
||||
except Exception as e:
|
||||
print(f'[red]⚠️ Crawl {self.crawl.id} failed to start: {e}[/red]')
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
# Re-raise so the worker knows it failed
|
||||
raise
|
||||
|
||||
def on_started_to_started(self):
|
||||
"""Called when Crawl stays in started state (snapshots not sealed yet)."""
|
||||
# Bump retry_at so we check again in a few seconds
|
||||
self.crawl.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=5),
|
||||
)
|
||||
|
||||
@sealed.enter
|
||||
def enter_sealed(self):
|
||||
# Clean up background hooks and run on_CrawlEnd hooks
|
||||
|
||||
@@ -480,7 +480,7 @@ def run_hook(
|
||||
returncode=returncode,
|
||||
stdout=stdout,
|
||||
stderr=stderr,
|
||||
output_json=output_json,
|
||||
output_json=None, # Legacy field, we now use records for JSONL
|
||||
output_files=new_files,
|
||||
duration_ms=duration_ms,
|
||||
hook=str(script),
|
||||
@@ -922,10 +922,14 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
|
||||
if plugins_whitelist:
|
||||
# PLUGINS whitelist is specified - only enable plugins in the list
|
||||
plugin_names = [p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()]
|
||||
import sys
|
||||
print(f"DEBUG: PLUGINS whitelist='{plugins_whitelist}', checking plugin '{plugin_name}', plugin_names={plugin_names}", file=sys.stderr)
|
||||
if plugin_name.lower() not in plugin_names:
|
||||
# Plugin not in whitelist - explicitly disabled
|
||||
print(f"DEBUG: Plugin '{plugin_name}' NOT in whitelist, disabling", file=sys.stderr)
|
||||
enabled = False
|
||||
else:
|
||||
print(f"DEBUG: Plugin '{plugin_name}' IS in whitelist, enabling", file=sys.stderr)
|
||||
# Plugin is in whitelist - check if explicitly disabled by PLUGINNAME_ENABLED
|
||||
enabled_key = f'{plugin_upper}_ENABLED'
|
||||
enabled = config.get(enabled_key)
|
||||
@@ -935,6 +939,8 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
|
||||
enabled = enabled.lower() not in ('false', '0', 'no', '')
|
||||
else:
|
||||
# No PLUGINS whitelist - use PLUGINNAME_ENABLED (default True)
|
||||
import sys
|
||||
print(f"DEBUG: NO PLUGINS whitelist in config, checking {plugin_name}_ENABLED", file=sys.stderr)
|
||||
enabled_key = f'{plugin_upper}_ENABLED'
|
||||
enabled = config.get(enabled_key)
|
||||
if enabled is None:
|
||||
|
||||
72
archivebox/machine/migrations/0005_converge_binary_model.py
Normal file
72
archivebox/machine/migrations/0005_converge_binary_model.py
Normal file
@@ -0,0 +1,72 @@
|
||||
# Generated by hand on 2026-01-01
|
||||
# Converges machine app for 0.8.6rc0 → 0.9.x migration path
|
||||
# Drops old InstalledBinary table and ensures Binary table exists
|
||||
|
||||
from django.db import migrations, connection
|
||||
|
||||
|
||||
def converge_binary_table(apps, schema_editor):
|
||||
"""
|
||||
Drop machine_installedbinary if it exists (0.8.6rc0 path).
|
||||
Create machine_binary if it doesn't exist (needed by Process model).
|
||||
"""
|
||||
cursor = connection.cursor()
|
||||
|
||||
# Check what tables exist
|
||||
cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name IN ('machine_installedbinary', 'machine_binary')")
|
||||
existing_tables = {row[0] for row in cursor.fetchall()}
|
||||
|
||||
print(f'DEBUG 0005: Existing tables: {existing_tables}')
|
||||
|
||||
# Drop old InstalledBinary table if it exists (0.8.6rc0 path)
|
||||
if 'machine_installedbinary' in existing_tables:
|
||||
print('✓ Dropping machine_installedbinary table (0.8.6rc0 divergence)')
|
||||
cursor.execute("DROP TABLE IF EXISTS machine_installedbinary")
|
||||
|
||||
# Create Binary table if it doesn't exist
|
||||
# This handles the case where 0.8.6rc0's 0001_initial didn't create it
|
||||
if 'machine_binary' not in existing_tables:
|
||||
print('✓ Creating machine_binary table with correct schema')
|
||||
cursor.execute("""
|
||||
CREATE TABLE machine_binary (
|
||||
id TEXT PRIMARY KEY NOT NULL,
|
||||
created_at DATETIME NOT NULL,
|
||||
modified_at DATETIME NOT NULL,
|
||||
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
|
||||
num_uses_failed INTEGER NOT NULL DEFAULT 0,
|
||||
machine_id TEXT NOT NULL REFERENCES machine_machine(id) ON DELETE CASCADE,
|
||||
name VARCHAR(63) NOT NULL,
|
||||
binproviders VARCHAR(255) NOT NULL DEFAULT 'env',
|
||||
overrides TEXT NOT NULL DEFAULT '{}',
|
||||
binprovider VARCHAR(63) NOT NULL DEFAULT 'env',
|
||||
abspath VARCHAR(255) NOT NULL,
|
||||
version VARCHAR(128) NOT NULL,
|
||||
sha256 VARCHAR(64) NOT NULL DEFAULT '',
|
||||
status VARCHAR(16) NOT NULL DEFAULT 'succeeded',
|
||||
retry_at DATETIME NULL,
|
||||
output_dir VARCHAR(255) NOT NULL DEFAULT ''
|
||||
)
|
||||
""")
|
||||
|
||||
# Create indexes
|
||||
cursor.execute("CREATE INDEX machine_binary_machine_id_idx ON machine_binary(machine_id)")
|
||||
cursor.execute("CREATE INDEX machine_binary_name_idx ON machine_binary(name)")
|
||||
cursor.execute("CREATE INDEX machine_binary_abspath_idx ON machine_binary(abspath)")
|
||||
|
||||
print('✓ machine_binary table created')
|
||||
else:
|
||||
print('✓ machine_binary table already exists')
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('machine', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RunPython(
|
||||
converge_binary_table,
|
||||
reverse_code=migrations.RunPython.noop,
|
||||
),
|
||||
]
|
||||
@@ -9,7 +9,7 @@ from django.db import migrations, models
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('machine', '0001_initial'),
|
||||
('machine', '0005_converge_binary_model'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
@@ -7,7 +7,7 @@ from django.db import migrations, models
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('machine', '0002_process'),
|
||||
('machine', '0006_process'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
18
archivebox/machine/migrations/0008_add_worker_type_field.py
Normal file
18
archivebox/machine/migrations/0008_add_worker_type_field.py
Normal file
@@ -0,0 +1,18 @@
|
||||
# Generated by Django 6.0 on 2026-01-02 03:36
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('machine', '0007_add_process_type_and_parent'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='process',
|
||||
name='worker_type',
|
||||
field=models.CharField(blank=True, db_index=True, default='', help_text='Worker type name for WORKER processes (crawl, snapshot, archiveresult)', max_length=32),
|
||||
),
|
||||
]
|
||||
@@ -203,13 +203,14 @@ class BinaryManager(models.Manager):
|
||||
|
||||
class Binary(ModelWithHealthStats):
|
||||
"""
|
||||
Tracks an binary on a specific machine.
|
||||
Tracks a binary on a specific machine.
|
||||
|
||||
Follows the unified state machine pattern:
|
||||
Simple state machine with 2 states:
|
||||
- queued: Binary needs to be installed
|
||||
- started: Installation in progress
|
||||
- succeeded: Binary installed successfully (abspath, version, sha256 populated)
|
||||
- failed: Installation failed
|
||||
- installed: Binary installed successfully (abspath, version, sha256 populated)
|
||||
|
||||
Installation is synchronous during queued→installed transition.
|
||||
If installation fails, Binary stays in queued with retry_at set for later retry.
|
||||
|
||||
State machine calls run() which executes on_Binary__install_* hooks
|
||||
to install the binary using the specified providers.
|
||||
@@ -217,9 +218,7 @@ class Binary(ModelWithHealthStats):
|
||||
|
||||
class StatusChoices(models.TextChoices):
|
||||
QUEUED = 'queued', 'Queued'
|
||||
STARTED = 'started', 'Started'
|
||||
SUCCEEDED = 'succeeded', 'Succeeded'
|
||||
FAILED = 'failed', 'Failed'
|
||||
INSTALLED = 'installed', 'Installed'
|
||||
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
@@ -323,8 +322,31 @@ class Binary(ModelWithHealthStats):
|
||||
machine = Machine.current()
|
||||
overrides = overrides or {}
|
||||
|
||||
# Case 1: From binaries.jsonl - create queued binary
|
||||
if 'binproviders' in record or ('overrides' in record and not record.get('abspath')):
|
||||
# Case 1: Already installed (from on_Crawl hooks) - has abspath AND binproviders
|
||||
# This happens when on_Crawl hooks detect already-installed binaries
|
||||
abspath = record.get('abspath')
|
||||
version = record.get('version')
|
||||
binproviders = record.get('binproviders')
|
||||
|
||||
if abspath and version and binproviders:
|
||||
# Binary is already installed, create INSTALLED record with binproviders filter
|
||||
binary, _ = Binary.objects.update_or_create(
|
||||
machine=machine,
|
||||
name=name,
|
||||
defaults={
|
||||
'abspath': abspath,
|
||||
'version': version,
|
||||
'sha256': record.get('sha256', ''),
|
||||
'binprovider': record.get('binprovider', 'env'),
|
||||
'binproviders': binproviders, # Preserve the filter
|
||||
'status': Binary.StatusChoices.INSTALLED,
|
||||
'retry_at': None,
|
||||
}
|
||||
)
|
||||
return binary
|
||||
|
||||
# Case 2: From binaries.json - create queued binary (needs installation)
|
||||
if 'binproviders' in record or ('overrides' in record and not abspath):
|
||||
binary, created = Binary.objects.get_or_create(
|
||||
machine=machine,
|
||||
name=name,
|
||||
@@ -337,25 +359,23 @@ class Binary(ModelWithHealthStats):
|
||||
)
|
||||
return binary
|
||||
|
||||
# Case 2: From hook output - update with installation results
|
||||
abspath = record.get('abspath')
|
||||
version = record.get('version')
|
||||
if not abspath or not version:
|
||||
return None
|
||||
# Case 3: From on_Binary__install hook output - update with installation results
|
||||
if abspath and version:
|
||||
binary, _ = Binary.objects.update_or_create(
|
||||
machine=machine,
|
||||
name=name,
|
||||
defaults={
|
||||
'abspath': abspath,
|
||||
'version': version,
|
||||
'sha256': record.get('sha256', ''),
|
||||
'binprovider': record.get('binprovider', 'env'),
|
||||
'status': Binary.StatusChoices.INSTALLED,
|
||||
'retry_at': None,
|
||||
}
|
||||
)
|
||||
return binary
|
||||
|
||||
binary, _ = Binary.objects.update_or_create(
|
||||
machine=machine,
|
||||
name=name,
|
||||
defaults={
|
||||
'abspath': abspath,
|
||||
'version': version,
|
||||
'sha256': record.get('sha256', ''),
|
||||
'binprovider': record.get('binprovider', 'env'),
|
||||
'status': Binary.StatusChoices.SUCCEEDED,
|
||||
'retry_at': None,
|
||||
}
|
||||
)
|
||||
return binary
|
||||
return None
|
||||
|
||||
@property
|
||||
def OUTPUT_DIR(self):
|
||||
@@ -403,8 +423,7 @@ class Binary(ModelWithHealthStats):
|
||||
# Discover ALL on_Binary__install_* hooks
|
||||
hooks = discover_hooks('Binary', config=config)
|
||||
if not hooks:
|
||||
self.status = self.StatusChoices.FAILED
|
||||
self.save()
|
||||
# No hooks available - stay queued, will retry later
|
||||
return
|
||||
|
||||
# Run each hook - they decide if they can handle this binary
|
||||
@@ -456,15 +475,21 @@ class Binary(ModelWithHealthStats):
|
||||
self.version = record.get('version', '')
|
||||
self.sha256 = record.get('sha256', '')
|
||||
self.binprovider = record.get('binprovider', 'env')
|
||||
self.status = self.StatusChoices.SUCCEEDED
|
||||
self.status = self.StatusChoices.INSTALLED
|
||||
self.save()
|
||||
|
||||
# Symlink binary into LIB_BIN_DIR if configured
|
||||
from django.conf import settings
|
||||
lib_bin_dir = getattr(settings, 'LIB_BIN_DIR', None)
|
||||
if lib_bin_dir:
|
||||
self.symlink_to_lib_bin(lib_bin_dir)
|
||||
|
||||
return
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# No hook succeeded
|
||||
self.status = self.StatusChoices.FAILED
|
||||
self.save()
|
||||
# No hook succeeded - leave status as QUEUED (will retry later)
|
||||
# Don't set to FAILED since we don't have that status anymore
|
||||
|
||||
def cleanup(self):
|
||||
"""
|
||||
@@ -484,10 +509,75 @@ class Binary(ModelWithHealthStats):
|
||||
for plugin_dir in output_dir.iterdir():
|
||||
if not plugin_dir.is_dir():
|
||||
continue
|
||||
|
||||
pid_file = plugin_dir / 'hook.pid'
|
||||
cmd_file = plugin_dir / 'cmd.sh'
|
||||
safe_kill_process(pid_file, cmd_file)
|
||||
|
||||
def symlink_to_lib_bin(self, lib_bin_dir: str | Path) -> Path | None:
|
||||
"""
|
||||
Symlink this binary into LIB_BIN_DIR for unified PATH management.
|
||||
|
||||
After a binary is installed by any binprovider (pip, npm, brew, apt, etc),
|
||||
we symlink it into LIB_BIN_DIR so that:
|
||||
1. All binaries can be found in a single directory
|
||||
2. PATH only needs LIB_BIN_DIR prepended (not multiple provider-specific paths)
|
||||
3. Binary priorities are clear (symlink points to the canonical install location)
|
||||
|
||||
Args:
|
||||
lib_bin_dir: Path to LIB_BIN_DIR (e.g., /data/lib/arm64-darwin/bin)
|
||||
|
||||
Returns:
|
||||
Path to the created symlink, or None if symlinking failed
|
||||
|
||||
Example:
|
||||
>>> binary = Binary.objects.get(name='yt-dlp')
|
||||
>>> binary.symlink_to_lib_bin('/data/lib/arm64-darwin/bin')
|
||||
Path('/data/lib/arm64-darwin/bin/yt-dlp')
|
||||
"""
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
if not self.abspath:
|
||||
return None
|
||||
|
||||
binary_abspath = Path(self.abspath).resolve()
|
||||
lib_bin_dir = Path(lib_bin_dir).resolve()
|
||||
|
||||
# Create LIB_BIN_DIR if it doesn't exist
|
||||
try:
|
||||
lib_bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
except (OSError, PermissionError) as e:
|
||||
print(f"Failed to create LIB_BIN_DIR {lib_bin_dir}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
# Get binary name (last component of path)
|
||||
binary_name = binary_abspath.name
|
||||
symlink_path = lib_bin_dir / binary_name
|
||||
|
||||
# Remove existing symlink/file if it exists
|
||||
if symlink_path.exists() or symlink_path.is_symlink():
|
||||
try:
|
||||
# Check if it's already pointing to the right place
|
||||
if symlink_path.is_symlink() and symlink_path.resolve() == binary_abspath:
|
||||
# Already correctly symlinked, nothing to do
|
||||
return symlink_path
|
||||
|
||||
# Remove old symlink/file
|
||||
symlink_path.unlink()
|
||||
except (OSError, PermissionError) as e:
|
||||
print(f"Failed to remove existing file at {symlink_path}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
# Create new symlink
|
||||
try:
|
||||
symlink_path.symlink_to(binary_abspath)
|
||||
print(f"Symlinked {binary_name} -> {symlink_path}", file=sys.stderr)
|
||||
return symlink_path
|
||||
except (OSError, PermissionError) as e:
|
||||
print(f"Failed to create symlink {symlink_path} -> {binary_abspath}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Process Model
|
||||
@@ -627,6 +717,16 @@ class Process(models.Model):
|
||||
help_text='Type of process (cli, worker, orchestrator, binary, supervisord)'
|
||||
)
|
||||
|
||||
# Worker type (only for WORKER processes: crawl, snapshot, archiveresult)
|
||||
worker_type = models.CharField(
|
||||
max_length=32,
|
||||
default='',
|
||||
null=False,
|
||||
blank=True,
|
||||
db_index=True,
|
||||
help_text='Worker type name for WORKER processes (crawl, snapshot, archiveresult)'
|
||||
)
|
||||
|
||||
# Execution metadata
|
||||
pwd = models.CharField(max_length=512, default='', null=False, blank=True,
|
||||
help_text='Working directory for process execution')
|
||||
@@ -895,11 +995,16 @@ class Process(models.Model):
|
||||
ppid = os.getppid()
|
||||
machine = machine or Machine.current()
|
||||
|
||||
# Debug logging
|
||||
import sys
|
||||
print(f"DEBUG _find_parent_process: my_pid={os.getpid()}, ppid={ppid}", file=sys.stderr)
|
||||
|
||||
# Get parent process start time from OS
|
||||
try:
|
||||
os_parent = psutil.Process(ppid)
|
||||
os_parent_start = os_parent.create_time()
|
||||
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
||||
print(f"DEBUG _find_parent_process: Parent process {ppid} not accessible", file=sys.stderr)
|
||||
return None # Parent process doesn't exist
|
||||
|
||||
# Find matching Process record
|
||||
@@ -910,12 +1015,18 @@ class Process(models.Model):
|
||||
started_at__gte=timezone.now() - PID_REUSE_WINDOW,
|
||||
).order_by('-started_at')
|
||||
|
||||
print(f"DEBUG _find_parent_process: Found {candidates.count()} candidates for ppid={ppid}", file=sys.stderr)
|
||||
|
||||
for candidate in candidates:
|
||||
if candidate.started_at:
|
||||
db_start_time = candidate.started_at.timestamp()
|
||||
if abs(db_start_time - os_parent_start) < START_TIME_TOLERANCE:
|
||||
time_diff = abs(db_start_time - os_parent_start)
|
||||
print(f"DEBUG _find_parent_process: Checking candidate id={candidate.id} time_diff={time_diff:.2f}s tolerance={START_TIME_TOLERANCE}s", file=sys.stderr)
|
||||
if time_diff < START_TIME_TOLERANCE:
|
||||
print(f"DEBUG _find_parent_process: MATCH! Returning parent id={candidate.id} pid={candidate.pid}", file=sys.stderr)
|
||||
return candidate
|
||||
|
||||
print(f"DEBUG _find_parent_process: No matching parent found for ppid={ppid}", file=sys.stderr)
|
||||
return None # No matching ArchiveBox parent process
|
||||
|
||||
@classmethod
|
||||
@@ -1584,69 +1695,38 @@ class BinaryMachine(BaseStateMachine, strict_states=True):
|
||||
"""
|
||||
State machine for managing Binary installation lifecycle.
|
||||
|
||||
Hook Lifecycle:
|
||||
Simple 2-state machine:
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ QUEUED State │
|
||||
│ • Binary needs to be installed │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() when can_start()
|
||||
↓ tick() when can_install()
|
||||
↓ Synchronous installation during transition
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ STARTED State → enter_started() │
|
||||
│ 1. binary.run() │
|
||||
│ • discover_hooks('Binary') → all on_Binary__install_* │
|
||||
│ • Try each provider hook in sequence: │
|
||||
│ - run_hook(script, output_dir, ...) │
|
||||
│ - If returncode == 0: │
|
||||
│ * Read stdout.log │
|
||||
│ * Parse JSONL for 'Binary' record with abspath │
|
||||
│ * Update self: abspath, version, sha256, provider │
|
||||
│ * Set status=SUCCEEDED, RETURN │
|
||||
│ • If no hook succeeds: set status=FAILED │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
↓ tick() checks status
|
||||
┌─────────────────────────────────────────────────────────────┐
|
||||
│ SUCCEEDED / FAILED │
|
||||
│ • Set by binary.run() based on hook results │
|
||||
│ • Health stats incremented (num_uses_succeeded/failed) │
|
||||
│ INSTALLED State │
|
||||
│ • Binary installed (abspath, version, sha256 set) │
|
||||
│ • Health stats incremented │
|
||||
└─────────────────────────────────────────────────────────────┘
|
||||
|
||||
If installation fails, Binary stays in QUEUED with retry_at bumped.
|
||||
"""
|
||||
|
||||
model_attr_name = 'binary'
|
||||
|
||||
# States
|
||||
queued = State(value=Binary.StatusChoices.QUEUED, initial=True)
|
||||
started = State(value=Binary.StatusChoices.STARTED)
|
||||
succeeded = State(value=Binary.StatusChoices.SUCCEEDED, final=True)
|
||||
failed = State(value=Binary.StatusChoices.FAILED, final=True)
|
||||
installed = State(value=Binary.StatusChoices.INSTALLED, final=True)
|
||||
|
||||
# Tick Event - transitions based on conditions
|
||||
# Tick Event - install happens during transition
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished') |
|
||||
started.to(succeeded, cond='is_succeeded') |
|
||||
started.to(failed, cond='is_failed')
|
||||
queued.to.itself(unless='can_install') |
|
||||
queued.to(installed, cond='can_install', on='on_install')
|
||||
)
|
||||
|
||||
def can_start(self) -> bool:
|
||||
def can_install(self) -> bool:
|
||||
"""Check if binary installation can start."""
|
||||
return bool(self.binary.name and self.binary.binproviders)
|
||||
|
||||
def is_succeeded(self) -> bool:
|
||||
"""Check if installation succeeded (status was set by run())."""
|
||||
return self.binary.status == Binary.StatusChoices.SUCCEEDED
|
||||
|
||||
def is_failed(self) -> bool:
|
||||
"""Check if installation failed (status was set by run())."""
|
||||
return self.binary.status == Binary.StatusChoices.FAILED
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
"""Check if installation has completed (success or failure)."""
|
||||
return self.binary.status in (
|
||||
Binary.StatusChoices.SUCCEEDED,
|
||||
Binary.StatusChoices.FAILED,
|
||||
)
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
"""Binary is queued for installation."""
|
||||
@@ -1655,43 +1735,48 @@ class BinaryMachine(BaseStateMachine, strict_states=True):
|
||||
status=Binary.StatusChoices.QUEUED,
|
||||
)
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
"""Start binary installation."""
|
||||
# Lock the binary while installation runs
|
||||
self.binary.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=300), # 5 min timeout for installation
|
||||
status=Binary.StatusChoices.STARTED,
|
||||
)
|
||||
def on_install(self):
|
||||
"""Called during queued→installed transition. Runs installation synchronously."""
|
||||
import sys
|
||||
|
||||
# Run installation hooks
|
||||
print(f'[cyan] 🔄 BinaryMachine.on_install() - installing {self.binary.name}[/cyan]', file=sys.stderr)
|
||||
|
||||
# Run installation hooks (synchronous, updates abspath/version/sha256 and sets status)
|
||||
self.binary.run()
|
||||
|
||||
# Save updated status (run() updates status to succeeded/failed)
|
||||
self.binary.save()
|
||||
# Check if installation succeeded by looking at updated status
|
||||
# Note: Binary.run() updates self.binary.status internally but doesn't refresh our reference
|
||||
self.binary.refresh_from_db()
|
||||
|
||||
@succeeded.enter
|
||||
def enter_succeeded(self):
|
||||
if self.binary.status != Binary.StatusChoices.INSTALLED:
|
||||
# Installation failed - abort transition, stay in queued
|
||||
print(f'[red] ❌ BinaryMachine - {self.binary.name} installation failed, retrying later[/red]', file=sys.stderr)
|
||||
|
||||
# Bump retry_at to try again later
|
||||
self.binary.update_and_requeue(
|
||||
retry_at=timezone.now() + timedelta(seconds=300), # Retry in 5 minutes
|
||||
status=Binary.StatusChoices.QUEUED, # Ensure we stay queued
|
||||
)
|
||||
|
||||
# Increment health stats for failure
|
||||
self.binary.increment_health_stats(success=False)
|
||||
|
||||
# Abort the transition - this will raise an exception and keep us in queued
|
||||
raise Exception(f'Binary {self.binary.name} installation failed')
|
||||
|
||||
print(f'[cyan] ✅ BinaryMachine - {self.binary.name} installed successfully[/cyan]', file=sys.stderr)
|
||||
|
||||
@installed.enter
|
||||
def enter_installed(self):
|
||||
"""Binary installed successfully."""
|
||||
self.binary.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=Binary.StatusChoices.SUCCEEDED,
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
|
||||
# Increment health stats
|
||||
self.binary.increment_health_stats(success=True)
|
||||
|
||||
@failed.enter
|
||||
def enter_failed(self):
|
||||
"""Binary installation failed."""
|
||||
self.binary.update_and_requeue(
|
||||
retry_at=None,
|
||||
status=Binary.StatusChoices.FAILED,
|
||||
)
|
||||
|
||||
# Increment health stats
|
||||
self.binary.increment_health_stats(success=False)
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Process State Machine
|
||||
|
||||
@@ -80,8 +80,7 @@ class TestAccessibilityWithChrome(TestCase):
|
||||
# Run accessibility hook with the active Chrome session
|
||||
result = subprocess.run(
|
||||
['node', str(ACCESSIBILITY_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(snapshot_chrome_dir,
|
||||
env=get_test_env()),
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
|
||||
0
archivebox/plugins/chrome/on_Crawl__01_chrome_install.py
Normal file → Executable file
0
archivebox/plugins/chrome/on_Crawl__01_chrome_install.py
Normal file → Executable file
@@ -39,30 +39,36 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
CHROME_NAVIGATE_HOOK,
|
||||
)
|
||||
|
||||
# Get LIB_DIR and NODE_MODULES_DIR from shared helpers
|
||||
LIB_DIR = get_lib_dir()
|
||||
NODE_MODULES_DIR = get_node_modules_dir()
|
||||
NPM_PREFIX = LIB_DIR / 'npm'
|
||||
|
||||
# Chromium install location (relative to DATA_DIR)
|
||||
CHROMIUM_INSTALL_DIR = Path(os.environ.get('DATA_DIR', '.')).resolve() / 'chromium'
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def ensure_chromium_and_puppeteer_installed():
|
||||
"""Ensure Chromium and puppeteer are installed before running tests."""
|
||||
def ensure_chromium_and_puppeteer_installed(tmp_path_factory):
|
||||
"""Ensure Chromium and puppeteer are installed before running tests.
|
||||
|
||||
Puppeteer handles Chromium installation automatically in its own cache.
|
||||
We only need to install puppeteer itself to LIB_DIR/npm.
|
||||
"""
|
||||
from abx_pkg import Binary, NpmProvider, BinProviderOverrides
|
||||
|
||||
# Set DATA_DIR if not already set (required by abx_pkg)
|
||||
if not os.environ.get('DATA_DIR'):
|
||||
# Use isolated temp dir for direct pytest runs
|
||||
test_data_dir = tmp_path_factory.mktemp('chrome_test_data')
|
||||
os.environ['DATA_DIR'] = str(test_data_dir)
|
||||
|
||||
# Compute paths AFTER setting DATA_DIR
|
||||
lib_dir = get_lib_dir()
|
||||
node_modules_dir = get_node_modules_dir()
|
||||
npm_prefix = lib_dir / 'npm'
|
||||
|
||||
# Rebuild pydantic models
|
||||
NpmProvider.model_rebuild()
|
||||
|
||||
# Install puppeteer-core if not available
|
||||
puppeteer_core_path = NODE_MODULES_DIR / 'puppeteer-core'
|
||||
# Install puppeteer if not available (it will handle Chromium in its own cache)
|
||||
puppeteer_core_path = node_modules_dir / 'puppeteer-core'
|
||||
if not puppeteer_core_path.exists():
|
||||
print(f"\n[*] Installing puppeteer to {NPM_PREFIX}...")
|
||||
NPM_PREFIX.mkdir(parents=True, exist_ok=True)
|
||||
print(f"\n[*] Installing puppeteer to {npm_prefix}...")
|
||||
npm_prefix.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
provider = NpmProvider(npm_prefix=NPM_PREFIX)
|
||||
provider = NpmProvider(npm_prefix=npm_prefix)
|
||||
try:
|
||||
binary = Binary(
|
||||
name='puppeteer',
|
||||
@@ -70,36 +76,25 @@ def ensure_chromium_and_puppeteer_installed():
|
||||
overrides={'npm': {'packages': ['puppeteer@^23.5.0']}}
|
||||
)
|
||||
binary.install()
|
||||
print(f"[*] Puppeteer installed successfully to {NPM_PREFIX}")
|
||||
print(f"[*] Puppeteer installed successfully to {npm_prefix}")
|
||||
except Exception as e:
|
||||
pytest.skip(f"Failed to install puppeteer: {e}")
|
||||
|
||||
# Install Chromium via @puppeteer/browsers if not available
|
||||
# Find Chromium binary (puppeteer installs it automatically in its cache)
|
||||
chromium_binary = find_chromium_binary()
|
||||
if not chromium_binary:
|
||||
print(f"\n[*] Installing Chromium to {CHROMIUM_INSTALL_DIR}...")
|
||||
CHROMIUM_INSTALL_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
result = subprocess.run(
|
||||
['npx', '@puppeteer/browsers', 'install', 'chromium@latest'],
|
||||
cwd=str(CHROMIUM_INSTALL_DIR.parent),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=300
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.skip(f"Failed to install Chromium: {result.stderr}")
|
||||
|
||||
chromium_binary = find_chromium_binary()
|
||||
if not chromium_binary:
|
||||
pytest.skip("Chromium installed but binary not found")
|
||||
|
||||
print(f"[*] Chromium installed: {chromium_binary}")
|
||||
pytest.skip("Chromium not found - puppeteer should install it automatically")
|
||||
|
||||
# Set CHROME_BINARY env var for tests
|
||||
os.environ['CHROME_BINARY'] = chromium_binary
|
||||
|
||||
|
||||
# Get paths from helpers (will use DATA_DIR if set, or compute based on __file__)
|
||||
LIB_DIR = get_lib_dir()
|
||||
NODE_MODULES_DIR = get_node_modules_dir()
|
||||
NPM_PREFIX = LIB_DIR / 'npm'
|
||||
|
||||
|
||||
def test_hook_scripts_exist():
|
||||
"""Verify chrome hooks exist."""
|
||||
assert CHROME_LAUNCH_HOOK.exists(), f"Hook not found: {CHROME_LAUNCH_HOOK}"
|
||||
@@ -208,8 +203,7 @@ def test_chrome_launch_and_tab_creation():
|
||||
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-123', '--crawl-id=test-crawl-123'],
|
||||
cwd=str(snapshot_chrome_dir,
|
||||
env=get_test_env()),
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
@@ -269,8 +263,7 @@ def test_chrome_navigation():
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_TAB_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123', '--crawl-id=test-crawl-nav'],
|
||||
cwd=str(snapshot_chrome_dir,
|
||||
env=get_test_env()),
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
@@ -281,8 +274,7 @@ def test_chrome_navigation():
|
||||
# Navigate to URL
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_NAVIGATE_HOOK), '--url=https://example.com', '--snapshot-id=snap-nav-123'],
|
||||
cwd=str(snapshot_chrome_dir,
|
||||
env=get_test_env()),
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
@@ -417,8 +409,7 @@ def test_multiple_snapshots_share_chrome():
|
||||
# Create tab for this snapshot
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_TAB_HOOK), f'--url=https://example.com/{snap_num}', f'--snapshot-id=snap-{snap_num}', '--crawl-id=test-multi-crawl'],
|
||||
cwd=str(snapshot_chrome_dir,
|
||||
env=get_test_env()),
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
|
||||
@@ -80,8 +80,7 @@ class TestConsolelogWithChrome(TestCase):
|
||||
# Run consolelog hook with the active Chrome session
|
||||
result = subprocess.run(
|
||||
['node', str(CONSOLELOG_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(snapshot_chrome_dir,
|
||||
env=get_test_env()),
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120, # Longer timeout as it waits for navigation
|
||||
|
||||
80
archivebox/plugins/gallerydl/on_Crawl__10_gallerydl_install.py
Executable file
80
archivebox/plugins/gallerydl/on_Crawl__10_gallerydl_install.py
Executable file
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Detect gallery-dl binary and emit Binary JSONL record.
|
||||
|
||||
Output: Binary JSONL record to stdout if gallery-dl is found
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def output_binary_found(binary: Binary, name: str):
|
||||
"""Output Binary JSONL record for an installed binary."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'env', # Already installed
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def output_binary_missing(name: str, binproviders: str):
|
||||
"""Output Binary JSONL record for a missing binary that needs installation."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'binproviders': binproviders, # Providers that can install it
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def main():
|
||||
gallerydl_enabled = get_env_bool('GALLERYDL_ENABLED', True)
|
||||
gallerydl_binary = get_env('GALLERYDL_BINARY', 'gallery-dl')
|
||||
|
||||
if not gallerydl_enabled:
|
||||
sys.exit(0)
|
||||
|
||||
provider = EnvProvider()
|
||||
try:
|
||||
binary = Binary(name=gallerydl_binary, binproviders=[provider]).load()
|
||||
if binary.abspath:
|
||||
# Binary found
|
||||
output_binary_found(binary, name='gallery-dl')
|
||||
else:
|
||||
# Binary not found
|
||||
output_binary_missing(name='gallery-dl', binproviders='pip')
|
||||
except Exception:
|
||||
# Binary not found
|
||||
output_binary_missing(name='gallery-dl', binproviders='pip')
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
80
archivebox/plugins/git/on_Crawl__09_git_install.py
Executable file
80
archivebox/plugins/git/on_Crawl__09_git_install.py
Executable file
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Detect git binary and emit Binary JSONL record.
|
||||
|
||||
Output: Binary JSONL record to stdout if git is found
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def output_binary_found(binary: Binary, name: str):
|
||||
"""Output Binary JSONL record for an installed binary."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'env', # Already installed
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def output_binary_missing(name: str, binproviders: str):
|
||||
"""Output Binary JSONL record for a missing binary that needs installation."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'binproviders': binproviders, # Providers that can install it
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def main():
|
||||
git_enabled = get_env_bool('GIT_ENABLED', True)
|
||||
git_binary = get_env('GIT_BINARY', 'git')
|
||||
|
||||
if not git_enabled:
|
||||
sys.exit(0)
|
||||
|
||||
provider = EnvProvider()
|
||||
try:
|
||||
binary = Binary(name=git_binary, binproviders=[provider]).load()
|
||||
if binary.abspath:
|
||||
# Binary found
|
||||
output_binary_found(binary, name='git')
|
||||
else:
|
||||
# Binary not found
|
||||
output_binary_missing(name='git', binproviders='apt,brew')
|
||||
except Exception:
|
||||
# Binary not found
|
||||
output_binary_missing(name='git', binproviders='apt,brew')
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -123,8 +123,7 @@ def test_scrolls_page_and_outputs_stats():
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-infiniscroll'],
|
||||
cwd=str(infiniscroll_dir,
|
||||
env=get_test_env()),
|
||||
cwd=str(infiniscroll_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
@@ -188,8 +187,7 @@ def test_config_scroll_limit_honored():
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-limit'],
|
||||
cwd=str(infiniscroll_dir,
|
||||
env=get_test_env()),
|
||||
cwd=str(infiniscroll_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
@@ -248,8 +246,7 @@ def test_config_timeout_honored():
|
||||
start_time = time.time()
|
||||
result = subprocess.run(
|
||||
['node', str(INFINISCROLL_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-timeout'],
|
||||
cwd=str(infiniscroll_dir,
|
||||
env=get_test_env()),
|
||||
cwd=str(infiniscroll_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
|
||||
80
archivebox/plugins/mercury/on_Crawl__12_mercury_install.py
Executable file
80
archivebox/plugins/mercury/on_Crawl__12_mercury_install.py
Executable file
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Detect mercury-parser binary and emit Binary JSONL record.
|
||||
|
||||
Output: Binary JSONL record to stdout if mercury-parser is found
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def output_binary_found(binary: Binary, name: str):
|
||||
"""Output Binary JSONL record for an installed binary."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'env', # Already installed
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def output_binary_missing(name: str, binproviders: str):
|
||||
"""Output Binary JSONL record for a missing binary that needs installation."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'binproviders': binproviders, # Providers that can install it
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def main():
|
||||
mercury_enabled = get_env_bool('MERCURY_ENABLED', True)
|
||||
mercury_binary = get_env('MERCURY_BINARY', 'mercury-parser')
|
||||
|
||||
if not mercury_enabled:
|
||||
sys.exit(0)
|
||||
|
||||
provider = EnvProvider()
|
||||
try:
|
||||
binary = Binary(name=mercury_binary, binproviders=[provider]).load()
|
||||
if binary.abspath:
|
||||
# Binary found
|
||||
output_binary_found(binary, name='mercury-parser')
|
||||
else:
|
||||
# Binary not found
|
||||
output_binary_missing(name='mercury-parser', binproviders='npm')
|
||||
except Exception:
|
||||
# Binary not found
|
||||
output_binary_missing(name='mercury-parser', binproviders='npm')
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
83
archivebox/plugins/readability/on_Crawl__11_readability_install.py
Executable file
83
archivebox/plugins/readability/on_Crawl__11_readability_install.py
Executable file
@@ -0,0 +1,83 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Detect readability-extractor binary and emit Binary JSONL record.
|
||||
|
||||
Output: Binary JSONL record to stdout if readability is found
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def output_binary_found(binary: Binary, name: str):
|
||||
"""Output Binary JSONL record for an installed binary."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'env', # Already installed
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def output_binary_missing(name: str, binproviders: str):
|
||||
"""Output Binary JSONL record for a missing binary that needs installation."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'binproviders': binproviders, # Providers that can install it
|
||||
'overrides': {
|
||||
'packages': ['git+https://github.com/ArchiveBox/readability-extractor.git'],
|
||||
},
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def main():
|
||||
readability_enabled = get_env_bool('READABILITY_ENABLED', True)
|
||||
readability_binary = get_env('READABILITY_BINARY', 'readability-extractor')
|
||||
|
||||
if not readability_enabled:
|
||||
sys.exit(0)
|
||||
|
||||
provider = EnvProvider()
|
||||
try:
|
||||
binary = Binary(name=readability_binary, binproviders=[provider]).load()
|
||||
if binary.abspath:
|
||||
# Binary found
|
||||
output_binary_found(binary, name='readability-extractor')
|
||||
else:
|
||||
# Binary not found
|
||||
output_binary_missing(name='readability-extractor', binproviders='npm')
|
||||
except Exception:
|
||||
# Binary not found
|
||||
output_binary_missing(name='readability-extractor', binproviders='npm')
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -27,11 +27,21 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (
|
||||
run_hook_and_parse,
|
||||
LIB_DIR,
|
||||
NODE_MODULES_DIR,
|
||||
CHROME_PLUGIN_DIR,
|
||||
)
|
||||
|
||||
# Import chrome test fixture to ensure puppeteer is installed
|
||||
from archivebox.plugins.chrome.tests.test_chrome import ensure_chromium_and_puppeteer_installed
|
||||
|
||||
|
||||
PLUGIN_DIR = get_plugin_dir(__file__)
|
||||
SCREENSHOT_HOOK = get_hook_script(PLUGIN_DIR, 'on_Snapshot__*_screenshot.*')
|
||||
|
||||
# Get Chrome hooks for setting up sessions
|
||||
CHROME_LAUNCH_HOOK = get_hook_script(CHROME_PLUGIN_DIR, 'on_Crawl__*_chrome_launch.*')
|
||||
CHROME_TAB_HOOK = get_hook_script(CHROME_PLUGIN_DIR, 'on_Snapshot__*_chrome_tab.*')
|
||||
CHROME_NAVIGATE_HOOK = get_hook_script(CHROME_PLUGIN_DIR, 'on_Snapshot__*_chrome_navigate.*')
|
||||
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
|
||||
@@ -53,18 +63,162 @@ def test_verify_deps_with_abx_pkg():
|
||||
|
||||
|
||||
def test_extracts_screenshot_from_example_com():
|
||||
"""Test full workflow: extract screenshot from real example.com via hook."""
|
||||
# Prerequisites checked by earlier test
|
||||
"""Test full workflow: extract screenshot from real example.com via hook.
|
||||
|
||||
Replicates production directory structure:
|
||||
DATA_DIR/users/testuser/crawls/{crawl-id}/chrome/
|
||||
DATA_DIR/users/testuser/crawls/{crawl-id}/snapshots/{snap-id}/chrome/
|
||||
DATA_DIR/users/testuser/crawls/{crawl-id}/snapshots/{snap-id}/screenshot/
|
||||
|
||||
This exercises the "connect to existing session" code path which is the primary
|
||||
path in production and accounts for ~50% of the code.
|
||||
"""
|
||||
import signal
|
||||
import time
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
# Replicate exact production directory structure
|
||||
data_dir = Path(tmpdir)
|
||||
crawl_id = 'test-screenshot-crawl'
|
||||
snapshot_id = 'test-screenshot-snap'
|
||||
|
||||
# Crawl: DATA_DIR/users/{username}/crawls/YYYYMMDD/example.com/{crawl-id}/{plugin}/
|
||||
crawl_dir = data_dir / 'users' / 'testuser' / 'crawls' / '20240101' / 'example.com' / crawl_id
|
||||
chrome_dir = crawl_dir / 'chrome'
|
||||
chrome_dir.mkdir(parents=True)
|
||||
|
||||
# Snapshot: DATA_DIR/users/{username}/snapshots/YYYYMMDD/example.com/{snapshot-uuid}/{plugin}/
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / snapshot_id
|
||||
snapshot_chrome_dir = snapshot_dir / 'chrome'
|
||||
snapshot_chrome_dir.mkdir(parents=True)
|
||||
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir()
|
||||
|
||||
# Run screenshot extraction hook
|
||||
env = get_test_env()
|
||||
print(f"\n[DEBUG] NODE_V8_COVERAGE={env.get('NODE_V8_COVERAGE', 'NOT SET')}", file=sys.stderr)
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Step 1: Launch Chrome session at crawl level (background process)
|
||||
chrome_launch_process = subprocess.Popen(
|
||||
['node', str(CHROME_LAUNCH_HOOK), f'--crawl-id={crawl_id}'],
|
||||
cwd=str(chrome_dir),
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Wait for Chrome to launch
|
||||
for i in range(15):
|
||||
if chrome_launch_process.poll() is not None:
|
||||
stdout, stderr = chrome_launch_process.communicate()
|
||||
pytest.fail(f"Chrome launch failed:\nStdout: {stdout}\nStderr: {stderr}")
|
||||
if (chrome_dir / 'cdp_url.txt').exists():
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
assert (chrome_dir / 'cdp_url.txt').exists(), "Chrome CDP URL file should exist"
|
||||
assert (chrome_dir / 'chrome.pid').exists(), "Chrome PID file should exist"
|
||||
|
||||
chrome_pid = int((chrome_dir / 'chrome.pid').read_text().strip())
|
||||
|
||||
try:
|
||||
# Step 2: Create tab at snapshot level
|
||||
env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_TAB_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=60,
|
||||
env=env
|
||||
)
|
||||
assert result.returncode == 0, f"Tab creation failed: {result.stderr}"
|
||||
assert (snapshot_chrome_dir / 'cdp_url.txt').exists(), "Snapshot CDP URL should exist"
|
||||
|
||||
# Step 3: Navigate to URL
|
||||
result = subprocess.run(
|
||||
['node', str(CHROME_NAVIGATE_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(snapshot_chrome_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
assert result.returncode == 0, f"Navigation failed: {result.stderr}"
|
||||
assert (snapshot_chrome_dir / 'navigation.json').exists(), "Navigation JSON should exist"
|
||||
|
||||
# Step 4: Take screenshot (should connect to existing session)
|
||||
# Screenshot hook runs in screenshot/ dir and looks for ../chrome/cdp_url.txt
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', f'--snapshot-id={snapshot_id}'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Screenshot extraction failed:\nStderr: {result.stderr}\nStdout: {result.stdout}"
|
||||
|
||||
# Parse JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
assert 'screenshot.png' in result_json['output_str'], f"Output should be screenshot.png: {result_json}"
|
||||
|
||||
# Verify filesystem output
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
assert screenshot_file.exists(), f"screenshot.png not created at {screenshot_file}"
|
||||
|
||||
# Verify file is valid PNG
|
||||
file_size = screenshot_file.stat().st_size
|
||||
assert file_size > 1000, f"Screenshot too small: {file_size} bytes"
|
||||
assert file_size < 10 * 1024 * 1024, f"Screenshot suspiciously large: {file_size} bytes"
|
||||
|
||||
# Check PNG magic bytes
|
||||
screenshot_data = screenshot_file.read_bytes()
|
||||
assert screenshot_data[:8] == b'\x89PNG\r\n\x1a\n', "Should be valid PNG file"
|
||||
|
||||
finally:
|
||||
# Cleanup: Kill Chrome
|
||||
try:
|
||||
chrome_launch_process.send_signal(signal.SIGTERM)
|
||||
chrome_launch_process.wait(timeout=5)
|
||||
except:
|
||||
pass
|
||||
try:
|
||||
os.kill(chrome_pid, signal.SIGKILL)
|
||||
except OSError:
|
||||
pass
|
||||
|
||||
|
||||
def test_extracts_screenshot_without_session():
|
||||
"""Test screenshot extraction without existing Chrome session (fallback to own browser)."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
# Create proper snapshot directory structure
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-fallback'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
# Don't set up Chrome session or staticfile - screenshot should launch its own browser
|
||||
env = get_test_env()
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=test789'],
|
||||
cwd=tmpdir,
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-fallback'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
@@ -73,7 +227,7 @@ def test_extracts_screenshot_from_example_com():
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
# Parse JSONL output (clean format without RESULT_JSON= prefix)
|
||||
# Parse JSONL output
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
@@ -88,20 +242,54 @@ def test_extracts_screenshot_from_example_com():
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
assert result_json['output_str'] == 'screenshot.png'
|
||||
assert 'screenshot.png' in result_json['output_str']
|
||||
|
||||
# Verify filesystem output (hook creates screenshot.png directly in working dir)
|
||||
screenshot_file = tmpdir / 'screenshot.png'
|
||||
# Verify file created
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
assert screenshot_file.exists(), "screenshot.png not created"
|
||||
assert screenshot_file.stat().st_size > 1000, "Screenshot too small"
|
||||
|
||||
# Verify file is valid PNG
|
||||
file_size = screenshot_file.stat().st_size
|
||||
assert file_size > 1000, f"Screenshot too small: {file_size} bytes"
|
||||
assert file_size < 10 * 1024 * 1024, f"Screenshot suspiciously large: {file_size} bytes"
|
||||
|
||||
# Check PNG magic bytes
|
||||
screenshot_data = screenshot_file.read_bytes()
|
||||
assert screenshot_data[:8] == b'\x89PNG\r\n\x1a\n', "Should be valid PNG file"
|
||||
def test_skips_when_staticfile_exists():
|
||||
"""Test that screenshot skips when staticfile extractor already handled the URL."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-skip'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
# Create staticfile output to simulate staticfile extractor already ran
|
||||
staticfile_dir = snapshot_dir / 'staticfile'
|
||||
staticfile_dir.mkdir()
|
||||
(staticfile_dir / 'index.html').write_text('<html></html>')
|
||||
|
||||
env = get_test_env()
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-skip'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should exit successfully: {result.stderr}"
|
||||
|
||||
# Should emit skipped status
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] == 'skipped', f"Should skip: {result_json}"
|
||||
|
||||
|
||||
def test_config_save_screenshot_false_skips():
|
||||
@@ -134,13 +322,11 @@ def test_config_save_screenshot_false_skips():
|
||||
|
||||
def test_reports_missing_chrome():
|
||||
"""Test that script reports error when Chrome is not found."""
|
||||
import os
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
# Set CHROME_BINARY to nonexistent path
|
||||
env = os.environ.copy()
|
||||
env = get_test_env()
|
||||
env['CHROME_BINARY'] = '/nonexistent/chrome'
|
||||
|
||||
result = subprocess.run(
|
||||
@@ -158,6 +344,59 @@ def test_reports_missing_chrome():
|
||||
assert 'chrome' in combined.lower() or 'browser' in combined.lower() or 'ERROR=' in combined
|
||||
|
||||
|
||||
def test_custom_resolution_and_user_agent():
|
||||
"""Test that CHROME_RESOLUTION and CHROME_USER_AGENT configs are respected."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-config'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_RESOLUTION'] = '800,600'
|
||||
env['CHROME_USER_AGENT'] = 'Test/1.0'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-config'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Extraction failed: {result.stderr}"
|
||||
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
assert screenshot_file.exists(), "screenshot.png not created"
|
||||
# Resolution affects file size
|
||||
assert screenshot_file.stat().st_size > 500, "Screenshot too small"
|
||||
|
||||
|
||||
def test_ssl_check_disabled():
|
||||
"""Test that CHROME_CHECK_SSL_VALIDITY=False allows invalid certificates."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-ssl'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_CHECK_SSL_VALIDITY'] = 'False'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-ssl'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should succeed: {result.stderr}"
|
||||
assert (screenshot_dir / 'screenshot.png').exists()
|
||||
|
||||
|
||||
def test_config_timeout_honored():
|
||||
"""Test that CHROME_TIMEOUT config is respected."""
|
||||
import os
|
||||
@@ -182,5 +421,410 @@ def test_config_timeout_honored():
|
||||
assert result.returncode in (0, 1), "Should complete without hanging"
|
||||
|
||||
|
||||
def test_missing_url_argument():
|
||||
"""Test that hook fails gracefully when URL argument is missing."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
env = get_test_env()
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), '--snapshot-id=test-missing-url'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should exit with error
|
||||
assert result.returncode != 0, "Should fail when URL is missing"
|
||||
assert 'Usage:' in result.stderr or 'url' in result.stderr.lower()
|
||||
|
||||
|
||||
def test_missing_snapshot_id_argument():
|
||||
"""Test that hook fails gracefully when snapshot-id argument is missing."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
tmpdir = Path(tmpdir)
|
||||
|
||||
env = get_test_env()
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}'],
|
||||
cwd=tmpdir,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should exit with error
|
||||
assert result.returncode != 0, "Should fail when snapshot-id is missing"
|
||||
assert 'Usage:' in result.stderr or 'snapshot' in result.stderr.lower()
|
||||
|
||||
|
||||
def test_invalid_resolution_format():
|
||||
"""Test that invalid CHROME_RESOLUTION format is handled gracefully."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-badres'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
env = get_test_env()
|
||||
# Invalid resolution formats to test parseResolution error handling
|
||||
for bad_resolution in ['invalid', '1440', '1440x2000', 'abc,def']:
|
||||
env['CHROME_RESOLUTION'] = bad_resolution
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-badres'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
# Should either fail gracefully or fall back to default
|
||||
# (depending on implementation - script should not crash with uncaught error)
|
||||
assert result.returncode in (0, 1), f"Script should handle bad resolution: {bad_resolution}"
|
||||
|
||||
|
||||
def test_boolean_env_var_parsing():
|
||||
"""Test that boolean environment variables are parsed correctly."""
|
||||
import time
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-bool'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
env = get_test_env()
|
||||
|
||||
# Test various boolean formats for CHROME_HEADLESS
|
||||
for bool_val in ['true', '1', 'yes', 'on', 'True', 'TRUE']:
|
||||
env['CHROME_HEADLESS'] = bool_val
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-bool'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
# Should either succeed or fail, but shouldn't crash on boolean parsing
|
||||
assert result.returncode in (0, 1), f"Should handle boolean value: {bool_val}"
|
||||
|
||||
# Clean up screenshot file if created
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
if screenshot_file.exists():
|
||||
screenshot_file.unlink()
|
||||
|
||||
time.sleep(0.5) # Brief pause between attempts
|
||||
|
||||
|
||||
def test_integer_env_var_parsing():
|
||||
"""Test that integer environment variables are parsed correctly."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-int'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
env = get_test_env()
|
||||
|
||||
# Test valid and invalid integer formats for CHROME_TIMEOUT
|
||||
test_cases = [
|
||||
('60', True), # Valid integer
|
||||
('invalid', True), # Invalid - should use default
|
||||
('', True), # Empty - should use default
|
||||
]
|
||||
|
||||
for timeout_val, should_work in test_cases:
|
||||
env['CHROME_TIMEOUT'] = timeout_val
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-int'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
# Should either succeed or fail gracefully, but shouldn't crash on int parsing
|
||||
assert result.returncode in (0, 1), f"Should handle timeout value: {timeout_val}"
|
||||
|
||||
# Clean up screenshot file if created
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
if screenshot_file.exists():
|
||||
screenshot_file.unlink()
|
||||
|
||||
|
||||
def test_extracts_screenshot_with_all_config_options():
|
||||
"""Test screenshot with comprehensive config to exercise all code paths."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-full'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
# Set ALL config options to exercise all code paths
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
env['CHROME_RESOLUTION'] = '800,600'
|
||||
env['CHROME_USER_AGENT'] = 'TestBot/1.0'
|
||||
env['CHROME_CHECK_SSL_VALIDITY'] = 'false' # Exercises checkSsl branch
|
||||
env['CHROME_TIMEOUT'] = '60'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-full'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Screenshot should succeed: {result.stderr}"
|
||||
|
||||
# Verify JSONL output with success
|
||||
result_json = None
|
||||
for line in result.stdout.strip().split('\n'):
|
||||
if line.strip().startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'ArchiveResult':
|
||||
result_json = record
|
||||
break
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
assert result_json, "Should have ArchiveResult JSONL output"
|
||||
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
|
||||
assert 'screenshot.png' in result_json['output_str']
|
||||
|
||||
# Verify file created
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
assert screenshot_file.exists(), "screenshot.png should be created"
|
||||
assert screenshot_file.stat().st_size > 1000, "Screenshot should have content"
|
||||
|
||||
|
||||
def test_headless_mode_false():
|
||||
"""Test headless=false code path specifically."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-headless'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
env = get_test_env()
|
||||
# Explicitly test headless=false (exercises the ternary false branch)
|
||||
env['CHROME_HEADLESS'] = 'false'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-headless-false'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
# Should work or fail gracefully
|
||||
assert result.returncode in (0, 1), f"Headless=false should handle: {result.stderr}"
|
||||
|
||||
|
||||
def test_invalid_url_causes_error():
|
||||
"""Test error path with invalid URL that causes navigation failure."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-invalid'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_TIMEOUT'] = '5' # Short timeout
|
||||
|
||||
# Use invalid URL to trigger error path
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), '--url=http://this-domain-does-not-exist-12345.invalid', '--snapshot-id=snap-invalid'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should fail due to navigation error
|
||||
assert result.returncode != 0, "Should fail on invalid URL"
|
||||
# Should NOT emit JSONL (transient error)
|
||||
jsonl_lines = [line for line in result.stdout.strip().split('\n') if line.strip().startswith('{')]
|
||||
assert len(jsonl_lines) == 0, f"Should not emit JSONL on error: {jsonl_lines}"
|
||||
|
||||
|
||||
def test_with_corrupted_cdp_url_falls_back():
|
||||
"""Test that corrupted CDP URL file causes fallback to launching browser."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-corrupt-cdp'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
# Create chrome directory with corrupted CDP URL
|
||||
chrome_dir = snapshot_dir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
(chrome_dir / 'cdp_url.txt').write_text('ws://127.0.0.1:99999/invalid')
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
env['CHROME_TIMEOUT'] = '5' # Short timeout for fast test
|
||||
|
||||
# Screenshot should try CDP, fail quickly, then fall back to launching own browser
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-corrupt-cdp'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=30,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should succeed by falling back to launching browser
|
||||
assert result.returncode == 0, f"Should fallback and succeed: {result.stderr}"
|
||||
assert 'Failed to connect to CDP' in result.stderr, "Should log CDP connection failure"
|
||||
|
||||
# Verify screenshot was created via fallback path
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
assert screenshot_file.exists(), "Screenshot should be created via fallback"
|
||||
|
||||
|
||||
def test_user_agent_is_applied():
|
||||
"""Test that CHROME_USER_AGENT is actually applied when launching browser."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-ua'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_USER_AGENT'] = 'CustomBot/9.9.9 (Testing)'
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-ua'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
|
||||
# Should succeed with custom user agent
|
||||
assert result.returncode == 0, f"Should succeed with custom UA: {result.stderr}"
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
assert screenshot_file.exists(), "Screenshot should be created"
|
||||
|
||||
|
||||
def test_check_ssl_false_branch():
|
||||
"""Test CHROME_CHECK_SSL_VALIDITY=false adds ignore-certificate-errors arg."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-nossl'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_CHECK_SSL_VALIDITY'] = 'false'
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
# Test with both boolean false and string 'false'
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-nossl'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should work with SSL check disabled: {result.stderr}"
|
||||
assert (screenshot_dir / 'screenshot.png').exists()
|
||||
|
||||
|
||||
def test_alternative_env_var_names():
|
||||
"""Test fallback environment variable names (TIMEOUT vs CHROME_TIMEOUT, etc)."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-altenv'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
env = get_test_env()
|
||||
# Use alternative env var names (without CHROME_ prefix)
|
||||
env['TIMEOUT'] = '45'
|
||||
env['RESOLUTION'] = '1024,768'
|
||||
env['USER_AGENT'] = 'AltBot/1.0'
|
||||
env['CHECK_SSL_VALIDITY'] = 'false'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-altenv'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should work with alternative env vars: {result.stderr}"
|
||||
assert (screenshot_dir / 'screenshot.png').exists()
|
||||
|
||||
|
||||
def test_very_large_resolution():
|
||||
"""Test screenshot with very large resolution."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-large'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_RESOLUTION'] = '3840,2160' # 4K resolution
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-large'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should handle large resolution: {result.stderr}"
|
||||
screenshot_file = screenshot_dir / 'screenshot.png'
|
||||
assert screenshot_file.exists()
|
||||
# 4K screenshot should be larger
|
||||
assert screenshot_file.stat().st_size > 5000, "4K screenshot should be substantial"
|
||||
|
||||
|
||||
def test_very_small_resolution():
|
||||
"""Test screenshot with very small resolution."""
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir)
|
||||
snapshot_dir = data_dir / 'users' / 'testuser' / 'snapshots' / '20240101' / 'example.com' / 'snap-small'
|
||||
screenshot_dir = snapshot_dir / 'screenshot'
|
||||
screenshot_dir.mkdir(parents=True)
|
||||
|
||||
env = get_test_env()
|
||||
env['CHROME_RESOLUTION'] = '320,240' # Very small
|
||||
env['CHROME_HEADLESS'] = 'true'
|
||||
|
||||
result = subprocess.run(
|
||||
['node', str(SCREENSHOT_HOOK), f'--url={TEST_URL}', '--snapshot-id=snap-small'],
|
||||
cwd=str(screenshot_dir),
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env
|
||||
)
|
||||
|
||||
assert result.returncode == 0, f"Should handle small resolution: {result.stderr}"
|
||||
assert (screenshot_dir / 'screenshot.png').exists()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
|
||||
85
archivebox/plugins/singlefile/on_Crawl__08_singlefile_install.py
Executable file
85
archivebox/plugins/singlefile/on_Crawl__08_singlefile_install.py
Executable file
@@ -0,0 +1,85 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Detect single-file binary and emit Binary JSONL record.
|
||||
|
||||
Output: Binary JSONL record to stdout if single-file is found
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def output_binary_found(binary: Binary, name: str):
|
||||
"""Output Binary JSONL record for an installed binary."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'env', # Already installed
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def output_binary_missing(name: str, binproviders: str):
|
||||
"""Output Binary JSONL record for a missing binary that needs installation."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'binproviders': binproviders, # Providers that can install it
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def main():
|
||||
singlefile_enabled = get_env_bool('SINGLEFILE_ENABLED', True)
|
||||
|
||||
if not singlefile_enabled:
|
||||
sys.exit(0)
|
||||
|
||||
provider = EnvProvider()
|
||||
found = False
|
||||
|
||||
# Try single-file-cli first, then single-file
|
||||
for binary_name in ['single-file-cli', 'single-file']:
|
||||
try:
|
||||
binary = Binary(name=binary_name, binproviders=[provider]).load()
|
||||
if binary.abspath:
|
||||
# Binary found
|
||||
output_binary_found(binary, name='single-file')
|
||||
found = True
|
||||
break
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not found:
|
||||
# Binary not found
|
||||
output_binary_missing(name='single-file', binproviders='npm')
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
30
archivebox/plugins/wget/on_Crawl__06_wget_install.py
Normal file → Executable file
30
archivebox/plugins/wget/on_Crawl__06_wget_install.py
Normal file → Executable file
@@ -40,8 +40,8 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
return default
|
||||
|
||||
|
||||
def output_binary(binary: Binary, name: str):
|
||||
"""Output Binary JSONL record to stdout."""
|
||||
def output_binary_found(binary: Binary, name: str):
|
||||
"""Output Binary JSONL record for an installed binary."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
@@ -50,7 +50,20 @@ def output_binary(binary: Binary, name: str):
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'env',
|
||||
'binprovider': 'env', # Already installed
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def output_binary_missing(name: str, binproviders: str):
|
||||
"""Output Binary JSONL record for a missing binary that needs installation."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'binproviders': binproviders, # Providers that can install it
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
@@ -89,16 +102,19 @@ def main():
|
||||
binary_path = ''
|
||||
|
||||
if not binary_path:
|
||||
if use_wget:
|
||||
errors.append(f"WGET_BINARY={wget_binary} not found. Install wget or set WGET_ENABLED=false.")
|
||||
# Binary not found
|
||||
computed['WGET_BINARY'] = ''
|
||||
if use_wget:
|
||||
# Emit Binary record for installation
|
||||
output_binary_missing(name='wget', binproviders='apt,brew')
|
||||
else:
|
||||
# Binary found
|
||||
computed['WGET_BINARY'] = binary_path
|
||||
wget_version = str(binary.version) if binary.version else 'unknown'
|
||||
computed['WGET_VERSION'] = wget_version
|
||||
|
||||
# Output Binary JSONL record
|
||||
output_binary(binary, name='wget')
|
||||
# Output Binary JSONL record for installed binary
|
||||
output_binary_found(binary, name='wget')
|
||||
|
||||
# Check for compression support
|
||||
if computed.get('WGET_BINARY'):
|
||||
|
||||
80
archivebox/plugins/ytdlp/on_Crawl__07_ytdlp_install.py
Executable file
80
archivebox/plugins/ytdlp/on_Crawl__07_ytdlp_install.py
Executable file
@@ -0,0 +1,80 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Detect yt-dlp binary and emit Binary JSONL record.
|
||||
|
||||
Output: Binary JSONL record to stdout if yt-dlp is found
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
|
||||
from abx_pkg import Binary, EnvProvider
|
||||
|
||||
|
||||
def get_env(name: str, default: str = '') -> str:
|
||||
return os.environ.get(name, default).strip()
|
||||
|
||||
def get_env_bool(name: str, default: bool = False) -> bool:
|
||||
val = get_env(name, '').lower()
|
||||
if val in ('true', '1', 'yes', 'on'):
|
||||
return True
|
||||
if val in ('false', '0', 'no', 'off'):
|
||||
return False
|
||||
return default
|
||||
|
||||
|
||||
def output_binary_found(binary: Binary, name: str):
|
||||
"""Output Binary JSONL record for an installed binary."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': str(binary.version) if binary.version else '',
|
||||
'sha256': binary.sha256 or '',
|
||||
'binprovider': 'env', # Already installed
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def output_binary_missing(name: str, binproviders: str):
|
||||
"""Output Binary JSONL record for a missing binary that needs installation."""
|
||||
machine_id = os.environ.get('MACHINE_ID', '')
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': name,
|
||||
'binproviders': binproviders, # Providers that can install it
|
||||
'machine_id': machine_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
|
||||
|
||||
def main():
|
||||
ytdlp_enabled = get_env_bool('YTDLP_ENABLED', True)
|
||||
ytdlp_binary = get_env('YTDLP_BINARY', 'yt-dlp')
|
||||
|
||||
if not ytdlp_enabled:
|
||||
sys.exit(0)
|
||||
|
||||
provider = EnvProvider()
|
||||
try:
|
||||
binary = Binary(name=ytdlp_binary, binproviders=[provider]).load()
|
||||
if binary.abspath:
|
||||
# Binary found
|
||||
output_binary_found(binary, name='yt-dlp')
|
||||
else:
|
||||
# Binary not found
|
||||
output_binary_missing(name='yt-dlp', binproviders='pip,brew,apt')
|
||||
except Exception:
|
||||
# Binary not found
|
||||
output_binary_missing(name='yt-dlp', binproviders='pip,brew,apt')
|
||||
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
@@ -10,6 +10,7 @@ Migration tests from 0.8.x to 0.9.x.
|
||||
- New fields like depth, retry_at, etc.
|
||||
"""
|
||||
|
||||
import json
|
||||
import shutil
|
||||
import sqlite3
|
||||
import subprocess
|
||||
@@ -78,29 +79,43 @@ class TestMigrationFrom08x(unittest.TestCase):
|
||||
self.assertTrue(ok, msg)
|
||||
|
||||
def test_migration_preserves_crawls(self):
|
||||
"""Migration should preserve all Crawl records."""
|
||||
"""Migration should preserve all Crawl records and create default crawl if needed."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Count snapshots with NULL crawl_id in original data
|
||||
snapshots_without_crawl = sum(1 for s in self.original_data['snapshots'] if s['crawl_id'] is None)
|
||||
|
||||
# Expected count: original crawls + 1 default crawl if any snapshots had NULL crawl_id
|
||||
expected_count = len(self.original_data['crawls'])
|
||||
if snapshots_without_crawl > 0:
|
||||
expected_count += 1 # Migration 0024 creates a default crawl
|
||||
|
||||
ok, msg = verify_crawl_count(self.db_path, expected_count)
|
||||
self.assertTrue(ok, msg)
|
||||
|
||||
def test_migration_preserves_snapshot_crawl_links(self):
|
||||
"""Migration should preserve snapshot-to-crawl relationships."""
|
||||
"""Migration should preserve snapshot-to-crawl relationships and assign default crawl to orphans."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Check EVERY snapshot still has its crawl_id
|
||||
# Check EVERY snapshot has a crawl_id after migration
|
||||
for snapshot in self.original_data['snapshots']:
|
||||
cursor.execute("SELECT crawl_id FROM core_snapshot WHERE url = ?", (snapshot['url'],))
|
||||
row = cursor.fetchone()
|
||||
self.assertIsNotNone(row, f"Snapshot {snapshot['url']} not found after migration")
|
||||
self.assertEqual(row[0], snapshot['crawl_id'],
|
||||
f"Crawl ID mismatch for {snapshot['url']}: expected {snapshot['crawl_id']}, got {row[0]}")
|
||||
|
||||
if snapshot['crawl_id'] is not None:
|
||||
# Snapshots that had a crawl should keep it
|
||||
self.assertEqual(row[0], snapshot['crawl_id'],
|
||||
f"Crawl ID changed for {snapshot['url']}: expected {snapshot['crawl_id']}, got {row[0]}")
|
||||
else:
|
||||
# Snapshots without a crawl should now have one (the default crawl)
|
||||
self.assertIsNotNone(row[0],
|
||||
f"Snapshot {snapshot['url']} should have been assigned to default crawl but has NULL")
|
||||
|
||||
conn.close()
|
||||
|
||||
@@ -153,7 +168,7 @@ class TestMigrationFrom08x(unittest.TestCase):
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
result = run_archivebox(self.work_dir, ['list'])
|
||||
result = run_archivebox(self.work_dir, ['snapshot', 'list'])
|
||||
self.assertEqual(result.returncode, 0, f"List failed after migration: {result.stderr}")
|
||||
|
||||
# Verify ALL snapshots appear in output
|
||||
@@ -475,357 +490,227 @@ class TestFilesystemMigration08to09(unittest.TestCase):
|
||||
"""Clean up temporary directory."""
|
||||
shutil.rmtree(self.work_dir, ignore_errors=True)
|
||||
|
||||
def test_filesystem_migration_with_real_archiving(self):
|
||||
def test_archiveresult_files_preserved_after_migration(self):
|
||||
"""
|
||||
Test that filesystem migration works with real archived content.
|
||||
Test that ArchiveResult output files are reorganized into new structure.
|
||||
|
||||
Steps:
|
||||
1. Initialize archivebox
|
||||
2. Archive https://example.com (creates real files)
|
||||
3. Manually set fs_version to 0.8.0
|
||||
4. Trigger migration by saving snapshot
|
||||
5. Verify files are organized correctly
|
||||
This test verifies that:
|
||||
1. Migration preserves ArchiveResult data in Process/Binary records
|
||||
2. Running `archivebox update` reorganizes files into new structure
|
||||
3. New structure: users/username/snapshots/YYYYMMDD/example.com/snap-uuid-here/output.ext
|
||||
4. All files are moved (no data loss)
|
||||
5. Old archive/timestamp/ directories are cleaned up
|
||||
"""
|
||||
# Step 1: Initialize
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
# Use the real 0.7.2 database which has actual ArchiveResults with files
|
||||
gold_db = Path('/Users/squash/Local/Code/archiveboxes/archivebox-migration-path/archivebox-v0.7.2/data')
|
||||
if not gold_db.exists():
|
||||
self.skipTest(f"Gold standard database not found at {gold_db}")
|
||||
|
||||
# Step 2: Archive example.com with ALL extractors enabled
|
||||
# This ensures we test migration with all file types
|
||||
try:
|
||||
result = run_archivebox(
|
||||
self.work_dir,
|
||||
['add', '--depth=0', 'https://example.com'],
|
||||
timeout=300, # 5 minutes for all extractors
|
||||
env={
|
||||
'SAVE_TITLE': 'True',
|
||||
'SAVE_FAVICON': 'True',
|
||||
'SAVE_WGET': 'True',
|
||||
'SAVE_SCREENSHOT': 'True',
|
||||
'SAVE_DOM': 'True',
|
||||
'SAVE_SINGLEFILE': 'True',
|
||||
'SAVE_READABILITY': 'True',
|
||||
'SAVE_MERCURY': 'True',
|
||||
'SAVE_PDF': 'True',
|
||||
'SAVE_YTDLP': 'True',
|
||||
'SAVE_ARCHIVEDOTORG': 'True',
|
||||
'SAVE_HEADERS': 'True',
|
||||
'SAVE_HTMLTOTEXT': 'True',
|
||||
'SAVE_GIT': 'True',
|
||||
}
|
||||
)
|
||||
except subprocess.TimeoutExpired as e:
|
||||
# If timeout, still continue - we want to test with whatever files were created
|
||||
print(f"\n[!] Add command timed out after {e.timeout}s, continuing with partial results...")
|
||||
# Note: Snapshot may still have been created even if command timed out
|
||||
# Copy gold database to test directory
|
||||
import shutil
|
||||
for item in gold_db.iterdir():
|
||||
if item.is_dir():
|
||||
shutil.copytree(item, self.work_dir / item.name, dirs_exist_ok=True)
|
||||
else:
|
||||
shutil.copy2(item, self.work_dir / item.name)
|
||||
|
||||
# Step 3: Get the snapshot and verify files were created
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT id, url, timestamp, fs_version FROM core_snapshot WHERE url = ?", ('https://example.com',))
|
||||
row = cursor.fetchone()
|
||||
conn.close()
|
||||
|
||||
if not row:
|
||||
self.skipTest("Failed to create snapshot for https://example.com")
|
||||
|
||||
snapshot_id, url, timestamp, fs_version = row
|
||||
|
||||
# Verify initial fs_version is 0.9.0 (current version)
|
||||
self.assertEqual(fs_version, '0.9.0', f"Expected new snapshot to have fs_version='0.9.0', got '{fs_version}'")
|
||||
|
||||
# Verify output directory exists
|
||||
output_dir = self.work_dir / 'archive' / timestamp
|
||||
self.assertTrue(output_dir.exists(), f"Output directory not found: {output_dir}")
|
||||
|
||||
# List all files created (for debugging)
|
||||
files_before = list(output_dir.rglob('*'))
|
||||
files_before_count = len([f for f in files_before if f.is_file()])
|
||||
print(f"\n[*] Files created by archiving: {files_before_count}")
|
||||
for f in sorted(files_before):
|
||||
if f.is_file():
|
||||
print(f" {f.relative_to(output_dir)}")
|
||||
|
||||
# Step 4: Manually set fs_version to 0.8.0 to simulate old snapshot
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("UPDATE core_snapshot SET fs_version = '0.8.0' WHERE id = ?", (snapshot_id,))
|
||||
conn.commit()
|
||||
|
||||
# Verify the update worked
|
||||
cursor.execute("SELECT fs_version FROM core_snapshot WHERE id = ?", (snapshot_id,))
|
||||
updated_version = cursor.fetchone()[0]
|
||||
conn.close()
|
||||
self.assertEqual(updated_version, '0.8.0', "Failed to set fs_version to 0.8.0")
|
||||
|
||||
# Step 5: Trigger migration by running a command that loads and saves the snapshot
|
||||
# We'll use the Python API directly to trigger save()
|
||||
import os
|
||||
import sys
|
||||
import django
|
||||
|
||||
# Setup Django
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||
os.environ['DATA_DIR'] = str(self.work_dir)
|
||||
|
||||
# Add parent dir to path so we can import archivebox
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||
|
||||
try:
|
||||
django.setup()
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
# Load the snapshot (should trigger migration on save)
|
||||
snapshot = Snapshot.objects.get(url='https://example.com')
|
||||
|
||||
# Verify fs_migration_needed returns True
|
||||
self.assertTrue(snapshot.fs_migration_needed,
|
||||
f"fs_migration_needed should be True for fs_version='0.8.0'")
|
||||
|
||||
# Save to trigger migration
|
||||
print(f"\n[*] Triggering filesystem migration by saving snapshot...")
|
||||
snapshot.save()
|
||||
|
||||
# Refresh from DB
|
||||
snapshot.refresh_from_db()
|
||||
|
||||
# Verify migration completed
|
||||
self.assertEqual(snapshot.fs_version, '0.9.0',
|
||||
f"Migration failed: fs_version is still '{snapshot.fs_version}'")
|
||||
self.assertFalse(snapshot.fs_migration_needed,
|
||||
"fs_migration_needed should be False after migration")
|
||||
|
||||
print(f"[√] Filesystem migration completed: 0.8.0 -> 0.9.0")
|
||||
|
||||
except Exception as e:
|
||||
self.fail(f"Failed to trigger migration via Django: {e}")
|
||||
|
||||
# Step 6: Verify files still exist and are accessible
|
||||
# For 0.8 -> 0.9, the migration is a no-op, so files should be in the same place
|
||||
files_after = list(output_dir.rglob('*'))
|
||||
files_after_count = len([f for f in files_after if f.is_file()])
|
||||
|
||||
print(f"\n[*] Files after migration: {files_after_count}")
|
||||
|
||||
# Verify no files were lost
|
||||
self.assertGreaterEqual(files_after_count, files_before_count,
|
||||
f"Files were lost during migration: {files_before_count} -> {files_after_count}")
|
||||
|
||||
|
||||
class TestDBOnlyCommands(unittest.TestCase):
|
||||
"""Test that status/search/list commands only use DB, not filesystem."""
|
||||
|
||||
def setUp(self):
|
||||
"""Create a temporary directory with 0.8.x schema and data."""
|
||||
self.work_dir = Path(tempfile.mkdtemp())
|
||||
self.db_path = self.work_dir / 'index.sqlite3'
|
||||
|
||||
create_data_dir_structure(self.work_dir)
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
conn.executescript(SCHEMA_0_8)
|
||||
conn.close()
|
||||
self.original_data = seed_0_8_data(self.db_path)
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up temporary directory."""
|
||||
shutil.rmtree(self.work_dir, ignore_errors=True)
|
||||
|
||||
def test_status_works_with_empty_archive(self):
|
||||
"""Status command should work with empty archive/ (queries DB only)."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Add a snapshot to DB
|
||||
result = run_archivebox(self.work_dir, ['add', 'https://example.com'], timeout=60)
|
||||
|
||||
# Empty the archive directory (but keep it existing)
|
||||
# Count archive directories and files BEFORE migration
|
||||
archive_dir = self.work_dir / 'archive'
|
||||
if archive_dir.exists():
|
||||
for item in archive_dir.iterdir():
|
||||
if item.is_dir():
|
||||
shutil.rmtree(item)
|
||||
else:
|
||||
item.unlink()
|
||||
dirs_before = list(archive_dir.glob('*')) if archive_dir.exists() else []
|
||||
dirs_before_count = len([d for d in dirs_before if d.is_dir()])
|
||||
|
||||
# Status should still work (queries DB only, doesn't scan filesystem)
|
||||
result = run_archivebox(self.work_dir, ['status'])
|
||||
self.assertEqual(result.returncode, 0,
|
||||
f"Status should work with empty archive: {result.stderr}")
|
||||
# Count total files in all archive directories
|
||||
files_before = []
|
||||
for d in dirs_before:
|
||||
if d.is_dir():
|
||||
files_before.extend([f for f in d.rglob('*') if f.is_file()])
|
||||
files_before_count = len(files_before)
|
||||
|
||||
# Should show count from DB
|
||||
output = result.stdout + result.stderr
|
||||
self.assertIn('Total', output,
|
||||
"Status should show DB statistics even with no files")
|
||||
# Sample some specific files to check they're preserved
|
||||
sample_files = [
|
||||
'favicon.ico',
|
||||
'screenshot.png',
|
||||
'singlefile.html',
|
||||
'headers.json',
|
||||
]
|
||||
sample_paths_before = {}
|
||||
for d in dirs_before:
|
||||
if d.is_dir():
|
||||
for sample_file in sample_files:
|
||||
matching = list(d.glob(sample_file))
|
||||
if matching:
|
||||
sample_paths_before[f"{d.name}/{sample_file}"] = matching[0]
|
||||
|
||||
def test_list_works_with_empty_archive(self):
|
||||
"""List command should work with empty archive/ (queries DB only)."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
print(f"\n[*] Archive directories before migration: {dirs_before_count}")
|
||||
print(f"[*] Total files before migration: {files_before_count}")
|
||||
print(f"[*] Sample files found: {len(sample_paths_before)}")
|
||||
|
||||
# Add a snapshot to DB
|
||||
result = run_archivebox(self.work_dir, ['add', 'https://example.com'], timeout=60)
|
||||
# Run init to trigger migration
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=60)
|
||||
self.assertEqual(result.returncode, 0, f"Init (migration) failed: {result.stderr}")
|
||||
|
||||
# Empty the archive directory (but keep it existing)
|
||||
archive_dir = self.work_dir / 'archive'
|
||||
if archive_dir.exists():
|
||||
for item in archive_dir.iterdir():
|
||||
if item.is_dir():
|
||||
shutil.rmtree(item)
|
||||
else:
|
||||
item.unlink()
|
||||
# Count archive directories and files AFTER migration
|
||||
dirs_after = list(archive_dir.glob('*')) if archive_dir.exists() else []
|
||||
dirs_after_count = len([d for d in dirs_after if d.is_dir()])
|
||||
|
||||
# List should still work (queries DB only, doesn't scan filesystem)
|
||||
result = run_archivebox(self.work_dir, ['list'])
|
||||
self.assertEqual(result.returncode, 0,
|
||||
f"List should work with empty archive: {result.stderr}")
|
||||
files_after = []
|
||||
for d in dirs_after:
|
||||
if d.is_dir():
|
||||
files_after.extend([f for f in d.rglob('*') if f.is_file()])
|
||||
files_after_count = len(files_after)
|
||||
|
||||
# Should show snapshot from DB
|
||||
output = result.stdout + result.stderr
|
||||
self.assertIn('example.com', output,
|
||||
"Snapshot should appear in list output even with no files")
|
||||
# Verify sample files still exist
|
||||
sample_paths_after = {}
|
||||
for d in dirs_after:
|
||||
if d.is_dir():
|
||||
for sample_file in sample_files:
|
||||
matching = list(d.glob(sample_file))
|
||||
if matching:
|
||||
sample_paths_after[f"{d.name}/{sample_file}"] = matching[0]
|
||||
|
||||
def test_search_works_with_empty_archive(self):
|
||||
"""Search command should work with empty archive/ (queries DB only)."""
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
print(f"[*] Archive directories after migration: {dirs_after_count}")
|
||||
print(f"[*] Total files after migration: {files_after_count}")
|
||||
print(f"[*] Sample files found: {len(sample_paths_after)}")
|
||||
|
||||
# Add a snapshot to DB
|
||||
result = run_archivebox(self.work_dir, ['add', 'https://example.com'], timeout=60)
|
||||
# Verify files still in old structure after migration (not moved yet)
|
||||
self.assertEqual(dirs_before_count, dirs_after_count,
|
||||
f"Archive directories lost during migration: {dirs_before_count} -> {dirs_after_count}")
|
||||
self.assertEqual(files_before_count, files_after_count,
|
||||
f"Files lost during migration: {files_before_count} -> {files_after_count}")
|
||||
|
||||
# Empty the archive directory (but keep it existing)
|
||||
archive_dir = self.work_dir / 'archive'
|
||||
if archive_dir.exists():
|
||||
for item in archive_dir.iterdir():
|
||||
if item.is_dir():
|
||||
shutil.rmtree(item)
|
||||
else:
|
||||
item.unlink()
|
||||
|
||||
# Search should still work (queries DB only, doesn't scan filesystem)
|
||||
result = run_archivebox(self.work_dir, ['search'])
|
||||
self.assertEqual(result.returncode, 0,
|
||||
f"Search should work with empty archive: {result.stderr}")
|
||||
|
||||
# Should show snapshot from DB
|
||||
output = result.stdout + result.stderr
|
||||
self.assertIn('example.com', output,
|
||||
"Snapshot should appear in search output even with no files")
|
||||
|
||||
|
||||
class TestUpdateCommandArchitecture(unittest.TestCase):
|
||||
"""Test new update command architecture: filters=DB only, no filters=scan filesystem."""
|
||||
|
||||
def setUp(self):
|
||||
"""Create a temporary directory with 0.8.x schema and data."""
|
||||
self.work_dir = Path(tempfile.mkdtemp())
|
||||
self.db_path = self.work_dir / 'index.sqlite3'
|
||||
create_data_dir_structure(self.work_dir)
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up temporary directory."""
|
||||
shutil.rmtree(self.work_dir, ignore_errors=True)
|
||||
|
||||
def test_update_with_filters_uses_db_only(self):
|
||||
"""Update with filters should only query DB, not scan filesystem."""
|
||||
# Initialize with data
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
conn.executescript(SCHEMA_0_8)
|
||||
conn.close()
|
||||
seed_0_8_data(self.db_path)
|
||||
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Run update with filter - should not scan filesystem
|
||||
# Use a URL from the seeded data
|
||||
result = run_archivebox(self.work_dir, ['update', 'example.com'], timeout=120)
|
||||
# Should complete successfully (or with orchestrator error, which is okay)
|
||||
# The key is it should not scan filesystem
|
||||
|
||||
def test_update_without_filters_imports_orphans(self):
|
||||
"""Update without filters should scan filesystem and import orphaned directories."""
|
||||
# Initialize empty DB
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
|
||||
# Create an orphaned directory in archive/
|
||||
timestamp = '1609459200'
|
||||
orphan_dir = self.work_dir / 'archive' / timestamp
|
||||
orphan_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
index_data = {
|
||||
'url': 'https://orphan.example.com',
|
||||
'timestamp': timestamp,
|
||||
'title': 'Orphaned Snapshot',
|
||||
}
|
||||
(orphan_dir / 'index.json').write_text(json.dumps(index_data))
|
||||
(orphan_dir / 'index.html').write_text('<html>Orphan</html>')
|
||||
|
||||
# Count snapshots before update
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT COUNT(*) FROM core_snapshot")
|
||||
count_before = cursor.fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# Run full update (no filters) - should scan filesystem
|
||||
# Run update to trigger filesystem reorganization
|
||||
print(f"\n[*] Running archivebox update to reorganize filesystem...")
|
||||
result = run_archivebox(self.work_dir, ['update'], timeout=120)
|
||||
self.assertEqual(result.returncode, 0, f"Update failed: {result.stderr}")
|
||||
|
||||
# Check if orphan was imported
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
cursor = conn.cursor()
|
||||
cursor.execute("SELECT COUNT(*) FROM core_snapshot WHERE url = ?",
|
||||
('https://orphan.example.com',))
|
||||
orphan_count = cursor.fetchone()[0]
|
||||
conn.close()
|
||||
# Check new filesystem structure
|
||||
# New structure: users/username/snapshots/YYYYMMDD/example.com/snap-uuid-here/output.ext
|
||||
users_dir = self.work_dir / 'users'
|
||||
snapshots_base = None
|
||||
|
||||
# If update succeeded, orphan should be imported
|
||||
if result.returncode == 0:
|
||||
self.assertGreaterEqual(orphan_count, 1,
|
||||
"Orphaned snapshot should be imported by update")
|
||||
if users_dir.exists():
|
||||
# Find the snapshots directory
|
||||
for user_dir in users_dir.iterdir():
|
||||
if user_dir.is_dir():
|
||||
user_snapshots = user_dir / 'snapshots'
|
||||
if user_snapshots.exists():
|
||||
snapshots_base = user_snapshots
|
||||
break
|
||||
|
||||
print(f"[*] New structure base: {snapshots_base}")
|
||||
|
||||
class TestTimestampUniqueness(unittest.TestCase):
|
||||
"""Test timestamp uniqueness constraint."""
|
||||
# Count files in new structure
|
||||
# Structure: users/{username}/snapshots/YYYYMMDD/{domain}/{uuid}/files...
|
||||
files_new_structure = []
|
||||
new_sample_files = {}
|
||||
|
||||
def setUp(self):
|
||||
"""Create a temporary directory."""
|
||||
self.work_dir = Path(tempfile.mkdtemp())
|
||||
self.db_path = self.work_dir / 'index.sqlite3'
|
||||
create_data_dir_structure(self.work_dir)
|
||||
if snapshots_base and snapshots_base.exists():
|
||||
for date_dir in snapshots_base.iterdir():
|
||||
if date_dir.is_dir():
|
||||
for domain_dir in date_dir.iterdir():
|
||||
if domain_dir.is_dir():
|
||||
for snap_dir in domain_dir.iterdir():
|
||||
if snap_dir.is_dir():
|
||||
# Files are directly in snap-uuid/ directory (no plugin subdirs)
|
||||
for f in snap_dir.rglob('*'):
|
||||
if f.is_file():
|
||||
files_new_structure.append(f)
|
||||
# Track sample files
|
||||
if f.name in sample_files:
|
||||
new_sample_files[f"{snap_dir.name}/{f.name}"] = f
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up temporary directory."""
|
||||
shutil.rmtree(self.work_dir, ignore_errors=True)
|
||||
files_new_count = len(files_new_structure)
|
||||
print(f"[*] Files in new structure: {files_new_count}")
|
||||
print(f"[*] Sample files in new structure: {len(new_sample_files)}")
|
||||
|
||||
def test_timestamp_uniqueness_constraint_exists(self):
|
||||
"""Database should have timestamp uniqueness constraint after migration."""
|
||||
# Initialize with 0.8.x and migrate
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
conn.executescript(SCHEMA_0_8)
|
||||
conn.close()
|
||||
# Check old structure (should be gone or empty)
|
||||
old_archive_dir = self.work_dir / 'archive'
|
||||
old_files_remaining = []
|
||||
unmigrated_dirs = []
|
||||
if old_archive_dir.exists():
|
||||
for d in old_archive_dir.glob('*'):
|
||||
# Only count REAL directories, not symlinks (symlinks are the migrated ones)
|
||||
if d.is_dir(follow_symlinks=False) and d.name.replace('.', '').isdigit():
|
||||
# This is a timestamp directory (old structure)
|
||||
files_in_dir = [f for f in d.rglob('*') if f.is_file()]
|
||||
if files_in_dir:
|
||||
unmigrated_dirs.append((d.name, len(files_in_dir)))
|
||||
old_files_remaining.extend(files_in_dir)
|
||||
|
||||
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||
old_files_count = len(old_files_remaining)
|
||||
print(f"[*] Files remaining in old structure: {old_files_count}")
|
||||
if unmigrated_dirs:
|
||||
print(f"[*] Unmigrated directories: {unmigrated_dirs}")
|
||||
|
||||
# Check if unique_timestamp constraint exists
|
||||
# CRITICAL: Verify files were moved to new structure
|
||||
self.assertGreater(files_new_count, 0,
|
||||
"No files found in new structure after update")
|
||||
|
||||
# CRITICAL: Verify old structure is cleaned up
|
||||
self.assertEqual(old_files_count, 0,
|
||||
f"Old structure not cleaned up: {old_files_count} files still in archive/timestamp/ directories")
|
||||
|
||||
# CRITICAL: Verify all files were moved (total count should match)
|
||||
total_after_update = files_new_count + old_files_count
|
||||
self.assertEqual(files_before_count, total_after_update,
|
||||
f"Files lost during reorganization: {files_before_count} before → {total_after_update} after")
|
||||
|
||||
# CRITICAL: Verify sample files exist in new structure
|
||||
self.assertGreater(len(new_sample_files), 0,
|
||||
f"Sample files not found in new structure")
|
||||
|
||||
# Verify new path format
|
||||
for path_key, file_path in new_sample_files.items():
|
||||
# Path should contain: snapshots/YYYYMMDD/domain/snap-uuid/plugin/file
|
||||
path_parts = file_path.parts
|
||||
self.assertIn('snapshots', path_parts,
|
||||
f"New path should contain 'snapshots': {file_path}")
|
||||
self.assertIn('users', path_parts,
|
||||
f"New path should contain 'users': {file_path}")
|
||||
print(f" ✓ {path_key} → {file_path.relative_to(self.work_dir)}")
|
||||
|
||||
# Verify Process and Binary records were created
|
||||
conn = sqlite3.connect(str(self.db_path))
|
||||
cursor = conn.cursor()
|
||||
|
||||
# Query sqlite_master for constraints
|
||||
cursor.execute("""
|
||||
SELECT sql FROM sqlite_master
|
||||
WHERE type='table' AND name='core_snapshot'
|
||||
""")
|
||||
table_sql = cursor.fetchone()[0]
|
||||
cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
|
||||
archiveresult_count = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM machine_process")
|
||||
process_count = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM machine_binary")
|
||||
binary_count = cursor.fetchone()[0]
|
||||
|
||||
cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE process_id IS NOT NULL")
|
||||
linked_count = cursor.fetchone()[0]
|
||||
|
||||
conn.close()
|
||||
|
||||
# Should contain unique_timestamp constraint or UNIQUE(timestamp)
|
||||
has_constraint = 'unique_timestamp' in table_sql.lower() or \
|
||||
'unique' in table_sql.lower() and 'timestamp' in table_sql.lower()
|
||||
print(f"[*] ArchiveResults: {archiveresult_count}")
|
||||
print(f"[*] Process records created: {process_count}")
|
||||
print(f"[*] Binary records created: {binary_count}")
|
||||
print(f"[*] ArchiveResults linked to Process: {linked_count}")
|
||||
|
||||
# Verify data migration happened correctly
|
||||
# The 0.7.2 gold database has 44 ArchiveResults
|
||||
self.assertEqual(archiveresult_count, 44,
|
||||
f"Expected 44 ArchiveResults from 0.7.2 database, got {archiveresult_count}")
|
||||
|
||||
# Each ArchiveResult should create one Process record
|
||||
self.assertEqual(process_count, 44,
|
||||
f"Expected 44 Process records (1 per ArchiveResult), got {process_count}")
|
||||
|
||||
# The 44 ArchiveResults use 7 unique binaries (curl, wget, etc.)
|
||||
self.assertEqual(binary_count, 7,
|
||||
f"Expected 7 unique Binary records, got {binary_count}")
|
||||
|
||||
# ALL ArchiveResults should be linked to Process records
|
||||
self.assertEqual(linked_count, 44,
|
||||
f"Expected all 44 ArchiveResults linked to Process, got {linked_count}")
|
||||
|
||||
|
||||
|
||||
self.assertTrue(has_constraint,
|
||||
f"Timestamp uniqueness constraint should exist. Table SQL: {table_sql}")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
@@ -76,11 +76,11 @@ class Orchestrator:
|
||||
self.idle_count: int = 0
|
||||
self._last_cleanup_time: float = 0.0 # For throttling cleanup_stale_running()
|
||||
|
||||
# CRITICAL: In foreground mode (exit_on_idle=True), use ONLY 1 worker
|
||||
# to keep execution strictly sequential and deterministic
|
||||
# In foreground mode (exit_on_idle=True), limit workers but allow enough
|
||||
# for crawl progression: 1 CrawlWorker + 1 SnapshotWorker + 1 ArchiveResultWorker
|
||||
if self.exit_on_idle:
|
||||
self.MAX_WORKERS_PER_TYPE = 1
|
||||
self.MAX_TOTAL_WORKERS = 1
|
||||
self.MAX_TOTAL_WORKERS = 3 # Allow one worker of each type to run concurrently
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'[underline]Orchestrator[/underline]\\[pid={self.pid}]'
|
||||
@@ -157,32 +157,41 @@ class Orchestrator:
|
||||
self._last_cleanup_time = now
|
||||
|
||||
return sum(len(W.get_running_workers()) for W in self.WORKER_TYPES)
|
||||
|
||||
def get_running_workers_for_type(self, WorkerClass: Type[Worker]) -> int:
|
||||
"""Get count of running workers for a specific worker type."""
|
||||
return len(WorkerClass.get_running_workers())
|
||||
|
||||
def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool:
|
||||
"""Determine if we should spawn a new worker of the given type."""
|
||||
if queue_count == 0:
|
||||
return False
|
||||
|
||||
|
||||
# Check per-type limit
|
||||
running_workers = WorkerClass.get_running_workers()
|
||||
if len(running_workers) >= self.MAX_WORKERS_PER_TYPE:
|
||||
running_count = len(running_workers)
|
||||
|
||||
if running_count >= self.MAX_WORKERS_PER_TYPE:
|
||||
return False
|
||||
|
||||
|
||||
# Check total limit
|
||||
if self.get_total_worker_count() >= self.MAX_TOTAL_WORKERS:
|
||||
total_workers = self.get_total_worker_count()
|
||||
if total_workers >= self.MAX_TOTAL_WORKERS:
|
||||
return False
|
||||
|
||||
|
||||
# Check if we already have enough workers for the queue size
|
||||
# Spawn more gradually - don't flood with workers
|
||||
if len(running_workers) > 0 and queue_count <= len(running_workers) * WorkerClass.MAX_CONCURRENT_TASKS:
|
||||
if running_count > 0 and queue_count <= running_count * WorkerClass.MAX_CONCURRENT_TASKS:
|
||||
return False
|
||||
|
||||
|
||||
return True
|
||||
|
||||
def spawn_worker(self, WorkerClass: Type[Worker]) -> int | None:
|
||||
"""Spawn a new worker process. Returns PID or None if spawn failed."""
|
||||
try:
|
||||
print(f'[yellow]DEBUG: Spawning {WorkerClass.name} worker with crawl_id={self.crawl_id}...[/yellow]')
|
||||
pid = WorkerClass.start(daemon=False, crawl_id=self.crawl_id)
|
||||
print(f'[yellow]DEBUG: Spawned {WorkerClass.name} worker with PID={pid}[/yellow]')
|
||||
|
||||
# CRITICAL: Block until worker registers itself in Process table
|
||||
# This prevents race condition where orchestrator spawns multiple workers
|
||||
@@ -202,6 +211,15 @@ class Orchestrator:
|
||||
# 3. RUNNING status
|
||||
# 4. Parent is this orchestrator
|
||||
# 5. Started recently (within last 10 seconds)
|
||||
|
||||
# Debug: Check all processes with this PID first
|
||||
if elapsed < 0.5:
|
||||
all_procs = list(Process.objects.filter(pid=pid))
|
||||
print(f'[yellow]DEBUG spawn_worker: elapsed={elapsed:.1f}s pid={pid} orchestrator_id={self.db_process.id}[/yellow]')
|
||||
print(f'[yellow] Found {len(all_procs)} Process records for pid={pid}[/yellow]')
|
||||
for p in all_procs:
|
||||
print(f'[yellow] -> type={p.process_type} status={p.status} parent_id={p.parent_id} match={p.parent_id == self.db_process.id}[/yellow]')
|
||||
|
||||
worker_process = Process.objects.filter(
|
||||
pid=pid,
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
@@ -212,6 +230,7 @@ class Orchestrator:
|
||||
|
||||
if worker_process:
|
||||
# Worker successfully registered!
|
||||
print(f'[green]DEBUG spawn_worker: Worker registered! Returning pid={pid}[/green]')
|
||||
return pid
|
||||
|
||||
time.sleep(poll_interval)
|
||||
@@ -244,7 +263,7 @@ class Orchestrator:
|
||||
Returns dict of queue sizes by worker type.
|
||||
"""
|
||||
queue_sizes = {}
|
||||
|
||||
|
||||
for WorkerClass in self.WORKER_TYPES:
|
||||
# Get queue for this worker type
|
||||
# Need to instantiate worker to get queue (for model access)
|
||||
@@ -392,11 +411,18 @@ class Orchestrator:
|
||||
|
||||
def _run_orchestrator_loop(self, progress, task_ids):
|
||||
"""Run the main orchestrator loop with optional progress display."""
|
||||
last_queue_sizes = {}
|
||||
last_snapshot_count = None
|
||||
try:
|
||||
while True:
|
||||
# Check queues and spawn workers
|
||||
queue_sizes = self.check_queues_and_spawn_workers()
|
||||
|
||||
# Debug queue sizes (only when changed)
|
||||
if progress and queue_sizes != last_queue_sizes:
|
||||
progress.console.print(f'[yellow]DEBUG: Queue sizes: {queue_sizes}[/yellow]')
|
||||
last_queue_sizes = queue_sizes.copy()
|
||||
|
||||
# Update progress bars
|
||||
if progress:
|
||||
from archivebox.core.models import Snapshot
|
||||
@@ -412,6 +438,11 @@ class Orchestrator:
|
||||
|
||||
active_snapshots = list(Snapshot.objects.filter(**snapshot_filter))
|
||||
|
||||
# Debug snapshot count (only when changed)
|
||||
if len(active_snapshots) != last_snapshot_count:
|
||||
progress.console.print(f'[yellow]DEBUG: Found {len(active_snapshots)} active snapshots (crawl_id={self.crawl_id})[/yellow]')
|
||||
last_snapshot_count = len(active_snapshots)
|
||||
|
||||
# Track which snapshots are still active
|
||||
active_ids = set()
|
||||
|
||||
@@ -461,7 +492,9 @@ class Orchestrator:
|
||||
del task_ids[snapshot_id]
|
||||
|
||||
# Track idle state
|
||||
if self.has_pending_work(queue_sizes) or self.has_running_workers():
|
||||
has_pending = self.has_pending_work(queue_sizes)
|
||||
has_running = self.has_running_workers()
|
||||
if has_pending or has_running:
|
||||
self.idle_count = 0
|
||||
self.on_tick(queue_sizes)
|
||||
else:
|
||||
|
||||
@@ -60,8 +60,8 @@ class Worker:
|
||||
# Configuration (can be overridden by subclasses)
|
||||
MAX_TICK_TIME: ClassVar[int] = 60
|
||||
MAX_CONCURRENT_TASKS: ClassVar[int] = 1
|
||||
POLL_INTERVAL: ClassVar[float] = 0.2 # How often to check for new work (seconds)
|
||||
IDLE_TIMEOUT: ClassVar[int] = 50 # Exit after N idle iterations (10 sec at 0.2 poll interval)
|
||||
POLL_INTERVAL: ClassVar[float] = 0.1 # How often to check for new work (seconds)
|
||||
IDLE_TIMEOUT: ClassVar[int] = 100 # Exit after N idle iterations (10 sec at 0.1 poll interval)
|
||||
|
||||
def __init__(self, worker_id: int = 0, daemon: bool = False, crawl_id: str | None = None, **kwargs: Any):
|
||||
self.worker_id = worker_id
|
||||
@@ -93,7 +93,9 @@ class Worker:
|
||||
Returns the claimed object or None if queue is empty or claim failed.
|
||||
"""
|
||||
Model = self.get_model()
|
||||
obj = self.get_queue().first()
|
||||
|
||||
queue = self.get_queue()
|
||||
obj = queue.first()
|
||||
if obj is None:
|
||||
return None
|
||||
|
||||
@@ -132,10 +134,17 @@ class Worker:
|
||||
self.pid = os.getpid()
|
||||
# Register this worker process in the database
|
||||
self.db_process = Process.current()
|
||||
# Explicitly set process_type to WORKER to prevent mis-detection
|
||||
# Explicitly set process_type to WORKER and store worker type name
|
||||
update_fields = []
|
||||
if self.db_process.process_type != Process.TypeChoices.WORKER:
|
||||
self.db_process.process_type = Process.TypeChoices.WORKER
|
||||
self.db_process.save(update_fields=['process_type'])
|
||||
update_fields.append('process_type')
|
||||
# Store worker type name (crawl/snapshot/archiveresult) in worker_type field
|
||||
if not self.db_process.worker_type:
|
||||
self.db_process.worker_type = self.name
|
||||
update_fields.append('worker_type')
|
||||
if update_fields:
|
||||
self.db_process.save(update_fields=update_fields)
|
||||
|
||||
# Determine worker type for logging
|
||||
worker_type_name = self.__class__.__name__
|
||||
@@ -316,7 +325,12 @@ class Worker:
|
||||
|
||||
Process.cleanup_stale_running()
|
||||
# Convert Process objects to dicts to match the expected API contract
|
||||
processes = Process.get_running(process_type=Process.TypeChoices.WORKER)
|
||||
# Filter by worker_type to get only workers of this specific type (crawl/snapshot/archiveresult)
|
||||
processes = Process.objects.filter(
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
worker_type=cls.name, # Filter by specific worker type
|
||||
status__in=['running', 'started']
|
||||
)
|
||||
# Note: worker_id is not stored on Process model, it's dynamically generated
|
||||
# We return process_id (UUID) and pid (OS process ID) instead
|
||||
return [
|
||||
@@ -334,7 +348,11 @@ class Worker:
|
||||
"""Get count of running workers of this type."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
return Process.get_running_count(process_type=Process.TypeChoices.WORKER)
|
||||
return Process.objects.filter(
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
worker_type=cls.name, # Filter by specific worker type
|
||||
status__in=['running', 'started']
|
||||
).count()
|
||||
|
||||
|
||||
class CrawlWorker(Worker):
|
||||
|
||||
@@ -3,18 +3,23 @@
|
||||
#
|
||||
# All plugin tests use pytest and are located in pluginname/tests/test_*.py
|
||||
#
|
||||
# Usage: ./bin/test_plugins.sh [plugin_name] [--no-coverage]
|
||||
# Usage: ./bin/test_plugins.sh [plugin_name] [--no-coverage] [--coverage-report]
|
||||
#
|
||||
# Examples:
|
||||
# ./bin/test_plugins.sh # Run all plugin tests with coverage
|
||||
# ./bin/test_plugins.sh chrome # Run chrome plugin tests with coverage
|
||||
# ./bin/test_plugins.sh parse_* # Run all parse_* plugin tests with coverage
|
||||
# ./bin/test_plugins.sh --no-coverage # Run all tests without coverage
|
||||
# ./bin/test_plugins.sh --coverage-report # Just show coverage report without running tests
|
||||
#
|
||||
# Coverage results are saved to .coverage and can be viewed with:
|
||||
# coverage combine
|
||||
# coverage report
|
||||
# For running individual hooks with coverage:
|
||||
# NODE_V8_COVERAGE=./coverage/js node <hook>.js [args] # JS hooks
|
||||
# coverage run --parallel-mode <hook>.py [args] # Python hooks
|
||||
#
|
||||
# Coverage results are saved to .coverage (Python) and coverage/js (JavaScript):
|
||||
# coverage combine && coverage report
|
||||
# coverage json
|
||||
# ./bin/test_plugins.sh --coverage-report
|
||||
|
||||
set -e
|
||||
|
||||
@@ -30,15 +35,134 @@ ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
|
||||
# Parse arguments
|
||||
PLUGIN_FILTER=""
|
||||
ENABLE_COVERAGE=true
|
||||
COVERAGE_REPORT_ONLY=false
|
||||
|
||||
for arg in "$@"; do
|
||||
if [ "$arg" = "--no-coverage" ]; then
|
||||
ENABLE_COVERAGE=false
|
||||
elif [ "$arg" = "--coverage-report" ]; then
|
||||
COVERAGE_REPORT_ONLY=true
|
||||
else
|
||||
PLUGIN_FILTER="$arg"
|
||||
fi
|
||||
done
|
||||
|
||||
# Function to show JS coverage report (inlined from convert_v8_coverage.js)
|
||||
show_js_coverage() {
|
||||
local coverage_dir="$1"
|
||||
|
||||
if [ ! -d "$coverage_dir" ] || [ -z "$(ls -A "$coverage_dir" 2>/dev/null)" ]; then
|
||||
echo "No JavaScript coverage data collected"
|
||||
echo "(JS hooks may not have been executed during tests)"
|
||||
return
|
||||
fi
|
||||
|
||||
node - "$coverage_dir" << 'ENDJS'
|
||||
const fs = require('fs');
|
||||
const path = require('path');
|
||||
const coverageDir = process.argv[2];
|
||||
|
||||
const files = fs.readdirSync(coverageDir).filter(f => f.startsWith('coverage-') && f.endsWith('.json'));
|
||||
if (files.length === 0) {
|
||||
console.log('No coverage files found');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
const coverageByFile = {};
|
||||
|
||||
files.forEach(file => {
|
||||
const data = JSON.parse(fs.readFileSync(path.join(coverageDir, file), 'utf8'));
|
||||
data.result.forEach(script => {
|
||||
const url = script.url;
|
||||
if (url.startsWith('node:') || url.includes('node_modules')) return;
|
||||
|
||||
if (!coverageByFile[url]) {
|
||||
coverageByFile[url] = { totalRanges: 0, executedRanges: 0 };
|
||||
}
|
||||
|
||||
script.functions.forEach(func => {
|
||||
func.ranges.forEach(range => {
|
||||
coverageByFile[url].totalRanges++;
|
||||
if (range.count > 0) coverageByFile[url].executedRanges++;
|
||||
});
|
||||
});
|
||||
});
|
||||
});
|
||||
|
||||
const allFiles = Object.keys(coverageByFile).sort();
|
||||
const pluginFiles = allFiles.filter(url => url.includes('archivebox/plugins'));
|
||||
const otherFiles = allFiles.filter(url => !url.startsWith('node:') && !url.includes('archivebox/plugins'));
|
||||
|
||||
console.log('Total files with coverage: ' + allFiles.length + '\n');
|
||||
console.log('Plugin files: ' + pluginFiles.length);
|
||||
console.log('Node internal: ' + allFiles.filter(u => u.startsWith('node:')).length);
|
||||
console.log('Other: ' + otherFiles.length + '\n');
|
||||
|
||||
console.log('JavaScript Coverage Report');
|
||||
console.log('='.repeat(80));
|
||||
console.log('');
|
||||
|
||||
if (otherFiles.length > 0) {
|
||||
console.log('Non-plugin files with coverage:');
|
||||
otherFiles.forEach(url => console.log(' ' + url));
|
||||
console.log('');
|
||||
}
|
||||
|
||||
if (pluginFiles.length === 0) {
|
||||
console.log('No plugin files covered');
|
||||
process.exit(0);
|
||||
}
|
||||
|
||||
let totalRanges = 0, totalExecuted = 0;
|
||||
|
||||
pluginFiles.forEach(url => {
|
||||
const cov = coverageByFile[url];
|
||||
const pct = cov.totalRanges > 0 ? (cov.executedRanges / cov.totalRanges * 100).toFixed(1) : '0.0';
|
||||
const match = url.match(/archivebox\/plugins\/.+/);
|
||||
const displayPath = match ? match[0] : url;
|
||||
console.log(displayPath + ': ' + pct + '% (' + cov.executedRanges + '/' + cov.totalRanges + ' ranges)');
|
||||
totalRanges += cov.totalRanges;
|
||||
totalExecuted += cov.executedRanges;
|
||||
});
|
||||
|
||||
console.log('');
|
||||
console.log('-'.repeat(80));
|
||||
const overallPct = totalRanges > 0 ? (totalExecuted / totalRanges * 100).toFixed(1) : '0.0';
|
||||
console.log('Total: ' + overallPct + '% (' + totalExecuted + '/' + totalRanges + ' ranges)');
|
||||
ENDJS
|
||||
}
|
||||
|
||||
# If --coverage-report only, just show the report and exit
|
||||
if [ "$COVERAGE_REPORT_ONLY" = true ]; then
|
||||
cd "$ROOT_DIR" || exit 1
|
||||
echo "=========================================="
|
||||
echo "Python Coverage Summary"
|
||||
echo "=========================================="
|
||||
coverage combine 2>/dev/null || true
|
||||
coverage report --include="archivebox/plugins/*" --omit="*/tests/*"
|
||||
echo ""
|
||||
|
||||
echo "=========================================="
|
||||
echo "JavaScript Coverage Summary"
|
||||
echo "=========================================="
|
||||
show_js_coverage "$ROOT_DIR/coverage/js"
|
||||
echo ""
|
||||
|
||||
echo "For detailed coverage reports:"
|
||||
echo " Python: coverage report --show-missing --include='archivebox/plugins/*' --omit='*/tests/*'"
|
||||
echo " Python: coverage json # LLM-friendly format"
|
||||
echo " Python: coverage html # Interactive HTML report"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Set DATA_DIR for tests (required by abx_pkg and plugins)
|
||||
# Use temp dir to isolate tests from project files
|
||||
if [ -z "$DATA_DIR" ]; then
|
||||
export DATA_DIR=$(mktemp -d -t archivebox_plugin_tests.XXXXXX)
|
||||
# Clean up on exit
|
||||
trap "rm -rf '$DATA_DIR'" EXIT
|
||||
fi
|
||||
|
||||
# Reset coverage data if collecting coverage
|
||||
if [ "$ENABLE_COVERAGE" = true ]; then
|
||||
echo "Resetting coverage data..."
|
||||
@@ -161,19 +285,14 @@ elif [ $FAILED_PLUGINS -eq 0 ]; then
|
||||
echo "=========================================="
|
||||
echo "JavaScript Coverage Summary"
|
||||
echo "=========================================="
|
||||
if [ -d "$ROOT_DIR/coverage/js" ] && [ "$(ls -A "$ROOT_DIR/coverage/js" 2>/dev/null)" ]; then
|
||||
node "$ROOT_DIR/bin/convert_v8_coverage.js" "$ROOT_DIR/coverage/js"
|
||||
else
|
||||
echo "No JavaScript coverage data collected"
|
||||
echo "(JS hooks may not have been executed during tests)"
|
||||
fi
|
||||
show_js_coverage "$ROOT_DIR/coverage/js"
|
||||
echo ""
|
||||
|
||||
echo "For detailed coverage reports (from project root):"
|
||||
echo " Python: coverage report --show-missing --include='archivebox/plugins/*' --omit='*/tests/*'"
|
||||
echo " Python: coverage json # LLM-friendly format"
|
||||
echo " Python: coverage html # Interactive HTML report"
|
||||
echo " JavaScript: node bin/convert_v8_coverage.js coverage/js"
|
||||
echo " JavaScript: ./bin/test_plugins.sh --coverage-report"
|
||||
fi
|
||||
|
||||
exit 0
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
import pytest
|
||||
@@ -1,97 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for archivebox crawl command.
|
||||
Verify crawl creates snapshots with depth.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_crawl_creates_snapshots(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that crawl command works on existing snapshots."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# First add a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Then run crawl on it
|
||||
result = subprocess.run(
|
||||
['archivebox', 'crawl', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert result.returncode in [0, 1, 2] # May succeed or fail depending on URL
|
||||
|
||||
# Check snapshot was created
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert count == 1
|
||||
|
||||
|
||||
def test_crawl_with_depth_0(tmp_path, process, disable_extractors_dict):
|
||||
"""Test crawl with depth=0 works on existing snapshot."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# First add a snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Then crawl it
|
||||
subprocess.run(
|
||||
['archivebox', 'crawl', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
count = c.execute("SELECT COUNT(*) FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# Should have at least 1 snapshot from the add command
|
||||
assert count >= 1
|
||||
|
||||
|
||||
def test_crawl_creates_crawl_record(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that add+crawl creates Crawl records."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# First add a snapshot (this creates a Crawl)
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Then crawl it
|
||||
subprocess.run(
|
||||
['archivebox', 'crawl', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
crawl_count = c.execute("SELECT COUNT(*) FROM crawls_crawl").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# Should have at least 1 crawl from the add command
|
||||
assert crawl_count >= 1
|
||||
@@ -1,63 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Tests for archivebox snapshot command.
|
||||
Verify snapshot command works with snapshot IDs/URLs.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sqlite3
|
||||
|
||||
from .fixtures import *
|
||||
|
||||
|
||||
def test_snapshot_command_works_with_url(tmp_path, process, disable_extractors_dict):
|
||||
"""Test that snapshot command works with URL."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add a snapshot first
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Try to view/interact with snapshot
|
||||
result = subprocess.run(
|
||||
['archivebox', 'snapshot', 'https://example.com'],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Should complete (exit code depends on implementation)
|
||||
assert result.returncode in [0, 1, 2]
|
||||
|
||||
|
||||
def test_snapshot_command_with_timestamp(tmp_path, process, disable_extractors_dict):
|
||||
"""Test snapshot command with timestamp ID."""
|
||||
os.chdir(tmp_path)
|
||||
|
||||
# Add snapshot
|
||||
subprocess.run(
|
||||
['archivebox', 'add', '--index-only', '--depth=0', 'https://example.com'],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
)
|
||||
|
||||
# Get snapshot timestamp
|
||||
conn = sqlite3.connect("index.sqlite3")
|
||||
c = conn.cursor()
|
||||
timestamp = c.execute("SELECT timestamp FROM core_snapshot").fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
# Try snapshot command with timestamp
|
||||
result = subprocess.run(
|
||||
['archivebox', 'snapshot', str(timestamp)],
|
||||
capture_output=True,
|
||||
env=disable_extractors_dict,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert result.returncode in [0, 1, 2]
|
||||
Reference in New Issue
Block a user