rename archive_org to archivedotorg, add BinaryWorker, fix config pass-through

This commit is contained in:
Nick Sweeting
2026-01-04 22:38:15 -08:00
parent 456aaee287
commit 7ceaeae2d9
32 changed files with 1111 additions and 110 deletions

View File

@@ -1,6 +1,9 @@
{
"permissions": {
"allow": [
"Read(**)",
"Glob(**)",
"Grep(**)",
"Bash(python -m archivebox:*)",
"Bash(ls:*)",
"Bash(xargs:*)",
@@ -29,5 +32,19 @@
"Bash(done)",
"Bash(coverage erase:*)"
]
},
"hooks": {
"PreToolUse": [
{
"matcher": "Bash",
"hooks": [
{
"type": "command",
"command": "REPO_ROOT=$(git rev-parse --show-toplevel 2>/dev/null); if [ -n \"$REPO_ROOT\" ] && [ \"$PWD\" != \"$REPO_ROOT\" ]; then echo \"ERROR: Not in repo root ($REPO_ROOT). Current dir: $PWD\" >&2; exit 1; fi",
"statusMessage": "Checking working directory..."
}
]
}
]
}
}

View File

@@ -128,4 +128,4 @@ class ModelWithOutputDir(ModelWithUUID):
@property
def output_dir(self) -> Path:
raise NotImplementedError(f'{self.__class__.__name__} must implement output_dir property')
raise NotImplementedError(f"{self.__class__.__name__} must implement output_dir property")

View File

@@ -59,10 +59,11 @@ def process_stdin_records() -> int:
"""
from django.utils import timezone
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_BINARY
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.crawls.models import Crawl
from archivebox.machine.models import Binary
from archivebox.workers.orchestrator import Orchestrator
records = list(read_stdin())
@@ -137,6 +138,26 @@ def process_stdin_records() -> int:
output_records.append(archiveresult.to_json())
queued_count += 1
elif record_type == TYPE_BINARY:
# Binary records - create or update and queue for installation
if record_id:
# Existing binary - re-queue
try:
binary = Binary.objects.get(id=record_id)
except Binary.DoesNotExist:
binary = Binary.from_json(record)
else:
# New binary - create it
binary = Binary.from_json(record)
if binary:
binary.retry_at = timezone.now()
if binary.status != Binary.StatusChoices.INSTALLED:
binary.status = Binary.StatusChoices.QUEUED
binary.save()
output_records.append(binary.to_json())
queued_count += 1
else:
# Unknown type - pass through
output_records.append(record)
@@ -222,7 +243,8 @@ def run_snapshot_worker(snapshot_id: str) -> int:
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
@click.option('--crawl-id', help="Run orchestrator for specific crawl only")
@click.option('--snapshot-id', help="Run worker for specific snapshot only")
def main(daemon: bool, crawl_id: str, snapshot_id: str):
@click.option('--binary-id', help="Run worker for specific binary only")
def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
"""
Process queued work.
@@ -231,11 +253,27 @@ def main(daemon: bool, crawl_id: str, snapshot_id: str):
- No args + TTY: Run orchestrator for all work
- --crawl-id: Run orchestrator for that crawl only
- --snapshot-id: Run worker for that snapshot only
- --binary-id: Run worker for that binary only
"""
# Snapshot worker mode
if snapshot_id:
sys.exit(run_snapshot_worker(snapshot_id))
# Binary worker mode
if binary_id:
from archivebox.workers.worker import BinaryWorker
try:
worker = BinaryWorker(binary_id=binary_id, worker_id=0)
worker.runloop()
sys.exit(0)
except KeyboardInterrupt:
sys.exit(0)
except Exception as e:
rprint(f'[red]Worker error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
import traceback
traceback.print_exc()
sys.exit(1)
# Crawl worker mode
if crawl_id:
from archivebox.workers.worker import CrawlWorker

View File

@@ -123,6 +123,7 @@ def get_config(
user: Any = None,
crawl: Any = None,
snapshot: Any = None,
archiveresult: Any = None,
machine: Any = None,
) -> Dict[str, Any]:
"""
@@ -145,11 +146,26 @@ def get_config(
user: User object with config JSON field
crawl: Crawl object with config JSON field
snapshot: Snapshot object with config JSON field
archiveresult: ArchiveResult object (auto-fetches snapshot)
machine: Machine object with config JSON field (defaults to Machine.current())
Note: Objects are auto-fetched from relationships if not provided:
- snapshot auto-fetched from archiveresult.snapshot
- crawl auto-fetched from snapshot.crawl
- user auto-fetched from crawl.created_by
Returns:
Merged config dict
"""
# Auto-fetch related objects from relationships
if snapshot is None and archiveresult and hasattr(archiveresult, "snapshot"):
snapshot = archiveresult.snapshot
if crawl is None and snapshot and hasattr(snapshot, "crawl"):
crawl = snapshot.crawl
if user is None and crawl and hasattr(crawl, "created_by"):
user = crawl.created_by
from archivebox.config.constants import CONSTANTS
from archivebox.config.common import (
SHELL_CONFIG,
@@ -197,12 +213,18 @@ def get_config(
if machine and hasattr(machine, "config") and machine.config:
config.update(machine.config)
# Override with environment variables
# Override with environment variables (for keys that exist in config)
for key in config:
env_val = os.environ.get(key)
if env_val is not None:
config[key] = _parse_env_value(env_val, config.get(key))
# Also add NEW environment variables (not yet in config)
# This is important for worker subprocesses that receive config via Process.env
for key, value in os.environ.items():
if key.isupper() and key not in config: # Only uppercase keys (config convention)
config[key] = _parse_env_value(value, None)
# Also check plugin config aliases in environment
try:
from archivebox.hooks import discover_plugin_configs
@@ -335,7 +357,7 @@ DEFAULT_WORKER_CONCURRENCY = {
"title": 5,
"favicon": 5,
"headers": 5,
"archive_org": 2,
"archivedotorg": 2,
"readability": 3,
"mercury": 3,
"git": 2,

View File

@@ -147,7 +147,7 @@ class AddLinkForm(forms.Form):
'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
}
archiving = {
'archive_org', 'favicon', 'forumdl', 'gallerydl', 'git',
'archivedotorg', 'favicon', 'forumdl', 'gallerydl', 'git',
'htmltotext', 'media', 'mercury', 'papersdl', 'readability', 'wget'
}
parsing = {

View File

@@ -120,7 +120,7 @@ class Migration(migrations.Migration):
('output', models.CharField(max_length=512)),
('start_ts', models.DateTimeField()),
('end_ts', models.DateTimeField()),
('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)),
('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archivedotorg', 'archivedotorg')], max_length=32)),
('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
],
),

View File

@@ -19,6 +19,6 @@ class Migration(migrations.Migration):
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32),
),
]

View File

@@ -13,6 +13,6 @@ class Migration(migrations.Migration):
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32),
),
]

View File

@@ -13,6 +13,6 @@ class Migration(migrations.Migration):
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('htmltotext', 'htmltotext'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('htmltotext', 'htmltotext'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32),
),
]

View File

@@ -1973,7 +1973,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
canonical = {
'index_path': 'index.html',
'google_favicon_path': FAVICON_PROVIDER.format(self.domain),
'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
'archivedotorg_path': f'https://web.archive.org/web/{self.base_url}',
}
# Scan each ArchiveResult's output directory for the best file

View File

@@ -206,7 +206,10 @@ DATABASES = {
}
MIGRATION_MODULES = {"signal_webhooks": None}
# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0
# Django requires DEFAULT_AUTO_FIELD to subclass AutoField (BigAutoField, SmallAutoField, etc.)
# Cannot use UUIDField here until Django 6.0 introduces DEFAULT_PK_FIELD setting
# For now: manually add `id = models.UUIDField(primary_key=True, default=uuid7, ...)` to all models
# OR inherit from ModelWithUUID base class which provides UUID primary key
DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"

View File

@@ -0,0 +1,17 @@
# Generated by Django 6.0 on 2026-01-05 01:09
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('crawls', '0003_remove_crawlschedule_num_uses_failed_and_more'),
]
operations = [
migrations.RemoveField(
model_name='crawl',
name='output_dir',
),
]

View File

@@ -370,20 +370,34 @@ def run_hook(
from archivebox.machine.models import Machine
machine = Machine.current()
if machine and machine.config:
machine_path = machine.config.get('config/PATH')
machine_path = machine.config.get('PATH')
if machine_path:
# Prepend LIB_BIN_DIR to machine PATH as well
if lib_bin_dir and not machine_path.startswith(f'{lib_bin_dir}:'):
env['PATH'] = f'{lib_bin_dir}:{machine_path}'
else:
env['PATH'] = machine_path
# Also set NODE_MODULES_DIR if configured
node_modules_dir = machine.config.get('config/NODE_MODULES_DIR')
if node_modules_dir:
env['NODE_MODULES_DIR'] = node_modules_dir
except Exception:
pass # Fall back to system PATH if Machine not available
# Set NODE_PATH for Node.js module resolution
# Priority: config dict > Machine.config > derive from LIB_DIR
node_path = config.get('NODE_PATH')
if not node_path and lib_dir:
# Derive from LIB_DIR/npm/node_modules
node_modules_dir = Path(lib_dir) / 'npm' / 'node_modules'
if node_modules_dir.exists():
node_path = str(node_modules_dir)
if not node_path:
try:
# Fallback to Machine.config
node_path = machine.config.get('NODE_MODULES_DIR')
except Exception:
pass
if node_path:
env['NODE_PATH'] = node_path
env['NODE_MODULES_DIR'] = node_path # For backwards compatibility
# Export all config values to environment (already merged by get_config())
for key, value in config.items():
if value is None:
@@ -414,56 +428,8 @@ def run_hook(
timeout=timeout,
)
# Build environment from config (Process._build_env() expects self.env dict)
# We need to set env on the process before launching
process.env = {}
for key, value in config.items():
if value is None:
continue
elif isinstance(value, bool):
process.env[key] = 'true' if value else 'false'
elif isinstance(value, (list, dict)):
process.env[key] = json.dumps(value)
else:
process.env[key] = str(value)
# Add base paths to env
process.env['DATA_DIR'] = str(getattr(settings, 'DATA_DIR', Path.cwd()))
process.env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive'))
process.env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', ''))
# Add LIB_DIR and LIB_BIN_DIR
lib_dir = config.get('LIB_DIR', getattr(settings, 'LIB_DIR', None))
lib_bin_dir = config.get('LIB_BIN_DIR', getattr(settings, 'LIB_BIN_DIR', None))
if lib_dir:
process.env['LIB_DIR'] = str(lib_dir)
if not lib_bin_dir and lib_dir:
lib_bin_dir = Path(lib_dir) / 'bin'
if lib_bin_dir:
process.env['LIB_BIN_DIR'] = str(lib_bin_dir)
# Set PATH from Machine.config if available
try:
if machine and machine.config:
machine_path = machine.config.get('config/PATH')
if machine_path:
# Prepend LIB_BIN_DIR to machine PATH as well
if lib_bin_dir and not machine_path.startswith(f'{lib_bin_dir}:'):
process.env['PATH'] = f'{lib_bin_dir}:{machine_path}'
else:
process.env['PATH'] = machine_path
elif lib_bin_dir:
# Just prepend to current PATH
current_path = os.environ.get('PATH', '')
if not current_path.startswith(f'{lib_bin_dir}:'):
process.env['PATH'] = f'{lib_bin_dir}:{current_path}' if current_path else str(lib_bin_dir)
# Also set NODE_MODULES_DIR if configured
node_modules_dir = machine.config.get('config/NODE_MODULES_DIR')
if node_modules_dir:
process.env['NODE_MODULES_DIR'] = node_modules_dir
except Exception:
pass # Fall back to system PATH if Machine not available
# Copy the env dict we already built (includes os.environ + all customizations)
process.env = env.copy()
# Save env before launching
process.save()

View File

@@ -0,0 +1,17 @@
# Generated by Django 6.0 on 2026-01-05 01:09
from django.db import migrations
class Migration(migrations.Migration):
dependencies = [
('machine', '0010_alter_process_process_type'),
]
operations = [
migrations.RemoveField(
model_name='binary',
name='output_dir',
),
]

View File

@@ -1,7 +1,7 @@
from django.apps import AppConfig
class SessionsConfig(AppConfig):
class PersonasConfig(AppConfig):
default_auto_field = "django.db.models.BigAutoField"
name = "archivebox.personas"
label = "personas"

View File

@@ -1,6 +1,7 @@
# Generated by Django 6.0 on 2025-12-31 09:06
import archivebox.base_models.models
from archivebox.uuid_compat import uuid7
import django.db.models.deletion
import django.utils.timezone
from django.conf import settings
@@ -19,7 +20,7 @@ class Migration(migrations.Migration):
migrations.CreateModel(
name='Persona',
fields=[
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
('id', models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)),
('config', models.JSONField(blank=True, default=dict, null=True)),
('name', models.CharField(max_length=64, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),

View File

@@ -0,0 +1,19 @@
# Generated by Django 6.0 on 2026-01-05 01:09
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('personas', '0001_initial'),
]
operations = [
migrations.AlterField(
model_name='persona',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
]

View File

@@ -19,6 +19,7 @@ from django.conf import settings
from django.utils import timezone
from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk
from archivebox.uuid_compat import uuid7
if TYPE_CHECKING:
from django.db.models import QuerySet
@@ -44,6 +45,7 @@ class Persona(ModelWithConfig):
persona.CHROME_USER_DATA_DIR # -> Path to chrome_user_data
"""
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
name = models.CharField(max_length=64, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)

View File

@@ -2,7 +2,7 @@
"""
Submit a URL to archive.org for archiving.
Usage: on_Snapshot__archive_org.py --url=<url> --snapshot-id=<uuid>
Usage: on_Snapshot__archivedotorg.py --url=<url> --snapshot-id=<uuid>
Output: Writes archive.org.txt to $PWD with the archived URL
Environment variables:
@@ -25,7 +25,7 @@ import rich_click as click
# Extractor metadata
PLUGIN_NAME = 'archive_org'
PLUGIN_NAME = 'archivedotorg'
OUTPUT_DIR = '.'
OUTPUT_FILE = 'archive.org.txt'
@@ -41,7 +41,7 @@ def get_env_int(name: str, default: int = 0) -> int:
return default
def submit_to_archive_org(url: str) -> tuple[bool, str | None, str]:
def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:
"""
Submit URL to archive.org Wayback Machine.
@@ -113,7 +113,7 @@ def main(url: str, snapshot_id: str):
try:
# Run extraction
success, output, error = submit_to_archive_org(url)
success, output, error = submit_to_archivedotorg(url)
if success:
# Success - emit ArchiveResult with output file

View File

@@ -1,5 +1,5 @@
"""
Integration tests for archive_org plugin
Integration tests for archivedotorg plugin
Tests verify standalone archive.org extractor execution.
"""
@@ -12,13 +12,13 @@ from pathlib import Path
import pytest
PLUGIN_DIR = Path(__file__).parent.parent
ARCHIVEDOTORG_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_archive_org.*'), None)
ARCHIVEDOTORG_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_archivedotorg.*'), None)
TEST_URL = 'https://example.com'
def test_hook_script_exists():
assert ARCHIVEDOTORG_HOOK.exists()
def test_submits_to_archive_org():
def test_submits_to_archivedotorg():
with tempfile.TemporaryDirectory() as tmpdir:
result = subprocess.run(
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
@@ -49,7 +49,7 @@ def test_submits_to_archive_org():
assert not result_json, "Should NOT emit JSONL on transient error"
assert result.stderr, "Should have error message in stderr"
def test_config_save_archive_org_false_skips():
def test_config_save_archivedotorg_false_skips():
with tempfile.TemporaryDirectory() as tmpdir:
import os
env = os.environ.copy()

View File

@@ -26,11 +26,20 @@ const {
readCdpUrl,
} = require('../chrome/chrome_utils.js');
// Flush V8 coverage before exit (needed for NODE_V8_COVERAGE to capture early exits)
function flushCoverageAndExit(exitCode) {
if (process.env.NODE_V8_COVERAGE) {
const v8 = require('v8');
v8.takeCoverage();
}
process.exit(exitCode);
}
// Check if screenshot is enabled BEFORE requiring puppeteer
if (!getEnvBool('SCREENSHOT_ENABLED', true)) {
console.error('Skipping screenshot (SCREENSHOT_ENABLED=False)');
// Temporary failure (config disabled) - NO JSONL emission
process.exit(0);
flushCoverageAndExit(0);
}
// Now safe to require puppeteer
@@ -135,7 +144,7 @@ async function main() {
if (!url || !snapshotId) {
console.error('Usage: on_Snapshot__51_screenshot.js --url=<url> --snapshot-id=<uuid>');
process.exit(1);
flushCoverageAndExit(1);
}
// Check if staticfile extractor already handled this (permanent skip)
@@ -147,7 +156,7 @@ async function main() {
status: 'skipped',
output_str: 'staticfile already handled',
}));
process.exit(0);
flushCoverageAndExit(0);
}
// Take screenshot (throws on error)
@@ -166,5 +175,5 @@ async function main() {
main().catch(e => {
// Transient error - emit NO JSONL
console.error(`ERROR: ${e.message}`);
process.exit(1);
flushCoverageAndExit(1);
});

View File

@@ -226,9 +226,6 @@ def test_config_save_screenshot_false_skips():
print(f"[DEBUG RESULT] Exit code: {result.returncode}")
print(f"[DEBUG RESULT] Stderr: {result.stderr[:200]}")
# FORCE FAILURE to verify test actually runs
assert False, f"FORCED FAILURE - NODE_V8_COVERAGE={'NODE_V8_COVERAGE' in env} value={env.get('NODE_V8_COVERAGE', 'NOTSET')}"
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
# Feature disabled - temporary failure, should NOT emit JSONL

View File

@@ -74,7 +74,7 @@
"--geo-bypass",
"--add-metadata",
"--no-progress",
"--remote-components ejs:github",
"--remote-components=ejs:github",
"-o",
"%(title)s.%(ext)s"
],

View File

@@ -0,0 +1,256 @@
"""
Tests for BinaryWorker processing Binary queue.
Tests cover:
- BinaryWorker is spawned by Orchestrator when Binary queue has work
- Binary hooks (on_Binary__*) actually run and install binaries
- Binary status transitions from QUEUED -> INSTALLED
- BinaryWorker exits after idle timeout
"""
import json
import sqlite3
import time
from archivebox.tests.conftest import (
run_archivebox_cmd,
parse_jsonl_output,
)
class TestBinaryWorkerSpawning:
"""Tests for BinaryWorker lifecycle."""
def test_binary_worker_spawns_when_binary_queued(self, initialized_archive):
"""Orchestrator spawns BinaryWorker when Binary queue has work."""
# Create a Binary record via CLI
binary_record = {
'type': 'Binary',
'name': 'python3',
'binproviders': 'env', # Use env provider to detect system python
}
# Use `archivebox run` to create the Binary (this queues it)
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
timeout=30,
)
assert code == 0, f"Failed to create Binary: {stderr}"
# Verify Binary was created in DB
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
binaries = c.execute(
"SELECT name, status, abspath FROM machine_binary WHERE name='python3'"
).fetchall()
conn.close()
assert len(binaries) >= 1, "Binary was not created in database"
name, status, abspath = binaries[0]
assert name == 'python3'
# Status should be INSTALLED after BinaryWorker processed it
# (or QUEUED if worker timed out before installing)
assert status in ['installed', 'queued']
def test_binary_hooks_actually_run(self, initialized_archive):
"""Binary installation hooks (on_Binary__*) run and update abspath."""
# Create a Binary for python3 (guaranteed to exist on system)
binary_record = {
'type': 'Binary',
'name': 'python3',
'binproviders': 'env',
}
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
timeout=30,
)
assert code == 0, f"Failed to process Binary: {stderr}"
# Query database to check if hooks ran and populated abspath
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
result = c.execute(
"SELECT name, status, abspath, version FROM machine_binary WHERE name='python3'"
).fetchone()
conn.close()
assert result is not None, "Binary not found in database"
name, status, abspath, version = result
# If hooks ran successfully, abspath should be populated
if status == 'installed':
assert abspath, f"Binary installed but abspath is empty: {abspath}"
assert '/python3' in abspath or '\\python3' in abspath, \
f"abspath doesn't look like a python3 path: {abspath}"
# Version should also be populated
assert version, f"Binary installed but version is empty: {version}"
def test_binary_status_transitions(self, initialized_archive):
"""Binary status correctly transitions QUEUED -> INSTALLED."""
binary_record = {
'type': 'Binary',
'name': 'python3',
'binproviders': 'env',
}
# Create and process the Binary
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
timeout=30,
)
assert code == 0
# Check final status
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
status = c.execute(
"SELECT status FROM machine_binary WHERE name='python3'"
).fetchone()
conn.close()
assert status is not None
# Should be installed (or queued if worker timed out)
assert status[0] in ['installed', 'queued']
class TestBinaryWorkerHooks:
"""Tests for specific Binary hook providers."""
def test_env_provider_hook_detects_system_binary(self, initialized_archive):
"""on_Binary__15_env_install.py hook detects system binaries."""
binary_record = {
'type': 'Binary',
'name': 'python3',
'binproviders': 'env',
}
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
timeout=30,
)
assert code == 0
# Check that env provider hook populated the Binary
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
result = c.execute(
"SELECT binprovider, abspath FROM machine_binary WHERE name='python3' AND status='installed'"
).fetchone()
conn.close()
if result:
binprovider, abspath = result
assert binprovider == 'env', f"Expected env provider, got: {binprovider}"
assert abspath, "abspath should be populated by env provider"
def test_multiple_binaries_processed_in_batch(self, initialized_archive):
"""BinaryWorker processes multiple queued binaries."""
# Create multiple Binary records
binaries = [
{'type': 'Binary', 'name': 'python3', 'binproviders': 'env'},
{'type': 'Binary', 'name': 'curl', 'binproviders': 'env'},
]
stdin = '\n'.join(json.dumps(b) for b in binaries)
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=stdin,
data_dir=initialized_archive,
timeout=45,
)
assert code == 0
# Both should be processed
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
installed = c.execute(
"SELECT name FROM machine_binary WHERE name IN ('python3', 'curl')"
).fetchall()
conn.close()
assert len(installed) >= 1, "At least one binary should be created"
class TestBinaryWorkerEdgeCases:
"""Tests for edge cases and error handling."""
def test_nonexistent_binary_stays_queued(self, initialized_archive):
"""Binary that doesn't exist stays queued (doesn't fail permanently)."""
binary_record = {
'type': 'Binary',
'name': 'nonexistent-binary-xyz-12345',
'binproviders': 'env',
}
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
timeout=30,
)
# Command should still succeed (orchestrator doesn't fail on binary install failures)
assert code == 0
# Binary should remain queued (not installed)
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
result = c.execute(
"SELECT status FROM machine_binary WHERE name='nonexistent-binary-xyz-12345'"
).fetchone()
conn.close()
if result:
status = result[0]
# Should stay queued since installation failed
assert status == 'queued', f"Expected queued, got: {status}"
def test_binary_worker_respects_machine_isolation(self, initialized_archive):
"""BinaryWorker only processes binaries for current machine."""
# This is implicitly tested by other tests - Binary.objects.filter(machine=current)
# ensures only current machine's binaries are processed
binary_record = {
'type': 'Binary',
'name': 'python3',
'binproviders': 'env',
}
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
timeout=30,
)
assert code == 0
# Check that machine_id is set correctly
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
result = c.execute(
"SELECT machine_id FROM machine_binary WHERE name='python3'"
).fetchone()
conn.close()
assert result is not None
machine_id = result[0]
assert machine_id, "machine_id should be set on Binary"

View File

@@ -246,6 +246,68 @@ print(snapshot.id)
"Expected debug output not found in stderr"
print("✓ Config debug output found in stderr")
# Verify precedence order: snapshot > crawl > user > persona > env > machine > file > defaults
verify_precedence_script = f"""
import os
os.environ['DATA_DIR'] = '{data_dir}'
from archivebox.config.django import setup_django
setup_django()
from archivebox.core.models import Snapshot
from archivebox.config.configset import get_config
snapshot = Snapshot.objects.get(id='{snapshot_id}')
# Test precedence by getting config at different levels
print("\\nTesting config precedence order:")
# 1. Just defaults (lowest priority)
config_defaults = get_config()
print(f" Defaults only: TIMEOUT={{config_defaults.get('TIMEOUT')}}")
# 2. With machine config
from archivebox.machine.models import Machine
machine = Machine.current()
config_machine = get_config(machine=machine)
custom_machine = config_machine.get('CUSTOM_MACHINE_KEY')
print(f" + Machine: CUSTOM_MACHINE_KEY={{custom_machine}}")
# 3. With crawl config
config_crawl = get_config(crawl=snapshot.crawl)
print(f" + Crawl: TIMEOUT={{config_crawl.get('TIMEOUT')}} (should be 777 from crawl.config)")
assert config_crawl.get('TIMEOUT') == 777, f"Expected 777 from crawl, got {{config_crawl.get('TIMEOUT')}}"
# 4. With snapshot config (highest priority)
config_snapshot = get_config(snapshot=snapshot)
print(f" + Snapshot: TIMEOUT={{config_snapshot.get('TIMEOUT')}} (should be 555 from snapshot.config)")
assert config_snapshot.get('TIMEOUT') == 555, f"Expected 555 from snapshot, got {{config_snapshot.get('TIMEOUT')}}"
# Verify snapshot config overrides crawl config
assert config_snapshot.get('CUSTOM_CRAWL_KEY') == 'from_crawl_json', "Crawl config should be present"
assert config_snapshot.get('CUSTOM_SNAPSHOT_KEY') == 'from_snapshot_json', "Snapshot config should be present"
assert config_snapshot.get('CUSTOM_MACHINE_KEY') == 'from_machine_config', "Machine config should be present"
print("\\n✓ Config precedence order verified: snapshot > crawl > machine > defaults")
"""
result = subprocess.run(
['python', '-c', verify_precedence_script],
cwd=str(data_dir.parent),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
},
capture_output=True,
timeout=30,
)
print(result.stdout.decode())
if result.returncode != 0:
print("\nPrecedence verification error:")
print(result.stderr.decode())
assert result.returncode == 0, f"Precedence verification failed: {result.stderr.decode()}"
# Verify config values were actually used by checking ArchiveResults
verify_script = f"""
import os
@@ -475,7 +537,453 @@ print("\\n✓ All config values correctly parsed from environment")
print("="*80 + "\n")
def test_parent_environment_preserved_in_hooks():
"""
Test that parent environment variables are preserved in hook execution.
This test catches the bug where we built env=os.environ.copy() but then
clobbered it with process.env={}, losing all parent environment.
Also verifies:
- NODE_PATH is correctly derived from LIB_DIR/npm/node_modules
- LIB_BIN_DIR is correctly derived and added to PATH
"""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir) / 'test_archive'
data_dir.mkdir()
print(f"\n{'='*80}")
print(f"Test: Parent Environment Preserved in Hooks")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
# Initialize archive
print("Step 1: Initialize archive")
result = subprocess.run(
['python', '-m', 'archivebox', 'init'],
cwd=str(data_dir),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
},
capture_output=True,
timeout=60,
)
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
print(f"✓ Archive initialized\n")
# Create snapshot
print("Step 2: Create Snapshot")
create_snapshot_script = f"""
import os
os.environ['DATA_DIR'] = '{data_dir}'
from archivebox.config.django import setup_django
setup_django()
from django.utils import timezone
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
crawl = Crawl.objects.create(
urls='https://example.com',
status='queued',
retry_at=timezone.now()
)
snapshot = Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status='queued',
retry_at=timezone.now()
)
print(snapshot.id)
"""
result = subprocess.run(
['python', '-c', create_snapshot_script],
cwd=str(data_dir.parent),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
},
capture_output=True,
timeout=30,
)
assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}"
snapshot_id = result.stdout.decode().strip().split('\n')[-1]
print(f"✓ Created snapshot {snapshot_id}\n")
# Run SnapshotWorker with custom parent environment variable
print("Step 3: Run SnapshotWorker with TEST_PARENT_ENV_VAR in parent process")
result = subprocess.run(
['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
cwd=str(data_dir),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
'TEST_PARENT_ENV_VAR': 'preserved_from_parent', # This should reach the hook
'PLUGINS': 'favicon', # Use existing plugin (favicon is simple and fast)
},
capture_output=True,
timeout=120,
)
stdout = result.stdout.decode()
stderr = result.stderr.decode()
print("\n--- SnapshotWorker stderr (first 50 lines) ---")
print('\n'.join(stderr.split('\n')[:50]))
print("--- End stderr ---\n")
# Verify hooks ran by checking Process records
print("Step 4: Verify environment variables in hook Process records")
verify_env_script = f"""
import os
os.environ['DATA_DIR'] = '{data_dir}'
from archivebox.config.django import setup_django
setup_django()
from archivebox.machine.models import Process
from archivebox.core.models import Snapshot
import json
snapshot = Snapshot.objects.get(id='{snapshot_id}')
# Find hook processes for this snapshot
hook_processes = Process.objects.filter(
process_type=Process.TypeChoices.HOOK,
pwd__contains=str(snapshot.id)
).order_by('-created_at')
print(f"Found {{hook_processes.count()}} hook processes")
if hook_processes.count() == 0:
print("ERROR: No hook processes found!")
import sys
sys.exit(1)
# Check the first hook process environment
hook_process = hook_processes.first()
print(f"\\nChecking hook: {{hook_process.cmd}}")
print(f"Hook env keys: {{len(hook_process.env)}} total")
# Verify TEST_PARENT_ENV_VAR was preserved
test_parent = hook_process.env.get('TEST_PARENT_ENV_VAR')
print(f" TEST_PARENT_ENV_VAR: {{test_parent}}")
assert test_parent == 'preserved_from_parent', f"Expected 'preserved_from_parent', got {{test_parent}}"
# Verify LIB_DIR is set
lib_dir = hook_process.env.get('LIB_DIR')
print(f" LIB_DIR: {{lib_dir}}")
assert lib_dir is not None, "LIB_DIR not set"
# Verify LIB_BIN_DIR is derived
lib_bin_dir = hook_process.env.get('LIB_BIN_DIR')
print(f" LIB_BIN_DIR: {{lib_bin_dir}}")
if lib_dir:
assert lib_bin_dir is not None, "LIB_BIN_DIR not derived from LIB_DIR"
assert lib_bin_dir.endswith('/bin'), f"LIB_BIN_DIR should end with /bin, got {{lib_bin_dir}}"
# Verify LIB_BIN_DIR is in PATH
path = hook_process.env.get('PATH')
if lib_bin_dir:
assert lib_bin_dir in path, f"LIB_BIN_DIR not in PATH. LIB_BIN_DIR={{lib_bin_dir}}, PATH={{path[:200]}}..."
# Verify NODE_PATH is set
node_path = hook_process.env.get('NODE_PATH')
node_modules_dir = hook_process.env.get('NODE_MODULES_DIR')
print(f" NODE_PATH: {{node_path}}")
print(f" NODE_MODULES_DIR: {{node_modules_dir}}")
if node_path:
# Should also have NODE_MODULES_DIR for backwards compatibility
assert node_modules_dir == node_path, f"NODE_MODULES_DIR should match NODE_PATH"
print("\\n✓ All environment checks passed")
"""
result = subprocess.run(
['python', '-c', verify_env_script],
cwd=str(data_dir.parent),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
},
capture_output=True,
timeout=30,
)
print(result.stdout.decode())
if result.returncode != 0:
print("\nVerification error:")
print(result.stderr.decode())
assert result.returncode == 0, f"Environment verification failed: {result.stderr.decode()}"
print("\n" + "="*80)
print("✓ TEST PASSED: Parent environment preserved in hooks")
print(" - Custom parent env vars reach hooks")
print(" - LIB_DIR propagated correctly")
print(" - LIB_BIN_DIR derived and added to PATH")
print(" - NODE_PATH/NODE_MODULES_DIR set when available")
print("="*80 + "\n")
def test_config_auto_fetch_relationships():
"""
Test that get_config() auto-fetches related objects from relationships.
Verifies:
- snapshot auto-fetched from archiveresult.snapshot
- crawl auto-fetched from snapshot.crawl
- user auto-fetched from crawl.created_by
"""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir) / 'test_archive'
data_dir.mkdir()
print(f"\n{'='*80}")
print(f"Test: Config Auto-Fetch Relationships")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
# Initialize archive
print("Step 1: Initialize archive")
result = subprocess.run(
['python', '-m', 'archivebox', 'init'],
cwd=str(data_dir),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
},
capture_output=True,
timeout=60,
)
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
print(f"✓ Archive initialized\n")
# Create objects with config at each level
print("Step 2: Create Crawl -> Snapshot -> ArchiveResult with config at each level")
create_objects_script = f"""
import os
os.environ['DATA_DIR'] = '{data_dir}'
from archivebox.config.django import setup_django
setup_django()
from django.utils import timezone
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.config.configset import get_config
# Create crawl with config
crawl = Crawl.objects.create(
urls='https://example.com',
status='queued',
retry_at=timezone.now(),
config={{
'CRAWL_KEY': 'from_crawl',
'TIMEOUT': 777,
}}
)
# Create snapshot with config
snapshot = Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status='queued',
retry_at=timezone.now(),
config={{
'SNAPSHOT_KEY': 'from_snapshot',
'TIMEOUT': 555,
}}
)
# Create ArchiveResult
ar = ArchiveResult.objects.create(
snapshot=snapshot,
plugin='test',
hook_name='test_hook',
status=ArchiveResult.StatusChoices.STARTED
)
print(f"Created: crawl={{crawl.id}}, snapshot={{snapshot.id}}, ar={{ar.id}}")
# Test 1: Auto-fetch crawl from snapshot
print("\\nTest 1: get_config(snapshot=snapshot) auto-fetches crawl")
config = get_config(snapshot=snapshot)
assert config.get('TIMEOUT') == 555, f"Expected 555 from snapshot, got {{config.get('TIMEOUT')}}"
assert config.get('SNAPSHOT_KEY') == 'from_snapshot', f"Expected from_snapshot, got {{config.get('SNAPSHOT_KEY')}}"
assert config.get('CRAWL_KEY') == 'from_crawl', f"Expected from_crawl, got {{config.get('CRAWL_KEY')}}"
print("✓ Snapshot config (TIMEOUT=555) overrides crawl config (TIMEOUT=777)")
print("✓ Both snapshot.config and crawl.config values present")
# Test 2: Auto-fetch snapshot from archiveresult
print("\\nTest 2: get_config(archiveresult=ar) auto-fetches snapshot and crawl")
config_from_ar = get_config(archiveresult=ar)
assert config_from_ar.get('TIMEOUT') == 555, f"Expected 555, got {{config_from_ar.get('TIMEOUT')}}"
assert config_from_ar.get('SNAPSHOT_KEY') == 'from_snapshot', f"Expected from_snapshot"
assert config_from_ar.get('CRAWL_KEY') == 'from_crawl', f"Expected from_crawl"
print("✓ Auto-fetched snapshot from ar.snapshot")
print("✓ Auto-fetched crawl from snapshot.crawl")
# Test 3: Precedence without auto-fetch (explicit crawl only)
print("\\nTest 3: get_config(crawl=crawl) without snapshot")
config_crawl_only = get_config(crawl=crawl)
assert config_crawl_only.get('TIMEOUT') == 777, f"Expected 777 from crawl, got {{config_crawl_only.get('TIMEOUT')}}"
assert config_crawl_only.get('CRAWL_KEY') == 'from_crawl'
assert config_crawl_only.get('SNAPSHOT_KEY') is None, "Should not have snapshot config"
print("✓ Crawl-only config has TIMEOUT=777")
print("✓ No snapshot config values present")
print("\\n✓ All auto-fetch tests passed")
"""
result = subprocess.run(
['python', '-c', create_objects_script],
cwd=str(data_dir.parent),
env={
**os.environ,
'DATA_DIR': str(data_dir),
'USE_COLOR': 'False',
},
capture_output=True,
timeout=30,
)
print(result.stdout.decode())
if result.returncode != 0:
print("\nAuto-fetch test error:")
print(result.stderr.decode())
assert result.returncode == 0, f"Auto-fetch test failed: {result.stderr.decode()}"
print("\n" + "="*80)
print("✓ TEST PASSED: Config auto-fetches related objects correctly")
print(" - archiveresult → snapshot → crawl → user")
print(" - Precedence preserved during auto-fetch")
print("="*80 + "\n")
def test_config_precedence_with_environment_vars():
"""
Test that config precedence order is correct when environment vars are set.
Documented order (highest to lowest):
1. snapshot.config
2. crawl.config
3. user.config
4. persona config
5. environment variables <-- LOWER priority than snapshot/crawl
6. machine.config
7. config file
8. plugin defaults
9. core defaults
This test verifies snapshot.config overrides environment variables.
"""
with tempfile.TemporaryDirectory() as tmpdir:
data_dir = Path(tmpdir) / 'test_archive'
data_dir.mkdir()
print(f"\n{'='*80}")
print(f"Test: Config Precedence with Environment Variables")
print(f"DATA_DIR: {data_dir}")
print(f"{'='*80}\n")
# Initialize
result = subprocess.run(
['python', '-m', 'archivebox', 'init'],
cwd=str(data_dir),
env={**os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False'},
capture_output=True,
timeout=60,
)
assert result.returncode == 0
print("✓ Archive initialized\n")
# Test with environment variable set
print("Step 1: Test with TIMEOUT=999 in environment")
test_script = f"""
import os
os.environ['DATA_DIR'] = '{data_dir}'
os.environ['TIMEOUT'] = '999' # Set env var
from archivebox.config.django import setup_django
setup_django()
from django.utils import timezone
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.config.configset import get_config
# Create crawl with TIMEOUT=777
crawl = Crawl.objects.create(
urls='https://example.com',
status='queued',
retry_at=timezone.now(),
config={{'TIMEOUT': 777}}
)
# Create snapshot with TIMEOUT=555
snapshot = Snapshot.objects.create(
url='https://example.com',
crawl=crawl,
status='queued',
retry_at=timezone.now(),
config={{'TIMEOUT': 555}}
)
# Get config with all sources
config = get_config(snapshot=snapshot)
print(f"Environment: TIMEOUT={{os.environ.get('TIMEOUT')}}")
print(f"Crawl config: TIMEOUT={{crawl.config.get('TIMEOUT')}}")
print(f"Snapshot config: TIMEOUT={{snapshot.config.get('TIMEOUT')}}")
print(f"Merged config: TIMEOUT={{config.get('TIMEOUT')}}")
# Snapshot should override both crawl AND environment
expected = 555
actual = config.get('TIMEOUT')
if actual != expected:
print(f"\\n❌ PRECEDENCE BUG: Expected {{expected}}, got {{actual}}")
print(f" Snapshot.config should have highest priority!")
import sys
sys.exit(1)
print(f"\\n✓ snapshot.config ({{expected}}) correctly overrides env var (999) and crawl.config (777)")
"""
result = subprocess.run(
['python', '-c', test_script],
cwd=str(data_dir.parent),
capture_output=True,
timeout=30,
)
print(result.stdout.decode())
if result.returncode != 0:
print("\nPrecedence bug detected:")
print(result.stderr.decode())
assert result.returncode == 0, f"Precedence test failed: {result.stderr.decode()}"
print("\n" + "="*80)
print("✓ TEST PASSED: Snapshot config correctly overrides environment variables")
print("="*80 + "\n")
if __name__ == '__main__':
# Run as standalone script
test_config_propagation_through_worker_hierarchy()
test_config_environment_variable_parsing()
test_parent_environment_preserved_in_hooks()
test_config_auto_fetch_relationships()
test_config_precedence_with_environment_vars()

View File

@@ -36,7 +36,7 @@ from django.utils import timezone
from rich import print
from archivebox.misc.logging_util import log_worker_event
from .worker import Worker, CrawlWorker
from .worker import Worker, BinaryWorker, CrawlWorker
def _run_orchestrator_process(exit_on_idle: bool) -> None:
@@ -63,13 +63,14 @@ class Orchestrator:
- Each SnapshotWorker runs hooks sequentially for its snapshot
"""
# Only CrawlWorker - SnapshotWorkers are spawned by CrawlWorker subprocess, not by Orchestrator
WORKER_TYPES: list[Type[Worker]] = [CrawlWorker]
# BinaryWorker (singleton daemon) and CrawlWorker - SnapshotWorkers are spawned by CrawlWorker subprocess, not by Orchestrator
WORKER_TYPES: list[Type[Worker]] = [BinaryWorker, CrawlWorker]
# Configuration
POLL_INTERVAL: float = 2.0 # How often to check for new work (seconds)
IDLE_TIMEOUT: int = 3 # Exit after N idle ticks (0 = never exit)
MAX_CRAWL_WORKERS: int = 8 # Max crawls processing simultaneously
MAX_BINARY_WORKERS: int = 1 # Max binaries installing simultaneously (sequential only)
def __init__(self, exit_on_idle: bool = True, crawl_id: str | None = None):
self.exit_on_idle = exit_on_idle
@@ -194,15 +195,23 @@ class Orchestrator:
return len(WorkerClass.get_running_workers())
def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool:
"""Determine if we should spawn a new CrawlWorker."""
"""Determine if we should spawn a new worker."""
if queue_count == 0:
return False
# Check CrawlWorker limit
# Get appropriate limit based on worker type
if WorkerClass.name == 'crawl':
max_workers = self.MAX_CRAWL_WORKERS
elif WorkerClass.name == 'binary':
max_workers = self.MAX_BINARY_WORKERS # Force sequential: only 1 binary at a time
else:
max_workers = 1 # Default for unknown types
# Check worker limit
running_workers = WorkerClass.get_running_workers()
running_count = len(running_workers)
if running_count >= self.MAX_CRAWL_WORKERS:
if running_count >= max_workers:
return False
# Check if we already have enough workers for the queue size
@@ -285,14 +294,35 @@ class Orchestrator:
def check_queues_and_spawn_workers(self) -> dict[str, int]:
"""
Check Crawl queue and spawn CrawlWorkers as needed.
Check Binary and Crawl queues and spawn workers as needed.
Returns dict of queue sizes.
"""
from archivebox.crawls.models import Crawl
from archivebox.machine.models import Binary, Machine
queue_sizes = {}
# Only check Crawl queue
# Check Binary queue
machine = Machine.current()
binary_queue = Binary.objects.filter(
machine=machine,
status=Binary.StatusChoices.QUEUED,
retry_at__lte=timezone.now()
).order_by('retry_at')
binary_count = binary_queue.count()
queue_sizes['binary'] = binary_count
# Spawn BinaryWorker if needed (one worker per binary, up to MAX_BINARY_WORKERS)
if self.should_spawn_worker(BinaryWorker, binary_count):
# Get next binary to process
binary = binary_queue.first()
if binary:
BinaryWorker.start(binary_id=str(binary.id))
# Check if any BinaryWorkers are still running
running_binary_workers = len(BinaryWorker.get_running_workers())
# Check Crawl queue
crawl_queue = Crawl.objects.filter(
retry_at__lte=timezone.now()
).exclude(
@@ -307,12 +337,15 @@ class Orchestrator:
crawl_count = crawl_queue.count()
queue_sizes['crawl'] = crawl_count
# Spawn CrawlWorker if needed
if self.should_spawn_worker(CrawlWorker, crawl_count):
# Claim next crawl
crawl = crawl_queue.first()
if crawl and self._claim_crawl(crawl):
CrawlWorker.start(crawl_id=str(crawl.id))
# CRITICAL: Only spawn CrawlWorkers if binary queue is empty AND no BinaryWorkers running
# This ensures all binaries are installed before snapshots start processing
if binary_count == 0 and running_binary_workers == 0:
# Spawn CrawlWorker if needed
if self.should_spawn_worker(CrawlWorker, crawl_count):
# Claim next crawl
crawl = crawl_queue.first()
if crawl and self._claim_crawl(crawl):
CrawlWorker.start(crawl_id=str(crawl.id))
return queue_sizes
@@ -328,7 +361,7 @@ class Orchestrator:
)
return updated == 1
def has_pending_work(self, queue_sizes: dict[str, int]) -> bool:
"""Check if any queue has pending work."""
return any(count > 0 for count in queue_sizes.values())

View File

@@ -323,6 +323,20 @@ class Worker:
pwd = Path(snapshot.output_dir) # Run in snapshot's output directory
env = get_config(snapshot=snapshot)
elif cls.name == 'binary':
# BinaryWorker processes a specific binary installation
binary_id = kwargs.get('binary_id')
if not binary_id:
raise ValueError("BinaryWorker requires binary_id")
from archivebox.machine.models import Binary
binary = Binary.objects.get(id=binary_id)
cmd = [sys.executable, '-m', 'archivebox', 'run', '--binary-id', str(binary_id)]
pwd = Path(settings.DATA_DIR) / 'machines' / str(Machine.current().id) / 'binaries' / binary.name / str(binary.id)
pwd.mkdir(parents=True, exist_ok=True)
env = get_config()
else:
raise ValueError(f"Unknown worker type: {cls.name}")
@@ -654,16 +668,8 @@ class SnapshotWorker(Worker):
hooks = discover_hooks('Snapshot', config=config)
hooks = sorted(hooks, key=lambda h: h.name) # Sort by name (includes step prefix)
import sys
print(f'[SnapshotWorker] Discovered {len(hooks)} hooks for snapshot {self.snapshot.url}', file=sys.stderr, flush=True)
if hooks:
print(f'[SnapshotWorker] First 5 hooks: {[h.name for h in hooks[:5]]}', file=sys.stderr, flush=True)
else:
print(f'[SnapshotWorker] WARNING: No hooks discovered! Config keys: {list(config.keys())[:10]}...', file=sys.stderr, flush=True)
# Execute each hook sequentially
for hook_path in hooks:
print(f'[SnapshotWorker] Running hook: {hook_path.name}', file=sys.stderr, flush=True)
hook_name = hook_path.name
plugin = self._extract_plugin_name(hook_name)
hook_step = extract_step(hook_name)
@@ -829,8 +835,96 @@ class SnapshotWorker(Worker):
return name
class BinaryWorker(Worker):
"""
Worker that processes a specific Binary installation.
Like CrawlWorker and SnapshotWorker, BinaryWorker:
- Processes one specific binary (specified by binary_id)
- Installs it via Binary.run() which runs on_Binary__* hooks
- Exits when done
Orchestrator spawns BinaryWorkers sequentially (MAX_BINARY_WORKERS=1) to avoid
conflicts during binary installations.
"""
name: ClassVar[str] = 'binary'
MAX_TICK_TIME: ClassVar[int] = 600 # 10 minutes for binary installations
MAX_CONCURRENT_TASKS: ClassVar[int] = 1 # One binary per worker
def __init__(self, binary_id: str, worker_id: int = 0):
self.binary_id = binary_id
super().__init__(worker_id=worker_id)
def get_model(self):
from archivebox.machine.models import Binary
return Binary
def get_next_item(self):
"""Get the specific binary to install."""
from archivebox.machine.models import Binary
try:
return Binary.objects.get(id=self.binary_id)
except Binary.DoesNotExist:
return None
def runloop(self) -> None:
"""Install the specified binary."""
import sys
self.on_startup()
try:
binary = self.get_next_item()
if not binary:
log_worker_event(
worker_type='BinaryWorker',
event=f'Binary {self.binary_id} not found',
indent_level=1,
pid=self.pid,
)
return
print(f'[cyan]🔧 BinaryWorker installing: {binary.name}[/cyan]', file=sys.stderr)
# Tick the state machine to trigger installation
# This calls BinaryMachine.on_install() -> Binary.run() -> on_Binary__* hooks
binary.sm.tick()
# Check result
binary.refresh_from_db()
if binary.status == Binary.StatusChoices.INSTALLED:
log_worker_event(
worker_type='BinaryWorker',
event=f'Installed: {binary.name} -> {binary.abspath}',
indent_level=1,
pid=self.pid,
)
else:
log_worker_event(
worker_type='BinaryWorker',
event=f'Installation pending: {binary.name} (status={binary.status})',
indent_level=1,
pid=self.pid,
)
except Exception as e:
log_worker_event(
worker_type='BinaryWorker',
event=f'Failed to install binary',
indent_level=1,
pid=self.pid,
error=e,
)
finally:
self.on_shutdown()
# Populate the registry
WORKER_TYPES.update({
'binary': BinaryWorker,
'crawl': CrawlWorker,
'snapshot': SnapshotWorker,
})

View File

@@ -239,6 +239,8 @@ for test_dir in $TEST_DIRS; do
PYTEST_CMD="python -m pytest $test_dir -p no:django -v --tb=short"
if [ "$ENABLE_COVERAGE" = true ]; then
PYTEST_CMD="$PYTEST_CMD --cov=$plugin_name --cov-append --cov-branch"
echo "[DEBUG] NODE_V8_COVERAGE before pytest: $NODE_V8_COVERAGE"
python -c "import os; print('[DEBUG BASH->PYTHON] NODE_V8_COVERAGE:', os.environ.get('NODE_V8_COVERAGE', 'NOT_SET'))"
fi
if eval "$PYTEST_CMD" 2>&1 | grep -v "^platform\|^cachedir\|^rootdir\|^configfile\|^plugins:" | tail -100; then

View File

@@ -1878,7 +1878,7 @@ Updated `archivebox/core/statemachines.py`:
|--------|------|--------|-------|
| favicon | `on_Snapshot__11_favicon.py` | ✅ UPDATED | Now outputs clean JSONL |
| git | `on_Snapshot__12_git.py` | ✅ UPDATED | Now outputs clean JSONL with cmd |
| archive_org | `on_Snapshot__13_archive_org.py` | ✅ UPDATED | Now outputs clean JSONL |
| archivedotorg | `on_Snapshot__13_archivedotorg.py` | ✅ UPDATED | Now outputs clean JSONL |
| title | `on_Snapshot__32_title.js` | ✅ UPDATED | Now outputs clean JSONL |
| singlefile | `on_Snapshot__37_singlefile.py` | ✅ UPDATED | Now outputs clean JSONL with cmd |
| wget | `on_Snapshot__50_wget.py` | ✅ UPDATED | Now outputs clean JSONL with cmd |
@@ -1930,7 +1930,7 @@ The following hooks have been renamed with `.bg.` suffix:
- `archivebox/core/migrations/0030_migrate_output_field.py` (new)
### Plugins Updated (Python Hooks)
- `archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py`
- `archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py`
- `archivebox/plugins/favicon/on_Snapshot__11_favicon.py`
- `archivebox/plugins/git/on_Snapshot__12_git.py`
- `archivebox/plugins/media/on_Snapshot__51_media.py`