mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
rename archive_org to archivedotorg, add BinaryWorker, fix config pass-through
This commit is contained in:
@@ -1,6 +1,9 @@
|
||||
{
|
||||
"permissions": {
|
||||
"allow": [
|
||||
"Read(**)",
|
||||
"Glob(**)",
|
||||
"Grep(**)",
|
||||
"Bash(python -m archivebox:*)",
|
||||
"Bash(ls:*)",
|
||||
"Bash(xargs:*)",
|
||||
@@ -29,5 +32,19 @@
|
||||
"Bash(done)",
|
||||
"Bash(coverage erase:*)"
|
||||
]
|
||||
},
|
||||
"hooks": {
|
||||
"PreToolUse": [
|
||||
{
|
||||
"matcher": "Bash",
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "REPO_ROOT=$(git rev-parse --show-toplevel 2>/dev/null); if [ -n \"$REPO_ROOT\" ] && [ \"$PWD\" != \"$REPO_ROOT\" ]; then echo \"ERROR: Not in repo root ($REPO_ROOT). Current dir: $PWD\" >&2; exit 1; fi",
|
||||
"statusMessage": "Checking working directory..."
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
||||
@@ -128,4 +128,4 @@ class ModelWithOutputDir(ModelWithUUID):
|
||||
|
||||
@property
|
||||
def output_dir(self) -> Path:
|
||||
raise NotImplementedError(f'{self.__class__.__name__} must implement output_dir property')
|
||||
raise NotImplementedError(f"{self.__class__.__name__} must implement output_dir property")
|
||||
|
||||
@@ -59,10 +59,11 @@ def process_stdin_records() -> int:
|
||||
"""
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_BINARY
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.machine.models import Binary
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
|
||||
records = list(read_stdin())
|
||||
@@ -137,6 +138,26 @@ def process_stdin_records() -> int:
|
||||
output_records.append(archiveresult.to_json())
|
||||
queued_count += 1
|
||||
|
||||
elif record_type == TYPE_BINARY:
|
||||
# Binary records - create or update and queue for installation
|
||||
if record_id:
|
||||
# Existing binary - re-queue
|
||||
try:
|
||||
binary = Binary.objects.get(id=record_id)
|
||||
except Binary.DoesNotExist:
|
||||
binary = Binary.from_json(record)
|
||||
else:
|
||||
# New binary - create it
|
||||
binary = Binary.from_json(record)
|
||||
|
||||
if binary:
|
||||
binary.retry_at = timezone.now()
|
||||
if binary.status != Binary.StatusChoices.INSTALLED:
|
||||
binary.status = Binary.StatusChoices.QUEUED
|
||||
binary.save()
|
||||
output_records.append(binary.to_json())
|
||||
queued_count += 1
|
||||
|
||||
else:
|
||||
# Unknown type - pass through
|
||||
output_records.append(record)
|
||||
@@ -222,7 +243,8 @@ def run_snapshot_worker(snapshot_id: str) -> int:
|
||||
@click.option('--daemon', '-d', is_flag=True, help="Run forever (don't exit on idle)")
|
||||
@click.option('--crawl-id', help="Run orchestrator for specific crawl only")
|
||||
@click.option('--snapshot-id', help="Run worker for specific snapshot only")
|
||||
def main(daemon: bool, crawl_id: str, snapshot_id: str):
|
||||
@click.option('--binary-id', help="Run worker for specific binary only")
|
||||
def main(daemon: bool, crawl_id: str, snapshot_id: str, binary_id: str):
|
||||
"""
|
||||
Process queued work.
|
||||
|
||||
@@ -231,11 +253,27 @@ def main(daemon: bool, crawl_id: str, snapshot_id: str):
|
||||
- No args + TTY: Run orchestrator for all work
|
||||
- --crawl-id: Run orchestrator for that crawl only
|
||||
- --snapshot-id: Run worker for that snapshot only
|
||||
- --binary-id: Run worker for that binary only
|
||||
"""
|
||||
# Snapshot worker mode
|
||||
if snapshot_id:
|
||||
sys.exit(run_snapshot_worker(snapshot_id))
|
||||
|
||||
# Binary worker mode
|
||||
if binary_id:
|
||||
from archivebox.workers.worker import BinaryWorker
|
||||
try:
|
||||
worker = BinaryWorker(binary_id=binary_id, worker_id=0)
|
||||
worker.runloop()
|
||||
sys.exit(0)
|
||||
except KeyboardInterrupt:
|
||||
sys.exit(0)
|
||||
except Exception as e:
|
||||
rprint(f'[red]Worker error: {type(e).__name__}: {e}[/red]', file=sys.stderr)
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
|
||||
# Crawl worker mode
|
||||
if crawl_id:
|
||||
from archivebox.workers.worker import CrawlWorker
|
||||
|
||||
@@ -123,6 +123,7 @@ def get_config(
|
||||
user: Any = None,
|
||||
crawl: Any = None,
|
||||
snapshot: Any = None,
|
||||
archiveresult: Any = None,
|
||||
machine: Any = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
@@ -145,11 +146,26 @@ def get_config(
|
||||
user: User object with config JSON field
|
||||
crawl: Crawl object with config JSON field
|
||||
snapshot: Snapshot object with config JSON field
|
||||
archiveresult: ArchiveResult object (auto-fetches snapshot)
|
||||
machine: Machine object with config JSON field (defaults to Machine.current())
|
||||
|
||||
Note: Objects are auto-fetched from relationships if not provided:
|
||||
- snapshot auto-fetched from archiveresult.snapshot
|
||||
- crawl auto-fetched from snapshot.crawl
|
||||
- user auto-fetched from crawl.created_by
|
||||
|
||||
Returns:
|
||||
Merged config dict
|
||||
"""
|
||||
# Auto-fetch related objects from relationships
|
||||
if snapshot is None and archiveresult and hasattr(archiveresult, "snapshot"):
|
||||
snapshot = archiveresult.snapshot
|
||||
|
||||
if crawl is None and snapshot and hasattr(snapshot, "crawl"):
|
||||
crawl = snapshot.crawl
|
||||
|
||||
if user is None and crawl and hasattr(crawl, "created_by"):
|
||||
user = crawl.created_by
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
from archivebox.config.common import (
|
||||
SHELL_CONFIG,
|
||||
@@ -197,12 +213,18 @@ def get_config(
|
||||
if machine and hasattr(machine, "config") and machine.config:
|
||||
config.update(machine.config)
|
||||
|
||||
# Override with environment variables
|
||||
# Override with environment variables (for keys that exist in config)
|
||||
for key in config:
|
||||
env_val = os.environ.get(key)
|
||||
if env_val is not None:
|
||||
config[key] = _parse_env_value(env_val, config.get(key))
|
||||
|
||||
# Also add NEW environment variables (not yet in config)
|
||||
# This is important for worker subprocesses that receive config via Process.env
|
||||
for key, value in os.environ.items():
|
||||
if key.isupper() and key not in config: # Only uppercase keys (config convention)
|
||||
config[key] = _parse_env_value(value, None)
|
||||
|
||||
# Also check plugin config aliases in environment
|
||||
try:
|
||||
from archivebox.hooks import discover_plugin_configs
|
||||
@@ -335,7 +357,7 @@ DEFAULT_WORKER_CONCURRENCY = {
|
||||
"title": 5,
|
||||
"favicon": 5,
|
||||
"headers": 5,
|
||||
"archive_org": 2,
|
||||
"archivedotorg": 2,
|
||||
"readability": 3,
|
||||
"mercury": 3,
|
||||
"git": 2,
|
||||
|
||||
@@ -147,7 +147,7 @@ class AddLinkForm(forms.Form):
|
||||
'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
|
||||
}
|
||||
archiving = {
|
||||
'archive_org', 'favicon', 'forumdl', 'gallerydl', 'git',
|
||||
'archivedotorg', 'favicon', 'forumdl', 'gallerydl', 'git',
|
||||
'htmltotext', 'media', 'mercury', 'papersdl', 'readability', 'wget'
|
||||
}
|
||||
parsing = {
|
||||
|
||||
@@ -120,7 +120,7 @@ class Migration(migrations.Migration):
|
||||
('output', models.CharField(max_length=512)),
|
||||
('start_ts', models.DateTimeField()),
|
||||
('end_ts', models.DateTimeField()),
|
||||
('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archive_org', 'archive_org')], max_length=32)),
|
||||
('extractor', models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('wget', 'wget'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('headers', 'headers'), ('archivedotorg', 'archivedotorg')], max_length=32)),
|
||||
('snapshot', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, to='core.Snapshot')),
|
||||
],
|
||||
),
|
||||
|
||||
@@ -19,6 +19,6 @@ class Migration(migrations.Migration):
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='extractor',
|
||||
field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
|
||||
field=models.CharField(choices=[('title', 'title'), ('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -13,6 +13,6 @@ class Migration(migrations.Migration):
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='extractor',
|
||||
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
|
||||
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -13,6 +13,6 @@ class Migration(migrations.Migration):
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='extractor',
|
||||
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('htmltotext', 'htmltotext'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
|
||||
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('htmltotext', 'htmltotext'), ('git', 'git'), ('media', 'media'), ('archivedotorg', 'archivedotorg')], max_length=32),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -1973,7 +1973,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
canonical = {
|
||||
'index_path': 'index.html',
|
||||
'google_favicon_path': FAVICON_PROVIDER.format(self.domain),
|
||||
'archive_org_path': f'https://web.archive.org/web/{self.base_url}',
|
||||
'archivedotorg_path': f'https://web.archive.org/web/{self.base_url}',
|
||||
}
|
||||
|
||||
# Scan each ArchiveResult's output directory for the best file
|
||||
|
||||
@@ -206,7 +206,10 @@ DATABASES = {
|
||||
}
|
||||
MIGRATION_MODULES = {"signal_webhooks": None}
|
||||
|
||||
# as much as I'd love this to be a UUID or ULID field, it's not supported yet as of Django 5.0
|
||||
# Django requires DEFAULT_AUTO_FIELD to subclass AutoField (BigAutoField, SmallAutoField, etc.)
|
||||
# Cannot use UUIDField here until Django 6.0 introduces DEFAULT_PK_FIELD setting
|
||||
# For now: manually add `id = models.UUIDField(primary_key=True, default=uuid7, ...)` to all models
|
||||
# OR inherit from ModelWithUUID base class which provides UUID primary key
|
||||
DEFAULT_AUTO_FIELD = "django.db.models.BigAutoField"
|
||||
|
||||
|
||||
|
||||
17
archivebox/crawls/migrations/0004_remove_crawl_output_dir.py
Normal file
17
archivebox/crawls/migrations/0004_remove_crawl_output_dir.py
Normal file
@@ -0,0 +1,17 @@
|
||||
# Generated by Django 6.0 on 2026-01-05 01:09
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('crawls', '0003_remove_crawlschedule_num_uses_failed_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='crawl',
|
||||
name='output_dir',
|
||||
),
|
||||
]
|
||||
@@ -370,20 +370,34 @@ def run_hook(
|
||||
from archivebox.machine.models import Machine
|
||||
machine = Machine.current()
|
||||
if machine and machine.config:
|
||||
machine_path = machine.config.get('config/PATH')
|
||||
machine_path = machine.config.get('PATH')
|
||||
if machine_path:
|
||||
# Prepend LIB_BIN_DIR to machine PATH as well
|
||||
if lib_bin_dir and not machine_path.startswith(f'{lib_bin_dir}:'):
|
||||
env['PATH'] = f'{lib_bin_dir}:{machine_path}'
|
||||
else:
|
||||
env['PATH'] = machine_path
|
||||
# Also set NODE_MODULES_DIR if configured
|
||||
node_modules_dir = machine.config.get('config/NODE_MODULES_DIR')
|
||||
if node_modules_dir:
|
||||
env['NODE_MODULES_DIR'] = node_modules_dir
|
||||
except Exception:
|
||||
pass # Fall back to system PATH if Machine not available
|
||||
|
||||
# Set NODE_PATH for Node.js module resolution
|
||||
# Priority: config dict > Machine.config > derive from LIB_DIR
|
||||
node_path = config.get('NODE_PATH')
|
||||
if not node_path and lib_dir:
|
||||
# Derive from LIB_DIR/npm/node_modules
|
||||
node_modules_dir = Path(lib_dir) / 'npm' / 'node_modules'
|
||||
if node_modules_dir.exists():
|
||||
node_path = str(node_modules_dir)
|
||||
if not node_path:
|
||||
try:
|
||||
# Fallback to Machine.config
|
||||
node_path = machine.config.get('NODE_MODULES_DIR')
|
||||
except Exception:
|
||||
pass
|
||||
if node_path:
|
||||
env['NODE_PATH'] = node_path
|
||||
env['NODE_MODULES_DIR'] = node_path # For backwards compatibility
|
||||
|
||||
# Export all config values to environment (already merged by get_config())
|
||||
for key, value in config.items():
|
||||
if value is None:
|
||||
@@ -414,56 +428,8 @@ def run_hook(
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
# Build environment from config (Process._build_env() expects self.env dict)
|
||||
# We need to set env on the process before launching
|
||||
process.env = {}
|
||||
for key, value in config.items():
|
||||
if value is None:
|
||||
continue
|
||||
elif isinstance(value, bool):
|
||||
process.env[key] = 'true' if value else 'false'
|
||||
elif isinstance(value, (list, dict)):
|
||||
process.env[key] = json.dumps(value)
|
||||
else:
|
||||
process.env[key] = str(value)
|
||||
|
||||
# Add base paths to env
|
||||
process.env['DATA_DIR'] = str(getattr(settings, 'DATA_DIR', Path.cwd()))
|
||||
process.env['ARCHIVE_DIR'] = str(getattr(settings, 'ARCHIVE_DIR', Path.cwd() / 'archive'))
|
||||
process.env.setdefault('MACHINE_ID', getattr(settings, 'MACHINE_ID', '') or os.environ.get('MACHINE_ID', ''))
|
||||
|
||||
# Add LIB_DIR and LIB_BIN_DIR
|
||||
lib_dir = config.get('LIB_DIR', getattr(settings, 'LIB_DIR', None))
|
||||
lib_bin_dir = config.get('LIB_BIN_DIR', getattr(settings, 'LIB_BIN_DIR', None))
|
||||
if lib_dir:
|
||||
process.env['LIB_DIR'] = str(lib_dir)
|
||||
if not lib_bin_dir and lib_dir:
|
||||
lib_bin_dir = Path(lib_dir) / 'bin'
|
||||
if lib_bin_dir:
|
||||
process.env['LIB_BIN_DIR'] = str(lib_bin_dir)
|
||||
|
||||
# Set PATH from Machine.config if available
|
||||
try:
|
||||
if machine and machine.config:
|
||||
machine_path = machine.config.get('config/PATH')
|
||||
if machine_path:
|
||||
# Prepend LIB_BIN_DIR to machine PATH as well
|
||||
if lib_bin_dir and not machine_path.startswith(f'{lib_bin_dir}:'):
|
||||
process.env['PATH'] = f'{lib_bin_dir}:{machine_path}'
|
||||
else:
|
||||
process.env['PATH'] = machine_path
|
||||
elif lib_bin_dir:
|
||||
# Just prepend to current PATH
|
||||
current_path = os.environ.get('PATH', '')
|
||||
if not current_path.startswith(f'{lib_bin_dir}:'):
|
||||
process.env['PATH'] = f'{lib_bin_dir}:{current_path}' if current_path else str(lib_bin_dir)
|
||||
|
||||
# Also set NODE_MODULES_DIR if configured
|
||||
node_modules_dir = machine.config.get('config/NODE_MODULES_DIR')
|
||||
if node_modules_dir:
|
||||
process.env['NODE_MODULES_DIR'] = node_modules_dir
|
||||
except Exception:
|
||||
pass # Fall back to system PATH if Machine not available
|
||||
# Copy the env dict we already built (includes os.environ + all customizations)
|
||||
process.env = env.copy()
|
||||
|
||||
# Save env before launching
|
||||
process.save()
|
||||
|
||||
@@ -0,0 +1,17 @@
|
||||
# Generated by Django 6.0 on 2026-01-05 01:09
|
||||
|
||||
from django.db import migrations
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('machine', '0010_alter_process_process_type'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.RemoveField(
|
||||
model_name='binary',
|
||||
name='output_dir',
|
||||
),
|
||||
]
|
||||
@@ -1,7 +1,7 @@
|
||||
from django.apps import AppConfig
|
||||
|
||||
|
||||
class SessionsConfig(AppConfig):
|
||||
class PersonasConfig(AppConfig):
|
||||
default_auto_field = "django.db.models.BigAutoField"
|
||||
name = "archivebox.personas"
|
||||
label = "personas"
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# Generated by Django 6.0 on 2025-12-31 09:06
|
||||
|
||||
import archivebox.base_models.models
|
||||
from archivebox.uuid_compat import uuid7
|
||||
import django.db.models.deletion
|
||||
import django.utils.timezone
|
||||
from django.conf import settings
|
||||
@@ -19,7 +20,7 @@ class Migration(migrations.Migration):
|
||||
migrations.CreateModel(
|
||||
name='Persona',
|
||||
fields=[
|
||||
('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
|
||||
('id', models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)),
|
||||
('config', models.JSONField(blank=True, default=dict, null=True)),
|
||||
('name', models.CharField(max_length=64, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
|
||||
19
archivebox/personas/migrations/0002_alter_persona_id.py
Normal file
19
archivebox/personas/migrations/0002_alter_persona_id.py
Normal file
@@ -0,0 +1,19 @@
|
||||
# Generated by Django 6.0 on 2026-01-05 01:09
|
||||
|
||||
import uuid
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('personas', '0001_initial'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='persona',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
]
|
||||
@@ -19,6 +19,7 @@ from django.conf import settings
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.base_models.models import ModelWithConfig, get_or_create_system_user_pk
|
||||
from archivebox.uuid_compat import uuid7
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from django.db.models import QuerySet
|
||||
@@ -44,6 +45,7 @@ class Persona(ModelWithConfig):
|
||||
persona.CHROME_USER_DATA_DIR # -> Path to chrome_user_data
|
||||
"""
|
||||
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
name = models.CharField(max_length=64, unique=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk)
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
"""
|
||||
Submit a URL to archive.org for archiving.
|
||||
|
||||
Usage: on_Snapshot__archive_org.py --url=<url> --snapshot-id=<uuid>
|
||||
Usage: on_Snapshot__archivedotorg.py --url=<url> --snapshot-id=<uuid>
|
||||
Output: Writes archive.org.txt to $PWD with the archived URL
|
||||
|
||||
Environment variables:
|
||||
@@ -25,7 +25,7 @@ import rich_click as click
|
||||
|
||||
|
||||
# Extractor metadata
|
||||
PLUGIN_NAME = 'archive_org'
|
||||
PLUGIN_NAME = 'archivedotorg'
|
||||
OUTPUT_DIR = '.'
|
||||
OUTPUT_FILE = 'archive.org.txt'
|
||||
|
||||
@@ -41,7 +41,7 @@ def get_env_int(name: str, default: int = 0) -> int:
|
||||
return default
|
||||
|
||||
|
||||
def submit_to_archive_org(url: str) -> tuple[bool, str | None, str]:
|
||||
def submit_to_archivedotorg(url: str) -> tuple[bool, str | None, str]:
|
||||
"""
|
||||
Submit URL to archive.org Wayback Machine.
|
||||
|
||||
@@ -113,7 +113,7 @@ def main(url: str, snapshot_id: str):
|
||||
|
||||
try:
|
||||
# Run extraction
|
||||
success, output, error = submit_to_archive_org(url)
|
||||
success, output, error = submit_to_archivedotorg(url)
|
||||
|
||||
if success:
|
||||
# Success - emit ArchiveResult with output file
|
||||
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
Integration tests for archive_org plugin
|
||||
Integration tests for archivedotorg plugin
|
||||
|
||||
Tests verify standalone archive.org extractor execution.
|
||||
"""
|
||||
@@ -12,13 +12,13 @@ from pathlib import Path
|
||||
import pytest
|
||||
|
||||
PLUGIN_DIR = Path(__file__).parent.parent
|
||||
ARCHIVEDOTORG_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_archive_org.*'), None)
|
||||
ARCHIVEDOTORG_HOOK = next(PLUGIN_DIR.glob('on_Snapshot__*_archivedotorg.*'), None)
|
||||
TEST_URL = 'https://example.com'
|
||||
|
||||
def test_hook_script_exists():
|
||||
assert ARCHIVEDOTORG_HOOK.exists()
|
||||
|
||||
def test_submits_to_archive_org():
|
||||
def test_submits_to_archivedotorg():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
result = subprocess.run(
|
||||
[sys.executable, str(ARCHIVEDOTORG_HOOK), '--url', TEST_URL, '--snapshot-id', 'test789'],
|
||||
@@ -49,7 +49,7 @@ def test_submits_to_archive_org():
|
||||
assert not result_json, "Should NOT emit JSONL on transient error"
|
||||
assert result.stderr, "Should have error message in stderr"
|
||||
|
||||
def test_config_save_archive_org_false_skips():
|
||||
def test_config_save_archivedotorg_false_skips():
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
import os
|
||||
env = os.environ.copy()
|
||||
@@ -26,11 +26,20 @@ const {
|
||||
readCdpUrl,
|
||||
} = require('../chrome/chrome_utils.js');
|
||||
|
||||
// Flush V8 coverage before exit (needed for NODE_V8_COVERAGE to capture early exits)
|
||||
function flushCoverageAndExit(exitCode) {
|
||||
if (process.env.NODE_V8_COVERAGE) {
|
||||
const v8 = require('v8');
|
||||
v8.takeCoverage();
|
||||
}
|
||||
process.exit(exitCode);
|
||||
}
|
||||
|
||||
// Check if screenshot is enabled BEFORE requiring puppeteer
|
||||
if (!getEnvBool('SCREENSHOT_ENABLED', true)) {
|
||||
console.error('Skipping screenshot (SCREENSHOT_ENABLED=False)');
|
||||
// Temporary failure (config disabled) - NO JSONL emission
|
||||
process.exit(0);
|
||||
flushCoverageAndExit(0);
|
||||
}
|
||||
|
||||
// Now safe to require puppeteer
|
||||
@@ -135,7 +144,7 @@ async function main() {
|
||||
|
||||
if (!url || !snapshotId) {
|
||||
console.error('Usage: on_Snapshot__51_screenshot.js --url=<url> --snapshot-id=<uuid>');
|
||||
process.exit(1);
|
||||
flushCoverageAndExit(1);
|
||||
}
|
||||
|
||||
// Check if staticfile extractor already handled this (permanent skip)
|
||||
@@ -147,7 +156,7 @@ async function main() {
|
||||
status: 'skipped',
|
||||
output_str: 'staticfile already handled',
|
||||
}));
|
||||
process.exit(0);
|
||||
flushCoverageAndExit(0);
|
||||
}
|
||||
|
||||
// Take screenshot (throws on error)
|
||||
@@ -166,5 +175,5 @@ async function main() {
|
||||
main().catch(e => {
|
||||
// Transient error - emit NO JSONL
|
||||
console.error(`ERROR: ${e.message}`);
|
||||
process.exit(1);
|
||||
flushCoverageAndExit(1);
|
||||
});
|
||||
|
||||
@@ -226,9 +226,6 @@ def test_config_save_screenshot_false_skips():
|
||||
print(f"[DEBUG RESULT] Exit code: {result.returncode}")
|
||||
print(f"[DEBUG RESULT] Stderr: {result.stderr[:200]}")
|
||||
|
||||
# FORCE FAILURE to verify test actually runs
|
||||
assert False, f"FORCED FAILURE - NODE_V8_COVERAGE={'NODE_V8_COVERAGE' in env} value={env.get('NODE_V8_COVERAGE', 'NOTSET')}"
|
||||
|
||||
assert result.returncode == 0, f"Should exit 0 when feature disabled: {result.stderr}"
|
||||
|
||||
# Feature disabled - temporary failure, should NOT emit JSONL
|
||||
|
||||
@@ -74,7 +74,7 @@
|
||||
"--geo-bypass",
|
||||
"--add-metadata",
|
||||
"--no-progress",
|
||||
"--remote-components ejs:github",
|
||||
"--remote-components=ejs:github",
|
||||
"-o",
|
||||
"%(title)s.%(ext)s"
|
||||
],
|
||||
|
||||
256
archivebox/tests/test_cli_run_binary_worker.py
Normal file
256
archivebox/tests/test_cli_run_binary_worker.py
Normal file
@@ -0,0 +1,256 @@
|
||||
"""
|
||||
Tests for BinaryWorker processing Binary queue.
|
||||
|
||||
Tests cover:
|
||||
- BinaryWorker is spawned by Orchestrator when Binary queue has work
|
||||
- Binary hooks (on_Binary__*) actually run and install binaries
|
||||
- Binary status transitions from QUEUED -> INSTALLED
|
||||
- BinaryWorker exits after idle timeout
|
||||
"""
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
import time
|
||||
|
||||
from archivebox.tests.conftest import (
|
||||
run_archivebox_cmd,
|
||||
parse_jsonl_output,
|
||||
)
|
||||
|
||||
|
||||
class TestBinaryWorkerSpawning:
|
||||
"""Tests for BinaryWorker lifecycle."""
|
||||
|
||||
def test_binary_worker_spawns_when_binary_queued(self, initialized_archive):
|
||||
"""Orchestrator spawns BinaryWorker when Binary queue has work."""
|
||||
# Create a Binary record via CLI
|
||||
binary_record = {
|
||||
'type': 'Binary',
|
||||
'name': 'python3',
|
||||
'binproviders': 'env', # Use env provider to detect system python
|
||||
}
|
||||
|
||||
# Use `archivebox run` to create the Binary (this queues it)
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(binary_record),
|
||||
data_dir=initialized_archive,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert code == 0, f"Failed to create Binary: {stderr}"
|
||||
|
||||
# Verify Binary was created in DB
|
||||
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
|
||||
c = conn.cursor()
|
||||
binaries = c.execute(
|
||||
"SELECT name, status, abspath FROM machine_binary WHERE name='python3'"
|
||||
).fetchall()
|
||||
conn.close()
|
||||
|
||||
assert len(binaries) >= 1, "Binary was not created in database"
|
||||
name, status, abspath = binaries[0]
|
||||
assert name == 'python3'
|
||||
# Status should be INSTALLED after BinaryWorker processed it
|
||||
# (or QUEUED if worker timed out before installing)
|
||||
assert status in ['installed', 'queued']
|
||||
|
||||
|
||||
def test_binary_hooks_actually_run(self, initialized_archive):
|
||||
"""Binary installation hooks (on_Binary__*) run and update abspath."""
|
||||
# Create a Binary for python3 (guaranteed to exist on system)
|
||||
binary_record = {
|
||||
'type': 'Binary',
|
||||
'name': 'python3',
|
||||
'binproviders': 'env',
|
||||
}
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(binary_record),
|
||||
data_dir=initialized_archive,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert code == 0, f"Failed to process Binary: {stderr}"
|
||||
|
||||
# Query database to check if hooks ran and populated abspath
|
||||
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
|
||||
c = conn.cursor()
|
||||
result = c.execute(
|
||||
"SELECT name, status, abspath, version FROM machine_binary WHERE name='python3'"
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert result is not None, "Binary not found in database"
|
||||
name, status, abspath, version = result
|
||||
|
||||
# If hooks ran successfully, abspath should be populated
|
||||
if status == 'installed':
|
||||
assert abspath, f"Binary installed but abspath is empty: {abspath}"
|
||||
assert '/python3' in abspath or '\\python3' in abspath, \
|
||||
f"abspath doesn't look like a python3 path: {abspath}"
|
||||
# Version should also be populated
|
||||
assert version, f"Binary installed but version is empty: {version}"
|
||||
|
||||
|
||||
def test_binary_status_transitions(self, initialized_archive):
|
||||
"""Binary status correctly transitions QUEUED -> INSTALLED."""
|
||||
binary_record = {
|
||||
'type': 'Binary',
|
||||
'name': 'python3',
|
||||
'binproviders': 'env',
|
||||
}
|
||||
|
||||
# Create and process the Binary
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(binary_record),
|
||||
data_dir=initialized_archive,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
|
||||
# Check final status
|
||||
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
|
||||
c = conn.cursor()
|
||||
status = c.execute(
|
||||
"SELECT status FROM machine_binary WHERE name='python3'"
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert status is not None
|
||||
# Should be installed (or queued if worker timed out)
|
||||
assert status[0] in ['installed', 'queued']
|
||||
|
||||
|
||||
class TestBinaryWorkerHooks:
|
||||
"""Tests for specific Binary hook providers."""
|
||||
|
||||
def test_env_provider_hook_detects_system_binary(self, initialized_archive):
|
||||
"""on_Binary__15_env_install.py hook detects system binaries."""
|
||||
binary_record = {
|
||||
'type': 'Binary',
|
||||
'name': 'python3',
|
||||
'binproviders': 'env',
|
||||
}
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(binary_record),
|
||||
data_dir=initialized_archive,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
|
||||
# Check that env provider hook populated the Binary
|
||||
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
|
||||
c = conn.cursor()
|
||||
result = c.execute(
|
||||
"SELECT binprovider, abspath FROM machine_binary WHERE name='python3' AND status='installed'"
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
if result:
|
||||
binprovider, abspath = result
|
||||
assert binprovider == 'env', f"Expected env provider, got: {binprovider}"
|
||||
assert abspath, "abspath should be populated by env provider"
|
||||
|
||||
|
||||
def test_multiple_binaries_processed_in_batch(self, initialized_archive):
|
||||
"""BinaryWorker processes multiple queued binaries."""
|
||||
# Create multiple Binary records
|
||||
binaries = [
|
||||
{'type': 'Binary', 'name': 'python3', 'binproviders': 'env'},
|
||||
{'type': 'Binary', 'name': 'curl', 'binproviders': 'env'},
|
||||
]
|
||||
|
||||
stdin = '\n'.join(json.dumps(b) for b in binaries)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=stdin,
|
||||
data_dir=initialized_archive,
|
||||
timeout=45,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
|
||||
# Both should be processed
|
||||
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
|
||||
c = conn.cursor()
|
||||
installed = c.execute(
|
||||
"SELECT name FROM machine_binary WHERE name IN ('python3', 'curl')"
|
||||
).fetchall()
|
||||
conn.close()
|
||||
|
||||
assert len(installed) >= 1, "At least one binary should be created"
|
||||
|
||||
|
||||
class TestBinaryWorkerEdgeCases:
|
||||
"""Tests for edge cases and error handling."""
|
||||
|
||||
def test_nonexistent_binary_stays_queued(self, initialized_archive):
|
||||
"""Binary that doesn't exist stays queued (doesn't fail permanently)."""
|
||||
binary_record = {
|
||||
'type': 'Binary',
|
||||
'name': 'nonexistent-binary-xyz-12345',
|
||||
'binproviders': 'env',
|
||||
}
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(binary_record),
|
||||
data_dir=initialized_archive,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Command should still succeed (orchestrator doesn't fail on binary install failures)
|
||||
assert code == 0
|
||||
|
||||
# Binary should remain queued (not installed)
|
||||
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
|
||||
c = conn.cursor()
|
||||
result = c.execute(
|
||||
"SELECT status FROM machine_binary WHERE name='nonexistent-binary-xyz-12345'"
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
if result:
|
||||
status = result[0]
|
||||
# Should stay queued since installation failed
|
||||
assert status == 'queued', f"Expected queued, got: {status}"
|
||||
|
||||
|
||||
def test_binary_worker_respects_machine_isolation(self, initialized_archive):
|
||||
"""BinaryWorker only processes binaries for current machine."""
|
||||
# This is implicitly tested by other tests - Binary.objects.filter(machine=current)
|
||||
# ensures only current machine's binaries are processed
|
||||
binary_record = {
|
||||
'type': 'Binary',
|
||||
'name': 'python3',
|
||||
'binproviders': 'env',
|
||||
}
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(binary_record),
|
||||
data_dir=initialized_archive,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
|
||||
# Check that machine_id is set correctly
|
||||
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
|
||||
c = conn.cursor()
|
||||
result = c.execute(
|
||||
"SELECT machine_id FROM machine_binary WHERE name='python3'"
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert result is not None
|
||||
machine_id = result[0]
|
||||
assert machine_id, "machine_id should be set on Binary"
|
||||
@@ -246,6 +246,68 @@ print(snapshot.id)
|
||||
"Expected debug output not found in stderr"
|
||||
print("✓ Config debug output found in stderr")
|
||||
|
||||
# Verify precedence order: snapshot > crawl > user > persona > env > machine > file > defaults
|
||||
verify_precedence_script = f"""
|
||||
import os
|
||||
os.environ['DATA_DIR'] = '{data_dir}'
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
snapshot = Snapshot.objects.get(id='{snapshot_id}')
|
||||
|
||||
# Test precedence by getting config at different levels
|
||||
print("\\nTesting config precedence order:")
|
||||
|
||||
# 1. Just defaults (lowest priority)
|
||||
config_defaults = get_config()
|
||||
print(f" Defaults only: TIMEOUT={{config_defaults.get('TIMEOUT')}}")
|
||||
|
||||
# 2. With machine config
|
||||
from archivebox.machine.models import Machine
|
||||
machine = Machine.current()
|
||||
config_machine = get_config(machine=machine)
|
||||
custom_machine = config_machine.get('CUSTOM_MACHINE_KEY')
|
||||
print(f" + Machine: CUSTOM_MACHINE_KEY={{custom_machine}}")
|
||||
|
||||
# 3. With crawl config
|
||||
config_crawl = get_config(crawl=snapshot.crawl)
|
||||
print(f" + Crawl: TIMEOUT={{config_crawl.get('TIMEOUT')}} (should be 777 from crawl.config)")
|
||||
assert config_crawl.get('TIMEOUT') == 777, f"Expected 777 from crawl, got {{config_crawl.get('TIMEOUT')}}"
|
||||
|
||||
# 4. With snapshot config (highest priority)
|
||||
config_snapshot = get_config(snapshot=snapshot)
|
||||
print(f" + Snapshot: TIMEOUT={{config_snapshot.get('TIMEOUT')}} (should be 555 from snapshot.config)")
|
||||
assert config_snapshot.get('TIMEOUT') == 555, f"Expected 555 from snapshot, got {{config_snapshot.get('TIMEOUT')}}"
|
||||
|
||||
# Verify snapshot config overrides crawl config
|
||||
assert config_snapshot.get('CUSTOM_CRAWL_KEY') == 'from_crawl_json', "Crawl config should be present"
|
||||
assert config_snapshot.get('CUSTOM_SNAPSHOT_KEY') == 'from_snapshot_json', "Snapshot config should be present"
|
||||
assert config_snapshot.get('CUSTOM_MACHINE_KEY') == 'from_machine_config', "Machine config should be present"
|
||||
|
||||
print("\\n✓ Config precedence order verified: snapshot > crawl > machine > defaults")
|
||||
"""
|
||||
result = subprocess.run(
|
||||
['python', '-c', verify_precedence_script],
|
||||
cwd=str(data_dir.parent),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
print(result.stdout.decode())
|
||||
if result.returncode != 0:
|
||||
print("\nPrecedence verification error:")
|
||||
print(result.stderr.decode())
|
||||
assert result.returncode == 0, f"Precedence verification failed: {result.stderr.decode()}"
|
||||
|
||||
# Verify config values were actually used by checking ArchiveResults
|
||||
verify_script = f"""
|
||||
import os
|
||||
@@ -475,7 +537,453 @@ print("\\n✓ All config values correctly parsed from environment")
|
||||
print("="*80 + "\n")
|
||||
|
||||
|
||||
def test_parent_environment_preserved_in_hooks():
|
||||
"""
|
||||
Test that parent environment variables are preserved in hook execution.
|
||||
|
||||
This test catches the bug where we built env=os.environ.copy() but then
|
||||
clobbered it with process.env={}, losing all parent environment.
|
||||
|
||||
Also verifies:
|
||||
- NODE_PATH is correctly derived from LIB_DIR/npm/node_modules
|
||||
- LIB_BIN_DIR is correctly derived and added to PATH
|
||||
"""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir) / 'test_archive'
|
||||
data_dir.mkdir()
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Test: Parent Environment Preserved in Hooks")
|
||||
print(f"DATA_DIR: {data_dir}")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
# Initialize archive
|
||||
print("Step 1: Initialize archive")
|
||||
result = subprocess.run(
|
||||
['python', '-m', 'archivebox', 'init'],
|
||||
cwd=str(data_dir),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=60,
|
||||
)
|
||||
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
|
||||
print(f"✓ Archive initialized\n")
|
||||
|
||||
# Create snapshot
|
||||
print("Step 2: Create Snapshot")
|
||||
create_snapshot_script = f"""
|
||||
import os
|
||||
os.environ['DATA_DIR'] = '{data_dir}'
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from django.utils import timezone
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.crawls.models import Crawl
|
||||
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
status='queued',
|
||||
retry_at=timezone.now()
|
||||
)
|
||||
|
||||
snapshot = Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
crawl=crawl,
|
||||
status='queued',
|
||||
retry_at=timezone.now()
|
||||
)
|
||||
print(snapshot.id)
|
||||
"""
|
||||
result = subprocess.run(
|
||||
['python', '-c', create_snapshot_script],
|
||||
cwd=str(data_dir.parent),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=30,
|
||||
)
|
||||
assert result.returncode == 0, f"Create snapshot failed: {result.stderr.decode()}"
|
||||
snapshot_id = result.stdout.decode().strip().split('\n')[-1]
|
||||
print(f"✓ Created snapshot {snapshot_id}\n")
|
||||
|
||||
# Run SnapshotWorker with custom parent environment variable
|
||||
print("Step 3: Run SnapshotWorker with TEST_PARENT_ENV_VAR in parent process")
|
||||
result = subprocess.run(
|
||||
['python', '-m', 'archivebox', 'run', '--snapshot-id', snapshot_id],
|
||||
cwd=str(data_dir),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
'TEST_PARENT_ENV_VAR': 'preserved_from_parent', # This should reach the hook
|
||||
'PLUGINS': 'favicon', # Use existing plugin (favicon is simple and fast)
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=120,
|
||||
)
|
||||
|
||||
stdout = result.stdout.decode()
|
||||
stderr = result.stderr.decode()
|
||||
|
||||
print("\n--- SnapshotWorker stderr (first 50 lines) ---")
|
||||
print('\n'.join(stderr.split('\n')[:50]))
|
||||
print("--- End stderr ---\n")
|
||||
|
||||
# Verify hooks ran by checking Process records
|
||||
print("Step 4: Verify environment variables in hook Process records")
|
||||
verify_env_script = f"""
|
||||
import os
|
||||
os.environ['DATA_DIR'] = '{data_dir}'
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from archivebox.machine.models import Process
|
||||
from archivebox.core.models import Snapshot
|
||||
import json
|
||||
|
||||
snapshot = Snapshot.objects.get(id='{snapshot_id}')
|
||||
|
||||
# Find hook processes for this snapshot
|
||||
hook_processes = Process.objects.filter(
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
pwd__contains=str(snapshot.id)
|
||||
).order_by('-created_at')
|
||||
|
||||
print(f"Found {{hook_processes.count()}} hook processes")
|
||||
|
||||
if hook_processes.count() == 0:
|
||||
print("ERROR: No hook processes found!")
|
||||
import sys
|
||||
sys.exit(1)
|
||||
|
||||
# Check the first hook process environment
|
||||
hook_process = hook_processes.first()
|
||||
print(f"\\nChecking hook: {{hook_process.cmd}}")
|
||||
print(f"Hook env keys: {{len(hook_process.env)}} total")
|
||||
|
||||
# Verify TEST_PARENT_ENV_VAR was preserved
|
||||
test_parent = hook_process.env.get('TEST_PARENT_ENV_VAR')
|
||||
print(f" TEST_PARENT_ENV_VAR: {{test_parent}}")
|
||||
assert test_parent == 'preserved_from_parent', f"Expected 'preserved_from_parent', got {{test_parent}}"
|
||||
|
||||
# Verify LIB_DIR is set
|
||||
lib_dir = hook_process.env.get('LIB_DIR')
|
||||
print(f" LIB_DIR: {{lib_dir}}")
|
||||
assert lib_dir is not None, "LIB_DIR not set"
|
||||
|
||||
# Verify LIB_BIN_DIR is derived
|
||||
lib_bin_dir = hook_process.env.get('LIB_BIN_DIR')
|
||||
print(f" LIB_BIN_DIR: {{lib_bin_dir}}")
|
||||
if lib_dir:
|
||||
assert lib_bin_dir is not None, "LIB_BIN_DIR not derived from LIB_DIR"
|
||||
assert lib_bin_dir.endswith('/bin'), f"LIB_BIN_DIR should end with /bin, got {{lib_bin_dir}}"
|
||||
|
||||
# Verify LIB_BIN_DIR is in PATH
|
||||
path = hook_process.env.get('PATH')
|
||||
if lib_bin_dir:
|
||||
assert lib_bin_dir in path, f"LIB_BIN_DIR not in PATH. LIB_BIN_DIR={{lib_bin_dir}}, PATH={{path[:200]}}..."
|
||||
|
||||
# Verify NODE_PATH is set
|
||||
node_path = hook_process.env.get('NODE_PATH')
|
||||
node_modules_dir = hook_process.env.get('NODE_MODULES_DIR')
|
||||
print(f" NODE_PATH: {{node_path}}")
|
||||
print(f" NODE_MODULES_DIR: {{node_modules_dir}}")
|
||||
if node_path:
|
||||
# Should also have NODE_MODULES_DIR for backwards compatibility
|
||||
assert node_modules_dir == node_path, f"NODE_MODULES_DIR should match NODE_PATH"
|
||||
|
||||
print("\\n✓ All environment checks passed")
|
||||
"""
|
||||
result = subprocess.run(
|
||||
['python', '-c', verify_env_script],
|
||||
cwd=str(data_dir.parent),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
print(result.stdout.decode())
|
||||
if result.returncode != 0:
|
||||
print("\nVerification error:")
|
||||
print(result.stderr.decode())
|
||||
|
||||
assert result.returncode == 0, f"Environment verification failed: {result.stderr.decode()}"
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("✓ TEST PASSED: Parent environment preserved in hooks")
|
||||
print(" - Custom parent env vars reach hooks")
|
||||
print(" - LIB_DIR propagated correctly")
|
||||
print(" - LIB_BIN_DIR derived and added to PATH")
|
||||
print(" - NODE_PATH/NODE_MODULES_DIR set when available")
|
||||
print("="*80 + "\n")
|
||||
|
||||
|
||||
def test_config_auto_fetch_relationships():
|
||||
"""
|
||||
Test that get_config() auto-fetches related objects from relationships.
|
||||
|
||||
Verifies:
|
||||
- snapshot auto-fetched from archiveresult.snapshot
|
||||
- crawl auto-fetched from snapshot.crawl
|
||||
- user auto-fetched from crawl.created_by
|
||||
"""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir) / 'test_archive'
|
||||
data_dir.mkdir()
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Test: Config Auto-Fetch Relationships")
|
||||
print(f"DATA_DIR: {data_dir}")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
# Initialize archive
|
||||
print("Step 1: Initialize archive")
|
||||
result = subprocess.run(
|
||||
['python', '-m', 'archivebox', 'init'],
|
||||
cwd=str(data_dir),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=60,
|
||||
)
|
||||
assert result.returncode == 0, f"Init failed: {result.stderr.decode()}"
|
||||
print(f"✓ Archive initialized\n")
|
||||
|
||||
# Create objects with config at each level
|
||||
print("Step 2: Create Crawl -> Snapshot -> ArchiveResult with config at each level")
|
||||
create_objects_script = f"""
|
||||
import os
|
||||
os.environ['DATA_DIR'] = '{data_dir}'
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from django.utils import timezone
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot, ArchiveResult
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
# Create crawl with config
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
status='queued',
|
||||
retry_at=timezone.now(),
|
||||
config={{
|
||||
'CRAWL_KEY': 'from_crawl',
|
||||
'TIMEOUT': 777,
|
||||
}}
|
||||
)
|
||||
|
||||
# Create snapshot with config
|
||||
snapshot = Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
crawl=crawl,
|
||||
status='queued',
|
||||
retry_at=timezone.now(),
|
||||
config={{
|
||||
'SNAPSHOT_KEY': 'from_snapshot',
|
||||
'TIMEOUT': 555,
|
||||
}}
|
||||
)
|
||||
|
||||
# Create ArchiveResult
|
||||
ar = ArchiveResult.objects.create(
|
||||
snapshot=snapshot,
|
||||
plugin='test',
|
||||
hook_name='test_hook',
|
||||
status=ArchiveResult.StatusChoices.STARTED
|
||||
)
|
||||
|
||||
print(f"Created: crawl={{crawl.id}}, snapshot={{snapshot.id}}, ar={{ar.id}}")
|
||||
|
||||
# Test 1: Auto-fetch crawl from snapshot
|
||||
print("\\nTest 1: get_config(snapshot=snapshot) auto-fetches crawl")
|
||||
config = get_config(snapshot=snapshot)
|
||||
assert config.get('TIMEOUT') == 555, f"Expected 555 from snapshot, got {{config.get('TIMEOUT')}}"
|
||||
assert config.get('SNAPSHOT_KEY') == 'from_snapshot', f"Expected from_snapshot, got {{config.get('SNAPSHOT_KEY')}}"
|
||||
assert config.get('CRAWL_KEY') == 'from_crawl', f"Expected from_crawl, got {{config.get('CRAWL_KEY')}}"
|
||||
print("✓ Snapshot config (TIMEOUT=555) overrides crawl config (TIMEOUT=777)")
|
||||
print("✓ Both snapshot.config and crawl.config values present")
|
||||
|
||||
# Test 2: Auto-fetch snapshot from archiveresult
|
||||
print("\\nTest 2: get_config(archiveresult=ar) auto-fetches snapshot and crawl")
|
||||
config_from_ar = get_config(archiveresult=ar)
|
||||
assert config_from_ar.get('TIMEOUT') == 555, f"Expected 555, got {{config_from_ar.get('TIMEOUT')}}"
|
||||
assert config_from_ar.get('SNAPSHOT_KEY') == 'from_snapshot', f"Expected from_snapshot"
|
||||
assert config_from_ar.get('CRAWL_KEY') == 'from_crawl', f"Expected from_crawl"
|
||||
print("✓ Auto-fetched snapshot from ar.snapshot")
|
||||
print("✓ Auto-fetched crawl from snapshot.crawl")
|
||||
|
||||
# Test 3: Precedence without auto-fetch (explicit crawl only)
|
||||
print("\\nTest 3: get_config(crawl=crawl) without snapshot")
|
||||
config_crawl_only = get_config(crawl=crawl)
|
||||
assert config_crawl_only.get('TIMEOUT') == 777, f"Expected 777 from crawl, got {{config_crawl_only.get('TIMEOUT')}}"
|
||||
assert config_crawl_only.get('CRAWL_KEY') == 'from_crawl'
|
||||
assert config_crawl_only.get('SNAPSHOT_KEY') is None, "Should not have snapshot config"
|
||||
print("✓ Crawl-only config has TIMEOUT=777")
|
||||
print("✓ No snapshot config values present")
|
||||
|
||||
print("\\n✓ All auto-fetch tests passed")
|
||||
"""
|
||||
|
||||
result = subprocess.run(
|
||||
['python', '-c', create_objects_script],
|
||||
cwd=str(data_dir.parent),
|
||||
env={
|
||||
**os.environ,
|
||||
'DATA_DIR': str(data_dir),
|
||||
'USE_COLOR': 'False',
|
||||
},
|
||||
capture_output=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
print(result.stdout.decode())
|
||||
if result.returncode != 0:
|
||||
print("\nAuto-fetch test error:")
|
||||
print(result.stderr.decode())
|
||||
|
||||
assert result.returncode == 0, f"Auto-fetch test failed: {result.stderr.decode()}"
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("✓ TEST PASSED: Config auto-fetches related objects correctly")
|
||||
print(" - archiveresult → snapshot → crawl → user")
|
||||
print(" - Precedence preserved during auto-fetch")
|
||||
print("="*80 + "\n")
|
||||
|
||||
|
||||
def test_config_precedence_with_environment_vars():
|
||||
"""
|
||||
Test that config precedence order is correct when environment vars are set.
|
||||
|
||||
Documented order (highest to lowest):
|
||||
1. snapshot.config
|
||||
2. crawl.config
|
||||
3. user.config
|
||||
4. persona config
|
||||
5. environment variables <-- LOWER priority than snapshot/crawl
|
||||
6. machine.config
|
||||
7. config file
|
||||
8. plugin defaults
|
||||
9. core defaults
|
||||
|
||||
This test verifies snapshot.config overrides environment variables.
|
||||
"""
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
data_dir = Path(tmpdir) / 'test_archive'
|
||||
data_dir.mkdir()
|
||||
|
||||
print(f"\n{'='*80}")
|
||||
print(f"Test: Config Precedence with Environment Variables")
|
||||
print(f"DATA_DIR: {data_dir}")
|
||||
print(f"{'='*80}\n")
|
||||
|
||||
# Initialize
|
||||
result = subprocess.run(
|
||||
['python', '-m', 'archivebox', 'init'],
|
||||
cwd=str(data_dir),
|
||||
env={**os.environ, 'DATA_DIR': str(data_dir), 'USE_COLOR': 'False'},
|
||||
capture_output=True,
|
||||
timeout=60,
|
||||
)
|
||||
assert result.returncode == 0
|
||||
print("✓ Archive initialized\n")
|
||||
|
||||
# Test with environment variable set
|
||||
print("Step 1: Test with TIMEOUT=999 in environment")
|
||||
test_script = f"""
|
||||
import os
|
||||
os.environ['DATA_DIR'] = '{data_dir}'
|
||||
os.environ['TIMEOUT'] = '999' # Set env var
|
||||
|
||||
from archivebox.config.django import setup_django
|
||||
setup_django()
|
||||
|
||||
from django.utils import timezone
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
# Create crawl with TIMEOUT=777
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
status='queued',
|
||||
retry_at=timezone.now(),
|
||||
config={{'TIMEOUT': 777}}
|
||||
)
|
||||
|
||||
# Create snapshot with TIMEOUT=555
|
||||
snapshot = Snapshot.objects.create(
|
||||
url='https://example.com',
|
||||
crawl=crawl,
|
||||
status='queued',
|
||||
retry_at=timezone.now(),
|
||||
config={{'TIMEOUT': 555}}
|
||||
)
|
||||
|
||||
# Get config with all sources
|
||||
config = get_config(snapshot=snapshot)
|
||||
|
||||
print(f"Environment: TIMEOUT={{os.environ.get('TIMEOUT')}}")
|
||||
print(f"Crawl config: TIMEOUT={{crawl.config.get('TIMEOUT')}}")
|
||||
print(f"Snapshot config: TIMEOUT={{snapshot.config.get('TIMEOUT')}}")
|
||||
print(f"Merged config: TIMEOUT={{config.get('TIMEOUT')}}")
|
||||
|
||||
# Snapshot should override both crawl AND environment
|
||||
expected = 555
|
||||
actual = config.get('TIMEOUT')
|
||||
if actual != expected:
|
||||
print(f"\\n❌ PRECEDENCE BUG: Expected {{expected}}, got {{actual}}")
|
||||
print(f" Snapshot.config should have highest priority!")
|
||||
import sys
|
||||
sys.exit(1)
|
||||
|
||||
print(f"\\n✓ snapshot.config ({{expected}}) correctly overrides env var (999) and crawl.config (777)")
|
||||
"""
|
||||
|
||||
result = subprocess.run(
|
||||
['python', '-c', test_script],
|
||||
cwd=str(data_dir.parent),
|
||||
capture_output=True,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
print(result.stdout.decode())
|
||||
if result.returncode != 0:
|
||||
print("\nPrecedence bug detected:")
|
||||
print(result.stderr.decode())
|
||||
|
||||
assert result.returncode == 0, f"Precedence test failed: {result.stderr.decode()}"
|
||||
|
||||
print("\n" + "="*80)
|
||||
print("✓ TEST PASSED: Snapshot config correctly overrides environment variables")
|
||||
print("="*80 + "\n")
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Run as standalone script
|
||||
test_config_propagation_through_worker_hierarchy()
|
||||
test_config_environment_variable_parsing()
|
||||
test_parent_environment_preserved_in_hooks()
|
||||
test_config_auto_fetch_relationships()
|
||||
test_config_precedence_with_environment_vars()
|
||||
|
||||
@@ -36,7 +36,7 @@ from django.utils import timezone
|
||||
from rich import print
|
||||
|
||||
from archivebox.misc.logging_util import log_worker_event
|
||||
from .worker import Worker, CrawlWorker
|
||||
from .worker import Worker, BinaryWorker, CrawlWorker
|
||||
|
||||
|
||||
def _run_orchestrator_process(exit_on_idle: bool) -> None:
|
||||
@@ -63,13 +63,14 @@ class Orchestrator:
|
||||
- Each SnapshotWorker runs hooks sequentially for its snapshot
|
||||
"""
|
||||
|
||||
# Only CrawlWorker - SnapshotWorkers are spawned by CrawlWorker subprocess, not by Orchestrator
|
||||
WORKER_TYPES: list[Type[Worker]] = [CrawlWorker]
|
||||
# BinaryWorker (singleton daemon) and CrawlWorker - SnapshotWorkers are spawned by CrawlWorker subprocess, not by Orchestrator
|
||||
WORKER_TYPES: list[Type[Worker]] = [BinaryWorker, CrawlWorker]
|
||||
|
||||
# Configuration
|
||||
POLL_INTERVAL: float = 2.0 # How often to check for new work (seconds)
|
||||
IDLE_TIMEOUT: int = 3 # Exit after N idle ticks (0 = never exit)
|
||||
MAX_CRAWL_WORKERS: int = 8 # Max crawls processing simultaneously
|
||||
MAX_BINARY_WORKERS: int = 1 # Max binaries installing simultaneously (sequential only)
|
||||
|
||||
def __init__(self, exit_on_idle: bool = True, crawl_id: str | None = None):
|
||||
self.exit_on_idle = exit_on_idle
|
||||
@@ -194,15 +195,23 @@ class Orchestrator:
|
||||
return len(WorkerClass.get_running_workers())
|
||||
|
||||
def should_spawn_worker(self, WorkerClass: Type[Worker], queue_count: int) -> bool:
|
||||
"""Determine if we should spawn a new CrawlWorker."""
|
||||
"""Determine if we should spawn a new worker."""
|
||||
if queue_count == 0:
|
||||
return False
|
||||
|
||||
# Check CrawlWorker limit
|
||||
# Get appropriate limit based on worker type
|
||||
if WorkerClass.name == 'crawl':
|
||||
max_workers = self.MAX_CRAWL_WORKERS
|
||||
elif WorkerClass.name == 'binary':
|
||||
max_workers = self.MAX_BINARY_WORKERS # Force sequential: only 1 binary at a time
|
||||
else:
|
||||
max_workers = 1 # Default for unknown types
|
||||
|
||||
# Check worker limit
|
||||
running_workers = WorkerClass.get_running_workers()
|
||||
running_count = len(running_workers)
|
||||
|
||||
if running_count >= self.MAX_CRAWL_WORKERS:
|
||||
if running_count >= max_workers:
|
||||
return False
|
||||
|
||||
# Check if we already have enough workers for the queue size
|
||||
@@ -285,14 +294,35 @@ class Orchestrator:
|
||||
|
||||
def check_queues_and_spawn_workers(self) -> dict[str, int]:
|
||||
"""
|
||||
Check Crawl queue and spawn CrawlWorkers as needed.
|
||||
Check Binary and Crawl queues and spawn workers as needed.
|
||||
Returns dict of queue sizes.
|
||||
"""
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
queue_sizes = {}
|
||||
|
||||
# Only check Crawl queue
|
||||
# Check Binary queue
|
||||
machine = Machine.current()
|
||||
binary_queue = Binary.objects.filter(
|
||||
machine=machine,
|
||||
status=Binary.StatusChoices.QUEUED,
|
||||
retry_at__lte=timezone.now()
|
||||
).order_by('retry_at')
|
||||
binary_count = binary_queue.count()
|
||||
queue_sizes['binary'] = binary_count
|
||||
|
||||
# Spawn BinaryWorker if needed (one worker per binary, up to MAX_BINARY_WORKERS)
|
||||
if self.should_spawn_worker(BinaryWorker, binary_count):
|
||||
# Get next binary to process
|
||||
binary = binary_queue.first()
|
||||
if binary:
|
||||
BinaryWorker.start(binary_id=str(binary.id))
|
||||
|
||||
# Check if any BinaryWorkers are still running
|
||||
running_binary_workers = len(BinaryWorker.get_running_workers())
|
||||
|
||||
# Check Crawl queue
|
||||
crawl_queue = Crawl.objects.filter(
|
||||
retry_at__lte=timezone.now()
|
||||
).exclude(
|
||||
@@ -307,12 +337,15 @@ class Orchestrator:
|
||||
crawl_count = crawl_queue.count()
|
||||
queue_sizes['crawl'] = crawl_count
|
||||
|
||||
# Spawn CrawlWorker if needed
|
||||
if self.should_spawn_worker(CrawlWorker, crawl_count):
|
||||
# Claim next crawl
|
||||
crawl = crawl_queue.first()
|
||||
if crawl and self._claim_crawl(crawl):
|
||||
CrawlWorker.start(crawl_id=str(crawl.id))
|
||||
# CRITICAL: Only spawn CrawlWorkers if binary queue is empty AND no BinaryWorkers running
|
||||
# This ensures all binaries are installed before snapshots start processing
|
||||
if binary_count == 0 and running_binary_workers == 0:
|
||||
# Spawn CrawlWorker if needed
|
||||
if self.should_spawn_worker(CrawlWorker, crawl_count):
|
||||
# Claim next crawl
|
||||
crawl = crawl_queue.first()
|
||||
if crawl and self._claim_crawl(crawl):
|
||||
CrawlWorker.start(crawl_id=str(crawl.id))
|
||||
|
||||
return queue_sizes
|
||||
|
||||
@@ -328,7 +361,7 @@ class Orchestrator:
|
||||
)
|
||||
|
||||
return updated == 1
|
||||
|
||||
|
||||
def has_pending_work(self, queue_sizes: dict[str, int]) -> bool:
|
||||
"""Check if any queue has pending work."""
|
||||
return any(count > 0 for count in queue_sizes.values())
|
||||
|
||||
@@ -323,6 +323,20 @@ class Worker:
|
||||
pwd = Path(snapshot.output_dir) # Run in snapshot's output directory
|
||||
env = get_config(snapshot=snapshot)
|
||||
|
||||
elif cls.name == 'binary':
|
||||
# BinaryWorker processes a specific binary installation
|
||||
binary_id = kwargs.get('binary_id')
|
||||
if not binary_id:
|
||||
raise ValueError("BinaryWorker requires binary_id")
|
||||
|
||||
from archivebox.machine.models import Binary
|
||||
binary = Binary.objects.get(id=binary_id)
|
||||
|
||||
cmd = [sys.executable, '-m', 'archivebox', 'run', '--binary-id', str(binary_id)]
|
||||
pwd = Path(settings.DATA_DIR) / 'machines' / str(Machine.current().id) / 'binaries' / binary.name / str(binary.id)
|
||||
pwd.mkdir(parents=True, exist_ok=True)
|
||||
env = get_config()
|
||||
|
||||
else:
|
||||
raise ValueError(f"Unknown worker type: {cls.name}")
|
||||
|
||||
@@ -654,16 +668,8 @@ class SnapshotWorker(Worker):
|
||||
hooks = discover_hooks('Snapshot', config=config)
|
||||
hooks = sorted(hooks, key=lambda h: h.name) # Sort by name (includes step prefix)
|
||||
|
||||
import sys
|
||||
print(f'[SnapshotWorker] Discovered {len(hooks)} hooks for snapshot {self.snapshot.url}', file=sys.stderr, flush=True)
|
||||
if hooks:
|
||||
print(f'[SnapshotWorker] First 5 hooks: {[h.name for h in hooks[:5]]}', file=sys.stderr, flush=True)
|
||||
else:
|
||||
print(f'[SnapshotWorker] WARNING: No hooks discovered! Config keys: {list(config.keys())[:10]}...', file=sys.stderr, flush=True)
|
||||
|
||||
# Execute each hook sequentially
|
||||
for hook_path in hooks:
|
||||
print(f'[SnapshotWorker] Running hook: {hook_path.name}', file=sys.stderr, flush=True)
|
||||
hook_name = hook_path.name
|
||||
plugin = self._extract_plugin_name(hook_name)
|
||||
hook_step = extract_step(hook_name)
|
||||
@@ -829,8 +835,96 @@ class SnapshotWorker(Worker):
|
||||
return name
|
||||
|
||||
|
||||
class BinaryWorker(Worker):
|
||||
"""
|
||||
Worker that processes a specific Binary installation.
|
||||
|
||||
Like CrawlWorker and SnapshotWorker, BinaryWorker:
|
||||
- Processes one specific binary (specified by binary_id)
|
||||
- Installs it via Binary.run() which runs on_Binary__* hooks
|
||||
- Exits when done
|
||||
|
||||
Orchestrator spawns BinaryWorkers sequentially (MAX_BINARY_WORKERS=1) to avoid
|
||||
conflicts during binary installations.
|
||||
"""
|
||||
|
||||
name: ClassVar[str] = 'binary'
|
||||
MAX_TICK_TIME: ClassVar[int] = 600 # 10 minutes for binary installations
|
||||
MAX_CONCURRENT_TASKS: ClassVar[int] = 1 # One binary per worker
|
||||
|
||||
def __init__(self, binary_id: str, worker_id: int = 0):
|
||||
self.binary_id = binary_id
|
||||
super().__init__(worker_id=worker_id)
|
||||
|
||||
def get_model(self):
|
||||
from archivebox.machine.models import Binary
|
||||
return Binary
|
||||
|
||||
def get_next_item(self):
|
||||
"""Get the specific binary to install."""
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
try:
|
||||
return Binary.objects.get(id=self.binary_id)
|
||||
except Binary.DoesNotExist:
|
||||
return None
|
||||
|
||||
def runloop(self) -> None:
|
||||
"""Install the specified binary."""
|
||||
import sys
|
||||
|
||||
self.on_startup()
|
||||
|
||||
try:
|
||||
binary = self.get_next_item()
|
||||
|
||||
if not binary:
|
||||
log_worker_event(
|
||||
worker_type='BinaryWorker',
|
||||
event=f'Binary {self.binary_id} not found',
|
||||
indent_level=1,
|
||||
pid=self.pid,
|
||||
)
|
||||
return
|
||||
|
||||
print(f'[cyan]🔧 BinaryWorker installing: {binary.name}[/cyan]', file=sys.stderr)
|
||||
|
||||
# Tick the state machine to trigger installation
|
||||
# This calls BinaryMachine.on_install() -> Binary.run() -> on_Binary__* hooks
|
||||
binary.sm.tick()
|
||||
|
||||
# Check result
|
||||
binary.refresh_from_db()
|
||||
if binary.status == Binary.StatusChoices.INSTALLED:
|
||||
log_worker_event(
|
||||
worker_type='BinaryWorker',
|
||||
event=f'Installed: {binary.name} -> {binary.abspath}',
|
||||
indent_level=1,
|
||||
pid=self.pid,
|
||||
)
|
||||
else:
|
||||
log_worker_event(
|
||||
worker_type='BinaryWorker',
|
||||
event=f'Installation pending: {binary.name} (status={binary.status})',
|
||||
indent_level=1,
|
||||
pid=self.pid,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
log_worker_event(
|
||||
worker_type='BinaryWorker',
|
||||
event=f'Failed to install binary',
|
||||
indent_level=1,
|
||||
pid=self.pid,
|
||||
error=e,
|
||||
)
|
||||
finally:
|
||||
self.on_shutdown()
|
||||
|
||||
|
||||
# Populate the registry
|
||||
WORKER_TYPES.update({
|
||||
'binary': BinaryWorker,
|
||||
'crawl': CrawlWorker,
|
||||
'snapshot': SnapshotWorker,
|
||||
})
|
||||
|
||||
@@ -239,6 +239,8 @@ for test_dir in $TEST_DIRS; do
|
||||
PYTEST_CMD="python -m pytest $test_dir -p no:django -v --tb=short"
|
||||
if [ "$ENABLE_COVERAGE" = true ]; then
|
||||
PYTEST_CMD="$PYTEST_CMD --cov=$plugin_name --cov-append --cov-branch"
|
||||
echo "[DEBUG] NODE_V8_COVERAGE before pytest: $NODE_V8_COVERAGE"
|
||||
python -c "import os; print('[DEBUG BASH->PYTHON] NODE_V8_COVERAGE:', os.environ.get('NODE_V8_COVERAGE', 'NOT_SET'))"
|
||||
fi
|
||||
|
||||
if eval "$PYTEST_CMD" 2>&1 | grep -v "^platform\|^cachedir\|^rootdir\|^configfile\|^plugins:" | tail -100; then
|
||||
|
||||
@@ -1878,7 +1878,7 @@ Updated `archivebox/core/statemachines.py`:
|
||||
|--------|------|--------|-------|
|
||||
| favicon | `on_Snapshot__11_favicon.py` | ✅ UPDATED | Now outputs clean JSONL |
|
||||
| git | `on_Snapshot__12_git.py` | ✅ UPDATED | Now outputs clean JSONL with cmd |
|
||||
| archive_org | `on_Snapshot__13_archive_org.py` | ✅ UPDATED | Now outputs clean JSONL |
|
||||
| archivedotorg | `on_Snapshot__13_archivedotorg.py` | ✅ UPDATED | Now outputs clean JSONL |
|
||||
| title | `on_Snapshot__32_title.js` | ✅ UPDATED | Now outputs clean JSONL |
|
||||
| singlefile | `on_Snapshot__37_singlefile.py` | ✅ UPDATED | Now outputs clean JSONL with cmd |
|
||||
| wget | `on_Snapshot__50_wget.py` | ✅ UPDATED | Now outputs clean JSONL with cmd |
|
||||
@@ -1930,7 +1930,7 @@ The following hooks have been renamed with `.bg.` suffix:
|
||||
- `archivebox/core/migrations/0030_migrate_output_field.py` (new)
|
||||
|
||||
### Plugins Updated (Python Hooks)
|
||||
- `archivebox/plugins/archive_org/on_Snapshot__13_archive_org.py`
|
||||
- `archivebox/plugins/archivedotorg/on_Snapshot__13_archivedotorg.py`
|
||||
- `archivebox/plugins/favicon/on_Snapshot__11_favicon.py`
|
||||
- `archivebox/plugins/git/on_Snapshot__12_git.py`
|
||||
- `archivebox/plugins/media/on_Snapshot__51_media.py`
|
||||
|
||||
Reference in New Issue
Block a user