Files
ArchiveBox/archivebox/tests/test_cli_run_binary_worker.py
Nick Sweeting 934e02695b fix lint
2026-03-15 18:45:29 -07:00

343 lines
12 KiB
Python

"""
Tests for BinaryWorker processing Binary queue.
Tests cover:
- BinaryWorker is spawned by Orchestrator when Binary queue has work
- Binary hooks (on_Binary__*) actually run and install binaries
- Binary status transitions from QUEUED -> INSTALLED
- BinaryWorker exits after idle timeout
"""
import json
import sqlite3
from archivebox.tests.conftest import (
run_archivebox_cmd,
)
class TestBinaryWorkerSpawning:
"""Tests for BinaryWorker lifecycle."""
def test_binary_worker_spawns_when_binary_queued(self, initialized_archive):
"""Orchestrator spawns BinaryWorker when Binary queue has work."""
# Create a Binary record via CLI
binary_record = {
'type': 'Binary',
'name': 'python3',
'binproviders': 'env', # Use env provider to detect system python
}
# Use `archivebox run` to create the Binary (this queues it)
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
timeout=60, # Increased timeout to allow for binary installation
)
assert code == 0, f"Failed to create Binary: {stderr}"
# Verify Binary was created in DB
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
binaries = c.execute(
"SELECT name, status, abspath FROM machine_binary WHERE name='python3'"
).fetchall()
conn.close()
assert len(binaries) >= 1, "Binary was not created in database"
name, status, abspath = binaries[0]
assert name == 'python3'
# Status should be INSTALLED after BinaryWorker processed it
# (or QUEUED if worker timed out before installing)
assert status in ['installed', 'queued']
def test_binary_hooks_actually_run(self, initialized_archive):
"""Binary installation hooks (on_Binary__*) run and update abspath."""
# Create a Binary for python3 (guaranteed to exist on system)
binary_record = {
'type': 'Binary',
'name': 'python3',
'binproviders': 'env',
}
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
timeout=30,
)
assert code == 0, f"Failed to process Binary: {stderr}"
# Query database to check if hooks ran and populated abspath
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
result = c.execute(
"SELECT name, status, abspath, version FROM machine_binary WHERE name='python3'"
).fetchone()
conn.close()
assert result is not None, "Binary not found in database"
name, status, abspath, version = result
# If hooks ran successfully, abspath should be populated
if status == 'installed':
assert abspath, f"Binary installed but abspath is empty: {abspath}"
assert '/python3' in abspath or '\\python3' in abspath, \
f"abspath doesn't look like a python3 path: {abspath}"
# Version should also be populated
assert version, f"Binary installed but version is empty: {version}"
def test_binary_status_transitions(self, initialized_archive):
"""Binary status correctly transitions QUEUED -> INSTALLED."""
binary_record = {
'type': 'Binary',
'name': 'python3',
'binproviders': 'env',
}
# Create and process the Binary
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
timeout=30,
)
assert code == 0
# Check final status
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
status = c.execute(
"SELECT status FROM machine_binary WHERE name='python3'"
).fetchone()
conn.close()
assert status is not None
# Should be installed (or queued if worker timed out)
assert status[0] in ['installed', 'queued']
class TestBinaryWorkerHooks:
"""Tests for specific Binary hook providers."""
def test_env_provider_hook_detects_system_binary(self, initialized_archive):
"""on_Binary__15_env_discover.py hook detects system binaries."""
binary_record = {
'type': 'Binary',
'name': 'python3',
'binproviders': 'env',
}
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
timeout=30,
)
assert code == 0
# Check that env provider hook populated the Binary
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
result = c.execute(
"SELECT binprovider, abspath FROM machine_binary WHERE name='python3' AND status='installed'"
).fetchone()
conn.close()
if result:
binprovider, abspath = result
assert binprovider == 'env', f"Expected env provider, got: {binprovider}"
assert abspath, "abspath should be populated by env provider"
def test_multiple_binaries_processed_in_batch(self, initialized_archive):
"""BinaryWorker processes multiple queued binaries."""
# Create multiple Binary records
binaries = [
{'type': 'Binary', 'name': 'python3', 'binproviders': 'env'},
{'type': 'Binary', 'name': 'curl', 'binproviders': 'env'},
]
stdin = '\n'.join(json.dumps(b) for b in binaries)
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=stdin,
data_dir=initialized_archive,
timeout=90, # Need more time for multiple binaries
)
assert code == 0
# Both should be processed
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
installed = c.execute(
"SELECT name FROM machine_binary WHERE name IN ('python3', 'curl')"
).fetchall()
conn.close()
assert len(installed) >= 1, "At least one binary should be created"
def test_puppeteer_binary_sets_skip_download_for_hooks(self, initialized_archive):
"""Puppeteer installs expose skip-download env to Binary hooks."""
user_plugins_dir = initialized_archive / 'test_plugins'
plugin_dir = user_plugins_dir / 'inspectnpm'
plugin_dir.mkdir(parents=True, exist_ok=True)
hook = plugin_dir / 'on_Binary__10_inspectnpm_install.py'
hook.write_text(
"""#!/usr/bin/env python3
import argparse
import json
import os
import shutil
import sys
parser = argparse.ArgumentParser()
parser.add_argument('--machine-id', required=True)
parser.add_argument('--binary-id', required=True)
parser.add_argument('--name', required=True)
parser.add_argument('--binproviders', default='*')
args = parser.parse_args()
record = {
'type': 'Binary',
'name': args.name,
'abspath': shutil.which('python3') or sys.executable,
'version': '1.0.0',
'sha256': '',
'binprovider': 'inspectnpm',
'machine_id': args.machine_id,
'binary_id': args.binary_id,
}
print(json.dumps(record))
print(json.dumps({
'type': 'Machine',
'config': {
'SEEN_PUPPETEER_SKIP_DOWNLOAD': os.environ.get('PUPPETEER_SKIP_DOWNLOAD', ''),
'SEEN_PUPPETEER_SKIP_CHROMIUM_DOWNLOAD': os.environ.get('PUPPETEER_SKIP_CHROMIUM_DOWNLOAD', ''),
},
}))
"""
)
binary_record = {
'type': 'Binary',
'name': 'puppeteer',
'binproviders': 'inspectnpm',
}
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
env={
'ARCHIVEBOX_USER_PLUGINS_DIR': str(user_plugins_dir),
'PLUGINS': 'inspectnpm',
},
timeout=60,
)
assert code == 0, f"Failed to process puppeteer Binary: {stderr}"
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
result = c.execute(
"SELECT status, binprovider FROM machine_binary WHERE name='puppeteer'"
).fetchone()
hook_rows = c.execute(
"SELECT cmd, env FROM machine_process WHERE process_type='hook' ORDER BY created_at DESC"
).fetchall()
conn.close()
assert result is not None, "Puppeteer binary not found in database"
status, binprovider = result
assert status == 'installed', f"Expected puppeteer to install, got: {status}"
assert binprovider == 'inspectnpm', f"Expected inspectnpm provider, got: {binprovider}"
hook_env = None
for cmd_json, env_json in hook_rows:
cmd = json.loads(cmd_json)
if any('inspectnpm' in part for part in cmd):
hook_env = json.loads(env_json)
break
assert hook_env is not None, "Inspectnpm hook process not found"
assert hook_env.get('PUPPETEER_SKIP_DOWNLOAD') == 'true'
assert hook_env.get('PUPPETEER_SKIP_CHROMIUM_DOWNLOAD') == 'true'
class TestBinaryWorkerEdgeCases:
"""Tests for edge cases and error handling."""
def test_nonexistent_binary_stays_queued(self, initialized_archive):
"""Binary that doesn't exist stays queued (doesn't fail permanently)."""
binary_record = {
'type': 'Binary',
'name': 'nonexistent-binary-xyz-12345',
'binproviders': 'env',
}
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
timeout=30,
)
# Command should still succeed (orchestrator doesn't fail on binary install failures)
assert code == 0
# Binary should remain queued (not installed)
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
result = c.execute(
"SELECT status FROM machine_binary WHERE name='nonexistent-binary-xyz-12345'"
).fetchone()
conn.close()
if result:
status = result[0]
# Should stay queued since installation failed
assert status == 'queued', f"Expected queued, got: {status}"
def test_binary_worker_respects_machine_isolation(self, initialized_archive):
"""BinaryWorker only processes binaries for current machine."""
# This is implicitly tested by other tests - Binary.objects.filter(machine=current)
# ensures only current machine's binaries are processed
binary_record = {
'type': 'Binary',
'name': 'python3',
'binproviders': 'env',
}
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
timeout=30,
)
assert code == 0
# Check that machine_id is set correctly
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
result = c.execute(
"SELECT machine_id FROM machine_binary WHERE name='python3'"
).fetchone()
conn.close()
assert result is not None
machine_id = result[0]
assert machine_id, "machine_id should be set on Binary"