Refactor ArchiveBox onto abx-dl bus runner

This commit is contained in:
Nick Sweeting
2026-03-21 11:47:57 -07:00
parent ee9ed440d1
commit c87079aa0a
45 changed files with 1282 additions and 6396 deletions

View File

@@ -1,133 +0,0 @@
import os
import signal
import sqlite3
import subprocess
import sys
import time
from pathlib import Path
def _run(cmd, data_dir: Path, env: dict, timeout: int = 120):
return subprocess.run(
cmd,
cwd=data_dir,
env=env,
capture_output=True,
text=True,
timeout=timeout,
)
def _make_env(data_dir: Path) -> dict:
env = os.environ.copy()
env["DATA_DIR"] = str(data_dir)
env["USE_COLOR"] = "False"
env["SHOW_PROGRESS"] = "False"
env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
env["PLUGINS"] = "favicon"
# Keep it fast but still real hooks
env["SAVE_FAVICON"] = "True"
env["SAVE_TITLE"] = "False"
env["SAVE_WGET"] = "False"
env["SAVE_WARC"] = "False"
env["SAVE_PDF"] = "False"
env["SAVE_SCREENSHOT"] = "False"
env["SAVE_DOM"] = "False"
env["SAVE_SINGLEFILE"] = "False"
env["SAVE_READABILITY"] = "False"
env["SAVE_MERCURY"] = "False"
env["SAVE_GIT"] = "False"
env["SAVE_YTDLP"] = "False"
env["SAVE_HEADERS"] = "False"
env["SAVE_HTMLTOTEXT"] = "False"
return env
def _count_running_processes(db_path: Path, where: str) -> int:
for _ in range(50):
try:
conn = sqlite3.connect(db_path, timeout=1)
cur = conn.cursor()
count = cur.execute(
f"SELECT COUNT(*) FROM machine_process WHERE status = 'running' AND {where}"
).fetchone()[0]
conn.close()
return count
except sqlite3.OperationalError:
time.sleep(0.1)
return 0
def _wait_for_count(db_path: Path, where: str, target: int, timeout: int = 20) -> bool:
start = time.time()
while time.time() - start < timeout:
if _count_running_processes(db_path, where) >= target:
return True
time.sleep(0.1)
return False
def test_add_parents_workers_to_orchestrator(tmp_path):
data_dir = tmp_path / "data"
data_dir.mkdir()
env = _make_env(data_dir)
init = _run([sys.executable, "-m", "archivebox", "init", "--quick"], data_dir, env)
assert init.returncode == 0, init.stderr
add = _run([sys.executable, "-m", "archivebox", "add", "--plugins=favicon", "https://example.com"], data_dir, env, timeout=120)
assert add.returncode == 0, add.stderr
conn = sqlite3.connect(data_dir / "index.sqlite3")
cur = conn.cursor()
orchestrator = cur.execute(
"SELECT id FROM machine_process WHERE process_type = 'orchestrator' ORDER BY created_at DESC LIMIT 1"
).fetchone()
assert orchestrator is not None
orchestrator_id = orchestrator[0]
worker_count = cur.execute(
"SELECT COUNT(*) FROM machine_process WHERE process_type = 'worker' AND worker_type = 'crawl' "
"AND parent_id = ?",
(orchestrator_id,),
).fetchone()[0]
conn.close()
assert worker_count >= 1, "Expected crawl worker to be parented to orchestrator"
def test_add_interrupt_cleans_orphaned_processes(tmp_path):
data_dir = tmp_path / "data"
data_dir.mkdir()
env = _make_env(data_dir)
init = _run([sys.executable, "-m", "archivebox", "init", "--quick"], data_dir, env)
assert init.returncode == 0, init.stderr
proc = subprocess.Popen(
[sys.executable, "-m", "archivebox", "add", "--plugins=favicon", "https://example.com"],
cwd=data_dir,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
)
db_path = data_dir / "index.sqlite3"
saw_worker = _wait_for_count(db_path, "process_type = 'worker'", 1, timeout=20)
assert saw_worker, "Expected at least one worker to start before interrupt"
proc.send_signal(signal.SIGINT)
proc.wait(timeout=30)
# Wait for workers/hooks to be cleaned up
start = time.time()
while time.time() - start < 30:
running = _count_running_processes(db_path, "process_type IN ('worker','hook')")
if running == 0:
break
time.sleep(0.2)
assert _count_running_processes(db_path, "process_type IN ('worker','hook')") == 0, (
"Expected no running worker/hook processes after interrupt"
)

View File

@@ -1,5 +1,5 @@
"""
Tests for JSONL piping contracts and `archivebox run` / `archivebox orchestrator`.
Tests for JSONL piping contracts and `archivebox run`.
This file covers both:
- low-level JSONL/stdin parsing behavior that makes CLI piping work
@@ -252,8 +252,8 @@ def test_snapshot_list_stdout_pipes_into_run(initialized_archive):
assert snapshot_status == "sealed"
def test_archiveresult_list_stdout_pipes_into_orchestrator_alias(initialized_archive):
"""`archivebox archiveresult list | archivebox orchestrator` should preserve clean JSONL stdout."""
def test_archiveresult_list_stdout_pipes_into_run(initialized_archive):
"""`archivebox archiveresult list | archivebox run` should preserve clean JSONL stdout."""
url = create_test_url()
snapshot_stdout, snapshot_stderr, snapshot_code = run_archivebox_cmd(
@@ -279,18 +279,17 @@ def test_archiveresult_list_stdout_pipes_into_orchestrator_alias(initialized_arc
assert list_code == 0, list_stderr
_assert_stdout_is_jsonl_only(list_stdout)
orchestrator_stdout, orchestrator_stderr, orchestrator_code = run_archivebox_cmd(
["orchestrator"],
run_stdout, run_stderr, run_code = run_archivebox_cmd(
["run"],
stdin=list_stdout,
data_dir=initialized_archive,
timeout=120,
env=PIPE_TEST_ENV,
)
assert orchestrator_code == 0, orchestrator_stderr
_assert_stdout_is_jsonl_only(orchestrator_stdout)
assert "renamed to `archivebox run`" in orchestrator_stderr
assert run_code == 0, run_stderr
_assert_stdout_is_jsonl_only(run_stdout)
run_records = parse_jsonl_output(orchestrator_stdout)
run_records = parse_jsonl_output(run_stdout)
assert any(
record.get("type") == "ArchiveResult" and record.get("id") == archiveresult["id"]
for record in run_records

View File

@@ -1,342 +0,0 @@
"""
Tests for BinaryWorker processing Binary queue.
Tests cover:
- BinaryWorker is spawned by Orchestrator when Binary queue has work
- Binary hooks (on_Binary__*) actually run and install binaries
- Binary status transitions from QUEUED -> INSTALLED
- BinaryWorker exits after idle timeout
"""
import json
import sqlite3
from archivebox.tests.conftest import (
run_archivebox_cmd,
)
class TestBinaryWorkerSpawning:
"""Tests for BinaryWorker lifecycle."""
def test_binary_worker_spawns_when_binary_queued(self, initialized_archive):
"""Orchestrator spawns BinaryWorker when Binary queue has work."""
# Create a Binary record via CLI
binary_record = {
'type': 'Binary',
'name': 'python3',
'binproviders': 'env', # Use env provider to detect system python
}
# Use `archivebox run` to create the Binary (this queues it)
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
timeout=60, # Increased timeout to allow for binary installation
)
assert code == 0, f"Failed to create Binary: {stderr}"
# Verify Binary was created in DB
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
binaries = c.execute(
"SELECT name, status, abspath FROM machine_binary WHERE name='python3'"
).fetchall()
conn.close()
assert len(binaries) >= 1, "Binary was not created in database"
name, status, abspath = binaries[0]
assert name == 'python3'
# Status should be INSTALLED after BinaryWorker processed it
# (or QUEUED if worker timed out before installing)
assert status in ['installed', 'queued']
def test_binary_hooks_actually_run(self, initialized_archive):
"""Binary installation hooks (on_Binary__*) run and update abspath."""
# Create a Binary for python3 (guaranteed to exist on system)
binary_record = {
'type': 'Binary',
'name': 'python3',
'binproviders': 'env',
}
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
timeout=30,
)
assert code == 0, f"Failed to process Binary: {stderr}"
# Query database to check if hooks ran and populated abspath
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
result = c.execute(
"SELECT name, status, abspath, version FROM machine_binary WHERE name='python3'"
).fetchone()
conn.close()
assert result is not None, "Binary not found in database"
name, status, abspath, version = result
# If hooks ran successfully, abspath should be populated
if status == 'installed':
assert abspath, f"Binary installed but abspath is empty: {abspath}"
assert '/python3' in abspath or '\\python3' in abspath, \
f"abspath doesn't look like a python3 path: {abspath}"
# Version should also be populated
assert version, f"Binary installed but version is empty: {version}"
def test_binary_status_transitions(self, initialized_archive):
"""Binary status correctly transitions QUEUED -> INSTALLED."""
binary_record = {
'type': 'Binary',
'name': 'python3',
'binproviders': 'env',
}
# Create and process the Binary
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
timeout=30,
)
assert code == 0
# Check final status
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
status = c.execute(
"SELECT status FROM machine_binary WHERE name='python3'"
).fetchone()
conn.close()
assert status is not None
# Should be installed (or queued if worker timed out)
assert status[0] in ['installed', 'queued']
class TestBinaryWorkerHooks:
"""Tests for specific Binary hook providers."""
def test_env_provider_hook_detects_system_binary(self, initialized_archive):
"""on_Binary__15_env_discover.py hook detects system binaries."""
binary_record = {
'type': 'Binary',
'name': 'python3',
'binproviders': 'env',
}
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
timeout=30,
)
assert code == 0
# Check that env provider hook populated the Binary
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
result = c.execute(
"SELECT binprovider, abspath FROM machine_binary WHERE name='python3' AND status='installed'"
).fetchone()
conn.close()
if result:
binprovider, abspath = result
assert binprovider == 'env', f"Expected env provider, got: {binprovider}"
assert abspath, "abspath should be populated by env provider"
def test_multiple_binaries_processed_in_batch(self, initialized_archive):
"""BinaryWorker processes multiple queued binaries."""
# Create multiple Binary records
binaries = [
{'type': 'Binary', 'name': 'python3', 'binproviders': 'env'},
{'type': 'Binary', 'name': 'curl', 'binproviders': 'env'},
]
stdin = '\n'.join(json.dumps(b) for b in binaries)
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=stdin,
data_dir=initialized_archive,
timeout=90, # Need more time for multiple binaries
)
assert code == 0
# Both should be processed
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
installed = c.execute(
"SELECT name FROM machine_binary WHERE name IN ('python3', 'curl')"
).fetchall()
conn.close()
assert len(installed) >= 1, "At least one binary should be created"
def test_puppeteer_binary_sets_skip_download_for_hooks(self, initialized_archive):
"""Puppeteer installs expose skip-download env to Binary hooks."""
user_plugins_dir = initialized_archive / 'test_plugins'
plugin_dir = user_plugins_dir / 'inspectnpm'
plugin_dir.mkdir(parents=True, exist_ok=True)
hook = plugin_dir / 'on_Binary__10_inspectnpm_install.py'
hook.write_text(
"""#!/usr/bin/env python3
import argparse
import json
import os
import shutil
import sys
parser = argparse.ArgumentParser()
parser.add_argument('--machine-id', required=True)
parser.add_argument('--binary-id', required=True)
parser.add_argument('--name', required=True)
parser.add_argument('--binproviders', default='*')
args = parser.parse_args()
record = {
'type': 'Binary',
'name': args.name,
'abspath': shutil.which('python3') or sys.executable,
'version': '1.0.0',
'sha256': '',
'binprovider': 'inspectnpm',
'machine_id': args.machine_id,
'binary_id': args.binary_id,
}
print(json.dumps(record))
print(json.dumps({
'type': 'Machine',
'config': {
'SEEN_PUPPETEER_SKIP_DOWNLOAD': os.environ.get('PUPPETEER_SKIP_DOWNLOAD', ''),
'SEEN_PUPPETEER_SKIP_CHROMIUM_DOWNLOAD': os.environ.get('PUPPETEER_SKIP_CHROMIUM_DOWNLOAD', ''),
},
}))
"""
)
binary_record = {
'type': 'Binary',
'name': 'puppeteer',
'binproviders': 'inspectnpm',
}
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
env={
'ARCHIVEBOX_USER_PLUGINS_DIR': str(user_plugins_dir),
'PLUGINS': 'inspectnpm',
},
timeout=60,
)
assert code == 0, f"Failed to process puppeteer Binary: {stderr}"
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
result = c.execute(
"SELECT status, binprovider FROM machine_binary WHERE name='puppeteer'"
).fetchone()
hook_rows = c.execute(
"SELECT cmd, env FROM machine_process WHERE process_type='hook' ORDER BY created_at DESC"
).fetchall()
conn.close()
assert result is not None, "Puppeteer binary not found in database"
status, binprovider = result
assert status == 'installed', f"Expected puppeteer to install, got: {status}"
assert binprovider == 'inspectnpm', f"Expected inspectnpm provider, got: {binprovider}"
hook_env = None
for cmd_json, env_json in hook_rows:
cmd = json.loads(cmd_json)
if any('inspectnpm' in part for part in cmd):
hook_env = json.loads(env_json)
break
assert hook_env is not None, "Inspectnpm hook process not found"
assert hook_env.get('PUPPETEER_SKIP_DOWNLOAD') == 'true'
assert hook_env.get('PUPPETEER_SKIP_CHROMIUM_DOWNLOAD') == 'true'
class TestBinaryWorkerEdgeCases:
"""Tests for edge cases and error handling."""
def test_nonexistent_binary_stays_queued(self, initialized_archive):
"""Binary that doesn't exist stays queued (doesn't fail permanently)."""
binary_record = {
'type': 'Binary',
'name': 'nonexistent-binary-xyz-12345',
'binproviders': 'env',
}
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
timeout=30,
)
# Command should still succeed (orchestrator doesn't fail on binary install failures)
assert code == 0
# Binary should remain queued (not installed)
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
result = c.execute(
"SELECT status FROM machine_binary WHERE name='nonexistent-binary-xyz-12345'"
).fetchone()
conn.close()
if result:
status = result[0]
# Should stay queued since installation failed
assert status == 'queued', f"Expected queued, got: {status}"
def test_binary_worker_respects_machine_isolation(self, initialized_archive):
"""BinaryWorker only processes binaries for current machine."""
# This is implicitly tested by other tests - Binary.objects.filter(machine=current)
# ensures only current machine's binaries are processed
binary_record = {
'type': 'Binary',
'name': 'python3',
'binproviders': 'env',
}
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(binary_record),
data_dir=initialized_archive,
timeout=30,
)
assert code == 0
# Check that machine_id is set correctly
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
c = conn.cursor()
result = c.execute(
"SELECT machine_id FROM machine_binary WHERE name='python3'"
).fetchone()
conn.close()
assert result is not None
machine_id = result[0]
assert machine_id, "machine_id should be set on Binary"

View File

@@ -369,9 +369,9 @@ class TestProcessCurrent(TestCase):
self.assertEqual(proc1.id, proc2.id)
def test_process_detect_type_orchestrator(self):
"""_detect_process_type should detect orchestrator."""
with patch('sys.argv', ['archivebox', 'manage', 'orchestrator']):
def test_process_detect_type_runner(self):
"""_detect_process_type should detect the background runner command."""
with patch('sys.argv', ['archivebox', 'run', '--daemon']):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.ORCHESTRATOR)
@@ -381,11 +381,11 @@ class TestProcessCurrent(TestCase):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.CLI)
def test_process_detect_type_worker(self):
"""_detect_process_type should detect workers."""
with patch('sys.argv', ['python', '-m', 'crawl_worker']):
def test_process_detect_type_binary(self):
"""_detect_process_type should detect non-ArchiveBox subprocesses as binary processes."""
with patch('sys.argv', ['/usr/bin/wget', 'https://example.com']):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.WORKER)
self.assertEqual(result, Process.TypeChoices.BINARY)
class TestProcessHierarchy(TestCase):

View File

@@ -1,484 +0,0 @@
"""
Unit tests for the Orchestrator and Worker classes.
Tests cover:
1. Orchestrator lifecycle (startup, shutdown)
2. Queue polling and worker spawning
3. Idle detection and exit logic
4. Worker registration and management
5. Process model methods (replacing old pid_utils)
"""
import os
import time
from datetime import datetime, timedelta
from unittest.mock import patch
from typing import ClassVar
import pytest
from django.test import TestCase
from django.utils import timezone
from archivebox.workers.orchestrator import Orchestrator
from archivebox.workers.worker import Worker
class FakeWorker(Worker):
name: ClassVar[str] = 'crawl'
MAX_CONCURRENT_TASKS: ClassVar[int] = 5
running_workers: ClassVar[list[dict[str, object]]] = []
@classmethod
def get_running_workers(cls) -> list[dict[str, object]]:
return cls.running_workers
class TestOrchestratorUnit(TestCase):
"""Unit tests for Orchestrator class (mocked dependencies)."""
def test_orchestrator_creation(self):
"""Orchestrator should initialize with correct defaults."""
orchestrator = Orchestrator(exit_on_idle=True)
self.assertTrue(orchestrator.exit_on_idle)
self.assertEqual(orchestrator.idle_count, 0)
self.assertIsNone(orchestrator.pid_file)
def test_orchestrator_repr(self):
"""Orchestrator __repr__ should include PID."""
orchestrator = Orchestrator()
repr_str = repr(orchestrator)
self.assertIn('Orchestrator', repr_str)
self.assertIn(str(os.getpid()), repr_str)
def test_has_pending_work(self):
"""has_pending_work should check if any queue has items."""
orchestrator = Orchestrator()
self.assertFalse(orchestrator.has_pending_work({'crawl': 0, 'snapshot': 0}))
self.assertTrue(orchestrator.has_pending_work({'crawl': 0, 'snapshot': 5}))
self.assertTrue(orchestrator.has_pending_work({'crawl': 10, 'snapshot': 0}))
def test_should_exit_not_exit_on_idle(self):
"""should_exit should return False when exit_on_idle is False."""
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator.idle_count = 100
self.assertFalse(orchestrator.should_exit({'crawl': 0}))
def test_should_exit_pending_work(self):
"""should_exit should return False when there's pending work."""
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.idle_count = 100
self.assertFalse(orchestrator.should_exit({'crawl': 5}))
@patch.object(Orchestrator, 'has_running_workers')
def test_should_exit_running_workers(self, mock_has_workers):
"""should_exit should return False when workers are running."""
mock_has_workers.return_value = True
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.idle_count = 100
self.assertFalse(orchestrator.should_exit({'crawl': 0}))
@patch.object(Orchestrator, 'has_running_workers')
@patch.object(Orchestrator, 'has_future_work')
def test_should_exit_idle_timeout(self, mock_future, mock_workers):
"""should_exit should return True after idle timeout with no work."""
mock_workers.return_value = False
mock_future.return_value = False
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.idle_count = orchestrator.IDLE_TIMEOUT
self.assertTrue(orchestrator.should_exit({'crawl': 0, 'snapshot': 0}))
@patch.object(Orchestrator, 'has_running_workers')
@patch.object(Orchestrator, 'has_future_work')
def test_should_exit_below_idle_timeout(self, mock_future, mock_workers):
"""should_exit should return False below idle timeout."""
mock_workers.return_value = False
mock_future.return_value = False
orchestrator = Orchestrator(exit_on_idle=True)
orchestrator.idle_count = orchestrator.IDLE_TIMEOUT - 1
self.assertFalse(orchestrator.should_exit({'crawl': 0}))
def test_should_spawn_worker_no_queue(self):
"""should_spawn_worker should return False when queue is empty."""
orchestrator = Orchestrator()
FakeWorker.running_workers = []
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 0))
def test_should_spawn_worker_at_limit(self):
"""should_spawn_worker should return False when at per-type limit."""
orchestrator = Orchestrator()
running_workers: list[dict[str, object]] = [{'worker_id': worker_id} for worker_id in range(orchestrator.MAX_CRAWL_WORKERS)]
FakeWorker.running_workers = running_workers
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 10))
@patch.object(Orchestrator, 'get_total_worker_count')
def test_should_spawn_worker_at_total_limit(self, mock_total):
"""should_spawn_worker should return False when at total limit."""
orchestrator = Orchestrator()
mock_total.return_value = 0
running_workers: list[dict[str, object]] = [{'worker_id': worker_id} for worker_id in range(orchestrator.MAX_CRAWL_WORKERS)]
FakeWorker.running_workers = running_workers
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 10))
@patch.object(Orchestrator, 'get_total_worker_count')
def test_should_spawn_worker_success(self, mock_total):
"""should_spawn_worker should return True when conditions are met."""
orchestrator = Orchestrator()
mock_total.return_value = 0
FakeWorker.running_workers = []
self.assertTrue(orchestrator.should_spawn_worker(FakeWorker, 10))
@patch.object(Orchestrator, 'get_total_worker_count')
def test_should_spawn_worker_enough_workers(self, mock_total):
"""should_spawn_worker should return False when enough workers for queue."""
orchestrator = Orchestrator()
mock_total.return_value = 2
FakeWorker.running_workers = [{}] # 1 worker running
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 3))
class TestOrchestratorWithProcess(TestCase):
"""Test Orchestrator using Process model for tracking."""
def setUp(self):
"""Reset process cache."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
models._CURRENT_PROCESS = None
def test_is_running_no_orchestrator(self):
"""is_running should return False when no orchestrator process exists."""
from archivebox.machine.models import Process
# Clean up any stale processes first
Process.cleanup_stale_running()
# Mark any running orchestrators as exited for clean test state
Process.objects.filter(
process_type=Process.TypeChoices.ORCHESTRATOR,
status=Process.StatusChoices.RUNNING
).update(status=Process.StatusChoices.EXITED)
self.assertFalse(Orchestrator.is_running())
def test_is_running_with_orchestrator_process(self):
"""is_running should return True when orchestrator Process exists."""
from archivebox.machine.models import Process, Machine
import psutil
machine = Machine.current()
current_proc = psutil.Process(os.getpid())
# Create an orchestrator Process record
proc = Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.ORCHESTRATOR,
status=Process.StatusChoices.RUNNING,
pid=os.getpid(), # Use current PID so it appears alive
started_at=datetime.fromtimestamp(current_proc.create_time(), tz=timezone.get_current_timezone()),
cmd=current_proc.cmdline(),
)
try:
# Should detect running orchestrator
self.assertTrue(Orchestrator.is_running())
finally:
# Clean up
proc.status = Process.StatusChoices.EXITED
proc.save()
def test_orchestrator_uses_process_for_is_running(self):
"""Orchestrator.is_running should use Process.get_running_count."""
from archivebox.machine.models import Process
# Verify is_running uses Process model, not pid files
with patch.object(Process, 'get_running_count') as mock_count:
mock_count.return_value = 1
result = Orchestrator.is_running()
# Should have called Process.get_running_count with orchestrator type
mock_count.assert_called()
self.assertTrue(result)
def test_orchestrator_scoped_worker_count(self):
"""Orchestrator with crawl_id should count only descendant workers."""
from archivebox.machine.models import Process, Machine
machine = Machine.current()
orchestrator = Orchestrator(exit_on_idle=True, crawl_id='test-crawl')
orchestrator.db_process = Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.ORCHESTRATOR,
status=Process.StatusChoices.RUNNING,
pid=12345,
started_at=timezone.now(),
)
# Prevent cleanup from marking fake PIDs as exited
orchestrator._last_cleanup_time = time.time()
Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.WORKER,
worker_type='crawl',
status=Process.StatusChoices.RUNNING,
pid=12346,
parent=orchestrator.db_process,
started_at=timezone.now(),
)
Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.WORKER,
worker_type='crawl',
status=Process.StatusChoices.RUNNING,
pid=12347,
started_at=timezone.now(),
)
self.assertEqual(orchestrator.get_total_worker_count(), 1)
class TestProcessBasedWorkerTracking(TestCase):
"""Test Process model methods that replace pid_utils functionality."""
def setUp(self):
"""Reset caches."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
models._CURRENT_PROCESS = None
def test_process_current_creates_record(self):
"""Process.current() should create a Process record for current PID."""
from archivebox.machine.models import Process
proc = Process.current()
self.assertIsNotNone(proc)
self.assertEqual(proc.pid, os.getpid())
self.assertEqual(proc.status, Process.StatusChoices.RUNNING)
self.assertIsNotNone(proc.machine)
self.assertIsNotNone(proc.started_at)
def test_process_current_caches_result(self):
"""Process.current() should return cached Process within interval."""
from archivebox.machine.models import Process
proc1 = Process.current()
proc2 = Process.current()
self.assertEqual(proc1.id, proc2.id)
def test_process_get_running_count(self):
"""Process.get_running_count should count running processes by type."""
from archivebox.machine.models import Process, Machine
machine = Machine.current()
# Create some worker processes
for i in range(3):
Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.WORKER,
status=Process.StatusChoices.RUNNING,
pid=99990 + i, # Fake PIDs
started_at=timezone.now(),
)
count = Process.get_running_count(process_type=Process.TypeChoices.WORKER)
self.assertGreaterEqual(count, 3)
def test_process_get_next_worker_id(self):
"""Process.get_next_worker_id should return count of running workers."""
from archivebox.machine.models import Process, Machine
machine = Machine.current()
# Create 2 worker processes
for i in range(2):
Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.WORKER,
status=Process.StatusChoices.RUNNING,
pid=99980 + i,
started_at=timezone.now(),
)
next_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
self.assertGreaterEqual(next_id, 2)
def test_process_cleanup_stale_running(self):
"""Process.cleanup_stale_running should mark stale processes as exited."""
from archivebox.machine.models import Process, Machine, PID_REUSE_WINDOW
machine = Machine.current()
# Create a stale process (old started_at, fake PID)
stale_proc = Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.WORKER,
status=Process.StatusChoices.RUNNING,
pid=999999, # Fake PID that doesn't exist
started_at=timezone.now() - PID_REUSE_WINDOW - timedelta(hours=1),
)
cleaned = Process.cleanup_stale_running()
self.assertGreaterEqual(cleaned, 1)
stale_proc.refresh_from_db()
self.assertEqual(stale_proc.status, Process.StatusChoices.EXITED)
def test_process_get_running(self):
"""Process.get_running should return queryset of running processes."""
from archivebox.machine.models import Process, Machine
machine = Machine.current()
# Create a running process
proc = Process.objects.create(
machine=machine,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
pid=99970,
started_at=timezone.now(),
)
running = Process.get_running(process_type=Process.TypeChoices.HOOK)
self.assertIn(proc, running)
def test_process_type_detection(self):
"""Process._detect_process_type should detect process type from argv."""
from archivebox.machine.models import Process
# Test detection logic
with patch('sys.argv', ['archivebox', 'manage', 'orchestrator']):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.ORCHESTRATOR)
with patch('sys.argv', ['archivebox', 'add', 'http://example.com']):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.CLI)
with patch('sys.argv', ['supervisord', '-c', 'config.ini']):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.SUPERVISORD)
class TestProcessLifecycle(TestCase):
"""Test Process model lifecycle methods."""
def setUp(self):
"""Reset caches and create a machine."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
models._CURRENT_PROCESS = None
self.machine = models.Machine.current()
def test_process_is_running_property(self):
"""Process.is_running should check actual OS process."""
from archivebox.machine.models import Process
proc = Process.current()
# Should be running (current process exists)
self.assertTrue(proc.is_running)
# Create a process with fake PID
fake_proc = Process.objects.create(
machine=self.machine,
status=Process.StatusChoices.RUNNING,
pid=999999,
started_at=timezone.now(),
)
# Should not be running (PID doesn't exist)
self.assertFalse(fake_proc.is_running)
def test_process_poll(self):
"""Process.poll should check and update exit status."""
from archivebox.machine.models import Process
# Create a process with fake PID (already exited)
proc = Process.objects.create(
machine=self.machine,
status=Process.StatusChoices.RUNNING,
pid=999999,
started_at=timezone.now(),
)
exit_code = proc.poll()
# Should have detected exit and updated status
self.assertIsNotNone(exit_code)
proc.refresh_from_db()
self.assertEqual(proc.status, Process.StatusChoices.EXITED)
def test_process_terminate_already_dead(self):
"""Process.terminate should handle already-dead processes."""
from archivebox.machine.models import Process
# Create a process with fake PID
proc = Process.objects.create(
machine=self.machine,
status=Process.StatusChoices.RUNNING,
pid=999999,
started_at=timezone.now(),
)
result = proc.terminate()
# Should return False (was already dead)
self.assertFalse(result)
proc.refresh_from_db()
self.assertEqual(proc.status, Process.StatusChoices.EXITED)
def test_process_tree_traversal(self):
"""Process parent/children relationships should work."""
from archivebox.machine.models import Process
# Create parent process
parent = Process.objects.create(
machine=self.machine,
process_type=Process.TypeChoices.CLI,
status=Process.StatusChoices.RUNNING,
pid=1,
started_at=timezone.now(),
)
# Create child process
child = Process.objects.create(
machine=self.machine,
parent=parent,
process_type=Process.TypeChoices.WORKER,
status=Process.StatusChoices.RUNNING,
pid=2,
started_at=timezone.now(),
)
# Test relationships
self.assertEqual(child.parent, parent)
self.assertIn(child, parent.children.all())
self.assertEqual(child.root, parent)
self.assertEqual(child.depth, 1)
self.assertEqual(parent.depth, 0)
if __name__ == '__main__':
pytest.main([__file__, '-v'])

View File

@@ -1,138 +0,0 @@
import os
import sqlite3
import subprocess
from pathlib import Path
def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
candidates = {snapshot_id}
if len(snapshot_id) == 32:
hyphenated = f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}"
candidates.add(hyphenated)
elif len(snapshot_id) == 36 and '-' in snapshot_id:
candidates.add(snapshot_id.replace('-', ''))
for needle in candidates:
for path in data_dir.rglob(needle):
if path.is_dir():
return path
return None
def _find_html_with_text(root: Path, needle: str) -> list[Path]:
hits: list[Path] = []
for path in root.rglob("*.htm*"):
if not path.is_file():
continue
try:
if needle in path.read_text(errors="ignore"):
hits.append(path)
except Exception:
continue
return hits
def test_add_real_world_example_domain(tmp_path):
os.chdir(tmp_path)
tmp_short = Path("/tmp") / f"abx-{tmp_path.name}"
tmp_short.mkdir(parents=True, exist_ok=True)
env = os.environ.copy()
env["TMP_DIR"] = str(tmp_short)
env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
env["SAVE_TITLE"] = "True"
env["SAVE_WGET"] = "True"
env["SAVE_SINGLEFILE"] = "True"
env["SAVE_READABILITY"] = "False"
env["SAVE_HTMLTOTEXT"] = "True"
env["SAVE_HEADERS"] = "True"
env["SAVE_PDF"] = "False"
env["SAVE_SCREENSHOT"] = "False"
env["SAVE_ARCHIVEDOTORG"] = "False"
env["SAVE_YTDLP"] = "False"
env["SAVE_GIT"] = "False"
init = subprocess.run(
["archivebox", "init"],
capture_output=True,
text=True,
timeout=120,
env=env,
)
assert init.returncode == 0, f"archivebox init failed: {init.stderr}"
result = subprocess.run(
["archivebox", "add", "--plugins=title,wget,singlefile,htmltotext,headers", "https://example.com"],
capture_output=True,
text=True,
timeout=900,
env=env,
)
assert result.returncode == 0, (
"archivebox add failed.\n"
f"stdout:\n{result.stdout}\n"
f"stderr:\n{result.stderr}"
)
conn = sqlite3.connect(tmp_path / "index.sqlite3")
c = conn.cursor()
snapshot_row = c.execute(
"SELECT id, url, title FROM core_snapshot WHERE url = ?",
("https://example.com",),
).fetchone()
assert snapshot_row is not None, "Snapshot for https://example.com not found in DB"
snapshot_id, snapshot_url, snapshot_title = snapshot_row
assert snapshot_title and "Example Domain" in snapshot_title, (
f"Expected title to contain Example Domain, got: {snapshot_title}"
)
failed_results = c.execute(
"SELECT COUNT(*) FROM core_archiveresult WHERE snapshot_id = ? AND status = 'failed'",
(snapshot_id,),
).fetchone()[0]
assert failed_results == 0, "Some archive results failed for example.com snapshot"
binary_workers = c.execute(
"SELECT COUNT(*) FROM machine_process WHERE process_type = 'worker' AND worker_type = 'binary'"
).fetchone()[0]
assert binary_workers > 0, "Expected BinaryWorker to run installs via BinaryMachine"
failed_binary_workers = c.execute(
"SELECT COUNT(*) FROM machine_process WHERE process_type = 'worker' AND worker_type = 'binary' "
"AND exit_code IS NOT NULL AND exit_code != 0"
).fetchone()[0]
assert failed_binary_workers == 0, "BinaryWorker reported non-zero exit codes"
queued_binaries = c.execute(
"SELECT name FROM machine_binary WHERE status != 'installed'"
).fetchall()
assert not queued_binaries, f"Some binaries did not install: {queued_binaries}"
conn.close()
snapshot_dir = _find_snapshot_dir(tmp_path, str(snapshot_id))
assert snapshot_dir is not None, "Snapshot output directory not found"
title_path = snapshot_dir / "title" / "title.txt"
assert title_path.exists(), f"Missing title output: {title_path}"
assert "Example Domain" in title_path.read_text(errors="ignore")
html_sources = []
for candidate in ("wget", "singlefile", "dom"):
for candidate_dir in (snapshot_dir / candidate, *snapshot_dir.glob(f"*_{candidate}")):
if candidate_dir.exists():
html_sources.extend(_find_html_with_text(candidate_dir, "Example Domain"))
assert len(html_sources) >= 2, (
"Expected HTML outputs from multiple extractors to contain Example Domain "
f"(found {len(html_sources)})."
)
text_hits = 0
for path in (
*snapshot_dir.glob("*_htmltotext/htmltotext.txt"),
snapshot_dir / "htmltotext" / "htmltotext.txt",
):
if path.exists() and "Example Domain" in path.read_text(errors="ignore"):
text_hits += 1
assert text_hits >= 1, (
"Expected htmltotext output to contain Example Domain "
f"(htmltotext hits={text_hits})."
)

View File

@@ -1,84 +0,0 @@
from datetime import timedelta
from typing import cast
from unittest.mock import patch
from django.contrib.auth import get_user_model
from django.contrib.auth.models import UserManager
from django.test import TestCase
from django.utils import timezone
from archivebox.crawls.models import Crawl, CrawlSchedule
from archivebox.workers.orchestrator import Orchestrator
from archivebox.workers.worker import CrawlWorker
class TestScheduledCrawlMaterialization(TestCase):
def setUp(self):
user_manager = cast(UserManager, get_user_model().objects)
self.user = user_manager.create_user(
username='schedule-user',
password='password',
)
def _create_due_schedule(self) -> CrawlSchedule:
template = Crawl.objects.create(
urls='https://example.com/feed.xml',
max_depth=1,
tags_str='scheduled',
label='Scheduled Feed',
notes='template',
created_by=self.user,
status=Crawl.StatusChoices.SEALED,
retry_at=None,
)
schedule = CrawlSchedule.objects.create(
template=template,
schedule='daily',
is_enabled=True,
label='Scheduled Feed',
notes='template',
created_by=self.user,
)
past = timezone.now() - timedelta(days=2)
Crawl.objects.filter(pk=template.pk).update(created_at=past, modified_at=past)
template.refresh_from_db()
schedule.refresh_from_db()
return schedule
def test_global_orchestrator_materializes_due_schedule(self):
schedule = self._create_due_schedule()
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator._materialize_due_schedules()
scheduled_crawls = Crawl.objects.filter(schedule=schedule).order_by('created_at')
self.assertEqual(scheduled_crawls.count(), 2)
queued_crawl = scheduled_crawls.last()
self.assertIsNotNone(queued_crawl)
assert queued_crawl is not None
self.assertEqual(queued_crawl.status, Crawl.StatusChoices.QUEUED)
self.assertEqual(queued_crawl.urls, 'https://example.com/feed.xml')
self.assertEqual(queued_crawl.max_depth, 1)
self.assertEqual(queued_crawl.tags_str, 'scheduled')
def test_one_shot_orchestrator_does_not_materialize_due_schedule(self):
schedule = self._create_due_schedule()
Orchestrator(exit_on_idle=True)._materialize_due_schedules()
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
Orchestrator(exit_on_idle=False, crawl_id=str(schedule.template.id))._materialize_due_schedules()
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
@patch.object(CrawlWorker, 'start')
def test_global_orchestrator_waits_one_tick_before_spawning_materialized_schedule(self, mock_start):
schedule = self._create_due_schedule()
orchestrator = Orchestrator(exit_on_idle=False)
with patch.object(orchestrator, '_claim_crawl', return_value=True):
queue_sizes = orchestrator.check_queues_and_spawn_workers()
self.assertEqual(queue_sizes['crawl'], 1)
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 2)
mock_start.assert_not_called()

View File

@@ -1,76 +0,0 @@
from pathlib import Path
from types import SimpleNamespace
from typing import Any, cast
from unittest.mock import patch
from django.test import SimpleTestCase
from archivebox.workers.worker import SnapshotWorker
class TestSnapshotWorkerRetryForegroundHooks(SimpleTestCase):
def _make_worker(self):
worker = SnapshotWorker.__new__(SnapshotWorker)
worker.pid = 12345
cast(Any, worker).snapshot = SimpleNamespace(
status='started',
refresh_from_db=lambda: None,
)
worker._snapshot_exceeded_hard_timeout = lambda: False
worker._seal_snapshot_due_to_timeout = lambda: None
worker._run_hook = lambda *args, **kwargs: SimpleNamespace()
worker._wait_for_hook = lambda process, ar: None
return worker
@patch('archivebox.workers.worker.log_worker_event')
def test_retry_skips_successful_hook_with_only_inline_output(self, mock_log):
worker = self._make_worker()
archive_result = SimpleNamespace(
status='succeeded',
output_files={},
output_str='scrolled 600px',
output_json=None,
refresh_from_db=lambda: None,
)
worker._retry_failed_empty_foreground_hooks(
[(Path('/tmp/on_Snapshot__45_infiniscroll.js'), archive_result)],
config={},
)
mock_log.assert_not_called()
@patch('archivebox.workers.worker.log_worker_event')
def test_retry_replays_failed_hook_with_no_outputs(self, mock_log):
worker = self._make_worker()
run_calls = []
wait_calls = []
def run_hook(*args, **kwargs):
run_calls.append((args, kwargs))
return SimpleNamespace()
def wait_for_hook(process, ar):
wait_calls.append((process, ar))
ar.status = 'succeeded'
ar.output_files = {'singlefile.html': {}}
archive_result = SimpleNamespace(
status='failed',
output_files={},
output_str='',
output_json=None,
refresh_from_db=lambda: None,
)
worker._run_hook = run_hook
worker._wait_for_hook = wait_for_hook
worker._retry_failed_empty_foreground_hooks(
[(Path('/tmp/on_Snapshot__50_singlefile.py'), archive_result)],
config={},
)
assert len(run_calls) == 1
assert len(wait_calls) == 1
mock_log.assert_called_once()

View File

@@ -1,143 +0,0 @@
import threading
import time
import pytest
from django.db import close_old_connections
from django.utils import timezone
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.machine.models import Binary, Machine
from archivebox.workers.worker import BinaryWorker
def get_fresh_machine() -> Machine:
import archivebox.machine.models as machine_models
machine_models._CURRENT_MACHINE = None
machine_models._CURRENT_BINARIES.clear()
return Machine.current()
@pytest.mark.django_db
def test_claim_processing_lock_does_not_steal_future_retry_at():
"""
retry_at is both the schedule and the ownership lock.
Once one process claims a due row and moves retry_at into the future, a
fresh reader must not be able to "re-claim" that future timestamp and run
the same side effects a second time.
"""
machine = get_fresh_machine()
binary = Binary.objects.create(
machine=machine,
name='claim-test',
binproviders='env',
status=Binary.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
owner = Binary.objects.get(pk=binary.pk)
contender = Binary.objects.get(pk=binary.pk)
assert owner.claim_processing_lock(lock_seconds=30) is True
contender.refresh_from_db()
assert contender.retry_at > timezone.now()
assert contender.claim_processing_lock(lock_seconds=30) is False
@pytest.mark.django_db
def test_binary_worker_skips_binary_claimed_by_other_owner(monkeypatch):
"""
BinaryWorker must never run install side effects for a Binary whose retry_at
lock has already been claimed by another process.
"""
machine = get_fresh_machine()
binary = Binary.objects.create(
machine=machine,
name='claimed-binary',
binproviders='env',
status=Binary.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
owner = Binary.objects.get(pk=binary.pk)
assert owner.claim_processing_lock(lock_seconds=30) is True
calls: list[str] = []
def fake_run(self):
calls.append(self.name)
self.status = self.StatusChoices.INSTALLED
self.abspath = '/tmp/fake-binary'
self.version = '1.0'
self.save(update_fields=['status', 'abspath', 'version', 'modified_at'])
monkeypatch.setattr(Binary, 'run', fake_run)
worker = BinaryWorker(binary_id=str(binary.id))
worker._process_single_binary()
assert calls == []
@pytest.mark.django_db(transaction=True)
def test_crawl_install_declared_binaries_waits_for_existing_owner(monkeypatch):
"""
Crawl.install_declared_binaries should wait for the current owner of a Binary
to finish instead of launching a duplicate install against shared provider
state such as the npm tree.
"""
machine = get_fresh_machine()
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
binary = Binary.objects.create(
machine=machine,
name='puppeteer',
binproviders='npm',
status=Binary.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
owner = Binary.objects.get(pk=binary.pk)
assert owner.claim_processing_lock(lock_seconds=30) is True
calls: list[str] = []
def fake_run(self):
calls.append(self.name)
self.status = self.StatusChoices.INSTALLED
self.abspath = '/tmp/should-not-run'
self.version = '1.0'
self.save(update_fields=['status', 'abspath', 'version', 'modified_at'])
monkeypatch.setattr(Binary, 'run', fake_run)
def finish_existing_install():
close_old_connections()
try:
time.sleep(0.3)
Binary.objects.filter(pk=binary.pk).update(
status=Binary.StatusChoices.INSTALLED,
retry_at=None,
abspath='/tmp/finished-by-owner',
version='1.0',
modified_at=timezone.now(),
)
finally:
close_old_connections()
thread = threading.Thread(target=finish_existing_install, daemon=True)
thread.start()
crawl.install_declared_binaries({'puppeteer'}, machine=machine)
thread.join(timeout=5)
binary.refresh_from_db()
assert binary.status == Binary.StatusChoices.INSTALLED
assert binary.abspath == '/tmp/finished-by-owner'
assert calls == []

File diff suppressed because it is too large Load Diff