mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Refactor ArchiveBox onto abx-dl bus runner
This commit is contained in:
@@ -1,133 +0,0 @@
|
||||
import os
|
||||
import signal
|
||||
import sqlite3
|
||||
import subprocess
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _run(cmd, data_dir: Path, env: dict, timeout: int = 120):
|
||||
return subprocess.run(
|
||||
cmd,
|
||||
cwd=data_dir,
|
||||
env=env,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
|
||||
def _make_env(data_dir: Path) -> dict:
|
||||
env = os.environ.copy()
|
||||
env["DATA_DIR"] = str(data_dir)
|
||||
env["USE_COLOR"] = "False"
|
||||
env["SHOW_PROGRESS"] = "False"
|
||||
env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
|
||||
env["PLUGINS"] = "favicon"
|
||||
# Keep it fast but still real hooks
|
||||
env["SAVE_FAVICON"] = "True"
|
||||
env["SAVE_TITLE"] = "False"
|
||||
env["SAVE_WGET"] = "False"
|
||||
env["SAVE_WARC"] = "False"
|
||||
env["SAVE_PDF"] = "False"
|
||||
env["SAVE_SCREENSHOT"] = "False"
|
||||
env["SAVE_DOM"] = "False"
|
||||
env["SAVE_SINGLEFILE"] = "False"
|
||||
env["SAVE_READABILITY"] = "False"
|
||||
env["SAVE_MERCURY"] = "False"
|
||||
env["SAVE_GIT"] = "False"
|
||||
env["SAVE_YTDLP"] = "False"
|
||||
env["SAVE_HEADERS"] = "False"
|
||||
env["SAVE_HTMLTOTEXT"] = "False"
|
||||
return env
|
||||
|
||||
|
||||
def _count_running_processes(db_path: Path, where: str) -> int:
|
||||
for _ in range(50):
|
||||
try:
|
||||
conn = sqlite3.connect(db_path, timeout=1)
|
||||
cur = conn.cursor()
|
||||
count = cur.execute(
|
||||
f"SELECT COUNT(*) FROM machine_process WHERE status = 'running' AND {where}"
|
||||
).fetchone()[0]
|
||||
conn.close()
|
||||
return count
|
||||
except sqlite3.OperationalError:
|
||||
time.sleep(0.1)
|
||||
return 0
|
||||
|
||||
|
||||
def _wait_for_count(db_path: Path, where: str, target: int, timeout: int = 20) -> bool:
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
if _count_running_processes(db_path, where) >= target:
|
||||
return True
|
||||
time.sleep(0.1)
|
||||
return False
|
||||
|
||||
|
||||
def test_add_parents_workers_to_orchestrator(tmp_path):
|
||||
data_dir = tmp_path / "data"
|
||||
data_dir.mkdir()
|
||||
env = _make_env(data_dir)
|
||||
|
||||
init = _run([sys.executable, "-m", "archivebox", "init", "--quick"], data_dir, env)
|
||||
assert init.returncode == 0, init.stderr
|
||||
|
||||
add = _run([sys.executable, "-m", "archivebox", "add", "--plugins=favicon", "https://example.com"], data_dir, env, timeout=120)
|
||||
assert add.returncode == 0, add.stderr
|
||||
|
||||
conn = sqlite3.connect(data_dir / "index.sqlite3")
|
||||
cur = conn.cursor()
|
||||
orchestrator = cur.execute(
|
||||
"SELECT id FROM machine_process WHERE process_type = 'orchestrator' ORDER BY created_at DESC LIMIT 1"
|
||||
).fetchone()
|
||||
assert orchestrator is not None
|
||||
orchestrator_id = orchestrator[0]
|
||||
|
||||
worker_count = cur.execute(
|
||||
"SELECT COUNT(*) FROM machine_process WHERE process_type = 'worker' AND worker_type = 'crawl' "
|
||||
"AND parent_id = ?",
|
||||
(orchestrator_id,),
|
||||
).fetchone()[0]
|
||||
conn.close()
|
||||
|
||||
assert worker_count >= 1, "Expected crawl worker to be parented to orchestrator"
|
||||
|
||||
|
||||
def test_add_interrupt_cleans_orphaned_processes(tmp_path):
|
||||
data_dir = tmp_path / "data"
|
||||
data_dir.mkdir()
|
||||
env = _make_env(data_dir)
|
||||
|
||||
init = _run([sys.executable, "-m", "archivebox", "init", "--quick"], data_dir, env)
|
||||
assert init.returncode == 0, init.stderr
|
||||
|
||||
proc = subprocess.Popen(
|
||||
[sys.executable, "-m", "archivebox", "add", "--plugins=favicon", "https://example.com"],
|
||||
cwd=data_dir,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
)
|
||||
|
||||
db_path = data_dir / "index.sqlite3"
|
||||
saw_worker = _wait_for_count(db_path, "process_type = 'worker'", 1, timeout=20)
|
||||
assert saw_worker, "Expected at least one worker to start before interrupt"
|
||||
|
||||
proc.send_signal(signal.SIGINT)
|
||||
proc.wait(timeout=30)
|
||||
|
||||
# Wait for workers/hooks to be cleaned up
|
||||
start = time.time()
|
||||
while time.time() - start < 30:
|
||||
running = _count_running_processes(db_path, "process_type IN ('worker','hook')")
|
||||
if running == 0:
|
||||
break
|
||||
time.sleep(0.2)
|
||||
|
||||
assert _count_running_processes(db_path, "process_type IN ('worker','hook')") == 0, (
|
||||
"Expected no running worker/hook processes after interrupt"
|
||||
)
|
||||
@@ -1,5 +1,5 @@
|
||||
"""
|
||||
Tests for JSONL piping contracts and `archivebox run` / `archivebox orchestrator`.
|
||||
Tests for JSONL piping contracts and `archivebox run`.
|
||||
|
||||
This file covers both:
|
||||
- low-level JSONL/stdin parsing behavior that makes CLI piping work
|
||||
@@ -252,8 +252,8 @@ def test_snapshot_list_stdout_pipes_into_run(initialized_archive):
|
||||
assert snapshot_status == "sealed"
|
||||
|
||||
|
||||
def test_archiveresult_list_stdout_pipes_into_orchestrator_alias(initialized_archive):
|
||||
"""`archivebox archiveresult list | archivebox orchestrator` should preserve clean JSONL stdout."""
|
||||
def test_archiveresult_list_stdout_pipes_into_run(initialized_archive):
|
||||
"""`archivebox archiveresult list | archivebox run` should preserve clean JSONL stdout."""
|
||||
url = create_test_url()
|
||||
|
||||
snapshot_stdout, snapshot_stderr, snapshot_code = run_archivebox_cmd(
|
||||
@@ -279,18 +279,17 @@ def test_archiveresult_list_stdout_pipes_into_orchestrator_alias(initialized_arc
|
||||
assert list_code == 0, list_stderr
|
||||
_assert_stdout_is_jsonl_only(list_stdout)
|
||||
|
||||
orchestrator_stdout, orchestrator_stderr, orchestrator_code = run_archivebox_cmd(
|
||||
["orchestrator"],
|
||||
run_stdout, run_stderr, run_code = run_archivebox_cmd(
|
||||
["run"],
|
||||
stdin=list_stdout,
|
||||
data_dir=initialized_archive,
|
||||
timeout=120,
|
||||
env=PIPE_TEST_ENV,
|
||||
)
|
||||
assert orchestrator_code == 0, orchestrator_stderr
|
||||
_assert_stdout_is_jsonl_only(orchestrator_stdout)
|
||||
assert "renamed to `archivebox run`" in orchestrator_stderr
|
||||
assert run_code == 0, run_stderr
|
||||
_assert_stdout_is_jsonl_only(run_stdout)
|
||||
|
||||
run_records = parse_jsonl_output(orchestrator_stdout)
|
||||
run_records = parse_jsonl_output(run_stdout)
|
||||
assert any(
|
||||
record.get("type") == "ArchiveResult" and record.get("id") == archiveresult["id"]
|
||||
for record in run_records
|
||||
|
||||
@@ -1,342 +0,0 @@
|
||||
"""
|
||||
Tests for BinaryWorker processing Binary queue.
|
||||
|
||||
Tests cover:
|
||||
- BinaryWorker is spawned by Orchestrator when Binary queue has work
|
||||
- Binary hooks (on_Binary__*) actually run and install binaries
|
||||
- Binary status transitions from QUEUED -> INSTALLED
|
||||
- BinaryWorker exits after idle timeout
|
||||
"""
|
||||
|
||||
import json
|
||||
import sqlite3
|
||||
|
||||
from archivebox.tests.conftest import (
|
||||
run_archivebox_cmd,
|
||||
)
|
||||
|
||||
|
||||
class TestBinaryWorkerSpawning:
|
||||
"""Tests for BinaryWorker lifecycle."""
|
||||
|
||||
def test_binary_worker_spawns_when_binary_queued(self, initialized_archive):
|
||||
"""Orchestrator spawns BinaryWorker when Binary queue has work."""
|
||||
# Create a Binary record via CLI
|
||||
binary_record = {
|
||||
'type': 'Binary',
|
||||
'name': 'python3',
|
||||
'binproviders': 'env', # Use env provider to detect system python
|
||||
}
|
||||
|
||||
# Use `archivebox run` to create the Binary (this queues it)
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(binary_record),
|
||||
data_dir=initialized_archive,
|
||||
timeout=60, # Increased timeout to allow for binary installation
|
||||
)
|
||||
|
||||
assert code == 0, f"Failed to create Binary: {stderr}"
|
||||
|
||||
# Verify Binary was created in DB
|
||||
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
|
||||
c = conn.cursor()
|
||||
binaries = c.execute(
|
||||
"SELECT name, status, abspath FROM machine_binary WHERE name='python3'"
|
||||
).fetchall()
|
||||
conn.close()
|
||||
|
||||
assert len(binaries) >= 1, "Binary was not created in database"
|
||||
name, status, abspath = binaries[0]
|
||||
assert name == 'python3'
|
||||
# Status should be INSTALLED after BinaryWorker processed it
|
||||
# (or QUEUED if worker timed out before installing)
|
||||
assert status in ['installed', 'queued']
|
||||
|
||||
|
||||
def test_binary_hooks_actually_run(self, initialized_archive):
|
||||
"""Binary installation hooks (on_Binary__*) run and update abspath."""
|
||||
# Create a Binary for python3 (guaranteed to exist on system)
|
||||
binary_record = {
|
||||
'type': 'Binary',
|
||||
'name': 'python3',
|
||||
'binproviders': 'env',
|
||||
}
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(binary_record),
|
||||
data_dir=initialized_archive,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert code == 0, f"Failed to process Binary: {stderr}"
|
||||
|
||||
# Query database to check if hooks ran and populated abspath
|
||||
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
|
||||
c = conn.cursor()
|
||||
result = c.execute(
|
||||
"SELECT name, status, abspath, version FROM machine_binary WHERE name='python3'"
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert result is not None, "Binary not found in database"
|
||||
name, status, abspath, version = result
|
||||
|
||||
# If hooks ran successfully, abspath should be populated
|
||||
if status == 'installed':
|
||||
assert abspath, f"Binary installed but abspath is empty: {abspath}"
|
||||
assert '/python3' in abspath or '\\python3' in abspath, \
|
||||
f"abspath doesn't look like a python3 path: {abspath}"
|
||||
# Version should also be populated
|
||||
assert version, f"Binary installed but version is empty: {version}"
|
||||
|
||||
|
||||
def test_binary_status_transitions(self, initialized_archive):
|
||||
"""Binary status correctly transitions QUEUED -> INSTALLED."""
|
||||
binary_record = {
|
||||
'type': 'Binary',
|
||||
'name': 'python3',
|
||||
'binproviders': 'env',
|
||||
}
|
||||
|
||||
# Create and process the Binary
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(binary_record),
|
||||
data_dir=initialized_archive,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
|
||||
# Check final status
|
||||
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
|
||||
c = conn.cursor()
|
||||
status = c.execute(
|
||||
"SELECT status FROM machine_binary WHERE name='python3'"
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert status is not None
|
||||
# Should be installed (or queued if worker timed out)
|
||||
assert status[0] in ['installed', 'queued']
|
||||
|
||||
|
||||
class TestBinaryWorkerHooks:
|
||||
"""Tests for specific Binary hook providers."""
|
||||
|
||||
def test_env_provider_hook_detects_system_binary(self, initialized_archive):
|
||||
"""on_Binary__15_env_discover.py hook detects system binaries."""
|
||||
binary_record = {
|
||||
'type': 'Binary',
|
||||
'name': 'python3',
|
||||
'binproviders': 'env',
|
||||
}
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(binary_record),
|
||||
data_dir=initialized_archive,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
|
||||
# Check that env provider hook populated the Binary
|
||||
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
|
||||
c = conn.cursor()
|
||||
result = c.execute(
|
||||
"SELECT binprovider, abspath FROM machine_binary WHERE name='python3' AND status='installed'"
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
if result:
|
||||
binprovider, abspath = result
|
||||
assert binprovider == 'env', f"Expected env provider, got: {binprovider}"
|
||||
assert abspath, "abspath should be populated by env provider"
|
||||
|
||||
|
||||
def test_multiple_binaries_processed_in_batch(self, initialized_archive):
|
||||
"""BinaryWorker processes multiple queued binaries."""
|
||||
# Create multiple Binary records
|
||||
binaries = [
|
||||
{'type': 'Binary', 'name': 'python3', 'binproviders': 'env'},
|
||||
{'type': 'Binary', 'name': 'curl', 'binproviders': 'env'},
|
||||
]
|
||||
|
||||
stdin = '\n'.join(json.dumps(b) for b in binaries)
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=stdin,
|
||||
data_dir=initialized_archive,
|
||||
timeout=90, # Need more time for multiple binaries
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
|
||||
# Both should be processed
|
||||
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
|
||||
c = conn.cursor()
|
||||
installed = c.execute(
|
||||
"SELECT name FROM machine_binary WHERE name IN ('python3', 'curl')"
|
||||
).fetchall()
|
||||
conn.close()
|
||||
|
||||
assert len(installed) >= 1, "At least one binary should be created"
|
||||
|
||||
def test_puppeteer_binary_sets_skip_download_for_hooks(self, initialized_archive):
|
||||
"""Puppeteer installs expose skip-download env to Binary hooks."""
|
||||
user_plugins_dir = initialized_archive / 'test_plugins'
|
||||
plugin_dir = user_plugins_dir / 'inspectnpm'
|
||||
plugin_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
hook = plugin_dir / 'on_Binary__10_inspectnpm_install.py'
|
||||
hook.write_text(
|
||||
"""#!/usr/bin/env python3
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import sys
|
||||
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument('--machine-id', required=True)
|
||||
parser.add_argument('--binary-id', required=True)
|
||||
parser.add_argument('--name', required=True)
|
||||
parser.add_argument('--binproviders', default='*')
|
||||
args = parser.parse_args()
|
||||
|
||||
record = {
|
||||
'type': 'Binary',
|
||||
'name': args.name,
|
||||
'abspath': shutil.which('python3') or sys.executable,
|
||||
'version': '1.0.0',
|
||||
'sha256': '',
|
||||
'binprovider': 'inspectnpm',
|
||||
'machine_id': args.machine_id,
|
||||
'binary_id': args.binary_id,
|
||||
}
|
||||
print(json.dumps(record))
|
||||
print(json.dumps({
|
||||
'type': 'Machine',
|
||||
'config': {
|
||||
'SEEN_PUPPETEER_SKIP_DOWNLOAD': os.environ.get('PUPPETEER_SKIP_DOWNLOAD', ''),
|
||||
'SEEN_PUPPETEER_SKIP_CHROMIUM_DOWNLOAD': os.environ.get('PUPPETEER_SKIP_CHROMIUM_DOWNLOAD', ''),
|
||||
},
|
||||
}))
|
||||
"""
|
||||
)
|
||||
|
||||
binary_record = {
|
||||
'type': 'Binary',
|
||||
'name': 'puppeteer',
|
||||
'binproviders': 'inspectnpm',
|
||||
}
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(binary_record),
|
||||
data_dir=initialized_archive,
|
||||
env={
|
||||
'ARCHIVEBOX_USER_PLUGINS_DIR': str(user_plugins_dir),
|
||||
'PLUGINS': 'inspectnpm',
|
||||
},
|
||||
timeout=60,
|
||||
)
|
||||
|
||||
assert code == 0, f"Failed to process puppeteer Binary: {stderr}"
|
||||
|
||||
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
|
||||
c = conn.cursor()
|
||||
result = c.execute(
|
||||
"SELECT status, binprovider FROM machine_binary WHERE name='puppeteer'"
|
||||
).fetchone()
|
||||
hook_rows = c.execute(
|
||||
"SELECT cmd, env FROM machine_process WHERE process_type='hook' ORDER BY created_at DESC"
|
||||
).fetchall()
|
||||
conn.close()
|
||||
|
||||
assert result is not None, "Puppeteer binary not found in database"
|
||||
status, binprovider = result
|
||||
assert status == 'installed', f"Expected puppeteer to install, got: {status}"
|
||||
assert binprovider == 'inspectnpm', f"Expected inspectnpm provider, got: {binprovider}"
|
||||
|
||||
hook_env = None
|
||||
for cmd_json, env_json in hook_rows:
|
||||
cmd = json.loads(cmd_json)
|
||||
if any('inspectnpm' in part for part in cmd):
|
||||
hook_env = json.loads(env_json)
|
||||
break
|
||||
|
||||
assert hook_env is not None, "Inspectnpm hook process not found"
|
||||
assert hook_env.get('PUPPETEER_SKIP_DOWNLOAD') == 'true'
|
||||
assert hook_env.get('PUPPETEER_SKIP_CHROMIUM_DOWNLOAD') == 'true'
|
||||
|
||||
|
||||
class TestBinaryWorkerEdgeCases:
|
||||
"""Tests for edge cases and error handling."""
|
||||
|
||||
def test_nonexistent_binary_stays_queued(self, initialized_archive):
|
||||
"""Binary that doesn't exist stays queued (doesn't fail permanently)."""
|
||||
binary_record = {
|
||||
'type': 'Binary',
|
||||
'name': 'nonexistent-binary-xyz-12345',
|
||||
'binproviders': 'env',
|
||||
}
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(binary_record),
|
||||
data_dir=initialized_archive,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
# Command should still succeed (orchestrator doesn't fail on binary install failures)
|
||||
assert code == 0
|
||||
|
||||
# Binary should remain queued (not installed)
|
||||
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
|
||||
c = conn.cursor()
|
||||
result = c.execute(
|
||||
"SELECT status FROM machine_binary WHERE name='nonexistent-binary-xyz-12345'"
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
if result:
|
||||
status = result[0]
|
||||
# Should stay queued since installation failed
|
||||
assert status == 'queued', f"Expected queued, got: {status}"
|
||||
|
||||
|
||||
def test_binary_worker_respects_machine_isolation(self, initialized_archive):
|
||||
"""BinaryWorker only processes binaries for current machine."""
|
||||
# This is implicitly tested by other tests - Binary.objects.filter(machine=current)
|
||||
# ensures only current machine's binaries are processed
|
||||
binary_record = {
|
||||
'type': 'Binary',
|
||||
'name': 'python3',
|
||||
'binproviders': 'env',
|
||||
}
|
||||
|
||||
stdout, stderr, code = run_archivebox_cmd(
|
||||
['run'],
|
||||
stdin=json.dumps(binary_record),
|
||||
data_dir=initialized_archive,
|
||||
timeout=30,
|
||||
)
|
||||
|
||||
assert code == 0
|
||||
|
||||
# Check that machine_id is set correctly
|
||||
conn = sqlite3.connect(initialized_archive / 'index.sqlite3')
|
||||
c = conn.cursor()
|
||||
result = c.execute(
|
||||
"SELECT machine_id FROM machine_binary WHERE name='python3'"
|
||||
).fetchone()
|
||||
conn.close()
|
||||
|
||||
assert result is not None
|
||||
machine_id = result[0]
|
||||
assert machine_id, "machine_id should be set on Binary"
|
||||
@@ -369,9 +369,9 @@ class TestProcessCurrent(TestCase):
|
||||
|
||||
self.assertEqual(proc1.id, proc2.id)
|
||||
|
||||
def test_process_detect_type_orchestrator(self):
|
||||
"""_detect_process_type should detect orchestrator."""
|
||||
with patch('sys.argv', ['archivebox', 'manage', 'orchestrator']):
|
||||
def test_process_detect_type_runner(self):
|
||||
"""_detect_process_type should detect the background runner command."""
|
||||
with patch('sys.argv', ['archivebox', 'run', '--daemon']):
|
||||
result = Process._detect_process_type()
|
||||
self.assertEqual(result, Process.TypeChoices.ORCHESTRATOR)
|
||||
|
||||
@@ -381,11 +381,11 @@ class TestProcessCurrent(TestCase):
|
||||
result = Process._detect_process_type()
|
||||
self.assertEqual(result, Process.TypeChoices.CLI)
|
||||
|
||||
def test_process_detect_type_worker(self):
|
||||
"""_detect_process_type should detect workers."""
|
||||
with patch('sys.argv', ['python', '-m', 'crawl_worker']):
|
||||
def test_process_detect_type_binary(self):
|
||||
"""_detect_process_type should detect non-ArchiveBox subprocesses as binary processes."""
|
||||
with patch('sys.argv', ['/usr/bin/wget', 'https://example.com']):
|
||||
result = Process._detect_process_type()
|
||||
self.assertEqual(result, Process.TypeChoices.WORKER)
|
||||
self.assertEqual(result, Process.TypeChoices.BINARY)
|
||||
|
||||
|
||||
class TestProcessHierarchy(TestCase):
|
||||
|
||||
@@ -1,484 +0,0 @@
|
||||
"""
|
||||
Unit tests for the Orchestrator and Worker classes.
|
||||
|
||||
Tests cover:
|
||||
1. Orchestrator lifecycle (startup, shutdown)
|
||||
2. Queue polling and worker spawning
|
||||
3. Idle detection and exit logic
|
||||
4. Worker registration and management
|
||||
5. Process model methods (replacing old pid_utils)
|
||||
"""
|
||||
|
||||
import os
|
||||
import time
|
||||
from datetime import datetime, timedelta
|
||||
from unittest.mock import patch
|
||||
from typing import ClassVar
|
||||
|
||||
import pytest
|
||||
from django.test import TestCase
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
from archivebox.workers.worker import Worker
|
||||
|
||||
|
||||
class FakeWorker(Worker):
|
||||
name: ClassVar[str] = 'crawl'
|
||||
MAX_CONCURRENT_TASKS: ClassVar[int] = 5
|
||||
running_workers: ClassVar[list[dict[str, object]]] = []
|
||||
|
||||
@classmethod
|
||||
def get_running_workers(cls) -> list[dict[str, object]]:
|
||||
return cls.running_workers
|
||||
|
||||
|
||||
class TestOrchestratorUnit(TestCase):
|
||||
"""Unit tests for Orchestrator class (mocked dependencies)."""
|
||||
|
||||
def test_orchestrator_creation(self):
|
||||
"""Orchestrator should initialize with correct defaults."""
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
|
||||
self.assertTrue(orchestrator.exit_on_idle)
|
||||
self.assertEqual(orchestrator.idle_count, 0)
|
||||
self.assertIsNone(orchestrator.pid_file)
|
||||
|
||||
def test_orchestrator_repr(self):
|
||||
"""Orchestrator __repr__ should include PID."""
|
||||
orchestrator = Orchestrator()
|
||||
repr_str = repr(orchestrator)
|
||||
|
||||
self.assertIn('Orchestrator', repr_str)
|
||||
self.assertIn(str(os.getpid()), repr_str)
|
||||
|
||||
def test_has_pending_work(self):
|
||||
"""has_pending_work should check if any queue has items."""
|
||||
orchestrator = Orchestrator()
|
||||
|
||||
self.assertFalse(orchestrator.has_pending_work({'crawl': 0, 'snapshot': 0}))
|
||||
self.assertTrue(orchestrator.has_pending_work({'crawl': 0, 'snapshot': 5}))
|
||||
self.assertTrue(orchestrator.has_pending_work({'crawl': 10, 'snapshot': 0}))
|
||||
|
||||
def test_should_exit_not_exit_on_idle(self):
|
||||
"""should_exit should return False when exit_on_idle is False."""
|
||||
orchestrator = Orchestrator(exit_on_idle=False)
|
||||
orchestrator.idle_count = 100
|
||||
|
||||
self.assertFalse(orchestrator.should_exit({'crawl': 0}))
|
||||
|
||||
def test_should_exit_pending_work(self):
|
||||
"""should_exit should return False when there's pending work."""
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.idle_count = 100
|
||||
|
||||
self.assertFalse(orchestrator.should_exit({'crawl': 5}))
|
||||
|
||||
@patch.object(Orchestrator, 'has_running_workers')
|
||||
def test_should_exit_running_workers(self, mock_has_workers):
|
||||
"""should_exit should return False when workers are running."""
|
||||
mock_has_workers.return_value = True
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.idle_count = 100
|
||||
|
||||
self.assertFalse(orchestrator.should_exit({'crawl': 0}))
|
||||
|
||||
@patch.object(Orchestrator, 'has_running_workers')
|
||||
@patch.object(Orchestrator, 'has_future_work')
|
||||
def test_should_exit_idle_timeout(self, mock_future, mock_workers):
|
||||
"""should_exit should return True after idle timeout with no work."""
|
||||
mock_workers.return_value = False
|
||||
mock_future.return_value = False
|
||||
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.idle_count = orchestrator.IDLE_TIMEOUT
|
||||
|
||||
self.assertTrue(orchestrator.should_exit({'crawl': 0, 'snapshot': 0}))
|
||||
|
||||
@patch.object(Orchestrator, 'has_running_workers')
|
||||
@patch.object(Orchestrator, 'has_future_work')
|
||||
def test_should_exit_below_idle_timeout(self, mock_future, mock_workers):
|
||||
"""should_exit should return False below idle timeout."""
|
||||
mock_workers.return_value = False
|
||||
mock_future.return_value = False
|
||||
|
||||
orchestrator = Orchestrator(exit_on_idle=True)
|
||||
orchestrator.idle_count = orchestrator.IDLE_TIMEOUT - 1
|
||||
|
||||
self.assertFalse(orchestrator.should_exit({'crawl': 0}))
|
||||
|
||||
def test_should_spawn_worker_no_queue(self):
|
||||
"""should_spawn_worker should return False when queue is empty."""
|
||||
orchestrator = Orchestrator()
|
||||
|
||||
FakeWorker.running_workers = []
|
||||
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 0))
|
||||
|
||||
def test_should_spawn_worker_at_limit(self):
|
||||
"""should_spawn_worker should return False when at per-type limit."""
|
||||
orchestrator = Orchestrator()
|
||||
|
||||
running_workers: list[dict[str, object]] = [{'worker_id': worker_id} for worker_id in range(orchestrator.MAX_CRAWL_WORKERS)]
|
||||
FakeWorker.running_workers = running_workers
|
||||
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 10))
|
||||
|
||||
@patch.object(Orchestrator, 'get_total_worker_count')
|
||||
def test_should_spawn_worker_at_total_limit(self, mock_total):
|
||||
"""should_spawn_worker should return False when at total limit."""
|
||||
orchestrator = Orchestrator()
|
||||
mock_total.return_value = 0
|
||||
running_workers: list[dict[str, object]] = [{'worker_id': worker_id} for worker_id in range(orchestrator.MAX_CRAWL_WORKERS)]
|
||||
FakeWorker.running_workers = running_workers
|
||||
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 10))
|
||||
|
||||
@patch.object(Orchestrator, 'get_total_worker_count')
|
||||
def test_should_spawn_worker_success(self, mock_total):
|
||||
"""should_spawn_worker should return True when conditions are met."""
|
||||
orchestrator = Orchestrator()
|
||||
mock_total.return_value = 0
|
||||
|
||||
FakeWorker.running_workers = []
|
||||
self.assertTrue(orchestrator.should_spawn_worker(FakeWorker, 10))
|
||||
|
||||
@patch.object(Orchestrator, 'get_total_worker_count')
|
||||
def test_should_spawn_worker_enough_workers(self, mock_total):
|
||||
"""should_spawn_worker should return False when enough workers for queue."""
|
||||
orchestrator = Orchestrator()
|
||||
mock_total.return_value = 2
|
||||
|
||||
FakeWorker.running_workers = [{}] # 1 worker running
|
||||
self.assertFalse(orchestrator.should_spawn_worker(FakeWorker, 3))
|
||||
|
||||
|
||||
class TestOrchestratorWithProcess(TestCase):
|
||||
"""Test Orchestrator using Process model for tracking."""
|
||||
|
||||
def setUp(self):
|
||||
"""Reset process cache."""
|
||||
import archivebox.machine.models as models
|
||||
models._CURRENT_MACHINE = None
|
||||
models._CURRENT_PROCESS = None
|
||||
|
||||
def test_is_running_no_orchestrator(self):
|
||||
"""is_running should return False when no orchestrator process exists."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
# Clean up any stale processes first
|
||||
Process.cleanup_stale_running()
|
||||
|
||||
# Mark any running orchestrators as exited for clean test state
|
||||
Process.objects.filter(
|
||||
process_type=Process.TypeChoices.ORCHESTRATOR,
|
||||
status=Process.StatusChoices.RUNNING
|
||||
).update(status=Process.StatusChoices.EXITED)
|
||||
|
||||
self.assertFalse(Orchestrator.is_running())
|
||||
|
||||
def test_is_running_with_orchestrator_process(self):
|
||||
"""is_running should return True when orchestrator Process exists."""
|
||||
from archivebox.machine.models import Process, Machine
|
||||
import psutil
|
||||
|
||||
machine = Machine.current()
|
||||
current_proc = psutil.Process(os.getpid())
|
||||
|
||||
# Create an orchestrator Process record
|
||||
proc = Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.ORCHESTRATOR,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=os.getpid(), # Use current PID so it appears alive
|
||||
started_at=datetime.fromtimestamp(current_proc.create_time(), tz=timezone.get_current_timezone()),
|
||||
cmd=current_proc.cmdline(),
|
||||
)
|
||||
|
||||
try:
|
||||
# Should detect running orchestrator
|
||||
self.assertTrue(Orchestrator.is_running())
|
||||
finally:
|
||||
# Clean up
|
||||
proc.status = Process.StatusChoices.EXITED
|
||||
proc.save()
|
||||
|
||||
def test_orchestrator_uses_process_for_is_running(self):
|
||||
"""Orchestrator.is_running should use Process.get_running_count."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
# Verify is_running uses Process model, not pid files
|
||||
with patch.object(Process, 'get_running_count') as mock_count:
|
||||
mock_count.return_value = 1
|
||||
|
||||
result = Orchestrator.is_running()
|
||||
|
||||
# Should have called Process.get_running_count with orchestrator type
|
||||
mock_count.assert_called()
|
||||
self.assertTrue(result)
|
||||
|
||||
def test_orchestrator_scoped_worker_count(self):
|
||||
"""Orchestrator with crawl_id should count only descendant workers."""
|
||||
from archivebox.machine.models import Process, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
orchestrator = Orchestrator(exit_on_idle=True, crawl_id='test-crawl')
|
||||
|
||||
orchestrator.db_process = Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.ORCHESTRATOR,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=12345,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
# Prevent cleanup from marking fake PIDs as exited
|
||||
orchestrator._last_cleanup_time = time.time()
|
||||
|
||||
Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
worker_type='crawl',
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=12346,
|
||||
parent=orchestrator.db_process,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
worker_type='crawl',
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=12347,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
self.assertEqual(orchestrator.get_total_worker_count(), 1)
|
||||
|
||||
|
||||
class TestProcessBasedWorkerTracking(TestCase):
|
||||
"""Test Process model methods that replace pid_utils functionality."""
|
||||
|
||||
def setUp(self):
|
||||
"""Reset caches."""
|
||||
import archivebox.machine.models as models
|
||||
models._CURRENT_MACHINE = None
|
||||
models._CURRENT_PROCESS = None
|
||||
|
||||
def test_process_current_creates_record(self):
|
||||
"""Process.current() should create a Process record for current PID."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
proc = Process.current()
|
||||
|
||||
self.assertIsNotNone(proc)
|
||||
self.assertEqual(proc.pid, os.getpid())
|
||||
self.assertEqual(proc.status, Process.StatusChoices.RUNNING)
|
||||
self.assertIsNotNone(proc.machine)
|
||||
self.assertIsNotNone(proc.started_at)
|
||||
|
||||
def test_process_current_caches_result(self):
|
||||
"""Process.current() should return cached Process within interval."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
proc1 = Process.current()
|
||||
proc2 = Process.current()
|
||||
|
||||
self.assertEqual(proc1.id, proc2.id)
|
||||
|
||||
def test_process_get_running_count(self):
|
||||
"""Process.get_running_count should count running processes by type."""
|
||||
from archivebox.machine.models import Process, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
# Create some worker processes
|
||||
for i in range(3):
|
||||
Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=99990 + i, # Fake PIDs
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
count = Process.get_running_count(process_type=Process.TypeChoices.WORKER)
|
||||
self.assertGreaterEqual(count, 3)
|
||||
|
||||
def test_process_get_next_worker_id(self):
|
||||
"""Process.get_next_worker_id should return count of running workers."""
|
||||
from archivebox.machine.models import Process, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
# Create 2 worker processes
|
||||
for i in range(2):
|
||||
Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=99980 + i,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
next_id = Process.get_next_worker_id(process_type=Process.TypeChoices.WORKER)
|
||||
self.assertGreaterEqual(next_id, 2)
|
||||
|
||||
def test_process_cleanup_stale_running(self):
|
||||
"""Process.cleanup_stale_running should mark stale processes as exited."""
|
||||
from archivebox.machine.models import Process, Machine, PID_REUSE_WINDOW
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
# Create a stale process (old started_at, fake PID)
|
||||
stale_proc = Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=999999, # Fake PID that doesn't exist
|
||||
started_at=timezone.now() - PID_REUSE_WINDOW - timedelta(hours=1),
|
||||
)
|
||||
|
||||
cleaned = Process.cleanup_stale_running()
|
||||
|
||||
self.assertGreaterEqual(cleaned, 1)
|
||||
|
||||
stale_proc.refresh_from_db()
|
||||
self.assertEqual(stale_proc.status, Process.StatusChoices.EXITED)
|
||||
|
||||
def test_process_get_running(self):
|
||||
"""Process.get_running should return queryset of running processes."""
|
||||
from archivebox.machine.models import Process, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
# Create a running process
|
||||
proc = Process.objects.create(
|
||||
machine=machine,
|
||||
process_type=Process.TypeChoices.HOOK,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=99970,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
running = Process.get_running(process_type=Process.TypeChoices.HOOK)
|
||||
|
||||
self.assertIn(proc, running)
|
||||
|
||||
def test_process_type_detection(self):
|
||||
"""Process._detect_process_type should detect process type from argv."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
# Test detection logic
|
||||
with patch('sys.argv', ['archivebox', 'manage', 'orchestrator']):
|
||||
result = Process._detect_process_type()
|
||||
self.assertEqual(result, Process.TypeChoices.ORCHESTRATOR)
|
||||
|
||||
with patch('sys.argv', ['archivebox', 'add', 'http://example.com']):
|
||||
result = Process._detect_process_type()
|
||||
self.assertEqual(result, Process.TypeChoices.CLI)
|
||||
|
||||
with patch('sys.argv', ['supervisord', '-c', 'config.ini']):
|
||||
result = Process._detect_process_type()
|
||||
self.assertEqual(result, Process.TypeChoices.SUPERVISORD)
|
||||
|
||||
|
||||
class TestProcessLifecycle(TestCase):
|
||||
"""Test Process model lifecycle methods."""
|
||||
|
||||
def setUp(self):
|
||||
"""Reset caches and create a machine."""
|
||||
import archivebox.machine.models as models
|
||||
models._CURRENT_MACHINE = None
|
||||
models._CURRENT_PROCESS = None
|
||||
self.machine = models.Machine.current()
|
||||
|
||||
def test_process_is_running_property(self):
|
||||
"""Process.is_running should check actual OS process."""
|
||||
from archivebox.machine.models import Process
|
||||
proc = Process.current()
|
||||
|
||||
# Should be running (current process exists)
|
||||
self.assertTrue(proc.is_running)
|
||||
|
||||
# Create a process with fake PID
|
||||
fake_proc = Process.objects.create(
|
||||
machine=self.machine,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=999999,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
# Should not be running (PID doesn't exist)
|
||||
self.assertFalse(fake_proc.is_running)
|
||||
|
||||
def test_process_poll(self):
|
||||
"""Process.poll should check and update exit status."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
# Create a process with fake PID (already exited)
|
||||
proc = Process.objects.create(
|
||||
machine=self.machine,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=999999,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
exit_code = proc.poll()
|
||||
|
||||
# Should have detected exit and updated status
|
||||
self.assertIsNotNone(exit_code)
|
||||
proc.refresh_from_db()
|
||||
self.assertEqual(proc.status, Process.StatusChoices.EXITED)
|
||||
|
||||
def test_process_terminate_already_dead(self):
|
||||
"""Process.terminate should handle already-dead processes."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
# Create a process with fake PID
|
||||
proc = Process.objects.create(
|
||||
machine=self.machine,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=999999,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
result = proc.terminate()
|
||||
|
||||
# Should return False (was already dead)
|
||||
self.assertFalse(result)
|
||||
|
||||
proc.refresh_from_db()
|
||||
self.assertEqual(proc.status, Process.StatusChoices.EXITED)
|
||||
|
||||
def test_process_tree_traversal(self):
|
||||
"""Process parent/children relationships should work."""
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
# Create parent process
|
||||
parent = Process.objects.create(
|
||||
machine=self.machine,
|
||||
process_type=Process.TypeChoices.CLI,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=1,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
# Create child process
|
||||
child = Process.objects.create(
|
||||
machine=self.machine,
|
||||
parent=parent,
|
||||
process_type=Process.TypeChoices.WORKER,
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
pid=2,
|
||||
started_at=timezone.now(),
|
||||
)
|
||||
|
||||
# Test relationships
|
||||
self.assertEqual(child.parent, parent)
|
||||
self.assertIn(child, parent.children.all())
|
||||
self.assertEqual(child.root, parent)
|
||||
self.assertEqual(child.depth, 1)
|
||||
self.assertEqual(parent.depth, 0)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
pytest.main([__file__, '-v'])
|
||||
@@ -1,138 +0,0 @@
|
||||
import os
|
||||
import sqlite3
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def _find_snapshot_dir(data_dir: Path, snapshot_id: str) -> Path | None:
|
||||
candidates = {snapshot_id}
|
||||
if len(snapshot_id) == 32:
|
||||
hyphenated = f"{snapshot_id[:8]}-{snapshot_id[8:12]}-{snapshot_id[12:16]}-{snapshot_id[16:20]}-{snapshot_id[20:]}"
|
||||
candidates.add(hyphenated)
|
||||
elif len(snapshot_id) == 36 and '-' in snapshot_id:
|
||||
candidates.add(snapshot_id.replace('-', ''))
|
||||
|
||||
for needle in candidates:
|
||||
for path in data_dir.rglob(needle):
|
||||
if path.is_dir():
|
||||
return path
|
||||
return None
|
||||
|
||||
|
||||
def _find_html_with_text(root: Path, needle: str) -> list[Path]:
|
||||
hits: list[Path] = []
|
||||
for path in root.rglob("*.htm*"):
|
||||
if not path.is_file():
|
||||
continue
|
||||
try:
|
||||
if needle in path.read_text(errors="ignore"):
|
||||
hits.append(path)
|
||||
except Exception:
|
||||
continue
|
||||
return hits
|
||||
|
||||
|
||||
def test_add_real_world_example_domain(tmp_path):
|
||||
os.chdir(tmp_path)
|
||||
tmp_short = Path("/tmp") / f"abx-{tmp_path.name}"
|
||||
tmp_short.mkdir(parents=True, exist_ok=True)
|
||||
env = os.environ.copy()
|
||||
env["TMP_DIR"] = str(tmp_short)
|
||||
env["ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS"] = "true"
|
||||
env["SAVE_TITLE"] = "True"
|
||||
env["SAVE_WGET"] = "True"
|
||||
env["SAVE_SINGLEFILE"] = "True"
|
||||
env["SAVE_READABILITY"] = "False"
|
||||
env["SAVE_HTMLTOTEXT"] = "True"
|
||||
env["SAVE_HEADERS"] = "True"
|
||||
env["SAVE_PDF"] = "False"
|
||||
env["SAVE_SCREENSHOT"] = "False"
|
||||
env["SAVE_ARCHIVEDOTORG"] = "False"
|
||||
env["SAVE_YTDLP"] = "False"
|
||||
env["SAVE_GIT"] = "False"
|
||||
|
||||
init = subprocess.run(
|
||||
["archivebox", "init"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=120,
|
||||
env=env,
|
||||
)
|
||||
assert init.returncode == 0, f"archivebox init failed: {init.stderr}"
|
||||
|
||||
result = subprocess.run(
|
||||
["archivebox", "add", "--plugins=title,wget,singlefile,htmltotext,headers", "https://example.com"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=900,
|
||||
env=env,
|
||||
)
|
||||
assert result.returncode == 0, (
|
||||
"archivebox add failed.\n"
|
||||
f"stdout:\n{result.stdout}\n"
|
||||
f"stderr:\n{result.stderr}"
|
||||
)
|
||||
|
||||
conn = sqlite3.connect(tmp_path / "index.sqlite3")
|
||||
c = conn.cursor()
|
||||
snapshot_row = c.execute(
|
||||
"SELECT id, url, title FROM core_snapshot WHERE url = ?",
|
||||
("https://example.com",),
|
||||
).fetchone()
|
||||
assert snapshot_row is not None, "Snapshot for https://example.com not found in DB"
|
||||
snapshot_id, snapshot_url, snapshot_title = snapshot_row
|
||||
assert snapshot_title and "Example Domain" in snapshot_title, (
|
||||
f"Expected title to contain Example Domain, got: {snapshot_title}"
|
||||
)
|
||||
|
||||
failed_results = c.execute(
|
||||
"SELECT COUNT(*) FROM core_archiveresult WHERE snapshot_id = ? AND status = 'failed'",
|
||||
(snapshot_id,),
|
||||
).fetchone()[0]
|
||||
assert failed_results == 0, "Some archive results failed for example.com snapshot"
|
||||
|
||||
binary_workers = c.execute(
|
||||
"SELECT COUNT(*) FROM machine_process WHERE process_type = 'worker' AND worker_type = 'binary'"
|
||||
).fetchone()[0]
|
||||
assert binary_workers > 0, "Expected BinaryWorker to run installs via BinaryMachine"
|
||||
|
||||
failed_binary_workers = c.execute(
|
||||
"SELECT COUNT(*) FROM machine_process WHERE process_type = 'worker' AND worker_type = 'binary' "
|
||||
"AND exit_code IS NOT NULL AND exit_code != 0"
|
||||
).fetchone()[0]
|
||||
assert failed_binary_workers == 0, "BinaryWorker reported non-zero exit codes"
|
||||
|
||||
queued_binaries = c.execute(
|
||||
"SELECT name FROM machine_binary WHERE status != 'installed'"
|
||||
).fetchall()
|
||||
assert not queued_binaries, f"Some binaries did not install: {queued_binaries}"
|
||||
conn.close()
|
||||
|
||||
snapshot_dir = _find_snapshot_dir(tmp_path, str(snapshot_id))
|
||||
assert snapshot_dir is not None, "Snapshot output directory not found"
|
||||
|
||||
title_path = snapshot_dir / "title" / "title.txt"
|
||||
assert title_path.exists(), f"Missing title output: {title_path}"
|
||||
assert "Example Domain" in title_path.read_text(errors="ignore")
|
||||
|
||||
html_sources = []
|
||||
for candidate in ("wget", "singlefile", "dom"):
|
||||
for candidate_dir in (snapshot_dir / candidate, *snapshot_dir.glob(f"*_{candidate}")):
|
||||
if candidate_dir.exists():
|
||||
html_sources.extend(_find_html_with_text(candidate_dir, "Example Domain"))
|
||||
assert len(html_sources) >= 2, (
|
||||
"Expected HTML outputs from multiple extractors to contain Example Domain "
|
||||
f"(found {len(html_sources)})."
|
||||
)
|
||||
|
||||
text_hits = 0
|
||||
for path in (
|
||||
*snapshot_dir.glob("*_htmltotext/htmltotext.txt"),
|
||||
snapshot_dir / "htmltotext" / "htmltotext.txt",
|
||||
):
|
||||
if path.exists() and "Example Domain" in path.read_text(errors="ignore"):
|
||||
text_hits += 1
|
||||
assert text_hits >= 1, (
|
||||
"Expected htmltotext output to contain Example Domain "
|
||||
f"(htmltotext hits={text_hits})."
|
||||
)
|
||||
@@ -1,84 +0,0 @@
|
||||
from datetime import timedelta
|
||||
from typing import cast
|
||||
from unittest.mock import patch
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.contrib.auth.models import UserManager
|
||||
from django.test import TestCase
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.crawls.models import Crawl, CrawlSchedule
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
from archivebox.workers.worker import CrawlWorker
|
||||
|
||||
|
||||
class TestScheduledCrawlMaterialization(TestCase):
|
||||
def setUp(self):
|
||||
user_manager = cast(UserManager, get_user_model().objects)
|
||||
self.user = user_manager.create_user(
|
||||
username='schedule-user',
|
||||
password='password',
|
||||
)
|
||||
|
||||
def _create_due_schedule(self) -> CrawlSchedule:
|
||||
template = Crawl.objects.create(
|
||||
urls='https://example.com/feed.xml',
|
||||
max_depth=1,
|
||||
tags_str='scheduled',
|
||||
label='Scheduled Feed',
|
||||
notes='template',
|
||||
created_by=self.user,
|
||||
status=Crawl.StatusChoices.SEALED,
|
||||
retry_at=None,
|
||||
)
|
||||
schedule = CrawlSchedule.objects.create(
|
||||
template=template,
|
||||
schedule='daily',
|
||||
is_enabled=True,
|
||||
label='Scheduled Feed',
|
||||
notes='template',
|
||||
created_by=self.user,
|
||||
)
|
||||
past = timezone.now() - timedelta(days=2)
|
||||
Crawl.objects.filter(pk=template.pk).update(created_at=past, modified_at=past)
|
||||
template.refresh_from_db()
|
||||
schedule.refresh_from_db()
|
||||
return schedule
|
||||
|
||||
def test_global_orchestrator_materializes_due_schedule(self):
|
||||
schedule = self._create_due_schedule()
|
||||
|
||||
orchestrator = Orchestrator(exit_on_idle=False)
|
||||
orchestrator._materialize_due_schedules()
|
||||
|
||||
scheduled_crawls = Crawl.objects.filter(schedule=schedule).order_by('created_at')
|
||||
self.assertEqual(scheduled_crawls.count(), 2)
|
||||
|
||||
queued_crawl = scheduled_crawls.last()
|
||||
self.assertIsNotNone(queued_crawl)
|
||||
assert queued_crawl is not None
|
||||
self.assertEqual(queued_crawl.status, Crawl.StatusChoices.QUEUED)
|
||||
self.assertEqual(queued_crawl.urls, 'https://example.com/feed.xml')
|
||||
self.assertEqual(queued_crawl.max_depth, 1)
|
||||
self.assertEqual(queued_crawl.tags_str, 'scheduled')
|
||||
|
||||
def test_one_shot_orchestrator_does_not_materialize_due_schedule(self):
|
||||
schedule = self._create_due_schedule()
|
||||
|
||||
Orchestrator(exit_on_idle=True)._materialize_due_schedules()
|
||||
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
|
||||
|
||||
Orchestrator(exit_on_idle=False, crawl_id=str(schedule.template.id))._materialize_due_schedules()
|
||||
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
|
||||
|
||||
@patch.object(CrawlWorker, 'start')
|
||||
def test_global_orchestrator_waits_one_tick_before_spawning_materialized_schedule(self, mock_start):
|
||||
schedule = self._create_due_schedule()
|
||||
|
||||
orchestrator = Orchestrator(exit_on_idle=False)
|
||||
with patch.object(orchestrator, '_claim_crawl', return_value=True):
|
||||
queue_sizes = orchestrator.check_queues_and_spawn_workers()
|
||||
|
||||
self.assertEqual(queue_sizes['crawl'], 1)
|
||||
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 2)
|
||||
mock_start.assert_not_called()
|
||||
@@ -1,76 +0,0 @@
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
from typing import Any, cast
|
||||
from unittest.mock import patch
|
||||
|
||||
from django.test import SimpleTestCase
|
||||
|
||||
from archivebox.workers.worker import SnapshotWorker
|
||||
|
||||
|
||||
class TestSnapshotWorkerRetryForegroundHooks(SimpleTestCase):
|
||||
def _make_worker(self):
|
||||
worker = SnapshotWorker.__new__(SnapshotWorker)
|
||||
worker.pid = 12345
|
||||
cast(Any, worker).snapshot = SimpleNamespace(
|
||||
status='started',
|
||||
refresh_from_db=lambda: None,
|
||||
)
|
||||
worker._snapshot_exceeded_hard_timeout = lambda: False
|
||||
worker._seal_snapshot_due_to_timeout = lambda: None
|
||||
worker._run_hook = lambda *args, **kwargs: SimpleNamespace()
|
||||
worker._wait_for_hook = lambda process, ar: None
|
||||
return worker
|
||||
|
||||
@patch('archivebox.workers.worker.log_worker_event')
|
||||
def test_retry_skips_successful_hook_with_only_inline_output(self, mock_log):
|
||||
worker = self._make_worker()
|
||||
archive_result = SimpleNamespace(
|
||||
status='succeeded',
|
||||
output_files={},
|
||||
output_str='scrolled 600px',
|
||||
output_json=None,
|
||||
refresh_from_db=lambda: None,
|
||||
)
|
||||
|
||||
worker._retry_failed_empty_foreground_hooks(
|
||||
[(Path('/tmp/on_Snapshot__45_infiniscroll.js'), archive_result)],
|
||||
config={},
|
||||
)
|
||||
|
||||
mock_log.assert_not_called()
|
||||
|
||||
@patch('archivebox.workers.worker.log_worker_event')
|
||||
def test_retry_replays_failed_hook_with_no_outputs(self, mock_log):
|
||||
worker = self._make_worker()
|
||||
run_calls = []
|
||||
wait_calls = []
|
||||
|
||||
def run_hook(*args, **kwargs):
|
||||
run_calls.append((args, kwargs))
|
||||
return SimpleNamespace()
|
||||
|
||||
def wait_for_hook(process, ar):
|
||||
wait_calls.append((process, ar))
|
||||
ar.status = 'succeeded'
|
||||
ar.output_files = {'singlefile.html': {}}
|
||||
|
||||
archive_result = SimpleNamespace(
|
||||
status='failed',
|
||||
output_files={},
|
||||
output_str='',
|
||||
output_json=None,
|
||||
refresh_from_db=lambda: None,
|
||||
)
|
||||
|
||||
worker._run_hook = run_hook
|
||||
worker._wait_for_hook = wait_for_hook
|
||||
|
||||
worker._retry_failed_empty_foreground_hooks(
|
||||
[(Path('/tmp/on_Snapshot__50_singlefile.py'), archive_result)],
|
||||
config={},
|
||||
)
|
||||
|
||||
assert len(run_calls) == 1
|
||||
assert len(wait_calls) == 1
|
||||
mock_log.assert_called_once()
|
||||
@@ -1,143 +0,0 @@
|
||||
import threading
|
||||
import time
|
||||
|
||||
import pytest
|
||||
from django.db import close_old_connections
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
from archivebox.workers.worker import BinaryWorker
|
||||
|
||||
|
||||
def get_fresh_machine() -> Machine:
|
||||
import archivebox.machine.models as machine_models
|
||||
|
||||
machine_models._CURRENT_MACHINE = None
|
||||
machine_models._CURRENT_BINARIES.clear()
|
||||
return Machine.current()
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_claim_processing_lock_does_not_steal_future_retry_at():
|
||||
"""
|
||||
retry_at is both the schedule and the ownership lock.
|
||||
|
||||
Once one process claims a due row and moves retry_at into the future, a
|
||||
fresh reader must not be able to "re-claim" that future timestamp and run
|
||||
the same side effects a second time.
|
||||
"""
|
||||
machine = get_fresh_machine()
|
||||
binary = Binary.objects.create(
|
||||
machine=machine,
|
||||
name='claim-test',
|
||||
binproviders='env',
|
||||
status=Binary.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
)
|
||||
|
||||
owner = Binary.objects.get(pk=binary.pk)
|
||||
contender = Binary.objects.get(pk=binary.pk)
|
||||
|
||||
assert owner.claim_processing_lock(lock_seconds=30) is True
|
||||
|
||||
contender.refresh_from_db()
|
||||
assert contender.retry_at > timezone.now()
|
||||
assert contender.claim_processing_lock(lock_seconds=30) is False
|
||||
|
||||
|
||||
@pytest.mark.django_db
|
||||
def test_binary_worker_skips_binary_claimed_by_other_owner(monkeypatch):
|
||||
"""
|
||||
BinaryWorker must never run install side effects for a Binary whose retry_at
|
||||
lock has already been claimed by another process.
|
||||
"""
|
||||
machine = get_fresh_machine()
|
||||
binary = Binary.objects.create(
|
||||
machine=machine,
|
||||
name='claimed-binary',
|
||||
binproviders='env',
|
||||
status=Binary.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
)
|
||||
|
||||
owner = Binary.objects.get(pk=binary.pk)
|
||||
assert owner.claim_processing_lock(lock_seconds=30) is True
|
||||
|
||||
calls: list[str] = []
|
||||
|
||||
def fake_run(self):
|
||||
calls.append(self.name)
|
||||
self.status = self.StatusChoices.INSTALLED
|
||||
self.abspath = '/tmp/fake-binary'
|
||||
self.version = '1.0'
|
||||
self.save(update_fields=['status', 'abspath', 'version', 'modified_at'])
|
||||
|
||||
monkeypatch.setattr(Binary, 'run', fake_run)
|
||||
|
||||
worker = BinaryWorker(binary_id=str(binary.id))
|
||||
worker._process_single_binary()
|
||||
|
||||
assert calls == []
|
||||
|
||||
|
||||
@pytest.mark.django_db(transaction=True)
|
||||
def test_crawl_install_declared_binaries_waits_for_existing_owner(monkeypatch):
|
||||
"""
|
||||
Crawl.install_declared_binaries should wait for the current owner of a Binary
|
||||
to finish instead of launching a duplicate install against shared provider
|
||||
state such as the npm tree.
|
||||
"""
|
||||
machine = get_fresh_machine()
|
||||
crawl = Crawl.objects.create(
|
||||
urls='https://example.com',
|
||||
created_by_id=get_or_create_system_user_pk(),
|
||||
status=Crawl.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
)
|
||||
binary = Binary.objects.create(
|
||||
machine=machine,
|
||||
name='puppeteer',
|
||||
binproviders='npm',
|
||||
status=Binary.StatusChoices.QUEUED,
|
||||
retry_at=timezone.now(),
|
||||
)
|
||||
|
||||
owner = Binary.objects.get(pk=binary.pk)
|
||||
assert owner.claim_processing_lock(lock_seconds=30) is True
|
||||
|
||||
calls: list[str] = []
|
||||
|
||||
def fake_run(self):
|
||||
calls.append(self.name)
|
||||
self.status = self.StatusChoices.INSTALLED
|
||||
self.abspath = '/tmp/should-not-run'
|
||||
self.version = '1.0'
|
||||
self.save(update_fields=['status', 'abspath', 'version', 'modified_at'])
|
||||
|
||||
monkeypatch.setattr(Binary, 'run', fake_run)
|
||||
|
||||
def finish_existing_install():
|
||||
close_old_connections()
|
||||
try:
|
||||
time.sleep(0.3)
|
||||
Binary.objects.filter(pk=binary.pk).update(
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
retry_at=None,
|
||||
abspath='/tmp/finished-by-owner',
|
||||
version='1.0',
|
||||
modified_at=timezone.now(),
|
||||
)
|
||||
finally:
|
||||
close_old_connections()
|
||||
|
||||
thread = threading.Thread(target=finish_existing_install, daemon=True)
|
||||
thread.start()
|
||||
crawl.install_declared_binaries({'puppeteer'}, machine=machine)
|
||||
thread.join(timeout=5)
|
||||
|
||||
binary.refresh_from_db()
|
||||
assert binary.status == Binary.StatusChoices.INSTALLED
|
||||
assert binary.abspath == '/tmp/finished-by-owner'
|
||||
assert calls == []
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user