Files
ArchiveBox/archivebox/tests/test_machine_models.py

825 lines
28 KiB
Python

"""
Unit tests for machine module models: Machine, NetworkInterface, Binary, Process.
Tests cover:
1. Machine model creation and current() method
2. NetworkInterface model and network detection
3. Binary model lifecycle and state machine
4. Process model lifecycle, hierarchy, and state machine
5. JSONL serialization/deserialization
6. Manager methods
7. Process tracking methods (replacing pid_utils)
"""
import os
from datetime import timedelta
from typing import cast
from unittest.mock import Mock, patch
import pytest
from django.test import TestCase
from django.utils import timezone
from archivebox.machine.models import (
BinaryManager,
Machine,
NetworkInterface,
Binary,
Process,
BinaryMachine,
ProcessMachine,
MACHINE_RECHECK_INTERVAL,
PID_REUSE_WINDOW,
PROCESS_TIMEOUT_GRACE,
)
class TestMachineModel(TestCase):
"""Test the Machine model."""
def setUp(self):
"""Reset cached machine between tests."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
def test_machine_current_creates_machine(self):
"""Machine.current() should create a machine if none exists."""
machine = Machine.current()
self.assertIsNotNone(machine)
self.assertIsNotNone(machine.id)
self.assertIsNotNone(machine.guid)
self.assertEqual(machine.hostname, os.uname().nodename)
self.assertIn(machine.os_family, ["linux", "darwin", "windows", "freebsd"])
def test_machine_current_returns_cached(self):
"""Machine.current() should return cached machine within recheck interval."""
machine1 = Machine.current()
machine2 = Machine.current()
self.assertEqual(machine1.id, machine2.id)
def test_machine_current_refreshes_after_interval(self):
"""Machine.current() should refresh after recheck interval."""
import archivebox.machine.models as models
machine1 = Machine.current()
# Manually expire the cache by modifying modified_at
machine1.modified_at = timezone.now() - timedelta(seconds=MACHINE_RECHECK_INTERVAL + 1)
machine1.save()
models._CURRENT_MACHINE = machine1
machine2 = Machine.current()
# Should have fetched/updated the machine (same GUID)
self.assertEqual(machine1.guid, machine2.guid)
def test_machine_from_jsonl_update(self):
"""Machine.from_json() should update machine config."""
Machine.current() # Ensure machine exists
record = {
"config": {
"WGET_BINARY": "/usr/bin/wget",
},
}
result = Machine.from_json(record)
self.assertIsNotNone(result)
assert result is not None
self.assertEqual(result.config.get("WGET_BINARY"), "/usr/bin/wget")
def test_machine_from_jsonl_strips_legacy_chromium_version(self):
"""Machine.from_json() should ignore legacy browser version keys."""
Machine.current() # Ensure machine exists
record = {
"config": {
"WGET_BINARY": "/usr/bin/wget",
"CHROMIUM_VERSION": "123.4.5",
},
}
result = Machine.from_json(record)
self.assertIsNotNone(result)
assert result is not None
self.assertEqual(result.config.get("WGET_BINARY"), "/usr/bin/wget")
self.assertNotIn("CHROMIUM_VERSION", result.config)
def test_machine_from_jsonl_invalid(self):
"""Machine.from_json() should return None for invalid records."""
result = Machine.from_json({"invalid": "record"})
self.assertIsNone(result)
def test_machine_current_strips_legacy_chromium_version(self):
"""Machine.current() should clean legacy browser version keys from persisted config."""
import archivebox.machine.models as models
machine = Machine.current()
machine.config = {
"CHROME_BINARY": "/tmp/chromium",
"CHROMIUM_VERSION": "123.4.5",
}
machine.save(update_fields=["config"])
models._CURRENT_MACHINE = machine
refreshed = Machine.current()
self.assertEqual(refreshed.config.get("CHROME_BINARY"), "/tmp/chromium")
self.assertNotIn("CHROMIUM_VERSION", refreshed.config)
def test_machine_manager_current(self):
"""Machine.objects.current() should return current machine."""
machine = Machine.current()
self.assertIsNotNone(machine)
self.assertEqual(machine.id, Machine.current().id)
class TestNetworkInterfaceModel(TestCase):
"""Test the NetworkInterface model."""
def setUp(self):
"""Reset cached interface between tests."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
models._CURRENT_INTERFACE = None
def test_networkinterface_current_creates_interface(self):
"""NetworkInterface.current() should create an interface if none exists."""
interface = NetworkInterface.current()
self.assertIsNotNone(interface)
self.assertIsNotNone(interface.id)
self.assertIsNotNone(interface.machine)
self.assertIsNotNone(interface.ip_local)
def test_networkinterface_current_returns_cached(self):
"""NetworkInterface.current() should return cached interface within recheck interval."""
interface1 = NetworkInterface.current()
interface2 = NetworkInterface.current()
self.assertEqual(interface1.id, interface2.id)
def test_networkinterface_manager_current(self):
"""NetworkInterface.objects.current() should return current interface."""
interface = NetworkInterface.current()
self.assertIsNotNone(interface)
def test_networkinterface_current_refresh_creates_new_interface_when_properties_change(self):
"""Refreshing should persist a new NetworkInterface row when the host network fingerprint changes."""
import archivebox.machine.models as models
first = {
"mac_address": "aa:bb:cc:dd:ee:01",
"ip_public": "1.1.1.1",
"ip_local": "192.168.1.10",
"dns_server": "8.8.8.8",
"hostname": "host-a",
"iface": "en0",
"isp": "ISP A",
"city": "City",
"region": "Region",
"country": "Country",
}
second = {
**first,
"ip_public": "2.2.2.2",
"ip_local": "10.0.0.5",
}
with patch.object(models, "get_host_network", side_effect=[first, second]):
interface1 = NetworkInterface.current(refresh=True)
interface2 = NetworkInterface.current(refresh=True)
self.assertNotEqual(interface1.id, interface2.id)
self.assertEqual(interface1.machine_id, interface2.machine_id)
self.assertEqual(NetworkInterface.objects.filter(machine=interface1.machine).count(), 2)
class TestBinaryModel(TestCase):
"""Test the Binary model."""
def setUp(self):
"""Reset cached binaries and create a machine."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
models._CURRENT_BINARIES = {}
self.machine = Machine.current()
def test_binary_creation(self):
"""Binary should be created with default values."""
binary = Binary.objects.create(
machine=self.machine,
name="wget",
binproviders="apt,brew,env",
)
self.assertIsNotNone(binary.id)
self.assertEqual(binary.name, "wget")
self.assertEqual(binary.status, Binary.StatusChoices.QUEUED)
self.assertFalse(binary.is_valid)
def test_binary_is_valid(self):
"""Binary.is_valid should be True for installed binaries with a resolved path."""
binary = Binary.objects.create(
machine=self.machine,
name="wget",
abspath="/usr/bin/wget",
version="1.21",
status=Binary.StatusChoices.INSTALLED,
)
self.assertTrue(binary.is_valid)
def test_binary_manager_get_valid_binary(self):
"""BinaryManager.get_valid_binary() should find valid binaries."""
# Create invalid binary (no abspath)
Binary.objects.create(machine=self.machine, name="wget")
# Create valid binary
Binary.objects.create(
machine=self.machine,
name="wget",
abspath="/usr/bin/wget",
version="1.21",
status=Binary.StatusChoices.INSTALLED,
)
result = cast(BinaryManager, Binary.objects).get_valid_binary("wget")
self.assertIsNotNone(result)
assert result is not None
self.assertEqual(result.abspath, "/usr/bin/wget")
def test_binary_update_and_requeue(self):
"""Binary.update_and_requeue() should update fields and save."""
binary = Binary.objects.create(machine=self.machine, name="test")
old_modified = binary.modified_at
binary.update_and_requeue(
status=Binary.StatusChoices.QUEUED,
retry_at=timezone.now() + timedelta(seconds=60),
)
binary.refresh_from_db()
self.assertEqual(binary.status, Binary.StatusChoices.QUEUED)
self.assertGreater(binary.modified_at, old_modified)
def test_binary_from_json_preserves_install_args_overrides(self):
"""Binary.from_json() should persist canonical install_args overrides unchanged."""
overrides = {
"apt": {"install_args": ["chromium"]},
"npm": {"install_args": "puppeteer"},
"custom": {"install_args": ["bash", "-lc", "echo ok"]},
}
binary = Binary.from_json(
{
"name": "chrome",
"binproviders": "apt,npm,custom",
"overrides": overrides,
},
)
self.assertIsNotNone(binary)
assert binary is not None
self.assertEqual(binary.overrides, overrides)
def test_binary_from_json_does_not_coerce_legacy_override_shapes(self):
"""Binary.from_json() should no longer translate legacy non-dict provider overrides."""
overrides = {
"apt": ["chromium"],
"npm": "puppeteer",
}
binary = Binary.from_json(
{
"name": "chrome",
"binproviders": "apt,npm",
"overrides": overrides,
},
)
self.assertIsNotNone(binary)
assert binary is not None
self.assertEqual(binary.overrides, overrides)
def test_binary_from_json_prefers_published_readability_package(self):
"""Binary.from_json() should rewrite readability's npm git URL to the published package."""
binary = Binary.from_json(
{
"name": "readability-extractor",
"binproviders": "env,npm",
"overrides": {
"npm": {
"install_args": ["https://github.com/ArchiveBox/readability-extractor"],
},
},
},
)
self.assertIsNotNone(binary)
assert binary is not None
self.assertEqual(
binary.overrides,
{
"npm": {
"install_args": ["readability-extractor"],
},
},
)
class TestBinaryStateMachine(TestCase):
"""Test the BinaryMachine state machine."""
def setUp(self):
"""Create a machine and binary for state machine tests."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
self.machine = Machine.current()
self.binary = Binary.objects.create(
machine=self.machine,
name="test-binary",
binproviders="env",
)
def test_binary_state_machine_initial_state(self):
"""BinaryMachine should start in queued state."""
sm = BinaryMachine(self.binary)
self.assertEqual(sm.current_state_value, Binary.StatusChoices.QUEUED)
def test_binary_state_machine_can_start(self):
"""BinaryMachine.can_start() should check name and binproviders."""
sm = BinaryMachine(self.binary)
self.assertTrue(sm.can_install())
self.binary.binproviders = ""
self.binary.save()
sm = BinaryMachine(self.binary)
self.assertFalse(sm.can_install())
class TestProcessModel(TestCase):
"""Test the Process model."""
def setUp(self):
"""Create a machine for process tests."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
models._CURRENT_PROCESS = None
self.machine = Machine.current()
def test_process_creation(self):
"""Process should be created with default values."""
process = Process.objects.create(
machine=self.machine,
cmd=["echo", "hello"],
pwd="/tmp",
)
self.assertIsNotNone(process.id)
self.assertEqual(process.cmd, ["echo", "hello"])
self.assertEqual(process.status, Process.StatusChoices.QUEUED)
self.assertIsNone(process.pid)
self.assertIsNone(process.exit_code)
def test_process_to_jsonl(self):
"""Process.to_json() should serialize correctly."""
process = Process.objects.create(
machine=self.machine,
cmd=["echo", "hello"],
pwd="/tmp",
timeout=60,
)
json_data = process.to_json()
self.assertEqual(json_data["type"], "Process")
self.assertEqual(json_data["cmd"], ["echo", "hello"])
self.assertEqual(json_data["pwd"], "/tmp")
self.assertEqual(json_data["timeout"], 60)
def test_process_update_and_requeue(self):
"""Process.update_and_requeue() should update fields and save."""
process = Process.objects.create(machine=self.machine, cmd=["test"])
process.update_and_requeue(
status=Process.StatusChoices.RUNNING,
pid=12345,
started_at=timezone.now(),
)
process.refresh_from_db()
self.assertEqual(process.status, Process.StatusChoices.RUNNING)
self.assertEqual(process.pid, 12345)
self.assertIsNotNone(process.started_at)
class TestProcessCurrent(TestCase):
"""Test Process.current() method."""
def setUp(self):
"""Reset caches."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
models._CURRENT_PROCESS = None
def test_process_current_creates_record(self):
"""Process.current() should create a Process for current PID."""
proc = Process.current()
self.assertIsNotNone(proc)
self.assertEqual(proc.pid, os.getpid())
self.assertEqual(proc.status, Process.StatusChoices.RUNNING)
self.assertIsNotNone(proc.machine)
self.assertIsNotNone(proc.iface)
self.assertEqual(proc.iface.machine_id, proc.machine_id)
self.assertIsNotNone(proc.started_at)
def test_process_current_caches(self):
"""Process.current() should cache the result."""
proc1 = Process.current()
proc2 = Process.current()
self.assertEqual(proc1.id, proc2.id)
def test_process_detect_type_runner(self):
"""_detect_process_type should detect the background runner command."""
with patch("sys.argv", ["archivebox", "run", "--daemon"]):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.ORCHESTRATOR)
def test_process_detect_type_runner_watch(self):
"""runner_watch should be classified as a worker, not the orchestrator itself."""
with patch("sys.argv", ["archivebox", "manage", "runner_watch", "--pidfile=/tmp/runserver.pid"]):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.WORKER)
def test_process_detect_type_cli(self):
"""_detect_process_type should detect CLI commands."""
with patch("sys.argv", ["archivebox", "add", "http://example.com"]):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.CLI)
def test_process_detect_type_binary(self):
"""_detect_process_type should detect non-ArchiveBox subprocesses as binary processes."""
with patch("sys.argv", ["/usr/bin/wget", "https://example.com"]):
result = Process._detect_process_type()
self.assertEqual(result, Process.TypeChoices.BINARY)
def test_process_proc_allows_interpreter_wrapped_script(self):
"""Process.proc should accept a script recorded in DB when wrapped by an interpreter in psutil."""
proc = Process.objects.create(
machine=Machine.current(),
cmd=["/tmp/on_CrawlSetup__90_chrome_launch.daemon.bg.js", "--url=https://example.com/"],
pid=12345,
status=Process.StatusChoices.RUNNING,
started_at=timezone.now(),
)
os_proc = Mock()
os_proc.create_time.return_value = proc.started_at.timestamp()
os_proc.cmdline.return_value = [
"node",
"/tmp/on_CrawlSetup__90_chrome_launch.daemon.bg.js",
"--url=https://example.com/",
]
with patch("archivebox.machine.models.psutil.Process", return_value=os_proc):
self.assertIs(proc.proc, os_proc)
class TestProcessHierarchy(TestCase):
"""Test Process parent/child relationships."""
def setUp(self):
"""Create machine."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
self.machine = Machine.current()
def test_process_parent_child(self):
"""Process should track parent/child relationships."""
parent = Process.objects.create(
machine=self.machine,
process_type=Process.TypeChoices.CLI,
status=Process.StatusChoices.RUNNING,
pid=1,
started_at=timezone.now(),
)
child = Process.objects.create(
machine=self.machine,
parent=parent,
process_type=Process.TypeChoices.WORKER,
status=Process.StatusChoices.RUNNING,
pid=2,
started_at=timezone.now(),
)
self.assertEqual(child.parent, parent)
self.assertIn(child, parent.children.all())
def test_process_root(self):
"""Process.root should return the root of the hierarchy."""
root = Process.objects.create(
machine=self.machine,
process_type=Process.TypeChoices.CLI,
status=Process.StatusChoices.RUNNING,
started_at=timezone.now(),
)
child = Process.objects.create(
machine=self.machine,
parent=root,
status=Process.StatusChoices.RUNNING,
started_at=timezone.now(),
)
grandchild = Process.objects.create(
machine=self.machine,
parent=child,
status=Process.StatusChoices.RUNNING,
started_at=timezone.now(),
)
self.assertEqual(grandchild.root, root)
self.assertEqual(child.root, root)
self.assertEqual(root.root, root)
def test_process_depth(self):
"""Process.depth should return depth in tree."""
root = Process.objects.create(
machine=self.machine,
status=Process.StatusChoices.RUNNING,
started_at=timezone.now(),
)
child = Process.objects.create(
machine=self.machine,
parent=root,
status=Process.StatusChoices.RUNNING,
started_at=timezone.now(),
)
self.assertEqual(root.depth, 0)
self.assertEqual(child.depth, 1)
class TestProcessLifecycle(TestCase):
"""Test Process lifecycle methods."""
def setUp(self):
"""Create machine."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
self.machine = Machine.current()
def test_process_is_running_current_pid(self):
"""is_running should be True for current PID."""
import psutil
from datetime import datetime
proc_start = datetime.fromtimestamp(psutil.Process(os.getpid()).create_time(), tz=timezone.get_current_timezone())
proc = Process.objects.create(
machine=self.machine,
status=Process.StatusChoices.RUNNING,
pid=os.getpid(),
started_at=proc_start,
)
self.assertTrue(proc.is_running)
def test_process_is_running_fake_pid(self):
"""is_running should be False for non-existent PID."""
proc = Process.objects.create(
machine=self.machine,
status=Process.StatusChoices.RUNNING,
pid=999999,
started_at=timezone.now(),
)
self.assertFalse(proc.is_running)
def test_process_poll_detects_exit(self):
"""poll() should detect exited process."""
proc = Process.objects.create(
machine=self.machine,
status=Process.StatusChoices.RUNNING,
pid=999999,
started_at=timezone.now(),
)
exit_code = proc.poll()
self.assertIsNotNone(exit_code)
proc.refresh_from_db()
self.assertEqual(proc.status, Process.StatusChoices.EXITED)
def test_process_poll_normalizes_negative_exit_code(self):
"""poll() should normalize -1 exit codes to 137."""
proc = Process.objects.create(
machine=self.machine,
status=Process.StatusChoices.EXITED,
pid=999999,
exit_code=-1,
started_at=timezone.now(),
)
exit_code = proc.poll()
self.assertEqual(exit_code, 137)
proc.refresh_from_db()
self.assertEqual(proc.exit_code, 137)
def test_process_terminate_dead_process(self):
"""terminate() should handle already-dead process."""
proc = Process.objects.create(
machine=self.machine,
status=Process.StatusChoices.RUNNING,
pid=999999,
started_at=timezone.now(),
)
result = proc.terminate()
self.assertFalse(result)
proc.refresh_from_db()
self.assertEqual(proc.status, Process.StatusChoices.EXITED)
class TestProcessClassMethods(TestCase):
"""Test Process class methods for querying."""
def setUp(self):
"""Create machine."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
self.machine = Machine.current()
def test_get_running(self):
"""get_running should return running processes."""
proc = Process.objects.create(
machine=self.machine,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
pid=99999,
started_at=timezone.now(),
)
running = Process.get_running(process_type=Process.TypeChoices.HOOK)
self.assertIn(proc, running)
def test_get_running_count(self):
"""get_running_count should count running processes."""
for i in range(3):
Process.objects.create(
machine=self.machine,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
pid=99900 + i,
started_at=timezone.now(),
)
count = Process.get_running_count(process_type=Process.TypeChoices.HOOK)
self.assertGreaterEqual(count, 3)
def test_cleanup_stale_running(self):
"""cleanup_stale_running should mark stale processes as exited."""
stale = Process.objects.create(
machine=self.machine,
status=Process.StatusChoices.RUNNING,
pid=999999,
started_at=timezone.now() - PID_REUSE_WINDOW - timedelta(hours=1),
)
cleaned = Process.cleanup_stale_running()
self.assertGreaterEqual(cleaned, 1)
stale.refresh_from_db()
self.assertEqual(stale.status, Process.StatusChoices.EXITED)
def test_cleanup_stale_running_marks_timed_out_rows_exited(self):
"""cleanup_stale_running should retire RUNNING rows that exceed timeout + grace."""
stale = Process.objects.create(
machine=self.machine,
status=Process.StatusChoices.RUNNING,
pid=999998,
timeout=5,
started_at=timezone.now() - PROCESS_TIMEOUT_GRACE - timedelta(seconds=10),
)
cleaned = Process.cleanup_stale_running()
self.assertGreaterEqual(cleaned, 1)
stale.refresh_from_db()
self.assertEqual(stale.status, Process.StatusChoices.EXITED)
def test_cleanup_stale_running_marks_timed_out_live_hooks_exited(self):
"""Timed-out live hook rows should be retired in the DB without trying to kill the process."""
stale = Process.objects.create(
machine=self.machine,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
pid=os.getpid(),
timeout=5,
started_at=timezone.now() - PROCESS_TIMEOUT_GRACE - timedelta(seconds=10),
)
with (
patch.object(Process, "poll", return_value=None),
patch.object(Process, "kill_tree") as kill_tree,
patch.object(Process, "terminate") as terminate,
):
cleaned = Process.cleanup_stale_running()
self.assertGreaterEqual(cleaned, 1)
stale.refresh_from_db()
self.assertEqual(stale.status, Process.StatusChoices.EXITED)
kill_tree.assert_not_called()
terminate.assert_not_called()
def test_cleanup_orphaned_workers_marks_dead_root_children_exited(self):
"""cleanup_orphaned_workers should retire rows whose CLI/orchestrator root is gone."""
import psutil
from datetime import datetime
started_at = datetime.fromtimestamp(psutil.Process(os.getpid()).create_time(), tz=timezone.get_current_timezone())
parent = Process.objects.create(
machine=self.machine,
process_type=Process.TypeChoices.CLI,
status=Process.StatusChoices.RUNNING,
pid=999997,
started_at=timezone.now() - timedelta(minutes=5),
)
child = Process.objects.create(
machine=self.machine,
parent=parent,
process_type=Process.TypeChoices.HOOK,
status=Process.StatusChoices.RUNNING,
pid=os.getpid(),
started_at=started_at,
)
with patch.object(Process, "kill_tree") as kill_tree, patch.object(Process, "terminate") as terminate:
cleaned = Process.cleanup_orphaned_workers()
self.assertEqual(cleaned, 1)
child.refresh_from_db()
self.assertEqual(child.status, Process.StatusChoices.EXITED)
kill_tree.assert_not_called()
terminate.assert_not_called()
class TestProcessStateMachine(TestCase):
"""Test the ProcessMachine state machine."""
def setUp(self):
"""Create a machine and process for state machine tests."""
import archivebox.machine.models as models
models._CURRENT_MACHINE = None
self.machine = Machine.current()
self.process = Process.objects.create(
machine=self.machine,
cmd=["echo", "test"],
pwd="/tmp",
)
def test_process_state_machine_initial_state(self):
"""ProcessMachine should start in queued state."""
sm = ProcessMachine(self.process)
self.assertEqual(sm.current_state_value, Process.StatusChoices.QUEUED)
def test_process_state_machine_can_start(self):
"""ProcessMachine.can_start() should check cmd and machine."""
sm = ProcessMachine(self.process)
self.assertTrue(sm.can_start())
self.process.cmd = []
self.process.save()
sm = ProcessMachine(self.process)
self.assertFalse(sm.can_start())
def test_process_state_machine_is_exited(self):
"""ProcessMachine.is_exited() should check exit_code."""
sm = ProcessMachine(self.process)
self.assertFalse(sm.is_exited())
self.process.exit_code = 0
self.process.save()
sm = ProcessMachine(self.process)
self.assertTrue(sm.is_exited())
if __name__ == "__main__":
pytest.main([__file__, "-v"])