Files
ArchiveBox/archivebox/tests/test_runner.py
2026-03-25 05:36:07 -07:00

770 lines
30 KiB
Python

import asyncio
import json
import subprocess
import sys
from pathlib import Path
from types import SimpleNamespace
import pytest
from django.test import RequestFactory
pytestmark = pytest.mark.django_db
class _DummyBus:
def __init__(self, name: str):
self.name = name
self.registrations = []
def on(self, event_pattern, handler):
registration = SimpleNamespace(event_pattern=event_pattern, handler=handler)
self.registrations.append(registration)
return registration
def off(self, event_pattern, registration):
self.registrations = [existing for existing in self.registrations if existing is not registration]
async def stop(self):
return None
class _DummyService:
def __init__(self, *args, **kwargs):
pass
def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls="https://blog.sweeting.me\nhttps://sweeting.me",
created_by_id=get_or_create_system_user_pk(),
)
snapshot_a = Snapshot.objects.create(
url="https://blog.sweeting.me",
crawl=crawl,
status=Snapshot.StatusChoices.QUEUED,
)
snapshot_b = Snapshot.objects.create(
url="https://sweeting.me",
crawl=crawl,
status=Snapshot.StatusChoices.QUEUED,
)
created_buses: list[_DummyBus] = []
def fake_create_bus(*, name, total_timeout=3600.0, **kwargs):
bus = _DummyBus(name)
created_buses.append(bus)
return bus
monkeypatch.setattr(runner_module, "create_bus", fake_create_bus)
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
monkeypatch.setattr(runner_module, "TagService", _DummyService)
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
download_calls = []
async def fake_download(*, url, bus, config_overrides, **kwargs):
extra_context = json.loads(config_overrides["EXTRA_CONTEXT"])
download_calls.append(
{
"url": url,
"bus": bus,
"snapshot_id": extra_context["snapshot_id"],
"source_url": url,
},
)
await asyncio.sleep(0)
return []
monkeypatch.setattr(runner_module, "download", fake_download)
crawl_runner = runner_module.CrawlRunner(crawl)
snapshot_data = {
str(snapshot_a.id): {
"id": str(snapshot_a.id),
"url": snapshot_a.url,
"status": snapshot_a.status,
"title": snapshot_a.title,
"timestamp": snapshot_a.timestamp,
"bookmarked_at": snapshot_a.bookmarked_at.isoformat() if snapshot_a.bookmarked_at else "",
"created_at": snapshot_a.created_at.isoformat() if snapshot_a.created_at else "",
"tags": snapshot_a.tags_str(),
"depth": snapshot_a.depth,
"output_dir": str(snapshot_a.output_dir),
"config": crawl_runner.load_snapshot_payload(str(snapshot_a.id))["config"],
},
str(snapshot_b.id): {
"id": str(snapshot_b.id),
"url": snapshot_b.url,
"status": snapshot_b.status,
"title": snapshot_b.title,
"timestamp": snapshot_b.timestamp,
"bookmarked_at": snapshot_b.bookmarked_at.isoformat() if snapshot_b.bookmarked_at else "",
"created_at": snapshot_b.created_at.isoformat() if snapshot_b.created_at else "",
"tags": snapshot_b.tags_str(),
"depth": snapshot_b.depth,
"output_dir": str(snapshot_b.output_dir),
"config": crawl_runner.load_snapshot_payload(str(snapshot_b.id))["config"],
},
}
monkeypatch.setattr(crawl_runner, "load_snapshot_payload", lambda snapshot_id: snapshot_data[snapshot_id])
async def run_both():
await asyncio.gather(
crawl_runner.run_snapshot(str(snapshot_a.id)),
crawl_runner.run_snapshot(str(snapshot_b.id)),
)
asyncio.run(run_both())
assert len(download_calls) == 2
assert {call["snapshot_id"] for call in download_calls} == {str(snapshot_a.id), str(snapshot_b.id)}
assert {call["source_url"] for call in download_calls} == {snapshot_a.url, snapshot_b.url}
assert len({id(call["bus"]) for call in download_calls}) == 1
assert len(created_buses) == 1
def test_ensure_background_runner_starts_when_none_running(monkeypatch):
import archivebox.machine.models as machine_models
from archivebox.services import runner as runner_module
popen_calls = []
class DummyPopen:
def __init__(self, args, **kwargs):
popen_calls.append((args, kwargs))
monkeypatch.setattr(machine_models.Process, "cleanup_stale_running", classmethod(lambda cls, machine=None: 0))
monkeypatch.setattr(machine_models.Process, "cleanup_orphaned_workers", classmethod(lambda cls: 0))
monkeypatch.setattr(machine_models.Machine, "current", classmethod(lambda cls: SimpleNamespace(id="machine-1")))
monkeypatch.setattr(
machine_models.Process.objects,
"filter",
lambda **kwargs: SimpleNamespace(exists=lambda: False),
)
monkeypatch.setattr(runner_module.subprocess, "Popen", DummyPopen)
started = runner_module.ensure_background_runner(allow_under_pytest=True)
assert started is True
assert len(popen_calls) == 1
assert popen_calls[0][0] == [runner_module.sys.executable, "-m", "archivebox", "run", "--daemon"]
assert popen_calls[0][1]["stdin"] is subprocess.DEVNULL
def test_ensure_background_runner_skips_when_orchestrator_running(monkeypatch):
import archivebox.machine.models as machine_models
from archivebox.services import runner as runner_module
monkeypatch.setattr(machine_models.Process, "cleanup_stale_running", classmethod(lambda cls, machine=None: 0))
monkeypatch.setattr(machine_models.Process, "cleanup_orphaned_workers", classmethod(lambda cls: 0))
monkeypatch.setattr(machine_models.Machine, "current", classmethod(lambda cls: SimpleNamespace(id="machine-1")))
monkeypatch.setattr(
machine_models.Process.objects,
"filter",
lambda **kwargs: SimpleNamespace(exists=lambda: True),
)
monkeypatch.setattr(
runner_module.subprocess,
"Popen",
lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError("runner should not be spawned")),
)
started = runner_module.ensure_background_runner(allow_under_pytest=True)
assert started is False
def test_runner_prepare_refreshes_network_interface_and_attaches_current_process(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
)
class _Iface:
id = "iface-1"
machine = SimpleNamespace(id="machine-1")
machine_id = "machine-1"
saved_updates = []
class _Proc:
iface_id = None
machine_id = "machine-1"
iface = None
machine = None
def save(self, *, update_fields):
saved_updates.append(tuple(update_fields))
proc = _Proc()
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
monkeypatch.setattr(runner_module, "create_bus", lambda **kwargs: _DummyBus(kwargs["name"]))
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
monkeypatch.setattr(runner_module, "TagService", _DummyService)
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
from archivebox.machine.models import NetworkInterface, Process
from archivebox.config import configset as configset_module
refresh_calls = []
monkeypatch.setattr(NetworkInterface, "current", classmethod(lambda cls, refresh=False: refresh_calls.append(refresh) or _Iface()))
monkeypatch.setattr(Process, "current", classmethod(lambda cls: proc))
monkeypatch.setattr(configset_module, "get_config", lambda **kwargs: {"PLUGINS": "", "CHROME_BINARY": "", "TIMEOUT": 60})
crawl_runner = runner_module.CrawlRunner(crawl)
crawl_runner.load_run_state()
assert refresh_calls == [True]
assert proc.iface is not None
assert proc.machine == proc.iface.machine
assert saved_updates == [("iface", "machine", "modified_at")]
def test_load_run_state_uses_machine_config_as_derived_config(monkeypatch):
from archivebox.machine.models import Machine, NetworkInterface, Process
from archivebox.services import runner as runner_module
from archivebox.config import configset as configset_module
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
machine = Machine.objects.create(
guid="test-guid-runner-overrides",
hostname="runner-host",
hw_in_docker=False,
hw_in_vm=False,
hw_manufacturer="Test",
hw_product="Test Product",
hw_uuid="test-hw-runner-overrides",
os_arch="arm64",
os_family="darwin",
os_platform="macOS",
os_release="14.0",
os_kernel="Darwin",
stats={},
config={"WGET_BINARY": "/tmp/wget", "ABX_INSTALL_CACHE": {"wget": "2026-03-24T00:00:00+00:00"}},
)
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
)
proc = SimpleNamespace(iface_id=str(machine.id), machine_id=str(machine.id), iface=None, machine=machine, save=lambda **kwargs: None)
monkeypatch.setattr(
NetworkInterface,
"current",
classmethod(lambda cls, refresh=False: SimpleNamespace(id=machine.id, machine=machine)),
)
monkeypatch.setattr(Process, "current", classmethod(lambda cls: proc))
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
monkeypatch.setattr(configset_module, "get_config", lambda **kwargs: {"PLUGINS": "", "CHROME_BINARY": "", "TIMEOUT": 60})
crawl_runner = runner_module.CrawlRunner(crawl)
crawl_runner.load_run_state()
assert crawl_runner.derived_config == machine.config
def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch, tmp_path):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
max_size=16,
)
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
monkeypatch.setattr(runner_module, "create_bus", lambda **kwargs: _DummyBus(kwargs["name"]))
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
monkeypatch.setattr(runner_module, "TagService", _DummyService)
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
monkeypatch.setattr(
runner_module,
"download",
lambda *args, **kwargs: (_ for _ in ()).throw(AssertionError("snapshot download should have been skipped")),
)
crawl_runner = runner_module.CrawlRunner(crawl)
state_dir = tmp_path / ".abx-dl"
state_dir.mkdir(parents=True, exist_ok=True)
(state_dir / "limits.json").write_text(
json.dumps(
{
"admitted_snapshot_ids": ["child-1"],
"counted_process_ids": ["proc-1"],
"total_size": 32,
"stop_reason": "max_size",
},
),
encoding="utf-8",
)
cancelled: list[str] = []
crawl_runner.load_snapshot_payload = lambda snapshot_id: {
"id": snapshot_id,
"url": "https://example.com/child",
"title": "",
"timestamp": "",
"bookmarked_at": "",
"created_at": "",
"tags": "",
"depth": 1,
"status": "queued",
"output_dir": "/tmp/child",
"config": {"CRAWL_DIR": str(tmp_path), "MAX_SIZE": 16},
}
crawl_runner.seal_snapshot_due_to_limit = lambda snapshot_id: cancelled.append(snapshot_id)
asyncio.run(crawl_runner.run_snapshot("child-1"))
assert cancelled == ["child-1"]
@pytest.mark.django_db(transaction=True)
def test_seal_snapshot_cancels_queued_descendants_after_max_size():
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services.snapshot_service import SnapshotService
from abx_dl.events import SnapshotCompletedEvent
from abx_dl.orchestrator import create_bus
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
max_size=16,
)
root = Snapshot.objects.create(
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
child = Snapshot.objects.create(
url="https://example.com/child",
crawl=crawl,
depth=1,
parent_snapshot_id=root.id,
status=Snapshot.StatusChoices.QUEUED,
)
state_dir = Path(crawl.output_dir) / ".abx-dl"
state_dir.mkdir(parents=True, exist_ok=True)
(state_dir / "limits.json").write_text(
json.dumps(
{
"admitted_snapshot_ids": [str(root.id), str(child.id)],
"counted_process_ids": ["proc-1"],
"total_size": 32,
"stop_reason": "max_size",
},
),
encoding="utf-8",
)
bus = create_bus(name="test_snapshot_limit_cancel")
service = SnapshotService(bus, crawl_id=str(crawl.id), schedule_snapshot=lambda snapshot_id: None)
try:
async def emit_event() -> None:
await service.on_SnapshotCompletedEvent(
SnapshotCompletedEvent(
url=root.url,
snapshot_id=str(root.id),
output_dir=str(root.output_dir),
),
)
asyncio.run(emit_event())
finally:
asyncio.run(bus.stop())
root.refresh_from_db()
child.refresh_from_db()
assert root.status == Snapshot.StatusChoices.SEALED
assert child.status == Snapshot.StatusChoices.SEALED
assert child.retry_at is None
def test_create_crawl_api_queues_crawl_without_spawning_runner(monkeypatch):
from django.contrib.auth import get_user_model
from archivebox.api.v1_crawls import CrawlCreateSchema, create_crawl
user = get_user_model().objects.create_superuser(
username="runner-api-admin",
email="runner-api-admin@example.com",
password="testpassword",
)
request = RequestFactory().post("/api/v1/crawls")
request.user = user
crawl = create_crawl(
request,
CrawlCreateSchema(
urls=["https://example.com"],
max_depth=0,
tags=[],
tags_str="",
label="",
notes="",
config={},
),
)
assert str(crawl.id)
assert crawl.status == "queued"
assert crawl.retry_at is not None
def test_crawl_runner_does_not_seal_unfinished_crawl(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
)
snapshot = Snapshot.objects.create(
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "finalize_run_state", lambda self: None)
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
crawl.refresh_from_db()
assert crawl.status != Crawl.StatusChoices.SEALED
assert crawl.retry_at is not None
def test_crawl_runner_calls_load_and_finalize_run_state(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
)
snapshot = Snapshot.objects.create(
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
monkeypatch.setattr(runner_module, "create_bus", lambda *args, **kwargs: _DummyBus("runner"))
monkeypatch.setattr(runner_module, "discover_plugins", lambda: {})
monkeypatch.setattr(runner_module, "ProcessService", _DummyService)
monkeypatch.setattr(runner_module, "BinaryService", _DummyService)
monkeypatch.setattr(runner_module, "TagService", _DummyService)
monkeypatch.setattr(runner_module, "CrawlService", _DummyService)
monkeypatch.setattr(runner_module, "SnapshotService", _DummyService)
monkeypatch.setattr(runner_module, "ArchiveResultService", _DummyService)
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_cleanup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setenv("DJANGO_ALLOW_ASYNC_UNSAFE", "true")
method_calls: list[str] = []
def wrapped_finalize(self):
method_calls.append("finalize_run_state")
return None
def wrapped_load(self):
method_calls.append("load_run_state")
return [str(snapshot.id)]
monkeypatch.setattr(runner_module.CrawlRunner, "finalize_run_state", wrapped_finalize)
monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", wrapped_load)
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
crawl.refresh_from_db()
assert crawl.status == Crawl.StatusChoices.STARTED
assert crawl.retry_at is not None
assert method_calls == ["load_run_state", "finalize_run_state"]
def test_wait_for_snapshot_tasks_surfaces_already_failed_task():
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
)
crawl_runner = runner_module.CrawlRunner(crawl)
async def run_test():
task = asyncio.get_running_loop().create_future()
task.set_exception(RuntimeError("snapshot failed"))
crawl_runner.snapshot_tasks["snap-1"] = task
with pytest.raises(RuntimeError, match="snapshot failed"):
await crawl_runner.wait_for_snapshot_tasks()
asyncio.run(run_test())
def test_wait_for_snapshot_tasks_returns_after_completed_tasks_are_pruned():
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
)
crawl_runner = runner_module.CrawlRunner(crawl)
async def finish_snapshot() -> None:
await asyncio.sleep(0)
async def run_test():
task = asyncio.create_task(finish_snapshot())
crawl_runner.snapshot_tasks["snap-1"] = task
await asyncio.wait_for(crawl_runner.wait_for_snapshot_tasks(), timeout=0.5)
assert crawl_runner.snapshot_tasks == {}
asyncio.run(run_test())
def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
)
snapshot = Snapshot.objects.create(
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.STARTED,
)
monkeypatch.setattr(runner_module, "setup_abx_services", lambda *args, **kwargs: None)
monkeypatch.setattr(runner_module.CrawlRunner, "load_run_state", lambda self: [str(snapshot.id)])
monkeypatch.setattr(runner_module.CrawlRunner, "_create_live_ui", lambda self: None)
monkeypatch.setattr(runner_module.CrawlRunner, "run_crawl_setup", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "enqueue_snapshot", lambda self, snapshot_id: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "wait_for_snapshot_tasks", lambda self: asyncio.sleep(0))
monkeypatch.setattr(runner_module.CrawlRunner, "finalize_run_state", lambda self: None)
cleanup_calls = []
monkeypatch.setattr(
runner_module.CrawlRunner,
"run_crawl_cleanup",
lambda self, snapshot_id: cleanup_calls.append("abx_cleanup") or asyncio.sleep(0),
)
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
assert cleanup_calls == ["abx_cleanup"]
def test_abx_process_service_background_process_finishes_after_process_exit(monkeypatch, tmp_path):
from abx_dl.models import Process as AbxProcess, now_iso
from abx_dl.services.process_service import ProcessService
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
service = object.__new__(ProcessService)
service.emit_jsonl = False
emitted_events = []
class FakeBus:
async def emit(self, event):
emitted_events.append(event)
service.bus = FakeBus()
async def fake_stream_stdout(**kwargs):
try:
await asyncio.Event().wait()
except asyncio.CancelledError:
return ["daemon output\n"]
monkeypatch.setattr(service, "_stream_stdout", fake_stream_stdout)
plugin_output_dir = tmp_path / "chrome"
plugin_output_dir.mkdir()
stdout_file = plugin_output_dir / "on_CrawlSetup__90_chrome_launch.daemon.bg.stdout.log"
stderr_file = plugin_output_dir / "on_CrawlSetup__90_chrome_launch.daemon.bg.stderr.log"
stderr_file.write_text("")
pid_file = plugin_output_dir / "on_CrawlSetup__90_chrome_launch.daemon.bg.pid"
pid_file.write_text("12345")
proc = AbxProcess(
cmd=["hook"],
pwd=str(plugin_output_dir),
timeout=60,
started_at=now_iso(),
plugin="chrome",
hook_name="on_CrawlSetup__90_chrome_launch.daemon.bg",
)
async def run_test():
process = await asyncio.create_subprocess_exec(
sys.executable,
"-c",
"pass",
stdout=asyncio.subprocess.PIPE,
stderr=asyncio.subprocess.PIPE,
)
event = ProcessStartedEvent(
plugin_name="chrome",
hook_name="on_CrawlSetup__90_chrome_launch.daemon.bg",
hook_path="hook",
hook_args=["--url=https://example.org/"],
env={},
output_dir=str(plugin_output_dir),
timeout=60,
pid=process.pid,
is_background=True,
url="https://example.org/",
process_type="hook",
worker_type="hook",
start_ts=proc.started_at or "",
subprocess=process,
stdout_file=stdout_file,
stderr_file=stderr_file,
pid_file=pid_file,
cmd_file=plugin_output_dir / "on_CrawlSetup__90_chrome_launch.daemon.bg.sh",
files_before=set(),
)
await asyncio.wait_for(
service.on_ProcessStartedEvent(event),
timeout=0.5,
)
asyncio.run(run_test())
assert pid_file.exists() is False
assert any(isinstance(event, ProcessCompletedEvent) for event in emitted_events)
def test_run_pending_crawls_runs_due_snapshot_in_place(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services import runner as runner_module
crawl = Crawl.objects.create(
urls="https://example.com",
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.SEALED,
)
snapshot = Snapshot.objects.create(
url="https://example.com",
crawl=crawl,
status=Snapshot.StatusChoices.QUEUED,
retry_at=runner_module.timezone.now(),
)
monkeypatch.setattr(type(snapshot), "claim_processing_lock", lambda self, lock_seconds=60: True)
monkeypatch.setattr(type(crawl), "claim_processing_lock", lambda self, lock_seconds=60: True)
run_calls: list[tuple[str, list[str] | None, bool]] = []
def fake_run_crawl(crawl_id, snapshot_ids=None, selected_plugins=None, process_discovered_snapshots_inline=True):
run_calls.append((crawl_id, snapshot_ids, process_discovered_snapshots_inline))
snapshot.status = Snapshot.StatusChoices.SEALED
snapshot.retry_at = None
snapshot.save(update_fields=["status", "retry_at", "modified_at"])
monkeypatch.setattr(runner_module, "run_crawl", fake_run_crawl)
result = runner_module.run_pending_crawls(daemon=False)
assert result == 0
assert run_calls == [(str(crawl.id), [str(snapshot.id)], False)]
def test_run_pending_crawls_prioritizes_new_queued_crawl_before_snapshot_backlog(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
from archivebox.services import runner as runner_module
older_crawl = Crawl.objects.create(
urls="https://older.example.com",
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.STARTED,
)
older_snapshot = Snapshot.objects.create(
url="https://older.example.com",
crawl=older_crawl,
status=Snapshot.StatusChoices.QUEUED,
retry_at=runner_module.timezone.now(),
)
newer_crawl = Crawl.objects.create(
urls="https://newer.example.com",
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.QUEUED,
retry_at=runner_module.timezone.now(),
)
monkeypatch.setattr(type(older_snapshot), "claim_processing_lock", lambda self, lock_seconds=60: True)
monkeypatch.setattr(type(older_crawl), "claim_processing_lock", lambda self, lock_seconds=60: True)
monkeypatch.setattr(type(newer_crawl), "claim_processing_lock", lambda self, lock_seconds=60: True)
run_calls: list[tuple[str, list[str] | None, bool]] = []
class _StopScheduling(Exception):
pass
def fake_run_crawl(crawl_id, snapshot_ids=None, selected_plugins=None, process_discovered_snapshots_inline=True):
run_calls.append((crawl_id, snapshot_ids, process_discovered_snapshots_inline))
raise _StopScheduling
monkeypatch.setattr(runner_module, "run_crawl", fake_run_crawl)
with pytest.raises(_StopScheduling):
runner_module.run_pending_crawls(daemon=False)
assert run_calls == [(str(newer_crawl.id), None, False)]