Files
ArchiveBox/archivebox/tests/test_state_machine_claims.py
2026-03-15 19:21:41 -07:00

144 lines
4.4 KiB
Python

import threading
import time
import pytest
from django.db import close_old_connections
from django.utils import timezone
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.machine.models import Binary, Machine
from archivebox.workers.worker import BinaryWorker
def get_fresh_machine() -> Machine:
import archivebox.machine.models as machine_models
machine_models._CURRENT_MACHINE = None
machine_models._CURRENT_BINARIES.clear()
return Machine.current()
@pytest.mark.django_db
def test_claim_processing_lock_does_not_steal_future_retry_at():
"""
retry_at is both the schedule and the ownership lock.
Once one process claims a due row and moves retry_at into the future, a
fresh reader must not be able to "re-claim" that future timestamp and run
the same side effects a second time.
"""
machine = get_fresh_machine()
binary = Binary.objects.create(
machine=machine,
name='claim-test',
binproviders='env',
status=Binary.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
owner = Binary.objects.get(pk=binary.pk)
contender = Binary.objects.get(pk=binary.pk)
assert owner.claim_processing_lock(lock_seconds=30) is True
contender.refresh_from_db()
assert contender.retry_at > timezone.now()
assert contender.claim_processing_lock(lock_seconds=30) is False
@pytest.mark.django_db
def test_binary_worker_skips_binary_claimed_by_other_owner(monkeypatch):
"""
BinaryWorker must never run install side effects for a Binary whose retry_at
lock has already been claimed by another process.
"""
machine = get_fresh_machine()
binary = Binary.objects.create(
machine=machine,
name='claimed-binary',
binproviders='env',
status=Binary.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
owner = Binary.objects.get(pk=binary.pk)
assert owner.claim_processing_lock(lock_seconds=30) is True
calls: list[str] = []
def fake_run(self):
calls.append(self.name)
self.status = self.StatusChoices.INSTALLED
self.abspath = '/tmp/fake-binary'
self.version = '1.0'
self.save(update_fields=['status', 'abspath', 'version', 'modified_at'])
monkeypatch.setattr(Binary, 'run', fake_run)
worker = BinaryWorker(binary_id=str(binary.id))
worker._process_single_binary()
assert calls == []
@pytest.mark.django_db(transaction=True)
def test_crawl_install_declared_binaries_waits_for_existing_owner(monkeypatch):
"""
Crawl.install_declared_binaries should wait for the current owner of a Binary
to finish instead of launching a duplicate install against shared provider
state such as the npm tree.
"""
machine = get_fresh_machine()
crawl = Crawl.objects.create(
urls='https://example.com',
created_by_id=get_or_create_system_user_pk(),
status=Crawl.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
binary = Binary.objects.create(
machine=machine,
name='puppeteer',
binproviders='npm',
status=Binary.StatusChoices.QUEUED,
retry_at=timezone.now(),
)
owner = Binary.objects.get(pk=binary.pk)
assert owner.claim_processing_lock(lock_seconds=30) is True
calls: list[str] = []
def fake_run(self):
calls.append(self.name)
self.status = self.StatusChoices.INSTALLED
self.abspath = '/tmp/should-not-run'
self.version = '1.0'
self.save(update_fields=['status', 'abspath', 'version', 'modified_at'])
monkeypatch.setattr(Binary, 'run', fake_run)
def finish_existing_install():
close_old_connections()
try:
time.sleep(0.3)
Binary.objects.filter(pk=binary.pk).update(
status=Binary.StatusChoices.INSTALLED,
retry_at=None,
abspath='/tmp/finished-by-owner',
version='1.0',
modified_at=timezone.now(),
)
finally:
close_old_connections()
thread = threading.Thread(target=finish_existing_install, daemon=True)
thread.start()
crawl.install_declared_binaries({'puppeteer'}, machine=machine)
thread.join(timeout=5)
binary.refresh_from_db()
assert binary.status == Binary.StatusChoices.INSTALLED
assert binary.abspath == '/tmp/finished-by-owner'
assert calls == []