Improve scheduling, runtime paths, and API behavior

This commit is contained in:
Nick Sweeting
2026-03-15 18:31:56 -07:00
parent 7d42c6c8b5
commit 70c9358cf9
37 changed files with 1058 additions and 398 deletions

View File

@@ -0,0 +1,65 @@
from datetime import timedelta
from django.contrib.auth import get_user_model
from django.test import TestCase
from django.utils import timezone
from archivebox.crawls.models import Crawl, CrawlSchedule
from archivebox.workers.orchestrator import Orchestrator
class TestScheduledCrawlMaterialization(TestCase):
def setUp(self):
self.user = get_user_model().objects.create_user(
username='schedule-user',
password='password',
)
def _create_due_schedule(self) -> CrawlSchedule:
template = Crawl.objects.create(
urls='https://example.com/feed.xml',
max_depth=1,
tags_str='scheduled',
label='Scheduled Feed',
notes='template',
created_by=self.user,
status=Crawl.StatusChoices.SEALED,
retry_at=None,
)
schedule = CrawlSchedule.objects.create(
template=template,
schedule='daily',
is_enabled=True,
label='Scheduled Feed',
notes='template',
created_by=self.user,
)
past = timezone.now() - timedelta(days=2)
Crawl.objects.filter(pk=template.pk).update(created_at=past, modified_at=past)
template.refresh_from_db()
schedule.refresh_from_db()
return schedule
def test_global_orchestrator_materializes_due_schedule(self):
schedule = self._create_due_schedule()
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator._materialize_due_schedules()
scheduled_crawls = Crawl.objects.filter(schedule=schedule).order_by('created_at')
self.assertEqual(scheduled_crawls.count(), 2)
queued_crawl = scheduled_crawls.last()
self.assertEqual(queued_crawl.status, Crawl.StatusChoices.QUEUED)
self.assertEqual(queued_crawl.urls, 'https://example.com/feed.xml')
self.assertEqual(queued_crawl.max_depth, 1)
self.assertEqual(queued_crawl.tags_str, 'scheduled')
def test_one_shot_orchestrator_does_not_materialize_due_schedule(self):
schedule = self._create_due_schedule()
Orchestrator(exit_on_idle=True)._materialize_due_schedules()
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
Orchestrator(exit_on_idle=False, crawl_id=str(schedule.template_id))._materialize_due_schedules()
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)

View File

@@ -0,0 +1,75 @@
from pathlib import Path
from types import SimpleNamespace
from unittest.mock import patch
from django.test import SimpleTestCase
from archivebox.workers.worker import SnapshotWorker
class TestSnapshotWorkerRetryForegroundHooks(SimpleTestCase):
def _make_worker(self):
worker = SnapshotWorker.__new__(SnapshotWorker)
worker.pid = 12345
worker.snapshot = SimpleNamespace(
status='started',
refresh_from_db=lambda: None,
)
worker._snapshot_exceeded_hard_timeout = lambda: False
worker._seal_snapshot_due_to_timeout = lambda: None
worker._run_hook = lambda *args, **kwargs: SimpleNamespace()
worker._wait_for_hook = lambda *args, **kwargs: None
return worker
@patch('archivebox.workers.worker.log_worker_event')
def test_retry_skips_successful_hook_with_only_inline_output(self, mock_log):
worker = self._make_worker()
archive_result = SimpleNamespace(
status='succeeded',
output_files={},
output_str='scrolled 600px',
output_json=None,
refresh_from_db=lambda: None,
)
worker._retry_failed_empty_foreground_hooks(
[(Path('/tmp/on_Snapshot__45_infiniscroll.js'), archive_result)],
config={},
)
mock_log.assert_not_called()
@patch('archivebox.workers.worker.log_worker_event')
def test_retry_replays_failed_hook_with_no_outputs(self, mock_log):
worker = self._make_worker()
run_calls = []
wait_calls = []
def run_hook(*args, **kwargs):
run_calls.append((args, kwargs))
return SimpleNamespace()
def wait_for_hook(process, archive_result):
wait_calls.append((process, archive_result))
archive_result.status = 'succeeded'
archive_result.output_files = {'singlefile.html': {}}
archive_result = SimpleNamespace(
status='failed',
output_files={},
output_str='',
output_json=None,
refresh_from_db=lambda: None,
)
worker._run_hook = run_hook
worker._wait_for_hook = wait_for_hook
worker._retry_failed_empty_foreground_hooks(
[(Path('/tmp/on_Snapshot__50_singlefile.py'), archive_result)],
config={},
)
assert len(run_calls) == 1
assert len(wait_calls) == 1
mock_log.assert_called_once()