Files
ArchiveBox/archivebox/tests/test_scheduled_crawls.py
2026-03-15 22:09:56 -07:00

85 lines
3.3 KiB
Python

from datetime import timedelta
from typing import cast
from unittest.mock import patch
from django.contrib.auth import get_user_model
from django.contrib.auth.models import UserManager
from django.test import TestCase
from django.utils import timezone
from archivebox.crawls.models import Crawl, CrawlSchedule
from archivebox.workers.orchestrator import Orchestrator
from archivebox.workers.worker import CrawlWorker
class TestScheduledCrawlMaterialization(TestCase):
def setUp(self):
user_manager = cast(UserManager, get_user_model().objects)
self.user = user_manager.create_user(
username='schedule-user',
password='password',
)
def _create_due_schedule(self) -> CrawlSchedule:
template = Crawl.objects.create(
urls='https://example.com/feed.xml',
max_depth=1,
tags_str='scheduled',
label='Scheduled Feed',
notes='template',
created_by=self.user,
status=Crawl.StatusChoices.SEALED,
retry_at=None,
)
schedule = CrawlSchedule.objects.create(
template=template,
schedule='daily',
is_enabled=True,
label='Scheduled Feed',
notes='template',
created_by=self.user,
)
past = timezone.now() - timedelta(days=2)
Crawl.objects.filter(pk=template.pk).update(created_at=past, modified_at=past)
template.refresh_from_db()
schedule.refresh_from_db()
return schedule
def test_global_orchestrator_materializes_due_schedule(self):
schedule = self._create_due_schedule()
orchestrator = Orchestrator(exit_on_idle=False)
orchestrator._materialize_due_schedules()
scheduled_crawls = Crawl.objects.filter(schedule=schedule).order_by('created_at')
self.assertEqual(scheduled_crawls.count(), 2)
queued_crawl = scheduled_crawls.last()
self.assertIsNotNone(queued_crawl)
assert queued_crawl is not None
self.assertEqual(queued_crawl.status, Crawl.StatusChoices.QUEUED)
self.assertEqual(queued_crawl.urls, 'https://example.com/feed.xml')
self.assertEqual(queued_crawl.max_depth, 1)
self.assertEqual(queued_crawl.tags_str, 'scheduled')
def test_one_shot_orchestrator_does_not_materialize_due_schedule(self):
schedule = self._create_due_schedule()
Orchestrator(exit_on_idle=True)._materialize_due_schedules()
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
Orchestrator(exit_on_idle=False, crawl_id=str(schedule.template.id))._materialize_due_schedules()
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
@patch.object(CrawlWorker, 'start')
def test_global_orchestrator_waits_one_tick_before_spawning_materialized_schedule(self, mock_start):
schedule = self._create_due_schedule()
orchestrator = Orchestrator(exit_on_idle=False)
with patch.object(orchestrator, '_claim_crawl', return_value=True):
queue_sizes = orchestrator.check_queues_and_spawn_workers()
self.assertEqual(queue_sizes['crawl'], 1)
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 2)
mock_start.assert_not_called()