mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 23:37:58 +10:00
85 lines
3.3 KiB
Python
85 lines
3.3 KiB
Python
from datetime import timedelta
|
|
from typing import cast
|
|
from unittest.mock import patch
|
|
|
|
from django.contrib.auth import get_user_model
|
|
from django.contrib.auth.models import UserManager
|
|
from django.test import TestCase
|
|
from django.utils import timezone
|
|
|
|
from archivebox.crawls.models import Crawl, CrawlSchedule
|
|
from archivebox.workers.orchestrator import Orchestrator
|
|
from archivebox.workers.worker import CrawlWorker
|
|
|
|
|
|
class TestScheduledCrawlMaterialization(TestCase):
|
|
def setUp(self):
|
|
user_manager = cast(UserManager, get_user_model().objects)
|
|
self.user = user_manager.create_user(
|
|
username='schedule-user',
|
|
password='password',
|
|
)
|
|
|
|
def _create_due_schedule(self) -> CrawlSchedule:
|
|
template = Crawl.objects.create(
|
|
urls='https://example.com/feed.xml',
|
|
max_depth=1,
|
|
tags_str='scheduled',
|
|
label='Scheduled Feed',
|
|
notes='template',
|
|
created_by=self.user,
|
|
status=Crawl.StatusChoices.SEALED,
|
|
retry_at=None,
|
|
)
|
|
schedule = CrawlSchedule.objects.create(
|
|
template=template,
|
|
schedule='daily',
|
|
is_enabled=True,
|
|
label='Scheduled Feed',
|
|
notes='template',
|
|
created_by=self.user,
|
|
)
|
|
past = timezone.now() - timedelta(days=2)
|
|
Crawl.objects.filter(pk=template.pk).update(created_at=past, modified_at=past)
|
|
template.refresh_from_db()
|
|
schedule.refresh_from_db()
|
|
return schedule
|
|
|
|
def test_global_orchestrator_materializes_due_schedule(self):
|
|
schedule = self._create_due_schedule()
|
|
|
|
orchestrator = Orchestrator(exit_on_idle=False)
|
|
orchestrator._materialize_due_schedules()
|
|
|
|
scheduled_crawls = Crawl.objects.filter(schedule=schedule).order_by('created_at')
|
|
self.assertEqual(scheduled_crawls.count(), 2)
|
|
|
|
queued_crawl = scheduled_crawls.last()
|
|
self.assertIsNotNone(queued_crawl)
|
|
assert queued_crawl is not None
|
|
self.assertEqual(queued_crawl.status, Crawl.StatusChoices.QUEUED)
|
|
self.assertEqual(queued_crawl.urls, 'https://example.com/feed.xml')
|
|
self.assertEqual(queued_crawl.max_depth, 1)
|
|
self.assertEqual(queued_crawl.tags_str, 'scheduled')
|
|
|
|
def test_one_shot_orchestrator_does_not_materialize_due_schedule(self):
|
|
schedule = self._create_due_schedule()
|
|
|
|
Orchestrator(exit_on_idle=True)._materialize_due_schedules()
|
|
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
|
|
|
|
Orchestrator(exit_on_idle=False, crawl_id=str(schedule.template.id))._materialize_due_schedules()
|
|
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
|
|
|
|
@patch.object(CrawlWorker, 'start')
|
|
def test_global_orchestrator_waits_one_tick_before_spawning_materialized_schedule(self, mock_start):
|
|
schedule = self._create_due_schedule()
|
|
|
|
orchestrator = Orchestrator(exit_on_idle=False)
|
|
with patch.object(orchestrator, '_claim_crawl', return_value=True):
|
|
queue_sizes = orchestrator.check_queues_and_spawn_workers()
|
|
|
|
self.assertEqual(queue_sizes['crawl'], 1)
|
|
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 2)
|
|
mock_start.assert_not_called()
|