mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Improve scheduling, runtime paths, and API behavior
This commit is contained in:
@@ -336,6 +336,7 @@ class Orchestrator:
|
||||
queue_sizes = {}
|
||||
|
||||
self._enforce_hard_timeouts()
|
||||
self._materialize_due_schedules()
|
||||
|
||||
# Check Binary queue
|
||||
machine = Machine.current()
|
||||
@@ -399,6 +400,24 @@ class Orchestrator:
|
||||
|
||||
return queue_sizes
|
||||
|
||||
def _should_process_schedules(self) -> bool:
|
||||
return (not self.exit_on_idle) and (self.crawl_id is None)
|
||||
|
||||
def _materialize_due_schedules(self) -> None:
|
||||
if not self._should_process_schedules():
|
||||
return
|
||||
|
||||
from archivebox.crawls.models import CrawlSchedule
|
||||
|
||||
now = timezone.now()
|
||||
due_schedules = CrawlSchedule.objects.filter(is_enabled=True).select_related('template', 'template__created_by')
|
||||
|
||||
for schedule in due_schedules:
|
||||
if not schedule.is_due(now):
|
||||
continue
|
||||
|
||||
schedule.enqueue(queued_at=now)
|
||||
|
||||
def _enforce_hard_timeouts(self) -> None:
|
||||
"""Force-kill and seal hooks/archiveresults/snapshots that exceed hard limits."""
|
||||
import time
|
||||
|
||||
65
archivebox/workers/tests/test_scheduled_crawls.py
Normal file
65
archivebox/workers/tests/test_scheduled_crawls.py
Normal file
@@ -0,0 +1,65 @@
|
||||
from datetime import timedelta
|
||||
|
||||
from django.contrib.auth import get_user_model
|
||||
from django.test import TestCase
|
||||
from django.utils import timezone
|
||||
|
||||
from archivebox.crawls.models import Crawl, CrawlSchedule
|
||||
from archivebox.workers.orchestrator import Orchestrator
|
||||
|
||||
|
||||
class TestScheduledCrawlMaterialization(TestCase):
|
||||
def setUp(self):
|
||||
self.user = get_user_model().objects.create_user(
|
||||
username='schedule-user',
|
||||
password='password',
|
||||
)
|
||||
|
||||
def _create_due_schedule(self) -> CrawlSchedule:
|
||||
template = Crawl.objects.create(
|
||||
urls='https://example.com/feed.xml',
|
||||
max_depth=1,
|
||||
tags_str='scheduled',
|
||||
label='Scheduled Feed',
|
||||
notes='template',
|
||||
created_by=self.user,
|
||||
status=Crawl.StatusChoices.SEALED,
|
||||
retry_at=None,
|
||||
)
|
||||
schedule = CrawlSchedule.objects.create(
|
||||
template=template,
|
||||
schedule='daily',
|
||||
is_enabled=True,
|
||||
label='Scheduled Feed',
|
||||
notes='template',
|
||||
created_by=self.user,
|
||||
)
|
||||
past = timezone.now() - timedelta(days=2)
|
||||
Crawl.objects.filter(pk=template.pk).update(created_at=past, modified_at=past)
|
||||
template.refresh_from_db()
|
||||
schedule.refresh_from_db()
|
||||
return schedule
|
||||
|
||||
def test_global_orchestrator_materializes_due_schedule(self):
|
||||
schedule = self._create_due_schedule()
|
||||
|
||||
orchestrator = Orchestrator(exit_on_idle=False)
|
||||
orchestrator._materialize_due_schedules()
|
||||
|
||||
scheduled_crawls = Crawl.objects.filter(schedule=schedule).order_by('created_at')
|
||||
self.assertEqual(scheduled_crawls.count(), 2)
|
||||
|
||||
queued_crawl = scheduled_crawls.last()
|
||||
self.assertEqual(queued_crawl.status, Crawl.StatusChoices.QUEUED)
|
||||
self.assertEqual(queued_crawl.urls, 'https://example.com/feed.xml')
|
||||
self.assertEqual(queued_crawl.max_depth, 1)
|
||||
self.assertEqual(queued_crawl.tags_str, 'scheduled')
|
||||
|
||||
def test_one_shot_orchestrator_does_not_materialize_due_schedule(self):
|
||||
schedule = self._create_due_schedule()
|
||||
|
||||
Orchestrator(exit_on_idle=True)._materialize_due_schedules()
|
||||
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
|
||||
|
||||
Orchestrator(exit_on_idle=False, crawl_id=str(schedule.template_id))._materialize_due_schedules()
|
||||
self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
|
||||
75
archivebox/workers/tests/test_snapshot_worker.py
Normal file
75
archivebox/workers/tests/test_snapshot_worker.py
Normal file
@@ -0,0 +1,75 @@
|
||||
from pathlib import Path
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import patch
|
||||
|
||||
from django.test import SimpleTestCase
|
||||
|
||||
from archivebox.workers.worker import SnapshotWorker
|
||||
|
||||
|
||||
class TestSnapshotWorkerRetryForegroundHooks(SimpleTestCase):
|
||||
def _make_worker(self):
|
||||
worker = SnapshotWorker.__new__(SnapshotWorker)
|
||||
worker.pid = 12345
|
||||
worker.snapshot = SimpleNamespace(
|
||||
status='started',
|
||||
refresh_from_db=lambda: None,
|
||||
)
|
||||
worker._snapshot_exceeded_hard_timeout = lambda: False
|
||||
worker._seal_snapshot_due_to_timeout = lambda: None
|
||||
worker._run_hook = lambda *args, **kwargs: SimpleNamespace()
|
||||
worker._wait_for_hook = lambda *args, **kwargs: None
|
||||
return worker
|
||||
|
||||
@patch('archivebox.workers.worker.log_worker_event')
|
||||
def test_retry_skips_successful_hook_with_only_inline_output(self, mock_log):
|
||||
worker = self._make_worker()
|
||||
archive_result = SimpleNamespace(
|
||||
status='succeeded',
|
||||
output_files={},
|
||||
output_str='scrolled 600px',
|
||||
output_json=None,
|
||||
refresh_from_db=lambda: None,
|
||||
)
|
||||
|
||||
worker._retry_failed_empty_foreground_hooks(
|
||||
[(Path('/tmp/on_Snapshot__45_infiniscroll.js'), archive_result)],
|
||||
config={},
|
||||
)
|
||||
|
||||
mock_log.assert_not_called()
|
||||
|
||||
@patch('archivebox.workers.worker.log_worker_event')
|
||||
def test_retry_replays_failed_hook_with_no_outputs(self, mock_log):
|
||||
worker = self._make_worker()
|
||||
run_calls = []
|
||||
wait_calls = []
|
||||
|
||||
def run_hook(*args, **kwargs):
|
||||
run_calls.append((args, kwargs))
|
||||
return SimpleNamespace()
|
||||
|
||||
def wait_for_hook(process, archive_result):
|
||||
wait_calls.append((process, archive_result))
|
||||
archive_result.status = 'succeeded'
|
||||
archive_result.output_files = {'singlefile.html': {}}
|
||||
|
||||
archive_result = SimpleNamespace(
|
||||
status='failed',
|
||||
output_files={},
|
||||
output_str='',
|
||||
output_json=None,
|
||||
refresh_from_db=lambda: None,
|
||||
)
|
||||
|
||||
worker._run_hook = run_hook
|
||||
worker._wait_for_hook = wait_for_hook
|
||||
|
||||
worker._retry_failed_empty_foreground_hooks(
|
||||
[(Path('/tmp/on_Snapshot__50_singlefile.py'), archive_result)],
|
||||
config={},
|
||||
)
|
||||
|
||||
assert len(run_calls) == 1
|
||||
assert len(wait_calls) == 1
|
||||
mock_log.assert_called_once()
|
||||
@@ -776,7 +776,7 @@ class SnapshotWorker(Worker):
|
||||
|
||||
def runloop(self) -> None:
|
||||
"""Execute all hooks sequentially."""
|
||||
from archivebox.hooks import discover_hooks, is_background_hook
|
||||
from archivebox.hooks import discover_hooks, is_background_hook, is_finite_background_hook
|
||||
from archivebox.core.models import ArchiveResult, Snapshot
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
@@ -797,7 +797,7 @@ class SnapshotWorker(Worker):
|
||||
hooks = sorted(hooks, key=lambda h: h.name) # Sort by name (includes step prefix)
|
||||
|
||||
foreground_hooks: list[tuple[Path, ArchiveResult]] = []
|
||||
launched_background_hooks = False
|
||||
launched_finite_background_hooks = False
|
||||
|
||||
# Execute each hook sequentially
|
||||
for hook_path in hooks:
|
||||
@@ -835,7 +835,8 @@ class SnapshotWorker(Worker):
|
||||
process = self._run_hook(hook_path, ar, config)
|
||||
|
||||
if is_background:
|
||||
launched_background_hooks = True
|
||||
if is_finite_background_hook(hook_name):
|
||||
launched_finite_background_hooks = True
|
||||
# Track but don't wait
|
||||
self.background_processes[hook_name] = process
|
||||
log_worker_event(
|
||||
@@ -860,7 +861,7 @@ class SnapshotWorker(Worker):
|
||||
|
||||
# All hooks launched (or completed) - terminate bg hooks and seal
|
||||
self._finalize_background_hooks()
|
||||
if launched_background_hooks:
|
||||
if launched_finite_background_hooks:
|
||||
self._retry_failed_empty_foreground_hooks(foreground_hooks, config)
|
||||
if self.snapshot.status != Snapshot.StatusChoices.SEALED:
|
||||
# This triggers enter_sealed() which calls cleanup() and checks parent crawl sealing
|
||||
@@ -961,9 +962,13 @@ class SnapshotWorker(Worker):
|
||||
window before giving up.
|
||||
"""
|
||||
import time
|
||||
from archivebox.core.models import Snapshot
|
||||
from archivebox.core.models import ArchiveResult, Snapshot
|
||||
|
||||
retry_delays = (0.0, 0.25, 0.5, 1.0)
|
||||
retryable_statuses = {
|
||||
ArchiveResult.StatusChoices.FAILED,
|
||||
ArchiveResult.StatusChoices.SKIPPED,
|
||||
}
|
||||
|
||||
for hook_path, ar in hooks:
|
||||
for attempt, delay in enumerate(retry_delays, start=1):
|
||||
@@ -975,7 +980,9 @@ class SnapshotWorker(Worker):
|
||||
return
|
||||
|
||||
ar.refresh_from_db()
|
||||
if ar.output_files:
|
||||
if ar.status not in retryable_statuses:
|
||||
break
|
||||
if ar.output_files or ar.output_str or ar.output_json:
|
||||
break
|
||||
|
||||
if delay:
|
||||
|
||||
Reference in New Issue
Block a user