Improve scheduling, runtime paths, and API behavior

2026-04-06 07:47:53 +10:00 · 2026-03-15 18:31:56 -07:00
parent 7d42c6c8b5
commit 70c9358cf9
37 changed files with 1058 additions and 398 deletions
--- a/archivebox/workers/orchestrator.py
+++ b/archivebox/workers/orchestrator.py
@@ -336,6 +336,7 @@ class Orchestrator:
        queue_sizes = {}

        self._enforce_hard_timeouts()
+        self._materialize_due_schedules()

        # Check Binary queue
        machine = Machine.current()
@@ -399,6 +400,24 @@ class Orchestrator:

        return queue_sizes

+    def _should_process_schedules(self) -> bool:
+        return (not self.exit_on_idle) and (self.crawl_id is None)
+
+    def _materialize_due_schedules(self) -> None:
+        if not self._should_process_schedules():
+            return
+
+        from archivebox.crawls.models import CrawlSchedule
+
+        now = timezone.now()
+        due_schedules = CrawlSchedule.objects.filter(is_enabled=True).select_related('template', 'template__created_by')
+
+        for schedule in due_schedules:
+            if not schedule.is_due(now):
+                continue
+
+            schedule.enqueue(queued_at=now)
+
    def _enforce_hard_timeouts(self) -> None:
        """Force-kill and seal hooks/archiveresults/snapshots that exceed hard limits."""
        import time
--- a/archivebox/workers/tests/test_scheduled_crawls.py
+++ b/archivebox/workers/tests/test_scheduled_crawls.py
@@ -0,0 +1,65 @@
+from datetime import timedelta
+
+from django.contrib.auth import get_user_model
+from django.test import TestCase
+from django.utils import timezone
+
+from archivebox.crawls.models import Crawl, CrawlSchedule
+from archivebox.workers.orchestrator import Orchestrator
+
+
+class TestScheduledCrawlMaterialization(TestCase):
+    def setUp(self):
+        self.user = get_user_model().objects.create_user(
+            username='schedule-user',
+            password='password',
+        )
+
+    def _create_due_schedule(self) -> CrawlSchedule:
+        template = Crawl.objects.create(
+            urls='https://example.com/feed.xml',
+            max_depth=1,
+            tags_str='scheduled',
+            label='Scheduled Feed',
+            notes='template',
+            created_by=self.user,
+            status=Crawl.StatusChoices.SEALED,
+            retry_at=None,
+        )
+        schedule = CrawlSchedule.objects.create(
+            template=template,
+            schedule='daily',
+            is_enabled=True,
+            label='Scheduled Feed',
+            notes='template',
+            created_by=self.user,
+        )
+        past = timezone.now() - timedelta(days=2)
+        Crawl.objects.filter(pk=template.pk).update(created_at=past, modified_at=past)
+        template.refresh_from_db()
+        schedule.refresh_from_db()
+        return schedule
+
+    def test_global_orchestrator_materializes_due_schedule(self):
+        schedule = self._create_due_schedule()
+
+        orchestrator = Orchestrator(exit_on_idle=False)
+        orchestrator._materialize_due_schedules()
+
+        scheduled_crawls = Crawl.objects.filter(schedule=schedule).order_by('created_at')
+        self.assertEqual(scheduled_crawls.count(), 2)
+
+        queued_crawl = scheduled_crawls.last()
+        self.assertEqual(queued_crawl.status, Crawl.StatusChoices.QUEUED)
+        self.assertEqual(queued_crawl.urls, 'https://example.com/feed.xml')
+        self.assertEqual(queued_crawl.max_depth, 1)
+        self.assertEqual(queued_crawl.tags_str, 'scheduled')
+
+    def test_one_shot_orchestrator_does_not_materialize_due_schedule(self):
+        schedule = self._create_due_schedule()
+
+        Orchestrator(exit_on_idle=True)._materialize_due_schedules()
+        self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
+
+        Orchestrator(exit_on_idle=False, crawl_id=str(schedule.template_id))._materialize_due_schedules()
+        self.assertEqual(Crawl.objects.filter(schedule=schedule).count(), 1)
--- a/archivebox/workers/tests/test_snapshot_worker.py
+++ b/archivebox/workers/tests/test_snapshot_worker.py
@@ -0,0 +1,75 @@
+from pathlib import Path
+from types import SimpleNamespace
+from unittest.mock import patch
+
+from django.test import SimpleTestCase
+
+from archivebox.workers.worker import SnapshotWorker
+
+
+class TestSnapshotWorkerRetryForegroundHooks(SimpleTestCase):
+    def _make_worker(self):
+        worker = SnapshotWorker.__new__(SnapshotWorker)
+        worker.pid = 12345
+        worker.snapshot = SimpleNamespace(
+            status='started',
+            refresh_from_db=lambda: None,
+        )
+        worker._snapshot_exceeded_hard_timeout = lambda: False
+        worker._seal_snapshot_due_to_timeout = lambda: None
+        worker._run_hook = lambda *args, **kwargs: SimpleNamespace()
+        worker._wait_for_hook = lambda *args, **kwargs: None
+        return worker
+
+    @patch('archivebox.workers.worker.log_worker_event')
+    def test_retry_skips_successful_hook_with_only_inline_output(self, mock_log):
+        worker = self._make_worker()
+        archive_result = SimpleNamespace(
+            status='succeeded',
+            output_files={},
+            output_str='scrolled 600px',
+            output_json=None,
+            refresh_from_db=lambda: None,
+        )
+
+        worker._retry_failed_empty_foreground_hooks(
+            [(Path('/tmp/on_Snapshot__45_infiniscroll.js'), archive_result)],
+            config={},
+        )
+
+        mock_log.assert_not_called()
+
+    @patch('archivebox.workers.worker.log_worker_event')
+    def test_retry_replays_failed_hook_with_no_outputs(self, mock_log):
+        worker = self._make_worker()
+        run_calls = []
+        wait_calls = []
+
+        def run_hook(*args, **kwargs):
+            run_calls.append((args, kwargs))
+            return SimpleNamespace()
+
+        def wait_for_hook(process, archive_result):
+            wait_calls.append((process, archive_result))
+            archive_result.status = 'succeeded'
+            archive_result.output_files = {'singlefile.html': {}}
+
+        archive_result = SimpleNamespace(
+            status='failed',
+            output_files={},
+            output_str='',
+            output_json=None,
+            refresh_from_db=lambda: None,
+        )
+
+        worker._run_hook = run_hook
+        worker._wait_for_hook = wait_for_hook
+
+        worker._retry_failed_empty_foreground_hooks(
+            [(Path('/tmp/on_Snapshot__50_singlefile.py'), archive_result)],
+            config={},
+        )
+
+        assert len(run_calls) == 1
+        assert len(wait_calls) == 1
+        mock_log.assert_called_once()
--- a/archivebox/workers/worker.py
+++ b/archivebox/workers/worker.py
@@ -776,7 +776,7 @@ class SnapshotWorker(Worker):

    def runloop(self) -> None:
        """Execute all hooks sequentially."""
-        from archivebox.hooks import discover_hooks, is_background_hook
+        from archivebox.hooks import discover_hooks, is_background_hook, is_finite_background_hook
        from archivebox.core.models import ArchiveResult, Snapshot
        from archivebox.config.configset import get_config

@@ -797,7 +797,7 @@ class SnapshotWorker(Worker):
            hooks = sorted(hooks, key=lambda h: h.name)  # Sort by name (includes step prefix)

            foreground_hooks: list[tuple[Path, ArchiveResult]] = []
-            launched_background_hooks = False
+            launched_finite_background_hooks = False

            # Execute each hook sequentially
            for hook_path in hooks:
@@ -835,7 +835,8 @@ class SnapshotWorker(Worker):
                process = self._run_hook(hook_path, ar, config)

                if is_background:
-                    launched_background_hooks = True
+                    if is_finite_background_hook(hook_name):
+                        launched_finite_background_hooks = True
                    # Track but don't wait
                    self.background_processes[hook_name] = process
                    log_worker_event(
@@ -860,7 +861,7 @@ class SnapshotWorker(Worker):

            # All hooks launched (or completed) - terminate bg hooks and seal
            self._finalize_background_hooks()
-            if launched_background_hooks:
+            if launched_finite_background_hooks:
                self._retry_failed_empty_foreground_hooks(foreground_hooks, config)
            if self.snapshot.status != Snapshot.StatusChoices.SEALED:
                # This triggers enter_sealed() which calls cleanup() and checks parent crawl sealing
@@ -961,9 +962,13 @@ class SnapshotWorker(Worker):
        window before giving up.
        """
        import time
-        from archivebox.core.models import Snapshot
+        from archivebox.core.models import ArchiveResult, Snapshot

        retry_delays = (0.0, 0.25, 0.5, 1.0)
+        retryable_statuses = {
+            ArchiveResult.StatusChoices.FAILED,
+            ArchiveResult.StatusChoices.SKIPPED,
+        }

        for hook_path, ar in hooks:
            for attempt, delay in enumerate(retry_delays, start=1):
@@ -975,7 +980,9 @@ class SnapshotWorker(Worker):
                    return

                ar.refresh_from_db()
-                if ar.output_files:
+                if ar.status not in retryable_statuses:
+                    break
+                if ar.output_files or ar.output_str or ar.output_json:
                    break

                if delay: