Implement hook step-based concurrency system

This implements the hook concurrency plan from TODO_hook_concurrency.md: ## Schema Changes - Add Snapshot.current_step (IntegerField 0-9, default=0) - Create migration 0034_snapshot_current_step.py - Fix uuid_compat imports in migrations 0032 and 0003 ## Core Logic - Add extract_step(hook_name) utility - extracts step from __XX_ pattern - Add is_background_hook(hook_name) utility - checks for .bg. suffix - Update Snapshot.create_pending_archiveresults() to create one AR per hook - Update ArchiveResult.run() to handle hook_name field - Add Snapshot.advance_step_if_ready() method for step advancement - Integrate with SnapshotMachine.is_finished() to call advance_step_if_ready() ## Worker Coordination - Update ArchiveResultWorker.get_queue() for step-based filtering - ARs are only claimable when their step <= snapshot.current_step ## Hook Renumbering - Step 5 (DOM extraction): singlefile→50, screenshot→51, pdf→52, dom→53, title→54, readability→55, headers→55, mercury→56, htmltotext→57 - Step 6 (post-DOM): wget→61, git→62, media→63.bg, gallerydl→64.bg, forumdl→65.bg, papersdl→66.bg - Step 7 (URL extraction): parse_* hooks moved to 70-75 Background hooks (.bg suffix) don't block step advancement, enabling long-running downloads to continue while other hooks proceed.
2026-04-05 15:27:53 +10:00 · 2025-12-28 13:47:25 +00:00
parent 4ccb0863bb
commit 1b5a816022
29 changed files with 297 additions and 83 deletions
--- a/archivebox/workers/worker.py
+++ b/archivebox/workers/worker.py
@@ -354,18 +354,42 @@ class ArchiveResultWorker(Worker):
        return ArchiveResult

    def get_queue(self) -> QuerySet:
-        """Get queue of ArchiveResults ready for processing."""
+        """
+        Get queue of ArchiveResults ready for processing.
+
+        Uses step-based filtering: only claims ARs where hook step <= snapshot.current_step.
+        This ensures hooks execute in order (step 0 → 1 → 2 ... → 9).
+        """
        from core.models import ArchiveResult
+        from archivebox.hooks import extract_step

        qs = super().get_queue()

        if self.plugin:
            qs = qs.filter(plugin=self.plugin)

-        # Note: Removed blocking logic since plugins have separate output directories
-        # and don't interfere with each other. Each plugin runs independently.
+        # Step-based filtering: only process ARs whose step <= snapshot.current_step
+        # Since step is derived from hook_name, we filter in Python after initial query
+        # This is efficient because the base query already filters by retry_at and status

-        return qs
+        # Get candidate ARs
+        candidates = list(qs[:50])  # Limit to avoid loading too many
+        ready_pks = []
+
+        for ar in candidates:
+            if not ar.hook_name:
+                # Legacy ARs without hook_name - process them
+                ready_pks.append(ar.pk)
+                continue
+
+            ar_step = extract_step(ar.hook_name)
+            snapshot_step = ar.snapshot.current_step
+
+            if ar_step <= snapshot_step:
+                ready_pks.append(ar.pk)
+
+        # Return filtered queryset ordered by hook_name (so earlier hooks run first within a step)
+        return ArchiveResult.objects.filter(pk__in=ready_pks).order_by('hook_name', 'retry_at')

    def process_item(self, obj) -> bool:
        """Process an ArchiveResult by running its plugin."""