mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 15:27:53 +10:00
Implement hook step-based concurrency system
This implements the hook concurrency plan from TODO_hook_concurrency.md: ## Schema Changes - Add Snapshot.current_step (IntegerField 0-9, default=0) - Create migration 0034_snapshot_current_step.py - Fix uuid_compat imports in migrations 0032 and 0003 ## Core Logic - Add extract_step(hook_name) utility - extracts step from __XX_ pattern - Add is_background_hook(hook_name) utility - checks for .bg. suffix - Update Snapshot.create_pending_archiveresults() to create one AR per hook - Update ArchiveResult.run() to handle hook_name field - Add Snapshot.advance_step_if_ready() method for step advancement - Integrate with SnapshotMachine.is_finished() to call advance_step_if_ready() ## Worker Coordination - Update ArchiveResultWorker.get_queue() for step-based filtering - ARs are only claimable when their step <= snapshot.current_step ## Hook Renumbering - Step 5 (DOM extraction): singlefile→50, screenshot→51, pdf→52, dom→53, title→54, readability→55, headers→55, mercury→56, htmltotext→57 - Step 6 (post-DOM): wget→61, git→62, media→63.bg, gallerydl→64.bg, forumdl→65.bg, papersdl→66.bg - Step 7 (URL extraction): parse_* hooks moved to 70-75 Background hooks (.bg suffix) don't block step advancement, enabling long-running downloads to continue while other hooks proceed.
This commit is contained in:
@@ -354,18 +354,42 @@ class ArchiveResultWorker(Worker):
|
||||
return ArchiveResult
|
||||
|
||||
def get_queue(self) -> QuerySet:
|
||||
"""Get queue of ArchiveResults ready for processing."""
|
||||
"""
|
||||
Get queue of ArchiveResults ready for processing.
|
||||
|
||||
Uses step-based filtering: only claims ARs where hook step <= snapshot.current_step.
|
||||
This ensures hooks execute in order (step 0 → 1 → 2 ... → 9).
|
||||
"""
|
||||
from core.models import ArchiveResult
|
||||
from archivebox.hooks import extract_step
|
||||
|
||||
qs = super().get_queue()
|
||||
|
||||
if self.plugin:
|
||||
qs = qs.filter(plugin=self.plugin)
|
||||
|
||||
# Note: Removed blocking logic since plugins have separate output directories
|
||||
# and don't interfere with each other. Each plugin runs independently.
|
||||
# Step-based filtering: only process ARs whose step <= snapshot.current_step
|
||||
# Since step is derived from hook_name, we filter in Python after initial query
|
||||
# This is efficient because the base query already filters by retry_at and status
|
||||
|
||||
return qs
|
||||
# Get candidate ARs
|
||||
candidates = list(qs[:50]) # Limit to avoid loading too many
|
||||
ready_pks = []
|
||||
|
||||
for ar in candidates:
|
||||
if not ar.hook_name:
|
||||
# Legacy ARs without hook_name - process them
|
||||
ready_pks.append(ar.pk)
|
||||
continue
|
||||
|
||||
ar_step = extract_step(ar.hook_name)
|
||||
snapshot_step = ar.snapshot.current_step
|
||||
|
||||
if ar_step <= snapshot_step:
|
||||
ready_pks.append(ar.pk)
|
||||
|
||||
# Return filtered queryset ordered by hook_name (so earlier hooks run first within a step)
|
||||
return ArchiveResult.objects.filter(pk__in=ready_pks).order_by('hook_name', 'retry_at')
|
||||
|
||||
def process_item(self, obj) -> bool:
|
||||
"""Process an ArchiveResult by running its plugin."""
|
||||
|
||||
Reference in New Issue
Block a user