mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 23:37:58 +10:00
Improve concurrency control between plugin hooks (#1721)
<!-- IMPORTANT: Do not submit PRs with only formatting / PEP8 / line length changes. --> # Summary <!--e.g. This PR fixes ABC or adds the ability to do XYZ...--> # Related issues <!-- e.g. #123 or Roadmap goal # https://github.com/pirate/ArchiveBox/wiki/Roadmap --> # Changes these areas - [ ] Bugfixes - [ ] Feature behavior - [ ] Command line interface - [ ] Configuration options - [ ] Internal architecture - [ ] Snapshot data layout on disk
This commit is contained in:
@@ -352,18 +352,42 @@ class ArchiveResultWorker(Worker):
|
||||
return ArchiveResult
|
||||
|
||||
def get_queue(self) -> QuerySet:
|
||||
"""Get queue of ArchiveResults ready for processing."""
|
||||
"""
|
||||
Get queue of ArchiveResults ready for processing.
|
||||
|
||||
Uses step-based filtering: only claims ARs where hook step <= snapshot.current_step.
|
||||
This ensures hooks execute in order (step 0 → 1 → 2 ... → 9).
|
||||
"""
|
||||
from core.models import ArchiveResult
|
||||
from archivebox.hooks import extract_step
|
||||
|
||||
qs = super().get_queue()
|
||||
|
||||
if self.plugin:
|
||||
qs = qs.filter(plugin=self.plugin)
|
||||
|
||||
# Note: Removed blocking logic since plugins have separate output directories
|
||||
# and don't interfere with each other. Each plugin runs independently.
|
||||
# Step-based filtering: only process ARs whose step <= snapshot.current_step
|
||||
# Since step is derived from hook_name, we filter in Python after initial query
|
||||
# This is efficient because the base query already filters by retry_at and status
|
||||
|
||||
return qs
|
||||
# Get candidate ARs
|
||||
candidates = list(qs[:50]) # Limit to avoid loading too many
|
||||
ready_pks = []
|
||||
|
||||
for ar in candidates:
|
||||
if not ar.hook_name:
|
||||
# Legacy ARs without hook_name - process them
|
||||
ready_pks.append(ar.pk)
|
||||
continue
|
||||
|
||||
ar_step = extract_step(ar.hook_name)
|
||||
snapshot_step = ar.snapshot.current_step
|
||||
|
||||
if ar_step <= snapshot_step:
|
||||
ready_pks.append(ar.pk)
|
||||
|
||||
# Return filtered queryset ordered by hook_name (so earlier hooks run first within a step)
|
||||
return ArchiveResult.objects.filter(pk__in=ready_pks).order_by('hook_name', 'retry_at')
|
||||
|
||||
def process_item(self, obj) -> bool:
|
||||
"""Process an ArchiveResult by running its plugin."""
|
||||
|
||||
Reference in New Issue
Block a user