Improve concurrency control between plugin hooks (#1721)

<!-- IMPORTANT: Do not submit PRs with only formatting / PEP8 / line
length changes. -->

# Summary

<!--e.g. This PR fixes ABC or adds the ability to do XYZ...-->

# Related issues

<!-- e.g. #123 or Roadmap goal #
https://github.com/pirate/ArchiveBox/wiki/Roadmap -->

# Changes these areas

- [ ] Bugfixes
- [ ] Feature behavior
- [ ] Command line interface
- [ ] Configuration options
- [ ] Internal architecture
- [ ] Snapshot data layout on disk
This commit is contained in:
Nick Sweeting
2025-12-28 12:48:53 -08:00
committed by GitHub
30 changed files with 325 additions and 125 deletions

View File

@@ -352,18 +352,42 @@ class ArchiveResultWorker(Worker):
return ArchiveResult
def get_queue(self) -> QuerySet:
"""Get queue of ArchiveResults ready for processing."""
"""
Get queue of ArchiveResults ready for processing.
Uses step-based filtering: only claims ARs where hook step <= snapshot.current_step.
This ensures hooks execute in order (step 0 → 1 → 2 ... → 9).
"""
from core.models import ArchiveResult
from archivebox.hooks import extract_step
qs = super().get_queue()
if self.plugin:
qs = qs.filter(plugin=self.plugin)
# Note: Removed blocking logic since plugins have separate output directories
# and don't interfere with each other. Each plugin runs independently.
# Step-based filtering: only process ARs whose step <= snapshot.current_step
# Since step is derived from hook_name, we filter in Python after initial query
# This is efficient because the base query already filters by retry_at and status
return qs
# Get candidate ARs
candidates = list(qs[:50]) # Limit to avoid loading too many
ready_pks = []
for ar in candidates:
if not ar.hook_name:
# Legacy ARs without hook_name - process them
ready_pks.append(ar.pk)
continue
ar_step = extract_step(ar.hook_name)
snapshot_step = ar.snapshot.current_step
if ar_step <= snapshot_step:
ready_pks.append(ar.pk)
# Return filtered queryset ordered by hook_name (so earlier hooks run first within a step)
return ArchiveResult.objects.filter(pk__in=ready_pks).order_by('hook_name', 'retry_at')
def process_item(self, obj) -> bool:
"""Process an ArchiveResult by running its plugin."""