Improve concurrency control between plugin hooks (#1721)

<!-- IMPORTANT: Do not submit PRs with only formatting / PEP8 / line
length changes. -->

# Summary

<!--e.g. This PR fixes ABC or adds the ability to do XYZ...-->

# Related issues

<!-- e.g. #123 or Roadmap goal #
https://github.com/pirate/ArchiveBox/wiki/Roadmap -->

# Changes these areas

- [ ] Bugfixes
- [ ] Feature behavior
- [ ] Command line interface
- [ ] Configuration options
- [ ] Internal architecture
- [ ] Snapshot data layout on disk
This commit is contained in:
Nick Sweeting
2025-12-28 12:48:53 -08:00
committed by GitHub
30 changed files with 325 additions and 125 deletions

View File

@@ -310,37 +310,40 @@ archivebox/plugins/{plugin_name}/
## Implementation Checklist
### Phase 1: Schema Migration ✅
- [ ] Add `Snapshot.current_step` (IntegerField 0-9, default=0)
- [ ] Add `ArchiveResult.hook_name` (CharField, nullable) - just filename
- [ ] Create migration: `0033_snapshot_current_step_archiveresult_hook_name.py`
- [x] Add `Snapshot.current_step` (IntegerField 0-9, default=0)
- [x] Add `ArchiveResult.hook_name` (CharField, nullable) - just filename
- [x] Create migration: `0034_snapshot_current_step.py`
### Phase 2: Core Logic Updates
- [ ] Add `extract_step(hook_name)` utility in `archivebox/hooks.py`
### Phase 2: Core Logic Updates
- [x] Add `extract_step(hook_name)` utility in `archivebox/hooks.py`
- Extract first digit from `__XX_` pattern
- Default to 9 for unnumbered hooks
- [ ] Update `Snapshot.create_pending_archiveresults()` in `archivebox/core/models.py`:
- [x] Add `is_background_hook(hook_name)` utility in `archivebox/hooks.py`
- Check for `.bg.` in filename
- [x] Update `Snapshot.create_pending_archiveresults()` in `archivebox/core/models.py`:
- Discover all hooks (not plugins)
- Create one AR per hook with `hook_name` set
- [ ] Update `ArchiveResult.run()` in `archivebox/core/models.py`:
- [x] Update `ArchiveResult.run()` in `archivebox/core/models.py`:
- If `hook_name` set: run single hook
- If `hook_name` None: discover all plugin hooks (existing behavior)
- [ ] Add `Snapshot.advance_step_if_ready()` method:
- [x] Add `Snapshot.advance_step_if_ready()` method:
- Check if all foreground ARs in current step finished
- Increment `current_step` if ready
- Ignore background hooks (.bg) in completion check
- [ ] Integrate with `SnapshotMachine.is_finished()` in `archivebox/core/statemachines.py`:
- [x] Integrate with `SnapshotMachine.is_finished()` in `archivebox/core/statemachines.py`:
- Call `advance_step_if_ready()` before checking if done
### Phase 3: Worker Coordination
- [ ] Update worker AR claiming query in `archivebox/workers/worker.py`:
### Phase 3: Worker Coordination
- [x] Update worker AR claiming query in `archivebox/workers/worker.py`:
- Filter: `extract_step(ar.hook_name) <= snapshot.current_step`
- Note: May need to denormalize or use clever query since step is derived
- Alternative: Claim any AR in QUEUED state, check step in Python before processing
- Claims ARs in QUEUED state, checks step in Python before processing
- Orders by hook_name for deterministic execution within step
### Phase 4: Hook Renumbering
- [ ] Renumber hooks per renumbering map below
- [ ] Add `.bg` suffix to long-running hooks
- [ ] Test all hooks still work after renumbering
### Phase 4: Hook Renumbering
- [x] Renumber hooks per renumbering map below
- [x] Add `.bg` suffix to long-running hooks (media, gallerydl, forumdl, papersdl)
- [x] Move parse_* hooks to step 7 (70-79)
- [x] Test all hooks still work after renumbering
## Migration Path
@@ -353,25 +356,34 @@ No special migration needed:
### Renumbering Map
**Current → New:**
**Completed Renames:**
```
git/on_Snapshot__12_git.py → git/on_Snapshot__62_git.py
media/on_Snapshot__51_media.py → media/on_Snapshot__63_media.bg.py
gallerydl/on_Snapshot__52_gallerydl.py → gallerydl/on_Snapshot__64_gallerydl.bg.py
forumdl/on_Snapshot__53_forumdl.pyforumdl/on_Snapshot__65_forumdl.bg.py
papersdl/on_Snapshot__54_papersdl.pypapersdl/on_Snapshot__66_papersdl.bg.py
# Step 5: DOM Extraction (sequential, non-background)
singlefile/on_Snapshot__37_singlefile.py → singlefile/on_Snapshot__50_singlefile.py
screenshot/on_Snapshot__34_screenshot.js → screenshot/on_Snapshot__51_screenshot.js ✅
pdf/on_Snapshot__35_pdf.js pdf/on_Snapshot__52_pdf.js ✅
dom/on_Snapshot__36_dom.js dom/on_Snapshot__53_dom.js ✅
title/on_Snapshot__32_title.js → title/on_Snapshot__54_title.js ✅
readability/on_Snapshot__52_readability.py → readability/on_Snapshot__55_readability.py ✅
headers/on_Snapshot__33_headers.js → headers/on_Snapshot__55_headers.js ✅
mercury/on_Snapshot__53_mercury.py → mercury/on_Snapshot__56_mercury.py ✅
htmltotext/on_Snapshot__54_htmltotext.py → htmltotext/on_Snapshot__57_htmltotext.py ✅
readability/on_Snapshot__52_readability.py → readability/on_Snapshot__55_readability.py
mercury/on_Snapshot__53_mercury.py → mercury/on_Snapshot__56_mercury.py
# Step 6: Post-DOM Extraction (background for long-running)
wget/on_Snapshot__50_wget.py → wget/on_Snapshot__61_wget.py
git/on_Snapshot__12_git.py → git/on_Snapshot__62_git.py ✅
media/on_Snapshot__51_media.py → media/on_Snapshot__63_media.bg.py ✅
gallerydl/on_Snapshot__52_gallerydl.py → gallerydl/on_Snapshot__64_gallerydl.bg.py ✅
forumdl/on_Snapshot__53_forumdl.py → forumdl/on_Snapshot__65_forumdl.bg.py ✅
papersdl/on_Snapshot__54_papersdl.py → papersdl/on_Snapshot__66_papersdl.bg.py ✅
singlefile/on_Snapshot__37_singlefile.py → singlefile/on_Snapshot__50_singlefile.py
screenshot/on_Snapshot__34_screenshot.js → screenshot/on_Snapshot__51_screenshot.js
pdf/on_Snapshot__35_pdf.js → pdf/on_Snapshot__52_pdf.js
dom/on_Snapshot__36_dom.js → dom/on_Snapshot__53_dom.js
title/on_Snapshot__32_title.js → title/on_Snapshot__54_title.js
headers/on_Snapshot__33_headers.js → headers/on_Snapshot__55_headers.js
wget/on_Snapshot__50_wget.py → wget/on_Snapshot__61_wget.py
# Step 7: URL Extraction (parse_* hooks moved from step 6)
parse_html_urls/on_Snapshot__60_parse_html_urls.py → parse_html_urls/on_Snapshot__70_parse_html_urls.py ✅
parse_txt_urls/on_Snapshot__62_parse_txt_urls.py → parse_txt_urls/on_Snapshot__71_parse_txt_urls.py ✅
parse_rss_urls/on_Snapshot__61_parse_rss_urls.py → parse_rss_urls/on_Snapshot__72_parse_rss_urls.py ✅
parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py → parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py ✅
parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py → parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py ✅
parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js → parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js ✅
```
## Testing Strategy

View File

@@ -11,18 +11,6 @@ from archivebox.misc.util import enforce_types, docstring
from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVE_DIR
from archivebox.config.common import SHELL_CONFIG
from archivebox.misc.legacy import parse_json_links_details
from archivebox.misc.folders import (
get_indexed_folders,
get_archived_folders,
get_invalid_folders,
get_unarchived_folders,
get_present_folders,
get_valid_folders,
get_duplicate_folders,
get_orphaned_folders,
get_corrupted_folders,
get_unrecognized_folders,
)
from archivebox.misc.system import get_dir_size
from archivebox.misc.logging_util import printable_filesize
@@ -55,42 +43,40 @@ def status(out_dir: Path=DATA_DIR) -> None:
size = printable_filesize(num_bytes)
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
num_indexed = len(get_indexed_folders(links, out_dir=out_dir))
num_archived = len(get_archived_folders(links, out_dir=out_dir))
num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir))
print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
num_present = len(get_present_folders(links, out_dir=out_dir))
num_valid = len(get_valid_folders(links, out_dir=out_dir))
# Use DB as source of truth for snapshot status
num_indexed = links.count()
num_archived = links.filter(status='archived').count() or links.exclude(downloaded_at=None).count()
num_unarchived = links.filter(status='queued').count() or links.filter(downloaded_at=None).count()
print(f' > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)')
print(f' > archived: {num_archived}'.ljust(36), '(snapshots with archived content)')
print(f' > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)')
# Count directories on filesystem
num_present = 0
orphaned_dirs = []
if ARCHIVE_DIR.exists():
for entry in ARCHIVE_DIR.iterdir():
if entry.is_dir():
num_present += 1
if not links.filter(timestamp=entry.name).exists():
orphaned_dirs.append(str(entry))
num_valid = min(num_present, num_indexed) # approximate
print()
print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
print(f' > [green]valid:[/green] {num_valid}'.ljust(36), f' ({get_valid_folders.__doc__})')
duplicate = get_duplicate_folders(links, out_dir=out_dir)
orphaned = get_orphaned_folders(links, out_dir=out_dir)
corrupted = get_corrupted_folders(links, out_dir=out_dir)
unrecognized = get_unrecognized_folders(links, out_dir=out_dir)
num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
print(f' > [red]invalid:[/red] {num_invalid}'.ljust(36), f' ({get_invalid_folders.__doc__})')
print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
print(f' > present: {num_present}'.ljust(36), '(directories in archive/)')
print(f' > [green]valid:[/green] {num_valid}'.ljust(36), ' (directories with matching DB entry)')
num_orphaned = len(orphaned_dirs)
print(f' > [red]orphaned:[/red] {num_orphaned}'.ljust(36), ' (directories without matching DB entry)')
if num_indexed:
print(' [violet]Hint:[/violet] You can list link data directories by status like so:')
print(' [green]archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)[/green]')
print(' [violet]Hint:[/violet] You can list snapshots by status like so:')
print(' [green]archivebox list --status=<status> (e.g. archived, queued, etc.)[/green]')
if orphaned:
if orphaned_dirs:
print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:')
print(' [green]archivebox init[/green]')
if num_invalid:
print(' [violet]Hint:[/violet] You may need to manually remove or fix some invalid data directories, afterwards make sure to run:')
print(' [green]archivebox init[/green]')
print()
print('[green]\\[*] Scanning recent archive changes and user logins:[/green]')
print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]')

View File

@@ -1,7 +1,7 @@
# Generated by Django 6.0 on 2025-12-28 05:12
import django.db.models.deletion
import uuid
from archivebox import uuid_compat
from django.conf import settings
from django.db import migrations, models
@@ -49,7 +49,7 @@ class Migration(migrations.Migration):
migrations.AlterField(
model_name='archiveresult',
name='uuid',
field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True),
field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True),
),
migrations.AddConstraint(
model_name='snapshot',

View File

@@ -0,0 +1,23 @@
# Generated by Django 6.0 on 2025-12-28
# Add Snapshot.current_step field for hook step-based execution
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0033_rename_extractor_add_hook_name'),
]
operations = [
migrations.AddField(
model_name='snapshot',
name='current_step',
field=models.PositiveSmallIntegerField(
default=0,
db_index=True,
help_text='Current hook step being executed (0-9). Used for sequential hook execution.'
),
),
]

View File

@@ -334,6 +334,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
depth = models.PositiveSmallIntegerField(default=0, db_index=True) # 0 for root snapshot, 1+ for discovered URLs
fs_version = models.CharField(max_length=10, default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().')
current_step = models.PositiveSmallIntegerField(default=0, db_index=True, help_text='Current hook step being executed (0-9). Used for sequential hook execution.')
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
@@ -1243,23 +1244,33 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
def create_pending_archiveresults(self) -> list['ArchiveResult']:
"""
Create ArchiveResult records for all enabled plugins.
Create ArchiveResult records for all enabled hooks.
Uses the hooks system to discover available plugins from:
Uses the hooks system to discover available hooks from:
- archivebox/plugins/*/on_Snapshot__*.{py,sh,js}
- data/plugins/*/on_Snapshot__*.{py,sh,js}
"""
from archivebox.hooks import get_enabled_plugins
plugins = get_enabled_plugins()
Creates one ArchiveResult per hook (not per plugin), with hook_name set.
This enables step-based execution where all hooks in a step can run in parallel.
"""
from archivebox.hooks import discover_hooks
hooks = discover_hooks('Snapshot')
archiveresults = []
for plugin in plugins:
if ArchiveResult.objects.filter(snapshot=self, plugin=plugin).exists():
for hook_path in hooks:
hook_name = hook_path.name # e.g., 'on_Snapshot__50_wget.py'
plugin = hook_path.parent.name # e.g., 'wget'
# Check if AR already exists for this specific hook
if ArchiveResult.objects.filter(snapshot=self, hook_name=hook_name).exists():
continue
archiveresult, _ = ArchiveResult.objects.get_or_create(
snapshot=self, plugin=plugin,
archiveresult, created = ArchiveResult.objects.get_or_create(
snapshot=self,
hook_name=hook_name,
defaults={
'plugin': plugin,
'status': ArchiveResult.INITIAL_STATE,
'retry_at': timezone.now(),
'created_by_id': self.created_by_id,
@@ -1267,8 +1278,57 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
)
if archiveresult.status == ArchiveResult.INITIAL_STATE:
archiveresults.append(archiveresult)
return archiveresults
def advance_step_if_ready(self) -> bool:
"""
Advance current_step if all foreground hooks in current step are finished.
Called by the state machine to check if step can advance.
Background hooks (.bg) don't block step advancement.
Step advancement rules:
- All foreground ARs in current step must be finished (SUCCEEDED/FAILED/SKIPPED)
- Background ARs (hook_name contains '.bg.') are ignored for advancement
- When ready, increments current_step by 1 (up to 9)
Returns:
True if step was advanced, False if not ready or already at step 9.
"""
from archivebox.hooks import extract_step, is_background_hook
if self.current_step >= 9:
return False # Already at final step
# Get all ARs for current step that are foreground
current_step_ars = self.archiveresult_set.filter(
hook_name__isnull=False
).exclude(hook_name='')
# Check each AR in current step
for ar in current_step_ars:
ar_step = extract_step(ar.hook_name)
if ar_step != self.current_step:
continue # Not in current step
if is_background_hook(ar.hook_name):
continue # Background hooks don't block
# Foreground hook in current step - check if finished
if ar.status not in ArchiveResult.FINAL_OR_ACTIVE_STATES:
# Still pending/queued - can't advance
return False
if ar.status == ArchiveResult.StatusChoices.STARTED:
# Still running - can't advance
return False
# All foreground hooks in current step are finished - advance!
self.current_step += 1
self.save(update_fields=['current_step', 'modified_at'])
return True
def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
"""
Reset failed/skipped ArchiveResults to queued for retry.
@@ -1301,11 +1361,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
end_ts=None,
)
# Also reset the snapshot so it gets re-checked
# Also reset the snapshot and current_step so it gets re-checked from the beginning
if count > 0:
self.status = self.StatusChoices.STARTED
self.retry_at = retry_at
self.save(update_fields=['status', 'retry_at', 'modified_at'])
self.current_step = 0 # Reset to step 0 for retry
self.save(update_fields=['status', 'retry_at', 'current_step', 'modified_at'])
return count
@@ -1841,45 +1902,63 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def run(self):
"""
Execute this ArchiveResult's plugin and update status.
Execute this ArchiveResult's hook and update status.
Discovers and runs the hook script for self.plugin,
updates status/output fields, queues discovered URLs, and triggers indexing.
If self.hook_name is set, runs only that specific hook.
If self.hook_name is empty, discovers and runs all hooks for self.plugin (backwards compat).
Updates status/output fields, queues discovered URLs, and triggers indexing.
"""
from django.utils import timezone
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
# Find ALL hooks for this plugin
# plugin = plugin name (e.g., 'chrome')
# Each plugin can have multiple hooks that run in sequence
# Determine which hook(s) to run
hooks = []
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
if not base_dir.exists():
continue
plugin_dir = base_dir / self.plugin
if plugin_dir.exists():
matches = list(plugin_dir.glob('on_Snapshot__*.*'))
if matches:
# Sort by name for deterministic order (numeric prefix controls execution order)
hooks.extend(sorted(matches))
if self.hook_name:
# SPECIFIC HOOK MODE: Find the specific hook by name
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
if not base_dir.exists():
continue
plugin_dir = base_dir / self.plugin
if plugin_dir.exists():
hook_path = plugin_dir / self.hook_name
if hook_path.exists():
hooks.append(hook_path)
break
else:
# LEGACY MODE: Discover all hooks for this plugin (backwards compatibility)
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
if not base_dir.exists():
continue
plugin_dir = base_dir / self.plugin
if plugin_dir.exists():
matches = list(plugin_dir.glob('on_Snapshot__*.*'))
if matches:
hooks.extend(sorted(matches))
if not hooks:
self.status = self.StatusChoices.FAILED
self.output_str = f'No hooks found for plugin: {self.plugin}'
if self.hook_name:
self.output_str = f'Hook not found: {self.plugin}/{self.hook_name}'
else:
self.output_str = f'No hooks found for plugin: {self.plugin}'
self.retry_at = None
self.save()
return
# plugin field contains plugin name
# Output directory is plugin_dir for the hook output
plugin_dir = Path(self.snapshot.output_dir) / self.plugin
# Run ALL hooks in the plugin sequentially
start_ts = timezone.now()
has_background_hook = False
is_bg_hook = False
for hook in hooks:
# Check if this is a background hook
is_bg_hook = is_background_hook(hook.name)
result = run_hook(
hook,
output_dir=plugin_dir,
@@ -1890,20 +1969,21 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
depth=self.snapshot.depth,
)
# If any hook is background, mark this ArchiveResult as started
# Background hooks return None
if result is None:
has_background_hook = True
is_bg_hook = True
# Update status based on hook execution
if has_background_hook:
# BACKGROUND HOOK(S) - still running, return immediately
if is_bg_hook:
# BACKGROUND HOOK - still running, return immediately
# Status stays STARTED, will be finalized by Snapshot.cleanup()
self.status = self.StatusChoices.STARTED
self.start_ts = start_ts
self.pwd = str(plugin_dir)
self.save()
return
# ALL FOREGROUND HOOKS - completed, update from filesystem
# FOREGROUND HOOK - completed, update from filesystem
self.start_ts = start_ts
self.pwd = str(plugin_dir)
self.update_from_output()
@@ -1911,11 +1991,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
# Clean up empty output directory if no files were created
if plugin_dir.exists() and not self.output_files:
try:
# Only remove if directory is completely empty
if not any(plugin_dir.iterdir()):
plugin_dir.rmdir()
except (OSError, RuntimeError):
pass # Directory not empty or can't be removed, that's fine
pass
def update_from_output(self):
"""

View File

@@ -60,6 +60,11 @@ class SnapshotMachine(StateMachine, strict_states=True):
if not self.snapshot.archiveresult_set.exists():
return False
# Try to advance step if ready (handles step-based hook execution)
# This will increment current_step when all foreground hooks in current step are done
while self.snapshot.advance_step_if_ready():
pass # Keep advancing until we can't anymore
# if archiveresults exist but are still pending, it's not finished
if self.snapshot.pending_archiveresults().exists():
return False

View File

@@ -15,10 +15,21 @@ Hook contract:
Exit: 0 = success, non-zero = failure
Execution order:
- Extractors run sequentially within each Snapshot (ordered by numeric prefix)
- Multiple Snapshots can process in parallel
- Hooks are numbered 00-99 with first digit determining step (0-9)
- All hooks in a step can run in parallel
- Steps execute sequentially (step 0 → step 1 → ... → step 9)
- Background hooks (.bg suffix) don't block step advancement
- Failed extractors don't block subsequent extractors
Hook Naming Convention:
on_{ModelName}__{run_order}_{description}[.bg].{ext}
Examples:
on_Snapshot__00_setup.py # Step 0, runs first
on_Snapshot__20_chrome_tab.bg.js # Step 2, background (doesn't block)
on_Snapshot__50_screenshot.js # Step 5, foreground (blocks step)
on_Snapshot__63_media.bg.py # Step 6, background (long-running)
Dependency handling:
Extractor plugins that depend on other plugins' output should check at runtime:
@@ -39,11 +50,14 @@ API (all hook logic lives here):
discover_hooks(event) -> List[Path] Find hook scripts
run_hook(script, ...) -> HookResult Execute a hook script
run_hooks(event, ...) -> List[HookResult] Run all hooks for an event
extract_step(hook_name) -> int Get step number (0-9) from hook name
is_background_hook(name) -> bool Check if hook is background (.bg suffix)
"""
__package__ = 'archivebox'
import os
import re
import json
import signal
import time
@@ -60,6 +74,63 @@ BUILTIN_PLUGINS_DIR = Path(__file__).parent / 'plugins'
USER_PLUGINS_DIR = Path(getattr(settings, 'DATA_DIR', Path.cwd())) / 'plugins'
# =============================================================================
# Hook Step Extraction
# =============================================================================
def extract_step(hook_name: str) -> int:
"""
Extract step number (0-9) from hook name.
Hooks are numbered 00-99 with the first digit determining the step.
Pattern: on_{Model}__{XX}_{description}[.bg].{ext}
Args:
hook_name: Hook filename (e.g., 'on_Snapshot__50_wget.py')
Returns:
Step number 0-9, or 9 (default) for unnumbered hooks.
Examples:
extract_step('on_Snapshot__05_chrome.py') -> 0
extract_step('on_Snapshot__50_wget.py') -> 5
extract_step('on_Snapshot__63_media.bg.py') -> 6
extract_step('on_Snapshot__99_cleanup.sh') -> 9
extract_step('on_Snapshot__unnumbered.py') -> 9 (default)
"""
# Pattern matches __XX_ where XX is two digits
match = re.search(r'__(\d{2})_', hook_name)
if match:
two_digit = int(match.group(1))
step = two_digit // 10 # First digit is the step (0-9)
return step
# Log warning for unnumbered hooks and default to step 9
import sys
print(f"Warning: Hook '{hook_name}' has no step number (expected __XX_), defaulting to step 9", file=sys.stderr)
return 9
def is_background_hook(hook_name: str) -> bool:
"""
Check if a hook is a background hook (doesn't block step advancement).
Background hooks have '.bg.' in their filename before the extension.
Args:
hook_name: Hook filename (e.g., 'on_Snapshot__20_chrome_tab.bg.js')
Returns:
True if background hook, False if foreground.
Examples:
is_background_hook('on_Snapshot__20_chrome_tab.bg.js') -> True
is_background_hook('on_Snapshot__50_wget.py') -> False
is_background_hook('on_Snapshot__63_media.bg.py') -> True
"""
return '.bg.' in hook_name or '__background' in hook_name
class HookResult(TypedDict, total=False):
"""Raw result from run_hook()."""
returncode: int

View File

@@ -1,7 +1,7 @@
# Generated by Django 6.0 on 2025-12-28 05:12
import django.db.models.deletion
import uuid
from archivebox import uuid_compat
from django.db import migrations, models
@@ -15,7 +15,7 @@ class Migration(migrations.Migration):
migrations.AlterField(
model_name='dependency',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='binary',
@@ -25,7 +25,7 @@ class Migration(migrations.Migration):
migrations.AlterField(
model_name='binary',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='machine',
@@ -35,11 +35,11 @@ class Migration(migrations.Migration):
migrations.AlterField(
model_name='machine',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='networkinterface',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
]

View File

@@ -352,18 +352,42 @@ class ArchiveResultWorker(Worker):
return ArchiveResult
def get_queue(self) -> QuerySet:
"""Get queue of ArchiveResults ready for processing."""
"""
Get queue of ArchiveResults ready for processing.
Uses step-based filtering: only claims ARs where hook step <= snapshot.current_step.
This ensures hooks execute in order (step 0 → 1 → 2 ... → 9).
"""
from core.models import ArchiveResult
from archivebox.hooks import extract_step
qs = super().get_queue()
if self.plugin:
qs = qs.filter(plugin=self.plugin)
# Note: Removed blocking logic since plugins have separate output directories
# and don't interfere with each other. Each plugin runs independently.
# Step-based filtering: only process ARs whose step <= snapshot.current_step
# Since step is derived from hook_name, we filter in Python after initial query
# This is efficient because the base query already filters by retry_at and status
return qs
# Get candidate ARs
candidates = list(qs[:50]) # Limit to avoid loading too many
ready_pks = []
for ar in candidates:
if not ar.hook_name:
# Legacy ARs without hook_name - process them
ready_pks.append(ar.pk)
continue
ar_step = extract_step(ar.hook_name)
snapshot_step = ar.snapshot.current_step
if ar_step <= snapshot_step:
ready_pks.append(ar.pk)
# Return filtered queryset ordered by hook_name (so earlier hooks run first within a step)
return ArchiveResult.objects.filter(pk__in=ready_pks).order_by('hook_name', 'retry_at')
def process_item(self, obj) -> bool:
"""Process an ArchiveResult by running its plugin."""