mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-02 17:05:38 +10:00
Improve concurrency control between plugin hooks (#1721)
<!-- IMPORTANT: Do not submit PRs with only formatting / PEP8 / line length changes. --> # Summary <!--e.g. This PR fixes ABC or adds the ability to do XYZ...--> # Related issues <!-- e.g. #123 or Roadmap goal # https://github.com/pirate/ArchiveBox/wiki/Roadmap --> # Changes these areas - [ ] Bugfixes - [ ] Feature behavior - [ ] Command line interface - [ ] Configuration options - [ ] Internal architecture - [ ] Snapshot data layout on disk
This commit is contained in:
@@ -310,37 +310,40 @@ archivebox/plugins/{plugin_name}/
|
||||
## Implementation Checklist
|
||||
|
||||
### Phase 1: Schema Migration ✅
|
||||
- [ ] Add `Snapshot.current_step` (IntegerField 0-9, default=0)
|
||||
- [ ] Add `ArchiveResult.hook_name` (CharField, nullable) - just filename
|
||||
- [ ] Create migration: `0033_snapshot_current_step_archiveresult_hook_name.py`
|
||||
- [x] Add `Snapshot.current_step` (IntegerField 0-9, default=0)
|
||||
- [x] Add `ArchiveResult.hook_name` (CharField, nullable) - just filename
|
||||
- [x] Create migration: `0034_snapshot_current_step.py`
|
||||
|
||||
### Phase 2: Core Logic Updates
|
||||
- [ ] Add `extract_step(hook_name)` utility in `archivebox/hooks.py`
|
||||
### Phase 2: Core Logic Updates ✅
|
||||
- [x] Add `extract_step(hook_name)` utility in `archivebox/hooks.py`
|
||||
- Extract first digit from `__XX_` pattern
|
||||
- Default to 9 for unnumbered hooks
|
||||
- [ ] Update `Snapshot.create_pending_archiveresults()` in `archivebox/core/models.py`:
|
||||
- [x] Add `is_background_hook(hook_name)` utility in `archivebox/hooks.py`
|
||||
- Check for `.bg.` in filename
|
||||
- [x] Update `Snapshot.create_pending_archiveresults()` in `archivebox/core/models.py`:
|
||||
- Discover all hooks (not plugins)
|
||||
- Create one AR per hook with `hook_name` set
|
||||
- [ ] Update `ArchiveResult.run()` in `archivebox/core/models.py`:
|
||||
- [x] Update `ArchiveResult.run()` in `archivebox/core/models.py`:
|
||||
- If `hook_name` set: run single hook
|
||||
- If `hook_name` None: discover all plugin hooks (existing behavior)
|
||||
- [ ] Add `Snapshot.advance_step_if_ready()` method:
|
||||
- [x] Add `Snapshot.advance_step_if_ready()` method:
|
||||
- Check if all foreground ARs in current step finished
|
||||
- Increment `current_step` if ready
|
||||
- Ignore background hooks (.bg) in completion check
|
||||
- [ ] Integrate with `SnapshotMachine.is_finished()` in `archivebox/core/statemachines.py`:
|
||||
- [x] Integrate with `SnapshotMachine.is_finished()` in `archivebox/core/statemachines.py`:
|
||||
- Call `advance_step_if_ready()` before checking if done
|
||||
|
||||
### Phase 3: Worker Coordination
|
||||
- [ ] Update worker AR claiming query in `archivebox/workers/worker.py`:
|
||||
### Phase 3: Worker Coordination ✅
|
||||
- [x] Update worker AR claiming query in `archivebox/workers/worker.py`:
|
||||
- Filter: `extract_step(ar.hook_name) <= snapshot.current_step`
|
||||
- Note: May need to denormalize or use clever query since step is derived
|
||||
- Alternative: Claim any AR in QUEUED state, check step in Python before processing
|
||||
- Claims ARs in QUEUED state, checks step in Python before processing
|
||||
- Orders by hook_name for deterministic execution within step
|
||||
|
||||
### Phase 4: Hook Renumbering
|
||||
- [ ] Renumber hooks per renumbering map below
|
||||
- [ ] Add `.bg` suffix to long-running hooks
|
||||
- [ ] Test all hooks still work after renumbering
|
||||
### Phase 4: Hook Renumbering ✅
|
||||
- [x] Renumber hooks per renumbering map below
|
||||
- [x] Add `.bg` suffix to long-running hooks (media, gallerydl, forumdl, papersdl)
|
||||
- [x] Move parse_* hooks to step 7 (70-79)
|
||||
- [x] Test all hooks still work after renumbering
|
||||
|
||||
## Migration Path
|
||||
|
||||
@@ -353,25 +356,34 @@ No special migration needed:
|
||||
|
||||
### Renumbering Map
|
||||
|
||||
**Current → New:**
|
||||
**Completed Renames:**
|
||||
```
|
||||
git/on_Snapshot__12_git.py → git/on_Snapshot__62_git.py
|
||||
media/on_Snapshot__51_media.py → media/on_Snapshot__63_media.bg.py
|
||||
gallerydl/on_Snapshot__52_gallerydl.py → gallerydl/on_Snapshot__64_gallerydl.bg.py
|
||||
forumdl/on_Snapshot__53_forumdl.py → forumdl/on_Snapshot__65_forumdl.bg.py
|
||||
papersdl/on_Snapshot__54_papersdl.py → papersdl/on_Snapshot__66_papersdl.bg.py
|
||||
# Step 5: DOM Extraction (sequential, non-background)
|
||||
singlefile/on_Snapshot__37_singlefile.py → singlefile/on_Snapshot__50_singlefile.py ✅
|
||||
screenshot/on_Snapshot__34_screenshot.js → screenshot/on_Snapshot__51_screenshot.js ✅
|
||||
pdf/on_Snapshot__35_pdf.js → pdf/on_Snapshot__52_pdf.js ✅
|
||||
dom/on_Snapshot__36_dom.js → dom/on_Snapshot__53_dom.js ✅
|
||||
title/on_Snapshot__32_title.js → title/on_Snapshot__54_title.js ✅
|
||||
readability/on_Snapshot__52_readability.py → readability/on_Snapshot__55_readability.py ✅
|
||||
headers/on_Snapshot__33_headers.js → headers/on_Snapshot__55_headers.js ✅
|
||||
mercury/on_Snapshot__53_mercury.py → mercury/on_Snapshot__56_mercury.py ✅
|
||||
htmltotext/on_Snapshot__54_htmltotext.py → htmltotext/on_Snapshot__57_htmltotext.py ✅
|
||||
|
||||
readability/on_Snapshot__52_readability.py → readability/on_Snapshot__55_readability.py
|
||||
mercury/on_Snapshot__53_mercury.py → mercury/on_Snapshot__56_mercury.py
|
||||
# Step 6: Post-DOM Extraction (background for long-running)
|
||||
wget/on_Snapshot__50_wget.py → wget/on_Snapshot__61_wget.py ✅
|
||||
git/on_Snapshot__12_git.py → git/on_Snapshot__62_git.py ✅
|
||||
media/on_Snapshot__51_media.py → media/on_Snapshot__63_media.bg.py ✅
|
||||
gallerydl/on_Snapshot__52_gallerydl.py → gallerydl/on_Snapshot__64_gallerydl.bg.py ✅
|
||||
forumdl/on_Snapshot__53_forumdl.py → forumdl/on_Snapshot__65_forumdl.bg.py ✅
|
||||
papersdl/on_Snapshot__54_papersdl.py → papersdl/on_Snapshot__66_papersdl.bg.py ✅
|
||||
|
||||
singlefile/on_Snapshot__37_singlefile.py → singlefile/on_Snapshot__50_singlefile.py
|
||||
screenshot/on_Snapshot__34_screenshot.js → screenshot/on_Snapshot__51_screenshot.js
|
||||
pdf/on_Snapshot__35_pdf.js → pdf/on_Snapshot__52_pdf.js
|
||||
dom/on_Snapshot__36_dom.js → dom/on_Snapshot__53_dom.js
|
||||
title/on_Snapshot__32_title.js → title/on_Snapshot__54_title.js
|
||||
headers/on_Snapshot__33_headers.js → headers/on_Snapshot__55_headers.js
|
||||
|
||||
wget/on_Snapshot__50_wget.py → wget/on_Snapshot__61_wget.py
|
||||
# Step 7: URL Extraction (parse_* hooks moved from step 6)
|
||||
parse_html_urls/on_Snapshot__60_parse_html_urls.py → parse_html_urls/on_Snapshot__70_parse_html_urls.py ✅
|
||||
parse_txt_urls/on_Snapshot__62_parse_txt_urls.py → parse_txt_urls/on_Snapshot__71_parse_txt_urls.py ✅
|
||||
parse_rss_urls/on_Snapshot__61_parse_rss_urls.py → parse_rss_urls/on_Snapshot__72_parse_rss_urls.py ✅
|
||||
parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py → parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py ✅
|
||||
parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py → parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py ✅
|
||||
parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js → parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js ✅
|
||||
```
|
||||
|
||||
## Testing Strategy
|
||||
|
||||
@@ -11,18 +11,6 @@ from archivebox.misc.util import enforce_types, docstring
|
||||
from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVE_DIR
|
||||
from archivebox.config.common import SHELL_CONFIG
|
||||
from archivebox.misc.legacy import parse_json_links_details
|
||||
from archivebox.misc.folders import (
|
||||
get_indexed_folders,
|
||||
get_archived_folders,
|
||||
get_invalid_folders,
|
||||
get_unarchived_folders,
|
||||
get_present_folders,
|
||||
get_valid_folders,
|
||||
get_duplicate_folders,
|
||||
get_orphaned_folders,
|
||||
get_corrupted_folders,
|
||||
get_unrecognized_folders,
|
||||
)
|
||||
from archivebox.misc.system import get_dir_size
|
||||
from archivebox.misc.logging_util import printable_filesize
|
||||
|
||||
@@ -55,42 +43,40 @@ def status(out_dir: Path=DATA_DIR) -> None:
|
||||
size = printable_filesize(num_bytes)
|
||||
print(f' Size: {size} across {num_files} files in {num_dirs} directories')
|
||||
|
||||
num_indexed = len(get_indexed_folders(links, out_dir=out_dir))
|
||||
num_archived = len(get_archived_folders(links, out_dir=out_dir))
|
||||
num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir))
|
||||
print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})')
|
||||
print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})')
|
||||
print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})')
|
||||
|
||||
num_present = len(get_present_folders(links, out_dir=out_dir))
|
||||
num_valid = len(get_valid_folders(links, out_dir=out_dir))
|
||||
# Use DB as source of truth for snapshot status
|
||||
num_indexed = links.count()
|
||||
num_archived = links.filter(status='archived').count() or links.exclude(downloaded_at=None).count()
|
||||
num_unarchived = links.filter(status='queued').count() or links.filter(downloaded_at=None).count()
|
||||
print(f' > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)')
|
||||
print(f' > archived: {num_archived}'.ljust(36), '(snapshots with archived content)')
|
||||
print(f' > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)')
|
||||
|
||||
# Count directories on filesystem
|
||||
num_present = 0
|
||||
orphaned_dirs = []
|
||||
if ARCHIVE_DIR.exists():
|
||||
for entry in ARCHIVE_DIR.iterdir():
|
||||
if entry.is_dir():
|
||||
num_present += 1
|
||||
if not links.filter(timestamp=entry.name).exists():
|
||||
orphaned_dirs.append(str(entry))
|
||||
|
||||
num_valid = min(num_present, num_indexed) # approximate
|
||||
print()
|
||||
print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})')
|
||||
print(f' > [green]valid:[/green] {num_valid}'.ljust(36), f' ({get_valid_folders.__doc__})')
|
||||
|
||||
duplicate = get_duplicate_folders(links, out_dir=out_dir)
|
||||
orphaned = get_orphaned_folders(links, out_dir=out_dir)
|
||||
corrupted = get_corrupted_folders(links, out_dir=out_dir)
|
||||
unrecognized = get_unrecognized_folders(links, out_dir=out_dir)
|
||||
num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized})
|
||||
print(f' > [red]invalid:[/red] {num_invalid}'.ljust(36), f' ({get_invalid_folders.__doc__})')
|
||||
print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})')
|
||||
print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})')
|
||||
print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})')
|
||||
print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})')
|
||||
print(f' > present: {num_present}'.ljust(36), '(directories in archive/)')
|
||||
print(f' > [green]valid:[/green] {num_valid}'.ljust(36), ' (directories with matching DB entry)')
|
||||
|
||||
num_orphaned = len(orphaned_dirs)
|
||||
print(f' > [red]orphaned:[/red] {num_orphaned}'.ljust(36), ' (directories without matching DB entry)')
|
||||
|
||||
if num_indexed:
|
||||
print(' [violet]Hint:[/violet] You can list link data directories by status like so:')
|
||||
print(' [green]archivebox list --status=<status> (e.g. indexed, corrupted, archived, etc.)[/green]')
|
||||
print(' [violet]Hint:[/violet] You can list snapshots by status like so:')
|
||||
print(' [green]archivebox list --status=<status> (e.g. archived, queued, etc.)[/green]')
|
||||
|
||||
if orphaned:
|
||||
if orphaned_dirs:
|
||||
print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:')
|
||||
print(' [green]archivebox init[/green]')
|
||||
|
||||
if num_invalid:
|
||||
print(' [violet]Hint:[/violet] You may need to manually remove or fix some invalid data directories, afterwards make sure to run:')
|
||||
print(' [green]archivebox init[/green]')
|
||||
|
||||
print()
|
||||
print('[green]\\[*] Scanning recent archive changes and user logins:[/green]')
|
||||
print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]')
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# Generated by Django 6.0 on 2025-12-28 05:12
|
||||
|
||||
import django.db.models.deletion
|
||||
import uuid
|
||||
from archivebox import uuid_compat
|
||||
from django.conf import settings
|
||||
from django.db import migrations, models
|
||||
|
||||
@@ -49,7 +49,7 @@ class Migration(migrations.Migration):
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='uuid',
|
||||
field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True),
|
||||
field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True),
|
||||
),
|
||||
migrations.AddConstraint(
|
||||
model_name='snapshot',
|
||||
|
||||
23
archivebox/core/migrations/0034_snapshot_current_step.py
Normal file
23
archivebox/core/migrations/0034_snapshot_current_step.py
Normal file
@@ -0,0 +1,23 @@
|
||||
# Generated by Django 6.0 on 2025-12-28
|
||||
# Add Snapshot.current_step field for hook step-based execution
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0033_rename_extractor_add_hook_name'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AddField(
|
||||
model_name='snapshot',
|
||||
name='current_step',
|
||||
field=models.PositiveSmallIntegerField(
|
||||
default=0,
|
||||
db_index=True,
|
||||
help_text='Current hook step being executed (0-9). Used for sequential hook execution.'
|
||||
),
|
||||
),
|
||||
]
|
||||
@@ -334,6 +334,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
|
||||
depth = models.PositiveSmallIntegerField(default=0, db_index=True) # 0 for root snapshot, 1+ for discovered URLs
|
||||
fs_version = models.CharField(max_length=10, default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().')
|
||||
current_step = models.PositiveSmallIntegerField(default=0, db_index=True, help_text='Current hook step being executed (0-9). Used for sequential hook execution.')
|
||||
|
||||
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
|
||||
status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
|
||||
@@ -1243,23 +1244,33 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
|
||||
def create_pending_archiveresults(self) -> list['ArchiveResult']:
|
||||
"""
|
||||
Create ArchiveResult records for all enabled plugins.
|
||||
Create ArchiveResult records for all enabled hooks.
|
||||
|
||||
Uses the hooks system to discover available plugins from:
|
||||
Uses the hooks system to discover available hooks from:
|
||||
- archivebox/plugins/*/on_Snapshot__*.{py,sh,js}
|
||||
- data/plugins/*/on_Snapshot__*.{py,sh,js}
|
||||
"""
|
||||
from archivebox.hooks import get_enabled_plugins
|
||||
|
||||
plugins = get_enabled_plugins()
|
||||
Creates one ArchiveResult per hook (not per plugin), with hook_name set.
|
||||
This enables step-based execution where all hooks in a step can run in parallel.
|
||||
"""
|
||||
from archivebox.hooks import discover_hooks
|
||||
|
||||
hooks = discover_hooks('Snapshot')
|
||||
archiveresults = []
|
||||
|
||||
for plugin in plugins:
|
||||
if ArchiveResult.objects.filter(snapshot=self, plugin=plugin).exists():
|
||||
for hook_path in hooks:
|
||||
hook_name = hook_path.name # e.g., 'on_Snapshot__50_wget.py'
|
||||
plugin = hook_path.parent.name # e.g., 'wget'
|
||||
|
||||
# Check if AR already exists for this specific hook
|
||||
if ArchiveResult.objects.filter(snapshot=self, hook_name=hook_name).exists():
|
||||
continue
|
||||
archiveresult, _ = ArchiveResult.objects.get_or_create(
|
||||
snapshot=self, plugin=plugin,
|
||||
|
||||
archiveresult, created = ArchiveResult.objects.get_or_create(
|
||||
snapshot=self,
|
||||
hook_name=hook_name,
|
||||
defaults={
|
||||
'plugin': plugin,
|
||||
'status': ArchiveResult.INITIAL_STATE,
|
||||
'retry_at': timezone.now(),
|
||||
'created_by_id': self.created_by_id,
|
||||
@@ -1267,8 +1278,57 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
)
|
||||
if archiveresult.status == ArchiveResult.INITIAL_STATE:
|
||||
archiveresults.append(archiveresult)
|
||||
|
||||
return archiveresults
|
||||
|
||||
def advance_step_if_ready(self) -> bool:
|
||||
"""
|
||||
Advance current_step if all foreground hooks in current step are finished.
|
||||
|
||||
Called by the state machine to check if step can advance.
|
||||
Background hooks (.bg) don't block step advancement.
|
||||
|
||||
Step advancement rules:
|
||||
- All foreground ARs in current step must be finished (SUCCEEDED/FAILED/SKIPPED)
|
||||
- Background ARs (hook_name contains '.bg.') are ignored for advancement
|
||||
- When ready, increments current_step by 1 (up to 9)
|
||||
|
||||
Returns:
|
||||
True if step was advanced, False if not ready or already at step 9.
|
||||
"""
|
||||
from archivebox.hooks import extract_step, is_background_hook
|
||||
|
||||
if self.current_step >= 9:
|
||||
return False # Already at final step
|
||||
|
||||
# Get all ARs for current step that are foreground
|
||||
current_step_ars = self.archiveresult_set.filter(
|
||||
hook_name__isnull=False
|
||||
).exclude(hook_name='')
|
||||
|
||||
# Check each AR in current step
|
||||
for ar in current_step_ars:
|
||||
ar_step = extract_step(ar.hook_name)
|
||||
if ar_step != self.current_step:
|
||||
continue # Not in current step
|
||||
|
||||
if is_background_hook(ar.hook_name):
|
||||
continue # Background hooks don't block
|
||||
|
||||
# Foreground hook in current step - check if finished
|
||||
if ar.status not in ArchiveResult.FINAL_OR_ACTIVE_STATES:
|
||||
# Still pending/queued - can't advance
|
||||
return False
|
||||
|
||||
if ar.status == ArchiveResult.StatusChoices.STARTED:
|
||||
# Still running - can't advance
|
||||
return False
|
||||
|
||||
# All foreground hooks in current step are finished - advance!
|
||||
self.current_step += 1
|
||||
self.save(update_fields=['current_step', 'modified_at'])
|
||||
return True
|
||||
|
||||
def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int:
|
||||
"""
|
||||
Reset failed/skipped ArchiveResults to queued for retry.
|
||||
@@ -1301,11 +1361,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
end_ts=None,
|
||||
)
|
||||
|
||||
# Also reset the snapshot so it gets re-checked
|
||||
# Also reset the snapshot and current_step so it gets re-checked from the beginning
|
||||
if count > 0:
|
||||
self.status = self.StatusChoices.STARTED
|
||||
self.retry_at = retry_at
|
||||
self.save(update_fields=['status', 'retry_at', 'modified_at'])
|
||||
self.current_step = 0 # Reset to step 0 for retry
|
||||
self.save(update_fields=['status', 'retry_at', 'current_step', 'modified_at'])
|
||||
|
||||
return count
|
||||
|
||||
@@ -1841,45 +1902,63 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Execute this ArchiveResult's plugin and update status.
|
||||
Execute this ArchiveResult's hook and update status.
|
||||
|
||||
Discovers and runs the hook script for self.plugin,
|
||||
updates status/output fields, queues discovered URLs, and triggers indexing.
|
||||
If self.hook_name is set, runs only that specific hook.
|
||||
If self.hook_name is empty, discovers and runs all hooks for self.plugin (backwards compat).
|
||||
|
||||
Updates status/output fields, queues discovered URLs, and triggers indexing.
|
||||
"""
|
||||
from django.utils import timezone
|
||||
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
|
||||
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook
|
||||
|
||||
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
|
||||
|
||||
# Find ALL hooks for this plugin
|
||||
# plugin = plugin name (e.g., 'chrome')
|
||||
# Each plugin can have multiple hooks that run in sequence
|
||||
# Determine which hook(s) to run
|
||||
hooks = []
|
||||
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
||||
if not base_dir.exists():
|
||||
continue
|
||||
plugin_dir = base_dir / self.plugin
|
||||
if plugin_dir.exists():
|
||||
matches = list(plugin_dir.glob('on_Snapshot__*.*'))
|
||||
if matches:
|
||||
# Sort by name for deterministic order (numeric prefix controls execution order)
|
||||
hooks.extend(sorted(matches))
|
||||
|
||||
if self.hook_name:
|
||||
# SPECIFIC HOOK MODE: Find the specific hook by name
|
||||
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
||||
if not base_dir.exists():
|
||||
continue
|
||||
plugin_dir = base_dir / self.plugin
|
||||
if plugin_dir.exists():
|
||||
hook_path = plugin_dir / self.hook_name
|
||||
if hook_path.exists():
|
||||
hooks.append(hook_path)
|
||||
break
|
||||
else:
|
||||
# LEGACY MODE: Discover all hooks for this plugin (backwards compatibility)
|
||||
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
||||
if not base_dir.exists():
|
||||
continue
|
||||
plugin_dir = base_dir / self.plugin
|
||||
if plugin_dir.exists():
|
||||
matches = list(plugin_dir.glob('on_Snapshot__*.*'))
|
||||
if matches:
|
||||
hooks.extend(sorted(matches))
|
||||
|
||||
if not hooks:
|
||||
self.status = self.StatusChoices.FAILED
|
||||
self.output_str = f'No hooks found for plugin: {self.plugin}'
|
||||
if self.hook_name:
|
||||
self.output_str = f'Hook not found: {self.plugin}/{self.hook_name}'
|
||||
else:
|
||||
self.output_str = f'No hooks found for plugin: {self.plugin}'
|
||||
self.retry_at = None
|
||||
self.save()
|
||||
return
|
||||
|
||||
# plugin field contains plugin name
|
||||
# Output directory is plugin_dir for the hook output
|
||||
plugin_dir = Path(self.snapshot.output_dir) / self.plugin
|
||||
|
||||
# Run ALL hooks in the plugin sequentially
|
||||
start_ts = timezone.now()
|
||||
has_background_hook = False
|
||||
is_bg_hook = False
|
||||
|
||||
for hook in hooks:
|
||||
# Check if this is a background hook
|
||||
is_bg_hook = is_background_hook(hook.name)
|
||||
|
||||
result = run_hook(
|
||||
hook,
|
||||
output_dir=plugin_dir,
|
||||
@@ -1890,20 +1969,21 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
depth=self.snapshot.depth,
|
||||
)
|
||||
|
||||
# If any hook is background, mark this ArchiveResult as started
|
||||
# Background hooks return None
|
||||
if result is None:
|
||||
has_background_hook = True
|
||||
is_bg_hook = True
|
||||
|
||||
# Update status based on hook execution
|
||||
if has_background_hook:
|
||||
# BACKGROUND HOOK(S) - still running, return immediately
|
||||
if is_bg_hook:
|
||||
# BACKGROUND HOOK - still running, return immediately
|
||||
# Status stays STARTED, will be finalized by Snapshot.cleanup()
|
||||
self.status = self.StatusChoices.STARTED
|
||||
self.start_ts = start_ts
|
||||
self.pwd = str(plugin_dir)
|
||||
self.save()
|
||||
return
|
||||
|
||||
# ALL FOREGROUND HOOKS - completed, update from filesystem
|
||||
# FOREGROUND HOOK - completed, update from filesystem
|
||||
self.start_ts = start_ts
|
||||
self.pwd = str(plugin_dir)
|
||||
self.update_from_output()
|
||||
@@ -1911,11 +1991,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
# Clean up empty output directory if no files were created
|
||||
if plugin_dir.exists() and not self.output_files:
|
||||
try:
|
||||
# Only remove if directory is completely empty
|
||||
if not any(plugin_dir.iterdir()):
|
||||
plugin_dir.rmdir()
|
||||
except (OSError, RuntimeError):
|
||||
pass # Directory not empty or can't be removed, that's fine
|
||||
pass
|
||||
|
||||
def update_from_output(self):
|
||||
"""
|
||||
|
||||
@@ -60,6 +60,11 @@ class SnapshotMachine(StateMachine, strict_states=True):
|
||||
if not self.snapshot.archiveresult_set.exists():
|
||||
return False
|
||||
|
||||
# Try to advance step if ready (handles step-based hook execution)
|
||||
# This will increment current_step when all foreground hooks in current step are done
|
||||
while self.snapshot.advance_step_if_ready():
|
||||
pass # Keep advancing until we can't anymore
|
||||
|
||||
# if archiveresults exist but are still pending, it's not finished
|
||||
if self.snapshot.pending_archiveresults().exists():
|
||||
return False
|
||||
|
||||
@@ -15,10 +15,21 @@ Hook contract:
|
||||
Exit: 0 = success, non-zero = failure
|
||||
|
||||
Execution order:
|
||||
- Extractors run sequentially within each Snapshot (ordered by numeric prefix)
|
||||
- Multiple Snapshots can process in parallel
|
||||
- Hooks are numbered 00-99 with first digit determining step (0-9)
|
||||
- All hooks in a step can run in parallel
|
||||
- Steps execute sequentially (step 0 → step 1 → ... → step 9)
|
||||
- Background hooks (.bg suffix) don't block step advancement
|
||||
- Failed extractors don't block subsequent extractors
|
||||
|
||||
Hook Naming Convention:
|
||||
on_{ModelName}__{run_order}_{description}[.bg].{ext}
|
||||
|
||||
Examples:
|
||||
on_Snapshot__00_setup.py # Step 0, runs first
|
||||
on_Snapshot__20_chrome_tab.bg.js # Step 2, background (doesn't block)
|
||||
on_Snapshot__50_screenshot.js # Step 5, foreground (blocks step)
|
||||
on_Snapshot__63_media.bg.py # Step 6, background (long-running)
|
||||
|
||||
Dependency handling:
|
||||
Extractor plugins that depend on other plugins' output should check at runtime:
|
||||
|
||||
@@ -39,11 +50,14 @@ API (all hook logic lives here):
|
||||
discover_hooks(event) -> List[Path] Find hook scripts
|
||||
run_hook(script, ...) -> HookResult Execute a hook script
|
||||
run_hooks(event, ...) -> List[HookResult] Run all hooks for an event
|
||||
extract_step(hook_name) -> int Get step number (0-9) from hook name
|
||||
is_background_hook(name) -> bool Check if hook is background (.bg suffix)
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox'
|
||||
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import signal
|
||||
import time
|
||||
@@ -60,6 +74,63 @@ BUILTIN_PLUGINS_DIR = Path(__file__).parent / 'plugins'
|
||||
USER_PLUGINS_DIR = Path(getattr(settings, 'DATA_DIR', Path.cwd())) / 'plugins'
|
||||
|
||||
|
||||
# =============================================================================
|
||||
# Hook Step Extraction
|
||||
# =============================================================================
|
||||
|
||||
def extract_step(hook_name: str) -> int:
|
||||
"""
|
||||
Extract step number (0-9) from hook name.
|
||||
|
||||
Hooks are numbered 00-99 with the first digit determining the step.
|
||||
Pattern: on_{Model}__{XX}_{description}[.bg].{ext}
|
||||
|
||||
Args:
|
||||
hook_name: Hook filename (e.g., 'on_Snapshot__50_wget.py')
|
||||
|
||||
Returns:
|
||||
Step number 0-9, or 9 (default) for unnumbered hooks.
|
||||
|
||||
Examples:
|
||||
extract_step('on_Snapshot__05_chrome.py') -> 0
|
||||
extract_step('on_Snapshot__50_wget.py') -> 5
|
||||
extract_step('on_Snapshot__63_media.bg.py') -> 6
|
||||
extract_step('on_Snapshot__99_cleanup.sh') -> 9
|
||||
extract_step('on_Snapshot__unnumbered.py') -> 9 (default)
|
||||
"""
|
||||
# Pattern matches __XX_ where XX is two digits
|
||||
match = re.search(r'__(\d{2})_', hook_name)
|
||||
if match:
|
||||
two_digit = int(match.group(1))
|
||||
step = two_digit // 10 # First digit is the step (0-9)
|
||||
return step
|
||||
|
||||
# Log warning for unnumbered hooks and default to step 9
|
||||
import sys
|
||||
print(f"Warning: Hook '{hook_name}' has no step number (expected __XX_), defaulting to step 9", file=sys.stderr)
|
||||
return 9
|
||||
|
||||
|
||||
def is_background_hook(hook_name: str) -> bool:
|
||||
"""
|
||||
Check if a hook is a background hook (doesn't block step advancement).
|
||||
|
||||
Background hooks have '.bg.' in their filename before the extension.
|
||||
|
||||
Args:
|
||||
hook_name: Hook filename (e.g., 'on_Snapshot__20_chrome_tab.bg.js')
|
||||
|
||||
Returns:
|
||||
True if background hook, False if foreground.
|
||||
|
||||
Examples:
|
||||
is_background_hook('on_Snapshot__20_chrome_tab.bg.js') -> True
|
||||
is_background_hook('on_Snapshot__50_wget.py') -> False
|
||||
is_background_hook('on_Snapshot__63_media.bg.py') -> True
|
||||
"""
|
||||
return '.bg.' in hook_name or '__background' in hook_name
|
||||
|
||||
|
||||
class HookResult(TypedDict, total=False):
|
||||
"""Raw result from run_hook()."""
|
||||
returncode: int
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# Generated by Django 6.0 on 2025-12-28 05:12
|
||||
|
||||
import django.db.models.deletion
|
||||
import uuid
|
||||
from archivebox import uuid_compat
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ class Migration(migrations.Migration):
|
||||
migrations.AlterField(
|
||||
model_name='dependency',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='binary',
|
||||
@@ -25,7 +25,7 @@ class Migration(migrations.Migration):
|
||||
migrations.AlterField(
|
||||
model_name='binary',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='machine',
|
||||
@@ -35,11 +35,11 @@ class Migration(migrations.Migration):
|
||||
migrations.AlterField(
|
||||
model_name='machine',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='networkinterface',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
]
|
||||
|
||||
@@ -352,18 +352,42 @@ class ArchiveResultWorker(Worker):
|
||||
return ArchiveResult
|
||||
|
||||
def get_queue(self) -> QuerySet:
|
||||
"""Get queue of ArchiveResults ready for processing."""
|
||||
"""
|
||||
Get queue of ArchiveResults ready for processing.
|
||||
|
||||
Uses step-based filtering: only claims ARs where hook step <= snapshot.current_step.
|
||||
This ensures hooks execute in order (step 0 → 1 → 2 ... → 9).
|
||||
"""
|
||||
from core.models import ArchiveResult
|
||||
from archivebox.hooks import extract_step
|
||||
|
||||
qs = super().get_queue()
|
||||
|
||||
if self.plugin:
|
||||
qs = qs.filter(plugin=self.plugin)
|
||||
|
||||
# Note: Removed blocking logic since plugins have separate output directories
|
||||
# and don't interfere with each other. Each plugin runs independently.
|
||||
# Step-based filtering: only process ARs whose step <= snapshot.current_step
|
||||
# Since step is derived from hook_name, we filter in Python after initial query
|
||||
# This is efficient because the base query already filters by retry_at and status
|
||||
|
||||
return qs
|
||||
# Get candidate ARs
|
||||
candidates = list(qs[:50]) # Limit to avoid loading too many
|
||||
ready_pks = []
|
||||
|
||||
for ar in candidates:
|
||||
if not ar.hook_name:
|
||||
# Legacy ARs without hook_name - process them
|
||||
ready_pks.append(ar.pk)
|
||||
continue
|
||||
|
||||
ar_step = extract_step(ar.hook_name)
|
||||
snapshot_step = ar.snapshot.current_step
|
||||
|
||||
if ar_step <= snapshot_step:
|
||||
ready_pks.append(ar.pk)
|
||||
|
||||
# Return filtered queryset ordered by hook_name (so earlier hooks run first within a step)
|
||||
return ArchiveResult.objects.filter(pk__in=ready_pks).order_by('hook_name', 'retry_at')
|
||||
|
||||
def process_item(self, obj) -> bool:
|
||||
"""Process an ArchiveResult by running its plugin."""
|
||||
|
||||
Reference in New Issue
Block a user