diff --git a/TODO_hook_concurrency.md b/TODO_hook_concurrency.md index f8f1bcf7..877b295b 100644 --- a/TODO_hook_concurrency.md +++ b/TODO_hook_concurrency.md @@ -310,37 +310,40 @@ archivebox/plugins/{plugin_name}/ ## Implementation Checklist ### Phase 1: Schema Migration ✅ -- [ ] Add `Snapshot.current_step` (IntegerField 0-9, default=0) -- [ ] Add `ArchiveResult.hook_name` (CharField, nullable) - just filename -- [ ] Create migration: `0033_snapshot_current_step_archiveresult_hook_name.py` +- [x] Add `Snapshot.current_step` (IntegerField 0-9, default=0) +- [x] Add `ArchiveResult.hook_name` (CharField, nullable) - just filename +- [x] Create migration: `0034_snapshot_current_step.py` -### Phase 2: Core Logic Updates -- [ ] Add `extract_step(hook_name)` utility in `archivebox/hooks.py` +### Phase 2: Core Logic Updates ✅ +- [x] Add `extract_step(hook_name)` utility in `archivebox/hooks.py` - Extract first digit from `__XX_` pattern - Default to 9 for unnumbered hooks -- [ ] Update `Snapshot.create_pending_archiveresults()` in `archivebox/core/models.py`: +- [x] Add `is_background_hook(hook_name)` utility in `archivebox/hooks.py` + - Check for `.bg.` in filename +- [x] Update `Snapshot.create_pending_archiveresults()` in `archivebox/core/models.py`: - Discover all hooks (not plugins) - Create one AR per hook with `hook_name` set -- [ ] Update `ArchiveResult.run()` in `archivebox/core/models.py`: +- [x] Update `ArchiveResult.run()` in `archivebox/core/models.py`: - If `hook_name` set: run single hook - If `hook_name` None: discover all plugin hooks (existing behavior) -- [ ] Add `Snapshot.advance_step_if_ready()` method: +- [x] Add `Snapshot.advance_step_if_ready()` method: - Check if all foreground ARs in current step finished - Increment `current_step` if ready - Ignore background hooks (.bg) in completion check -- [ ] Integrate with `SnapshotMachine.is_finished()` in `archivebox/core/statemachines.py`: +- [x] Integrate with `SnapshotMachine.is_finished()` in `archivebox/core/statemachines.py`: - Call `advance_step_if_ready()` before checking if done -### Phase 3: Worker Coordination -- [ ] Update worker AR claiming query in `archivebox/workers/worker.py`: +### Phase 3: Worker Coordination ✅ +- [x] Update worker AR claiming query in `archivebox/workers/worker.py`: - Filter: `extract_step(ar.hook_name) <= snapshot.current_step` - - Note: May need to denormalize or use clever query since step is derived - - Alternative: Claim any AR in QUEUED state, check step in Python before processing + - Claims ARs in QUEUED state, checks step in Python before processing + - Orders by hook_name for deterministic execution within step -### Phase 4: Hook Renumbering -- [ ] Renumber hooks per renumbering map below -- [ ] Add `.bg` suffix to long-running hooks -- [ ] Test all hooks still work after renumbering +### Phase 4: Hook Renumbering ✅ +- [x] Renumber hooks per renumbering map below +- [x] Add `.bg` suffix to long-running hooks (media, gallerydl, forumdl, papersdl) +- [x] Move parse_* hooks to step 7 (70-79) +- [x] Test all hooks still work after renumbering ## Migration Path @@ -353,25 +356,34 @@ No special migration needed: ### Renumbering Map -**Current → New:** +**Completed Renames:** ``` -git/on_Snapshot__12_git.py → git/on_Snapshot__62_git.py -media/on_Snapshot__51_media.py → media/on_Snapshot__63_media.bg.py -gallerydl/on_Snapshot__52_gallerydl.py → gallerydl/on_Snapshot__64_gallerydl.bg.py -forumdl/on_Snapshot__53_forumdl.py → forumdl/on_Snapshot__65_forumdl.bg.py -papersdl/on_Snapshot__54_papersdl.py → papersdl/on_Snapshot__66_papersdl.bg.py +# Step 5: DOM Extraction (sequential, non-background) +singlefile/on_Snapshot__37_singlefile.py → singlefile/on_Snapshot__50_singlefile.py ✅ +screenshot/on_Snapshot__34_screenshot.js → screenshot/on_Snapshot__51_screenshot.js ✅ +pdf/on_Snapshot__35_pdf.js → pdf/on_Snapshot__52_pdf.js ✅ +dom/on_Snapshot__36_dom.js → dom/on_Snapshot__53_dom.js ✅ +title/on_Snapshot__32_title.js → title/on_Snapshot__54_title.js ✅ +readability/on_Snapshot__52_readability.py → readability/on_Snapshot__55_readability.py ✅ +headers/on_Snapshot__33_headers.js → headers/on_Snapshot__55_headers.js ✅ +mercury/on_Snapshot__53_mercury.py → mercury/on_Snapshot__56_mercury.py ✅ +htmltotext/on_Snapshot__54_htmltotext.py → htmltotext/on_Snapshot__57_htmltotext.py ✅ -readability/on_Snapshot__52_readability.py → readability/on_Snapshot__55_readability.py -mercury/on_Snapshot__53_mercury.py → mercury/on_Snapshot__56_mercury.py +# Step 6: Post-DOM Extraction (background for long-running) +wget/on_Snapshot__50_wget.py → wget/on_Snapshot__61_wget.py ✅ +git/on_Snapshot__12_git.py → git/on_Snapshot__62_git.py ✅ +media/on_Snapshot__51_media.py → media/on_Snapshot__63_media.bg.py ✅ +gallerydl/on_Snapshot__52_gallerydl.py → gallerydl/on_Snapshot__64_gallerydl.bg.py ✅ +forumdl/on_Snapshot__53_forumdl.py → forumdl/on_Snapshot__65_forumdl.bg.py ✅ +papersdl/on_Snapshot__54_papersdl.py → papersdl/on_Snapshot__66_papersdl.bg.py ✅ -singlefile/on_Snapshot__37_singlefile.py → singlefile/on_Snapshot__50_singlefile.py -screenshot/on_Snapshot__34_screenshot.js → screenshot/on_Snapshot__51_screenshot.js -pdf/on_Snapshot__35_pdf.js → pdf/on_Snapshot__52_pdf.js -dom/on_Snapshot__36_dom.js → dom/on_Snapshot__53_dom.js -title/on_Snapshot__32_title.js → title/on_Snapshot__54_title.js -headers/on_Snapshot__33_headers.js → headers/on_Snapshot__55_headers.js - -wget/on_Snapshot__50_wget.py → wget/on_Snapshot__61_wget.py +# Step 7: URL Extraction (parse_* hooks moved from step 6) +parse_html_urls/on_Snapshot__60_parse_html_urls.py → parse_html_urls/on_Snapshot__70_parse_html_urls.py ✅ +parse_txt_urls/on_Snapshot__62_parse_txt_urls.py → parse_txt_urls/on_Snapshot__71_parse_txt_urls.py ✅ +parse_rss_urls/on_Snapshot__61_parse_rss_urls.py → parse_rss_urls/on_Snapshot__72_parse_rss_urls.py ✅ +parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py → parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py ✅ +parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py → parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py ✅ +parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js → parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js ✅ ``` ## Testing Strategy diff --git a/archivebox/cli/archivebox_status.py b/archivebox/cli/archivebox_status.py index e37f01ba..de5ada95 100644 --- a/archivebox/cli/archivebox_status.py +++ b/archivebox/cli/archivebox_status.py @@ -11,18 +11,6 @@ from archivebox.misc.util import enforce_types, docstring from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVE_DIR from archivebox.config.common import SHELL_CONFIG from archivebox.misc.legacy import parse_json_links_details -from archivebox.misc.folders import ( - get_indexed_folders, - get_archived_folders, - get_invalid_folders, - get_unarchived_folders, - get_present_folders, - get_valid_folders, - get_duplicate_folders, - get_orphaned_folders, - get_corrupted_folders, - get_unrecognized_folders, -) from archivebox.misc.system import get_dir_size from archivebox.misc.logging_util import printable_filesize @@ -55,42 +43,40 @@ def status(out_dir: Path=DATA_DIR) -> None: size = printable_filesize(num_bytes) print(f' Size: {size} across {num_files} files in {num_dirs} directories') - num_indexed = len(get_indexed_folders(links, out_dir=out_dir)) - num_archived = len(get_archived_folders(links, out_dir=out_dir)) - num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir)) - print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})') - print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})') - print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})') - - num_present = len(get_present_folders(links, out_dir=out_dir)) - num_valid = len(get_valid_folders(links, out_dir=out_dir)) + # Use DB as source of truth for snapshot status + num_indexed = links.count() + num_archived = links.filter(status='archived').count() or links.exclude(downloaded_at=None).count() + num_unarchived = links.filter(status='queued').count() or links.filter(downloaded_at=None).count() + print(f' > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)') + print(f' > archived: {num_archived}'.ljust(36), '(snapshots with archived content)') + print(f' > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)') + + # Count directories on filesystem + num_present = 0 + orphaned_dirs = [] + if ARCHIVE_DIR.exists(): + for entry in ARCHIVE_DIR.iterdir(): + if entry.is_dir(): + num_present += 1 + if not links.filter(timestamp=entry.name).exists(): + orphaned_dirs.append(str(entry)) + + num_valid = min(num_present, num_indexed) # approximate print() - print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})') - print(f' > [green]valid:[/green] {num_valid}'.ljust(36), f' ({get_valid_folders.__doc__})') - - duplicate = get_duplicate_folders(links, out_dir=out_dir) - orphaned = get_orphaned_folders(links, out_dir=out_dir) - corrupted = get_corrupted_folders(links, out_dir=out_dir) - unrecognized = get_unrecognized_folders(links, out_dir=out_dir) - num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized}) - print(f' > [red]invalid:[/red] {num_invalid}'.ljust(36), f' ({get_invalid_folders.__doc__})') - print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})') - print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})') - print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})') - print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})') + print(f' > present: {num_present}'.ljust(36), '(directories in archive/)') + print(f' > [green]valid:[/green] {num_valid}'.ljust(36), ' (directories with matching DB entry)') + + num_orphaned = len(orphaned_dirs) + print(f' > [red]orphaned:[/red] {num_orphaned}'.ljust(36), ' (directories without matching DB entry)') if num_indexed: - print(' [violet]Hint:[/violet] You can list link data directories by status like so:') - print(' [green]archivebox list --status= (e.g. indexed, corrupted, archived, etc.)[/green]') + print(' [violet]Hint:[/violet] You can list snapshots by status like so:') + print(' [green]archivebox list --status= (e.g. archived, queued, etc.)[/green]') - if orphaned: + if orphaned_dirs: print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:') print(' [green]archivebox init[/green]') - if num_invalid: - print(' [violet]Hint:[/violet] You may need to manually remove or fix some invalid data directories, afterwards make sure to run:') - print(' [green]archivebox init[/green]') - print() print('[green]\\[*] Scanning recent archive changes and user logins:[/green]') print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]') diff --git a/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py b/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py index 77c78472..3d3d70d2 100644 --- a/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py +++ b/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py @@ -1,7 +1,7 @@ # Generated by Django 6.0 on 2025-12-28 05:12 import django.db.models.deletion -import uuid +from archivebox import uuid_compat from django.conf import settings from django.db import migrations, models @@ -49,7 +49,7 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='archiveresult', name='uuid', - field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True), + field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True), ), migrations.AddConstraint( model_name='snapshot', diff --git a/archivebox/core/migrations/0034_snapshot_current_step.py b/archivebox/core/migrations/0034_snapshot_current_step.py new file mode 100644 index 00000000..f570230c --- /dev/null +++ b/archivebox/core/migrations/0034_snapshot_current_step.py @@ -0,0 +1,23 @@ +# Generated by Django 6.0 on 2025-12-28 +# Add Snapshot.current_step field for hook step-based execution + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0033_rename_extractor_add_hook_name'), + ] + + operations = [ + migrations.AddField( + model_name='snapshot', + name='current_step', + field=models.PositiveSmallIntegerField( + default=0, + db_index=True, + help_text='Current hook step being executed (0-9). Used for sequential hook execution.' + ), + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index fbef95cd..192835de 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -334,6 +334,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True) depth = models.PositiveSmallIntegerField(default=0, db_index=True) # 0 for root snapshot, 1+ for discovered URLs fs_version = models.CharField(max_length=10, default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().') + current_step = models.PositiveSmallIntegerField(default=0, db_index=True, help_text='Current hook step being executed (0-9). Used for sequential hook execution.') retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED) @@ -1243,23 +1244,33 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea def create_pending_archiveresults(self) -> list['ArchiveResult']: """ - Create ArchiveResult records for all enabled plugins. + Create ArchiveResult records for all enabled hooks. - Uses the hooks system to discover available plugins from: + Uses the hooks system to discover available hooks from: - archivebox/plugins/*/on_Snapshot__*.{py,sh,js} - data/plugins/*/on_Snapshot__*.{py,sh,js} - """ - from archivebox.hooks import get_enabled_plugins - plugins = get_enabled_plugins() + Creates one ArchiveResult per hook (not per plugin), with hook_name set. + This enables step-based execution where all hooks in a step can run in parallel. + """ + from archivebox.hooks import discover_hooks + + hooks = discover_hooks('Snapshot') archiveresults = [] - for plugin in plugins: - if ArchiveResult.objects.filter(snapshot=self, plugin=plugin).exists(): + for hook_path in hooks: + hook_name = hook_path.name # e.g., 'on_Snapshot__50_wget.py' + plugin = hook_path.parent.name # e.g., 'wget' + + # Check if AR already exists for this specific hook + if ArchiveResult.objects.filter(snapshot=self, hook_name=hook_name).exists(): continue - archiveresult, _ = ArchiveResult.objects.get_or_create( - snapshot=self, plugin=plugin, + + archiveresult, created = ArchiveResult.objects.get_or_create( + snapshot=self, + hook_name=hook_name, defaults={ + 'plugin': plugin, 'status': ArchiveResult.INITIAL_STATE, 'retry_at': timezone.now(), 'created_by_id': self.created_by_id, @@ -1267,8 +1278,57 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea ) if archiveresult.status == ArchiveResult.INITIAL_STATE: archiveresults.append(archiveresult) + return archiveresults + def advance_step_if_ready(self) -> bool: + """ + Advance current_step if all foreground hooks in current step are finished. + + Called by the state machine to check if step can advance. + Background hooks (.bg) don't block step advancement. + + Step advancement rules: + - All foreground ARs in current step must be finished (SUCCEEDED/FAILED/SKIPPED) + - Background ARs (hook_name contains '.bg.') are ignored for advancement + - When ready, increments current_step by 1 (up to 9) + + Returns: + True if step was advanced, False if not ready or already at step 9. + """ + from archivebox.hooks import extract_step, is_background_hook + + if self.current_step >= 9: + return False # Already at final step + + # Get all ARs for current step that are foreground + current_step_ars = self.archiveresult_set.filter( + hook_name__isnull=False + ).exclude(hook_name='') + + # Check each AR in current step + for ar in current_step_ars: + ar_step = extract_step(ar.hook_name) + if ar_step != self.current_step: + continue # Not in current step + + if is_background_hook(ar.hook_name): + continue # Background hooks don't block + + # Foreground hook in current step - check if finished + if ar.status not in ArchiveResult.FINAL_OR_ACTIVE_STATES: + # Still pending/queued - can't advance + return False + + if ar.status == ArchiveResult.StatusChoices.STARTED: + # Still running - can't advance + return False + + # All foreground hooks in current step are finished - advance! + self.current_step += 1 + self.save(update_fields=['current_step', 'modified_at']) + return True + def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int: """ Reset failed/skipped ArchiveResults to queued for retry. @@ -1301,11 +1361,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea end_ts=None, ) - # Also reset the snapshot so it gets re-checked + # Also reset the snapshot and current_step so it gets re-checked from the beginning if count > 0: self.status = self.StatusChoices.STARTED self.retry_at = retry_at - self.save(update_fields=['status', 'retry_at', 'modified_at']) + self.current_step = 0 # Reset to step 0 for retry + self.save(update_fields=['status', 'retry_at', 'current_step', 'modified_at']) return count @@ -1841,45 +1902,63 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi def run(self): """ - Execute this ArchiveResult's plugin and update status. + Execute this ArchiveResult's hook and update status. - Discovers and runs the hook script for self.plugin, - updates status/output fields, queues discovered URLs, and triggers indexing. + If self.hook_name is set, runs only that specific hook. + If self.hook_name is empty, discovers and runs all hooks for self.plugin (backwards compat). + + Updates status/output fields, queues discovered URLs, and triggers indexing. """ from django.utils import timezone - from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook + from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot] - # Find ALL hooks for this plugin - # plugin = plugin name (e.g., 'chrome') - # Each plugin can have multiple hooks that run in sequence + # Determine which hook(s) to run hooks = [] - for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): - if not base_dir.exists(): - continue - plugin_dir = base_dir / self.plugin - if plugin_dir.exists(): - matches = list(plugin_dir.glob('on_Snapshot__*.*')) - if matches: - # Sort by name for deterministic order (numeric prefix controls execution order) - hooks.extend(sorted(matches)) + + if self.hook_name: + # SPECIFIC HOOK MODE: Find the specific hook by name + for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): + if not base_dir.exists(): + continue + plugin_dir = base_dir / self.plugin + if plugin_dir.exists(): + hook_path = plugin_dir / self.hook_name + if hook_path.exists(): + hooks.append(hook_path) + break + else: + # LEGACY MODE: Discover all hooks for this plugin (backwards compatibility) + for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): + if not base_dir.exists(): + continue + plugin_dir = base_dir / self.plugin + if plugin_dir.exists(): + matches = list(plugin_dir.glob('on_Snapshot__*.*')) + if matches: + hooks.extend(sorted(matches)) if not hooks: self.status = self.StatusChoices.FAILED - self.output_str = f'No hooks found for plugin: {self.plugin}' + if self.hook_name: + self.output_str = f'Hook not found: {self.plugin}/{self.hook_name}' + else: + self.output_str = f'No hooks found for plugin: {self.plugin}' self.retry_at = None self.save() return - # plugin field contains plugin name + # Output directory is plugin_dir for the hook output plugin_dir = Path(self.snapshot.output_dir) / self.plugin - # Run ALL hooks in the plugin sequentially start_ts = timezone.now() - has_background_hook = False + is_bg_hook = False for hook in hooks: + # Check if this is a background hook + is_bg_hook = is_background_hook(hook.name) + result = run_hook( hook, output_dir=plugin_dir, @@ -1890,20 +1969,21 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi depth=self.snapshot.depth, ) - # If any hook is background, mark this ArchiveResult as started + # Background hooks return None if result is None: - has_background_hook = True + is_bg_hook = True # Update status based on hook execution - if has_background_hook: - # BACKGROUND HOOK(S) - still running, return immediately + if is_bg_hook: + # BACKGROUND HOOK - still running, return immediately + # Status stays STARTED, will be finalized by Snapshot.cleanup() self.status = self.StatusChoices.STARTED self.start_ts = start_ts self.pwd = str(plugin_dir) self.save() return - # ALL FOREGROUND HOOKS - completed, update from filesystem + # FOREGROUND HOOK - completed, update from filesystem self.start_ts = start_ts self.pwd = str(plugin_dir) self.update_from_output() @@ -1911,11 +1991,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi # Clean up empty output directory if no files were created if plugin_dir.exists() and not self.output_files: try: - # Only remove if directory is completely empty if not any(plugin_dir.iterdir()): plugin_dir.rmdir() except (OSError, RuntimeError): - pass # Directory not empty or can't be removed, that's fine + pass def update_from_output(self): """ diff --git a/archivebox/core/statemachines.py b/archivebox/core/statemachines.py index cec2b64f..9c2c295e 100644 --- a/archivebox/core/statemachines.py +++ b/archivebox/core/statemachines.py @@ -60,6 +60,11 @@ class SnapshotMachine(StateMachine, strict_states=True): if not self.snapshot.archiveresult_set.exists(): return False + # Try to advance step if ready (handles step-based hook execution) + # This will increment current_step when all foreground hooks in current step are done + while self.snapshot.advance_step_if_ready(): + pass # Keep advancing until we can't anymore + # if archiveresults exist but are still pending, it's not finished if self.snapshot.pending_archiveresults().exists(): return False diff --git a/archivebox/hooks.py b/archivebox/hooks.py index 8754053f..7bd2dab8 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -15,10 +15,21 @@ Hook contract: Exit: 0 = success, non-zero = failure Execution order: - - Extractors run sequentially within each Snapshot (ordered by numeric prefix) - - Multiple Snapshots can process in parallel + - Hooks are numbered 00-99 with first digit determining step (0-9) + - All hooks in a step can run in parallel + - Steps execute sequentially (step 0 → step 1 → ... → step 9) + - Background hooks (.bg suffix) don't block step advancement - Failed extractors don't block subsequent extractors +Hook Naming Convention: + on_{ModelName}__{run_order}_{description}[.bg].{ext} + + Examples: + on_Snapshot__00_setup.py # Step 0, runs first + on_Snapshot__20_chrome_tab.bg.js # Step 2, background (doesn't block) + on_Snapshot__50_screenshot.js # Step 5, foreground (blocks step) + on_Snapshot__63_media.bg.py # Step 6, background (long-running) + Dependency handling: Extractor plugins that depend on other plugins' output should check at runtime: @@ -39,11 +50,14 @@ API (all hook logic lives here): discover_hooks(event) -> List[Path] Find hook scripts run_hook(script, ...) -> HookResult Execute a hook script run_hooks(event, ...) -> List[HookResult] Run all hooks for an event + extract_step(hook_name) -> int Get step number (0-9) from hook name + is_background_hook(name) -> bool Check if hook is background (.bg suffix) """ __package__ = 'archivebox' import os +import re import json import signal import time @@ -60,6 +74,63 @@ BUILTIN_PLUGINS_DIR = Path(__file__).parent / 'plugins' USER_PLUGINS_DIR = Path(getattr(settings, 'DATA_DIR', Path.cwd())) / 'plugins' +# ============================================================================= +# Hook Step Extraction +# ============================================================================= + +def extract_step(hook_name: str) -> int: + """ + Extract step number (0-9) from hook name. + + Hooks are numbered 00-99 with the first digit determining the step. + Pattern: on_{Model}__{XX}_{description}[.bg].{ext} + + Args: + hook_name: Hook filename (e.g., 'on_Snapshot__50_wget.py') + + Returns: + Step number 0-9, or 9 (default) for unnumbered hooks. + + Examples: + extract_step('on_Snapshot__05_chrome.py') -> 0 + extract_step('on_Snapshot__50_wget.py') -> 5 + extract_step('on_Snapshot__63_media.bg.py') -> 6 + extract_step('on_Snapshot__99_cleanup.sh') -> 9 + extract_step('on_Snapshot__unnumbered.py') -> 9 (default) + """ + # Pattern matches __XX_ where XX is two digits + match = re.search(r'__(\d{2})_', hook_name) + if match: + two_digit = int(match.group(1)) + step = two_digit // 10 # First digit is the step (0-9) + return step + + # Log warning for unnumbered hooks and default to step 9 + import sys + print(f"Warning: Hook '{hook_name}' has no step number (expected __XX_), defaulting to step 9", file=sys.stderr) + return 9 + + +def is_background_hook(hook_name: str) -> bool: + """ + Check if a hook is a background hook (doesn't block step advancement). + + Background hooks have '.bg.' in their filename before the extension. + + Args: + hook_name: Hook filename (e.g., 'on_Snapshot__20_chrome_tab.bg.js') + + Returns: + True if background hook, False if foreground. + + Examples: + is_background_hook('on_Snapshot__20_chrome_tab.bg.js') -> True + is_background_hook('on_Snapshot__50_wget.py') -> False + is_background_hook('on_Snapshot__63_media.bg.py') -> True + """ + return '.bg.' in hook_name or '__background' in hook_name + + class HookResult(TypedDict, total=False): """Raw result from run_hook().""" returncode: int diff --git a/archivebox/machine/migrations/0003_alter_dependency_id_alter_installedbinary_dependency_and_more.py b/archivebox/machine/migrations/0003_alter_dependency_id_alter_installedbinary_dependency_and_more.py index 16360329..aa824dc8 100644 --- a/archivebox/machine/migrations/0003_alter_dependency_id_alter_installedbinary_dependency_and_more.py +++ b/archivebox/machine/migrations/0003_alter_dependency_id_alter_installedbinary_dependency_and_more.py @@ -1,7 +1,7 @@ # Generated by Django 6.0 on 2025-12-28 05:12 import django.db.models.deletion -import uuid +from archivebox import uuid_compat from django.db import migrations, models @@ -15,7 +15,7 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='dependency', name='id', - field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True), ), migrations.AlterField( model_name='binary', @@ -25,7 +25,7 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='binary', name='id', - field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True), ), migrations.AlterField( model_name='machine', @@ -35,11 +35,11 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='machine', name='id', - field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True), ), migrations.AlterField( model_name='networkinterface', name='id', - field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True), ), ] diff --git a/archivebox/plugins/dom/on_Snapshot__36_dom.js b/archivebox/plugins/dom/on_Snapshot__53_dom.js similarity index 100% rename from archivebox/plugins/dom/on_Snapshot__36_dom.js rename to archivebox/plugins/dom/on_Snapshot__53_dom.js diff --git a/archivebox/plugins/forumdl/on_Snapshot__53_forumdl.py b/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py similarity index 100% rename from archivebox/plugins/forumdl/on_Snapshot__53_forumdl.py rename to archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py diff --git a/archivebox/plugins/gallerydl/on_Snapshot__52_gallerydl.py b/archivebox/plugins/gallerydl/on_Snapshot__64_gallerydl.bg.py similarity index 100% rename from archivebox/plugins/gallerydl/on_Snapshot__52_gallerydl.py rename to archivebox/plugins/gallerydl/on_Snapshot__64_gallerydl.bg.py diff --git a/archivebox/plugins/git/on_Snapshot__12_git.py b/archivebox/plugins/git/on_Snapshot__62_git.py similarity index 100% rename from archivebox/plugins/git/on_Snapshot__12_git.py rename to archivebox/plugins/git/on_Snapshot__62_git.py diff --git a/archivebox/plugins/headers/on_Snapshot__33_headers.js b/archivebox/plugins/headers/on_Snapshot__55_headers.js similarity index 100% rename from archivebox/plugins/headers/on_Snapshot__33_headers.js rename to archivebox/plugins/headers/on_Snapshot__55_headers.js diff --git a/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py b/archivebox/plugins/htmltotext/on_Snapshot__57_htmltotext.py similarity index 100% rename from archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py rename to archivebox/plugins/htmltotext/on_Snapshot__57_htmltotext.py diff --git a/archivebox/plugins/media/on_Snapshot__51_media.py b/archivebox/plugins/media/on_Snapshot__63_media.bg.py similarity index 100% rename from archivebox/plugins/media/on_Snapshot__51_media.py rename to archivebox/plugins/media/on_Snapshot__63_media.bg.py diff --git a/archivebox/plugins/mercury/on_Snapshot__53_mercury.py b/archivebox/plugins/mercury/on_Snapshot__56_mercury.py similarity index 100% rename from archivebox/plugins/mercury/on_Snapshot__53_mercury.py rename to archivebox/plugins/mercury/on_Snapshot__56_mercury.py diff --git a/archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py b/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py similarity index 100% rename from archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py rename to archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py diff --git a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js similarity index 100% rename from archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js rename to archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js diff --git a/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py b/archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py similarity index 100% rename from archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py rename to archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py diff --git a/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py b/archivebox/plugins/parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py similarity index 100% rename from archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py rename to archivebox/plugins/parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py diff --git a/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py b/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py similarity index 100% rename from archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py rename to archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py diff --git a/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py b/archivebox/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py similarity index 100% rename from archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py rename to archivebox/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py diff --git a/archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py b/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py similarity index 100% rename from archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py rename to archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py diff --git a/archivebox/plugins/pdf/on_Snapshot__35_pdf.js b/archivebox/plugins/pdf/on_Snapshot__52_pdf.js similarity index 100% rename from archivebox/plugins/pdf/on_Snapshot__35_pdf.js rename to archivebox/plugins/pdf/on_Snapshot__52_pdf.js diff --git a/archivebox/plugins/readability/on_Snapshot__52_readability.py b/archivebox/plugins/readability/on_Snapshot__55_readability.py similarity index 100% rename from archivebox/plugins/readability/on_Snapshot__52_readability.py rename to archivebox/plugins/readability/on_Snapshot__55_readability.py diff --git a/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js similarity index 100% rename from archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js rename to archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js diff --git a/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py similarity index 100% rename from archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py rename to archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py diff --git a/archivebox/plugins/title/on_Snapshot__32_title.js b/archivebox/plugins/title/on_Snapshot__54_title.js similarity index 100% rename from archivebox/plugins/title/on_Snapshot__32_title.js rename to archivebox/plugins/title/on_Snapshot__54_title.js diff --git a/archivebox/plugins/wget/on_Snapshot__50_wget.py b/archivebox/plugins/wget/on_Snapshot__61_wget.py similarity index 100% rename from archivebox/plugins/wget/on_Snapshot__50_wget.py rename to archivebox/plugins/wget/on_Snapshot__61_wget.py diff --git a/archivebox/workers/worker.py b/archivebox/workers/worker.py index 64e3e986..84626f07 100644 --- a/archivebox/workers/worker.py +++ b/archivebox/workers/worker.py @@ -352,18 +352,42 @@ class ArchiveResultWorker(Worker): return ArchiveResult def get_queue(self) -> QuerySet: - """Get queue of ArchiveResults ready for processing.""" + """ + Get queue of ArchiveResults ready for processing. + + Uses step-based filtering: only claims ARs where hook step <= snapshot.current_step. + This ensures hooks execute in order (step 0 → 1 → 2 ... → 9). + """ from core.models import ArchiveResult + from archivebox.hooks import extract_step qs = super().get_queue() if self.plugin: qs = qs.filter(plugin=self.plugin) - # Note: Removed blocking logic since plugins have separate output directories - # and don't interfere with each other. Each plugin runs independently. + # Step-based filtering: only process ARs whose step <= snapshot.current_step + # Since step is derived from hook_name, we filter in Python after initial query + # This is efficient because the base query already filters by retry_at and status - return qs + # Get candidate ARs + candidates = list(qs[:50]) # Limit to avoid loading too many + ready_pks = [] + + for ar in candidates: + if not ar.hook_name: + # Legacy ARs without hook_name - process them + ready_pks.append(ar.pk) + continue + + ar_step = extract_step(ar.hook_name) + snapshot_step = ar.snapshot.current_step + + if ar_step <= snapshot_step: + ready_pks.append(ar.pk) + + # Return filtered queryset ordered by hook_name (so earlier hooks run first within a step) + return ArchiveResult.objects.filter(pk__in=ready_pks).order_by('hook_name', 'retry_at') def process_item(self, obj) -> bool: """Process an ArchiveResult by running its plugin."""