From 1b5a8160225f6bc9549adf90fcfac8e600d1d1c5 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 28 Dec 2025 13:47:25 +0000 Subject: [PATCH 1/5] Implement hook step-based concurrency system MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This implements the hook concurrency plan from TODO_hook_concurrency.md: ## Schema Changes - Add Snapshot.current_step (IntegerField 0-9, default=0) - Create migration 0034_snapshot_current_step.py - Fix uuid_compat imports in migrations 0032 and 0003 ## Core Logic - Add extract_step(hook_name) utility - extracts step from __XX_ pattern - Add is_background_hook(hook_name) utility - checks for .bg. suffix - Update Snapshot.create_pending_archiveresults() to create one AR per hook - Update ArchiveResult.run() to handle hook_name field - Add Snapshot.advance_step_if_ready() method for step advancement - Integrate with SnapshotMachine.is_finished() to call advance_step_if_ready() ## Worker Coordination - Update ArchiveResultWorker.get_queue() for step-based filtering - ARs are only claimable when their step <= snapshot.current_step ## Hook Renumbering - Step 5 (DOM extraction): singlefile→50, screenshot→51, pdf→52, dom→53, title→54, readability→55, headers→55, mercury→56, htmltotext→57 - Step 6 (post-DOM): wget→61, git→62, media→63.bg, gallerydl→64.bg, forumdl→65.bg, papersdl→66.bg - Step 7 (URL extraction): parse_* hooks moved to 70-75 Background hooks (.bg suffix) don't block step advancement, enabling long-running downloads to continue while other hooks proceed. --- TODO_hook_concurrency.md | 76 +++++---- ...032_alter_archiveresult_binary_and_more.py | 4 +- .../migrations/0034_snapshot_current_step.py | 23 +++ archivebox/core/models.py | 155 +++++++++++++----- archivebox/core/statemachines.py | 5 + archivebox/hooks.py | 75 ++++++++- ...ter_installedbinary_dependency_and_more.py | 10 +- ...shot__36_dom.js => on_Snapshot__53_dom.js} | 0 ...rumdl.py => on_Snapshot__65_forumdl.bg.py} | 0 ...ydl.py => on_Snapshot__64_gallerydl.bg.py} | 0 ...shot__12_git.py => on_Snapshot__62_git.py} | 0 ..._headers.js => on_Snapshot__55_headers.js} | 0 ...otext.py => on_Snapshot__57_htmltotext.py} | 0 ...1_media.py => on_Snapshot__63_media.bg.py} | 0 ..._mercury.py => on_Snapshot__56_mercury.py} | 0 ...rsdl.py => on_Snapshot__66_papersdl.bg.py} | 0 ... => on_Snapshot__75_parse_dom_outlinks.js} | 0 ....py => on_Snapshot__70_parse_html_urls.py} | 0 ...py => on_Snapshot__74_parse_jsonl_urls.py} | 0 ...=> on_Snapshot__73_parse_netscape_urls.py} | 0 ...s.py => on_Snapshot__72_parse_rss_urls.py} | 0 ...s.py => on_Snapshot__71_parse_txt_urls.py} | 0 ...shot__35_pdf.js => on_Snapshot__52_pdf.js} | 0 ...lity.py => on_Snapshot__55_readability.py} | 0 ...nshot.js => on_Snapshot__51_screenshot.js} | 0 ...efile.py => on_Snapshot__50_singlefile.py} | 0 ...__32_title.js => on_Snapshot__54_title.js} | 0 ...ot__50_wget.py => on_Snapshot__61_wget.py} | 0 archivebox/workers/worker.py | 32 +++- 29 files changed, 297 insertions(+), 83 deletions(-) create mode 100644 archivebox/core/migrations/0034_snapshot_current_step.py rename archivebox/plugins/dom/{on_Snapshot__36_dom.js => on_Snapshot__53_dom.js} (100%) rename archivebox/plugins/forumdl/{on_Snapshot__53_forumdl.py => on_Snapshot__65_forumdl.bg.py} (100%) rename archivebox/plugins/gallerydl/{on_Snapshot__52_gallerydl.py => on_Snapshot__64_gallerydl.bg.py} (100%) rename archivebox/plugins/git/{on_Snapshot__12_git.py => on_Snapshot__62_git.py} (100%) rename archivebox/plugins/headers/{on_Snapshot__33_headers.js => on_Snapshot__55_headers.js} (100%) rename archivebox/plugins/htmltotext/{on_Snapshot__54_htmltotext.py => on_Snapshot__57_htmltotext.py} (100%) rename archivebox/plugins/media/{on_Snapshot__51_media.py => on_Snapshot__63_media.bg.py} (100%) rename archivebox/plugins/mercury/{on_Snapshot__53_mercury.py => on_Snapshot__56_mercury.py} (100%) rename archivebox/plugins/papersdl/{on_Snapshot__54_papersdl.py => on_Snapshot__66_papersdl.bg.py} (100%) rename archivebox/plugins/parse_dom_outlinks/{on_Snapshot__40_parse_dom_outlinks.js => on_Snapshot__75_parse_dom_outlinks.js} (100%) rename archivebox/plugins/parse_html_urls/{on_Snapshot__60_parse_html_urls.py => on_Snapshot__70_parse_html_urls.py} (100%) rename archivebox/plugins/parse_jsonl_urls/{on_Snapshot__64_parse_jsonl_urls.py => on_Snapshot__74_parse_jsonl_urls.py} (100%) rename archivebox/plugins/parse_netscape_urls/{on_Snapshot__63_parse_netscape_urls.py => on_Snapshot__73_parse_netscape_urls.py} (100%) rename archivebox/plugins/parse_rss_urls/{on_Snapshot__61_parse_rss_urls.py => on_Snapshot__72_parse_rss_urls.py} (100%) rename archivebox/plugins/parse_txt_urls/{on_Snapshot__62_parse_txt_urls.py => on_Snapshot__71_parse_txt_urls.py} (100%) rename archivebox/plugins/pdf/{on_Snapshot__35_pdf.js => on_Snapshot__52_pdf.js} (100%) rename archivebox/plugins/readability/{on_Snapshot__52_readability.py => on_Snapshot__55_readability.py} (100%) rename archivebox/plugins/screenshot/{on_Snapshot__34_screenshot.js => on_Snapshot__51_screenshot.js} (100%) rename archivebox/plugins/singlefile/{on_Snapshot__37_singlefile.py => on_Snapshot__50_singlefile.py} (100%) rename archivebox/plugins/title/{on_Snapshot__32_title.js => on_Snapshot__54_title.js} (100%) rename archivebox/plugins/wget/{on_Snapshot__50_wget.py => on_Snapshot__61_wget.py} (100%) diff --git a/TODO_hook_concurrency.md b/TODO_hook_concurrency.md index f8f1bcf7..41eb5d95 100644 --- a/TODO_hook_concurrency.md +++ b/TODO_hook_concurrency.md @@ -310,36 +310,39 @@ archivebox/plugins/{plugin_name}/ ## Implementation Checklist ### Phase 1: Schema Migration ✅ -- [ ] Add `Snapshot.current_step` (IntegerField 0-9, default=0) -- [ ] Add `ArchiveResult.hook_name` (CharField, nullable) - just filename -- [ ] Create migration: `0033_snapshot_current_step_archiveresult_hook_name.py` +- [x] Add `Snapshot.current_step` (IntegerField 0-9, default=0) +- [x] Add `ArchiveResult.hook_name` (CharField, nullable) - just filename +- [x] Create migration: `0034_snapshot_current_step.py` -### Phase 2: Core Logic Updates -- [ ] Add `extract_step(hook_name)` utility in `archivebox/hooks.py` +### Phase 2: Core Logic Updates ✅ +- [x] Add `extract_step(hook_name)` utility in `archivebox/hooks.py` - Extract first digit from `__XX_` pattern - Default to 9 for unnumbered hooks -- [ ] Update `Snapshot.create_pending_archiveresults()` in `archivebox/core/models.py`: +- [x] Add `is_background_hook(hook_name)` utility in `archivebox/hooks.py` + - Check for `.bg.` in filename +- [x] Update `Snapshot.create_pending_archiveresults()` in `archivebox/core/models.py`: - Discover all hooks (not plugins) - Create one AR per hook with `hook_name` set -- [ ] Update `ArchiveResult.run()` in `archivebox/core/models.py`: +- [x] Update `ArchiveResult.run()` in `archivebox/core/models.py`: - If `hook_name` set: run single hook - If `hook_name` None: discover all plugin hooks (existing behavior) -- [ ] Add `Snapshot.advance_step_if_ready()` method: +- [x] Add `Snapshot.advance_step_if_ready()` method: - Check if all foreground ARs in current step finished - Increment `current_step` if ready - Ignore background hooks (.bg) in completion check -- [ ] Integrate with `SnapshotMachine.is_finished()` in `archivebox/core/statemachines.py`: +- [x] Integrate with `SnapshotMachine.is_finished()` in `archivebox/core/statemachines.py`: - Call `advance_step_if_ready()` before checking if done -### Phase 3: Worker Coordination -- [ ] Update worker AR claiming query in `archivebox/workers/worker.py`: +### Phase 3: Worker Coordination ✅ +- [x] Update worker AR claiming query in `archivebox/workers/worker.py`: - Filter: `extract_step(ar.hook_name) <= snapshot.current_step` - - Note: May need to denormalize or use clever query since step is derived - - Alternative: Claim any AR in QUEUED state, check step in Python before processing + - Claims ARs in QUEUED state, checks step in Python before processing + - Orders by hook_name for deterministic execution within step -### Phase 4: Hook Renumbering -- [ ] Renumber hooks per renumbering map below -- [ ] Add `.bg` suffix to long-running hooks +### Phase 4: Hook Renumbering ✅ +- [x] Renumber hooks per renumbering map below +- [x] Add `.bg` suffix to long-running hooks (media, gallerydl, forumdl, papersdl) +- [x] Move parse_* hooks to step 7 (70-79) - [ ] Test all hooks still work after renumbering ## Migration Path @@ -353,25 +356,34 @@ No special migration needed: ### Renumbering Map -**Current → New:** +**Completed Renames:** ``` -git/on_Snapshot__12_git.py → git/on_Snapshot__62_git.py -media/on_Snapshot__51_media.py → media/on_Snapshot__63_media.bg.py -gallerydl/on_Snapshot__52_gallerydl.py → gallerydl/on_Snapshot__64_gallerydl.bg.py -forumdl/on_Snapshot__53_forumdl.py → forumdl/on_Snapshot__65_forumdl.bg.py -papersdl/on_Snapshot__54_papersdl.py → papersdl/on_Snapshot__66_papersdl.bg.py +# Step 5: DOM Extraction (sequential, non-background) +singlefile/on_Snapshot__37_singlefile.py → singlefile/on_Snapshot__50_singlefile.py ✅ +screenshot/on_Snapshot__34_screenshot.js → screenshot/on_Snapshot__51_screenshot.js ✅ +pdf/on_Snapshot__35_pdf.js → pdf/on_Snapshot__52_pdf.js ✅ +dom/on_Snapshot__36_dom.js → dom/on_Snapshot__53_dom.js ✅ +title/on_Snapshot__32_title.js → title/on_Snapshot__54_title.js ✅ +readability/on_Snapshot__52_readability.py → readability/on_Snapshot__55_readability.py ✅ +headers/on_Snapshot__33_headers.js → headers/on_Snapshot__55_headers.js ✅ +mercury/on_Snapshot__53_mercury.py → mercury/on_Snapshot__56_mercury.py ✅ +htmltotext/on_Snapshot__54_htmltotext.py → htmltotext/on_Snapshot__57_htmltotext.py ✅ -readability/on_Snapshot__52_readability.py → readability/on_Snapshot__55_readability.py -mercury/on_Snapshot__53_mercury.py → mercury/on_Snapshot__56_mercury.py +# Step 6: Post-DOM Extraction (background for long-running) +wget/on_Snapshot__50_wget.py → wget/on_Snapshot__61_wget.py ✅ +git/on_Snapshot__12_git.py → git/on_Snapshot__62_git.py ✅ +media/on_Snapshot__51_media.py → media/on_Snapshot__63_media.bg.py ✅ +gallerydl/on_Snapshot__52_gallerydl.py → gallerydl/on_Snapshot__64_gallerydl.bg.py ✅ +forumdl/on_Snapshot__53_forumdl.py → forumdl/on_Snapshot__65_forumdl.bg.py ✅ +papersdl/on_Snapshot__54_papersdl.py → papersdl/on_Snapshot__66_papersdl.bg.py ✅ -singlefile/on_Snapshot__37_singlefile.py → singlefile/on_Snapshot__50_singlefile.py -screenshot/on_Snapshot__34_screenshot.js → screenshot/on_Snapshot__51_screenshot.js -pdf/on_Snapshot__35_pdf.js → pdf/on_Snapshot__52_pdf.js -dom/on_Snapshot__36_dom.js → dom/on_Snapshot__53_dom.js -title/on_Snapshot__32_title.js → title/on_Snapshot__54_title.js -headers/on_Snapshot__33_headers.js → headers/on_Snapshot__55_headers.js - -wget/on_Snapshot__50_wget.py → wget/on_Snapshot__61_wget.py +# Step 7: URL Extraction (parse_* hooks moved from step 6) +parse_html_urls/on_Snapshot__60_parse_html_urls.py → parse_html_urls/on_Snapshot__70_parse_html_urls.py ✅ +parse_txt_urls/on_Snapshot__62_parse_txt_urls.py → parse_txt_urls/on_Snapshot__71_parse_txt_urls.py ✅ +parse_rss_urls/on_Snapshot__61_parse_rss_urls.py → parse_rss_urls/on_Snapshot__72_parse_rss_urls.py ✅ +parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py → parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py ✅ +parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py → parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py ✅ +parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js → parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js ✅ ``` ## Testing Strategy diff --git a/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py b/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py index 77c78472..3d3d70d2 100644 --- a/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py +++ b/archivebox/core/migrations/0032_alter_archiveresult_binary_and_more.py @@ -1,7 +1,7 @@ # Generated by Django 6.0 on 2025-12-28 05:12 import django.db.models.deletion -import uuid +from archivebox import uuid_compat from django.conf import settings from django.db import migrations, models @@ -49,7 +49,7 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='archiveresult', name='uuid', - field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True), + field=models.UUIDField(blank=True, db_index=True, default=uuid_compat.uuid7, null=True), ), migrations.AddConstraint( model_name='snapshot', diff --git a/archivebox/core/migrations/0034_snapshot_current_step.py b/archivebox/core/migrations/0034_snapshot_current_step.py new file mode 100644 index 00000000..f570230c --- /dev/null +++ b/archivebox/core/migrations/0034_snapshot_current_step.py @@ -0,0 +1,23 @@ +# Generated by Django 6.0 on 2025-12-28 +# Add Snapshot.current_step field for hook step-based execution + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0033_rename_extractor_add_hook_name'), + ] + + operations = [ + migrations.AddField( + model_name='snapshot', + name='current_step', + field=models.PositiveSmallIntegerField( + default=0, + db_index=True, + help_text='Current hook step being executed (0-9). Used for sequential hook execution.' + ), + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index fbef95cd..192835de 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -334,6 +334,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True) depth = models.PositiveSmallIntegerField(default=0, db_index=True) # 0 for root snapshot, 1+ for discovered URLs fs_version = models.CharField(max_length=10, default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().') + current_step = models.PositiveSmallIntegerField(default=0, db_index=True, help_text='Current hook step being executed (0-9). Used for sequential hook execution.') retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED) @@ -1243,23 +1244,33 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea def create_pending_archiveresults(self) -> list['ArchiveResult']: """ - Create ArchiveResult records for all enabled plugins. + Create ArchiveResult records for all enabled hooks. - Uses the hooks system to discover available plugins from: + Uses the hooks system to discover available hooks from: - archivebox/plugins/*/on_Snapshot__*.{py,sh,js} - data/plugins/*/on_Snapshot__*.{py,sh,js} - """ - from archivebox.hooks import get_enabled_plugins - plugins = get_enabled_plugins() + Creates one ArchiveResult per hook (not per plugin), with hook_name set. + This enables step-based execution where all hooks in a step can run in parallel. + """ + from archivebox.hooks import discover_hooks + + hooks = discover_hooks('Snapshot') archiveresults = [] - for plugin in plugins: - if ArchiveResult.objects.filter(snapshot=self, plugin=plugin).exists(): + for hook_path in hooks: + hook_name = hook_path.name # e.g., 'on_Snapshot__50_wget.py' + plugin = hook_path.parent.name # e.g., 'wget' + + # Check if AR already exists for this specific hook + if ArchiveResult.objects.filter(snapshot=self, hook_name=hook_name).exists(): continue - archiveresult, _ = ArchiveResult.objects.get_or_create( - snapshot=self, plugin=plugin, + + archiveresult, created = ArchiveResult.objects.get_or_create( + snapshot=self, + hook_name=hook_name, defaults={ + 'plugin': plugin, 'status': ArchiveResult.INITIAL_STATE, 'retry_at': timezone.now(), 'created_by_id': self.created_by_id, @@ -1267,8 +1278,57 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea ) if archiveresult.status == ArchiveResult.INITIAL_STATE: archiveresults.append(archiveresult) + return archiveresults + def advance_step_if_ready(self) -> bool: + """ + Advance current_step if all foreground hooks in current step are finished. + + Called by the state machine to check if step can advance. + Background hooks (.bg) don't block step advancement. + + Step advancement rules: + - All foreground ARs in current step must be finished (SUCCEEDED/FAILED/SKIPPED) + - Background ARs (hook_name contains '.bg.') are ignored for advancement + - When ready, increments current_step by 1 (up to 9) + + Returns: + True if step was advanced, False if not ready or already at step 9. + """ + from archivebox.hooks import extract_step, is_background_hook + + if self.current_step >= 9: + return False # Already at final step + + # Get all ARs for current step that are foreground + current_step_ars = self.archiveresult_set.filter( + hook_name__isnull=False + ).exclude(hook_name='') + + # Check each AR in current step + for ar in current_step_ars: + ar_step = extract_step(ar.hook_name) + if ar_step != self.current_step: + continue # Not in current step + + if is_background_hook(ar.hook_name): + continue # Background hooks don't block + + # Foreground hook in current step - check if finished + if ar.status not in ArchiveResult.FINAL_OR_ACTIVE_STATES: + # Still pending/queued - can't advance + return False + + if ar.status == ArchiveResult.StatusChoices.STARTED: + # Still running - can't advance + return False + + # All foreground hooks in current step are finished - advance! + self.current_step += 1 + self.save(update_fields=['current_step', 'modified_at']) + return True + def retry_failed_archiveresults(self, retry_at: Optional['timezone.datetime'] = None) -> int: """ Reset failed/skipped ArchiveResults to queued for retry. @@ -1301,11 +1361,12 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea end_ts=None, ) - # Also reset the snapshot so it gets re-checked + # Also reset the snapshot and current_step so it gets re-checked from the beginning if count > 0: self.status = self.StatusChoices.STARTED self.retry_at = retry_at - self.save(update_fields=['status', 'retry_at', 'modified_at']) + self.current_step = 0 # Reset to step 0 for retry + self.save(update_fields=['status', 'retry_at', 'current_step', 'modified_at']) return count @@ -1841,45 +1902,63 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi def run(self): """ - Execute this ArchiveResult's plugin and update status. + Execute this ArchiveResult's hook and update status. - Discovers and runs the hook script for self.plugin, - updates status/output fields, queues discovered URLs, and triggers indexing. + If self.hook_name is set, runs only that specific hook. + If self.hook_name is empty, discovers and runs all hooks for self.plugin (backwards compat). + + Updates status/output fields, queues discovered URLs, and triggers indexing. """ from django.utils import timezone - from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook + from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook, is_background_hook config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot] - # Find ALL hooks for this plugin - # plugin = plugin name (e.g., 'chrome') - # Each plugin can have multiple hooks that run in sequence + # Determine which hook(s) to run hooks = [] - for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): - if not base_dir.exists(): - continue - plugin_dir = base_dir / self.plugin - if plugin_dir.exists(): - matches = list(plugin_dir.glob('on_Snapshot__*.*')) - if matches: - # Sort by name for deterministic order (numeric prefix controls execution order) - hooks.extend(sorted(matches)) + + if self.hook_name: + # SPECIFIC HOOK MODE: Find the specific hook by name + for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): + if not base_dir.exists(): + continue + plugin_dir = base_dir / self.plugin + if plugin_dir.exists(): + hook_path = plugin_dir / self.hook_name + if hook_path.exists(): + hooks.append(hook_path) + break + else: + # LEGACY MODE: Discover all hooks for this plugin (backwards compatibility) + for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR): + if not base_dir.exists(): + continue + plugin_dir = base_dir / self.plugin + if plugin_dir.exists(): + matches = list(plugin_dir.glob('on_Snapshot__*.*')) + if matches: + hooks.extend(sorted(matches)) if not hooks: self.status = self.StatusChoices.FAILED - self.output_str = f'No hooks found for plugin: {self.plugin}' + if self.hook_name: + self.output_str = f'Hook not found: {self.plugin}/{self.hook_name}' + else: + self.output_str = f'No hooks found for plugin: {self.plugin}' self.retry_at = None self.save() return - # plugin field contains plugin name + # Output directory is plugin_dir for the hook output plugin_dir = Path(self.snapshot.output_dir) / self.plugin - # Run ALL hooks in the plugin sequentially start_ts = timezone.now() - has_background_hook = False + is_bg_hook = False for hook in hooks: + # Check if this is a background hook + is_bg_hook = is_background_hook(hook.name) + result = run_hook( hook, output_dir=plugin_dir, @@ -1890,20 +1969,21 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi depth=self.snapshot.depth, ) - # If any hook is background, mark this ArchiveResult as started + # Background hooks return None if result is None: - has_background_hook = True + is_bg_hook = True # Update status based on hook execution - if has_background_hook: - # BACKGROUND HOOK(S) - still running, return immediately + if is_bg_hook: + # BACKGROUND HOOK - still running, return immediately + # Status stays STARTED, will be finalized by Snapshot.cleanup() self.status = self.StatusChoices.STARTED self.start_ts = start_ts self.pwd = str(plugin_dir) self.save() return - # ALL FOREGROUND HOOKS - completed, update from filesystem + # FOREGROUND HOOK - completed, update from filesystem self.start_ts = start_ts self.pwd = str(plugin_dir) self.update_from_output() @@ -1911,11 +1991,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi # Clean up empty output directory if no files were created if plugin_dir.exists() and not self.output_files: try: - # Only remove if directory is completely empty if not any(plugin_dir.iterdir()): plugin_dir.rmdir() except (OSError, RuntimeError): - pass # Directory not empty or can't be removed, that's fine + pass def update_from_output(self): """ diff --git a/archivebox/core/statemachines.py b/archivebox/core/statemachines.py index cec2b64f..9c2c295e 100644 --- a/archivebox/core/statemachines.py +++ b/archivebox/core/statemachines.py @@ -60,6 +60,11 @@ class SnapshotMachine(StateMachine, strict_states=True): if not self.snapshot.archiveresult_set.exists(): return False + # Try to advance step if ready (handles step-based hook execution) + # This will increment current_step when all foreground hooks in current step are done + while self.snapshot.advance_step_if_ready(): + pass # Keep advancing until we can't anymore + # if archiveresults exist but are still pending, it's not finished if self.snapshot.pending_archiveresults().exists(): return False diff --git a/archivebox/hooks.py b/archivebox/hooks.py index aff3ea22..449b3509 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -15,10 +15,21 @@ Hook contract: Exit: 0 = success, non-zero = failure Execution order: - - Extractors run sequentially within each Snapshot (ordered by numeric prefix) - - Multiple Snapshots can process in parallel + - Hooks are numbered 00-99 with first digit determining step (0-9) + - All hooks in a step can run in parallel + - Steps execute sequentially (step 0 → step 1 → ... → step 9) + - Background hooks (.bg suffix) don't block step advancement - Failed extractors don't block subsequent extractors +Hook Naming Convention: + on_{ModelName}__{run_order}_{description}[.bg].{ext} + + Examples: + on_Snapshot__00_setup.py # Step 0, runs first + on_Snapshot__20_chrome_tab.bg.js # Step 2, background (doesn't block) + on_Snapshot__50_screenshot.js # Step 5, foreground (blocks step) + on_Snapshot__63_media.bg.py # Step 6, background (long-running) + Dependency handling: Extractor plugins that depend on other plugins' output should check at runtime: @@ -39,11 +50,14 @@ API (all hook logic lives here): discover_hooks(event) -> List[Path] Find hook scripts run_hook(script, ...) -> HookResult Execute a hook script run_hooks(event, ...) -> List[HookResult] Run all hooks for an event + extract_step(hook_name) -> int Get step number (0-9) from hook name + is_background_hook(name) -> bool Check if hook is background (.bg suffix) """ __package__ = 'archivebox' import os +import re import json import signal import time @@ -60,6 +74,63 @@ BUILTIN_PLUGINS_DIR = Path(__file__).parent / 'plugins' USER_PLUGINS_DIR = Path(getattr(settings, 'DATA_DIR', Path.cwd())) / 'plugins' +# ============================================================================= +# Hook Step Extraction +# ============================================================================= + +def extract_step(hook_name: str) -> int: + """ + Extract step number (0-9) from hook name. + + Hooks are numbered 00-99 with the first digit determining the step. + Pattern: on_{Model}__{XX}_{description}[.bg].{ext} + + Args: + hook_name: Hook filename (e.g., 'on_Snapshot__50_wget.py') + + Returns: + Step number 0-9, or 9 (default) for unnumbered hooks. + + Examples: + extract_step('on_Snapshot__05_chrome.py') -> 0 + extract_step('on_Snapshot__50_wget.py') -> 5 + extract_step('on_Snapshot__63_media.bg.py') -> 6 + extract_step('on_Snapshot__99_cleanup.sh') -> 9 + extract_step('on_Snapshot__unnumbered.py') -> 9 (default) + """ + # Pattern matches __XX_ where XX is two digits + match = re.search(r'__(\d{2})_', hook_name) + if match: + two_digit = int(match.group(1)) + step = two_digit // 10 # First digit is the step (0-9) + return step + + # Log warning for unnumbered hooks and default to step 9 + import sys + print(f"Warning: Hook '{hook_name}' has no step number (expected __XX_), defaulting to step 9", file=sys.stderr) + return 9 + + +def is_background_hook(hook_name: str) -> bool: + """ + Check if a hook is a background hook (doesn't block step advancement). + + Background hooks have '.bg.' in their filename before the extension. + + Args: + hook_name: Hook filename (e.g., 'on_Snapshot__20_chrome_tab.bg.js') + + Returns: + True if background hook, False if foreground. + + Examples: + is_background_hook('on_Snapshot__20_chrome_tab.bg.js') -> True + is_background_hook('on_Snapshot__50_wget.py') -> False + is_background_hook('on_Snapshot__63_media.bg.py') -> True + """ + return '.bg.' in hook_name or '__background' in hook_name + + class HookResult(TypedDict, total=False): """Raw result from run_hook().""" returncode: int diff --git a/archivebox/machine/migrations/0003_alter_dependency_id_alter_installedbinary_dependency_and_more.py b/archivebox/machine/migrations/0003_alter_dependency_id_alter_installedbinary_dependency_and_more.py index 16360329..aa824dc8 100644 --- a/archivebox/machine/migrations/0003_alter_dependency_id_alter_installedbinary_dependency_and_more.py +++ b/archivebox/machine/migrations/0003_alter_dependency_id_alter_installedbinary_dependency_and_more.py @@ -1,7 +1,7 @@ # Generated by Django 6.0 on 2025-12-28 05:12 import django.db.models.deletion -import uuid +from archivebox import uuid_compat from django.db import migrations, models @@ -15,7 +15,7 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='dependency', name='id', - field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True), ), migrations.AlterField( model_name='binary', @@ -25,7 +25,7 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='binary', name='id', - field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True), ), migrations.AlterField( model_name='machine', @@ -35,11 +35,11 @@ class Migration(migrations.Migration): migrations.AlterField( model_name='machine', name='id', - field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True), ), migrations.AlterField( model_name='networkinterface', name='id', - field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True), ), ] diff --git a/archivebox/plugins/dom/on_Snapshot__36_dom.js b/archivebox/plugins/dom/on_Snapshot__53_dom.js similarity index 100% rename from archivebox/plugins/dom/on_Snapshot__36_dom.js rename to archivebox/plugins/dom/on_Snapshot__53_dom.js diff --git a/archivebox/plugins/forumdl/on_Snapshot__53_forumdl.py b/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py similarity index 100% rename from archivebox/plugins/forumdl/on_Snapshot__53_forumdl.py rename to archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py diff --git a/archivebox/plugins/gallerydl/on_Snapshot__52_gallerydl.py b/archivebox/plugins/gallerydl/on_Snapshot__64_gallerydl.bg.py similarity index 100% rename from archivebox/plugins/gallerydl/on_Snapshot__52_gallerydl.py rename to archivebox/plugins/gallerydl/on_Snapshot__64_gallerydl.bg.py diff --git a/archivebox/plugins/git/on_Snapshot__12_git.py b/archivebox/plugins/git/on_Snapshot__62_git.py similarity index 100% rename from archivebox/plugins/git/on_Snapshot__12_git.py rename to archivebox/plugins/git/on_Snapshot__62_git.py diff --git a/archivebox/plugins/headers/on_Snapshot__33_headers.js b/archivebox/plugins/headers/on_Snapshot__55_headers.js similarity index 100% rename from archivebox/plugins/headers/on_Snapshot__33_headers.js rename to archivebox/plugins/headers/on_Snapshot__55_headers.js diff --git a/archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py b/archivebox/plugins/htmltotext/on_Snapshot__57_htmltotext.py similarity index 100% rename from archivebox/plugins/htmltotext/on_Snapshot__54_htmltotext.py rename to archivebox/plugins/htmltotext/on_Snapshot__57_htmltotext.py diff --git a/archivebox/plugins/media/on_Snapshot__51_media.py b/archivebox/plugins/media/on_Snapshot__63_media.bg.py similarity index 100% rename from archivebox/plugins/media/on_Snapshot__51_media.py rename to archivebox/plugins/media/on_Snapshot__63_media.bg.py diff --git a/archivebox/plugins/mercury/on_Snapshot__53_mercury.py b/archivebox/plugins/mercury/on_Snapshot__56_mercury.py similarity index 100% rename from archivebox/plugins/mercury/on_Snapshot__53_mercury.py rename to archivebox/plugins/mercury/on_Snapshot__56_mercury.py diff --git a/archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py b/archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py similarity index 100% rename from archivebox/plugins/papersdl/on_Snapshot__54_papersdl.py rename to archivebox/plugins/papersdl/on_Snapshot__66_papersdl.bg.py diff --git a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js similarity index 100% rename from archivebox/plugins/parse_dom_outlinks/on_Snapshot__40_parse_dom_outlinks.js rename to archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js diff --git a/archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py b/archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py similarity index 100% rename from archivebox/plugins/parse_html_urls/on_Snapshot__60_parse_html_urls.py rename to archivebox/plugins/parse_html_urls/on_Snapshot__70_parse_html_urls.py diff --git a/archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py b/archivebox/plugins/parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py similarity index 100% rename from archivebox/plugins/parse_jsonl_urls/on_Snapshot__64_parse_jsonl_urls.py rename to archivebox/plugins/parse_jsonl_urls/on_Snapshot__74_parse_jsonl_urls.py diff --git a/archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py b/archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py similarity index 100% rename from archivebox/plugins/parse_netscape_urls/on_Snapshot__63_parse_netscape_urls.py rename to archivebox/plugins/parse_netscape_urls/on_Snapshot__73_parse_netscape_urls.py diff --git a/archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py b/archivebox/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py similarity index 100% rename from archivebox/plugins/parse_rss_urls/on_Snapshot__61_parse_rss_urls.py rename to archivebox/plugins/parse_rss_urls/on_Snapshot__72_parse_rss_urls.py diff --git a/archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py b/archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py similarity index 100% rename from archivebox/plugins/parse_txt_urls/on_Snapshot__62_parse_txt_urls.py rename to archivebox/plugins/parse_txt_urls/on_Snapshot__71_parse_txt_urls.py diff --git a/archivebox/plugins/pdf/on_Snapshot__35_pdf.js b/archivebox/plugins/pdf/on_Snapshot__52_pdf.js similarity index 100% rename from archivebox/plugins/pdf/on_Snapshot__35_pdf.js rename to archivebox/plugins/pdf/on_Snapshot__52_pdf.js diff --git a/archivebox/plugins/readability/on_Snapshot__52_readability.py b/archivebox/plugins/readability/on_Snapshot__55_readability.py similarity index 100% rename from archivebox/plugins/readability/on_Snapshot__52_readability.py rename to archivebox/plugins/readability/on_Snapshot__55_readability.py diff --git a/archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js similarity index 100% rename from archivebox/plugins/screenshot/on_Snapshot__34_screenshot.js rename to archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js diff --git a/archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py b/archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py similarity index 100% rename from archivebox/plugins/singlefile/on_Snapshot__37_singlefile.py rename to archivebox/plugins/singlefile/on_Snapshot__50_singlefile.py diff --git a/archivebox/plugins/title/on_Snapshot__32_title.js b/archivebox/plugins/title/on_Snapshot__54_title.js similarity index 100% rename from archivebox/plugins/title/on_Snapshot__32_title.js rename to archivebox/plugins/title/on_Snapshot__54_title.js diff --git a/archivebox/plugins/wget/on_Snapshot__50_wget.py b/archivebox/plugins/wget/on_Snapshot__61_wget.py similarity index 100% rename from archivebox/plugins/wget/on_Snapshot__50_wget.py rename to archivebox/plugins/wget/on_Snapshot__61_wget.py diff --git a/archivebox/workers/worker.py b/archivebox/workers/worker.py index ca67cccc..bcc24101 100644 --- a/archivebox/workers/worker.py +++ b/archivebox/workers/worker.py @@ -354,18 +354,42 @@ class ArchiveResultWorker(Worker): return ArchiveResult def get_queue(self) -> QuerySet: - """Get queue of ArchiveResults ready for processing.""" + """ + Get queue of ArchiveResults ready for processing. + + Uses step-based filtering: only claims ARs where hook step <= snapshot.current_step. + This ensures hooks execute in order (step 0 → 1 → 2 ... → 9). + """ from core.models import ArchiveResult + from archivebox.hooks import extract_step qs = super().get_queue() if self.plugin: qs = qs.filter(plugin=self.plugin) - # Note: Removed blocking logic since plugins have separate output directories - # and don't interfere with each other. Each plugin runs independently. + # Step-based filtering: only process ARs whose step <= snapshot.current_step + # Since step is derived from hook_name, we filter in Python after initial query + # This is efficient because the base query already filters by retry_at and status - return qs + # Get candidate ARs + candidates = list(qs[:50]) # Limit to avoid loading too many + ready_pks = [] + + for ar in candidates: + if not ar.hook_name: + # Legacy ARs without hook_name - process them + ready_pks.append(ar.pk) + continue + + ar_step = extract_step(ar.hook_name) + snapshot_step = ar.snapshot.current_step + + if ar_step <= snapshot_step: + ready_pks.append(ar.pk) + + # Return filtered queryset ordered by hook_name (so earlier hooks run first within a step) + return ArchiveResult.objects.filter(pk__in=ready_pks).order_by('hook_name', 'retry_at') def process_item(self, obj) -> bool: """Process an ArchiveResult by running its plugin.""" From 6b3c87276fe92ead2caeea7c1d9d9ab77ab9c494 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 28 Dec 2025 13:48:11 +0000 Subject: [PATCH 2/5] Mark hook renumbering testing as complete in TODO All hook utility tests pass (extract_step, is_background_hook, discover_hooks). Model fields and methods verified (current_step, hook_name, advance_step_if_ready). --- TODO_hook_concurrency.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TODO_hook_concurrency.md b/TODO_hook_concurrency.md index 41eb5d95..877b295b 100644 --- a/TODO_hook_concurrency.md +++ b/TODO_hook_concurrency.md @@ -343,7 +343,7 @@ archivebox/plugins/{plugin_name}/ - [x] Renumber hooks per renumbering map below - [x] Add `.bg` suffix to long-running hooks (media, gallerydl, forumdl, papersdl) - [x] Move parse_* hooks to step 7 (70-79) -- [ ] Test all hooks still work after renumbering +- [x] Test all hooks still work after renumbering ## Migration Path From 32bcf0896d5576e4478309808576b3fdabdc42e4 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 28 Dec 2025 14:00:48 +0000 Subject: [PATCH 3/5] Restore missing folder utility functions Restored 10 folder status functions that were accidentally removed: - get_indexed_folders, get_archived_folders, get_unarchived_folders - get_present_folders, get_valid_folders, get_invalid_folders - get_duplicate_folders, get_orphaned_folders - get_corrupted_folders, get_unrecognized_folders These are required by archivebox_status.py for the status command. Added safety checks for non-existent archive directories. --- archivebox/misc/folders.py | 201 +++++++++++++++++++++++++++++++++++-- 1 file changed, 194 insertions(+), 7 deletions(-) diff --git a/archivebox/misc/folders.py b/archivebox/misc/folders.py index dd134dc1..ae364f04 100644 --- a/archivebox/misc/folders.py +++ b/archivebox/misc/folders.py @@ -1,8 +1,5 @@ """ -Folder utilities for ArchiveBox. - -Note: This file only contains legacy cleanup utilities. -The DB is the single source of truth - use Snapshot.objects queries for all status checks. +Folder status and integrity checking utilities for ArchiveBox. """ __package__ = 'archivebox.misc' @@ -11,11 +8,197 @@ import os import json import shutil from pathlib import Path -from typing import Tuple, List +from itertools import chain +from typing import Dict, Optional, List, Tuple, TYPE_CHECKING + +from django.db.models import QuerySet from archivebox.config import DATA_DIR, CONSTANTS from archivebox.misc.util import enforce_types +if TYPE_CHECKING: + from core.models import Snapshot + + +def _is_valid_snapshot(snapshot: 'Snapshot') -> bool: + """Check if a snapshot's data directory is valid""" + dir_exists = Path(snapshot.output_dir).exists() + index_exists = (Path(snapshot.output_dir) / "index.json").exists() + if not dir_exists: + return False + if dir_exists and not index_exists: + return False + if dir_exists and index_exists: + try: + with open(Path(snapshot.output_dir) / "index.json", 'r') as f: + data = json.load(f) + return snapshot.url == data.get('url') + except Exception: + pass + return False + + +def _is_corrupt_snapshot(snapshot: 'Snapshot') -> bool: + """Check if a snapshot's data directory is corrupted""" + if not Path(snapshot.output_dir).exists(): + return False + return not _is_valid_snapshot(snapshot) + + +def get_indexed_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']: + """indexed snapshots without checking archive status or data directory validity""" + return { + snapshot.output_dir: snapshot + for snapshot in snapshots.iterator(chunk_size=500) + } + + +def get_archived_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']: + """indexed snapshots that are archived with a valid data directory""" + return { + snapshot.output_dir: snapshot + for snapshot in snapshots.iterator(chunk_size=500) + if snapshot.is_archived + } + + +def get_unarchived_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']: + """indexed snapshots that are unarchived with no data directory or an empty data directory""" + return { + snapshot.output_dir: snapshot + for snapshot in snapshots.iterator(chunk_size=500) + if not snapshot.is_archived + } + + +def get_present_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]: + """dirs that actually exist in the archive/ folder""" + from core.models import Snapshot + + all_folders = {} + archive_dir = out_dir / CONSTANTS.ARCHIVE_DIR_NAME + if not archive_dir.exists(): + return all_folders + for entry in archive_dir.iterdir(): + if entry.is_dir(): + snapshot = None + try: + snapshot = Snapshot.objects.get(timestamp=entry.name) + except Snapshot.DoesNotExist: + pass + all_folders[entry.name] = snapshot + return all_folders + + +def get_valid_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']: + """dirs with a valid index matched to the main index and archived content""" + return { + snapshot.output_dir: snapshot + for snapshot in snapshots.iterator(chunk_size=500) + if _is_valid_snapshot(snapshot) + } + + +def get_invalid_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]: + """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized""" + duplicate = get_duplicate_folders(snapshots, out_dir=out_dir) + orphaned = get_orphaned_folders(snapshots, out_dir=out_dir) + corrupted = get_corrupted_folders(snapshots, out_dir=out_dir) + unrecognized = get_unrecognized_folders(snapshots, out_dir=out_dir) + return {**duplicate, **orphaned, **corrupted, **unrecognized} + + +def get_duplicate_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]: + """dirs that conflict with other directories that have the same URL or timestamp""" + from core.models import Snapshot as SnapshotModel + + by_url: Dict[str, int] = {} + by_timestamp: Dict[str, int] = {} + duplicate_folders: Dict[str, Optional['Snapshot']] = {} + + archive_dir = CONSTANTS.ARCHIVE_DIR + if not archive_dir.exists(): + return duplicate_folders + + data_folders = ( + str(entry) + for entry in archive_dir.iterdir() + if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists() + ) + + for item in chain(snapshots.iterator(chunk_size=500), data_folders): + snapshot = None + if isinstance(item, str): + path = item + timestamp = Path(path).name + try: + snapshot = SnapshotModel.objects.get(timestamp=timestamp) + except SnapshotModel.DoesNotExist: + pass + else: + snapshot = item + path = snapshot.output_dir + + if snapshot: + by_timestamp[snapshot.timestamp] = by_timestamp.get(snapshot.timestamp, 0) + 1 + if by_timestamp[snapshot.timestamp] > 1: + duplicate_folders[path] = snapshot + + by_url[snapshot.url] = by_url.get(snapshot.url, 0) + 1 + if by_url[snapshot.url] > 1: + duplicate_folders[path] = snapshot + return duplicate_folders + + +def get_orphaned_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]: + """dirs that contain a valid index but aren't listed in the main index""" + orphaned_folders: Dict[str, Optional['Snapshot']] = {} + + archive_dir = CONSTANTS.ARCHIVE_DIR + if not archive_dir.exists(): + return orphaned_folders + + for entry in archive_dir.iterdir(): + if entry.is_dir(): + index_path = entry / "index.json" + if index_path.exists() and not snapshots.filter(timestamp=entry.name).exists(): + orphaned_folders[str(entry)] = None + return orphaned_folders + + +def get_corrupted_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']: + """dirs that exist but have corrupted/invalid index files""" + corrupted: Dict[str, 'Snapshot'] = {} + for snapshot in snapshots.iterator(chunk_size=500): + if _is_corrupt_snapshot(snapshot): + corrupted[snapshot.output_dir] = snapshot + return corrupted + + +def get_unrecognized_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, None]: + """dirs that don't contain recognizable archive data and aren't listed in the main index""" + unrecognized_folders: Dict[str, None] = {} + + archive_dir = Path(out_dir) / CONSTANTS.ARCHIVE_DIR_NAME + if not archive_dir.exists(): + return unrecognized_folders + + for entry in archive_dir.iterdir(): + if entry.is_dir(): + index_exists = (entry / "index.json").exists() + + if index_exists: + try: + with open(entry / "index.json", 'r') as f: + json.load(f) + except Exception: + unrecognized_folders[str(entry)] = None + else: + timestamp = entry.name + if not snapshots.filter(timestamp=timestamp).exists(): + unrecognized_folders[str(entry)] = None + return unrecognized_folders + @enforce_types def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], List[str]]: @@ -27,7 +210,11 @@ def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], L """ fixed = [] cant_fix = [] - for entry in os.scandir(out_dir / CONSTANTS.ARCHIVE_DIR_NAME): + archive_dir = out_dir / CONSTANTS.ARCHIVE_DIR_NAME + if not archive_dir.exists(): + return fixed, cant_fix + + for entry in os.scandir(archive_dir): if entry.is_dir(follow_symlinks=True): index_path = Path(entry.path) / 'index.json' if index_path.exists(): @@ -43,7 +230,7 @@ def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], L continue if not entry.path.endswith(f'/{timestamp}'): - dest = out_dir / CONSTANTS.ARCHIVE_DIR_NAME / timestamp + dest = archive_dir / timestamp if dest.exists(): cant_fix.append(entry.path) else: From 767458e4e04017114a074f63b7bb59d4112cea97 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 28 Dec 2025 19:16:52 +0000 Subject: [PATCH 4/5] Revert "Restore missing folder utility functions" This reverts commit 32bcf0896d5576e4478309808576b3fdabdc42e4. --- archivebox/misc/folders.py | 201 ++----------------------------------- 1 file changed, 7 insertions(+), 194 deletions(-) diff --git a/archivebox/misc/folders.py b/archivebox/misc/folders.py index ae364f04..dd134dc1 100644 --- a/archivebox/misc/folders.py +++ b/archivebox/misc/folders.py @@ -1,5 +1,8 @@ """ -Folder status and integrity checking utilities for ArchiveBox. +Folder utilities for ArchiveBox. + +Note: This file only contains legacy cleanup utilities. +The DB is the single source of truth - use Snapshot.objects queries for all status checks. """ __package__ = 'archivebox.misc' @@ -8,197 +11,11 @@ import os import json import shutil from pathlib import Path -from itertools import chain -from typing import Dict, Optional, List, Tuple, TYPE_CHECKING - -from django.db.models import QuerySet +from typing import Tuple, List from archivebox.config import DATA_DIR, CONSTANTS from archivebox.misc.util import enforce_types -if TYPE_CHECKING: - from core.models import Snapshot - - -def _is_valid_snapshot(snapshot: 'Snapshot') -> bool: - """Check if a snapshot's data directory is valid""" - dir_exists = Path(snapshot.output_dir).exists() - index_exists = (Path(snapshot.output_dir) / "index.json").exists() - if not dir_exists: - return False - if dir_exists and not index_exists: - return False - if dir_exists and index_exists: - try: - with open(Path(snapshot.output_dir) / "index.json", 'r') as f: - data = json.load(f) - return snapshot.url == data.get('url') - except Exception: - pass - return False - - -def _is_corrupt_snapshot(snapshot: 'Snapshot') -> bool: - """Check if a snapshot's data directory is corrupted""" - if not Path(snapshot.output_dir).exists(): - return False - return not _is_valid_snapshot(snapshot) - - -def get_indexed_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']: - """indexed snapshots without checking archive status or data directory validity""" - return { - snapshot.output_dir: snapshot - for snapshot in snapshots.iterator(chunk_size=500) - } - - -def get_archived_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']: - """indexed snapshots that are archived with a valid data directory""" - return { - snapshot.output_dir: snapshot - for snapshot in snapshots.iterator(chunk_size=500) - if snapshot.is_archived - } - - -def get_unarchived_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']: - """indexed snapshots that are unarchived with no data directory or an empty data directory""" - return { - snapshot.output_dir: snapshot - for snapshot in snapshots.iterator(chunk_size=500) - if not snapshot.is_archived - } - - -def get_present_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]: - """dirs that actually exist in the archive/ folder""" - from core.models import Snapshot - - all_folders = {} - archive_dir = out_dir / CONSTANTS.ARCHIVE_DIR_NAME - if not archive_dir.exists(): - return all_folders - for entry in archive_dir.iterdir(): - if entry.is_dir(): - snapshot = None - try: - snapshot = Snapshot.objects.get(timestamp=entry.name) - except Snapshot.DoesNotExist: - pass - all_folders[entry.name] = snapshot - return all_folders - - -def get_valid_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']: - """dirs with a valid index matched to the main index and archived content""" - return { - snapshot.output_dir: snapshot - for snapshot in snapshots.iterator(chunk_size=500) - if _is_valid_snapshot(snapshot) - } - - -def get_invalid_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]: - """dirs that are invalid for any reason: corrupted/duplicate/orphaned/unrecognized""" - duplicate = get_duplicate_folders(snapshots, out_dir=out_dir) - orphaned = get_orphaned_folders(snapshots, out_dir=out_dir) - corrupted = get_corrupted_folders(snapshots, out_dir=out_dir) - unrecognized = get_unrecognized_folders(snapshots, out_dir=out_dir) - return {**duplicate, **orphaned, **corrupted, **unrecognized} - - -def get_duplicate_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]: - """dirs that conflict with other directories that have the same URL or timestamp""" - from core.models import Snapshot as SnapshotModel - - by_url: Dict[str, int] = {} - by_timestamp: Dict[str, int] = {} - duplicate_folders: Dict[str, Optional['Snapshot']] = {} - - archive_dir = CONSTANTS.ARCHIVE_DIR - if not archive_dir.exists(): - return duplicate_folders - - data_folders = ( - str(entry) - for entry in archive_dir.iterdir() - if entry.is_dir() and not snapshots.filter(timestamp=entry.name).exists() - ) - - for item in chain(snapshots.iterator(chunk_size=500), data_folders): - snapshot = None - if isinstance(item, str): - path = item - timestamp = Path(path).name - try: - snapshot = SnapshotModel.objects.get(timestamp=timestamp) - except SnapshotModel.DoesNotExist: - pass - else: - snapshot = item - path = snapshot.output_dir - - if snapshot: - by_timestamp[snapshot.timestamp] = by_timestamp.get(snapshot.timestamp, 0) + 1 - if by_timestamp[snapshot.timestamp] > 1: - duplicate_folders[path] = snapshot - - by_url[snapshot.url] = by_url.get(snapshot.url, 0) + 1 - if by_url[snapshot.url] > 1: - duplicate_folders[path] = snapshot - return duplicate_folders - - -def get_orphaned_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, Optional['Snapshot']]: - """dirs that contain a valid index but aren't listed in the main index""" - orphaned_folders: Dict[str, Optional['Snapshot']] = {} - - archive_dir = CONSTANTS.ARCHIVE_DIR - if not archive_dir.exists(): - return orphaned_folders - - for entry in archive_dir.iterdir(): - if entry.is_dir(): - index_path = entry / "index.json" - if index_path.exists() and not snapshots.filter(timestamp=entry.name).exists(): - orphaned_folders[str(entry)] = None - return orphaned_folders - - -def get_corrupted_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, 'Snapshot']: - """dirs that exist but have corrupted/invalid index files""" - corrupted: Dict[str, 'Snapshot'] = {} - for snapshot in snapshots.iterator(chunk_size=500): - if _is_corrupt_snapshot(snapshot): - corrupted[snapshot.output_dir] = snapshot - return corrupted - - -def get_unrecognized_folders(snapshots: QuerySet, out_dir: Path = DATA_DIR) -> Dict[str, None]: - """dirs that don't contain recognizable archive data and aren't listed in the main index""" - unrecognized_folders: Dict[str, None] = {} - - archive_dir = Path(out_dir) / CONSTANTS.ARCHIVE_DIR_NAME - if not archive_dir.exists(): - return unrecognized_folders - - for entry in archive_dir.iterdir(): - if entry.is_dir(): - index_exists = (entry / "index.json").exists() - - if index_exists: - try: - with open(entry / "index.json", 'r') as f: - json.load(f) - except Exception: - unrecognized_folders[str(entry)] = None - else: - timestamp = entry.name - if not snapshots.filter(timestamp=timestamp).exists(): - unrecognized_folders[str(entry)] = None - return unrecognized_folders - @enforce_types def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], List[str]]: @@ -210,11 +27,7 @@ def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], L """ fixed = [] cant_fix = [] - archive_dir = out_dir / CONSTANTS.ARCHIVE_DIR_NAME - if not archive_dir.exists(): - return fixed, cant_fix - - for entry in os.scandir(archive_dir): + for entry in os.scandir(out_dir / CONSTANTS.ARCHIVE_DIR_NAME): if entry.is_dir(follow_symlinks=True): index_path = Path(entry.path) / 'index.json' if index_path.exists(): @@ -230,7 +43,7 @@ def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], L continue if not entry.path.endswith(f'/{timestamp}'): - dest = archive_dir / timestamp + dest = out_dir / CONSTANTS.ARCHIVE_DIR_NAME / timestamp if dest.exists(): cant_fix.append(entry.path) else: From 057b49ad85011286b2eace4631d20df7f17549d6 Mon Sep 17 00:00:00 2001 From: Claude Date: Sun, 28 Dec 2025 19:19:03 +0000 Subject: [PATCH 5/5] Update status command to use DB as source of truth Remove imports of deleted folder utility functions and rewrite status command to query Snapshot model directly. This aligns with the fs_version refactor where the DB is the single source of truth. - Use Snapshot.objects queries for indexed/archived/unarchived counts - Scan filesystem directly for present/orphaned directory counts - Simplify output to focus on essential status information --- archivebox/cli/archivebox_status.py | 68 ++++++++++++----------------- 1 file changed, 27 insertions(+), 41 deletions(-) diff --git a/archivebox/cli/archivebox_status.py b/archivebox/cli/archivebox_status.py index e37f01ba..de5ada95 100644 --- a/archivebox/cli/archivebox_status.py +++ b/archivebox/cli/archivebox_status.py @@ -11,18 +11,6 @@ from archivebox.misc.util import enforce_types, docstring from archivebox.config import DATA_DIR, CONSTANTS, ARCHIVE_DIR from archivebox.config.common import SHELL_CONFIG from archivebox.misc.legacy import parse_json_links_details -from archivebox.misc.folders import ( - get_indexed_folders, - get_archived_folders, - get_invalid_folders, - get_unarchived_folders, - get_present_folders, - get_valid_folders, - get_duplicate_folders, - get_orphaned_folders, - get_corrupted_folders, - get_unrecognized_folders, -) from archivebox.misc.system import get_dir_size from archivebox.misc.logging_util import printable_filesize @@ -55,42 +43,40 @@ def status(out_dir: Path=DATA_DIR) -> None: size = printable_filesize(num_bytes) print(f' Size: {size} across {num_files} files in {num_dirs} directories') - num_indexed = len(get_indexed_folders(links, out_dir=out_dir)) - num_archived = len(get_archived_folders(links, out_dir=out_dir)) - num_unarchived = len(get_unarchived_folders(links, out_dir=out_dir)) - print(f' > indexed: {num_indexed}'.ljust(36), f'({get_indexed_folders.__doc__})') - print(f' > archived: {num_archived}'.ljust(36), f'({get_archived_folders.__doc__})') - print(f' > unarchived: {num_unarchived}'.ljust(36), f'({get_unarchived_folders.__doc__})') - - num_present = len(get_present_folders(links, out_dir=out_dir)) - num_valid = len(get_valid_folders(links, out_dir=out_dir)) + # Use DB as source of truth for snapshot status + num_indexed = links.count() + num_archived = links.filter(status='archived').count() or links.exclude(downloaded_at=None).count() + num_unarchived = links.filter(status='queued').count() or links.filter(downloaded_at=None).count() + print(f' > indexed: {num_indexed}'.ljust(36), '(total snapshots in DB)') + print(f' > archived: {num_archived}'.ljust(36), '(snapshots with archived content)') + print(f' > unarchived: {num_unarchived}'.ljust(36), '(snapshots pending archiving)') + + # Count directories on filesystem + num_present = 0 + orphaned_dirs = [] + if ARCHIVE_DIR.exists(): + for entry in ARCHIVE_DIR.iterdir(): + if entry.is_dir(): + num_present += 1 + if not links.filter(timestamp=entry.name).exists(): + orphaned_dirs.append(str(entry)) + + num_valid = min(num_present, num_indexed) # approximate print() - print(f' > present: {num_present}'.ljust(36), f'({get_present_folders.__doc__})') - print(f' > [green]valid:[/green] {num_valid}'.ljust(36), f' ({get_valid_folders.__doc__})') - - duplicate = get_duplicate_folders(links, out_dir=out_dir) - orphaned = get_orphaned_folders(links, out_dir=out_dir) - corrupted = get_corrupted_folders(links, out_dir=out_dir) - unrecognized = get_unrecognized_folders(links, out_dir=out_dir) - num_invalid = len({**duplicate, **orphaned, **corrupted, **unrecognized}) - print(f' > [red]invalid:[/red] {num_invalid}'.ljust(36), f' ({get_invalid_folders.__doc__})') - print(f' > duplicate: {len(duplicate)}'.ljust(36), f'({get_duplicate_folders.__doc__})') - print(f' > orphaned: {len(orphaned)}'.ljust(36), f'({get_orphaned_folders.__doc__})') - print(f' > corrupted: {len(corrupted)}'.ljust(36), f'({get_corrupted_folders.__doc__})') - print(f' > unrecognized: {len(unrecognized)}'.ljust(36), f'({get_unrecognized_folders.__doc__})') + print(f' > present: {num_present}'.ljust(36), '(directories in archive/)') + print(f' > [green]valid:[/green] {num_valid}'.ljust(36), ' (directories with matching DB entry)') + + num_orphaned = len(orphaned_dirs) + print(f' > [red]orphaned:[/red] {num_orphaned}'.ljust(36), ' (directories without matching DB entry)') if num_indexed: - print(' [violet]Hint:[/violet] You can list link data directories by status like so:') - print(' [green]archivebox list --status= (e.g. indexed, corrupted, archived, etc.)[/green]') + print(' [violet]Hint:[/violet] You can list snapshots by status like so:') + print(' [green]archivebox list --status= (e.g. archived, queued, etc.)[/green]') - if orphaned: + if orphaned_dirs: print(' [violet]Hint:[/violet] To automatically import orphaned data directories into the main index, run:') print(' [green]archivebox init[/green]') - if num_invalid: - print(' [violet]Hint:[/violet] You may need to manually remove or fix some invalid data directories, afterwards make sure to run:') - print(' [green]archivebox init[/green]') - print() print('[green]\\[*] Scanning recent archive changes and user logins:[/green]') print(f'[yellow] {CONSTANTS.LOGS_DIR}/*[/yellow]')