Resolve crawl provider dependencies lazily

2026-04-06 07:47:53 +10:00 · 2026-03-15 10:18:49 -07:00
parent d4be507a6b
commit 47f540c094
3 changed files with 86 additions and 51 deletions
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -352,18 +352,25 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
            if not binary_names:
                return

-            pending_binaries = Binary.objects.filter(
-                machine=machine,
-                name__in=binary_names,
-            ).exclude(
-                status=Binary.StatusChoices.INSTALLED,
-            ).order_by('retry_at')
+            max_attempts = max(2, len(binary_names))

-            for binary in pending_binaries:
-                try:
-                    binary.sm.tick()
-                except Exception:
-                    continue
+            for _ in range(max_attempts):
+                pending_binaries = list(
+                    Binary.objects.filter(
+                        machine=machine,
+                        name__in=binary_names,
+                    ).exclude(
+                        status=Binary.StatusChoices.INSTALLED,
+                    ).order_by('retry_at', 'name')
+                )
+                if not pending_binaries:
+                    return
+
+                for binary in pending_binaries:
+                    try:
+                        binary.sm.tick()
+                    except Exception:
+                        continue

            unresolved_binaries = list(
                Binary.objects.filter(
@@ -382,16 +389,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
                    f'Crawl dependencies failed to install before continuing: {binary_details}'
                )

-        # Discover and run on_Crawl hooks
-        with open(debug_log, 'a') as f:
-            f.write(f'Discovering Crawl hooks...\n')
-            f.flush()
-        hooks = discover_hooks('Crawl', config=config)
-        with open(debug_log, 'a') as f:
-            f.write(f'Found {len(hooks)} hooks\n')
-            f.flush()
+        executed_crawl_hooks: set[str] = set()
+
+        def run_crawl_hook(hook: Path) -> set[str]:
+            executed_crawl_hooks.add(str(hook))

-        for hook in hooks:
            with open(debug_log, 'a') as f:
                f.write(f'Running hook: {hook.name}\n')
                f.flush()
@@ -400,38 +402,34 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
            output_dir = self.output_dir / plugin_name
            output_dir.mkdir(parents=True, exist_ok=True)

-            # Run hook using Process.launch() - returns Process model
            process = run_hook(
                hook,
                output_dir=output_dir,
                config=config,
                crawl_id=str(self.id),
-                source_url=self.urls,  # Pass full newline-separated URLs
+                source_url=self.urls,
            )
            with open(debug_log, 'a') as f:
                f.write(f'Hook {hook.name} completed with status={process.status}\n')
                f.flush()

            hook_elapsed = time.time() - hook_start
-            if hook_elapsed > 0.5:  # Log slow hooks
+            if hook_elapsed > 0.5:
                print(f'[yellow]⏱️  Hook {hook.name} took {hook_elapsed:.2f}s[/yellow]')

-            # Finite background hooks must finish before snapshots start so they can
-            # emit dependency records (Binary, Machine config, etc.).
            if process.status == process.StatusChoices.RUNNING:
                if not is_finite_background_hook(hook.name):
-                    continue
+                    return set()
                try:
                    process.wait(timeout=process.timeout)
                except Exception:
-                    continue
+                    return set()

-            # Foreground hook - process JSONL records
            from archivebox.hooks import extract_records_from_process
            records = extract_records_from_process(process)
            if records:
                print(f'[cyan]📝 Processing {len(records)} records from {hook.name}[/cyan]')
-                for record in records[:3]:  # Show first 3
+                for record in records[:3]:
                    print(f'   Record: type={record.get("type")}, keys={list(record.keys())[:5]}')
            overrides = {'crawl': self}
            stats = process_hook_records(records, overrides=overrides)
@@ -446,7 +444,60 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
            hook_binary_names.discard('')
            if hook_binary_names:
                declared_binary_names.update(hook_binary_names)
-                install_declared_binaries(hook_binary_names)
+            return hook_binary_names
+
+        def resolve_provider_binaries(binary_names: set[str]) -> set[str]:
+            if not binary_names:
+                return set()
+
+            resolved_binary_names = set(binary_names)
+
+            while True:
+                unresolved_binaries = list(
+                    Binary.objects.filter(
+                        machine=machine,
+                        name__in=resolved_binary_names,
+                    ).exclude(
+                        status=Binary.StatusChoices.INSTALLED,
+                    ).order_by('name')
+                )
+                if not unresolved_binaries:
+                    return resolved_binary_names
+
+                needed_provider_names: set[str] = set()
+                for binary in unresolved_binaries:
+                    allowed_binproviders = binary._allowed_binproviders()
+                    if allowed_binproviders is None:
+                        continue
+                    needed_provider_names.update(allowed_binproviders)
+
+                if not needed_provider_names:
+                    return resolved_binary_names
+
+                provider_hooks = [
+                    hook
+                    for hook in discover_hooks('Crawl', filter_disabled=False, config=config)
+                    if hook.parent.name in needed_provider_names and str(hook) not in executed_crawl_hooks
+                ]
+                if not provider_hooks:
+                    return resolved_binary_names
+
+                for hook in provider_hooks:
+                    resolved_binary_names.update(run_crawl_hook(hook))
+
+        # Discover and run on_Crawl hooks
+        with open(debug_log, 'a') as f:
+            f.write(f'Discovering Crawl hooks...\n')
+            f.flush()
+        hooks = discover_hooks('Crawl', config=config)
+        with open(debug_log, 'a') as f:
+            f.write(f'Found {len(hooks)} hooks\n')
+            f.flush()
+
+        for hook in hooks:
+            hook_binary_names = run_crawl_hook(hook)
+            if hook_binary_names:
+                install_declared_binaries(resolve_provider_binaries(hook_binary_names))

        # Safety check: don't create snapshots if any crawl-declared dependency
        # is still unresolved after all crawl hooks have run.
--- a/archivebox/hooks.py
+++ b/archivebox/hooks.py
@@ -622,19 +622,6 @@ def get_plugins() -> List[str]:
    return sorted(set(plugins))


-@lru_cache(maxsize=1)
-def get_binary_provider_plugins() -> List[str]:
-    """Get plugin names that expose Binary hooks and act as provider plugins."""
-    providers = []
-
-    for plugin_dir in iter_plugin_dirs():
-        has_binary_hooks = any(plugin_dir.glob('on_Binary__*.*'))
-        if has_binary_hooks:
-            providers.append(plugin_dir.name)
-
-    return sorted(set(providers))
-
-
 def get_parser_plugins() -> List[str]:
    """
    Get list of parser plugins by discovering parse_*_urls hooks.
@@ -926,12 +913,10 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
    plugins_whitelist = config.get('PLUGINS', '')
    if plugins_whitelist:
        # PLUGINS whitelist is specified - include transitive required_plugins from
-        # config.json as well as binary provider plugins. Provider plugins may also
-        # expose early on_Crawl hooks (e.g. npm -> install node/npm) that are
-        # required before a selected extractor's Binary hooks can succeed.
+        # config.json so selecting a plugin also enables its declared plugin-level
+        # dependencies (e.g. singlefile -> chrome).
        plugin_configs = discover_plugin_configs()
        plugin_names = {p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()}
-        plugin_names.update(provider.lower() for provider in get_binary_provider_plugins())
        pending = list(plugin_names)

        while pending:
--- a/archivebox/tests/test_hooks.py
+++ b/archivebox/tests/test_hooks.py
@@ -263,8 +263,8 @@ class TestHookDiscovery(unittest.TestCase):
        hook_names = [hook.name for hook in hooks]
        self.assertIn('on_Binary__10_npm_install.py', hook_names)

-    def test_discover_crawl_hooks_keeps_binary_provider_dependencies_enabled(self):
-        """Provider crawl hooks should remain enabled when a whitelisted plugin depends on them transitively."""
+    def test_discover_crawl_hooks_only_include_declared_plugin_dependencies(self):
+        """Crawl hook discovery should include required_plugins without broadening to provider plugins."""
        responses_dir = self.plugins_dir / 'responses'
        responses_dir.mkdir()
        (responses_dir / 'config.json').write_text(
@@ -291,13 +291,12 @@ class TestHookDiscovery(unittest.TestCase):
        from archivebox import hooks as hooks_module

        hooks_module.get_plugins.cache_clear()
-        hooks_module.get_binary_provider_plugins.cache_clear()
        with patch.object(hooks_module, 'BUILTIN_PLUGINS_DIR', self.plugins_dir), patch.object(hooks_module, 'USER_PLUGINS_DIR', self.test_dir / 'user_plugins'):
            hooks = hooks_module.discover_hooks('Crawl', config={'PLUGINS': 'responses'})

        hook_names = [hook.name for hook in hooks]
        self.assertIn('on_Crawl__70_chrome_install.finite.bg.py', hook_names)
-        self.assertIn('on_Crawl__00_npm_install.py', hook_names)
+        self.assertNotIn('on_Crawl__00_npm_install.py', hook_names)


 class TestGetExtractorName(unittest.TestCase):