From d4be507a6bf93f0bff334d593e30317aabfb5aaf Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 15 Mar 2026 09:49:45 -0700 Subject: [PATCH] Keep provider plugins enabled under whitelists --- archivebox/hooks.py | 19 ++++++++++++++++- archivebox/tests/test_hooks.py | 38 +++++++++++++++++++++++++++++++++- 2 files changed, 55 insertions(+), 2 deletions(-) diff --git a/archivebox/hooks.py b/archivebox/hooks.py index f20cbb23..51f1f42d 100644 --- a/archivebox/hooks.py +++ b/archivebox/hooks.py @@ -622,6 +622,19 @@ def get_plugins() -> List[str]: return sorted(set(plugins)) +@lru_cache(maxsize=1) +def get_binary_provider_plugins() -> List[str]: + """Get plugin names that expose Binary hooks and act as provider plugins.""" + providers = [] + + for plugin_dir in iter_plugin_dirs(): + has_binary_hooks = any(plugin_dir.glob('on_Binary__*.*')) + if has_binary_hooks: + providers.append(plugin_dir.name) + + return sorted(set(providers)) + + def get_parser_plugins() -> List[str]: """ Get list of parser plugins by discovering parse_*_urls hooks. @@ -912,9 +925,13 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[ # Check if PLUGINS whitelist is specified (e.g., --plugins=wget,favicon) plugins_whitelist = config.get('PLUGINS', '') if plugins_whitelist: - # PLUGINS whitelist is specified - include transitive required_plugins from config.json + # PLUGINS whitelist is specified - include transitive required_plugins from + # config.json as well as binary provider plugins. Provider plugins may also + # expose early on_Crawl hooks (e.g. npm -> install node/npm) that are + # required before a selected extractor's Binary hooks can succeed. plugin_configs = discover_plugin_configs() plugin_names = {p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()} + plugin_names.update(provider.lower() for provider in get_binary_provider_plugins()) pending = list(plugin_names) while pending: diff --git a/archivebox/tests/test_hooks.py b/archivebox/tests/test_hooks.py index fae05a27..9435d328 100755 --- a/archivebox/tests/test_hooks.py +++ b/archivebox/tests/test_hooks.py @@ -179,7 +179,7 @@ class TestHookDiscovery(unittest.TestCase): (wget_dir / 'on_Crawl__10_wget_install.finite.bg.py').write_text('# install hook') chrome_dir = self.plugins_dir / 'chrome' - chrome_dir.mkdir() + chrome_dir.mkdir(exist_ok=True) (chrome_dir / 'on_Snapshot__20_chrome_tab.daemon.bg.js').write_text('// background hook') consolelog_dir = self.plugins_dir / 'consolelog' @@ -263,6 +263,42 @@ class TestHookDiscovery(unittest.TestCase): hook_names = [hook.name for hook in hooks] self.assertIn('on_Binary__10_npm_install.py', hook_names) + def test_discover_crawl_hooks_keeps_binary_provider_dependencies_enabled(self): + """Provider crawl hooks should remain enabled when a whitelisted plugin depends on them transitively.""" + responses_dir = self.plugins_dir / 'responses' + responses_dir.mkdir() + (responses_dir / 'config.json').write_text( + json.dumps( + { + "type": "object", + "required_plugins": ["chrome"], + "properties": {}, + } + ) + ) + + chrome_dir = self.plugins_dir / 'chrome' + chrome_dir.mkdir(exist_ok=True) + (chrome_dir / 'config.json').write_text('{"type": "object", "properties": {}}') + (chrome_dir / 'on_Crawl__70_chrome_install.finite.bg.py').write_text('# chrome crawl hook') + + npm_dir = self.plugins_dir / 'npm' + npm_dir.mkdir() + (npm_dir / 'on_Binary__10_npm_install.py').write_text('# npm binary hook') + (npm_dir / 'on_Crawl__00_npm_install.py').write_text('# npm crawl hook') + (npm_dir / 'config.json').write_text('{"type": "object", "properties": {}}') + + from archivebox import hooks as hooks_module + + hooks_module.get_plugins.cache_clear() + hooks_module.get_binary_provider_plugins.cache_clear() + with patch.object(hooks_module, 'BUILTIN_PLUGINS_DIR', self.plugins_dir), patch.object(hooks_module, 'USER_PLUGINS_DIR', self.test_dir / 'user_plugins'): + hooks = hooks_module.discover_hooks('Crawl', config={'PLUGINS': 'responses'}) + + hook_names = [hook.name for hook in hooks] + self.assertIn('on_Crawl__70_chrome_install.finite.bg.py', hook_names) + self.assertIn('on_Crawl__00_npm_install.py', hook_names) + class TestGetExtractorName(unittest.TestCase): """Test get_extractor_name() function."""