Keep provider plugins enabled under whitelists

This commit is contained in:
Nick Sweeting
2026-03-15 09:49:45 -07:00
parent 82bfd7e655
commit d4be507a6b
2 changed files with 55 additions and 2 deletions

View File

@@ -622,6 +622,19 @@ def get_plugins() -> List[str]:
return sorted(set(plugins)) return sorted(set(plugins))
@lru_cache(maxsize=1)
def get_binary_provider_plugins() -> List[str]:
"""Get plugin names that expose Binary hooks and act as provider plugins."""
providers = []
for plugin_dir in iter_plugin_dirs():
has_binary_hooks = any(plugin_dir.glob('on_Binary__*.*'))
if has_binary_hooks:
providers.append(plugin_dir.name)
return sorted(set(providers))
def get_parser_plugins() -> List[str]: def get_parser_plugins() -> List[str]:
""" """
Get list of parser plugins by discovering parse_*_urls hooks. Get list of parser plugins by discovering parse_*_urls hooks.
@@ -912,9 +925,13 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
# Check if PLUGINS whitelist is specified (e.g., --plugins=wget,favicon) # Check if PLUGINS whitelist is specified (e.g., --plugins=wget,favicon)
plugins_whitelist = config.get('PLUGINS', '') plugins_whitelist = config.get('PLUGINS', '')
if plugins_whitelist: if plugins_whitelist:
# PLUGINS whitelist is specified - include transitive required_plugins from config.json # PLUGINS whitelist is specified - include transitive required_plugins from
# config.json as well as binary provider plugins. Provider plugins may also
# expose early on_Crawl hooks (e.g. npm -> install node/npm) that are
# required before a selected extractor's Binary hooks can succeed.
plugin_configs = discover_plugin_configs() plugin_configs = discover_plugin_configs()
plugin_names = {p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()} plugin_names = {p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()}
plugin_names.update(provider.lower() for provider in get_binary_provider_plugins())
pending = list(plugin_names) pending = list(plugin_names)
while pending: while pending:

View File

@@ -179,7 +179,7 @@ class TestHookDiscovery(unittest.TestCase):
(wget_dir / 'on_Crawl__10_wget_install.finite.bg.py').write_text('# install hook') (wget_dir / 'on_Crawl__10_wget_install.finite.bg.py').write_text('# install hook')
chrome_dir = self.plugins_dir / 'chrome' chrome_dir = self.plugins_dir / 'chrome'
chrome_dir.mkdir() chrome_dir.mkdir(exist_ok=True)
(chrome_dir / 'on_Snapshot__20_chrome_tab.daemon.bg.js').write_text('// background hook') (chrome_dir / 'on_Snapshot__20_chrome_tab.daemon.bg.js').write_text('// background hook')
consolelog_dir = self.plugins_dir / 'consolelog' consolelog_dir = self.plugins_dir / 'consolelog'
@@ -263,6 +263,42 @@ class TestHookDiscovery(unittest.TestCase):
hook_names = [hook.name for hook in hooks] hook_names = [hook.name for hook in hooks]
self.assertIn('on_Binary__10_npm_install.py', hook_names) self.assertIn('on_Binary__10_npm_install.py', hook_names)
def test_discover_crawl_hooks_keeps_binary_provider_dependencies_enabled(self):
"""Provider crawl hooks should remain enabled when a whitelisted plugin depends on them transitively."""
responses_dir = self.plugins_dir / 'responses'
responses_dir.mkdir()
(responses_dir / 'config.json').write_text(
json.dumps(
{
"type": "object",
"required_plugins": ["chrome"],
"properties": {},
}
)
)
chrome_dir = self.plugins_dir / 'chrome'
chrome_dir.mkdir(exist_ok=True)
(chrome_dir / 'config.json').write_text('{"type": "object", "properties": {}}')
(chrome_dir / 'on_Crawl__70_chrome_install.finite.bg.py').write_text('# chrome crawl hook')
npm_dir = self.plugins_dir / 'npm'
npm_dir.mkdir()
(npm_dir / 'on_Binary__10_npm_install.py').write_text('# npm binary hook')
(npm_dir / 'on_Crawl__00_npm_install.py').write_text('# npm crawl hook')
(npm_dir / 'config.json').write_text('{"type": "object", "properties": {}}')
from archivebox import hooks as hooks_module
hooks_module.get_plugins.cache_clear()
hooks_module.get_binary_provider_plugins.cache_clear()
with patch.object(hooks_module, 'BUILTIN_PLUGINS_DIR', self.plugins_dir), patch.object(hooks_module, 'USER_PLUGINS_DIR', self.test_dir / 'user_plugins'):
hooks = hooks_module.discover_hooks('Crawl', config={'PLUGINS': 'responses'})
hook_names = [hook.name for hook in hooks]
self.assertIn('on_Crawl__70_chrome_install.finite.bg.py', hook_names)
self.assertIn('on_Crawl__00_npm_install.py', hook_names)
class TestGetExtractorName(unittest.TestCase): class TestGetExtractorName(unittest.TestCase):
"""Test get_extractor_name() function.""" """Test get_extractor_name() function."""