mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Keep provider plugins enabled under whitelists
This commit is contained in:
@@ -622,6 +622,19 @@ def get_plugins() -> List[str]:
|
||||
return sorted(set(plugins))
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_binary_provider_plugins() -> List[str]:
|
||||
"""Get plugin names that expose Binary hooks and act as provider plugins."""
|
||||
providers = []
|
||||
|
||||
for plugin_dir in iter_plugin_dirs():
|
||||
has_binary_hooks = any(plugin_dir.glob('on_Binary__*.*'))
|
||||
if has_binary_hooks:
|
||||
providers.append(plugin_dir.name)
|
||||
|
||||
return sorted(set(providers))
|
||||
|
||||
|
||||
def get_parser_plugins() -> List[str]:
|
||||
"""
|
||||
Get list of parser plugins by discovering parse_*_urls hooks.
|
||||
@@ -912,9 +925,13 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
|
||||
# Check if PLUGINS whitelist is specified (e.g., --plugins=wget,favicon)
|
||||
plugins_whitelist = config.get('PLUGINS', '')
|
||||
if plugins_whitelist:
|
||||
# PLUGINS whitelist is specified - include transitive required_plugins from config.json
|
||||
# PLUGINS whitelist is specified - include transitive required_plugins from
|
||||
# config.json as well as binary provider plugins. Provider plugins may also
|
||||
# expose early on_Crawl hooks (e.g. npm -> install node/npm) that are
|
||||
# required before a selected extractor's Binary hooks can succeed.
|
||||
plugin_configs = discover_plugin_configs()
|
||||
plugin_names = {p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()}
|
||||
plugin_names.update(provider.lower() for provider in get_binary_provider_plugins())
|
||||
pending = list(plugin_names)
|
||||
|
||||
while pending:
|
||||
|
||||
@@ -179,7 +179,7 @@ class TestHookDiscovery(unittest.TestCase):
|
||||
(wget_dir / 'on_Crawl__10_wget_install.finite.bg.py').write_text('# install hook')
|
||||
|
||||
chrome_dir = self.plugins_dir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
chrome_dir.mkdir(exist_ok=True)
|
||||
(chrome_dir / 'on_Snapshot__20_chrome_tab.daemon.bg.js').write_text('// background hook')
|
||||
|
||||
consolelog_dir = self.plugins_dir / 'consolelog'
|
||||
@@ -263,6 +263,42 @@ class TestHookDiscovery(unittest.TestCase):
|
||||
hook_names = [hook.name for hook in hooks]
|
||||
self.assertIn('on_Binary__10_npm_install.py', hook_names)
|
||||
|
||||
def test_discover_crawl_hooks_keeps_binary_provider_dependencies_enabled(self):
|
||||
"""Provider crawl hooks should remain enabled when a whitelisted plugin depends on them transitively."""
|
||||
responses_dir = self.plugins_dir / 'responses'
|
||||
responses_dir.mkdir()
|
||||
(responses_dir / 'config.json').write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"type": "object",
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {},
|
||||
}
|
||||
)
|
||||
)
|
||||
|
||||
chrome_dir = self.plugins_dir / 'chrome'
|
||||
chrome_dir.mkdir(exist_ok=True)
|
||||
(chrome_dir / 'config.json').write_text('{"type": "object", "properties": {}}')
|
||||
(chrome_dir / 'on_Crawl__70_chrome_install.finite.bg.py').write_text('# chrome crawl hook')
|
||||
|
||||
npm_dir = self.plugins_dir / 'npm'
|
||||
npm_dir.mkdir()
|
||||
(npm_dir / 'on_Binary__10_npm_install.py').write_text('# npm binary hook')
|
||||
(npm_dir / 'on_Crawl__00_npm_install.py').write_text('# npm crawl hook')
|
||||
(npm_dir / 'config.json').write_text('{"type": "object", "properties": {}}')
|
||||
|
||||
from archivebox import hooks as hooks_module
|
||||
|
||||
hooks_module.get_plugins.cache_clear()
|
||||
hooks_module.get_binary_provider_plugins.cache_clear()
|
||||
with patch.object(hooks_module, 'BUILTIN_PLUGINS_DIR', self.plugins_dir), patch.object(hooks_module, 'USER_PLUGINS_DIR', self.test_dir / 'user_plugins'):
|
||||
hooks = hooks_module.discover_hooks('Crawl', config={'PLUGINS': 'responses'})
|
||||
|
||||
hook_names = [hook.name for hook in hooks]
|
||||
self.assertIn('on_Crawl__70_chrome_install.finite.bg.py', hook_names)
|
||||
self.assertIn('on_Crawl__00_npm_install.py', hook_names)
|
||||
|
||||
|
||||
class TestGetExtractorName(unittest.TestCase):
|
||||
"""Test get_extractor_name() function."""
|
||||
|
||||
Reference in New Issue
Block a user