mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Keep provider plugins enabled under whitelists
This commit is contained in:
@@ -622,6 +622,19 @@ def get_plugins() -> List[str]:
|
|||||||
return sorted(set(plugins))
|
return sorted(set(plugins))
|
||||||
|
|
||||||
|
|
||||||
|
@lru_cache(maxsize=1)
|
||||||
|
def get_binary_provider_plugins() -> List[str]:
|
||||||
|
"""Get plugin names that expose Binary hooks and act as provider plugins."""
|
||||||
|
providers = []
|
||||||
|
|
||||||
|
for plugin_dir in iter_plugin_dirs():
|
||||||
|
has_binary_hooks = any(plugin_dir.glob('on_Binary__*.*'))
|
||||||
|
if has_binary_hooks:
|
||||||
|
providers.append(plugin_dir.name)
|
||||||
|
|
||||||
|
return sorted(set(providers))
|
||||||
|
|
||||||
|
|
||||||
def get_parser_plugins() -> List[str]:
|
def get_parser_plugins() -> List[str]:
|
||||||
"""
|
"""
|
||||||
Get list of parser plugins by discovering parse_*_urls hooks.
|
Get list of parser plugins by discovering parse_*_urls hooks.
|
||||||
@@ -912,9 +925,13 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
|
|||||||
# Check if PLUGINS whitelist is specified (e.g., --plugins=wget,favicon)
|
# Check if PLUGINS whitelist is specified (e.g., --plugins=wget,favicon)
|
||||||
plugins_whitelist = config.get('PLUGINS', '')
|
plugins_whitelist = config.get('PLUGINS', '')
|
||||||
if plugins_whitelist:
|
if plugins_whitelist:
|
||||||
# PLUGINS whitelist is specified - include transitive required_plugins from config.json
|
# PLUGINS whitelist is specified - include transitive required_plugins from
|
||||||
|
# config.json as well as binary provider plugins. Provider plugins may also
|
||||||
|
# expose early on_Crawl hooks (e.g. npm -> install node/npm) that are
|
||||||
|
# required before a selected extractor's Binary hooks can succeed.
|
||||||
plugin_configs = discover_plugin_configs()
|
plugin_configs = discover_plugin_configs()
|
||||||
plugin_names = {p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()}
|
plugin_names = {p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()}
|
||||||
|
plugin_names.update(provider.lower() for provider in get_binary_provider_plugins())
|
||||||
pending = list(plugin_names)
|
pending = list(plugin_names)
|
||||||
|
|
||||||
while pending:
|
while pending:
|
||||||
|
|||||||
@@ -179,7 +179,7 @@ class TestHookDiscovery(unittest.TestCase):
|
|||||||
(wget_dir / 'on_Crawl__10_wget_install.finite.bg.py').write_text('# install hook')
|
(wget_dir / 'on_Crawl__10_wget_install.finite.bg.py').write_text('# install hook')
|
||||||
|
|
||||||
chrome_dir = self.plugins_dir / 'chrome'
|
chrome_dir = self.plugins_dir / 'chrome'
|
||||||
chrome_dir.mkdir()
|
chrome_dir.mkdir(exist_ok=True)
|
||||||
(chrome_dir / 'on_Snapshot__20_chrome_tab.daemon.bg.js').write_text('// background hook')
|
(chrome_dir / 'on_Snapshot__20_chrome_tab.daemon.bg.js').write_text('// background hook')
|
||||||
|
|
||||||
consolelog_dir = self.plugins_dir / 'consolelog'
|
consolelog_dir = self.plugins_dir / 'consolelog'
|
||||||
@@ -263,6 +263,42 @@ class TestHookDiscovery(unittest.TestCase):
|
|||||||
hook_names = [hook.name for hook in hooks]
|
hook_names = [hook.name for hook in hooks]
|
||||||
self.assertIn('on_Binary__10_npm_install.py', hook_names)
|
self.assertIn('on_Binary__10_npm_install.py', hook_names)
|
||||||
|
|
||||||
|
def test_discover_crawl_hooks_keeps_binary_provider_dependencies_enabled(self):
|
||||||
|
"""Provider crawl hooks should remain enabled when a whitelisted plugin depends on them transitively."""
|
||||||
|
responses_dir = self.plugins_dir / 'responses'
|
||||||
|
responses_dir.mkdir()
|
||||||
|
(responses_dir / 'config.json').write_text(
|
||||||
|
json.dumps(
|
||||||
|
{
|
||||||
|
"type": "object",
|
||||||
|
"required_plugins": ["chrome"],
|
||||||
|
"properties": {},
|
||||||
|
}
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
chrome_dir = self.plugins_dir / 'chrome'
|
||||||
|
chrome_dir.mkdir(exist_ok=True)
|
||||||
|
(chrome_dir / 'config.json').write_text('{"type": "object", "properties": {}}')
|
||||||
|
(chrome_dir / 'on_Crawl__70_chrome_install.finite.bg.py').write_text('# chrome crawl hook')
|
||||||
|
|
||||||
|
npm_dir = self.plugins_dir / 'npm'
|
||||||
|
npm_dir.mkdir()
|
||||||
|
(npm_dir / 'on_Binary__10_npm_install.py').write_text('# npm binary hook')
|
||||||
|
(npm_dir / 'on_Crawl__00_npm_install.py').write_text('# npm crawl hook')
|
||||||
|
(npm_dir / 'config.json').write_text('{"type": "object", "properties": {}}')
|
||||||
|
|
||||||
|
from archivebox import hooks as hooks_module
|
||||||
|
|
||||||
|
hooks_module.get_plugins.cache_clear()
|
||||||
|
hooks_module.get_binary_provider_plugins.cache_clear()
|
||||||
|
with patch.object(hooks_module, 'BUILTIN_PLUGINS_DIR', self.plugins_dir), patch.object(hooks_module, 'USER_PLUGINS_DIR', self.test_dir / 'user_plugins'):
|
||||||
|
hooks = hooks_module.discover_hooks('Crawl', config={'PLUGINS': 'responses'})
|
||||||
|
|
||||||
|
hook_names = [hook.name for hook in hooks]
|
||||||
|
self.assertIn('on_Crawl__70_chrome_install.finite.bg.py', hook_names)
|
||||||
|
self.assertIn('on_Crawl__00_npm_install.py', hook_names)
|
||||||
|
|
||||||
|
|
||||||
class TestGetExtractorName(unittest.TestCase):
|
class TestGetExtractorName(unittest.TestCase):
|
||||||
"""Test get_extractor_name() function."""
|
"""Test get_extractor_name() function."""
|
||||||
|
|||||||
Reference in New Issue
Block a user