From 82bfd7e655540f5efd54349e1c580c477675acfc Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Sun, 15 Mar 2026 09:32:32 -0700 Subject: [PATCH] Filter binary hooks by allowed providers --- archivebox/machine/models.py | 47 +++++++++++++++++++++- archivebox/tests/conftest.py | 75 ++---------------------------------- 2 files changed, 48 insertions(+), 74 deletions(-) diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index d90a98fc..4740d639 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -394,6 +394,40 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): self.modified_at = timezone.now() self.save() + def _allowed_binproviders(self) -> set[str] | None: + """Return the allowed binproviders for this binary, or None for wildcard.""" + providers = str(self.binproviders or '').strip() + if not providers or providers == '*': + return None + return {provider.strip() for provider in providers.split(',') if provider.strip()} + + def _get_custom_install_command(self) -> str | None: + """Extract a custom install command from overrides when the custom provider is used.""" + import shlex + + if not isinstance(self.overrides, dict): + return None + + for key in ('custom_cmd', 'cmd', 'command'): + value = self.overrides.get(key) + if isinstance(value, str) and value.strip(): + return value.strip() + + custom_overrides = self.overrides.get('custom') + if isinstance(custom_overrides, dict): + for key in ('custom_cmd', 'cmd', 'command'): + value = custom_overrides.get(key) + if isinstance(value, str) and value.strip(): + return value.strip() + + install_args = custom_overrides.get('install_args') + if isinstance(install_args, str) and install_args.strip(): + return install_args.strip() + if isinstance(install_args, list) and install_args: + return ' '.join(shlex.quote(str(arg)) for arg in install_args if str(arg).strip()) + + return None + def run(self): """ Execute binary installation by running on_Binary__install_* hooks. @@ -420,9 +454,14 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): # No hooks available - stay queued, will retry later return + allowed_binproviders = self._allowed_binproviders() + # Run each hook - they decide if they can handle this binary for hook in hooks: plugin_name = hook.parent.name + if allowed_binproviders is not None and plugin_name not in allowed_binproviders: + continue + plugin_output_dir = output_dir / plugin_name plugin_output_dir.mkdir(parents=True, exist_ok=True) @@ -434,8 +473,12 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine): 'binproviders': self.binproviders, } - # Add overrides as JSON string if present - if self.overrides: + if plugin_name == 'custom': + custom_cmd = self._get_custom_install_command() + if not custom_cmd: + continue + hook_kwargs['custom_cmd'] = custom_cmd + elif self.overrides: hook_kwargs['overrides'] = json.dumps(self.overrides) # Run the hook diff --git a/archivebox/tests/conftest.py b/archivebox/tests/conftest.py index 60cee8e4..b290a37e 100644 --- a/archivebox/tests/conftest.py +++ b/archivebox/tests/conftest.py @@ -287,8 +287,8 @@ def _ensure_puppeteer(shared_lib: Path) -> None: @pytest.fixture(scope="class") def real_archive_with_example(tmp_path_factory, request): """ - Initialize archive and add https://example.com using chrome+responses only. - Uses cwd for DATA_DIR and symlinks lib dir to a shared cache. + Initialize archive and add https://example.com using responses only. + Uses cwd for DATA_DIR. """ tmp_path = tmp_path_factory.mktemp("archivebox_data") if getattr(request, "cls", None) is not None: @@ -314,82 +314,13 @@ def real_archive_with_example(tmp_path_factory, request): ) assert returncode == 0, f"archivebox config failed: {stderr}" - machine_type = _get_machine_type() - shared_root = Path(__file__).resolve().parents[3] / 'tmp' / 'test_lib_cache' - shared_lib = shared_root / machine_type - shared_lib.mkdir(parents=True, exist_ok=True) - - lib_target = tmp_path / 'lib' / machine_type - if lib_target.exists() and not lib_target.is_symlink(): - shutil.rmtree(lib_target) - if not lib_target.exists(): - lib_target.parent.mkdir(parents=True, exist_ok=True) - lib_target.symlink_to(shared_lib, target_is_directory=True) - - _ensure_puppeteer(shared_lib) - cached_chromium = _find_cached_chromium(shared_lib) - if cached_chromium: - browser_binary = cached_chromium - else: - browser_binary = _find_system_browser() - if browser_binary: - chromium_link = shared_lib / 'chromium-bin' - if not chromium_link.exists(): - chromium_link.symlink_to(browser_binary) - browser_binary = chromium_link - - if browser_binary: - stdout, stderr, returncode = run_archivebox_cmd_cwd( - [f'config', '--set', f'CHROME_BINARY={browser_binary}'], - cwd=tmp_path, - ) - assert returncode == 0, f"archivebox config CHROME_BINARY failed: {stderr}" - script = textwrap.dedent(f"""\ - import os - os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings') - import django - django.setup() - from django.utils import timezone - from archivebox.machine.models import Binary, Machine - machine = Machine.current() - Binary.objects.filter(machine=machine, name='chromium').update( - status='installed', - abspath='{browser_binary}', - binprovider='env', - retry_at=timezone.now(), - ) - Binary.objects.update_or_create( - machine=machine, - name='chromium', - defaults={{ - 'status': 'installed', - 'abspath': '{browser_binary}', - 'binprovider': 'env', - 'retry_at': timezone.now(), - }}, - ) - print('OK') - """ - ) - stdout, stderr, returncode = run_python_cwd(script, cwd=tmp_path, timeout=60) - assert returncode == 0, f"Register chromium binary failed: {stderr}" - add_env = { - 'CHROME_ENABLED': 'True', 'RESPONSES_ENABLED': 'True', - 'DOM_ENABLED': 'False', 'SHOW_PROGRESS': 'False', 'USE_COLOR': 'False', - 'CHROME_HEADLESS': 'True', - 'CHROME_PAGELOAD_TIMEOUT': '45', - 'CHROME_TIMEOUT': '60', 'RESPONSES_TIMEOUT': '30', } - if browser_binary: - add_env['CHROME_BINARY'] = str(browser_binary) - if cached_chromium: - add_env['PUPPETEER_CACHE_DIR'] = str(shared_lib / 'puppeteer') - cmd = [sys.executable, '-m', 'archivebox', 'add', '--depth=0', '--plugins=chrome,responses', 'https://example.com'] + cmd = [sys.executable, '-m', 'archivebox', 'add', '--depth=0', '--plugins=responses', 'https://example.com'] base_env = os.environ.copy() base_env.pop('DATA_DIR', None) base_env['USE_COLOR'] = 'False'