Resolve crawl provider dependencies lazily

This commit is contained in:
Nick Sweeting
2026-03-15 10:18:49 -07:00
parent d4be507a6b
commit 47f540c094
3 changed files with 86 additions and 51 deletions

View File

@@ -352,18 +352,25 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
if not binary_names:
return
pending_binaries = Binary.objects.filter(
machine=machine,
name__in=binary_names,
).exclude(
status=Binary.StatusChoices.INSTALLED,
).order_by('retry_at')
max_attempts = max(2, len(binary_names))
for binary in pending_binaries:
try:
binary.sm.tick()
except Exception:
continue
for _ in range(max_attempts):
pending_binaries = list(
Binary.objects.filter(
machine=machine,
name__in=binary_names,
).exclude(
status=Binary.StatusChoices.INSTALLED,
).order_by('retry_at', 'name')
)
if not pending_binaries:
return
for binary in pending_binaries:
try:
binary.sm.tick()
except Exception:
continue
unresolved_binaries = list(
Binary.objects.filter(
@@ -382,16 +389,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
f'Crawl dependencies failed to install before continuing: {binary_details}'
)
# Discover and run on_Crawl hooks
with open(debug_log, 'a') as f:
f.write(f'Discovering Crawl hooks...\n')
f.flush()
hooks = discover_hooks('Crawl', config=config)
with open(debug_log, 'a') as f:
f.write(f'Found {len(hooks)} hooks\n')
f.flush()
executed_crawl_hooks: set[str] = set()
def run_crawl_hook(hook: Path) -> set[str]:
executed_crawl_hooks.add(str(hook))
for hook in hooks:
with open(debug_log, 'a') as f:
f.write(f'Running hook: {hook.name}\n')
f.flush()
@@ -400,38 +402,34 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
output_dir = self.output_dir / plugin_name
output_dir.mkdir(parents=True, exist_ok=True)
# Run hook using Process.launch() - returns Process model
process = run_hook(
hook,
output_dir=output_dir,
config=config,
crawl_id=str(self.id),
source_url=self.urls, # Pass full newline-separated URLs
source_url=self.urls,
)
with open(debug_log, 'a') as f:
f.write(f'Hook {hook.name} completed with status={process.status}\n')
f.flush()
hook_elapsed = time.time() - hook_start
if hook_elapsed > 0.5: # Log slow hooks
if hook_elapsed > 0.5:
print(f'[yellow]⏱️ Hook {hook.name} took {hook_elapsed:.2f}s[/yellow]')
# Finite background hooks must finish before snapshots start so they can
# emit dependency records (Binary, Machine config, etc.).
if process.status == process.StatusChoices.RUNNING:
if not is_finite_background_hook(hook.name):
continue
return set()
try:
process.wait(timeout=process.timeout)
except Exception:
continue
return set()
# Foreground hook - process JSONL records
from archivebox.hooks import extract_records_from_process
records = extract_records_from_process(process)
if records:
print(f'[cyan]📝 Processing {len(records)} records from {hook.name}[/cyan]')
for record in records[:3]: # Show first 3
for record in records[:3]:
print(f' Record: type={record.get("type")}, keys={list(record.keys())[:5]}')
overrides = {'crawl': self}
stats = process_hook_records(records, overrides=overrides)
@@ -446,7 +444,60 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
hook_binary_names.discard('')
if hook_binary_names:
declared_binary_names.update(hook_binary_names)
install_declared_binaries(hook_binary_names)
return hook_binary_names
def resolve_provider_binaries(binary_names: set[str]) -> set[str]:
if not binary_names:
return set()
resolved_binary_names = set(binary_names)
while True:
unresolved_binaries = list(
Binary.objects.filter(
machine=machine,
name__in=resolved_binary_names,
).exclude(
status=Binary.StatusChoices.INSTALLED,
).order_by('name')
)
if not unresolved_binaries:
return resolved_binary_names
needed_provider_names: set[str] = set()
for binary in unresolved_binaries:
allowed_binproviders = binary._allowed_binproviders()
if allowed_binproviders is None:
continue
needed_provider_names.update(allowed_binproviders)
if not needed_provider_names:
return resolved_binary_names
provider_hooks = [
hook
for hook in discover_hooks('Crawl', filter_disabled=False, config=config)
if hook.parent.name in needed_provider_names and str(hook) not in executed_crawl_hooks
]
if not provider_hooks:
return resolved_binary_names
for hook in provider_hooks:
resolved_binary_names.update(run_crawl_hook(hook))
# Discover and run on_Crawl hooks
with open(debug_log, 'a') as f:
f.write(f'Discovering Crawl hooks...\n')
f.flush()
hooks = discover_hooks('Crawl', config=config)
with open(debug_log, 'a') as f:
f.write(f'Found {len(hooks)} hooks\n')
f.flush()
for hook in hooks:
hook_binary_names = run_crawl_hook(hook)
if hook_binary_names:
install_declared_binaries(resolve_provider_binaries(hook_binary_names))
# Safety check: don't create snapshots if any crawl-declared dependency
# is still unresolved after all crawl hooks have run.

View File

@@ -622,19 +622,6 @@ def get_plugins() -> List[str]:
return sorted(set(plugins))
@lru_cache(maxsize=1)
def get_binary_provider_plugins() -> List[str]:
"""Get plugin names that expose Binary hooks and act as provider plugins."""
providers = []
for plugin_dir in iter_plugin_dirs():
has_binary_hooks = any(plugin_dir.glob('on_Binary__*.*'))
if has_binary_hooks:
providers.append(plugin_dir.name)
return sorted(set(providers))
def get_parser_plugins() -> List[str]:
"""
Get list of parser plugins by discovering parse_*_urls hooks.
@@ -926,12 +913,10 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
plugins_whitelist = config.get('PLUGINS', '')
if plugins_whitelist:
# PLUGINS whitelist is specified - include transitive required_plugins from
# config.json as well as binary provider plugins. Provider plugins may also
# expose early on_Crawl hooks (e.g. npm -> install node/npm) that are
# required before a selected extractor's Binary hooks can succeed.
# config.json so selecting a plugin also enables its declared plugin-level
# dependencies (e.g. singlefile -> chrome).
plugin_configs = discover_plugin_configs()
plugin_names = {p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()}
plugin_names.update(provider.lower() for provider in get_binary_provider_plugins())
pending = list(plugin_names)
while pending:

View File

@@ -263,8 +263,8 @@ class TestHookDiscovery(unittest.TestCase):
hook_names = [hook.name for hook in hooks]
self.assertIn('on_Binary__10_npm_install.py', hook_names)
def test_discover_crawl_hooks_keeps_binary_provider_dependencies_enabled(self):
"""Provider crawl hooks should remain enabled when a whitelisted plugin depends on them transitively."""
def test_discover_crawl_hooks_only_include_declared_plugin_dependencies(self):
"""Crawl hook discovery should include required_plugins without broadening to provider plugins."""
responses_dir = self.plugins_dir / 'responses'
responses_dir.mkdir()
(responses_dir / 'config.json').write_text(
@@ -291,13 +291,12 @@ class TestHookDiscovery(unittest.TestCase):
from archivebox import hooks as hooks_module
hooks_module.get_plugins.cache_clear()
hooks_module.get_binary_provider_plugins.cache_clear()
with patch.object(hooks_module, 'BUILTIN_PLUGINS_DIR', self.plugins_dir), patch.object(hooks_module, 'USER_PLUGINS_DIR', self.test_dir / 'user_plugins'):
hooks = hooks_module.discover_hooks('Crawl', config={'PLUGINS': 'responses'})
hook_names = [hook.name for hook in hooks]
self.assertIn('on_Crawl__70_chrome_install.finite.bg.py', hook_names)
self.assertIn('on_Crawl__00_npm_install.py', hook_names)
self.assertNotIn('on_Crawl__00_npm_install.py', hook_names)
class TestGetExtractorName(unittest.TestCase):