mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Resolve crawl provider dependencies lazily
This commit is contained in:
@@ -352,18 +352,25 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
if not binary_names:
|
||||
return
|
||||
|
||||
pending_binaries = Binary.objects.filter(
|
||||
machine=machine,
|
||||
name__in=binary_names,
|
||||
).exclude(
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
).order_by('retry_at')
|
||||
max_attempts = max(2, len(binary_names))
|
||||
|
||||
for binary in pending_binaries:
|
||||
try:
|
||||
binary.sm.tick()
|
||||
except Exception:
|
||||
continue
|
||||
for _ in range(max_attempts):
|
||||
pending_binaries = list(
|
||||
Binary.objects.filter(
|
||||
machine=machine,
|
||||
name__in=binary_names,
|
||||
).exclude(
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
).order_by('retry_at', 'name')
|
||||
)
|
||||
if not pending_binaries:
|
||||
return
|
||||
|
||||
for binary in pending_binaries:
|
||||
try:
|
||||
binary.sm.tick()
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
unresolved_binaries = list(
|
||||
Binary.objects.filter(
|
||||
@@ -382,16 +389,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
f'Crawl dependencies failed to install before continuing: {binary_details}'
|
||||
)
|
||||
|
||||
# Discover and run on_Crawl hooks
|
||||
with open(debug_log, 'a') as f:
|
||||
f.write(f'Discovering Crawl hooks...\n')
|
||||
f.flush()
|
||||
hooks = discover_hooks('Crawl', config=config)
|
||||
with open(debug_log, 'a') as f:
|
||||
f.write(f'Found {len(hooks)} hooks\n')
|
||||
f.flush()
|
||||
executed_crawl_hooks: set[str] = set()
|
||||
|
||||
def run_crawl_hook(hook: Path) -> set[str]:
|
||||
executed_crawl_hooks.add(str(hook))
|
||||
|
||||
for hook in hooks:
|
||||
with open(debug_log, 'a') as f:
|
||||
f.write(f'Running hook: {hook.name}\n')
|
||||
f.flush()
|
||||
@@ -400,38 +402,34 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
output_dir = self.output_dir / plugin_name
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Run hook using Process.launch() - returns Process model
|
||||
process = run_hook(
|
||||
hook,
|
||||
output_dir=output_dir,
|
||||
config=config,
|
||||
crawl_id=str(self.id),
|
||||
source_url=self.urls, # Pass full newline-separated URLs
|
||||
source_url=self.urls,
|
||||
)
|
||||
with open(debug_log, 'a') as f:
|
||||
f.write(f'Hook {hook.name} completed with status={process.status}\n')
|
||||
f.flush()
|
||||
|
||||
hook_elapsed = time.time() - hook_start
|
||||
if hook_elapsed > 0.5: # Log slow hooks
|
||||
if hook_elapsed > 0.5:
|
||||
print(f'[yellow]⏱️ Hook {hook.name} took {hook_elapsed:.2f}s[/yellow]')
|
||||
|
||||
# Finite background hooks must finish before snapshots start so they can
|
||||
# emit dependency records (Binary, Machine config, etc.).
|
||||
if process.status == process.StatusChoices.RUNNING:
|
||||
if not is_finite_background_hook(hook.name):
|
||||
continue
|
||||
return set()
|
||||
try:
|
||||
process.wait(timeout=process.timeout)
|
||||
except Exception:
|
||||
continue
|
||||
return set()
|
||||
|
||||
# Foreground hook - process JSONL records
|
||||
from archivebox.hooks import extract_records_from_process
|
||||
records = extract_records_from_process(process)
|
||||
if records:
|
||||
print(f'[cyan]📝 Processing {len(records)} records from {hook.name}[/cyan]')
|
||||
for record in records[:3]: # Show first 3
|
||||
for record in records[:3]:
|
||||
print(f' Record: type={record.get("type")}, keys={list(record.keys())[:5]}')
|
||||
overrides = {'crawl': self}
|
||||
stats = process_hook_records(records, overrides=overrides)
|
||||
@@ -446,7 +444,60 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
hook_binary_names.discard('')
|
||||
if hook_binary_names:
|
||||
declared_binary_names.update(hook_binary_names)
|
||||
install_declared_binaries(hook_binary_names)
|
||||
return hook_binary_names
|
||||
|
||||
def resolve_provider_binaries(binary_names: set[str]) -> set[str]:
|
||||
if not binary_names:
|
||||
return set()
|
||||
|
||||
resolved_binary_names = set(binary_names)
|
||||
|
||||
while True:
|
||||
unresolved_binaries = list(
|
||||
Binary.objects.filter(
|
||||
machine=machine,
|
||||
name__in=resolved_binary_names,
|
||||
).exclude(
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
).order_by('name')
|
||||
)
|
||||
if not unresolved_binaries:
|
||||
return resolved_binary_names
|
||||
|
||||
needed_provider_names: set[str] = set()
|
||||
for binary in unresolved_binaries:
|
||||
allowed_binproviders = binary._allowed_binproviders()
|
||||
if allowed_binproviders is None:
|
||||
continue
|
||||
needed_provider_names.update(allowed_binproviders)
|
||||
|
||||
if not needed_provider_names:
|
||||
return resolved_binary_names
|
||||
|
||||
provider_hooks = [
|
||||
hook
|
||||
for hook in discover_hooks('Crawl', filter_disabled=False, config=config)
|
||||
if hook.parent.name in needed_provider_names and str(hook) not in executed_crawl_hooks
|
||||
]
|
||||
if not provider_hooks:
|
||||
return resolved_binary_names
|
||||
|
||||
for hook in provider_hooks:
|
||||
resolved_binary_names.update(run_crawl_hook(hook))
|
||||
|
||||
# Discover and run on_Crawl hooks
|
||||
with open(debug_log, 'a') as f:
|
||||
f.write(f'Discovering Crawl hooks...\n')
|
||||
f.flush()
|
||||
hooks = discover_hooks('Crawl', config=config)
|
||||
with open(debug_log, 'a') as f:
|
||||
f.write(f'Found {len(hooks)} hooks\n')
|
||||
f.flush()
|
||||
|
||||
for hook in hooks:
|
||||
hook_binary_names = run_crawl_hook(hook)
|
||||
if hook_binary_names:
|
||||
install_declared_binaries(resolve_provider_binaries(hook_binary_names))
|
||||
|
||||
# Safety check: don't create snapshots if any crawl-declared dependency
|
||||
# is still unresolved after all crawl hooks have run.
|
||||
|
||||
@@ -622,19 +622,6 @@ def get_plugins() -> List[str]:
|
||||
return sorted(set(plugins))
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_binary_provider_plugins() -> List[str]:
|
||||
"""Get plugin names that expose Binary hooks and act as provider plugins."""
|
||||
providers = []
|
||||
|
||||
for plugin_dir in iter_plugin_dirs():
|
||||
has_binary_hooks = any(plugin_dir.glob('on_Binary__*.*'))
|
||||
if has_binary_hooks:
|
||||
providers.append(plugin_dir.name)
|
||||
|
||||
return sorted(set(providers))
|
||||
|
||||
|
||||
def get_parser_plugins() -> List[str]:
|
||||
"""
|
||||
Get list of parser plugins by discovering parse_*_urls hooks.
|
||||
@@ -926,12 +913,10 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
|
||||
plugins_whitelist = config.get('PLUGINS', '')
|
||||
if plugins_whitelist:
|
||||
# PLUGINS whitelist is specified - include transitive required_plugins from
|
||||
# config.json as well as binary provider plugins. Provider plugins may also
|
||||
# expose early on_Crawl hooks (e.g. npm -> install node/npm) that are
|
||||
# required before a selected extractor's Binary hooks can succeed.
|
||||
# config.json so selecting a plugin also enables its declared plugin-level
|
||||
# dependencies (e.g. singlefile -> chrome).
|
||||
plugin_configs = discover_plugin_configs()
|
||||
plugin_names = {p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()}
|
||||
plugin_names.update(provider.lower() for provider in get_binary_provider_plugins())
|
||||
pending = list(plugin_names)
|
||||
|
||||
while pending:
|
||||
|
||||
@@ -263,8 +263,8 @@ class TestHookDiscovery(unittest.TestCase):
|
||||
hook_names = [hook.name for hook in hooks]
|
||||
self.assertIn('on_Binary__10_npm_install.py', hook_names)
|
||||
|
||||
def test_discover_crawl_hooks_keeps_binary_provider_dependencies_enabled(self):
|
||||
"""Provider crawl hooks should remain enabled when a whitelisted plugin depends on them transitively."""
|
||||
def test_discover_crawl_hooks_only_include_declared_plugin_dependencies(self):
|
||||
"""Crawl hook discovery should include required_plugins without broadening to provider plugins."""
|
||||
responses_dir = self.plugins_dir / 'responses'
|
||||
responses_dir.mkdir()
|
||||
(responses_dir / 'config.json').write_text(
|
||||
@@ -291,13 +291,12 @@ class TestHookDiscovery(unittest.TestCase):
|
||||
from archivebox import hooks as hooks_module
|
||||
|
||||
hooks_module.get_plugins.cache_clear()
|
||||
hooks_module.get_binary_provider_plugins.cache_clear()
|
||||
with patch.object(hooks_module, 'BUILTIN_PLUGINS_DIR', self.plugins_dir), patch.object(hooks_module, 'USER_PLUGINS_DIR', self.test_dir / 'user_plugins'):
|
||||
hooks = hooks_module.discover_hooks('Crawl', config={'PLUGINS': 'responses'})
|
||||
|
||||
hook_names = [hook.name for hook in hooks]
|
||||
self.assertIn('on_Crawl__70_chrome_install.finite.bg.py', hook_names)
|
||||
self.assertIn('on_Crawl__00_npm_install.py', hook_names)
|
||||
self.assertNotIn('on_Crawl__00_npm_install.py', hook_names)
|
||||
|
||||
|
||||
class TestGetExtractorName(unittest.TestCase):
|
||||
|
||||
Reference in New Issue
Block a user