mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Resolve crawl provider dependencies lazily
This commit is contained in:
@@ -352,18 +352,25 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
|||||||
if not binary_names:
|
if not binary_names:
|
||||||
return
|
return
|
||||||
|
|
||||||
pending_binaries = Binary.objects.filter(
|
max_attempts = max(2, len(binary_names))
|
||||||
machine=machine,
|
|
||||||
name__in=binary_names,
|
|
||||||
).exclude(
|
|
||||||
status=Binary.StatusChoices.INSTALLED,
|
|
||||||
).order_by('retry_at')
|
|
||||||
|
|
||||||
for binary in pending_binaries:
|
for _ in range(max_attempts):
|
||||||
try:
|
pending_binaries = list(
|
||||||
binary.sm.tick()
|
Binary.objects.filter(
|
||||||
except Exception:
|
machine=machine,
|
||||||
continue
|
name__in=binary_names,
|
||||||
|
).exclude(
|
||||||
|
status=Binary.StatusChoices.INSTALLED,
|
||||||
|
).order_by('retry_at', 'name')
|
||||||
|
)
|
||||||
|
if not pending_binaries:
|
||||||
|
return
|
||||||
|
|
||||||
|
for binary in pending_binaries:
|
||||||
|
try:
|
||||||
|
binary.sm.tick()
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
|
||||||
unresolved_binaries = list(
|
unresolved_binaries = list(
|
||||||
Binary.objects.filter(
|
Binary.objects.filter(
|
||||||
@@ -382,16 +389,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
|||||||
f'Crawl dependencies failed to install before continuing: {binary_details}'
|
f'Crawl dependencies failed to install before continuing: {binary_details}'
|
||||||
)
|
)
|
||||||
|
|
||||||
# Discover and run on_Crawl hooks
|
executed_crawl_hooks: set[str] = set()
|
||||||
with open(debug_log, 'a') as f:
|
|
||||||
f.write(f'Discovering Crawl hooks...\n')
|
def run_crawl_hook(hook: Path) -> set[str]:
|
||||||
f.flush()
|
executed_crawl_hooks.add(str(hook))
|
||||||
hooks = discover_hooks('Crawl', config=config)
|
|
||||||
with open(debug_log, 'a') as f:
|
|
||||||
f.write(f'Found {len(hooks)} hooks\n')
|
|
||||||
f.flush()
|
|
||||||
|
|
||||||
for hook in hooks:
|
|
||||||
with open(debug_log, 'a') as f:
|
with open(debug_log, 'a') as f:
|
||||||
f.write(f'Running hook: {hook.name}\n')
|
f.write(f'Running hook: {hook.name}\n')
|
||||||
f.flush()
|
f.flush()
|
||||||
@@ -400,38 +402,34 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
|||||||
output_dir = self.output_dir / plugin_name
|
output_dir = self.output_dir / plugin_name
|
||||||
output_dir.mkdir(parents=True, exist_ok=True)
|
output_dir.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
# Run hook using Process.launch() - returns Process model
|
|
||||||
process = run_hook(
|
process = run_hook(
|
||||||
hook,
|
hook,
|
||||||
output_dir=output_dir,
|
output_dir=output_dir,
|
||||||
config=config,
|
config=config,
|
||||||
crawl_id=str(self.id),
|
crawl_id=str(self.id),
|
||||||
source_url=self.urls, # Pass full newline-separated URLs
|
source_url=self.urls,
|
||||||
)
|
)
|
||||||
with open(debug_log, 'a') as f:
|
with open(debug_log, 'a') as f:
|
||||||
f.write(f'Hook {hook.name} completed with status={process.status}\n')
|
f.write(f'Hook {hook.name} completed with status={process.status}\n')
|
||||||
f.flush()
|
f.flush()
|
||||||
|
|
||||||
hook_elapsed = time.time() - hook_start
|
hook_elapsed = time.time() - hook_start
|
||||||
if hook_elapsed > 0.5: # Log slow hooks
|
if hook_elapsed > 0.5:
|
||||||
print(f'[yellow]⏱️ Hook {hook.name} took {hook_elapsed:.2f}s[/yellow]')
|
print(f'[yellow]⏱️ Hook {hook.name} took {hook_elapsed:.2f}s[/yellow]')
|
||||||
|
|
||||||
# Finite background hooks must finish before snapshots start so they can
|
|
||||||
# emit dependency records (Binary, Machine config, etc.).
|
|
||||||
if process.status == process.StatusChoices.RUNNING:
|
if process.status == process.StatusChoices.RUNNING:
|
||||||
if not is_finite_background_hook(hook.name):
|
if not is_finite_background_hook(hook.name):
|
||||||
continue
|
return set()
|
||||||
try:
|
try:
|
||||||
process.wait(timeout=process.timeout)
|
process.wait(timeout=process.timeout)
|
||||||
except Exception:
|
except Exception:
|
||||||
continue
|
return set()
|
||||||
|
|
||||||
# Foreground hook - process JSONL records
|
|
||||||
from archivebox.hooks import extract_records_from_process
|
from archivebox.hooks import extract_records_from_process
|
||||||
records = extract_records_from_process(process)
|
records = extract_records_from_process(process)
|
||||||
if records:
|
if records:
|
||||||
print(f'[cyan]📝 Processing {len(records)} records from {hook.name}[/cyan]')
|
print(f'[cyan]📝 Processing {len(records)} records from {hook.name}[/cyan]')
|
||||||
for record in records[:3]: # Show first 3
|
for record in records[:3]:
|
||||||
print(f' Record: type={record.get("type")}, keys={list(record.keys())[:5]}')
|
print(f' Record: type={record.get("type")}, keys={list(record.keys())[:5]}')
|
||||||
overrides = {'crawl': self}
|
overrides = {'crawl': self}
|
||||||
stats = process_hook_records(records, overrides=overrides)
|
stats = process_hook_records(records, overrides=overrides)
|
||||||
@@ -446,7 +444,60 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
|||||||
hook_binary_names.discard('')
|
hook_binary_names.discard('')
|
||||||
if hook_binary_names:
|
if hook_binary_names:
|
||||||
declared_binary_names.update(hook_binary_names)
|
declared_binary_names.update(hook_binary_names)
|
||||||
install_declared_binaries(hook_binary_names)
|
return hook_binary_names
|
||||||
|
|
||||||
|
def resolve_provider_binaries(binary_names: set[str]) -> set[str]:
|
||||||
|
if not binary_names:
|
||||||
|
return set()
|
||||||
|
|
||||||
|
resolved_binary_names = set(binary_names)
|
||||||
|
|
||||||
|
while True:
|
||||||
|
unresolved_binaries = list(
|
||||||
|
Binary.objects.filter(
|
||||||
|
machine=machine,
|
||||||
|
name__in=resolved_binary_names,
|
||||||
|
).exclude(
|
||||||
|
status=Binary.StatusChoices.INSTALLED,
|
||||||
|
).order_by('name')
|
||||||
|
)
|
||||||
|
if not unresolved_binaries:
|
||||||
|
return resolved_binary_names
|
||||||
|
|
||||||
|
needed_provider_names: set[str] = set()
|
||||||
|
for binary in unresolved_binaries:
|
||||||
|
allowed_binproviders = binary._allowed_binproviders()
|
||||||
|
if allowed_binproviders is None:
|
||||||
|
continue
|
||||||
|
needed_provider_names.update(allowed_binproviders)
|
||||||
|
|
||||||
|
if not needed_provider_names:
|
||||||
|
return resolved_binary_names
|
||||||
|
|
||||||
|
provider_hooks = [
|
||||||
|
hook
|
||||||
|
for hook in discover_hooks('Crawl', filter_disabled=False, config=config)
|
||||||
|
if hook.parent.name in needed_provider_names and str(hook) not in executed_crawl_hooks
|
||||||
|
]
|
||||||
|
if not provider_hooks:
|
||||||
|
return resolved_binary_names
|
||||||
|
|
||||||
|
for hook in provider_hooks:
|
||||||
|
resolved_binary_names.update(run_crawl_hook(hook))
|
||||||
|
|
||||||
|
# Discover and run on_Crawl hooks
|
||||||
|
with open(debug_log, 'a') as f:
|
||||||
|
f.write(f'Discovering Crawl hooks...\n')
|
||||||
|
f.flush()
|
||||||
|
hooks = discover_hooks('Crawl', config=config)
|
||||||
|
with open(debug_log, 'a') as f:
|
||||||
|
f.write(f'Found {len(hooks)} hooks\n')
|
||||||
|
f.flush()
|
||||||
|
|
||||||
|
for hook in hooks:
|
||||||
|
hook_binary_names = run_crawl_hook(hook)
|
||||||
|
if hook_binary_names:
|
||||||
|
install_declared_binaries(resolve_provider_binaries(hook_binary_names))
|
||||||
|
|
||||||
# Safety check: don't create snapshots if any crawl-declared dependency
|
# Safety check: don't create snapshots if any crawl-declared dependency
|
||||||
# is still unresolved after all crawl hooks have run.
|
# is still unresolved after all crawl hooks have run.
|
||||||
|
|||||||
@@ -622,19 +622,6 @@ def get_plugins() -> List[str]:
|
|||||||
return sorted(set(plugins))
|
return sorted(set(plugins))
|
||||||
|
|
||||||
|
|
||||||
@lru_cache(maxsize=1)
|
|
||||||
def get_binary_provider_plugins() -> List[str]:
|
|
||||||
"""Get plugin names that expose Binary hooks and act as provider plugins."""
|
|
||||||
providers = []
|
|
||||||
|
|
||||||
for plugin_dir in iter_plugin_dirs():
|
|
||||||
has_binary_hooks = any(plugin_dir.glob('on_Binary__*.*'))
|
|
||||||
if has_binary_hooks:
|
|
||||||
providers.append(plugin_dir.name)
|
|
||||||
|
|
||||||
return sorted(set(providers))
|
|
||||||
|
|
||||||
|
|
||||||
def get_parser_plugins() -> List[str]:
|
def get_parser_plugins() -> List[str]:
|
||||||
"""
|
"""
|
||||||
Get list of parser plugins by discovering parse_*_urls hooks.
|
Get list of parser plugins by discovering parse_*_urls hooks.
|
||||||
@@ -926,12 +913,10 @@ def get_plugin_special_config(plugin_name: str, config: Dict[str, Any]) -> Dict[
|
|||||||
plugins_whitelist = config.get('PLUGINS', '')
|
plugins_whitelist = config.get('PLUGINS', '')
|
||||||
if plugins_whitelist:
|
if plugins_whitelist:
|
||||||
# PLUGINS whitelist is specified - include transitive required_plugins from
|
# PLUGINS whitelist is specified - include transitive required_plugins from
|
||||||
# config.json as well as binary provider plugins. Provider plugins may also
|
# config.json so selecting a plugin also enables its declared plugin-level
|
||||||
# expose early on_Crawl hooks (e.g. npm -> install node/npm) that are
|
# dependencies (e.g. singlefile -> chrome).
|
||||||
# required before a selected extractor's Binary hooks can succeed.
|
|
||||||
plugin_configs = discover_plugin_configs()
|
plugin_configs = discover_plugin_configs()
|
||||||
plugin_names = {p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()}
|
plugin_names = {p.strip().lower() for p in plugins_whitelist.split(',') if p.strip()}
|
||||||
plugin_names.update(provider.lower() for provider in get_binary_provider_plugins())
|
|
||||||
pending = list(plugin_names)
|
pending = list(plugin_names)
|
||||||
|
|
||||||
while pending:
|
while pending:
|
||||||
|
|||||||
@@ -263,8 +263,8 @@ class TestHookDiscovery(unittest.TestCase):
|
|||||||
hook_names = [hook.name for hook in hooks]
|
hook_names = [hook.name for hook in hooks]
|
||||||
self.assertIn('on_Binary__10_npm_install.py', hook_names)
|
self.assertIn('on_Binary__10_npm_install.py', hook_names)
|
||||||
|
|
||||||
def test_discover_crawl_hooks_keeps_binary_provider_dependencies_enabled(self):
|
def test_discover_crawl_hooks_only_include_declared_plugin_dependencies(self):
|
||||||
"""Provider crawl hooks should remain enabled when a whitelisted plugin depends on them transitively."""
|
"""Crawl hook discovery should include required_plugins without broadening to provider plugins."""
|
||||||
responses_dir = self.plugins_dir / 'responses'
|
responses_dir = self.plugins_dir / 'responses'
|
||||||
responses_dir.mkdir()
|
responses_dir.mkdir()
|
||||||
(responses_dir / 'config.json').write_text(
|
(responses_dir / 'config.json').write_text(
|
||||||
@@ -291,13 +291,12 @@ class TestHookDiscovery(unittest.TestCase):
|
|||||||
from archivebox import hooks as hooks_module
|
from archivebox import hooks as hooks_module
|
||||||
|
|
||||||
hooks_module.get_plugins.cache_clear()
|
hooks_module.get_plugins.cache_clear()
|
||||||
hooks_module.get_binary_provider_plugins.cache_clear()
|
|
||||||
with patch.object(hooks_module, 'BUILTIN_PLUGINS_DIR', self.plugins_dir), patch.object(hooks_module, 'USER_PLUGINS_DIR', self.test_dir / 'user_plugins'):
|
with patch.object(hooks_module, 'BUILTIN_PLUGINS_DIR', self.plugins_dir), patch.object(hooks_module, 'USER_PLUGINS_DIR', self.test_dir / 'user_plugins'):
|
||||||
hooks = hooks_module.discover_hooks('Crawl', config={'PLUGINS': 'responses'})
|
hooks = hooks_module.discover_hooks('Crawl', config={'PLUGINS': 'responses'})
|
||||||
|
|
||||||
hook_names = [hook.name for hook in hooks]
|
hook_names = [hook.name for hook in hooks]
|
||||||
self.assertIn('on_Crawl__70_chrome_install.finite.bg.py', hook_names)
|
self.assertIn('on_Crawl__70_chrome_install.finite.bg.py', hook_names)
|
||||||
self.assertIn('on_Crawl__00_npm_install.py', hook_names)
|
self.assertNotIn('on_Crawl__00_npm_install.py', hook_names)
|
||||||
|
|
||||||
|
|
||||||
class TestGetExtractorName(unittest.TestCase):
|
class TestGetExtractorName(unittest.TestCase):
|
||||||
|
|||||||
Reference in New Issue
Block a user