mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Update abx dependencies and plugin test harness
This commit is contained in:
@@ -147,8 +147,8 @@ class AddLinkForm(forms.Form):
|
||||
'screenshot', 'seo', 'singlefile', 'ssl', 'staticfile', 'title'
|
||||
}
|
||||
archiving = {
|
||||
'archivedotorg', 'favicon', 'forumdl', 'gallerydl', 'git',
|
||||
'htmltotext', 'media', 'mercury', 'papersdl', 'readability', 'wget'
|
||||
'archivedotorg', 'defuddle', 'favicon', 'forumdl', 'gallerydl', 'git',
|
||||
'htmltotext', 'mercury', 'papersdl', 'readability', 'trafilatura', 'wget', 'ytdlp'
|
||||
}
|
||||
parsing = {
|
||||
'parse_html_urls', 'parse_jsonl_urls',
|
||||
|
||||
@@ -2185,7 +2185,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
||||
# Snapshot State Machine
|
||||
# =============================================================================
|
||||
|
||||
class SnapshotMachine(BaseStateMachine, strict_states=True):
|
||||
class SnapshotMachine(BaseStateMachine):
|
||||
"""
|
||||
State machine for managing Snapshot lifecycle.
|
||||
|
||||
@@ -3074,7 +3074,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
||||
# ArchiveResult State Machine
|
||||
# =============================================================================
|
||||
|
||||
class ArchiveResultMachine(BaseStateMachine, strict_states=True):
|
||||
class ArchiveResultMachine(BaseStateMachine):
|
||||
"""
|
||||
State machine for managing ArchiveResult (single plugin execution) lifecycle.
|
||||
|
||||
|
||||
@@ -506,7 +506,7 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
|
||||
# State Machines
|
||||
# =============================================================================
|
||||
|
||||
class CrawlMachine(BaseStateMachine, strict_states=True):
|
||||
class CrawlMachine(BaseStateMachine):
|
||||
"""
|
||||
State machine for managing Crawl lifecycle.
|
||||
|
||||
|
||||
@@ -22,13 +22,13 @@ Execution order:
|
||||
- Failed extractors don't block subsequent extractors
|
||||
|
||||
Hook Naming Convention:
|
||||
on_{ModelName}__{run_order}_{description}[.bg].{ext}
|
||||
on_{ModelName}__{run_order}_{description}[.finite.bg|.daemon.bg].{ext}
|
||||
|
||||
Examples:
|
||||
on_Snapshot__00_setup.py # runs first
|
||||
on_Snapshot__10_chrome_tab.bg.js # background (doesn't block)
|
||||
on_Snapshot__10_chrome_tab.daemon.bg.js # background (doesn't block)
|
||||
on_Snapshot__50_screenshot.js # foreground (blocks)
|
||||
on_Snapshot__63_media.bg.py # background (long-running)
|
||||
on_Snapshot__63_media.finite.bg.py # background (long-running)
|
||||
|
||||
Dependency handling:
|
||||
Extractor plugins that depend on other plugins' output should check at runtime:
|
||||
@@ -108,19 +108,34 @@ def is_background_hook(hook_name: str) -> bool:
|
||||
Background hooks have '.bg.' in their filename before the extension.
|
||||
|
||||
Args:
|
||||
hook_name: Hook filename (e.g., 'on_Snapshot__10_chrome_tab.bg.js')
|
||||
hook_name: Hook filename (e.g., 'on_Snapshot__10_chrome_tab.daemon.bg.js')
|
||||
|
||||
Returns:
|
||||
True if background hook, False if foreground.
|
||||
|
||||
Examples:
|
||||
is_background_hook('on_Snapshot__10_chrome_tab.bg.js') -> True
|
||||
is_background_hook('on_Snapshot__10_chrome_tab.daemon.bg.js') -> True
|
||||
is_background_hook('on_Snapshot__50_wget.py') -> False
|
||||
is_background_hook('on_Snapshot__63_media.bg.py') -> True
|
||||
is_background_hook('on_Snapshot__63_media.finite.bg.py') -> True
|
||||
"""
|
||||
return '.bg.' in hook_name or '__background' in hook_name
|
||||
|
||||
|
||||
def iter_plugin_dirs() -> List[Path]:
|
||||
"""Iterate over all built-in and user plugin directories."""
|
||||
plugin_dirs: List[Path] = []
|
||||
|
||||
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
||||
if not base_dir.exists():
|
||||
continue
|
||||
|
||||
for plugin_dir in base_dir.iterdir():
|
||||
if plugin_dir.is_dir() and not plugin_dir.name.startswith('_'):
|
||||
plugin_dirs.append(plugin_dir)
|
||||
|
||||
return plugin_dirs
|
||||
|
||||
|
||||
class HookResult(TypedDict, total=False):
|
||||
"""Raw result from run_hook()."""
|
||||
returncode: int
|
||||
@@ -420,7 +435,7 @@ def run_hook(
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Detect if this is a background hook (long-running daemon)
|
||||
# New convention: .bg. suffix (e.g., on_Snapshot__21_consolelog.bg.js)
|
||||
# Background hooks use the .daemon.bg. or .finite.bg. filename convention.
|
||||
# Old convention: __background in stem (for backwards compatibility)
|
||||
is_background = '.bg.' in script.name or '__background' in script.stem
|
||||
|
||||
@@ -581,28 +596,20 @@ def run_hooks(
|
||||
@lru_cache(maxsize=1)
|
||||
def get_plugins() -> List[str]:
|
||||
"""
|
||||
Get list of available plugins by discovering Snapshot hooks.
|
||||
Get list of available plugins by discovering plugin directories.
|
||||
|
||||
Returns plugin names (directory names) that contain on_Snapshot hooks.
|
||||
The plugin name is the plugin directory name, not the hook script name.
|
||||
|
||||
Example:
|
||||
abx_plugins/plugins/chrome/on_Snapshot__10_chrome_tab.bg.js
|
||||
-> plugin = 'chrome'
|
||||
|
||||
Sorted alphabetically (plugins control their hook order via numeric prefixes in hook names).
|
||||
Returns plugin directory names for any plugin that exposes hooks, config.json,
|
||||
or a standardized templates/icon.html asset. This includes non-extractor
|
||||
plugins such as binary providers and shared base plugins.
|
||||
"""
|
||||
plugins = []
|
||||
|
||||
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
||||
if not base_dir.exists():
|
||||
continue
|
||||
|
||||
for ext in ('sh', 'py', 'js'):
|
||||
for hook_path in base_dir.glob(f'*/on_Snapshot__*.{ext}'):
|
||||
# Use plugin directory name as plugin name
|
||||
plugin_name = hook_path.parent.name
|
||||
plugins.append(plugin_name)
|
||||
for plugin_dir in iter_plugin_dirs():
|
||||
has_hooks = any(plugin_dir.glob('on_*__*.*'))
|
||||
has_config = (plugin_dir / 'config.json').exists()
|
||||
has_icon = (plugin_dir / 'templates' / 'icon.html').exists()
|
||||
if has_hooks or has_config or has_icon:
|
||||
plugins.append(plugin_dir.name)
|
||||
|
||||
return sorted(set(plugins))
|
||||
|
||||
@@ -808,37 +815,31 @@ def discover_plugin_configs() -> Dict[str, Dict[str, Any]]:
|
||||
"""
|
||||
configs = {}
|
||||
|
||||
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
||||
if not base_dir.exists():
|
||||
for plugin_dir in iter_plugin_dirs():
|
||||
|
||||
config_path = plugin_dir / 'config.json'
|
||||
if not config_path.exists():
|
||||
continue
|
||||
|
||||
for plugin_dir in base_dir.iterdir():
|
||||
if not plugin_dir.is_dir():
|
||||
try:
|
||||
with open(config_path, 'r') as f:
|
||||
schema = json.load(f)
|
||||
|
||||
# Basic validation: must be an object with properties
|
||||
if not isinstance(schema, dict):
|
||||
continue
|
||||
if schema.get('type') != 'object':
|
||||
continue
|
||||
if 'properties' not in schema:
|
||||
continue
|
||||
|
||||
config_path = plugin_dir / 'config.json'
|
||||
if not config_path.exists():
|
||||
continue
|
||||
configs[plugin_dir.name] = schema
|
||||
|
||||
try:
|
||||
with open(config_path, 'r') as f:
|
||||
schema = json.load(f)
|
||||
|
||||
# Basic validation: must be an object with properties
|
||||
if not isinstance(schema, dict):
|
||||
continue
|
||||
if schema.get('type') != 'object':
|
||||
continue
|
||||
if 'properties' not in schema:
|
||||
continue
|
||||
|
||||
configs[plugin_dir.name] = schema
|
||||
|
||||
except (json.JSONDecodeError, OSError) as e:
|
||||
# Log warning but continue - malformed config shouldn't break discovery
|
||||
import sys
|
||||
print(f"Warning: Failed to load config.json from {plugin_dir.name}: {e}", file=sys.stderr)
|
||||
continue
|
||||
except (json.JSONDecodeError, OSError) as e:
|
||||
# Log warning but continue - malformed config shouldn't break discovery
|
||||
import sys
|
||||
print(f"Warning: Failed to load config.json from {plugin_dir.name}: {e}", file=sys.stderr)
|
||||
continue
|
||||
|
||||
return configs
|
||||
|
||||
@@ -1002,20 +1003,13 @@ def get_plugin_template(plugin: str, template_name: str, fallback: bool = True)
|
||||
if base_name in ('yt-dlp', 'youtube-dl'):
|
||||
base_name = 'ytdlp'
|
||||
|
||||
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
||||
if not base_dir.exists():
|
||||
continue
|
||||
for plugin_dir in iter_plugin_dirs():
|
||||
|
||||
# Look for plugin directory matching plugin name
|
||||
for plugin_dir in base_dir.iterdir():
|
||||
if not plugin_dir.is_dir():
|
||||
continue
|
||||
|
||||
# Match by directory name (exact or partial)
|
||||
if plugin_dir.name == base_name or plugin_dir.name.endswith(f'_{base_name}'):
|
||||
template_path = plugin_dir / 'templates' / f'{template_name}.html'
|
||||
if template_path.exists():
|
||||
return template_path.read_text()
|
||||
# Match by directory name (exact or partial)
|
||||
if plugin_dir.name == base_name or plugin_dir.name.endswith(f'_{base_name}'):
|
||||
template_path = plugin_dir / 'templates' / f'{template_name}.html'
|
||||
if template_path.exists():
|
||||
return template_path.read_text()
|
||||
|
||||
# Fall back to default template if requested
|
||||
if fallback:
|
||||
@@ -1068,25 +1062,19 @@ def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
|
||||
"""
|
||||
templates: Dict[str, Dict[str, str]] = {}
|
||||
|
||||
for base_dir in (BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR):
|
||||
if not base_dir.exists():
|
||||
for plugin_dir in iter_plugin_dirs():
|
||||
|
||||
templates_dir = plugin_dir / 'templates'
|
||||
if not templates_dir.exists():
|
||||
continue
|
||||
|
||||
for plugin_dir in base_dir.iterdir():
|
||||
if not plugin_dir.is_dir():
|
||||
continue
|
||||
plugin_templates = {}
|
||||
for template_file in templates_dir.glob('*.html'):
|
||||
template_name = template_file.stem # icon, card, full
|
||||
plugin_templates[template_name] = str(template_file)
|
||||
|
||||
templates_dir = plugin_dir / 'templates'
|
||||
if not templates_dir.exists():
|
||||
continue
|
||||
|
||||
plugin_templates = {}
|
||||
for template_file in templates_dir.glob('*.html'):
|
||||
template_name = template_file.stem # icon, card, full
|
||||
plugin_templates[template_name] = str(template_file)
|
||||
|
||||
if plugin_templates:
|
||||
templates[plugin_dir.name] = plugin_templates
|
||||
if plugin_templates:
|
||||
templates[plugin_dir.name] = plugin_templates
|
||||
|
||||
return templates
|
||||
|
||||
|
||||
@@ -169,7 +169,7 @@ class Migration(migrations.Migration):
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('name', models.CharField(blank=True, db_index=True, default='', max_length=63)),
|
||||
('binproviders', models.CharField(blank=True, default='env', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,env', max_length=127)),
|
||||
('overrides', models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}")),
|
||||
('overrides', models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'install_args': ['pkg']}, ...}")),
|
||||
('binprovider', models.CharField(blank=True, default='', help_text='Provider that successfully installed this binary', max_length=31)),
|
||||
('abspath', models.CharField(blank=True, default='', max_length=255)),
|
||||
('version', models.CharField(blank=True, default='', max_length=32)),
|
||||
|
||||
@@ -227,7 +227,7 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
||||
binproviders = models.CharField(max_length=127, default='env', null=False, blank=True,
|
||||
help_text="Comma-separated list of allowed providers: apt,brew,pip,npm,env")
|
||||
overrides = models.JSONField(default=dict, blank=True,
|
||||
help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}")
|
||||
help_text="Provider-specific overrides: {'apt': {'install_args': ['pkg']}, ...}")
|
||||
|
||||
# Installation results (populated after installation)
|
||||
binprovider = models.CharField(max_length=31, default='', null=False, blank=True,
|
||||
@@ -2042,7 +2042,7 @@ class Process(models.Model):
|
||||
# Binary State Machine
|
||||
# =============================================================================
|
||||
|
||||
class BinaryMachine(BaseStateMachine, strict_states=True):
|
||||
class BinaryMachine(BaseStateMachine):
|
||||
"""
|
||||
State machine for managing Binary installation lifecycle.
|
||||
|
||||
@@ -2133,7 +2133,7 @@ class BinaryMachine(BaseStateMachine, strict_states=True):
|
||||
# Process State Machine
|
||||
# =============================================================================
|
||||
|
||||
class ProcessMachine(BaseStateMachine, strict_states=True):
|
||||
class ProcessMachine(BaseStateMachine):
|
||||
"""
|
||||
State machine for managing Process (OS subprocess) lifecycle.
|
||||
|
||||
|
||||
@@ -129,7 +129,7 @@ class TestBinaryWorkerHooks:
|
||||
"""Tests for specific Binary hook providers."""
|
||||
|
||||
def test_env_provider_hook_detects_system_binary(self, initialized_archive):
|
||||
"""on_Binary__15_env_install.py hook detects system binaries."""
|
||||
"""on_Binary__15_env_discover.py hook detects system binaries."""
|
||||
binary_record = {
|
||||
'type': 'Binary',
|
||||
'name': 'python3',
|
||||
|
||||
@@ -27,39 +27,33 @@ class TestBackgroundHookDetection(unittest.TestCase):
|
||||
|
||||
def test_bg_js_suffix_detected(self):
|
||||
"""Hooks with .bg.js suffix should be detected as background."""
|
||||
script = Path('/path/to/on_Snapshot__21_consolelog.bg.js')
|
||||
is_background = '.bg.' in script.name or '__background' in script.stem
|
||||
self.assertTrue(is_background)
|
||||
from archivebox.hooks import is_background_hook
|
||||
self.assertTrue(is_background_hook('on_Snapshot__21_consolelog.daemon.bg.js'))
|
||||
|
||||
def test_bg_py_suffix_detected(self):
|
||||
"""Hooks with .bg.py suffix should be detected as background."""
|
||||
script = Path('/path/to/on_Snapshot__24_responses.bg.py')
|
||||
is_background = '.bg.' in script.name or '__background' in script.stem
|
||||
self.assertTrue(is_background)
|
||||
from archivebox.hooks import is_background_hook
|
||||
self.assertTrue(is_background_hook('on_Snapshot__24_responses.finite.bg.py'))
|
||||
|
||||
def test_bg_sh_suffix_detected(self):
|
||||
"""Hooks with .bg.sh suffix should be detected as background."""
|
||||
script = Path('/path/to/on_Snapshot__23_ssl.bg.sh')
|
||||
is_background = '.bg.' in script.name or '__background' in script.stem
|
||||
self.assertTrue(is_background)
|
||||
from archivebox.hooks import is_background_hook
|
||||
self.assertTrue(is_background_hook('on_Snapshot__23_ssl.daemon.bg.sh'))
|
||||
|
||||
def test_legacy_background_suffix_detected(self):
|
||||
"""Hooks with __background in stem should be detected (backwards compat)."""
|
||||
script = Path('/path/to/on_Snapshot__21_consolelog__background.js')
|
||||
is_background = '.bg.' in script.name or '__background' in script.stem
|
||||
self.assertTrue(is_background)
|
||||
from archivebox.hooks import is_background_hook
|
||||
self.assertTrue(is_background_hook('on_Snapshot__21_consolelog__background.js'))
|
||||
|
||||
def test_foreground_hook_not_detected(self):
|
||||
"""Hooks without .bg. or __background should NOT be detected as background."""
|
||||
script = Path('/path/to/on_Snapshot__11_favicon.js')
|
||||
is_background = '.bg.' in script.name or '__background' in script.stem
|
||||
self.assertFalse(is_background)
|
||||
from archivebox.hooks import is_background_hook
|
||||
self.assertFalse(is_background_hook('on_Snapshot__11_favicon.js'))
|
||||
|
||||
def test_foreground_py_hook_not_detected(self):
|
||||
"""Python hooks without .bg. should NOT be detected as background."""
|
||||
script = Path('/path/to/on_Snapshot__50_wget.py')
|
||||
is_background = '.bg.' in script.name or '__background' in script.stem
|
||||
self.assertFalse(is_background)
|
||||
from archivebox.hooks import is_background_hook
|
||||
self.assertFalse(is_background_hook('on_Snapshot__50_wget.py'))
|
||||
|
||||
|
||||
class TestJSONLParsing(unittest.TestCase):
|
||||
@@ -182,15 +176,15 @@ class TestHookDiscovery(unittest.TestCase):
|
||||
wget_dir = self.plugins_dir / 'wget'
|
||||
wget_dir.mkdir()
|
||||
(wget_dir / 'on_Snapshot__50_wget.py').write_text('# test hook')
|
||||
(wget_dir / 'on_Crawl__00_install_wget.py').write_text('# install hook')
|
||||
(wget_dir / 'on_Crawl__10_wget_install.finite.bg.py').write_text('# install hook')
|
||||
|
||||
chrome_dir = self.plugins_dir / 'chrome'
|
||||
chrome_dir.mkdir()
|
||||
(chrome_dir / 'on_Snapshot__20_chrome_tab.bg.js').write_text('// background hook')
|
||||
(chrome_dir / 'on_Snapshot__20_chrome_tab.daemon.bg.js').write_text('// background hook')
|
||||
|
||||
consolelog_dir = self.plugins_dir / 'consolelog'
|
||||
consolelog_dir.mkdir()
|
||||
(consolelog_dir / 'on_Snapshot__21_consolelog.bg.js').write_text('// background hook')
|
||||
(consolelog_dir / 'on_Snapshot__21_consolelog.daemon.bg.js').write_text('// background hook')
|
||||
|
||||
def tearDown(self):
|
||||
"""Clean up test directory."""
|
||||
@@ -208,8 +202,8 @@ class TestHookDiscovery(unittest.TestCase):
|
||||
|
||||
self.assertEqual(len(hooks), 3)
|
||||
hook_names = [h.name for h in hooks]
|
||||
self.assertIn('on_Snapshot__20_chrome_tab.bg.js', hook_names)
|
||||
self.assertIn('on_Snapshot__21_consolelog.bg.js', hook_names)
|
||||
self.assertIn('on_Snapshot__20_chrome_tab.daemon.bg.js', hook_names)
|
||||
self.assertIn('on_Snapshot__21_consolelog.daemon.bg.js', hook_names)
|
||||
self.assertIn('on_Snapshot__50_wget.py', hook_names)
|
||||
|
||||
def test_discover_hooks_sorted_by_name(self):
|
||||
@@ -222,10 +216,25 @@ class TestHookDiscovery(unittest.TestCase):
|
||||
hooks = sorted(set(hooks), key=lambda p: p.name)
|
||||
|
||||
# Check numeric ordering
|
||||
self.assertEqual(hooks[0].name, 'on_Snapshot__20_chrome_tab.bg.js')
|
||||
self.assertEqual(hooks[1].name, 'on_Snapshot__21_consolelog.bg.js')
|
||||
self.assertEqual(hooks[0].name, 'on_Snapshot__20_chrome_tab.daemon.bg.js')
|
||||
self.assertEqual(hooks[1].name, 'on_Snapshot__21_consolelog.daemon.bg.js')
|
||||
self.assertEqual(hooks[2].name, 'on_Snapshot__50_wget.py')
|
||||
|
||||
def test_get_plugins_includes_non_snapshot_plugin_dirs(self):
|
||||
"""get_plugins() should include binary-only plugins with standardized metadata."""
|
||||
env_dir = self.plugins_dir / 'env'
|
||||
env_dir.mkdir()
|
||||
(env_dir / 'on_Binary__15_env_discover.py').write_text('# binary hook')
|
||||
(env_dir / 'config.json').write_text('{"type": "object", "properties": {}}')
|
||||
|
||||
from archivebox import hooks as hooks_module
|
||||
|
||||
hooks_module.get_plugins.cache_clear()
|
||||
with patch.object(hooks_module, 'BUILTIN_PLUGINS_DIR', self.plugins_dir), patch.object(hooks_module, 'USER_PLUGINS_DIR', self.test_dir / 'user_plugins'):
|
||||
plugins = hooks_module.get_plugins()
|
||||
|
||||
self.assertIn('env', plugins)
|
||||
|
||||
|
||||
class TestGetExtractorName(unittest.TestCase):
|
||||
"""Test get_extractor_name() function."""
|
||||
|
||||
@@ -338,7 +338,7 @@ class BaseStateMachine(StateMachine):
|
||||
(e.g., 'snapshot', 'archiveresult', 'crawl', 'binary').
|
||||
|
||||
Example usage:
|
||||
class SnapshotMachine(BaseStateMachine, strict_states=True):
|
||||
class SnapshotMachine(BaseStateMachine):
|
||||
model_attr_name = 'snapshot'
|
||||
|
||||
# States and transitions...
|
||||
|
||||
Reference in New Issue
Block a user