diff --git a/archivebox/cli/__init__.py b/archivebox/cli/__init__.py index c0d35a54..675baabd 100644 --- a/archivebox/cli/__init__.py +++ b/archivebox/cli/__init__.py @@ -48,6 +48,8 @@ class ArchiveBoxGroup(click.Group): 'server': 'archivebox.cli.archivebox_server.main', 'shell': 'archivebox.cli.archivebox_shell.main', 'manage': 'archivebox.cli.archivebox_manage.main', + # Introspection commands + 'pluginmap': 'archivebox.cli.archivebox_pluginmap.main', # Worker command 'worker': 'archivebox.cli.archivebox_worker.main', } diff --git a/archivebox/cli/archivebox_pluginmap.py b/archivebox/cli/archivebox_pluginmap.py new file mode 100644 index 00000000..b168a480 --- /dev/null +++ b/archivebox/cli/archivebox_pluginmap.py @@ -0,0 +1,356 @@ +#!/usr/bin/env python3 + +__package__ = 'archivebox.cli' + +from typing import Optional +from pathlib import Path + +import rich_click as click + +from archivebox.misc.util import docstring, enforce_types + + +# State Machine ASCII Art Diagrams +CRAWL_MACHINE_DIAGRAM = """ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ CrawlMachine │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────┐ │ +│ │ QUEUED │◄────────────────┐ │ +│ │ (initial) │ │ │ +│ └──────┬──────┘ │ │ +│ │ │ tick() unless can_start() │ +│ │ tick() when │ │ +│ │ can_start() │ │ +│ ▼ │ │ +│ ┌─────────────┐ │ │ +│ │ STARTED │─────────────────┘ │ +│ │ │◄────────────────┐ │ +│ │ enter: │ │ │ +│ │ crawl.run()│ │ tick() unless is_finished() │ +│ │ (discover │ │ │ +│ │ Crawl │─────────────────┘ │ +│ │ hooks) │ │ +│ └──────┬──────┘ │ +│ │ │ +│ │ tick() when is_finished() │ +│ ▼ │ +│ ┌─────────────┐ │ +│ │ SEALED │ │ +│ │ (final) │ │ +│ │ │ │ +│ │ enter: │ │ +│ │ cleanup() │ │ +│ └─────────────┘ │ +│ │ +│ Hooks triggered: on_Crawl__* (during STARTED.enter via crawl.run()) │ +│ on_CrawlEnd__* (during SEALED.enter via cleanup()) │ +└─────────────────────────────────────────────────────────────────────────────┘ +""" + +SNAPSHOT_MACHINE_DIAGRAM = """ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ SnapshotMachine │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────┐ │ +│ │ QUEUED │◄────────────────┐ │ +│ │ (initial) │ │ │ +│ └──────┬──────┘ │ │ +│ │ │ tick() unless can_start() │ +│ │ tick() when │ │ +│ │ can_start() │ │ +│ ▼ │ │ +│ ┌─────────────┐ │ │ +│ │ STARTED │─────────────────┘ │ +│ │ │◄────────────────┐ │ +│ │ enter: │ │ │ +│ │ snapshot │ │ tick() unless is_finished() │ +│ │ .run() │ │ │ +│ │ (discover │─────────────────┘ │ +│ │ Snapshot │ │ +│ │ hooks, │ │ +│ │ create │ │ +│ │ pending │ │ +│ │ results) │ │ +│ └──────┬──────┘ │ +│ │ │ +│ │ tick() when is_finished() │ +│ ▼ │ +│ ┌─────────────┐ │ +│ │ SEALED │ │ +│ │ (final) │ │ +│ │ │ │ +│ │ enter: │ │ +│ │ cleanup() │ │ +│ └─────────────┘ │ +│ │ +│ Hooks triggered: on_Snapshot__* (creates ArchiveResults in STARTED.enter) │ +└─────────────────────────────────────────────────────────────────────────────┘ +""" + +ARCHIVERESULT_MACHINE_DIAGRAM = """ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ ArchiveResultMachine │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────┐ │ +│ │ QUEUED │◄────────────────┐ │ +│ │ (initial) │ │ │ +│ └──────┬──────┘ │ │ +│ │ │ tick() unless can_start() │ +│ │ tick() when │ │ +│ │ can_start() │ │ +│ ▼ │ │ +│ ┌─────────────┐ │ │ +│ │ STARTED │─────────────────┘ │ +│ │ │◄────────────────┐ │ +│ │ enter: │ │ tick() unless is_finished() │ +│ │ result.run()│─────────────────┘ │ +│ │ (execute │ │ +│ │ hook via │ │ +│ │ run_hook())│ │ +│ └──────┬──────┘ │ +│ │ │ +│ │ tick() checks status set by hook output │ +│ ├────────────────┬────────────────┬────────────────┐ │ +│ ▼ ▼ ▼ ▼ │ +│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ +│ │ SUCCEEDED │ │ FAILED │ │ SKIPPED │ │ BACKOFF │ │ +│ │ (final) │ │ (final) │ │ (final) │ │ │ │ +│ └───────────┘ └───────────┘ └───────────┘ └─────┬─────┘ │ +│ │ │ +│ can_start()───┘ │ +│ loops back to STARTED │ +│ │ +│ Each ArchiveResult runs ONE specific hook (stored in .hook_name field) │ +└─────────────────────────────────────────────────────────────────────────────┘ +""" + +BINARY_MACHINE_DIAGRAM = """ +┌─────────────────────────────────────────────────────────────────────────────┐ +│ BinaryMachine │ +├─────────────────────────────────────────────────────────────────────────────┤ +│ │ +│ ┌─────────────┐ │ +│ │ QUEUED │◄────────────────┐ │ +│ │ (initial) │ │ │ +│ └──────┬──────┘ │ │ +│ │ │ tick() unless can_start() │ +│ │ tick() when │ │ +│ │ can_start() │ │ +│ ▼ │ │ +│ ┌─────────────┐ │ │ +│ │ STARTED │─────────────────┘ │ +│ │ │◄────────────────┐ │ +│ │ enter: │ │ │ +│ │ binary.run()│ │ tick() unless is_finished() │ +│ │ (discover │─────────────────┘ │ +│ │ Binary │ │ +│ │ hooks, │ │ +│ │ try each │ │ +│ │ provider) │ │ +│ └──────┬──────┘ │ +│ │ │ +│ │ tick() checks status set by hook output │ +│ ├────────────────────────────────┐ │ +│ ▼ ▼ │ +│ ┌─────────────┐ ┌─────────────┐ │ +│ │ SUCCEEDED │ │ FAILED │ │ +│ │ (final) │ │ (final) │ │ +│ │ │ │ │ │ +│ │ abspath, │ │ no provider │ │ +│ │ version set │ │ succeeded │ │ +│ └─────────────┘ └─────────────┘ │ +│ │ +│ Hooks triggered: on_Binary__* (provider hooks during STARTED.enter) │ +│ Providers tried in sequence until one succeeds: apt, brew, pip, npm, etc. │ +└─────────────────────────────────────────────────────────────────────────────┘ +""" + + +@enforce_types +def pluginmap( + show_disabled: bool = False, + model: Optional[str] = None, + quiet: bool = False, +) -> dict: + """ + Show a map of all state machines and their associated plugin hooks. + + Displays ASCII art diagrams of the core model state machines (Crawl, Snapshot, + ArchiveResult, Binary) and lists all auto-detected on_Modelname_xyz hooks + that will run for each model's transitions. + """ + from rich.console import Console + from rich.table import Table + from rich.panel import Panel + from rich import box + + from archivebox.hooks import ( + discover_hooks, + extract_step, + is_background_hook, + BUILTIN_PLUGINS_DIR, + USER_PLUGINS_DIR, + ) + + console = Console() + prnt = console.print + + # Model event types that can have hooks + model_events = { + 'Crawl': { + 'description': 'Hooks run when a Crawl starts (QUEUED→STARTED)', + 'machine': 'CrawlMachine', + 'diagram': CRAWL_MACHINE_DIAGRAM, + }, + 'CrawlEnd': { + 'description': 'Hooks run when a Crawl finishes (STARTED→SEALED)', + 'machine': 'CrawlMachine', + 'diagram': None, # Part of CrawlMachine + }, + 'Snapshot': { + 'description': 'Hooks run for each Snapshot (creates ArchiveResults)', + 'machine': 'SnapshotMachine', + 'diagram': SNAPSHOT_MACHINE_DIAGRAM, + }, + 'Binary': { + 'description': 'Hooks for installing binary dependencies (providers)', + 'machine': 'BinaryMachine', + 'diagram': BINARY_MACHINE_DIAGRAM, + }, + } + + # Filter to specific model if requested + if model: + model = model.title() + if model not in model_events: + prnt(f'[red]Error: Unknown model "{model}". Available: {", ".join(model_events.keys())}[/red]') + return {} + model_events = {model: model_events[model]} + + result = { + 'models': {}, + 'plugins_dir': str(BUILTIN_PLUGINS_DIR), + 'user_plugins_dir': str(USER_PLUGINS_DIR), + } + + if not quiet: + prnt() + prnt('[bold cyan]ArchiveBox Plugin Map[/bold cyan]') + prnt(f'[dim]Built-in plugins: {BUILTIN_PLUGINS_DIR}[/dim]') + prnt(f'[dim]User plugins: {USER_PLUGINS_DIR}[/dim]') + prnt() + + # Show diagrams first (unless quiet mode) + if not quiet: + # Show ArchiveResult diagram separately since it's different + prnt(Panel( + ARCHIVERESULT_MACHINE_DIAGRAM, + title='[bold green]ArchiveResultMachine[/bold green]', + border_style='green', + expand=False, + )) + prnt() + + for event_name, info in model_events.items(): + # Discover hooks for this event + hooks = discover_hooks(event_name, filter_disabled=not show_disabled) + + # Build hook info list + hook_infos = [] + for hook_path in hooks: + # Get plugin name from parent directory (e.g., 'wget' from 'plugins/wget/on_Snapshot__61_wget.py') + plugin_name = hook_path.parent.name + step = extract_step(hook_path.name) + is_bg = is_background_hook(hook_path.name) + + hook_infos.append({ + 'path': str(hook_path), + 'name': hook_path.name, + 'plugin': plugin_name, + 'step': step, + 'is_background': is_bg, + 'extension': hook_path.suffix, + }) + + result['models'][event_name] = { + 'description': info['description'], + 'machine': info['machine'], + 'hooks': hook_infos, + 'hook_count': len(hook_infos), + } + + if not quiet: + # Show diagram if this model has one + if info.get('diagram'): + prnt(Panel( + info['diagram'], + title=f'[bold green]{info["machine"]}[/bold green]', + border_style='green', + expand=False, + )) + prnt() + + # Create hooks table + table = Table( + title=f'[bold yellow]on_{event_name}__* Hooks[/bold yellow] ({len(hooks)} found)', + box=box.ROUNDED, + show_header=True, + header_style='bold magenta', + ) + table.add_column('Step', justify='center', width=6) + table.add_column('Plugin', style='cyan', width=20) + table.add_column('Hook Name', style='green') + table.add_column('BG', justify='center', width=4) + table.add_column('Type', justify='center', width=5) + + # Sort by step then by name + sorted_hooks = sorted(hook_infos, key=lambda h: (h['step'], h['name'])) + + for hook in sorted_hooks: + bg_marker = '[yellow]bg[/yellow]' if hook['is_background'] else '' + ext = hook['extension'].lstrip('.') + table.add_row( + str(hook['step']), + hook['plugin'], + hook['name'], + bg_marker, + ext, + ) + + prnt(table) + prnt() + prnt(f'[dim]{info["description"]}[/dim]') + prnt() + + # Summary + if not quiet: + total_hooks = sum(m['hook_count'] for m in result['models'].values()) + prnt(f'[bold]Total hooks discovered: {total_hooks}[/bold]') + prnt() + prnt('[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]') + prnt('[dim] - XX: Two-digit order (first digit = step 0-9)[/dim]') + prnt('[dim] - .bg: Background hook (non-blocking)[/dim]') + prnt('[dim] - ext: py, sh, or js[/dim]') + prnt() + + return result + + +@click.command() +@click.option('--show-disabled', '-a', is_flag=True, help='Show hooks from disabled plugins too') +@click.option('--model', '-m', type=str, default=None, help='Filter to specific model (Crawl, Snapshot, Binary, CrawlEnd)') +@click.option('--quiet', '-q', is_flag=True, help='Output JSON only, no ASCII diagrams') +@docstring(pluginmap.__doc__) +def main(**kwargs): + import json + result = pluginmap(**kwargs) + if kwargs.get('quiet'): + print(json.dumps(result, indent=2)) + + +if __name__ == '__main__': + main()