Files
ArchiveBox/archivebox/cli/archivebox_pluginmap.py
Nick Sweeting b749b26c5d wip
2026-03-23 03:58:32 -07:00

310 lines
18 KiB
Python

#!/usr/bin/env python3
__package__ = "archivebox.cli"
import rich_click as click
from archivebox.misc.util import docstring, enforce_types
# State Machine ASCII Art Diagrams
CRAWL_MACHINE_DIAGRAM = """
┌─────────────────────────────────────────────────────────────────────────────┐
│ CrawlMachine │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┐ │
│ │ QUEUED │◄────────────────┐ │
│ │ (initial) │ │ │
│ └──────┬──────┘ │ │
│ │ │ tick() unless can_start() │
│ │ tick() when │ │
│ │ can_start() │ │
│ ▼ │ │
│ ┌─────────────┐ │ │
│ │ STARTED │─────────────────┘ │
│ │ │◄────────────────┐ │
│ │ enter: │ │ │
│ │ crawl.run()│ │ tick() unless is_finished() │
│ │ (discover │ │ │
│ │ Crawl │─────────────────┘ │
│ │ hooks) │ │
│ └──────┬──────┘ │
│ │ │
│ │ tick() when is_finished() │
│ ▼ │
│ ┌─────────────┐ │
│ │ SEALED │ │
│ │ (final) │ │
│ │ │ │
│ │ enter: │ │
│ │ cleanup() │ │
│ └─────────────┘ │
│ │
│ Hooks triggered: on_Crawl__* (during STARTED.enter via crawl.run()) │
│ on_CrawlEnd__* (during SEALED.enter via cleanup()) │
└─────────────────────────────────────────────────────────────────────────────┘
"""
SNAPSHOT_MACHINE_DIAGRAM = """
┌─────────────────────────────────────────────────────────────────────────────┐
│ SnapshotMachine │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┐ │
│ │ QUEUED │◄────────────────┐ │
│ │ (initial) │ │ │
│ └──────┬──────┘ │ │
│ │ │ tick() unless can_start() │
│ │ tick() when │ │
│ │ can_start() │ │
│ ▼ │ │
│ ┌─────────────┐ │ │
│ │ STARTED │─────────────────┘ │
│ │ │◄────────────────┐ │
│ │ enter: │ │ │
│ │ snapshot │ │ tick() unless is_finished() │
│ │ .run() │ │ │
│ │ (discover │─────────────────┘ │
│ │ Snapshot │ │
│ │ hooks, │ │
│ │ create │ │
│ │ pending │ │
│ │ results) │ │
│ └──────┬──────┘ │
│ │ │
│ │ tick() when is_finished() │
│ ▼ │
│ ┌─────────────┐ │
│ │ SEALED │ │
│ │ (final) │ │
│ │ │ │
│ │ enter: │ │
│ │ cleanup() │ │
│ └─────────────┘ │
│ │
│ Hooks triggered: on_Snapshot__* (creates ArchiveResults in STARTED.enter) │
└─────────────────────────────────────────────────────────────────────────────┘
"""
BINARY_MACHINE_DIAGRAM = """
┌─────────────────────────────────────────────────────────────────────────────┐
│ BinaryMachine │
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┐ │
│ │ QUEUED │◄────────────────┐ │
│ │ (initial) │ │ │
│ └──────┬──────┘ │ │
│ │ │ tick() unless can_install() │
│ │ │ (stays queued if failed) │
│ │ tick() when │ │
│ │ can_install() │ │
│ │ │ │
│ │ on_install() runs │ │
│ │ during transition: │ │
│ │ • binary.run() │ │
│ │ (discover Binary │ │
│ │ hooks, try each │ │
│ │ provider until │ │
│ │ one succeeds) │ │
│ │ • Sets abspath, │ │
│ │ version, sha256 │ │
│ │ │ │
│ │ If install fails: │ │
│ │ raises exception──────┘ │
│ │ (retry_at bumped) │
│ │ │
│ ▼ │
│ ┌─────────────┐ │
│ │ INSTALLED │ │
│ │ (final) │ │
│ │ │ │
│ │ Binary is │ │
│ │ ready to │ │
│ │ use │ │
│ └─────────────┘ │
│ │
│ Hooks triggered: on_Binary__* (provider hooks during transition) │
│ Providers tried in sequence until one succeeds: apt, brew, pip, npm, etc. │
│ Installation is synchronous - no intermediate STARTED state │
└─────────────────────────────────────────────────────────────────────────────┘
"""
@enforce_types
def pluginmap(
show_disabled: bool = False,
model: str | None = None,
quiet: bool = False,
) -> dict:
"""
Show a map of all state machines and their associated plugin hooks.
Displays ASCII art diagrams of the core queued model state machines (Crawl,
Snapshot, Binary) and lists all auto-detected on_Modelname_xyz hooks
that will run for each model's transitions.
"""
from rich.console import Console
from rich.table import Table
from rich.panel import Panel
from rich import box
from archivebox.hooks import (
discover_hooks,
is_background_hook,
BUILTIN_PLUGINS_DIR,
USER_PLUGINS_DIR,
)
console = Console()
prnt = console.print
# Model event types that can have hooks
model_events = {
"Crawl": {
"description": "Hooks run when a Crawl starts (QUEUED→STARTED)",
"machine": "CrawlMachine",
"diagram": CRAWL_MACHINE_DIAGRAM,
},
"CrawlEnd": {
"description": "Hooks run when a Crawl finishes (STARTED→SEALED)",
"machine": "CrawlMachine",
"diagram": None, # Part of CrawlMachine
},
"Snapshot": {
"description": "Hooks run for each Snapshot (creates ArchiveResults)",
"machine": "SnapshotMachine",
"diagram": SNAPSHOT_MACHINE_DIAGRAM,
},
"Binary": {
"description": "Hooks for installing binary dependencies (providers)",
"machine": "BinaryMachine",
"diagram": BINARY_MACHINE_DIAGRAM,
},
}
# Filter to specific model if requested
if model:
model = model.title()
if model not in model_events:
prnt(f'[red]Error: Unknown model "{model}". Available: {", ".join(model_events.keys())}[/red]')
return {}
model_events = {model: model_events[model]}
result = {
"models": {},
"plugins_dir": str(BUILTIN_PLUGINS_DIR),
"user_plugins_dir": str(USER_PLUGINS_DIR),
}
if not quiet:
prnt()
prnt("[bold cyan]ArchiveBox Plugin Map[/bold cyan]")
prnt(f"[dim]Built-in plugins: {BUILTIN_PLUGINS_DIR}[/dim]")
prnt(f"[dim]User plugins: {USER_PLUGINS_DIR}[/dim]")
prnt()
for event_name, info in model_events.items():
# Discover hooks for this event
hooks = discover_hooks(event_name, filter_disabled=not show_disabled)
# Build hook info list
hook_infos = []
for hook_path in hooks:
# Get plugin name from parent directory (e.g., 'wget' from 'plugins/wget/on_Snapshot__06_wget.bg.py')
plugin_name = hook_path.parent.name
is_bg = is_background_hook(hook_path.name)
hook_infos.append(
{
"path": str(hook_path),
"name": hook_path.name,
"plugin": plugin_name,
"is_background": is_bg,
"extension": hook_path.suffix,
},
)
result["models"][event_name] = {
"description": info["description"],
"machine": info["machine"],
"hooks": hook_infos,
"hook_count": len(hook_infos),
}
if not quiet:
# Show diagram if this model has one
if info.get("diagram"):
assert info["diagram"] is not None
prnt(
Panel(
info["diagram"],
title=f"[bold green]{info['machine']}[/bold green]",
border_style="green",
expand=False,
),
)
prnt()
# Create hooks table
table = Table(
title=f"[bold yellow]on_{event_name}__* Hooks[/bold yellow] ({len(hooks)} found)",
box=box.ROUNDED,
show_header=True,
header_style="bold magenta",
)
table.add_column("Plugin", style="cyan", width=20)
table.add_column("Hook Name", style="green")
table.add_column("BG", justify="center", width=4)
table.add_column("Type", justify="center", width=5)
# Sort lexicographically by hook name
sorted_hooks = sorted(hook_infos, key=lambda h: h["name"])
for hook in sorted_hooks:
bg_marker = "[yellow]bg[/yellow]" if hook["is_background"] else ""
ext = hook["extension"].lstrip(".")
table.add_row(
hook["plugin"],
hook["name"],
bg_marker,
ext,
)
prnt(table)
prnt()
prnt(f"[dim]{info['description']}[/dim]")
prnt()
# Summary
if not quiet:
total_hooks = sum(m["hook_count"] for m in result["models"].values())
prnt(f"[bold]Total hooks discovered: {total_hooks}[/bold]")
prnt()
prnt("[dim]Hook naming convention: on_{Model}__{XX}_{description}[.bg].{ext}[/dim]")
prnt("[dim] - XX: Two-digit lexicographic order (00-99)[/dim]")
prnt("[dim] - .bg: Background hook (non-blocking)[/dim]")
prnt("[dim] - ext: py, sh, or js[/dim]")
prnt()
return result
@click.command()
@click.option("--show-disabled", "-a", is_flag=True, help="Show hooks from disabled plugins too")
@click.option("--model", "-m", type=str, default=None, help="Filter to specific model (Crawl, Snapshot, Binary, CrawlEnd)")
@click.option("--quiet", "-q", is_flag=True, help="Output JSON only, no ASCII diagrams")
@docstring(pluginmap.__doc__)
def main(**kwargs):
import json
result = pluginmap(**kwargs)
if kwargs.get("quiet"):
print(json.dumps(result, indent=2))
if __name__ == "__main__":
main()