WIP: checkpoint working tree before rebasing onto dev

This commit is contained in:
Nick Sweeting
2026-03-22 20:23:45 -07:00
parent a6548df8d0
commit f400a2cd67
87 changed files with 12607 additions and 1808 deletions

View File

@@ -1,10 +1,13 @@
__package__ = 'archivebox.config'
import html
import json
import os
import shutil
import inspect
import re
from pathlib import Path
from typing import Any, Dict
from typing import Any, Callable, Dict
from urllib.parse import quote, urlencode
from django.http import HttpRequest
from django.utils import timezone
from django.utils.html import format_html
@@ -18,16 +21,27 @@ from archivebox.misc.util import parse_date
from archivebox.machine.models import Binary
ABX_PLUGINS_DOCS_BASE_URL = 'https://archivebox.github.io/abx-plugins/'
ABX_PLUGINS_GITHUB_BASE_URL = 'https://github.com/ArchiveBox/abx-plugins/tree/main/abx_plugins/plugins/'
LIVE_CONFIG_BASE_URL = '/admin/environment/config/'
ENVIRONMENT_BINARIES_BASE_URL = '/admin/environment/binaries/'
INSTALLED_BINARIES_BASE_URL = '/admin/machine/binary/'
# Common binaries to check for
KNOWN_BINARIES = [
'wget', 'curl', 'chromium', 'chrome', 'google-chrome', 'google-chrome-stable',
'node', 'npm', 'npx', 'yt-dlp', 'ytdlp', 'youtube-dl',
'node', 'npm', 'npx', 'yt-dlp',
'git', 'singlefile', 'readability-extractor', 'mercury-parser',
'python3', 'python', 'bash', 'zsh',
'ffmpeg', 'ripgrep', 'rg', 'sonic', 'archivebox',
]
CANONICAL_BINARY_ALIASES = {
'youtube-dl': 'yt-dlp',
'ytdlp': 'yt-dlp',
}
def is_superuser(request: HttpRequest) -> bool:
return bool(getattr(request.user, 'is_superuser', False))
@@ -38,6 +52,249 @@ def format_parsed_datetime(value: object) -> str:
return parsed.strftime("%Y-%m-%d %H:%M:%S") if parsed else ""
JSON_TOKEN_RE = re.compile(
r'(?P<key>"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")(?=\s*:)'
r'|(?P<string>"(?:\\u[a-fA-F0-9]{4}|\\[^u]|[^\\"])*")'
r'|(?P<boolean>\btrue\b|\bfalse\b)'
r'|(?P<null>\bnull\b)'
r'|(?P<number>-?\d+(?:\.\d+)?(?:[eE][+-]?\d+)?)'
)
def render_code_block(text: str, *, highlighted: bool = False) -> str:
code = html.escape(text, quote=False)
if highlighted:
def _wrap_token(match: re.Match[str]) -> str:
styles = {
'key': 'color: #0550ae;',
'string': 'color: #0a7f45;',
'boolean': 'color: #8250df; font-weight: 600;',
'null': 'color: #6e7781; font-style: italic;',
'number': 'color: #b35900;',
}
token_type = next(name for name, value in match.groupdict().items() if value is not None)
return f'<span style="{styles[token_type]}">{match.group(0)}</span>'
code = JSON_TOKEN_RE.sub(_wrap_token, code)
return (
'<pre style="max-height: 600px; overflow: auto; background: #f6f8fa; '
'border: 1px solid #d0d7de; border-radius: 6px; padding: 12px; margin: 0;">'
'<code style="font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, '
'\'Liberation Mono\', monospace; white-space: pre; line-height: 1.5;">'
f'{code}'
'</code></pre>'
)
def render_highlighted_json_block(value: Any) -> str:
return render_code_block(json.dumps(value, indent=2, ensure_ascii=False), highlighted=True)
def get_plugin_docs_url(plugin_name: str) -> str:
return f'{ABX_PLUGINS_DOCS_BASE_URL}#{plugin_name}'
def get_plugin_hook_source_url(plugin_name: str, hook_name: str) -> str:
return f'{ABX_PLUGINS_GITHUB_BASE_URL}{quote(plugin_name)}/{quote(hook_name)}'
def get_live_config_url(key: str) -> str:
return f'{LIVE_CONFIG_BASE_URL}{quote(key)}/'
def get_environment_binary_url(name: str) -> str:
return f'{ENVIRONMENT_BINARIES_BASE_URL}{quote(name)}/'
def get_installed_binary_change_url(name: str, binary: Any) -> str | None:
binary_id = getattr(binary, 'id', None)
if not binary_id:
return None
base_url = getattr(binary, 'admin_change_url', None) or f'{INSTALLED_BINARIES_BASE_URL}{binary_id}/change/'
changelist_filters = urlencode({'q': canonical_binary_name(name)})
return f'{base_url}?{urlencode({"_changelist_filters": changelist_filters})}'
def get_machine_admin_url() -> str | None:
try:
from archivebox.machine.models import Machine
return Machine.current().admin_change_url
except Exception:
return None
def render_code_tag_list(values: list[str]) -> str:
if not values:
return '<span style="color: #6e7781;">(none)</span>'
tags = ''.join(
str(format_html(
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>',
value,
))
for value in values
)
return f'<div style="display: flex; flex-wrap: wrap;">{tags}</div>'
def render_plugin_metadata_html(config: dict[str, Any]) -> str:
rows = (
('Title', config.get('title') or '(none)'),
('Description', config.get('description') or '(none)'),
('Required Plugins', mark_safe(render_link_tag_list(config.get('required_plugins') or [], get_plugin_docs_url))),
('Required Binaries', mark_safe(render_link_tag_list(config.get('required_binaries') or [], get_environment_binary_url))),
('Output MIME Types', mark_safe(render_code_tag_list(config.get('output_mimetypes') or []))),
)
rendered_rows = ''.join(
str(format_html(
'<div style="margin: 0 0 14px 0;">'
'<div style="font-weight: 600; margin-bottom: 4px;">{}</div>'
'<div>{}</div>'
'</div>',
label,
value,
))
for label, value in rows
)
return f'<div style="margin: 4px 0 0 0;">{rendered_rows}</div>'
def render_link_tag_list(values: list[str], url_resolver: Callable[[str], str] | None = None) -> str:
if not values:
return '<span style="color: #6e7781;">(none)</span>'
tags = []
for value in values:
if url_resolver is None:
tags.append(str(format_html(
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>',
value,
)))
else:
tags.append(str(format_html(
'<a href="{}" style="text-decoration: none;">'
'<code style="display: inline-block; margin: 0 6px 6px 0; padding: 2px 6px; '
'background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 999px;">{}</code>'
'</a>',
url_resolver(value),
value,
)))
return f'<div style="display: flex; flex-wrap: wrap;">{"".join(tags)}</div>'
def render_property_links(prop_name: str, prop_info: dict[str, Any], machine_admin_url: str | None) -> str:
links = [
str(format_html('<a href="{}">Computed value</a>', get_live_config_url(prop_name))),
]
if machine_admin_url:
links.append(str(format_html('<a href="{}">Edit override</a>', machine_admin_url)))
fallback = prop_info.get('x-fallback')
if isinstance(fallback, str) and fallback:
links.append(str(format_html('<a href="{}">Fallback: <code>{}</code></a>', get_live_config_url(fallback), fallback)))
aliases = prop_info.get('x-aliases') or []
if isinstance(aliases, list):
for alias in aliases:
if isinstance(alias, str) and alias:
links.append(str(format_html('<a href="{}">Alias: <code>{}</code></a>', get_live_config_url(alias), alias)))
default = prop_info.get('default')
if prop_name.endswith('_BINARY') and isinstance(default, str) and default:
links.append(str(format_html('<a href="{}">Binary: <code>{}</code></a>', get_environment_binary_url(default), default)))
return ' &nbsp; '.join(links)
def render_config_properties_html(properties: dict[str, Any], machine_admin_url: str | None) -> str:
header_links = [
str(format_html('<a href="{}">Dependencies</a>', ENVIRONMENT_BINARIES_BASE_URL)),
str(format_html('<a href="{}">Installed Binaries</a>', INSTALLED_BINARIES_BASE_URL)),
]
if machine_admin_url:
header_links.insert(0, str(format_html('<a href="{}">Machine Config Editor</a>', machine_admin_url)))
cards = [
f'<div style="margin: 0 0 16px 0;">{" &nbsp; | &nbsp; ".join(header_links)}</div>'
]
for prop_name, prop_info in properties.items():
prop_type = prop_info.get('type', 'unknown')
if isinstance(prop_type, list):
prop_type = ' | '.join(str(type_name) for type_name in prop_type)
prop_desc = prop_info.get('description', '')
default_html = ''
if 'default' in prop_info:
default_html = str(format_html(
'<div style="margin-top: 6px;"><b>Default:</b> <code>{}</code></div>',
prop_info['default'],
))
description_html = prop_desc or mark_safe('<span style="color: #6e7781;">(no description)</span>')
cards.append(str(format_html(
'<div style="margin: 0 0 14px 0; padding: 12px; background: #f6f8fa; border: 1px solid #d0d7de; border-radius: 6px;">'
'<div style="margin-bottom: 6px;">'
'<a href="{}" style="font-weight: 600;"><code>{}</code></a>'
' <span style="color: #6e7781;">({})</span>'
'</div>'
'<div style="margin-bottom: 6px;">{}</div>'
'<div style="font-size: 0.95em;">{}</div>'
'{}'
'</div>',
get_live_config_url(prop_name),
prop_name,
prop_type,
description_html,
mark_safe(render_property_links(prop_name, prop_info, machine_admin_url)),
mark_safe(default_html),
)))
return ''.join(cards)
def render_hook_links_html(plugin_name: str, hooks: list[str], source: str) -> str:
if not hooks:
return '<span style="color: #6e7781;">(none)</span>'
items = []
for hook_name in hooks:
if source == 'builtin':
items.append(str(format_html(
'<div style="margin: 0 0 8px 0;">'
'<a href="{}" target="_blank" rel="noopener noreferrer"><code>{}</code></a>'
'</div>',
get_plugin_hook_source_url(plugin_name, hook_name),
hook_name,
)))
else:
items.append(str(format_html(
'<div style="margin: 0 0 8px 0;"><code>{}</code></div>',
hook_name,
)))
return ''.join(items)
def render_binary_detail_description(name: str, merged: dict[str, Any], db_binary: Any) -> str:
installed_binary_url = get_installed_binary_change_url(name, db_binary)
if installed_binary_url:
return str(format_html(
'<code>{}</code><br/>'
'<a href="{}">View Installed Binary Record</a>',
merged['abspath'],
installed_binary_url,
))
return str(format_html('<code>{}</code>', merged['abspath']))
def obj_to_yaml(obj: Any, indent: int = 0) -> str:
indent_str = " " * indent
if indent == 0:
@@ -80,21 +337,41 @@ def obj_to_yaml(obj: Any, indent: int = 0) -> str:
return f" {str(obj)}"
def get_detected_binaries() -> Dict[str, Dict[str, Any]]:
"""Detect available binaries using shutil.which."""
binaries = {}
def canonical_binary_name(name: str) -> str:
return CANONICAL_BINARY_ALIASES.get(name, name)
for name in KNOWN_BINARIES:
path = shutil.which(name)
if path:
binaries[name] = {
'name': name,
'abspath': path,
'version': None, # Could add version detection later
'is_available': True,
}
return binaries
def _binary_sort_key(binary: Binary) -> tuple[int, int, int, Any]:
return (
int(binary.status == Binary.StatusChoices.INSTALLED),
int(bool(binary.version)),
int(bool(binary.abspath)),
binary.modified_at,
)
def get_db_binaries_by_name() -> Dict[str, Binary]:
grouped: Dict[str, list[Binary]] = {}
for binary in Binary.objects.all():
grouped.setdefault(canonical_binary_name(binary.name), []).append(binary)
return {
name: max(records, key=_binary_sort_key)
for name, records in grouped.items()
}
def serialize_binary_record(name: str, binary: Binary | None) -> Dict[str, Any]:
is_installed = bool(binary and binary.status == Binary.StatusChoices.INSTALLED)
return {
'name': canonical_binary_name(name),
'version': str(getattr(binary, 'version', '') or ''),
'binprovider': str(getattr(binary, 'binprovider', '') or ''),
'abspath': str(getattr(binary, 'abspath', '') or ''),
'sha256': str(getattr(binary, 'sha256', '') or ''),
'status': str(getattr(binary, 'status', '') or ''),
'is_available': is_installed and bool(getattr(binary, 'abspath', '') or ''),
}
def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]:
@@ -150,29 +427,18 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
"Found Abspath": [],
}
# Get binaries from database (previously detected/installed)
db_binaries = {b.name: b for b in Binary.objects.all()}
# Get currently detectable binaries
detected = get_detected_binaries()
# Merge and display
all_binary_names = sorted(set(list(db_binaries.keys()) + list(detected.keys())))
db_binaries = get_db_binaries_by_name()
all_binary_names = sorted(db_binaries.keys())
for name in all_binary_names:
db_binary = db_binaries.get(name)
detected_binary = detected.get(name)
merged = serialize_binary_record(name, db_binaries.get(name))
rows['Binary Name'].append(ItemLink(name, key=name))
if db_binary:
rows['Found Version'].append(f'{db_binary.version}' if db_binary.version else '✅ found')
rows['Provided By'].append(db_binary.binprovider or 'PATH')
rows['Found Abspath'].append(str(db_binary.abspath or ''))
elif detected_binary:
rows['Found Version'].append('✅ found')
rows['Provided By'].append('PATH')
rows['Found Abspath'].append(detected_binary['abspath'])
if merged['is_available']:
rows['Found Version'].append(f"{merged['version']}" if merged['version'] else '✅ found')
rows['Provided By'].append(merged['binprovider'] or '-')
rows['Found Abspath'].append(merged['abspath'] or '-')
else:
rows['Found Version'].append('❌ missing')
rows['Provided By'].append('-')
@@ -187,41 +453,22 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
key = canonical_binary_name(key)
# Try database first
try:
binary = Binary.objects.get(name=key)
section: SectionData = {
"name": binary.name,
"description": str(binary.abspath or ''),
"fields": {
'name': binary.name,
'binprovider': binary.binprovider,
'abspath': str(binary.abspath),
'version': binary.version,
'sha256': binary.sha256,
},
"help_texts": {},
}
return ItemContext(
slug=key,
title=key,
data=[section],
)
except Binary.DoesNotExist:
pass
db_binary = get_db_binaries_by_name().get(key)
merged = serialize_binary_record(key, db_binary)
# Try to detect from PATH
path = shutil.which(key)
if path:
if merged['is_available']:
section: SectionData = {
"name": key,
"description": path,
"description": mark_safe(render_binary_detail_description(key, merged, db_binary)),
"fields": {
'name': key,
'binprovider': 'PATH',
'abspath': path,
'version': 'unknown',
'binprovider': merged['binprovider'] or '-',
'abspath': merged['abspath'] or 'not found',
'version': merged['version'] or 'unknown',
'sha256': merged['sha256'],
'status': merged['status'],
},
"help_texts": {},
}
@@ -233,12 +480,13 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
section: SectionData = {
"name": key,
"description": "Binary not found",
"description": "No persisted Binary record found",
"fields": {
'name': key,
'binprovider': 'not installed',
'abspath': 'not found',
'version': 'N/A',
'binprovider': merged['binprovider'] or 'not recorded',
'abspath': merged['abspath'] or 'not recorded',
'version': merged['version'] or 'N/A',
'status': merged['status'] or 'unrecorded',
},
"help_texts": {},
}
@@ -293,8 +541,6 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
import json
assert is_superuser(request), 'Must be a superuser to view configuration settings.'
plugins = get_filesystem_plugins()
@@ -308,45 +554,61 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
)
# Base fields that all plugins have
docs_url = get_plugin_docs_url(plugin['name'])
machine_admin_url = get_machine_admin_url()
fields = {
"id": plugin['id'],
"name": plugin['name'],
"source": plugin['source'],
"path": plugin['path'],
"hooks": ', '.join(plugin['hooks']),
}
# Add config.json data if available
if plugin.get('config'):
config_json = json.dumps(plugin['config'], indent=2)
fields["config.json"] = mark_safe(
'<pre style="max-height: 600px; overflow-y: auto; background: #f5f5f5; '
f'padding: 10px; border-radius: 4px;"><code>{config_json}</code></pre>'
)
# Also extract and display individual config properties for easier viewing
if 'properties' in plugin['config']:
config_properties = plugin['config']['properties']
properties_summary = []
for prop_name, prop_info in config_properties.items():
prop_type = prop_info.get('type', 'unknown')
prop_desc = prop_info.get('description', '')
properties_summary.append(f"{prop_name} ({prop_type}): {prop_desc}")
if properties_summary:
fields["Config Properties"] = mark_safe('<br/>'.join(properties_summary))
section: SectionData = {
sections: list[SectionData] = [{
"name": plugin['name'],
"description": plugin['path'],
"description": format_html(
'<code>{}</code><br/><a href="{}" target="_blank" rel="noopener noreferrer">ABX Plugin Docs</a>',
plugin['path'],
docs_url,
),
"fields": fields,
"help_texts": {},
}
}]
if plugin['hooks']:
sections.append({
"name": "Hooks",
"description": mark_safe(render_hook_links_html(plugin['name'], plugin['hooks'], plugin['source'])),
"fields": {},
"help_texts": {},
})
if plugin.get('config'):
sections.append({
"name": "Plugin Metadata",
"description": mark_safe(render_plugin_metadata_html(plugin['config'])),
"fields": {},
"help_texts": {},
})
sections.append({
"name": "config.json",
"description": mark_safe(render_highlighted_json_block(plugin['config'])),
"fields": {},
"help_texts": {},
})
config_properties = plugin['config'].get('properties', {})
if config_properties:
sections.append({
"name": "Config Properties",
"description": mark_safe(render_config_properties_html(config_properties, machine_admin_url)),
"fields": {},
"help_texts": {},
})
return ItemContext(
slug=key,
title=plugin['name'],
data=[section],
data=sections,
)