mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 23:37:58 +10:00
Remove 7 dead functions and 4 unused imports from hooks.py
Dead functions: extract_step, run_hooks, is_parser_plugin, get_all_plugin_icons, discover_plugin_templates, find_binary_for_cmd, create_model_record, get_parser_plugins Dead imports: re, signal, subprocess, django.utils.timezone Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
@@ -49,26 +49,20 @@ Dependency handling:
|
||||
API (all hook logic lives here):
|
||||
discover_hooks(event) -> List[Path] Find hook scripts
|
||||
run_hook(script, ...) -> HookResult Execute a hook script
|
||||
run_hooks(event, ...) -> List[HookResult] Run all hooks for an event
|
||||
extract_step(hook_name) -> int Deprecated: get two-digit order prefix if present
|
||||
is_background_hook(name) -> bool Check if hook is background (.bg suffix)
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox'
|
||||
|
||||
import os
|
||||
import re
|
||||
import json
|
||||
import signal
|
||||
import time
|
||||
import subprocess
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import List, Dict, Any, Optional, TypedDict
|
||||
|
||||
from abx_plugins import get_plugins_dir
|
||||
from django.conf import settings
|
||||
from django.utils import timezone
|
||||
from django.utils.safestring import mark_safe
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
|
||||
@@ -86,20 +80,6 @@ USER_PLUGINS_DIR = Path(
|
||||
# Hook Step Extraction
|
||||
# =============================================================================
|
||||
|
||||
def extract_step(hook_name: str) -> int:
|
||||
"""
|
||||
Deprecated: return the two-digit order prefix as an integer (00-99) if present.
|
||||
|
||||
Hook execution is based on lexicographic ordering of filenames; callers should
|
||||
not rely on parsed numeric steps for ordering decisions.
|
||||
"""
|
||||
match = re.search(r'__(\d{2})_', hook_name)
|
||||
if match:
|
||||
return int(match.group(1))
|
||||
import sys
|
||||
print(f"Warning: Hook '{hook_name}' has no order prefix (expected __XX_), defaulting to 99", file=sys.stderr)
|
||||
return 99
|
||||
|
||||
|
||||
def is_background_hook(hook_name: str) -> bool:
|
||||
"""
|
||||
@@ -573,51 +553,6 @@ def collect_urls_from_plugins(snapshot_dir: Path) -> List[Dict[str, Any]]:
|
||||
return urls
|
||||
|
||||
|
||||
def run_hooks(
|
||||
event_name: str,
|
||||
output_dir: Path,
|
||||
config: Dict[str, Any],
|
||||
timeout: Optional[int] = None,
|
||||
stop_on_failure: bool = False,
|
||||
**kwargs: Any
|
||||
) -> List[HookResult]:
|
||||
"""
|
||||
Run all hooks for a given event.
|
||||
|
||||
Args:
|
||||
event_name: The event name to trigger (e.g., 'Snapshot', 'Crawl', 'Binary')
|
||||
output_dir: Working directory for hook scripts
|
||||
config: Merged config dict from get_config(crawl=..., snapshot=...) - REQUIRED
|
||||
timeout: Maximum execution time per hook (None = auto-detect from plugin config)
|
||||
stop_on_failure: If True, stop executing hooks after first failure
|
||||
**kwargs: Arguments passed to each hook script
|
||||
|
||||
Returns:
|
||||
List of results from each hook execution
|
||||
|
||||
Example:
|
||||
from archivebox.config.configset import get_config
|
||||
config = get_config(crawl=my_crawl, snapshot=my_snapshot)
|
||||
results = run_hooks('Snapshot', output_dir, config=config, url=url, snapshot_id=id)
|
||||
"""
|
||||
hooks = discover_hooks(event_name, config=config)
|
||||
results = []
|
||||
|
||||
for hook in hooks:
|
||||
result = run_hook(hook, output_dir, config=config, timeout=timeout, **kwargs)
|
||||
|
||||
# Background hooks return None - skip adding to results
|
||||
if result is None:
|
||||
continue
|
||||
|
||||
result['hook'] = str(hook)
|
||||
results.append(result)
|
||||
|
||||
if stop_on_failure and result['returncode'] != 0:
|
||||
break
|
||||
|
||||
return results
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def get_plugins() -> List[str]:
|
||||
@@ -640,15 +575,6 @@ def get_plugins() -> List[str]:
|
||||
return sorted(set(plugins))
|
||||
|
||||
|
||||
def get_parser_plugins() -> List[str]:
|
||||
"""
|
||||
Get list of parser plugins by discovering parse_*_urls hooks.
|
||||
|
||||
Parser plugins discover URLs from source files and output urls.jsonl.
|
||||
Returns plugin names like: ['50_parse_html_urls', '51_parse_rss_urls', ...]
|
||||
"""
|
||||
return [e for e in get_plugins() if 'parse_' in e and '_urls' in e]
|
||||
|
||||
|
||||
def get_plugin_name(plugin: str) -> str:
|
||||
"""
|
||||
@@ -666,11 +592,6 @@ def get_plugin_name(plugin: str) -> str:
|
||||
return plugin
|
||||
|
||||
|
||||
def is_parser_plugin(plugin: str) -> bool:
|
||||
"""Check if a plugin is a parser plugin (discovers URLs)."""
|
||||
name = get_plugin_name(plugin)
|
||||
return name.startswith('parse_') and name.endswith('_urls')
|
||||
|
||||
|
||||
def get_enabled_plugins(config: Optional[Dict[str, Any]] = None) -> List[str]:
|
||||
"""
|
||||
@@ -1083,45 +1004,6 @@ def get_plugin_icon(plugin: str) -> str:
|
||||
return mark_safe('📁')
|
||||
|
||||
|
||||
def get_all_plugin_icons() -> Dict[str, str]:
|
||||
"""
|
||||
Get icons for all discovered plugins.
|
||||
|
||||
Returns:
|
||||
Dict mapping plugin base names to their icons.
|
||||
"""
|
||||
icons = {}
|
||||
for plugin in get_plugins():
|
||||
base_name = get_plugin_name(plugin)
|
||||
icons[base_name] = get_plugin_icon(plugin)
|
||||
return icons
|
||||
|
||||
|
||||
def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
|
||||
"""
|
||||
Discover all plugin templates organized by plugin.
|
||||
|
||||
Returns:
|
||||
Dict mapping plugin names to dicts of template_name -> template_path.
|
||||
e.g., {'screenshot': {'icon': '/path/to/icon.html', 'card': '/path/to/card.html'}}
|
||||
"""
|
||||
templates: Dict[str, Dict[str, str]] = {}
|
||||
|
||||
for plugin_dir in iter_plugin_dirs():
|
||||
|
||||
templates_dir = plugin_dir / 'templates'
|
||||
if not templates_dir.exists():
|
||||
continue
|
||||
|
||||
plugin_templates = {}
|
||||
for template_file in templates_dir.glob('*.html'):
|
||||
template_name = template_file.stem # icon, card, full
|
||||
plugin_templates[template_name] = str(template_file)
|
||||
|
||||
if plugin_templates:
|
||||
templates[plugin_dir.name] = plugin_templates
|
||||
|
||||
return templates
|
||||
|
||||
|
||||
# =============================================================================
|
||||
@@ -1129,104 +1011,6 @@ def discover_plugin_templates() -> Dict[str, Dict[str, str]]:
|
||||
# =============================================================================
|
||||
|
||||
|
||||
def find_binary_for_cmd(cmd: List[str], machine_id: str) -> Optional[str]:
|
||||
"""
|
||||
Find Binary for a command, trying abspath first then name.
|
||||
Only matches binaries on the current machine.
|
||||
|
||||
Args:
|
||||
cmd: Command list (e.g., ['/usr/bin/wget', '-p', 'url'])
|
||||
machine_id: Current machine ID
|
||||
|
||||
Returns:
|
||||
Binary ID as string if found, None otherwise
|
||||
"""
|
||||
if not cmd:
|
||||
return None
|
||||
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
bin_path_or_name = cmd[0] if isinstance(cmd, list) else cmd
|
||||
|
||||
# Try matching by absolute path first
|
||||
binary = Binary.objects.filter(
|
||||
abspath=bin_path_or_name,
|
||||
machine_id=machine_id
|
||||
).first()
|
||||
|
||||
if binary:
|
||||
return str(binary.id)
|
||||
|
||||
# Fallback: match by binary name
|
||||
bin_name = Path(bin_path_or_name).name
|
||||
binary = Binary.objects.filter(
|
||||
name=bin_name,
|
||||
machine_id=machine_id
|
||||
).first()
|
||||
|
||||
return str(binary.id) if binary else None
|
||||
|
||||
|
||||
def create_model_record(record: Dict[str, Any]) -> Any:
|
||||
"""
|
||||
Generic helper to create/update model instances from hook JSONL output.
|
||||
|
||||
Args:
|
||||
record: Dict with 'type' field and model data
|
||||
|
||||
Returns:
|
||||
Created/updated model instance, or None if type unknown
|
||||
"""
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
record_type = record.pop('type', None)
|
||||
if not record_type:
|
||||
return None
|
||||
|
||||
# Remove plugin metadata (not model fields)
|
||||
record.pop('plugin', None)
|
||||
record.pop('plugin_hook', None)
|
||||
|
||||
if record_type == 'Binary':
|
||||
# Binary requires machine FK
|
||||
machine = Machine.current()
|
||||
record.setdefault('machine', machine)
|
||||
|
||||
# Required fields check
|
||||
name = record.get('name')
|
||||
abspath = record.get('abspath')
|
||||
if not name or not abspath:
|
||||
return None
|
||||
|
||||
obj, created = Binary.objects.update_or_create(
|
||||
machine=machine,
|
||||
name=name,
|
||||
defaults={
|
||||
'abspath': abspath,
|
||||
'version': record.get('version', ''),
|
||||
'sha256': record.get('sha256', ''),
|
||||
'binprovider': record.get('binprovider', 'env'),
|
||||
}
|
||||
)
|
||||
return obj
|
||||
|
||||
elif record_type == 'Machine':
|
||||
config_patch = record.get('config')
|
||||
if isinstance(config_patch, dict) and config_patch:
|
||||
machine = Machine.current()
|
||||
if not machine.config:
|
||||
machine.config = {}
|
||||
machine.config.update(config_patch)
|
||||
machine.save(update_fields=['config'])
|
||||
return machine
|
||||
return None
|
||||
|
||||
# Add more types as needed (Dependency, Snapshot, etc.)
|
||||
else:
|
||||
# Unknown type - log warning but don't fail
|
||||
import sys
|
||||
print(f"Warning: Unknown record type '{record_type}' from hook output", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def process_hook_records(records: List[Dict[str, Any]], overrides: Dict[str, Any] = None) -> Dict[str, int]:
|
||||
|
||||
Reference in New Issue
Block a user