mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Reuse cached binaries in archivebox runtime
This commit is contained in:
@@ -14,11 +14,10 @@ EVENT_FLOW_DIAGRAM = """
|
||||
├─────────────────────────────────────────────────────────────────────────────┤
|
||||
│ │
|
||||
│ InstallEvent │
|
||||
│ └─ on_Install__* │
|
||||
│ └─ BinaryRequest records │
|
||||
│ └─ BinaryRequestEvent │
|
||||
│ └─ on_BinaryRequest__* │
|
||||
│ └─ BinaryEvent / MachineEvent │
|
||||
│ └─ config.json > required_binaries │
|
||||
│ └─ BinaryRequestEvent │
|
||||
│ └─ on_BinaryRequest__* │
|
||||
│ └─ BinaryEvent │
|
||||
│ │
|
||||
│ CrawlEvent │
|
||||
│ └─ CrawlSetupEvent │
|
||||
@@ -70,15 +69,15 @@ def pluginmap(
|
||||
|
||||
event_phases = {
|
||||
"InstallEvent": {
|
||||
"description": "Pre-run dependency phase. on_Install hooks request binaries and update machine config.",
|
||||
"emits": ["BinaryRequestEvent", "BinaryEvent", "MachineEvent", "ProcessEvent"],
|
||||
"description": "Pre-run dependency phase. Enabled plugins emit BinaryRequest events from config.json required_binaries.",
|
||||
"emits": ["BinaryRequestEvent", "BinaryEvent", "ProcessEvent"],
|
||||
},
|
||||
"BinaryRequestEvent": {
|
||||
"description": "Provider phase. on_BinaryRequest hooks resolve or install requested binaries.",
|
||||
"emits": ["BinaryEvent", "MachineEvent", "ProcessEvent"],
|
||||
"emits": ["BinaryEvent", "ProcessEvent"],
|
||||
},
|
||||
"BinaryEvent": {
|
||||
"description": "Resolved binary metadata event. Projected into the DB/runtime config.",
|
||||
"description": "Resolved binary metadata event. Projected into the DB binary cache.",
|
||||
"emits": [],
|
||||
},
|
||||
"CrawlEvent": {
|
||||
@@ -87,11 +86,11 @@ def pluginmap(
|
||||
},
|
||||
"CrawlSetupEvent": {
|
||||
"description": "Crawl-scoped setup phase. on_CrawlSetup hooks launch/configure shared daemons and runtime state.",
|
||||
"emits": ["MachineEvent", "ProcessEvent"],
|
||||
"emits": ["ProcessEvent"],
|
||||
},
|
||||
"SnapshotEvent": {
|
||||
"description": "Per-snapshot extraction phase. on_Snapshot hooks emit ArchiveResult, Snapshot, Tag, Machine, and BinaryRequest records.",
|
||||
"emits": ["ArchiveResultEvent", "SnapshotEvent", "TagEvent", "MachineEvent", "BinaryRequestEvent", "ProcessEvent"],
|
||||
"description": "Per-snapshot extraction phase. on_Snapshot hooks emit ArchiveResult, Snapshot, Tag, and BinaryRequest records.",
|
||||
"emits": ["ArchiveResultEvent", "SnapshotEvent", "TagEvent", "BinaryRequestEvent", "ProcessEvent"],
|
||||
},
|
||||
"SnapshotCleanupEvent": {
|
||||
"description": "Internal snapshot cleanup phase.",
|
||||
|
||||
@@ -5,7 +5,6 @@ __package__ = "archivebox.cli"
|
||||
import sys
|
||||
import os
|
||||
import platform
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from collections.abc import Iterable
|
||||
|
||||
@@ -124,17 +123,19 @@ def version(
|
||||
setup_django()
|
||||
|
||||
from archivebox.machine.models import Machine, Binary
|
||||
from archivebox.config.views import KNOWN_BINARIES, canonical_binary_name
|
||||
from abx_dl.dependencies import load_binary
|
||||
|
||||
machine = Machine.current()
|
||||
|
||||
requested_names = {canonical_binary_name(name) for name in binaries} if binaries else set()
|
||||
if isinstance(binaries, str):
|
||||
requested_names = {name.strip() for name in binaries.split(",") if name.strip()}
|
||||
else:
|
||||
requested_names = {name for name in (binaries or ()) if name}
|
||||
|
||||
db_binaries = {
|
||||
canonical_binary_name(binary.name): binary for binary in Binary.objects.filter(machine=machine).order_by("name", "-modified_at")
|
||||
}
|
||||
all_binary_names = sorted(set(KNOWN_BINARIES) | set(db_binaries.keys()))
|
||||
db_binaries: dict[str, Binary] = {}
|
||||
for binary in Binary.objects.filter(machine=machine).order_by("name", "-modified_at"):
|
||||
db_binaries.setdefault(binary.name, binary)
|
||||
|
||||
all_binary_names = sorted(requested_names or set(db_binaries.keys()))
|
||||
|
||||
if not all_binary_names:
|
||||
prnt("", "[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]")
|
||||
@@ -163,37 +164,10 @@ def version(
|
||||
any_available = True
|
||||
continue
|
||||
|
||||
loaded = None
|
||||
try:
|
||||
abx_pkg_logger = logging.getLogger("abx_pkg")
|
||||
previous_level = abx_pkg_logger.level
|
||||
abx_pkg_logger.setLevel(logging.CRITICAL)
|
||||
try:
|
||||
loaded = load_binary({"name": name, "binproviders": "env,pip,npm,brew,apt"})
|
||||
finally:
|
||||
abx_pkg_logger.setLevel(previous_level)
|
||||
except Exception:
|
||||
loaded = None
|
||||
|
||||
if loaded and loaded.is_valid and loaded.loaded_abspath:
|
||||
display_path = str(loaded.loaded_abspath).replace(str(DATA_DIR), ".").replace(str(Path("~").expanduser()), "~")
|
||||
version_str = str(loaded.loaded_version or "unknown")[:15]
|
||||
provider = str(getattr(getattr(loaded, "loaded_binprovider", None), "name", "") or "env")[:8]
|
||||
prnt(
|
||||
"",
|
||||
"[green]√[/green]",
|
||||
"",
|
||||
name.ljust(18),
|
||||
version_str.ljust(16),
|
||||
provider.ljust(8),
|
||||
display_path,
|
||||
overflow="ignore",
|
||||
crop=False,
|
||||
)
|
||||
any_available = True
|
||||
continue
|
||||
|
||||
prnt("", "[red]X[/red]", "", name.ljust(18), "[grey53]not installed[/grey53]", overflow="ignore", crop=False)
|
||||
status = (
|
||||
"[grey53]not recorded[/grey53]" if name in requested_names and installed is None else "[grey53]not installed[/grey53]"
|
||||
)
|
||||
prnt("", "[red]X[/red]", "", name.ljust(18), status, overflow="ignore", crop=False)
|
||||
failures.append(name)
|
||||
|
||||
if not any_available:
|
||||
|
||||
@@ -138,10 +138,9 @@ def get_config(
|
||||
3. Per-user config (user.config JSON field)
|
||||
4. Per-persona config (persona.get_derived_config() - includes CHROME_USER_DATA_DIR etc.)
|
||||
5. Environment variables
|
||||
6. Per-machine config (machine.config JSON field - resolved binary paths)
|
||||
7. Config file (ArchiveBox.conf)
|
||||
8. Plugin schema defaults (config.json)
|
||||
9. Core config defaults
|
||||
6. Config file (ArchiveBox.conf)
|
||||
7. Plugin schema defaults (config.json)
|
||||
8. Core config defaults
|
||||
|
||||
Args:
|
||||
defaults: Default values to start with
|
||||
@@ -150,7 +149,7 @@ def get_config(
|
||||
crawl: Crawl object with config JSON field
|
||||
snapshot: Snapshot object with config JSON field
|
||||
archiveresult: ArchiveResult object (auto-fetches snapshot)
|
||||
machine: Machine object with config JSON field (defaults to Machine.current())
|
||||
machine: Unused legacy argument kept for call compatibility
|
||||
|
||||
Note: Objects are auto-fetched from relationships if not provided:
|
||||
- snapshot auto-fetched from archiveresult.snapshot
|
||||
@@ -221,19 +220,6 @@ def get_config(
|
||||
file_config = BaseConfigSet.load_from_file(config_file)
|
||||
config.update(file_config)
|
||||
|
||||
# Apply machine config overrides (cached binary paths, etc.)
|
||||
if machine is None:
|
||||
# Default to current machine if not provided
|
||||
try:
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
machine = Machine.current()
|
||||
except Exception:
|
||||
pass # Machine might not be available during early init
|
||||
|
||||
if machine and hasattr(machine, "config") and machine.config:
|
||||
config.update(machine.config)
|
||||
|
||||
# Override with environment variables (for keys that exist in config)
|
||||
for key in config:
|
||||
env_val = os.environ.get(key)
|
||||
|
||||
@@ -29,42 +29,6 @@ ENVIRONMENT_BINARIES_BASE_URL = "/admin/environment/binaries/"
|
||||
INSTALLED_BINARIES_BASE_URL = "/admin/machine/binary/"
|
||||
|
||||
|
||||
# Common binaries to check for
|
||||
KNOWN_BINARIES = [
|
||||
"wget",
|
||||
"curl",
|
||||
"chromium",
|
||||
"chrome",
|
||||
"google-chrome",
|
||||
"google-chrome-stable",
|
||||
"node",
|
||||
"npm",
|
||||
"npx",
|
||||
"yt-dlp",
|
||||
"git",
|
||||
"singlefile",
|
||||
"readability-extractor",
|
||||
"mercury-parser",
|
||||
"python3",
|
||||
"python",
|
||||
"bash",
|
||||
"zsh",
|
||||
"ffmpeg",
|
||||
"ripgrep",
|
||||
"rg",
|
||||
"sonic",
|
||||
"archivebox",
|
||||
]
|
||||
|
||||
CANONICAL_BINARY_ALIASES = {
|
||||
"youtube-dl": "yt-dlp",
|
||||
"ytdlp": "yt-dlp",
|
||||
"ripgrep": "rg",
|
||||
"singlefile": "single-file",
|
||||
"mercury-parser": "postlight-parser",
|
||||
}
|
||||
|
||||
|
||||
def is_superuser(request: HttpRequest) -> bool:
|
||||
return bool(getattr(request.user, "is_superuser", False))
|
||||
|
||||
@@ -131,13 +95,12 @@ def get_environment_binary_url(name: str) -> str:
|
||||
return f"{ENVIRONMENT_BINARIES_BASE_URL}{quote(name)}/"
|
||||
|
||||
|
||||
def get_installed_binary_change_url(name: str, binary: Any) -> str | None:
|
||||
binary_id = getattr(binary, "id", None)
|
||||
if not binary_id:
|
||||
def get_installed_binary_change_url(name: str, binary: Binary | None) -> str | None:
|
||||
if binary is None or not binary.id:
|
||||
return None
|
||||
|
||||
base_url = getattr(binary, "admin_change_url", None) or f"{INSTALLED_BINARIES_BASE_URL}{binary_id}/change/"
|
||||
changelist_filters = urlencode({"q": canonical_binary_name(name)})
|
||||
base_url = binary.admin_change_url or f"{INSTALLED_BINARIES_BASE_URL}{binary.id}/change/"
|
||||
changelist_filters = urlencode({"q": name})
|
||||
return f"{base_url}?{urlencode({'_changelist_filters': changelist_filters})}"
|
||||
|
||||
|
||||
@@ -168,11 +131,14 @@ def render_code_tag_list(values: list[str]) -> str:
|
||||
|
||||
|
||||
def render_plugin_metadata_html(config: dict[str, Any]) -> str:
|
||||
required_binaries = [
|
||||
str(item.get("name")) for item in (config.get("required_binaries") or []) if isinstance(item, dict) and item.get("name")
|
||||
]
|
||||
rows = (
|
||||
("Title", config.get("title") or "(none)"),
|
||||
("Description", config.get("description") or "(none)"),
|
||||
("Required Plugins", mark_safe(render_link_tag_list(config.get("required_plugins") or [], get_plugin_docs_url))),
|
||||
("Required Binaries", mark_safe(render_link_tag_list(config.get("required_binaries") or [], get_environment_binary_url))),
|
||||
("Required Binaries", mark_safe(render_link_tag_list(required_binaries, get_environment_binary_url))),
|
||||
("Output MIME Types", mark_safe(render_code_tag_list(config.get("output_mimetypes") or []))),
|
||||
)
|
||||
|
||||
@@ -383,10 +349,6 @@ def obj_to_yaml(obj: Any, indent: int = 0) -> str:
|
||||
return f" {str(obj)}"
|
||||
|
||||
|
||||
def canonical_binary_name(name: str) -> str:
|
||||
return CANONICAL_BINARY_ALIASES.get(name, name)
|
||||
|
||||
|
||||
def _binary_sort_key(binary: Binary) -> tuple[int, int, int, Any]:
|
||||
return (
|
||||
int(binary.status == Binary.StatusChoices.INSTALLED),
|
||||
@@ -399,24 +361,11 @@ def _binary_sort_key(binary: Binary) -> tuple[int, int, int, Any]:
|
||||
def get_db_binaries_by_name() -> dict[str, Binary]:
|
||||
grouped: dict[str, list[Binary]] = {}
|
||||
for binary in Binary.objects.all():
|
||||
grouped.setdefault(canonical_binary_name(binary.name), []).append(binary)
|
||||
grouped.setdefault(binary.name, []).append(binary)
|
||||
|
||||
return {name: max(records, key=_binary_sort_key) for name, records in grouped.items()}
|
||||
|
||||
|
||||
def serialize_binary_record(name: str, binary: Binary | None) -> dict[str, Any]:
|
||||
is_installed = bool(binary and binary.status == Binary.StatusChoices.INSTALLED)
|
||||
return {
|
||||
"name": canonical_binary_name(name),
|
||||
"version": str(getattr(binary, "version", "") or ""),
|
||||
"binprovider": str(getattr(binary, "binprovider", "") or ""),
|
||||
"abspath": str(getattr(binary, "abspath", "") or ""),
|
||||
"sha256": str(getattr(binary, "sha256", "") or ""),
|
||||
"status": str(getattr(binary, "status", "") or ""),
|
||||
"is_available": is_installed and bool(getattr(binary, "abspath", "") or ""),
|
||||
}
|
||||
|
||||
|
||||
def get_filesystem_plugins() -> dict[str, dict[str, Any]]:
|
||||
"""Discover plugins from filesystem directories."""
|
||||
import json
|
||||
@@ -474,14 +423,14 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
all_binary_names = sorted(db_binaries.keys())
|
||||
|
||||
for name in all_binary_names:
|
||||
merged = serialize_binary_record(name, db_binaries.get(name))
|
||||
binary = db_binaries.get(name)
|
||||
|
||||
rows["Binary Name"].append(ItemLink(name, key=name))
|
||||
|
||||
if merged["is_available"]:
|
||||
rows["Found Version"].append(f"✅ {merged['version']}" if merged["version"] else "✅ found")
|
||||
rows["Provided By"].append(merged["binprovider"] or "-")
|
||||
rows["Found Abspath"].append(merged["abspath"] or "-")
|
||||
if binary and binary.is_valid:
|
||||
rows["Found Version"].append(f"✅ {binary.version}" if binary.version else "✅ found")
|
||||
rows["Provided By"].append(binary.binprovider or "-")
|
||||
rows["Found Abspath"].append(binary.abspath or "-")
|
||||
else:
|
||||
rows["Found Version"].append("❌ missing")
|
||||
rows["Provided By"].append("-")
|
||||
@@ -496,22 +445,20 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
@render_with_item_view
|
||||
def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
assert is_superuser(request), "Must be a superuser to view configuration settings."
|
||||
key = canonical_binary_name(key)
|
||||
|
||||
db_binary = get_db_binaries_by_name().get(key)
|
||||
merged = serialize_binary_record(key, db_binary)
|
||||
|
||||
if merged["is_available"]:
|
||||
if db_binary and db_binary.is_valid:
|
||||
binary_data = db_binary.to_json()
|
||||
section: SectionData = {
|
||||
"name": key,
|
||||
"description": mark_safe(render_binary_detail_description(key, merged, db_binary)),
|
||||
"description": mark_safe(render_binary_detail_description(key, binary_data, db_binary)),
|
||||
"fields": {
|
||||
"name": key,
|
||||
"binprovider": merged["binprovider"] or "-",
|
||||
"abspath": merged["abspath"] or "not found",
|
||||
"version": merged["version"] or "unknown",
|
||||
"sha256": merged["sha256"],
|
||||
"status": merged["status"],
|
||||
"binprovider": db_binary.binprovider or "-",
|
||||
"abspath": db_binary.abspath or "not found",
|
||||
"version": db_binary.version or "unknown",
|
||||
"sha256": db_binary.sha256,
|
||||
"status": db_binary.status,
|
||||
},
|
||||
"help_texts": {},
|
||||
}
|
||||
@@ -526,10 +473,10 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
"description": "No persisted Binary record found",
|
||||
"fields": {
|
||||
"name": key,
|
||||
"binprovider": merged["binprovider"] or "not recorded",
|
||||
"abspath": merged["abspath"] or "not recorded",
|
||||
"version": merged["version"] or "N/A",
|
||||
"status": merged["status"] or "unrecorded",
|
||||
"binprovider": db_binary.binprovider if db_binary else "not recorded",
|
||||
"abspath": db_binary.abspath if db_binary else "not recorded",
|
||||
"version": db_binary.version if db_binary else "N/A",
|
||||
"status": db_binary.status if db_binary else "unrecorded",
|
||||
},
|
||||
"help_texts": {},
|
||||
}
|
||||
|
||||
@@ -1226,7 +1226,7 @@ def live_progress_view(request):
|
||||
return (plugin, plugin, "unknown", "")
|
||||
|
||||
phase = "unknown"
|
||||
if normalized_hook_name.startswith("on_Install__"):
|
||||
if normalized_hook_name == "InstallEvent":
|
||||
phase = "install"
|
||||
elif normalized_hook_name.startswith("on_CrawlSetup__"):
|
||||
phase = "crawl"
|
||||
@@ -1966,7 +1966,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
|
||||
Priority order (highest to lowest):
|
||||
<ol>
|
||||
<li><b style="color: blue">Environment</b> - Environment variables</li>
|
||||
<li><b style="color: purple">Machine</b> - Machine-specific overrides (e.g., resolved binary paths)
|
||||
<li><b style="color: purple">Machine</b> - Machine-specific overrides
|
||||
{f'<br/><a href="{machine_admin_url}">→ Edit <code>{key}</code> in Machine.config for this server</a>' if machine_admin_url else ""}
|
||||
</li>
|
||||
<li><b style="color: green">Config File</b> - data/ArchiveBox.conf</li>
|
||||
|
||||
@@ -9,11 +9,14 @@ ArchiveBox no longer drives plugin execution itself during normal crawls.
|
||||
- parses hook stdout JSONL records into ArchiveBox models when needed
|
||||
|
||||
Hook-backed event families are discovered from filenames like:
|
||||
on_Install__*
|
||||
on_BinaryRequest__*
|
||||
on_CrawlSetup__*
|
||||
on_Snapshot__*
|
||||
|
||||
InstallEvent itself is still part of the runtime lifecycle, but it has no
|
||||
corresponding hook family. Its dependency declarations come directly from each
|
||||
plugin's `config.json > required_binaries`.
|
||||
|
||||
Lifecycle event names like `InstallEvent` or `SnapshotCleanupEvent` are
|
||||
normalized to the corresponding `on_{EventFamily}__*` prefix by a simple
|
||||
string transform. If no scripts exist for that prefix, discovery returns `[]`.
|
||||
@@ -212,7 +215,7 @@ def discover_hooks(
|
||||
pattern_direct = f"on_{hook_event_name}__*.{ext}"
|
||||
hooks.extend(base_dir.glob(pattern_direct))
|
||||
|
||||
# Binary install hooks are provider hooks, not end-user extractors. They
|
||||
# Binary provider hooks are not end-user extractors. They
|
||||
# self-filter via `binproviders`, so applying the PLUGINS whitelist here
|
||||
# can hide the very installer needed by a selected plugin (e.g.
|
||||
# `--plugins=singlefile` still needs the `npm` BinaryRequest hook).
|
||||
@@ -394,54 +397,14 @@ def run_hook(
|
||||
# Derive LIB_BIN_DIR from LIB_DIR if not set
|
||||
lib_bin_dir = Path(lib_dir) / "bin"
|
||||
|
||||
# Build PATH with proper precedence:
|
||||
# 1. LIB_BIN_DIR (highest priority - local symlinked binaries)
|
||||
# 2. Machine.config.PATH (pip/npm bin dirs from providers)
|
||||
# 3. os.environ['PATH'] (system PATH)
|
||||
|
||||
if lib_bin_dir:
|
||||
lib_bin_dir = str(lib_bin_dir)
|
||||
env["LIB_BIN_DIR"] = lib_bin_dir
|
||||
|
||||
# Start with base PATH
|
||||
current_path = env.get("PATH", "")
|
||||
|
||||
# Prepend Machine.config.PATH if it exists (treat as extra entries, not replacement)
|
||||
try:
|
||||
from archivebox.machine.models import Machine
|
||||
|
||||
machine = Machine.current()
|
||||
if machine and machine.config:
|
||||
machine_path = machine.config.get("PATH")
|
||||
if machine_path:
|
||||
# Prepend machine_path to current PATH
|
||||
current_path = f"{machine_path}:{current_path}" if current_path else machine_path
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Finally prepend LIB_BIN_DIR to the front (highest priority)
|
||||
if lib_bin_dir:
|
||||
if not current_path.startswith(f"{lib_bin_dir}:"):
|
||||
env["PATH"] = f"{lib_bin_dir}:{current_path}" if current_path else lib_bin_dir
|
||||
else:
|
||||
env["PATH"] = current_path
|
||||
else:
|
||||
env["PATH"] = current_path
|
||||
|
||||
# Set NODE_PATH for Node.js module resolution
|
||||
# Priority: config dict > Machine.config > derive from LIB_DIR
|
||||
# Set NODE_PATH for Node.js module resolution.
|
||||
# Priority: config dict > derive from LIB_DIR
|
||||
node_path = config.get("NODE_PATH")
|
||||
if not node_path and lib_dir:
|
||||
# Derive from LIB_DIR/npm/node_modules (create if needed)
|
||||
node_modules_dir = Path(lib_dir) / "npm" / "node_modules"
|
||||
node_modules_dir.mkdir(parents=True, exist_ok=True)
|
||||
node_path = str(node_modules_dir)
|
||||
if not node_path:
|
||||
try:
|
||||
# Fallback to Machine.config
|
||||
node_path = machine.config.get("NODE_MODULES_DIR")
|
||||
except Exception:
|
||||
pass
|
||||
if node_path:
|
||||
env["NODE_PATH"] = node_path
|
||||
env["NODE_MODULES_DIR"] = node_path # For backwards compatibility
|
||||
@@ -472,6 +435,41 @@ def run_hook(
|
||||
else:
|
||||
env[key] = str(value)
|
||||
|
||||
# Build PATH with proper precedence:
|
||||
# 1. path-like *_BINARY parents (explicit binary overrides / cached abspaths)
|
||||
# 2. LIB_BIN_DIR (local symlinked binaries)
|
||||
# 3. existing PATH
|
||||
runtime_bin_dirs: list[str] = []
|
||||
if lib_bin_dir:
|
||||
lib_bin_dir = str(lib_bin_dir)
|
||||
env["LIB_BIN_DIR"] = lib_bin_dir
|
||||
for key, raw_value in env.items():
|
||||
if not key.endswith("_BINARY"):
|
||||
continue
|
||||
value = str(raw_value or "").strip()
|
||||
if not value:
|
||||
continue
|
||||
path_value = Path(value).expanduser()
|
||||
if not (path_value.is_absolute() or "/" in value or "\\" in value):
|
||||
continue
|
||||
binary_dir = str(path_value.resolve(strict=False).parent)
|
||||
if binary_dir and binary_dir not in runtime_bin_dirs:
|
||||
runtime_bin_dirs.append(binary_dir)
|
||||
if lib_bin_dir and lib_bin_dir not in runtime_bin_dirs:
|
||||
runtime_bin_dirs.append(lib_bin_dir)
|
||||
uv_value = str(env.get("UV") or "").strip()
|
||||
if uv_value:
|
||||
uv_bin_dir = str(Path(uv_value).expanduser().resolve(strict=False).parent)
|
||||
if uv_bin_dir and uv_bin_dir not in runtime_bin_dirs:
|
||||
runtime_bin_dirs.append(uv_bin_dir)
|
||||
|
||||
current_path = env.get("PATH", "")
|
||||
path_parts = [part for part in current_path.split(os.pathsep) if part]
|
||||
for extra_dir in reversed(runtime_bin_dirs):
|
||||
if extra_dir not in path_parts:
|
||||
path_parts.insert(0, extra_dir)
|
||||
env["PATH"] = os.pathsep.join(path_parts)
|
||||
|
||||
# Create output directory if needed
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
|
||||
@@ -101,8 +101,6 @@ def _get_process_binary_env_keys(plugin_name: str, hook_path: str, env: dict[str
|
||||
schema_keys.sort(
|
||||
key=lambda key: (
|
||||
key != f"{plugin_key}_BINARY",
|
||||
key.endswith("_NODE_BINARY"),
|
||||
key.endswith("_CHROME_BINARY"),
|
||||
key,
|
||||
),
|
||||
)
|
||||
@@ -117,8 +115,6 @@ def _get_process_binary_env_keys(plugin_name: str, hook_path: str, env: dict[str
|
||||
|
||||
hook_suffix = Path(hook_path).suffix.lower()
|
||||
if hook_suffix == ".js":
|
||||
if plugin_key:
|
||||
add(f"{plugin_key}_NODE_BINARY")
|
||||
add("NODE_BINARY")
|
||||
|
||||
return keys
|
||||
@@ -160,7 +156,7 @@ class Machine(ModelWithHealthStats):
|
||||
default=dict,
|
||||
null=True,
|
||||
blank=True,
|
||||
help_text="Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)",
|
||||
help_text="Machine-specific config overrides.",
|
||||
)
|
||||
num_uses_failed = models.PositiveIntegerField(default=0)
|
||||
num_uses_succeeded = models.PositiveIntegerField(default=0)
|
||||
@@ -176,24 +172,13 @@ class Machine(ModelWithHealthStats):
|
||||
global _CURRENT_MACHINE
|
||||
if _CURRENT_MACHINE:
|
||||
if timezone.now() < _CURRENT_MACHINE.modified_at + timedelta(seconds=MACHINE_RECHECK_INTERVAL):
|
||||
return cls._sanitize_config(cls._hydrate_config_from_sibling(_CURRENT_MACHINE))
|
||||
return cls._sanitize_config(_CURRENT_MACHINE)
|
||||
_CURRENT_MACHINE = None
|
||||
_CURRENT_MACHINE, _ = cls.objects.update_or_create(
|
||||
guid=get_host_guid(),
|
||||
defaults={"hostname": socket.gethostname(), **get_os_info(), **get_vm_info(), "stats": get_host_stats()},
|
||||
)
|
||||
return cls._sanitize_config(cls._hydrate_config_from_sibling(_CURRENT_MACHINE))
|
||||
|
||||
@classmethod
|
||||
def _hydrate_config_from_sibling(cls, machine: Machine) -> Machine:
|
||||
if machine.config:
|
||||
return machine
|
||||
|
||||
sibling = cls.objects.exclude(pk=machine.pk).filter(hostname=machine.hostname).exclude(config={}).order_by("-modified_at").first()
|
||||
if sibling and sibling.config:
|
||||
machine.config = dict(sibling.config)
|
||||
machine.save(update_fields=["config", "modified_at"])
|
||||
return machine
|
||||
return cls._sanitize_config(_CURRENT_MACHINE)
|
||||
|
||||
@classmethod
|
||||
def _sanitize_config(cls, machine: Machine) -> Machine:
|
||||
@@ -622,12 +607,7 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
# Get merged config (Binary doesn't have crawl/snapshot context).
|
||||
# Binary workers can install several dependencies in one process, so
|
||||
# refresh from the latest persisted machine config before each hook run.
|
||||
config = get_config()
|
||||
current_machine = Machine.current()
|
||||
if current_machine.config:
|
||||
config.update(current_machine.config)
|
||||
|
||||
# ArchiveBox installs the puppeteer package and Chromium in separate
|
||||
# hook phases. Suppress puppeteer's bundled browser download during the
|
||||
@@ -760,6 +740,11 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
||||
|
||||
binary_abspath = Path(self.abspath).resolve()
|
||||
lib_bin_dir = Path(lib_bin_dir).resolve()
|
||||
binary_parts = binary_abspath.parts
|
||||
try:
|
||||
app_index = next(index for index, part in enumerate(binary_parts) if part.endswith(".app"))
|
||||
except StopIteration:
|
||||
app_index = -1
|
||||
|
||||
# Create LIB_BIN_DIR if it doesn't exist
|
||||
try:
|
||||
@@ -772,6 +757,15 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
||||
binary_name = binary_abspath.name
|
||||
symlink_path = lib_bin_dir / binary_name
|
||||
|
||||
if app_index != -1 and len(binary_parts) > app_index + 2 and binary_parts[app_index + 1 : app_index + 3] == ("Contents", "MacOS"):
|
||||
if symlink_path.exists() or symlink_path.is_symlink():
|
||||
try:
|
||||
symlink_path.unlink()
|
||||
except (OSError, PermissionError) as e:
|
||||
print(f"Failed to remove existing file at {symlink_path}: {e}", file=sys.stderr)
|
||||
return None
|
||||
return binary_abspath
|
||||
|
||||
# Remove existing symlink/file if it exists
|
||||
if symlink_path.exists() or symlink_path.is_symlink():
|
||||
try:
|
||||
|
||||
@@ -2,7 +2,6 @@ from .archive_result_service import ArchiveResultService
|
||||
from .binary_service import BinaryService
|
||||
from .crawl_service import CrawlService
|
||||
from .machine_service import MachineService
|
||||
from .process_request_service import ProcessRequestService
|
||||
from .process_service import ProcessService
|
||||
from .runner import run_binary, run_crawl, run_install, run_pending_crawls
|
||||
from .snapshot_service import SnapshotService
|
||||
@@ -13,7 +12,6 @@ __all__ = [
|
||||
"BinaryService",
|
||||
"CrawlService",
|
||||
"MachineService",
|
||||
"ProcessRequestService",
|
||||
"ProcessService",
|
||||
"SnapshotService",
|
||||
"TagService",
|
||||
|
||||
@@ -14,6 +14,23 @@ class BinaryService(BaseService):
|
||||
|
||||
async def on_BinaryRequestEvent__Outer(self, event: BinaryRequestEvent) -> None:
|
||||
await run_db_op(self._project_binary, event)
|
||||
cached = await run_db_op(self._load_cached_binary, event)
|
||||
if cached is not None:
|
||||
await self.bus.emit(
|
||||
BinaryEvent(
|
||||
name=event.name,
|
||||
plugin_name=event.plugin_name,
|
||||
hook_name=event.hook_name,
|
||||
abspath=cached["abspath"],
|
||||
version=cached["version"],
|
||||
sha256=cached["sha256"],
|
||||
binproviders=event.binproviders or cached["binproviders"],
|
||||
binprovider=cached["binprovider"],
|
||||
overrides=event.overrides or cached["overrides"],
|
||||
binary_id=event.binary_id,
|
||||
machine_id=event.machine_id or cached["machine_id"],
|
||||
),
|
||||
)
|
||||
|
||||
async def on_BinaryEvent__Outer(self, event: BinaryEvent) -> None:
|
||||
resolved = await asyncio.to_thread(self._resolve_installed_binary_metadata, event)
|
||||
@@ -44,6 +61,29 @@ class BinaryService(BaseService):
|
||||
},
|
||||
)
|
||||
|
||||
def _load_cached_binary(self, event: BinaryRequestEvent) -> dict[str, str] | None:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
installed = (
|
||||
Binary.objects.filter(machine=machine, name=event.name, status=Binary.StatusChoices.INSTALLED)
|
||||
.exclude(abspath="")
|
||||
.exclude(abspath__isnull=True)
|
||||
.order_by("-modified_at")
|
||||
.first()
|
||||
)
|
||||
if installed is None:
|
||||
return None
|
||||
return {
|
||||
"abspath": installed.abspath,
|
||||
"version": installed.version or "",
|
||||
"sha256": installed.sha256 or "",
|
||||
"binproviders": installed.binproviders or "",
|
||||
"binprovider": installed.binprovider or "",
|
||||
"machine_id": str(installed.machine_id),
|
||||
"overrides": installed.overrides or {},
|
||||
}
|
||||
|
||||
def _resolve_installed_binary_metadata(self, event: BinaryEvent) -> dict[str, str]:
|
||||
resolved = {
|
||||
"abspath": event.abspath or "",
|
||||
@@ -77,12 +117,11 @@ class BinaryService(BaseService):
|
||||
"overrides": event.overrides or {},
|
||||
}
|
||||
binary = load_binary(spec)
|
||||
resolved["abspath"] = str(getattr(binary, "abspath", None) or resolved["abspath"] or "")
|
||||
resolved["version"] = str(getattr(binary, "version", None) or resolved["version"] or "")
|
||||
resolved["sha256"] = str(getattr(binary, "sha256", None) or resolved["sha256"] or "")
|
||||
provider_name = getattr(getattr(binary, "loaded_binprovider", None), "name", None)
|
||||
if provider_name:
|
||||
resolved["binprovider"] = str(provider_name)
|
||||
resolved["abspath"] = str(binary.abspath or resolved["abspath"] or "")
|
||||
resolved["version"] = str(binary.version or resolved["version"] or "")
|
||||
resolved["sha256"] = str(binary.sha256 or resolved["sha256"] or "")
|
||||
if binary.loaded_binprovider is not None and binary.loaded_binprovider.name:
|
||||
resolved["binprovider"] = str(binary.loaded_binprovider.name)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
@@ -14,13 +14,13 @@ class MachineService(BaseService):
|
||||
await run_db_op(self._project, event)
|
||||
|
||||
def _project(self, event: MachineEvent) -> None:
|
||||
from archivebox.machine.models import Machine
|
||||
from archivebox.machine.models import Machine, _sanitize_machine_config
|
||||
|
||||
machine = Machine.current()
|
||||
config = dict(machine.config or {})
|
||||
|
||||
if event.config is not None:
|
||||
config.update(event.config)
|
||||
config.update(_sanitize_machine_config(event.config))
|
||||
elif event.method == "update":
|
||||
key = event.key.replace("config/", "", 1).strip()
|
||||
if key:
|
||||
@@ -28,5 +28,5 @@ class MachineService(BaseService):
|
||||
else:
|
||||
return
|
||||
|
||||
machine.config = config
|
||||
machine.config = _sanitize_machine_config(config)
|
||||
machine.save(update_fields=["config", "modified_at"])
|
||||
|
||||
@@ -1,179 +0,0 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import asyncio
|
||||
from datetime import datetime, timezone
|
||||
import json
|
||||
from pathlib import Path
|
||||
import shlex
|
||||
import socket
|
||||
import time
|
||||
from typing import ClassVar
|
||||
|
||||
from abxbus import BaseEvent
|
||||
from abx_dl.events import ProcessCompletedEvent, ProcessEvent, ProcessStartedEvent, ProcessStdoutEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
|
||||
def _is_port_listening(host: str, port: int) -> bool:
|
||||
if not host or not port:
|
||||
return False
|
||||
try:
|
||||
with socket.create_connection((host, port), timeout=0.5):
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
|
||||
def _supervisor_env(env: dict[str, str]) -> str:
|
||||
pairs = []
|
||||
for key, value in env.items():
|
||||
escaped = value.replace('"', '\\"')
|
||||
pairs.append(f'{key}="{escaped}"')
|
||||
return ",".join(pairs)
|
||||
|
||||
|
||||
def _iso_from_epoch(value: object) -> str:
|
||||
if not isinstance(value, (int, float)) or value <= 0:
|
||||
return ""
|
||||
return datetime.fromtimestamp(value, tz=timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _ensure_worker(process_event: ProcessEvent) -> dict[str, object]:
|
||||
from archivebox.workers.supervisord_util import get_or_create_supervisord_process, get_worker, start_worker
|
||||
|
||||
output_dir = Path(process_event.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
worker_name = process_event.hook_name
|
||||
supervisor = get_or_create_supervisord_process(daemonize=True)
|
||||
|
||||
existing = get_worker(supervisor, worker_name)
|
||||
if (
|
||||
isinstance(existing, dict)
|
||||
and existing.get("statename") == "RUNNING"
|
||||
and (
|
||||
not process_event.daemon_startup_host
|
||||
or not process_event.daemon_startup_port
|
||||
or _is_port_listening(process_event.daemon_startup_host, process_event.daemon_startup_port)
|
||||
)
|
||||
):
|
||||
return existing
|
||||
|
||||
daemon = {
|
||||
"name": worker_name,
|
||||
"command": shlex.join([process_event.hook_path, *process_event.hook_args]),
|
||||
"directory": str(output_dir),
|
||||
"autostart": "false",
|
||||
"autorestart": "true",
|
||||
"stdout_logfile": str(output_dir / f"{worker_name}.stdout.log"),
|
||||
"redirect_stderr": "true",
|
||||
}
|
||||
if process_event.env:
|
||||
daemon["environment"] = _supervisor_env(process_event.env)
|
||||
|
||||
proc = start_worker(supervisor, daemon)
|
||||
deadline = time.monotonic() + max(float(process_event.daemon_startup_timeout), 0.5)
|
||||
while time.monotonic() < deadline:
|
||||
current = get_worker(supervisor, worker_name)
|
||||
if isinstance(current, dict) and current.get("statename") == "RUNNING":
|
||||
if (
|
||||
not process_event.daemon_startup_host
|
||||
or not process_event.daemon_startup_port
|
||||
or _is_port_listening(process_event.daemon_startup_host, process_event.daemon_startup_port)
|
||||
):
|
||||
return current
|
||||
time.sleep(0.1)
|
||||
return proc if isinstance(proc, dict) else {}
|
||||
|
||||
|
||||
class ProcessRequestService(BaseService):
|
||||
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStdoutEvent]
|
||||
EMITS: ClassVar[list[type[BaseEvent]]] = [ProcessEvent, ProcessStartedEvent, ProcessCompletedEvent]
|
||||
|
||||
async def on_ProcessStdoutEvent(self, event: ProcessStdoutEvent) -> None:
|
||||
try:
|
||||
record = json.loads(event.line)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
return
|
||||
if not isinstance(record, dict) or record.pop("type", "") != "ProcessEvent":
|
||||
return
|
||||
|
||||
process_event = ProcessEvent(
|
||||
plugin_name=record.get("plugin_name") or event.plugin_name,
|
||||
hook_name=record.get("hook_name") or "process_request",
|
||||
hook_path=record["hook_path"],
|
||||
hook_args=[str(arg) for arg in record.get("hook_args", [])],
|
||||
is_background=bool(record.get("is_background", True)),
|
||||
output_dir=record.get("output_dir") or event.output_dir,
|
||||
env={str(key): str(value) for key, value in (record.get("env") or {}).items()},
|
||||
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
|
||||
timeout=int(record.get("timeout") or 60),
|
||||
daemon=bool(record.get("daemon", False)),
|
||||
daemon_startup_host=str(record.get("daemon_startup_host") or ""),
|
||||
daemon_startup_port=int(record.get("daemon_startup_port") or 0),
|
||||
daemon_startup_timeout=float(record.get("daemon_startup_timeout") or 0.0),
|
||||
process_type=str(record.get("process_type") or ""),
|
||||
worker_type=str(record.get("worker_type") or ""),
|
||||
event_timeout=float(record.get("event_timeout") or 360.0),
|
||||
event_handler_timeout=float(record.get("event_handler_timeout") or 390.0),
|
||||
)
|
||||
if not process_event.daemon:
|
||||
await self.bus.emit(process_event)
|
||||
return
|
||||
|
||||
proc = await asyncio.to_thread(_ensure_worker, process_event)
|
||||
process_id = str(record.get("process_id") or f"worker:{process_event.hook_name}")
|
||||
start_ts = _iso_from_epoch(proc.get("start"))
|
||||
pid = int(proc.get("pid") or 0)
|
||||
statename = str(proc.get("statename") or "")
|
||||
exitstatus = int(proc.get("exitstatus") or 0)
|
||||
process_type = process_event.process_type or "worker"
|
||||
worker_type = process_event.worker_type or process_event.plugin_name
|
||||
|
||||
if statename == "RUNNING" and pid:
|
||||
await self.bus.emit(
|
||||
ProcessStartedEvent(
|
||||
plugin_name=process_event.plugin_name,
|
||||
hook_name=process_event.hook_name,
|
||||
hook_path=process_event.hook_path,
|
||||
hook_args=process_event.hook_args,
|
||||
output_dir=process_event.output_dir,
|
||||
env=process_event.env,
|
||||
timeout=process_event.timeout,
|
||||
pid=pid,
|
||||
process_id=process_id,
|
||||
snapshot_id=process_event.snapshot_id,
|
||||
is_background=True,
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
start_ts=start_ts,
|
||||
),
|
||||
)
|
||||
return
|
||||
|
||||
stderr = (
|
||||
f"Worker {process_event.hook_name} failed to start"
|
||||
if not statename
|
||||
else f"Worker {process_event.hook_name} state={statename} exitstatus={exitstatus}"
|
||||
)
|
||||
await self.bus.emit(
|
||||
ProcessCompletedEvent(
|
||||
plugin_name=process_event.plugin_name,
|
||||
hook_name=process_event.hook_name,
|
||||
hook_path=process_event.hook_path,
|
||||
hook_args=process_event.hook_args,
|
||||
env=process_event.env,
|
||||
stdout="",
|
||||
stderr=stderr,
|
||||
exit_code=exitstatus or 1,
|
||||
output_dir=process_event.output_dir,
|
||||
is_background=True,
|
||||
process_id=process_id,
|
||||
snapshot_id=process_event.snapshot_id,
|
||||
pid=pid,
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
start_ts=start_ts,
|
||||
end_ts=datetime.now(tz=timezone.utc).isoformat(),
|
||||
),
|
||||
)
|
||||
raise RuntimeError(stderr)
|
||||
@@ -1,11 +1,19 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import datetime
|
||||
from typing import TYPE_CHECKING
|
||||
import asyncio
|
||||
from datetime import datetime, timezone as datetime_timezone
|
||||
import json
|
||||
from pathlib import Path
|
||||
import shlex
|
||||
import socket
|
||||
import time
|
||||
from typing import TYPE_CHECKING, Any, ClassVar
|
||||
from urllib.parse import urlparse
|
||||
|
||||
from django.utils import timezone
|
||||
|
||||
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
|
||||
from abxbus import BaseEvent
|
||||
from abx_dl.events import ProcessCompletedEvent, ProcessEvent, ProcessStartedEvent, ProcessStdoutEvent
|
||||
from abx_dl.services.base import BaseService
|
||||
|
||||
from .db import run_db_op
|
||||
@@ -14,6 +22,9 @@ if TYPE_CHECKING:
|
||||
from archivebox.machine.models import Process
|
||||
|
||||
|
||||
WORKER_READY_TIMEOUT = 10.0
|
||||
|
||||
|
||||
def parse_event_datetime(value: str | None):
|
||||
if not value:
|
||||
return None
|
||||
@@ -26,14 +37,218 @@ def parse_event_datetime(value: str | None):
|
||||
return dt
|
||||
|
||||
|
||||
def _is_port_listening(host: str, port: int) -> bool:
|
||||
if not host or not port:
|
||||
return False
|
||||
try:
|
||||
with socket.create_connection((host, port), timeout=0.5):
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
|
||||
def _worker_socket_from_url(url: str) -> tuple[str, int] | None:
|
||||
if not url:
|
||||
return None
|
||||
parsed = urlparse(url)
|
||||
if parsed.scheme != "tcp" or not parsed.hostname or not parsed.port:
|
||||
return None
|
||||
return parsed.hostname, parsed.port
|
||||
|
||||
|
||||
def _supervisor_env(env: dict[str, str]) -> str:
|
||||
pairs = []
|
||||
for key, value in env.items():
|
||||
escaped = value.replace('"', '\\"')
|
||||
pairs.append(f'{key}="{escaped}"')
|
||||
return ",".join(pairs)
|
||||
|
||||
|
||||
def _iso_from_epoch(value: object) -> str:
|
||||
if not isinstance(value, (int, float)) or value <= 0:
|
||||
return ""
|
||||
return datetime.fromtimestamp(value, tz=datetime_timezone.utc).isoformat()
|
||||
|
||||
|
||||
def _int_from_object(value: object) -> int:
|
||||
if isinstance(value, bool):
|
||||
return int(value)
|
||||
if isinstance(value, int):
|
||||
return value
|
||||
if isinstance(value, float):
|
||||
return int(value)
|
||||
if isinstance(value, str):
|
||||
try:
|
||||
return int(value)
|
||||
except ValueError:
|
||||
return 0
|
||||
return 0
|
||||
|
||||
|
||||
def _ensure_worker(process_event: ProcessEvent) -> dict[str, object]:
|
||||
from archivebox.workers.supervisord_util import get_or_create_supervisord_process, get_worker, start_worker
|
||||
|
||||
output_dir = Path(process_event.output_dir)
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
worker_name = process_event.hook_name
|
||||
supervisor = get_or_create_supervisord_process(daemonize=True)
|
||||
worker_socket = _worker_socket_from_url(getattr(process_event, "url", ""))
|
||||
|
||||
existing = get_worker(supervisor, worker_name)
|
||||
if (
|
||||
isinstance(existing, dict)
|
||||
and existing.get("statename") == "RUNNING"
|
||||
and (worker_socket is None or _is_port_listening(*worker_socket))
|
||||
):
|
||||
return existing
|
||||
|
||||
daemon = {
|
||||
"name": worker_name,
|
||||
"command": shlex.join([process_event.hook_path, *process_event.hook_args]),
|
||||
"directory": str(output_dir),
|
||||
"autostart": "false",
|
||||
"autorestart": "true",
|
||||
"stdout_logfile": str(output_dir / f"{worker_name}.stdout.log"),
|
||||
"redirect_stderr": "true",
|
||||
}
|
||||
if process_event.env:
|
||||
daemon["environment"] = _supervisor_env(process_event.env)
|
||||
|
||||
proc = start_worker(supervisor, daemon)
|
||||
deadline = time.monotonic() + WORKER_READY_TIMEOUT
|
||||
while time.monotonic() < deadline:
|
||||
current = get_worker(supervisor, worker_name)
|
||||
if isinstance(current, dict) and current.get("statename") == "RUNNING":
|
||||
if worker_socket is None or _is_port_listening(*worker_socket):
|
||||
return current
|
||||
time.sleep(0.1)
|
||||
return proc if isinstance(proc, dict) else {}
|
||||
|
||||
|
||||
class ProcessService(BaseService):
|
||||
LISTENS_TO = [ProcessStartedEvent, ProcessCompletedEvent]
|
||||
EMITS = []
|
||||
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStdoutEvent, ProcessStartedEvent, ProcessCompletedEvent]
|
||||
EMITS: ClassVar[list[type[BaseEvent]]] = [ProcessEvent, ProcessStartedEvent, ProcessCompletedEvent]
|
||||
|
||||
def __init__(self, bus):
|
||||
self.process_ids: dict[str, str] = {}
|
||||
super().__init__(bus)
|
||||
|
||||
async def on_ProcessStdoutEvent(self, event: ProcessStdoutEvent) -> None:
|
||||
try:
|
||||
record = json.loads(event.line)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
return
|
||||
if not isinstance(record, dict) or record.get("type") != "ProcessEvent":
|
||||
return
|
||||
|
||||
passthrough_fields: dict[str, Any] = {
|
||||
key: value
|
||||
for key, value in record.items()
|
||||
if key
|
||||
not in {
|
||||
"type",
|
||||
"plugin_name",
|
||||
"hook_name",
|
||||
"hook_path",
|
||||
"hook_args",
|
||||
"is_background",
|
||||
"output_dir",
|
||||
"env",
|
||||
"snapshot_id",
|
||||
"process_id",
|
||||
"url",
|
||||
"timeout",
|
||||
"daemon",
|
||||
"process_type",
|
||||
"worker_type",
|
||||
"event_timeout",
|
||||
"event_handler_timeout",
|
||||
}
|
||||
}
|
||||
process_event = ProcessEvent(
|
||||
plugin_name=record.get("plugin_name") or event.plugin_name,
|
||||
hook_name=record.get("hook_name") or "process",
|
||||
hook_path=record["hook_path"],
|
||||
hook_args=[str(arg) for arg in record.get("hook_args", [])],
|
||||
is_background=bool(record.get("is_background", True)),
|
||||
output_dir=record.get("output_dir") or event.output_dir,
|
||||
env={str(key): str(value) for key, value in (record.get("env") or {}).items()},
|
||||
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
|
||||
timeout=int(record.get("timeout") or 60),
|
||||
daemon=bool(record.get("daemon", False)),
|
||||
url=str(record.get("url") or ""),
|
||||
process_type=str(record.get("process_type") or ""),
|
||||
worker_type=str(record.get("worker_type") or ""),
|
||||
event_timeout=float(record.get("event_timeout") or 360.0),
|
||||
event_handler_timeout=float(record.get("event_handler_timeout") or 390.0),
|
||||
**passthrough_fields,
|
||||
)
|
||||
if not process_event.daemon:
|
||||
await self.bus.emit(process_event)
|
||||
return
|
||||
|
||||
proc = await asyncio.to_thread(_ensure_worker, process_event)
|
||||
process_id = str(record.get("process_id") or f"worker:{process_event.hook_name}")
|
||||
start_ts = _iso_from_epoch(proc.get("start"))
|
||||
pid = _int_from_object(proc.get("pid"))
|
||||
statename = str(proc.get("statename") or "")
|
||||
exitstatus = _int_from_object(proc.get("exitstatus"))
|
||||
process_type = process_event.process_type or "worker"
|
||||
worker_type = process_event.worker_type or process_event.plugin_name
|
||||
|
||||
if statename == "RUNNING" and pid:
|
||||
await self.bus.emit(
|
||||
ProcessStartedEvent(
|
||||
plugin_name=process_event.plugin_name,
|
||||
hook_name=process_event.hook_name,
|
||||
hook_path=process_event.hook_path,
|
||||
hook_args=process_event.hook_args,
|
||||
output_dir=process_event.output_dir,
|
||||
env=process_event.env,
|
||||
timeout=process_event.timeout,
|
||||
pid=pid,
|
||||
process_id=process_id,
|
||||
snapshot_id=process_event.snapshot_id,
|
||||
is_background=True,
|
||||
url=process_event.url,
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
start_ts=start_ts,
|
||||
**passthrough_fields,
|
||||
),
|
||||
)
|
||||
return
|
||||
|
||||
stderr = (
|
||||
f"Worker {process_event.hook_name} failed to start"
|
||||
if not statename
|
||||
else f"Worker {process_event.hook_name} state={statename} exitstatus={exitstatus}"
|
||||
)
|
||||
await self.bus.emit(
|
||||
ProcessCompletedEvent(
|
||||
plugin_name=process_event.plugin_name,
|
||||
hook_name=process_event.hook_name,
|
||||
hook_path=process_event.hook_path,
|
||||
hook_args=process_event.hook_args,
|
||||
env=process_event.env,
|
||||
stdout="",
|
||||
stderr=stderr,
|
||||
exit_code=exitstatus or 1,
|
||||
output_dir=process_event.output_dir,
|
||||
is_background=True,
|
||||
process_id=process_id,
|
||||
snapshot_id=process_event.snapshot_id,
|
||||
pid=pid,
|
||||
url=process_event.url,
|
||||
process_type=process_type,
|
||||
worker_type=worker_type,
|
||||
start_ts=start_ts,
|
||||
end_ts=datetime.now(tz=datetime_timezone.utc).isoformat(),
|
||||
**passthrough_fields,
|
||||
),
|
||||
)
|
||||
raise RuntimeError(stderr)
|
||||
|
||||
async def on_ProcessStartedEvent__Outer(self, event: ProcessStartedEvent) -> None:
|
||||
await run_db_op(self._project_started, event)
|
||||
|
||||
@@ -51,7 +266,7 @@ class ProcessService(BaseService):
|
||||
if db_process_id:
|
||||
process = Process.objects.filter(id=db_process_id).first()
|
||||
if process is not None:
|
||||
if process.iface_id != iface.id or process.machine_id != iface.machine_id:
|
||||
if getattr(process, "iface_id", None) != iface.id or process.machine_id != iface.machine_id:
|
||||
process.iface = iface
|
||||
process.machine = iface.machine
|
||||
process.save(update_fields=["iface", "machine", "modified_at"])
|
||||
@@ -84,6 +299,7 @@ class ProcessService(BaseService):
|
||||
env=event.env,
|
||||
timeout=getattr(event, "timeout", 60),
|
||||
pid=event.pid or None,
|
||||
url=getattr(event, "url", "") or None,
|
||||
started_at=parse_event_datetime(getattr(event, "start_ts", "")),
|
||||
status=Process.StatusChoices.RUNNING,
|
||||
retry_at=None,
|
||||
@@ -98,6 +314,7 @@ class ProcessService(BaseService):
|
||||
process.env = event.env
|
||||
process.timeout = event.timeout
|
||||
process.pid = event.pid or None
|
||||
process.url = getattr(event, "url", "") or process.url
|
||||
process.process_type = getattr(event, "process_type", "") or process.process_type
|
||||
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
|
||||
process.started_at = parse_event_datetime(event.start_ts) or process.started_at or timezone.now()
|
||||
@@ -113,6 +330,7 @@ class ProcessService(BaseService):
|
||||
process.cmd = [event.hook_path, *event.hook_args]
|
||||
process.env = event.env
|
||||
process.pid = event.pid or process.pid
|
||||
process.url = getattr(event, "url", "") or process.url
|
||||
process.process_type = getattr(event, "process_type", "") or process.process_type
|
||||
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
|
||||
process.started_at = parse_event_datetime(event.start_ts) or process.started_at
|
||||
|
||||
@@ -3,6 +3,7 @@ from __future__ import annotations
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
@@ -28,8 +29,6 @@ from abx_dl.orchestrator import (
|
||||
from .archive_result_service import ArchiveResultService
|
||||
from .binary_service import BinaryService
|
||||
from .crawl_service import CrawlService
|
||||
from .machine_service import MachineService
|
||||
from .process_request_service import ProcessRequestService
|
||||
from .process_service import ProcessService
|
||||
from .snapshot_service import SnapshotService
|
||||
from .tag_service import TagService
|
||||
@@ -58,28 +57,34 @@ def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str
|
||||
)
|
||||
|
||||
|
||||
def _binary_env_key(name: str) -> str:
|
||||
normalized = "".join(ch if ch.isalnum() else "_" for ch in name).upper()
|
||||
return f"{normalized}_BINARY"
|
||||
_TEMPLATE_NAME_RE = re.compile(r"^\{([A-Z0-9_]+)\}$")
|
||||
|
||||
|
||||
def _binary_config_keys_for_plugins(plugins: dict[str, Plugin], binary_name: str) -> list[str]:
|
||||
def _binary_config_keys_for_plugins(plugins: dict[str, Plugin], binary_name: str, config: dict[str, Any]) -> list[str]:
|
||||
keys: list[str] = []
|
||||
if binary_name != "postlight-parser":
|
||||
keys.append(_binary_env_key(binary_name))
|
||||
|
||||
for plugin in plugins.values():
|
||||
for spec in plugin.binaries:
|
||||
template_name = str(spec.get("name") or "").strip()
|
||||
match = _TEMPLATE_NAME_RE.fullmatch(template_name)
|
||||
if match is None:
|
||||
continue
|
||||
key = match.group(1)
|
||||
configured_value = config.get(key)
|
||||
if configured_value is not None and str(configured_value).strip() == binary_name:
|
||||
keys.append(key)
|
||||
for key, prop in plugin.config_schema.items():
|
||||
if key.endswith("_BINARY") and prop.get("default") == binary_name:
|
||||
keys.insert(0, key)
|
||||
keys.append(key)
|
||||
|
||||
return list(dict.fromkeys(keys))
|
||||
|
||||
|
||||
def _installed_binary_config_overrides(plugins: dict[str, Plugin]) -> dict[str, str]:
|
||||
def _installed_binary_config_overrides(plugins: dict[str, Plugin], config: dict[str, Any] | None = None) -> dict[str, str]:
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
|
||||
machine = Machine.current()
|
||||
active_config = dict(config or {})
|
||||
overrides: dict[str, str] = {}
|
||||
shared_lib_dir: Path | None = None
|
||||
pip_home: Path | None = None
|
||||
@@ -98,7 +103,7 @@ def _installed_binary_config_overrides(plugins: dict[str, Plugin]) -> dict[str,
|
||||
continue
|
||||
if not resolved_path.is_file() or not os.access(resolved_path, os.X_OK):
|
||||
continue
|
||||
for key in _binary_config_keys_for_plugins(plugins, binary.name):
|
||||
for key in _binary_config_keys_for_plugins(plugins, binary.name, active_config):
|
||||
overrides[key] = binary.abspath
|
||||
|
||||
if resolved_path.parent.name == ".bin" and resolved_path.parent.parent.name == "node_modules":
|
||||
@@ -231,10 +236,8 @@ class CrawlRunner:
|
||||
self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0)
|
||||
self.plugins = discover_plugins()
|
||||
self.process_service = ProcessService(self.bus)
|
||||
self.machine_service = MachineService(self.bus)
|
||||
self.binary_service = BinaryService(self.bus)
|
||||
self.tag_service = TagService(self.bus)
|
||||
self.process_request_service = ProcessRequestService(self.bus)
|
||||
self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id))
|
||||
self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
|
||||
self.snapshot_service = SnapshotService(
|
||||
@@ -250,32 +253,10 @@ class CrawlRunner:
|
||||
self.abx_services = None
|
||||
self.persona = None
|
||||
self.base_config: dict[str, Any] = {}
|
||||
self.derived_config: dict[str, Any] = {}
|
||||
self.primary_url = ""
|
||||
self._live_stream = None
|
||||
|
||||
def _create_projector_bus(self, *, identifier: str, config_overrides: dict[str, Any]):
|
||||
bus = create_bus(name=_bus_name("ArchiveBox", identifier), total_timeout=3600.0)
|
||||
process_service = ProcessService(bus)
|
||||
MachineService(bus)
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
ProcessRequestService(bus)
|
||||
CrawlService(bus, crawl_id=str(self.crawl.id))
|
||||
SnapshotService(
|
||||
bus,
|
||||
crawl_id=str(self.crawl.id),
|
||||
schedule_snapshot=self.enqueue_snapshot if self.process_discovered_snapshots_inline else self.leave_snapshot_queued,
|
||||
)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
abx_services = setup_abx_services(
|
||||
bus,
|
||||
plugins=self.plugins,
|
||||
config_overrides=config_overrides,
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
return bus, abx_services
|
||||
|
||||
async def run(self) -> None:
|
||||
from asgiref.sync import sync_to_async
|
||||
from archivebox.crawls.models import Crawl
|
||||
@@ -292,6 +273,8 @@ class CrawlRunner:
|
||||
**self.base_config,
|
||||
"ABX_RUNTIME": "archivebox",
|
||||
},
|
||||
derived_config_overrides=self.derived_config,
|
||||
persist_derived=False,
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
@@ -369,7 +352,7 @@ class CrawlRunner:
|
||||
current_process.save(update_fields=["iface", "machine", "modified_at"])
|
||||
self.persona = self.crawl.resolve_persona()
|
||||
self.base_config = get_config(crawl=self.crawl)
|
||||
self.base_config.update(_installed_binary_config_overrides(self.plugins))
|
||||
self.derived_config = _installed_binary_config_overrides(self.plugins, self.base_config)
|
||||
self.base_config["ABX_RUNTIME"] = "archivebox"
|
||||
if self.selected_plugins is None:
|
||||
self.selected_plugins = _selected_plugins_from_config(self.base_config)
|
||||
@@ -473,7 +456,6 @@ class CrawlRunner:
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
config_overrides=snapshot["config"],
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=setup_snapshot,
|
||||
@@ -501,7 +483,6 @@ class CrawlRunner:
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
config_overrides=snapshot["config"],
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=cleanup_snapshot,
|
||||
@@ -530,31 +511,22 @@ class CrawlRunner:
|
||||
parent_snapshot_id=snapshot["parent_snapshot_id"],
|
||||
crawl_id=str(self.crawl.id),
|
||||
)
|
||||
snapshot_bus, snapshot_services = self._create_projector_bus(
|
||||
identifier=f"{self.crawl.id}_{snapshot['id']}",
|
||||
config_overrides=snapshot["config"],
|
||||
)
|
||||
try:
|
||||
_attach_bus_trace(snapshot_bus)
|
||||
await download(
|
||||
url=snapshot["url"],
|
||||
plugins=self.plugins,
|
||||
output_dir=Path(snapshot["output_dir"]),
|
||||
selected_plugins=self.selected_plugins,
|
||||
config_overrides=snapshot["config"],
|
||||
bus=snapshot_bus,
|
||||
bus=self.bus,
|
||||
emit_jsonl=False,
|
||||
snapshot=abx_snapshot,
|
||||
skip_crawl_setup=True,
|
||||
skip_crawl_cleanup=True,
|
||||
)
|
||||
await snapshot_services.process.wait_for_background_monitors()
|
||||
finally:
|
||||
current_task = asyncio.current_task()
|
||||
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
|
||||
self.snapshot_tasks.pop(snapshot_id, None)
|
||||
await _stop_bus_trace(snapshot_bus)
|
||||
await snapshot_bus.stop()
|
||||
|
||||
def _load_snapshot_run_data(self, snapshot_id: str):
|
||||
from archivebox.core.models import Snapshot
|
||||
@@ -615,19 +587,19 @@ async def _run_binary(binary_id: str) -> None:
|
||||
binary = await sync_to_async(Binary.objects.get, thread_sensitive=True)(id=binary_id)
|
||||
plugins = discover_plugins()
|
||||
config = get_config()
|
||||
config.update(await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins))
|
||||
derived_config = await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins, config)
|
||||
config["ABX_RUNTIME"] = "archivebox"
|
||||
bus = create_bus(name=_bus_name("ArchiveBox_binary", str(binary.id)), total_timeout=1800.0)
|
||||
process_service = ProcessService(bus)
|
||||
MachineService(bus)
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
ProcessRequestService(bus)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
setup_abx_services(
|
||||
bus,
|
||||
plugins=plugins,
|
||||
config_overrides=config,
|
||||
derived_config_overrides=derived_config,
|
||||
persist_derived=False,
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
@@ -662,19 +634,19 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
|
||||
|
||||
plugins = discover_plugins()
|
||||
config = get_config()
|
||||
config.update(await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins))
|
||||
derived_config = await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins, config)
|
||||
config["ABX_RUNTIME"] = "archivebox"
|
||||
bus = create_bus(name="ArchiveBox_install", total_timeout=3600.0)
|
||||
process_service = ProcessService(bus)
|
||||
MachineService(bus)
|
||||
BinaryService(bus)
|
||||
TagService(bus)
|
||||
ProcessRequestService(bus)
|
||||
ArchiveResultService(bus, process_service=process_service)
|
||||
abx_services = setup_abx_services(
|
||||
bus,
|
||||
plugins=plugins,
|
||||
config_overrides=config,
|
||||
derived_config_overrides=derived_config,
|
||||
persist_derived=False,
|
||||
auto_install=True,
|
||||
emit_jsonl=False,
|
||||
)
|
||||
|
||||
@@ -518,7 +518,6 @@ def test_binary_event_reuses_existing_installed_binary_row(monkeypatch):
|
||||
event = BinaryRequestEvent(
|
||||
name="wget",
|
||||
plugin_name="wget",
|
||||
hook_name="on_Install__10_wget.finite.bg",
|
||||
output_dir="/tmp/wget",
|
||||
binproviders="provider",
|
||||
)
|
||||
|
||||
@@ -133,7 +133,13 @@ def test_plugin_detail_view_renders_config_in_dedicated_sections(monkeypatch):
|
||||
"description": "Example config used to verify plugin metadata rendering.",
|
||||
"type": "object",
|
||||
"required_plugins": ["chrome"],
|
||||
"required_binaries": ["example-cli"],
|
||||
"required_binaries": [
|
||||
{
|
||||
"name": "example-cli",
|
||||
"binproviders": "env,apt,brew",
|
||||
"min_version": None,
|
||||
},
|
||||
],
|
||||
"output_mimetypes": ["text/plain", "application/json"],
|
||||
"properties": {
|
||||
"EXAMPLE_ENABLED": {
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
Unit tests for the ArchiveBox hook architecture.
|
||||
|
||||
Tests hook discovery, execution, JSONL parsing, background hook detection,
|
||||
binary lookup, and install hook XYZ_BINARY env var handling.
|
||||
binary lookup, and required_binaries XYZ_BINARY passthrough handling.
|
||||
|
||||
Run with:
|
||||
sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/test_hooks.py -v'
|
||||
@@ -126,8 +126,8 @@ not json at all
|
||||
self.assertEqual(records[0]["type"], "ArchiveResult")
|
||||
|
||||
|
||||
class TestInstallHookEnvVarHandling(unittest.TestCase):
|
||||
"""Test that install hooks respect XYZ_BINARY env vars."""
|
||||
class TestRequiredBinaryConfigHandling(unittest.TestCase):
|
||||
"""Test that required_binaries keep configured XYZ_BINARY values intact."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
@@ -139,39 +139,28 @@ class TestInstallHookEnvVarHandling(unittest.TestCase):
|
||||
shutil.rmtree(self.work_dir, ignore_errors=True)
|
||||
|
||||
def test_binary_env_var_absolute_path_handling(self):
|
||||
"""Install hooks should handle absolute paths in XYZ_BINARY."""
|
||||
# Test the logic that install hooks use
|
||||
"""Absolute binary paths should pass through unchanged."""
|
||||
configured_binary = "/custom/path/to/wget2"
|
||||
if "/" in configured_binary:
|
||||
bin_name = Path(configured_binary).name
|
||||
else:
|
||||
bin_name = configured_binary
|
||||
binary_name = configured_binary
|
||||
|
||||
self.assertEqual(bin_name, "wget2")
|
||||
self.assertEqual(binary_name, "/custom/path/to/wget2")
|
||||
|
||||
def test_binary_env_var_name_only_handling(self):
|
||||
"""Install hooks should handle binary names in XYZ_BINARY."""
|
||||
# Test the logic that install hooks use
|
||||
"""Binary command names should pass through unchanged."""
|
||||
configured_binary = "wget2"
|
||||
if "/" in configured_binary:
|
||||
bin_name = Path(configured_binary).name
|
||||
else:
|
||||
bin_name = configured_binary
|
||||
binary_name = configured_binary
|
||||
|
||||
self.assertEqual(bin_name, "wget2")
|
||||
self.assertEqual(binary_name, "wget2")
|
||||
|
||||
def test_binary_env_var_empty_default(self):
|
||||
"""Install hooks should use default when XYZ_BINARY is empty."""
|
||||
"""Empty configured values should fall back to config defaults."""
|
||||
configured_binary = ""
|
||||
if configured_binary:
|
||||
if "/" in configured_binary:
|
||||
bin_name = Path(configured_binary).name
|
||||
else:
|
||||
bin_name = configured_binary
|
||||
binary_name = configured_binary
|
||||
else:
|
||||
bin_name = "wget" # default
|
||||
binary_name = "wget"
|
||||
|
||||
self.assertEqual(bin_name, "wget")
|
||||
self.assertEqual(binary_name, "wget")
|
||||
|
||||
|
||||
class TestHookDiscovery(unittest.TestCase):
|
||||
@@ -187,7 +176,7 @@ class TestHookDiscovery(unittest.TestCase):
|
||||
wget_dir = self.plugins_dir / "wget"
|
||||
wget_dir.mkdir()
|
||||
(wget_dir / "on_Snapshot__50_wget.py").write_text("# test hook")
|
||||
(wget_dir / "on_Install__10_wget.finite.bg.py").write_text("# install hook")
|
||||
(wget_dir / "on_BinaryRequest__10_wget.py").write_text("# binary request hook")
|
||||
|
||||
chrome_dir = self.plugins_dir / "chrome"
|
||||
chrome_dir.mkdir(exist_ok=True)
|
||||
@@ -299,7 +288,7 @@ class TestHookDiscovery(unittest.TestCase):
|
||||
self.assertIn("on_BinaryRequest__10_npm.py", hook_names)
|
||||
|
||||
def test_discover_hooks_accepts_event_class_names(self):
|
||||
"""discover_hooks should accept InstallEvent / SnapshotEvent class names."""
|
||||
"""discover_hooks should accept BinaryRequestEvent / SnapshotEvent class names."""
|
||||
from archivebox import hooks as hooks_module
|
||||
|
||||
hooks_module.get_plugins.cache_clear()
|
||||
@@ -307,10 +296,10 @@ class TestHookDiscovery(unittest.TestCase):
|
||||
patch.object(hooks_module, "BUILTIN_PLUGINS_DIR", self.plugins_dir),
|
||||
patch.object(hooks_module, "USER_PLUGINS_DIR", self.test_dir / "user_plugins"),
|
||||
):
|
||||
install_hooks = hooks_module.discover_hooks("InstallEvent", filter_disabled=False)
|
||||
binary_hooks = hooks_module.discover_hooks("BinaryRequestEvent", filter_disabled=False)
|
||||
snapshot_hooks = hooks_module.discover_hooks("SnapshotEvent", filter_disabled=False)
|
||||
|
||||
self.assertIn("on_Install__10_wget.finite.bg.py", [hook.name for hook in install_hooks])
|
||||
self.assertIn("on_BinaryRequest__10_wget.py", [hook.name for hook in binary_hooks])
|
||||
self.assertIn("on_Snapshot__50_wget.py", [hook.name for hook in snapshot_hooks])
|
||||
|
||||
def test_discover_hooks_returns_empty_for_non_hook_lifecycle_events(self):
|
||||
@@ -325,44 +314,6 @@ class TestHookDiscovery(unittest.TestCase):
|
||||
self.assertEqual(hooks_module.discover_hooks("BinaryEvent", filter_disabled=False), [])
|
||||
self.assertEqual(hooks_module.discover_hooks("CrawlCleanupEvent", filter_disabled=False), [])
|
||||
|
||||
def test_discover_install_hooks_only_include_declared_plugin_dependencies(self):
|
||||
"""Install hook discovery should include required_plugins without broadening to provider plugins."""
|
||||
responses_dir = self.plugins_dir / "responses"
|
||||
responses_dir.mkdir()
|
||||
(responses_dir / "config.json").write_text(
|
||||
json.dumps(
|
||||
{
|
||||
"type": "object",
|
||||
"required_plugins": ["chrome"],
|
||||
"properties": {},
|
||||
},
|
||||
),
|
||||
)
|
||||
|
||||
chrome_dir = self.plugins_dir / "chrome"
|
||||
chrome_dir.mkdir(exist_ok=True)
|
||||
(chrome_dir / "config.json").write_text('{"type": "object", "properties": {}}')
|
||||
(chrome_dir / "on_Install__70_chrome.finite.bg.py").write_text("# chrome install hook")
|
||||
|
||||
npm_dir = self.plugins_dir / "npm"
|
||||
npm_dir.mkdir()
|
||||
(npm_dir / "on_BinaryRequest__10_npm.py").write_text("# npm binary hook")
|
||||
(npm_dir / "on_Install__00_npm.py").write_text("# npm install hook")
|
||||
(npm_dir / "config.json").write_text('{"type": "object", "properties": {}}')
|
||||
|
||||
from archivebox import hooks as hooks_module
|
||||
|
||||
hooks_module.get_plugins.cache_clear()
|
||||
with (
|
||||
patch.object(hooks_module, "BUILTIN_PLUGINS_DIR", self.plugins_dir),
|
||||
patch.object(hooks_module, "USER_PLUGINS_DIR", self.test_dir / "user_plugins"),
|
||||
):
|
||||
hooks = hooks_module.discover_hooks("Install", config={"PLUGINS": "responses"})
|
||||
|
||||
hook_names = [hook.name for hook in hooks]
|
||||
self.assertIn("on_Install__70_chrome.finite.bg.py", hook_names)
|
||||
self.assertNotIn("on_Install__00_npm.py", hook_names)
|
||||
|
||||
|
||||
class TestGetExtractorName(unittest.TestCase):
|
||||
"""Test get_extractor_name() function."""
|
||||
@@ -484,8 +435,8 @@ print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "url": args.ge
|
||||
self.assertEqual(records[0]["url"], "https://example.com")
|
||||
|
||||
|
||||
class TestInstallHookOutput(unittest.TestCase):
|
||||
"""Test install hook output format compliance."""
|
||||
class TestDependencyRecordOutput(unittest.TestCase):
|
||||
"""Test dependency record output format compliance."""
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test environment."""
|
||||
@@ -495,8 +446,8 @@ class TestInstallHookOutput(unittest.TestCase):
|
||||
"""Clean up test environment."""
|
||||
shutil.rmtree(self.work_dir, ignore_errors=True)
|
||||
|
||||
def test_install_hook_outputs_binary(self):
|
||||
"""Install hook should output Binary JSONL when binary found."""
|
||||
def test_dependency_record_outputs_binary(self):
|
||||
"""Dependency resolution should output Binary JSONL when binary is found."""
|
||||
hook_output = json.dumps(
|
||||
{
|
||||
"type": "Binary",
|
||||
@@ -515,8 +466,8 @@ class TestInstallHookOutput(unittest.TestCase):
|
||||
self.assertEqual(data["name"], "wget")
|
||||
self.assertTrue(data["abspath"].startswith("/"))
|
||||
|
||||
def test_install_hook_outputs_machine_config(self):
|
||||
"""Install hook should output Machine config update JSONL."""
|
||||
def test_dependency_record_outputs_machine_config(self):
|
||||
"""Dependency resolution should output Machine config update JSONL."""
|
||||
hook_output = json.dumps(
|
||||
{
|
||||
"type": "Machine",
|
||||
|
||||
69
archivebox/tests/test_process_service.py
Normal file
69
archivebox/tests/test_process_service.py
Normal file
@@ -0,0 +1,69 @@
|
||||
import asyncio
|
||||
import json
|
||||
|
||||
import pytest
|
||||
|
||||
from abx_dl.events import ProcessStartedEvent, ProcessStdoutEvent
|
||||
from abx_dl.orchestrator import create_bus
|
||||
|
||||
|
||||
pytestmark = pytest.mark.django_db
|
||||
|
||||
|
||||
def test_process_service_emits_process_started_from_inline_process_event(monkeypatch):
|
||||
from archivebox.services import process_service as process_service_module
|
||||
from archivebox.services.process_service import ProcessService
|
||||
|
||||
bus = create_bus(name="test_process_service_inline_process_event")
|
||||
ProcessService(bus)
|
||||
|
||||
monkeypatch.setattr(
|
||||
process_service_module,
|
||||
"_ensure_worker",
|
||||
lambda event: {
|
||||
"pid": 4321,
|
||||
"start": 1711111111.0,
|
||||
"statename": "RUNNING",
|
||||
"exitstatus": 0,
|
||||
},
|
||||
)
|
||||
|
||||
async def run_test():
|
||||
await bus.emit(
|
||||
ProcessStdoutEvent(
|
||||
line=json.dumps(
|
||||
{
|
||||
"type": "ProcessEvent",
|
||||
"plugin_name": "search_backend_sonic",
|
||||
"hook_name": "worker_sonic",
|
||||
"hook_path": "/usr/bin/sonic",
|
||||
"hook_args": ["-c", "/tmp/sonic/config.cfg"],
|
||||
"is_background": True,
|
||||
"daemon": True,
|
||||
"url": "tcp://127.0.0.1:1491",
|
||||
"output_dir": "/tmp/sonic",
|
||||
"env": {},
|
||||
"process_type": "worker",
|
||||
"worker_type": "sonic",
|
||||
"process_id": "worker:sonic",
|
||||
"output_str": "127.0.0.1:1491",
|
||||
},
|
||||
),
|
||||
plugin_name="search_backend_sonic",
|
||||
hook_name="on_CrawlSetup__55_sonic_start.py",
|
||||
output_dir="/tmp/search_backend_sonic",
|
||||
snapshot_id="snap-1",
|
||||
process_id="proc-hook",
|
||||
),
|
||||
)
|
||||
started = await bus.find(ProcessStartedEvent, process_id="worker:sonic")
|
||||
await bus.stop()
|
||||
return started
|
||||
|
||||
started = asyncio.run(run_test())
|
||||
assert started is not None
|
||||
assert started.hook_name == "worker_sonic"
|
||||
assert started.process_type == "worker"
|
||||
assert started.worker_type == "sonic"
|
||||
assert getattr(started, "url", "") == "tcp://127.0.0.1:1491"
|
||||
assert getattr(started, "output_str", "") == "127.0.0.1:1491"
|
||||
@@ -46,7 +46,7 @@ async def _call_sync(func, *args, **kwargs):
|
||||
return func(*args, **kwargs)
|
||||
|
||||
|
||||
def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
|
||||
def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
|
||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||
from archivebox.crawls.models import Crawl
|
||||
from archivebox.core.models import Snapshot
|
||||
@@ -87,13 +87,13 @@ def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
|
||||
|
||||
download_calls = []
|
||||
|
||||
async def fake_download(*, url, bus, config_overrides, snapshot, **kwargs):
|
||||
async def fake_download(*, url, bus, snapshot, **kwargs):
|
||||
download_calls.append(
|
||||
{
|
||||
"url": url,
|
||||
"bus": bus,
|
||||
"snapshot_id": config_overrides["SNAPSHOT_ID"],
|
||||
"source_url": config_overrides["SOURCE_URL"],
|
||||
"snapshot_id": snapshot.id,
|
||||
"source_url": snapshot.url,
|
||||
"abx_snapshot_id": snapshot.id,
|
||||
},
|
||||
)
|
||||
@@ -146,8 +146,8 @@ def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
|
||||
assert len(download_calls) == 2
|
||||
assert {call["snapshot_id"] for call in download_calls} == {str(snapshot_a.id), str(snapshot_b.id)}
|
||||
assert {call["source_url"] for call in download_calls} == {snapshot_a.url, snapshot_b.url}
|
||||
assert len({id(call["bus"]) for call in download_calls}) == 2
|
||||
assert len(created_buses) == 3 # 1 crawl bus + 2 isolated snapshot buses
|
||||
assert len({id(call["bus"]) for call in download_calls}) == 1
|
||||
assert len(created_buses) == 1
|
||||
|
||||
|
||||
def test_ensure_background_runner_starts_when_none_running(monkeypatch):
|
||||
@@ -353,6 +353,62 @@ def test_installed_binary_config_overrides_include_valid_installed_binaries(monk
|
||||
assert overrides["NODE_PATH"] == "/tmp/shared-lib/npm/node_modules"
|
||||
|
||||
|
||||
def test_installed_binary_config_overrides_do_not_map_hardcoded_artifacts_to_configurable_binary_keys(monkeypatch):
|
||||
from archivebox.machine.models import Binary, Machine
|
||||
from archivebox.services import runner as runner_module
|
||||
from abx_dl.models import Plugin
|
||||
|
||||
machine = Machine.objects.create(
|
||||
guid="test-guid-runner-singlefile-cache",
|
||||
hostname="runner-host-singlefile",
|
||||
hw_in_docker=False,
|
||||
hw_in_vm=False,
|
||||
hw_manufacturer="Test",
|
||||
hw_product="Test Product",
|
||||
hw_uuid="test-hw-runner-singlefile-cache",
|
||||
os_arch="arm64",
|
||||
os_family="darwin",
|
||||
os_platform="macOS",
|
||||
os_release="14.0",
|
||||
os_kernel="Darwin",
|
||||
stats={},
|
||||
config={},
|
||||
)
|
||||
singlefile_extension = Binary.objects.create(
|
||||
machine=machine,
|
||||
name="singlefile",
|
||||
abspath="/tmp/shared-lib/bin/singlefile",
|
||||
version="1.0.0",
|
||||
binprovider="chromewebstore",
|
||||
binproviders="chromewebstore",
|
||||
status=Binary.StatusChoices.INSTALLED,
|
||||
)
|
||||
|
||||
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
|
||||
monkeypatch.setattr(Path, "is_file", lambda self: str(self) == singlefile_extension.abspath)
|
||||
monkeypatch.setattr(runner_module.os, "access", lambda path, mode: str(path) == singlefile_extension.abspath)
|
||||
|
||||
overrides = runner_module._installed_binary_config_overrides(
|
||||
{
|
||||
"singlefile": Plugin(
|
||||
name="singlefile",
|
||||
path=Path("."),
|
||||
hooks=[],
|
||||
config_schema={"SINGLEFILE_BINARY": {"type": "string", "default": "single-file"}},
|
||||
binaries=[
|
||||
{"name": "{SINGLEFILE_BINARY}", "binproviders": "env,npm"},
|
||||
{"name": "singlefile", "binproviders": "chromewebstore"},
|
||||
],
|
||||
),
|
||||
},
|
||||
config={"SINGLEFILE_BINARY": "single-file"},
|
||||
)
|
||||
|
||||
assert "SINGLEFILE_BINARY" not in overrides
|
||||
assert overrides["LIB_DIR"] == "/tmp/shared-lib"
|
||||
assert overrides["LIB_BIN_DIR"] == "/tmp/shared-lib/bin"
|
||||
|
||||
|
||||
def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch):
|
||||
import asgiref.sync
|
||||
|
||||
@@ -700,11 +756,9 @@ def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
|
||||
"_run_crawl_cleanup",
|
||||
lambda self, snapshot_id: cleanup_calls.append("abx_cleanup") or asyncio.sleep(0),
|
||||
)
|
||||
monkeypatch.setattr(crawl, "cleanup", lambda: cleanup_calls.append("crawl_cleanup"))
|
||||
|
||||
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
|
||||
|
||||
assert cleanup_calls == ["crawl_cleanup", "abx_cleanup"]
|
||||
assert cleanup_calls == ["abx_cleanup"]
|
||||
|
||||
|
||||
def test_abx_process_service_background_monitor_finishes_after_process_exit(monkeypatch, tmp_path):
|
||||
@@ -765,6 +819,9 @@ def test_abx_process_service_background_monitor_finishes_after_process_exit(monk
|
||||
timeout=60,
|
||||
snapshot_id="snap-1",
|
||||
is_background=True,
|
||||
url="https://example.org/",
|
||||
process_type="hook",
|
||||
worker_type="hook",
|
||||
)
|
||||
|
||||
async def run_test():
|
||||
|
||||
Reference in New Issue
Block a user