Reuse cached binaries in archivebox runtime

This commit is contained in:
Nick Sweeting
2026-03-24 11:03:43 -07:00
parent 39450111dd
commit 50286d3c38
19 changed files with 714 additions and 564 deletions

View File

@@ -14,11 +14,10 @@ EVENT_FLOW_DIAGRAM = """
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ InstallEvent │
│ └─ on_Install__*
│ └─ BinaryRequest records
│ └─ BinaryRequestEvent
│ └─ on_BinaryRequest__*
│ └─ BinaryEvent / MachineEvent │
│ └─ config.json > required_binaries
│ └─ BinaryRequestEvent
│ └─ on_BinaryRequest__*
│ └─ BinaryEvent
│ │
│ CrawlEvent │
│ └─ CrawlSetupEvent │
@@ -70,15 +69,15 @@ def pluginmap(
event_phases = {
"InstallEvent": {
"description": "Pre-run dependency phase. on_Install hooks request binaries and update machine config.",
"emits": ["BinaryRequestEvent", "BinaryEvent", "MachineEvent", "ProcessEvent"],
"description": "Pre-run dependency phase. Enabled plugins emit BinaryRequest events from config.json required_binaries.",
"emits": ["BinaryRequestEvent", "BinaryEvent", "ProcessEvent"],
},
"BinaryRequestEvent": {
"description": "Provider phase. on_BinaryRequest hooks resolve or install requested binaries.",
"emits": ["BinaryEvent", "MachineEvent", "ProcessEvent"],
"emits": ["BinaryEvent", "ProcessEvent"],
},
"BinaryEvent": {
"description": "Resolved binary metadata event. Projected into the DB/runtime config.",
"description": "Resolved binary metadata event. Projected into the DB binary cache.",
"emits": [],
},
"CrawlEvent": {
@@ -87,11 +86,11 @@ def pluginmap(
},
"CrawlSetupEvent": {
"description": "Crawl-scoped setup phase. on_CrawlSetup hooks launch/configure shared daemons and runtime state.",
"emits": ["MachineEvent", "ProcessEvent"],
"emits": ["ProcessEvent"],
},
"SnapshotEvent": {
"description": "Per-snapshot extraction phase. on_Snapshot hooks emit ArchiveResult, Snapshot, Tag, Machine, and BinaryRequest records.",
"emits": ["ArchiveResultEvent", "SnapshotEvent", "TagEvent", "MachineEvent", "BinaryRequestEvent", "ProcessEvent"],
"description": "Per-snapshot extraction phase. on_Snapshot hooks emit ArchiveResult, Snapshot, Tag, and BinaryRequest records.",
"emits": ["ArchiveResultEvent", "SnapshotEvent", "TagEvent", "BinaryRequestEvent", "ProcessEvent"],
},
"SnapshotCleanupEvent": {
"description": "Internal snapshot cleanup phase.",

View File

@@ -5,7 +5,6 @@ __package__ = "archivebox.cli"
import sys
import os
import platform
import logging
from pathlib import Path
from collections.abc import Iterable
@@ -124,17 +123,19 @@ def version(
setup_django()
from archivebox.machine.models import Machine, Binary
from archivebox.config.views import KNOWN_BINARIES, canonical_binary_name
from abx_dl.dependencies import load_binary
machine = Machine.current()
requested_names = {canonical_binary_name(name) for name in binaries} if binaries else set()
if isinstance(binaries, str):
requested_names = {name.strip() for name in binaries.split(",") if name.strip()}
else:
requested_names = {name for name in (binaries or ()) if name}
db_binaries = {
canonical_binary_name(binary.name): binary for binary in Binary.objects.filter(machine=machine).order_by("name", "-modified_at")
}
all_binary_names = sorted(set(KNOWN_BINARIES) | set(db_binaries.keys()))
db_binaries: dict[str, Binary] = {}
for binary in Binary.objects.filter(machine=machine).order_by("name", "-modified_at"):
db_binaries.setdefault(binary.name, binary)
all_binary_names = sorted(requested_names or set(db_binaries.keys()))
if not all_binary_names:
prnt("", "[grey53]No binaries detected. Run [green]archivebox install[/green] to detect dependencies.[/grey53]")
@@ -163,37 +164,10 @@ def version(
any_available = True
continue
loaded = None
try:
abx_pkg_logger = logging.getLogger("abx_pkg")
previous_level = abx_pkg_logger.level
abx_pkg_logger.setLevel(logging.CRITICAL)
try:
loaded = load_binary({"name": name, "binproviders": "env,pip,npm,brew,apt"})
finally:
abx_pkg_logger.setLevel(previous_level)
except Exception:
loaded = None
if loaded and loaded.is_valid and loaded.loaded_abspath:
display_path = str(loaded.loaded_abspath).replace(str(DATA_DIR), ".").replace(str(Path("~").expanduser()), "~")
version_str = str(loaded.loaded_version or "unknown")[:15]
provider = str(getattr(getattr(loaded, "loaded_binprovider", None), "name", "") or "env")[:8]
prnt(
"",
"[green]√[/green]",
"",
name.ljust(18),
version_str.ljust(16),
provider.ljust(8),
display_path,
overflow="ignore",
crop=False,
)
any_available = True
continue
prnt("", "[red]X[/red]", "", name.ljust(18), "[grey53]not installed[/grey53]", overflow="ignore", crop=False)
status = (
"[grey53]not recorded[/grey53]" if name in requested_names and installed is None else "[grey53]not installed[/grey53]"
)
prnt("", "[red]X[/red]", "", name.ljust(18), status, overflow="ignore", crop=False)
failures.append(name)
if not any_available:

View File

@@ -138,10 +138,9 @@ def get_config(
3. Per-user config (user.config JSON field)
4. Per-persona config (persona.get_derived_config() - includes CHROME_USER_DATA_DIR etc.)
5. Environment variables
6. Per-machine config (machine.config JSON field - resolved binary paths)
7. Config file (ArchiveBox.conf)
8. Plugin schema defaults (config.json)
9. Core config defaults
6. Config file (ArchiveBox.conf)
7. Plugin schema defaults (config.json)
8. Core config defaults
Args:
defaults: Default values to start with
@@ -150,7 +149,7 @@ def get_config(
crawl: Crawl object with config JSON field
snapshot: Snapshot object with config JSON field
archiveresult: ArchiveResult object (auto-fetches snapshot)
machine: Machine object with config JSON field (defaults to Machine.current())
machine: Unused legacy argument kept for call compatibility
Note: Objects are auto-fetched from relationships if not provided:
- snapshot auto-fetched from archiveresult.snapshot
@@ -221,19 +220,6 @@ def get_config(
file_config = BaseConfigSet.load_from_file(config_file)
config.update(file_config)
# Apply machine config overrides (cached binary paths, etc.)
if machine is None:
# Default to current machine if not provided
try:
from archivebox.machine.models import Machine
machine = Machine.current()
except Exception:
pass # Machine might not be available during early init
if machine and hasattr(machine, "config") and machine.config:
config.update(machine.config)
# Override with environment variables (for keys that exist in config)
for key in config:
env_val = os.environ.get(key)

View File

@@ -29,42 +29,6 @@ ENVIRONMENT_BINARIES_BASE_URL = "/admin/environment/binaries/"
INSTALLED_BINARIES_BASE_URL = "/admin/machine/binary/"
# Common binaries to check for
KNOWN_BINARIES = [
"wget",
"curl",
"chromium",
"chrome",
"google-chrome",
"google-chrome-stable",
"node",
"npm",
"npx",
"yt-dlp",
"git",
"singlefile",
"readability-extractor",
"mercury-parser",
"python3",
"python",
"bash",
"zsh",
"ffmpeg",
"ripgrep",
"rg",
"sonic",
"archivebox",
]
CANONICAL_BINARY_ALIASES = {
"youtube-dl": "yt-dlp",
"ytdlp": "yt-dlp",
"ripgrep": "rg",
"singlefile": "single-file",
"mercury-parser": "postlight-parser",
}
def is_superuser(request: HttpRequest) -> bool:
return bool(getattr(request.user, "is_superuser", False))
@@ -131,13 +95,12 @@ def get_environment_binary_url(name: str) -> str:
return f"{ENVIRONMENT_BINARIES_BASE_URL}{quote(name)}/"
def get_installed_binary_change_url(name: str, binary: Any) -> str | None:
binary_id = getattr(binary, "id", None)
if not binary_id:
def get_installed_binary_change_url(name: str, binary: Binary | None) -> str | None:
if binary is None or not binary.id:
return None
base_url = getattr(binary, "admin_change_url", None) or f"{INSTALLED_BINARIES_BASE_URL}{binary_id}/change/"
changelist_filters = urlencode({"q": canonical_binary_name(name)})
base_url = binary.admin_change_url or f"{INSTALLED_BINARIES_BASE_URL}{binary.id}/change/"
changelist_filters = urlencode({"q": name})
return f"{base_url}?{urlencode({'_changelist_filters': changelist_filters})}"
@@ -168,11 +131,14 @@ def render_code_tag_list(values: list[str]) -> str:
def render_plugin_metadata_html(config: dict[str, Any]) -> str:
required_binaries = [
str(item.get("name")) for item in (config.get("required_binaries") or []) if isinstance(item, dict) and item.get("name")
]
rows = (
("Title", config.get("title") or "(none)"),
("Description", config.get("description") or "(none)"),
("Required Plugins", mark_safe(render_link_tag_list(config.get("required_plugins") or [], get_plugin_docs_url))),
("Required Binaries", mark_safe(render_link_tag_list(config.get("required_binaries") or [], get_environment_binary_url))),
("Required Binaries", mark_safe(render_link_tag_list(required_binaries, get_environment_binary_url))),
("Output MIME Types", mark_safe(render_code_tag_list(config.get("output_mimetypes") or []))),
)
@@ -383,10 +349,6 @@ def obj_to_yaml(obj: Any, indent: int = 0) -> str:
return f" {str(obj)}"
def canonical_binary_name(name: str) -> str:
return CANONICAL_BINARY_ALIASES.get(name, name)
def _binary_sort_key(binary: Binary) -> tuple[int, int, int, Any]:
return (
int(binary.status == Binary.StatusChoices.INSTALLED),
@@ -399,24 +361,11 @@ def _binary_sort_key(binary: Binary) -> tuple[int, int, int, Any]:
def get_db_binaries_by_name() -> dict[str, Binary]:
grouped: dict[str, list[Binary]] = {}
for binary in Binary.objects.all():
grouped.setdefault(canonical_binary_name(binary.name), []).append(binary)
grouped.setdefault(binary.name, []).append(binary)
return {name: max(records, key=_binary_sort_key) for name, records in grouped.items()}
def serialize_binary_record(name: str, binary: Binary | None) -> dict[str, Any]:
is_installed = bool(binary and binary.status == Binary.StatusChoices.INSTALLED)
return {
"name": canonical_binary_name(name),
"version": str(getattr(binary, "version", "") or ""),
"binprovider": str(getattr(binary, "binprovider", "") or ""),
"abspath": str(getattr(binary, "abspath", "") or ""),
"sha256": str(getattr(binary, "sha256", "") or ""),
"status": str(getattr(binary, "status", "") or ""),
"is_available": is_installed and bool(getattr(binary, "abspath", "") or ""),
}
def get_filesystem_plugins() -> dict[str, dict[str, Any]]:
"""Discover plugins from filesystem directories."""
import json
@@ -474,14 +423,14 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
all_binary_names = sorted(db_binaries.keys())
for name in all_binary_names:
merged = serialize_binary_record(name, db_binaries.get(name))
binary = db_binaries.get(name)
rows["Binary Name"].append(ItemLink(name, key=name))
if merged["is_available"]:
rows["Found Version"].append(f"{merged['version']}" if merged["version"] else "✅ found")
rows["Provided By"].append(merged["binprovider"] or "-")
rows["Found Abspath"].append(merged["abspath"] or "-")
if binary and binary.is_valid:
rows["Found Version"].append(f"{binary.version}" if binary.version else "✅ found")
rows["Provided By"].append(binary.binprovider or "-")
rows["Found Abspath"].append(binary.abspath or "-")
else:
rows["Found Version"].append("❌ missing")
rows["Provided By"].append("-")
@@ -496,22 +445,20 @@ def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
@render_with_item_view
def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert is_superuser(request), "Must be a superuser to view configuration settings."
key = canonical_binary_name(key)
db_binary = get_db_binaries_by_name().get(key)
merged = serialize_binary_record(key, db_binary)
if merged["is_available"]:
if db_binary and db_binary.is_valid:
binary_data = db_binary.to_json()
section: SectionData = {
"name": key,
"description": mark_safe(render_binary_detail_description(key, merged, db_binary)),
"description": mark_safe(render_binary_detail_description(key, binary_data, db_binary)),
"fields": {
"name": key,
"binprovider": merged["binprovider"] or "-",
"abspath": merged["abspath"] or "not found",
"version": merged["version"] or "unknown",
"sha256": merged["sha256"],
"status": merged["status"],
"binprovider": db_binary.binprovider or "-",
"abspath": db_binary.abspath or "not found",
"version": db_binary.version or "unknown",
"sha256": db_binary.sha256,
"status": db_binary.status,
},
"help_texts": {},
}
@@ -526,10 +473,10 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
"description": "No persisted Binary record found",
"fields": {
"name": key,
"binprovider": merged["binprovider"] or "not recorded",
"abspath": merged["abspath"] or "not recorded",
"version": merged["version"] or "N/A",
"status": merged["status"] or "unrecorded",
"binprovider": db_binary.binprovider if db_binary else "not recorded",
"abspath": db_binary.abspath if db_binary else "not recorded",
"version": db_binary.version if db_binary else "N/A",
"status": db_binary.status if db_binary else "unrecorded",
},
"help_texts": {},
}

View File

@@ -1226,7 +1226,7 @@ def live_progress_view(request):
return (plugin, plugin, "unknown", "")
phase = "unknown"
if normalized_hook_name.startswith("on_Install__"):
if normalized_hook_name == "InstallEvent":
phase = "install"
elif normalized_hook_name.startswith("on_CrawlSetup__"):
phase = "crawl"
@@ -1966,7 +1966,7 @@ def live_config_value_view(request: HttpRequest, key: str, **kwargs) -> ItemCont
Priority order (highest to lowest):
<ol>
<li><b style="color: blue">Environment</b> - Environment variables</li>
<li><b style="color: purple">Machine</b> - Machine-specific overrides (e.g., resolved binary paths)
<li><b style="color: purple">Machine</b> - Machine-specific overrides
{f'<br/><a href="{machine_admin_url}">→ Edit <code>{key}</code> in Machine.config for this server</a>' if machine_admin_url else ""}
</li>
<li><b style="color: green">Config File</b> - data/ArchiveBox.conf</li>

View File

@@ -9,11 +9,14 @@ ArchiveBox no longer drives plugin execution itself during normal crawls.
- parses hook stdout JSONL records into ArchiveBox models when needed
Hook-backed event families are discovered from filenames like:
on_Install__*
on_BinaryRequest__*
on_CrawlSetup__*
on_Snapshot__*
InstallEvent itself is still part of the runtime lifecycle, but it has no
corresponding hook family. Its dependency declarations come directly from each
plugin's `config.json > required_binaries`.
Lifecycle event names like `InstallEvent` or `SnapshotCleanupEvent` are
normalized to the corresponding `on_{EventFamily}__*` prefix by a simple
string transform. If no scripts exist for that prefix, discovery returns `[]`.
@@ -212,7 +215,7 @@ def discover_hooks(
pattern_direct = f"on_{hook_event_name}__*.{ext}"
hooks.extend(base_dir.glob(pattern_direct))
# Binary install hooks are provider hooks, not end-user extractors. They
# Binary provider hooks are not end-user extractors. They
# self-filter via `binproviders`, so applying the PLUGINS whitelist here
# can hide the very installer needed by a selected plugin (e.g.
# `--plugins=singlefile` still needs the `npm` BinaryRequest hook).
@@ -394,54 +397,14 @@ def run_hook(
# Derive LIB_BIN_DIR from LIB_DIR if not set
lib_bin_dir = Path(lib_dir) / "bin"
# Build PATH with proper precedence:
# 1. LIB_BIN_DIR (highest priority - local symlinked binaries)
# 2. Machine.config.PATH (pip/npm bin dirs from providers)
# 3. os.environ['PATH'] (system PATH)
if lib_bin_dir:
lib_bin_dir = str(lib_bin_dir)
env["LIB_BIN_DIR"] = lib_bin_dir
# Start with base PATH
current_path = env.get("PATH", "")
# Prepend Machine.config.PATH if it exists (treat as extra entries, not replacement)
try:
from archivebox.machine.models import Machine
machine = Machine.current()
if machine and machine.config:
machine_path = machine.config.get("PATH")
if machine_path:
# Prepend machine_path to current PATH
current_path = f"{machine_path}:{current_path}" if current_path else machine_path
except Exception:
pass
# Finally prepend LIB_BIN_DIR to the front (highest priority)
if lib_bin_dir:
if not current_path.startswith(f"{lib_bin_dir}:"):
env["PATH"] = f"{lib_bin_dir}:{current_path}" if current_path else lib_bin_dir
else:
env["PATH"] = current_path
else:
env["PATH"] = current_path
# Set NODE_PATH for Node.js module resolution
# Priority: config dict > Machine.config > derive from LIB_DIR
# Set NODE_PATH for Node.js module resolution.
# Priority: config dict > derive from LIB_DIR
node_path = config.get("NODE_PATH")
if not node_path and lib_dir:
# Derive from LIB_DIR/npm/node_modules (create if needed)
node_modules_dir = Path(lib_dir) / "npm" / "node_modules"
node_modules_dir.mkdir(parents=True, exist_ok=True)
node_path = str(node_modules_dir)
if not node_path:
try:
# Fallback to Machine.config
node_path = machine.config.get("NODE_MODULES_DIR")
except Exception:
pass
if node_path:
env["NODE_PATH"] = node_path
env["NODE_MODULES_DIR"] = node_path # For backwards compatibility
@@ -472,6 +435,41 @@ def run_hook(
else:
env[key] = str(value)
# Build PATH with proper precedence:
# 1. path-like *_BINARY parents (explicit binary overrides / cached abspaths)
# 2. LIB_BIN_DIR (local symlinked binaries)
# 3. existing PATH
runtime_bin_dirs: list[str] = []
if lib_bin_dir:
lib_bin_dir = str(lib_bin_dir)
env["LIB_BIN_DIR"] = lib_bin_dir
for key, raw_value in env.items():
if not key.endswith("_BINARY"):
continue
value = str(raw_value or "").strip()
if not value:
continue
path_value = Path(value).expanduser()
if not (path_value.is_absolute() or "/" in value or "\\" in value):
continue
binary_dir = str(path_value.resolve(strict=False).parent)
if binary_dir and binary_dir not in runtime_bin_dirs:
runtime_bin_dirs.append(binary_dir)
if lib_bin_dir and lib_bin_dir not in runtime_bin_dirs:
runtime_bin_dirs.append(lib_bin_dir)
uv_value = str(env.get("UV") or "").strip()
if uv_value:
uv_bin_dir = str(Path(uv_value).expanduser().resolve(strict=False).parent)
if uv_bin_dir and uv_bin_dir not in runtime_bin_dirs:
runtime_bin_dirs.append(uv_bin_dir)
current_path = env.get("PATH", "")
path_parts = [part for part in current_path.split(os.pathsep) if part]
for extra_dir in reversed(runtime_bin_dirs):
if extra_dir not in path_parts:
path_parts.insert(0, extra_dir)
env["PATH"] = os.pathsep.join(path_parts)
# Create output directory if needed
output_dir.mkdir(parents=True, exist_ok=True)

View File

@@ -101,8 +101,6 @@ def _get_process_binary_env_keys(plugin_name: str, hook_path: str, env: dict[str
schema_keys.sort(
key=lambda key: (
key != f"{plugin_key}_BINARY",
key.endswith("_NODE_BINARY"),
key.endswith("_CHROME_BINARY"),
key,
),
)
@@ -117,8 +115,6 @@ def _get_process_binary_env_keys(plugin_name: str, hook_path: str, env: dict[str
hook_suffix = Path(hook_path).suffix.lower()
if hook_suffix == ".js":
if plugin_key:
add(f"{plugin_key}_NODE_BINARY")
add("NODE_BINARY")
return keys
@@ -160,7 +156,7 @@ class Machine(ModelWithHealthStats):
default=dict,
null=True,
blank=True,
help_text="Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)",
help_text="Machine-specific config overrides.",
)
num_uses_failed = models.PositiveIntegerField(default=0)
num_uses_succeeded = models.PositiveIntegerField(default=0)
@@ -176,24 +172,13 @@ class Machine(ModelWithHealthStats):
global _CURRENT_MACHINE
if _CURRENT_MACHINE:
if timezone.now() < _CURRENT_MACHINE.modified_at + timedelta(seconds=MACHINE_RECHECK_INTERVAL):
return cls._sanitize_config(cls._hydrate_config_from_sibling(_CURRENT_MACHINE))
return cls._sanitize_config(_CURRENT_MACHINE)
_CURRENT_MACHINE = None
_CURRENT_MACHINE, _ = cls.objects.update_or_create(
guid=get_host_guid(),
defaults={"hostname": socket.gethostname(), **get_os_info(), **get_vm_info(), "stats": get_host_stats()},
)
return cls._sanitize_config(cls._hydrate_config_from_sibling(_CURRENT_MACHINE))
@classmethod
def _hydrate_config_from_sibling(cls, machine: Machine) -> Machine:
if machine.config:
return machine
sibling = cls.objects.exclude(pk=machine.pk).filter(hostname=machine.hostname).exclude(config={}).order_by("-modified_at").first()
if sibling and sibling.config:
machine.config = dict(sibling.config)
machine.save(update_fields=["config", "modified_at"])
return machine
return cls._sanitize_config(_CURRENT_MACHINE)
@classmethod
def _sanitize_config(cls, machine: Machine) -> Machine:
@@ -622,12 +607,7 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
from archivebox.config.configset import get_config
# Get merged config (Binary doesn't have crawl/snapshot context).
# Binary workers can install several dependencies in one process, so
# refresh from the latest persisted machine config before each hook run.
config = get_config()
current_machine = Machine.current()
if current_machine.config:
config.update(current_machine.config)
# ArchiveBox installs the puppeteer package and Chromium in separate
# hook phases. Suppress puppeteer's bundled browser download during the
@@ -760,6 +740,11 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
binary_abspath = Path(self.abspath).resolve()
lib_bin_dir = Path(lib_bin_dir).resolve()
binary_parts = binary_abspath.parts
try:
app_index = next(index for index, part in enumerate(binary_parts) if part.endswith(".app"))
except StopIteration:
app_index = -1
# Create LIB_BIN_DIR if it doesn't exist
try:
@@ -772,6 +757,15 @@ class Binary(ModelWithHealthStats, ModelWithStateMachine):
binary_name = binary_abspath.name
symlink_path = lib_bin_dir / binary_name
if app_index != -1 and len(binary_parts) > app_index + 2 and binary_parts[app_index + 1 : app_index + 3] == ("Contents", "MacOS"):
if symlink_path.exists() or symlink_path.is_symlink():
try:
symlink_path.unlink()
except (OSError, PermissionError) as e:
print(f"Failed to remove existing file at {symlink_path}: {e}", file=sys.stderr)
return None
return binary_abspath
# Remove existing symlink/file if it exists
if symlink_path.exists() or symlink_path.is_symlink():
try:

View File

@@ -2,7 +2,6 @@ from .archive_result_service import ArchiveResultService
from .binary_service import BinaryService
from .crawl_service import CrawlService
from .machine_service import MachineService
from .process_request_service import ProcessRequestService
from .process_service import ProcessService
from .runner import run_binary, run_crawl, run_install, run_pending_crawls
from .snapshot_service import SnapshotService
@@ -13,7 +12,6 @@ __all__ = [
"BinaryService",
"CrawlService",
"MachineService",
"ProcessRequestService",
"ProcessService",
"SnapshotService",
"TagService",

View File

@@ -14,6 +14,23 @@ class BinaryService(BaseService):
async def on_BinaryRequestEvent__Outer(self, event: BinaryRequestEvent) -> None:
await run_db_op(self._project_binary, event)
cached = await run_db_op(self._load_cached_binary, event)
if cached is not None:
await self.bus.emit(
BinaryEvent(
name=event.name,
plugin_name=event.plugin_name,
hook_name=event.hook_name,
abspath=cached["abspath"],
version=cached["version"],
sha256=cached["sha256"],
binproviders=event.binproviders or cached["binproviders"],
binprovider=cached["binprovider"],
overrides=event.overrides or cached["overrides"],
binary_id=event.binary_id,
machine_id=event.machine_id or cached["machine_id"],
),
)
async def on_BinaryEvent__Outer(self, event: BinaryEvent) -> None:
resolved = await asyncio.to_thread(self._resolve_installed_binary_metadata, event)
@@ -44,6 +61,29 @@ class BinaryService(BaseService):
},
)
def _load_cached_binary(self, event: BinaryRequestEvent) -> dict[str, str] | None:
from archivebox.machine.models import Binary, Machine
machine = Machine.current()
installed = (
Binary.objects.filter(machine=machine, name=event.name, status=Binary.StatusChoices.INSTALLED)
.exclude(abspath="")
.exclude(abspath__isnull=True)
.order_by("-modified_at")
.first()
)
if installed is None:
return None
return {
"abspath": installed.abspath,
"version": installed.version or "",
"sha256": installed.sha256 or "",
"binproviders": installed.binproviders or "",
"binprovider": installed.binprovider or "",
"machine_id": str(installed.machine_id),
"overrides": installed.overrides or {},
}
def _resolve_installed_binary_metadata(self, event: BinaryEvent) -> dict[str, str]:
resolved = {
"abspath": event.abspath or "",
@@ -77,12 +117,11 @@ class BinaryService(BaseService):
"overrides": event.overrides or {},
}
binary = load_binary(spec)
resolved["abspath"] = str(getattr(binary, "abspath", None) or resolved["abspath"] or "")
resolved["version"] = str(getattr(binary, "version", None) or resolved["version"] or "")
resolved["sha256"] = str(getattr(binary, "sha256", None) or resolved["sha256"] or "")
provider_name = getattr(getattr(binary, "loaded_binprovider", None), "name", None)
if provider_name:
resolved["binprovider"] = str(provider_name)
resolved["abspath"] = str(binary.abspath or resolved["abspath"] or "")
resolved["version"] = str(binary.version or resolved["version"] or "")
resolved["sha256"] = str(binary.sha256 or resolved["sha256"] or "")
if binary.loaded_binprovider is not None and binary.loaded_binprovider.name:
resolved["binprovider"] = str(binary.loaded_binprovider.name)
except Exception:
pass

View File

@@ -14,13 +14,13 @@ class MachineService(BaseService):
await run_db_op(self._project, event)
def _project(self, event: MachineEvent) -> None:
from archivebox.machine.models import Machine
from archivebox.machine.models import Machine, _sanitize_machine_config
machine = Machine.current()
config = dict(machine.config or {})
if event.config is not None:
config.update(event.config)
config.update(_sanitize_machine_config(event.config))
elif event.method == "update":
key = event.key.replace("config/", "", 1).strip()
if key:
@@ -28,5 +28,5 @@ class MachineService(BaseService):
else:
return
machine.config = config
machine.config = _sanitize_machine_config(config)
machine.save(update_fields=["config", "modified_at"])

View File

@@ -1,179 +0,0 @@
from __future__ import annotations
import asyncio
from datetime import datetime, timezone
import json
from pathlib import Path
import shlex
import socket
import time
from typing import ClassVar
from abxbus import BaseEvent
from abx_dl.events import ProcessCompletedEvent, ProcessEvent, ProcessStartedEvent, ProcessStdoutEvent
from abx_dl.services.base import BaseService
def _is_port_listening(host: str, port: int) -> bool:
if not host or not port:
return False
try:
with socket.create_connection((host, port), timeout=0.5):
return True
except OSError:
return False
def _supervisor_env(env: dict[str, str]) -> str:
pairs = []
for key, value in env.items():
escaped = value.replace('"', '\\"')
pairs.append(f'{key}="{escaped}"')
return ",".join(pairs)
def _iso_from_epoch(value: object) -> str:
if not isinstance(value, (int, float)) or value <= 0:
return ""
return datetime.fromtimestamp(value, tz=timezone.utc).isoformat()
def _ensure_worker(process_event: ProcessEvent) -> dict[str, object]:
from archivebox.workers.supervisord_util import get_or_create_supervisord_process, get_worker, start_worker
output_dir = Path(process_event.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
worker_name = process_event.hook_name
supervisor = get_or_create_supervisord_process(daemonize=True)
existing = get_worker(supervisor, worker_name)
if (
isinstance(existing, dict)
and existing.get("statename") == "RUNNING"
and (
not process_event.daemon_startup_host
or not process_event.daemon_startup_port
or _is_port_listening(process_event.daemon_startup_host, process_event.daemon_startup_port)
)
):
return existing
daemon = {
"name": worker_name,
"command": shlex.join([process_event.hook_path, *process_event.hook_args]),
"directory": str(output_dir),
"autostart": "false",
"autorestart": "true",
"stdout_logfile": str(output_dir / f"{worker_name}.stdout.log"),
"redirect_stderr": "true",
}
if process_event.env:
daemon["environment"] = _supervisor_env(process_event.env)
proc = start_worker(supervisor, daemon)
deadline = time.monotonic() + max(float(process_event.daemon_startup_timeout), 0.5)
while time.monotonic() < deadline:
current = get_worker(supervisor, worker_name)
if isinstance(current, dict) and current.get("statename") == "RUNNING":
if (
not process_event.daemon_startup_host
or not process_event.daemon_startup_port
or _is_port_listening(process_event.daemon_startup_host, process_event.daemon_startup_port)
):
return current
time.sleep(0.1)
return proc if isinstance(proc, dict) else {}
class ProcessRequestService(BaseService):
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStdoutEvent]
EMITS: ClassVar[list[type[BaseEvent]]] = [ProcessEvent, ProcessStartedEvent, ProcessCompletedEvent]
async def on_ProcessStdoutEvent(self, event: ProcessStdoutEvent) -> None:
try:
record = json.loads(event.line)
except (json.JSONDecodeError, ValueError):
return
if not isinstance(record, dict) or record.pop("type", "") != "ProcessEvent":
return
process_event = ProcessEvent(
plugin_name=record.get("plugin_name") or event.plugin_name,
hook_name=record.get("hook_name") or "process_request",
hook_path=record["hook_path"],
hook_args=[str(arg) for arg in record.get("hook_args", [])],
is_background=bool(record.get("is_background", True)),
output_dir=record.get("output_dir") or event.output_dir,
env={str(key): str(value) for key, value in (record.get("env") or {}).items()},
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
timeout=int(record.get("timeout") or 60),
daemon=bool(record.get("daemon", False)),
daemon_startup_host=str(record.get("daemon_startup_host") or ""),
daemon_startup_port=int(record.get("daemon_startup_port") or 0),
daemon_startup_timeout=float(record.get("daemon_startup_timeout") or 0.0),
process_type=str(record.get("process_type") or ""),
worker_type=str(record.get("worker_type") or ""),
event_timeout=float(record.get("event_timeout") or 360.0),
event_handler_timeout=float(record.get("event_handler_timeout") or 390.0),
)
if not process_event.daemon:
await self.bus.emit(process_event)
return
proc = await asyncio.to_thread(_ensure_worker, process_event)
process_id = str(record.get("process_id") or f"worker:{process_event.hook_name}")
start_ts = _iso_from_epoch(proc.get("start"))
pid = int(proc.get("pid") or 0)
statename = str(proc.get("statename") or "")
exitstatus = int(proc.get("exitstatus") or 0)
process_type = process_event.process_type or "worker"
worker_type = process_event.worker_type or process_event.plugin_name
if statename == "RUNNING" and pid:
await self.bus.emit(
ProcessStartedEvent(
plugin_name=process_event.plugin_name,
hook_name=process_event.hook_name,
hook_path=process_event.hook_path,
hook_args=process_event.hook_args,
output_dir=process_event.output_dir,
env=process_event.env,
timeout=process_event.timeout,
pid=pid,
process_id=process_id,
snapshot_id=process_event.snapshot_id,
is_background=True,
process_type=process_type,
worker_type=worker_type,
start_ts=start_ts,
),
)
return
stderr = (
f"Worker {process_event.hook_name} failed to start"
if not statename
else f"Worker {process_event.hook_name} state={statename} exitstatus={exitstatus}"
)
await self.bus.emit(
ProcessCompletedEvent(
plugin_name=process_event.plugin_name,
hook_name=process_event.hook_name,
hook_path=process_event.hook_path,
hook_args=process_event.hook_args,
env=process_event.env,
stdout="",
stderr=stderr,
exit_code=exitstatus or 1,
output_dir=process_event.output_dir,
is_background=True,
process_id=process_id,
snapshot_id=process_event.snapshot_id,
pid=pid,
process_type=process_type,
worker_type=worker_type,
start_ts=start_ts,
end_ts=datetime.now(tz=timezone.utc).isoformat(),
),
)
raise RuntimeError(stderr)

View File

@@ -1,11 +1,19 @@
from __future__ import annotations
from datetime import datetime
from typing import TYPE_CHECKING
import asyncio
from datetime import datetime, timezone as datetime_timezone
import json
from pathlib import Path
import shlex
import socket
import time
from typing import TYPE_CHECKING, Any, ClassVar
from urllib.parse import urlparse
from django.utils import timezone
from abx_dl.events import ProcessCompletedEvent, ProcessStartedEvent
from abxbus import BaseEvent
from abx_dl.events import ProcessCompletedEvent, ProcessEvent, ProcessStartedEvent, ProcessStdoutEvent
from abx_dl.services.base import BaseService
from .db import run_db_op
@@ -14,6 +22,9 @@ if TYPE_CHECKING:
from archivebox.machine.models import Process
WORKER_READY_TIMEOUT = 10.0
def parse_event_datetime(value: str | None):
if not value:
return None
@@ -26,14 +37,218 @@ def parse_event_datetime(value: str | None):
return dt
def _is_port_listening(host: str, port: int) -> bool:
if not host or not port:
return False
try:
with socket.create_connection((host, port), timeout=0.5):
return True
except OSError:
return False
def _worker_socket_from_url(url: str) -> tuple[str, int] | None:
if not url:
return None
parsed = urlparse(url)
if parsed.scheme != "tcp" or not parsed.hostname or not parsed.port:
return None
return parsed.hostname, parsed.port
def _supervisor_env(env: dict[str, str]) -> str:
pairs = []
for key, value in env.items():
escaped = value.replace('"', '\\"')
pairs.append(f'{key}="{escaped}"')
return ",".join(pairs)
def _iso_from_epoch(value: object) -> str:
if not isinstance(value, (int, float)) or value <= 0:
return ""
return datetime.fromtimestamp(value, tz=datetime_timezone.utc).isoformat()
def _int_from_object(value: object) -> int:
if isinstance(value, bool):
return int(value)
if isinstance(value, int):
return value
if isinstance(value, float):
return int(value)
if isinstance(value, str):
try:
return int(value)
except ValueError:
return 0
return 0
def _ensure_worker(process_event: ProcessEvent) -> dict[str, object]:
from archivebox.workers.supervisord_util import get_or_create_supervisord_process, get_worker, start_worker
output_dir = Path(process_event.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
worker_name = process_event.hook_name
supervisor = get_or_create_supervisord_process(daemonize=True)
worker_socket = _worker_socket_from_url(getattr(process_event, "url", ""))
existing = get_worker(supervisor, worker_name)
if (
isinstance(existing, dict)
and existing.get("statename") == "RUNNING"
and (worker_socket is None or _is_port_listening(*worker_socket))
):
return existing
daemon = {
"name": worker_name,
"command": shlex.join([process_event.hook_path, *process_event.hook_args]),
"directory": str(output_dir),
"autostart": "false",
"autorestart": "true",
"stdout_logfile": str(output_dir / f"{worker_name}.stdout.log"),
"redirect_stderr": "true",
}
if process_event.env:
daemon["environment"] = _supervisor_env(process_event.env)
proc = start_worker(supervisor, daemon)
deadline = time.monotonic() + WORKER_READY_TIMEOUT
while time.monotonic() < deadline:
current = get_worker(supervisor, worker_name)
if isinstance(current, dict) and current.get("statename") == "RUNNING":
if worker_socket is None or _is_port_listening(*worker_socket):
return current
time.sleep(0.1)
return proc if isinstance(proc, dict) else {}
class ProcessService(BaseService):
LISTENS_TO = [ProcessStartedEvent, ProcessCompletedEvent]
EMITS = []
LISTENS_TO: ClassVar[list[type[BaseEvent]]] = [ProcessStdoutEvent, ProcessStartedEvent, ProcessCompletedEvent]
EMITS: ClassVar[list[type[BaseEvent]]] = [ProcessEvent, ProcessStartedEvent, ProcessCompletedEvent]
def __init__(self, bus):
self.process_ids: dict[str, str] = {}
super().__init__(bus)
async def on_ProcessStdoutEvent(self, event: ProcessStdoutEvent) -> None:
try:
record = json.loads(event.line)
except (json.JSONDecodeError, ValueError):
return
if not isinstance(record, dict) or record.get("type") != "ProcessEvent":
return
passthrough_fields: dict[str, Any] = {
key: value
for key, value in record.items()
if key
not in {
"type",
"plugin_name",
"hook_name",
"hook_path",
"hook_args",
"is_background",
"output_dir",
"env",
"snapshot_id",
"process_id",
"url",
"timeout",
"daemon",
"process_type",
"worker_type",
"event_timeout",
"event_handler_timeout",
}
}
process_event = ProcessEvent(
plugin_name=record.get("plugin_name") or event.plugin_name,
hook_name=record.get("hook_name") or "process",
hook_path=record["hook_path"],
hook_args=[str(arg) for arg in record.get("hook_args", [])],
is_background=bool(record.get("is_background", True)),
output_dir=record.get("output_dir") or event.output_dir,
env={str(key): str(value) for key, value in (record.get("env") or {}).items()},
snapshot_id=record.get("snapshot_id") or event.snapshot_id,
timeout=int(record.get("timeout") or 60),
daemon=bool(record.get("daemon", False)),
url=str(record.get("url") or ""),
process_type=str(record.get("process_type") or ""),
worker_type=str(record.get("worker_type") or ""),
event_timeout=float(record.get("event_timeout") or 360.0),
event_handler_timeout=float(record.get("event_handler_timeout") or 390.0),
**passthrough_fields,
)
if not process_event.daemon:
await self.bus.emit(process_event)
return
proc = await asyncio.to_thread(_ensure_worker, process_event)
process_id = str(record.get("process_id") or f"worker:{process_event.hook_name}")
start_ts = _iso_from_epoch(proc.get("start"))
pid = _int_from_object(proc.get("pid"))
statename = str(proc.get("statename") or "")
exitstatus = _int_from_object(proc.get("exitstatus"))
process_type = process_event.process_type or "worker"
worker_type = process_event.worker_type or process_event.plugin_name
if statename == "RUNNING" and pid:
await self.bus.emit(
ProcessStartedEvent(
plugin_name=process_event.plugin_name,
hook_name=process_event.hook_name,
hook_path=process_event.hook_path,
hook_args=process_event.hook_args,
output_dir=process_event.output_dir,
env=process_event.env,
timeout=process_event.timeout,
pid=pid,
process_id=process_id,
snapshot_id=process_event.snapshot_id,
is_background=True,
url=process_event.url,
process_type=process_type,
worker_type=worker_type,
start_ts=start_ts,
**passthrough_fields,
),
)
return
stderr = (
f"Worker {process_event.hook_name} failed to start"
if not statename
else f"Worker {process_event.hook_name} state={statename} exitstatus={exitstatus}"
)
await self.bus.emit(
ProcessCompletedEvent(
plugin_name=process_event.plugin_name,
hook_name=process_event.hook_name,
hook_path=process_event.hook_path,
hook_args=process_event.hook_args,
env=process_event.env,
stdout="",
stderr=stderr,
exit_code=exitstatus or 1,
output_dir=process_event.output_dir,
is_background=True,
process_id=process_id,
snapshot_id=process_event.snapshot_id,
pid=pid,
url=process_event.url,
process_type=process_type,
worker_type=worker_type,
start_ts=start_ts,
end_ts=datetime.now(tz=datetime_timezone.utc).isoformat(),
**passthrough_fields,
),
)
raise RuntimeError(stderr)
async def on_ProcessStartedEvent__Outer(self, event: ProcessStartedEvent) -> None:
await run_db_op(self._project_started, event)
@@ -51,7 +266,7 @@ class ProcessService(BaseService):
if db_process_id:
process = Process.objects.filter(id=db_process_id).first()
if process is not None:
if process.iface_id != iface.id or process.machine_id != iface.machine_id:
if getattr(process, "iface_id", None) != iface.id or process.machine_id != iface.machine_id:
process.iface = iface
process.machine = iface.machine
process.save(update_fields=["iface", "machine", "modified_at"])
@@ -84,6 +299,7 @@ class ProcessService(BaseService):
env=event.env,
timeout=getattr(event, "timeout", 60),
pid=event.pid or None,
url=getattr(event, "url", "") or None,
started_at=parse_event_datetime(getattr(event, "start_ts", "")),
status=Process.StatusChoices.RUNNING,
retry_at=None,
@@ -98,6 +314,7 @@ class ProcessService(BaseService):
process.env = event.env
process.timeout = event.timeout
process.pid = event.pid or None
process.url = getattr(event, "url", "") or process.url
process.process_type = getattr(event, "process_type", "") or process.process_type
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
process.started_at = parse_event_datetime(event.start_ts) or process.started_at or timezone.now()
@@ -113,6 +330,7 @@ class ProcessService(BaseService):
process.cmd = [event.hook_path, *event.hook_args]
process.env = event.env
process.pid = event.pid or process.pid
process.url = getattr(event, "url", "") or process.url
process.process_type = getattr(event, "process_type", "") or process.process_type
process.worker_type = getattr(event, "worker_type", "") or process.worker_type
process.started_at = parse_event_datetime(event.start_ts) or process.started_at

View File

@@ -3,6 +3,7 @@ from __future__ import annotations
import asyncio
import json
import os
import re
import shutil
import subprocess
import sys
@@ -28,8 +29,6 @@ from abx_dl.orchestrator import (
from .archive_result_service import ArchiveResultService
from .binary_service import BinaryService
from .crawl_service import CrawlService
from .machine_service import MachineService
from .process_request_service import ProcessRequestService
from .process_service import ProcessService
from .snapshot_service import SnapshotService
from .tag_service import TagService
@@ -58,28 +57,34 @@ def _count_selected_hooks(plugins: dict[str, Plugin], selected_plugins: list[str
)
def _binary_env_key(name: str) -> str:
normalized = "".join(ch if ch.isalnum() else "_" for ch in name).upper()
return f"{normalized}_BINARY"
_TEMPLATE_NAME_RE = re.compile(r"^\{([A-Z0-9_]+)\}$")
def _binary_config_keys_for_plugins(plugins: dict[str, Plugin], binary_name: str) -> list[str]:
def _binary_config_keys_for_plugins(plugins: dict[str, Plugin], binary_name: str, config: dict[str, Any]) -> list[str]:
keys: list[str] = []
if binary_name != "postlight-parser":
keys.append(_binary_env_key(binary_name))
for plugin in plugins.values():
for spec in plugin.binaries:
template_name = str(spec.get("name") or "").strip()
match = _TEMPLATE_NAME_RE.fullmatch(template_name)
if match is None:
continue
key = match.group(1)
configured_value = config.get(key)
if configured_value is not None and str(configured_value).strip() == binary_name:
keys.append(key)
for key, prop in plugin.config_schema.items():
if key.endswith("_BINARY") and prop.get("default") == binary_name:
keys.insert(0, key)
keys.append(key)
return list(dict.fromkeys(keys))
def _installed_binary_config_overrides(plugins: dict[str, Plugin]) -> dict[str, str]:
def _installed_binary_config_overrides(plugins: dict[str, Plugin], config: dict[str, Any] | None = None) -> dict[str, str]:
from archivebox.machine.models import Binary, Machine
machine = Machine.current()
active_config = dict(config or {})
overrides: dict[str, str] = {}
shared_lib_dir: Path | None = None
pip_home: Path | None = None
@@ -98,7 +103,7 @@ def _installed_binary_config_overrides(plugins: dict[str, Plugin]) -> dict[str,
continue
if not resolved_path.is_file() or not os.access(resolved_path, os.X_OK):
continue
for key in _binary_config_keys_for_plugins(plugins, binary.name):
for key in _binary_config_keys_for_plugins(plugins, binary.name, active_config):
overrides[key] = binary.abspath
if resolved_path.parent.name == ".bin" and resolved_path.parent.parent.name == "node_modules":
@@ -231,10 +236,8 @@ class CrawlRunner:
self.bus = create_bus(name=_bus_name("ArchiveBox", str(crawl.id)), total_timeout=3600.0)
self.plugins = discover_plugins()
self.process_service = ProcessService(self.bus)
self.machine_service = MachineService(self.bus)
self.binary_service = BinaryService(self.bus)
self.tag_service = TagService(self.bus)
self.process_request_service = ProcessRequestService(self.bus)
self.crawl_service = CrawlService(self.bus, crawl_id=str(crawl.id))
self.process_discovered_snapshots_inline = process_discovered_snapshots_inline
self.snapshot_service = SnapshotService(
@@ -250,32 +253,10 @@ class CrawlRunner:
self.abx_services = None
self.persona = None
self.base_config: dict[str, Any] = {}
self.derived_config: dict[str, Any] = {}
self.primary_url = ""
self._live_stream = None
def _create_projector_bus(self, *, identifier: str, config_overrides: dict[str, Any]):
bus = create_bus(name=_bus_name("ArchiveBox", identifier), total_timeout=3600.0)
process_service = ProcessService(bus)
MachineService(bus)
BinaryService(bus)
TagService(bus)
ProcessRequestService(bus)
CrawlService(bus, crawl_id=str(self.crawl.id))
SnapshotService(
bus,
crawl_id=str(self.crawl.id),
schedule_snapshot=self.enqueue_snapshot if self.process_discovered_snapshots_inline else self.leave_snapshot_queued,
)
ArchiveResultService(bus, process_service=process_service)
abx_services = setup_abx_services(
bus,
plugins=self.plugins,
config_overrides=config_overrides,
auto_install=True,
emit_jsonl=False,
)
return bus, abx_services
async def run(self) -> None:
from asgiref.sync import sync_to_async
from archivebox.crawls.models import Crawl
@@ -292,6 +273,8 @@ class CrawlRunner:
**self.base_config,
"ABX_RUNTIME": "archivebox",
},
derived_config_overrides=self.derived_config,
persist_derived=False,
auto_install=True,
emit_jsonl=False,
)
@@ -369,7 +352,7 @@ class CrawlRunner:
current_process.save(update_fields=["iface", "machine", "modified_at"])
self.persona = self.crawl.resolve_persona()
self.base_config = get_config(crawl=self.crawl)
self.base_config.update(_installed_binary_config_overrides(self.plugins))
self.derived_config = _installed_binary_config_overrides(self.plugins, self.base_config)
self.base_config["ABX_RUNTIME"] = "archivebox"
if self.selected_plugins is None:
self.selected_plugins = _selected_plugins_from_config(self.base_config)
@@ -473,7 +456,6 @@ class CrawlRunner:
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
bus=self.bus,
emit_jsonl=False,
snapshot=setup_snapshot,
@@ -501,7 +483,6 @@ class CrawlRunner:
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
bus=self.bus,
emit_jsonl=False,
snapshot=cleanup_snapshot,
@@ -530,31 +511,22 @@ class CrawlRunner:
parent_snapshot_id=snapshot["parent_snapshot_id"],
crawl_id=str(self.crawl.id),
)
snapshot_bus, snapshot_services = self._create_projector_bus(
identifier=f"{self.crawl.id}_{snapshot['id']}",
config_overrides=snapshot["config"],
)
try:
_attach_bus_trace(snapshot_bus)
await download(
url=snapshot["url"],
plugins=self.plugins,
output_dir=Path(snapshot["output_dir"]),
selected_plugins=self.selected_plugins,
config_overrides=snapshot["config"],
bus=snapshot_bus,
bus=self.bus,
emit_jsonl=False,
snapshot=abx_snapshot,
skip_crawl_setup=True,
skip_crawl_cleanup=True,
)
await snapshot_services.process.wait_for_background_monitors()
finally:
current_task = asyncio.current_task()
if current_task is not None and self.snapshot_tasks.get(snapshot_id) is current_task:
self.snapshot_tasks.pop(snapshot_id, None)
await _stop_bus_trace(snapshot_bus)
await snapshot_bus.stop()
def _load_snapshot_run_data(self, snapshot_id: str):
from archivebox.core.models import Snapshot
@@ -615,19 +587,19 @@ async def _run_binary(binary_id: str) -> None:
binary = await sync_to_async(Binary.objects.get, thread_sensitive=True)(id=binary_id)
plugins = discover_plugins()
config = get_config()
config.update(await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins))
derived_config = await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins, config)
config["ABX_RUNTIME"] = "archivebox"
bus = create_bus(name=_bus_name("ArchiveBox_binary", str(binary.id)), total_timeout=1800.0)
process_service = ProcessService(bus)
MachineService(bus)
BinaryService(bus)
TagService(bus)
ProcessRequestService(bus)
ArchiveResultService(bus, process_service=process_service)
setup_abx_services(
bus,
plugins=plugins,
config_overrides=config,
derived_config_overrides=derived_config,
persist_derived=False,
auto_install=True,
emit_jsonl=False,
)
@@ -662,19 +634,19 @@ async def _run_install(plugin_names: list[str] | None = None) -> None:
plugins = discover_plugins()
config = get_config()
config.update(await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins))
derived_config = await sync_to_async(_installed_binary_config_overrides, thread_sensitive=True)(plugins, config)
config["ABX_RUNTIME"] = "archivebox"
bus = create_bus(name="ArchiveBox_install", total_timeout=3600.0)
process_service = ProcessService(bus)
MachineService(bus)
BinaryService(bus)
TagService(bus)
ProcessRequestService(bus)
ArchiveResultService(bus, process_service=process_service)
abx_services = setup_abx_services(
bus,
plugins=plugins,
config_overrides=config,
derived_config_overrides=derived_config,
persist_derived=False,
auto_install=True,
emit_jsonl=False,
)

View File

@@ -518,7 +518,6 @@ def test_binary_event_reuses_existing_installed_binary_row(monkeypatch):
event = BinaryRequestEvent(
name="wget",
plugin_name="wget",
hook_name="on_Install__10_wget.finite.bg",
output_dir="/tmp/wget",
binproviders="provider",
)

View File

@@ -133,7 +133,13 @@ def test_plugin_detail_view_renders_config_in_dedicated_sections(monkeypatch):
"description": "Example config used to verify plugin metadata rendering.",
"type": "object",
"required_plugins": ["chrome"],
"required_binaries": ["example-cli"],
"required_binaries": [
{
"name": "example-cli",
"binproviders": "env,apt,brew",
"min_version": None,
},
],
"output_mimetypes": ["text/plain", "application/json"],
"properties": {
"EXAMPLE_ENABLED": {

View File

@@ -3,7 +3,7 @@
Unit tests for the ArchiveBox hook architecture.
Tests hook discovery, execution, JSONL parsing, background hook detection,
binary lookup, and install hook XYZ_BINARY env var handling.
binary lookup, and required_binaries XYZ_BINARY passthrough handling.
Run with:
sudo -u testuser bash -c 'source .venv/bin/activate && python -m pytest archivebox/tests/test_hooks.py -v'
@@ -126,8 +126,8 @@ not json at all
self.assertEqual(records[0]["type"], "ArchiveResult")
class TestInstallHookEnvVarHandling(unittest.TestCase):
"""Test that install hooks respect XYZ_BINARY env vars."""
class TestRequiredBinaryConfigHandling(unittest.TestCase):
"""Test that required_binaries keep configured XYZ_BINARY values intact."""
def setUp(self):
"""Set up test environment."""
@@ -139,39 +139,28 @@ class TestInstallHookEnvVarHandling(unittest.TestCase):
shutil.rmtree(self.work_dir, ignore_errors=True)
def test_binary_env_var_absolute_path_handling(self):
"""Install hooks should handle absolute paths in XYZ_BINARY."""
# Test the logic that install hooks use
"""Absolute binary paths should pass through unchanged."""
configured_binary = "/custom/path/to/wget2"
if "/" in configured_binary:
bin_name = Path(configured_binary).name
else:
bin_name = configured_binary
binary_name = configured_binary
self.assertEqual(bin_name, "wget2")
self.assertEqual(binary_name, "/custom/path/to/wget2")
def test_binary_env_var_name_only_handling(self):
"""Install hooks should handle binary names in XYZ_BINARY."""
# Test the logic that install hooks use
"""Binary command names should pass through unchanged."""
configured_binary = "wget2"
if "/" in configured_binary:
bin_name = Path(configured_binary).name
else:
bin_name = configured_binary
binary_name = configured_binary
self.assertEqual(bin_name, "wget2")
self.assertEqual(binary_name, "wget2")
def test_binary_env_var_empty_default(self):
"""Install hooks should use default when XYZ_BINARY is empty."""
"""Empty configured values should fall back to config defaults."""
configured_binary = ""
if configured_binary:
if "/" in configured_binary:
bin_name = Path(configured_binary).name
else:
bin_name = configured_binary
binary_name = configured_binary
else:
bin_name = "wget" # default
binary_name = "wget"
self.assertEqual(bin_name, "wget")
self.assertEqual(binary_name, "wget")
class TestHookDiscovery(unittest.TestCase):
@@ -187,7 +176,7 @@ class TestHookDiscovery(unittest.TestCase):
wget_dir = self.plugins_dir / "wget"
wget_dir.mkdir()
(wget_dir / "on_Snapshot__50_wget.py").write_text("# test hook")
(wget_dir / "on_Install__10_wget.finite.bg.py").write_text("# install hook")
(wget_dir / "on_BinaryRequest__10_wget.py").write_text("# binary request hook")
chrome_dir = self.plugins_dir / "chrome"
chrome_dir.mkdir(exist_ok=True)
@@ -299,7 +288,7 @@ class TestHookDiscovery(unittest.TestCase):
self.assertIn("on_BinaryRequest__10_npm.py", hook_names)
def test_discover_hooks_accepts_event_class_names(self):
"""discover_hooks should accept InstallEvent / SnapshotEvent class names."""
"""discover_hooks should accept BinaryRequestEvent / SnapshotEvent class names."""
from archivebox import hooks as hooks_module
hooks_module.get_plugins.cache_clear()
@@ -307,10 +296,10 @@ class TestHookDiscovery(unittest.TestCase):
patch.object(hooks_module, "BUILTIN_PLUGINS_DIR", self.plugins_dir),
patch.object(hooks_module, "USER_PLUGINS_DIR", self.test_dir / "user_plugins"),
):
install_hooks = hooks_module.discover_hooks("InstallEvent", filter_disabled=False)
binary_hooks = hooks_module.discover_hooks("BinaryRequestEvent", filter_disabled=False)
snapshot_hooks = hooks_module.discover_hooks("SnapshotEvent", filter_disabled=False)
self.assertIn("on_Install__10_wget.finite.bg.py", [hook.name for hook in install_hooks])
self.assertIn("on_BinaryRequest__10_wget.py", [hook.name for hook in binary_hooks])
self.assertIn("on_Snapshot__50_wget.py", [hook.name for hook in snapshot_hooks])
def test_discover_hooks_returns_empty_for_non_hook_lifecycle_events(self):
@@ -325,44 +314,6 @@ class TestHookDiscovery(unittest.TestCase):
self.assertEqual(hooks_module.discover_hooks("BinaryEvent", filter_disabled=False), [])
self.assertEqual(hooks_module.discover_hooks("CrawlCleanupEvent", filter_disabled=False), [])
def test_discover_install_hooks_only_include_declared_plugin_dependencies(self):
"""Install hook discovery should include required_plugins without broadening to provider plugins."""
responses_dir = self.plugins_dir / "responses"
responses_dir.mkdir()
(responses_dir / "config.json").write_text(
json.dumps(
{
"type": "object",
"required_plugins": ["chrome"],
"properties": {},
},
),
)
chrome_dir = self.plugins_dir / "chrome"
chrome_dir.mkdir(exist_ok=True)
(chrome_dir / "config.json").write_text('{"type": "object", "properties": {}}')
(chrome_dir / "on_Install__70_chrome.finite.bg.py").write_text("# chrome install hook")
npm_dir = self.plugins_dir / "npm"
npm_dir.mkdir()
(npm_dir / "on_BinaryRequest__10_npm.py").write_text("# npm binary hook")
(npm_dir / "on_Install__00_npm.py").write_text("# npm install hook")
(npm_dir / "config.json").write_text('{"type": "object", "properties": {}}')
from archivebox import hooks as hooks_module
hooks_module.get_plugins.cache_clear()
with (
patch.object(hooks_module, "BUILTIN_PLUGINS_DIR", self.plugins_dir),
patch.object(hooks_module, "USER_PLUGINS_DIR", self.test_dir / "user_plugins"),
):
hooks = hooks_module.discover_hooks("Install", config={"PLUGINS": "responses"})
hook_names = [hook.name for hook in hooks]
self.assertIn("on_Install__70_chrome.finite.bg.py", hook_names)
self.assertNotIn("on_Install__00_npm.py", hook_names)
class TestGetExtractorName(unittest.TestCase):
"""Test get_extractor_name() function."""
@@ -484,8 +435,8 @@ print(json.dumps({"type": "ArchiveResult", "status": "succeeded", "url": args.ge
self.assertEqual(records[0]["url"], "https://example.com")
class TestInstallHookOutput(unittest.TestCase):
"""Test install hook output format compliance."""
class TestDependencyRecordOutput(unittest.TestCase):
"""Test dependency record output format compliance."""
def setUp(self):
"""Set up test environment."""
@@ -495,8 +446,8 @@ class TestInstallHookOutput(unittest.TestCase):
"""Clean up test environment."""
shutil.rmtree(self.work_dir, ignore_errors=True)
def test_install_hook_outputs_binary(self):
"""Install hook should output Binary JSONL when binary found."""
def test_dependency_record_outputs_binary(self):
"""Dependency resolution should output Binary JSONL when binary is found."""
hook_output = json.dumps(
{
"type": "Binary",
@@ -515,8 +466,8 @@ class TestInstallHookOutput(unittest.TestCase):
self.assertEqual(data["name"], "wget")
self.assertTrue(data["abspath"].startswith("/"))
def test_install_hook_outputs_machine_config(self):
"""Install hook should output Machine config update JSONL."""
def test_dependency_record_outputs_machine_config(self):
"""Dependency resolution should output Machine config update JSONL."""
hook_output = json.dumps(
{
"type": "Machine",

View File

@@ -0,0 +1,69 @@
import asyncio
import json
import pytest
from abx_dl.events import ProcessStartedEvent, ProcessStdoutEvent
from abx_dl.orchestrator import create_bus
pytestmark = pytest.mark.django_db
def test_process_service_emits_process_started_from_inline_process_event(monkeypatch):
from archivebox.services import process_service as process_service_module
from archivebox.services.process_service import ProcessService
bus = create_bus(name="test_process_service_inline_process_event")
ProcessService(bus)
monkeypatch.setattr(
process_service_module,
"_ensure_worker",
lambda event: {
"pid": 4321,
"start": 1711111111.0,
"statename": "RUNNING",
"exitstatus": 0,
},
)
async def run_test():
await bus.emit(
ProcessStdoutEvent(
line=json.dumps(
{
"type": "ProcessEvent",
"plugin_name": "search_backend_sonic",
"hook_name": "worker_sonic",
"hook_path": "/usr/bin/sonic",
"hook_args": ["-c", "/tmp/sonic/config.cfg"],
"is_background": True,
"daemon": True,
"url": "tcp://127.0.0.1:1491",
"output_dir": "/tmp/sonic",
"env": {},
"process_type": "worker",
"worker_type": "sonic",
"process_id": "worker:sonic",
"output_str": "127.0.0.1:1491",
},
),
plugin_name="search_backend_sonic",
hook_name="on_CrawlSetup__55_sonic_start.py",
output_dir="/tmp/search_backend_sonic",
snapshot_id="snap-1",
process_id="proc-hook",
),
)
started = await bus.find(ProcessStartedEvent, process_id="worker:sonic")
await bus.stop()
return started
started = asyncio.run(run_test())
assert started is not None
assert started.hook_name == "worker_sonic"
assert started.process_type == "worker"
assert started.worker_type == "sonic"
assert getattr(started, "url", "") == "tcp://127.0.0.1:1491"
assert getattr(started, "output_str", "") == "127.0.0.1:1491"

View File

@@ -46,7 +46,7 @@ async def _call_sync(func, *args, **kwargs):
return func(*args, **kwargs)
def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
def test_run_snapshot_reuses_crawl_bus_for_all_snapshots(monkeypatch):
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl
from archivebox.core.models import Snapshot
@@ -87,13 +87,13 @@ def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
download_calls = []
async def fake_download(*, url, bus, config_overrides, snapshot, **kwargs):
async def fake_download(*, url, bus, snapshot, **kwargs):
download_calls.append(
{
"url": url,
"bus": bus,
"snapshot_id": config_overrides["SNAPSHOT_ID"],
"source_url": config_overrides["SOURCE_URL"],
"snapshot_id": snapshot.id,
"source_url": snapshot.url,
"abx_snapshot_id": snapshot.id,
},
)
@@ -146,8 +146,8 @@ def test_run_snapshot_uses_isolated_bus_per_snapshot(monkeypatch):
assert len(download_calls) == 2
assert {call["snapshot_id"] for call in download_calls} == {str(snapshot_a.id), str(snapshot_b.id)}
assert {call["source_url"] for call in download_calls} == {snapshot_a.url, snapshot_b.url}
assert len({id(call["bus"]) for call in download_calls}) == 2
assert len(created_buses) == 3 # 1 crawl bus + 2 isolated snapshot buses
assert len({id(call["bus"]) for call in download_calls}) == 1
assert len(created_buses) == 1
def test_ensure_background_runner_starts_when_none_running(monkeypatch):
@@ -353,6 +353,62 @@ def test_installed_binary_config_overrides_include_valid_installed_binaries(monk
assert overrides["NODE_PATH"] == "/tmp/shared-lib/npm/node_modules"
def test_installed_binary_config_overrides_do_not_map_hardcoded_artifacts_to_configurable_binary_keys(monkeypatch):
from archivebox.machine.models import Binary, Machine
from archivebox.services import runner as runner_module
from abx_dl.models import Plugin
machine = Machine.objects.create(
guid="test-guid-runner-singlefile-cache",
hostname="runner-host-singlefile",
hw_in_docker=False,
hw_in_vm=False,
hw_manufacturer="Test",
hw_product="Test Product",
hw_uuid="test-hw-runner-singlefile-cache",
os_arch="arm64",
os_family="darwin",
os_platform="macOS",
os_release="14.0",
os_kernel="Darwin",
stats={},
config={},
)
singlefile_extension = Binary.objects.create(
machine=machine,
name="singlefile",
abspath="/tmp/shared-lib/bin/singlefile",
version="1.0.0",
binprovider="chromewebstore",
binproviders="chromewebstore",
status=Binary.StatusChoices.INSTALLED,
)
monkeypatch.setattr(Machine, "current", classmethod(lambda cls: machine))
monkeypatch.setattr(Path, "is_file", lambda self: str(self) == singlefile_extension.abspath)
monkeypatch.setattr(runner_module.os, "access", lambda path, mode: str(path) == singlefile_extension.abspath)
overrides = runner_module._installed_binary_config_overrides(
{
"singlefile": Plugin(
name="singlefile",
path=Path("."),
hooks=[],
config_schema={"SINGLEFILE_BINARY": {"type": "string", "default": "single-file"}},
binaries=[
{"name": "{SINGLEFILE_BINARY}", "binproviders": "env,npm"},
{"name": "singlefile", "binproviders": "chromewebstore"},
],
),
},
config={"SINGLEFILE_BINARY": "single-file"},
)
assert "SINGLEFILE_BINARY" not in overrides
assert overrides["LIB_DIR"] == "/tmp/shared-lib"
assert overrides["LIB_BIN_DIR"] == "/tmp/shared-lib/bin"
def test_run_snapshot_skips_descendant_when_max_size_already_reached(monkeypatch):
import asgiref.sync
@@ -700,11 +756,9 @@ def test_crawl_runner_calls_crawl_cleanup_after_snapshot_phase(monkeypatch):
"_run_crawl_cleanup",
lambda self, snapshot_id: cleanup_calls.append("abx_cleanup") or asyncio.sleep(0),
)
monkeypatch.setattr(crawl, "cleanup", lambda: cleanup_calls.append("crawl_cleanup"))
asyncio.run(runner_module.CrawlRunner(crawl, snapshot_ids=[str(snapshot.id)]).run())
assert cleanup_calls == ["crawl_cleanup", "abx_cleanup"]
assert cleanup_calls == ["abx_cleanup"]
def test_abx_process_service_background_monitor_finishes_after_process_exit(monkeypatch, tmp_path):
@@ -765,6 +819,9 @@ def test_abx_process_service_background_monitor_finishes_after_process_exit(monk
timeout=60,
snapshot_id="snap-1",
is_background=True,
url="https://example.org/",
process_type="hook",
worker_type="hook",
)
async def run_test():