mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
2567 lines
94 KiB
Python
Executable File
2567 lines
94 KiB
Python
Executable File
from __future__ import annotations
|
|
|
|
__package__ = "archivebox.machine"
|
|
|
|
import os
|
|
import sys
|
|
import uuid
|
|
import socket
|
|
from pathlib import Path
|
|
from archivebox.uuid_compat import uuid7
|
|
from datetime import timedelta, datetime
|
|
from typing import TYPE_CHECKING, Any, cast
|
|
|
|
from statemachine import State, registry
|
|
|
|
from django.db import models
|
|
from django.db.models import QuerySet
|
|
from django.utils import timezone
|
|
from django.utils.functional import cached_property
|
|
from django_stubs_ext.db.models import TypedModelMeta
|
|
|
|
from archivebox.base_models.models import ModelWithHealthStats
|
|
from archivebox.workers.models import BaseStateMachine, ModelWithStateMachine
|
|
from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, get_host_stats
|
|
|
|
_psutil: Any | None = None
|
|
try:
|
|
import psutil as _psutil_import
|
|
|
|
PSUTIL_AVAILABLE = True
|
|
except ImportError:
|
|
PSUTIL_AVAILABLE = False
|
|
else:
|
|
_psutil = _psutil_import
|
|
|
|
if TYPE_CHECKING:
|
|
import psutil
|
|
from archivebox.core.models import ArchiveResult
|
|
else:
|
|
psutil = cast(Any, _psutil)
|
|
|
|
_CURRENT_MACHINE: Machine | None = None
|
|
_CURRENT_INTERFACE: NetworkInterface | None = None
|
|
_CURRENT_BINARIES: dict[str, Binary] = {}
|
|
_CURRENT_PROCESS: Process | None = None
|
|
|
|
MACHINE_RECHECK_INTERVAL = 7 * 24 * 60 * 60
|
|
NETWORK_INTERFACE_RECHECK_INTERVAL = 1 * 60 * 60
|
|
BINARY_RECHECK_INTERVAL = 1 * 30 * 60
|
|
PROCESS_RECHECK_INTERVAL = 60 # Re-validate every 60 seconds
|
|
PID_REUSE_WINDOW = timedelta(hours=24) # Max age for considering a PID match valid
|
|
PROCESS_TIMEOUT_GRACE = timedelta(seconds=30) # Extra margin before force-cleaning timed-out RUNNING rows
|
|
START_TIME_TOLERANCE = 5.0 # Seconds tolerance for start time matching
|
|
LEGACY_MACHINE_CONFIG_KEYS = frozenset({"CHROMIUM_VERSION"})
|
|
|
|
|
|
def _find_existing_binary_for_reference(machine: Machine, reference: str) -> Binary | None:
|
|
reference = str(reference or "").strip()
|
|
if not reference:
|
|
return None
|
|
|
|
qs = Binary.objects.filter(machine=machine)
|
|
|
|
direct_match = qs.filter(abspath=reference).order_by("-modified_at").first()
|
|
if direct_match:
|
|
return direct_match
|
|
|
|
ref_name = Path(reference).name
|
|
if ref_name:
|
|
named_match = qs.filter(name=ref_name).order_by("-modified_at").first()
|
|
if named_match:
|
|
return named_match
|
|
|
|
return qs.filter(name=reference).order_by("-modified_at").first()
|
|
|
|
|
|
def _get_process_binary_env_keys(plugin_name: str, hook_path: str, env: dict[str, Any] | None) -> list[str]:
|
|
env = env or {}
|
|
plugin_name = str(plugin_name or "").strip()
|
|
hook_path = str(hook_path or "").strip()
|
|
plugin_key = plugin_name.upper().replace("-", "_")
|
|
keys: list[str] = []
|
|
seen: set[str] = set()
|
|
|
|
def add(key: str) -> None:
|
|
if key and key not in seen and env.get(key):
|
|
seen.add(key)
|
|
keys.append(key)
|
|
|
|
if plugin_key:
|
|
add(f"{plugin_key}_BINARY")
|
|
|
|
try:
|
|
from archivebox.hooks import discover_plugin_configs
|
|
|
|
plugin_schema = discover_plugin_configs().get(plugin_name, {})
|
|
schema_keys = [key for key in (plugin_schema.get("properties") or {}) if key.endswith("_BINARY")]
|
|
except Exception:
|
|
schema_keys = []
|
|
|
|
schema_keys.sort(
|
|
key=lambda key: (
|
|
key != f"{plugin_key}_BINARY",
|
|
key,
|
|
),
|
|
)
|
|
for key in schema_keys:
|
|
add(key)
|
|
|
|
if plugin_name.startswith("search_backend_"):
|
|
backend_name = plugin_name.removeprefix("search_backend_").upper().replace("-", "_")
|
|
configured_engine = str(env.get("SEARCH_BACKEND_ENGINE") or "").strip().upper().replace("-", "_")
|
|
if backend_name and backend_name == configured_engine:
|
|
add(f"{backend_name}_BINARY")
|
|
|
|
hook_suffix = Path(hook_path).suffix.lower()
|
|
if hook_suffix == ".js":
|
|
add("NODE_BINARY")
|
|
|
|
return keys
|
|
|
|
|
|
def _sanitize_machine_config(config: dict[str, Any] | None) -> dict[str, Any]:
|
|
if not isinstance(config, dict):
|
|
return {}
|
|
|
|
sanitized = dict(config)
|
|
for key in LEGACY_MACHINE_CONFIG_KEYS:
|
|
sanitized.pop(key, None)
|
|
return sanitized
|
|
|
|
|
|
class MachineManager(models.Manager):
|
|
def current(self) -> Machine:
|
|
return Machine.current()
|
|
|
|
|
|
class Machine(ModelWithHealthStats):
|
|
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
|
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
|
modified_at = models.DateTimeField(auto_now=True)
|
|
guid = models.CharField(max_length=64, default=None, null=False, unique=True, editable=False)
|
|
hostname = models.CharField(max_length=63, default=None, null=False)
|
|
hw_in_docker = models.BooleanField(default=False, null=False)
|
|
hw_in_vm = models.BooleanField(default=False, null=False)
|
|
hw_manufacturer = models.CharField(max_length=63, default=None, null=False)
|
|
hw_product = models.CharField(max_length=63, default=None, null=False)
|
|
hw_uuid = models.CharField(max_length=255, default=None, null=False)
|
|
os_arch = models.CharField(max_length=15, default=None, null=False)
|
|
os_family = models.CharField(max_length=15, default=None, null=False)
|
|
os_platform = models.CharField(max_length=63, default=None, null=False)
|
|
os_release = models.CharField(max_length=63, default=None, null=False)
|
|
os_kernel = models.CharField(max_length=255, default=None, null=False)
|
|
stats = models.JSONField(default=dict, null=True, blank=True)
|
|
config = models.JSONField(
|
|
default=dict,
|
|
null=True,
|
|
blank=True,
|
|
help_text="Machine-specific config overrides.",
|
|
)
|
|
num_uses_failed = models.PositiveIntegerField(default=0)
|
|
num_uses_succeeded = models.PositiveIntegerField(default=0)
|
|
|
|
objects = MachineManager() # pyright: ignore[reportIncompatibleVariableOverride]
|
|
networkinterface_set: models.Manager[NetworkInterface]
|
|
|
|
class Meta(ModelWithHealthStats.Meta):
|
|
app_label = "machine"
|
|
|
|
@classmethod
|
|
def current(cls) -> Machine:
|
|
global _CURRENT_MACHINE
|
|
if _CURRENT_MACHINE:
|
|
if timezone.now() < _CURRENT_MACHINE.modified_at + timedelta(seconds=MACHINE_RECHECK_INTERVAL):
|
|
return cls._sanitize_config(_CURRENT_MACHINE)
|
|
_CURRENT_MACHINE = None
|
|
_CURRENT_MACHINE, _ = cls.objects.update_or_create(
|
|
guid=get_host_guid(),
|
|
defaults={"hostname": socket.gethostname(), **get_os_info(), **get_vm_info(), "stats": get_host_stats()},
|
|
)
|
|
return cls._sanitize_config(_CURRENT_MACHINE)
|
|
|
|
@classmethod
|
|
def _sanitize_config(cls, machine: Machine) -> Machine:
|
|
sanitized = _sanitize_machine_config(machine.config)
|
|
current = machine.config or {}
|
|
if sanitized != current:
|
|
machine.config = sanitized
|
|
machine.save(update_fields=["config", "modified_at"])
|
|
return machine
|
|
|
|
def to_json(self) -> dict:
|
|
"""
|
|
Convert Machine model instance to a JSON-serializable dict.
|
|
"""
|
|
from archivebox.config import VERSION
|
|
|
|
return {
|
|
"type": "Machine",
|
|
"schema_version": VERSION,
|
|
"id": str(self.id),
|
|
"guid": self.guid,
|
|
"hostname": self.hostname,
|
|
"hw_in_docker": self.hw_in_docker,
|
|
"hw_in_vm": self.hw_in_vm,
|
|
"hw_manufacturer": self.hw_manufacturer,
|
|
"hw_product": self.hw_product,
|
|
"hw_uuid": self.hw_uuid,
|
|
"os_arch": self.os_arch,
|
|
"os_family": self.os_family,
|
|
"os_platform": self.os_platform,
|
|
"os_kernel": self.os_kernel,
|
|
"os_release": self.os_release,
|
|
"stats": self.stats,
|
|
"config": self.config or {},
|
|
}
|
|
|
|
@staticmethod
|
|
def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None):
|
|
"""
|
|
Update Machine config from JSON dict.
|
|
|
|
Args:
|
|
record: JSON dict with 'config': {key: value} patch
|
|
overrides: Not used
|
|
|
|
Returns:
|
|
Machine instance or None
|
|
"""
|
|
config_patch = _sanitize_machine_config(record.get("config"))
|
|
if config_patch:
|
|
machine = Machine.current()
|
|
machine.config = _sanitize_machine_config(machine.config)
|
|
machine.config.update(config_patch)
|
|
machine.save(update_fields=["config"])
|
|
return machine
|
|
return None
|
|
|
|
|
|
class NetworkInterfaceManager(models.Manager):
|
|
def current(self) -> NetworkInterface:
|
|
return NetworkInterface.current()
|
|
|
|
|
|
class NetworkInterface(ModelWithHealthStats):
|
|
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
|
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
|
modified_at = models.DateTimeField(auto_now=True)
|
|
machine = models.ForeignKey(Machine, on_delete=models.CASCADE, default=None, null=False)
|
|
mac_address = models.CharField(max_length=17, default=None, null=False, editable=False)
|
|
ip_public = models.GenericIPAddressField(default=None, null=False, editable=False)
|
|
ip_local = models.GenericIPAddressField(default=None, null=False, editable=False)
|
|
dns_server = models.GenericIPAddressField(default=None, null=False, editable=False)
|
|
hostname = models.CharField(max_length=63, default=None, null=False)
|
|
iface = models.CharField(max_length=15, default=None, null=False)
|
|
isp = models.CharField(max_length=63, default=None, null=False)
|
|
city = models.CharField(max_length=63, default=None, null=False)
|
|
region = models.CharField(max_length=63, default=None, null=False)
|
|
country = models.CharField(max_length=63, default=None, null=False)
|
|
# num_uses_failed = models.PositiveIntegerField(default=0) # from ModelWithHealthStats
|
|
# num_uses_succeeded = models.PositiveIntegerField(default=0) # from ModelWithHealthStats
|
|
|
|
objects = NetworkInterfaceManager() # pyright: ignore[reportIncompatibleVariableOverride]
|
|
machine_id: uuid.UUID
|
|
|
|
class Meta(ModelWithHealthStats.Meta):
|
|
app_label = "machine"
|
|
unique_together = (("machine", "ip_public", "ip_local", "mac_address", "dns_server"),)
|
|
|
|
@classmethod
|
|
def current(cls, refresh: bool = False) -> NetworkInterface:
|
|
global _CURRENT_INTERFACE
|
|
machine = Machine.current()
|
|
if _CURRENT_INTERFACE:
|
|
if (
|
|
not refresh
|
|
and _CURRENT_INTERFACE.machine_id == machine.id
|
|
and timezone.now() < _CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL)
|
|
):
|
|
return _CURRENT_INTERFACE
|
|
_CURRENT_INTERFACE = None
|
|
net_info = get_host_network()
|
|
_CURRENT_INTERFACE, _ = cls.objects.update_or_create(
|
|
machine=machine,
|
|
ip_public=net_info.pop("ip_public"),
|
|
ip_local=net_info.pop("ip_local"),
|
|
mac_address=net_info.pop("mac_address"),
|
|
dns_server=net_info.pop("dns_server"),
|
|
defaults=net_info,
|
|
)
|
|
return _CURRENT_INTERFACE
|
|
|
|
|
|
class BinaryManager(models.Manager):
|
|
def get_from_db_or_cache(self, name: str, abspath: str = "", version: str = "", sha256: str = "", binprovider: str = "env") -> Binary:
|
|
"""Get or create an Binary record from the database or cache."""
|
|
cached = _CURRENT_BINARIES.get(name)
|
|
if cached and timezone.now() < cached.modified_at + timedelta(seconds=BINARY_RECHECK_INTERVAL):
|
|
return cached
|
|
_CURRENT_BINARIES[name], _ = self.update_or_create(
|
|
machine=Machine.current(),
|
|
name=name,
|
|
binprovider=binprovider,
|
|
version=version,
|
|
abspath=abspath,
|
|
sha256=sha256,
|
|
)
|
|
return _CURRENT_BINARIES[name]
|
|
|
|
def get_valid_binary(self, name: str, machine: Machine | None = None) -> Binary | None:
|
|
"""Get a valid Binary for the given name on the current machine, or None if not found."""
|
|
machine = machine or Machine.current()
|
|
return (
|
|
self.filter(
|
|
machine=machine,
|
|
name__iexact=name,
|
|
)
|
|
.exclude(abspath="")
|
|
.exclude(abspath__isnull=True)
|
|
.order_by("-modified_at")
|
|
.first()
|
|
)
|
|
|
|
|
|
class Binary(ModelWithHealthStats, ModelWithStateMachine):
|
|
"""
|
|
Tracks a binary on a specific machine.
|
|
|
|
Simple state machine with 2 states:
|
|
- queued: Binary needs to be installed
|
|
- installed: Binary installed successfully (abspath, version, sha256 populated)
|
|
|
|
Installation is synchronous during queued→installed transition.
|
|
If installation fails, Binary stays in queued with retry_at set for later retry.
|
|
|
|
State machine calls run() which executes on_BinaryRequest__* hooks
|
|
to install the binary using the specified providers.
|
|
"""
|
|
|
|
class StatusChoices(models.TextChoices):
|
|
QUEUED = "queued", "Queued"
|
|
INSTALLED = "installed", "Installed"
|
|
|
|
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
|
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
|
modified_at = models.DateTimeField(auto_now=True)
|
|
machine = models.ForeignKey(Machine, on_delete=models.CASCADE, null=False)
|
|
|
|
# Binary metadata
|
|
name = models.CharField(max_length=63, default="", null=False, blank=True, db_index=True)
|
|
binproviders = models.CharField(
|
|
max_length=127,
|
|
default="env",
|
|
null=False,
|
|
blank=True,
|
|
help_text="Comma-separated list of allowed providers: apt,brew,pip,npm,env",
|
|
)
|
|
overrides = models.JSONField(
|
|
default=dict,
|
|
blank=True,
|
|
help_text="Provider-specific overrides: {'apt': {'install_args': ['pkg']}, ...}",
|
|
)
|
|
|
|
# Installation results (populated after installation)
|
|
binprovider = models.CharField(
|
|
max_length=31,
|
|
default="",
|
|
null=False,
|
|
blank=True,
|
|
help_text="Provider that successfully installed this binary",
|
|
)
|
|
abspath = models.CharField(max_length=255, default="", null=False, blank=True)
|
|
version = models.CharField(max_length=32, default="", null=False, blank=True)
|
|
sha256 = models.CharField(max_length=64, default="", null=False, blank=True)
|
|
|
|
# State machine fields
|
|
status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED, max_length=16)
|
|
retry_at = ModelWithStateMachine.RetryAtField(
|
|
default=timezone.now,
|
|
help_text="When to retry this binary installation",
|
|
)
|
|
|
|
# Health stats
|
|
num_uses_failed = models.PositiveIntegerField(default=0)
|
|
num_uses_succeeded = models.PositiveIntegerField(default=0)
|
|
|
|
machine_id: uuid.UUID
|
|
|
|
state_machine_name: str | None = "archivebox.machine.models.BinaryMachine"
|
|
active_state: str = StatusChoices.QUEUED
|
|
|
|
objects = BinaryManager() # pyright: ignore[reportIncompatibleVariableOverride]
|
|
|
|
class Meta(ModelWithHealthStats.Meta, ModelWithStateMachine.Meta):
|
|
app_label = "machine"
|
|
verbose_name = "Binary"
|
|
verbose_name_plural = "Binaries"
|
|
unique_together = (("machine", "name", "abspath", "version", "sha256"),)
|
|
|
|
def __str__(self) -> str:
|
|
return f"{self.name}@{self.binprovider}+{self.abspath}@{self.version}"
|
|
|
|
@property
|
|
def is_valid(self) -> bool:
|
|
"""A binary is valid if it has a resolved path and is marked installed."""
|
|
return bool(self.abspath) and self.status == self.StatusChoices.INSTALLED
|
|
|
|
@cached_property
|
|
def binary_info(self) -> dict:
|
|
"""Return info about the binary."""
|
|
return {
|
|
"name": self.name,
|
|
"abspath": self.abspath,
|
|
"version": self.version,
|
|
"binprovider": self.binprovider,
|
|
"is_valid": self.is_valid,
|
|
}
|
|
|
|
@property
|
|
def output_dir(self) -> Path:
|
|
"""
|
|
Get output directory for this binary's hook logs.
|
|
Path: data/machines/{machine_uuid}/binaries/{binary_name}/{binary_uuid}
|
|
"""
|
|
from django.conf import settings
|
|
|
|
return Path(settings.DATA_DIR) / "machines" / str(self.machine_id) / "binaries" / self.name / str(self.id)
|
|
|
|
def to_json(self) -> dict:
|
|
"""
|
|
Convert Binary model instance to a JSON-serializable dict.
|
|
"""
|
|
from archivebox.config import VERSION
|
|
|
|
is_installed = bool(self.abspath and self.version)
|
|
return {
|
|
"type": "Binary" if is_installed else "BinaryRequest",
|
|
"schema_version": VERSION,
|
|
"id": str(self.id),
|
|
"machine_id": str(self.machine_id),
|
|
"name": self.name,
|
|
"binproviders": self.binproviders,
|
|
"overrides": self.overrides,
|
|
"binprovider": self.binprovider,
|
|
"abspath": self.abspath,
|
|
"version": self.version,
|
|
"sha256": self.sha256,
|
|
"status": self.status,
|
|
}
|
|
|
|
@staticmethod
|
|
def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None):
|
|
"""
|
|
Create/update Binary from JSON dict.
|
|
|
|
Handles two cases:
|
|
1. From binaries.json: creates queued binary with name, binproviders, overrides
|
|
2. From hook output: updates binary with abspath, version, sha256, binprovider
|
|
|
|
Args:
|
|
record: JSON dict with 'name' and either:
|
|
- 'binproviders', 'overrides' (from binaries.json)
|
|
- 'abspath', 'version', 'sha256', 'binprovider' (from hook output)
|
|
overrides: Not used
|
|
|
|
Returns:
|
|
Binary instance or None
|
|
"""
|
|
name = record.get("name")
|
|
if not name:
|
|
return None
|
|
|
|
machine = Machine.current()
|
|
overrides = overrides or {}
|
|
binary_overrides = record.get("overrides", {})
|
|
normalized_overrides = binary_overrides if isinstance(binary_overrides, dict) else {}
|
|
|
|
# abx-plugins currently emits a GitHub install URL for readability-extractor,
|
|
# but the package is published on npm. Prefer the registry package to avoid
|
|
# long git-based installs in CI while still using canonical install_args.
|
|
if (
|
|
name == "readability-extractor"
|
|
and isinstance(normalized_overrides.get("npm"), dict)
|
|
and normalized_overrides["npm"].get("install_args") == ["https://github.com/ArchiveBox/readability-extractor"]
|
|
):
|
|
normalized_overrides = {
|
|
**normalized_overrides,
|
|
"npm": {
|
|
**normalized_overrides["npm"],
|
|
"install_args": ["readability-extractor"],
|
|
},
|
|
}
|
|
|
|
# Case 1: Already installed (from on_Crawl hooks) - has abspath AND binproviders
|
|
# This happens when on_Crawl hooks detect already-installed binaries
|
|
abspath = record.get("abspath")
|
|
version = record.get("version")
|
|
binproviders = record.get("binproviders")
|
|
|
|
if abspath and version and binproviders:
|
|
# Binary is already installed, create INSTALLED record with binproviders filter
|
|
binary, _ = Binary.objects.update_or_create(
|
|
machine=machine,
|
|
name=name,
|
|
defaults={
|
|
"abspath": abspath,
|
|
"version": version,
|
|
"sha256": record.get("sha256", ""),
|
|
"binprovider": record.get("binprovider", "env"),
|
|
"binproviders": binproviders, # Preserve the filter
|
|
"status": Binary.StatusChoices.INSTALLED,
|
|
"retry_at": None,
|
|
},
|
|
)
|
|
return binary
|
|
|
|
# Case 2: From binaries.json - create queued binary (needs installation)
|
|
if "binproviders" in record or ("overrides" in record and not abspath):
|
|
binary, _ = Binary.objects.update_or_create(
|
|
machine=machine,
|
|
name=name,
|
|
defaults={
|
|
"binproviders": record.get("binproviders", "env"),
|
|
"overrides": normalized_overrides,
|
|
"status": Binary.StatusChoices.QUEUED,
|
|
"retry_at": timezone.now(),
|
|
},
|
|
)
|
|
return binary
|
|
|
|
# Case 3: From on_BinaryRequest__ hook output - update with installation results
|
|
if abspath and version:
|
|
binary, _ = Binary.objects.update_or_create(
|
|
machine=machine,
|
|
name=name,
|
|
defaults={
|
|
"abspath": abspath,
|
|
"version": version,
|
|
"sha256": record.get("sha256", ""),
|
|
"binprovider": record.get("binprovider", "env"),
|
|
"status": Binary.StatusChoices.INSTALLED,
|
|
"retry_at": None,
|
|
},
|
|
)
|
|
return binary
|
|
|
|
return None
|
|
|
|
def update_and_requeue(self, **kwargs) -> bool:
|
|
"""
|
|
Update binary fields and requeue for worker state machine.
|
|
|
|
Sets modified_at to ensure workers pick up changes.
|
|
Always saves the model after updating.
|
|
"""
|
|
for key, value in kwargs.items():
|
|
setattr(self, key, value)
|
|
self.modified_at = timezone.now()
|
|
self.save()
|
|
return True
|
|
|
|
def _allowed_binproviders(self) -> set[str] | None:
|
|
"""Return the allowed binproviders for this binary, or None for wildcard."""
|
|
providers = str(self.binproviders or "").strip()
|
|
if not providers or providers == "*":
|
|
return None
|
|
return {provider.strip() for provider in providers.split(",") if provider.strip()}
|
|
|
|
def run(self):
|
|
"""
|
|
Execute binary installation by running on_BinaryRequest__* hooks.
|
|
|
|
Called by BinaryMachine when entering 'started' state.
|
|
Runs ALL on_BinaryRequest__* hooks - each hook checks binproviders
|
|
and decides if it can handle this binary. First hook to succeed wins.
|
|
Updates status to SUCCEEDED or FAILED based on hook output.
|
|
"""
|
|
import json
|
|
from archivebox.hooks import discover_hooks, run_hook
|
|
from archivebox.config.configset import get_config
|
|
|
|
# Get merged config (Binary doesn't have crawl/snapshot context).
|
|
config = get_config()
|
|
|
|
# ArchiveBox installs the puppeteer package and Chromium in separate
|
|
# hook phases. Suppress puppeteer's bundled browser download during the
|
|
# package install step so the dedicated chromium hook owns that work.
|
|
if self.name == "puppeteer":
|
|
config.setdefault("PUPPETEER_SKIP_DOWNLOAD", "true")
|
|
config.setdefault("PUPPETEER_SKIP_CHROMIUM_DOWNLOAD", "true")
|
|
|
|
# Create output directory
|
|
output_dir = self.output_dir
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Discover ALL on_BinaryRequest__* hooks
|
|
hooks = discover_hooks("BinaryRequest", config=config)
|
|
if not hooks:
|
|
# No hooks available - stay queued, will retry later
|
|
return
|
|
|
|
allowed_binproviders = self._allowed_binproviders()
|
|
|
|
# Run each hook - they decide if they can handle this binary
|
|
for hook in hooks:
|
|
plugin_name = hook.parent.name
|
|
if allowed_binproviders is not None and plugin_name not in allowed_binproviders:
|
|
continue
|
|
|
|
plugin_output_dir = output_dir / plugin_name
|
|
plugin_output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
overrides_json = None
|
|
if self.overrides:
|
|
overrides_json = json.dumps(self.overrides)
|
|
|
|
# Run the hook
|
|
process = run_hook(
|
|
hook,
|
|
output_dir=plugin_output_dir,
|
|
config=config,
|
|
timeout=600, # 10 min timeout for binary installation
|
|
binary_id=str(self.id),
|
|
machine_id=str(self.machine_id),
|
|
name=self.name,
|
|
binproviders=self.binproviders,
|
|
overrides=overrides_json,
|
|
)
|
|
|
|
# Background hook (unlikely for binary installation, but handle it)
|
|
if process is None:
|
|
continue
|
|
|
|
# Failed or skipped hook - try next one
|
|
if process.exit_code != 0:
|
|
continue
|
|
|
|
# Parse JSONL output to check for successful installation
|
|
from archivebox.hooks import extract_records_from_process, process_hook_records
|
|
|
|
records = extract_records_from_process(process)
|
|
if records:
|
|
process_hook_records(records, overrides={})
|
|
binary_records = [record for record in records if record.get("type") == "Binary" and record.get("abspath")]
|
|
if binary_records:
|
|
record = binary_records[0]
|
|
# Update self from successful installation
|
|
self.abspath = record["abspath"]
|
|
self.version = record.get("version", "")
|
|
self.sha256 = record.get("sha256", "")
|
|
self.binprovider = record.get("binprovider", "env")
|
|
self.status = self.StatusChoices.INSTALLED
|
|
self.save()
|
|
|
|
# Symlink binary into LIB_BIN_DIR if configured
|
|
from django.conf import settings
|
|
|
|
lib_bin_dir = getattr(settings, "LIB_BIN_DIR", None)
|
|
if lib_bin_dir:
|
|
self.symlink_to_lib_bin(lib_bin_dir)
|
|
|
|
return
|
|
|
|
# No hook succeeded - leave status as QUEUED (will retry later)
|
|
# Don't set to FAILED since we don't have that status anymore
|
|
|
|
def cleanup(self):
|
|
"""
|
|
Clean up background binary installation hooks.
|
|
|
|
Called by state machine if needed (not typically used for binaries
|
|
since installations are foreground, but included for consistency).
|
|
"""
|
|
|
|
# Clean up .pid files from output directory
|
|
output_dir = self.output_dir
|
|
if output_dir.exists():
|
|
for pid_file in output_dir.glob("**/*.pid"):
|
|
pid_file.unlink(missing_ok=True)
|
|
|
|
def symlink_to_lib_bin(self, lib_bin_dir: str | Path) -> Path | None:
|
|
"""
|
|
Symlink this binary into LIB_BIN_DIR for unified PATH management.
|
|
|
|
After a binary is installed by any binprovider (pip, npm, brew, apt, etc),
|
|
we symlink it into LIB_BIN_DIR so that:
|
|
1. All binaries can be found in a single directory
|
|
2. PATH only needs LIB_BIN_DIR prepended (not multiple provider-specific paths)
|
|
3. Binary priorities are clear (symlink points to the canonical install location)
|
|
|
|
Args:
|
|
lib_bin_dir: Path to LIB_BIN_DIR (e.g., /data/lib/arm64-darwin/bin)
|
|
|
|
Returns:
|
|
Path to the created symlink, or None if symlinking failed
|
|
|
|
Example:
|
|
>>> binary = Binary.objects.get(name='yt-dlp')
|
|
>>> binary.symlink_to_lib_bin('/data/lib/arm64-darwin/bin')
|
|
Path('/data/lib/arm64-darwin/bin/yt-dlp')
|
|
"""
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
if not self.abspath:
|
|
return None
|
|
|
|
binary_abspath = Path(self.abspath).resolve()
|
|
lib_bin_dir = Path(lib_bin_dir).resolve()
|
|
binary_parts = binary_abspath.parts
|
|
try:
|
|
app_index = next(index for index, part in enumerate(binary_parts) if part.endswith(".app"))
|
|
except StopIteration:
|
|
app_index = -1
|
|
|
|
# Create LIB_BIN_DIR if it doesn't exist
|
|
try:
|
|
lib_bin_dir.mkdir(parents=True, exist_ok=True)
|
|
except (OSError, PermissionError) as e:
|
|
print(f"Failed to create LIB_BIN_DIR {lib_bin_dir}: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
# Get binary name (last component of path)
|
|
binary_name = binary_abspath.name
|
|
symlink_path = lib_bin_dir / binary_name
|
|
|
|
if app_index != -1 and len(binary_parts) > app_index + 2 and binary_parts[app_index + 1 : app_index + 3] == ("Contents", "MacOS"):
|
|
if symlink_path.exists() or symlink_path.is_symlink():
|
|
try:
|
|
symlink_path.unlink()
|
|
except (OSError, PermissionError) as e:
|
|
print(f"Failed to remove existing file at {symlink_path}: {e}", file=sys.stderr)
|
|
return None
|
|
return binary_abspath
|
|
|
|
# Remove existing symlink/file if it exists
|
|
if symlink_path.exists() or symlink_path.is_symlink():
|
|
try:
|
|
# Check if it's already pointing to the right place
|
|
if symlink_path.is_symlink() and symlink_path.resolve() == binary_abspath:
|
|
# Already correctly symlinked, nothing to do
|
|
return symlink_path
|
|
|
|
# Remove old symlink/file
|
|
symlink_path.unlink()
|
|
except (OSError, PermissionError) as e:
|
|
print(f"Failed to remove existing file at {symlink_path}: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
# Create new symlink
|
|
try:
|
|
symlink_path.symlink_to(binary_abspath)
|
|
print(f"Symlinked {binary_name} -> {symlink_path}", file=sys.stderr)
|
|
return symlink_path
|
|
except (OSError, PermissionError) as e:
|
|
print(f"Failed to create symlink {symlink_path} -> {binary_abspath}: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
# =============================================================================
|
|
# Process Model
|
|
# =============================================================================
|
|
|
|
|
|
class ProcessManager(models.Manager):
|
|
"""Manager for Process model."""
|
|
|
|
def current(self) -> Process:
|
|
"""Get the Process record for the current OS process."""
|
|
return Process.current()
|
|
|
|
def get_by_pid(self, pid: int, machine: Machine | None = None) -> Process | None:
|
|
"""
|
|
Find a Process by PID with proper validation against PID reuse.
|
|
|
|
IMPORTANT: PIDs are reused by the OS! This method:
|
|
1. Filters by machine (required - PIDs are only unique per machine)
|
|
2. Filters by time window (processes older than 24h are stale)
|
|
3. Validates via psutil that start times match
|
|
|
|
Args:
|
|
pid: OS process ID
|
|
machine: Machine instance (defaults to current machine)
|
|
|
|
Returns:
|
|
Process if found and validated, None otherwise
|
|
"""
|
|
if not PSUTIL_AVAILABLE:
|
|
return None
|
|
|
|
machine = machine or Machine.current()
|
|
|
|
# Get the actual process start time from OS
|
|
try:
|
|
os_proc = psutil.Process(pid)
|
|
os_start_time = os_proc.create_time()
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
|
# Process doesn't exist - any DB record with this PID is stale
|
|
return None
|
|
|
|
# Query candidates: same machine, same PID, recent, still RUNNING
|
|
candidates = self.filter(
|
|
machine=machine,
|
|
pid=pid,
|
|
status=Process.StatusChoices.RUNNING,
|
|
started_at__gte=timezone.now() - PID_REUSE_WINDOW,
|
|
).order_by("-started_at")
|
|
|
|
for candidate in candidates:
|
|
# Validate start time matches (within tolerance)
|
|
if candidate.started_at:
|
|
db_start_time = candidate.started_at.timestamp()
|
|
if abs(db_start_time - os_start_time) < START_TIME_TOLERANCE:
|
|
return candidate
|
|
|
|
return None
|
|
|
|
def create_for_archiveresult(self, archiveresult, **kwargs):
|
|
"""
|
|
Create a Process record for an ArchiveResult.
|
|
|
|
Called during migration and when creating new ArchiveResults.
|
|
"""
|
|
iface = kwargs.get("iface") or NetworkInterface.current()
|
|
|
|
# Defaults from ArchiveResult if not provided
|
|
defaults = {
|
|
"machine": iface.machine,
|
|
"pwd": kwargs.get("pwd") or str(archiveresult.snapshot.output_dir / archiveresult.plugin),
|
|
"cmd": kwargs.get("cmd") or [],
|
|
"status": "queued",
|
|
"timeout": kwargs.get("timeout", 120),
|
|
"env": kwargs.get("env", {}),
|
|
"iface": iface,
|
|
}
|
|
defaults.update(kwargs)
|
|
|
|
process = self.create(**defaults)
|
|
return process
|
|
|
|
|
|
class Process(models.Model):
|
|
"""
|
|
Tracks a single OS process execution.
|
|
|
|
Process represents the actual subprocess spawned to execute a hook.
|
|
One Process can optionally be associated with an ArchiveResult (via OneToOne),
|
|
but Process can also exist standalone for internal operations.
|
|
|
|
Follows the unified state machine pattern:
|
|
- queued: Process ready to launch
|
|
- running: Process actively executing
|
|
- exited: Process completed (check exit_code for success/failure)
|
|
|
|
State machine calls launch() to spawn the process and monitors its lifecycle.
|
|
"""
|
|
|
|
class StatusChoices(models.TextChoices):
|
|
QUEUED = "queued", "Queued"
|
|
RUNNING = "running", "Running"
|
|
EXITED = "exited", "Exited"
|
|
|
|
class TypeChoices(models.TextChoices):
|
|
SUPERVISORD = "supervisord", "Supervisord"
|
|
ORCHESTRATOR = "orchestrator", "Orchestrator"
|
|
WORKER = "worker", "Worker"
|
|
CLI = "cli", "CLI"
|
|
HOOK = "hook", "Hook"
|
|
BINARY = "binary", "Binary"
|
|
|
|
# Primary fields
|
|
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
|
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
|
modified_at = models.DateTimeField(auto_now=True)
|
|
|
|
# Machine FK - required (every process runs on a machine)
|
|
machine = models.ForeignKey(
|
|
Machine,
|
|
on_delete=models.CASCADE,
|
|
null=False,
|
|
related_name="process_set",
|
|
help_text="Machine where this process executed",
|
|
)
|
|
|
|
# Parent process (optional)
|
|
parent = models.ForeignKey(
|
|
"self",
|
|
on_delete=models.SET_NULL,
|
|
null=True,
|
|
blank=True,
|
|
related_name="children",
|
|
help_text="Parent process that spawned this process",
|
|
)
|
|
|
|
# Process type (cli, worker, orchestrator, binary, supervisord)
|
|
process_type = models.CharField(
|
|
max_length=16,
|
|
choices=TypeChoices.choices,
|
|
default=TypeChoices.CLI,
|
|
db_index=True,
|
|
help_text="Type of process (cli, worker, orchestrator, binary, supervisord)",
|
|
)
|
|
|
|
# Worker type (only for WORKER processes: crawl, snapshot, archiveresult)
|
|
worker_type = models.CharField(
|
|
max_length=32,
|
|
default="",
|
|
null=False,
|
|
blank=True,
|
|
db_index=True,
|
|
help_text="Worker type name for WORKER processes (crawl, snapshot, archiveresult)",
|
|
)
|
|
|
|
# Execution metadata
|
|
pwd = models.CharField(
|
|
max_length=512,
|
|
default="",
|
|
null=False,
|
|
blank=True,
|
|
help_text="Working directory for process execution",
|
|
)
|
|
cmd = models.JSONField(
|
|
default=list,
|
|
null=False,
|
|
blank=True,
|
|
help_text="Command as array of arguments",
|
|
)
|
|
env = models.JSONField(
|
|
default=dict,
|
|
null=False,
|
|
blank=True,
|
|
help_text="Environment variables for process",
|
|
)
|
|
timeout = models.IntegerField(
|
|
default=120,
|
|
null=False,
|
|
help_text="Timeout in seconds",
|
|
)
|
|
|
|
# Process results
|
|
pid = models.IntegerField(
|
|
default=None,
|
|
null=True,
|
|
blank=True,
|
|
help_text="OS process ID",
|
|
)
|
|
exit_code = models.IntegerField(
|
|
default=None,
|
|
null=True,
|
|
blank=True,
|
|
help_text="Process exit code (0 = success)",
|
|
)
|
|
stdout = models.TextField(
|
|
default="",
|
|
null=False,
|
|
blank=True,
|
|
help_text="Standard output from process",
|
|
)
|
|
stderr = models.TextField(
|
|
default="",
|
|
null=False,
|
|
blank=True,
|
|
help_text="Standard error from process",
|
|
)
|
|
|
|
# Timing
|
|
started_at = models.DateTimeField(
|
|
default=None,
|
|
null=True,
|
|
blank=True,
|
|
help_text="When process was launched",
|
|
)
|
|
ended_at = models.DateTimeField(
|
|
default=None,
|
|
null=True,
|
|
blank=True,
|
|
help_text="When process completed/terminated",
|
|
)
|
|
|
|
# Optional FKs
|
|
binary = models.ForeignKey(
|
|
Binary,
|
|
on_delete=models.SET_NULL,
|
|
null=True,
|
|
blank=True,
|
|
related_name="process_set",
|
|
help_text="Binary used by this process",
|
|
)
|
|
iface = models.ForeignKey(
|
|
NetworkInterface,
|
|
on_delete=models.SET_NULL,
|
|
null=True,
|
|
blank=True,
|
|
related_name="process_set",
|
|
help_text="Network interface used by this process",
|
|
)
|
|
|
|
# Optional connection URL (for CDP, sonic, etc.)
|
|
url = models.URLField(
|
|
max_length=2048,
|
|
default=None,
|
|
null=True,
|
|
blank=True,
|
|
help_text="Connection URL (CDP endpoint, sonic server, etc.)",
|
|
)
|
|
|
|
# Reverse relation to ArchiveResult (OneToOne from AR side)
|
|
# archiveresult: OneToOneField defined on ArchiveResult model
|
|
|
|
# State machine fields
|
|
status = models.CharField(
|
|
max_length=16,
|
|
choices=StatusChoices.choices,
|
|
default=StatusChoices.QUEUED,
|
|
db_index=True,
|
|
)
|
|
retry_at = models.DateTimeField(
|
|
default=timezone.now,
|
|
null=True,
|
|
blank=True,
|
|
db_index=True,
|
|
help_text="When to retry this process",
|
|
)
|
|
|
|
machine_id: uuid.UUID
|
|
parent_id: uuid.UUID | None
|
|
binary_id: uuid.UUID | None
|
|
children: models.Manager[Process]
|
|
archiveresult: ArchiveResult
|
|
|
|
state_machine_name: str = "archivebox.machine.models.ProcessMachine"
|
|
|
|
objects = ProcessManager() # pyright: ignore[reportIncompatibleVariableOverride]
|
|
|
|
class Meta(TypedModelMeta):
|
|
app_label = "machine"
|
|
verbose_name = "Process"
|
|
verbose_name_plural = "Processes"
|
|
indexes = [
|
|
models.Index(fields=["machine", "status", "retry_at"]),
|
|
models.Index(fields=["binary", "exit_code"]),
|
|
]
|
|
|
|
def __str__(self) -> str:
|
|
cmd_str = " ".join(self.cmd[:3]) if self.cmd else "(no cmd)"
|
|
return f"Process[{self.id}] {cmd_str} ({self.status})"
|
|
|
|
# Properties that delegate to related objects
|
|
@property
|
|
def cmd_version(self) -> str:
|
|
"""Get version from associated binary."""
|
|
return self.binary.version if self.binary else ""
|
|
|
|
@property
|
|
def bin_abspath(self) -> str:
|
|
"""Get absolute path from associated binary."""
|
|
return self.binary.abspath if self.binary else ""
|
|
|
|
@property
|
|
def plugin(self) -> str:
|
|
"""Get plugin name from associated ArchiveResult (if any)."""
|
|
if hasattr(self, "archiveresult"):
|
|
# Inline import to avoid circular dependency
|
|
return self.archiveresult.plugin
|
|
return ""
|
|
|
|
@property
|
|
def hook_name(self) -> str:
|
|
"""Get hook name from associated ArchiveResult (if any)."""
|
|
if hasattr(self, "archiveresult"):
|
|
return self.archiveresult.hook_name
|
|
return ""
|
|
|
|
def to_json(self) -> dict:
|
|
"""
|
|
Convert Process model instance to a JSON-serializable dict.
|
|
"""
|
|
from archivebox.config import VERSION
|
|
|
|
record = {
|
|
"type": "Process",
|
|
"schema_version": VERSION,
|
|
"id": str(self.id),
|
|
"machine_id": str(self.machine_id),
|
|
"cmd": self.cmd,
|
|
"pwd": self.pwd,
|
|
"status": self.status,
|
|
"exit_code": self.exit_code,
|
|
"started_at": self.started_at.isoformat() if self.started_at else None,
|
|
"ended_at": self.ended_at.isoformat() if self.ended_at else None,
|
|
}
|
|
# Include optional fields if set
|
|
if self.binary_id:
|
|
record["binary_id"] = str(self.binary_id)
|
|
if self.pid:
|
|
record["pid"] = self.pid
|
|
if self.timeout:
|
|
record["timeout"] = self.timeout
|
|
return record
|
|
|
|
def hydrate_binary_from_context(self, *, plugin_name: str = "", hook_path: str = "") -> Binary | None:
|
|
machine = self.machine if self.machine_id else Machine.current()
|
|
|
|
references: list[str] = []
|
|
for key in _get_process_binary_env_keys(plugin_name, hook_path, self.env):
|
|
value = str(self.env.get(key) or "").strip()
|
|
if value and value not in references:
|
|
references.append(value)
|
|
|
|
if self.cmd:
|
|
cmd_0 = str(self.cmd[0]).strip()
|
|
if cmd_0 and cmd_0 not in references:
|
|
references.append(cmd_0)
|
|
|
|
for reference in references:
|
|
binary = _find_existing_binary_for_reference(machine, reference)
|
|
if binary:
|
|
self.binary = binary
|
|
return binary
|
|
|
|
return None
|
|
|
|
@classmethod
|
|
def parse_records_from_text(cls, text: str) -> list[dict]:
|
|
"""Parse JSONL records from raw text using the shared JSONL parser."""
|
|
from archivebox.misc.jsonl import parse_line
|
|
|
|
records: list[dict] = []
|
|
if not text:
|
|
return records
|
|
for line in text.splitlines():
|
|
record = parse_line(line)
|
|
if record and record.get("type"):
|
|
records.append(record)
|
|
return records
|
|
|
|
def get_records(self) -> list[dict]:
|
|
"""Parse JSONL records from this process's stdout."""
|
|
stdout = self.stdout
|
|
if not stdout and self.stdout_file and self.stdout_file.exists():
|
|
stdout = self.stdout_file.read_text()
|
|
return self.parse_records_from_text(stdout or "")
|
|
|
|
@staticmethod
|
|
def from_json(record: dict[str, Any], overrides: dict[str, Any] | None = None):
|
|
"""
|
|
Create/update Process from JSON dict.
|
|
|
|
Args:
|
|
record: JSON dict with 'id' or process details
|
|
overrides: Optional dict of field overrides
|
|
|
|
Returns:
|
|
Process instance or None
|
|
"""
|
|
process_id = record.get("id")
|
|
if process_id:
|
|
try:
|
|
return Process.objects.get(id=process_id)
|
|
except Process.DoesNotExist:
|
|
pass
|
|
return None
|
|
|
|
def update_and_requeue(self, **kwargs) -> bool:
|
|
"""
|
|
Update process fields and requeue for worker state machine.
|
|
Sets modified_at to ensure workers pick up changes.
|
|
"""
|
|
for key, value in kwargs.items():
|
|
setattr(self, key, value)
|
|
self.modified_at = timezone.now()
|
|
self.save()
|
|
return True
|
|
|
|
# =========================================================================
|
|
# Process.current() and hierarchy methods
|
|
# =========================================================================
|
|
|
|
@classmethod
|
|
def current(cls) -> Process:
|
|
"""
|
|
Get or create the Process record for the current OS process.
|
|
|
|
Similar to Machine.current(), this:
|
|
1. Checks cache for existing Process with matching PID
|
|
2. Validates the cached Process is still valid (PID not reused)
|
|
3. Creates new Process if needed
|
|
|
|
IMPORTANT: Uses psutil to validate PID hasn't been reused.
|
|
PIDs are recycled by OS, so we compare start times.
|
|
"""
|
|
global _CURRENT_PROCESS
|
|
|
|
current_pid = os.getpid()
|
|
machine = Machine.current()
|
|
iface = NetworkInterface.current()
|
|
|
|
# Check cache validity
|
|
if _CURRENT_PROCESS:
|
|
# Verify: same PID, same machine, cache not expired
|
|
if (
|
|
_CURRENT_PROCESS.pid == current_pid
|
|
and _CURRENT_PROCESS.machine_id == machine.id
|
|
and timezone.now() < _CURRENT_PROCESS.modified_at + timedelta(seconds=PROCESS_RECHECK_INTERVAL)
|
|
):
|
|
if _CURRENT_PROCESS.iface_id != iface.id:
|
|
_CURRENT_PROCESS.iface = iface
|
|
_CURRENT_PROCESS.save(update_fields=["iface", "modified_at"])
|
|
_CURRENT_PROCESS.ensure_log_files()
|
|
return _CURRENT_PROCESS
|
|
_CURRENT_PROCESS = None
|
|
|
|
# Get actual process start time from OS for validation
|
|
os_start_time = None
|
|
if PSUTIL_AVAILABLE:
|
|
try:
|
|
os_proc = psutil.Process(current_pid)
|
|
os_start_time = os_proc.create_time()
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
pass
|
|
|
|
# Try to find existing Process for this PID on this machine
|
|
# Filter by: machine + PID + RUNNING + recent + start time matches
|
|
if os_start_time:
|
|
existing = (
|
|
cls.objects.filter(
|
|
machine=machine,
|
|
pid=current_pid,
|
|
status=cls.StatusChoices.RUNNING,
|
|
started_at__gte=timezone.now() - PID_REUSE_WINDOW,
|
|
)
|
|
.order_by("-started_at")
|
|
.first()
|
|
)
|
|
|
|
if existing and existing.started_at:
|
|
db_start_time = existing.started_at.timestamp()
|
|
if abs(db_start_time - os_start_time) < START_TIME_TOLERANCE:
|
|
_CURRENT_PROCESS = existing
|
|
if existing.iface_id != iface.id:
|
|
existing.iface = iface
|
|
existing.save(update_fields=["iface", "modified_at"])
|
|
_CURRENT_PROCESS.ensure_log_files()
|
|
return existing
|
|
|
|
# No valid existing record - create new one
|
|
parent = cls._find_parent_process(machine)
|
|
process_type = cls._detect_process_type()
|
|
|
|
# Use psutil cmdline if available (matches what proc() will validate against)
|
|
# Otherwise fall back to sys.argv
|
|
cmd = sys.argv
|
|
if PSUTIL_AVAILABLE:
|
|
try:
|
|
os_proc = psutil.Process(current_pid)
|
|
cmd = os_proc.cmdline()
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
pass
|
|
|
|
# Use psutil start time if available (more accurate than timezone.now())
|
|
if os_start_time:
|
|
started_at = datetime.fromtimestamp(os_start_time, tz=timezone.get_current_timezone())
|
|
else:
|
|
started_at = timezone.now()
|
|
|
|
_CURRENT_PROCESS = cls.objects.create(
|
|
machine=machine,
|
|
parent=parent,
|
|
process_type=process_type,
|
|
cmd=cmd,
|
|
pwd=os.getcwd(),
|
|
pid=current_pid,
|
|
started_at=started_at,
|
|
status=cls.StatusChoices.RUNNING,
|
|
iface=iface,
|
|
)
|
|
_CURRENT_PROCESS.ensure_log_files()
|
|
return _CURRENT_PROCESS
|
|
|
|
@classmethod
|
|
def _find_parent_process(cls, machine: Machine | None = None) -> Process | None:
|
|
"""
|
|
Find the parent Process record by looking up PPID.
|
|
|
|
IMPORTANT: Validates against PID reuse by checking:
|
|
1. Same machine (PIDs are only unique per machine)
|
|
2. Start time matches OS process start time
|
|
3. Process is still RUNNING and recent
|
|
|
|
Returns None if parent is not an ArchiveBox process.
|
|
"""
|
|
if not PSUTIL_AVAILABLE:
|
|
return None
|
|
|
|
ppid = os.getppid()
|
|
machine = machine or Machine.current()
|
|
|
|
# Debug logging
|
|
# print(f"DEBUG _find_parent_process: my_pid={os.getpid()}, ppid={ppid}", file=sys.stderr)
|
|
|
|
# Get parent process start time from OS
|
|
try:
|
|
os_parent = psutil.Process(ppid)
|
|
os_parent_start = os_parent.create_time()
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
|
# print(f"DEBUG _find_parent_process: Parent process {ppid} not accessible", file=sys.stderr)
|
|
return None # Parent process doesn't exist
|
|
|
|
# Find matching Process record
|
|
candidates = cls.objects.filter(
|
|
machine=machine,
|
|
pid=ppid,
|
|
status=cls.StatusChoices.RUNNING,
|
|
started_at__gte=timezone.now() - PID_REUSE_WINDOW,
|
|
).order_by("-started_at")
|
|
|
|
# print(f"DEBUG _find_parent_process: Found {candidates.count()} candidates for ppid={ppid}", file=sys.stderr)
|
|
|
|
for candidate in candidates:
|
|
if candidate.started_at:
|
|
db_start_time = candidate.started_at.timestamp()
|
|
time_diff = abs(db_start_time - os_parent_start)
|
|
# print(f"DEBUG _find_parent_process: Checking candidate id={candidate.id} time_diff={time_diff:.2f}s tolerance={START_TIME_TOLERANCE}s", file=sys.stderr)
|
|
if time_diff < START_TIME_TOLERANCE:
|
|
# print(f"DEBUG _find_parent_process: MATCH! Returning parent id={candidate.id} pid={candidate.pid}", file=sys.stderr)
|
|
return candidate
|
|
|
|
# print(f"DEBUG _find_parent_process: No matching parent found for ppid={ppid}", file=sys.stderr)
|
|
return None # No matching ArchiveBox parent process
|
|
|
|
@classmethod
|
|
def _detect_process_type(cls) -> str:
|
|
"""
|
|
Detect the type of the current process from sys.argv.
|
|
"""
|
|
argv_str = " ".join(sys.argv).lower()
|
|
|
|
if "supervisord" in argv_str:
|
|
return cls.TypeChoices.SUPERVISORD
|
|
elif "runner_watch" in argv_str:
|
|
return cls.TypeChoices.WORKER
|
|
elif "archivebox run" in argv_str:
|
|
return cls.TypeChoices.ORCHESTRATOR
|
|
elif "archivebox" in argv_str:
|
|
return cls.TypeChoices.CLI
|
|
else:
|
|
return cls.TypeChoices.BINARY
|
|
|
|
@classmethod
|
|
def cleanup_stale_running(cls, machine: Machine | None = None) -> int:
|
|
"""
|
|
Mark stale RUNNING processes as EXITED in the DB.
|
|
|
|
Processes are stale if:
|
|
- Status is RUNNING but OS process no longer exists
|
|
- Status is RUNNING but exceeded its timeout plus a small grace margin
|
|
- Status is RUNNING but started_at is older than PID_REUSE_WINDOW
|
|
|
|
Returns count of processes cleaned up.
|
|
"""
|
|
machine = machine or Machine.current()
|
|
cleaned = 0
|
|
|
|
stale = cls.objects.filter(
|
|
machine=machine,
|
|
status=cls.StatusChoices.RUNNING,
|
|
)
|
|
|
|
for proc in stale:
|
|
if proc.poll() is not None:
|
|
cleaned += 1
|
|
continue
|
|
|
|
is_stale = False
|
|
|
|
if proc.started_at:
|
|
timeout_seconds = max(int(proc.timeout or 0), 0)
|
|
timeout_deadline = proc.started_at + timedelta(seconds=timeout_seconds) + PROCESS_TIMEOUT_GRACE
|
|
if timezone.now() >= timeout_deadline:
|
|
is_stale = True
|
|
|
|
# Check if too old (PID definitely reused)
|
|
if not is_stale and proc.started_at and proc.started_at < timezone.now() - PID_REUSE_WINDOW:
|
|
is_stale = True
|
|
elif not is_stale and PSUTIL_AVAILABLE and proc.pid is not None:
|
|
# Check if OS process still exists with matching start time
|
|
try:
|
|
os_proc = psutil.Process(proc.pid)
|
|
if proc.started_at:
|
|
db_start = proc.started_at.timestamp()
|
|
os_start = os_proc.create_time()
|
|
if abs(db_start - os_start) > START_TIME_TOLERANCE:
|
|
is_stale = True # PID reused by different process
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
|
is_stale = True # Process no longer exists
|
|
|
|
if is_stale:
|
|
proc.status = cls.StatusChoices.EXITED
|
|
proc.ended_at = proc.ended_at or timezone.now()
|
|
proc.exit_code = proc.exit_code if proc.exit_code is not None else 0
|
|
proc.save(update_fields=["status", "ended_at", "exit_code"])
|
|
cleaned += 1
|
|
|
|
return cleaned
|
|
|
|
# =========================================================================
|
|
# Tree traversal properties
|
|
# =========================================================================
|
|
|
|
@property
|
|
def root(self) -> Process:
|
|
"""Get the root process (CLI command) of this hierarchy."""
|
|
proc = self
|
|
while proc.parent_id:
|
|
proc = proc.parent
|
|
return proc
|
|
|
|
@property
|
|
def ancestors(self) -> list[Process]:
|
|
"""Get all ancestor processes from parent to root."""
|
|
ancestors = []
|
|
proc = self.parent
|
|
while proc:
|
|
ancestors.append(proc)
|
|
proc = proc.parent
|
|
return ancestors
|
|
|
|
@property
|
|
def depth(self) -> int:
|
|
"""Get depth in the process tree (0 = root)."""
|
|
return len(self.ancestors)
|
|
|
|
def get_descendants(self, include_self: bool = False):
|
|
"""Get all descendant processes recursively."""
|
|
if include_self:
|
|
pks = [self.pk]
|
|
else:
|
|
pks = []
|
|
|
|
children = list(self.children.values_list("pk", flat=True))
|
|
while children:
|
|
pks.extend(children)
|
|
children = list(Process.objects.filter(parent_id__in=children).values_list("pk", flat=True))
|
|
|
|
return Process.objects.filter(pk__in=pks)
|
|
|
|
# =========================================================================
|
|
# Validated psutil access via .proc property
|
|
# =========================================================================
|
|
|
|
@property
|
|
def proc(self) -> psutil.Process | None:
|
|
"""
|
|
Get validated psutil.Process for this record.
|
|
|
|
Returns psutil.Process ONLY if:
|
|
1. Process with this PID exists in OS
|
|
2. OS process start time matches our started_at (within tolerance)
|
|
3. Process is on current machine
|
|
|
|
Returns None if:
|
|
- PID doesn't exist (process exited)
|
|
- PID was reused by a different process (start times don't match)
|
|
- We're on a different machine than where process ran
|
|
- psutil is not available
|
|
|
|
This prevents accidentally matching a stale/recycled PID.
|
|
"""
|
|
if not PSUTIL_AVAILABLE:
|
|
return None
|
|
|
|
# Can't get psutil.Process if we don't have a PID
|
|
if not self.pid:
|
|
return None
|
|
|
|
# Can't validate processes on other machines
|
|
if self.machine_id != Machine.current().id:
|
|
return None
|
|
|
|
try:
|
|
os_proc = psutil.Process(self.pid)
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
|
return None # Process no longer exists
|
|
|
|
# Validate start time matches to prevent PID reuse confusion
|
|
if self.started_at:
|
|
os_start_time = os_proc.create_time()
|
|
db_start_time = self.started_at.timestamp()
|
|
|
|
if abs(os_start_time - db_start_time) > START_TIME_TOLERANCE:
|
|
# PID has been reused by a different process!
|
|
return None
|
|
|
|
# Optionally validate command matches (extra safety)
|
|
if self.cmd:
|
|
try:
|
|
os_cmdline = os_proc.cmdline()
|
|
if os_cmdline and self.cmd:
|
|
db_binary = self.cmd[0] if self.cmd else ""
|
|
if db_binary:
|
|
db_binary_name = Path(db_binary).name
|
|
cmd_matches = any(arg == db_binary or Path(arg).name == db_binary_name for arg in os_cmdline if arg)
|
|
if not cmd_matches:
|
|
return None # Different command, PID reused
|
|
except (psutil.AccessDenied, psutil.ZombieProcess):
|
|
pass # Can't check cmdline, trust start time match
|
|
|
|
return os_proc
|
|
|
|
@property
|
|
def is_running(self) -> bool:
|
|
"""
|
|
Check if process is currently running via psutil.
|
|
|
|
More reliable than checking status field since it validates
|
|
the actual OS process exists and matches our record.
|
|
"""
|
|
proc = self.proc
|
|
if proc is None:
|
|
return False
|
|
try:
|
|
# Treat zombies as not running (they should be reaped)
|
|
if proc.status() == psutil.STATUS_ZOMBIE:
|
|
return False
|
|
except Exception:
|
|
pass
|
|
return proc.is_running()
|
|
|
|
def is_alive(self) -> bool:
|
|
"""
|
|
Alias for is_running, for compatibility with subprocess.Popen API.
|
|
"""
|
|
return self.is_running
|
|
|
|
def get_memory_info(self) -> dict | None:
|
|
"""Get memory usage if process is running."""
|
|
proc = self.proc
|
|
if proc:
|
|
try:
|
|
mem = proc.memory_info()
|
|
return {"rss": mem.rss, "vms": mem.vms}
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
pass
|
|
return None
|
|
|
|
def get_cpu_percent(self) -> float | None:
|
|
"""Get CPU usage percentage if process is running."""
|
|
proc = self.proc
|
|
if proc:
|
|
try:
|
|
return proc.cpu_percent(interval=0.1)
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
pass
|
|
return None
|
|
|
|
def get_children_pids(self) -> list[int]:
|
|
"""Get PIDs of child processes from OS (not DB)."""
|
|
proc = self.proc
|
|
if proc:
|
|
try:
|
|
return [child.pid for child in proc.children(recursive=True)]
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
pass
|
|
return []
|
|
|
|
# =========================================================================
|
|
# Lifecycle methods (launch, kill, poll, wait)
|
|
# =========================================================================
|
|
|
|
@property
|
|
def pid_file(self) -> Path | None:
|
|
"""Path to PID file for this process."""
|
|
runtime_dir = self.runtime_dir
|
|
return runtime_dir / "process.pid" if runtime_dir else None
|
|
|
|
@property
|
|
def cmd_file(self) -> Path | None:
|
|
"""Path to cmd.sh script for this process."""
|
|
runtime_dir = self.runtime_dir
|
|
return runtime_dir / "cmd.sh" if runtime_dir else None
|
|
|
|
@property
|
|
def stdout_file(self) -> Path | None:
|
|
"""Path to stdout log."""
|
|
runtime_dir = self.runtime_dir
|
|
return runtime_dir / "stdout.log" if runtime_dir else None
|
|
|
|
@property
|
|
def stderr_file(self) -> Path | None:
|
|
"""Path to stderr log."""
|
|
runtime_dir = self.runtime_dir
|
|
return runtime_dir / "stderr.log" if runtime_dir else None
|
|
|
|
@property
|
|
def hook_script_name(self) -> str | None:
|
|
"""Best-effort hook filename extracted from the process command."""
|
|
if self.process_type != self.TypeChoices.HOOK or not self.cmd:
|
|
return None
|
|
|
|
for arg in self.cmd:
|
|
arg = str(arg)
|
|
if arg.startswith("-"):
|
|
continue
|
|
candidate = Path(arg).name
|
|
if candidate.startswith("on_") and Path(candidate).suffix in {".py", ".js", ".sh"}:
|
|
return candidate
|
|
|
|
return None
|
|
|
|
@property
|
|
def runtime_dir(self) -> Path | None:
|
|
"""Directory where this process stores runtime logs/pid/cmd metadata."""
|
|
if not self.pwd:
|
|
return None
|
|
|
|
base_dir = Path(self.pwd)
|
|
hook_name = self.hook_script_name
|
|
if hook_name:
|
|
return base_dir / ".hooks" / hook_name
|
|
return base_dir
|
|
|
|
def tail_stdout(self, lines: int = 50, follow: bool = False):
|
|
"""
|
|
Tail stdout log file (like `tail` or `tail -f`).
|
|
|
|
Args:
|
|
lines: Number of lines to show (default 50)
|
|
follow: If True, follow the file and yield new lines as they appear
|
|
|
|
Yields:
|
|
Lines from stdout
|
|
"""
|
|
if not self.stdout_file or not self.stdout_file.exists():
|
|
return
|
|
|
|
if follow:
|
|
# Follow mode - yield new lines as they appear (tail -f)
|
|
import time
|
|
|
|
with open(self.stdout_file) as f:
|
|
# Seek to end minus roughly 'lines' worth of bytes
|
|
f.seek(0, 2) # Seek to end
|
|
file_size = f.tell()
|
|
# Rough estimate: 100 bytes per line
|
|
seek_pos = max(0, file_size - (lines * 100))
|
|
f.seek(seek_pos)
|
|
|
|
# Skip partial line if we seeked to middle
|
|
if seek_pos > 0:
|
|
f.readline()
|
|
|
|
# Yield existing lines
|
|
for line in f:
|
|
yield line.rstrip("\n")
|
|
|
|
# Now follow for new lines
|
|
while True:
|
|
line = f.readline()
|
|
if line:
|
|
yield line.rstrip("\n")
|
|
else:
|
|
time.sleep(0.1) # Wait before checking again
|
|
else:
|
|
# Just get last N lines (tail -n)
|
|
try:
|
|
content = self.stdout_file.read_text()
|
|
for line in content.splitlines()[-lines:]:
|
|
yield line
|
|
except Exception:
|
|
return
|
|
|
|
def tail_stderr(self, lines: int = 50, follow: bool = False):
|
|
"""
|
|
Tail stderr log file (like `tail` or `tail -f`).
|
|
|
|
Args:
|
|
lines: Number of lines to show (default 50)
|
|
follow: If True, follow the file and yield new lines as they appear
|
|
|
|
Yields:
|
|
Lines from stderr
|
|
"""
|
|
if not self.stderr_file or not self.stderr_file.exists():
|
|
return
|
|
|
|
if follow:
|
|
# Follow mode - yield new lines as they appear (tail -f)
|
|
import time
|
|
|
|
with open(self.stderr_file) as f:
|
|
# Seek to end minus roughly 'lines' worth of bytes
|
|
f.seek(0, 2) # Seek to end
|
|
file_size = f.tell()
|
|
# Rough estimate: 100 bytes per line
|
|
seek_pos = max(0, file_size - (lines * 100))
|
|
f.seek(seek_pos)
|
|
|
|
# Skip partial line if we seeked to middle
|
|
if seek_pos > 0:
|
|
f.readline()
|
|
|
|
# Yield existing lines
|
|
for line in f:
|
|
yield line.rstrip("\n")
|
|
|
|
# Now follow for new lines
|
|
while True:
|
|
line = f.readline()
|
|
if line:
|
|
yield line.rstrip("\n")
|
|
else:
|
|
time.sleep(0.1) # Wait before checking again
|
|
else:
|
|
# Just get last N lines (tail -n)
|
|
try:
|
|
content = self.stderr_file.read_text()
|
|
for line in content.splitlines()[-lines:]:
|
|
yield line
|
|
except Exception:
|
|
return
|
|
|
|
def pipe_stdout(self, lines: int = 10, follow: bool = True):
|
|
"""
|
|
Pipe stdout to sys.stdout.
|
|
|
|
Args:
|
|
lines: Number of initial lines to show
|
|
follow: If True, follow the file and print new lines as they appear
|
|
"""
|
|
import sys
|
|
|
|
for line in self.tail_stdout(lines=lines, follow=follow):
|
|
print(line, file=sys.stdout, flush=True)
|
|
|
|
def pipe_stderr(self, lines: int = 10, follow: bool = True):
|
|
"""
|
|
Pipe stderr to sys.stderr.
|
|
|
|
Args:
|
|
lines: Number of initial lines to show
|
|
follow: If True, follow the file and print new lines as they appear
|
|
"""
|
|
import sys
|
|
|
|
for line in self.tail_stderr(lines=lines, follow=follow):
|
|
print(line, file=sys.stderr, flush=True)
|
|
|
|
def _write_pid_file(self) -> None:
|
|
"""Write PID file with mtime set to process start time."""
|
|
if self.pid and self.started_at and self.pid_file:
|
|
self.pid_file.parent.mkdir(parents=True, exist_ok=True)
|
|
# Write PID to file
|
|
self.pid_file.write_text(str(self.pid))
|
|
# Set mtime to process start time for validation
|
|
try:
|
|
start_time = self.started_at.timestamp()
|
|
os.utime(self.pid_file, (start_time, start_time))
|
|
except OSError:
|
|
pass # mtime optional, validation degrades gracefully
|
|
|
|
def _write_cmd_file(self) -> None:
|
|
"""Write cmd.sh script for debugging/validation."""
|
|
if self.cmd and self.cmd_file:
|
|
self.cmd_file.parent.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Escape shell arguments (quote if contains space, ", or $)
|
|
def escape(arg: str) -> str:
|
|
return f'"{arg.replace(chr(34), chr(92) + chr(34))}"' if any(c in arg for c in ' "$') else arg
|
|
|
|
# Write executable shell script
|
|
script = "#!/bin/bash\n" + " ".join(escape(arg) for arg in self.cmd) + "\n"
|
|
self.cmd_file.write_text(script)
|
|
try:
|
|
self.cmd_file.chmod(0o755)
|
|
except OSError:
|
|
pass
|
|
|
|
def ensure_log_files(self) -> None:
|
|
"""Ensure stdout/stderr log files exist for this process."""
|
|
runtime_dir = self.runtime_dir
|
|
if not runtime_dir:
|
|
return
|
|
try:
|
|
runtime_dir.mkdir(parents=True, exist_ok=True)
|
|
except OSError:
|
|
return
|
|
try:
|
|
if self.stdout_file:
|
|
self.stdout_file.parent.mkdir(parents=True, exist_ok=True)
|
|
self.stdout_file.touch(exist_ok=True)
|
|
if self.stderr_file:
|
|
self.stderr_file.parent.mkdir(parents=True, exist_ok=True)
|
|
self.stderr_file.touch(exist_ok=True)
|
|
except OSError:
|
|
return
|
|
|
|
def _build_env(self) -> dict:
|
|
"""Build environment dict for subprocess, merging stored env with system."""
|
|
import json
|
|
|
|
env = os.environ.copy()
|
|
|
|
# Convert all values to strings for subprocess.Popen
|
|
if self.env:
|
|
for key, value in self.env.items():
|
|
if value is None:
|
|
continue
|
|
elif isinstance(value, str):
|
|
env[key] = value # Already a string, use as-is
|
|
elif isinstance(value, bool):
|
|
env[key] = "True" if value else "False"
|
|
elif isinstance(value, (int, float)):
|
|
env[key] = str(value)
|
|
else:
|
|
# Lists, dicts, etc. - serialize to JSON
|
|
env[key] = json.dumps(value, default=str)
|
|
|
|
return env
|
|
|
|
def launch(self, background: bool = False, cwd: str | None = None) -> Process:
|
|
"""
|
|
Spawn the subprocess and update this Process record.
|
|
|
|
Args:
|
|
background: If True, don't wait for completion (for daemons/bg hooks)
|
|
cwd: Working directory for the subprocess (defaults to self.pwd)
|
|
|
|
Returns:
|
|
self (updated with pid, started_at, etc.)
|
|
"""
|
|
import subprocess
|
|
|
|
# Validate pwd is set (required for output files)
|
|
if not self.pwd:
|
|
raise ValueError("Process.pwd must be set before calling launch()")
|
|
|
|
# Use provided cwd or default to pwd
|
|
working_dir = cwd or self.pwd
|
|
|
|
# Write cmd.sh for debugging
|
|
self._write_cmd_file()
|
|
|
|
stdout_path = self.stdout_file
|
|
stderr_path = self.stderr_file
|
|
if stdout_path:
|
|
stdout_path.parent.mkdir(parents=True, exist_ok=True)
|
|
if stderr_path:
|
|
stderr_path.parent.mkdir(parents=True, exist_ok=True)
|
|
if stdout_path is None or stderr_path is None:
|
|
raise RuntimeError("Process log paths could not be determined")
|
|
|
|
with open(stdout_path, "a") as out, open(stderr_path, "a") as err:
|
|
proc = subprocess.Popen(
|
|
self.cmd,
|
|
cwd=working_dir,
|
|
stdout=out,
|
|
stderr=err,
|
|
env=self._build_env(),
|
|
)
|
|
|
|
# Get accurate start time from psutil if available
|
|
if PSUTIL_AVAILABLE:
|
|
try:
|
|
ps_proc = psutil.Process(proc.pid)
|
|
self.started_at = datetime.fromtimestamp(
|
|
ps_proc.create_time(),
|
|
tz=timezone.get_current_timezone(),
|
|
)
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied):
|
|
self.started_at = timezone.now()
|
|
else:
|
|
self.started_at = timezone.now()
|
|
|
|
self.pid = proc.pid
|
|
self.status = self.StatusChoices.RUNNING
|
|
self.save()
|
|
|
|
self._write_pid_file()
|
|
|
|
if not background:
|
|
try:
|
|
proc.wait(timeout=self.timeout)
|
|
self.exit_code = proc.returncode
|
|
except subprocess.TimeoutExpired:
|
|
import signal
|
|
|
|
proc.kill()
|
|
proc.wait()
|
|
self.exit_code = 128 + signal.SIGKILL
|
|
|
|
self.ended_at = timezone.now()
|
|
if stdout_path.exists():
|
|
self.stdout = stdout_path.read_text()
|
|
if stderr_path.exists():
|
|
self.stderr = stderr_path.read_text()
|
|
self.status = self.StatusChoices.EXITED
|
|
self.save()
|
|
|
|
return self
|
|
|
|
def kill(self, signal_num: int = 15) -> bool:
|
|
"""
|
|
Kill this process and update status.
|
|
|
|
Uses self.proc for safe killing - only kills if PID matches
|
|
our recorded process (prevents killing recycled PIDs).
|
|
|
|
Args:
|
|
signal_num: Signal to send (default SIGTERM=15)
|
|
|
|
Returns:
|
|
True if killed successfully, False otherwise
|
|
"""
|
|
# Use validated psutil.Process to ensure we're killing the right process
|
|
proc = self.proc
|
|
if proc is None:
|
|
# Process doesn't exist or PID was recycled - just update status
|
|
if self.status != self.StatusChoices.EXITED:
|
|
self.status = self.StatusChoices.EXITED
|
|
self.ended_at = self.ended_at or timezone.now()
|
|
self.save()
|
|
return False
|
|
|
|
try:
|
|
# Safe to kill - we validated it's our process via start time match
|
|
proc.send_signal(signal_num)
|
|
|
|
# Update our record
|
|
# Use standard Unix convention: 128 + signal number
|
|
self.exit_code = 128 + signal_num
|
|
self.ended_at = timezone.now()
|
|
self.status = self.StatusChoices.EXITED
|
|
self.save()
|
|
|
|
# Clean up PID file
|
|
if self.pid_file and self.pid_file.exists():
|
|
self.pid_file.unlink(missing_ok=True)
|
|
|
|
return True
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied, ProcessLookupError):
|
|
# Process already exited between proc check and kill
|
|
self.status = self.StatusChoices.EXITED
|
|
self.ended_at = self.ended_at or timezone.now()
|
|
self.save()
|
|
return False
|
|
|
|
def poll(self) -> int | None:
|
|
"""
|
|
Check if process has exited and update status if so.
|
|
|
|
Cleanup when process exits:
|
|
- Copy stdout/stderr to DB (keep files for debugging)
|
|
- Delete PID file
|
|
|
|
Returns:
|
|
exit_code if exited, None if still running
|
|
"""
|
|
if self.status == self.StatusChoices.EXITED:
|
|
if self.exit_code == -1:
|
|
self.exit_code = 137
|
|
self.save(update_fields=["exit_code"])
|
|
return self.exit_code
|
|
|
|
if not self.is_running:
|
|
# Reap child process if it's a zombie (best-effort)
|
|
proc = self.proc
|
|
if proc is not None:
|
|
try:
|
|
proc.wait(timeout=0)
|
|
except Exception:
|
|
pass
|
|
# Process exited - read output and copy to DB
|
|
if self.stdout_file and self.stdout_file.exists():
|
|
self.stdout = self.stdout_file.read_text()
|
|
# TODO: Uncomment to cleanup (keeping for debugging for now)
|
|
# self.stdout_file.unlink(missing_ok=True)
|
|
if self.stderr_file and self.stderr_file.exists():
|
|
self.stderr = self.stderr_file.read_text()
|
|
# TODO: Uncomment to cleanup (keeping for debugging for now)
|
|
# self.stderr_file.unlink(missing_ok=True)
|
|
|
|
# Clean up PID file (not needed for debugging)
|
|
if self.pid_file and self.pid_file.exists():
|
|
self.pid_file.unlink(missing_ok=True)
|
|
|
|
# TODO: Uncomment to cleanup cmd.sh (keeping for debugging for now)
|
|
# if self.pwd:
|
|
# cmd_file = Path(self.pwd) / 'cmd.sh'
|
|
# if cmd_file.exists():
|
|
# cmd_file.unlink(missing_ok=True)
|
|
|
|
# Try to get exit code from proc or default to unknown
|
|
self.exit_code = self.exit_code if self.exit_code is not None else 0
|
|
if self.exit_code == -1:
|
|
self.exit_code = 137
|
|
self.ended_at = timezone.now()
|
|
self.status = self.StatusChoices.EXITED
|
|
self.save()
|
|
return self.exit_code
|
|
|
|
return None # Still running
|
|
|
|
def wait(self, timeout: int | None = None) -> int:
|
|
"""
|
|
Wait for process to exit, polling periodically.
|
|
|
|
Args:
|
|
timeout: Max seconds to wait (None = use self.timeout)
|
|
|
|
Returns:
|
|
exit_code
|
|
|
|
Raises:
|
|
TimeoutError if process doesn't exit in time
|
|
"""
|
|
import time
|
|
from archivebox.config.constants import CONSTANTS
|
|
|
|
timeout = timeout or self.timeout
|
|
if self.process_type == self.TypeChoices.HOOK:
|
|
timeout = min(int(timeout), int(CONSTANTS.MAX_HOOK_RUNTIME_SECONDS))
|
|
start = time.time()
|
|
|
|
while True:
|
|
exit_code = self.poll()
|
|
if exit_code is not None:
|
|
return exit_code
|
|
|
|
if time.time() - start > timeout:
|
|
raise TimeoutError(f"Process {self.id} did not exit within {timeout}s")
|
|
|
|
time.sleep(0.1)
|
|
|
|
def terminate(self, graceful_timeout: float = 5.0) -> bool:
|
|
"""
|
|
Gracefully terminate process: SIGTERM → wait → SIGKILL.
|
|
|
|
This consolidates the scattered SIGTERM/SIGKILL logic from:
|
|
- crawls/models.py Crawl.cleanup()
|
|
- workers/pid_utils.py stop_worker()
|
|
- supervisord_util.py stop_existing_supervisord_process()
|
|
|
|
Args:
|
|
graceful_timeout: Seconds to wait after SIGTERM before SIGKILL
|
|
|
|
Returns:
|
|
True if process was terminated, False if already dead
|
|
"""
|
|
import signal
|
|
|
|
proc = self.proc
|
|
if proc is None:
|
|
# Already dead - just update status
|
|
if self.status != self.StatusChoices.EXITED:
|
|
self.status = self.StatusChoices.EXITED
|
|
self.ended_at = self.ended_at or timezone.now()
|
|
self.save()
|
|
return False
|
|
|
|
try:
|
|
# Step 1: Send SIGTERM for graceful shutdown
|
|
proc.terminate()
|
|
|
|
# Step 2: Wait for graceful exit
|
|
try:
|
|
exit_status = proc.wait(timeout=graceful_timeout)
|
|
# Process exited gracefully
|
|
# psutil.Process.wait() returns the exit status
|
|
self.exit_code = exit_status if exit_status is not None else 0
|
|
self.status = self.StatusChoices.EXITED
|
|
self.ended_at = timezone.now()
|
|
self.save()
|
|
return True
|
|
except psutil.TimeoutExpired:
|
|
pass # Still running, need to force kill
|
|
|
|
# Step 3: Force kill with SIGKILL
|
|
proc.kill()
|
|
proc.wait(timeout=2)
|
|
|
|
# Use standard Unix convention: 128 + signal number
|
|
self.exit_code = 128 + signal.SIGKILL
|
|
self.status = self.StatusChoices.EXITED
|
|
self.ended_at = timezone.now()
|
|
self.save()
|
|
return True
|
|
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
|
# Process already dead
|
|
self.status = self.StatusChoices.EXITED
|
|
self.ended_at = self.ended_at or timezone.now()
|
|
self.save()
|
|
return False
|
|
|
|
def kill_tree(self, graceful_timeout: float = 2.0) -> int:
|
|
"""
|
|
Kill this process and all its children (OS children, not DB children) in parallel.
|
|
|
|
Uses parallel polling approach - sends SIGTERM to all processes at once,
|
|
then polls all simultaneously with individual deadline tracking.
|
|
|
|
This consolidates the scattered child-killing logic from:
|
|
- crawls/models.py Crawl.cleanup() os.killpg()
|
|
- supervisord_util.py stop_existing_supervisord_process()
|
|
|
|
Args:
|
|
graceful_timeout: Seconds to wait after SIGTERM before SIGKILL
|
|
|
|
Returns:
|
|
Number of processes killed (including self)
|
|
"""
|
|
import signal
|
|
import time
|
|
import os
|
|
|
|
killed_count = 0
|
|
used_sigkill = False
|
|
proc = self.proc
|
|
if proc is None:
|
|
# Already dead
|
|
if self.status != self.StatusChoices.EXITED:
|
|
self.status = self.StatusChoices.EXITED
|
|
self.ended_at = self.ended_at or timezone.now()
|
|
self.save()
|
|
return 0
|
|
|
|
try:
|
|
# Phase 1: Get all children and send SIGTERM to entire tree in parallel
|
|
children = proc.children(recursive=True)
|
|
deadline = time.time() + graceful_timeout
|
|
|
|
# Send SIGTERM to all children first (non-blocking)
|
|
for child in children:
|
|
try:
|
|
os.kill(child.pid, signal.SIGTERM)
|
|
except (OSError, ProcessLookupError):
|
|
pass
|
|
|
|
# Send SIGTERM to parent
|
|
try:
|
|
os.kill(proc.pid, signal.SIGTERM)
|
|
except (OSError, ProcessLookupError):
|
|
pass
|
|
|
|
# Phase 2: Poll all processes in parallel
|
|
all_procs = children + [proc]
|
|
still_running = {p.pid for p in all_procs}
|
|
|
|
while still_running and time.time() < deadline:
|
|
time.sleep(0.1)
|
|
|
|
for pid in list(still_running):
|
|
try:
|
|
# Check if process exited
|
|
os.kill(pid, 0) # Signal 0 checks if process exists
|
|
except (OSError, ProcessLookupError):
|
|
# Process exited
|
|
still_running.remove(pid)
|
|
killed_count += 1
|
|
|
|
# Phase 3: SIGKILL any stragglers that exceeded timeout
|
|
if still_running:
|
|
for pid in still_running:
|
|
try:
|
|
os.kill(pid, signal.SIGKILL)
|
|
killed_count += 1
|
|
used_sigkill = True
|
|
except (OSError, ProcessLookupError):
|
|
pass
|
|
|
|
# Update self status
|
|
if used_sigkill:
|
|
self.exit_code = 128 + signal.SIGKILL
|
|
else:
|
|
self.exit_code = 128 + signal.SIGTERM if killed_count > 0 else 0
|
|
self.status = self.StatusChoices.EXITED
|
|
self.ended_at = timezone.now()
|
|
self.save()
|
|
|
|
return killed_count
|
|
|
|
except (psutil.NoSuchProcess, psutil.AccessDenied, psutil.ZombieProcess):
|
|
# Process tree already dead
|
|
self.status = self.StatusChoices.EXITED
|
|
self.ended_at = self.ended_at or timezone.now()
|
|
self.save()
|
|
return killed_count
|
|
|
|
def kill_children_db(self) -> int:
|
|
"""
|
|
Kill all DB-tracked child processes (via parent FK).
|
|
|
|
Different from kill_tree() which uses OS children.
|
|
This kills processes created via Process.create(parent=self).
|
|
|
|
Returns:
|
|
Number of child Process records killed
|
|
"""
|
|
killed = 0
|
|
for child in self.children.filter(status=self.StatusChoices.RUNNING):
|
|
if child.terminate():
|
|
killed += 1
|
|
return killed
|
|
|
|
# =========================================================================
|
|
# Class methods for querying processes
|
|
# =========================================================================
|
|
|
|
@classmethod
|
|
def get_running(cls, process_type: str | None = None, machine: Machine | None = None) -> QuerySet[Process]:
|
|
"""
|
|
Get all running processes, optionally filtered by type.
|
|
|
|
Replaces:
|
|
- workers/pid_utils.py get_all_worker_pids()
|
|
- workers/orchestrator.py get_total_worker_count()
|
|
|
|
Args:
|
|
process_type: Filter by TypeChoices (e.g., 'worker', 'hook')
|
|
machine: Filter by machine (defaults to current)
|
|
|
|
Returns:
|
|
QuerySet of running Process records
|
|
"""
|
|
machine = machine or Machine.current()
|
|
qs = cls.objects.filter(
|
|
machine=machine,
|
|
status=cls.StatusChoices.RUNNING,
|
|
)
|
|
if process_type:
|
|
qs = qs.filter(process_type=process_type)
|
|
return qs
|
|
|
|
@classmethod
|
|
def get_running_count(cls, process_type: str | None = None, machine: Machine | None = None) -> int:
|
|
"""
|
|
Get count of running processes.
|
|
|
|
Replaces:
|
|
- workers/pid_utils.py get_running_worker_count()
|
|
"""
|
|
return cls.get_running(process_type=process_type, machine=machine).count()
|
|
|
|
@classmethod
|
|
def stop_all(cls, process_type: str | None = None, machine: Machine | None = None, graceful: bool = True) -> int:
|
|
"""
|
|
Stop all running processes of a given type.
|
|
|
|
Args:
|
|
process_type: Filter by TypeChoices
|
|
machine: Filter by machine
|
|
graceful: If True, use terminate() (SIGTERM→SIGKILL), else kill()
|
|
|
|
Returns:
|
|
Number of processes stopped
|
|
"""
|
|
stopped = 0
|
|
for proc in cls.get_running(process_type=process_type, machine=machine):
|
|
if graceful:
|
|
if proc.terminate():
|
|
stopped += 1
|
|
else:
|
|
if proc.kill():
|
|
stopped += 1
|
|
return stopped
|
|
|
|
@classmethod
|
|
def get_next_worker_id(cls, process_type: str = "worker", machine: Machine | None = None) -> int:
|
|
"""
|
|
Get the next available worker ID for spawning new workers.
|
|
|
|
Replaces workers/pid_utils.py get_next_worker_id().
|
|
Simply returns count of running workers of this type.
|
|
|
|
Args:
|
|
process_type: Worker type to count
|
|
machine: Machine to scope query
|
|
|
|
Returns:
|
|
Next available worker ID (0-indexed)
|
|
"""
|
|
return cls.get_running_count(process_type=process_type, machine=machine)
|
|
|
|
@classmethod
|
|
def cleanup_orphaned_chrome(cls) -> int:
|
|
"""
|
|
Kill orphaned Chrome processes using chrome_utils.js killZombieChrome.
|
|
|
|
Scans DATA_DIR for chrome/*.pid files from stale crawls (>5 min old)
|
|
and kills any orphaned Chrome processes.
|
|
|
|
Called by:
|
|
- Orchestrator on startup (cleanup from previous crashes)
|
|
- Orchestrator periodically (every N minutes)
|
|
|
|
Returns:
|
|
Number of zombie Chrome processes killed
|
|
"""
|
|
import subprocess
|
|
from pathlib import Path
|
|
from django.conf import settings
|
|
|
|
chrome_utils = Path(__file__).parent.parent / "plugins" / "chrome" / "chrome_utils.js"
|
|
if not chrome_utils.exists():
|
|
return 0
|
|
|
|
try:
|
|
result = subprocess.run(
|
|
["node", str(chrome_utils), "killZombieChrome", str(settings.DATA_DIR)],
|
|
capture_output=True,
|
|
timeout=30,
|
|
text=True,
|
|
)
|
|
if result.returncode == 0:
|
|
killed = int(result.stdout.strip())
|
|
if killed > 0:
|
|
print(f"[yellow]🧹 Cleaned up {killed} orphaned Chrome processes[/yellow]")
|
|
return killed
|
|
except (subprocess.TimeoutExpired, ValueError, FileNotFoundError) as e:
|
|
print(f"[red]Failed to cleanup orphaned Chrome: {e}[/red]")
|
|
|
|
return 0
|
|
|
|
@classmethod
|
|
def cleanup_orphaned_workers(cls) -> int:
|
|
"""
|
|
Mark orphaned worker/hook processes as EXITED in the DB.
|
|
|
|
Orphaned if:
|
|
- Root (orchestrator/cli) is not running, or
|
|
- No orchestrator/cli ancestor exists.
|
|
|
|
Standalone worker runs (archivebox run --snapshot-id) are allowed.
|
|
"""
|
|
cleaned = 0
|
|
|
|
running_children = cls.objects.filter(
|
|
process_type__in=[cls.TypeChoices.WORKER, cls.TypeChoices.HOOK],
|
|
status=cls.StatusChoices.RUNNING,
|
|
)
|
|
|
|
for proc in running_children:
|
|
if not proc.is_running:
|
|
continue
|
|
|
|
root = proc.root
|
|
# Standalone worker/hook process (run directly)
|
|
if root.id == proc.id and root.process_type in (cls.TypeChoices.WORKER, cls.TypeChoices.HOOK):
|
|
continue
|
|
|
|
# If root is an active orchestrator/cli, keep it
|
|
if root.process_type in (cls.TypeChoices.ORCHESTRATOR, cls.TypeChoices.CLI) and root.is_running:
|
|
continue
|
|
|
|
proc.status = cls.StatusChoices.EXITED
|
|
proc.ended_at = proc.ended_at or timezone.now()
|
|
proc.exit_code = proc.exit_code if proc.exit_code is not None else 0
|
|
proc.save(update_fields=["status", "ended_at", "exit_code"])
|
|
cleaned += 1
|
|
|
|
if cleaned:
|
|
print(f"[yellow]🧹 Cleaned up {cleaned} orphaned worker/hook process record(s)[/yellow]")
|
|
return cleaned
|
|
|
|
|
|
# =============================================================================
|
|
# Binary State Machine
|
|
# =============================================================================
|
|
|
|
|
|
class BinaryMachine(BaseStateMachine):
|
|
"""
|
|
State machine for managing Binary installation lifecycle.
|
|
|
|
Simple 2-state machine:
|
|
┌─────────────────────────────────────────────────────────────┐
|
|
│ QUEUED State │
|
|
│ • Binary needs to be installed │
|
|
└─────────────────────────────────────────────────────────────┘
|
|
↓ tick() when can_install()
|
|
↓ Synchronous installation during transition
|
|
┌─────────────────────────────────────────────────────────────┐
|
|
│ INSTALLED State │
|
|
│ • Binary installed (abspath, version, sha256 set) │
|
|
│ • Health stats incremented │
|
|
└─────────────────────────────────────────────────────────────┘
|
|
|
|
If installation fails, Binary stays in QUEUED with retry_at bumped.
|
|
"""
|
|
|
|
model_attr_name = "binary"
|
|
binary: Binary
|
|
|
|
# States
|
|
queued = State(value=Binary.StatusChoices.QUEUED, initial=True)
|
|
installed = State(value=Binary.StatusChoices.INSTALLED, final=True)
|
|
|
|
# Tick Event - install happens during transition
|
|
tick = queued.to.itself(unless="can_install") | queued.to(installed, cond="can_install", on="on_install")
|
|
|
|
def can_install(self) -> bool:
|
|
"""Check if binary installation can start."""
|
|
return bool(self.binary.name and self.binary.binproviders)
|
|
|
|
@queued.enter
|
|
def enter_queued(self):
|
|
"""Binary is queued for installation."""
|
|
self.binary.update_and_requeue(
|
|
retry_at=timezone.now(),
|
|
status=Binary.StatusChoices.QUEUED,
|
|
)
|
|
|
|
def on_install(self):
|
|
"""Called during queued→installed transition. Runs installation synchronously."""
|
|
import sys
|
|
|
|
print(f"[cyan] 🔄 BinaryMachine.on_install() - installing {self.binary.name}[/cyan]", file=sys.stderr)
|
|
|
|
# Run installation hooks (synchronous, updates abspath/version/sha256 and sets status)
|
|
self.binary.run()
|
|
|
|
# Check if installation succeeded by looking at updated status
|
|
# Note: Binary.run() updates self.binary.status internally but doesn't refresh our reference
|
|
self.binary.refresh_from_db()
|
|
|
|
if self.binary.status != Binary.StatusChoices.INSTALLED:
|
|
# Installation failed - abort transition, stay in queued
|
|
print(f"[red] ❌ BinaryMachine - {self.binary.name} installation failed, retrying later[/red]", file=sys.stderr)
|
|
|
|
# Bump retry_at to try again later
|
|
self.binary.update_and_requeue(
|
|
retry_at=timezone.now() + timedelta(seconds=300), # Retry in 5 minutes
|
|
status=Binary.StatusChoices.QUEUED, # Ensure we stay queued
|
|
)
|
|
|
|
# Increment health stats for failure
|
|
self.binary.increment_health_stats(success=False)
|
|
|
|
# Abort the transition - this will raise an exception and keep us in queued
|
|
raise Exception(f"Binary {self.binary.name} installation failed")
|
|
|
|
print(f"[cyan] ✅ BinaryMachine - {self.binary.name} installed successfully[/cyan]", file=sys.stderr)
|
|
|
|
@installed.enter
|
|
def enter_installed(self):
|
|
"""Binary installed successfully."""
|
|
self.binary.update_and_requeue(
|
|
retry_at=None,
|
|
status=Binary.StatusChoices.INSTALLED,
|
|
)
|
|
|
|
# Increment health stats
|
|
self.binary.increment_health_stats(success=True)
|
|
|
|
|
|
# =============================================================================
|
|
# Process State Machine
|
|
# =============================================================================
|
|
|
|
|
|
class ProcessMachine(BaseStateMachine):
|
|
"""
|
|
State machine for managing Process (OS subprocess) lifecycle.
|
|
|
|
Process Lifecycle:
|
|
┌─────────────────────────────────────────────────────────────┐
|
|
│ QUEUED State │
|
|
│ • Process ready to launch, waiting for resources │
|
|
└─────────────────────────────────────────────────────────────┘
|
|
↓ tick() when can_start()
|
|
┌─────────────────────────────────────────────────────────────┐
|
|
│ RUNNING State → enter_running() │
|
|
│ 1. process.launch() │
|
|
│ • Spawn subprocess with cmd, pwd, env, timeout │
|
|
│ • Set pid, started_at │
|
|
│ • Process runs in background or foreground │
|
|
│ 2. Monitor process completion │
|
|
│ • Check exit code when process completes │
|
|
└─────────────────────────────────────────────────────────────┘
|
|
↓ tick() checks is_exited()
|
|
┌─────────────────────────────────────────────────────────────┐
|
|
│ EXITED State │
|
|
│ • Process completed (exit_code set) │
|
|
│ • Health stats incremented │
|
|
│ • stdout/stderr captured │
|
|
└─────────────────────────────────────────────────────────────┘
|
|
|
|
Note: This is a simpler state machine than ArchiveResult.
|
|
Process is just about execution lifecycle. ArchiveResult handles
|
|
the archival-specific logic (status, output parsing, etc.).
|
|
"""
|
|
|
|
model_attr_name = "process"
|
|
process: Process
|
|
|
|
# States
|
|
queued = State(value=Process.StatusChoices.QUEUED, initial=True)
|
|
running = State(value=Process.StatusChoices.RUNNING)
|
|
exited = State(value=Process.StatusChoices.EXITED, final=True)
|
|
|
|
# Tick Event - transitions based on conditions
|
|
tick = (
|
|
queued.to.itself(unless="can_start")
|
|
| queued.to(running, cond="can_start")
|
|
| running.to.itself(unless="is_exited")
|
|
| running.to(exited, cond="is_exited")
|
|
)
|
|
|
|
# Additional events (for explicit control)
|
|
launch = queued.to(running)
|
|
kill = running.to(exited)
|
|
|
|
def can_start(self) -> bool:
|
|
"""Check if process can start (has cmd and machine)."""
|
|
return bool(self.process.cmd and self.process.machine)
|
|
|
|
def is_exited(self) -> bool:
|
|
"""Check if process has exited (exit_code is set)."""
|
|
return self.process.exit_code is not None
|
|
|
|
@queued.enter
|
|
def enter_queued(self):
|
|
"""Process is queued for execution."""
|
|
self.process.update_and_requeue(
|
|
retry_at=timezone.now(),
|
|
status=Process.StatusChoices.QUEUED,
|
|
)
|
|
|
|
@running.enter
|
|
def enter_running(self):
|
|
"""Start process execution."""
|
|
# Lock the process while it runs
|
|
self.process.update_and_requeue(
|
|
retry_at=timezone.now() + timedelta(seconds=self.process.timeout),
|
|
status=Process.StatusChoices.RUNNING,
|
|
started_at=timezone.now(),
|
|
)
|
|
|
|
# Launch the subprocess
|
|
# NOTE: This is a placeholder - actual launch logic would
|
|
# be implemented based on how hooks currently spawn processes
|
|
# For now, Process is a data model that tracks execution metadata
|
|
# The actual subprocess spawning is still handled by run_hook()
|
|
|
|
# Mark as immediately exited for now (until we refactor run_hook)
|
|
# In the future, this would actually spawn the subprocess
|
|
self.process.exit_code = 0 # Placeholder
|
|
self.process.save()
|
|
|
|
@exited.enter
|
|
def enter_exited(self):
|
|
"""Process has exited."""
|
|
self.process.update_and_requeue(
|
|
retry_at=None,
|
|
status=Process.StatusChoices.EXITED,
|
|
ended_at=timezone.now(),
|
|
)
|
|
|
|
|
|
# =============================================================================
|
|
# State Machine Registration
|
|
# =============================================================================
|
|
|
|
# Manually register state machines with python-statemachine registry
|
|
registry.register(BinaryMachine)
|
|
registry.register(ProcessMachine)
|