mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 17:35:45 +10:00
351 lines
15 KiB
Python
Executable File
351 lines
15 KiB
Python
Executable File
__package__ = 'archivebox.machine'
|
|
|
|
import socket
|
|
from archivebox.uuid_compat import uuid7
|
|
from datetime import timedelta
|
|
|
|
from django.db import models
|
|
from django.utils import timezone
|
|
from django.utils.functional import cached_property
|
|
|
|
from archivebox.base_models.models import ModelWithHealthStats
|
|
from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, get_host_stats
|
|
|
|
_CURRENT_MACHINE = None
|
|
_CURRENT_INTERFACE = None
|
|
_CURRENT_BINARIES = {}
|
|
|
|
MACHINE_RECHECK_INTERVAL = 7 * 24 * 60 * 60
|
|
NETWORK_INTERFACE_RECHECK_INTERVAL = 1 * 60 * 60
|
|
INSTALLED_BINARY_RECHECK_INTERVAL = 1 * 30 * 60
|
|
|
|
|
|
class MachineManager(models.Manager):
|
|
def current(self) -> 'Machine':
|
|
return Machine.current()
|
|
|
|
|
|
class Machine(ModelWithHealthStats):
|
|
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
|
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
|
modified_at = models.DateTimeField(auto_now=True)
|
|
guid = models.CharField(max_length=64, default=None, null=False, unique=True, editable=False)
|
|
hostname = models.CharField(max_length=63, default=None, null=False)
|
|
hw_in_docker = models.BooleanField(default=False, null=False)
|
|
hw_in_vm = models.BooleanField(default=False, null=False)
|
|
hw_manufacturer = models.CharField(max_length=63, default=None, null=False)
|
|
hw_product = models.CharField(max_length=63, default=None, null=False)
|
|
hw_uuid = models.CharField(max_length=255, default=None, null=False)
|
|
os_arch = models.CharField(max_length=15, default=None, null=False)
|
|
os_family = models.CharField(max_length=15, default=None, null=False)
|
|
os_platform = models.CharField(max_length=63, default=None, null=False)
|
|
os_release = models.CharField(max_length=63, default=None, null=False)
|
|
os_kernel = models.CharField(max_length=255, default=None, null=False)
|
|
stats = models.JSONField(default=dict, null=False)
|
|
config = models.JSONField(default=dict, null=False, blank=True,
|
|
help_text="Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)")
|
|
num_uses_failed = models.PositiveIntegerField(default=0)
|
|
num_uses_succeeded = models.PositiveIntegerField(default=0)
|
|
|
|
objects: MachineManager = MachineManager()
|
|
networkinterface_set: models.Manager['NetworkInterface']
|
|
|
|
@classmethod
|
|
def current(cls) -> 'Machine':
|
|
global _CURRENT_MACHINE
|
|
if _CURRENT_MACHINE:
|
|
if timezone.now() < _CURRENT_MACHINE.modified_at + timedelta(seconds=MACHINE_RECHECK_INTERVAL):
|
|
return _CURRENT_MACHINE
|
|
_CURRENT_MACHINE = None
|
|
_CURRENT_MACHINE, _ = cls.objects.update_or_create(
|
|
guid=get_host_guid(),
|
|
defaults={'hostname': socket.gethostname(), **get_os_info(), **get_vm_info(), 'stats': get_host_stats()},
|
|
)
|
|
return _CURRENT_MACHINE
|
|
|
|
|
|
class NetworkInterfaceManager(models.Manager):
|
|
def current(self) -> 'NetworkInterface':
|
|
return NetworkInterface.current()
|
|
|
|
|
|
class NetworkInterface(ModelWithHealthStats):
|
|
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
|
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
|
modified_at = models.DateTimeField(auto_now=True)
|
|
machine = models.ForeignKey(Machine, on_delete=models.CASCADE, default=None, null=False)
|
|
mac_address = models.CharField(max_length=17, default=None, null=False, editable=False)
|
|
ip_public = models.GenericIPAddressField(default=None, null=False, editable=False)
|
|
ip_local = models.GenericIPAddressField(default=None, null=False, editable=False)
|
|
dns_server = models.GenericIPAddressField(default=None, null=False, editable=False)
|
|
hostname = models.CharField(max_length=63, default=None, null=False)
|
|
iface = models.CharField(max_length=15, default=None, null=False)
|
|
isp = models.CharField(max_length=63, default=None, null=False)
|
|
city = models.CharField(max_length=63, default=None, null=False)
|
|
region = models.CharField(max_length=63, default=None, null=False)
|
|
country = models.CharField(max_length=63, default=None, null=False)
|
|
num_uses_failed = models.PositiveIntegerField(default=0)
|
|
num_uses_succeeded = models.PositiveIntegerField(default=0)
|
|
|
|
objects: NetworkInterfaceManager = NetworkInterfaceManager()
|
|
|
|
class Meta:
|
|
unique_together = (('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),)
|
|
|
|
@classmethod
|
|
def current(cls) -> 'NetworkInterface':
|
|
global _CURRENT_INTERFACE
|
|
if _CURRENT_INTERFACE:
|
|
if timezone.now() < _CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL):
|
|
return _CURRENT_INTERFACE
|
|
_CURRENT_INTERFACE = None
|
|
machine = Machine.objects.current()
|
|
net_info = get_host_network()
|
|
_CURRENT_INTERFACE, _ = cls.objects.update_or_create(
|
|
machine=machine, ip_public=net_info.pop('ip_public'), ip_local=net_info.pop('ip_local'),
|
|
mac_address=net_info.pop('mac_address'), dns_server=net_info.pop('dns_server'), defaults=net_info,
|
|
)
|
|
return _CURRENT_INTERFACE
|
|
|
|
|
|
class DependencyManager(models.Manager):
|
|
def get_or_create_for_extractor(self, bin_name: str, bin_providers: str = '*', overrides: dict = None, config: dict = None) -> 'Dependency':
|
|
"""Get or create a Dependency for an extractor's binary."""
|
|
dependency, created = self.get_or_create(
|
|
bin_name=bin_name,
|
|
defaults={
|
|
'bin_providers': bin_providers,
|
|
'overrides': overrides or {},
|
|
'config': config or {},
|
|
}
|
|
)
|
|
return dependency
|
|
|
|
|
|
class Dependency(models.Model):
|
|
"""
|
|
Defines a binary dependency needed by an extractor.
|
|
|
|
This model tracks what binaries need to be installed and how to install them.
|
|
Provider hooks listen for Dependency creation events and attempt installation.
|
|
|
|
Example:
|
|
Dependency.objects.get_or_create(
|
|
bin_name='wget',
|
|
bin_providers='apt,brew,pip,env',
|
|
overrides={
|
|
'apt': {'packages': ['wget']},
|
|
'brew': {'packages': ['wget']},
|
|
'pip': {'packages': ['wget']},
|
|
}
|
|
)
|
|
"""
|
|
|
|
BIN_PROVIDER_CHOICES = (
|
|
('*', 'Any'),
|
|
('apt', 'apt'),
|
|
('brew', 'brew'),
|
|
('pip', 'pip'),
|
|
('npm', 'npm'),
|
|
('gem', 'gem'),
|
|
('nix', 'nix'),
|
|
('env', 'env (already in PATH)'),
|
|
('custom', 'custom'),
|
|
)
|
|
|
|
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
|
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
|
modified_at = models.DateTimeField(auto_now=True)
|
|
|
|
bin_name = models.CharField(max_length=63, unique=True, db_index=True,
|
|
help_text="Binary executable name (e.g., wget, yt-dlp, chromium)")
|
|
bin_providers = models.CharField(max_length=127, default='*',
|
|
help_text="Comma-separated list of allowed providers: apt,brew,pip,npm,gem,nix,custom or * for any")
|
|
overrides = models.JSONField(default=dict, blank=True,
|
|
help_text="JSON map matching abx-pkg Binary.overrides format: {'pip': {'packages': ['pkg']}, 'apt': {'packages': ['pkg']}}")
|
|
config = models.JSONField(default=dict, blank=True,
|
|
help_text="JSON map of env var config to use during install")
|
|
|
|
objects: DependencyManager = DependencyManager()
|
|
|
|
class Meta:
|
|
verbose_name = 'Dependency'
|
|
verbose_name_plural = 'Dependencies'
|
|
|
|
def __str__(self) -> str:
|
|
return f'{self.bin_name} (providers: {self.bin_providers})'
|
|
|
|
def allows_provider(self, provider: str) -> bool:
|
|
"""Check if this dependency allows the given provider."""
|
|
if self.bin_providers == '*':
|
|
return True
|
|
return provider in self.bin_providers.split(',')
|
|
|
|
def get_overrides_for_provider(self, provider: str) -> dict | None:
|
|
"""Get the overrides for a provider, or None if not specified."""
|
|
return self.overrides.get(provider)
|
|
|
|
@property
|
|
def installed_binaries(self):
|
|
"""Get all InstalledBinary records for this dependency."""
|
|
return InstalledBinary.objects.filter(dependency=self)
|
|
|
|
@property
|
|
def is_installed(self) -> bool:
|
|
"""Check if at least one valid InstalledBinary exists for this dependency."""
|
|
return self.installed_binaries.filter(abspath__isnull=False).exclude(abspath='').exists()
|
|
|
|
def run(self):
|
|
"""
|
|
Execute dependency installation by running all on_Dependency hooks.
|
|
|
|
Each hook checks if it can handle this dependency and installs if possible.
|
|
Returns the InstalledBinary record on success, None on failure.
|
|
"""
|
|
import json
|
|
from pathlib import Path
|
|
from django.conf import settings
|
|
|
|
# Check if already installed
|
|
if self.is_installed:
|
|
return self.installed_binaries.first()
|
|
|
|
# Import here to avoid circular dependency
|
|
from archivebox.hooks import run_hooks
|
|
|
|
# Create output directory
|
|
DATA_DIR = getattr(settings, 'DATA_DIR', Path.cwd())
|
|
output_dir = Path(DATA_DIR) / 'tmp' / f'dependency_{self.id}'
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
|
|
# Build kwargs for hooks - pass overrides as JSON string
|
|
hook_kwargs = {
|
|
'dependency_id': str(self.id),
|
|
'bin_name': self.bin_name,
|
|
'bin_providers': self.bin_providers,
|
|
'overrides': json.dumps(self.overrides) if self.overrides else None,
|
|
}
|
|
|
|
# Run all on_Dependency hooks - each decides if it can handle this
|
|
results = run_hooks(
|
|
event_name='Dependency',
|
|
output_dir=output_dir,
|
|
timeout=600,
|
|
**hook_kwargs
|
|
)
|
|
|
|
# Process results - parse JSONL and create InstalledBinary records
|
|
for result in results:
|
|
if result['returncode'] != 0:
|
|
continue
|
|
|
|
# Parse JSONL output
|
|
for line in result['stdout'].strip().split('\n'):
|
|
if not line.strip():
|
|
continue
|
|
|
|
try:
|
|
obj = json.loads(line)
|
|
if obj.get('type') == 'InstalledBinary':
|
|
# Create InstalledBinary record
|
|
if not obj.get('name') or not obj.get('abspath') or not obj.get('version'):
|
|
continue
|
|
|
|
machine = Machine.current()
|
|
installed_binary, _ = InstalledBinary.objects.update_or_create(
|
|
machine=machine,
|
|
name=obj['name'],
|
|
defaults={
|
|
'abspath': obj['abspath'],
|
|
'version': obj['version'],
|
|
'sha256': obj.get('sha256') or '',
|
|
'binprovider': obj.get('binprovider') or 'env',
|
|
'dependency': self,
|
|
}
|
|
)
|
|
|
|
# Success! Return the installed binary
|
|
if self.is_installed:
|
|
return installed_binary
|
|
|
|
except json.JSONDecodeError:
|
|
continue
|
|
|
|
# Failed to install with any hook
|
|
return None
|
|
|
|
|
|
class InstalledBinaryManager(models.Manager):
|
|
def get_from_db_or_cache(self, name: str, abspath: str = '', version: str = '', sha256: str = '', binprovider: str = 'env') -> 'InstalledBinary':
|
|
"""Get or create an InstalledBinary record from the database or cache."""
|
|
global _CURRENT_BINARIES
|
|
cached = _CURRENT_BINARIES.get(name)
|
|
if cached and timezone.now() < cached.modified_at + timedelta(seconds=INSTALLED_BINARY_RECHECK_INTERVAL):
|
|
return cached
|
|
_CURRENT_BINARIES[name], _ = self.update_or_create(
|
|
machine=Machine.objects.current(), name=name, binprovider=binprovider,
|
|
version=version, abspath=abspath, sha256=sha256,
|
|
)
|
|
return _CURRENT_BINARIES[name]
|
|
|
|
def get_valid_binary(self, name: str, machine: 'Machine | None' = None) -> 'InstalledBinary | None':
|
|
"""Get a valid InstalledBinary for the given name on the current machine, or None if not found."""
|
|
machine = machine or Machine.current()
|
|
return self.filter(
|
|
machine=machine,
|
|
name__iexact=name,
|
|
).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
|
|
|
|
|
|
class InstalledBinary(ModelWithHealthStats):
|
|
"""
|
|
Tracks an installed binary on a specific machine.
|
|
|
|
Each InstalledBinary is optionally linked to a Dependency that defines
|
|
how the binary should be installed. The `is_valid` property indicates
|
|
whether the binary is usable (has both abspath and version).
|
|
"""
|
|
|
|
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
|
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
|
modified_at = models.DateTimeField(auto_now=True)
|
|
machine = models.ForeignKey(Machine, on_delete=models.CASCADE, default=None, null=False, blank=True)
|
|
dependency = models.ForeignKey(Dependency, on_delete=models.SET_NULL, null=True, blank=True,
|
|
related_name='installedbinary_set',
|
|
help_text="The Dependency this binary satisfies")
|
|
name = models.CharField(max_length=63, default=None, null=False, blank=True, db_index=True)
|
|
binprovider = models.CharField(max_length=31, default=None, null=False, blank=True)
|
|
abspath = models.CharField(max_length=255, default=None, null=False, blank=True)
|
|
version = models.CharField(max_length=32, default=None, null=False, blank=True)
|
|
sha256 = models.CharField(max_length=64, default=None, null=False, blank=True)
|
|
num_uses_failed = models.PositiveIntegerField(default=0)
|
|
num_uses_succeeded = models.PositiveIntegerField(default=0)
|
|
|
|
objects: InstalledBinaryManager = InstalledBinaryManager()
|
|
|
|
class Meta:
|
|
verbose_name = 'Installed Binary'
|
|
verbose_name_plural = 'Installed Binaries'
|
|
unique_together = (('machine', 'name', 'abspath', 'version', 'sha256'),)
|
|
|
|
def __str__(self) -> str:
|
|
return f'{self.name}@{self.binprovider}+{self.abspath}@{self.version}'
|
|
|
|
@property
|
|
def is_valid(self) -> bool:
|
|
"""A binary is valid if it has both abspath and version set."""
|
|
return bool(self.abspath) and bool(self.version)
|
|
|
|
@cached_property
|
|
def binary_info(self) -> dict:
|
|
"""Return info about the binary."""
|
|
return {
|
|
'name': self.name,
|
|
'abspath': self.abspath,
|
|
'version': self.version,
|
|
'binprovider': self.binprovider,
|
|
'is_valid': self.is_valid,
|
|
}
|
|
|
|
|