Files
ArchiveBox/archivebox/machine/models.py

351 lines
15 KiB
Python
Executable File

__package__ = 'archivebox.machine'
import socket
from archivebox.uuid_compat import uuid7
from datetime import timedelta
from django.db import models
from django.utils import timezone
from django.utils.functional import cached_property
from archivebox.base_models.models import ModelWithHealthStats
from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, get_host_stats
_CURRENT_MACHINE = None
_CURRENT_INTERFACE = None
_CURRENT_BINARIES = {}
MACHINE_RECHECK_INTERVAL = 7 * 24 * 60 * 60
NETWORK_INTERFACE_RECHECK_INTERVAL = 1 * 60 * 60
INSTALLED_BINARY_RECHECK_INTERVAL = 1 * 30 * 60
class MachineManager(models.Manager):
def current(self) -> 'Machine':
return Machine.current()
class Machine(ModelWithHealthStats):
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
guid = models.CharField(max_length=64, default=None, null=False, unique=True, editable=False)
hostname = models.CharField(max_length=63, default=None, null=False)
hw_in_docker = models.BooleanField(default=False, null=False)
hw_in_vm = models.BooleanField(default=False, null=False)
hw_manufacturer = models.CharField(max_length=63, default=None, null=False)
hw_product = models.CharField(max_length=63, default=None, null=False)
hw_uuid = models.CharField(max_length=255, default=None, null=False)
os_arch = models.CharField(max_length=15, default=None, null=False)
os_family = models.CharField(max_length=15, default=None, null=False)
os_platform = models.CharField(max_length=63, default=None, null=False)
os_release = models.CharField(max_length=63, default=None, null=False)
os_kernel = models.CharField(max_length=255, default=None, null=False)
stats = models.JSONField(default=dict, null=False)
config = models.JSONField(default=dict, null=False, blank=True,
help_text="Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)")
num_uses_failed = models.PositiveIntegerField(default=0)
num_uses_succeeded = models.PositiveIntegerField(default=0)
objects: MachineManager = MachineManager()
networkinterface_set: models.Manager['NetworkInterface']
@classmethod
def current(cls) -> 'Machine':
global _CURRENT_MACHINE
if _CURRENT_MACHINE:
if timezone.now() < _CURRENT_MACHINE.modified_at + timedelta(seconds=MACHINE_RECHECK_INTERVAL):
return _CURRENT_MACHINE
_CURRENT_MACHINE = None
_CURRENT_MACHINE, _ = cls.objects.update_or_create(
guid=get_host_guid(),
defaults={'hostname': socket.gethostname(), **get_os_info(), **get_vm_info(), 'stats': get_host_stats()},
)
return _CURRENT_MACHINE
class NetworkInterfaceManager(models.Manager):
def current(self) -> 'NetworkInterface':
return NetworkInterface.current()
class NetworkInterface(ModelWithHealthStats):
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
machine = models.ForeignKey(Machine, on_delete=models.CASCADE, default=None, null=False)
mac_address = models.CharField(max_length=17, default=None, null=False, editable=False)
ip_public = models.GenericIPAddressField(default=None, null=False, editable=False)
ip_local = models.GenericIPAddressField(default=None, null=False, editable=False)
dns_server = models.GenericIPAddressField(default=None, null=False, editable=False)
hostname = models.CharField(max_length=63, default=None, null=False)
iface = models.CharField(max_length=15, default=None, null=False)
isp = models.CharField(max_length=63, default=None, null=False)
city = models.CharField(max_length=63, default=None, null=False)
region = models.CharField(max_length=63, default=None, null=False)
country = models.CharField(max_length=63, default=None, null=False)
num_uses_failed = models.PositiveIntegerField(default=0)
num_uses_succeeded = models.PositiveIntegerField(default=0)
objects: NetworkInterfaceManager = NetworkInterfaceManager()
class Meta:
unique_together = (('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),)
@classmethod
def current(cls) -> 'NetworkInterface':
global _CURRENT_INTERFACE
if _CURRENT_INTERFACE:
if timezone.now() < _CURRENT_INTERFACE.modified_at + timedelta(seconds=NETWORK_INTERFACE_RECHECK_INTERVAL):
return _CURRENT_INTERFACE
_CURRENT_INTERFACE = None
machine = Machine.objects.current()
net_info = get_host_network()
_CURRENT_INTERFACE, _ = cls.objects.update_or_create(
machine=machine, ip_public=net_info.pop('ip_public'), ip_local=net_info.pop('ip_local'),
mac_address=net_info.pop('mac_address'), dns_server=net_info.pop('dns_server'), defaults=net_info,
)
return _CURRENT_INTERFACE
class DependencyManager(models.Manager):
def get_or_create_for_extractor(self, bin_name: str, bin_providers: str = '*', overrides: dict = None, config: dict = None) -> 'Dependency':
"""Get or create a Dependency for an extractor's binary."""
dependency, created = self.get_or_create(
bin_name=bin_name,
defaults={
'bin_providers': bin_providers,
'overrides': overrides or {},
'config': config or {},
}
)
return dependency
class Dependency(models.Model):
"""
Defines a binary dependency needed by an extractor.
This model tracks what binaries need to be installed and how to install them.
Provider hooks listen for Dependency creation events and attempt installation.
Example:
Dependency.objects.get_or_create(
bin_name='wget',
bin_providers='apt,brew,pip,env',
overrides={
'apt': {'packages': ['wget']},
'brew': {'packages': ['wget']},
'pip': {'packages': ['wget']},
}
)
"""
BIN_PROVIDER_CHOICES = (
('*', 'Any'),
('apt', 'apt'),
('brew', 'brew'),
('pip', 'pip'),
('npm', 'npm'),
('gem', 'gem'),
('nix', 'nix'),
('env', 'env (already in PATH)'),
('custom', 'custom'),
)
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
bin_name = models.CharField(max_length=63, unique=True, db_index=True,
help_text="Binary executable name (e.g., wget, yt-dlp, chromium)")
bin_providers = models.CharField(max_length=127, default='*',
help_text="Comma-separated list of allowed providers: apt,brew,pip,npm,gem,nix,custom or * for any")
overrides = models.JSONField(default=dict, blank=True,
help_text="JSON map matching abx-pkg Binary.overrides format: {'pip': {'packages': ['pkg']}, 'apt': {'packages': ['pkg']}}")
config = models.JSONField(default=dict, blank=True,
help_text="JSON map of env var config to use during install")
objects: DependencyManager = DependencyManager()
class Meta:
verbose_name = 'Dependency'
verbose_name_plural = 'Dependencies'
def __str__(self) -> str:
return f'{self.bin_name} (providers: {self.bin_providers})'
def allows_provider(self, provider: str) -> bool:
"""Check if this dependency allows the given provider."""
if self.bin_providers == '*':
return True
return provider in self.bin_providers.split(',')
def get_overrides_for_provider(self, provider: str) -> dict | None:
"""Get the overrides for a provider, or None if not specified."""
return self.overrides.get(provider)
@property
def installed_binaries(self):
"""Get all InstalledBinary records for this dependency."""
return InstalledBinary.objects.filter(dependency=self)
@property
def is_installed(self) -> bool:
"""Check if at least one valid InstalledBinary exists for this dependency."""
return self.installed_binaries.filter(abspath__isnull=False).exclude(abspath='').exists()
def run(self):
"""
Execute dependency installation by running all on_Dependency hooks.
Each hook checks if it can handle this dependency and installs if possible.
Returns the InstalledBinary record on success, None on failure.
"""
import json
from pathlib import Path
from django.conf import settings
# Check if already installed
if self.is_installed:
return self.installed_binaries.first()
# Import here to avoid circular dependency
from archivebox.hooks import run_hooks
# Create output directory
DATA_DIR = getattr(settings, 'DATA_DIR', Path.cwd())
output_dir = Path(DATA_DIR) / 'tmp' / f'dependency_{self.id}'
output_dir.mkdir(parents=True, exist_ok=True)
# Build kwargs for hooks - pass overrides as JSON string
hook_kwargs = {
'dependency_id': str(self.id),
'bin_name': self.bin_name,
'bin_providers': self.bin_providers,
'overrides': json.dumps(self.overrides) if self.overrides else None,
}
# Run all on_Dependency hooks - each decides if it can handle this
results = run_hooks(
event_name='Dependency',
output_dir=output_dir,
timeout=600,
**hook_kwargs
)
# Process results - parse JSONL and create InstalledBinary records
for result in results:
if result['returncode'] != 0:
continue
# Parse JSONL output
for line in result['stdout'].strip().split('\n'):
if not line.strip():
continue
try:
obj = json.loads(line)
if obj.get('type') == 'InstalledBinary':
# Create InstalledBinary record
if not obj.get('name') or not obj.get('abspath') or not obj.get('version'):
continue
machine = Machine.current()
installed_binary, _ = InstalledBinary.objects.update_or_create(
machine=machine,
name=obj['name'],
defaults={
'abspath': obj['abspath'],
'version': obj['version'],
'sha256': obj.get('sha256') or '',
'binprovider': obj.get('binprovider') or 'env',
'dependency': self,
}
)
# Success! Return the installed binary
if self.is_installed:
return installed_binary
except json.JSONDecodeError:
continue
# Failed to install with any hook
return None
class InstalledBinaryManager(models.Manager):
def get_from_db_or_cache(self, name: str, abspath: str = '', version: str = '', sha256: str = '', binprovider: str = 'env') -> 'InstalledBinary':
"""Get or create an InstalledBinary record from the database or cache."""
global _CURRENT_BINARIES
cached = _CURRENT_BINARIES.get(name)
if cached and timezone.now() < cached.modified_at + timedelta(seconds=INSTALLED_BINARY_RECHECK_INTERVAL):
return cached
_CURRENT_BINARIES[name], _ = self.update_or_create(
machine=Machine.objects.current(), name=name, binprovider=binprovider,
version=version, abspath=abspath, sha256=sha256,
)
return _CURRENT_BINARIES[name]
def get_valid_binary(self, name: str, machine: 'Machine | None' = None) -> 'InstalledBinary | None':
"""Get a valid InstalledBinary for the given name on the current machine, or None if not found."""
machine = machine or Machine.current()
return self.filter(
machine=machine,
name__iexact=name,
).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
class InstalledBinary(ModelWithHealthStats):
"""
Tracks an installed binary on a specific machine.
Each InstalledBinary is optionally linked to a Dependency that defines
how the binary should be installed. The `is_valid` property indicates
whether the binary is usable (has both abspath and version).
"""
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
machine = models.ForeignKey(Machine, on_delete=models.CASCADE, default=None, null=False, blank=True)
dependency = models.ForeignKey(Dependency, on_delete=models.SET_NULL, null=True, blank=True,
related_name='installedbinary_set',
help_text="The Dependency this binary satisfies")
name = models.CharField(max_length=63, default=None, null=False, blank=True, db_index=True)
binprovider = models.CharField(max_length=31, default=None, null=False, blank=True)
abspath = models.CharField(max_length=255, default=None, null=False, blank=True)
version = models.CharField(max_length=32, default=None, null=False, blank=True)
sha256 = models.CharField(max_length=64, default=None, null=False, blank=True)
num_uses_failed = models.PositiveIntegerField(default=0)
num_uses_succeeded = models.PositiveIntegerField(default=0)
objects: InstalledBinaryManager = InstalledBinaryManager()
class Meta:
verbose_name = 'Installed Binary'
verbose_name_plural = 'Installed Binaries'
unique_together = (('machine', 'name', 'abspath', 'version', 'sha256'),)
def __str__(self) -> str:
return f'{self.name}@{self.binprovider}+{self.abspath}@{self.version}'
@property
def is_valid(self) -> bool:
"""A binary is valid if it has both abspath and version set."""
return bool(self.abspath) and bool(self.version)
@cached_property
def binary_info(self) -> dict:
"""Return info about the binary."""
return {
'name': self.name,
'abspath': self.abspath,
'version': self.version,
'binprovider': self.binprovider,
'is_valid': self.is_valid,
}