way better plugin hooks system wip

This commit is contained in:
Nick Sweeting
2025-12-28 03:39:59 -08:00
parent a38624a4dd
commit 50e527ec65
156 changed files with 10275 additions and 7149 deletions

View File

@@ -4,7 +4,7 @@ from django.contrib import admin
from django.utils.html import format_html
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from machine.models import Machine, NetworkInterface, InstalledBinary, Dependency
from machine.models import Machine, NetworkInterface, Binary
class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):
@@ -96,62 +96,16 @@ class NetworkInterfaceAdmin(BaseModelAdmin):
)
class DependencyAdmin(ConfigEditorMixin, BaseModelAdmin):
list_display = ('id', 'created_at', 'bin_name', 'bin_providers', 'is_installed', 'installed_count')
sort_fields = ('id', 'created_at', 'bin_name', 'bin_providers')
search_fields = ('id', 'bin_name', 'bin_providers')
readonly_fields = ('id', 'created_at', 'modified_at', 'is_installed', 'installed_count')
fieldsets = (
('Binary', {
'fields': ('bin_name', 'bin_providers', 'is_installed', 'installed_count'),
'classes': ('card',),
}),
('Commands', {
'fields': ('custom_cmds',),
'classes': ('card',),
}),
('Configuration', {
'fields': ('config',),
'classes': ('card', 'wide'),
}),
('Timestamps', {
'fields': ('id', 'created_at', 'modified_at'),
'classes': ('card',),
}),
)
list_filter = ('bin_providers', 'created_at')
ordering = ['-created_at']
list_per_page = 100
actions = ["delete_selected"]
@admin.display(description='Installed', boolean=True)
def is_installed(self, dependency):
return dependency.is_installed
@admin.display(description='# Binaries')
def installed_count(self, dependency):
count = dependency.installed_binaries.count()
if count:
return format_html(
'<a href="/admin/machine/installedbinary/?dependency__id__exact={}">{}</a>',
dependency.id, count,
)
return '0'
class InstalledBinaryAdmin(BaseModelAdmin):
list_display = ('id', 'created_at', 'machine_info', 'name', 'dependency_link', 'binprovider', 'version', 'abspath', 'sha256', 'health')
sort_fields = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256')
search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'dependency__bin_name')
class BinaryAdmin(BaseModelAdmin):
list_display = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status', 'health')
sort_fields = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status')
search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256')
readonly_fields = ('created_at', 'modified_at')
fieldsets = (
('Binary Info', {
'fields': ('name', 'dependency', 'binprovider'),
'fields': ('name', 'binproviders', 'binprovider', 'overrides'),
'classes': ('card',),
}),
('Location', {
@@ -162,6 +116,10 @@ class InstalledBinaryAdmin(BaseModelAdmin):
'fields': ('version', 'sha256'),
'classes': ('card',),
}),
('State', {
'fields': ('status', 'retry_at', 'output_dir'),
'classes': ('card',),
}),
('Usage', {
'fields': ('num_uses_succeeded', 'num_uses_failed'),
'classes': ('card',),
@@ -172,30 +130,20 @@ class InstalledBinaryAdmin(BaseModelAdmin):
}),
)
list_filter = ('name', 'binprovider', 'machine_id', 'dependency')
list_filter = ('name', 'binprovider', 'status', 'machine_id')
ordering = ['-created_at']
list_per_page = 100
actions = ["delete_selected"]
@admin.display(description='Machine', ordering='machine__id')
def machine_info(self, installed_binary):
def machine_info(self, binary):
return format_html(
'<a href="/admin/machine/machine/{}/change"><b><code>[{}]</code></b> &nbsp; {}</a>',
installed_binary.machine.id, str(installed_binary.machine.id)[:8], installed_binary.machine.hostname,
binary.machine.id, str(binary.machine.id)[:8], binary.machine.hostname,
)
@admin.display(description='Dependency', ordering='dependency__bin_name')
def dependency_link(self, installed_binary):
if installed_binary.dependency:
return format_html(
'<a href="/admin/machine/dependency/{}/change">{}</a>',
installed_binary.dependency.id, installed_binary.dependency.bin_name,
)
return '-'
def register_admin(admin_site):
admin_site.register(Machine, MachineAdmin)
admin_site.register(NetworkInterface, NetworkInterfaceAdmin)
admin_site.register(Dependency, DependencyAdmin)
admin_site.register(InstalledBinary, InstalledBinaryAdmin)
admin_site.register(Binary, BinaryAdmin)

View File

@@ -14,9 +14,9 @@ class Migration(migrations.Migration):
replaces = [
('machine', '0001_initial'),
('machine', '0002_alter_machine_stats_installedbinary'),
('machine', '0003_alter_installedbinary_options_and_more'),
('machine', '0004_alter_installedbinary_abspath_and_more'),
('machine', '0002_alter_machine_stats_binary'),
('machine', '0003_alter_binary_options_and_more'),
('machine', '0004_alter_binary_abspath_and_more'),
]
dependencies = []
@@ -87,7 +87,7 @@ class Migration(migrations.Migration):
},
),
migrations.CreateModel(
name='InstalledBinary',
name='Binary',
fields=[
('num_uses_failed', models.PositiveIntegerField(default=0)),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
@@ -100,11 +100,11 @@ class Migration(migrations.Migration):
('version', models.CharField(blank=True, default=None, max_length=32)),
('sha256', models.CharField(blank=True, default=None, max_length=64)),
('machine', models.ForeignKey(blank=True, default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
('dependency', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='installedbinary_set', to='machine.dependency')),
('dependency', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='binary_set', to='machine.dependency')),
],
options={
'verbose_name': 'Installed Binary',
'verbose_name_plural': 'Installed Binaries',
'verbose_name': 'Binary',
'verbose_name_plural': 'Binaries',
'unique_together': {('machine', 'name', 'abspath', 'version', 'sha256')},
},
),

View File

@@ -0,0 +1,45 @@
# Generated by Django 6.0 on 2025-12-28 05:12
import django.db.models.deletion
import uuid
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('machine', '0002_rename_custom_cmds_to_overrides'),
]
operations = [
migrations.AlterField(
model_name='dependency',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='binary',
name='dependency',
field=models.ForeignKey(blank=True, help_text='The Dependency this binary satisfies', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='binary_set', to='machine.dependency'),
),
migrations.AlterField(
model_name='binary',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='machine',
name='config',
field=models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)'),
),
migrations.AlterField(
model_name='machine',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='networkinterface',
name='id',
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
]

View File

@@ -0,0 +1,56 @@
# Generated migration - Clean slate for Binary model
# Drops old InstalledBinary and Dependency tables, creates new Binary table
from django.db import migrations, models
import django.utils.timezone
import archivebox.uuid_compat
def drop_old_tables(apps, schema_editor):
"""Drop old tables using raw SQL"""
schema_editor.execute('DROP TABLE IF EXISTS machine_installedbinary')
schema_editor.execute('DROP TABLE IF EXISTS machine_dependency')
schema_editor.execute('DROP TABLE IF EXISTS machine_binary') # In case rename happened
class Migration(migrations.Migration):
dependencies = [
('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
]
operations = [
# Drop old tables using raw SQL
migrations.RunPython(drop_old_tables, migrations.RunPython.noop),
# Create new Binary model from scratch
migrations.CreateModel(
name='Binary',
fields=[
('id', models.UUIDField(default=archivebox.uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('name', models.CharField(blank=True, db_index=True, default=None, max_length=63)),
('binproviders', models.CharField(blank=True, default='env', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,env', max_length=127)),
('overrides', models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}")),
('binprovider', models.CharField(blank=True, default=None, help_text='Provider that successfully installed this binary', max_length=31)),
('abspath', models.CharField(blank=True, default=None, max_length=255)),
('version', models.CharField(blank=True, default=None, max_length=32)),
('sha256', models.CharField(blank=True, default=None, max_length=64)),
('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16)),
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True)),
('output_dir', models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255)),
('num_uses_failed', models.PositiveIntegerField(default=0)),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
('machine', models.ForeignKey(blank=True, default=None, on_delete=models.deletion.CASCADE, to='machine.machine')),
],
options={
'verbose_name': 'Binary',
'verbose_name_plural': 'Binaries',
},
),
migrations.AddIndex(
model_name='binary',
index=models.Index(fields=['machine', 'name', 'abspath', 'version', 'sha256'], name='machine_bin_machine_idx'),
),
]

View File

@@ -17,7 +17,7 @@ _CURRENT_BINARIES = {}
MACHINE_RECHECK_INTERVAL = 7 * 24 * 60 * 60
NETWORK_INTERFACE_RECHECK_INTERVAL = 1 * 60 * 60
INSTALLED_BINARY_RECHECK_INTERVAL = 1 * 30 * 60
BINARY_RECHECK_INTERVAL = 1 * 30 * 60
class MachineManager(models.Manager):
@@ -63,6 +63,31 @@ class Machine(ModelWithHealthStats):
)
return _CURRENT_MACHINE
@staticmethod
def from_jsonl(record: dict, overrides: dict = None):
"""
Update Machine config from JSONL record.
Args:
record: JSONL record with '_method': 'update', 'key': '...', 'value': '...'
overrides: Not used
Returns:
Machine instance or None
"""
method = record.get('_method')
if method == 'update':
key = record.get('key')
value = record.get('value')
if key and value:
machine = Machine.current()
if not machine.config:
machine.config = {}
machine.config[key] = value
machine.save(update_fields=['config'])
return machine
return None
class NetworkInterfaceManager(models.Manager):
def current(self) -> 'NetworkInterface':
@@ -108,179 +133,13 @@ class NetworkInterface(ModelWithHealthStats):
return _CURRENT_INTERFACE
class DependencyManager(models.Manager):
def get_or_create_for_extractor(self, bin_name: str, bin_providers: str = '*', overrides: dict = None, config: dict = None) -> 'Dependency':
"""Get or create a Dependency for an extractor's binary."""
dependency, created = self.get_or_create(
bin_name=bin_name,
defaults={
'bin_providers': bin_providers,
'overrides': overrides or {},
'config': config or {},
}
)
return dependency
class Dependency(models.Model):
"""
Defines a binary dependency needed by an extractor.
This model tracks what binaries need to be installed and how to install them.
Provider hooks listen for Dependency creation events and attempt installation.
Example:
Dependency.objects.get_or_create(
bin_name='wget',
bin_providers='apt,brew,pip,env',
overrides={
'apt': {'packages': ['wget']},
'brew': {'packages': ['wget']},
'pip': {'packages': ['wget']},
}
)
"""
BIN_PROVIDER_CHOICES = (
('*', 'Any'),
('apt', 'apt'),
('brew', 'brew'),
('pip', 'pip'),
('npm', 'npm'),
('gem', 'gem'),
('nix', 'nix'),
('env', 'env (already in PATH)'),
('custom', 'custom'),
)
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
bin_name = models.CharField(max_length=63, unique=True, db_index=True,
help_text="Binary executable name (e.g., wget, yt-dlp, chromium)")
bin_providers = models.CharField(max_length=127, default='*',
help_text="Comma-separated list of allowed providers: apt,brew,pip,npm,gem,nix,custom or * for any")
overrides = models.JSONField(default=dict, blank=True,
help_text="JSON map matching abx-pkg Binary.overrides format: {'pip': {'packages': ['pkg']}, 'apt': {'packages': ['pkg']}}")
config = models.JSONField(default=dict, blank=True,
help_text="JSON map of env var config to use during install")
objects: DependencyManager = DependencyManager()
class Meta:
verbose_name = 'Dependency'
verbose_name_plural = 'Dependencies'
def __str__(self) -> str:
return f'{self.bin_name} (providers: {self.bin_providers})'
def allows_provider(self, provider: str) -> bool:
"""Check if this dependency allows the given provider."""
if self.bin_providers == '*':
return True
return provider in self.bin_providers.split(',')
def get_overrides_for_provider(self, provider: str) -> dict | None:
"""Get the overrides for a provider, or None if not specified."""
return self.overrides.get(provider)
@property
def installed_binaries(self):
"""Get all InstalledBinary records for this dependency."""
return InstalledBinary.objects.filter(dependency=self)
@property
def is_installed(self) -> bool:
"""Check if at least one valid InstalledBinary exists for this dependency."""
return self.installed_binaries.filter(abspath__isnull=False).exclude(abspath='').exists()
def run(self):
"""
Execute dependency installation by running all on_Dependency hooks.
Each hook checks if it can handle this dependency and installs if possible.
Returns the InstalledBinary record on success, None on failure.
"""
import json
from pathlib import Path
from django.conf import settings
# Check if already installed
if self.is_installed:
return self.installed_binaries.first()
# Import here to avoid circular dependency
from archivebox.hooks import run_hooks
# Create output directory
DATA_DIR = getattr(settings, 'DATA_DIR', Path.cwd())
output_dir = Path(DATA_DIR) / 'tmp' / f'dependency_{self.id}'
output_dir.mkdir(parents=True, exist_ok=True)
# Build kwargs for hooks - pass overrides as JSON string
hook_kwargs = {
'dependency_id': str(self.id),
'bin_name': self.bin_name,
'bin_providers': self.bin_providers,
'overrides': json.dumps(self.overrides) if self.overrides else None,
}
# Run all on_Dependency hooks - each decides if it can handle this
results = run_hooks(
event_name='Dependency',
output_dir=output_dir,
timeout=600,
**hook_kwargs
)
# Process results - parse JSONL and create InstalledBinary records
for result in results:
if result['returncode'] != 0:
continue
# Parse JSONL output
for line in result['stdout'].strip().split('\n'):
if not line.strip():
continue
try:
obj = json.loads(line)
if obj.get('type') == 'InstalledBinary':
# Create InstalledBinary record
if not obj.get('name') or not obj.get('abspath') or not obj.get('version'):
continue
machine = Machine.current()
installed_binary, _ = InstalledBinary.objects.update_or_create(
machine=machine,
name=obj['name'],
defaults={
'abspath': obj['abspath'],
'version': obj['version'],
'sha256': obj.get('sha256') or '',
'binprovider': obj.get('binprovider') or 'env',
'dependency': self,
}
)
# Success! Return the installed binary
if self.is_installed:
return installed_binary
except json.JSONDecodeError:
continue
# Failed to install with any hook
return None
class InstalledBinaryManager(models.Manager):
def get_from_db_or_cache(self, name: str, abspath: str = '', version: str = '', sha256: str = '', binprovider: str = 'env') -> 'InstalledBinary':
"""Get or create an InstalledBinary record from the database or cache."""
class BinaryManager(models.Manager):
def get_from_db_or_cache(self, name: str, abspath: str = '', version: str = '', sha256: str = '', binprovider: str = 'env') -> 'Binary':
"""Get or create an Binary record from the database or cache."""
global _CURRENT_BINARIES
cached = _CURRENT_BINARIES.get(name)
if cached and timezone.now() < cached.modified_at + timedelta(seconds=INSTALLED_BINARY_RECHECK_INTERVAL):
if cached and timezone.now() < cached.modified_at + timedelta(seconds=BINARY_RECHECK_INTERVAL):
return cached
_CURRENT_BINARIES[name], _ = self.update_or_create(
machine=Machine.objects.current(), name=name, binprovider=binprovider,
@@ -288,8 +147,8 @@ class InstalledBinaryManager(models.Manager):
)
return _CURRENT_BINARIES[name]
def get_valid_binary(self, name: str, machine: 'Machine | None' = None) -> 'InstalledBinary | None':
"""Get a valid InstalledBinary for the given name on the current machine, or None if not found."""
def get_valid_binary(self, name: str, machine: 'Machine | None' = None) -> 'Binary | None':
"""Get a valid Binary for the given name on the current machine, or None if not found."""
machine = machine or Machine.current()
return self.filter(
machine=machine,
@@ -297,35 +156,63 @@ class InstalledBinaryManager(models.Manager):
).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
class InstalledBinary(ModelWithHealthStats):
class Binary(ModelWithHealthStats):
"""
Tracks an installed binary on a specific machine.
Tracks an binary on a specific machine.
Each InstalledBinary is optionally linked to a Dependency that defines
how the binary should be installed. The `is_valid` property indicates
whether the binary is usable (has both abspath and version).
Follows the unified state machine pattern:
- queued: Binary needs to be installed
- started: Installation in progress
- succeeded: Binary installed successfully (abspath, version, sha256 populated)
- failed: Installation failed
State machine calls run() which executes on_Binary__install_* hooks
to install the binary using the specified providers.
"""
class StatusChoices(models.TextChoices):
QUEUED = 'queued', 'Queued'
STARTED = 'started', 'Started'
SUCCEEDED = 'succeeded', 'Succeeded'
FAILED = 'failed', 'Failed'
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
machine = models.ForeignKey(Machine, on_delete=models.CASCADE, default=None, null=False, blank=True)
dependency = models.ForeignKey(Dependency, on_delete=models.SET_NULL, null=True, blank=True,
related_name='installedbinary_set',
help_text="The Dependency this binary satisfies")
name = models.CharField(max_length=63, default=None, null=False, blank=True, db_index=True)
binprovider = models.CharField(max_length=31, default=None, null=False, blank=True)
abspath = models.CharField(max_length=255, default=None, null=False, blank=True)
version = models.CharField(max_length=32, default=None, null=False, blank=True)
sha256 = models.CharField(max_length=64, default=None, null=False, blank=True)
machine = models.ForeignKey(Machine, on_delete=models.CASCADE, null=False)
# Binary metadata
name = models.CharField(max_length=63, default='', null=False, blank=True, db_index=True)
binproviders = models.CharField(max_length=127, default='env', null=False, blank=True,
help_text="Comma-separated list of allowed providers: apt,brew,pip,npm,env")
overrides = models.JSONField(default=dict, blank=True,
help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}")
# Installation results (populated after installation)
binprovider = models.CharField(max_length=31, default='', null=False, blank=True,
help_text="Provider that successfully installed this binary")
abspath = models.CharField(max_length=255, default='', null=False, blank=True)
version = models.CharField(max_length=32, default='', null=False, blank=True)
sha256 = models.CharField(max_length=64, default='', null=False, blank=True)
# State machine fields
status = models.CharField(max_length=16, choices=StatusChoices.choices, default=StatusChoices.QUEUED, db_index=True)
retry_at = models.DateTimeField(default=timezone.now, null=True, blank=True, db_index=True,
help_text="When to retry this binary installation")
output_dir = models.CharField(max_length=255, default='', null=False, blank=True,
help_text="Directory where installation hook logs are stored")
# Health stats
num_uses_failed = models.PositiveIntegerField(default=0)
num_uses_succeeded = models.PositiveIntegerField(default=0)
objects: InstalledBinaryManager = InstalledBinaryManager()
state_machine_name: str = 'machine.statemachines.BinaryMachine'
objects: BinaryManager = BinaryManager()
class Meta:
verbose_name = 'Installed Binary'
verbose_name_plural = 'Installed Binaries'
verbose_name = 'Binary'
verbose_name_plural = 'Binaries'
unique_together = (('machine', 'name', 'abspath', 'version', 'sha256'),)
def __str__(self) -> str:
@@ -347,4 +234,189 @@ class InstalledBinary(ModelWithHealthStats):
'is_valid': self.is_valid,
}
@staticmethod
def from_jsonl(record: dict, overrides: dict = None):
"""
Create/update Binary from JSONL record.
Handles two cases:
1. From binaries.jsonl: creates queued binary with name, binproviders, overrides
2. From hook output: updates binary with abspath, version, sha256, binprovider
Args:
record: JSONL record with 'name' and either:
- 'binproviders', 'overrides' (from binaries.jsonl)
- 'abspath', 'version', 'sha256', 'binprovider' (from hook output)
overrides: Not used
Returns:
Binary instance or None
"""
name = record.get('name')
if not name:
return None
machine = Machine.current()
overrides = overrides or {}
# Case 1: From binaries.jsonl - create queued binary
if 'binproviders' in record or ('overrides' in record and not record.get('abspath')):
binary, created = Binary.objects.get_or_create(
machine=machine,
name=name,
defaults={
'binproviders': record.get('binproviders', 'env'),
'overrides': record.get('overrides', {}),
'status': Binary.StatusChoices.QUEUED,
'retry_at': timezone.now(),
}
)
return binary
# Case 2: From hook output - update with installation results
abspath = record.get('abspath')
version = record.get('version')
if not abspath or not version:
return None
binary, _ = Binary.objects.update_or_create(
machine=machine,
name=name,
defaults={
'abspath': abspath,
'version': version,
'sha256': record.get('sha256', ''),
'binprovider': record.get('binprovider', 'env'),
'status': Binary.StatusChoices.SUCCEEDED,
'retry_at': None,
}
)
return binary
@property
def OUTPUT_DIR(self):
"""Return the output directory for this binary installation."""
from pathlib import Path
from django.conf import settings
DATA_DIR = getattr(settings, 'DATA_DIR', Path.cwd())
return Path(DATA_DIR) / 'machines' / str(self.machine_id) / 'binaries' / self.name / str(self.id)
def update_for_workers(self, **kwargs):
"""
Update binary fields for worker state machine.
Sets modified_at to ensure workers pick up changes.
Always saves the model after updating.
"""
for key, value in kwargs.items():
setattr(self, key, value)
self.modified_at = timezone.now()
self.save()
def run(self):
"""
Execute binary installation by running on_Binary__install_* hooks.
Called by BinaryMachine when entering 'started' state.
Runs ALL on_Binary__install_* hooks - each hook checks binproviders
and decides if it can handle this binary. First hook to succeed wins.
Updates status to SUCCEEDED or FAILED based on hook output.
"""
import json
from archivebox.hooks import discover_hooks, run_hook
# Create output directory
output_dir = self.OUTPUT_DIR
output_dir.mkdir(parents=True, exist_ok=True)
self.output_dir = str(output_dir)
self.save()
# Discover ALL on_Binary__install_* hooks
hooks = discover_hooks('Binary')
if not hooks:
self.status = self.StatusChoices.FAILED
self.save()
return
# Run each hook - they decide if they can handle this binary
for hook in hooks:
plugin_name = hook.parent.name
plugin_output_dir = output_dir / plugin_name
plugin_output_dir.mkdir(parents=True, exist_ok=True)
# Build kwargs for hook
hook_kwargs = {
'binary_id': str(self.id),
'machine_id': str(self.machine_id),
'name': self.name,
'binproviders': self.binproviders,
}
# Add overrides as JSON string if present
if self.overrides:
hook_kwargs['overrides'] = json.dumps(self.overrides)
# Run the hook
result = run_hook(
hook,
output_dir=plugin_output_dir,
timeout=600, # 10 min timeout
**hook_kwargs
)
# Background hook (unlikely for binary installation, but handle it)
if result is None:
continue
# Failed or skipped hook - try next one
if result['returncode'] != 0:
continue
# Parse JSONL output to check for successful installation
stdout_file = plugin_output_dir / 'stdout.log'
if stdout_file.exists():
stdout = stdout_file.read_text()
for line in stdout.splitlines():
if line.strip() and line.strip().startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'Binary' and record.get('abspath'):
# Update self from successful installation
self.abspath = record['abspath']
self.version = record.get('version', '')
self.sha256 = record.get('sha256', '')
self.binprovider = record.get('binprovider', 'env')
self.status = self.StatusChoices.SUCCEEDED
self.save()
return
except json.JSONDecodeError:
continue
# No hook succeeded
self.status = self.StatusChoices.FAILED
self.save()
def cleanup(self):
"""
Clean up background binary installation hooks.
Called by state machine if needed (not typically used for binaries
since installations are foreground, but included for consistency).
"""
from pathlib import Path
from archivebox.hooks import kill_process
output_dir = self.OUTPUT_DIR
if not output_dir.exists():
return
# Kill any background hooks
for plugin_dir in output_dir.iterdir():
if not plugin_dir.is_dir():
continue
pid_file = plugin_dir / 'hook.pid'
if pid_file.exists():
kill_process(pid_file)

View File

@@ -0,0 +1,112 @@
__package__ = 'archivebox.machine'
from datetime import timedelta
from django.utils import timezone
from django.db.models import F
from statemachine import State, StateMachine
from machine.models import Binary
class BinaryMachine(StateMachine, strict_states=True):
"""
State machine for managing Binary installation lifecycle.
Follows the unified pattern used by Crawl, Snapshot, and ArchiveResult:
- queued: Binary needs to be installed
- started: Installation hooks are running
- succeeded: Binary installed successfully (abspath, version, sha256 populated)
- failed: Installation failed permanently
"""
model: Binary
# States
queued = State(value=Binary.StatusChoices.QUEUED, initial=True)
started = State(value=Binary.StatusChoices.STARTED)
succeeded = State(value=Binary.StatusChoices.SUCCEEDED, final=True)
failed = State(value=Binary.StatusChoices.FAILED, final=True)
# Tick Event - transitions based on conditions
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(succeeded, cond='is_succeeded') |
started.to(failed, cond='is_failed')
)
def __init__(self, binary, *args, **kwargs):
self.binary = binary
super().__init__(binary, *args, **kwargs)
def __repr__(self) -> str:
return f'Binary[{self.binary.id}]'
def __str__(self) -> str:
return self.__repr__()
def can_start(self) -> bool:
"""Check if binary installation can start."""
return bool(self.binary.name and self.binary.binproviders)
def is_succeeded(self) -> bool:
"""Check if installation succeeded (status was set by run())."""
return self.binary.status == Binary.StatusChoices.SUCCEEDED
def is_failed(self) -> bool:
"""Check if installation failed (status was set by run())."""
return self.binary.status == Binary.StatusChoices.FAILED
def is_finished(self) -> bool:
"""Check if installation has completed (success or failure)."""
return self.binary.status in (
Binary.StatusChoices.SUCCEEDED,
Binary.StatusChoices.FAILED,
)
@queued.enter
def enter_queued(self):
"""Binary is queued for installation."""
self.binary.update_for_workers(
retry_at=timezone.now(),
status=Binary.StatusChoices.QUEUED,
)
@started.enter
def enter_started(self):
"""Start binary installation."""
# Lock the binary while installation runs
self.binary.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=300), # 5 min timeout for installation
status=Binary.StatusChoices.STARTED,
)
# Run installation hooks
self.binary.run()
# Save updated status (run() updates status to succeeded/failed)
self.binary.save()
@succeeded.enter
def enter_succeeded(self):
"""Binary installed successfully."""
self.binary.update_for_workers(
retry_at=None,
status=Binary.StatusChoices.SUCCEEDED,
)
# Increment health stats
Binary.objects.filter(pk=self.binary.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
@failed.enter
def enter_failed(self):
"""Binary installation failed."""
self.binary.update_for_workers(
retry_at=None,
status=Binary.StatusChoices.FAILED,
)
# Increment health stats
Binary.objects.filter(pk=self.binary.pk).update(num_uses_failed=F('num_uses_failed') + 1)