mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 15:27:53 +10:00
way better plugin hooks system wip
This commit is contained in:
@@ -4,7 +4,7 @@ from django.contrib import admin
|
||||
from django.utils.html import format_html
|
||||
|
||||
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
|
||||
from machine.models import Machine, NetworkInterface, InstalledBinary, Dependency
|
||||
from machine.models import Machine, NetworkInterface, Binary
|
||||
|
||||
|
||||
class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
@@ -96,62 +96,16 @@ class NetworkInterfaceAdmin(BaseModelAdmin):
|
||||
)
|
||||
|
||||
|
||||
class DependencyAdmin(ConfigEditorMixin, BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'bin_name', 'bin_providers', 'is_installed', 'installed_count')
|
||||
sort_fields = ('id', 'created_at', 'bin_name', 'bin_providers')
|
||||
search_fields = ('id', 'bin_name', 'bin_providers')
|
||||
|
||||
readonly_fields = ('id', 'created_at', 'modified_at', 'is_installed', 'installed_count')
|
||||
|
||||
fieldsets = (
|
||||
('Binary', {
|
||||
'fields': ('bin_name', 'bin_providers', 'is_installed', 'installed_count'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Commands', {
|
||||
'fields': ('custom_cmds',),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Configuration', {
|
||||
'fields': ('config',),
|
||||
'classes': ('card', 'wide'),
|
||||
}),
|
||||
('Timestamps', {
|
||||
'fields': ('id', 'created_at', 'modified_at'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('bin_providers', 'created_at')
|
||||
ordering = ['-created_at']
|
||||
list_per_page = 100
|
||||
actions = ["delete_selected"]
|
||||
|
||||
@admin.display(description='Installed', boolean=True)
|
||||
def is_installed(self, dependency):
|
||||
return dependency.is_installed
|
||||
|
||||
@admin.display(description='# Binaries')
|
||||
def installed_count(self, dependency):
|
||||
count = dependency.installed_binaries.count()
|
||||
if count:
|
||||
return format_html(
|
||||
'<a href="/admin/machine/installedbinary/?dependency__id__exact={}">{}</a>',
|
||||
dependency.id, count,
|
||||
)
|
||||
return '0'
|
||||
|
||||
|
||||
class InstalledBinaryAdmin(BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'machine_info', 'name', 'dependency_link', 'binprovider', 'version', 'abspath', 'sha256', 'health')
|
||||
sort_fields = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256')
|
||||
search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'dependency__bin_name')
|
||||
class BinaryAdmin(BaseModelAdmin):
|
||||
list_display = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status', 'health')
|
||||
sort_fields = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status')
|
||||
search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256')
|
||||
|
||||
readonly_fields = ('created_at', 'modified_at')
|
||||
|
||||
fieldsets = (
|
||||
('Binary Info', {
|
||||
'fields': ('name', 'dependency', 'binprovider'),
|
||||
'fields': ('name', 'binproviders', 'binprovider', 'overrides'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Location', {
|
||||
@@ -162,6 +116,10 @@ class InstalledBinaryAdmin(BaseModelAdmin):
|
||||
'fields': ('version', 'sha256'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('State', {
|
||||
'fields': ('status', 'retry_at', 'output_dir'),
|
||||
'classes': ('card',),
|
||||
}),
|
||||
('Usage', {
|
||||
'fields': ('num_uses_succeeded', 'num_uses_failed'),
|
||||
'classes': ('card',),
|
||||
@@ -172,30 +130,20 @@ class InstalledBinaryAdmin(BaseModelAdmin):
|
||||
}),
|
||||
)
|
||||
|
||||
list_filter = ('name', 'binprovider', 'machine_id', 'dependency')
|
||||
list_filter = ('name', 'binprovider', 'status', 'machine_id')
|
||||
ordering = ['-created_at']
|
||||
list_per_page = 100
|
||||
actions = ["delete_selected"]
|
||||
|
||||
@admin.display(description='Machine', ordering='machine__id')
|
||||
def machine_info(self, installed_binary):
|
||||
def machine_info(self, binary):
|
||||
return format_html(
|
||||
'<a href="/admin/machine/machine/{}/change"><b><code>[{}]</code></b> {}</a>',
|
||||
installed_binary.machine.id, str(installed_binary.machine.id)[:8], installed_binary.machine.hostname,
|
||||
binary.machine.id, str(binary.machine.id)[:8], binary.machine.hostname,
|
||||
)
|
||||
|
||||
@admin.display(description='Dependency', ordering='dependency__bin_name')
|
||||
def dependency_link(self, installed_binary):
|
||||
if installed_binary.dependency:
|
||||
return format_html(
|
||||
'<a href="/admin/machine/dependency/{}/change">{}</a>',
|
||||
installed_binary.dependency.id, installed_binary.dependency.bin_name,
|
||||
)
|
||||
return '-'
|
||||
|
||||
|
||||
def register_admin(admin_site):
|
||||
admin_site.register(Machine, MachineAdmin)
|
||||
admin_site.register(NetworkInterface, NetworkInterfaceAdmin)
|
||||
admin_site.register(Dependency, DependencyAdmin)
|
||||
admin_site.register(InstalledBinary, InstalledBinaryAdmin)
|
||||
admin_site.register(Binary, BinaryAdmin)
|
||||
|
||||
@@ -14,9 +14,9 @@ class Migration(migrations.Migration):
|
||||
|
||||
replaces = [
|
||||
('machine', '0001_initial'),
|
||||
('machine', '0002_alter_machine_stats_installedbinary'),
|
||||
('machine', '0003_alter_installedbinary_options_and_more'),
|
||||
('machine', '0004_alter_installedbinary_abspath_and_more'),
|
||||
('machine', '0002_alter_machine_stats_binary'),
|
||||
('machine', '0003_alter_binary_options_and_more'),
|
||||
('machine', '0004_alter_binary_abspath_and_more'),
|
||||
]
|
||||
|
||||
dependencies = []
|
||||
@@ -87,7 +87,7 @@ class Migration(migrations.Migration):
|
||||
},
|
||||
),
|
||||
migrations.CreateModel(
|
||||
name='InstalledBinary',
|
||||
name='Binary',
|
||||
fields=[
|
||||
('num_uses_failed', models.PositiveIntegerField(default=0)),
|
||||
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
|
||||
@@ -100,11 +100,11 @@ class Migration(migrations.Migration):
|
||||
('version', models.CharField(blank=True, default=None, max_length=32)),
|
||||
('sha256', models.CharField(blank=True, default=None, max_length=64)),
|
||||
('machine', models.ForeignKey(blank=True, default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
|
||||
('dependency', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='installedbinary_set', to='machine.dependency')),
|
||||
('dependency', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='binary_set', to='machine.dependency')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Installed Binary',
|
||||
'verbose_name_plural': 'Installed Binaries',
|
||||
'verbose_name': 'Binary',
|
||||
'verbose_name_plural': 'Binaries',
|
||||
'unique_together': {('machine', 'name', 'abspath', 'version', 'sha256')},
|
||||
},
|
||||
),
|
||||
|
||||
@@ -0,0 +1,45 @@
|
||||
# Generated by Django 6.0 on 2025-12-28 05:12
|
||||
|
||||
import django.db.models.deletion
|
||||
import uuid
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('machine', '0002_rename_custom_cmds_to_overrides'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='dependency',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='binary',
|
||||
name='dependency',
|
||||
field=models.ForeignKey(blank=True, help_text='The Dependency this binary satisfies', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='binary_set', to='machine.dependency'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='binary',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='machine',
|
||||
name='config',
|
||||
field=models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)'),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='machine',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
migrations.AlterField(
|
||||
model_name='networkinterface',
|
||||
name='id',
|
||||
field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
|
||||
),
|
||||
]
|
||||
@@ -0,0 +1,56 @@
|
||||
# Generated migration - Clean slate for Binary model
|
||||
# Drops old InstalledBinary and Dependency tables, creates new Binary table
|
||||
|
||||
from django.db import migrations, models
|
||||
import django.utils.timezone
|
||||
import archivebox.uuid_compat
|
||||
|
||||
|
||||
def drop_old_tables(apps, schema_editor):
|
||||
"""Drop old tables using raw SQL"""
|
||||
schema_editor.execute('DROP TABLE IF EXISTS machine_installedbinary')
|
||||
schema_editor.execute('DROP TABLE IF EXISTS machine_dependency')
|
||||
schema_editor.execute('DROP TABLE IF EXISTS machine_binary') # In case rename happened
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
# Drop old tables using raw SQL
|
||||
migrations.RunPython(drop_old_tables, migrations.RunPython.noop),
|
||||
|
||||
# Create new Binary model from scratch
|
||||
migrations.CreateModel(
|
||||
name='Binary',
|
||||
fields=[
|
||||
('id', models.UUIDField(default=archivebox.uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
|
||||
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
|
||||
('modified_at', models.DateTimeField(auto_now=True)),
|
||||
('name', models.CharField(blank=True, db_index=True, default=None, max_length=63)),
|
||||
('binproviders', models.CharField(blank=True, default='env', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,env', max_length=127)),
|
||||
('overrides', models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}")),
|
||||
('binprovider', models.CharField(blank=True, default=None, help_text='Provider that successfully installed this binary', max_length=31)),
|
||||
('abspath', models.CharField(blank=True, default=None, max_length=255)),
|
||||
('version', models.CharField(blank=True, default=None, max_length=32)),
|
||||
('sha256', models.CharField(blank=True, default=None, max_length=64)),
|
||||
('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16)),
|
||||
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True)),
|
||||
('output_dir', models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255)),
|
||||
('num_uses_failed', models.PositiveIntegerField(default=0)),
|
||||
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
|
||||
('machine', models.ForeignKey(blank=True, default=None, on_delete=models.deletion.CASCADE, to='machine.machine')),
|
||||
],
|
||||
options={
|
||||
'verbose_name': 'Binary',
|
||||
'verbose_name_plural': 'Binaries',
|
||||
},
|
||||
),
|
||||
migrations.AddIndex(
|
||||
model_name='binary',
|
||||
index=models.Index(fields=['machine', 'name', 'abspath', 'version', 'sha256'], name='machine_bin_machine_idx'),
|
||||
),
|
||||
]
|
||||
@@ -17,7 +17,7 @@ _CURRENT_BINARIES = {}
|
||||
|
||||
MACHINE_RECHECK_INTERVAL = 7 * 24 * 60 * 60
|
||||
NETWORK_INTERFACE_RECHECK_INTERVAL = 1 * 60 * 60
|
||||
INSTALLED_BINARY_RECHECK_INTERVAL = 1 * 30 * 60
|
||||
BINARY_RECHECK_INTERVAL = 1 * 30 * 60
|
||||
|
||||
|
||||
class MachineManager(models.Manager):
|
||||
@@ -63,6 +63,31 @@ class Machine(ModelWithHealthStats):
|
||||
)
|
||||
return _CURRENT_MACHINE
|
||||
|
||||
@staticmethod
|
||||
def from_jsonl(record: dict, overrides: dict = None):
|
||||
"""
|
||||
Update Machine config from JSONL record.
|
||||
|
||||
Args:
|
||||
record: JSONL record with '_method': 'update', 'key': '...', 'value': '...'
|
||||
overrides: Not used
|
||||
|
||||
Returns:
|
||||
Machine instance or None
|
||||
"""
|
||||
method = record.get('_method')
|
||||
if method == 'update':
|
||||
key = record.get('key')
|
||||
value = record.get('value')
|
||||
if key and value:
|
||||
machine = Machine.current()
|
||||
if not machine.config:
|
||||
machine.config = {}
|
||||
machine.config[key] = value
|
||||
machine.save(update_fields=['config'])
|
||||
return machine
|
||||
return None
|
||||
|
||||
|
||||
class NetworkInterfaceManager(models.Manager):
|
||||
def current(self) -> 'NetworkInterface':
|
||||
@@ -108,179 +133,13 @@ class NetworkInterface(ModelWithHealthStats):
|
||||
return _CURRENT_INTERFACE
|
||||
|
||||
|
||||
class DependencyManager(models.Manager):
|
||||
def get_or_create_for_extractor(self, bin_name: str, bin_providers: str = '*', overrides: dict = None, config: dict = None) -> 'Dependency':
|
||||
"""Get or create a Dependency for an extractor's binary."""
|
||||
dependency, created = self.get_or_create(
|
||||
bin_name=bin_name,
|
||||
defaults={
|
||||
'bin_providers': bin_providers,
|
||||
'overrides': overrides or {},
|
||||
'config': config or {},
|
||||
}
|
||||
)
|
||||
return dependency
|
||||
|
||||
|
||||
class Dependency(models.Model):
|
||||
"""
|
||||
Defines a binary dependency needed by an extractor.
|
||||
|
||||
This model tracks what binaries need to be installed and how to install them.
|
||||
Provider hooks listen for Dependency creation events and attempt installation.
|
||||
|
||||
Example:
|
||||
Dependency.objects.get_or_create(
|
||||
bin_name='wget',
|
||||
bin_providers='apt,brew,pip,env',
|
||||
overrides={
|
||||
'apt': {'packages': ['wget']},
|
||||
'brew': {'packages': ['wget']},
|
||||
'pip': {'packages': ['wget']},
|
||||
}
|
||||
)
|
||||
"""
|
||||
|
||||
BIN_PROVIDER_CHOICES = (
|
||||
('*', 'Any'),
|
||||
('apt', 'apt'),
|
||||
('brew', 'brew'),
|
||||
('pip', 'pip'),
|
||||
('npm', 'npm'),
|
||||
('gem', 'gem'),
|
||||
('nix', 'nix'),
|
||||
('env', 'env (already in PATH)'),
|
||||
('custom', 'custom'),
|
||||
)
|
||||
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
|
||||
bin_name = models.CharField(max_length=63, unique=True, db_index=True,
|
||||
help_text="Binary executable name (e.g., wget, yt-dlp, chromium)")
|
||||
bin_providers = models.CharField(max_length=127, default='*',
|
||||
help_text="Comma-separated list of allowed providers: apt,brew,pip,npm,gem,nix,custom or * for any")
|
||||
overrides = models.JSONField(default=dict, blank=True,
|
||||
help_text="JSON map matching abx-pkg Binary.overrides format: {'pip': {'packages': ['pkg']}, 'apt': {'packages': ['pkg']}}")
|
||||
config = models.JSONField(default=dict, blank=True,
|
||||
help_text="JSON map of env var config to use during install")
|
||||
|
||||
objects: DependencyManager = DependencyManager()
|
||||
|
||||
class Meta:
|
||||
verbose_name = 'Dependency'
|
||||
verbose_name_plural = 'Dependencies'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f'{self.bin_name} (providers: {self.bin_providers})'
|
||||
|
||||
def allows_provider(self, provider: str) -> bool:
|
||||
"""Check if this dependency allows the given provider."""
|
||||
if self.bin_providers == '*':
|
||||
return True
|
||||
return provider in self.bin_providers.split(',')
|
||||
|
||||
def get_overrides_for_provider(self, provider: str) -> dict | None:
|
||||
"""Get the overrides for a provider, or None if not specified."""
|
||||
return self.overrides.get(provider)
|
||||
|
||||
@property
|
||||
def installed_binaries(self):
|
||||
"""Get all InstalledBinary records for this dependency."""
|
||||
return InstalledBinary.objects.filter(dependency=self)
|
||||
|
||||
@property
|
||||
def is_installed(self) -> bool:
|
||||
"""Check if at least one valid InstalledBinary exists for this dependency."""
|
||||
return self.installed_binaries.filter(abspath__isnull=False).exclude(abspath='').exists()
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Execute dependency installation by running all on_Dependency hooks.
|
||||
|
||||
Each hook checks if it can handle this dependency and installs if possible.
|
||||
Returns the InstalledBinary record on success, None on failure.
|
||||
"""
|
||||
import json
|
||||
from pathlib import Path
|
||||
from django.conf import settings
|
||||
|
||||
# Check if already installed
|
||||
if self.is_installed:
|
||||
return self.installed_binaries.first()
|
||||
|
||||
# Import here to avoid circular dependency
|
||||
from archivebox.hooks import run_hooks
|
||||
|
||||
# Create output directory
|
||||
DATA_DIR = getattr(settings, 'DATA_DIR', Path.cwd())
|
||||
output_dir = Path(DATA_DIR) / 'tmp' / f'dependency_{self.id}'
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build kwargs for hooks - pass overrides as JSON string
|
||||
hook_kwargs = {
|
||||
'dependency_id': str(self.id),
|
||||
'bin_name': self.bin_name,
|
||||
'bin_providers': self.bin_providers,
|
||||
'overrides': json.dumps(self.overrides) if self.overrides else None,
|
||||
}
|
||||
|
||||
# Run all on_Dependency hooks - each decides if it can handle this
|
||||
results = run_hooks(
|
||||
event_name='Dependency',
|
||||
output_dir=output_dir,
|
||||
timeout=600,
|
||||
**hook_kwargs
|
||||
)
|
||||
|
||||
# Process results - parse JSONL and create InstalledBinary records
|
||||
for result in results:
|
||||
if result['returncode'] != 0:
|
||||
continue
|
||||
|
||||
# Parse JSONL output
|
||||
for line in result['stdout'].strip().split('\n'):
|
||||
if not line.strip():
|
||||
continue
|
||||
|
||||
try:
|
||||
obj = json.loads(line)
|
||||
if obj.get('type') == 'InstalledBinary':
|
||||
# Create InstalledBinary record
|
||||
if not obj.get('name') or not obj.get('abspath') or not obj.get('version'):
|
||||
continue
|
||||
|
||||
machine = Machine.current()
|
||||
installed_binary, _ = InstalledBinary.objects.update_or_create(
|
||||
machine=machine,
|
||||
name=obj['name'],
|
||||
defaults={
|
||||
'abspath': obj['abspath'],
|
||||
'version': obj['version'],
|
||||
'sha256': obj.get('sha256') or '',
|
||||
'binprovider': obj.get('binprovider') or 'env',
|
||||
'dependency': self,
|
||||
}
|
||||
)
|
||||
|
||||
# Success! Return the installed binary
|
||||
if self.is_installed:
|
||||
return installed_binary
|
||||
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# Failed to install with any hook
|
||||
return None
|
||||
|
||||
|
||||
class InstalledBinaryManager(models.Manager):
|
||||
def get_from_db_or_cache(self, name: str, abspath: str = '', version: str = '', sha256: str = '', binprovider: str = 'env') -> 'InstalledBinary':
|
||||
"""Get or create an InstalledBinary record from the database or cache."""
|
||||
class BinaryManager(models.Manager):
|
||||
def get_from_db_or_cache(self, name: str, abspath: str = '', version: str = '', sha256: str = '', binprovider: str = 'env') -> 'Binary':
|
||||
"""Get or create an Binary record from the database or cache."""
|
||||
global _CURRENT_BINARIES
|
||||
cached = _CURRENT_BINARIES.get(name)
|
||||
if cached and timezone.now() < cached.modified_at + timedelta(seconds=INSTALLED_BINARY_RECHECK_INTERVAL):
|
||||
if cached and timezone.now() < cached.modified_at + timedelta(seconds=BINARY_RECHECK_INTERVAL):
|
||||
return cached
|
||||
_CURRENT_BINARIES[name], _ = self.update_or_create(
|
||||
machine=Machine.objects.current(), name=name, binprovider=binprovider,
|
||||
@@ -288,8 +147,8 @@ class InstalledBinaryManager(models.Manager):
|
||||
)
|
||||
return _CURRENT_BINARIES[name]
|
||||
|
||||
def get_valid_binary(self, name: str, machine: 'Machine | None' = None) -> 'InstalledBinary | None':
|
||||
"""Get a valid InstalledBinary for the given name on the current machine, or None if not found."""
|
||||
def get_valid_binary(self, name: str, machine: 'Machine | None' = None) -> 'Binary | None':
|
||||
"""Get a valid Binary for the given name on the current machine, or None if not found."""
|
||||
machine = machine or Machine.current()
|
||||
return self.filter(
|
||||
machine=machine,
|
||||
@@ -297,35 +156,63 @@ class InstalledBinaryManager(models.Manager):
|
||||
).exclude(abspath='').exclude(abspath__isnull=True).order_by('-modified_at').first()
|
||||
|
||||
|
||||
class InstalledBinary(ModelWithHealthStats):
|
||||
class Binary(ModelWithHealthStats):
|
||||
"""
|
||||
Tracks an installed binary on a specific machine.
|
||||
Tracks an binary on a specific machine.
|
||||
|
||||
Each InstalledBinary is optionally linked to a Dependency that defines
|
||||
how the binary should be installed. The `is_valid` property indicates
|
||||
whether the binary is usable (has both abspath and version).
|
||||
Follows the unified state machine pattern:
|
||||
- queued: Binary needs to be installed
|
||||
- started: Installation in progress
|
||||
- succeeded: Binary installed successfully (abspath, version, sha256 populated)
|
||||
- failed: Installation failed
|
||||
|
||||
State machine calls run() which executes on_Binary__install_* hooks
|
||||
to install the binary using the specified providers.
|
||||
"""
|
||||
|
||||
class StatusChoices(models.TextChoices):
|
||||
QUEUED = 'queued', 'Queued'
|
||||
STARTED = 'started', 'Started'
|
||||
SUCCEEDED = 'succeeded', 'Succeeded'
|
||||
FAILED = 'failed', 'Failed'
|
||||
|
||||
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
|
||||
created_at = models.DateTimeField(default=timezone.now, db_index=True)
|
||||
modified_at = models.DateTimeField(auto_now=True)
|
||||
machine = models.ForeignKey(Machine, on_delete=models.CASCADE, default=None, null=False, blank=True)
|
||||
dependency = models.ForeignKey(Dependency, on_delete=models.SET_NULL, null=True, blank=True,
|
||||
related_name='installedbinary_set',
|
||||
help_text="The Dependency this binary satisfies")
|
||||
name = models.CharField(max_length=63, default=None, null=False, blank=True, db_index=True)
|
||||
binprovider = models.CharField(max_length=31, default=None, null=False, blank=True)
|
||||
abspath = models.CharField(max_length=255, default=None, null=False, blank=True)
|
||||
version = models.CharField(max_length=32, default=None, null=False, blank=True)
|
||||
sha256 = models.CharField(max_length=64, default=None, null=False, blank=True)
|
||||
machine = models.ForeignKey(Machine, on_delete=models.CASCADE, null=False)
|
||||
|
||||
# Binary metadata
|
||||
name = models.CharField(max_length=63, default='', null=False, blank=True, db_index=True)
|
||||
binproviders = models.CharField(max_length=127, default='env', null=False, blank=True,
|
||||
help_text="Comma-separated list of allowed providers: apt,brew,pip,npm,env")
|
||||
overrides = models.JSONField(default=dict, blank=True,
|
||||
help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}")
|
||||
|
||||
# Installation results (populated after installation)
|
||||
binprovider = models.CharField(max_length=31, default='', null=False, blank=True,
|
||||
help_text="Provider that successfully installed this binary")
|
||||
abspath = models.CharField(max_length=255, default='', null=False, blank=True)
|
||||
version = models.CharField(max_length=32, default='', null=False, blank=True)
|
||||
sha256 = models.CharField(max_length=64, default='', null=False, blank=True)
|
||||
|
||||
# State machine fields
|
||||
status = models.CharField(max_length=16, choices=StatusChoices.choices, default=StatusChoices.QUEUED, db_index=True)
|
||||
retry_at = models.DateTimeField(default=timezone.now, null=True, blank=True, db_index=True,
|
||||
help_text="When to retry this binary installation")
|
||||
output_dir = models.CharField(max_length=255, default='', null=False, blank=True,
|
||||
help_text="Directory where installation hook logs are stored")
|
||||
|
||||
# Health stats
|
||||
num_uses_failed = models.PositiveIntegerField(default=0)
|
||||
num_uses_succeeded = models.PositiveIntegerField(default=0)
|
||||
|
||||
objects: InstalledBinaryManager = InstalledBinaryManager()
|
||||
state_machine_name: str = 'machine.statemachines.BinaryMachine'
|
||||
|
||||
objects: BinaryManager = BinaryManager()
|
||||
|
||||
class Meta:
|
||||
verbose_name = 'Installed Binary'
|
||||
verbose_name_plural = 'Installed Binaries'
|
||||
verbose_name = 'Binary'
|
||||
verbose_name_plural = 'Binaries'
|
||||
unique_together = (('machine', 'name', 'abspath', 'version', 'sha256'),)
|
||||
|
||||
def __str__(self) -> str:
|
||||
@@ -347,4 +234,189 @@ class InstalledBinary(ModelWithHealthStats):
|
||||
'is_valid': self.is_valid,
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_jsonl(record: dict, overrides: dict = None):
|
||||
"""
|
||||
Create/update Binary from JSONL record.
|
||||
|
||||
Handles two cases:
|
||||
1. From binaries.jsonl: creates queued binary with name, binproviders, overrides
|
||||
2. From hook output: updates binary with abspath, version, sha256, binprovider
|
||||
|
||||
Args:
|
||||
record: JSONL record with 'name' and either:
|
||||
- 'binproviders', 'overrides' (from binaries.jsonl)
|
||||
- 'abspath', 'version', 'sha256', 'binprovider' (from hook output)
|
||||
overrides: Not used
|
||||
|
||||
Returns:
|
||||
Binary instance or None
|
||||
"""
|
||||
name = record.get('name')
|
||||
if not name:
|
||||
return None
|
||||
|
||||
machine = Machine.current()
|
||||
overrides = overrides or {}
|
||||
|
||||
# Case 1: From binaries.jsonl - create queued binary
|
||||
if 'binproviders' in record or ('overrides' in record and not record.get('abspath')):
|
||||
binary, created = Binary.objects.get_or_create(
|
||||
machine=machine,
|
||||
name=name,
|
||||
defaults={
|
||||
'binproviders': record.get('binproviders', 'env'),
|
||||
'overrides': record.get('overrides', {}),
|
||||
'status': Binary.StatusChoices.QUEUED,
|
||||
'retry_at': timezone.now(),
|
||||
}
|
||||
)
|
||||
return binary
|
||||
|
||||
# Case 2: From hook output - update with installation results
|
||||
abspath = record.get('abspath')
|
||||
version = record.get('version')
|
||||
if not abspath or not version:
|
||||
return None
|
||||
|
||||
binary, _ = Binary.objects.update_or_create(
|
||||
machine=machine,
|
||||
name=name,
|
||||
defaults={
|
||||
'abspath': abspath,
|
||||
'version': version,
|
||||
'sha256': record.get('sha256', ''),
|
||||
'binprovider': record.get('binprovider', 'env'),
|
||||
'status': Binary.StatusChoices.SUCCEEDED,
|
||||
'retry_at': None,
|
||||
}
|
||||
)
|
||||
return binary
|
||||
|
||||
@property
|
||||
def OUTPUT_DIR(self):
|
||||
"""Return the output directory for this binary installation."""
|
||||
from pathlib import Path
|
||||
from django.conf import settings
|
||||
|
||||
DATA_DIR = getattr(settings, 'DATA_DIR', Path.cwd())
|
||||
return Path(DATA_DIR) / 'machines' / str(self.machine_id) / 'binaries' / self.name / str(self.id)
|
||||
|
||||
def update_for_workers(self, **kwargs):
|
||||
"""
|
||||
Update binary fields for worker state machine.
|
||||
|
||||
Sets modified_at to ensure workers pick up changes.
|
||||
Always saves the model after updating.
|
||||
"""
|
||||
for key, value in kwargs.items():
|
||||
setattr(self, key, value)
|
||||
self.modified_at = timezone.now()
|
||||
self.save()
|
||||
|
||||
def run(self):
|
||||
"""
|
||||
Execute binary installation by running on_Binary__install_* hooks.
|
||||
|
||||
Called by BinaryMachine when entering 'started' state.
|
||||
Runs ALL on_Binary__install_* hooks - each hook checks binproviders
|
||||
and decides if it can handle this binary. First hook to succeed wins.
|
||||
Updates status to SUCCEEDED or FAILED based on hook output.
|
||||
"""
|
||||
import json
|
||||
from archivebox.hooks import discover_hooks, run_hook
|
||||
|
||||
# Create output directory
|
||||
output_dir = self.OUTPUT_DIR
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
self.output_dir = str(output_dir)
|
||||
self.save()
|
||||
|
||||
# Discover ALL on_Binary__install_* hooks
|
||||
hooks = discover_hooks('Binary')
|
||||
if not hooks:
|
||||
self.status = self.StatusChoices.FAILED
|
||||
self.save()
|
||||
return
|
||||
|
||||
# Run each hook - they decide if they can handle this binary
|
||||
for hook in hooks:
|
||||
plugin_name = hook.parent.name
|
||||
plugin_output_dir = output_dir / plugin_name
|
||||
plugin_output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Build kwargs for hook
|
||||
hook_kwargs = {
|
||||
'binary_id': str(self.id),
|
||||
'machine_id': str(self.machine_id),
|
||||
'name': self.name,
|
||||
'binproviders': self.binproviders,
|
||||
}
|
||||
|
||||
# Add overrides as JSON string if present
|
||||
if self.overrides:
|
||||
hook_kwargs['overrides'] = json.dumps(self.overrides)
|
||||
|
||||
# Run the hook
|
||||
result = run_hook(
|
||||
hook,
|
||||
output_dir=plugin_output_dir,
|
||||
timeout=600, # 10 min timeout
|
||||
**hook_kwargs
|
||||
)
|
||||
|
||||
# Background hook (unlikely for binary installation, but handle it)
|
||||
if result is None:
|
||||
continue
|
||||
|
||||
# Failed or skipped hook - try next one
|
||||
if result['returncode'] != 0:
|
||||
continue
|
||||
|
||||
# Parse JSONL output to check for successful installation
|
||||
stdout_file = plugin_output_dir / 'stdout.log'
|
||||
if stdout_file.exists():
|
||||
stdout = stdout_file.read_text()
|
||||
for line in stdout.splitlines():
|
||||
if line.strip() and line.strip().startswith('{'):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Binary' and record.get('abspath'):
|
||||
# Update self from successful installation
|
||||
self.abspath = record['abspath']
|
||||
self.version = record.get('version', '')
|
||||
self.sha256 = record.get('sha256', '')
|
||||
self.binprovider = record.get('binprovider', 'env')
|
||||
self.status = self.StatusChoices.SUCCEEDED
|
||||
self.save()
|
||||
return
|
||||
except json.JSONDecodeError:
|
||||
continue
|
||||
|
||||
# No hook succeeded
|
||||
self.status = self.StatusChoices.FAILED
|
||||
self.save()
|
||||
|
||||
def cleanup(self):
|
||||
"""
|
||||
Clean up background binary installation hooks.
|
||||
|
||||
Called by state machine if needed (not typically used for binaries
|
||||
since installations are foreground, but included for consistency).
|
||||
"""
|
||||
from pathlib import Path
|
||||
from archivebox.hooks import kill_process
|
||||
|
||||
output_dir = self.OUTPUT_DIR
|
||||
if not output_dir.exists():
|
||||
return
|
||||
|
||||
# Kill any background hooks
|
||||
for plugin_dir in output_dir.iterdir():
|
||||
if not plugin_dir.is_dir():
|
||||
continue
|
||||
pid_file = plugin_dir / 'hook.pid'
|
||||
if pid_file.exists():
|
||||
kill_process(pid_file)
|
||||
|
||||
|
||||
|
||||
112
archivebox/machine/statemachines.py
Normal file
112
archivebox/machine/statemachines.py
Normal file
@@ -0,0 +1,112 @@
|
||||
__package__ = 'archivebox.machine'
|
||||
|
||||
from datetime import timedelta
|
||||
from django.utils import timezone
|
||||
from django.db.models import F
|
||||
|
||||
from statemachine import State, StateMachine
|
||||
|
||||
from machine.models import Binary
|
||||
|
||||
|
||||
class BinaryMachine(StateMachine, strict_states=True):
|
||||
"""
|
||||
State machine for managing Binary installation lifecycle.
|
||||
|
||||
Follows the unified pattern used by Crawl, Snapshot, and ArchiveResult:
|
||||
- queued: Binary needs to be installed
|
||||
- started: Installation hooks are running
|
||||
- succeeded: Binary installed successfully (abspath, version, sha256 populated)
|
||||
- failed: Installation failed permanently
|
||||
"""
|
||||
|
||||
model: Binary
|
||||
|
||||
# States
|
||||
queued = State(value=Binary.StatusChoices.QUEUED, initial=True)
|
||||
started = State(value=Binary.StatusChoices.STARTED)
|
||||
succeeded = State(value=Binary.StatusChoices.SUCCEEDED, final=True)
|
||||
failed = State(value=Binary.StatusChoices.FAILED, final=True)
|
||||
|
||||
# Tick Event - transitions based on conditions
|
||||
tick = (
|
||||
queued.to.itself(unless='can_start') |
|
||||
queued.to(started, cond='can_start') |
|
||||
started.to.itself(unless='is_finished') |
|
||||
started.to(succeeded, cond='is_succeeded') |
|
||||
started.to(failed, cond='is_failed')
|
||||
)
|
||||
|
||||
def __init__(self, binary, *args, **kwargs):
|
||||
self.binary = binary
|
||||
super().__init__(binary, *args, **kwargs)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f'Binary[{self.binary.id}]'
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
def can_start(self) -> bool:
|
||||
"""Check if binary installation can start."""
|
||||
return bool(self.binary.name and self.binary.binproviders)
|
||||
|
||||
def is_succeeded(self) -> bool:
|
||||
"""Check if installation succeeded (status was set by run())."""
|
||||
return self.binary.status == Binary.StatusChoices.SUCCEEDED
|
||||
|
||||
def is_failed(self) -> bool:
|
||||
"""Check if installation failed (status was set by run())."""
|
||||
return self.binary.status == Binary.StatusChoices.FAILED
|
||||
|
||||
def is_finished(self) -> bool:
|
||||
"""Check if installation has completed (success or failure)."""
|
||||
return self.binary.status in (
|
||||
Binary.StatusChoices.SUCCEEDED,
|
||||
Binary.StatusChoices.FAILED,
|
||||
)
|
||||
|
||||
@queued.enter
|
||||
def enter_queued(self):
|
||||
"""Binary is queued for installation."""
|
||||
self.binary.update_for_workers(
|
||||
retry_at=timezone.now(),
|
||||
status=Binary.StatusChoices.QUEUED,
|
||||
)
|
||||
|
||||
@started.enter
|
||||
def enter_started(self):
|
||||
"""Start binary installation."""
|
||||
# Lock the binary while installation runs
|
||||
self.binary.update_for_workers(
|
||||
retry_at=timezone.now() + timedelta(seconds=300), # 5 min timeout for installation
|
||||
status=Binary.StatusChoices.STARTED,
|
||||
)
|
||||
|
||||
# Run installation hooks
|
||||
self.binary.run()
|
||||
|
||||
# Save updated status (run() updates status to succeeded/failed)
|
||||
self.binary.save()
|
||||
|
||||
@succeeded.enter
|
||||
def enter_succeeded(self):
|
||||
"""Binary installed successfully."""
|
||||
self.binary.update_for_workers(
|
||||
retry_at=None,
|
||||
status=Binary.StatusChoices.SUCCEEDED,
|
||||
)
|
||||
|
||||
# Increment health stats
|
||||
Binary.objects.filter(pk=self.binary.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
|
||||
|
||||
@failed.enter
|
||||
def enter_failed(self):
|
||||
"""Binary installation failed."""
|
||||
self.binary.update_for_workers(
|
||||
retry_at=None,
|
||||
status=Binary.StatusChoices.FAILED,
|
||||
)
|
||||
|
||||
# Increment health stats
|
||||
Binary.objects.filter(pk=self.binary.pk).update(num_uses_failed=F('num_uses_failed') + 1)
|
||||
Reference in New Issue
Block a user