This commit is contained in:
Nick Sweeting
2025-12-28 17:51:54 -08:00
parent 54f91c1339
commit f0aa19fa7d
157 changed files with 6774 additions and 5061 deletions

View File

@@ -4,7 +4,7 @@ from django.contrib import admin
from django.utils.html import format_html
from archivebox.base_models.admin import BaseModelAdmin, ConfigEditorMixin
from machine.models import Machine, NetworkInterface, Binary
from archivebox.machine.models import Machine, NetworkInterface, Binary
class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):

View File

@@ -5,11 +5,11 @@ from django.apps import AppConfig
class MachineConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'machine'
name = 'archivebox.machine'
verbose_name = 'Machine Info'
def register_admin(admin_site):
from machine.admin import register_admin
from archivebox.machine.admin import register_admin
register_admin(admin_site)

View File

@@ -14,9 +14,9 @@ class Migration(migrations.Migration):
replaces = [
('machine', '0001_initial'),
('machine', '0002_alter_machine_stats_binary'),
('machine', '0003_alter_binary_options_and_more'),
('machine', '0004_alter_binary_abspath_and_more'),
('machine', '0002_alter_machine_stats_installedbinary'),
('machine', '0003_alter_installedbinary_options_and_more'),
('machine', '0004_alter_installedbinary_abspath_and_more'),
]
dependencies = []
@@ -70,22 +70,7 @@ class Migration(migrations.Migration):
'unique_together': {('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server')},
},
),
migrations.CreateModel(
name='Dependency',
fields=[
('id', models.UUIDField(default=uuid4, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('bin_name', models.CharField(db_index=True, max_length=63, unique=True)),
('bin_providers', models.CharField(default='*', max_length=127)),
('custom_cmds', models.JSONField(blank=True, default=dict)),
('config', models.JSONField(blank=True, default=dict)),
],
options={
'verbose_name': 'Dependency',
'verbose_name_plural': 'Dependencies',
},
),
# Dependency model removed - not needed anymore
migrations.CreateModel(
name='Binary',
fields=[
@@ -100,7 +85,7 @@ class Migration(migrations.Migration):
('version', models.CharField(blank=True, default=None, max_length=32)),
('sha256', models.CharField(blank=True, default=None, max_length=64)),
('machine', models.ForeignKey(blank=True, default=None, on_delete=django.db.models.deletion.CASCADE, to='machine.machine')),
('dependency', models.ForeignKey(blank=True, null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='binary_set', to='machine.dependency')),
# dependency FK removed - Dependency model deleted
],
options={
'verbose_name': 'Binary',

View File

@@ -1,6 +1,8 @@
# Generated manually on 2025-12-26
# NOTE: This migration is intentionally empty but kept for dependency chain
# The Dependency model was removed in 0004, so all operations have been stripped
from django.db import migrations, models
from django.db import migrations
class Migration(migrations.Migration):
@@ -10,29 +12,5 @@ class Migration(migrations.Migration):
]
operations = [
migrations.RenameField(
model_name='dependency',
old_name='custom_cmds',
new_name='overrides',
),
migrations.AlterField(
model_name='dependency',
name='bin_name',
field=models.CharField(db_index=True, help_text='Binary executable name (e.g., wget, yt-dlp, chromium)', max_length=63, unique=True),
),
migrations.AlterField(
model_name='dependency',
name='bin_providers',
field=models.CharField(default='*', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,gem,nix,custom or * for any', max_length=127),
),
migrations.AlterField(
model_name='dependency',
name='overrides',
field=models.JSONField(blank=True, default=dict, help_text="JSON map matching abx-pkg Binary.overrides format: {'pip': {'packages': ['pkg']}, 'apt': {'packages': ['pkg']}}"),
),
migrations.AlterField(
model_name='dependency',
name='config',
field=models.JSONField(blank=True, default=dict, help_text='JSON map of env var config to use during install'),
),
# All Dependency operations removed - model deleted in 0004
]

View File

@@ -1,8 +1,8 @@
# Generated by Django 6.0 on 2025-12-28 05:12
# NOTE: This migration is intentionally empty but kept for dependency chain
# The Dependency model was removed in 0004, all operations stripped
import django.db.models.deletion
from archivebox import uuid_compat
from django.db import migrations, models
from django.db import migrations
class Migration(migrations.Migration):
@@ -12,34 +12,6 @@ class Migration(migrations.Migration):
]
operations = [
migrations.AlterField(
model_name='dependency',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='binary',
name='dependency',
field=models.ForeignKey(blank=True, help_text='The Dependency this binary satisfies', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='binary_set', to='machine.dependency'),
),
migrations.AlterField(
model_name='binary',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='machine',
name='config',
field=models.JSONField(blank=True, default=dict, help_text='Machine-specific config overrides (e.g., resolved binary paths like WGET_BINARY)'),
),
migrations.AlterField(
model_name='machine',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
migrations.AlterField(
model_name='networkinterface',
name='id',
field=models.UUIDField(default=uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
),
# All operations removed - Dependency model deleted in 0004
# This is a stub migration for users upgrading from old dev versions
]

View File

@@ -0,0 +1,28 @@
# Generated migration - removes Dependency model entirely
# NOTE: This is a cleanup migration for users upgrading from old dev versions
# that had the Dependency model. Fresh installs never create this table.
from django.db import migrations
def drop_dependency_table(apps, schema_editor):
"""
Drop old Dependency table if it exists (from dev versions that had it).
Safe to run multiple times, safe if table doesn't exist.
Does NOT touch machine_binary - that's our current Binary model table!
"""
schema_editor.execute('DROP TABLE IF EXISTS machine_dependency')
# Also drop old InstalledBinary table if it somehow still exists
schema_editor.execute('DROP TABLE IF EXISTS machine_installedbinary')
class Migration(migrations.Migration):
dependencies = [
('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
]
operations = [
migrations.RunPython(drop_dependency_table, migrations.RunPython.noop),
]

View File

@@ -1,56 +0,0 @@
# Generated migration - Clean slate for Binary model
# Drops old InstalledBinary and Dependency tables, creates new Binary table
from django.db import migrations, models
import django.utils.timezone
import archivebox.uuid_compat
def drop_old_tables(apps, schema_editor):
"""Drop old tables using raw SQL"""
schema_editor.execute('DROP TABLE IF EXISTS machine_installedbinary')
schema_editor.execute('DROP TABLE IF EXISTS machine_dependency')
schema_editor.execute('DROP TABLE IF EXISTS machine_binary') # In case rename happened
class Migration(migrations.Migration):
dependencies = [
('machine', '0003_alter_dependency_id_alter_installedbinary_dependency_and_more'),
]
operations = [
# Drop old tables using raw SQL
migrations.RunPython(drop_old_tables, migrations.RunPython.noop),
# Create new Binary model from scratch
migrations.CreateModel(
name='Binary',
fields=[
('id', models.UUIDField(default=archivebox.uuid_compat.uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
('modified_at', models.DateTimeField(auto_now=True)),
('name', models.CharField(blank=True, db_index=True, default=None, max_length=63)),
('binproviders', models.CharField(blank=True, default='env', help_text='Comma-separated list of allowed providers: apt,brew,pip,npm,env', max_length=127)),
('overrides', models.JSONField(blank=True, default=dict, help_text="Provider-specific overrides: {'apt': {'packages': ['pkg']}, ...}")),
('binprovider', models.CharField(blank=True, default=None, help_text='Provider that successfully installed this binary', max_length=31)),
('abspath', models.CharField(blank=True, default=None, max_length=255)),
('version', models.CharField(blank=True, default=None, max_length=32)),
('sha256', models.CharField(blank=True, default=None, max_length=64)),
('status', models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('succeeded', 'Succeeded'), ('failed', 'Failed')], db_index=True, default='queued', max_length=16)),
('retry_at', models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, help_text='When to retry this binary installation', null=True)),
('output_dir', models.CharField(blank=True, default='', help_text='Directory where installation hook logs are stored', max_length=255)),
('num_uses_failed', models.PositiveIntegerField(default=0)),
('num_uses_succeeded', models.PositiveIntegerField(default=0)),
('machine', models.ForeignKey(blank=True, default=None, on_delete=models.deletion.CASCADE, to='machine.machine')),
],
options={
'verbose_name': 'Binary',
'verbose_name_plural': 'Binaries',
},
),
migrations.AddIndex(
model_name='binary',
index=models.Index(fields=['machine', 'name', 'abspath', 'version', 'sha256'], name='machine_bin_machine_idx'),
),
]

View File

@@ -4,11 +4,14 @@ import socket
from archivebox.uuid_compat import uuid7
from datetime import timedelta
from statemachine import State, registry
from django.db import models
from django.utils import timezone
from django.utils.functional import cached_property
from archivebox.base_models.models import ModelWithHealthStats
from archivebox.workers.models import BaseStateMachine
from .detect import get_host_guid, get_os_info, get_vm_info, get_host_network, get_host_stats
_CURRENT_MACHINE = None
@@ -50,6 +53,9 @@ class Machine(ModelWithHealthStats):
objects: MachineManager = MachineManager()
networkinterface_set: models.Manager['NetworkInterface']
class Meta:
app_label = 'machine'
@classmethod
def current(cls) -> 'Machine':
global _CURRENT_MACHINE
@@ -115,6 +121,7 @@ class NetworkInterface(ModelWithHealthStats):
objects: NetworkInterfaceManager = NetworkInterfaceManager()
class Meta:
app_label = 'machine'
unique_together = (('machine', 'ip_public', 'ip_local', 'mac_address', 'dns_server'),)
@classmethod
@@ -206,11 +213,12 @@ class Binary(ModelWithHealthStats):
num_uses_failed = models.PositiveIntegerField(default=0)
num_uses_succeeded = models.PositiveIntegerField(default=0)
state_machine_name: str = 'machine.statemachines.BinaryMachine'
state_machine_name: str = 'machine.models.BinaryMachine'
objects: BinaryManager = BinaryManager()
class Meta:
app_label = 'machine'
verbose_name = 'Binary'
verbose_name_plural = 'Binaries'
unique_together = (('machine', 'name', 'abspath', 'version', 'sha256'),)
@@ -302,9 +310,9 @@ class Binary(ModelWithHealthStats):
DATA_DIR = getattr(settings, 'DATA_DIR', Path.cwd())
return Path(DATA_DIR) / 'machines' / str(self.machine_id) / 'binaries' / self.name / str(self.id)
def update_for_workers(self, **kwargs):
def update_and_requeue(self, **kwargs):
"""
Update binary fields for worker state machine.
Update binary fields and requeue for worker state machine.
Sets modified_at to ensure workers pick up changes.
Always saves the model after updating.
@@ -325,6 +333,10 @@ class Binary(ModelWithHealthStats):
"""
import json
from archivebox.hooks import discover_hooks, run_hook
from archivebox.config.configset import get_config
# Get merged config (Binary doesn't have crawl/snapshot context)
config = get_config(scope='global')
# Create output directory
output_dir = self.OUTPUT_DIR
@@ -333,7 +345,7 @@ class Binary(ModelWithHealthStats):
self.save()
# Discover ALL on_Binary__install_* hooks
hooks = discover_hooks('Binary')
hooks = discover_hooks('Binary', config=config)
if not hooks:
self.status = self.StatusChoices.FAILED
self.save()
@@ -361,7 +373,8 @@ class Binary(ModelWithHealthStats):
result = run_hook(
hook,
output_dir=plugin_output_dir,
timeout=600, # 10 min timeout
config=config,
timeout=600, # 10 min timeout for binary installation
**hook_kwargs
)
@@ -420,3 +433,128 @@ class Binary(ModelWithHealthStats):
kill_process(pid_file)
# =============================================================================
# Binary State Machine
# =============================================================================
class BinaryMachine(BaseStateMachine, strict_states=True):
"""
State machine for managing Binary installation lifecycle.
Hook Lifecycle:
┌─────────────────────────────────────────────────────────────┐
│ QUEUED State │
│ • Binary needs to be installed │
└─────────────────────────────────────────────────────────────┘
↓ tick() when can_start()
┌─────────────────────────────────────────────────────────────┐
│ STARTED State → enter_started() │
│ 1. binary.run() │
│ • discover_hooks('Binary') → all on_Binary__install_* │
│ • Try each provider hook in sequence: │
│ - run_hook(script, output_dir, ...) │
│ - If returncode == 0: │
│ * Read stdout.log │
│ * Parse JSONL for 'Binary' record with abspath │
│ * Update self: abspath, version, sha256, provider │
│ * Set status=SUCCEEDED, RETURN │
│ • If no hook succeeds: set status=FAILED │
└─────────────────────────────────────────────────────────────┘
↓ tick() checks status
┌─────────────────────────────────────────────────────────────┐
│ SUCCEEDED / FAILED │
│ • Set by binary.run() based on hook results │
│ • Health stats incremented (num_uses_succeeded/failed) │
└─────────────────────────────────────────────────────────────┘
"""
model_attr_name = 'binary'
# States
queued = State(value=Binary.StatusChoices.QUEUED, initial=True)
started = State(value=Binary.StatusChoices.STARTED)
succeeded = State(value=Binary.StatusChoices.SUCCEEDED, final=True)
failed = State(value=Binary.StatusChoices.FAILED, final=True)
# Tick Event - transitions based on conditions
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(succeeded, cond='is_succeeded') |
started.to(failed, cond='is_failed')
)
def can_start(self) -> bool:
"""Check if binary installation can start."""
return bool(self.binary.name and self.binary.binproviders)
def is_succeeded(self) -> bool:
"""Check if installation succeeded (status was set by run())."""
return self.binary.status == Binary.StatusChoices.SUCCEEDED
def is_failed(self) -> bool:
"""Check if installation failed (status was set by run())."""
return self.binary.status == Binary.StatusChoices.FAILED
def is_finished(self) -> bool:
"""Check if installation has completed (success or failure)."""
return self.binary.status in (
Binary.StatusChoices.SUCCEEDED,
Binary.StatusChoices.FAILED,
)
@queued.enter
def enter_queued(self):
"""Binary is queued for installation."""
self.binary.update_and_requeue(
retry_at=timezone.now(),
status=Binary.StatusChoices.QUEUED,
)
@started.enter
def enter_started(self):
"""Start binary installation."""
# Lock the binary while installation runs
self.binary.update_and_requeue(
retry_at=timezone.now() + timedelta(seconds=300), # 5 min timeout for installation
status=Binary.StatusChoices.STARTED,
)
# Run installation hooks
self.binary.run()
# Save updated status (run() updates status to succeeded/failed)
self.binary.save()
@succeeded.enter
def enter_succeeded(self):
"""Binary installed successfully."""
self.binary.update_and_requeue(
retry_at=None,
status=Binary.StatusChoices.SUCCEEDED,
)
# Increment health stats
self.binary.increment_health_stats(success=True)
@failed.enter
def enter_failed(self):
"""Binary installation failed."""
self.binary.update_and_requeue(
retry_at=None,
status=Binary.StatusChoices.FAILED,
)
# Increment health stats
self.binary.increment_health_stats(success=False)
# =============================================================================
# State Machine Registration
# =============================================================================
# Manually register state machines with python-statemachine registry
registry.register(BinaryMachine)

View File

@@ -1,112 +0,0 @@
__package__ = 'archivebox.machine'
from datetime import timedelta
from django.utils import timezone
from django.db.models import F
from statemachine import State, StateMachine
from machine.models import Binary
class BinaryMachine(StateMachine, strict_states=True):
"""
State machine for managing Binary installation lifecycle.
Follows the unified pattern used by Crawl, Snapshot, and ArchiveResult:
- queued: Binary needs to be installed
- started: Installation hooks are running
- succeeded: Binary installed successfully (abspath, version, sha256 populated)
- failed: Installation failed permanently
"""
model: Binary
# States
queued = State(value=Binary.StatusChoices.QUEUED, initial=True)
started = State(value=Binary.StatusChoices.STARTED)
succeeded = State(value=Binary.StatusChoices.SUCCEEDED, final=True)
failed = State(value=Binary.StatusChoices.FAILED, final=True)
# Tick Event - transitions based on conditions
tick = (
queued.to.itself(unless='can_start') |
queued.to(started, cond='can_start') |
started.to.itself(unless='is_finished') |
started.to(succeeded, cond='is_succeeded') |
started.to(failed, cond='is_failed')
)
def __init__(self, binary, *args, **kwargs):
self.binary = binary
super().__init__(binary, *args, **kwargs)
def __repr__(self) -> str:
return f'Binary[{self.binary.id}]'
def __str__(self) -> str:
return self.__repr__()
def can_start(self) -> bool:
"""Check if binary installation can start."""
return bool(self.binary.name and self.binary.binproviders)
def is_succeeded(self) -> bool:
"""Check if installation succeeded (status was set by run())."""
return self.binary.status == Binary.StatusChoices.SUCCEEDED
def is_failed(self) -> bool:
"""Check if installation failed (status was set by run())."""
return self.binary.status == Binary.StatusChoices.FAILED
def is_finished(self) -> bool:
"""Check if installation has completed (success or failure)."""
return self.binary.status in (
Binary.StatusChoices.SUCCEEDED,
Binary.StatusChoices.FAILED,
)
@queued.enter
def enter_queued(self):
"""Binary is queued for installation."""
self.binary.update_for_workers(
retry_at=timezone.now(),
status=Binary.StatusChoices.QUEUED,
)
@started.enter
def enter_started(self):
"""Start binary installation."""
# Lock the binary while installation runs
self.binary.update_for_workers(
retry_at=timezone.now() + timedelta(seconds=300), # 5 min timeout for installation
status=Binary.StatusChoices.STARTED,
)
# Run installation hooks
self.binary.run()
# Save updated status (run() updates status to succeeded/failed)
self.binary.save()
@succeeded.enter
def enter_succeeded(self):
"""Binary installed successfully."""
self.binary.update_for_workers(
retry_at=None,
status=Binary.StatusChoices.SUCCEEDED,
)
# Increment health stats
Binary.objects.filter(pk=self.binary.pk).update(num_uses_succeeded=F('num_uses_succeeded') + 1)
@failed.enter
def enter_failed(self):
"""Binary installation failed."""
self.binary.update_for_workers(
retry_at=None,
status=Binary.StatusChoices.FAILED,
)
# Increment health stats
Binary.objects.filter(pk=self.binary.pk).update(num_uses_failed=F('num_uses_failed') + 1)