actually working migration path from 0.7.2 and 0.8.6 + renames and test coverage

2026-04-06 07:47:53 +10:00 · 2026-01-01 15:49:56 -08:00
parent 6fadcf5168
commit 876feac522
33 changed files with 825 additions and 333 deletions
--- a/archivebox/config/constants.py
+++ b/archivebox/config/constants.py
@@ -83,15 +83,15 @@ class ConstantsDict(Mapping):
    CRONTABS_DIR_NAME: str              = 'crontabs'
    CACHE_DIR_NAME: str                 = 'cache'
    LOGS_DIR_NAME: str                  = 'logs'
-    USER_PLUGINS_DIR_NAME: str          = 'user_plugins'
-    CUSTOM_TEMPLATES_DIR_NAME: str      = 'user_templates'
+    CUSTOM_PLUGINS_DIR_NAME: str        = 'custom_plugins'
+    CUSTOM_TEMPLATES_DIR_NAME: str      = 'custom_templates'
    ARCHIVE_DIR: Path                   = DATA_DIR / ARCHIVE_DIR_NAME
    SOURCES_DIR: Path                   = DATA_DIR / SOURCES_DIR_NAME
    PERSONAS_DIR: Path                  = DATA_DIR / PERSONAS_DIR_NAME
    LOGS_DIR: Path                      = DATA_DIR / LOGS_DIR_NAME
    CACHE_DIR: Path                     = DATA_DIR / CACHE_DIR_NAME
    CUSTOM_TEMPLATES_DIR: Path          = DATA_DIR / CUSTOM_TEMPLATES_DIR_NAME
-    USER_PLUGINS_DIR: Path              = DATA_DIR / USER_PLUGINS_DIR_NAME
+    USER_PLUGINS_DIR: Path              = DATA_DIR / CUSTOM_PLUGINS_DIR_NAME

    # Data dir files
    CONFIG_FILENAME: str                = 'ArchiveBox.conf'
@@ -171,8 +171,11 @@ class ConstantsDict(Mapping):
        TMP_DIR_NAME,
        PERSONAS_DIR_NAME,
        CUSTOM_TEMPLATES_DIR_NAME,
-        USER_PLUGINS_DIR_NAME,
+        CUSTOM_PLUGINS_DIR_NAME,
        CRONTABS_DIR_NAME,
+        # Backwards compatibility with old directory names
+        "user_plugins",          # old name for USER_PLUGINS_DIR (now 'plugins')
+        "user_templates",        # old name for CUSTOM_TEMPLATES_DIR (now 'templates')
        "static",                # created by old static exports <v0.6.0
        "sonic",                 # created by docker bind mount / sonic FTS process
        ".git",
--- a/archivebox/core/admin_snapshots.py
+++ b/archivebox/core/admin_snapshots.py
@@ -117,7 +117,7 @@ class SnapshotAdminForm(forms.ModelForm):

 class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
    form = SnapshotAdminForm
-    list_display = ('created_at', 'title_str', 'status_with_progress', 'files', 'size_with_stats', 'url_str')
+    list_display = ('created_at', 'title_str', 'status_with_progress', 'files', 'size_with_stats', 'health_display', 'url_str')
    sort_fields = ('title_str', 'url_str', 'created_at', 'status', 'crawl')
    readonly_fields = ('admin_actions', 'status_info', 'imported_timestamp', 'created_at', 'modified_at', 'downloaded_at', 'output_dir', 'archiveresults_list')
    search_fields = ('id', 'url', 'timestamp', 'title', 'tags__name')
@@ -488,6 +488,12 @@ class SnapshotAdmin(SearchResultsAdminMixin, ConfigEditorMixin, BaseModelAdmin):
            obj.url[:128],
        )

+    @admin.display(description='Health', ordering='health')
+    def health_display(self, obj):
+        h = obj.health
+        color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red'
+        return format_html('<span style="color: {};">{}</span>', color, h)
+
    def grid_view(self, request, extra_context=None):

        # cl = self.get_changelist_instance(request)
--- a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
+++ b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
@@ -3,6 +3,7 @@
 # Handles both fresh installs and upgrades from v0.7.2/v0.8.6rc0

 from django.db import migrations, models, connection
+import django.utils.timezone


 def get_table_columns(table_name):
@@ -95,31 +96,31 @@ def upgrade_core_tables(apps, schema_editor):
    # ============================================================================
    # PART 2: Upgrade core_snapshot table
    # ============================================================================
+    # Create table with NEW field names for timestamps (bookmarked_at, created_at, modified_at)
+    # and all other fields needed by later migrations
    cursor.execute("""
        CREATE TABLE IF NOT EXISTS core_snapshot_new (
            id TEXT PRIMARY KEY NOT NULL,
-            created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
-            modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
-
            url TEXT NOT NULL,
            timestamp VARCHAR(32) NOT NULL UNIQUE,
-            bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
-
+            title VARCHAR(512),
            crawl_id TEXT,
            parent_snapshot_id TEXT,

-            title VARCHAR(512),
+            bookmarked_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
+            created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
+            modified_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP,
+
            downloaded_at DATETIME,
+            status VARCHAR(15) NOT NULL DEFAULT 'queued',
+            retry_at DATETIME,
+
            depth INTEGER NOT NULL DEFAULT 0,
            fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
-
            config TEXT NOT NULL DEFAULT '{}',
            notes TEXT NOT NULL DEFAULT '',
            num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
            num_uses_failed INTEGER NOT NULL DEFAULT 0,
-
-            status VARCHAR(15) NOT NULL DEFAULT 'queued',
-            retry_at DATETIME,
            current_step INTEGER NOT NULL DEFAULT 0,

            FOREIGN KEY (crawl_id) REFERENCES crawls_crawl(id) ON DELETE CASCADE,
@@ -141,29 +142,23 @@ def upgrade_core_tables(apps, schema_editor):
            has_bookmarked_at = 'bookmarked_at' in snapshot_cols

            if has_added and not has_bookmarked_at:
-                # Migrating from v0.7.2 (has added/updated, no bookmarked_at/created_at/modified_at)
+                # Migrating from v0.7.2 (has added/updated fields)
                print('Migrating Snapshot from v0.7.2 schema...')
-                # Debug: Check what data we're about to copy
-                cursor.execute("SELECT id, added, updated FROM core_snapshot LIMIT 3")
-                sample_data = cursor.fetchall()
-                print(f'DEBUG 0023: Sample Snapshot data before migration: {sample_data}')
-
+                # Transform added→bookmarked_at/created_at and updated→modified_at
                cursor.execute("""
                    INSERT OR IGNORE INTO core_snapshot_new (
-                        id, url, timestamp, title, bookmarked_at, created_at, modified_at
+                        id, url, timestamp, title,
+                        bookmarked_at, created_at, modified_at,
+                        status
                    )
                    SELECT
                        id, url, timestamp, title,
                        COALESCE(added, CURRENT_TIMESTAMP) as bookmarked_at,
                        COALESCE(added, CURRENT_TIMESTAMP) as created_at,
-                        COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at
+                        COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at,
+                        'queued' as status
                    FROM core_snapshot;
                """)
-
-                # Debug: Check what was inserted
-                cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot_new LIMIT 3")
-                inserted_data = cursor.fetchall()
-                print(f'DEBUG 0023: Sample Snapshot data after INSERT: {inserted_data}')
            elif has_bookmarked_at and not has_added:
                # Migrating from v0.8.6rc0 (already has bookmarked_at/created_at/modified_at)
                print('Migrating Snapshot from v0.8.6rc0 schema...')
@@ -308,14 +303,29 @@ class Migration(migrations.Migration):
                ),
            ],
            state_operations=[
-                # NOTE: We do NOT remove extractor/output here for ArchiveResult!
+                # NOTE: We do NOT remove extractor/output for ArchiveResult!
                # They are still in the database and will be removed by migration 0025
-                # after copying their data to the new field names (plugin, output_str).
+                # after copying their data to plugin/output_str.

-                # However, for Snapshot, we DO remove added/updated here because
-                # the database operations above already renamed them to bookmarked_at/created_at/modified_at.
+                # However, for Snapshot, we DO remove added/updated and ADD the new timestamp fields
+                # because the SQL above already transformed them.
                migrations.RemoveField(model_name='snapshot', name='added'),
                migrations.RemoveField(model_name='snapshot', name='updated'),
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='bookmarked_at',
+                    field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+                ),
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='created_at',
+                    field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
+                ),
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='modified_at',
+                    field=models.DateTimeField(auto_now=True),
+                ),

                # SnapshotTag table already exists from v0.7.2, just declare it in state
                migrations.CreateModel(
--- a/archivebox/core/migrations/0024_assign_default_crawl.py
+++ b/archivebox/core/migrations/0024_assign_default_crawl.py
@@ -103,15 +103,21 @@ class Migration(migrations.Migration):
                        );

                        INSERT INTO core_snapshot_final (
-                            id, created_at, modified_at, url, timestamp, bookmarked_at,
-                            crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
-                            config, notes, num_uses_succeeded, num_uses_failed,
+                            id, url, timestamp, title,
+                            bookmarked_at, created_at, modified_at,
+                            crawl_id, parent_snapshot_id,
+                            downloaded_at, depth, fs_version,
+                            config, notes,
+                            num_uses_succeeded, num_uses_failed,
                            status, retry_at, current_step
                        )
                        SELECT
-                            id, created_at, modified_at, url, timestamp, bookmarked_at,
-                            crawl_id, parent_snapshot_id, title, downloaded_at, depth, fs_version,
-                            COALESCE(config, '{}'), COALESCE(notes, ''), num_uses_succeeded, num_uses_failed,
+                            id, url, timestamp, title,
+                            bookmarked_at, created_at, modified_at,
+                            crawl_id, parent_snapshot_id,
+                            downloaded_at, depth, fs_version,
+                            COALESCE(config, '{}'), COALESCE(notes, ''),
+                            num_uses_succeeded, num_uses_failed,
                            status, retry_at, current_step
                        FROM core_snapshot;

--- a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py
+++ b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py
@@ -9,23 +9,16 @@ from django.db import migrations, models, connection


 def copy_old_fields_to_new(apps, schema_editor):
-    """Copy data from old field names to new field names before AddField operations."""
+    """Copy data from old field names to new field names after AddField operations."""
    cursor = connection.cursor()

    # Check if old fields still exist
    cursor.execute("PRAGMA table_info(core_archiveresult)")
    cols = {row[1] for row in cursor.fetchall()}
-    print(f'DEBUG 0025: ArchiveResult columns: {sorted(cols)}')

    if 'extractor' in cols and 'plugin' in cols:
        # Copy extractor -> plugin
-        print('DEBUG 0025: Copying extractor -> plugin')
        cursor.execute("UPDATE core_archiveresult SET plugin = COALESCE(extractor, '') WHERE plugin = '' OR plugin IS NULL")
-        cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE plugin != ''")
-        count = cursor.fetchone()[0]
-        print(f'DEBUG 0025: Updated {count} rows with plugin data')
-    else:
-        print(f'DEBUG 0025: NOT copying - extractor in cols: {"extractor" in cols}, plugin in cols: {"plugin" in cols}')

    if 'output' in cols and 'output_str' in cols:
        # Copy output -> output_str
@@ -38,16 +31,13 @@ def copy_old_fields_to_new(apps, schema_editor):
    if 'end_ts' in cols and 'modified_at' in cols:
        cursor.execute("UPDATE core_archiveresult SET modified_at = COALESCE(end_ts, start_ts, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''")

-    # Same for Snapshot table
-    cursor.execute("PRAGMA table_info(core_snapshot)")
-    snap_cols = {row[1] for row in cursor.fetchall()}
+    # NOTE: Snapshot timestamps (added→bookmarked_at, updated→modified_at) were already
+    # transformed by migration 0023, so we don't need to copy them here.

-    if 'added' in snap_cols and 'bookmarked_at' in snap_cols:
-        cursor.execute("UPDATE core_snapshot SET bookmarked_at = COALESCE(added, CURRENT_TIMESTAMP) WHERE bookmarked_at IS NULL OR bookmarked_at = ''")
-        cursor.execute("UPDATE core_snapshot SET created_at = COALESCE(added, CURRENT_TIMESTAMP) WHERE created_at IS NULL OR created_at = ''")
-
-    if 'updated' in snap_cols and 'modified_at' in snap_cols:
-        cursor.execute("UPDATE core_snapshot SET modified_at = COALESCE(updated, added, CURRENT_TIMESTAMP) WHERE modified_at IS NULL OR modified_at = ''")
+    # Debug: Check Snapshot timestamps at end of RunPython
+    cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot LIMIT 2")
+    snap_after = cursor.fetchall()
+    print(f'DEBUG 0025: Snapshot timestamps at END of RunPython: {snap_after}')


 class Migration(migrations.Migration):
@@ -149,21 +139,12 @@ class Migration(migrations.Migration):
            name='retry_at',
            field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
        ),
-        migrations.AddField(
-            model_name='snapshot',
-            name='bookmarked_at',
-            field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
-        ),
+        # NOTE: bookmarked_at and created_at already added by migration 0023
        migrations.AddField(
            model_name='snapshot',
            name='config',
            field=models.JSONField(default=dict),
        ),
-        migrations.AddField(
-            model_name='snapshot',
-            name='created_at',
-            field=models.DateTimeField(db_index=True, default=django.utils.timezone.now),
-        ),
        migrations.AddField(
            model_name='snapshot',
            name='current_step',
@@ -184,11 +165,7 @@ class Migration(migrations.Migration):
            name='fs_version',
            field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
        ),
-        migrations.AddField(
-            model_name='snapshot',
-            name='modified_at',
-            field=models.DateTimeField(auto_now=True),
-        ),
+        # NOTE: modified_at already added by migration 0023
        migrations.AddField(
            model_name='snapshot',
            name='notes',
@@ -248,7 +225,7 @@ class Migration(migrations.Migration):
            model_name='archiveresult',
            name='output',
        ),
-        # NOTE: Snapshot's added/updated fields were already removed by migration 0023
+        # NOTE: Snapshot's added/updated were already removed by migration 0023
        migrations.AlterField(
            model_name='archiveresult',
            name='end_ts',
--- a/archivebox/core/migrations/0026_add_process_to_archiveresult.py
+++ b/archivebox/core/migrations/0026_add_process_to_archiveresult.py
@@ -0,0 +1,28 @@
+# Generated by Django 6.0 on 2026-01-01 23:28
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0025_alter_archiveresult_options_alter_snapshot_options_and_more'),
+        ('machine', '0003_add_process_type_and_parent'),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name='archiveresult',
+            name='num_uses_failed',
+        ),
+        migrations.RemoveField(
+            model_name='archiveresult',
+            name='num_uses_succeeded',
+        ),
+        migrations.AddField(
+            model_name='archiveresult',
+            name='process',
+            field=models.OneToOneField(blank=True, help_text='Process execution details for this archive result', null=True, on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'),
+        ),
+    ]
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -2285,13 +2285,14 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi

    # Process FK - tracks execution details (cmd, pwd, stdout, stderr, etc.)
    # Added POST-v0.9.0, will be added in a separate migration
-    # process = models.OneToOneField(
-    #     'machine.Process',
-    #     on_delete=models.PROTECT,
-    #     null=False,
-    #     related_name='archiveresult',
-    #     help_text='Process execution details for this archive result'
-    # )
+    process = models.OneToOneField(
+        'machine.Process',
+        on_delete=models.PROTECT,
+        null=True,
+        blank=True,
+        related_name='archiveresult',
+        help_text='Process execution details for this archive result'
+    )

    # New output fields (replacing old 'output' field)
    output_str = models.TextField(blank=True, default='', help_text='Human-readable output summary')
--- a/archivebox/crawls/admin.py
+++ b/archivebox/crawls/admin.py
@@ -154,7 +154,7 @@ class CrawlAdminForm(forms.ModelForm):

 class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
    form = CrawlAdminForm
-    list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'urls_preview', 'schedule_str', 'status', 'retry_at', 'num_snapshots')
+    list_display = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'urls_preview', 'schedule_str', 'status', 'retry_at', 'health_display', 'num_snapshots')
    sort_fields = ('id', 'created_at', 'created_by', 'max_depth', 'label', 'notes', 'schedule_str', 'status', 'retry_at')
    search_fields = ('id', 'created_by__username', 'max_depth', 'label', 'notes', 'schedule_id', 'status', 'urls')

@@ -270,6 +270,12 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
        first_url = obj.get_urls_list()[0] if obj.get_urls_list() else ''
        return first_url[:80] + '...' if len(first_url) > 80 else first_url

+    @admin.display(description='Health', ordering='health')
+    def health_display(self, obj):
+        h = obj.health
+        color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red'
+        return format_html('<span style="color: {};">{}</span>', color, h)
+
    @admin.display(description='URLs')
    def urls_editor(self, obj):
        """Editor for crawl URLs."""
--- a/archivebox/crawls/migrations/0003_remove_crawlschedule_num_uses_failed_and_more.py
+++ b/archivebox/crawls/migrations/0003_remove_crawlschedule_num_uses_failed_and_more.py
@@ -0,0 +1,21 @@
+# Generated by Django 6.0 on 2026-01-01 23:36
+
+from django.db import migrations
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('crawls', '0002_upgrade_from_0_8_6'),
+    ]
+
+    operations = [
+        migrations.RemoveField(
+            model_name='crawlschedule',
+            name='num_uses_failed',
+        ),
+        migrations.RemoveField(
+            model_name='crawlschedule',
+            name='num_uses_succeeded',
+        ),
+    ]
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -519,12 +519,14 @@ class CrawlMachine(BaseStateMachine, strict_states=True):
    def is_finished(self) -> bool:
        from archivebox.core.models import Snapshot

-        # check that at least one snapshot exists for this crawl
+        # Check if any snapshots exist for this crawl
        snapshots = Snapshot.objects.filter(crawl=self.crawl)
-        if not snapshots.exists():
-            return False

-        # check if all snapshots are sealed
+        # If no snapshots exist, allow finishing (e.g., archivebox://install crawls that only run hooks)
+        if not snapshots.exists():
+            return True
+
+        # If snapshots exist, check if all are sealed
        # Snapshots handle their own background hooks via the step system,
        # so we just need to wait for all snapshots to reach sealed state
        if snapshots.filter(status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]).exists():
--- a/archivebox/machine/admin.py
+++ b/archivebox/machine/admin.py
@@ -8,7 +8,7 @@ from archivebox.machine.models import Machine, NetworkInterface, Binary, Process


 class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):
-    list_display = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health')
+    list_display = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid', 'health_display')
    sort_fields = ('id', 'created_at', 'hostname', 'ips', 'os_platform', 'hw_in_docker', 'hw_in_vm', 'hw_manufacturer', 'hw_product', 'os_arch', 'os_family', 'os_release', 'hw_uuid')

    readonly_fields = ('guid', 'created_at', 'modified_at', 'ips')
@@ -52,9 +52,15 @@ class MachineAdmin(ConfigEditorMixin, BaseModelAdmin):
            machine.id, ', '.join(machine.networkinterface_set.values_list('ip_public', flat=True)),
        )

+    @admin.display(description='Health', ordering='health')
+    def health_display(self, obj):
+        h = obj.health
+        color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red'
+        return format_html('<span style="color: {};">{}</span>', color, h)
+

 class NetworkInterfaceAdmin(BaseModelAdmin):
-    list_display = ('id', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address', 'health')
+    list_display = ('id', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address', 'health_display')
    sort_fields = ('id', 'created_at', 'machine_info', 'ip_public', 'dns_server', 'isp', 'country', 'region', 'city', 'iface', 'ip_local', 'mac_address')
    search_fields = ('id', 'machine__id', 'iface', 'ip_public', 'ip_local', 'mac_address', 'dns_server', 'hostname', 'isp', 'city', 'region', 'country')

@@ -95,9 +101,15 @@ class NetworkInterfaceAdmin(BaseModelAdmin):
            iface.machine.id, str(iface.machine.id)[:8], iface.machine.hostname,
        )

+    @admin.display(description='Health', ordering='health')
+    def health_display(self, obj):
+        h = obj.health
+        color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red'
+        return format_html('<span style="color: {};">{}</span>', color, h)
+

 class BinaryAdmin(BaseModelAdmin):
-    list_display = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status', 'health')
+    list_display = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status', 'health_display')
    sort_fields = ('id', 'created_at', 'machine_info', 'name', 'binprovider', 'version', 'abspath', 'sha256', 'status')
    search_fields = ('id', 'machine__id', 'name', 'binprovider', 'version', 'abspath', 'sha256')

@@ -142,6 +154,12 @@ class BinaryAdmin(BaseModelAdmin):
            binary.machine.id, str(binary.machine.id)[:8], binary.machine.hostname,
        )

+    @admin.display(description='Health', ordering='health')
+    def health_display(self, obj):
+        h = obj.health
+        color = 'green' if h >= 80 else 'orange' if h >= 50 else 'red'
+        return format_html('<span style="color: {};">{}</span>', color, h)
+

 class ProcessAdmin(BaseModelAdmin):
    list_display = ('id', 'created_at', 'machine_info', 'archiveresult_link', 'cmd_str', 'status', 'exit_code', 'pid', 'binary_info')
--- a/archivebox/machine/migrations/0003_add_process_type_and_parent.py
+++ b/archivebox/machine/migrations/0003_add_process_type_and_parent.py
@@ -0,0 +1,24 @@
+# Generated by Django 6.0 on 2026-01-01 22:55
+
+import django.db.models.deletion
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('machine', '0002_process'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='process',
+            name='parent',
+            field=models.ForeignKey(blank=True, help_text='Parent process that spawned this process', null=True, on_delete=django.db.models.deletion.SET_NULL, related_name='children', to='machine.process'),
+        ),
+        migrations.AddField(
+            model_name='process',
+            name='process_type',
+            field=models.CharField(choices=[('supervisord', 'Supervisord'), ('orchestrator', 'Orchestrator'), ('worker', 'Worker'), ('cli', 'CLI'), ('binary', 'Binary')], db_index=True, default='cli', help_text='Type of process (cli, worker, orchestrator, binary, supervisord)', max_length=16),
+        ),
+    ]
--- a/archivebox/machine/models.py
+++ b/archivebox/machine/models.py
@@ -153,8 +153,8 @@ class NetworkInterface(ModelWithHealthStats):
    city = models.CharField(max_length=63, default=None, null=False)
    region = models.CharField(max_length=63, default=None, null=False)
    country = models.CharField(max_length=63, default=None, null=False)
-    num_uses_failed = models.PositiveIntegerField(default=0)
-    num_uses_succeeded = models.PositiveIntegerField(default=0)
+    # num_uses_failed = models.PositiveIntegerField(default=0)  # from ModelWithHealthStats
+    # num_uses_succeeded = models.PositiveIntegerField(default=0)  # from ModelWithHealthStats

    objects: NetworkInterfaceManager = NetworkInterfaceManager()

@@ -588,6 +588,13 @@ class Process(models.Model):
        RUNNING = 'running', 'Running'
        EXITED = 'exited', 'Exited'

+    class TypeChoices(models.TextChoices):
+        SUPERVISORD = 'supervisord', 'Supervisord'
+        ORCHESTRATOR = 'orchestrator', 'Orchestrator'
+        WORKER = 'worker', 'Worker'
+        CLI = 'cli', 'CLI'
+        BINARY = 'binary', 'Binary'
+
    # Primary fields
    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
@@ -602,6 +609,24 @@ class Process(models.Model):
        help_text='Machine where this process executed'
    )

+    # Parent process (optional)
+    parent = models.ForeignKey(
+        'self',
+        on_delete=models.SET_NULL,
+        null=True, blank=True,
+        related_name='children',
+        help_text='Parent process that spawned this process'
+    )
+
+    # Process type (cli, worker, orchestrator, binary, supervisord)
+    process_type = models.CharField(
+        max_length=16,
+        choices=TypeChoices.choices,
+        default=TypeChoices.CLI,
+        db_index=True,
+        help_text='Type of process (cli, worker, orchestrator, binary, supervisord)'
+    )
+
    # Execution metadata
    pwd = models.CharField(max_length=512, default='', null=False, blank=True,
        help_text='Working directory for process execution')
--- a/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
+++ b/archivebox/plugins/accessibility/on_Snapshot__39_accessibility.js
@@ -8,7 +8,7 @@
 * - Accessibility snapshot
 * - ARIA labels and roles
 *
- * Usage: on_Snapshot__18_accessibility.js --url=<url> --snapshot-id=<uuid>
+ * Usage: on_Snapshot__39_accessibility.js --url=<url> --snapshot-id=<uuid>
 * Output: Writes accessibility/accessibility.json
 *
 * Environment variables:
@@ -203,7 +203,7 @@ async function main() {
    const snapshotId = args.snapshot_id;

    if (!url || !snapshotId) {
-        console.error('Usage: on_Snapshot__18_accessibility.js --url=<url> --snapshot-id=<uuid>');
+        console.error('Usage: on_Snapshot__39_accessibility.js --url=<url> --snapshot-id=<uuid>');
        process.exit(1);
    }

--- a/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js
+++ b/archivebox/plugins/chrome/on_Crawl__20_chrome_launch.bg.js
@@ -8,7 +8,7 @@
 * NOTE: We use Chromium instead of Chrome because Chrome 137+ removed support for
 * --load-extension and --disable-extensions-except flags.
 *
- * Usage: on_Crawl__30_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
+ * Usage: on_Crawl__20_chrome_launch.bg.js --crawl-id=<uuid> --source-url=<url>
 * Output: Writes to current directory (executor creates chrome/ dir):
 *   - cdp_url.txt: WebSocket URL for CDP connection
 *   - chrome.pid: Chromium process ID (for cleanup)
--- a/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js
+++ b/archivebox/plugins/chrome/on_Snapshot__20_chrome_tab.bg.js
@@ -2,7 +2,7 @@
 /**
 * Create a Chrome tab for this snapshot in the shared crawl Chrome session.
 *
- * If a crawl-level Chrome session exists (from on_Crawl__30_chrome_launch.bg.js),
+ * If a crawl-level Chrome session exists (from on_Crawl__20_chrome_launch.bg.js),
 * this connects to it and creates a new tab. Otherwise, falls back to launching
 * its own Chrome instance.
 *
--- a/archivebox/plugins/chrome/tests/chrome_test_helpers.py
+++ b/archivebox/plugins/chrome/tests/chrome_test_helpers.py
@@ -73,8 +73,8 @@ CHROME_PLUGIN_DIR = Path(__file__).parent.parent
 PLUGINS_ROOT = CHROME_PLUGIN_DIR.parent

 # Hook script locations
-CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__00_install_puppeteer_chromium.py'
-CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__30_chrome_launch.bg.js'
+CHROME_INSTALL_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__01_chrome_install.py'
+CHROME_LAUNCH_HOOK = CHROME_PLUGIN_DIR / 'on_Crawl__20_chrome_launch.bg.js'
 CHROME_TAB_HOOK = CHROME_PLUGIN_DIR / 'on_Snapshot__20_chrome_tab.bg.js'
 CHROME_NAVIGATE_HOOK = next(CHROME_PLUGIN_DIR.glob('on_Snapshot__*_chrome_navigate.*'), None)
 CHROME_UTILS = CHROME_PLUGIN_DIR / 'chrome_utils.js'
--- a/archivebox/plugins/dom/on_Snapshot__53_dom.js
+++ b/archivebox/plugins/dom/on_Snapshot__53_dom.js
@@ -5,7 +5,7 @@
 * If a Chrome session exists (from chrome plugin), connects to it via CDP.
 * Otherwise launches a new Chrome instance.
 *
- * Usage: on_Snapshot__23_dom.js --url=<url> --snapshot-id=<uuid>
+ * Usage: on_Snapshot__53_dom.js --url=<url> --snapshot-id=<uuid>
 * Output: Writes dom/output.html
 *
 * Environment variables:
@@ -175,7 +175,7 @@ async function main() {
    const snapshotId = args.snapshot_id;

    if (!url || !snapshotId) {
-        console.error('Usage: on_Snapshot__23_dom.js --url=<url> --snapshot-id=<uuid>');
+        console.error('Usage: on_Snapshot__53_dom.js --url=<url> --snapshot-id=<uuid>');
        process.exit(1);
    }

--- a/archivebox/plugins/headers/on_Snapshot__55_headers.js
+++ b/archivebox/plugins/headers/on_Snapshot__55_headers.js
@@ -6,7 +6,7 @@
 * response headers from chrome plugin/response_headers.json.
 * Otherwise falls back to making an HTTP HEAD request.
 *
- * Usage: on_Snapshot__12_headers.js --url=<url> --snapshot-id=<uuid>
+ * Usage: on_Snapshot__55_headers.js --url=<url> --snapshot-id=<uuid>
 * Output: Writes headers/headers.json
 *
 * Environment variables:
@@ -116,7 +116,7 @@ async function main() {
    const snapshotId = args.snapshot_id;

    if (!url || !snapshotId) {
-        console.error('Usage: on_Snapshot__12_headers.js --url=<url> --snapshot-id=<uuid>');
+        console.error('Usage: on_Snapshot__55_headers.js --url=<url> --snapshot-id=<uuid>');
        process.exit(1);
    }

--- a/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js
+++ b/archivebox/plugins/parse_dom_outlinks/on_Snapshot__75_parse_dom_outlinks.js
@@ -11,7 +11,7 @@
 * - iframes: <iframe src>
 * - links: <link> tags with rel/href
 *
- * Usage: on_Snapshot__40_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>
+ * Usage: on_Snapshot__75_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>
 * Output: Writes parse_dom_outlinks/outlinks.json and parse_dom_outlinks/urls.jsonl
 *
 * Environment variables:
@@ -216,7 +216,7 @@ async function main() {
    const snapshotId = args.snapshot_id;

    if (!url || !snapshotId) {
-        console.error('Usage: on_Snapshot__40_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>');
+        console.error('Usage: on_Snapshot__75_parse_dom_outlinks.js --url=<url> --snapshot-id=<uuid>');
        process.exit(1);
    }

--- a/archivebox/plugins/pdf/on_Snapshot__52_pdf.js
+++ b/archivebox/plugins/pdf/on_Snapshot__52_pdf.js
@@ -5,7 +5,7 @@
 * If a Chrome session exists (from chrome plugin), connects to it via CDP.
 * Otherwise launches a new Chrome instance.
 *
- * Usage: on_Snapshot__22_pdf.js --url=<url> --snapshot-id=<uuid>
+ * Usage: on_Snapshot__52_pdf.js --url=<url> --snapshot-id=<uuid>
 * Output: Writes pdf/output.pdf
 *
 * Environment variables:
@@ -184,7 +184,7 @@ async function main() {
    const snapshotId = args.snapshot_id;

    if (!url || !snapshotId) {
-        console.error('Usage: on_Snapshot__22_pdf.js --url=<url> --snapshot-id=<uuid>');
+        console.error('Usage: on_Snapshot__52_pdf.js --url=<url> --snapshot-id=<uuid>');
        process.exit(1);
    }

--- a/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js
+++ b/archivebox/plugins/screenshot/on_Snapshot__51_screenshot.js
@@ -5,7 +5,7 @@
 * If a Chrome session exists (from chrome plugin), connects to it via CDP.
 * Otherwise launches a new Chrome instance.
 *
- * Usage: on_Snapshot__21_screenshot.js --url=<url> --snapshot-id=<uuid>
+ * Usage: on_Snapshot__51_screenshot.js --url=<url> --snapshot-id=<uuid>
 * Output: Writes screenshot/screenshot.png
 *
 * Environment variables:
@@ -177,7 +177,7 @@ async function main() {
    const snapshotId = args.snapshot_id;

    if (!url || !snapshotId) {
-        console.error('Usage: on_Snapshot__21_screenshot.js --url=<url> --snapshot-id=<uuid>');
+        console.error('Usage: on_Snapshot__51_screenshot.js --url=<url> --snapshot-id=<uuid>');
        process.exit(1);
    }

--- a/archivebox/plugins/seo/on_Snapshot__38_seo.js
+++ b/archivebox/plugins/seo/on_Snapshot__38_seo.js
@@ -8,7 +8,7 @@
 * - description, keywords, author
 * - Any other meta tags
 *
- * Usage: on_Snapshot__17_seo.js --url=<url> --snapshot-id=<uuid>
+ * Usage: on_Snapshot__38_seo.js --url=<url> --snapshot-id=<uuid>
 * Output: Writes seo/seo.json
 *
 * Environment variables:
@@ -157,7 +157,7 @@ async function main() {
    const snapshotId = args.snapshot_id;

    if (!url || !snapshotId) {
-        console.error('Usage: on_Snapshot__17_seo.js --url=<url> --snapshot-id=<uuid>');
+        console.error('Usage: on_Snapshot__38_seo.js --url=<url> --snapshot-id=<uuid>');
        process.exit(1);
    }

--- a/archivebox/plugins/staticfile/on_Snapshot__32_staticfile.bg.js
+++ b/archivebox/plugins/staticfile/on_Snapshot__32_staticfile.bg.js
@@ -6,7 +6,7 @@
 * Content-Type from the initial response. If it's a static file (PDF, image, etc.),
 * it downloads the content directly using CDP.
 *
- * Usage: on_Snapshot__31_staticfile.bg.js --url=<url> --snapshot-id=<uuid>
+ * Usage: on_Snapshot__32_staticfile.bg.js --url=<url> --snapshot-id=<uuid>
 * Output: Downloads static file
 */

@@ -288,7 +288,7 @@ async function main() {
    const snapshotId = args.snapshot_id;

    if (!url || !snapshotId) {
-        console.error('Usage: on_Snapshot__31_staticfile.bg.js --url=<url> --snapshot-id=<uuid>');
+        console.error('Usage: on_Snapshot__32_staticfile.bg.js --url=<url> --snapshot-id=<uuid>');
        process.exit(1);
    }

--- a/archivebox/plugins/twocaptcha/on_Crawl__05_twocaptcha_install.js
+++ b/archivebox/plugins/twocaptcha/on_Crawl__05_twocaptcha_install.js
@@ -28,7 +28,7 @@ const EXTENSION = {
 /**
 * Main entry point - install extension before archiving
 *
- * Note: 2captcha configuration is handled by on_Crawl__25_configure_twocaptcha_extension_options.js
+ * Note: 2captcha configuration is handled by on_Crawl__25_twocaptcha_config.js
 * during first-time browser setup to avoid repeated configuration on every snapshot.
 * The API key is injected via chrome.storage API once per browser session.
 */
--- a/archivebox/plugins/twocaptcha/on_Crawl__25_twocaptcha_config.js
+++ b/archivebox/plugins/twocaptcha/on_Crawl__25_twocaptcha_config.js
@@ -5,7 +5,7 @@
 * Configures the 2captcha extension with API key and settings after Crawl-level Chrome session starts.
 * Runs once per crawl to inject configuration into extension storage.
 *
- * Priority: 25 (after chrome_launch at 30, before snapshots start)
+ * Priority: 25 (after chrome_launch at 20, before snapshots start)
 * Hook: on_Crawl (runs once per crawl, not per snapshot)
 *
 * Config Options (from config.json / environment):
@@ -346,7 +346,7 @@ async function main() {
    const snapshotId = args.snapshot_id;

    if (!url || !snapshotId) {
-        console.error('Usage: on_Crawl__25_configure_twocaptcha_extension_options.js --url=<url> --snapshot-id=<uuid>');
+        console.error('Usage: on_Crawl__25_twocaptcha_config.js --url=<url> --snapshot-id=<uuid>');
        process.exit(1);
    }

--- a/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py
+++ b/archivebox/plugins/twocaptcha/tests/test_twocaptcha.py
@@ -26,8 +26,8 @@ from archivebox.plugins.chrome.tests.chrome_test_helpers import (


 PLUGIN_DIR = Path(__file__).parent.parent
-INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__20_install_twocaptcha_extension.js'
-CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_configure_twocaptcha_extension_options.js'
+INSTALL_SCRIPT = PLUGIN_DIR / 'on_Crawl__05_twocaptcha_install.js'
+CONFIG_SCRIPT = PLUGIN_DIR / 'on_Crawl__25_twocaptcha_config.js'

 TEST_URL = 'https://2captcha.com/demo/cloudflare-turnstile'

--- a/archivebox/workers/orchestrator.py
+++ b/archivebox/workers/orchestrator.py
@@ -269,30 +269,44 @@ class Orchestrator:
        from archivebox.misc.logging import IS_TTY
        import archivebox.misc.logging as logging_module

-        self.on_startup()
-
        # Enable progress bars only in TTY + foreground mode
        show_progress = IS_TTY and self.exit_on_idle

+        # Save original consoles
+        original_console = logging_module.CONSOLE
+        original_stderr = logging_module.STDERR
+
+        # Create Progress with the console it will control
        progress = Progress(
            TextColumn("[cyan]{task.description}"),
            BarColumn(bar_width=40),
            TaskProgressColumn(),
            transient=False,
+            console=original_console,  # Use the original console
        ) if show_progress else None

        task_ids = {}  # snapshot_id -> task_id

-        # Replace global CONSOLE with progress.console when active
-        original_console = logging_module.CONSOLE
-        original_stderr = logging_module.STDERR
+        # Wrapper to convert console.print() to console.log() for Rich Progress
+        class ConsoleLogWrapper:
+            def __init__(self, console):
+                self._console = console
+            def print(self, *args, **kwargs):
+                # Use log() instead of print() to work with Live display
+                self._console.log(*args)
+            def __getattr__(self, name):
+                return getattr(self._console, name)

        try:
            if progress:
                progress.start()
-                # Redirect all logging through progress.console
-                logging_module.CONSOLE = progress.console
-                logging_module.STDERR = progress.console
+                # Wrap progress.console so print() calls become log() calls
+                wrapped_console = ConsoleLogWrapper(progress.console)
+                logging_module.CONSOLE = wrapped_console
+                logging_module.STDERR = wrapped_console
+
+            # Call on_startup AFTER redirecting consoles
+            self.on_startup()

            while True:
                # Check queues and spawn workers
@@ -302,9 +316,15 @@ class Orchestrator:
                if progress:
                    from archivebox.core.models import Snapshot

-                    active_snapshots = Snapshot.objects.filter(status='started').iterator(chunk_size=100)
+                    # Get all started snapshots
+                    active_snapshots = list(Snapshot.objects.filter(status='started'))
+
+                    # Track which snapshots are still active
+                    active_ids = set()

                    for snapshot in active_snapshots:
+                        active_ids.add(snapshot.id)
+
                        total = snapshot.archiveresult_set.count()
                        if total == 0:
                            continue
@@ -316,9 +336,15 @@ class Orchestrator:
                        # Create or update task
                        if snapshot.id not in task_ids:
                            url = snapshot.url[:60] + '...' if len(snapshot.url) > 60 else snapshot.url
-                            task_ids[snapshot.id] = progress.add_task(url, total=total)
+                            task_ids[snapshot.id] = progress.add_task(url, total=total, completed=completed)
+                        else:
+                            progress.update(task_ids[snapshot.id], completed=completed)

-                        progress.update(task_ids[snapshot.id], completed=completed)
+                    # Remove tasks for snapshots that are no longer active
+                    for snapshot_id in list(task_ids.keys()):
+                        if snapshot_id not in active_ids:
+                            progress.remove_task(task_ids[snapshot_id])
+                            del task_ids[snapshot_id]

                # Track idle state
                if self.has_pending_work(queue_sizes) or self.has_running_workers():