move tests into subfolder, add missing install hooks

2026-04-06 07:47:53 +10:00 · 2026-01-02 00:22:07 -08:00
parent c2afb40350
commit 65ee09ceab
80 changed files with 2659 additions and 859 deletions
--- a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
+++ b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
@@ -116,7 +116,7 @@ def upgrade_core_tables(apps, schema_editor):
            retry_at DATETIME,

            depth INTEGER NOT NULL DEFAULT 0,
-            fs_version VARCHAR(10) NOT NULL DEFAULT '0.9.0',
+            fs_version VARCHAR(10) NOT NULL DEFAULT '0.8.0',
            config TEXT NOT NULL DEFAULT '{}',
            notes TEXT NOT NULL DEFAULT '',
            num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
@@ -326,6 +326,16 @@ class Migration(migrations.Migration):
                    name='modified_at',
                    field=models.DateTimeField(auto_now=True),
                ),
+                # Declare fs_version (already created in database with DEFAULT '0.8.0')
+                migrations.AddField(
+                    model_name='snapshot',
+                    name='fs_version',
+                    field=models.CharField(
+                        max_length=10,
+                        default='0.8.0',
+                        help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().'
+                    ),
+                ),

                # SnapshotTag table already exists from v0.7.2, just declare it in state
                migrations.CreateModel(
--- a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py
+++ b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py
@@ -150,11 +150,7 @@ class Migration(migrations.Migration):
            name='downloaded_at',
            field=models.DateTimeField(blank=True, db_index=True, default=None, editable=False, null=True),
        ),
-        migrations.AddField(
-            model_name='snapshot',
-            name='fs_version',
-            field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
-        ),
+        # NOTE: fs_version already added by migration 0023 with default='0.8.0'
        # NOTE: modified_at already added by migration 0023
        migrations.AddField(
            model_name='snapshot',
--- a/archivebox/core/migrations/0026_add_process_to_archiveresult.py
+++ b/archivebox/core/migrations/0026_add_process_to_archiveresult.py
@@ -8,7 +8,7 @@ class Migration(migrations.Migration):

    dependencies = [
        ('core', '0025_alter_archiveresult_options_alter_snapshot_options_and_more'),
-        ('machine', '0003_add_process_type_and_parent'),
+        ('machine', '0007_add_process_type_and_parent'),
    ]

    operations = [
--- a/archivebox/core/migrations/0027_copy_archiveresult_to_process.py
+++ b/archivebox/core/migrations/0027_copy_archiveresult_to_process.py
@@ -0,0 +1,388 @@
+# Generated by hand on 2026-01-01
+# Copies ArchiveResult cmd/pwd/cmd_version data to Process records before removing old fields
+
+from django.db import migrations, connection
+import json
+from pathlib import Path
+
+
+def parse_cmd_field(cmd_raw):
+    """
+    Parse cmd field which could be:
+    1. JSON array string: '["wget", "-p", "url"]'
+    2. Space-separated string: 'wget -p url'
+    3. NULL/empty
+
+    Returns list of strings.
+    """
+    if not cmd_raw:
+        return []
+
+    cmd_raw = cmd_raw.strip()
+
+    if not cmd_raw:
+        return []
+
+    # Try to parse as JSON first
+    if cmd_raw.startswith('['):
+        try:
+            parsed = json.loads(cmd_raw)
+            if isinstance(parsed, list):
+                return [str(x) for x in parsed]
+        except json.JSONDecodeError:
+            pass
+
+    # Fallback: split by spaces (simple approach, doesn't handle quoted strings)
+    # This is acceptable since old cmd fields were mostly simple commands
+    return cmd_raw.split()
+
+
+def get_or_create_current_machine(cursor):
+    """Get or create Machine.current() using raw SQL."""
+    import uuid
+    import socket
+    from datetime import datetime
+
+    # Simple machine detection - get hostname as guid
+    hostname = socket.gethostname()
+    guid = f'host_{hostname}'  # Simple but stable identifier
+
+    # Check if machine exists
+    cursor.execute("SELECT id FROM machine_machine WHERE guid = ?", [guid])
+    row = cursor.fetchone()
+
+    if row:
+        return row[0]
+
+    # Create new machine
+    machine_id = str(uuid.uuid4())
+    now = datetime.now().isoformat()
+
+    # Check which columns exist (schema differs between 0.8.x and 0.9.x)
+    cursor.execute("PRAGMA table_info(machine_machine)")
+    machine_cols = {row[1] for row in cursor.fetchall()}
+
+    # Build INSERT statement based on available columns
+    if 'config' in machine_cols:
+        # 0.9.x schema with config column
+        cursor.execute("""
+            INSERT INTO machine_machine (
+                id, created_at, modified_at, guid, hostname,
+                hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
+                os_arch, os_family, os_platform, os_release, os_kernel,
+                stats, config, num_uses_failed, num_uses_succeeded
+            ) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '',
+                      '', '', '', '', '', '{}', '{}', 0, 0)
+        """, [machine_id, now, now, guid, hostname])
+    else:
+        # 0.8.x schema without config column
+        cursor.execute("""
+            INSERT INTO machine_machine (
+                id, created_at, modified_at, guid, hostname,
+                hw_in_docker, hw_in_vm, hw_manufacturer, hw_product, hw_uuid,
+                os_arch, os_family, os_platform, os_release, os_kernel,
+                stats, num_uses_failed, num_uses_succeeded
+            ) VALUES (?, ?, ?, ?, ?, 0, 0, '', '', '',
+                      '', '', '', '', '', '{}', 0, 0)
+        """, [machine_id, now, now, guid, hostname])
+
+    return machine_id
+
+
+def get_or_create_binary(cursor, machine_id, name, abspath, version):
+    """
+    Get or create Binary record.
+
+    Args:
+        cursor: DB cursor
+        machine_id: Machine FK
+        name: Binary name (basename of command)
+        abspath: Absolute path to binary (or just name if path unknown)
+        version: Version string
+
+    Returns:
+        binary_id (str)
+    """
+    import uuid
+    from datetime import datetime
+
+    # If abspath is just a name without slashes, it's not a full path
+    # Store it in both fields for simplicity
+    if '/' not in abspath:
+        # Not a full path - store as-is
+        pass
+
+    # Check if binary exists with same machine, name, abspath, version
+    cursor.execute("""
+        SELECT id FROM machine_binary
+        WHERE machine_id = ? AND name = ? AND abspath = ? AND version = ?
+    """, [machine_id, name, abspath, version])
+
+    row = cursor.fetchone()
+    if row:
+        return row[0]
+
+    # Create new binary
+    binary_id = str(uuid.uuid4())
+    now = datetime.now().isoformat()
+
+    # Check which columns exist (schema differs between 0.8.x and 0.9.x)
+    cursor.execute("PRAGMA table_info(machine_binary)")
+    binary_cols = {row[1] for row in cursor.fetchall()}
+
+    # Use only columns that exist in current schema
+    # 0.8.x schema: id, created_at, modified_at, machine_id, name, binprovider, abspath, version, sha256, num_uses_failed, num_uses_succeeded
+    # 0.9.x schema adds: binproviders, overrides, status, retry_at, output_dir
+    if 'binproviders' in binary_cols:
+        # 0.9.x schema
+        cursor.execute("""
+            INSERT INTO machine_binary (
+                id, created_at, modified_at, machine_id,
+                name, binproviders, overrides, binprovider, abspath, version, sha256,
+                status, retry_at, output_dir,
+                num_uses_failed, num_uses_succeeded
+            ) VALUES (?, ?, ?, ?, ?, 'env', '{}', 'env', ?, ?, '',
+                      'succeeded', NULL, '', 0, 0)
+        """, [binary_id, now, now, machine_id, name, abspath, version])
+    else:
+        # 0.8.x schema (simpler)
+        cursor.execute("""
+            INSERT INTO machine_binary (
+                id, created_at, modified_at, machine_id,
+                name, binprovider, abspath, version, sha256,
+                num_uses_failed, num_uses_succeeded
+            ) VALUES (?, ?, ?, ?, ?, 'env', ?, ?, '', 0, 0)
+        """, [binary_id, now, now, machine_id, name, abspath, version])
+
+    return binary_id
+
+
+def map_status(old_status):
+    """
+    Map old ArchiveResult status to Process status and exit_code.
+
+    Args:
+        old_status: One of: queued, started, backoff, succeeded, failed, skipped
+
+    Returns:
+        (process_status, exit_code) tuple
+    """
+    status_map = {
+        'queued': ('queued', None),
+        'started': ('running', None),
+        'backoff': ('queued', None),
+        'succeeded': ('exited', 0),
+        'failed': ('exited', 1),
+        'skipped': ('exited', None),  # Skipped = exited without error
+    }
+
+    return status_map.get(old_status, ('queued', None))
+
+
+def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at, ended_at, binary_id):
+    """
+    Create a Process record.
+
+    Returns:
+        process_id (str)
+    """
+    import uuid
+    from datetime import datetime
+
+    process_id = str(uuid.uuid4())
+    now = datetime.now().isoformat()
+
+    # Convert cmd array to JSON
+    cmd_json = json.dumps(cmd)
+
+    # Set retry_at to now for queued processes, NULL otherwise
+    retry_at = now if status == 'queued' else None
+
+    cursor.execute("""
+        INSERT INTO machine_process (
+            id, created_at, modified_at, machine_id, parent_id, process_type,
+            pwd, cmd, env, timeout,
+            pid, exit_code, stdout, stderr,
+            started_at, ended_at,
+            binary_id, iface_id, url,
+            status, retry_at
+        ) VALUES (?, ?, ?, ?, NULL, 'cli',
+                  ?, ?, '{}', 120,
+                  NULL, ?, '', '',
+                  ?, ?,
+                  ?, NULL, NULL,
+                  ?, ?)
+    """, [
+        process_id, now, now, machine_id,
+        pwd, cmd_json,
+        exit_code,
+        started_at, ended_at,
+        binary_id,
+        status, retry_at
+    ])
+
+    return process_id
+
+
+def copy_archiveresult_data_to_process(apps, schema_editor):
+    """
+    Copy old ArchiveResult execution data (cmd, pwd, cmd_version) to Process records.
+
+    For each ArchiveResult without a process_id:
+    1. Parse cmd field (handle both JSON array and space-separated string)
+    2. Extract binary name/path from cmd[0]
+    3. Get or create Binary record with machine, name, abspath, version
+    4. Create Process record with mapped fields
+    5. Link ArchiveResult.process_id to new Process
+
+    Status mapping:
+    - queued → queued (exit_code=None)
+    - started → running (exit_code=None)
+    - backoff → queued (exit_code=None)
+    - succeeded → exited (exit_code=0)
+    - failed → exited (exit_code=1)
+    - skipped → exited (exit_code=None)
+    """
+    cursor = connection.cursor()
+
+    # Check if old fields still exist (skip if fresh install or already migrated)
+    cursor.execute("PRAGMA table_info(core_archiveresult)")
+    cols = {row[1] for row in cursor.fetchall()}
+
+    print(f'DEBUG 0027: Columns found: {sorted(cols)}')
+    print(f'DEBUG 0027: Has cmd={("cmd" in cols)}, pwd={("pwd" in cols)}, cmd_version={("cmd_version" in cols)}, process_id={("process_id" in cols)}')
+
+    if 'cmd' not in cols or 'pwd' not in cols or 'cmd_version' not in cols:
+        print('✓ Fresh install or fields already removed - skipping data copy')
+        return
+
+    # Check if process_id field exists (should exist from 0026)
+    if 'process_id' not in cols:
+        print('✗ ERROR: process_id field not found. Migration 0026 must run first.')
+        return
+
+    # Get or create Machine.current()
+    machine_id = get_or_create_current_machine(cursor)
+
+    # Get ArchiveResults without process_id that have cmd data
+    # Use plugin (extractor was renamed to plugin in migration 0025)
+    cursor.execute("""
+        SELECT id, snapshot_id, plugin, cmd, pwd, cmd_version,
+               status, start_ts, end_ts, created_at
+        FROM core_archiveresult
+        WHERE process_id IS NULL
+        AND (cmd IS NOT NULL OR pwd IS NOT NULL)
+    """)
+
+    results = cursor.fetchall()
+
+    if not results:
+        print('✓ No ArchiveResults need Process migration')
+        return
+
+    print(f'Migrating {len(results)} ArchiveResults to Process records...')
+
+    migrated_count = 0
+    skipped_count = 0
+    error_count = 0
+
+    for i, row in enumerate(results):
+        ar_id, snapshot_id, plugin, cmd_raw, pwd, cmd_version, status, start_ts, end_ts, created_at = row
+
+        if i == 0:
+            print(f'DEBUG 0027: First row: ar_id={ar_id}, plugin={plugin}, cmd={cmd_raw[:50] if cmd_raw else None}, status={status}')
+
+        try:
+            # Parse cmd field
+            cmd_array = parse_cmd_field(cmd_raw)
+
+            if i == 0:
+                print(f'DEBUG 0027: Parsed cmd: {cmd_array}')
+
+            # Extract binary info from cmd[0] if available
+            binary_id = None
+            if cmd_array and cmd_array[0]:
+                binary_name = Path(cmd_array[0]).name or plugin  # Fallback to plugin name
+                binary_abspath = cmd_array[0]
+                binary_version = cmd_version or ''
+
+                # Get or create Binary record
+                binary_id = get_or_create_binary(
+                    cursor, machine_id, binary_name, binary_abspath, binary_version
+                )
+
+                if i == 0:
+                    print(f'DEBUG 0027: Created Binary: id={binary_id}, name={binary_name}')
+
+            # Map status
+            process_status, exit_code = map_status(status)
+
+            # Set timestamps
+            started_at = start_ts or created_at
+            ended_at = end_ts if process_status == 'exited' else None
+
+            # Create Process record
+            process_id = create_process(
+                cursor=cursor,
+                machine_id=machine_id,
+                pwd=pwd or '',
+                cmd=cmd_array,
+                status=process_status,
+                exit_code=exit_code,
+                started_at=started_at,
+                ended_at=ended_at,
+                binary_id=binary_id,
+            )
+
+            if i == 0:
+                print(f'DEBUG 0027: Created Process: id={process_id}')
+
+            # Link ArchiveResult to Process
+            cursor.execute(
+                "UPDATE core_archiveresult SET process_id = ? WHERE id = ?",
+                [process_id, ar_id]
+            )
+
+            migrated_count += 1
+
+            if i == 0:
+                print(f'DEBUG 0027: Linked ArchiveResult to Process')
+
+        except Exception as e:
+            print(f'✗ Error migrating ArchiveResult {ar_id}: {e}')
+            import traceback
+            traceback.print_exc()
+            error_count += 1
+            continue
+
+    print(f'✓ Migration complete: {migrated_count} migrated, {skipped_count} skipped, {error_count} errors')
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0026_add_process_to_archiveresult'),
+        ('machine', '0007_add_process_type_and_parent'),
+    ]
+
+    operations = [
+        # First, copy data from old fields to Process
+        migrations.RunPython(
+            copy_archiveresult_data_to_process,
+            reverse_code=migrations.RunPython.noop,
+        ),
+
+        # Now safe to remove old fields (moved from 0025)
+        migrations.RemoveField(
+            model_name='archiveresult',
+            name='cmd',
+        ),
+        migrations.RemoveField(
+            model_name='archiveresult',
+            name='cmd_version',
+        ),
+        migrations.RemoveField(
+            model_name='archiveresult',
+            name='pwd',
+        ),
+    ]
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -362,24 +362,22 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea

        # Migrate filesystem if needed (happens automatically on save)
        if self.pk and self.fs_migration_needed:
-            from django.db import transaction
-            with transaction.atomic():
-                # Walk through migration chain automatically
-                current = self.fs_version
-                target = self._fs_current_version()
+            # Walk through migration chain automatically
+            current = self.fs_version
+            target = self._fs_current_version()

-                while current != target:
-                    next_ver = self._fs_next_version(current)
-                    method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'
+            while current != target:
+                next_ver = self._fs_next_version(current)
+                method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'

-                    # Only run if method exists (most are no-ops)
-                    if hasattr(self, method):
-                        getattr(self, method)()
+                # Only run if method exists (most are no-ops)
+                if hasattr(self, method):
+                    getattr(self, method)()

-                    current = next_ver
+                current = next_ver

-                # Update version (still in transaction)
-                self.fs_version = target
+            # Update version
+            self.fs_version = target

        super().save(*args, **kwargs)
        if self.url not in self.crawl.urls:
@@ -486,33 +484,58 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        # Convert index.json to index.jsonl in the new directory
        self.convert_index_json_to_jsonl()

-        # Create backwards-compat symlink (INSIDE transaction)
-        symlink_path = CONSTANTS.ARCHIVE_DIR / self.timestamp
-        if symlink_path.is_symlink():
-            symlink_path.unlink()
+        # Schedule cleanup AFTER transaction commits successfully
+        # This ensures DB changes are committed before we delete old files
+        from django.db import transaction
+        transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir, new_dir))

-        if not symlink_path.exists() or symlink_path == old_dir:
-            symlink_path.symlink_to(new_dir, target_is_directory=True)
+        # Return cleanup info for manual cleanup if needed (when called directly)
+        return (old_dir, new_dir)

-        # Schedule old directory deletion AFTER transaction commits
-        transaction.on_commit(lambda: self._cleanup_old_migration_dir(old_dir))
-
-    def _cleanup_old_migration_dir(self, old_dir: Path):
+    def _cleanup_old_migration_dir(self, old_dir: Path, new_dir: Path):
        """
-        Delete old directory after successful migration.
+        Delete old directory and create symlink after successful migration.
        Called via transaction.on_commit() after DB commit succeeds.
        """
        import shutil
        import logging

+        print(f"[DEBUG] _cleanup_old_migration_dir called: old_dir={old_dir}, new_dir={new_dir}")
+
+        # Delete old directory
        if old_dir.exists() and not old_dir.is_symlink():
+            print(f"[DEBUG] Attempting to delete old directory: {old_dir}")
            try:
                shutil.rmtree(old_dir)
+                print(f"[DEBUG] Successfully deleted old directory: {old_dir}")
            except Exception as e:
                # Log but don't raise - migration succeeded, this is just cleanup
+                print(f"[DEBUG] Failed to delete old directory {old_dir}: {e}")
                logging.getLogger('archivebox.migration').warning(
                    f"Could not remove old migration directory {old_dir}: {e}"
                )
+                return  # Don't create symlink if cleanup failed
+        else:
+            print(f"[DEBUG] Old directory doesn't exist or is already a symlink: {old_dir}")
+
+        # Create backwards-compat symlink (after old dir is deleted)
+        symlink_path = old_dir  # Same path as old_dir
+        if symlink_path.is_symlink():
+            print(f"[DEBUG] Unlinking existing symlink: {symlink_path}")
+            symlink_path.unlink()
+
+        if not symlink_path.exists():
+            print(f"[DEBUG] Creating symlink: {symlink_path} -> {new_dir}")
+            try:
+                symlink_path.symlink_to(new_dir, target_is_directory=True)
+                print(f"[DEBUG] Successfully created symlink")
+            except Exception as e:
+                print(f"[DEBUG] Failed to create symlink: {e}")
+                logging.getLogger('archivebox.migration').warning(
+                    f"Could not create symlink from {symlink_path} to {new_dir}: {e}"
+                )
+        else:
+            print(f"[DEBUG] Symlink path already exists: {symlink_path}")

    # =========================================================================
    # Path Calculation and Migration Helpers
@@ -1616,8 +1639,11 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        This enables step-based execution where all hooks in a step can run in parallel.
        """
        from archivebox.hooks import discover_hooks
+        from archivebox.config.configset import get_config

-        hooks = discover_hooks('Snapshot')
+        # Get merged config with crawl-specific PLUGINS filter
+        config = get_config(crawl=self.crawl, snapshot=self)
+        hooks = discover_hooks('Snapshot', config=config)
        archiveresults = []

        for hook_path in hooks:
@@ -2212,22 +2238,19 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
    started = State(value=Snapshot.StatusChoices.STARTED)
    sealed = State(value=Snapshot.StatusChoices.SEALED, final=True)

-    # Tick Event
+    # Tick Event (polled by workers)
    tick = (
        queued.to.itself(unless='can_start') |
-        queued.to(started, cond='can_start') |
-        started.to.itself(unless='is_finished', on='on_started_to_started') |
-        started.to(sealed, cond='is_finished')
+        queued.to(started, cond='can_start')
    )

+    # Manual event (triggered by last ArchiveResult finishing)
+    seal = started.to(sealed)
+
    def can_start(self) -> bool:
        can_start = bool(self.snapshot.url)
        return can_start

-    def is_finished(self) -> bool:
-        """Check if snapshot processing is complete - delegates to model method."""
-        return self.snapshot.is_finished_processing()
-
    @queued.enter
    def enter_queued(self):
        self.snapshot.update_and_requeue(
@@ -2237,29 +2260,34 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):

    @started.enter
    def enter_started(self):
-        # lock the snapshot while we create the pending archiveresults
-        self.snapshot.update_and_requeue(
-            retry_at=timezone.now() + timedelta(seconds=30),  # if failed, wait 30s before retrying
-        )
+        import sys
+
+        print(f'[cyan]  🔄 SnapshotMachine.enter_started() - creating archiveresults for {self.snapshot.url}[/cyan]', file=sys.stderr)

        # Run the snapshot - creates pending archiveresults for all enabled plugins
        self.snapshot.run()

-        # unlock the snapshot after we're done + set status = started
-        self.snapshot.update_and_requeue(
-            retry_at=timezone.now() + timedelta(seconds=5),  # check again in 5s
-            status=Snapshot.StatusChoices.STARTED,
-        )
+        # Check if any archiveresults were created
+        ar_count = self.snapshot.archiveresult_set.count()
+        print(f'[cyan]  🔄 ArchiveResult count: {ar_count}[/cyan]', file=sys.stderr)

-    def on_started_to_started(self):
-        """Called when Snapshot stays in started state (archiveresults not finished yet)."""
-        # Bump retry_at so we check again in a few seconds
-        self.snapshot.update_and_requeue(
-            retry_at=timezone.now() + timedelta(seconds=5),
-        )
+        if ar_count == 0:
+            # No archiveresults created, seal immediately
+            print(f'[cyan]  🔄 No archiveresults created, sealing snapshot immediately[/cyan]', file=sys.stderr)
+            self.seal()
+        else:
+            # Set status = started with retry_at far future (so workers don't claim us - we're waiting for ARs)
+            # Last AR will manually call self.seal() when done
+            self.snapshot.update_and_requeue(
+                retry_at=timezone.now() + timedelta(days=365),
+                status=Snapshot.StatusChoices.STARTED,
+            )
+            print(f'[cyan]  🔄 {ar_count} archiveresults created, waiting for them to finish[/cyan]', file=sys.stderr)

    @sealed.enter
    def enter_sealed(self):
+        import sys
+
        # Clean up background hooks
        self.snapshot.cleanup()

@@ -2268,6 +2296,21 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
            status=Snapshot.StatusChoices.SEALED,
        )

+        print(f'[cyan]  ✅ SnapshotMachine.enter_sealed() - sealed {self.snapshot.url}[/cyan]', file=sys.stderr)
+
+        # Check if this is the last snapshot for the parent crawl - if so, seal the crawl
+        if self.snapshot.crawl:
+            crawl = self.snapshot.crawl
+            remaining_active = Snapshot.objects.filter(
+                crawl=crawl,
+                status__in=[Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]
+            ).count()
+
+            if remaining_active == 0:
+                print(f'[cyan]🔒 All snapshots sealed for crawl {crawl.id}, sealing crawl[/cyan]', file=sys.stderr)
+                # Seal the parent crawl
+                crawl.sm.seal()
+

 class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithStateMachine):
    class StatusChoices(models.TextChoices):
@@ -3102,8 +3145,30 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
            end_ts=None,
        )

+    def _check_and_seal_parent_snapshot(self):
+        """Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot."""
+        import sys
+
+        snapshot = self.archiveresult.snapshot
+
+        # Check if all archiveresults are finished (in final states)
+        remaining_active = snapshot.archiveresult_set.exclude(
+            status__in=[
+                ArchiveResult.StatusChoices.SUCCEEDED,
+                ArchiveResult.StatusChoices.FAILED,
+                ArchiveResult.StatusChoices.SKIPPED,
+            ]
+        ).count()
+
+        if remaining_active == 0:
+            print(f'[cyan]    🔒 All archiveresults finished for snapshot {snapshot.url}, sealing snapshot[/cyan]', file=sys.stderr)
+            # Seal the parent snapshot
+            snapshot.sm.seal()
+
    @succeeded.enter
    def enter_succeeded(self):
+        import sys
+
        self.archiveresult.update_and_requeue(
            retry_at=None,
            status=ArchiveResult.StatusChoices.SUCCEEDED,
@@ -3113,8 +3178,15 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
        # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
        self.archiveresult.cascade_health_update(success=True)

+        print(f'[cyan]    ✅ ArchiveResult succeeded: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/cyan]', file=sys.stderr)
+
+        # Check if this is the last AR to finish - seal parent snapshot if so
+        self._check_and_seal_parent_snapshot()
+
    @failed.enter
    def enter_failed(self):
+        import sys
+
        self.archiveresult.update_and_requeue(
            retry_at=None,
            status=ArchiveResult.StatusChoices.FAILED,
@@ -3124,16 +3196,25 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
        # Update health stats for ArchiveResult, Snapshot, and Crawl cascade
        self.archiveresult.cascade_health_update(success=False)

+        print(f'[red]    ❌ ArchiveResult failed: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/red]', file=sys.stderr)
+
+        # Check if this is the last AR to finish - seal parent snapshot if so
+        self._check_and_seal_parent_snapshot()
+
    @skipped.enter
    def enter_skipped(self):
+        import sys
+
        self.archiveresult.update_and_requeue(
            retry_at=None,
            status=ArchiveResult.StatusChoices.SKIPPED,
            end_ts=timezone.now(),
        )

-    def after_transition(self, event: str, source: State, target: State):
-        self.archiveresult.snapshot.update_and_requeue()  # bump snapshot retry time so it picks up all the new changes
+        print(f'[dim]    ⏭️  ArchiveResult skipped: {self.archiveresult.plugin} for {self.archiveresult.snapshot.url}[/dim]', file=sys.stderr)
+
+        # Check if this is the last AR to finish - seal parent snapshot if so
+        self._check_and_seal_parent_snapshot()


 # =============================================================================