unified Process source of truth and better screenshot tests

2026-04-06 07:47:53 +10:00 · 2026-01-02 04:20:34 -08:00
parent 3672174dad
commit dd77511026
44 changed files with 3369 additions and 1919 deletions
--- a/archivebox/core/migrations/0027_copy_archiveresult_to_process.py
+++ b/archivebox/core/migrations/0027_copy_archiveresult_to_process.py
@@ -4,6 +4,7 @@
 from django.db import migrations, connection
 import json
 from pathlib import Path
+from archivebox.uuid_compat import uuid7


 def parse_cmd_field(cmd_raw):
@@ -39,7 +40,6 @@ def parse_cmd_field(cmd_raw):

 def get_or_create_current_machine(cursor):
    """Get or create Machine.current() using raw SQL."""
-    import uuid
    import socket
    from datetime import datetime

@@ -55,7 +55,8 @@ def get_or_create_current_machine(cursor):
        return row[0]

    # Create new machine
-    machine_id = str(uuid.uuid4())
+    # Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite
+    machine_id = uuid7().hex
    now = datetime.now().isoformat()

    # Check which columns exist (schema differs between 0.8.x and 0.9.x)
@@ -103,7 +104,6 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version):
    Returns:
        binary_id (str)
    """
-    import uuid
    from datetime import datetime

    # If abspath is just a name without slashes, it's not a full path
@@ -123,7 +123,8 @@ def get_or_create_binary(cursor, machine_id, name, abspath, version):
        return row[0]

    # Create new binary
-    binary_id = str(uuid.uuid4())
+    # Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite
+    binary_id = uuid7().hex
    now = datetime.now().isoformat()

    # Check which columns exist (schema differs between 0.8.x and 0.9.x)
@@ -186,10 +187,10 @@ def create_process(cursor, machine_id, pwd, cmd, status, exit_code, started_at,
    Returns:
        process_id (str)
    """
-    import uuid
    from datetime import datetime

-    process_id = str(uuid.uuid4())
+    # Django UUIDField stores UUIDs as 32-char hex (no dashes) in SQLite
+    process_id = uuid7().hex
    now = datetime.now().isoformat()

    # Convert cmd array to JSON
--- a/archivebox/core/migrations/0028_alter_snapshot_fs_version.py
+++ b/archivebox/core/migrations/0028_alter_snapshot_fs_version.py
@@ -0,0 +1,18 @@
+# Generated by Django 6.0 on 2026-01-02 08:43
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0027_copy_archiveresult_to_process'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='snapshot',
+            name='fs_version',
+            field=models.CharField(default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().', max_length=10),
+        ),
+    ]
--- a/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py
+++ b/archivebox/core/migrations/0029_migrate_archiveresult_to_uuid_pk.py
@@ -0,0 +1,181 @@
+# Generated by hand on 2026-01-02
+# Migrate ArchiveResult from integer PK to UUID PK (matching Snapshot)
+
+from django.db import migrations, models, connection
+from uuid import UUID
+from archivebox.uuid_compat import uuid7
+
+
+def migrate_archiveresult_id_to_uuid(apps, schema_editor):
+    """
+    Migrate ArchiveResult from integer PK to UUID PK.
+
+    Strategy:
+    1. Add old_id field to store current integer IDs
+    2. Generate UUIDs for any records missing them
+    3. Swap id and uuid fields (uuid becomes PK, old integer id becomes old_id)
+    """
+    cursor = connection.cursor()
+
+    # Check if table exists and has data
+    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='core_archiveresult'")
+    if not cursor.fetchone():
+        print('ArchiveResult table does not exist, skipping migration')
+        return
+
+    cursor.execute("SELECT COUNT(*) FROM core_archiveresult")
+    row_count = cursor.fetchone()[0]
+
+    if row_count == 0:
+        print('No ArchiveResult records to migrate')
+        return
+
+    print(f'Migrating {row_count} ArchiveResult records from integer PK to UUID PK...')
+
+    # Step 0: Check if machine_process table exists, if not NULL out process_id values
+    cursor.execute("SELECT name FROM sqlite_master WHERE type='table' AND name='machine_process'")
+    machine_process_exists = cursor.fetchone() is not None
+
+    if not machine_process_exists:
+        print('machine_process table does not exist yet, setting process_id to NULL')
+        cursor.execute("UPDATE core_archiveresult SET process_id = NULL WHERE process_id IS NOT NULL")
+
+    # Step 1: Create new table with UUID as primary key
+    cursor.execute("""
+        CREATE TABLE core_archiveresult_new (
+            id TEXT PRIMARY KEY NOT NULL,
+            old_id INTEGER,
+            uuid TEXT UNIQUE,
+            created_at DATETIME NOT NULL,
+            modified_at DATETIME NOT NULL,
+
+            snapshot_id TEXT NOT NULL,
+            plugin VARCHAR(32) NOT NULL,
+            hook_name VARCHAR(255) NOT NULL DEFAULT '',
+
+            status VARCHAR(15) NOT NULL DEFAULT 'queued',
+            retry_at DATETIME,
+
+            start_ts DATETIME,
+            end_ts DATETIME,
+
+            output_str TEXT NOT NULL DEFAULT '',
+            output_json TEXT,
+            output_files TEXT NOT NULL DEFAULT '{}',
+            output_size BIGINT NOT NULL DEFAULT 0,
+            output_mimetypes VARCHAR(512) NOT NULL DEFAULT '',
+
+            config TEXT NOT NULL DEFAULT '{}',
+            notes TEXT NOT NULL DEFAULT '',
+            num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
+            num_uses_failed INTEGER NOT NULL DEFAULT 0,
+
+            process_id TEXT,
+
+            FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
+            FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE SET NULL
+        );
+    """)
+
+    # Step 2: Generate UUIDs for records that don't have them
+    cursor.execute("SELECT id, uuid FROM core_archiveresult")
+    records = cursor.fetchall()
+
+    id_to_uuid = {}
+    for old_id, existing_uuid in records:
+        if existing_uuid:
+            # Normalize existing UUID to 32-char hex format (Django SQLite UUIDField format)
+            # (existing UUIDs might be stored with or without dashes in old schema)
+            id_to_uuid[old_id] = UUID(existing_uuid).hex
+        else:
+            # Generate new UUIDv7 (time-ordered) as 32-char hex
+            id_to_uuid[old_id] = uuid7().hex
+
+    # Step 3: Copy data with UUIDs as new primary key
+    cursor.execute("SELECT * FROM core_archiveresult")
+    old_records = cursor.fetchall()
+
+    # Get column names
+    cursor.execute("PRAGMA table_info(core_archiveresult)")
+    columns = cursor.fetchall()
+    col_names = [col[1] for col in columns]
+
+    for i, record in enumerate(old_records):
+        old_id = record[col_names.index('id')]
+        new_uuid = id_to_uuid[old_id]
+
+        # Build insert with new structure
+        values = {col_names[i]: record[i] for i in range(len(col_names))}
+
+        # Check which fields exist in new table
+        fields_to_copy = [
+            'created_at', 'modified_at', 'snapshot_id', 'plugin', 'hook_name',
+            'status', 'retry_at', 'start_ts', 'end_ts',
+            'output_str', 'output_json', 'output_files', 'output_size', 'output_mimetypes',
+            'config', 'notes', 'num_uses_succeeded', 'num_uses_failed', 'process_id'
+        ]
+
+        # Build INSERT statement
+        existing_fields = [f for f in fields_to_copy if f in values]
+        placeholders = ', '.join(['?'] * (len(existing_fields) + 3))  # +3 for id, old_id, uuid
+        field_list = 'id, old_id, uuid, ' + ', '.join(existing_fields)
+
+        insert_values = [new_uuid, old_id, new_uuid] + [values.get(f) for f in existing_fields]
+
+        cursor.execute(
+            f"INSERT INTO core_archiveresult_new ({field_list}) VALUES ({placeholders})",
+            insert_values
+        )
+
+    # Step 4: Replace old table with new table
+    cursor.execute("DROP TABLE core_archiveresult")
+    cursor.execute("ALTER TABLE core_archiveresult_new RENAME TO core_archiveresult")
+
+    # Step 5: Create indexes
+    cursor.execute("CREATE INDEX core_archiveresult_snapshot_id_idx ON core_archiveresult(snapshot_id)")
+    cursor.execute("CREATE INDEX core_archiveresult_plugin_idx ON core_archiveresult(plugin)")
+    cursor.execute("CREATE INDEX core_archiveresult_status_idx ON core_archiveresult(status)")
+    cursor.execute("CREATE INDEX core_archiveresult_retry_at_idx ON core_archiveresult(retry_at)")
+    cursor.execute("CREATE INDEX core_archiveresult_created_at_idx ON core_archiveresult(created_at)")
+    cursor.execute("CREATE INDEX core_archiveresult_hook_name_idx ON core_archiveresult(hook_name)")
+    cursor.execute("CREATE INDEX core_archiveresult_process_id_idx ON core_archiveresult(process_id)")
+    cursor.execute("CREATE INDEX core_archiveresult_old_id_idx ON core_archiveresult(old_id)")
+
+    print(f'✓ Migrated {row_count} ArchiveResult records to UUID primary key')
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0028_alter_snapshot_fs_version'),
+    ]
+
+    operations = [
+        migrations.SeparateDatabaseAndState(
+            database_operations=[
+                migrations.RunPython(
+                    migrate_archiveresult_id_to_uuid,
+                    reverse_code=migrations.RunPython.noop,
+                ),
+            ],
+            state_operations=[
+                # Remove old uuid field
+                migrations.RemoveField(
+                    model_name='archiveresult',
+                    name='uuid',
+                ),
+                # Change id from AutoField to UUIDField
+                migrations.AlterField(
+                    model_name='archiveresult',
+                    name='id',
+                    field=models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True),
+                ),
+                # Add old_id field to preserve legacy integer IDs
+                migrations.AddField(
+                    model_name='archiveresult',
+                    name='old_id',
+                    field=models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions'),
+                ),
+            ],
+        ),
+    ]
--- a/archivebox/core/migrations/0030_alter_archiveresult_id.py
+++ b/archivebox/core/migrations/0030_alter_archiveresult_id.py
@@ -0,0 +1,19 @@
+# Generated by Django 6.0 on 2026-01-02 10:02
+
+import uuid
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0029_migrate_archiveresult_to_uuid_pk'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='id',
+            field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+        ),
+    ]
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -362,6 +362,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea

        # Migrate filesystem if needed (happens automatically on save)
        if self.pk and self.fs_migration_needed:
+            print(f"[DEBUG save()] Triggering filesystem migration for {str(self.id)[:8]}: {self.fs_version} → {self._fs_current_version()}")
            # Walk through migration chain automatically
            current = self.fs_version
            target = self._fs_current_version()
@@ -372,6 +373,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea

                # Only run if method exists (most are no-ops)
                if hasattr(self, method):
+                    print(f"[DEBUG save()] Running {method}()")
                    getattr(self, method)()

                current = next_ver
@@ -449,10 +451,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        old_dir = self.get_storage_path_for_version('0.8.0')
        new_dir = self.get_storage_path_for_version('0.9.0')

-        if not old_dir.exists() or old_dir == new_dir or new_dir.exists():
-            # Even if no directory migration needed, still convert index format
-            self.convert_index_json_to_jsonl()
-            return
+        print(f"[DEBUG _fs_migrate] {self.timestamp}: old_exists={old_dir.exists()}, same={old_dir == new_dir}, new_exists={new_dir.exists()}")
+
+        if not old_dir.exists() or old_dir == new_dir:
+            # No migration needed
+            print(f"[DEBUG _fs_migrate] Returning None (early return)")
+            return None
+
+        if new_dir.exists():
+            # New directory already exists (files already copied), but we still need cleanup
+            # Return cleanup info so old directory can be cleaned up
+            print(f"[DEBUG _fs_migrate] Returning cleanup info (new_dir exists)")
+            return (old_dir, new_dir)

        new_dir.mkdir(parents=True, exist_ok=True)

@@ -495,47 +505,32 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
    def _cleanup_old_migration_dir(self, old_dir: Path, new_dir: Path):
        """
        Delete old directory and create symlink after successful migration.
-        Called via transaction.on_commit() after DB commit succeeds.
        """
        import shutil
        import logging

-        print(f"[DEBUG] _cleanup_old_migration_dir called: old_dir={old_dir}, new_dir={new_dir}")
-
        # Delete old directory
        if old_dir.exists() and not old_dir.is_symlink():
-            print(f"[DEBUG] Attempting to delete old directory: {old_dir}")
            try:
                shutil.rmtree(old_dir)
-                print(f"[DEBUG] Successfully deleted old directory: {old_dir}")
            except Exception as e:
-                # Log but don't raise - migration succeeded, this is just cleanup
-                print(f"[DEBUG] Failed to delete old directory {old_dir}: {e}")
                logging.getLogger('archivebox.migration').warning(
                    f"Could not remove old migration directory {old_dir}: {e}"
                )
                return  # Don't create symlink if cleanup failed
-        else:
-            print(f"[DEBUG] Old directory doesn't exist or is already a symlink: {old_dir}")

        # Create backwards-compat symlink (after old dir is deleted)
        symlink_path = old_dir  # Same path as old_dir
        if symlink_path.is_symlink():
-            print(f"[DEBUG] Unlinking existing symlink: {symlink_path}")
            symlink_path.unlink()

        if not symlink_path.exists():
-            print(f"[DEBUG] Creating symlink: {symlink_path} -> {new_dir}")
            try:
                symlink_path.symlink_to(new_dir, target_is_directory=True)
-                print(f"[DEBUG] Successfully created symlink")
            except Exception as e:
-                print(f"[DEBUG] Failed to create symlink: {e}")
                logging.getLogger('archivebox.migration').warning(
                    f"Could not create symlink from {symlink_path} to {new_dir}: {e}"
                )
-        else:
-            print(f"[DEBUG] Symlink path already exists: {symlink_path}")

    # =========================================================================
    # Path Calculation and Migration Helpers
@@ -660,13 +655,28 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        if not timestamp:
            return None

-        # Look up existing
+        # Look up existing (try exact match first, then fuzzy match for truncated timestamps)
        try:
-            return cls.objects.get(url=url, timestamp=timestamp)
+            snapshot = cls.objects.get(url=url, timestamp=timestamp)
+            print(f"[DEBUG load_from_directory] Found existing snapshot for {url} @ {timestamp}: {str(snapshot.id)[:8]}")
+            return snapshot
        except cls.DoesNotExist:
+            print(f"[DEBUG load_from_directory] NOT FOUND (exact): {url} @ {timestamp}")
+            # Try fuzzy match - index.json may have truncated timestamp
+            # e.g., index has "1767000340" but DB has "1767000340.624737"
+            candidates = cls.objects.filter(url=url, timestamp__startswith=timestamp)
+            if candidates.count() == 1:
+                snapshot = candidates.first()
+                print(f"[DEBUG load_from_directory] Found via fuzzy match: {snapshot.timestamp}")
+                return snapshot
+            elif candidates.count() > 1:
+                print(f"[DEBUG load_from_directory] Multiple fuzzy matches, using first")
+                return candidates.first()
+            print(f"[DEBUG load_from_directory] NOT FOUND (fuzzy): {url} @ {timestamp}")
            return None
        except cls.MultipleObjectsReturned:
            # Should not happen with unique constraint
+            print(f"[DEBUG load_from_directory] Multiple snapshots found for {url} @ {timestamp}")
            return cls.objects.filter(url=url, timestamp=timestamp).first()

    @classmethod
@@ -1668,83 +1678,20 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea

        return archiveresults

-    def advance_step_if_ready(self) -> bool:
-        """
-        Advance current_step if all foreground hooks in current step are finished.
-
-        Called by the state machine to check if step can advance.
-        Background hooks (.bg) don't block step advancement.
-
-        Step advancement rules:
-        - All foreground ARs in current step must be finished (SUCCEEDED/FAILED/SKIPPED)
-        - Background ARs (hook_name contains '.bg.') are ignored for advancement
-        - When ready, increments current_step by 1 (up to 9)
-
-        Returns:
-            True if step was advanced, False if not ready or already at step 9.
-        """
-        from archivebox.hooks import extract_step, is_background_hook
-
-        if self.current_step >= 9:
-            return False  # Already at final step
-
-        # Get all ARs for current step that are foreground
-        current_step_ars = self.archiveresult_set.filter(
-            hook_name__isnull=False
-        ).exclude(hook_name='')
-
-        # Check each AR in current step
-        for ar in current_step_ars:
-            ar_step = extract_step(ar.hook_name)
-            if ar_step != self.current_step:
-                continue  # Not in current step
-
-            if is_background_hook(ar.hook_name):
-                continue  # Background hooks don't block
-
-            # Foreground hook in current step - check if finished
-            if ar.status not in ArchiveResult.FINAL_OR_ACTIVE_STATES:
-                # Still pending/queued - can't advance
-                return False
-
-            if ar.status == ArchiveResult.StatusChoices.STARTED:
-                # Still running - can't advance
-                return False
-
-        # All foreground hooks in current step are finished - advance!
-        self.current_step += 1
-        self.save(update_fields=['current_step', 'modified_at'])
-        return True

    def is_finished_processing(self) -> bool:
        """
-        Check if this snapshot has finished processing.
+        Check if all ArchiveResults are finished.

-        Used by SnapshotMachine.is_finished() to determine if snapshot is complete.
-
-        Returns:
-            True if all archiveresults are finished (or no work to do), False otherwise.
+        Note: This is only called for observability/progress tracking.
+        SnapshotWorker owns the execution and doesn't poll this.
        """
-        # if no archiveresults exist yet, it's not finished
-        if not self.archiveresult_set.exists():
-            return False
+        # Check if any ARs are still pending/started
+        pending = self.archiveresult_set.exclude(
+            status__in=ArchiveResult.FINAL_OR_ACTIVE_STATES
+        ).exists()

-        # Try to advance step if ready (handles step-based hook execution)
-        # This will increment current_step when all foreground hooks in current step are done
-        while self.advance_step_if_ready():
-            pass  # Keep advancing until we can't anymore
-
-        # if archiveresults exist but are still pending, it's not finished
-        if self.pending_archiveresults().exists():
-            return False
-
-        # Don't wait for background hooks - they'll be cleaned up on entering sealed state
-        # Background hooks in STARTED state are excluded by pending_archiveresults()
-        # (STARTED is in FINAL_OR_ACTIVE_STATES) so once all results are FINAL or ACTIVE,
-        # we can transition to sealed and cleanup() will kill the background hooks
-
-        # otherwise archiveresults exist and are all finished, so it's finished
-        return True
+        return not pending

    def get_progress_stats(self) -> dict:
        """
@@ -2242,7 +2189,6 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
    tick = (
        queued.to.itself(unless='can_start') |
        queued.to(started, cond='can_start') |
-        started.to.itself(unless='is_finished') |
        started.to(sealed, cond='is_finished')
    )

@@ -2253,6 +2199,10 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
        can_start = bool(self.snapshot.url)
        return can_start

+    def is_finished(self) -> bool:
+        """Check if all ArchiveResults for this snapshot are finished."""
+        return self.snapshot.is_finished_processing()
+
    @queued.enter
    def enter_queued(self):
        self.snapshot.update_and_requeue(
@@ -2262,29 +2212,10 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):

    @started.enter
    def enter_started(self):
-        import sys
-
-        print(f'[cyan]  🔄 SnapshotMachine.enter_started() - creating archiveresults for {self.snapshot.url}[/cyan]', file=sys.stderr)
-
-        # Run the snapshot - creates pending archiveresults for all enabled plugins
-        self.snapshot.run()
-
-        # Check if any archiveresults were created
-        ar_count = self.snapshot.archiveresult_set.count()
-        print(f'[cyan]  🔄 ArchiveResult count: {ar_count}[/cyan]', file=sys.stderr)
-
-        if ar_count == 0:
-            # No archiveresults created, seal immediately
-            print(f'[cyan]  🔄 No archiveresults created, sealing snapshot immediately[/cyan]', file=sys.stderr)
-            self.seal()
-        else:
-            # Set status = started with retry_at far future (so workers don't claim us - we're waiting for ARs)
-            # Last AR will manually call self.seal() when done
-            self.snapshot.update_and_requeue(
-                retry_at=timezone.now() + timedelta(days=365),
-                status=Snapshot.StatusChoices.STARTED,
-            )
-            print(f'[cyan]  🔄 {ar_count} archiveresults created, waiting for them to finish[/cyan]', file=sys.stderr)
+        """Just mark as started - SnapshotWorker will create ARs and run hooks."""
+        self.snapshot.status = Snapshot.StatusChoices.STARTED
+        self.snapshot.retry_at = None  # No more polling
+        self.snapshot.save(update_fields=['status', 'retry_at', 'modified_at'])

    @sealed.enter
    def enter_sealed(self):
@@ -2329,12 +2260,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        plugins = [get_plugin_name(e) for e in get_plugins()]
        return tuple((e, e) for e in plugins)

-    # Keep AutoField for backward compatibility with 0.7.x databases
-    # UUID field is added separately by migration for new records
-    id = models.AutoField(primary_key=True, editable=False)
-    # Note: unique constraint is added by migration 0027 - don't set unique=True here
-    # or SQLite table recreation in earlier migrations will fail
-    uuid = models.UUIDField(default=uuid7, null=True, blank=True, db_index=True)
+    # UUID primary key (migrated from integer in 0029)
+    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
+    # old_id preserves the legacy integer ID for backward compatibility
+    old_id = models.IntegerField(null=True, blank=True, db_index=True, help_text='Legacy integer ID from pre-0.9.0 versions')
+    # Note: uuid field was removed in migration 0029 when id became UUID
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
    modified_at = models.DateTimeField(auto_now=True)

@@ -2684,13 +2614,11 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        plugin_dir = Path(self.snapshot.output_dir) / self.plugin

        start_ts = timezone.now()
-        is_bg_hook = False
+        process = None

        for hook in hooks:
-            # Check if this is a background hook
-            is_bg_hook = is_background_hook(hook.name)
-
-            result = run_hook(
+            # Run hook using Process.launch() - returns Process model
+            process = run_hook(
                hook,
                output_dir=plugin_dir,
                config=config,
@@ -2700,27 +2628,25 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
                depth=self.snapshot.depth,
            )

-            # Background hooks return None
-            if result is None:
-                is_bg_hook = True
-
-        # Update status based on hook execution
-        if is_bg_hook:
-            # BACKGROUND HOOK - still running, return immediately
-            # Status stays STARTED, will be finalized by Snapshot.cleanup()
-            self.status = self.StatusChoices.STARTED
+            # Link ArchiveResult to Process
+            self.process = process
            self.start_ts = start_ts
-            if self.process_id:
-                self.process.pwd = str(plugin_dir)
-                self.process.save()
+            self.save(update_fields=['process_id', 'start_ts', 'modified_at'])
+
+        if not process:
+            # No hooks ran
+            self.status = self.StatusChoices.FAILED
+            self.output_str = 'No hooks executed'
            self.save()
            return

+        # Update status based on hook execution
+        if process.status == process.StatusChoices.RUNNING:
+            # BACKGROUND HOOK - still running, return immediately
+            # Status is already STARTED from enter_started(), will be finalized by Snapshot.cleanup()
+            return
+
        # FOREGROUND HOOK - completed, update from filesystem
-        self.start_ts = start_ts
-        if self.process_id:
-            self.process.pwd = str(plugin_dir)
-            self.process.save()
        self.update_from_output()

        # Clean up empty output directory if no files were created
@@ -3037,26 +2963,30 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
    skipped = State(value=ArchiveResult.StatusChoices.SKIPPED, final=True)

    # Tick Event - transitions based on conditions
+    # Flow: queued → started → (succeeded|failed|skipped)
+    #       queued → skipped (if exceeded max attempts)
+    #       started → backoff → started (retry)
    tick = (
+        queued.to(skipped, cond='is_exceeded_max_attempts') |  # Check skip first
        queued.to.itself(unless='can_start') |
        queued.to(started, cond='can_start') |
-        started.to.itself(unless='is_finished') |
        started.to(succeeded, cond='is_succeeded') |
        started.to(failed, cond='is_failed') |
        started.to(skipped, cond='is_skipped') |
        started.to(backoff, cond='is_backoff') |
+        backoff.to(skipped, cond='is_exceeded_max_attempts') |  # Check skip from backoff too
        backoff.to.itself(unless='can_start') |
-        backoff.to(started, cond='can_start') |
-        backoff.to(succeeded, cond='is_succeeded') |
-        backoff.to(failed, cond='is_failed') |
-        backoff.to(skipped, cond='is_skipped')
+        backoff.to(started, cond='can_start')
+        # Removed redundant transitions: backoff.to(succeeded/failed/skipped)
+        # Reason: backoff should always retry→started, then started→final states
    )

    def can_start(self) -> bool:
-        if not self.archiveresult.snapshot.url:
-            return False
+        """Pure function - check if AR can start (has valid URL)."""
+        return bool(self.archiveresult.snapshot.url)

-        # Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results
+    def is_exceeded_max_attempts(self) -> bool:
+        """Check if snapshot has exceeded MAX_URL_ATTEMPTS failed results."""
        from archivebox.config.configset import get_config

        config = get_config(
@@ -3070,15 +3000,7 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
            status=ArchiveResult.StatusChoices.FAILED
        ).count()

-        if failed_count >= max_attempts:
-            # Mark this result as skipped since we've hit the limit
-            self.archiveresult.status = ArchiveResult.StatusChoices.SKIPPED
-            self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)'
-            self.archiveresult.retry_at = None
-            self.archiveresult.save()
-            return False
-
-        return True
+        return failed_count >= max_attempts

    def is_succeeded(self) -> bool:
        """Check if extractor plugin succeeded (status was set by run())."""
@@ -3101,12 +3023,35 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
        )

    def is_finished(self) -> bool:
-        """Check if extraction has completed (success, failure, or skipped)."""
-        return self.archiveresult.status in (
+        """
+        Check if extraction has completed (success, failure, or skipped).
+
+        For background hooks in STARTED state, checks if their Process has finished and reaps them.
+        """
+        # If already in final state, return True
+        if self.archiveresult.status in (
            ArchiveResult.StatusChoices.SUCCEEDED,
            ArchiveResult.StatusChoices.FAILED,
            ArchiveResult.StatusChoices.SKIPPED,
-        )
+        ):
+            return True
+
+        # If in STARTED state with a Process, check if Process has finished running
+        if self.archiveresult.status == ArchiveResult.StatusChoices.STARTED:
+            if self.archiveresult.process_id:
+                process = self.archiveresult.process
+
+                # If process is NOT running anymore, reap the background hook
+                if not process.is_running():
+                    self.archiveresult.update_from_output()
+                    # Check if now in final state after reaping
+                    return self.archiveresult.status in (
+                        ArchiveResult.StatusChoices.SUCCEEDED,
+                        ArchiveResult.StatusChoices.FAILED,
+                        ArchiveResult.StatusChoices.SKIPPED,
+                    )
+
+        return False

    @queued.enter
    def enter_queued(self):
@@ -3148,7 +3093,12 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
        )

    def _check_and_seal_parent_snapshot(self):
-        """Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot."""
+        """
+        Check if this is the last ArchiveResult to finish - if so, seal the parent Snapshot.
+
+        Note: In the new architecture, SnapshotWorker handles step advancement and sealing.
+        This method is kept for backwards compatibility with manual CLI commands.
+        """
        import sys

        snapshot = self.archiveresult.snapshot
@@ -3189,6 +3139,8 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
    def enter_failed(self):
        import sys

+        print(f'[red]    ❌ ArchiveResult.enter_failed() called for {self.archiveresult.plugin}[/red]', file=sys.stderr)
+
        self.archiveresult.update_and_requeue(
            retry_at=None,
            status=ArchiveResult.StatusChoices.FAILED,
@@ -3207,6 +3159,16 @@ class ArchiveResultMachine(BaseStateMachine, strict_states=True):
    def enter_skipped(self):
        import sys

+        # Set output_str if not already set (e.g., when skipped due to max attempts)
+        if not self.archiveresult.output_str and self.is_exceeded_max_attempts():
+            from archivebox.config.configset import get_config
+            config = get_config(
+                crawl=self.archiveresult.snapshot.crawl,
+                snapshot=self.archiveresult.snapshot,
+            )
+            max_attempts = config.get('MAX_URL_ATTEMPTS', 50)
+            self.archiveresult.output_str = f'Skipped: snapshot exceeded MAX_URL_ATTEMPTS ({max_attempts} failures)'
+
        self.archiveresult.update_and_requeue(
            retry_at=None,
            status=ArchiveResult.StatusChoices.SKIPPED,