more fixes

2026-01-03 09:25:42 +10:00 · 2025-12-29 21:03:05 -08:00
parent 147d567d3f
commit 80f75126c6
12 changed files with 339 additions and 135 deletions
--- a/archivebox/api/migrations/0001_initial.py
+++ b/archivebox/api/migrations/0001_initial.py
@@ -21,12 +21,8 @@ class Migration(migrations.Migration):
                    id TEXT PRIMARY KEY NOT NULL,
                    created_at DATETIME NOT NULL,
                    modified_at DATETIME NOT NULL,
-                    num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
-                    num_uses_failed INTEGER NOT NULL DEFAULT 0,

                    token VARCHAR(32) NOT NULL UNIQUE,
-                    label VARCHAR(64) NOT NULL DEFAULT '',
-                    notes TEXT NOT NULL DEFAULT '',
                    expires DATETIME,

                    created_by_id INTEGER NOT NULL,
@@ -41,19 +37,20 @@ class Migration(migrations.Migration):
                    id TEXT PRIMARY KEY NOT NULL,
                    created_at DATETIME NOT NULL,
                    modified_at DATETIME NOT NULL,
-                    num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
-                    num_uses_failed INTEGER NOT NULL DEFAULT 0,

                    name VARCHAR(255) NOT NULL UNIQUE,
                    signal VARCHAR(255) NOT NULL,
                    ref VARCHAR(1024) NOT NULL,
                    endpoint VARCHAR(2048) NOT NULL,
                    headers TEXT NOT NULL DEFAULT '{}',
+                    auth_token TEXT NOT NULL DEFAULT '',
                    enabled BOOLEAN NOT NULL DEFAULT 1,
                    keep_last_response BOOLEAN NOT NULL DEFAULT 0,
-                    last_response TEXT,
+                    created DATETIME NOT NULL,
+                    updated DATETIME NOT NULL,
+                    last_response TEXT NOT NULL DEFAULT '',
                    last_success DATETIME,
-                    last_error DATETIME,
+                    last_failure DATETIME,

                    created_by_id INTEGER NOT NULL,

--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -52,20 +52,21 @@ def update(filter_patterns: Iterable[str] = (),
            )
            print_stats(stats)
        else:
-            # Full mode: import orphans + process DB + deduplicate
-            stats_combined = {'phase1': {}, 'phase2': {}, 'deduplicated': 0}
+            # Full mode: drain old dirs + process DB
+            stats_combined = {'phase1': {}, 'phase2': {}}

-            print('[*] Phase 1: Scanning archive/ for orphaned snapshots...')
-            stats_combined['phase1'] = import_orphans_from_archive(
+            print('[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...')
+            stats_combined['phase1'] = drain_old_archive_dirs(
                resume_from=resume,
                batch_size=batch_size
            )

-            print('[*] Phase 2: Processing all database snapshots...')
+            print('[*] Phase 2: Processing all database snapshots (most recent first)...')
            stats_combined['phase2'] = process_all_db_snapshots(batch_size=batch_size)

-            print('[*] Phase 3: Deduplicating...')
-            stats_combined['deduplicated'] = Snapshot.find_and_merge_duplicates()
+            # Phase 3: Deduplication (disabled for now)
+            # print('[*] Phase 3: Deduplicating...')
+            # stats_combined['deduplicated'] = Snapshot.find_and_merge_duplicates()

            print_combined_stats(stats_combined)

@@ -77,33 +78,39 @@ def update(filter_patterns: Iterable[str] = (),
        resume = None


-def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100) -> dict:
+def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> dict:
    """
-    Scan archive/ for orphaned snapshots.
-    Skip symlinks (already migrated).
-    Create DB records and trigger migration on save().
+    Drain old archive/ directories (0.8.x → 0.9.x migration).
+
+    Only processes real directories (skips symlinks - those are already migrated).
+    For each old dir found in archive/:
+      1. Load or create DB snapshot
+      2. Trigger fs migration on save() to move to data/users/{user}/...
+      3. Leave symlink in archive/ pointing to new location
+
+    After this drains, archive/ should only contain symlinks and we can trust
+    1:1 mapping between DB and filesystem.
    """
    from archivebox.core.models import Snapshot
    from archivebox.config import CONSTANTS
    from django.db import transaction

-    stats = {'processed': 0, 'imported': 0, 'migrated': 0, 'invalid': 0}
+    stats = {'processed': 0, 'migrated': 0, 'skipped': 0, 'invalid': 0}

    archive_dir = CONSTANTS.ARCHIVE_DIR
    if not archive_dir.exists():
        return stats

-    print('[*] Scanning and sorting by modification time...')
+    print('[*] Scanning for old directories in archive/...')

-    # Scan and sort by mtime (newest first)
-    # Loading (mtime, path) tuples is fine even for millions (~100MB for 1M entries)
+    # Scan for real directories only (skip symlinks - they're already migrated)
    entries = [
        (e.stat().st_mtime, e.path)
        for e in os.scandir(archive_dir)
        if e.is_dir(follow_symlinks=False)  # Skip symlinks
    ]
    entries.sort(reverse=True)  # Newest first
-    print(f'[*] Found {len(entries)} directories to check')
+    print(f'[*] Found {len(entries)} old directories to drain')

    for mtime, entry_path in entries:
        entry_path = Path(entry_path)
@@ -114,30 +121,26 @@ def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100)

        stats['processed'] += 1

-        # Check if already in DB
+        # Try to load existing snapshot from DB
        snapshot = Snapshot.load_from_directory(entry_path)
-        if snapshot:
-            continue  # Already in DB, skip

-        # Not in DB - create orphaned snapshot
-        snapshot = Snapshot.create_from_directory(entry_path)
        if not snapshot:
-            # Invalid directory
-            Snapshot.move_directory_to_invalid(entry_path)
-            stats['invalid'] += 1
-            print(f"    [{stats['processed']}] Invalid: {entry_path.name}")
-            continue
+            # Not in DB - create new snapshot record
+            snapshot = Snapshot.create_from_directory(entry_path)
+            if not snapshot:
+                # Invalid directory - move to invalid/
+                Snapshot.move_directory_to_invalid(entry_path)
+                stats['invalid'] += 1
+                print(f"    [{stats['processed']}] Invalid: {entry_path.name}")
+                continue

-        needs_migration = snapshot.fs_migration_needed
-
-        snapshot.save()  # Creates DB record + triggers migration
-
-        stats['imported'] += 1
-        if needs_migration:
+        # Check if needs migration (0.8.x → 0.9.x)
+        if snapshot.fs_migration_needed:
+            snapshot.save()  # Triggers migration + creates symlink
            stats['migrated'] += 1
-            print(f"    [{stats['processed']}] Imported + migrated: {entry_path.name}")
+            print(f"    [{stats['processed']}] Migrated: {entry_path.name}")
        else:
-            print(f"    [{stats['processed']}] Imported: {entry_path.name}")
+            stats['skipped'] += 1

        if stats['processed'] % batch_size == 0:
            transaction.commit()
@@ -148,8 +151,14 @@ def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100)

 def process_all_db_snapshots(batch_size: int = 100) -> dict:
    """
-    Process all snapshots in DB.
-    Reconcile index.json and queue for archiving.
+    O(n) scan over entire DB from most recent to least recent.
+
+    For each snapshot:
+      1. Reconcile index.json with DB (merge titles, tags, archive results)
+      2. Queue for archiving (state machine will handle it)
+
+    No orphan detection needed - we trust 1:1 mapping between DB and filesystem
+    after Phase 1 has drained all old archive/ directories.
    """
    from archivebox.core.models import Snapshot
    from django.db import transaction
@@ -158,9 +167,10 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
    stats = {'processed': 0, 'reconciled': 0, 'queued': 0}

    total = Snapshot.objects.count()
-    print(f'[*] Processing {total} snapshots from database...')
+    print(f'[*] Processing {total} snapshots from database (most recent first)...')

-    for snapshot in Snapshot.objects.iterator(chunk_size=batch_size):
+    # Process from most recent to least recent
+    for snapshot in Snapshot.objects.order_by('-bookmarked_at').iterator(chunk_size=batch_size):
        # Reconcile index.json with DB
        snapshot.reconcile_with_index_json()

@@ -252,19 +262,16 @@ def print_combined_stats(stats_combined: dict):
    print(f"""
 [green]Archive Update Complete[/green]

-Phase 1 (Import Orphans):
+Phase 1 (Drain Old Dirs):
  Checked:     {s1.get('processed', 0)}
-  Imported:    {s1.get('imported', 0)}
  Migrated:    {s1.get('migrated', 0)}
+  Skipped:     {s1.get('skipped', 0)}
  Invalid:     {s1.get('invalid', 0)}

 Phase 2 (Process DB):
  Processed:   {s2.get('processed', 0)}
  Reconciled:  {s2.get('reconciled', 0)}
  Queued:      {s2.get('queued', 0)}
-
-Phase 3 (Deduplicate):
-  Merged:      {stats_combined['deduplicated']}
 """)


--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -297,7 +297,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
    status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
    config = models.JSONField(default=dict, null=False, blank=False, editable=True)
    notes = models.TextField(blank=True, null=False, default='')
-    output_dir = models.FilePathField(path=CONSTANTS.ARCHIVE_DIR, recursive=True, match='.*', default=None, null=True, blank=True, editable=True)
+    # output_dir is computed via @cached_property from fs_version and get_storage_path_for_version()

    tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))

@@ -1981,7 +1981,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
    status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
    notes = models.TextField(blank=True, null=False, default='')
-    output_dir = models.CharField(max_length=256, default=None, null=True, blank=True)
+    # output_dir is computed via @property from snapshot.output_dir / plugin

    state_machine_name = 'archivebox.core.models.ArchiveResultMachine'
    retry_at_field_name = 'retry_at'
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -358,10 +358,19 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
        """Clean up background hooks and run on_CrawlEnd hooks."""
        import os
        import signal
+        import time
        from pathlib import Path
        from archivebox.hooks import run_hook, discover_hooks
        from archivebox.misc.process_utils import validate_pid_file

+        def is_process_alive(pid):
+            """Check if a process exists."""
+            try:
+                os.kill(pid, 0)  # Signal 0 checks existence without killing
+                return True
+            except (OSError, ProcessLookupError):
+                return False
+
        # Kill any background processes by scanning for all .pid files
        if self.OUTPUT_DIR.exists():
            for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
@@ -371,9 +380,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
                    # PID reused by different process or process dead
                    pid_file.unlink(missing_ok=True)
                    continue
-                
+
                try:
                    pid = int(pid_file.read_text().strip())
+
+                    # Step 1: Send SIGTERM for graceful shutdown
                    try:
                        # Try to kill process group first (handles detached processes like Chrome)
                        try:
@@ -382,8 +393,46 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
                            # Fall back to killing just the process
                            os.kill(pid, signal.SIGTERM)
                    except ProcessLookupError:
-                        pass  # Already dead
-                except (ValueError, OSError):
+                        # Already dead
+                        pid_file.unlink(missing_ok=True)
+                        continue
+
+                    # Step 2: Wait for graceful shutdown
+                    time.sleep(2)
+
+                    # Step 3: Check if still alive
+                    if not is_process_alive(pid):
+                        # Process terminated gracefully
+                        pid_file.unlink(missing_ok=True)
+                        continue
+
+                    # Step 4: Process still alive, force kill ENTIRE process group with SIGKILL
+                    try:
+                        try:
+                            # Always kill entire process group with SIGKILL (not individual processes)
+                            os.killpg(pid, signal.SIGKILL)
+                        except (OSError, ProcessLookupError) as e:
+                            # Process group kill failed, try single process as fallback
+                            os.kill(pid, signal.SIGKILL)
+                    except ProcessLookupError:
+                        # Process died between check and kill
+                        pid_file.unlink(missing_ok=True)
+                        continue
+
+                    # Step 5: Wait and verify death
+                    time.sleep(1)
+
+                    if is_process_alive(pid):
+                        # Process is unkillable (likely in UNE state on macOS)
+                        # This happens when Chrome crashes in kernel syscall (IOSurface)
+                        # Log but don't block cleanup - process will remain until reboot
+                        print(f'[yellow]⚠️ Process {pid} is unkillable (likely crashed in kernel). Will remain until reboot.[/yellow]')
+                    else:
+                        # Successfully killed
+                        pid_file.unlink(missing_ok=True)
+
+                except (ValueError, OSError) as e:
+                    # Invalid PID file or permission error
                    pass

        # Run on_CrawlEnd hooks
--- a/archivebox/plugins/chrome/chrome_utils.js
+++ b/archivebox/plugins/chrome/chrome_utils.js
@@ -397,8 +397,53 @@ async function launchChromium(options = {}) {
    }
 }

+/**
+ * Check if a process is still running.
+ * @param {number} pid - Process ID to check
+ * @returns {boolean} - True if process exists
+ */
+function isProcessAlive(pid) {
+    try {
+        process.kill(pid, 0);  // Signal 0 checks existence without killing
+        return true;
+    } catch (e) {
+        return false;
+    }
+}
+
+/**
+ * Find all Chrome child processes for a given debug port.
+ * @param {number} port - Debug port number
+ * @returns {Array<number>} - Array of PIDs
+ */
+function findChromeProcessesByPort(port) {
+    const { execSync } = require('child_process');
+    const pids = [];
+
+    try {
+        // Find all Chrome processes using this debug port
+        const output = execSync(
+            `ps aux | grep -i "chrome.*--remote-debugging-port=${port}" | grep -v grep | awk '{print $2}'`,
+            { encoding: 'utf8', timeout: 5000 }
+        );
+
+        for (const line of output.split('\n')) {
+            const pid = parseInt(line.trim(), 10);
+            if (!isNaN(pid) && pid > 0) {
+                pids.push(pid);
+            }
+        }
+    } catch (e) {
+        // Command failed or no processes found
+    }
+
+    return pids;
+}
+
 /**
 * Kill a Chrome process by PID.
+ * Always sends SIGTERM before SIGKILL, then verifies death.
+ *
 * @param {number} pid - Process ID to kill
 * @param {string} [outputDir] - Directory containing PID files to clean up
 */
@@ -407,30 +452,93 @@ async function killChrome(pid, outputDir = null) {

    console.error(`[*] Killing Chrome process tree (PID ${pid})...`);

-    // Try to kill process group first
+    // Get debug port for finding child processes
+    let debugPort = null;
+    if (outputDir) {
+        try {
+            const portFile = path.join(outputDir, 'port.txt');
+            if (fs.existsSync(portFile)) {
+                debugPort = parseInt(fs.readFileSync(portFile, 'utf8').trim(), 10);
+            }
+        } catch (e) {}
+    }
+
+    // Step 1: SIGTERM to process group (graceful shutdown)
+    console.error(`[*] Sending SIGTERM to process group -${pid}...`);
    try {
        process.kill(-pid, 'SIGTERM');
    } catch (e) {
-        try { process.kill(pid, 'SIGTERM'); } catch (e2) {}
+        try {
+            console.error(`[*] Process group kill failed, trying single process...`);
+            process.kill(pid, 'SIGTERM');
+        } catch (e2) {
+            console.error(`[!] SIGTERM failed: ${e2.message}`);
+        }
    }

-    // Wait for graceful shutdown
+    // Step 2: Wait for graceful shutdown
    await new Promise(resolve => setTimeout(resolve, 2000));

-    // Force kill
-    try {
-        process.kill(-pid, 'SIGKILL');
-    } catch (e) {
-        try { process.kill(pid, 'SIGKILL'); } catch (e2) {}
+    // Step 3: Check if still alive
+    if (!isProcessAlive(pid)) {
+        console.error('[+] Chrome process terminated gracefully');
+    } else {
+        // Step 4: Force kill ENTIRE process group with SIGKILL
+        console.error(`[*] Process still alive, sending SIGKILL to process group -${pid}...`);
+        try {
+            process.kill(-pid, 'SIGKILL');  // Kill entire process group
+        } catch (e) {
+            console.error(`[!] Process group SIGKILL failed, trying single process: ${e.message}`);
+            try {
+                process.kill(pid, 'SIGKILL');
+            } catch (e2) {
+                console.error(`[!] SIGKILL failed: ${e2.message}`);
+            }
+        }
+
+        // Step 5: Wait briefly and verify death
+        await new Promise(resolve => setTimeout(resolve, 1000));
+
+        if (isProcessAlive(pid)) {
+            console.error(`[!] WARNING: Process ${pid} is unkillable (likely in UNE state)`);
+            console.error(`[!] This typically happens when Chrome crashes in kernel syscall`);
+            console.error(`[!] Process will remain as zombie until system reboot`);
+            console.error(`[!] macOS IOSurface crash creates unkillable processes in UNE state`);
+
+            // Try one more time to kill the entire process group
+            if (debugPort) {
+                const relatedPids = findChromeProcessesByPort(debugPort);
+                if (relatedPids.length > 1) {
+                    console.error(`[*] Found ${relatedPids.length} Chrome processes still running on port ${debugPort}`);
+                    console.error(`[*] Attempting final process group SIGKILL...`);
+
+                    // Try to kill each unique process group we find
+                    const processGroups = new Set();
+                    for (const relatedPid of relatedPids) {
+                        if (relatedPid !== pid) {
+                            processGroups.add(relatedPid);
+                        }
+                    }
+
+                    for (const groupPid of processGroups) {
+                        try {
+                            process.kill(-groupPid, 'SIGKILL');
+                        } catch (e) {}
+                    }
+                }
+            }
+        } else {
+            console.error('[+] Chrome process group killed successfully');
+        }
    }

-    // Clean up PID files
+    // Step 8: Clean up PID files
    if (outputDir) {
        try { fs.unlinkSync(path.join(outputDir, 'chrome.pid')); } catch (e) {}
        try { fs.unlinkSync(path.join(outputDir, 'hook.pid')); } catch (e) {}
    }

-    console.error('[*] Chrome process killed');
+    console.error('[*] Chrome cleanup completed');
 }

 /**
--- a/archivebox/plugins/chrome/tests/test_chrome.py
+++ b/archivebox/plugins/chrome/tests/test_chrome.py
@@ -594,36 +594,57 @@ def test_zombie_prevention_hook_killed():
        except OSError:
            pytest.fail("Chrome should still be running after hook SIGKILL")

-        # Simulate Crawl.cleanup() - kill all .pid files
+        # Simulate Crawl.cleanup() using the actual cleanup logic
+        def is_process_alive(pid):
+            """Check if a process exists."""
+            try:
+                os.kill(pid, 0)
+                return True
+            except (OSError, ProcessLookupError):
+                return False
+
        for pid_file in chrome_dir.glob('**/*.pid'):
            try:
                pid = int(pid_file.read_text().strip())
+
+                # Step 1: SIGTERM for graceful shutdown
                try:
-                    # Try to kill process group first (for detached processes like Chrome)
                    try:
                        os.killpg(pid, signal.SIGTERM)
                    except (OSError, ProcessLookupError):
-                        # Fall back to killing just the process
                        os.kill(pid, signal.SIGTERM)
+                except ProcessLookupError:
+                    pid_file.unlink(missing_ok=True)
+                    continue

-                    time.sleep(0.5)
+                # Step 2: Wait for graceful shutdown
+                time.sleep(2)

-                    # Force kill if still alive
+                # Step 3: Check if still alive
+                if not is_process_alive(pid):
+                    pid_file.unlink(missing_ok=True)
+                    continue
+
+                # Step 4: Force kill ENTIRE process group with SIGKILL
+                try:
                    try:
+                        # Always kill entire process group with SIGKILL
                        os.killpg(pid, signal.SIGKILL)
                    except (OSError, ProcessLookupError):
-                        try:
-                            os.kill(pid, signal.SIGKILL)
-                        except OSError:
-                            pass
+                        os.kill(pid, signal.SIGKILL)
                except ProcessLookupError:
-                    pass
+                    pid_file.unlink(missing_ok=True)
+                    continue
+
+                # Step 5: Wait and verify death
+                time.sleep(1)
+
+                if not is_process_alive(pid):
+                    pid_file.unlink(missing_ok=True)
+
            except (ValueError, OSError):
                pass

-        # Wait a moment for cleanup
-        time.sleep(1)
-
        # Chrome should now be dead
        try:
            os.kill(chrome_pid, 0)
--- a/archivebox/plugins/forumdl/forum-dl-wrapper.py
+++ b/archivebox/plugins/forumdl/forum-dl-wrapper.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+"""
+Wrapper for forum-dl that applies Pydantic v2 compatibility patches.
+
+This wrapper fixes forum-dl 0.3.0's incompatibility with Pydantic v2 by monkey-patching
+the JsonlWriter class to use model_dump_json() instead of the deprecated json(models_as_dict=False).
+"""
+
+import sys
+
+# Apply Pydantic v2 compatibility patch BEFORE importing forum_dl
+try:
+    from forum_dl.writers.jsonl import JsonlWriter
+    from pydantic import BaseModel
+
+    # Check if we're using Pydantic v2
+    if hasattr(BaseModel, 'model_dump_json'):
+        def _patched_serialize_entry(self, entry):
+            """Use Pydantic v2's model_dump_json() instead of deprecated json(models_as_dict=False)"""
+            return entry.model_dump_json()
+
+        JsonlWriter._serialize_entry = _patched_serialize_entry
+except (ImportError, AttributeError):
+    # forum-dl not installed or already compatible - no patch needed
+    pass
+
+# Now import and run forum-dl's main function
+from forum_dl import main
+
+if __name__ == '__main__':
+    sys.exit(main())
--- a/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py
+++ b/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py
@@ -115,8 +115,12 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
    else:
        output_file = output_dir / f'forum.{output_format}'

-    # Build command
-    cmd = [binary, *forumdl_args, '-f', output_format, '-o', str(output_file)]
+    # Use our Pydantic v2 compatible wrapper if available, otherwise fall back to binary
+    wrapper_path = Path(__file__).parent / 'forum-dl-wrapper.py'
+    if wrapper_path.exists():
+        cmd = [sys.executable, str(wrapper_path), *forumdl_args, '-f', output_format, '-o', str(output_file)]
+    else:
+        cmd = [binary, *forumdl_args, '-f', output_format, '-o', str(output_file)]

    if not check_ssl:
        cmd.append('--no-check-certificate')
--- a/archivebox/plugins/forumdl/tests/test_forumdl.py
+++ b/archivebox/plugins/forumdl/tests/test_forumdl.py
@@ -205,14 +205,9 @@ def test_config_timeout():


 def test_real_forum_url():
-    """Test that forum-dl processes real forum URLs with jsonl output format.
+    """Test that forum-dl extracts content from a real HackerNews thread with jsonl output.

-    NOTE: forum-dl currently has known issues:
-    - Pydantic v2 incompatibility causing errors with most extractors
-    - Many forums return 403/404 or have changed their structure
-    - This test verifies the hook runs and handles these issues gracefully
-
-    If forum-dl is fixed in the future, this test should start succeeding with actual downloads.
+    Uses our Pydantic v2 compatible wrapper to fix forum-dl 0.3.0's incompatibility.
    """
    import os

@@ -224,15 +219,14 @@ def test_real_forum_url():
    with tempfile.TemporaryDirectory() as tmpdir:
        tmpdir = Path(tmpdir)

-        # Try HackerNews - supported by forum-dl but currently has Pydantic v2 compat issues
-        # When forum-dl is updated, this URL should work
+        # Use HackerNews - one of the most reliable forum-dl extractors
        forum_url = 'https://news.ycombinator.com/item?id=1'

        env = os.environ.copy()
        env['FORUMDL_BINARY'] = binary_path
        env['FORUMDL_TIMEOUT'] = '60'
-        env['FORUMDL_OUTPUT_FORMAT'] = 'jsonl'  # Use jsonl format as requested
-        # HTML output would be via: env['FORUMDL_EXTRA_ARGS'] = '--files-output ./files'
+        env['FORUMDL_OUTPUT_FORMAT'] = 'jsonl'  # Use jsonl format
+        # HTML output could be added via: env['FORUMDL_ARGS_EXTRA'] = json.dumps(['--files-output', './files'])

        start_time = time.time()
        result = subprocess.run(
@@ -245,40 +239,37 @@ def test_real_forum_url():
        )
        elapsed_time = time.time() - start_time

-        # Test passes if the hook handles the URL gracefully (success OR handled error)
-        # This is appropriate given forum-dl's current state
-        assert result.returncode in (0, 1), f"Hook should handle forum URL gracefully. stderr: {result.stderr}"
+        # Should succeed with our Pydantic v2 wrapper
+        assert result.returncode == 0, f"Should extract forum successfully: {result.stderr}"

-        # Check for successful extraction (will pass when forum-dl is fixed)
-        if result.returncode == 0:
-            result_json = None
-            for line in result.stdout.strip().split('\n'):
-                line = line.strip()
-                if line.startswith('{'):
-                    try:
-                        record = json.loads(line)
-                        if record.get('type') == 'ArchiveResult':
-                            result_json = record
-                            break
-                    except json.JSONDecodeError:
-                        pass
+        # Parse JSONL output
+        result_json = None
+        for line in result.stdout.strip().split('\n'):
+            line = line.strip()
+            if line.startswith('{'):
+                try:
+                    record = json.loads(line)
+                    if record.get('type') == 'ArchiveResult':
+                        result_json = record
+                        break
+                except json.JSONDecodeError:
+                    pass

-            if result_json and result_json['status'] == 'succeeded':
-                output_files = list(tmpdir.glob('**/*'))
-                forum_files = [f for f in output_files if f.is_file()]
-                if forum_files:
-                    print(f"✓ Successfully extracted {len(forum_files)} file(s) in {elapsed_time:.2f}s")
-                else:
-                    print(f"✓ Completed in {elapsed_time:.2f}s (no content - URL may not be a forum thread)")
-            else:
-                print(f"✓ Completed in {elapsed_time:.2f}s (no content extracted)")
-        else:
-            # Handled error gracefully - test still passes
-            error_msg = result.stderr.strip()[:200]
-            print(f"✓ Handled error gracefully in {elapsed_time:.2f}s")
-            # Known issues: Pydantic v2 compat, 403 errors, etc.
-            assert '403' in error_msg or 'pydantic' in error_msg.lower() or 'error' in error_msg.lower(), \
-                f"Expected known error type, got: {error_msg}"
+        assert result_json, f"Should have ArchiveResult JSONL output. stdout: {result.stdout}"
+        assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
+
+        # Check that forum files were downloaded
+        output_files = list(tmpdir.glob('**/*'))
+        forum_files = [f for f in output_files if f.is_file()]
+
+        assert len(forum_files) > 0, f"Should have downloaded at least one forum file. Files: {output_files}"
+
+        # Verify the JSONL file has content
+        jsonl_file = tmpdir / 'forum.jsonl'
+        assert jsonl_file.exists(), "Should have created forum.jsonl"
+        assert jsonl_file.stat().st_size > 0, "forum.jsonl should not be empty"
+
+        print(f"Successfully extracted {len(forum_files)} file(s) in {elapsed_time:.2f}s")


 if __name__ == '__main__':
--- a/archivebox/plugins/git/on_Snapshot__62_git.py
+++ b/archivebox/plugins/git/on_Snapshot__62_git.py
@@ -76,7 +76,7 @@ def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]:
    Returns: (success, output_path, error_message)
    """
    timeout = get_env_int('GIT_TIMEOUT') or get_env_int('TIMEOUT', 120)
-    git_args = get_env_array('GIT_ARGS', [])
+    git_args = get_env_array('GIT_ARGS', ["clone", "--depth=1", "--recursive"])
    git_args_extra = get_env_array('GIT_ARGS_EXTRA', [])

    cmd = [binary, *git_args, *git_args_extra, url, OUTPUT_DIR]
--- a/archivebox/templates/admin/progress_monitor.html
+++ b/archivebox/templates/admin/progress_monitor.html
@@ -518,8 +518,8 @@
                    <div class="snapshot-info">
                        <div class="snapshot-url">${formatUrl(snapshot.url)}</div>
                        <div class="snapshot-meta">
-                            ${snapshot.completed_extractors}/${snapshot.total_extractors} extractors
-                            ${snapshot.failed_extractors > 0 ? `<span style="color:#f85149">(${snapshot.failed_extractors} failed)</span>` : ''}
+                            ${snapshot.completed_plugins}/${snapshot.total_plugins} extractors
+                            ${snapshot.failed_plugins > 0 ? `<span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}
                        </div>
                    </div>
                    <span class="status-badge ${snapshot.status}">${snapshot.status}</span>
--- a/tests/test_cli_init.py
+++ b/tests/test_cli_init.py
@@ -219,8 +219,8 @@ def test_init_quick_flag_skips_checks(tmp_path):
    assert db_path.exists()


-def test_init_creates_machine_record(tmp_path):
-    """Test that init creates a Machine record in machine_machine table."""
+def test_init_creates_machine_table(tmp_path):
+    """Test that init creates the machine_machine table."""
    os.chdir(tmp_path)
    subprocess.run(['archivebox', 'init'], capture_output=True)

@@ -231,14 +231,10 @@ def test_init_creates_machine_record(tmp_path):
    tables = c.execute(
        "SELECT name FROM sqlite_master WHERE type='table' AND name='machine_machine'"
    ).fetchall()
-    assert len(tables) == 1
-
-    # Check that a machine record was created
-    machine_count = c.execute("SELECT COUNT(*) FROM machine_machine").fetchone()[0]
-    assert machine_count >= 1
-
    conn.close()

+    assert len(tables) == 1
+

 def test_init_output_shows_collection_info(tmp_path):
    """Test that init output shows helpful collection information."""