From 80f75126c67bf08d18620e6d3cc3d4dd0d82e740 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Mon, 29 Dec 2025 21:03:05 -0800
Subject: [PATCH] more fixes

---
 archivebox/api/migrations/0001_initial.py     |  13 +-
 archivebox/cli/archivebox_update.py           |  93 +++++++------
 archivebox/core/models.py                     |   4 +-
 archivebox/crawls/models.py                   |  55 +++++++-
 archivebox/plugins/chrome/chrome_utils.js     | 128 ++++++++++++++++--
 .../plugins/chrome/tests/test_chrome.py       |  47 +++++--
 .../plugins/forumdl/forum-dl-wrapper.py       |  31 +++++
 .../forumdl/on_Snapshot__65_forumdl.bg.py     |   8 +-
 .../plugins/forumdl/tests/test_forumdl.py     |  77 +++++------
 archivebox/plugins/git/on_Snapshot__62_git.py |   2 +-
 .../templates/admin/progress_monitor.html     |   4 +-
 tests/test_cli_init.py                        |  12 +-
 12 files changed, 339 insertions(+), 135 deletions(-)
 create mode 100755 archivebox/plugins/forumdl/forum-dl-wrapper.py

diff --git a/archivebox/api/migrations/0001_initial.py b/archivebox/api/migrations/0001_initial.py
index 037ea575..fc3ce8a1 100644
--- a/archivebox/api/migrations/0001_initial.py
+++ b/archivebox/api/migrations/0001_initial.py
@@ -21,12 +21,8 @@ class Migration(migrations.Migration):
                     id TEXT PRIMARY KEY NOT NULL,
                     created_at DATETIME NOT NULL,
                     modified_at DATETIME NOT NULL,
-                    num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
-                    num_uses_failed INTEGER NOT NULL DEFAULT 0,
 
                     token VARCHAR(32) NOT NULL UNIQUE,
-                    label VARCHAR(64) NOT NULL DEFAULT '',
-                    notes TEXT NOT NULL DEFAULT '',
                     expires DATETIME,
 
                     created_by_id INTEGER NOT NULL,
@@ -41,19 +37,20 @@ class Migration(migrations.Migration):
                     id TEXT PRIMARY KEY NOT NULL,
                     created_at DATETIME NOT NULL,
                     modified_at DATETIME NOT NULL,
-                    num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
-                    num_uses_failed INTEGER NOT NULL DEFAULT 0,
 
                     name VARCHAR(255) NOT NULL UNIQUE,
                     signal VARCHAR(255) NOT NULL,
                     ref VARCHAR(1024) NOT NULL,
                     endpoint VARCHAR(2048) NOT NULL,
                     headers TEXT NOT NULL DEFAULT '{}',
+                    auth_token TEXT NOT NULL DEFAULT '',
                     enabled BOOLEAN NOT NULL DEFAULT 1,
                     keep_last_response BOOLEAN NOT NULL DEFAULT 0,
-                    last_response TEXT,
+                    created DATETIME NOT NULL,
+                    updated DATETIME NOT NULL,
+                    last_response TEXT NOT NULL DEFAULT '',
                     last_success DATETIME,
-                    last_error DATETIME,
+                    last_failure DATETIME,
 
                     created_by_id INTEGER NOT NULL,
 
diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py
index b0e29be9..d5ebc622 100644
--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -52,20 +52,21 @@ def update(filter_patterns: Iterable[str] = (),
             )
             print_stats(stats)
         else:
-            # Full mode: import orphans + process DB + deduplicate
-            stats_combined = {'phase1': {}, 'phase2': {}, 'deduplicated': 0}
+            # Full mode: drain old dirs + process DB
+            stats_combined = {'phase1': {}, 'phase2': {}}
 
-            print('[*] Phase 1: Scanning archive/ for orphaned snapshots...')
-            stats_combined['phase1'] = import_orphans_from_archive(
+            print('[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...')
+            stats_combined['phase1'] = drain_old_archive_dirs(
                 resume_from=resume,
                 batch_size=batch_size
             )
 
-            print('[*] Phase 2: Processing all database snapshots...')
+            print('[*] Phase 2: Processing all database snapshots (most recent first)...')
             stats_combined['phase2'] = process_all_db_snapshots(batch_size=batch_size)
 
-            print('[*] Phase 3: Deduplicating...')
-            stats_combined['deduplicated'] = Snapshot.find_and_merge_duplicates()
+            # Phase 3: Deduplication (disabled for now)
+            # print('[*] Phase 3: Deduplicating...')
+            # stats_combined['deduplicated'] = Snapshot.find_and_merge_duplicates()
 
             print_combined_stats(stats_combined)
 
@@ -77,33 +78,39 @@ def update(filter_patterns: Iterable[str] = (),
         resume = None
 
 
-def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100) -> dict:
+def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> dict:
     """
-    Scan archive/ for orphaned snapshots.
-    Skip symlinks (already migrated).
-    Create DB records and trigger migration on save().
+    Drain old archive/ directories (0.8.x → 0.9.x migration).
+
+    Only processes real directories (skips symlinks - those are already migrated).
+    For each old dir found in archive/:
+      1. Load or create DB snapshot
+      2. Trigger fs migration on save() to move to data/users/{user}/...
+      3. Leave symlink in archive/ pointing to new location
+
+    After this drains, archive/ should only contain symlinks and we can trust
+    1:1 mapping between DB and filesystem.
     """
     from archivebox.core.models import Snapshot
     from archivebox.config import CONSTANTS
     from django.db import transaction
 
-    stats = {'processed': 0, 'imported': 0, 'migrated': 0, 'invalid': 0}
+    stats = {'processed': 0, 'migrated': 0, 'skipped': 0, 'invalid': 0}
 
     archive_dir = CONSTANTS.ARCHIVE_DIR
     if not archive_dir.exists():
         return stats
 
-    print('[*] Scanning and sorting by modification time...')
+    print('[*] Scanning for old directories in archive/...')
 
-    # Scan and sort by mtime (newest first)
-    # Loading (mtime, path) tuples is fine even for millions (~100MB for 1M entries)
+    # Scan for real directories only (skip symlinks - they're already migrated)
     entries = [
         (e.stat().st_mtime, e.path)
         for e in os.scandir(archive_dir)
         if e.is_dir(follow_symlinks=False)  # Skip symlinks
     ]
     entries.sort(reverse=True)  # Newest first
-    print(f'[*] Found {len(entries)} directories to check')
+    print(f'[*] Found {len(entries)} old directories to drain')
 
     for mtime, entry_path in entries:
         entry_path = Path(entry_path)
@@ -114,30 +121,26 @@ def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100)
 
         stats['processed'] += 1
 
-        # Check if already in DB
+        # Try to load existing snapshot from DB
         snapshot = Snapshot.load_from_directory(entry_path)
-        if snapshot:
-            continue  # Already in DB, skip
 
-        # Not in DB - create orphaned snapshot
-        snapshot = Snapshot.create_from_directory(entry_path)
         if not snapshot:
-            # Invalid directory
-            Snapshot.move_directory_to_invalid(entry_path)
-            stats['invalid'] += 1
-            print(f"    [{stats['processed']}] Invalid: {entry_path.name}")
-            continue
+            # Not in DB - create new snapshot record
+            snapshot = Snapshot.create_from_directory(entry_path)
+            if not snapshot:
+                # Invalid directory - move to invalid/
+                Snapshot.move_directory_to_invalid(entry_path)
+                stats['invalid'] += 1
+                print(f"    [{stats['processed']}] Invalid: {entry_path.name}")
+                continue
 
-        needs_migration = snapshot.fs_migration_needed
-
-        snapshot.save()  # Creates DB record + triggers migration
-
-        stats['imported'] += 1
-        if needs_migration:
+        # Check if needs migration (0.8.x → 0.9.x)
+        if snapshot.fs_migration_needed:
+            snapshot.save()  # Triggers migration + creates symlink
             stats['migrated'] += 1
-            print(f"    [{stats['processed']}] Imported + migrated: {entry_path.name}")
+            print(f"    [{stats['processed']}] Migrated: {entry_path.name}")
         else:
-            print(f"    [{stats['processed']}] Imported: {entry_path.name}")
+            stats['skipped'] += 1
 
         if stats['processed'] % batch_size == 0:
             transaction.commit()
@@ -148,8 +151,14 @@ def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100)
 
 def process_all_db_snapshots(batch_size: int = 100) -> dict:
     """
-    Process all snapshots in DB.
-    Reconcile index.json and queue for archiving.
+    O(n) scan over entire DB from most recent to least recent.
+
+    For each snapshot:
+      1. Reconcile index.json with DB (merge titles, tags, archive results)
+      2. Queue for archiving (state machine will handle it)
+
+    No orphan detection needed - we trust 1:1 mapping between DB and filesystem
+    after Phase 1 has drained all old archive/ directories.
     """
     from archivebox.core.models import Snapshot
     from django.db import transaction
@@ -158,9 +167,10 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
     stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
 
     total = Snapshot.objects.count()
-    print(f'[*] Processing {total} snapshots from database...')
+    print(f'[*] Processing {total} snapshots from database (most recent first)...')
 
-    for snapshot in Snapshot.objects.iterator(chunk_size=batch_size):
+    # Process from most recent to least recent
+    for snapshot in Snapshot.objects.order_by('-bookmarked_at').iterator(chunk_size=batch_size):
         # Reconcile index.json with DB
         snapshot.reconcile_with_index_json()
 
@@ -252,19 +262,16 @@ def print_combined_stats(stats_combined: dict):
     print(f"""
 [green]Archive Update Complete[/green]
 
-Phase 1 (Import Orphans):
+Phase 1 (Drain Old Dirs):
   Checked:     {s1.get('processed', 0)}
-  Imported:    {s1.get('imported', 0)}
   Migrated:    {s1.get('migrated', 0)}
+  Skipped:     {s1.get('skipped', 0)}
   Invalid:     {s1.get('invalid', 0)}
 
 Phase 2 (Process DB):
   Processed:   {s2.get('processed', 0)}
   Reconciled:  {s2.get('reconciled', 0)}
   Queued:      {s2.get('queued', 0)}
-
-Phase 3 (Deduplicate):
-  Merged:      {stats_combined['deduplicated']}
 """)
 
 
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index c30061c2..0a94df61 100755
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -297,7 +297,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
     status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
     config = models.JSONField(default=dict, null=False, blank=False, editable=True)
     notes = models.TextField(blank=True, null=False, default='')
-    output_dir = models.FilePathField(path=CONSTANTS.ARCHIVE_DIR, recursive=True, match='.*', default=None, null=True, blank=True, editable=True)
+    # output_dir is computed via @cached_property from fs_version and get_storage_path_for_version()
 
     tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
 
@@ -1981,7 +1981,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
     status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
     retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
     notes = models.TextField(blank=True, null=False, default='')
-    output_dir = models.CharField(max_length=256, default=None, null=True, blank=True)
+    # output_dir is computed via @property from snapshot.output_dir / plugin
 
     state_machine_name = 'archivebox.core.models.ArchiveResultMachine'
     retry_at_field_name = 'retry_at'
diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py
index 818c59a4..1f0c880f 100755
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -358,10 +358,19 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
         """Clean up background hooks and run on_CrawlEnd hooks."""
         import os
         import signal
+        import time
         from pathlib import Path
         from archivebox.hooks import run_hook, discover_hooks
         from archivebox.misc.process_utils import validate_pid_file
 
+        def is_process_alive(pid):
+            """Check if a process exists."""
+            try:
+                os.kill(pid, 0)  # Signal 0 checks existence without killing
+                return True
+            except (OSError, ProcessLookupError):
+                return False
+
         # Kill any background processes by scanning for all .pid files
         if self.OUTPUT_DIR.exists():
             for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
@@ -371,9 +380,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
                     # PID reused by different process or process dead
                     pid_file.unlink(missing_ok=True)
                     continue
-                
+
                 try:
                     pid = int(pid_file.read_text().strip())
+
+                    # Step 1: Send SIGTERM for graceful shutdown
                     try:
                         # Try to kill process group first (handles detached processes like Chrome)
                         try:
@@ -382,8 +393,46 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
                             # Fall back to killing just the process
                             os.kill(pid, signal.SIGTERM)
                     except ProcessLookupError:
-                        pass  # Already dead
-                except (ValueError, OSError):
+                        # Already dead
+                        pid_file.unlink(missing_ok=True)
+                        continue
+
+                    # Step 2: Wait for graceful shutdown
+                    time.sleep(2)
+
+                    # Step 3: Check if still alive
+                    if not is_process_alive(pid):
+                        # Process terminated gracefully
+                        pid_file.unlink(missing_ok=True)
+                        continue
+
+                    # Step 4: Process still alive, force kill ENTIRE process group with SIGKILL
+                    try:
+                        try:
+                            # Always kill entire process group with SIGKILL (not individual processes)
+                            os.killpg(pid, signal.SIGKILL)
+                        except (OSError, ProcessLookupError) as e:
+                            # Process group kill failed, try single process as fallback
+                            os.kill(pid, signal.SIGKILL)
+                    except ProcessLookupError:
+                        # Process died between check and kill
+                        pid_file.unlink(missing_ok=True)
+                        continue
+
+                    # Step 5: Wait and verify death
+                    time.sleep(1)
+
+                    if is_process_alive(pid):
+                        # Process is unkillable (likely in UNE state on macOS)
+                        # This happens when Chrome crashes in kernel syscall (IOSurface)
+                        # Log but don't block cleanup - process will remain until reboot
+                        print(f'[yellow]⚠️ Process {pid} is unkillable (likely crashed in kernel). Will remain until reboot.[/yellow]')
+                    else:
+                        # Successfully killed
+                        pid_file.unlink(missing_ok=True)
+
+                except (ValueError, OSError) as e:
+                    # Invalid PID file or permission error
                     pass
 
         # Run on_CrawlEnd hooks
diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js
index fd09fbb3..d448923b 100755
--- a/archivebox/plugins/chrome/chrome_utils.js
+++ b/archivebox/plugins/chrome/chrome_utils.js
@@ -397,8 +397,53 @@ async function launchChromium(options = {}) {
     }
 }
 
+/**
+ * Check if a process is still running.
+ * @param {number} pid - Process ID to check
+ * @returns {boolean} - True if process exists
+ */
+function isProcessAlive(pid) {
+    try {
+        process.kill(pid, 0);  // Signal 0 checks existence without killing
+        return true;
+    } catch (e) {
+        return false;
+    }
+}
+
+/**
+ * Find all Chrome child processes for a given debug port.
+ * @param {number} port - Debug port number
+ * @returns {Array<number>} - Array of PIDs
+ */
+function findChromeProcessesByPort(port) {
+    const { execSync } = require('child_process');
+    const pids = [];
+
+    try {
+        // Find all Chrome processes using this debug port
+        const output = execSync(
+            `ps aux | grep -i "chrome.*--remote-debugging-port=${port}" | grep -v grep | awk '{print $2}'`,
+            { encoding: 'utf8', timeout: 5000 }
+        );
+
+        for (const line of output.split('\n')) {
+            const pid = parseInt(line.trim(), 10);
+            if (!isNaN(pid) && pid > 0) {
+                pids.push(pid);
+            }
+        }
+    } catch (e) {
+        // Command failed or no processes found
+    }
+
+    return pids;
+}
+
 /**
  * Kill a Chrome process by PID.
+ * Always sends SIGTERM before SIGKILL, then verifies death.
+ *
  * @param {number} pid - Process ID to kill
  * @param {string} [outputDir] - Directory containing PID files to clean up
  */
@@ -407,30 +452,93 @@ async function killChrome(pid, outputDir = null) {
 
     console.error(`[*] Killing Chrome process tree (PID ${pid})...`);
 
-    // Try to kill process group first
+    // Get debug port for finding child processes
+    let debugPort = null;
+    if (outputDir) {
+        try {
+            const portFile = path.join(outputDir, 'port.txt');
+            if (fs.existsSync(portFile)) {
+                debugPort = parseInt(fs.readFileSync(portFile, 'utf8').trim(), 10);
+            }
+        } catch (e) {}
+    }
+
+    // Step 1: SIGTERM to process group (graceful shutdown)
+    console.error(`[*] Sending SIGTERM to process group -${pid}...`);
     try {
         process.kill(-pid, 'SIGTERM');
     } catch (e) {
-        try { process.kill(pid, 'SIGTERM'); } catch (e2) {}
+        try {
+            console.error(`[*] Process group kill failed, trying single process...`);
+            process.kill(pid, 'SIGTERM');
+        } catch (e2) {
+            console.error(`[!] SIGTERM failed: ${e2.message}`);
+        }
     }
 
-    // Wait for graceful shutdown
+    // Step 2: Wait for graceful shutdown
     await new Promise(resolve => setTimeout(resolve, 2000));
 
-    // Force kill
-    try {
-        process.kill(-pid, 'SIGKILL');
-    } catch (e) {
-        try { process.kill(pid, 'SIGKILL'); } catch (e2) {}
+    // Step 3: Check if still alive
+    if (!isProcessAlive(pid)) {
+        console.error('[+] Chrome process terminated gracefully');
+    } else {
+        // Step 4: Force kill ENTIRE process group with SIGKILL
+        console.error(`[*] Process still alive, sending SIGKILL to process group -${pid}...`);
+        try {
+            process.kill(-pid, 'SIGKILL');  // Kill entire process group
+        } catch (e) {
+            console.error(`[!] Process group SIGKILL failed, trying single process: ${e.message}`);
+            try {
+                process.kill(pid, 'SIGKILL');
+            } catch (e2) {
+                console.error(`[!] SIGKILL failed: ${e2.message}`);
+            }
+        }
+
+        // Step 5: Wait briefly and verify death
+        await new Promise(resolve => setTimeout(resolve, 1000));
+
+        if (isProcessAlive(pid)) {
+            console.error(`[!] WARNING: Process ${pid} is unkillable (likely in UNE state)`);
+            console.error(`[!] This typically happens when Chrome crashes in kernel syscall`);
+            console.error(`[!] Process will remain as zombie until system reboot`);
+            console.error(`[!] macOS IOSurface crash creates unkillable processes in UNE state`);
+
+            // Try one more time to kill the entire process group
+            if (debugPort) {
+                const relatedPids = findChromeProcessesByPort(debugPort);
+                if (relatedPids.length > 1) {
+                    console.error(`[*] Found ${relatedPids.length} Chrome processes still running on port ${debugPort}`);
+                    console.error(`[*] Attempting final process group SIGKILL...`);
+
+                    // Try to kill each unique process group we find
+                    const processGroups = new Set();
+                    for (const relatedPid of relatedPids) {
+                        if (relatedPid !== pid) {
+                            processGroups.add(relatedPid);
+                        }
+                    }
+
+                    for (const groupPid of processGroups) {
+                        try {
+                            process.kill(-groupPid, 'SIGKILL');
+                        } catch (e) {}
+                    }
+                }
+            }
+        } else {
+            console.error('[+] Chrome process group killed successfully');
+        }
     }
 
-    // Clean up PID files
+    // Step 8: Clean up PID files
     if (outputDir) {
         try { fs.unlinkSync(path.join(outputDir, 'chrome.pid')); } catch (e) {}
         try { fs.unlinkSync(path.join(outputDir, 'hook.pid')); } catch (e) {}
     }
 
-    console.error('[*] Chrome process killed');
+    console.error('[*] Chrome cleanup completed');
 }
 
 /**
diff --git a/archivebox/plugins/chrome/tests/test_chrome.py b/archivebox/plugins/chrome/tests/test_chrome.py
index 699dad70..3aa7f2be 100644
--- a/archivebox/plugins/chrome/tests/test_chrome.py
+++ b/archivebox/plugins/chrome/tests/test_chrome.py
@@ -594,36 +594,57 @@ def test_zombie_prevention_hook_killed():
         except OSError:
             pytest.fail("Chrome should still be running after hook SIGKILL")
 
-        # Simulate Crawl.cleanup() - kill all .pid files
+        # Simulate Crawl.cleanup() using the actual cleanup logic
+        def is_process_alive(pid):
+            """Check if a process exists."""
+            try:
+                os.kill(pid, 0)
+                return True
+            except (OSError, ProcessLookupError):
+                return False
+
         for pid_file in chrome_dir.glob('**/*.pid'):
             try:
                 pid = int(pid_file.read_text().strip())
+
+                # Step 1: SIGTERM for graceful shutdown
                 try:
-                    # Try to kill process group first (for detached processes like Chrome)
                     try:
                         os.killpg(pid, signal.SIGTERM)
                     except (OSError, ProcessLookupError):
-                        # Fall back to killing just the process
                         os.kill(pid, signal.SIGTERM)
+                except ProcessLookupError:
+                    pid_file.unlink(missing_ok=True)
+                    continue
 
-                    time.sleep(0.5)
+                # Step 2: Wait for graceful shutdown
+                time.sleep(2)
 
-                    # Force kill if still alive
+                # Step 3: Check if still alive
+                if not is_process_alive(pid):
+                    pid_file.unlink(missing_ok=True)
+                    continue
+
+                # Step 4: Force kill ENTIRE process group with SIGKILL
+                try:
                     try:
+                        # Always kill entire process group with SIGKILL
                         os.killpg(pid, signal.SIGKILL)
                     except (OSError, ProcessLookupError):
-                        try:
-                            os.kill(pid, signal.SIGKILL)
-                        except OSError:
-                            pass
+                        os.kill(pid, signal.SIGKILL)
                 except ProcessLookupError:
-                    pass
+                    pid_file.unlink(missing_ok=True)
+                    continue
+
+                # Step 5: Wait and verify death
+                time.sleep(1)
+
+                if not is_process_alive(pid):
+                    pid_file.unlink(missing_ok=True)
+
             except (ValueError, OSError):
                 pass
 
-        # Wait a moment for cleanup
-        time.sleep(1)
-
         # Chrome should now be dead
         try:
             os.kill(chrome_pid, 0)
diff --git a/archivebox/plugins/forumdl/forum-dl-wrapper.py b/archivebox/plugins/forumdl/forum-dl-wrapper.py
new file mode 100755
index 00000000..2b53ca99
--- /dev/null
+++ b/archivebox/plugins/forumdl/forum-dl-wrapper.py
@@ -0,0 +1,31 @@
+#!/usr/bin/env python3
+"""
+Wrapper for forum-dl that applies Pydantic v2 compatibility patches.
+
+This wrapper fixes forum-dl 0.3.0's incompatibility with Pydantic v2 by monkey-patching
+the JsonlWriter class to use model_dump_json() instead of the deprecated json(models_as_dict=False).
+"""
+
+import sys
+
+# Apply Pydantic v2 compatibility patch BEFORE importing forum_dl
+try:
+    from forum_dl.writers.jsonl import JsonlWriter
+    from pydantic import BaseModel
+
+    # Check if we're using Pydantic v2
+    if hasattr(BaseModel, 'model_dump_json'):
+        def _patched_serialize_entry(self, entry):
+            """Use Pydantic v2's model_dump_json() instead of deprecated json(models_as_dict=False)"""
+            return entry.model_dump_json()
+
+        JsonlWriter._serialize_entry = _patched_serialize_entry
+except (ImportError, AttributeError):
+    # forum-dl not installed or already compatible - no patch needed
+    pass
+
+# Now import and run forum-dl's main function
+from forum_dl import main
+
+if __name__ == '__main__':
+    sys.exit(main())
diff --git a/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py b/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py
index 3fe7a94a..8cb97d54 100755
--- a/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py
+++ b/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py
@@ -115,8 +115,12 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
     else:
         output_file = output_dir / f'forum.{output_format}'
 
-    # Build command
-    cmd = [binary, *forumdl_args, '-f', output_format, '-o', str(output_file)]
+    # Use our Pydantic v2 compatible wrapper if available, otherwise fall back to binary
+    wrapper_path = Path(__file__).parent / 'forum-dl-wrapper.py'
+    if wrapper_path.exists():
+        cmd = [sys.executable, str(wrapper_path), *forumdl_args, '-f', output_format, '-o', str(output_file)]
+    else:
+        cmd = [binary, *forumdl_args, '-f', output_format, '-o', str(output_file)]
 
     if not check_ssl:
         cmd.append('--no-check-certificate')
diff --git a/archivebox/plugins/forumdl/tests/test_forumdl.py b/archivebox/plugins/forumdl/tests/test_forumdl.py
index f976d44c..f965d898 100644
--- a/archivebox/plugins/forumdl/tests/test_forumdl.py
+++ b/archivebox/plugins/forumdl/tests/test_forumdl.py
@@ -205,14 +205,9 @@ def test_config_timeout():
 
 
 def test_real_forum_url():
-    """Test that forum-dl processes real forum URLs with jsonl output format.
+    """Test that forum-dl extracts content from a real HackerNews thread with jsonl output.
 
-    NOTE: forum-dl currently has known issues:
-    - Pydantic v2 incompatibility causing errors with most extractors
-    - Many forums return 403/404 or have changed their structure
-    - This test verifies the hook runs and handles these issues gracefully
-
-    If forum-dl is fixed in the future, this test should start succeeding with actual downloads.
+    Uses our Pydantic v2 compatible wrapper to fix forum-dl 0.3.0's incompatibility.
     """
     import os
 
@@ -224,15 +219,14 @@ def test_real_forum_url():
     with tempfile.TemporaryDirectory() as tmpdir:
         tmpdir = Path(tmpdir)
 
-        # Try HackerNews - supported by forum-dl but currently has Pydantic v2 compat issues
-        # When forum-dl is updated, this URL should work
+        # Use HackerNews - one of the most reliable forum-dl extractors
         forum_url = 'https://news.ycombinator.com/item?id=1'
 
         env = os.environ.copy()
         env['FORUMDL_BINARY'] = binary_path
         env['FORUMDL_TIMEOUT'] = '60'
-        env['FORUMDL_OUTPUT_FORMAT'] = 'jsonl'  # Use jsonl format as requested
-        # HTML output would be via: env['FORUMDL_EXTRA_ARGS'] = '--files-output ./files'
+        env['FORUMDL_OUTPUT_FORMAT'] = 'jsonl'  # Use jsonl format
+        # HTML output could be added via: env['FORUMDL_ARGS_EXTRA'] = json.dumps(['--files-output', './files'])
 
         start_time = time.time()
         result = subprocess.run(
@@ -245,40 +239,37 @@ def test_real_forum_url():
         )
         elapsed_time = time.time() - start_time
 
-        # Test passes if the hook handles the URL gracefully (success OR handled error)
-        # This is appropriate given forum-dl's current state
-        assert result.returncode in (0, 1), f"Hook should handle forum URL gracefully. stderr: {result.stderr}"
+        # Should succeed with our Pydantic v2 wrapper
+        assert result.returncode == 0, f"Should extract forum successfully: {result.stderr}"
 
-        # Check for successful extraction (will pass when forum-dl is fixed)
-        if result.returncode == 0:
-            result_json = None
-            for line in result.stdout.strip().split('\n'):
-                line = line.strip()
-                if line.startswith('{'):
-                    try:
-                        record = json.loads(line)
-                        if record.get('type') == 'ArchiveResult':
-                            result_json = record
-                            break
-                    except json.JSONDecodeError:
-                        pass
+        # Parse JSONL output
+        result_json = None
+        for line in result.stdout.strip().split('\n'):
+            line = line.strip()
+            if line.startswith('{'):
+                try:
+                    record = json.loads(line)
+                    if record.get('type') == 'ArchiveResult':
+                        result_json = record
+                        break
+                except json.JSONDecodeError:
+                    pass
 
-            if result_json and result_json['status'] == 'succeeded':
-                output_files = list(tmpdir.glob('**/*'))
-                forum_files = [f for f in output_files if f.is_file()]
-                if forum_files:
-                    print(f"✓ Successfully extracted {len(forum_files)} file(s) in {elapsed_time:.2f}s")
-                else:
-                    print(f"✓ Completed in {elapsed_time:.2f}s (no content - URL may not be a forum thread)")
-            else:
-                print(f"✓ Completed in {elapsed_time:.2f}s (no content extracted)")
-        else:
-            # Handled error gracefully - test still passes
-            error_msg = result.stderr.strip()[:200]
-            print(f"✓ Handled error gracefully in {elapsed_time:.2f}s")
-            # Known issues: Pydantic v2 compat, 403 errors, etc.
-            assert '403' in error_msg or 'pydantic' in error_msg.lower() or 'error' in error_msg.lower(), \
-                f"Expected known error type, got: {error_msg}"
+        assert result_json, f"Should have ArchiveResult JSONL output. stdout: {result.stdout}"
+        assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
+
+        # Check that forum files were downloaded
+        output_files = list(tmpdir.glob('**/*'))
+        forum_files = [f for f in output_files if f.is_file()]
+
+        assert len(forum_files) > 0, f"Should have downloaded at least one forum file. Files: {output_files}"
+
+        # Verify the JSONL file has content
+        jsonl_file = tmpdir / 'forum.jsonl'
+        assert jsonl_file.exists(), "Should have created forum.jsonl"
+        assert jsonl_file.stat().st_size > 0, "forum.jsonl should not be empty"
+
+        print(f"Successfully extracted {len(forum_files)} file(s) in {elapsed_time:.2f}s")
 
 
 if __name__ == '__main__':
diff --git a/archivebox/plugins/git/on_Snapshot__62_git.py b/archivebox/plugins/git/on_Snapshot__62_git.py
index 943be861..04dbbd70 100644
--- a/archivebox/plugins/git/on_Snapshot__62_git.py
+++ b/archivebox/plugins/git/on_Snapshot__62_git.py
@@ -76,7 +76,7 @@ def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]:
     Returns: (success, output_path, error_message)
     """
     timeout = get_env_int('GIT_TIMEOUT') or get_env_int('TIMEOUT', 120)
-    git_args = get_env_array('GIT_ARGS', [])
+    git_args = get_env_array('GIT_ARGS', ["clone", "--depth=1", "--recursive"])
     git_args_extra = get_env_array('GIT_ARGS_EXTRA', [])
 
     cmd = [binary, *git_args, *git_args_extra, url, OUTPUT_DIR]
diff --git a/archivebox/templates/admin/progress_monitor.html b/archivebox/templates/admin/progress_monitor.html
index a2be9eda..bbc65663 100644
--- a/archivebox/templates/admin/progress_monitor.html
+++ b/archivebox/templates/admin/progress_monitor.html
@@ -518,8 +518,8 @@
                     <div class="snapshot-info">
                         <div class="snapshot-url">${formatUrl(snapshot.url)}</div>
                         <div class="snapshot-meta">
-                            ${snapshot.completed_extractors}/${snapshot.total_extractors} extractors
-                            ${snapshot.failed_extractors > 0 ? `<span style="color:#f85149">(${snapshot.failed_extractors} failed)</span>` : ''}
+                            ${snapshot.completed_plugins}/${snapshot.total_plugins} extractors
+                            ${snapshot.failed_plugins > 0 ? `<span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}
                         </div>
                     </div>
                     <span class="status-badge ${snapshot.status}">${snapshot.status}</span>
diff --git a/tests/test_cli_init.py b/tests/test_cli_init.py
index c086182e..5761ce5b 100644
--- a/tests/test_cli_init.py
+++ b/tests/test_cli_init.py
@@ -219,8 +219,8 @@ def test_init_quick_flag_skips_checks(tmp_path):
     assert db_path.exists()
 
 
-def test_init_creates_machine_record(tmp_path):
-    """Test that init creates a Machine record in machine_machine table."""
+def test_init_creates_machine_table(tmp_path):
+    """Test that init creates the machine_machine table."""
     os.chdir(tmp_path)
     subprocess.run(['archivebox', 'init'], capture_output=True)
 
@@ -231,14 +231,10 @@ def test_init_creates_machine_record(tmp_path):
     tables = c.execute(
         "SELECT name FROM sqlite_master WHERE type='table' AND name='machine_machine'"
     ).fetchall()
-    assert len(tables) == 1
-
-    # Check that a machine record was created
-    machine_count = c.execute("SELECT COUNT(*) FROM machine_machine").fetchone()[0]
-    assert machine_count >= 1
-
     conn.close()
 
+    assert len(tables) == 1
+
 
 def test_init_output_shows_collection_info(tmp_path):
     """Test that init output shows helpful collection information."""