fix extensions dir paths add personas migration

2026-04-06 07:47:53 +10:00 · 2025-12-31 01:12:29 -08:00
parent 1bbb9b45a7
commit 3d8c62ffb1
10 changed files with 300 additions and 85 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -27,6 +27,17 @@ uv sync --dev --all-extras  # Always use uv, never pip directly
 source .venv/bin/activate
 ```

+### Generate and Apply Migrations
+```bash
+# Generate migrations (run from archivebox subdirectory)
+cd archivebox
+./manage.py makemigrations
+
+# Apply migrations to test database
+cd data/
+archivebox init
+```
+
 ## Running Tests

 ### CRITICAL: Never Run as Root
--- a/archivebox/core/migrations/0025_cleanup_schema.py
+++ b/archivebox/core/migrations/0025_cleanup_schema.py
@@ -10,8 +10,8 @@ import archivebox.base_models.models

 def cleanup_extra_columns(apps, schema_editor):
    """
-    Remove extra columns that were needed for v0.7.2/v0.8.6rc0 migration but don't exist in final models.
-    The actual models use @property methods to access these values from the process FK.
+    Create Process records from old cmd/pwd/cmd_version columns and remove those columns.
+    This preserves the execution details by moving them to the Process model.
    """
    with schema_editor.connection.cursor() as cursor:
        # Check if cmd column exists (means we came from v0.7.2/v0.8.6rc0)
@@ -19,8 +19,41 @@ def cleanup_extra_columns(apps, schema_editor):
        has_cmd = cursor.fetchone()[0] > 0

        if has_cmd:
-            print("  Cleaning up temporary columns from core_archiveresult...")
-            # Rebuild table without the extra columns
+            print("  Migrating cmd/pwd/cmd_version data to Process records...")
+
+            # For each ArchiveResult, create a Process record with cmd/pwd data
+            # Note: cmd_version from old schema is not preserved (it's now derived from Binary)
+            cursor.execute("""
+                SELECT id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status
+                FROM core_archiveresult
+            """)
+            archive_results = cursor.fetchall()
+
+            from archivebox.uuid_compat import uuid7
+            from archivebox.base_models.models import get_or_create_system_user_pk
+
+            machine_id = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone()[0]
+
+            for ar_id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status in archive_results:
+                # Create Process record
+                process_id = str(uuid7())
+                cursor.execute("""
+                    INSERT INTO machine_process (
+                        id, created_at, modified_at,
+                        machine_id, binary_id, iface_id,
+                        pwd, cmd, env, timeout,
+                        pid, exit_code, stdout, stderr,
+                        started_at, ended_at, url, status, retry_at
+                    ) VALUES (?, datetime('now'), datetime('now'), ?, ?, ?, ?, ?, '{}', 120, NULL, NULL, '', '', ?, ?, '', ?, NULL)
+                """, (process_id, machine_id, binary_id, iface_id, pwd or '', cmd or '[]', start_ts, end_ts, status or 'queued'))
+
+                # Update ArchiveResult to point to new Process
+                cursor.execute("UPDATE core_archiveresult SET process_id = ? WHERE id = ?", (process_id, ar_id))
+
+            print(f"  ✓ Created {len(archive_results)} Process records from ArchiveResult data")
+
+            # Now rebuild table without the extra columns
+            print("  Rebuilding core_archiveresult table...")
            cursor.execute("""
                CREATE TABLE core_archiveresult_final (
                    id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -48,14 +81,14 @@ def cleanup_extra_columns(apps, schema_editor):
                    num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
                    num_uses_failed INTEGER NOT NULL DEFAULT 0,

-                    process_id TEXT,
+                    process_id TEXT NOT NULL,

                    FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE,
                    FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE RESTRICT
                )
            """)

-            # Copy data (cmd, pwd, etc. are now accessed via process FK)
+            # Copy data (cmd, pwd, etc. are now in Process records)
            cursor.execute("""
                INSERT INTO core_archiveresult_final SELECT
                    id, uuid, created_at, modified_at,
--- a/archivebox/core/migrations/0027_alter_archiveresult_hook_name_alter_archiveresult_id_and_more.py
+++ b/archivebox/core/migrations/0027_alter_archiveresult_hook_name_alter_archiveresult_id_and_more.py
@@ -0,0 +1,108 @@
+# Generated by Django 6.0 on 2025-12-31 09:04
+
+import django.db.models.deletion
+import django.utils.timezone
+import uuid
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0026_final_field_adjustments'),
+        ('crawls', '0002_upgrade_to_0_9_0'),
+        ('machine', '0001_initial'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='hook_name',
+            field=models.CharField(blank=True, db_index=True, default='', help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)', max_length=255),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='id',
+            field=models.AutoField(editable=False, primary_key=True, serialize=False),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='output_files',
+            field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='output_json',
+            field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='output_mimetypes',
+            field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='output_size',
+            field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='output_str',
+            field=models.TextField(blank=True, default='', help_text='Human-readable output summary'),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='plugin',
+            field=models.CharField(db_index=True, default='', max_length=32),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='process',
+            field=models.OneToOneField(help_text='Process execution details for this archive result', on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='retry_at',
+            field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='status',
+            field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15),
+        ),
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='uuid',
+            field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='config',
+            field=models.JSONField(default=dict),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='crawl',
+            field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='current_step',
+            field=models.PositiveSmallIntegerField(db_index=True, default=0, help_text='Current hook step being executed (0-9). Used for sequential hook execution.'),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='depth',
+            field=models.PositiveSmallIntegerField(db_index=True, default=0),
+        ),
+        migrations.AlterField(
+            model_name='snapshot',
+            name='id',
+            field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True),
+        ),
+        migrations.AlterField(
+            model_name='snapshottag',
+            name='id',
+            field=models.AutoField(primary_key=True, serialize=False),
+        ),
+    ]
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -2321,7 +2321,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
    process = models.OneToOneField(
        'machine.Process',
        on_delete=models.PROTECT,
-        null=False,  # Required after migration 4
+        null=False,
        related_name='archiveresult',
        help_text='Process execution details for this archive result'
    )
--- a/archivebox/machine/admin.py
+++ b/archivebox/machine/admin.py
@@ -144,7 +144,7 @@ class BinaryAdmin(BaseModelAdmin):


 class ProcessAdmin(BaseModelAdmin):
-    list_display = ('id', 'created_at', 'machine_info', 'archiveresult_link', 'cmd_str', 'status', 'exit_code', 'pid', 'binary_info', 'health')
+    list_display = ('id', 'created_at', 'machine_info', 'archiveresult_link', 'cmd_str', 'status', 'exit_code', 'pid', 'binary_info')
    sort_fields = ('id', 'created_at', 'status', 'exit_code', 'pid')
    search_fields = ('id', 'machine__id', 'binary__name', 'cmd', 'pwd', 'stdout', 'stderr')

@@ -171,10 +171,6 @@ class ProcessAdmin(BaseModelAdmin):
            'fields': ('stdout', 'stderr'),
            'classes': ('card', 'wide', 'collapse'),
        }),
-        ('Usage', {
-            'fields': ('num_uses_succeeded', 'num_uses_failed'),
-            'classes': ('card',),
-        }),
        ('Timestamps', {
            'fields': ('created_at', 'modified_at'),
            'classes': ('card',),
--- a/archivebox/machine/migrations/0001_initial.py
+++ b/archivebox/machine/migrations/0001_initial.py
@@ -234,8 +234,6 @@ class Migration(migrations.Migration):
                        ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)),
                        ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
                        ('modified_at', models.DateTimeField(auto_now=True)),
-                        ('num_uses_succeeded', models.PositiveIntegerField(default=0)),
-                        ('num_uses_failed', models.PositiveIntegerField(default=0)),
                        ('pwd', models.CharField(blank=True, default='', help_text='Working directory for process execution', max_length=512)),
                        ('cmd', models.JSONField(blank=True, default=list, help_text='Command as array of arguments')),
                        ('env', models.JSONField(blank=True, default=dict, help_text='Environment variables for process')),
--- a/archivebox/machine/models.py
+++ b/archivebox/machine/models.py
@@ -625,7 +625,7 @@ class ProcessManager(models.Manager):
        return process


-class Process(ModelWithHealthStats):
+class Process(models.Model):
    """
    Tracks a single OS process execution.

--- a/archivebox/personas/migrations/0001_initial.py
+++ b/archivebox/personas/migrations/0001_initial.py
@@ -0,0 +1,29 @@
+# Generated by Django 6.0 on 2025-12-31 09:06
+
+import archivebox.base_models.models
+import django.db.models.deletion
+import django.utils.timezone
+from django.conf import settings
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    initial = True
+
+    dependencies = [
+        migrations.swappable_dependency(settings.AUTH_USER_MODEL),
+    ]
+
+    operations = [
+        migrations.CreateModel(
+            name='Persona',
+            fields=[
+                ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')),
+                ('config', models.JSONField(blank=True, default=dict, null=True)),
+                ('name', models.CharField(max_length=64, unique=True)),
+                ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)),
+                ('created_by', models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)),
+            ],
+        ),
+    ]
--- a/archivebox/plugins/chrome/chrome_utils.js
+++ b/archivebox/plugins/chrome/chrome_utils.js
@@ -203,86 +203,115 @@ function waitForDebugPort(port, timeout = 30000) {

 /**
 * Kill zombie Chrome processes from stale crawls.
- * Scans DATA_DIR/crawls/<crawl_id>/chrome/<name>.pid for stale processes.
+ * Recursively scans DATA_DIR for any */chrome/*.pid files from stale crawls.
+ * Does not assume specific directory structure - works with nested paths.
 * @param {string} [dataDir] - Data directory (defaults to DATA_DIR env or '.')
 * @returns {number} - Number of zombies killed
 */
 function killZombieChrome(dataDir = null) {
    dataDir = dataDir || getEnv('DATA_DIR', '.');
-    const crawlsDir = path.join(dataDir, 'crawls');
    const now = Date.now();
    const fiveMinutesAgo = now - 300000;
    let killed = 0;

    console.error('[*] Checking for zombie Chrome processes...');

-    if (!fs.existsSync(crawlsDir)) {
-        console.error('[+] No crawls directory found');
+    if (!fs.existsSync(dataDir)) {
+        console.error('[+] No data directory found');
        return 0;
    }

+    /**
+     * Recursively find all chrome/.pid files in directory tree
+     * @param {string} dir - Directory to search
+     * @param {number} depth - Current recursion depth (limit to 10)
+     * @returns {Array<{pidFile: string, crawlDir: string}>} - Array of PID file info
+     */
+    function findChromePidFiles(dir, depth = 0) {
+        if (depth > 10) return [];  // Prevent infinite recursion
+
+        const results = [];
+        try {
+            const entries = fs.readdirSync(dir, { withFileTypes: true });
+
+            for (const entry of entries) {
+                if (!entry.isDirectory()) continue;
+
+                const fullPath = path.join(dir, entry.name);
+
+                // Found a chrome directory - check for .pid files
+                if (entry.name === 'chrome') {
+                    try {
+                        const pidFiles = fs.readdirSync(fullPath).filter(f => f.endsWith('.pid'));
+                        const crawlDir = dir;  // Parent of chrome/ is the crawl dir
+
+                        for (const pidFileName of pidFiles) {
+                            results.push({
+                                pidFile: path.join(fullPath, pidFileName),
+                                crawlDir: crawlDir,
+                            });
+                        }
+                    } catch (e) {
+                        // Skip if can't read chrome dir
+                    }
+                } else {
+                    // Recurse into subdirectory (skip hidden dirs and node_modules)
+                    if (!entry.name.startsWith('.') && entry.name !== 'node_modules') {
+                        results.push(...findChromePidFiles(fullPath, depth + 1));
+                    }
+                }
+            }
+        } catch (e) {
+            // Skip if can't read directory
+        }
+        return results;
+    }
+
    try {
-        const crawls = fs.readdirSync(crawlsDir, { withFileTypes: true });
-
-        for (const crawl of crawls) {
-            if (!crawl.isDirectory()) continue;
-
-            const crawlDir = path.join(crawlsDir, crawl.name);
-            const chromeDir = path.join(crawlDir, 'chrome');
-
-            if (!fs.existsSync(chromeDir)) continue;
+        const chromePids = findChromePidFiles(dataDir);

+        for (const {pidFile, crawlDir} of chromePids) {
            // Check if crawl was modified recently (still active)
            try {
                const crawlStats = fs.statSync(crawlDir);
                if (crawlStats.mtimeMs > fiveMinutesAgo) {
-                    continue;
+                    continue;  // Crawl is active, skip
                }
            } catch (e) {
                continue;
            }

-            // Crawl is stale, check for PIDs
+            // Crawl is stale, check PID
            try {
-                const pidFiles = fs.readdirSync(chromeDir).filter(f => f.endsWith('.pid'));
+                const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
+                if (isNaN(pid) || pid <= 0) continue;

-                for (const pidFileName of pidFiles) {
-                    const pidFile = path.join(chromeDir, pidFileName);
+                // Check if process exists
+                try {
+                    process.kill(pid, 0);
+                } catch (e) {
+                    // Process dead, remove stale PID file
+                    try { fs.unlinkSync(pidFile); } catch (e) {}
+                    continue;
+                }

-                    try {
-                        const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10);
-                        if (isNaN(pid) || pid <= 0) continue;
+                // Process alive and crawl is stale - zombie!
+                console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${path.basename(crawlDir)}`);

-                        // Check if process exists
-                        try {
-                            process.kill(pid, 0);
-                        } catch (e) {
-                            // Process dead, remove stale PID file
-                            try { fs.unlinkSync(pidFile); } catch (e) {}
-                            continue;
-                        }
-
-                        // Process alive and crawl is stale - zombie!
-                        console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${crawl.name}`);
-
-                        try {
-                            try { process.kill(-pid, 'SIGKILL'); } catch (e) { process.kill(pid, 'SIGKILL'); }
-                            killed++;
-                            console.error(`[+] Killed zombie (PID ${pid})`);
-                            try { fs.unlinkSync(pidFile); } catch (e) {}
-                        } catch (e) {
-                            console.error(`[!] Failed to kill PID ${pid}: ${e.message}`);
-                        }
-                    } catch (e) {
-                        // Skip invalid PID files
-                    }
+                try {
+                    try { process.kill(-pid, 'SIGKILL'); } catch (e) { process.kill(pid, 'SIGKILL'); }
+                    killed++;
+                    console.error(`[+] Killed zombie (PID ${pid})`);
+                    try { fs.unlinkSync(pidFile); } catch (e) {}
+                } catch (e) {
+                    console.error(`[!] Failed to kill PID ${pid}: ${e.message}`);
                }
            } catch (e) {
-                // Skip if can't read chrome dir
+                // Skip invalid PID files
            }
        }
    } catch (e) {
-        console.error(`[!] Error scanning crawls: ${e.message}`);
+        console.error(`[!] Error scanning for Chrome processes: ${e.message}`);
    }

    if (killed > 0) {
@@ -1327,7 +1356,7 @@ function findChromium() {
 * @returns {string} - Absolute path to extensions directory
 */
 function getExtensionsDir() {
-    const dataDir = getEnv('DATA_DIR', './data');
+    const dataDir = getEnv('DATA_DIR', '.');
    const persona = getEnv('ACTIVE_PERSONA', 'Default');
    return getEnv('CHROME_EXTENSIONS_DIR') ||
        path.join(dataDir, 'personas', persona, 'chrome_extensions');
@@ -1459,7 +1488,7 @@ async function installExtensionWithCache(extension, options = {}) {

    const installedExt = await loadOrInstallExtension(extension, extensionsDir);

-    if (!installedExt) {
+    if (!installedExt?.version) {
        console.error(`[❌] Failed to install ${extension.name} extension`);
        return null;
    }
--- a/archivebox/plugins/chrome/tests/chrome_test_helpers.py
+++ b/archivebox/plugins/chrome/tests/chrome_test_helpers.py
@@ -214,12 +214,15 @@ def get_extensions_dir() -> str:

    Tries chrome_utils.js first, falls back to Python computation.
    """
-    returncode, stdout, stderr = _call_chrome_utils('getExtensionsDir')
-    if returncode == 0 and stdout.strip():
-        return stdout.strip()
+    try:
+        returncode, stdout, stderr = _call_chrome_utils('getExtensionsDir')
+        if returncode == 0 and stdout.strip():
+            return stdout.strip()
+    except subprocess.TimeoutExpired:
+        pass  # Fall through to default computation

    # Fallback to default computation if JS call fails
-    data_dir = os.environ.get('DATA_DIR', './data')
+    data_dir = os.environ.get('DATA_DIR', '.')
    persona = os.environ.get('ACTIVE_PERSONA', 'Default')
    return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions')

@@ -760,31 +763,39 @@ def setup_chrome_session(
    # Create tab
    tab_env = env.copy()
    tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir)
-    result = subprocess.run(
-        ['node', str(CHROME_TAB_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'],
-        cwd=str(snapshot_chrome_dir),
-        capture_output=True,
-        text=True,
-        timeout=60,
-        env=tab_env
-    )
-    if result.returncode != 0:
-        cleanup_chrome(chrome_launch_process, chrome_pid)
-        raise RuntimeError(f"Tab creation failed: {result.stderr}")
-
-    # Navigate to URL if requested
-    if navigate and CHROME_NAVIGATE_HOOK and test_url != 'about:blank':
+    try:
        result = subprocess.run(
-            ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
+            ['node', str(CHROME_TAB_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'],
            cwd=str(snapshot_chrome_dir),
            capture_output=True,
            text=True,
-            timeout=120,
-            env=env
+            timeout=60,
+            env=tab_env
        )
        if result.returncode != 0:
            cleanup_chrome(chrome_launch_process, chrome_pid)
-            raise RuntimeError(f"Navigation failed: {result.stderr}")
+            raise RuntimeError(f"Tab creation failed: {result.stderr}")
+    except subprocess.TimeoutExpired:
+        cleanup_chrome(chrome_launch_process, chrome_pid)
+        raise RuntimeError("Tab creation timed out after 60s")
+
+    # Navigate to URL if requested
+    if navigate and CHROME_NAVIGATE_HOOK and test_url != 'about:blank':
+        try:
+            result = subprocess.run(
+                ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'],
+                cwd=str(snapshot_chrome_dir),
+                capture_output=True,
+                text=True,
+                timeout=120,
+                env=env
+            )
+            if result.returncode != 0:
+                cleanup_chrome(chrome_launch_process, chrome_pid)
+                raise RuntimeError(f"Navigation failed: {result.stderr}")
+        except subprocess.TimeoutExpired:
+            cleanup_chrome(chrome_launch_process, chrome_pid)
+            raise RuntimeError("Navigation timed out after 120s")

    return chrome_launch_process, chrome_pid, snapshot_chrome_dir