diff --git a/CLAUDE.md b/CLAUDE.md index ae17cc52..35a58346 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -27,6 +27,17 @@ uv sync --dev --all-extras # Always use uv, never pip directly source .venv/bin/activate ``` +### Generate and Apply Migrations +```bash +# Generate migrations (run from archivebox subdirectory) +cd archivebox +./manage.py makemigrations + +# Apply migrations to test database +cd data/ +archivebox init +``` + ## Running Tests ### CRITICAL: Never Run as Root diff --git a/archivebox/core/migrations/0025_cleanup_schema.py b/archivebox/core/migrations/0025_cleanup_schema.py index 78057e4b..f4b13fd2 100644 --- a/archivebox/core/migrations/0025_cleanup_schema.py +++ b/archivebox/core/migrations/0025_cleanup_schema.py @@ -10,8 +10,8 @@ import archivebox.base_models.models def cleanup_extra_columns(apps, schema_editor): """ - Remove extra columns that were needed for v0.7.2/v0.8.6rc0 migration but don't exist in final models. - The actual models use @property methods to access these values from the process FK. + Create Process records from old cmd/pwd/cmd_version columns and remove those columns. + This preserves the execution details by moving them to the Process model. """ with schema_editor.connection.cursor() as cursor: # Check if cmd column exists (means we came from v0.7.2/v0.8.6rc0) @@ -19,8 +19,41 @@ def cleanup_extra_columns(apps, schema_editor): has_cmd = cursor.fetchone()[0] > 0 if has_cmd: - print(" Cleaning up temporary columns from core_archiveresult...") - # Rebuild table without the extra columns + print(" Migrating cmd/pwd/cmd_version data to Process records...") + + # For each ArchiveResult, create a Process record with cmd/pwd data + # Note: cmd_version from old schema is not preserved (it's now derived from Binary) + cursor.execute(""" + SELECT id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status + FROM core_archiveresult + """) + archive_results = cursor.fetchall() + + from archivebox.uuid_compat import uuid7 + from archivebox.base_models.models import get_or_create_system_user_pk + + machine_id = cursor.execute("SELECT id FROM machine_machine LIMIT 1").fetchone()[0] + + for ar_id, cmd, pwd, binary_id, iface_id, start_ts, end_ts, status in archive_results: + # Create Process record + process_id = str(uuid7()) + cursor.execute(""" + INSERT INTO machine_process ( + id, created_at, modified_at, + machine_id, binary_id, iface_id, + pwd, cmd, env, timeout, + pid, exit_code, stdout, stderr, + started_at, ended_at, url, status, retry_at + ) VALUES (?, datetime('now'), datetime('now'), ?, ?, ?, ?, ?, '{}', 120, NULL, NULL, '', '', ?, ?, '', ?, NULL) + """, (process_id, machine_id, binary_id, iface_id, pwd or '', cmd or '[]', start_ts, end_ts, status or 'queued')) + + # Update ArchiveResult to point to new Process + cursor.execute("UPDATE core_archiveresult SET process_id = ? WHERE id = ?", (process_id, ar_id)) + + print(f" ✓ Created {len(archive_results)} Process records from ArchiveResult data") + + # Now rebuild table without the extra columns + print(" Rebuilding core_archiveresult table...") cursor.execute(""" CREATE TABLE core_archiveresult_final ( id INTEGER PRIMARY KEY AUTOINCREMENT, @@ -48,14 +81,14 @@ def cleanup_extra_columns(apps, schema_editor): num_uses_succeeded INTEGER NOT NULL DEFAULT 0, num_uses_failed INTEGER NOT NULL DEFAULT 0, - process_id TEXT, + process_id TEXT NOT NULL, FOREIGN KEY (snapshot_id) REFERENCES core_snapshot(id) ON DELETE CASCADE, FOREIGN KEY (process_id) REFERENCES machine_process(id) ON DELETE RESTRICT ) """) - # Copy data (cmd, pwd, etc. are now accessed via process FK) + # Copy data (cmd, pwd, etc. are now in Process records) cursor.execute(""" INSERT INTO core_archiveresult_final SELECT id, uuid, created_at, modified_at, diff --git a/archivebox/core/migrations/0027_alter_archiveresult_hook_name_alter_archiveresult_id_and_more.py b/archivebox/core/migrations/0027_alter_archiveresult_hook_name_alter_archiveresult_id_and_more.py new file mode 100644 index 00000000..4f4ed92b --- /dev/null +++ b/archivebox/core/migrations/0027_alter_archiveresult_hook_name_alter_archiveresult_id_and_more.py @@ -0,0 +1,108 @@ +# Generated by Django 6.0 on 2025-12-31 09:04 + +import django.db.models.deletion +import django.utils.timezone +import uuid +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('core', '0026_final_field_adjustments'), + ('crawls', '0002_upgrade_to_0_9_0'), + ('machine', '0001_initial'), + ] + + operations = [ + migrations.AlterField( + model_name='archiveresult', + name='hook_name', + field=models.CharField(blank=True, db_index=True, default='', help_text='Full filename of the hook that executed (e.g., on_Snapshot__50_wget.py)', max_length=255), + ), + migrations.AlterField( + model_name='archiveresult', + name='id', + field=models.AutoField(editable=False, primary_key=True, serialize=False), + ), + migrations.AlterField( + model_name='archiveresult', + name='output_files', + field=models.JSONField(default=dict, help_text='Dict of {relative_path: {metadata}}'), + ), + migrations.AlterField( + model_name='archiveresult', + name='output_json', + field=models.JSONField(blank=True, default=None, help_text='Structured metadata (headers, redirects, etc.)', null=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='output_mimetypes', + field=models.CharField(blank=True, default='', help_text='CSV of mimetypes sorted by size', max_length=512), + ), + migrations.AlterField( + model_name='archiveresult', + name='output_size', + field=models.BigIntegerField(default=0, help_text='Total bytes of all output files'), + ), + migrations.AlterField( + model_name='archiveresult', + name='output_str', + field=models.TextField(blank=True, default='', help_text='Human-readable output summary'), + ), + migrations.AlterField( + model_name='archiveresult', + name='plugin', + field=models.CharField(db_index=True, default='', max_length=32), + ), + migrations.AlterField( + model_name='archiveresult', + name='process', + field=models.OneToOneField(help_text='Process execution details for this archive result', on_delete=django.db.models.deletion.PROTECT, related_name='archiveresult', to='machine.process'), + ), + migrations.AlterField( + model_name='archiveresult', + name='retry_at', + field=models.DateTimeField(blank=True, db_index=True, default=django.utils.timezone.now, null=True), + ), + migrations.AlterField( + model_name='archiveresult', + name='status', + field=models.CharField(choices=[('queued', 'Queued'), ('started', 'Started'), ('backoff', 'Waiting to retry'), ('succeeded', 'Succeeded'), ('failed', 'Failed'), ('skipped', 'Skipped')], db_index=True, default='queued', max_length=15), + ), + migrations.AlterField( + model_name='archiveresult', + name='uuid', + field=models.UUIDField(blank=True, db_index=True, default=uuid.uuid7, null=True), + ), + migrations.AlterField( + model_name='snapshot', + name='config', + field=models.JSONField(default=dict), + ), + migrations.AlterField( + model_name='snapshot', + name='crawl', + field=models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='snapshot_set', to='crawls.crawl'), + ), + migrations.AlterField( + model_name='snapshot', + name='current_step', + field=models.PositiveSmallIntegerField(db_index=True, default=0, help_text='Current hook step being executed (0-9). Used for sequential hook execution.'), + ), + migrations.AlterField( + model_name='snapshot', + name='depth', + field=models.PositiveSmallIntegerField(db_index=True, default=0), + ), + migrations.AlterField( + model_name='snapshot', + name='id', + field=models.UUIDField(default=uuid.uuid7, editable=False, primary_key=True, serialize=False, unique=True), + ), + migrations.AlterField( + model_name='snapshottag', + name='id', + field=models.AutoField(primary_key=True, serialize=False), + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index ef3c3a6e..d36216d0 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -2321,7 +2321,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi process = models.OneToOneField( 'machine.Process', on_delete=models.PROTECT, - null=False, # Required after migration 4 + null=False, related_name='archiveresult', help_text='Process execution details for this archive result' ) diff --git a/archivebox/machine/admin.py b/archivebox/machine/admin.py index 3fbaa5b1..13834ced 100644 --- a/archivebox/machine/admin.py +++ b/archivebox/machine/admin.py @@ -144,7 +144,7 @@ class BinaryAdmin(BaseModelAdmin): class ProcessAdmin(BaseModelAdmin): - list_display = ('id', 'created_at', 'machine_info', 'archiveresult_link', 'cmd_str', 'status', 'exit_code', 'pid', 'binary_info', 'health') + list_display = ('id', 'created_at', 'machine_info', 'archiveresult_link', 'cmd_str', 'status', 'exit_code', 'pid', 'binary_info') sort_fields = ('id', 'created_at', 'status', 'exit_code', 'pid') search_fields = ('id', 'machine__id', 'binary__name', 'cmd', 'pwd', 'stdout', 'stderr') @@ -171,10 +171,6 @@ class ProcessAdmin(BaseModelAdmin): 'fields': ('stdout', 'stderr'), 'classes': ('card', 'wide', 'collapse'), }), - ('Usage', { - 'fields': ('num_uses_succeeded', 'num_uses_failed'), - 'classes': ('card',), - }), ('Timestamps', { 'fields': ('created_at', 'modified_at'), 'classes': ('card',), diff --git a/archivebox/machine/migrations/0001_initial.py b/archivebox/machine/migrations/0001_initial.py index e032b76d..e82e7f60 100644 --- a/archivebox/machine/migrations/0001_initial.py +++ b/archivebox/machine/migrations/0001_initial.py @@ -234,8 +234,6 @@ class Migration(migrations.Migration): ('id', models.UUIDField(default=uuid7, editable=False, primary_key=True, serialize=False, unique=True)), ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)), ('modified_at', models.DateTimeField(auto_now=True)), - ('num_uses_succeeded', models.PositiveIntegerField(default=0)), - ('num_uses_failed', models.PositiveIntegerField(default=0)), ('pwd', models.CharField(blank=True, default='', help_text='Working directory for process execution', max_length=512)), ('cmd', models.JSONField(blank=True, default=list, help_text='Command as array of arguments')), ('env', models.JSONField(blank=True, default=dict, help_text='Environment variables for process')), diff --git a/archivebox/machine/models.py b/archivebox/machine/models.py index c0659afd..feb9bc88 100755 --- a/archivebox/machine/models.py +++ b/archivebox/machine/models.py @@ -625,7 +625,7 @@ class ProcessManager(models.Manager): return process -class Process(ModelWithHealthStats): +class Process(models.Model): """ Tracks a single OS process execution. diff --git a/archivebox/personas/migrations/0001_initial.py b/archivebox/personas/migrations/0001_initial.py new file mode 100644 index 00000000..d85613c3 --- /dev/null +++ b/archivebox/personas/migrations/0001_initial.py @@ -0,0 +1,29 @@ +# Generated by Django 6.0 on 2025-12-31 09:06 + +import archivebox.base_models.models +import django.db.models.deletion +import django.utils.timezone +from django.conf import settings +from django.db import migrations, models + + +class Migration(migrations.Migration): + + initial = True + + dependencies = [ + migrations.swappable_dependency(settings.AUTH_USER_MODEL), + ] + + operations = [ + migrations.CreateModel( + name='Persona', + fields=[ + ('id', models.BigAutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('config', models.JSONField(blank=True, default=dict, null=True)), + ('name', models.CharField(max_length=64, unique=True)), + ('created_at', models.DateTimeField(db_index=True, default=django.utils.timezone.now)), + ('created_by', models.ForeignKey(default=archivebox.base_models.models.get_or_create_system_user_pk, on_delete=django.db.models.deletion.CASCADE, to=settings.AUTH_USER_MODEL)), + ], + ), + ] diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index 9dac6599..d840e0f6 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -203,86 +203,115 @@ function waitForDebugPort(port, timeout = 30000) { /** * Kill zombie Chrome processes from stale crawls. - * Scans DATA_DIR/crawls//chrome/.pid for stale processes. + * Recursively scans DATA_DIR for any */chrome/*.pid files from stale crawls. + * Does not assume specific directory structure - works with nested paths. * @param {string} [dataDir] - Data directory (defaults to DATA_DIR env or '.') * @returns {number} - Number of zombies killed */ function killZombieChrome(dataDir = null) { dataDir = dataDir || getEnv('DATA_DIR', '.'); - const crawlsDir = path.join(dataDir, 'crawls'); const now = Date.now(); const fiveMinutesAgo = now - 300000; let killed = 0; console.error('[*] Checking for zombie Chrome processes...'); - if (!fs.existsSync(crawlsDir)) { - console.error('[+] No crawls directory found'); + if (!fs.existsSync(dataDir)) { + console.error('[+] No data directory found'); return 0; } + /** + * Recursively find all chrome/.pid files in directory tree + * @param {string} dir - Directory to search + * @param {number} depth - Current recursion depth (limit to 10) + * @returns {Array<{pidFile: string, crawlDir: string}>} - Array of PID file info + */ + function findChromePidFiles(dir, depth = 0) { + if (depth > 10) return []; // Prevent infinite recursion + + const results = []; + try { + const entries = fs.readdirSync(dir, { withFileTypes: true }); + + for (const entry of entries) { + if (!entry.isDirectory()) continue; + + const fullPath = path.join(dir, entry.name); + + // Found a chrome directory - check for .pid files + if (entry.name === 'chrome') { + try { + const pidFiles = fs.readdirSync(fullPath).filter(f => f.endsWith('.pid')); + const crawlDir = dir; // Parent of chrome/ is the crawl dir + + for (const pidFileName of pidFiles) { + results.push({ + pidFile: path.join(fullPath, pidFileName), + crawlDir: crawlDir, + }); + } + } catch (e) { + // Skip if can't read chrome dir + } + } else { + // Recurse into subdirectory (skip hidden dirs and node_modules) + if (!entry.name.startsWith('.') && entry.name !== 'node_modules') { + results.push(...findChromePidFiles(fullPath, depth + 1)); + } + } + } + } catch (e) { + // Skip if can't read directory + } + return results; + } + try { - const crawls = fs.readdirSync(crawlsDir, { withFileTypes: true }); - - for (const crawl of crawls) { - if (!crawl.isDirectory()) continue; - - const crawlDir = path.join(crawlsDir, crawl.name); - const chromeDir = path.join(crawlDir, 'chrome'); - - if (!fs.existsSync(chromeDir)) continue; + const chromePids = findChromePidFiles(dataDir); + for (const {pidFile, crawlDir} of chromePids) { // Check if crawl was modified recently (still active) try { const crawlStats = fs.statSync(crawlDir); if (crawlStats.mtimeMs > fiveMinutesAgo) { - continue; + continue; // Crawl is active, skip } } catch (e) { continue; } - // Crawl is stale, check for PIDs + // Crawl is stale, check PID try { - const pidFiles = fs.readdirSync(chromeDir).filter(f => f.endsWith('.pid')); + const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10); + if (isNaN(pid) || pid <= 0) continue; - for (const pidFileName of pidFiles) { - const pidFile = path.join(chromeDir, pidFileName); + // Check if process exists + try { + process.kill(pid, 0); + } catch (e) { + // Process dead, remove stale PID file + try { fs.unlinkSync(pidFile); } catch (e) {} + continue; + } - try { - const pid = parseInt(fs.readFileSync(pidFile, 'utf8').trim(), 10); - if (isNaN(pid) || pid <= 0) continue; + // Process alive and crawl is stale - zombie! + console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${path.basename(crawlDir)}`); - // Check if process exists - try { - process.kill(pid, 0); - } catch (e) { - // Process dead, remove stale PID file - try { fs.unlinkSync(pidFile); } catch (e) {} - continue; - } - - // Process alive and crawl is stale - zombie! - console.error(`[!] Found zombie (PID ${pid}) from stale crawl ${crawl.name}`); - - try { - try { process.kill(-pid, 'SIGKILL'); } catch (e) { process.kill(pid, 'SIGKILL'); } - killed++; - console.error(`[+] Killed zombie (PID ${pid})`); - try { fs.unlinkSync(pidFile); } catch (e) {} - } catch (e) { - console.error(`[!] Failed to kill PID ${pid}: ${e.message}`); - } - } catch (e) { - // Skip invalid PID files - } + try { + try { process.kill(-pid, 'SIGKILL'); } catch (e) { process.kill(pid, 'SIGKILL'); } + killed++; + console.error(`[+] Killed zombie (PID ${pid})`); + try { fs.unlinkSync(pidFile); } catch (e) {} + } catch (e) { + console.error(`[!] Failed to kill PID ${pid}: ${e.message}`); } } catch (e) { - // Skip if can't read chrome dir + // Skip invalid PID files } } } catch (e) { - console.error(`[!] Error scanning crawls: ${e.message}`); + console.error(`[!] Error scanning for Chrome processes: ${e.message}`); } if (killed > 0) { @@ -1327,7 +1356,7 @@ function findChromium() { * @returns {string} - Absolute path to extensions directory */ function getExtensionsDir() { - const dataDir = getEnv('DATA_DIR', './data'); + const dataDir = getEnv('DATA_DIR', '.'); const persona = getEnv('ACTIVE_PERSONA', 'Default'); return getEnv('CHROME_EXTENSIONS_DIR') || path.join(dataDir, 'personas', persona, 'chrome_extensions'); @@ -1459,7 +1488,7 @@ async function installExtensionWithCache(extension, options = {}) { const installedExt = await loadOrInstallExtension(extension, extensionsDir); - if (!installedExt) { + if (!installedExt?.version) { console.error(`[❌] Failed to install ${extension.name} extension`); return null; } diff --git a/archivebox/plugins/chrome/tests/chrome_test_helpers.py b/archivebox/plugins/chrome/tests/chrome_test_helpers.py index 7e8c2d5e..17c27ff2 100644 --- a/archivebox/plugins/chrome/tests/chrome_test_helpers.py +++ b/archivebox/plugins/chrome/tests/chrome_test_helpers.py @@ -214,12 +214,15 @@ def get_extensions_dir() -> str: Tries chrome_utils.js first, falls back to Python computation. """ - returncode, stdout, stderr = _call_chrome_utils('getExtensionsDir') - if returncode == 0 and stdout.strip(): - return stdout.strip() + try: + returncode, stdout, stderr = _call_chrome_utils('getExtensionsDir') + if returncode == 0 and stdout.strip(): + return stdout.strip() + except subprocess.TimeoutExpired: + pass # Fall through to default computation # Fallback to default computation if JS call fails - data_dir = os.environ.get('DATA_DIR', './data') + data_dir = os.environ.get('DATA_DIR', '.') persona = os.environ.get('ACTIVE_PERSONA', 'Default') return str(Path(data_dir) / 'personas' / persona / 'chrome_extensions') @@ -760,31 +763,39 @@ def setup_chrome_session( # Create tab tab_env = env.copy() tab_env['CRAWL_OUTPUT_DIR'] = str(crawl_dir) - result = subprocess.run( - ['node', str(CHROME_TAB_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'], - cwd=str(snapshot_chrome_dir), - capture_output=True, - text=True, - timeout=60, - env=tab_env - ) - if result.returncode != 0: - cleanup_chrome(chrome_launch_process, chrome_pid) - raise RuntimeError(f"Tab creation failed: {result.stderr}") - - # Navigate to URL if requested - if navigate and CHROME_NAVIGATE_HOOK and test_url != 'about:blank': + try: result = subprocess.run( - ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], + ['node', str(CHROME_TAB_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}', f'--crawl-id={crawl_id}'], cwd=str(snapshot_chrome_dir), capture_output=True, text=True, - timeout=120, - env=env + timeout=60, + env=tab_env ) if result.returncode != 0: cleanup_chrome(chrome_launch_process, chrome_pid) - raise RuntimeError(f"Navigation failed: {result.stderr}") + raise RuntimeError(f"Tab creation failed: {result.stderr}") + except subprocess.TimeoutExpired: + cleanup_chrome(chrome_launch_process, chrome_pid) + raise RuntimeError("Tab creation timed out after 60s") + + # Navigate to URL if requested + if navigate and CHROME_NAVIGATE_HOOK and test_url != 'about:blank': + try: + result = subprocess.run( + ['node', str(CHROME_NAVIGATE_HOOK), f'--url={test_url}', f'--snapshot-id={snapshot_id}'], + cwd=str(snapshot_chrome_dir), + capture_output=True, + text=True, + timeout=120, + env=env + ) + if result.returncode != 0: + cleanup_chrome(chrome_launch_process, chrome_pid) + raise RuntimeError(f"Navigation failed: {result.stderr}") + except subprocess.TimeoutExpired: + cleanup_chrome(chrome_launch_process, chrome_pid) + raise RuntimeError("Navigation timed out after 120s") return chrome_launch_process, chrome_pid, snapshot_chrome_dir