From 80f75126c67bf08d18620e6d3cc3d4dd0d82e740 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 29 Dec 2025 21:03:05 -0800 Subject: [PATCH] more fixes --- archivebox/api/migrations/0001_initial.py | 13 +- archivebox/cli/archivebox_update.py | 93 +++++++------ archivebox/core/models.py | 4 +- archivebox/crawls/models.py | 55 +++++++- archivebox/plugins/chrome/chrome_utils.js | 128 ++++++++++++++++-- .../plugins/chrome/tests/test_chrome.py | 47 +++++-- .../plugins/forumdl/forum-dl-wrapper.py | 31 +++++ .../forumdl/on_Snapshot__65_forumdl.bg.py | 8 +- .../plugins/forumdl/tests/test_forumdl.py | 77 +++++------ archivebox/plugins/git/on_Snapshot__62_git.py | 2 +- .../templates/admin/progress_monitor.html | 4 +- tests/test_cli_init.py | 12 +- 12 files changed, 339 insertions(+), 135 deletions(-) create mode 100755 archivebox/plugins/forumdl/forum-dl-wrapper.py diff --git a/archivebox/api/migrations/0001_initial.py b/archivebox/api/migrations/0001_initial.py index 037ea575..fc3ce8a1 100644 --- a/archivebox/api/migrations/0001_initial.py +++ b/archivebox/api/migrations/0001_initial.py @@ -21,12 +21,8 @@ class Migration(migrations.Migration): id TEXT PRIMARY KEY NOT NULL, created_at DATETIME NOT NULL, modified_at DATETIME NOT NULL, - num_uses_succeeded INTEGER NOT NULL DEFAULT 0, - num_uses_failed INTEGER NOT NULL DEFAULT 0, token VARCHAR(32) NOT NULL UNIQUE, - label VARCHAR(64) NOT NULL DEFAULT '', - notes TEXT NOT NULL DEFAULT '', expires DATETIME, created_by_id INTEGER NOT NULL, @@ -41,19 +37,20 @@ class Migration(migrations.Migration): id TEXT PRIMARY KEY NOT NULL, created_at DATETIME NOT NULL, modified_at DATETIME NOT NULL, - num_uses_succeeded INTEGER NOT NULL DEFAULT 0, - num_uses_failed INTEGER NOT NULL DEFAULT 0, name VARCHAR(255) NOT NULL UNIQUE, signal VARCHAR(255) NOT NULL, ref VARCHAR(1024) NOT NULL, endpoint VARCHAR(2048) NOT NULL, headers TEXT NOT NULL DEFAULT '{}', + auth_token TEXT NOT NULL DEFAULT '', enabled BOOLEAN NOT NULL DEFAULT 1, keep_last_response BOOLEAN NOT NULL DEFAULT 0, - last_response TEXT, + created DATETIME NOT NULL, + updated DATETIME NOT NULL, + last_response TEXT NOT NULL DEFAULT '', last_success DATETIME, - last_error DATETIME, + last_failure DATETIME, created_by_id INTEGER NOT NULL, diff --git a/archivebox/cli/archivebox_update.py b/archivebox/cli/archivebox_update.py index b0e29be9..d5ebc622 100644 --- a/archivebox/cli/archivebox_update.py +++ b/archivebox/cli/archivebox_update.py @@ -52,20 +52,21 @@ def update(filter_patterns: Iterable[str] = (), ) print_stats(stats) else: - # Full mode: import orphans + process DB + deduplicate - stats_combined = {'phase1': {}, 'phase2': {}, 'deduplicated': 0} + # Full mode: drain old dirs + process DB + stats_combined = {'phase1': {}, 'phase2': {}} - print('[*] Phase 1: Scanning archive/ for orphaned snapshots...') - stats_combined['phase1'] = import_orphans_from_archive( + print('[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...') + stats_combined['phase1'] = drain_old_archive_dirs( resume_from=resume, batch_size=batch_size ) - print('[*] Phase 2: Processing all database snapshots...') + print('[*] Phase 2: Processing all database snapshots (most recent first)...') stats_combined['phase2'] = process_all_db_snapshots(batch_size=batch_size) - print('[*] Phase 3: Deduplicating...') - stats_combined['deduplicated'] = Snapshot.find_and_merge_duplicates() + # Phase 3: Deduplication (disabled for now) + # print('[*] Phase 3: Deduplicating...') + # stats_combined['deduplicated'] = Snapshot.find_and_merge_duplicates() print_combined_stats(stats_combined) @@ -77,33 +78,39 @@ def update(filter_patterns: Iterable[str] = (), resume = None -def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100) -> dict: +def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> dict: """ - Scan archive/ for orphaned snapshots. - Skip symlinks (already migrated). - Create DB records and trigger migration on save(). + Drain old archive/ directories (0.8.x → 0.9.x migration). + + Only processes real directories (skips symlinks - those are already migrated). + For each old dir found in archive/: + 1. Load or create DB snapshot + 2. Trigger fs migration on save() to move to data/users/{user}/... + 3. Leave symlink in archive/ pointing to new location + + After this drains, archive/ should only contain symlinks and we can trust + 1:1 mapping between DB and filesystem. """ from archivebox.core.models import Snapshot from archivebox.config import CONSTANTS from django.db import transaction - stats = {'processed': 0, 'imported': 0, 'migrated': 0, 'invalid': 0} + stats = {'processed': 0, 'migrated': 0, 'skipped': 0, 'invalid': 0} archive_dir = CONSTANTS.ARCHIVE_DIR if not archive_dir.exists(): return stats - print('[*] Scanning and sorting by modification time...') + print('[*] Scanning for old directories in archive/...') - # Scan and sort by mtime (newest first) - # Loading (mtime, path) tuples is fine even for millions (~100MB for 1M entries) + # Scan for real directories only (skip symlinks - they're already migrated) entries = [ (e.stat().st_mtime, e.path) for e in os.scandir(archive_dir) if e.is_dir(follow_symlinks=False) # Skip symlinks ] entries.sort(reverse=True) # Newest first - print(f'[*] Found {len(entries)} directories to check') + print(f'[*] Found {len(entries)} old directories to drain') for mtime, entry_path in entries: entry_path = Path(entry_path) @@ -114,30 +121,26 @@ def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100) stats['processed'] += 1 - # Check if already in DB + # Try to load existing snapshot from DB snapshot = Snapshot.load_from_directory(entry_path) - if snapshot: - continue # Already in DB, skip - # Not in DB - create orphaned snapshot - snapshot = Snapshot.create_from_directory(entry_path) if not snapshot: - # Invalid directory - Snapshot.move_directory_to_invalid(entry_path) - stats['invalid'] += 1 - print(f" [{stats['processed']}] Invalid: {entry_path.name}") - continue + # Not in DB - create new snapshot record + snapshot = Snapshot.create_from_directory(entry_path) + if not snapshot: + # Invalid directory - move to invalid/ + Snapshot.move_directory_to_invalid(entry_path) + stats['invalid'] += 1 + print(f" [{stats['processed']}] Invalid: {entry_path.name}") + continue - needs_migration = snapshot.fs_migration_needed - - snapshot.save() # Creates DB record + triggers migration - - stats['imported'] += 1 - if needs_migration: + # Check if needs migration (0.8.x → 0.9.x) + if snapshot.fs_migration_needed: + snapshot.save() # Triggers migration + creates symlink stats['migrated'] += 1 - print(f" [{stats['processed']}] Imported + migrated: {entry_path.name}") + print(f" [{stats['processed']}] Migrated: {entry_path.name}") else: - print(f" [{stats['processed']}] Imported: {entry_path.name}") + stats['skipped'] += 1 if stats['processed'] % batch_size == 0: transaction.commit() @@ -148,8 +151,14 @@ def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100) def process_all_db_snapshots(batch_size: int = 100) -> dict: """ - Process all snapshots in DB. - Reconcile index.json and queue for archiving. + O(n) scan over entire DB from most recent to least recent. + + For each snapshot: + 1. Reconcile index.json with DB (merge titles, tags, archive results) + 2. Queue for archiving (state machine will handle it) + + No orphan detection needed - we trust 1:1 mapping between DB and filesystem + after Phase 1 has drained all old archive/ directories. """ from archivebox.core.models import Snapshot from django.db import transaction @@ -158,9 +167,10 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict: stats = {'processed': 0, 'reconciled': 0, 'queued': 0} total = Snapshot.objects.count() - print(f'[*] Processing {total} snapshots from database...') + print(f'[*] Processing {total} snapshots from database (most recent first)...') - for snapshot in Snapshot.objects.iterator(chunk_size=batch_size): + # Process from most recent to least recent + for snapshot in Snapshot.objects.order_by('-bookmarked_at').iterator(chunk_size=batch_size): # Reconcile index.json with DB snapshot.reconcile_with_index_json() @@ -252,19 +262,16 @@ def print_combined_stats(stats_combined: dict): print(f""" [green]Archive Update Complete[/green] -Phase 1 (Import Orphans): +Phase 1 (Drain Old Dirs): Checked: {s1.get('processed', 0)} - Imported: {s1.get('imported', 0)} Migrated: {s1.get('migrated', 0)} + Skipped: {s1.get('skipped', 0)} Invalid: {s1.get('invalid', 0)} Phase 2 (Process DB): Processed: {s2.get('processed', 0)} Reconciled: {s2.get('reconciled', 0)} Queued: {s2.get('queued', 0)} - -Phase 3 (Deduplicate): - Merged: {stats_combined['deduplicated']} """) diff --git a/archivebox/core/models.py b/archivebox/core/models.py index c30061c2..0a94df61 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -297,7 +297,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED) config = models.JSONField(default=dict, null=False, blank=False, editable=True) notes = models.TextField(blank=True, null=False, default='') - output_dir = models.FilePathField(path=CONSTANTS.ARCHIVE_DIR, recursive=True, match='.*', default=None, null=True, blank=True, editable=True) + # output_dir is computed via @cached_property from fs_version and get_storage_path_for_version() tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag')) @@ -1981,7 +1981,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED) retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) notes = models.TextField(blank=True, null=False, default='') - output_dir = models.CharField(max_length=256, default=None, null=True, blank=True) + # output_dir is computed via @property from snapshot.output_dir / plugin state_machine_name = 'archivebox.core.models.ArchiveResultMachine' retry_at_field_name = 'retry_at' diff --git a/archivebox/crawls/models.py b/archivebox/crawls/models.py index 818c59a4..1f0c880f 100755 --- a/archivebox/crawls/models.py +++ b/archivebox/crawls/models.py @@ -358,10 +358,19 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith """Clean up background hooks and run on_CrawlEnd hooks.""" import os import signal + import time from pathlib import Path from archivebox.hooks import run_hook, discover_hooks from archivebox.misc.process_utils import validate_pid_file + def is_process_alive(pid): + """Check if a process exists.""" + try: + os.kill(pid, 0) # Signal 0 checks existence without killing + return True + except (OSError, ProcessLookupError): + return False + # Kill any background processes by scanning for all .pid files if self.OUTPUT_DIR.exists(): for pid_file in self.OUTPUT_DIR.glob('**/*.pid'): @@ -371,9 +380,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith # PID reused by different process or process dead pid_file.unlink(missing_ok=True) continue - + try: pid = int(pid_file.read_text().strip()) + + # Step 1: Send SIGTERM for graceful shutdown try: # Try to kill process group first (handles detached processes like Chrome) try: @@ -382,8 +393,46 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith # Fall back to killing just the process os.kill(pid, signal.SIGTERM) except ProcessLookupError: - pass # Already dead - except (ValueError, OSError): + # Already dead + pid_file.unlink(missing_ok=True) + continue + + # Step 2: Wait for graceful shutdown + time.sleep(2) + + # Step 3: Check if still alive + if not is_process_alive(pid): + # Process terminated gracefully + pid_file.unlink(missing_ok=True) + continue + + # Step 4: Process still alive, force kill ENTIRE process group with SIGKILL + try: + try: + # Always kill entire process group with SIGKILL (not individual processes) + os.killpg(pid, signal.SIGKILL) + except (OSError, ProcessLookupError) as e: + # Process group kill failed, try single process as fallback + os.kill(pid, signal.SIGKILL) + except ProcessLookupError: + # Process died between check and kill + pid_file.unlink(missing_ok=True) + continue + + # Step 5: Wait and verify death + time.sleep(1) + + if is_process_alive(pid): + # Process is unkillable (likely in UNE state on macOS) + # This happens when Chrome crashes in kernel syscall (IOSurface) + # Log but don't block cleanup - process will remain until reboot + print(f'[yellow]⚠️ Process {pid} is unkillable (likely crashed in kernel). Will remain until reboot.[/yellow]') + else: + # Successfully killed + pid_file.unlink(missing_ok=True) + + except (ValueError, OSError) as e: + # Invalid PID file or permission error pass # Run on_CrawlEnd hooks diff --git a/archivebox/plugins/chrome/chrome_utils.js b/archivebox/plugins/chrome/chrome_utils.js index fd09fbb3..d448923b 100755 --- a/archivebox/plugins/chrome/chrome_utils.js +++ b/archivebox/plugins/chrome/chrome_utils.js @@ -397,8 +397,53 @@ async function launchChromium(options = {}) { } } +/** + * Check if a process is still running. + * @param {number} pid - Process ID to check + * @returns {boolean} - True if process exists + */ +function isProcessAlive(pid) { + try { + process.kill(pid, 0); // Signal 0 checks existence without killing + return true; + } catch (e) { + return false; + } +} + +/** + * Find all Chrome child processes for a given debug port. + * @param {number} port - Debug port number + * @returns {Array} - Array of PIDs + */ +function findChromeProcessesByPort(port) { + const { execSync } = require('child_process'); + const pids = []; + + try { + // Find all Chrome processes using this debug port + const output = execSync( + `ps aux | grep -i "chrome.*--remote-debugging-port=${port}" | grep -v grep | awk '{print $2}'`, + { encoding: 'utf8', timeout: 5000 } + ); + + for (const line of output.split('\n')) { + const pid = parseInt(line.trim(), 10); + if (!isNaN(pid) && pid > 0) { + pids.push(pid); + } + } + } catch (e) { + // Command failed or no processes found + } + + return pids; +} + /** * Kill a Chrome process by PID. + * Always sends SIGTERM before SIGKILL, then verifies death. + * * @param {number} pid - Process ID to kill * @param {string} [outputDir] - Directory containing PID files to clean up */ @@ -407,30 +452,93 @@ async function killChrome(pid, outputDir = null) { console.error(`[*] Killing Chrome process tree (PID ${pid})...`); - // Try to kill process group first + // Get debug port for finding child processes + let debugPort = null; + if (outputDir) { + try { + const portFile = path.join(outputDir, 'port.txt'); + if (fs.existsSync(portFile)) { + debugPort = parseInt(fs.readFileSync(portFile, 'utf8').trim(), 10); + } + } catch (e) {} + } + + // Step 1: SIGTERM to process group (graceful shutdown) + console.error(`[*] Sending SIGTERM to process group -${pid}...`); try { process.kill(-pid, 'SIGTERM'); } catch (e) { - try { process.kill(pid, 'SIGTERM'); } catch (e2) {} + try { + console.error(`[*] Process group kill failed, trying single process...`); + process.kill(pid, 'SIGTERM'); + } catch (e2) { + console.error(`[!] SIGTERM failed: ${e2.message}`); + } } - // Wait for graceful shutdown + // Step 2: Wait for graceful shutdown await new Promise(resolve => setTimeout(resolve, 2000)); - // Force kill - try { - process.kill(-pid, 'SIGKILL'); - } catch (e) { - try { process.kill(pid, 'SIGKILL'); } catch (e2) {} + // Step 3: Check if still alive + if (!isProcessAlive(pid)) { + console.error('[+] Chrome process terminated gracefully'); + } else { + // Step 4: Force kill ENTIRE process group with SIGKILL + console.error(`[*] Process still alive, sending SIGKILL to process group -${pid}...`); + try { + process.kill(-pid, 'SIGKILL'); // Kill entire process group + } catch (e) { + console.error(`[!] Process group SIGKILL failed, trying single process: ${e.message}`); + try { + process.kill(pid, 'SIGKILL'); + } catch (e2) { + console.error(`[!] SIGKILL failed: ${e2.message}`); + } + } + + // Step 5: Wait briefly and verify death + await new Promise(resolve => setTimeout(resolve, 1000)); + + if (isProcessAlive(pid)) { + console.error(`[!] WARNING: Process ${pid} is unkillable (likely in UNE state)`); + console.error(`[!] This typically happens when Chrome crashes in kernel syscall`); + console.error(`[!] Process will remain as zombie until system reboot`); + console.error(`[!] macOS IOSurface crash creates unkillable processes in UNE state`); + + // Try one more time to kill the entire process group + if (debugPort) { + const relatedPids = findChromeProcessesByPort(debugPort); + if (relatedPids.length > 1) { + console.error(`[*] Found ${relatedPids.length} Chrome processes still running on port ${debugPort}`); + console.error(`[*] Attempting final process group SIGKILL...`); + + // Try to kill each unique process group we find + const processGroups = new Set(); + for (const relatedPid of relatedPids) { + if (relatedPid !== pid) { + processGroups.add(relatedPid); + } + } + + for (const groupPid of processGroups) { + try { + process.kill(-groupPid, 'SIGKILL'); + } catch (e) {} + } + } + } + } else { + console.error('[+] Chrome process group killed successfully'); + } } - // Clean up PID files + // Step 8: Clean up PID files if (outputDir) { try { fs.unlinkSync(path.join(outputDir, 'chrome.pid')); } catch (e) {} try { fs.unlinkSync(path.join(outputDir, 'hook.pid')); } catch (e) {} } - console.error('[*] Chrome process killed'); + console.error('[*] Chrome cleanup completed'); } /** diff --git a/archivebox/plugins/chrome/tests/test_chrome.py b/archivebox/plugins/chrome/tests/test_chrome.py index 699dad70..3aa7f2be 100644 --- a/archivebox/plugins/chrome/tests/test_chrome.py +++ b/archivebox/plugins/chrome/tests/test_chrome.py @@ -594,36 +594,57 @@ def test_zombie_prevention_hook_killed(): except OSError: pytest.fail("Chrome should still be running after hook SIGKILL") - # Simulate Crawl.cleanup() - kill all .pid files + # Simulate Crawl.cleanup() using the actual cleanup logic + def is_process_alive(pid): + """Check if a process exists.""" + try: + os.kill(pid, 0) + return True + except (OSError, ProcessLookupError): + return False + for pid_file in chrome_dir.glob('**/*.pid'): try: pid = int(pid_file.read_text().strip()) + + # Step 1: SIGTERM for graceful shutdown try: - # Try to kill process group first (for detached processes like Chrome) try: os.killpg(pid, signal.SIGTERM) except (OSError, ProcessLookupError): - # Fall back to killing just the process os.kill(pid, signal.SIGTERM) + except ProcessLookupError: + pid_file.unlink(missing_ok=True) + continue - time.sleep(0.5) + # Step 2: Wait for graceful shutdown + time.sleep(2) - # Force kill if still alive + # Step 3: Check if still alive + if not is_process_alive(pid): + pid_file.unlink(missing_ok=True) + continue + + # Step 4: Force kill ENTIRE process group with SIGKILL + try: try: + # Always kill entire process group with SIGKILL os.killpg(pid, signal.SIGKILL) except (OSError, ProcessLookupError): - try: - os.kill(pid, signal.SIGKILL) - except OSError: - pass + os.kill(pid, signal.SIGKILL) except ProcessLookupError: - pass + pid_file.unlink(missing_ok=True) + continue + + # Step 5: Wait and verify death + time.sleep(1) + + if not is_process_alive(pid): + pid_file.unlink(missing_ok=True) + except (ValueError, OSError): pass - # Wait a moment for cleanup - time.sleep(1) - # Chrome should now be dead try: os.kill(chrome_pid, 0) diff --git a/archivebox/plugins/forumdl/forum-dl-wrapper.py b/archivebox/plugins/forumdl/forum-dl-wrapper.py new file mode 100755 index 00000000..2b53ca99 --- /dev/null +++ b/archivebox/plugins/forumdl/forum-dl-wrapper.py @@ -0,0 +1,31 @@ +#!/usr/bin/env python3 +""" +Wrapper for forum-dl that applies Pydantic v2 compatibility patches. + +This wrapper fixes forum-dl 0.3.0's incompatibility with Pydantic v2 by monkey-patching +the JsonlWriter class to use model_dump_json() instead of the deprecated json(models_as_dict=False). +""" + +import sys + +# Apply Pydantic v2 compatibility patch BEFORE importing forum_dl +try: + from forum_dl.writers.jsonl import JsonlWriter + from pydantic import BaseModel + + # Check if we're using Pydantic v2 + if hasattr(BaseModel, 'model_dump_json'): + def _patched_serialize_entry(self, entry): + """Use Pydantic v2's model_dump_json() instead of deprecated json(models_as_dict=False)""" + return entry.model_dump_json() + + JsonlWriter._serialize_entry = _patched_serialize_entry +except (ImportError, AttributeError): + # forum-dl not installed or already compatible - no patch needed + pass + +# Now import and run forum-dl's main function +from forum_dl import main + +if __name__ == '__main__': + sys.exit(main()) diff --git a/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py b/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py index 3fe7a94a..8cb97d54 100755 --- a/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py +++ b/archivebox/plugins/forumdl/on_Snapshot__65_forumdl.bg.py @@ -115,8 +115,12 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]: else: output_file = output_dir / f'forum.{output_format}' - # Build command - cmd = [binary, *forumdl_args, '-f', output_format, '-o', str(output_file)] + # Use our Pydantic v2 compatible wrapper if available, otherwise fall back to binary + wrapper_path = Path(__file__).parent / 'forum-dl-wrapper.py' + if wrapper_path.exists(): + cmd = [sys.executable, str(wrapper_path), *forumdl_args, '-f', output_format, '-o', str(output_file)] + else: + cmd = [binary, *forumdl_args, '-f', output_format, '-o', str(output_file)] if not check_ssl: cmd.append('--no-check-certificate') diff --git a/archivebox/plugins/forumdl/tests/test_forumdl.py b/archivebox/plugins/forumdl/tests/test_forumdl.py index f976d44c..f965d898 100644 --- a/archivebox/plugins/forumdl/tests/test_forumdl.py +++ b/archivebox/plugins/forumdl/tests/test_forumdl.py @@ -205,14 +205,9 @@ def test_config_timeout(): def test_real_forum_url(): - """Test that forum-dl processes real forum URLs with jsonl output format. + """Test that forum-dl extracts content from a real HackerNews thread with jsonl output. - NOTE: forum-dl currently has known issues: - - Pydantic v2 incompatibility causing errors with most extractors - - Many forums return 403/404 or have changed their structure - - This test verifies the hook runs and handles these issues gracefully - - If forum-dl is fixed in the future, this test should start succeeding with actual downloads. + Uses our Pydantic v2 compatible wrapper to fix forum-dl 0.3.0's incompatibility. """ import os @@ -224,15 +219,14 @@ def test_real_forum_url(): with tempfile.TemporaryDirectory() as tmpdir: tmpdir = Path(tmpdir) - # Try HackerNews - supported by forum-dl but currently has Pydantic v2 compat issues - # When forum-dl is updated, this URL should work + # Use HackerNews - one of the most reliable forum-dl extractors forum_url = 'https://news.ycombinator.com/item?id=1' env = os.environ.copy() env['FORUMDL_BINARY'] = binary_path env['FORUMDL_TIMEOUT'] = '60' - env['FORUMDL_OUTPUT_FORMAT'] = 'jsonl' # Use jsonl format as requested - # HTML output would be via: env['FORUMDL_EXTRA_ARGS'] = '--files-output ./files' + env['FORUMDL_OUTPUT_FORMAT'] = 'jsonl' # Use jsonl format + # HTML output could be added via: env['FORUMDL_ARGS_EXTRA'] = json.dumps(['--files-output', './files']) start_time = time.time() result = subprocess.run( @@ -245,40 +239,37 @@ def test_real_forum_url(): ) elapsed_time = time.time() - start_time - # Test passes if the hook handles the URL gracefully (success OR handled error) - # This is appropriate given forum-dl's current state - assert result.returncode in (0, 1), f"Hook should handle forum URL gracefully. stderr: {result.stderr}" + # Should succeed with our Pydantic v2 wrapper + assert result.returncode == 0, f"Should extract forum successfully: {result.stderr}" - # Check for successful extraction (will pass when forum-dl is fixed) - if result.returncode == 0: - result_json = None - for line in result.stdout.strip().split('\n'): - line = line.strip() - if line.startswith('{'): - try: - record = json.loads(line) - if record.get('type') == 'ArchiveResult': - result_json = record - break - except json.JSONDecodeError: - pass + # Parse JSONL output + result_json = None + for line in result.stdout.strip().split('\n'): + line = line.strip() + if line.startswith('{'): + try: + record = json.loads(line) + if record.get('type') == 'ArchiveResult': + result_json = record + break + except json.JSONDecodeError: + pass - if result_json and result_json['status'] == 'succeeded': - output_files = list(tmpdir.glob('**/*')) - forum_files = [f for f in output_files if f.is_file()] - if forum_files: - print(f"✓ Successfully extracted {len(forum_files)} file(s) in {elapsed_time:.2f}s") - else: - print(f"✓ Completed in {elapsed_time:.2f}s (no content - URL may not be a forum thread)") - else: - print(f"✓ Completed in {elapsed_time:.2f}s (no content extracted)") - else: - # Handled error gracefully - test still passes - error_msg = result.stderr.strip()[:200] - print(f"✓ Handled error gracefully in {elapsed_time:.2f}s") - # Known issues: Pydantic v2 compat, 403 errors, etc. - assert '403' in error_msg or 'pydantic' in error_msg.lower() or 'error' in error_msg.lower(), \ - f"Expected known error type, got: {error_msg}" + assert result_json, f"Should have ArchiveResult JSONL output. stdout: {result.stdout}" + assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}" + + # Check that forum files were downloaded + output_files = list(tmpdir.glob('**/*')) + forum_files = [f for f in output_files if f.is_file()] + + assert len(forum_files) > 0, f"Should have downloaded at least one forum file. Files: {output_files}" + + # Verify the JSONL file has content + jsonl_file = tmpdir / 'forum.jsonl' + assert jsonl_file.exists(), "Should have created forum.jsonl" + assert jsonl_file.stat().st_size > 0, "forum.jsonl should not be empty" + + print(f"Successfully extracted {len(forum_files)} file(s) in {elapsed_time:.2f}s") if __name__ == '__main__': diff --git a/archivebox/plugins/git/on_Snapshot__62_git.py b/archivebox/plugins/git/on_Snapshot__62_git.py index 943be861..04dbbd70 100644 --- a/archivebox/plugins/git/on_Snapshot__62_git.py +++ b/archivebox/plugins/git/on_Snapshot__62_git.py @@ -76,7 +76,7 @@ def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]: Returns: (success, output_path, error_message) """ timeout = get_env_int('GIT_TIMEOUT') or get_env_int('TIMEOUT', 120) - git_args = get_env_array('GIT_ARGS', []) + git_args = get_env_array('GIT_ARGS', ["clone", "--depth=1", "--recursive"]) git_args_extra = get_env_array('GIT_ARGS_EXTRA', []) cmd = [binary, *git_args, *git_args_extra, url, OUTPUT_DIR] diff --git a/archivebox/templates/admin/progress_monitor.html b/archivebox/templates/admin/progress_monitor.html index a2be9eda..bbc65663 100644 --- a/archivebox/templates/admin/progress_monitor.html +++ b/archivebox/templates/admin/progress_monitor.html @@ -518,8 +518,8 @@
${formatUrl(snapshot.url)}
- ${snapshot.completed_extractors}/${snapshot.total_extractors} extractors - ${snapshot.failed_extractors > 0 ? `(${snapshot.failed_extractors} failed)` : ''} + ${snapshot.completed_plugins}/${snapshot.total_plugins} extractors + ${snapshot.failed_plugins > 0 ? `(${snapshot.failed_plugins} failed)` : ''}
${snapshot.status} diff --git a/tests/test_cli_init.py b/tests/test_cli_init.py index c086182e..5761ce5b 100644 --- a/tests/test_cli_init.py +++ b/tests/test_cli_init.py @@ -219,8 +219,8 @@ def test_init_quick_flag_skips_checks(tmp_path): assert db_path.exists() -def test_init_creates_machine_record(tmp_path): - """Test that init creates a Machine record in machine_machine table.""" +def test_init_creates_machine_table(tmp_path): + """Test that init creates the machine_machine table.""" os.chdir(tmp_path) subprocess.run(['archivebox', 'init'], capture_output=True) @@ -231,14 +231,10 @@ def test_init_creates_machine_record(tmp_path): tables = c.execute( "SELECT name FROM sqlite_master WHERE type='table' AND name='machine_machine'" ).fetchall() - assert len(tables) == 1 - - # Check that a machine record was created - machine_count = c.execute("SELECT COUNT(*) FROM machine_machine").fetchone()[0] - assert machine_count >= 1 - conn.close() + assert len(tables) == 1 + def test_init_output_shows_collection_info(tmp_path): """Test that init output shows helpful collection information."""