more fixes

This commit is contained in:
Nick Sweeting
2025-12-29 21:03:05 -08:00
parent 147d567d3f
commit 80f75126c6
12 changed files with 339 additions and 135 deletions

View File

@@ -21,12 +21,8 @@ class Migration(migrations.Migration):
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL,
modified_at DATETIME NOT NULL,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
token VARCHAR(32) NOT NULL UNIQUE,
label VARCHAR(64) NOT NULL DEFAULT '',
notes TEXT NOT NULL DEFAULT '',
expires DATETIME,
created_by_id INTEGER NOT NULL,
@@ -41,19 +37,20 @@ class Migration(migrations.Migration):
id TEXT PRIMARY KEY NOT NULL,
created_at DATETIME NOT NULL,
modified_at DATETIME NOT NULL,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
name VARCHAR(255) NOT NULL UNIQUE,
signal VARCHAR(255) NOT NULL,
ref VARCHAR(1024) NOT NULL,
endpoint VARCHAR(2048) NOT NULL,
headers TEXT NOT NULL DEFAULT '{}',
auth_token TEXT NOT NULL DEFAULT '',
enabled BOOLEAN NOT NULL DEFAULT 1,
keep_last_response BOOLEAN NOT NULL DEFAULT 0,
last_response TEXT,
created DATETIME NOT NULL,
updated DATETIME NOT NULL,
last_response TEXT NOT NULL DEFAULT '',
last_success DATETIME,
last_error DATETIME,
last_failure DATETIME,
created_by_id INTEGER NOT NULL,

View File

@@ -52,20 +52,21 @@ def update(filter_patterns: Iterable[str] = (),
)
print_stats(stats)
else:
# Full mode: import orphans + process DB + deduplicate
stats_combined = {'phase1': {}, 'phase2': {}, 'deduplicated': 0}
# Full mode: drain old dirs + process DB
stats_combined = {'phase1': {}, 'phase2': {}}
print('[*] Phase 1: Scanning archive/ for orphaned snapshots...')
stats_combined['phase1'] = import_orphans_from_archive(
print('[*] Phase 1: Draining old archive/ directories (0.8.x → 0.9.x migration)...')
stats_combined['phase1'] = drain_old_archive_dirs(
resume_from=resume,
batch_size=batch_size
)
print('[*] Phase 2: Processing all database snapshots...')
print('[*] Phase 2: Processing all database snapshots (most recent first)...')
stats_combined['phase2'] = process_all_db_snapshots(batch_size=batch_size)
print('[*] Phase 3: Deduplicating...')
stats_combined['deduplicated'] = Snapshot.find_and_merge_duplicates()
# Phase 3: Deduplication (disabled for now)
# print('[*] Phase 3: Deduplicating...')
# stats_combined['deduplicated'] = Snapshot.find_and_merge_duplicates()
print_combined_stats(stats_combined)
@@ -77,33 +78,39 @@ def update(filter_patterns: Iterable[str] = (),
resume = None
def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100) -> dict:
def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> dict:
"""
Scan archive/ for orphaned snapshots.
Skip symlinks (already migrated).
Create DB records and trigger migration on save().
Drain old archive/ directories (0.8.x → 0.9.x migration).
Only processes real directories (skips symlinks - those are already migrated).
For each old dir found in archive/:
1. Load or create DB snapshot
2. Trigger fs migration on save() to move to data/users/{user}/...
3. Leave symlink in archive/ pointing to new location
After this drains, archive/ should only contain symlinks and we can trust
1:1 mapping between DB and filesystem.
"""
from archivebox.core.models import Snapshot
from archivebox.config import CONSTANTS
from django.db import transaction
stats = {'processed': 0, 'imported': 0, 'migrated': 0, 'invalid': 0}
stats = {'processed': 0, 'migrated': 0, 'skipped': 0, 'invalid': 0}
archive_dir = CONSTANTS.ARCHIVE_DIR
if not archive_dir.exists():
return stats
print('[*] Scanning and sorting by modification time...')
print('[*] Scanning for old directories in archive/...')
# Scan and sort by mtime (newest first)
# Loading (mtime, path) tuples is fine even for millions (~100MB for 1M entries)
# Scan for real directories only (skip symlinks - they're already migrated)
entries = [
(e.stat().st_mtime, e.path)
for e in os.scandir(archive_dir)
if e.is_dir(follow_symlinks=False) # Skip symlinks
]
entries.sort(reverse=True) # Newest first
print(f'[*] Found {len(entries)} directories to check')
print(f'[*] Found {len(entries)} old directories to drain')
for mtime, entry_path in entries:
entry_path = Path(entry_path)
@@ -114,30 +121,26 @@ def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100)
stats['processed'] += 1
# Check if already in DB
# Try to load existing snapshot from DB
snapshot = Snapshot.load_from_directory(entry_path)
if snapshot:
continue # Already in DB, skip
# Not in DB - create orphaned snapshot
snapshot = Snapshot.create_from_directory(entry_path)
if not snapshot:
# Invalid directory
Snapshot.move_directory_to_invalid(entry_path)
stats['invalid'] += 1
print(f" [{stats['processed']}] Invalid: {entry_path.name}")
continue
# Not in DB - create new snapshot record
snapshot = Snapshot.create_from_directory(entry_path)
if not snapshot:
# Invalid directory - move to invalid/
Snapshot.move_directory_to_invalid(entry_path)
stats['invalid'] += 1
print(f" [{stats['processed']}] Invalid: {entry_path.name}")
continue
needs_migration = snapshot.fs_migration_needed
snapshot.save() # Creates DB record + triggers migration
stats['imported'] += 1
if needs_migration:
# Check if needs migration (0.8.x → 0.9.x)
if snapshot.fs_migration_needed:
snapshot.save() # Triggers migration + creates symlink
stats['migrated'] += 1
print(f" [{stats['processed']}] Imported + migrated: {entry_path.name}")
print(f" [{stats['processed']}] Migrated: {entry_path.name}")
else:
print(f" [{stats['processed']}] Imported: {entry_path.name}")
stats['skipped'] += 1
if stats['processed'] % batch_size == 0:
transaction.commit()
@@ -148,8 +151,14 @@ def import_orphans_from_archive(resume_from: str = None, batch_size: int = 100)
def process_all_db_snapshots(batch_size: int = 100) -> dict:
"""
Process all snapshots in DB.
Reconcile index.json and queue for archiving.
O(n) scan over entire DB from most recent to least recent.
For each snapshot:
1. Reconcile index.json with DB (merge titles, tags, archive results)
2. Queue for archiving (state machine will handle it)
No orphan detection needed - we trust 1:1 mapping between DB and filesystem
after Phase 1 has drained all old archive/ directories.
"""
from archivebox.core.models import Snapshot
from django.db import transaction
@@ -158,9 +167,10 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
stats = {'processed': 0, 'reconciled': 0, 'queued': 0}
total = Snapshot.objects.count()
print(f'[*] Processing {total} snapshots from database...')
print(f'[*] Processing {total} snapshots from database (most recent first)...')
for snapshot in Snapshot.objects.iterator(chunk_size=batch_size):
# Process from most recent to least recent
for snapshot in Snapshot.objects.order_by('-bookmarked_at').iterator(chunk_size=batch_size):
# Reconcile index.json with DB
snapshot.reconcile_with_index_json()
@@ -252,19 +262,16 @@ def print_combined_stats(stats_combined: dict):
print(f"""
[green]Archive Update Complete[/green]
Phase 1 (Import Orphans):
Phase 1 (Drain Old Dirs):
Checked: {s1.get('processed', 0)}
Imported: {s1.get('imported', 0)}
Migrated: {s1.get('migrated', 0)}
Skipped: {s1.get('skipped', 0)}
Invalid: {s1.get('invalid', 0)}
Phase 2 (Process DB):
Processed: {s2.get('processed', 0)}
Reconciled: {s2.get('reconciled', 0)}
Queued: {s2.get('queued', 0)}
Phase 3 (Deduplicate):
Merged: {stats_combined['deduplicated']}
""")

View File

@@ -297,7 +297,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
config = models.JSONField(default=dict, null=False, blank=False, editable=True)
notes = models.TextField(blank=True, null=False, default='')
output_dir = models.FilePathField(path=CONSTANTS.ARCHIVE_DIR, recursive=True, match='.*', default=None, null=True, blank=True, editable=True)
# output_dir is computed via @cached_property from fs_version and get_storage_path_for_version()
tags = models.ManyToManyField(Tag, blank=True, through=SnapshotTag, related_name='snapshot_set', through_fields=('snapshot', 'tag'))
@@ -1981,7 +1981,7 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
status = ModelWithStateMachine.StatusField(choices=StatusChoices.choices, default=StatusChoices.QUEUED)
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
notes = models.TextField(blank=True, null=False, default='')
output_dir = models.CharField(max_length=256, default=None, null=True, blank=True)
# output_dir is computed via @property from snapshot.output_dir / plugin
state_machine_name = 'archivebox.core.models.ArchiveResultMachine'
retry_at_field_name = 'retry_at'

View File

@@ -358,10 +358,19 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
"""Clean up background hooks and run on_CrawlEnd hooks."""
import os
import signal
import time
from pathlib import Path
from archivebox.hooks import run_hook, discover_hooks
from archivebox.misc.process_utils import validate_pid_file
def is_process_alive(pid):
"""Check if a process exists."""
try:
os.kill(pid, 0) # Signal 0 checks existence without killing
return True
except (OSError, ProcessLookupError):
return False
# Kill any background processes by scanning for all .pid files
if self.OUTPUT_DIR.exists():
for pid_file in self.OUTPUT_DIR.glob('**/*.pid'):
@@ -371,9 +380,11 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
# PID reused by different process or process dead
pid_file.unlink(missing_ok=True)
continue
try:
pid = int(pid_file.read_text().strip())
# Step 1: Send SIGTERM for graceful shutdown
try:
# Try to kill process group first (handles detached processes like Chrome)
try:
@@ -382,8 +393,46 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
# Fall back to killing just the process
os.kill(pid, signal.SIGTERM)
except ProcessLookupError:
pass # Already dead
except (ValueError, OSError):
# Already dead
pid_file.unlink(missing_ok=True)
continue
# Step 2: Wait for graceful shutdown
time.sleep(2)
# Step 3: Check if still alive
if not is_process_alive(pid):
# Process terminated gracefully
pid_file.unlink(missing_ok=True)
continue
# Step 4: Process still alive, force kill ENTIRE process group with SIGKILL
try:
try:
# Always kill entire process group with SIGKILL (not individual processes)
os.killpg(pid, signal.SIGKILL)
except (OSError, ProcessLookupError) as e:
# Process group kill failed, try single process as fallback
os.kill(pid, signal.SIGKILL)
except ProcessLookupError:
# Process died between check and kill
pid_file.unlink(missing_ok=True)
continue
# Step 5: Wait and verify death
time.sleep(1)
if is_process_alive(pid):
# Process is unkillable (likely in UNE state on macOS)
# This happens when Chrome crashes in kernel syscall (IOSurface)
# Log but don't block cleanup - process will remain until reboot
print(f'[yellow]⚠️ Process {pid} is unkillable (likely crashed in kernel). Will remain until reboot.[/yellow]')
else:
# Successfully killed
pid_file.unlink(missing_ok=True)
except (ValueError, OSError) as e:
# Invalid PID file or permission error
pass
# Run on_CrawlEnd hooks

View File

@@ -397,8 +397,53 @@ async function launchChromium(options = {}) {
}
}
/**
* Check if a process is still running.
* @param {number} pid - Process ID to check
* @returns {boolean} - True if process exists
*/
function isProcessAlive(pid) {
try {
process.kill(pid, 0); // Signal 0 checks existence without killing
return true;
} catch (e) {
return false;
}
}
/**
* Find all Chrome child processes for a given debug port.
* @param {number} port - Debug port number
* @returns {Array<number>} - Array of PIDs
*/
function findChromeProcessesByPort(port) {
const { execSync } = require('child_process');
const pids = [];
try {
// Find all Chrome processes using this debug port
const output = execSync(
`ps aux | grep -i "chrome.*--remote-debugging-port=${port}" | grep -v grep | awk '{print $2}'`,
{ encoding: 'utf8', timeout: 5000 }
);
for (const line of output.split('\n')) {
const pid = parseInt(line.trim(), 10);
if (!isNaN(pid) && pid > 0) {
pids.push(pid);
}
}
} catch (e) {
// Command failed or no processes found
}
return pids;
}
/**
* Kill a Chrome process by PID.
* Always sends SIGTERM before SIGKILL, then verifies death.
*
* @param {number} pid - Process ID to kill
* @param {string} [outputDir] - Directory containing PID files to clean up
*/
@@ -407,30 +452,93 @@ async function killChrome(pid, outputDir = null) {
console.error(`[*] Killing Chrome process tree (PID ${pid})...`);
// Try to kill process group first
// Get debug port for finding child processes
let debugPort = null;
if (outputDir) {
try {
const portFile = path.join(outputDir, 'port.txt');
if (fs.existsSync(portFile)) {
debugPort = parseInt(fs.readFileSync(portFile, 'utf8').trim(), 10);
}
} catch (e) {}
}
// Step 1: SIGTERM to process group (graceful shutdown)
console.error(`[*] Sending SIGTERM to process group -${pid}...`);
try {
process.kill(-pid, 'SIGTERM');
} catch (e) {
try { process.kill(pid, 'SIGTERM'); } catch (e2) {}
try {
console.error(`[*] Process group kill failed, trying single process...`);
process.kill(pid, 'SIGTERM');
} catch (e2) {
console.error(`[!] SIGTERM failed: ${e2.message}`);
}
}
// Wait for graceful shutdown
// Step 2: Wait for graceful shutdown
await new Promise(resolve => setTimeout(resolve, 2000));
// Force kill
try {
process.kill(-pid, 'SIGKILL');
} catch (e) {
try { process.kill(pid, 'SIGKILL'); } catch (e2) {}
// Step 3: Check if still alive
if (!isProcessAlive(pid)) {
console.error('[+] Chrome process terminated gracefully');
} else {
// Step 4: Force kill ENTIRE process group with SIGKILL
console.error(`[*] Process still alive, sending SIGKILL to process group -${pid}...`);
try {
process.kill(-pid, 'SIGKILL'); // Kill entire process group
} catch (e) {
console.error(`[!] Process group SIGKILL failed, trying single process: ${e.message}`);
try {
process.kill(pid, 'SIGKILL');
} catch (e2) {
console.error(`[!] SIGKILL failed: ${e2.message}`);
}
}
// Step 5: Wait briefly and verify death
await new Promise(resolve => setTimeout(resolve, 1000));
if (isProcessAlive(pid)) {
console.error(`[!] WARNING: Process ${pid} is unkillable (likely in UNE state)`);
console.error(`[!] This typically happens when Chrome crashes in kernel syscall`);
console.error(`[!] Process will remain as zombie until system reboot`);
console.error(`[!] macOS IOSurface crash creates unkillable processes in UNE state`);
// Try one more time to kill the entire process group
if (debugPort) {
const relatedPids = findChromeProcessesByPort(debugPort);
if (relatedPids.length > 1) {
console.error(`[*] Found ${relatedPids.length} Chrome processes still running on port ${debugPort}`);
console.error(`[*] Attempting final process group SIGKILL...`);
// Try to kill each unique process group we find
const processGroups = new Set();
for (const relatedPid of relatedPids) {
if (relatedPid !== pid) {
processGroups.add(relatedPid);
}
}
for (const groupPid of processGroups) {
try {
process.kill(-groupPid, 'SIGKILL');
} catch (e) {}
}
}
}
} else {
console.error('[+] Chrome process group killed successfully');
}
}
// Clean up PID files
// Step 8: Clean up PID files
if (outputDir) {
try { fs.unlinkSync(path.join(outputDir, 'chrome.pid')); } catch (e) {}
try { fs.unlinkSync(path.join(outputDir, 'hook.pid')); } catch (e) {}
}
console.error('[*] Chrome process killed');
console.error('[*] Chrome cleanup completed');
}
/**

View File

@@ -594,36 +594,57 @@ def test_zombie_prevention_hook_killed():
except OSError:
pytest.fail("Chrome should still be running after hook SIGKILL")
# Simulate Crawl.cleanup() - kill all .pid files
# Simulate Crawl.cleanup() using the actual cleanup logic
def is_process_alive(pid):
"""Check if a process exists."""
try:
os.kill(pid, 0)
return True
except (OSError, ProcessLookupError):
return False
for pid_file in chrome_dir.glob('**/*.pid'):
try:
pid = int(pid_file.read_text().strip())
# Step 1: SIGTERM for graceful shutdown
try:
# Try to kill process group first (for detached processes like Chrome)
try:
os.killpg(pid, signal.SIGTERM)
except (OSError, ProcessLookupError):
# Fall back to killing just the process
os.kill(pid, signal.SIGTERM)
except ProcessLookupError:
pid_file.unlink(missing_ok=True)
continue
time.sleep(0.5)
# Step 2: Wait for graceful shutdown
time.sleep(2)
# Force kill if still alive
# Step 3: Check if still alive
if not is_process_alive(pid):
pid_file.unlink(missing_ok=True)
continue
# Step 4: Force kill ENTIRE process group with SIGKILL
try:
try:
# Always kill entire process group with SIGKILL
os.killpg(pid, signal.SIGKILL)
except (OSError, ProcessLookupError):
try:
os.kill(pid, signal.SIGKILL)
except OSError:
pass
os.kill(pid, signal.SIGKILL)
except ProcessLookupError:
pass
pid_file.unlink(missing_ok=True)
continue
# Step 5: Wait and verify death
time.sleep(1)
if not is_process_alive(pid):
pid_file.unlink(missing_ok=True)
except (ValueError, OSError):
pass
# Wait a moment for cleanup
time.sleep(1)
# Chrome should now be dead
try:
os.kill(chrome_pid, 0)

View File

@@ -0,0 +1,31 @@
#!/usr/bin/env python3
"""
Wrapper for forum-dl that applies Pydantic v2 compatibility patches.
This wrapper fixes forum-dl 0.3.0's incompatibility with Pydantic v2 by monkey-patching
the JsonlWriter class to use model_dump_json() instead of the deprecated json(models_as_dict=False).
"""
import sys
# Apply Pydantic v2 compatibility patch BEFORE importing forum_dl
try:
from forum_dl.writers.jsonl import JsonlWriter
from pydantic import BaseModel
# Check if we're using Pydantic v2
if hasattr(BaseModel, 'model_dump_json'):
def _patched_serialize_entry(self, entry):
"""Use Pydantic v2's model_dump_json() instead of deprecated json(models_as_dict=False)"""
return entry.model_dump_json()
JsonlWriter._serialize_entry = _patched_serialize_entry
except (ImportError, AttributeError):
# forum-dl not installed or already compatible - no patch needed
pass
# Now import and run forum-dl's main function
from forum_dl import main
if __name__ == '__main__':
sys.exit(main())

View File

@@ -115,8 +115,12 @@ def save_forum(url: str, binary: str) -> tuple[bool, str | None, str]:
else:
output_file = output_dir / f'forum.{output_format}'
# Build command
cmd = [binary, *forumdl_args, '-f', output_format, '-o', str(output_file)]
# Use our Pydantic v2 compatible wrapper if available, otherwise fall back to binary
wrapper_path = Path(__file__).parent / 'forum-dl-wrapper.py'
if wrapper_path.exists():
cmd = [sys.executable, str(wrapper_path), *forumdl_args, '-f', output_format, '-o', str(output_file)]
else:
cmd = [binary, *forumdl_args, '-f', output_format, '-o', str(output_file)]
if not check_ssl:
cmd.append('--no-check-certificate')

View File

@@ -205,14 +205,9 @@ def test_config_timeout():
def test_real_forum_url():
"""Test that forum-dl processes real forum URLs with jsonl output format.
"""Test that forum-dl extracts content from a real HackerNews thread with jsonl output.
NOTE: forum-dl currently has known issues:
- Pydantic v2 incompatibility causing errors with most extractors
- Many forums return 403/404 or have changed their structure
- This test verifies the hook runs and handles these issues gracefully
If forum-dl is fixed in the future, this test should start succeeding with actual downloads.
Uses our Pydantic v2 compatible wrapper to fix forum-dl 0.3.0's incompatibility.
"""
import os
@@ -224,15 +219,14 @@ def test_real_forum_url():
with tempfile.TemporaryDirectory() as tmpdir:
tmpdir = Path(tmpdir)
# Try HackerNews - supported by forum-dl but currently has Pydantic v2 compat issues
# When forum-dl is updated, this URL should work
# Use HackerNews - one of the most reliable forum-dl extractors
forum_url = 'https://news.ycombinator.com/item?id=1'
env = os.environ.copy()
env['FORUMDL_BINARY'] = binary_path
env['FORUMDL_TIMEOUT'] = '60'
env['FORUMDL_OUTPUT_FORMAT'] = 'jsonl' # Use jsonl format as requested
# HTML output would be via: env['FORUMDL_EXTRA_ARGS'] = '--files-output ./files'
env['FORUMDL_OUTPUT_FORMAT'] = 'jsonl' # Use jsonl format
# HTML output could be added via: env['FORUMDL_ARGS_EXTRA'] = json.dumps(['--files-output', './files'])
start_time = time.time()
result = subprocess.run(
@@ -245,40 +239,37 @@ def test_real_forum_url():
)
elapsed_time = time.time() - start_time
# Test passes if the hook handles the URL gracefully (success OR handled error)
# This is appropriate given forum-dl's current state
assert result.returncode in (0, 1), f"Hook should handle forum URL gracefully. stderr: {result.stderr}"
# Should succeed with our Pydantic v2 wrapper
assert result.returncode == 0, f"Should extract forum successfully: {result.stderr}"
# Check for successful extraction (will pass when forum-dl is fixed)
if result.returncode == 0:
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
# Parse JSONL output
result_json = None
for line in result.stdout.strip().split('\n'):
line = line.strip()
if line.startswith('{'):
try:
record = json.loads(line)
if record.get('type') == 'ArchiveResult':
result_json = record
break
except json.JSONDecodeError:
pass
if result_json and result_json['status'] == 'succeeded':
output_files = list(tmpdir.glob('**/*'))
forum_files = [f for f in output_files if f.is_file()]
if forum_files:
print(f"✓ Successfully extracted {len(forum_files)} file(s) in {elapsed_time:.2f}s")
else:
print(f"✓ Completed in {elapsed_time:.2f}s (no content - URL may not be a forum thread)")
else:
print(f"✓ Completed in {elapsed_time:.2f}s (no content extracted)")
else:
# Handled error gracefully - test still passes
error_msg = result.stderr.strip()[:200]
print(f"✓ Handled error gracefully in {elapsed_time:.2f}s")
# Known issues: Pydantic v2 compat, 403 errors, etc.
assert '403' in error_msg or 'pydantic' in error_msg.lower() or 'error' in error_msg.lower(), \
f"Expected known error type, got: {error_msg}"
assert result_json, f"Should have ArchiveResult JSONL output. stdout: {result.stdout}"
assert result_json['status'] == 'succeeded', f"Should succeed: {result_json}"
# Check that forum files were downloaded
output_files = list(tmpdir.glob('**/*'))
forum_files = [f for f in output_files if f.is_file()]
assert len(forum_files) > 0, f"Should have downloaded at least one forum file. Files: {output_files}"
# Verify the JSONL file has content
jsonl_file = tmpdir / 'forum.jsonl'
assert jsonl_file.exists(), "Should have created forum.jsonl"
assert jsonl_file.stat().st_size > 0, "forum.jsonl should not be empty"
print(f"Successfully extracted {len(forum_files)} file(s) in {elapsed_time:.2f}s")
if __name__ == '__main__':

View File

@@ -76,7 +76,7 @@ def clone_git(url: str, binary: str) -> tuple[bool, str | None, str]:
Returns: (success, output_path, error_message)
"""
timeout = get_env_int('GIT_TIMEOUT') or get_env_int('TIMEOUT', 120)
git_args = get_env_array('GIT_ARGS', [])
git_args = get_env_array('GIT_ARGS', ["clone", "--depth=1", "--recursive"])
git_args_extra = get_env_array('GIT_ARGS_EXTRA', [])
cmd = [binary, *git_args, *git_args_extra, url, OUTPUT_DIR]

View File

@@ -518,8 +518,8 @@
<div class="snapshot-info">
<div class="snapshot-url">${formatUrl(snapshot.url)}</div>
<div class="snapshot-meta">
${snapshot.completed_extractors}/${snapshot.total_extractors} extractors
${snapshot.failed_extractors > 0 ? `<span style="color:#f85149">(${snapshot.failed_extractors} failed)</span>` : ''}
${snapshot.completed_plugins}/${snapshot.total_plugins} extractors
${snapshot.failed_plugins > 0 ? `<span style="color:#f85149">(${snapshot.failed_plugins} failed)</span>` : ''}
</div>
</div>
<span class="status-badge ${snapshot.status}">${snapshot.status}</span>

View File

@@ -219,8 +219,8 @@ def test_init_quick_flag_skips_checks(tmp_path):
assert db_path.exists()
def test_init_creates_machine_record(tmp_path):
"""Test that init creates a Machine record in machine_machine table."""
def test_init_creates_machine_table(tmp_path):
"""Test that init creates the machine_machine table."""
os.chdir(tmp_path)
subprocess.run(['archivebox', 'init'], capture_output=True)
@@ -231,14 +231,10 @@ def test_init_creates_machine_record(tmp_path):
tables = c.execute(
"SELECT name FROM sqlite_master WHERE type='table' AND name='machine_machine'"
).fetchall()
assert len(tables) == 1
# Check that a machine record was created
machine_count = c.execute("SELECT COUNT(*) FROM machine_machine").fetchone()[0]
assert machine_count >= 1
conn.close()
assert len(tables) == 1
def test_init_output_shows_collection_info(tmp_path):
"""Test that init output shows helpful collection information."""