unified Process source of truth and better screenshot tests

This commit is contained in:
Nick Sweeting
2026-01-02 04:20:34 -08:00
parent 3672174dad
commit dd77511026
44 changed files with 3369 additions and 1919 deletions

View File

@@ -96,10 +96,9 @@ def add(urls: str | list[str],
first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ''
print(f' [dim]First URL: {first_url}[/dim]')
# 3. The CrawlMachine will create the root Snapshot when started
# If URLs are from a file: first URL = file:///path/to/sources/...txt
# Parser extractors will run on it and discover more URLs
# Those URLs become child Snapshots (depth=1)
# 3. The CrawlMachine will create Snapshots from all URLs when started
# Parser extractors run on snapshots and discover more URLs
# Discovered URLs become child Snapshots (depth+1)
if index_only:
# Just create the crawl but don't start processing
@@ -119,10 +118,9 @@ def add(urls: str | list[str],
# 5. Start the orchestrator to process the queue
# The orchestrator will:
# - Process Crawl -> create root Snapshot
# - Process root Snapshot -> run parser extractors -> discover URLs
# - Create child Snapshots from discovered URLs
# - Process child Snapshots -> run extractors
# - Process Crawl -> create Snapshots from all URLs
# - Process Snapshots -> run extractors
# - Parser extractors discover new URLs -> create child Snapshots
# - Repeat until max_depth reached
if bg:

View File

@@ -160,10 +160,12 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True)
CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True)
(CONSTANTS.DEFAULT_LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
from archivebox.config.common import STORAGE_CONFIG
STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True)
STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True)
(STORAGE_CONFIG.LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
if install:
from archivebox.cli.archivebox_install import install as install_method

View File

@@ -96,33 +96,45 @@ ARCHIVERESULT_MACHINE_DIAGRAM = """
├─────────────────────────────────────────────────────────────────────────────┤
│ │
│ ┌─────────────┐ │
│ │ QUEUED │◄────────────────┐
│ │ (initial) │ │
│ └──────┬──────┘ │
│ │ │ tick() unless can_start()
│ tick() when
│ │ can_start()
│ ▼
│ ┌─────────────┐
│ │ STARTED │─────────────────┘
│◄────────────────┐
│ enter: │ │ tick() unless is_finished()
│ result.run()│─────────────────┘
│ (execute │
│ hook via
│ run_hook())│
────────────┘
│ tick() checks status set by hook output
├────────────────┬────────────────┬────────────────┐
▼ ▼
┌───────────┐ ┌───────────┐ ┌─────────── ┌───────────┐
│ │ SUCCEEDED FAILED SKIPPED BACKOFF
│ (final) (final) (final)
└───────────┘ └───────────┘ └───────────┘ └─────┬─────┘
can_start()───┘
loops back to STARTED
│ │ QUEUED │◄────────────────┐ │
│ │ (initial) │ │ │
│ └──┬───────┬──┘ │ │
│ │ tick() unless can_start() │
│ │ exceeded_max_ │ │
attempts │ │
┌──────────┐
SKIPPED │ │
│ │ (final)
│ └──────────┘ │
│ tick() when │
│ can_start()
┌─────────────┐
│ STARTED │──────────────────┘ │
│ │ │◄─────────────────────────────────────────────────┐
│ enter: │ │
│ result.run()│ tick() unless
│ (execute │ is_finished()
│ hook via │──────────────────────
│ │ run_hook())
└──────┬──────┘ │ │
│ │
│ tick() checks status set by hook output │ │
├─────────────┬─────────────┬─────────────┐
▼ │
│ ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐ │ │
│ │ SUCCEEDED │ │ FAILED │ │ SKIPPED │ │ BACKOFF │ │ │
│ │ (final) │ │ (final) │ │ (final) │ │ │ │ │
│ └───────────┘ └───────────┘ └───────────┘ └──┬──────┬─┘ │ │
│ │ │ │ │
│ exceeded_max_ │ │ can_start()│ │
│ attempts │ │ loops back │ │
│ ▼ │ └────────────┘ │
│ ┌──────────┐ │ │
│ │ SKIPPED │◄─┘ │
│ │ (final) │ │
│ └──────────┘ │
│ │
│ Each ArchiveResult runs ONE specific hook (stored in .hook_name field) │
└─────────────────────────────────────────────────────────────────────────────┘
@@ -137,35 +149,38 @@ BINARY_MACHINE_DIAGRAM = """
│ │ QUEUED │◄────────────────┐ │
│ │ (initial) │ │ │
│ └──────┬──────┘ │ │
│ │ │ tick() unless can_start()
│ │ │ tick() unless can_install()
│ │ │ (stays queued if failed) │
│ │ tick() when │ │
│ │ can_start() │ │
│ │
┌─────────────┐ │ │
│ STARTED │─────────────────┘
│◄────────────────┐
│ enter: │ │
│ binary.run()│ │ tick() unless is_finished()
│ (discover │─────────────────┘
Binary
hooks,
│ try each │
│ provider) │ │
└──────┬──────┘
│ │ can_install() │ │
│ │
│ on_install() runs │ │
│ during transition: │
• binary.run() │
(discover Binary │ │
│ hooks, try each │
│ provider until │
│ │ one succeeds)
│ │ • Sets abspath,
version, sha256
│ If install fails:
│ │ raises exception──────┘ │
│ │ (retry_at bumped) │
│ │ │
│ tick() checks status set by hook output
├────────────────────────────────┐
┌─────────────┐ ┌─────────────┐
│ │ SUCCEEDED FAILED │
│ │ (final) (final)
│ │
│ │ abspath,│ no provider │
│ version set │ │ succeeded
│ └─────────────┘ └─────────────┘ │
┌─────────────┐
INSTALLED │
│ (final)
│ │
│ │ Binary is
│ │ ready to
│ │ use
└─────────────┘
│ │
│ Hooks triggered: on_Binary__* (provider hooks during STARTED.enter)
│ Hooks triggered: on_Binary__* (provider hooks during transition)
│ Providers tried in sequence until one succeeds: apt, brew, pip, npm, etc. │
│ Installation is synchronous - no intermediate STARTED state │
└─────────────────────────────────────────────────────────────────────────────┘
"""

View File

@@ -109,15 +109,18 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di
if not archive_dir.exists():
return stats
print('[*] Scanning for old directories in archive/...')
print('[DEBUG Phase1] Scanning for old directories in archive/...')
# Scan for real directories only (skip symlinks - they're already migrated)
all_entries = list(os.scandir(archive_dir))
print(f'[DEBUG Phase1] Total entries in archive/: {len(all_entries)}')
entries = [
(e.stat().st_mtime, e.path)
for e in os.scandir(archive_dir)
for e in all_entries
if e.is_dir(follow_symlinks=False) # Skip symlinks
]
entries.sort(reverse=True) # Newest first
print(f'[DEBUG Phase1] Real directories (not symlinks): {len(entries)}')
print(f'[*] Found {len(entries)} old directories to drain')
for mtime, entry_path in entries:
@@ -142,14 +145,48 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di
print(f" [{stats['processed']}] Invalid: {entry_path.name}")
continue
# Ensure snapshot has a valid crawl (migration 0024 may have failed)
from archivebox.crawls.models import Crawl
has_valid_crawl = False
if snapshot.crawl_id:
# Check if the crawl actually exists
has_valid_crawl = Crawl.objects.filter(id=snapshot.crawl_id).exists()
if not has_valid_crawl:
# Create a new crawl (created_by will default to system user)
crawl = Crawl.objects.create(urls=snapshot.url)
# Use queryset update to avoid triggering save() hooks
from archivebox.core.models import Snapshot as SnapshotModel
SnapshotModel.objects.filter(pk=snapshot.pk).update(crawl=crawl)
# Refresh the instance
snapshot.crawl = crawl
snapshot.crawl_id = crawl.id
print(f"[DEBUG Phase1] Created missing crawl for snapshot {str(snapshot.id)[:8]}")
# Check if needs migration (0.8.x → 0.9.x)
print(f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
if snapshot.fs_migration_needed:
try:
# Manually trigger filesystem migration without full save()
# This avoids UNIQUE constraint issues while still migrating files
cleanup_info = None
if hasattr(snapshot, '_fs_migrate_from_0_8_0_to_0_9_0'):
cleanup_info = snapshot._fs_migrate_from_0_8_0_to_0_9_0()
# Calculate paths using actual directory (entry_path), not snapshot.timestamp
# because snapshot.timestamp might be truncated
old_dir = entry_path
new_dir = snapshot.get_storage_path_for_version('0.9.0')
print(f"[DEBUG Phase1] Migrating {old_dir.name}{new_dir}")
# Manually migrate files
if not new_dir.exists() and old_dir.exists():
new_dir.mkdir(parents=True, exist_ok=True)
import shutil
file_count = 0
for old_file in old_dir.rglob('*'):
if old_file.is_file():
rel_path = old_file.relative_to(old_dir)
new_file = new_dir / rel_path
if not new_file.exists():
new_file.parent.mkdir(parents=True, exist_ok=True)
shutil.copy2(old_file, new_file)
file_count += 1
print(f"[DEBUG Phase1] Copied {file_count} files")
# Update only fs_version field using queryset update (bypasses validation)
from archivebox.core.models import Snapshot as SnapshotModel
@@ -158,9 +195,8 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di
# Commit the transaction
transaction.commit()
# Manually call cleanup since we bypassed normal save() flow
if cleanup_info:
old_dir, new_dir = cleanup_info
# Cleanup: delete old dir and create symlink
if old_dir.exists() and old_dir != new_dir:
snapshot._cleanup_old_migration_dir(old_dir, new_dir)
stats['migrated'] += 1
@@ -207,19 +243,39 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
continue
try:
# Reconcile index.json with DB
snapshot.reconcile_with_index_json()
print(f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
# Check if snapshot has a directory on disk
from pathlib import Path
output_dir = Path(snapshot.output_dir)
has_directory = output_dir.exists() and output_dir.is_dir()
# Only reconcile if directory exists (don't create empty directories for orphans)
if has_directory:
snapshot.reconcile_with_index_json()
# Clean up invalid field values from old migrations
if not isinstance(snapshot.current_step, int):
snapshot.current_step = 0
# If still needs migration, it's an orphan (no directory on disk)
# Mark it as migrated to prevent save() from triggering filesystem migration
if snapshot.fs_migration_needed:
if has_directory:
print(f"[DEBUG Phase2] WARNING: Snapshot {str(snapshot.id)[:8]} has directory but still needs migration")
else:
print(f"[DEBUG Phase2] Orphan snapshot {str(snapshot.id)[:8]} - marking as migrated without filesystem operation")
# Use queryset update to set fs_version without triggering save() hooks
from archivebox.core.models import Snapshot as SnapshotModel
SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0')
snapshot.fs_version = '0.9.0'
# Queue for archiving (state machine will handle it)
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.retry_at = timezone.now()
snapshot.save()
stats['reconciled'] += 1
stats['reconciled'] += 1 if has_directory else 0
stats['queued'] += 1
except Exception as e:
# Skip snapshots that can't be processed (e.g., missing crawl)