unified Process source of truth and better screenshot tests

2026-04-06 07:47:53 +10:00 · 2026-01-02 04:20:34 -08:00
parent 3672174dad
commit dd77511026
44 changed files with 3369 additions and 1919 deletions
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -96,10 +96,9 @@ def add(urls: str | list[str],
    first_url = crawl.get_urls_list()[0] if crawl.get_urls_list() else ''
    print(f'    [dim]First URL: {first_url}[/dim]')

-    # 3. The CrawlMachine will create the root Snapshot when started
-    #    If URLs are from a file: first URL = file:///path/to/sources/...txt
-    #    Parser extractors will run on it and discover more URLs
-    #    Those URLs become child Snapshots (depth=1)
+    # 3. The CrawlMachine will create Snapshots from all URLs when started
+    #    Parser extractors run on snapshots and discover more URLs
+    #    Discovered URLs become child Snapshots (depth+1)

    if index_only:
        # Just create the crawl but don't start processing
@@ -119,10 +118,9 @@ def add(urls: str | list[str],

    # 5. Start the orchestrator to process the queue
    #    The orchestrator will:
-    #    - Process Crawl -> create root Snapshot
-    #    - Process root Snapshot -> run parser extractors -> discover URLs
-    #    - Create child Snapshots from discovered URLs
-    #    - Process child Snapshots -> run extractors
+    #    - Process Crawl -> create Snapshots from all URLs
+    #    - Process Snapshots -> run extractors
+    #    - Parser extractors discover new URLs -> create child Snapshots
    #    - Repeat until max_depth reached

    if bg:
--- a/archivebox/cli/archivebox_init.py
+++ b/archivebox/cli/archivebox_init.py
@@ -160,10 +160,12 @@ def init(force: bool=False, quick: bool=False, install: bool=False) -> None:
    CONSTANTS.PERSONAS_DIR.mkdir(parents=True, exist_ok=True)
    CONSTANTS.DEFAULT_TMP_DIR.mkdir(parents=True, exist_ok=True)
    CONSTANTS.DEFAULT_LIB_DIR.mkdir(parents=True, exist_ok=True)
-    
+    (CONSTANTS.DEFAULT_LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
+
    from archivebox.config.common import STORAGE_CONFIG
    STORAGE_CONFIG.TMP_DIR.mkdir(parents=True, exist_ok=True)
    STORAGE_CONFIG.LIB_DIR.mkdir(parents=True, exist_ok=True)
+    (STORAGE_CONFIG.LIB_DIR / 'bin').mkdir(parents=True, exist_ok=True)
    
    if install:
        from archivebox.cli.archivebox_install import install as install_method
--- a/archivebox/cli/archivebox_pluginmap.py
+++ b/archivebox/cli/archivebox_pluginmap.py
@@ -96,33 +96,45 @@ ARCHIVERESULT_MACHINE_DIAGRAM = """
 ├─────────────────────────────────────────────────────────────────────────────┤
 │                                                                             │
 │   ┌─────────────┐                                                           │
-│   │   QUEUED    │◄────────────────┐                                         │
-│   │  (initial)  │                 │                                         │
-│   └──────┬──────┘                 │                                         │
-│          │                        │ tick() unless can_start()               │
-│          │ tick() when            │                                         │
-│          │ can_start()            │                                         │
-│          ▼                        │                                         │
-│   ┌─────────────┐                 │                                         │
-│   │   STARTED   │─────────────────┘                                         │
-│   │             │◄────────────────┐                                         │
-│   │ enter:      │                 │ tick() unless is_finished()             │
-│   │ result.run()│─────────────────┘                                         │
-│   │ (execute    │                                                           │
-│   │  hook via   │                                                           │
-│   │  run_hook())│                                                           │
-│   └──────┬──────┘                                                           │
-│          │                                                                  │
-│          │ tick() checks status set by hook output                          │
-│          ├────────────────┬────────────────┬────────────────┐               │
-│          ▼                ▼                ▼                ▼               │
-│   ┌───────────┐    ┌───────────┐    ┌───────────┐    ┌───────────┐          │
-│   │ SUCCEEDED │    │  FAILED   │    │  SKIPPED  │    │  BACKOFF  │          │
-│   │  (final)  │    │  (final)  │    │  (final)  │    │           │          │
-│   └───────────┘    └───────────┘    └───────────┘    └─────┬─────┘          │
-│                                                            │                │
-│                                              can_start()───┘                │
-│                                              loops back to STARTED          │
+│   │   QUEUED    │◄─────────────────┐                                        │
+│   │  (initial)  │                  │                                        │
+│   └──┬───────┬──┘                  │                                        │
+│      │       │                     │ tick() unless can_start()              │
+│      │       │ exceeded_max_       │                                        │
+│      │       │ attempts            │                                        │
+│      │       ▼                     │                                        │
+│      │  ┌──────────┐               │                                        │
+│      │  │ SKIPPED  │               │                                        │
+│      │  │ (final)  │               │                                        │
+│      │  └──────────┘               │                                        │
+│      │ tick() when                 │                                        │
+│      │ can_start()                 │                                        │
+│      ▼                             │                                        │
+│   ┌─────────────┐                  │                                        │
+│   │   STARTED   │──────────────────┘                                        │
+│   │             │◄─────────────────────────────────────────────────┐        │
+│   │ enter:      │                      │                           │        │
+│   │ result.run()│ tick() unless        │                           │        │
+│   │ (execute    │ is_finished()        │                           │        │
+│   │  hook via   │──────────────────────┘                           │        │
+│   │  run_hook())│                                                  │        │
+│   └──────┬──────┘                                                  │        │
+│          │                                                         │        │
+│          │ tick() checks status set by hook output                 │        │
+│          ├─────────────┬─────────────┬─────────────┐               │        │
+│          ▼             ▼             ▼             ▼               │        │
+│   ┌───────────┐ ┌───────────┐ ┌───────────┐ ┌───────────┐         │        │
+│   │ SUCCEEDED │ │  FAILED   │ │  SKIPPED  │ │  BACKOFF  │         │        │
+│   │  (final)  │ │  (final)  │ │  (final)  │ │           │         │        │
+│   └───────────┘ └───────────┘ └───────────┘ └──┬──────┬─┘         │        │
+│                                                 │      │            │        │
+│                                   exceeded_max_ │      │ can_start()│        │
+│                                   attempts      │      │ loops back │        │
+│                                        ▼        │      └────────────┘        │
+│                                   ┌──────────┐  │                            │
+│                                   │ SKIPPED  │◄─┘                            │
+│                                   │ (final)  │                               │
+│                                   └──────────┘                               │
 │                                                                             │
 │   Each ArchiveResult runs ONE specific hook (stored in .hook_name field)    │
 └─────────────────────────────────────────────────────────────────────────────┘
@@ -137,35 +149,38 @@ BINARY_MACHINE_DIAGRAM = """
 │   │   QUEUED    │◄────────────────┐                                         │
 │   │  (initial)  │                 │                                         │
 │   └──────┬──────┘                 │                                         │
-│          │                        │ tick() unless can_start()               │
+│          │                        │ tick() unless can_install()             │
+│          │                        │ (stays queued if failed)                │
 │          │ tick() when            │                                         │
-│          │ can_start()            │                                         │
-│          ▼                        │                                         │
-│   ┌─────────────┐                 │                                         │
-│   │   STARTED   │─────────────────┘                                         │
-│   │             │◄────────────────┐                                         │
-│   │ enter:      │                 │                                         │
-│   │ binary.run()│                 │ tick() unless is_finished()             │
-│   │ (discover   │─────────────────┘                                         │
-│   │  Binary     │                                                           │
-│   │  hooks,     │                                                           │
-│   │  try each   │                                                           │
-│   │  provider)  │                                                           │
-│   └──────┬──────┘                                                           │
+│          │ can_install()          │                                         │
+│          │                        │                                         │
+│          │ on_install() runs      │                                         │
+│          │ during transition:     │                                         │
+│          │  • binary.run()        │                                         │
+│          │    (discover Binary    │                                         │
+│          │     hooks, try each    │                                         │
+│          │     provider until     │                                         │
+│          │     one succeeds)      │                                         │
+│          │  • Sets abspath,       │                                         │
+│          │    version, sha256     │                                         │
+│          │                        │                                         │
+│          │ If install fails:      │                                         │
+│          │  raises exception──────┘                                         │
+│          │  (retry_at bumped)                                               │
 │          │                                                                  │
-│          │ tick() checks status set by hook output                          │
-│          ├────────────────────────────────┐                                 │
-│          ▼                                ▼                                 │
-│   ┌─────────────┐                  ┌─────────────┐                          │
-│   │  SUCCEEDED  │                  │   FAILED    │                          │
-│   │   (final)   │                  │   (final)   │                          │
-│   │             │                  │             │                          │
-│   │ abspath,    │                  │ no provider │                          │
-│   │ version set │                  │ succeeded   │                          │
-│   └─────────────┘                  └─────────────┘                          │
+│          ▼                                                                  │
+│   ┌─────────────┐                                                           │
+│   │  INSTALLED  │                                                           │
+│   │   (final)   │                                                           │
+│   │             │                                                           │
+│   │ Binary is   │                                                           │
+│   │ ready to    │                                                           │
+│   │ use         │                                                           │
+│   └─────────────┘                                                           │
 │                                                                             │
-│   Hooks triggered: on_Binary__* (provider hooks during STARTED.enter)       │
+│   Hooks triggered: on_Binary__* (provider hooks during transition)          │
 │   Providers tried in sequence until one succeeds: apt, brew, pip, npm, etc. │
+│   Installation is synchronous - no intermediate STARTED state               │
 └─────────────────────────────────────────────────────────────────────────────┘
 """

--- a/archivebox/cli/archivebox_update.py
+++ b/archivebox/cli/archivebox_update.py
@@ -109,15 +109,18 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di
    if not archive_dir.exists():
        return stats

-    print('[*] Scanning for old directories in archive/...')
+    print('[DEBUG Phase1] Scanning for old directories in archive/...')

    # Scan for real directories only (skip symlinks - they're already migrated)
+    all_entries = list(os.scandir(archive_dir))
+    print(f'[DEBUG Phase1] Total entries in archive/: {len(all_entries)}')
    entries = [
        (e.stat().st_mtime, e.path)
-        for e in os.scandir(archive_dir)
+        for e in all_entries
        if e.is_dir(follow_symlinks=False)  # Skip symlinks
    ]
    entries.sort(reverse=True)  # Newest first
+    print(f'[DEBUG Phase1] Real directories (not symlinks): {len(entries)}')
    print(f'[*] Found {len(entries)} old directories to drain')

    for mtime, entry_path in entries:
@@ -142,14 +145,48 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di
                print(f"    [{stats['processed']}] Invalid: {entry_path.name}")
                continue

+        # Ensure snapshot has a valid crawl (migration 0024 may have failed)
+        from archivebox.crawls.models import Crawl
+        has_valid_crawl = False
+        if snapshot.crawl_id:
+            # Check if the crawl actually exists
+            has_valid_crawl = Crawl.objects.filter(id=snapshot.crawl_id).exists()
+
+        if not has_valid_crawl:
+            # Create a new crawl (created_by will default to system user)
+            crawl = Crawl.objects.create(urls=snapshot.url)
+            # Use queryset update to avoid triggering save() hooks
+            from archivebox.core.models import Snapshot as SnapshotModel
+            SnapshotModel.objects.filter(pk=snapshot.pk).update(crawl=crawl)
+            # Refresh the instance
+            snapshot.crawl = crawl
+            snapshot.crawl_id = crawl.id
+            print(f"[DEBUG Phase1] Created missing crawl for snapshot {str(snapshot.id)[:8]}")
+
        # Check if needs migration (0.8.x → 0.9.x)
+        print(f"[DEBUG Phase1] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
        if snapshot.fs_migration_needed:
            try:
-                # Manually trigger filesystem migration without full save()
-                # This avoids UNIQUE constraint issues while still migrating files
-                cleanup_info = None
-                if hasattr(snapshot, '_fs_migrate_from_0_8_0_to_0_9_0'):
-                    cleanup_info = snapshot._fs_migrate_from_0_8_0_to_0_9_0()
+                # Calculate paths using actual directory (entry_path), not snapshot.timestamp
+                # because snapshot.timestamp might be truncated
+                old_dir = entry_path
+                new_dir = snapshot.get_storage_path_for_version('0.9.0')
+                print(f"[DEBUG Phase1] Migrating {old_dir.name} → {new_dir}")
+
+                # Manually migrate files
+                if not new_dir.exists() and old_dir.exists():
+                    new_dir.mkdir(parents=True, exist_ok=True)
+                    import shutil
+                    file_count = 0
+                    for old_file in old_dir.rglob('*'):
+                        if old_file.is_file():
+                            rel_path = old_file.relative_to(old_dir)
+                            new_file = new_dir / rel_path
+                            if not new_file.exists():
+                                new_file.parent.mkdir(parents=True, exist_ok=True)
+                                shutil.copy2(old_file, new_file)
+                                file_count += 1
+                    print(f"[DEBUG Phase1] Copied {file_count} files")

                # Update only fs_version field using queryset update (bypasses validation)
                from archivebox.core.models import Snapshot as SnapshotModel
@@ -158,9 +195,8 @@ def drain_old_archive_dirs(resume_from: str = None, batch_size: int = 100) -> di
                # Commit the transaction
                transaction.commit()

-                # Manually call cleanup since we bypassed normal save() flow
-                if cleanup_info:
-                    old_dir, new_dir = cleanup_info
+                # Cleanup: delete old dir and create symlink
+                if old_dir.exists() and old_dir != new_dir:
                    snapshot._cleanup_old_migration_dir(old_dir, new_dir)

                stats['migrated'] += 1
@@ -207,19 +243,39 @@ def process_all_db_snapshots(batch_size: int = 100) -> dict:
            continue

        try:
-            # Reconcile index.json with DB
-            snapshot.reconcile_with_index_json()
+            print(f"[DEBUG Phase2] Snapshot {str(snapshot.id)[:8]}: fs_version={snapshot.fs_version}, needs_migration={snapshot.fs_migration_needed}")
+
+            # Check if snapshot has a directory on disk
+            from pathlib import Path
+            output_dir = Path(snapshot.output_dir)
+            has_directory = output_dir.exists() and output_dir.is_dir()
+
+            # Only reconcile if directory exists (don't create empty directories for orphans)
+            if has_directory:
+                snapshot.reconcile_with_index_json()

            # Clean up invalid field values from old migrations
            if not isinstance(snapshot.current_step, int):
                snapshot.current_step = 0

+            # If still needs migration, it's an orphan (no directory on disk)
+            # Mark it as migrated to prevent save() from triggering filesystem migration
+            if snapshot.fs_migration_needed:
+                if has_directory:
+                    print(f"[DEBUG Phase2] WARNING: Snapshot {str(snapshot.id)[:8]} has directory but still needs migration")
+                else:
+                    print(f"[DEBUG Phase2] Orphan snapshot {str(snapshot.id)[:8]} - marking as migrated without filesystem operation")
+                # Use queryset update to set fs_version without triggering save() hooks
+                from archivebox.core.models import Snapshot as SnapshotModel
+                SnapshotModel.objects.filter(pk=snapshot.pk).update(fs_version='0.9.0')
+                snapshot.fs_version = '0.9.0'
+
            # Queue for archiving (state machine will handle it)
            snapshot.status = Snapshot.StatusChoices.QUEUED
            snapshot.retry_at = timezone.now()
            snapshot.save()

-            stats['reconciled'] += 1
+            stats['reconciled'] += 1 if has_directory else 0
            stats['queued'] += 1
        except Exception as e:
            # Skip snapshots that can't be processed (e.g., missing crawl)