fix orchestrator statemachine and Process from archiveresult migrations

2026-01-03 01:15:57 +10:00 · 2026-01-01 16:43:02 -08:00
parent 876feac522
commit 60422adc87
13 changed files with 378 additions and 96 deletions
--- a/archivebox/tests/test_migrations_08_to_09.py
+++ b/archivebox/tests/test_migrations_08_to_09.py
@@ -30,6 +30,7 @@ from .test_migrations_helpers import (
    verify_foreign_keys,
    verify_all_snapshots_in_output,
    verify_crawl_count,
+    verify_process_migration,
 )


@@ -260,6 +261,54 @@ class TestMigrationFrom08x(unittest.TestCase):
        self.assertTrue('ArchiveBox' in output or 'version' in output.lower(),
                       f"Version output missing expected content: {output[:500]}")

+    def test_migration_creates_process_records(self):
+        """Migration should create Process records for all ArchiveResults."""
+        result = run_archivebox(self.work_dir, ['init'], timeout=45)
+        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
+
+        # Verify Process records created
+        expected_count = len(self.original_data['archiveresults'])
+        ok, msg = verify_process_migration(self.db_path, expected_count)
+        self.assertTrue(ok, msg)
+
+    def test_migration_creates_binary_records(self):
+        """Migration should create Binary records from cmd_version data."""
+        result = run_archivebox(self.work_dir, ['init'], timeout=45)
+        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
+
+        conn = sqlite3.connect(str(self.db_path))
+        cursor = conn.cursor()
+
+        # Check Binary records exist
+        cursor.execute("SELECT COUNT(*) FROM machine_binary")
+        binary_count = cursor.fetchone()[0]
+
+        # Should have at least one binary per unique extractor
+        extractors = set(ar['extractor'] for ar in self.original_data['archiveresults'])
+        self.assertGreaterEqual(binary_count, len(extractors),
+                              f"Expected at least {len(extractors)} Binaries, got {binary_count}")
+
+        conn.close()
+
+    def test_migration_preserves_cmd_data(self):
+        """Migration should preserve cmd data in Process.cmd field."""
+        result = run_archivebox(self.work_dir, ['init'], timeout=45)
+        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
+
+        conn = sqlite3.connect(str(self.db_path))
+        cursor = conn.cursor()
+
+        # Check that Process records have cmd arrays
+        cursor.execute("SELECT cmd FROM machine_process WHERE cmd != '[]'")
+        cmd_records = cursor.fetchall()
+
+        # All Processes should have non-empty cmd (test data has json.dumps([extractor, '--version']))
+        expected_count = len(self.original_data['archiveresults'])
+        self.assertEqual(len(cmd_records), expected_count,
+                        f"Expected {expected_count} Processes with cmd, got {len(cmd_records)}")
+
+        conn.close()
+

 class TestMigrationDataIntegrity08x(unittest.TestCase):
    """Comprehensive data integrity tests for 0.8.x migrations."""
--- a/archivebox/tests/test_migrations_helpers.py
+++ b/archivebox/tests/test_migrations_helpers.py
@@ -730,44 +730,26 @@ def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
        tag_id = cursor.lastrowid
        created_data['tags'].append({'id': tag_id, 'name': name, 'slug': name.lower()})

-    # Create Seeds first (required for 0.8.x Crawls)
-    test_seeds = [
-        ('https://example.com', 'auto', 'Example Seed'),
-        ('https://github.com/ArchiveBox', 'auto', 'GitHub Seed'),
-    ]
-
-    created_data['seeds'] = []
-    for uri, extractor, label in test_seeds:
-        seed_id = generate_uuid()
-        cursor.execute("""
-            INSERT INTO crawls_seed (id, created_at, created_by_id, modified_at, uri,
-                                     extractor, tags_str, label, config, output_dir, notes,
-                                     num_uses_failed, num_uses_succeeded)
-            VALUES (?, datetime('now'), ?, datetime('now'), ?, ?, '', ?, '{}', '', '', 0, 0)
-        """, (seed_id, user_id, uri, extractor, label))
-        created_data['seeds'].append({'id': seed_id, 'uri': uri, 'label': label})
-
-    # Create 2 Crawls (linked to Seeds)
+    # Create 2 Crawls (0.9.0 schema - no seeds)
    test_crawls = [
-        ('https://example.com\nhttps://example.org', 0, 'Example Crawl', created_data['seeds'][0]['id']),
-        ('https://github.com/ArchiveBox', 1, 'GitHub Crawl', created_data['seeds'][1]['id']),
+        ('https://example.com\nhttps://example.org', 0, 'Example Crawl'),
+        ('https://github.com/ArchiveBox', 1, 'GitHub Crawl'),
    ]

-    for i, (urls, max_depth, label, seed_id) in enumerate(test_crawls):
+    for i, (urls, max_depth, label) in enumerate(test_crawls):
        crawl_id = generate_uuid()
        cursor.execute("""
-            INSERT INTO crawls_crawl (id, created_at, created_by_id, modified_at, seed_id, urls,
+            INSERT INTO crawls_crawl (id, created_at, created_by_id, modified_at, urls,
                                      config, max_depth, tags_str, label, status, retry_at,
                                      num_uses_failed, num_uses_succeeded)
-            VALUES (?, datetime('now'), ?, datetime('now'), ?, ?, '{}', ?, '', ?, 'queued', datetime('now'), 0, 0)
-        """, (crawl_id, user_id, seed_id, urls, max_depth, label))
+            VALUES (?, datetime('now'), ?, datetime('now'), ?, '{}', ?, '', ?, 'queued', datetime('now'), 0, 0)
+        """, (crawl_id, user_id, urls, max_depth, label))

        created_data['crawls'].append({
            'id': crawl_id,
            'urls': urls,
            'max_depth': max_depth,
            'label': label,
-            'seed_id': seed_id,
        })

    # Create 5 snapshots linked to crawls
@@ -1146,3 +1128,64 @@ def verify_crawl_count(db_path: Path, expected: int) -> Tuple[bool, str]:
    if count == expected:
        return True, f"Crawl count OK: {count}"
    return False, f"Crawl count mismatch: expected {expected}, got {count}"
+
+
+def verify_process_migration(db_path: Path, expected_archiveresult_count: int) -> Tuple[bool, str]:
+    """
+    Verify that ArchiveResults were properly migrated to Process records.
+
+    Checks:
+    1. All ArchiveResults have process_id set
+    2. Process count matches ArchiveResult count
+    3. Binary records created for unique cmd_version values
+    4. Status mapping is correct
+    """
+    conn = sqlite3.connect(str(db_path))
+    cursor = conn.cursor()
+
+    # Check all ArchiveResults have process_id
+    cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE process_id IS NULL")
+    null_count = cursor.fetchone()[0]
+
+    if null_count > 0:
+        conn.close()
+        return False, f"Found {null_count} ArchiveResults without process_id"
+
+    # Check Process count
+    cursor.execute("SELECT COUNT(*) FROM machine_process")
+    process_count = cursor.fetchone()[0]
+
+    if process_count != expected_archiveresult_count:
+        conn.close()
+        return False, f"Expected {expected_archiveresult_count} Processes, got {process_count}"
+
+    # Check status mapping
+    cursor.execute("""
+        SELECT ar.status, p.status, p.exit_code
+        FROM core_archiveresult ar
+        JOIN machine_process p ON ar.process_id = p.id
+    """)
+
+    status_errors = []
+    for ar_status, p_status, p_exit_code in cursor.fetchall():
+        expected_p_status, expected_exit_code = {
+            'queued': ('queued', None),
+            'started': ('running', None),
+            'backoff': ('queued', None),
+            'succeeded': ('exited', 0),
+            'failed': ('exited', 1),
+            'skipped': ('exited', None),
+        }.get(ar_status, ('queued', None))
+
+        if p_status != expected_p_status:
+            status_errors.append(f"AR status {ar_status} → Process {p_status}, expected {expected_p_status}")
+
+        if p_exit_code != expected_exit_code:
+            status_errors.append(f"AR status {ar_status} → exit_code {p_exit_code}, expected {expected_exit_code}")
+
+    if status_errors:
+        conn.close()
+        return False, f"Status mapping errors: {'; '.join(status_errors[:5])}"
+
+    conn.close()
+    return True, f"Process migration verified: {process_count} Processes created"