fix orchestrator statemachine and Process from archiveresult migrations

This commit is contained in:
Nick Sweeting
2026-01-01 16:43:02 -08:00
parent 876feac522
commit 60422adc87
13 changed files with 378 additions and 96 deletions

View File

@@ -30,6 +30,7 @@ from .test_migrations_helpers import (
verify_foreign_keys,
verify_all_snapshots_in_output,
verify_crawl_count,
verify_process_migration,
)
@@ -260,6 +261,54 @@ class TestMigrationFrom08x(unittest.TestCase):
self.assertTrue('ArchiveBox' in output or 'version' in output.lower(),
f"Version output missing expected content: {output[:500]}")
def test_migration_creates_process_records(self):
"""Migration should create Process records for all ArchiveResults."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Verify Process records created
expected_count = len(self.original_data['archiveresults'])
ok, msg = verify_process_migration(self.db_path, expected_count)
self.assertTrue(ok, msg)
def test_migration_creates_binary_records(self):
"""Migration should create Binary records from cmd_version data."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
# Check Binary records exist
cursor.execute("SELECT COUNT(*) FROM machine_binary")
binary_count = cursor.fetchone()[0]
# Should have at least one binary per unique extractor
extractors = set(ar['extractor'] for ar in self.original_data['archiveresults'])
self.assertGreaterEqual(binary_count, len(extractors),
f"Expected at least {len(extractors)} Binaries, got {binary_count}")
conn.close()
def test_migration_preserves_cmd_data(self):
"""Migration should preserve cmd data in Process.cmd field."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
# Check that Process records have cmd arrays
cursor.execute("SELECT cmd FROM machine_process WHERE cmd != '[]'")
cmd_records = cursor.fetchall()
# All Processes should have non-empty cmd (test data has json.dumps([extractor, '--version']))
expected_count = len(self.original_data['archiveresults'])
self.assertEqual(len(cmd_records), expected_count,
f"Expected {expected_count} Processes with cmd, got {len(cmd_records)}")
conn.close()
class TestMigrationDataIntegrity08x(unittest.TestCase):
"""Comprehensive data integrity tests for 0.8.x migrations."""

View File

@@ -730,44 +730,26 @@ def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
tag_id = cursor.lastrowid
created_data['tags'].append({'id': tag_id, 'name': name, 'slug': name.lower()})
# Create Seeds first (required for 0.8.x Crawls)
test_seeds = [
('https://example.com', 'auto', 'Example Seed'),
('https://github.com/ArchiveBox', 'auto', 'GitHub Seed'),
]
created_data['seeds'] = []
for uri, extractor, label in test_seeds:
seed_id = generate_uuid()
cursor.execute("""
INSERT INTO crawls_seed (id, created_at, created_by_id, modified_at, uri,
extractor, tags_str, label, config, output_dir, notes,
num_uses_failed, num_uses_succeeded)
VALUES (?, datetime('now'), ?, datetime('now'), ?, ?, '', ?, '{}', '', '', 0, 0)
""", (seed_id, user_id, uri, extractor, label))
created_data['seeds'].append({'id': seed_id, 'uri': uri, 'label': label})
# Create 2 Crawls (linked to Seeds)
# Create 2 Crawls (0.9.0 schema - no seeds)
test_crawls = [
('https://example.com\nhttps://example.org', 0, 'Example Crawl', created_data['seeds'][0]['id']),
('https://github.com/ArchiveBox', 1, 'GitHub Crawl', created_data['seeds'][1]['id']),
('https://example.com\nhttps://example.org', 0, 'Example Crawl'),
('https://github.com/ArchiveBox', 1, 'GitHub Crawl'),
]
for i, (urls, max_depth, label, seed_id) in enumerate(test_crawls):
for i, (urls, max_depth, label) in enumerate(test_crawls):
crawl_id = generate_uuid()
cursor.execute("""
INSERT INTO crawls_crawl (id, created_at, created_by_id, modified_at, seed_id, urls,
INSERT INTO crawls_crawl (id, created_at, created_by_id, modified_at, urls,
config, max_depth, tags_str, label, status, retry_at,
num_uses_failed, num_uses_succeeded)
VALUES (?, datetime('now'), ?, datetime('now'), ?, ?, '{}', ?, '', ?, 'queued', datetime('now'), 0, 0)
""", (crawl_id, user_id, seed_id, urls, max_depth, label))
VALUES (?, datetime('now'), ?, datetime('now'), ?, '{}', ?, '', ?, 'queued', datetime('now'), 0, 0)
""", (crawl_id, user_id, urls, max_depth, label))
created_data['crawls'].append({
'id': crawl_id,
'urls': urls,
'max_depth': max_depth,
'label': label,
'seed_id': seed_id,
})
# Create 5 snapshots linked to crawls
@@ -1146,3 +1128,64 @@ def verify_crawl_count(db_path: Path, expected: int) -> Tuple[bool, str]:
if count == expected:
return True, f"Crawl count OK: {count}"
return False, f"Crawl count mismatch: expected {expected}, got {count}"
def verify_process_migration(db_path: Path, expected_archiveresult_count: int) -> Tuple[bool, str]:
"""
Verify that ArchiveResults were properly migrated to Process records.
Checks:
1. All ArchiveResults have process_id set
2. Process count matches ArchiveResult count
3. Binary records created for unique cmd_version values
4. Status mapping is correct
"""
conn = sqlite3.connect(str(db_path))
cursor = conn.cursor()
# Check all ArchiveResults have process_id
cursor.execute("SELECT COUNT(*) FROM core_archiveresult WHERE process_id IS NULL")
null_count = cursor.fetchone()[0]
if null_count > 0:
conn.close()
return False, f"Found {null_count} ArchiveResults without process_id"
# Check Process count
cursor.execute("SELECT COUNT(*) FROM machine_process")
process_count = cursor.fetchone()[0]
if process_count != expected_archiveresult_count:
conn.close()
return False, f"Expected {expected_archiveresult_count} Processes, got {process_count}"
# Check status mapping
cursor.execute("""
SELECT ar.status, p.status, p.exit_code
FROM core_archiveresult ar
JOIN machine_process p ON ar.process_id = p.id
""")
status_errors = []
for ar_status, p_status, p_exit_code in cursor.fetchall():
expected_p_status, expected_exit_code = {
'queued': ('queued', None),
'started': ('running', None),
'backoff': ('queued', None),
'succeeded': ('exited', 0),
'failed': ('exited', 1),
'skipped': ('exited', None),
}.get(ar_status, ('queued', None))
if p_status != expected_p_status:
status_errors.append(f"AR status {ar_status} → Process {p_status}, expected {expected_p_status}")
if p_exit_code != expected_exit_code:
status_errors.append(f"AR status {ar_status} → exit_code {p_exit_code}, expected {expected_exit_code}")
if status_errors:
conn.close()
return False, f"Status mapping errors: {'; '.join(status_errors[:5])}"
conn.close()
return True, f"Process migration verified: {process_count} Processes created"