Add crawls_crawlschedule table to 0.8.x test schema and fix migrations

- Add missing crawls_crawlschedule table definition to SCHEMA_0_8 in test file - Record all replaced dev branch migrations (0023-0074) for squashed migration - Update 0024_snapshot_crawl migration to depend on squashed machine migration - Remove 'extractor' field references from crawls admin - All 45 migration tests now pass (0.4.x, 0.7.x, 0.8.x, fresh install)
2026-04-03 06:17:53 +10:00 · 2025-12-27 04:32:58 +00:00
parent 766bb28536
commit ea6fe94c93
3 changed files with 158 additions and 13 deletions
--- a/archivebox/core/migrations/0024_snapshot_crawl.py
+++ b/archivebox/core/migrations/0024_snapshot_crawl.py
@@ -10,7 +10,7 @@ class Migration(migrations.Migration):
    dependencies = [
        ('core', '0023_new_schema'),
        ('crawls', '0001_initial'),
-        ('machine', '0001_initial'),
+        ('machine', '0001_squashed'),
    ]
    operations = [
--- a/archivebox/crawls/admin.py
+++ b/archivebox/crawls/admin.py
@@ -170,7 +170,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
            'classes': ('card',),
        }),
        ('Settings', {
-            'fields': ('max_depth', 'extractor', 'config'),
+            'fields': ('max_depth', 'config'),
            'classes': ('card',),
        }),
        ('Status', {
@@ -191,7 +191,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
        }),
    )
-    list_filter = ('max_depth', 'extractor', 'schedule', 'created_by', 'status', 'retry_at')
+    list_filter = ('max_depth', 'schedule', 'created_by', 'status', 'retry_at')
    ordering = ['-created_at', '-retry_at']
    list_per_page = 100
    actions = ["delete_selected_batched"]
--- a/archivebox/tests/tests_migrations.py
+++ b/archivebox/tests/tests_migrations.py
@@ -15,7 +15,9 @@ Schema Evolution:
         changed primary keys from AutoField to UUID for Tag/ArchiveResult
 """
-__package__ = 'archivebox.cli'
+# Note: This test file intentionally does NOT set __package__ to avoid
 # importing archivebox directly (which would trigger root checks).
 # All tests run archivebox via subprocess, which handles its own env.
 import os
 import sys
@@ -346,6 +348,36 @@ CREATE TABLE IF NOT EXISTS machine_installedbinary (
    num_uses_succeeded INTEGER NOT NULL DEFAULT 0
 );
 -- API app tables (added in 0.8.x)
 CREATE TABLE IF NOT EXISTS api_apitoken (
    id CHAR(36) PRIMARY KEY,
    created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
    created_at DATETIME NOT NULL,
    modified_at DATETIME,
    token VARCHAR(32) NOT NULL UNIQUE,
    expires DATETIME
 );
 CREATE TABLE IF NOT EXISTS api_outboundwebhook (
    id CHAR(36) PRIMARY KEY,
    created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
    created_at DATETIME NOT NULL,
    modified_at DATETIME,
    name VARCHAR(255) NOT NULL DEFAULT '',
    signal VARCHAR(255) NOT NULL,
    ref VARCHAR(255) NOT NULL,
    endpoint VARCHAR(2083) NOT NULL,
    headers TEXT DEFAULT '{}',
    auth_token VARCHAR(4000) NOT NULL DEFAULT '',
    enabled BOOLEAN NOT NULL DEFAULT 1,
    keep_last_response BOOLEAN NOT NULL DEFAULT 0,
    last_response TEXT NOT NULL DEFAULT '',
    last_success DATETIME,
    last_failure DATETIME,
    num_uses_failed INTEGER NOT NULL DEFAULT 0,
    num_uses_succeeded INTEGER NOT NULL DEFAULT 0
 );
 -- Core Tag table (AutoField PK in 0.8.x)
 CREATE TABLE IF NOT EXISTS core_tag (
    id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -374,6 +406,20 @@ CREATE TABLE IF NOT EXISTS crawls_seed (
    num_uses_succeeded INTEGER NOT NULL DEFAULT 0
 );
 CREATE TABLE IF NOT EXISTS crawls_crawlschedule (
    id CHAR(36) PRIMARY KEY,
    created_at DATETIME NOT NULL,
    created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
    modified_at DATETIME,
    schedule VARCHAR(64) NOT NULL,
    is_enabled BOOLEAN NOT NULL DEFAULT 1,
    label VARCHAR(64) NOT NULL DEFAULT '',
    notes TEXT NOT NULL DEFAULT '',
    template_id CHAR(36) REFERENCES crawls_crawl(id),
    num_uses_failed INTEGER NOT NULL DEFAULT 0,
    num_uses_succeeded INTEGER NOT NULL DEFAULT 0
 );
 CREATE TABLE IF NOT EXISTS crawls_crawl (
    id CHAR(36) PRIMARY KEY,
    created_at DATETIME NOT NULL,
@@ -472,7 +518,9 @@ INSERT INTO django_content_type (app_label, model) VALUES
 ('machine', 'installedbinary'),
 ('crawls', 'crawl'),
 ('crawls', 'crawlschedule'),
-('crawls', 'seed');
+('crawls', 'seed'),
 ('api', 'apitoken'),
 ('api', 'outboundwebhook');
 """
@@ -873,11 +921,83 @@ def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
        ('core', '0020_auto_20210410_1031'),
        ('core', '0021_auto_20220914_0934'),
        ('core', '0022_auto_20231023_2008'),
        # For 0.8.x (dev branch), record the migrations that 0023_new_schema replaces
        # This is required because 0023_new_schema is a squashed migration
        ('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
        ('core', '0024_auto_20240513_1143'),
        ('core', '0025_alter_archiveresult_uuid'),
        ('core', '0026_archiveresult_created_archiveresult_created_by_and_more'),
        ('core', '0027_update_snapshot_ids'),
        ('core', '0028_alter_archiveresult_uuid'),
        ('core', '0029_alter_archiveresult_id'),
        ('core', '0030_alter_archiveresult_uuid'),
        ('core', '0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more'),
        ('core', '0032_alter_archiveresult_id'),
        ('core', '0033_rename_id_archiveresult_old_id'),
        ('core', '0034_alter_archiveresult_old_id_alter_archiveresult_uuid'),
        ('core', '0035_remove_archiveresult_uuid_archiveresult_id'),
        ('core', '0036_alter_archiveresult_id_alter_archiveresult_old_id'),
        ('core', '0037_rename_id_snapshot_old_id'),
        ('core', '0038_rename_uuid_snapshot_id'),
        ('core', '0039_rename_snapshot_archiveresult_snapshot_old'),
        ('core', '0040_archiveresult_snapshot'),
        ('core', '0041_alter_archiveresult_snapshot_and_more'),
        ('core', '0042_remove_archiveresult_snapshot_old'),
        ('core', '0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
        ('core', '0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more'),
        ('core', '0045_alter_snapshot_old_id'),
        ('core', '0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
        ('core', '0047_alter_snapshottag_unique_together_and_more'),
        ('core', '0048_alter_archiveresult_snapshot_and_more'),
        ('core', '0049_rename_snapshot_snapshottag_snapshot_old_and_more'),
        ('core', '0050_alter_snapshottag_snapshot_old'),
        ('core', '0051_snapshottag_snapshot_alter_snapshottag_snapshot_old'),
        ('core', '0052_alter_snapshottag_unique_together_and_more'),
        ('core', '0053_remove_snapshottag_snapshot_old'),
        ('core', '0054_alter_snapshot_timestamp'),
        ('core', '0055_alter_tag_slug'),
        ('core', '0056_remove_tag_uuid'),
        ('core', '0057_rename_id_tag_old_id'),
        ('core', '0058_alter_tag_old_id'),
        ('core', '0059_tag_id'),
        ('core', '0060_alter_tag_id'),
        ('core', '0061_rename_tag_snapshottag_old_tag_and_more'),
        ('core', '0062_alter_snapshottag_old_tag'),
        ('core', '0063_snapshottag_tag_alter_snapshottag_old_tag'),
        ('core', '0064_alter_snapshottag_unique_together_and_more'),
        ('core', '0065_remove_snapshottag_old_tag'),
        ('core', '0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id'),
        ('core', '0067_alter_snapshottag_tag'),
        ('core', '0068_alter_archiveresult_options'),
        ('core', '0069_alter_archiveresult_created_alter_snapshot_added_and_more'),
        ('core', '0070_alter_archiveresult_created_by_alter_snapshot_added_and_more'),
        ('core', '0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more'),
        ('core', '0072_rename_added_snapshot_bookmarked_at_and_more'),
        ('core', '0073_rename_created_archiveresult_created_at_and_more'),
        ('core', '0074_alter_snapshot_downloaded_at'),
        # Also record the squashed migration itself
        ('core', '0023_new_schema'),
-        # Machine app migrations (required by core.0024)
+        # Machine app - record both squashed and individual migrations (like fresh install does)
        ('machine', '0001_initial'),
        ('machine', '0001_squashed'),
        ('machine', '0002_alter_machine_stats_installedbinary'),
        ('machine', '0003_alter_installedbinary_options_and_more'),
        ('machine', '0004_alter_installedbinary_abspath_and_more'),
        ('core', '0024_snapshot_crawl'),
        ('core', '0025_allow_duplicate_urls_per_crawl'),
        # Note: core.0026 removes output_dir which the 0.8.x schema still has
        # Let Django apply it during migration
        # API app - record both squashed and individual migrations (like fresh install does)
        ('api', '0001_initial'),
        ('api', '0001_squashed'),
        ('api', '0002_alter_apitoken_options'),
        ('api', '0003_rename_user_apitoken_created_by_apitoken_abid_and_more'),
        ('api', '0004_alter_apitoken_id_alter_apitoken_uuid'),
        ('api', '0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more'),
        ('api', '0006_remove_outboundwebhook_uuid_apitoken_id_and_more'),
        ('api', '0007_alter_apitoken_created_by'),
        ('api', '0008_alter_apitoken_created_alter_apitoken_created_by_and_more'),
        ('api', '0009_rename_created_apitoken_created_at_and_more'),
        # Crawls migrations
        ('crawls', '0001_initial'),
    ]
@@ -1040,6 +1160,7 @@ class TestFreshInstall(unittest.TestCase):
        """Fresh init should create database and directories."""
        work_dir = Path(tempfile.mkdtemp())
        try:
            result = run_archivebox(work_dir, ['init'])
            self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
@@ -1056,6 +1177,7 @@ class TestFreshInstall(unittest.TestCase):
        """Status command should work after init."""
        work_dir = Path(tempfile.mkdtemp())
        try:
            result = run_archivebox(work_dir, ['init'])
            self.assertEqual(result.returncode, 0)
@@ -1070,6 +1192,7 @@ class TestFreshInstall(unittest.TestCase):
        """Should be able to add URLs after init with --index-only (fast)."""
        work_dir = Path(tempfile.mkdtemp())
        try:
            result = run_archivebox(work_dir, ['init'])
            self.assertEqual(result.returncode, 0)
@@ -1101,6 +1224,7 @@ class TestFreshInstall(unittest.TestCase):
        """List command should show added snapshots."""
        work_dir = Path(tempfile.mkdtemp())
        try:
            result = run_archivebox(work_dir, ['init'])
            self.assertEqual(result.returncode, 0)
@@ -1118,6 +1242,7 @@ class TestFreshInstall(unittest.TestCase):
        """Django migrations table should be populated after init."""
        work_dir = Path(tempfile.mkdtemp())
        try:
            result = run_archivebox(work_dir, ['init'])
            self.assertEqual(result.returncode, 0)
@@ -1138,6 +1263,7 @@ class TestFreshInstall(unittest.TestCase):
        """Core app migrations should be applied."""
        work_dir = Path(tempfile.mkdtemp())
        try:
            result = run_archivebox(work_dir, ['init'])
            self.assertEqual(result.returncode, 0)
@@ -1161,6 +1287,7 @@ class TestSchemaIntegrity(unittest.TestCase):
        """Snapshot table should have all required columns."""
        work_dir = Path(tempfile.mkdtemp())
        try:
            result = run_archivebox(work_dir, ['init'])
            self.assertEqual(result.returncode, 0)
@@ -1182,6 +1309,7 @@ class TestSchemaIntegrity(unittest.TestCase):
        """ArchiveResult table should have all required columns."""
        work_dir = Path(tempfile.mkdtemp())
        try:
            result = run_archivebox(work_dir, ['init'])
            self.assertEqual(result.returncode, 0)
@@ -1203,6 +1331,7 @@ class TestSchemaIntegrity(unittest.TestCase):
        """Tag table should have all required columns."""
        work_dir = Path(tempfile.mkdtemp())
        try:
            result = run_archivebox(work_dir, ['init'])
            self.assertEqual(result.returncode, 0)
@@ -1228,6 +1357,7 @@ class TestMultipleSnapshots(unittest.TestCase):
        """Should be able to add multiple URLs with --index-only."""
        work_dir = Path(tempfile.mkdtemp())
        try:
            result = run_archivebox(work_dir, ['init'])
            self.assertEqual(result.returncode, 0)
@@ -1269,6 +1399,9 @@ class TestMigrationFrom07x(unittest.TestCase):
        # Seed with test data
        self.original_data = seed_0_7_data(self.db_path)
        # Change ownership to testuser so archivebox can write to it
    def tearDown(self):
        """Clean up temporary directory."""
        shutil.rmtree(self.work_dir, ignore_errors=True)
@@ -1501,6 +1634,9 @@ class TestMigrationFrom04x(unittest.TestCase):
        # Seed with test data
        self.original_data = seed_0_4_data(self.db_path)
        # Change ownership to testuser so archivebox can write to it
    def tearDown(self):
        """Clean up temporary directory."""
        shutil.rmtree(self.work_dir, ignore_errors=True)
@@ -1542,7 +1678,6 @@ class TestMigrationFrom04x(unittest.TestCase):
        self.assertTrue(ok, msg)
@unittest.skip("0.8.x migration tests skipped: complex machine app state issues with Django migration loader")
 class TestMigrationFrom08x(unittest.TestCase):
    """Test migration from 0.8.x schema to latest.
@@ -1551,11 +1686,6 @@ class TestMigrationFrom08x(unittest.TestCase):
    - UUID primary keys for Snapshot
    - Status fields for state machine
    - New fields like depth, retry_at, etc.
    NOTE: These tests are currently skipped because the 0.8.x schema has complex
    migration state dependencies with the machine app that Django's migration loader
    has trouble resolving. The 0.7.x tests are the critical path since most users
    will be upgrading from the stable 0.7.x branch, not the dev 0.8.x branch.
    """
    def setUp(self):
@@ -1574,6 +1704,9 @@ class TestMigrationFrom08x(unittest.TestCase):
        # Seed with test data
        self.original_data = seed_0_8_data(self.db_path)
        # Change ownership to testuser so archivebox can write to it
    def tearDown(self):
        """Clean up temporary directory."""
        shutil.rmtree(self.work_dir, ignore_errors=True)
@@ -1724,11 +1857,20 @@ class TestMigrationFrom08x(unittest.TestCase):
    def test_add_works_after_migration(self):
        """Adding new URLs should work after migration from 0.8.x."""
        result = run_archivebox(self.work_dir, ['init'], timeout=45)
        # Check that init actually ran and applied migrations
        self.assertIn('Applying', result.stdout + result.stderr,
            f"Init did not apply migrations. stdout: {result.stdout[:500]}, stderr: {result.stderr[:500]}")
        self.assertIn(result.returncode, [0, 1], f"Init crashed: {result.stderr}")
-        # Count existing crawls
+        # Check that seed_id column was removed by migration
        conn = sqlite3.connect(str(self.db_path))
        cursor = conn.cursor()
        cursor.execute("PRAGMA table_info(crawls_crawl)")
        columns = [row[1] for row in cursor.fetchall()]
        self.assertNotIn('seed_id', columns,
            f"seed_id column should have been removed by migration. Columns: {columns}")
        # Count existing crawls
        cursor.execute("SELECT COUNT(*) FROM crawls_crawl")
        initial_crawl_count = cursor.fetchone()[0]
        conn.close()
@@ -1777,6 +1919,7 @@ class TestMigrationDataIntegrity(unittest.TestCase):
            conn.close()
            seed_0_7_data(db_path)
            result = run_archivebox(work_dir, ['init'], timeout=45)
            self.assertIn(result.returncode, [0, 1])
@@ -1807,6 +1950,7 @@ class TestMigrationDataIntegrity(unittest.TestCase):
            conn.close()
            seed_0_7_data(db_path)
            result = run_archivebox(work_dir, ['init'], timeout=45)
            self.assertIn(result.returncode, [0, 1])
@@ -1828,6 +1972,7 @@ class TestMigrationDataIntegrity(unittest.TestCase):
            conn.close()
            original_data = seed_0_7_data(db_path)
            original_timestamps = {s['url']: s['timestamp'] for s in original_data['snapshots']}
            result = run_archivebox(work_dir, ['init'], timeout=45)