Add crawls_crawlschedule table to 0.8.x test schema and fix migrations

- Add missing crawls_crawlschedule table definition to SCHEMA_0_8 in test file
- Record all replaced dev branch migrations (0023-0074) for squashed migration
- Update 0024_snapshot_crawl migration to depend on squashed machine migration
- Remove 'extractor' field references from crawls admin
- All 45 migration tests now pass (0.4.x, 0.7.x, 0.8.x, fresh install)
This commit is contained in:
Claude
2025-12-27 04:32:58 +00:00
parent 766bb28536
commit ea6fe94c93
3 changed files with 158 additions and 13 deletions

View File

@@ -10,7 +10,7 @@ class Migration(migrations.Migration):
dependencies = [
('core', '0023_new_schema'),
('crawls', '0001_initial'),
('machine', '0001_initial'),
('machine', '0001_squashed'),
]
operations = [

View File

@@ -170,7 +170,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
'classes': ('card',),
}),
('Settings', {
'fields': ('max_depth', 'extractor', 'config'),
'fields': ('max_depth', 'config'),
'classes': ('card',),
}),
('Status', {
@@ -191,7 +191,7 @@ class CrawlAdmin(ConfigEditorMixin, BaseModelAdmin):
}),
)
list_filter = ('max_depth', 'extractor', 'schedule', 'created_by', 'status', 'retry_at')
list_filter = ('max_depth', 'schedule', 'created_by', 'status', 'retry_at')
ordering = ['-created_at', '-retry_at']
list_per_page = 100
actions = ["delete_selected_batched"]

View File

@@ -15,7 +15,9 @@ Schema Evolution:
changed primary keys from AutoField to UUID for Tag/ArchiveResult
"""
__package__ = 'archivebox.cli'
# Note: This test file intentionally does NOT set __package__ to avoid
# importing archivebox directly (which would trigger root checks).
# All tests run archivebox via subprocess, which handles its own env.
import os
import sys
@@ -346,6 +348,36 @@ CREATE TABLE IF NOT EXISTS machine_installedbinary (
num_uses_succeeded INTEGER NOT NULL DEFAULT 0
);
-- API app tables (added in 0.8.x)
CREATE TABLE IF NOT EXISTS api_apitoken (
id CHAR(36) PRIMARY KEY,
created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
created_at DATETIME NOT NULL,
modified_at DATETIME,
token VARCHAR(32) NOT NULL UNIQUE,
expires DATETIME
);
CREATE TABLE IF NOT EXISTS api_outboundwebhook (
id CHAR(36) PRIMARY KEY,
created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
created_at DATETIME NOT NULL,
modified_at DATETIME,
name VARCHAR(255) NOT NULL DEFAULT '',
signal VARCHAR(255) NOT NULL,
ref VARCHAR(255) NOT NULL,
endpoint VARCHAR(2083) NOT NULL,
headers TEXT DEFAULT '{}',
auth_token VARCHAR(4000) NOT NULL DEFAULT '',
enabled BOOLEAN NOT NULL DEFAULT 1,
keep_last_response BOOLEAN NOT NULL DEFAULT 0,
last_response TEXT NOT NULL DEFAULT '',
last_success DATETIME,
last_failure DATETIME,
num_uses_failed INTEGER NOT NULL DEFAULT 0,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0
);
-- Core Tag table (AutoField PK in 0.8.x)
CREATE TABLE IF NOT EXISTS core_tag (
id INTEGER PRIMARY KEY AUTOINCREMENT,
@@ -374,6 +406,20 @@ CREATE TABLE IF NOT EXISTS crawls_seed (
num_uses_succeeded INTEGER NOT NULL DEFAULT 0
);
CREATE TABLE IF NOT EXISTS crawls_crawlschedule (
id CHAR(36) PRIMARY KEY,
created_at DATETIME NOT NULL,
created_by_id INTEGER NOT NULL REFERENCES auth_user(id),
modified_at DATETIME,
schedule VARCHAR(64) NOT NULL,
is_enabled BOOLEAN NOT NULL DEFAULT 1,
label VARCHAR(64) NOT NULL DEFAULT '',
notes TEXT NOT NULL DEFAULT '',
template_id CHAR(36) REFERENCES crawls_crawl(id),
num_uses_failed INTEGER NOT NULL DEFAULT 0,
num_uses_succeeded INTEGER NOT NULL DEFAULT 0
);
CREATE TABLE IF NOT EXISTS crawls_crawl (
id CHAR(36) PRIMARY KEY,
created_at DATETIME NOT NULL,
@@ -472,7 +518,9 @@ INSERT INTO django_content_type (app_label, model) VALUES
('machine', 'installedbinary'),
('crawls', 'crawl'),
('crawls', 'crawlschedule'),
('crawls', 'seed');
('crawls', 'seed'),
('api', 'apitoken'),
('api', 'outboundwebhook');
"""
@@ -873,11 +921,83 @@ def seed_0_8_data(db_path: Path) -> Dict[str, List[Dict]]:
('core', '0020_auto_20210410_1031'),
('core', '0021_auto_20220914_0934'),
('core', '0022_auto_20231023_2008'),
# For 0.8.x (dev branch), record the migrations that 0023_new_schema replaces
# This is required because 0023_new_schema is a squashed migration
('core', '0023_alter_archiveresult_options_archiveresult_abid_and_more'),
('core', '0024_auto_20240513_1143'),
('core', '0025_alter_archiveresult_uuid'),
('core', '0026_archiveresult_created_archiveresult_created_by_and_more'),
('core', '0027_update_snapshot_ids'),
('core', '0028_alter_archiveresult_uuid'),
('core', '0029_alter_archiveresult_id'),
('core', '0030_alter_archiveresult_uuid'),
('core', '0031_alter_archiveresult_id_alter_archiveresult_uuid_and_more'),
('core', '0032_alter_archiveresult_id'),
('core', '0033_rename_id_archiveresult_old_id'),
('core', '0034_alter_archiveresult_old_id_alter_archiveresult_uuid'),
('core', '0035_remove_archiveresult_uuid_archiveresult_id'),
('core', '0036_alter_archiveresult_id_alter_archiveresult_old_id'),
('core', '0037_rename_id_snapshot_old_id'),
('core', '0038_rename_uuid_snapshot_id'),
('core', '0039_rename_snapshot_archiveresult_snapshot_old'),
('core', '0040_archiveresult_snapshot'),
('core', '0041_alter_archiveresult_snapshot_and_more'),
('core', '0042_remove_archiveresult_snapshot_old'),
('core', '0043_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
('core', '0044_alter_archiveresult_snapshot_alter_tag_uuid_and_more'),
('core', '0045_alter_snapshot_old_id'),
('core', '0046_alter_archiveresult_snapshot_alter_snapshot_id_and_more'),
('core', '0047_alter_snapshottag_unique_together_and_more'),
('core', '0048_alter_archiveresult_snapshot_and_more'),
('core', '0049_rename_snapshot_snapshottag_snapshot_old_and_more'),
('core', '0050_alter_snapshottag_snapshot_old'),
('core', '0051_snapshottag_snapshot_alter_snapshottag_snapshot_old'),
('core', '0052_alter_snapshottag_unique_together_and_more'),
('core', '0053_remove_snapshottag_snapshot_old'),
('core', '0054_alter_snapshot_timestamp'),
('core', '0055_alter_tag_slug'),
('core', '0056_remove_tag_uuid'),
('core', '0057_rename_id_tag_old_id'),
('core', '0058_alter_tag_old_id'),
('core', '0059_tag_id'),
('core', '0060_alter_tag_id'),
('core', '0061_rename_tag_snapshottag_old_tag_and_more'),
('core', '0062_alter_snapshottag_old_tag'),
('core', '0063_snapshottag_tag_alter_snapshottag_old_tag'),
('core', '0064_alter_snapshottag_unique_together_and_more'),
('core', '0065_remove_snapshottag_old_tag'),
('core', '0066_alter_snapshottag_tag_alter_tag_id_alter_tag_old_id'),
('core', '0067_alter_snapshottag_tag'),
('core', '0068_alter_archiveresult_options'),
('core', '0069_alter_archiveresult_created_alter_snapshot_added_and_more'),
('core', '0070_alter_archiveresult_created_by_alter_snapshot_added_and_more'),
('core', '0071_remove_archiveresult_old_id_remove_snapshot_old_id_and_more'),
('core', '0072_rename_added_snapshot_bookmarked_at_and_more'),
('core', '0073_rename_created_archiveresult_created_at_and_more'),
('core', '0074_alter_snapshot_downloaded_at'),
# Also record the squashed migration itself
('core', '0023_new_schema'),
# Machine app migrations (required by core.0024)
# Machine app - record both squashed and individual migrations (like fresh install does)
('machine', '0001_initial'),
('machine', '0001_squashed'),
('machine', '0002_alter_machine_stats_installedbinary'),
('machine', '0003_alter_installedbinary_options_and_more'),
('machine', '0004_alter_installedbinary_abspath_and_more'),
('core', '0024_snapshot_crawl'),
('core', '0025_allow_duplicate_urls_per_crawl'),
# Note: core.0026 removes output_dir which the 0.8.x schema still has
# Let Django apply it during migration
# API app - record both squashed and individual migrations (like fresh install does)
('api', '0001_initial'),
('api', '0001_squashed'),
('api', '0002_alter_apitoken_options'),
('api', '0003_rename_user_apitoken_created_by_apitoken_abid_and_more'),
('api', '0004_alter_apitoken_id_alter_apitoken_uuid'),
('api', '0005_remove_apitoken_uuid_remove_outboundwebhook_uuid_and_more'),
('api', '0006_remove_outboundwebhook_uuid_apitoken_id_and_more'),
('api', '0007_alter_apitoken_created_by'),
('api', '0008_alter_apitoken_created_alter_apitoken_created_by_and_more'),
('api', '0009_rename_created_apitoken_created_at_and_more'),
# Crawls migrations
('crawls', '0001_initial'),
]
@@ -1039,6 +1159,7 @@ class TestFreshInstall(unittest.TestCase):
def test_init_creates_database(self):
"""Fresh init should create database and directories."""
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
@@ -1055,6 +1176,7 @@ class TestFreshInstall(unittest.TestCase):
def test_status_after_init(self):
"""Status command should work after init."""
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
@@ -1069,6 +1191,7 @@ class TestFreshInstall(unittest.TestCase):
def test_add_url_after_init(self):
"""Should be able to add URLs after init with --index-only (fast)."""
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
@@ -1100,6 +1223,7 @@ class TestFreshInstall(unittest.TestCase):
def test_list_after_add(self):
"""List command should show added snapshots."""
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
@@ -1117,6 +1241,7 @@ class TestFreshInstall(unittest.TestCase):
def test_migrations_table_populated(self):
"""Django migrations table should be populated after init."""
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
@@ -1137,6 +1262,7 @@ class TestFreshInstall(unittest.TestCase):
def test_core_migrations_applied(self):
"""Core app migrations should be applied."""
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
@@ -1160,6 +1286,7 @@ class TestSchemaIntegrity(unittest.TestCase):
def test_snapshot_table_has_required_columns(self):
"""Snapshot table should have all required columns."""
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
@@ -1181,6 +1308,7 @@ class TestSchemaIntegrity(unittest.TestCase):
def test_archiveresult_table_has_required_columns(self):
"""ArchiveResult table should have all required columns."""
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
@@ -1202,6 +1330,7 @@ class TestSchemaIntegrity(unittest.TestCase):
def test_tag_table_has_required_columns(self):
"""Tag table should have all required columns."""
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
@@ -1227,6 +1356,7 @@ class TestMultipleSnapshots(unittest.TestCase):
def test_add_multiple_urls(self):
"""Should be able to add multiple URLs with --index-only."""
work_dir = Path(tempfile.mkdtemp())
try:
result = run_archivebox(work_dir, ['init'])
@@ -1269,6 +1399,9 @@ class TestMigrationFrom07x(unittest.TestCase):
# Seed with test data
self.original_data = seed_0_7_data(self.db_path)
# Change ownership to testuser so archivebox can write to it
def tearDown(self):
"""Clean up temporary directory."""
shutil.rmtree(self.work_dir, ignore_errors=True)
@@ -1501,6 +1634,9 @@ class TestMigrationFrom04x(unittest.TestCase):
# Seed with test data
self.original_data = seed_0_4_data(self.db_path)
# Change ownership to testuser so archivebox can write to it
def tearDown(self):
"""Clean up temporary directory."""
shutil.rmtree(self.work_dir, ignore_errors=True)
@@ -1542,7 +1678,6 @@ class TestMigrationFrom04x(unittest.TestCase):
self.assertTrue(ok, msg)
@unittest.skip("0.8.x migration tests skipped: complex machine app state issues with Django migration loader")
class TestMigrationFrom08x(unittest.TestCase):
"""Test migration from 0.8.x schema to latest.
@@ -1551,11 +1686,6 @@ class TestMigrationFrom08x(unittest.TestCase):
- UUID primary keys for Snapshot
- Status fields for state machine
- New fields like depth, retry_at, etc.
NOTE: These tests are currently skipped because the 0.8.x schema has complex
migration state dependencies with the machine app that Django's migration loader
has trouble resolving. The 0.7.x tests are the critical path since most users
will be upgrading from the stable 0.7.x branch, not the dev 0.8.x branch.
"""
def setUp(self):
@@ -1574,6 +1704,9 @@ class TestMigrationFrom08x(unittest.TestCase):
# Seed with test data
self.original_data = seed_0_8_data(self.db_path)
# Change ownership to testuser so archivebox can write to it
def tearDown(self):
"""Clean up temporary directory."""
shutil.rmtree(self.work_dir, ignore_errors=True)
@@ -1724,11 +1857,20 @@ class TestMigrationFrom08x(unittest.TestCase):
def test_add_works_after_migration(self):
"""Adding new URLs should work after migration from 0.8.x."""
result = run_archivebox(self.work_dir, ['init'], timeout=45)
# Check that init actually ran and applied migrations
self.assertIn('Applying', result.stdout + result.stderr,
f"Init did not apply migrations. stdout: {result.stdout[:500]}, stderr: {result.stderr[:500]}")
self.assertIn(result.returncode, [0, 1], f"Init crashed: {result.stderr}")
# Count existing crawls
# Check that seed_id column was removed by migration
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
cursor.execute("PRAGMA table_info(crawls_crawl)")
columns = [row[1] for row in cursor.fetchall()]
self.assertNotIn('seed_id', columns,
f"seed_id column should have been removed by migration. Columns: {columns}")
# Count existing crawls
cursor.execute("SELECT COUNT(*) FROM crawls_crawl")
initial_crawl_count = cursor.fetchone()[0]
conn.close()
@@ -1776,6 +1918,7 @@ class TestMigrationDataIntegrity(unittest.TestCase):
conn.executescript(SCHEMA_0_7)
conn.close()
seed_0_7_data(db_path)
result = run_archivebox(work_dir, ['init'], timeout=45)
self.assertIn(result.returncode, [0, 1])
@@ -1806,6 +1949,7 @@ class TestMigrationDataIntegrity(unittest.TestCase):
conn.executescript(SCHEMA_0_7)
conn.close()
seed_0_7_data(db_path)
result = run_archivebox(work_dir, ['init'], timeout=45)
self.assertIn(result.returncode, [0, 1])
@@ -1827,6 +1971,7 @@ class TestMigrationDataIntegrity(unittest.TestCase):
conn.executescript(SCHEMA_0_7)
conn.close()
original_data = seed_0_7_data(db_path)
original_timestamps = {s['url']: s['timestamp'] for s in original_data['snapshots']}