implement fs_version migrations

This commit is contained in:
Nick Sweeting
2025-12-27 00:25:35 -08:00
parent 6e892fb2b4
commit 35dd9acafe
4 changed files with 337 additions and 0 deletions

View File

@@ -143,6 +143,57 @@ SQLite handles circular references with `IF NOT EXISTS`. Order matters less than
- Individual migrations recorded for upgrades from dev branch - Individual migrations recorded for upgrades from dev branch
- `replaces` attribute in squashed migrations lists what they replace - `replaces` attribute in squashed migrations lists what they replace
## Code Style Guidelines
### Naming Conventions for Grep-ability
Use consistent naming for everything to enable easy grep-ability and logical grouping:
**Principle**: Fewest unique names. If you must create a new unique name, make it grep and group well.
**Examples**:
```python
# Filesystem migration methods - all start with fs_
def fs_migration_needed() -> bool: ...
def fs_migrate() -> None: ...
def _fs_migrate_from_0_7_0_to_0_8_0() -> None: ...
def _fs_migrate_from_0_8_0_to_0_9_0() -> None: ...
def _fs_next_version(current: str) -> str: ...
# Logging methods - ALL must start with log_ or _log
def log_migration_start(snapshot_id: str) -> None: ...
def _log_error(message: str) -> None: ...
def log_validation_result(ok: bool, msg: str) -> None: ...
```
**Rules**:
- Group related functions with common prefixes
- Use `_` prefix for internal/private helpers within the same family
- ALL logging-related methods MUST start with `log_` or `_log`
- Search for all migration functions: `grep -r "def.*fs_.*(" archivebox/`
- Search for all logging: `grep -r "def.*log_.*(" archivebox/`
### Minimize Unique Names and Data Structures
**Do not invent new data structures, variable names, or keys if possible.** Try to use existing field names and data structures exactly to keep the total unique data structures and names in the codebase to an absolute minimum.
**Example - GOOD**:
```python
# Binary has overrides field
binary = Binary(overrides={'TIMEOUT': '60s'})
# InstalledBinary reuses the same field name and structure
class InstalledBinary(models.Model):
overrides = models.JSONField(default=dict) # Same name, same structure
```
**Example - BAD**:
```python
# Don't invent new names like custom_bin_cmds, installed_binary_overrides, etc.
class InstalledBinary(models.Model):
custom_bin_cmds = models.JSONField(default=dict) # ❌ New unique name
```
**Principle**: If you're storing the same conceptual data (e.g., `overrides`), use the same field name across all models and keep the internal structure identical. This makes the codebase predictable and reduces cognitive load.
## Debugging Tips ## Debugging Tips
### Check Migration State ### Check Migration State

View File

@@ -0,0 +1,47 @@
# Generated by Claude Code on 2025-12-27
from django.db import migrations, models
def set_existing_snapshots_to_old_version(apps, schema_editor):
"""Set existing snapshots to 0.8.0 since they use the old filesystem layout."""
Snapshot = apps.get_model('core', 'Snapshot')
# Set all existing snapshots to 0.8.0 (the previous version's layout)
Snapshot.objects.all().update(fs_version='0.8.0')
def reverse_migration(apps, schema_editor):
"""Reverse migration - do nothing."""
pass
class Migration(migrations.Migration):
dependencies = [
('core', '0027_alter_archiveresult_created_by_and_more'),
]
operations = [
# Add field with temporary default to allow NULL initially
migrations.AddField(
model_name='snapshot',
name='fs_version',
field=models.CharField(
max_length=10,
default='0.8.0', # Temporary default for adding the column
help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().'
),
),
# Set existing snapshots to old version
migrations.RunPython(set_existing_snapshots_to_old_version, reverse_migration),
# Update default to current version for new snapshots going forward
migrations.AlterField(
model_name='snapshot',
name='fs_version',
field=models.CharField(
max_length=10,
default='0.9.0', # Hardcoded for this migration - new migration when version bumps
help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().'
),
),
]

View File

@@ -307,6 +307,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
title = models.CharField(max_length=512, null=True, blank=True, db_index=True) title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True) downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
depth = models.PositiveSmallIntegerField(default=0, db_index=True) # 0 for root snapshot, 1+ for discovered URLs depth = models.PositiveSmallIntegerField(default=0, db_index=True) # 0 for root snapshot, 1+ for discovered URLs
fs_version = models.CharField(max_length=10, default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().')
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now) retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED) status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
@@ -342,6 +343,28 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
self.bookmarked_at = self.created_at or timezone.now() self.bookmarked_at = self.created_at or timezone.now()
if not self.timestamp: if not self.timestamp:
self.timestamp = str(self.bookmarked_at.timestamp()) self.timestamp = str(self.bookmarked_at.timestamp())
# Migrate filesystem if needed (happens automatically on save)
if self.pk and self.fs_migration_needed:
from django.db import transaction
with transaction.atomic():
# Walk through migration chain automatically
current = self.fs_version
target = self._fs_current_version()
while current != target:
next_ver = self._fs_next_version(current)
method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'
# Only run if method exists (most are no-ops)
if hasattr(self, method):
getattr(self, method)()
current = next_ver
# Update version (still in transaction)
self.fs_version = target
super().save(*args, **kwargs) super().save(*args, **kwargs)
if self.crawl and self.url not in self.crawl.urls: if self.crawl and self.url not in self.crawl.urls:
self.crawl.urls += f'\n{self.url}' self.crawl.urls += f'\n{self.url}'
@@ -362,6 +385,79 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
}, },
) )
# =========================================================================
# Filesystem Migration Methods
# =========================================================================
@staticmethod
def _fs_current_version() -> str:
"""Get current ArchiveBox filesystem version (normalized to x.x.0 format)"""
from archivebox.config import VERSION
# Normalize version to x.x.0 format (e.g., "0.9.0rc1" -> "0.9.0")
parts = VERSION.split('.')
if len(parts) >= 2:
major, minor = parts[0], parts[1]
# Strip any non-numeric suffix from minor version
minor = ''.join(c for c in minor if c.isdigit())
return f'{major}.{minor}.0'
return '0.9.0' # Fallback if version parsing fails
@property
def fs_migration_needed(self) -> bool:
"""Check if snapshot needs filesystem migration"""
return self.fs_version != self._fs_current_version()
def _fs_next_version(self, version: str) -> str:
"""Get next version in migration chain"""
chain = ['0.7.0', '0.8.0', '0.9.0']
try:
idx = chain.index(version)
return chain[idx + 1] if idx + 1 < len(chain) else self._fs_current_version()
except ValueError:
# Unknown version - skip to current
return self._fs_current_version()
def _fs_migrate_from_0_7_0_to_0_8_0(self):
"""Migration from 0.7.0 to 0.8.0 layout (no-op)"""
# 0.7 and 0.8 both used archive/<timestamp>
# Nothing to do!
pass
def _fs_migrate_from_0_8_0_to_0_9_0(self):
"""
Migrate from flat file structure to organized extractor subdirectories.
0.8.x layout (flat):
archive/1234567890/
index.json
index.html
screenshot.png
warc/archive.warc.gz
media/video.mp4
0.9.x layout (organized):
archive/{timestamp}/
index.json
screenshot/
screenshot.png
singlefile/
index.html
warc/
archive.warc.gz
media/
video.mp4
Note: For now this is a no-op. The actual file reorganization will be
implemented when we're ready to do the migration. This placeholder ensures
the migration chain is set up correctly.
"""
# TODO: Implement actual file reorganization when ready
pass
# =========================================================================
# Output Directory Properties
# =========================================================================
@property @property
def output_dir_parent(self) -> str: def output_dir_parent(self) -> str:
return 'archive' return 'archive'

View File

@@ -413,5 +413,148 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
shutil.rmtree(work_dir, ignore_errors=True) shutil.rmtree(work_dir, ignore_errors=True)
class TestFilesystemMigration08to09(unittest.TestCase):
"""Test filesystem migration from 0.8.x flat structure to 0.9.x organized structure."""
def setUp(self):
"""Create a temporary directory for testing."""
self.work_dir = Path(tempfile.mkdtemp())
self.db_path = self.work_dir / 'index.sqlite3'
def tearDown(self):
"""Clean up temporary directory."""
shutil.rmtree(self.work_dir, ignore_errors=True)
def test_filesystem_migration_with_real_archiving(self):
"""
Test that filesystem migration works with real archived content.
Steps:
1. Initialize archivebox
2. Archive https://example.com (creates real files)
3. Manually set fs_version to 0.8.0
4. Trigger migration by saving snapshot
5. Verify files are organized correctly
"""
# Step 1: Initialize
result = run_archivebox(self.work_dir, ['init'], timeout=45)
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
# Step 2: Archive example.com with some extractors enabled
# Enable a subset of fast extractors for testing
result = run_archivebox(
self.work_dir,
['add', '--depth=0', 'https://example.com'],
timeout=120,
env={
'SAVE_TITLE': 'True',
'SAVE_FAVICON': 'True',
'SAVE_WGET': 'True',
'SAVE_SCREENSHOT': 'False', # Disable slow extractors
'SAVE_DOM': 'False',
'SAVE_SINGLEFILE': 'False',
'SAVE_READABILITY': 'False',
'SAVE_MERCURY': 'False',
'SAVE_PDF': 'False',
'SAVE_MEDIA': 'False',
'SAVE_ARCHIVE_DOT_ORG': 'False',
}
)
# Note: Add may fail if network is down or extractors fail, but we still want to test
# the filesystem migration logic even with partial failures
# Step 3: Get the snapshot and verify files were created
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
cursor.execute("SELECT id, url, timestamp, fs_version FROM core_snapshot WHERE url = ?", ('https://example.com',))
row = cursor.fetchone()
conn.close()
if not row:
self.skipTest("Failed to create snapshot for https://example.com")
snapshot_id, url, timestamp, fs_version = row
# Verify initial fs_version is 0.9.0 (current version)
self.assertEqual(fs_version, '0.9.0', f"Expected new snapshot to have fs_version='0.9.0', got '{fs_version}'")
# Verify output directory exists
output_dir = self.work_dir / 'archive' / timestamp
self.assertTrue(output_dir.exists(), f"Output directory not found: {output_dir}")
# List all files created (for debugging)
files_before = list(output_dir.rglob('*'))
files_before_count = len([f for f in files_before if f.is_file()])
print(f"\n[*] Files created by archiving: {files_before_count}")
for f in sorted(files_before):
if f.is_file():
print(f" {f.relative_to(output_dir)}")
# Step 4: Manually set fs_version to 0.8.0 to simulate old snapshot
conn = sqlite3.connect(str(self.db_path))
cursor = conn.cursor()
cursor.execute("UPDATE core_snapshot SET fs_version = '0.8.0' WHERE id = ?", (snapshot_id,))
conn.commit()
# Verify the update worked
cursor.execute("SELECT fs_version FROM core_snapshot WHERE id = ?", (snapshot_id,))
updated_version = cursor.fetchone()[0]
conn.close()
self.assertEqual(updated_version, '0.8.0', "Failed to set fs_version to 0.8.0")
# Step 5: Trigger migration by running a command that loads and saves the snapshot
# We'll use the Python API directly to trigger save()
import os
import sys
import django
# Setup Django
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
os.environ['DATA_DIR'] = str(self.work_dir)
# Add parent dir to path so we can import archivebox
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
try:
django.setup()
from core.models import Snapshot
# Load the snapshot (should trigger migration on save)
snapshot = Snapshot.objects.get(url='https://example.com')
# Verify fs_migration_needed returns True
self.assertTrue(snapshot.fs_migration_needed,
f"fs_migration_needed should be True for fs_version='0.8.0'")
# Save to trigger migration
print(f"\n[*] Triggering filesystem migration by saving snapshot...")
snapshot.save()
# Refresh from DB
snapshot.refresh_from_db()
# Verify migration completed
self.assertEqual(snapshot.fs_version, '0.9.0',
f"Migration failed: fs_version is still '{snapshot.fs_version}'")
self.assertFalse(snapshot.fs_migration_needed,
"fs_migration_needed should be False after migration")
print(f"[√] Filesystem migration completed: 0.8.0 -> 0.9.0")
except Exception as e:
self.fail(f"Failed to trigger migration via Django: {e}")
# Step 6: Verify files still exist and are accessible
# For 0.8 -> 0.9, the migration is a no-op, so files should be in the same place
files_after = list(output_dir.rglob('*'))
files_after_count = len([f for f in files_after if f.is_file()])
print(f"\n[*] Files after migration: {files_after_count}")
# Verify no files were lost
self.assertGreaterEqual(files_after_count, files_before_count,
f"Files were lost during migration: {files_before_count} -> {files_after_count}")
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()