mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 01:15:57 +10:00
implement fs_version migrations
This commit is contained in:
51
CLAUDE.md
51
CLAUDE.md
@@ -143,6 +143,57 @@ SQLite handles circular references with `IF NOT EXISTS`. Order matters less than
|
|||||||
- Individual migrations recorded for upgrades from dev branch
|
- Individual migrations recorded for upgrades from dev branch
|
||||||
- `replaces` attribute in squashed migrations lists what they replace
|
- `replaces` attribute in squashed migrations lists what they replace
|
||||||
|
|
||||||
|
## Code Style Guidelines
|
||||||
|
|
||||||
|
### Naming Conventions for Grep-ability
|
||||||
|
Use consistent naming for everything to enable easy grep-ability and logical grouping:
|
||||||
|
|
||||||
|
**Principle**: Fewest unique names. If you must create a new unique name, make it grep and group well.
|
||||||
|
|
||||||
|
**Examples**:
|
||||||
|
```python
|
||||||
|
# Filesystem migration methods - all start with fs_
|
||||||
|
def fs_migration_needed() -> bool: ...
|
||||||
|
def fs_migrate() -> None: ...
|
||||||
|
def _fs_migrate_from_0_7_0_to_0_8_0() -> None: ...
|
||||||
|
def _fs_migrate_from_0_8_0_to_0_9_0() -> None: ...
|
||||||
|
def _fs_next_version(current: str) -> str: ...
|
||||||
|
|
||||||
|
# Logging methods - ALL must start with log_ or _log
|
||||||
|
def log_migration_start(snapshot_id: str) -> None: ...
|
||||||
|
def _log_error(message: str) -> None: ...
|
||||||
|
def log_validation_result(ok: bool, msg: str) -> None: ...
|
||||||
|
```
|
||||||
|
|
||||||
|
**Rules**:
|
||||||
|
- Group related functions with common prefixes
|
||||||
|
- Use `_` prefix for internal/private helpers within the same family
|
||||||
|
- ALL logging-related methods MUST start with `log_` or `_log`
|
||||||
|
- Search for all migration functions: `grep -r "def.*fs_.*(" archivebox/`
|
||||||
|
- Search for all logging: `grep -r "def.*log_.*(" archivebox/`
|
||||||
|
|
||||||
|
### Minimize Unique Names and Data Structures
|
||||||
|
**Do not invent new data structures, variable names, or keys if possible.** Try to use existing field names and data structures exactly to keep the total unique data structures and names in the codebase to an absolute minimum.
|
||||||
|
|
||||||
|
**Example - GOOD**:
|
||||||
|
```python
|
||||||
|
# Binary has overrides field
|
||||||
|
binary = Binary(overrides={'TIMEOUT': '60s'})
|
||||||
|
|
||||||
|
# InstalledBinary reuses the same field name and structure
|
||||||
|
class InstalledBinary(models.Model):
|
||||||
|
overrides = models.JSONField(default=dict) # Same name, same structure
|
||||||
|
```
|
||||||
|
|
||||||
|
**Example - BAD**:
|
||||||
|
```python
|
||||||
|
# Don't invent new names like custom_bin_cmds, installed_binary_overrides, etc.
|
||||||
|
class InstalledBinary(models.Model):
|
||||||
|
custom_bin_cmds = models.JSONField(default=dict) # ❌ New unique name
|
||||||
|
```
|
||||||
|
|
||||||
|
**Principle**: If you're storing the same conceptual data (e.g., `overrides`), use the same field name across all models and keep the internal structure identical. This makes the codebase predictable and reduces cognitive load.
|
||||||
|
|
||||||
## Debugging Tips
|
## Debugging Tips
|
||||||
|
|
||||||
### Check Migration State
|
### Check Migration State
|
||||||
|
|||||||
47
archivebox/core/migrations/0028_snapshot_fs_version.py
Normal file
47
archivebox/core/migrations/0028_snapshot_fs_version.py
Normal file
@@ -0,0 +1,47 @@
|
|||||||
|
# Generated by Claude Code on 2025-12-27
|
||||||
|
|
||||||
|
from django.db import migrations, models
|
||||||
|
|
||||||
|
|
||||||
|
def set_existing_snapshots_to_old_version(apps, schema_editor):
|
||||||
|
"""Set existing snapshots to 0.8.0 since they use the old filesystem layout."""
|
||||||
|
Snapshot = apps.get_model('core', 'Snapshot')
|
||||||
|
# Set all existing snapshots to 0.8.0 (the previous version's layout)
|
||||||
|
Snapshot.objects.all().update(fs_version='0.8.0')
|
||||||
|
|
||||||
|
|
||||||
|
def reverse_migration(apps, schema_editor):
|
||||||
|
"""Reverse migration - do nothing."""
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
|
class Migration(migrations.Migration):
|
||||||
|
|
||||||
|
dependencies = [
|
||||||
|
('core', '0027_alter_archiveresult_created_by_and_more'),
|
||||||
|
]
|
||||||
|
|
||||||
|
operations = [
|
||||||
|
# Add field with temporary default to allow NULL initially
|
||||||
|
migrations.AddField(
|
||||||
|
model_name='snapshot',
|
||||||
|
name='fs_version',
|
||||||
|
field=models.CharField(
|
||||||
|
max_length=10,
|
||||||
|
default='0.8.0', # Temporary default for adding the column
|
||||||
|
help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().'
|
||||||
|
),
|
||||||
|
),
|
||||||
|
# Set existing snapshots to old version
|
||||||
|
migrations.RunPython(set_existing_snapshots_to_old_version, reverse_migration),
|
||||||
|
# Update default to current version for new snapshots going forward
|
||||||
|
migrations.AlterField(
|
||||||
|
model_name='snapshot',
|
||||||
|
name='fs_version',
|
||||||
|
field=models.CharField(
|
||||||
|
max_length=10,
|
||||||
|
default='0.9.0', # Hardcoded for this migration - new migration when version bumps
|
||||||
|
help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().'
|
||||||
|
),
|
||||||
|
),
|
||||||
|
]
|
||||||
@@ -307,6 +307,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
|||||||
title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
|
title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
|
||||||
downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
|
downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
|
||||||
depth = models.PositiveSmallIntegerField(default=0, db_index=True) # 0 for root snapshot, 1+ for discovered URLs
|
depth = models.PositiveSmallIntegerField(default=0, db_index=True) # 0 for root snapshot, 1+ for discovered URLs
|
||||||
|
fs_version = models.CharField(max_length=10, default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().')
|
||||||
|
|
||||||
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
|
retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
|
||||||
status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
|
status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
|
||||||
@@ -342,6 +343,28 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
|||||||
self.bookmarked_at = self.created_at or timezone.now()
|
self.bookmarked_at = self.created_at or timezone.now()
|
||||||
if not self.timestamp:
|
if not self.timestamp:
|
||||||
self.timestamp = str(self.bookmarked_at.timestamp())
|
self.timestamp = str(self.bookmarked_at.timestamp())
|
||||||
|
|
||||||
|
# Migrate filesystem if needed (happens automatically on save)
|
||||||
|
if self.pk and self.fs_migration_needed:
|
||||||
|
from django.db import transaction
|
||||||
|
with transaction.atomic():
|
||||||
|
# Walk through migration chain automatically
|
||||||
|
current = self.fs_version
|
||||||
|
target = self._fs_current_version()
|
||||||
|
|
||||||
|
while current != target:
|
||||||
|
next_ver = self._fs_next_version(current)
|
||||||
|
method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'
|
||||||
|
|
||||||
|
# Only run if method exists (most are no-ops)
|
||||||
|
if hasattr(self, method):
|
||||||
|
getattr(self, method)()
|
||||||
|
|
||||||
|
current = next_ver
|
||||||
|
|
||||||
|
# Update version (still in transaction)
|
||||||
|
self.fs_version = target
|
||||||
|
|
||||||
super().save(*args, **kwargs)
|
super().save(*args, **kwargs)
|
||||||
if self.crawl and self.url not in self.crawl.urls:
|
if self.crawl and self.url not in self.crawl.urls:
|
||||||
self.crawl.urls += f'\n{self.url}'
|
self.crawl.urls += f'\n{self.url}'
|
||||||
@@ -362,6 +385,79 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
|||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Filesystem Migration Methods
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _fs_current_version() -> str:
|
||||||
|
"""Get current ArchiveBox filesystem version (normalized to x.x.0 format)"""
|
||||||
|
from archivebox.config import VERSION
|
||||||
|
# Normalize version to x.x.0 format (e.g., "0.9.0rc1" -> "0.9.0")
|
||||||
|
parts = VERSION.split('.')
|
||||||
|
if len(parts) >= 2:
|
||||||
|
major, minor = parts[0], parts[1]
|
||||||
|
# Strip any non-numeric suffix from minor version
|
||||||
|
minor = ''.join(c for c in minor if c.isdigit())
|
||||||
|
return f'{major}.{minor}.0'
|
||||||
|
return '0.9.0' # Fallback if version parsing fails
|
||||||
|
|
||||||
|
@property
|
||||||
|
def fs_migration_needed(self) -> bool:
|
||||||
|
"""Check if snapshot needs filesystem migration"""
|
||||||
|
return self.fs_version != self._fs_current_version()
|
||||||
|
|
||||||
|
def _fs_next_version(self, version: str) -> str:
|
||||||
|
"""Get next version in migration chain"""
|
||||||
|
chain = ['0.7.0', '0.8.0', '0.9.0']
|
||||||
|
try:
|
||||||
|
idx = chain.index(version)
|
||||||
|
return chain[idx + 1] if idx + 1 < len(chain) else self._fs_current_version()
|
||||||
|
except ValueError:
|
||||||
|
# Unknown version - skip to current
|
||||||
|
return self._fs_current_version()
|
||||||
|
|
||||||
|
def _fs_migrate_from_0_7_0_to_0_8_0(self):
|
||||||
|
"""Migration from 0.7.0 to 0.8.0 layout (no-op)"""
|
||||||
|
# 0.7 and 0.8 both used archive/<timestamp>
|
||||||
|
# Nothing to do!
|
||||||
|
pass
|
||||||
|
|
||||||
|
def _fs_migrate_from_0_8_0_to_0_9_0(self):
|
||||||
|
"""
|
||||||
|
Migrate from flat file structure to organized extractor subdirectories.
|
||||||
|
|
||||||
|
0.8.x layout (flat):
|
||||||
|
archive/1234567890/
|
||||||
|
index.json
|
||||||
|
index.html
|
||||||
|
screenshot.png
|
||||||
|
warc/archive.warc.gz
|
||||||
|
media/video.mp4
|
||||||
|
|
||||||
|
0.9.x layout (organized):
|
||||||
|
archive/{timestamp}/
|
||||||
|
index.json
|
||||||
|
screenshot/
|
||||||
|
screenshot.png
|
||||||
|
singlefile/
|
||||||
|
index.html
|
||||||
|
warc/
|
||||||
|
archive.warc.gz
|
||||||
|
media/
|
||||||
|
video.mp4
|
||||||
|
|
||||||
|
Note: For now this is a no-op. The actual file reorganization will be
|
||||||
|
implemented when we're ready to do the migration. This placeholder ensures
|
||||||
|
the migration chain is set up correctly.
|
||||||
|
"""
|
||||||
|
# TODO: Implement actual file reorganization when ready
|
||||||
|
pass
|
||||||
|
|
||||||
|
# =========================================================================
|
||||||
|
# Output Directory Properties
|
||||||
|
# =========================================================================
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def output_dir_parent(self) -> str:
|
def output_dir_parent(self) -> str:
|
||||||
return 'archive'
|
return 'archive'
|
||||||
|
|||||||
@@ -413,5 +413,148 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
|
|||||||
shutil.rmtree(work_dir, ignore_errors=True)
|
shutil.rmtree(work_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
|
||||||
|
class TestFilesystemMigration08to09(unittest.TestCase):
|
||||||
|
"""Test filesystem migration from 0.8.x flat structure to 0.9.x organized structure."""
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
"""Create a temporary directory for testing."""
|
||||||
|
self.work_dir = Path(tempfile.mkdtemp())
|
||||||
|
self.db_path = self.work_dir / 'index.sqlite3'
|
||||||
|
|
||||||
|
def tearDown(self):
|
||||||
|
"""Clean up temporary directory."""
|
||||||
|
shutil.rmtree(self.work_dir, ignore_errors=True)
|
||||||
|
|
||||||
|
def test_filesystem_migration_with_real_archiving(self):
|
||||||
|
"""
|
||||||
|
Test that filesystem migration works with real archived content.
|
||||||
|
|
||||||
|
Steps:
|
||||||
|
1. Initialize archivebox
|
||||||
|
2. Archive https://example.com (creates real files)
|
||||||
|
3. Manually set fs_version to 0.8.0
|
||||||
|
4. Trigger migration by saving snapshot
|
||||||
|
5. Verify files are organized correctly
|
||||||
|
"""
|
||||||
|
# Step 1: Initialize
|
||||||
|
result = run_archivebox(self.work_dir, ['init'], timeout=45)
|
||||||
|
self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
|
||||||
|
|
||||||
|
# Step 2: Archive example.com with some extractors enabled
|
||||||
|
# Enable a subset of fast extractors for testing
|
||||||
|
result = run_archivebox(
|
||||||
|
self.work_dir,
|
||||||
|
['add', '--depth=0', 'https://example.com'],
|
||||||
|
timeout=120,
|
||||||
|
env={
|
||||||
|
'SAVE_TITLE': 'True',
|
||||||
|
'SAVE_FAVICON': 'True',
|
||||||
|
'SAVE_WGET': 'True',
|
||||||
|
'SAVE_SCREENSHOT': 'False', # Disable slow extractors
|
||||||
|
'SAVE_DOM': 'False',
|
||||||
|
'SAVE_SINGLEFILE': 'False',
|
||||||
|
'SAVE_READABILITY': 'False',
|
||||||
|
'SAVE_MERCURY': 'False',
|
||||||
|
'SAVE_PDF': 'False',
|
||||||
|
'SAVE_MEDIA': 'False',
|
||||||
|
'SAVE_ARCHIVE_DOT_ORG': 'False',
|
||||||
|
}
|
||||||
|
)
|
||||||
|
# Note: Add may fail if network is down or extractors fail, but we still want to test
|
||||||
|
# the filesystem migration logic even with partial failures
|
||||||
|
|
||||||
|
# Step 3: Get the snapshot and verify files were created
|
||||||
|
conn = sqlite3.connect(str(self.db_path))
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("SELECT id, url, timestamp, fs_version FROM core_snapshot WHERE url = ?", ('https://example.com',))
|
||||||
|
row = cursor.fetchone()
|
||||||
|
conn.close()
|
||||||
|
|
||||||
|
if not row:
|
||||||
|
self.skipTest("Failed to create snapshot for https://example.com")
|
||||||
|
|
||||||
|
snapshot_id, url, timestamp, fs_version = row
|
||||||
|
|
||||||
|
# Verify initial fs_version is 0.9.0 (current version)
|
||||||
|
self.assertEqual(fs_version, '0.9.0', f"Expected new snapshot to have fs_version='0.9.0', got '{fs_version}'")
|
||||||
|
|
||||||
|
# Verify output directory exists
|
||||||
|
output_dir = self.work_dir / 'archive' / timestamp
|
||||||
|
self.assertTrue(output_dir.exists(), f"Output directory not found: {output_dir}")
|
||||||
|
|
||||||
|
# List all files created (for debugging)
|
||||||
|
files_before = list(output_dir.rglob('*'))
|
||||||
|
files_before_count = len([f for f in files_before if f.is_file()])
|
||||||
|
print(f"\n[*] Files created by archiving: {files_before_count}")
|
||||||
|
for f in sorted(files_before):
|
||||||
|
if f.is_file():
|
||||||
|
print(f" {f.relative_to(output_dir)}")
|
||||||
|
|
||||||
|
# Step 4: Manually set fs_version to 0.8.0 to simulate old snapshot
|
||||||
|
conn = sqlite3.connect(str(self.db_path))
|
||||||
|
cursor = conn.cursor()
|
||||||
|
cursor.execute("UPDATE core_snapshot SET fs_version = '0.8.0' WHERE id = ?", (snapshot_id,))
|
||||||
|
conn.commit()
|
||||||
|
|
||||||
|
# Verify the update worked
|
||||||
|
cursor.execute("SELECT fs_version FROM core_snapshot WHERE id = ?", (snapshot_id,))
|
||||||
|
updated_version = cursor.fetchone()[0]
|
||||||
|
conn.close()
|
||||||
|
self.assertEqual(updated_version, '0.8.0', "Failed to set fs_version to 0.8.0")
|
||||||
|
|
||||||
|
# Step 5: Trigger migration by running a command that loads and saves the snapshot
|
||||||
|
# We'll use the Python API directly to trigger save()
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import django
|
||||||
|
|
||||||
|
# Setup Django
|
||||||
|
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
|
||||||
|
os.environ['DATA_DIR'] = str(self.work_dir)
|
||||||
|
|
||||||
|
# Add parent dir to path so we can import archivebox
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent.parent))
|
||||||
|
|
||||||
|
try:
|
||||||
|
django.setup()
|
||||||
|
from core.models import Snapshot
|
||||||
|
|
||||||
|
# Load the snapshot (should trigger migration on save)
|
||||||
|
snapshot = Snapshot.objects.get(url='https://example.com')
|
||||||
|
|
||||||
|
# Verify fs_migration_needed returns True
|
||||||
|
self.assertTrue(snapshot.fs_migration_needed,
|
||||||
|
f"fs_migration_needed should be True for fs_version='0.8.0'")
|
||||||
|
|
||||||
|
# Save to trigger migration
|
||||||
|
print(f"\n[*] Triggering filesystem migration by saving snapshot...")
|
||||||
|
snapshot.save()
|
||||||
|
|
||||||
|
# Refresh from DB
|
||||||
|
snapshot.refresh_from_db()
|
||||||
|
|
||||||
|
# Verify migration completed
|
||||||
|
self.assertEqual(snapshot.fs_version, '0.9.0',
|
||||||
|
f"Migration failed: fs_version is still '{snapshot.fs_version}'")
|
||||||
|
self.assertFalse(snapshot.fs_migration_needed,
|
||||||
|
"fs_migration_needed should be False after migration")
|
||||||
|
|
||||||
|
print(f"[√] Filesystem migration completed: 0.8.0 -> 0.9.0")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
self.fail(f"Failed to trigger migration via Django: {e}")
|
||||||
|
|
||||||
|
# Step 6: Verify files still exist and are accessible
|
||||||
|
# For 0.8 -> 0.9, the migration is a no-op, so files should be in the same place
|
||||||
|
files_after = list(output_dir.rglob('*'))
|
||||||
|
files_after_count = len([f for f in files_after if f.is_file()])
|
||||||
|
|
||||||
|
print(f"\n[*] Files after migration: {files_after_count}")
|
||||||
|
|
||||||
|
# Verify no files were lost
|
||||||
|
self.assertGreaterEqual(files_after_count, files_before_count,
|
||||||
|
f"Files were lost during migration: {files_before_count} -> {files_after_count}")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
Reference in New Issue
Block a user