implement fs_version migrations

2026-01-03 01:15:57 +10:00 · 2025-12-27 00:25:35 -08:00
parent 6e892fb2b4
commit 35dd9acafe
4 changed files with 337 additions and 0 deletions
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -143,6 +143,57 @@ SQLite handles circular references with `IF NOT EXISTS`. Order matters less than
 - Individual migrations recorded for upgrades from dev branch
 - `replaces` attribute in squashed migrations lists what they replace
 ## Code Style Guidelines
 ### Naming Conventions for Grep-ability
 Use consistent naming for everything to enable easy grep-ability and logical grouping:
 **Principle**: Fewest unique names. If you must create a new unique name, make it grep and group well.
 **Examples**:
 ```python
 # Filesystem migration methods - all start with fs_
 def fs_migration_needed() -> bool: ...
 def fs_migrate() -> None: ...
 def _fs_migrate_from_0_7_0_to_0_8_0() -> None: ...
 def _fs_migrate_from_0_8_0_to_0_9_0() -> None: ...
 def _fs_next_version(current: str) -> str: ...
 # Logging methods - ALL must start with log_ or _log
 def log_migration_start(snapshot_id: str) -> None: ...
 def _log_error(message: str) -> None: ...
 def log_validation_result(ok: bool, msg: str) -> None: ...
 ```
 **Rules**:
 - Group related functions with common prefixes
 - Use `_` prefix for internal/private helpers within the same family
 - ALL logging-related methods MUST start with `log_` or `_log`
 - Search for all migration functions: `grep -r "def.*fs_.*(" archivebox/`
 - Search for all logging: `grep -r "def.*log_.*(" archivebox/`
 ### Minimize Unique Names and Data Structures
 **Do not invent new data structures, variable names, or keys if possible.** Try to use existing field names and data structures exactly to keep the total unique data structures and names in the codebase to an absolute minimum.
 **Example - GOOD**:
 ```python
 # Binary has overrides field
 binary = Binary(overrides={'TIMEOUT': '60s'})
 # InstalledBinary reuses the same field name and structure
 class InstalledBinary(models.Model):
    overrides = models.JSONField(default=dict)  # Same name, same structure
 ```
 **Example - BAD**:
 ```python
 # Don't invent new names like custom_bin_cmds, installed_binary_overrides, etc.
 class InstalledBinary(models.Model):
    custom_bin_cmds = models.JSONField(default=dict)  # ❌ New unique name
 ```
 **Principle**: If you're storing the same conceptual data (e.g., `overrides`), use the same field name across all models and keep the internal structure identical. This makes the codebase predictable and reduces cognitive load.
 ## Debugging Tips
 ### Check Migration State
--- a/archivebox/core/migrations/0028_snapshot_fs_version.py
+++ b/archivebox/core/migrations/0028_snapshot_fs_version.py
@@ -0,0 +1,47 @@
 # Generated by Claude Code on 2025-12-27
 from django.db import migrations, models
 def set_existing_snapshots_to_old_version(apps, schema_editor):
    """Set existing snapshots to 0.8.0 since they use the old filesystem layout."""
    Snapshot = apps.get_model('core', 'Snapshot')
    # Set all existing snapshots to 0.8.0 (the previous version's layout)
    Snapshot.objects.all().update(fs_version='0.8.0')
 def reverse_migration(apps, schema_editor):
    """Reverse migration - do nothing."""
    pass
 class Migration(migrations.Migration):
    dependencies = [
        ('core', '0027_alter_archiveresult_created_by_and_more'),
    ]
    operations = [
        # Add field with temporary default to allow NULL initially
        migrations.AddField(
            model_name='snapshot',
            name='fs_version',
            field=models.CharField(
                max_length=10,
                default='0.8.0',  # Temporary default for adding the column
                help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().'
            ),
        ),
        # Set existing snapshots to old version
        migrations.RunPython(set_existing_snapshots_to_old_version, reverse_migration),
        # Update default to current version for new snapshots going forward
        migrations.AlterField(
            model_name='snapshot',
            name='fs_version',
            field=models.CharField(
                max_length=10,
                default='0.9.0',  # Hardcoded for this migration - new migration when version bumps
                help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().'
            ),
        ),
    ]
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -307,6 +307,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
    title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
    downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
    depth = models.PositiveSmallIntegerField(default=0, db_index=True)  # 0 for root snapshot, 1+ for discovered URLs
    fs_version = models.CharField(max_length=10, default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().')
    retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
    status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
@@ -342,6 +343,28 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
            self.bookmarked_at = self.created_at or timezone.now()
        if not self.timestamp:
            self.timestamp = str(self.bookmarked_at.timestamp())
        # Migrate filesystem if needed (happens automatically on save)
        if self.pk and self.fs_migration_needed:
            from django.db import transaction
            with transaction.atomic():
                # Walk through migration chain automatically
                current = self.fs_version
                target = self._fs_current_version()
                while current != target:
                    next_ver = self._fs_next_version(current)
                    method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'
                    # Only run if method exists (most are no-ops)
                    if hasattr(self, method):
                        getattr(self, method)()
                    current = next_ver
                # Update version (still in transaction)
                self.fs_version = target
        super().save(*args, **kwargs)
        if self.crawl and self.url not in self.crawl.urls:
            self.crawl.urls += f'\n{self.url}'
@@ -362,6 +385,79 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                },
            )
    # =========================================================================
    # Filesystem Migration Methods
    # =========================================================================
    @staticmethod
    def _fs_current_version() -> str:
        """Get current ArchiveBox filesystem version (normalized to x.x.0 format)"""
        from archivebox.config import VERSION
        # Normalize version to x.x.0 format (e.g., "0.9.0rc1" -> "0.9.0")
        parts = VERSION.split('.')
        if len(parts) >= 2:
            major, minor = parts[0], parts[1]
            # Strip any non-numeric suffix from minor version
            minor = ''.join(c for c in minor if c.isdigit())
            return f'{major}.{minor}.0'
        return '0.9.0'  # Fallback if version parsing fails
    @property
    def fs_migration_needed(self) -> bool:
        """Check if snapshot needs filesystem migration"""
        return self.fs_version != self._fs_current_version()
    def _fs_next_version(self, version: str) -> str:
        """Get next version in migration chain"""
        chain = ['0.7.0', '0.8.0', '0.9.0']
        try:
            idx = chain.index(version)
            return chain[idx + 1] if idx + 1 < len(chain) else self._fs_current_version()
        except ValueError:
            # Unknown version - skip to current
            return self._fs_current_version()
    def _fs_migrate_from_0_7_0_to_0_8_0(self):
        """Migration from 0.7.0 to 0.8.0 layout (no-op)"""
        # 0.7 and 0.8 both used archive/<timestamp>
        # Nothing to do!
        pass
    def _fs_migrate_from_0_8_0_to_0_9_0(self):
        """
        Migrate from flat file structure to organized extractor subdirectories.
        0.8.x layout (flat):
            archive/1234567890/
                index.json
                index.html
                screenshot.png
                warc/archive.warc.gz
                media/video.mp4
        0.9.x layout (organized):
            archive/{timestamp}/
                index.json
                screenshot/
                    screenshot.png
                singlefile/
                    index.html
                warc/
                    archive.warc.gz
                media/
                    video.mp4
        Note: For now this is a no-op. The actual file reorganization will be
        implemented when we're ready to do the migration. This placeholder ensures
        the migration chain is set up correctly.
        """
        # TODO: Implement actual file reorganization when ready
        pass
    # =========================================================================
    # Output Directory Properties
    # =========================================================================
    @property
    def output_dir_parent(self) -> str:
        return 'archive'
--- a/archivebox/tests/test_migrations_08_to_09.py
+++ b/archivebox/tests/test_migrations_08_to_09.py
@@ -413,5 +413,148 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
            shutil.rmtree(work_dir, ignore_errors=True)
 class TestFilesystemMigration08to09(unittest.TestCase):
    """Test filesystem migration from 0.8.x flat structure to 0.9.x organized structure."""
    def setUp(self):
        """Create a temporary directory for testing."""
        self.work_dir = Path(tempfile.mkdtemp())
        self.db_path = self.work_dir / 'index.sqlite3'
    def tearDown(self):
        """Clean up temporary directory."""
        shutil.rmtree(self.work_dir, ignore_errors=True)
    def test_filesystem_migration_with_real_archiving(self):
        """
        Test that filesystem migration works with real archived content.
        Steps:
        1. Initialize archivebox
        2. Archive https://example.com (creates real files)
        3. Manually set fs_version to 0.8.0
        4. Trigger migration by saving snapshot
        5. Verify files are organized correctly
        """
        # Step 1: Initialize
        result = run_archivebox(self.work_dir, ['init'], timeout=45)
        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
        # Step 2: Archive example.com with some extractors enabled
        # Enable a subset of fast extractors for testing
        result = run_archivebox(
            self.work_dir,
            ['add', '--depth=0', 'https://example.com'],
            timeout=120,
            env={
                'SAVE_TITLE': 'True',
                'SAVE_FAVICON': 'True',
                'SAVE_WGET': 'True',
                'SAVE_SCREENSHOT': 'False',  # Disable slow extractors
                'SAVE_DOM': 'False',
                'SAVE_SINGLEFILE': 'False',
                'SAVE_READABILITY': 'False',
                'SAVE_MERCURY': 'False',
                'SAVE_PDF': 'False',
                'SAVE_MEDIA': 'False',
                'SAVE_ARCHIVE_DOT_ORG': 'False',
            }
        )
        # Note: Add may fail if network is down or extractors fail, but we still want to test
        # the filesystem migration logic even with partial failures
        # Step 3: Get the snapshot and verify files were created
        conn = sqlite3.connect(str(self.db_path))
        cursor = conn.cursor()
        cursor.execute("SELECT id, url, timestamp, fs_version FROM core_snapshot WHERE url = ?", ('https://example.com',))
        row = cursor.fetchone()
        conn.close()
        if not row:
            self.skipTest("Failed to create snapshot for https://example.com")
        snapshot_id, url, timestamp, fs_version = row
        # Verify initial fs_version is 0.9.0 (current version)
        self.assertEqual(fs_version, '0.9.0', f"Expected new snapshot to have fs_version='0.9.0', got '{fs_version}'")
        # Verify output directory exists
        output_dir = self.work_dir / 'archive' / timestamp
        self.assertTrue(output_dir.exists(), f"Output directory not found: {output_dir}")
        # List all files created (for debugging)
        files_before = list(output_dir.rglob('*'))
        files_before_count = len([f for f in files_before if f.is_file()])
        print(f"\n[*] Files created by archiving: {files_before_count}")
        for f in sorted(files_before):
            if f.is_file():
                print(f"    {f.relative_to(output_dir)}")
        # Step 4: Manually set fs_version to 0.8.0 to simulate old snapshot
        conn = sqlite3.connect(str(self.db_path))
        cursor = conn.cursor()
        cursor.execute("UPDATE core_snapshot SET fs_version = '0.8.0' WHERE id = ?", (snapshot_id,))
        conn.commit()
        # Verify the update worked
        cursor.execute("SELECT fs_version FROM core_snapshot WHERE id = ?", (snapshot_id,))
        updated_version = cursor.fetchone()[0]
        conn.close()
        self.assertEqual(updated_version, '0.8.0', "Failed to set fs_version to 0.8.0")
        # Step 5: Trigger migration by running a command that loads and saves the snapshot
        # We'll use the Python API directly to trigger save()
        import os
        import sys
        import django
        # Setup Django
        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
        os.environ['DATA_DIR'] = str(self.work_dir)
        # Add parent dir to path so we can import archivebox
        sys.path.insert(0, str(Path(__file__).parent.parent.parent))
        try:
            django.setup()
            from core.models import Snapshot
            # Load the snapshot (should trigger migration on save)
            snapshot = Snapshot.objects.get(url='https://example.com')
            # Verify fs_migration_needed returns True
            self.assertTrue(snapshot.fs_migration_needed,
                          f"fs_migration_needed should be True for fs_version='0.8.0'")
            # Save to trigger migration
            print(f"\n[*] Triggering filesystem migration by saving snapshot...")
            snapshot.save()
            # Refresh from DB
            snapshot.refresh_from_db()
            # Verify migration completed
            self.assertEqual(snapshot.fs_version, '0.9.0',
                           f"Migration failed: fs_version is still '{snapshot.fs_version}'")
            self.assertFalse(snapshot.fs_migration_needed,
                           "fs_migration_needed should be False after migration")
            print(f"[√] Filesystem migration completed: 0.8.0 -> 0.9.0")
        except Exception as e:
            self.fail(f"Failed to trigger migration via Django: {e}")
        # Step 6: Verify files still exist and are accessible
        # For 0.8 -> 0.9, the migration is a no-op, so files should be in the same place
        files_after = list(output_dir.rglob('*'))
        files_after_count = len([f for f in files_after if f.is_file()])
        print(f"\n[*] Files after migration: {files_after_count}")
        # Verify no files were lost
        self.assertGreaterEqual(files_after_count, files_before_count,
                               f"Files were lost during migration: {files_before_count} -> {files_after_count}")
 if __name__ == '__main__':
    unittest.main()