From 35dd9acafe2ef19a0cf622163e51dc61ea360848 Mon Sep 17 00:00:00 2001
From: Nick Sweeting <git@sweeting.me>
Date: Sat, 27 Dec 2025 00:25:35 -0800
Subject: [PATCH] implement fs_version migrations

---
 CLAUDE.md                                     |  51 +++++++
 .../migrations/0028_snapshot_fs_version.py    |  47 ++++++
 archivebox/core/models.py                     |  96 ++++++++++++
 archivebox/tests/test_migrations_08_to_09.py  | 143 ++++++++++++++++++
 4 files changed, 337 insertions(+)
 create mode 100644 archivebox/core/migrations/0028_snapshot_fs_version.py

diff --git a/CLAUDE.md b/CLAUDE.md
index 9e8b1a54..b8af1059 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -143,6 +143,57 @@ SQLite handles circular references with `IF NOT EXISTS`. Order matters less than
 - Individual migrations recorded for upgrades from dev branch
 - `replaces` attribute in squashed migrations lists what they replace
 
+## Code Style Guidelines
+
+### Naming Conventions for Grep-ability
+Use consistent naming for everything to enable easy grep-ability and logical grouping:
+
+**Principle**: Fewest unique names. If you must create a new unique name, make it grep and group well.
+
+**Examples**:
+```python
+# Filesystem migration methods - all start with fs_
+def fs_migration_needed() -> bool: ...
+def fs_migrate() -> None: ...
+def _fs_migrate_from_0_7_0_to_0_8_0() -> None: ...
+def _fs_migrate_from_0_8_0_to_0_9_0() -> None: ...
+def _fs_next_version(current: str) -> str: ...
+
+# Logging methods - ALL must start with log_ or _log
+def log_migration_start(snapshot_id: str) -> None: ...
+def _log_error(message: str) -> None: ...
+def log_validation_result(ok: bool, msg: str) -> None: ...
+```
+
+**Rules**:
+- Group related functions with common prefixes
+- Use `_` prefix for internal/private helpers within the same family
+- ALL logging-related methods MUST start with `log_` or `_log`
+- Search for all migration functions: `grep -r "def.*fs_.*(" archivebox/`
+- Search for all logging: `grep -r "def.*log_.*(" archivebox/`
+
+### Minimize Unique Names and Data Structures
+**Do not invent new data structures, variable names, or keys if possible.** Try to use existing field names and data structures exactly to keep the total unique data structures and names in the codebase to an absolute minimum.
+
+**Example - GOOD**:
+```python
+# Binary has overrides field
+binary = Binary(overrides={'TIMEOUT': '60s'})
+
+# InstalledBinary reuses the same field name and structure
+class InstalledBinary(models.Model):
+    overrides = models.JSONField(default=dict)  # Same name, same structure
+```
+
+**Example - BAD**:
+```python
+# Don't invent new names like custom_bin_cmds, installed_binary_overrides, etc.
+class InstalledBinary(models.Model):
+    custom_bin_cmds = models.JSONField(default=dict)  # ❌ New unique name
+```
+
+**Principle**: If you're storing the same conceptual data (e.g., `overrides`), use the same field name across all models and keep the internal structure identical. This makes the codebase predictable and reduces cognitive load.
+
 ## Debugging Tips
 
 ### Check Migration State
diff --git a/archivebox/core/migrations/0028_snapshot_fs_version.py b/archivebox/core/migrations/0028_snapshot_fs_version.py
new file mode 100644
index 00000000..29c2a588
--- /dev/null
+++ b/archivebox/core/migrations/0028_snapshot_fs_version.py
@@ -0,0 +1,47 @@
+# Generated by Claude Code on 2025-12-27
+
+from django.db import migrations, models
+
+
+def set_existing_snapshots_to_old_version(apps, schema_editor):
+    """Set existing snapshots to 0.8.0 since they use the old filesystem layout."""
+    Snapshot = apps.get_model('core', 'Snapshot')
+    # Set all existing snapshots to 0.8.0 (the previous version's layout)
+    Snapshot.objects.all().update(fs_version='0.8.0')
+
+
+def reverse_migration(apps, schema_editor):
+    """Reverse migration - do nothing."""
+    pass
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0027_alter_archiveresult_created_by_and_more'),
+    ]
+
+    operations = [
+        # Add field with temporary default to allow NULL initially
+        migrations.AddField(
+            model_name='snapshot',
+            name='fs_version',
+            field=models.CharField(
+                max_length=10,
+                default='0.8.0',  # Temporary default for adding the column
+                help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().'
+            ),
+        ),
+        # Set existing snapshots to old version
+        migrations.RunPython(set_existing_snapshots_to_old_version, reverse_migration),
+        # Update default to current version for new snapshots going forward
+        migrations.AlterField(
+            model_name='snapshot',
+            name='fs_version',
+            field=models.CharField(
+                max_length=10,
+                default='0.9.0',  # Hardcoded for this migration - new migration when version bumps
+                help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().'
+            ),
+        ),
+    ]
diff --git a/archivebox/core/models.py b/archivebox/core/models.py
index 806367e3..6bac5679 100755
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -307,6 +307,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
     title = models.CharField(max_length=512, null=True, blank=True, db_index=True)
     downloaded_at = models.DateTimeField(default=None, null=True, editable=False, db_index=True, blank=True)
     depth = models.PositiveSmallIntegerField(default=0, db_index=True)  # 0 for root snapshot, 1+ for discovered URLs
+    fs_version = models.CharField(max_length=10, default='0.9.0', help_text='Filesystem version of this snapshot (e.g., "0.7.0", "0.8.0", "0.9.0"). Used to trigger lazy migration on save().')
 
     retry_at = ModelWithStateMachine.RetryAtField(default=timezone.now)
     status = ModelWithStateMachine.StatusField(choices=ModelWithStateMachine.StatusChoices, default=ModelWithStateMachine.StatusChoices.QUEUED)
@@ -342,6 +343,28 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
             self.bookmarked_at = self.created_at or timezone.now()
         if not self.timestamp:
             self.timestamp = str(self.bookmarked_at.timestamp())
+
+        # Migrate filesystem if needed (happens automatically on save)
+        if self.pk and self.fs_migration_needed:
+            from django.db import transaction
+            with transaction.atomic():
+                # Walk through migration chain automatically
+                current = self.fs_version
+                target = self._fs_current_version()
+
+                while current != target:
+                    next_ver = self._fs_next_version(current)
+                    method = f'_fs_migrate_from_{current.replace(".", "_")}_to_{next_ver.replace(".", "_")}'
+
+                    # Only run if method exists (most are no-ops)
+                    if hasattr(self, method):
+                        getattr(self, method)()
+
+                    current = next_ver
+
+                # Update version (still in transaction)
+                self.fs_version = target
+
         super().save(*args, **kwargs)
         if self.crawl and self.url not in self.crawl.urls:
             self.crawl.urls += f'\n{self.url}'
@@ -362,6 +385,79 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
                 },
             )
 
+    # =========================================================================
+    # Filesystem Migration Methods
+    # =========================================================================
+
+    @staticmethod
+    def _fs_current_version() -> str:
+        """Get current ArchiveBox filesystem version (normalized to x.x.0 format)"""
+        from archivebox.config import VERSION
+        # Normalize version to x.x.0 format (e.g., "0.9.0rc1" -> "0.9.0")
+        parts = VERSION.split('.')
+        if len(parts) >= 2:
+            major, minor = parts[0], parts[1]
+            # Strip any non-numeric suffix from minor version
+            minor = ''.join(c for c in minor if c.isdigit())
+            return f'{major}.{minor}.0'
+        return '0.9.0'  # Fallback if version parsing fails
+
+    @property
+    def fs_migration_needed(self) -> bool:
+        """Check if snapshot needs filesystem migration"""
+        return self.fs_version != self._fs_current_version()
+
+    def _fs_next_version(self, version: str) -> str:
+        """Get next version in migration chain"""
+        chain = ['0.7.0', '0.8.0', '0.9.0']
+        try:
+            idx = chain.index(version)
+            return chain[idx + 1] if idx + 1 < len(chain) else self._fs_current_version()
+        except ValueError:
+            # Unknown version - skip to current
+            return self._fs_current_version()
+
+    def _fs_migrate_from_0_7_0_to_0_8_0(self):
+        """Migration from 0.7.0 to 0.8.0 layout (no-op)"""
+        # 0.7 and 0.8 both used archive/<timestamp>
+        # Nothing to do!
+        pass
+
+    def _fs_migrate_from_0_8_0_to_0_9_0(self):
+        """
+        Migrate from flat file structure to organized extractor subdirectories.
+
+        0.8.x layout (flat):
+            archive/1234567890/
+                index.json
+                index.html
+                screenshot.png
+                warc/archive.warc.gz
+                media/video.mp4
+
+        0.9.x layout (organized):
+            archive/{timestamp}/
+                index.json
+                screenshot/
+                    screenshot.png
+                singlefile/
+                    index.html
+                warc/
+                    archive.warc.gz
+                media/
+                    video.mp4
+
+        Note: For now this is a no-op. The actual file reorganization will be
+        implemented when we're ready to do the migration. This placeholder ensures
+        the migration chain is set up correctly.
+        """
+        # TODO: Implement actual file reorganization when ready
+        pass
+
+    # =========================================================================
+    # Output Directory Properties
+    # =========================================================================
+
     @property
     def output_dir_parent(self) -> str:
         return 'archive'
diff --git a/archivebox/tests/test_migrations_08_to_09.py b/archivebox/tests/test_migrations_08_to_09.py
index d28d4744..09a1e65a 100644
--- a/archivebox/tests/test_migrations_08_to_09.py
+++ b/archivebox/tests/test_migrations_08_to_09.py
@@ -413,5 +413,148 @@ class TestMigrationDataIntegrity08x(unittest.TestCase):
             shutil.rmtree(work_dir, ignore_errors=True)
 
 
+class TestFilesystemMigration08to09(unittest.TestCase):
+    """Test filesystem migration from 0.8.x flat structure to 0.9.x organized structure."""
+
+    def setUp(self):
+        """Create a temporary directory for testing."""
+        self.work_dir = Path(tempfile.mkdtemp())
+        self.db_path = self.work_dir / 'index.sqlite3'
+
+    def tearDown(self):
+        """Clean up temporary directory."""
+        shutil.rmtree(self.work_dir, ignore_errors=True)
+
+    def test_filesystem_migration_with_real_archiving(self):
+        """
+        Test that filesystem migration works with real archived content.
+
+        Steps:
+        1. Initialize archivebox
+        2. Archive https://example.com (creates real files)
+        3. Manually set fs_version to 0.8.0
+        4. Trigger migration by saving snapshot
+        5. Verify files are organized correctly
+        """
+        # Step 1: Initialize
+        result = run_archivebox(self.work_dir, ['init'], timeout=45)
+        self.assertEqual(result.returncode, 0, f"Init failed: {result.stderr}")
+
+        # Step 2: Archive example.com with some extractors enabled
+        # Enable a subset of fast extractors for testing
+        result = run_archivebox(
+            self.work_dir,
+            ['add', '--depth=0', 'https://example.com'],
+            timeout=120,
+            env={
+                'SAVE_TITLE': 'True',
+                'SAVE_FAVICON': 'True',
+                'SAVE_WGET': 'True',
+                'SAVE_SCREENSHOT': 'False',  # Disable slow extractors
+                'SAVE_DOM': 'False',
+                'SAVE_SINGLEFILE': 'False',
+                'SAVE_READABILITY': 'False',
+                'SAVE_MERCURY': 'False',
+                'SAVE_PDF': 'False',
+                'SAVE_MEDIA': 'False',
+                'SAVE_ARCHIVE_DOT_ORG': 'False',
+            }
+        )
+        # Note: Add may fail if network is down or extractors fail, but we still want to test
+        # the filesystem migration logic even with partial failures
+
+        # Step 3: Get the snapshot and verify files were created
+        conn = sqlite3.connect(str(self.db_path))
+        cursor = conn.cursor()
+        cursor.execute("SELECT id, url, timestamp, fs_version FROM core_snapshot WHERE url = ?", ('https://example.com',))
+        row = cursor.fetchone()
+        conn.close()
+
+        if not row:
+            self.skipTest("Failed to create snapshot for https://example.com")
+
+        snapshot_id, url, timestamp, fs_version = row
+
+        # Verify initial fs_version is 0.9.0 (current version)
+        self.assertEqual(fs_version, '0.9.0', f"Expected new snapshot to have fs_version='0.9.0', got '{fs_version}'")
+
+        # Verify output directory exists
+        output_dir = self.work_dir / 'archive' / timestamp
+        self.assertTrue(output_dir.exists(), f"Output directory not found: {output_dir}")
+
+        # List all files created (for debugging)
+        files_before = list(output_dir.rglob('*'))
+        files_before_count = len([f for f in files_before if f.is_file()])
+        print(f"\n[*] Files created by archiving: {files_before_count}")
+        for f in sorted(files_before):
+            if f.is_file():
+                print(f"    {f.relative_to(output_dir)}")
+
+        # Step 4: Manually set fs_version to 0.8.0 to simulate old snapshot
+        conn = sqlite3.connect(str(self.db_path))
+        cursor = conn.cursor()
+        cursor.execute("UPDATE core_snapshot SET fs_version = '0.8.0' WHERE id = ?", (snapshot_id,))
+        conn.commit()
+
+        # Verify the update worked
+        cursor.execute("SELECT fs_version FROM core_snapshot WHERE id = ?", (snapshot_id,))
+        updated_version = cursor.fetchone()[0]
+        conn.close()
+        self.assertEqual(updated_version, '0.8.0', "Failed to set fs_version to 0.8.0")
+
+        # Step 5: Trigger migration by running a command that loads and saves the snapshot
+        # We'll use the Python API directly to trigger save()
+        import os
+        import sys
+        import django
+
+        # Setup Django
+        os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.settings')
+        os.environ['DATA_DIR'] = str(self.work_dir)
+
+        # Add parent dir to path so we can import archivebox
+        sys.path.insert(0, str(Path(__file__).parent.parent.parent))
+
+        try:
+            django.setup()
+            from core.models import Snapshot
+
+            # Load the snapshot (should trigger migration on save)
+            snapshot = Snapshot.objects.get(url='https://example.com')
+
+            # Verify fs_migration_needed returns True
+            self.assertTrue(snapshot.fs_migration_needed,
+                          f"fs_migration_needed should be True for fs_version='0.8.0'")
+
+            # Save to trigger migration
+            print(f"\n[*] Triggering filesystem migration by saving snapshot...")
+            snapshot.save()
+
+            # Refresh from DB
+            snapshot.refresh_from_db()
+
+            # Verify migration completed
+            self.assertEqual(snapshot.fs_version, '0.9.0',
+                           f"Migration failed: fs_version is still '{snapshot.fs_version}'")
+            self.assertFalse(snapshot.fs_migration_needed,
+                           "fs_migration_needed should be False after migration")
+
+            print(f"[√] Filesystem migration completed: 0.8.0 -> 0.9.0")
+
+        except Exception as e:
+            self.fail(f"Failed to trigger migration via Django: {e}")
+
+        # Step 6: Verify files still exist and are accessible
+        # For 0.8 -> 0.9, the migration is a no-op, so files should be in the same place
+        files_after = list(output_dir.rglob('*'))
+        files_after_count = len([f for f in files_after if f.is_file()])
+
+        print(f"\n[*] Files after migration: {files_after_count}")
+
+        # Verify no files were lost
+        self.assertGreaterEqual(files_after_count, files_before_count,
+                               f"Files were lost during migration: {files_before_count} -> {files_after_count}")
+
+
 if __name__ == '__main__':
     unittest.main()