remove model health stats from models that dont need it

2026-01-03 01:15:57 +10:00 · 2025-12-31 18:01:53 -08:00
parent e903fa1d2b
commit 6fadcf5168
5 changed files with 46 additions and 1016 deletions
--- a/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
+++ b/archivebox/core/migrations/0023_upgrade_to_0_9_0.py
@@ -143,6 +143,11 @@ def upgrade_core_tables(apps, schema_editor):
            if has_added and not has_bookmarked_at:
                # Migrating from v0.7.2 (has added/updated, no bookmarked_at/created_at/modified_at)
                print('Migrating Snapshot from v0.7.2 schema...')
+                # Debug: Check what data we're about to copy
+                cursor.execute("SELECT id, added, updated FROM core_snapshot LIMIT 3")
+                sample_data = cursor.fetchall()
+                print(f'DEBUG 0023: Sample Snapshot data before migration: {sample_data}')
+
                cursor.execute("""
                    INSERT OR IGNORE INTO core_snapshot_new (
                        id, url, timestamp, title, bookmarked_at, created_at, modified_at
@@ -154,6 +159,11 @@ def upgrade_core_tables(apps, schema_editor):
                        COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at
                    FROM core_snapshot;
                """)
+
+                # Debug: Check what was inserted
+                cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot_new LIMIT 3")
+                inserted_data = cursor.fetchall()
+                print(f'DEBUG 0023: Sample Snapshot data after INSERT: {inserted_data}')
            elif has_bookmarked_at and not has_added:
                # Migrating from v0.8.6rc0 (already has bookmarked_at/created_at/modified_at)
                print('Migrating Snapshot from v0.8.6rc0 schema...')
@@ -298,12 +308,15 @@ class Migration(migrations.Migration):
                ),
            ],
            state_operations=[
-                # Remove old ArchiveResult fields
-                migrations.RemoveField(model_name='archiveresult', name='extractor'),
-                migrations.RemoveField(model_name='archiveresult', name='output'),
-                # Remove old Snapshot fields
+                # NOTE: We do NOT remove extractor/output here for ArchiveResult!
+                # They are still in the database and will be removed by migration 0025
+                # after copying their data to the new field names (plugin, output_str).
+
+                # However, for Snapshot, we DO remove added/updated here because
+                # the database operations above already renamed them to bookmarked_at/created_at/modified_at.
                migrations.RemoveField(model_name='snapshot', name='added'),
                migrations.RemoveField(model_name='snapshot', name='updated'),
+
                # SnapshotTag table already exists from v0.7.2, just declare it in state
                migrations.CreateModel(
                    name='SnapshotTag',
--- a/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py
+++ b/archivebox/core/migrations/0025_alter_archiveresult_options_alter_snapshot_options_and_more.py
@@ -25,7 +25,7 @@ def copy_old_fields_to_new(apps, schema_editor):
        count = cursor.fetchone()[0]
        print(f'DEBUG 0025: Updated {count} rows with plugin data')
    else:
-        print(f'DEBUG 0025: NOT copying - extractor in cols: {extractor" in cols}, plugin in cols: {"plugin" in cols}')
+        print(f'DEBUG 0025: NOT copying - extractor in cols: {"extractor" in cols}, plugin in cols: {"plugin" in cols}')

    if 'output' in cols and 'output_str' in cols:
        # Copy output -> output_str
@@ -239,6 +239,16 @@ class Migration(migrations.Migration):
            copy_old_fields_to_new,
            reverse_code=migrations.RunPython.noop,
        ),
+        # Now remove the old ArchiveResult fields after data has been copied
+        migrations.RemoveField(
+            model_name='archiveresult',
+            name='extractor',
+        ),
+        migrations.RemoveField(
+            model_name='archiveresult',
+            name='output',
+        ),
+        # NOTE: Snapshot's added/updated fields were already removed by migration 0023
        migrations.AlterField(
            model_name='archiveresult',
            name='end_ts',
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -29,7 +29,7 @@ from archivebox.hooks import (
    get_plugins, get_plugin_name, get_plugin_icon,
 )
 from archivebox.base_models.models import (
-    ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
+    ModelWithUUID, ModelWithOutputDir,
    ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
    get_or_create_system_user_pk,
 )
@@ -40,7 +40,7 @@ from archivebox.machine.models import NetworkInterface, Binary



-class Tag(ModelWithSerializers):
+class Tag(ModelWithUUID):
    # Keep AutoField for compatibility with main branch migrations
    # Don't use UUIDField here - requires complex FK transformation
    id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
@@ -2254,7 +2254,7 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
        )


-class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
+class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithStateMachine):
    class StatusChoices(models.TextChoices):
        QUEUED = 'queued', 'Queued'
        STARTED = 'started', 'Started'
@@ -2551,11 +2551,20 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        pass

    def cascade_health_update(self, success: bool):
-        """Update health stats for self, parent Snapshot, and grandparent Crawl."""
-        self.increment_health_stats(success)
+        """Update health stats for parent Snapshot, Crawl, and execution infrastructure (Binary, Machine, NetworkInterface)."""
+        # Update archival hierarchy
        self.snapshot.increment_health_stats(success)
        self.snapshot.crawl.increment_health_stats(success)

+        # Update execution infrastructure
+        if self.binary:
+            self.binary.increment_health_stats(success)
+            if self.binary.machine:
+                self.binary.machine.increment_health_stats(success)
+
+        if self.iface:
+            self.iface.increment_health_stats(success)
+
    def run(self):
        """
        Execute this ArchiveResult's hook and update status.
--- a/archivebox/crawls/models.py
+++ b/archivebox/crawls/models.py
@@ -16,14 +16,14 @@ from statemachine import State, registry
 from rich import print

 from archivebox.config import CONSTANTS
-from archivebox.base_models.models import ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, get_or_create_system_user_pk
+from archivebox.base_models.models import ModelWithUUID, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, get_or_create_system_user_pk
 from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine

 if TYPE_CHECKING:
    from archivebox.core.models import Snapshot, ArchiveResult


-class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
+class CrawlSchedule(ModelWithUUID, ModelWithNotes):
    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
    created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
@@ -197,9 +197,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith

    @property
    def output_dir_parent(self) -> str:
-        """Construct parent directory: users/{user_id}/crawls/{YYYYMMDD}"""
+        """Construct parent directory: users/{username}/crawls/{YYYYMMDD}"""
        date_str = self.created_at.strftime('%Y%m%d')
-        return f'users/{self.created_by_id}/crawls/{date_str}'
+        return f'users/{self.created_by.username}/crawls/{date_str}'

    @property
    def output_dir_name(self) -> str:
--- a/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls_comprehensive.py.bak
+++ b/archivebox/plugins/parse_rss_urls/tests/test_parse_rss_urls_comprehensive.py.bak