remove model health stats from models that dont need it

This commit is contained in:
Nick Sweeting
2025-12-31 18:01:53 -08:00
parent e903fa1d2b
commit 6fadcf5168
5 changed files with 46 additions and 1016 deletions

View File

@@ -143,6 +143,11 @@ def upgrade_core_tables(apps, schema_editor):
if has_added and not has_bookmarked_at:
# Migrating from v0.7.2 (has added/updated, no bookmarked_at/created_at/modified_at)
print('Migrating Snapshot from v0.7.2 schema...')
# Debug: Check what data we're about to copy
cursor.execute("SELECT id, added, updated FROM core_snapshot LIMIT 3")
sample_data = cursor.fetchall()
print(f'DEBUG 0023: Sample Snapshot data before migration: {sample_data}')
cursor.execute("""
INSERT OR IGNORE INTO core_snapshot_new (
id, url, timestamp, title, bookmarked_at, created_at, modified_at
@@ -154,6 +159,11 @@ def upgrade_core_tables(apps, schema_editor):
COALESCE(updated, added, CURRENT_TIMESTAMP) as modified_at
FROM core_snapshot;
""")
# Debug: Check what was inserted
cursor.execute("SELECT id, bookmarked_at, modified_at FROM core_snapshot_new LIMIT 3")
inserted_data = cursor.fetchall()
print(f'DEBUG 0023: Sample Snapshot data after INSERT: {inserted_data}')
elif has_bookmarked_at and not has_added:
# Migrating from v0.8.6rc0 (already has bookmarked_at/created_at/modified_at)
print('Migrating Snapshot from v0.8.6rc0 schema...')
@@ -298,12 +308,15 @@ class Migration(migrations.Migration):
),
],
state_operations=[
# Remove old ArchiveResult fields
migrations.RemoveField(model_name='archiveresult', name='extractor'),
migrations.RemoveField(model_name='archiveresult', name='output'),
# Remove old Snapshot fields
# NOTE: We do NOT remove extractor/output here for ArchiveResult!
# They are still in the database and will be removed by migration 0025
# after copying their data to the new field names (plugin, output_str).
# However, for Snapshot, we DO remove added/updated here because
# the database operations above already renamed them to bookmarked_at/created_at/modified_at.
migrations.RemoveField(model_name='snapshot', name='added'),
migrations.RemoveField(model_name='snapshot', name='updated'),
# SnapshotTag table already exists from v0.7.2, just declare it in state
migrations.CreateModel(
name='SnapshotTag',

View File

@@ -25,7 +25,7 @@ def copy_old_fields_to_new(apps, schema_editor):
count = cursor.fetchone()[0]
print(f'DEBUG 0025: Updated {count} rows with plugin data')
else:
print(f'DEBUG 0025: NOT copying - extractor in cols: {extractor" in cols}, plugin in cols: {"plugin" in cols}')
print(f'DEBUG 0025: NOT copying - extractor in cols: {"extractor" in cols}, plugin in cols: {"plugin" in cols}')
if 'output' in cols and 'output_str' in cols:
# Copy output -> output_str
@@ -239,6 +239,16 @@ class Migration(migrations.Migration):
copy_old_fields_to_new,
reverse_code=migrations.RunPython.noop,
),
# Now remove the old ArchiveResult fields after data has been copied
migrations.RemoveField(
model_name='archiveresult',
name='extractor',
),
migrations.RemoveField(
model_name='archiveresult',
name='output',
),
# NOTE: Snapshot's added/updated fields were already removed by migration 0023
migrations.AlterField(
model_name='archiveresult',
name='end_ts',

View File

@@ -29,7 +29,7 @@ from archivebox.hooks import (
get_plugins, get_plugin_name, get_plugin_icon,
)
from archivebox.base_models.models import (
ModelWithUUID, ModelWithSerializers, ModelWithOutputDir,
ModelWithUUID, ModelWithOutputDir,
ModelWithConfig, ModelWithNotes, ModelWithHealthStats,
get_or_create_system_user_pk,
)
@@ -40,7 +40,7 @@ from archivebox.machine.models import NetworkInterface, Binary
class Tag(ModelWithSerializers):
class Tag(ModelWithUUID):
# Keep AutoField for compatibility with main branch migrations
# Don't use UUIDField here - requires complex FK transformation
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
@@ -2254,7 +2254,7 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
)
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithStateMachine):
class StatusChoices(models.TextChoices):
QUEUED = 'queued', 'Queued'
STARTED = 'started', 'Started'
@@ -2551,11 +2551,20 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
pass
def cascade_health_update(self, success: bool):
"""Update health stats for self, parent Snapshot, and grandparent Crawl."""
self.increment_health_stats(success)
"""Update health stats for parent Snapshot, Crawl, and execution infrastructure (Binary, Machine, NetworkInterface)."""
# Update archival hierarchy
self.snapshot.increment_health_stats(success)
self.snapshot.crawl.increment_health_stats(success)
# Update execution infrastructure
if self.binary:
self.binary.increment_health_stats(success)
if self.binary.machine:
self.binary.machine.increment_health_stats(success)
if self.iface:
self.iface.increment_health_stats(success)
def run(self):
"""
Execute this ArchiveResult's hook and update status.

View File

@@ -16,14 +16,14 @@ from statemachine import State, registry
from rich import print
from archivebox.config import CONSTANTS
from archivebox.base_models.models import ModelWithSerializers, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, get_or_create_system_user_pk
from archivebox.base_models.models import ModelWithUUID, ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, get_or_create_system_user_pk
from archivebox.workers.models import ModelWithStateMachine, BaseStateMachine
if TYPE_CHECKING:
from archivebox.core.models import Snapshot, ArchiveResult
class CrawlSchedule(ModelWithSerializers, ModelWithNotes, ModelWithHealthStats):
class CrawlSchedule(ModelWithUUID, ModelWithNotes):
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
created_by = models.ForeignKey(settings.AUTH_USER_MODEL, on_delete=models.CASCADE, default=get_or_create_system_user_pk, null=False)
@@ -197,9 +197,9 @@ class Crawl(ModelWithOutputDir, ModelWithConfig, ModelWithHealthStats, ModelWith
@property
def output_dir_parent(self) -> str:
"""Construct parent directory: users/{user_id}/crawls/{YYYYMMDD}"""
"""Construct parent directory: users/{username}/crawls/{YYYYMMDD}"""
date_str = self.created_at.strftime('%Y%m%d')
return f'users/{self.created_by_id}/crawls/{date_str}'
return f'users/{self.created_by.username}/crawls/{date_str}'
@property
def output_dir_name(self) -> str: