fix abid generation migrations to be historically consistent

2026-01-04 09:55:33 +10:00 · 2024-08-20 01:58:19 -07:00
parent 506b3d28d4
commit 9273db528e
6 changed files with 99 additions and 31 deletions
--- a/archivebox/core/migrations/0024_auto_20240513_1143.py
+++ b/archivebox/core/migrations/0024_auto_20240513_1143.py
@@ -2,7 +2,7 @@

 from django.db import migrations
 from datetime import datetime
-from abid_utils.abid import abid_from_values
+from abid_utils.abid import abid_from_values, DEFAULT_ABID_URI_SALT


 def calculate_abid(self):
@@ -41,6 +41,7 @@ def calculate_abid(self):
        uri=uri,
        subtype=subtype,
        rand=rand,
+        salt=DEFAULT_ABID_URI_SALT,
    )
    assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
    return abid
@@ -65,8 +66,7 @@ def generate_snapshot_abids(apps, schema_editor):

        snapshot.abid = calculate_abid(snapshot)
        snapshot.uuid = snapshot.abid.uuid
-        snapshot.id = snapshot.abid.uuid
-        snapshot.save(update_fields=["abid", "uuid", "id"])
+        snapshot.save(update_fields=["abid", "uuid"])

 def generate_archiveresult_abids(apps, schema_editor):
    print('   Generating ArchiveResult.abid values... (may take an hour or longer for large collections...)')
--- a/archivebox/core/migrations/0027_update_snapshot_ids.py
+++ b/archivebox/core/migrations/0027_update_snapshot_ids.py
@@ -4,29 +4,89 @@ from django.db import migrations

 from django.db import migrations
 from datetime import datetime
-from abid_utils.abid import ABID
+from abid_utils.abid import ABID, abid_from_values, DEFAULT_ABID_URI_SALT


+def calculate_abid(self):
+    """
+    Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
+    """
+    prefix = self.abid_prefix
+    ts = eval(self.abid_ts_src)
+    uri = eval(self.abid_uri_src)
+    subtype = eval(self.abid_subtype_src)
+    rand = eval(self.abid_rand_src)
+
+    if (not prefix) or prefix == 'obj_':
+        suggested_abid = self.__class__.__name__[:3].lower()
+        raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
+
+    if not ts:
+        ts = datetime.utcfromtimestamp(0)
+        print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
+
+    if not uri:
+        uri = str(self)
+        print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
+
+    if not subtype:
+        subtype = self.__class__.__name__
+        print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
+
+    if not rand:
+        rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
+        print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
+
+    abid = abid_from_values(
+        prefix=prefix,
+        ts=ts,
+        uri=uri,
+        subtype=subtype,
+        rand=rand,
+        salt=DEFAULT_ABID_URI_SALT,
+    )
+    assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
+    return abid
+
 def update_snapshot_ids(apps, schema_editor):
    Snapshot = apps.get_model("core", "Snapshot")
    num_total = Snapshot.objects.all().count()
    print(f'   Updating {num_total} Snapshot.id, Snapshot.uuid values in place...')
    for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator()):
        assert snapshot.abid
-        snapshot.uuid = ABID.parse(snapshot.abid).uuid
-        snapshot.save(update_fields=["uuid"])
+        snapshot.abid_prefix = 'snp_'
+        snapshot.abid_ts_src = 'self.added'
+        snapshot.abid_uri_src = 'self.url'
+        snapshot.abid_subtype_src = '"01"'
+        snapshot.abid_rand_src = 'self.uuid'
+
+        snapshot.abid = calculate_abid(snapshot)
+        snapshot.uuid = snapshot.abid.uuid
+        snapshot.save(update_fields=["abid", "uuid"])
        assert str(ABID.parse(snapshot.abid).uuid) == str(snapshot.uuid)
        if idx % 1000 == 0:
            print(f'Migrated {idx}/{num_total} Snapshot objects...')

 def update_archiveresult_ids(apps, schema_editor):
+    Snapshot = apps.get_model("core", "Snapshot")
    ArchiveResult = apps.get_model("core", "ArchiveResult")
    num_total = ArchiveResult.objects.all().count()
    print(f'   Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
-    for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator()):
+    for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator()):
        assert result.abid
+        result.abid_prefix = 'res_'
+        result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
+        result.snapshot_added = result.snapshot.added
+        result.snapshot_url = result.snapshot.url
+        result.abid_ts_src = 'self.snapshot_added'
+        result.abid_uri_src = 'self.snapshot_url'
+        result.abid_subtype_src = 'self.extractor'
+        result.abid_rand_src = 'self.id'
+
+        result.abid = calculate_abid(result)
+        result.uuid = result.abid.uuid
        result.uuid = ABID.parse(result.abid).uuid
-        result.save(update_fields=["uuid"])
+        result.save(update_fields=["abid", "uuid"])
        assert str(ABID.parse(result.abid).uuid) == str(result.uuid)
        if idx % 5000 == 0:
            print(f'Migrated {idx}/{num_total} ArchiveResult objects...')
--- a/archivebox/core/migrations/0040_archiveresult_snapshot.py
+++ b/archivebox/core/migrations/0040_archiveresult_snapshot.py
@@ -8,9 +8,9 @@ def update_archiveresult_snapshot_ids(apps, schema_editor):
    Snapshot = apps.get_model("core", "Snapshot")
    num_total = ArchiveResult.objects.all().count()
    print(f'   Updating {num_total} ArchiveResult.snapshot_id values in place... (may take an hour or longer for large collections...)')
-    for idx, result in enumerate(ArchiveResult.objects.all().only('snapshot_old_id').iterator()):
+    for idx, result in enumerate(ArchiveResult.objects.all().only('snapshot_old_id').iterator(chunk_size=5000)):
        assert result.snapshot_old_id
-        snapshot = Snapshot.objects.get(old_id=result.snapshot_old_id)
+        snapshot = Snapshot.objects.only('id').get(old_id=result.snapshot_old_id)
        result.snapshot_id = snapshot.id
        result.save(update_fields=["snapshot_id"])
        assert str(result.snapshot_id) == str(snapshot.id)