fix abid generation migrations to be historically consistent

This commit is contained in:
Nick Sweeting
2024-08-20 01:58:19 -07:00
parent 506b3d28d4
commit 9273db528e
6 changed files with 99 additions and 31 deletions

View File

@@ -2,7 +2,7 @@
from django.db import migrations
from datetime import datetime
from abid_utils.abid import abid_from_values
from abid_utils.abid import abid_from_values, DEFAULT_ABID_URI_SALT
def calculate_abid(self):
@@ -41,6 +41,7 @@ def calculate_abid(self):
uri=uri,
subtype=subtype,
rand=rand,
salt=DEFAULT_ABID_URI_SALT,
)
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
return abid
@@ -65,8 +66,7 @@ def generate_snapshot_abids(apps, schema_editor):
snapshot.abid = calculate_abid(snapshot)
snapshot.uuid = snapshot.abid.uuid
snapshot.id = snapshot.abid.uuid
snapshot.save(update_fields=["abid", "uuid", "id"])
snapshot.save(update_fields=["abid", "uuid"])
def generate_archiveresult_abids(apps, schema_editor):
print(' Generating ArchiveResult.abid values... (may take an hour or longer for large collections...)')

View File

@@ -4,29 +4,89 @@ from django.db import migrations
from django.db import migrations
from datetime import datetime
from abid_utils.abid import ABID
from abid_utils.abid import ABID, abid_from_values, DEFAULT_ABID_URI_SALT
def calculate_abid(self):
"""
Return a freshly derived ABID (assembled from attrs defined in ABIDModel.abid_*_src).
"""
prefix = self.abid_prefix
ts = eval(self.abid_ts_src)
uri = eval(self.abid_uri_src)
subtype = eval(self.abid_subtype_src)
rand = eval(self.abid_rand_src)
if (not prefix) or prefix == 'obj_':
suggested_abid = self.__class__.__name__[:3].lower()
raise Exception(f'{self.__class__.__name__}.abid_prefix must be defined to calculate ABIDs (suggested: {suggested_abid})')
if not ts:
ts = datetime.utcfromtimestamp(0)
print(f'[!] WARNING: Generating ABID with ts=0000000000 placeholder because {self.__class__.__name__}.abid_ts_src={self.abid_ts_src} is unset!', ts.isoformat())
if not uri:
uri = str(self)
print(f'[!] WARNING: Generating ABID with uri=str(self) placeholder because {self.__class__.__name__}.abid_uri_src={self.abid_uri_src} is unset!', uri)
if not subtype:
subtype = self.__class__.__name__
print(f'[!] WARNING: Generating ABID with subtype={subtype} placeholder because {self.__class__.__name__}.abid_subtype_src={self.abid_subtype_src} is unset!', subtype)
if not rand:
rand = getattr(self, 'uuid', None) or getattr(self, 'id', None) or getattr(self, 'pk')
print(f'[!] WARNING: Generating ABID with rand=self.id placeholder because {self.__class__.__name__}.abid_rand_src={self.abid_rand_src} is unset!', rand)
abid = abid_from_values(
prefix=prefix,
ts=ts,
uri=uri,
subtype=subtype,
rand=rand,
salt=DEFAULT_ABID_URI_SALT,
)
assert abid.ulid and abid.uuid and abid.typeid, f'Failed to calculate {prefix}_ABID for {self.__class__.__name__}'
return abid
def update_snapshot_ids(apps, schema_editor):
Snapshot = apps.get_model("core", "Snapshot")
num_total = Snapshot.objects.all().count()
print(f' Updating {num_total} Snapshot.id, Snapshot.uuid values in place...')
for idx, snapshot in enumerate(Snapshot.objects.all().only('abid').iterator()):
assert snapshot.abid
snapshot.uuid = ABID.parse(snapshot.abid).uuid
snapshot.save(update_fields=["uuid"])
snapshot.abid_prefix = 'snp_'
snapshot.abid_ts_src = 'self.added'
snapshot.abid_uri_src = 'self.url'
snapshot.abid_subtype_src = '"01"'
snapshot.abid_rand_src = 'self.uuid'
snapshot.abid = calculate_abid(snapshot)
snapshot.uuid = snapshot.abid.uuid
snapshot.save(update_fields=["abid", "uuid"])
assert str(ABID.parse(snapshot.abid).uuid) == str(snapshot.uuid)
if idx % 1000 == 0:
print(f'Migrated {idx}/{num_total} Snapshot objects...')
def update_archiveresult_ids(apps, schema_editor):
Snapshot = apps.get_model("core", "Snapshot")
ArchiveResult = apps.get_model("core", "ArchiveResult")
num_total = ArchiveResult.objects.all().count()
print(f' Updating {num_total} ArchiveResult.id, ArchiveResult.uuid values in place... (may take an hour or longer for large collections...)')
for idx, result in enumerate(ArchiveResult.objects.all().only('abid').iterator()):
for idx, result in enumerate(ArchiveResult.objects.all().only('abid', 'snapshot_id').iterator()):
assert result.abid
result.abid_prefix = 'res_'
result.snapshot = Snapshot.objects.get(pk=result.snapshot_id)
result.snapshot_added = result.snapshot.added
result.snapshot_url = result.snapshot.url
result.abid_ts_src = 'self.snapshot_added'
result.abid_uri_src = 'self.snapshot_url'
result.abid_subtype_src = 'self.extractor'
result.abid_rand_src = 'self.id'
result.abid = calculate_abid(result)
result.uuid = result.abid.uuid
result.uuid = ABID.parse(result.abid).uuid
result.save(update_fields=["uuid"])
result.save(update_fields=["abid", "uuid"])
assert str(ABID.parse(result.abid).uuid) == str(result.uuid)
if idx % 5000 == 0:
print(f'Migrated {idx}/{num_total} ArchiveResult objects...')

View File

@@ -8,9 +8,9 @@ def update_archiveresult_snapshot_ids(apps, schema_editor):
Snapshot = apps.get_model("core", "Snapshot")
num_total = ArchiveResult.objects.all().count()
print(f' Updating {num_total} ArchiveResult.snapshot_id values in place... (may take an hour or longer for large collections...)')
for idx, result in enumerate(ArchiveResult.objects.all().only('snapshot_old_id').iterator()):
for idx, result in enumerate(ArchiveResult.objects.all().only('snapshot_old_id').iterator(chunk_size=5000)):
assert result.snapshot_old_id
snapshot = Snapshot.objects.get(old_id=result.snapshot_old_id)
snapshot = Snapshot.objects.only('id').get(old_id=result.snapshot_old_id)
result.snapshot_id = snapshot.id
result.save(update_fields=["snapshot_id"])
assert str(result.snapshot_id) == str(snapshot.id)