logging and admin ui improvements

This commit is contained in:
Nick Sweeting
2025-12-25 01:10:41 -08:00
parent 8218675ed4
commit 866f993f26
60 changed files with 2932 additions and 497 deletions

View File

@@ -60,7 +60,8 @@ class Tag(ModelWithSerializers):
return self.name
def save(self, *args, **kwargs):
if self._state.adding:
is_new = self._state.adding
if is_new:
self.slug = slugify(self.name)
existing = set(Tag.objects.filter(slug__startswith=self.slug).values_list("slug", flat=True))
i = None
@@ -72,6 +73,19 @@ class Tag(ModelWithSerializers):
i = (i or 0) + 1
super().save(*args, **kwargs)
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
worker_type='DB',
event='Created Tag',
indent_level=0,
metadata={
'id': self.id,
'name': self.name,
'slug': self.slug,
},
)
@property
def api_url(self) -> str:
return reverse_lazy('api-1:get_tag', args=[self.id])
@@ -241,12 +255,13 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
if tag.strip()
))
try:
snapshot = self.get(url=url)
# Get most recent snapshot with this URL (URLs can exist in multiple crawls)
snapshot = self.filter(url=url).order_by('-created_at').first()
if snapshot:
if title and (not snapshot.title or len(title) > len(snapshot.title or '')):
snapshot.title = title
snapshot.save(update_fields=['title', 'modified_at'])
except self.model.DoesNotExist:
else:
if timestamp:
while self.filter(timestamp=timestamp).exists():
timestamp = str(float(timestamp) + 1.0)
@@ -284,7 +299,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
url = models.URLField(unique=True, db_index=True)
url = models.URLField(unique=False, db_index=True) # URLs can appear in multiple crawls
timestamp = models.CharField(max_length=32, unique=True, db_index=True, editable=False)
bookmarked_at = models.DateTimeField(default=timezone.now, db_index=True)
crawl: Crawl = models.ForeignKey(Crawl, on_delete=models.CASCADE, default=None, null=True, blank=True, related_name='snapshot_set', db_index=True) # type: ignore
@@ -313,11 +328,16 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
class Meta(TypedModelMeta):
verbose_name = "Snapshot"
verbose_name_plural = "Snapshots"
constraints = [
# Allow same URL in different crawls, but not duplicates within same crawl
models.UniqueConstraint(fields=['url', 'crawl'], name='unique_url_per_crawl'),
]
def __str__(self):
return f'[{self.id}] {self.url[:64]}'
def save(self, *args, **kwargs):
is_new = self._state.adding
if not self.bookmarked_at:
self.bookmarked_at = self.created_at or timezone.now()
if not self.timestamp:
@@ -327,6 +347,21 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
self.crawl.urls += f'\n{self.url}'
self.crawl.save()
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
worker_type='DB',
event='Created Snapshot',
indent_level=2,
url=self.url,
metadata={
'id': str(self.id),
'crawl_id': str(self.crawl_id) if self.crawl_id else None,
'depth': self.depth,
'status': self.status,
},
)
def output_dir_parent(self) -> str:
return 'archive'
@@ -807,6 +842,24 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
def __str__(self):
return f'[{self.id}] {self.snapshot.url[:64]} -> {self.extractor}'
def save(self, *args, **kwargs):
is_new = self._state.adding
super().save(*args, **kwargs)
if is_new:
from archivebox.misc.logging_util import log_worker_event
log_worker_event(
worker_type='DB',
event='Created ArchiveResult',
indent_level=3,
extractor=self.extractor,
metadata={
'id': str(self.id),
'snapshot_id': str(self.snapshot_id),
'snapshot_url': str(self.snapshot.url)[:64],
'status': self.status,
},
)
@cached_property
def snapshot_dir(self):
return Path(self.snapshot.output_dir)
@@ -879,7 +932,6 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
from django.utils import timezone
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR, run_hook
extractor_dir = Path(self.snapshot.output_dir) / self.extractor
config_objects = [self.snapshot.crawl, self.snapshot] if self.snapshot.crawl else [self.snapshot]
# Find hook for this extractor
@@ -899,6 +951,10 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
self.save()
return
# Use plugin directory name instead of extractor name (removes numeric prefix)
plugin_name = hook.parent.name
extractor_dir = Path(self.snapshot.output_dir) / plugin_name
# Run the hook
start_ts = timezone.now()
result = run_hook(