improve jsonl logic

2026-01-04 01:46:54 +10:00 · 2025-12-30 12:43:36 -08:00
parent 08366cfa46
commit 1b49ea9a0e
1 changed files with 23 additions and 10 deletions
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -341,6 +341,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        """Convenience property to access the user who created this snapshot via its crawl."""
        return self.crawl.created_by

+    @property
+    def process_set(self):
+        """Get all Process objects related to this snapshot's ArchiveResults."""
+        from archivebox.machine.models import Process
+        return Process.objects.filter(archiveresult__snapshot_id=self.id)
+
+    @property
+    def binary_set(self):
+        """Get all Binary objects used by processes related to this snapshot."""
+        from archivebox.machine.models import Binary
+        return Binary.objects.filter(process__archiveresult__snapshot_id=self.id).distinct()
+
    def save(self, *args, **kwargs):
        is_new = self._state.adding
        if not self.bookmarked_at:
@@ -965,19 +977,17 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
        index_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME
        index_path.parent.mkdir(parents=True, exist_ok=True)

-        # Collect unique binaries and processes from archive results
+        # Track unique binaries and processes to avoid duplicates
        binaries_seen = set()
        processes_seen = set()

        with open(index_path, 'w') as f:
-            # Write Snapshot record first
-            snapshot_record = self.to_jsonl()
-            snapshot_record['crawl_id'] = str(self.crawl_id) if self.crawl_id else None
-            snapshot_record['fs_version'] = self.fs_version
-            f.write(json.dumps(snapshot_record) + '\n')
+            # Write Snapshot record first (to_jsonl includes crawl_id, fs_version)
+            f.write(json.dumps(self.to_jsonl()) + '\n')

            # Write ArchiveResult records with their associated Binary and Process
-            for ar in ArchiveResult.objects.filter(snapshot=self).order_by('start_ts'):
+            # Use select_related to optimize queries
+            for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'):
                # Write Binary record if not already written
                if ar.process and ar.process.binary and ar.process.binary_id not in binaries_seen:
                    binaries_seen.add(ar.process.binary_id)
@@ -1413,20 +1423,23 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
    def to_jsonl(self) -> dict:
        """
        Convert Snapshot model instance to a JSONL record.
+        Includes all fields needed to fully reconstruct/identify this snapshot.
        """
        from archivebox.config import VERSION
        return {
            'type': 'Snapshot',
            'schema_version': VERSION,
            'id': str(self.id),
+            'crawl_id': str(self.crawl_id),
            'url': self.url,
            'title': self.title,
-            'tags': self.tags_str() if hasattr(self, 'tags_str') else '',
+            'tags': self.tags_str(),
            'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
            'created_at': self.created_at.isoformat() if self.created_at else None,
            'timestamp': self.timestamp,
-            'depth': getattr(self, 'depth', 0),
-            'status': self.status if hasattr(self, 'status') else None,
+            'depth': self.depth,
+            'status': self.status,
+            'fs_version': self.fs_version,
        }

    @staticmethod