new jsonl cli interface

2026-04-06 07:47:53 +10:00 · 2025-12-30 16:12:53 -08:00
parent ba8c28a866
commit dd2302ad92
37 changed files with 2919 additions and 1602 deletions
--- a/archivebox/core/forms.py
+++ b/archivebox/core/forms.py
@@ -158,7 +158,7 @@ class AddLinkForm(forms.Form):
            'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
        }
        binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
-        extensions = {'captcha2', 'istilldontcareaboutcookies', 'ublock'}
+        extensions = {'twocaptcha', 'istilldontcareaboutcookies', 'ublock'}

        # Populate plugin field choices
        self.fields['chrome_plugins'].choices = [
--- a/archivebox/core/models.py
+++ b/archivebox/core/models.py
@@ -1,6 +1,6 @@
 __package__ = 'archivebox.core'

-from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
+from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING, Iterator, Set
 from archivebox.uuid_compat import uuid7
 from datetime import datetime, timedelta
 from django_stubs_ext.db.models import TypedModelMeta
@@ -41,6 +41,8 @@ from archivebox.machine.models import NetworkInterface, Binary


 class Tag(ModelWithSerializers):
+    JSONL_TYPE = 'Tag'
+
    # Keep AutoField for compatibility with main branch migrations
    # Don't use UUIDField here - requires complex FK transformation
    id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
@@ -91,26 +93,66 @@ class Tag(ModelWithSerializers):
    def api_url(self) -> str:
        return reverse_lazy('api-1:get_tag', args=[self.id])

-    def to_jsonl(self) -> dict:
+    def to_json(self) -> dict:
        """
-        Convert Tag model instance to a JSONL record.
+        Convert Tag model instance to a JSON-serializable dict.
        """
        from archivebox.config import VERSION
        return {
-            'type': 'Tag',
+            'type': self.JSONL_TYPE,
            'schema_version': VERSION,
            'id': str(self.id),
            'name': self.name,
            'slug': self.slug,
        }

-    @staticmethod
-    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
+    def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]:
        """
-        Create/update Tag from JSONL record.
+        Yield this Tag as a JSON record.

        Args:
-            record: JSONL record with 'name' field
+            seen: Set of (type, id) tuples already emitted (for deduplication)
+            **kwargs: Passed to children (none for Tag, leaf node)
+
+        Yields:
+            dict: JSON-serializable record for this tag
+        """
+        if seen is not None:
+            key = (self.JSONL_TYPE, str(self.id))
+            if key in seen:
+                return
+            seen.add(key)
+        yield self.to_json()
+
+    @classmethod
+    def from_jsonl(cls, records, overrides: Dict[str, Any] = None) -> list['Tag']:
+        """
+        Create/update Tags from an iterable of JSONL records.
+        Filters to only records with type='Tag'.
+
+        Args:
+            records: Iterable of dicts (JSONL records)
+            overrides: Optional dict with 'snapshot' to auto-attach tags
+
+        Returns:
+            List of Tag instances (skips None results)
+        """
+        results = []
+        for record in records:
+            record_type = record.get('type', cls.JSONL_TYPE)
+            if record_type == cls.JSONL_TYPE:
+                instance = cls.from_json(record, overrides=overrides)
+                if instance:
+                    results.append(instance)
+        return results
+
+    @staticmethod
+    def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None) -> 'Tag | None':
+        """
+        Create/update a single Tag from a JSON record dict.
+
+        Args:
+            record: Dict with 'name' field
            overrides: Optional dict with 'snapshot' to auto-attach tag

        Returns:
@@ -289,6 +331,8 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):


 class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
+    JSONL_TYPE = 'Snapshot'
+
    id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
    created_at = models.DateTimeField(default=timezone.now, db_index=True)
    modified_at = models.DateTimeField(auto_now=True)
@@ -968,38 +1012,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea

        Each line is a JSON record with a 'type' field:
        - Snapshot: snapshot metadata (crawl_id, url, tags, etc.)
-        - ArchiveResult: extractor results (plugin, status, output, etc.)
        - Binary: binary info used for the extraction
        - Process: process execution details (cmd, exit_code, timing, etc.)
+        - ArchiveResult: extractor results (plugin, status, output, etc.)
        """
        import json

        index_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME
        index_path.parent.mkdir(parents=True, exist_ok=True)

-        # Track unique binaries and processes to avoid duplicates
-        binaries_seen = set()
-        processes_seen = set()
-
        with open(index_path, 'w') as f:
-            # Write Snapshot record first (to_jsonl includes crawl_id, fs_version)
-            f.write(json.dumps(self.to_jsonl()) + '\n')
-
-            # Write ArchiveResult records with their associated Binary and Process
-            # Use select_related to optimize queries
-            for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'):
-                # Write Binary record if not already written
-                if ar.process and ar.process.binary and ar.process.binary_id not in binaries_seen:
-                    binaries_seen.add(ar.process.binary_id)
-                    f.write(json.dumps(ar.process.binary.to_jsonl()) + '\n')
-
-                # Write Process record if not already written
-                if ar.process and ar.process_id not in processes_seen:
-                    processes_seen.add(ar.process_id)
-                    f.write(json.dumps(ar.process.to_jsonl()) + '\n')
-
-                # Write ArchiveResult record
-                f.write(json.dumps(ar.to_jsonl()) + '\n')
+            for record in self.to_jsonl():
+                f.write(json.dumps(record) + '\n')

    def read_index_jsonl(self) -> dict:
        """
@@ -1420,14 +1444,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea

        return False

-    def to_jsonl(self) -> dict:
+    def to_json(self) -> dict:
        """
-        Convert Snapshot model instance to a JSONL record.
+        Convert Snapshot model instance to a JSON-serializable dict.
        Includes all fields needed to fully reconstruct/identify this snapshot.
        """
        from archivebox.config import VERSION
        return {
-            'type': 'Snapshot',
+            'type': self.JSONL_TYPE,
            'schema_version': VERSION,
            'id': str(self.id),
            'crawl_id': str(self.crawl_id),
@@ -1442,12 +1466,68 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
            'fs_version': self.fs_version,
        }

-    @staticmethod
-    def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
+    def to_jsonl(self, seen: Set[tuple] = None, archiveresult: bool = True, process: bool = True, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]:
        """
-        Create/update Snapshot from JSONL record or dict.
+        Yield this Snapshot and optionally related objects as JSON records.

-        Unified method that handles:
+        Uses select_related for efficient querying. Deduplicates automatically.
+
+        Args:
+            seen: Set of (type, id) tuples already emitted (for deduplication)
+            archiveresult: Include related ArchiveResults (default: True)
+            process: Include Process for each ArchiveResult (default: True)
+            binary: Include Binary for each Process (default: True)
+            machine: Include Machine for each Process (default: False)
+            iface: Include NetworkInterface for each Process (default: False)
+            **kwargs: Additional options passed to children
+
+        Yields:
+            dict: JSON-serializable records
+        """
+        if seen is None:
+            seen = set()
+
+        key = (self.JSONL_TYPE, str(self.id))
+        if key in seen:
+            return
+        seen.add(key)
+
+        yield self.to_json()
+
+        if archiveresult:
+            # Use select_related to optimize queries
+            for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'):
+                yield from ar.to_jsonl(seen=seen, process=process, binary=binary, machine=machine, iface=iface, **kwargs)
+
+    @classmethod
+    def from_jsonl(cls, records, overrides: Dict[str, Any] = None, queue_for_extraction: bool = True) -> list['Snapshot']:
+        """
+        Create/update Snapshots from an iterable of JSONL records.
+        Filters to only records with type='Snapshot' (or no type).
+
+        Args:
+            records: Iterable of dicts (JSONL records)
+            overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
+            queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)
+
+        Returns:
+            List of Snapshot instances (skips None results)
+        """
+        results = []
+        for record in records:
+            record_type = record.get('type', cls.JSONL_TYPE)
+            if record_type == cls.JSONL_TYPE:
+                instance = cls.from_json(record, overrides=overrides, queue_for_extraction=queue_for_extraction)
+                if instance:
+                    results.append(instance)
+        return results
+
+    @staticmethod
+    def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True) -> 'Snapshot | None':
+        """
+        Create/update a single Snapshot from a JSON record dict.
+
+        Handles:
        - ID-based patching: {"id": "...", "title": "new title"}
        - URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
        - Auto-creates Crawl if not provided
@@ -2054,8 +2134,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
            result['canonical'] = self.canonical_outputs()
        return result

-    def to_json(self, indent: int = 4) -> str:
-        """Convert to JSON string"""
+    def to_json_str(self, indent: int = 4) -> str:
+        """Convert to JSON string for file output."""
        return to_json(self.to_dict(extended=True), indent=indent)

    def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
@@ -2203,6 +2283,8 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):


 class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
+    JSONL_TYPE = 'ArchiveResult'
+
    class StatusChoices(models.TextChoices):
        QUEUED = 'queued', 'Queued'
        STARTED = 'started', 'Started'
@@ -2274,13 +2356,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
        """Convenience property to access the user who created this archive result via its snapshot's crawl."""
        return self.snapshot.crawl.created_by

-    def to_jsonl(self) -> dict:
+    def to_json(self) -> dict:
        """
-        Convert ArchiveResult model instance to a JSONL record.
+        Convert ArchiveResult model instance to a JSON-serializable dict.
        """
        from archivebox.config import VERSION
        record = {
-            'type': 'ArchiveResult',
+            'type': self.JSONL_TYPE,
            'schema_version': VERSION,
            'id': str(self.id),
            'snapshot_id': str(self.snapshot_id),
@@ -2308,6 +2390,31 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
            record['process_id'] = str(self.process_id)
        return record

+    def to_jsonl(self, seen: Set[tuple] = None, process: bool = True, **kwargs) -> Iterator[dict]:
+        """
+        Yield this ArchiveResult and optionally related objects as JSON records.
+
+        Args:
+            seen: Set of (type, id) tuples already emitted (for deduplication)
+            process: Include related Process and its children (default: True)
+            **kwargs: Passed to Process.to_jsonl() (e.g., binary=True, machine=False)
+
+        Yields:
+            dict: JSON-serializable records
+        """
+        if seen is None:
+            seen = set()
+
+        key = (self.JSONL_TYPE, str(self.id))
+        if key in seen:
+            return
+        seen.add(key)
+
+        yield self.to_json()
+
+        if process and self.process:
+            yield from self.process.to_jsonl(seen=seen, **kwargs)
+
    def save(self, *args, **kwargs):
        is_new = self._state.adding