new jsonl cli interface

This commit is contained in:
Nick Sweeting
2025-12-30 16:12:53 -08:00
parent ba8c28a866
commit dd2302ad92
37 changed files with 2919 additions and 1602 deletions

View File

@@ -158,7 +158,7 @@ class AddLinkForm(forms.Form):
'search_backend_ripgrep', 'search_backend_sonic', 'search_backend_sqlite'
}
binary = {'apt', 'brew', 'custom', 'env', 'npm', 'pip'}
extensions = {'captcha2', 'istilldontcareaboutcookies', 'ublock'}
extensions = {'twocaptcha', 'istilldontcareaboutcookies', 'ublock'}
# Populate plugin field choices
self.fields['chrome_plugins'].choices = [

View File

@@ -1,6 +1,6 @@
__package__ = 'archivebox.core'
from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING
from typing import Optional, Dict, Iterable, Any, List, TYPE_CHECKING, Iterator, Set
from archivebox.uuid_compat import uuid7
from datetime import datetime, timedelta
from django_stubs_ext.db.models import TypedModelMeta
@@ -41,6 +41,8 @@ from archivebox.machine.models import NetworkInterface, Binary
class Tag(ModelWithSerializers):
JSONL_TYPE = 'Tag'
# Keep AutoField for compatibility with main branch migrations
# Don't use UUIDField here - requires complex FK transformation
id = models.AutoField(primary_key=True, serialize=False, verbose_name='ID')
@@ -91,26 +93,66 @@ class Tag(ModelWithSerializers):
def api_url(self) -> str:
return reverse_lazy('api-1:get_tag', args=[self.id])
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Tag model instance to a JSONL record.
Convert Tag model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
return {
'type': 'Tag',
'type': self.JSONL_TYPE,
'schema_version': VERSION,
'id': str(self.id),
'name': self.name,
'slug': self.slug,
}
@staticmethod
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None):
def to_jsonl(self, seen: Set[tuple] = None, **kwargs) -> Iterator[dict]:
"""
Create/update Tag from JSONL record.
Yield this Tag as a JSON record.
Args:
record: JSONL record with 'name' field
seen: Set of (type, id) tuples already emitted (for deduplication)
**kwargs: Passed to children (none for Tag, leaf node)
Yields:
dict: JSON-serializable record for this tag
"""
if seen is not None:
key = (self.JSONL_TYPE, str(self.id))
if key in seen:
return
seen.add(key)
yield self.to_json()
@classmethod
def from_jsonl(cls, records, overrides: Dict[str, Any] = None) -> list['Tag']:
"""
Create/update Tags from an iterable of JSONL records.
Filters to only records with type='Tag'.
Args:
records: Iterable of dicts (JSONL records)
overrides: Optional dict with 'snapshot' to auto-attach tags
Returns:
List of Tag instances (skips None results)
"""
results = []
for record in records:
record_type = record.get('type', cls.JSONL_TYPE)
if record_type == cls.JSONL_TYPE:
instance = cls.from_json(record, overrides=overrides)
if instance:
results.append(instance)
return results
@staticmethod
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None) -> 'Tag | None':
"""
Create/update a single Tag from a JSON record dict.
Args:
record: Dict with 'name' field
overrides: Optional dict with 'snapshot' to auto-attach tag
Returns:
@@ -289,6 +331,8 @@ class SnapshotManager(models.Manager.from_queryset(SnapshotQuerySet)):
class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
JSONL_TYPE = 'Snapshot'
id = models.UUIDField(primary_key=True, default=uuid7, editable=False, unique=True)
created_at = models.DateTimeField(default=timezone.now, db_index=True)
modified_at = models.DateTimeField(auto_now=True)
@@ -968,38 +1012,18 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
Each line is a JSON record with a 'type' field:
- Snapshot: snapshot metadata (crawl_id, url, tags, etc.)
- ArchiveResult: extractor results (plugin, status, output, etc.)
- Binary: binary info used for the extraction
- Process: process execution details (cmd, exit_code, timing, etc.)
- ArchiveResult: extractor results (plugin, status, output, etc.)
"""
import json
index_path = Path(self.output_dir) / CONSTANTS.JSONL_INDEX_FILENAME
index_path.parent.mkdir(parents=True, exist_ok=True)
# Track unique binaries and processes to avoid duplicates
binaries_seen = set()
processes_seen = set()
with open(index_path, 'w') as f:
# Write Snapshot record first (to_jsonl includes crawl_id, fs_version)
f.write(json.dumps(self.to_jsonl()) + '\n')
# Write ArchiveResult records with their associated Binary and Process
# Use select_related to optimize queries
for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'):
# Write Binary record if not already written
if ar.process and ar.process.binary and ar.process.binary_id not in binaries_seen:
binaries_seen.add(ar.process.binary_id)
f.write(json.dumps(ar.process.binary.to_jsonl()) + '\n')
# Write Process record if not already written
if ar.process and ar.process_id not in processes_seen:
processes_seen.add(ar.process_id)
f.write(json.dumps(ar.process.to_jsonl()) + '\n')
# Write ArchiveResult record
f.write(json.dumps(ar.to_jsonl()) + '\n')
for record in self.to_jsonl():
f.write(json.dumps(record) + '\n')
def read_index_jsonl(self) -> dict:
"""
@@ -1420,14 +1444,14 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
return False
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert Snapshot model instance to a JSONL record.
Convert Snapshot model instance to a JSON-serializable dict.
Includes all fields needed to fully reconstruct/identify this snapshot.
"""
from archivebox.config import VERSION
return {
'type': 'Snapshot',
'type': self.JSONL_TYPE,
'schema_version': VERSION,
'id': str(self.id),
'crawl_id': str(self.crawl_id),
@@ -1442,12 +1466,68 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'fs_version': self.fs_version,
}
@staticmethod
def from_jsonl(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True):
def to_jsonl(self, seen: Set[tuple] = None, archiveresult: bool = True, process: bool = True, binary: bool = True, machine: bool = False, iface: bool = False, **kwargs) -> Iterator[dict]:
"""
Create/update Snapshot from JSONL record or dict.
Yield this Snapshot and optionally related objects as JSON records.
Unified method that handles:
Uses select_related for efficient querying. Deduplicates automatically.
Args:
seen: Set of (type, id) tuples already emitted (for deduplication)
archiveresult: Include related ArchiveResults (default: True)
process: Include Process for each ArchiveResult (default: True)
binary: Include Binary for each Process (default: True)
machine: Include Machine for each Process (default: False)
iface: Include NetworkInterface for each Process (default: False)
**kwargs: Additional options passed to children
Yields:
dict: JSON-serializable records
"""
if seen is None:
seen = set()
key = (self.JSONL_TYPE, str(self.id))
if key in seen:
return
seen.add(key)
yield self.to_json()
if archiveresult:
# Use select_related to optimize queries
for ar in self.archiveresult_set.select_related('process__binary').order_by('start_ts'):
yield from ar.to_jsonl(seen=seen, process=process, binary=binary, machine=machine, iface=iface, **kwargs)
@classmethod
def from_jsonl(cls, records, overrides: Dict[str, Any] = None, queue_for_extraction: bool = True) -> list['Snapshot']:
"""
Create/update Snapshots from an iterable of JSONL records.
Filters to only records with type='Snapshot' (or no type).
Args:
records: Iterable of dicts (JSONL records)
overrides: Dict with 'crawl', 'snapshot' (parent), 'created_by_id'
queue_for_extraction: If True, sets status=QUEUED and retry_at (default: True)
Returns:
List of Snapshot instances (skips None results)
"""
results = []
for record in records:
record_type = record.get('type', cls.JSONL_TYPE)
if record_type == cls.JSONL_TYPE:
instance = cls.from_json(record, overrides=overrides, queue_for_extraction=queue_for_extraction)
if instance:
results.append(instance)
return results
@staticmethod
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None, queue_for_extraction: bool = True) -> 'Snapshot | None':
"""
Create/update a single Snapshot from a JSON record dict.
Handles:
- ID-based patching: {"id": "...", "title": "new title"}
- URL-based create/update: {"url": "...", "title": "...", "tags": "..."}
- Auto-creates Crawl if not provided
@@ -2054,8 +2134,8 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
result['canonical'] = self.canonical_outputs()
return result
def to_json(self, indent: int = 4) -> str:
"""Convert to JSON string"""
def to_json_str(self, indent: int = 4) -> str:
"""Convert to JSON string for file output."""
return to_json(self.to_dict(extended=True), indent=indent)
def to_csv(self, cols: Optional[List[str]] = None, separator: str = ',', ljust: int = 0) -> str:
@@ -2203,6 +2283,8 @@ class SnapshotMachine(BaseStateMachine, strict_states=True):
class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHealthStats, ModelWithStateMachine):
JSONL_TYPE = 'ArchiveResult'
class StatusChoices(models.TextChoices):
QUEUED = 'queued', 'Queued'
STARTED = 'started', 'Started'
@@ -2274,13 +2356,13 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
"""Convenience property to access the user who created this archive result via its snapshot's crawl."""
return self.snapshot.crawl.created_by
def to_jsonl(self) -> dict:
def to_json(self) -> dict:
"""
Convert ArchiveResult model instance to a JSONL record.
Convert ArchiveResult model instance to a JSON-serializable dict.
"""
from archivebox.config import VERSION
record = {
'type': 'ArchiveResult',
'type': self.JSONL_TYPE,
'schema_version': VERSION,
'id': str(self.id),
'snapshot_id': str(self.snapshot_id),
@@ -2308,6 +2390,31 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
record['process_id'] = str(self.process_id)
return record
def to_jsonl(self, seen: Set[tuple] = None, process: bool = True, **kwargs) -> Iterator[dict]:
"""
Yield this ArchiveResult and optionally related objects as JSON records.
Args:
seen: Set of (type, id) tuples already emitted (for deduplication)
process: Include related Process and its children (default: True)
**kwargs: Passed to Process.to_jsonl() (e.g., binary=True, machine=False)
Yields:
dict: JSON-serializable records
"""
if seen is None:
seen = set()
key = (self.JSONL_TYPE, str(self.id))
if key in seen:
return
seen.add(key)
yield self.to_json()
if process and self.process:
yield from self.process.to_jsonl(seen=seen, **kwargs)
def save(self, *args, **kwargs):
is_new = self._state.adding