Implement JSONL CLI pipeline architecture (Phases 1-4, 6)

Phase 1: Model Prerequisites
- Add ArchiveResult.from_json() and from_jsonl() methods
- Fix Snapshot.to_json() to use tags_str (consistent with Crawl)

Phase 2: Shared Utilities
- Create archivebox/cli/cli_utils.py with shared apply_filters()
- Update 7 CLI files to import from cli_utils.py instead of duplicating

Phase 3: Pass-Through Behavior
- Add pass-through to crawl create (non-Crawl records pass unchanged)
- Add pass-through to snapshot create (Crawl records + others pass through)
- Add pass-through to archiveresult create (Snapshot records + others)
- Add create-or-update behavior to run command:
  - Records WITHOUT id: Create via Model.from_json()
  - Records WITH id: Lookup existing, re-queue
  - Outputs JSONL of all processed records for chaining

Phase 4: Test Infrastructure
- Create archivebox/tests/conftest.py with pytest-django fixtures
- Include CLI helpers, output assertions, database assertions

Phase 6: Config Update
- Update supervisord_util.py: orchestrator -> run command

This enables Unix-style piping:
  archivebox crawl create URL | archivebox run
  archivebox archiveresult list --status=failed | archivebox run
  curl API | jq transform | archivebox crawl create | archivebox run
This commit is contained in:
Claude
2025-12-31 10:07:14 +00:00
parent 1c85b4daa3
commit f3e11b61fd
13 changed files with 529 additions and 145 deletions

View File

@@ -39,21 +39,7 @@ from typing import Optional
import rich_click as click
from rich import print as rprint
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
from archivebox.cli.cli_utils import apply_filters
# =============================================================================
@@ -69,6 +55,7 @@ def create_archiveresults(
Create ArchiveResults for Snapshots.
Reads Snapshot records from stdin and creates ArchiveResult entries.
Pass-through: Non-Snapshot/ArchiveResult records are output unchanged.
If --plugin is specified, only creates results for that plugin.
Otherwise, creates results for all pending plugins.
@@ -78,7 +65,7 @@ def create_archiveresults(
"""
from django.utils import timezone
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
from archivebox.core.models import Snapshot, ArchiveResult
is_tty = sys.stdout.isatty()
@@ -87,6 +74,7 @@ def create_archiveresults(
if snapshot_id:
try:
snapshots = [Snapshot.objects.get(id=snapshot_id)]
pass_through_records = []
except Snapshot.DoesNotExist:
rprint(f'[red]Snapshot not found: {snapshot_id}[/red]', file=sys.stderr)
return 1
@@ -97,17 +85,44 @@ def create_archiveresults(
rprint('[yellow]No Snapshot records provided via stdin[/yellow]', file=sys.stderr)
return 1
# Filter to only Snapshot records
# Separate snapshot records from pass-through records
snapshot_ids = []
pass_through_records = []
for record in records:
if record.get('type') == TYPE_SNAPSHOT:
record_type = record.get('type', '')
if record_type == TYPE_SNAPSHOT:
# Pass through the Snapshot record itself
pass_through_records.append(record)
if record.get('id'):
snapshot_ids.append(record['id'])
elif record_type == TYPE_ARCHIVERESULT:
# ArchiveResult records: pass through if they have an id
if record.get('id'):
pass_through_records.append(record)
# If no id, we could create it, but for now just pass through
else:
pass_through_records.append(record)
elif record_type:
# Other typed records (Crawl, Tag, etc): pass through
pass_through_records.append(record)
elif record.get('id'):
# Assume it's a snapshot ID if no type specified
# Untyped record with id - assume it's a snapshot ID
snapshot_ids.append(record['id'])
# Output pass-through records first
if not is_tty:
for record in pass_through_records:
write_record(record)
if not snapshot_ids:
if pass_through_records:
rprint(f'[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]', file=sys.stderr)
return 0
rprint('[yellow]No valid Snapshot IDs in input[/yellow]', file=sys.stderr)
return 1
@@ -115,7 +130,7 @@ def create_archiveresults(
if not snapshots:
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
return 1
return 0 if pass_through_records else 1
created_count = 0
for snapshot in snapshots: