Add unit tests for JSONL CLI pipeline commands (Phase 5 & 6) (#1743)

This commit is contained in:
Nick Sweeting
2025-12-31 02:27:17 -08:00
committed by GitHub
18 changed files with 1711 additions and 150 deletions

View File

@@ -687,30 +687,30 @@ def create_test_snapshot_json(url: str = None, **kwargs) -> Dict[str, Any]:
## Task Checklist ## Task Checklist
### Phase 1: Model Prerequisites ### Phase 1: Model Prerequisites
- [ ] Implement `ArchiveResult.from_json()` in `archivebox/core/models.py` - [x] Implement `ArchiveResult.from_json()` in `archivebox/core/models.py`
- [ ] Implement `ArchiveResult.from_jsonl()` in `archivebox/core/models.py` - [x] Implement `ArchiveResult.from_jsonl()` in `archivebox/core/models.py`
- [ ] Fix `Snapshot.to_json()` to use `tags_str` instead of `tags` - [x] Fix `Snapshot.to_json()` to use `tags_str` instead of `tags`
### Phase 2: Shared Utilities ### Phase 2: Shared Utilities
- [ ] Create `archivebox/cli/cli_utils.py` with shared `apply_filters()` - [x] Create `archivebox/cli/cli_utils.py` with shared `apply_filters()`
- [ ] Update 7 CLI files to import from `cli_utils.py` - [x] Update 7 CLI files to import from `cli_utils.py`
### Phase 3: Pass-Through Behavior ### Phase 3: Pass-Through Behavior
- [ ] Add pass-through to `archivebox_crawl.py` create - [x] Add pass-through to `archivebox_crawl.py` create
- [ ] Add pass-through to `archivebox_snapshot.py` create - [x] Add pass-through to `archivebox_snapshot.py` create
- [ ] Add pass-through to `archivebox_archiveresult.py` create - [x] Add pass-through to `archivebox_archiveresult.py` create
- [ ] Add create-or-update to `archivebox_run.py` - [x] Add create-or-update to `archivebox_run.py`
- [ ] Add pass-through output to `archivebox_run.py` - [x] Add pass-through output to `archivebox_run.py`
### Phase 4: Test Infrastructure ### Phase 4: Test Infrastructure
- [ ] Create `archivebox/tests/conftest.py` with pytest-django fixtures - [x] Create `archivebox/tests/conftest.py` with pytest-django fixtures
### Phase 5: Unit Tests ### Phase 5: Unit Tests
- [ ] Create `archivebox/tests/test_cli_crawl.py` - [x] Create `archivebox/tests/test_cli_crawl.py`
- [ ] Create `archivebox/tests/test_cli_snapshot.py` - [x] Create `archivebox/tests/test_cli_snapshot.py`
- [ ] Create `archivebox/tests/test_cli_archiveresult.py` - [x] Create `archivebox/tests/test_cli_archiveresult.py`
- [ ] Create `archivebox/tests/test_cli_run.py` - [x] Create `archivebox/tests/test_cli_run.py`
### Phase 6: Integration & Config ### Phase 6: Integration & Config
- [ ] Extend `archivebox/cli/tests_piping.py` with pass-through tests - [x] Extend `archivebox/cli/tests_piping.py` with pass-through tests
- [ ] Update `archivebox/workers/supervisord_util.py`: orchestrator→run - [x] Update `archivebox/workers/supervisord_util.py`: orchestrator→run

View File

@@ -39,21 +39,7 @@ from typing import Optional
import rich_click as click import rich_click as click
from rich import print as rprint from rich import print as rprint
from archivebox.cli.cli_utils import apply_filters
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
# ============================================================================= # =============================================================================
@@ -69,6 +55,7 @@ def create_archiveresults(
Create ArchiveResults for Snapshots. Create ArchiveResults for Snapshots.
Reads Snapshot records from stdin and creates ArchiveResult entries. Reads Snapshot records from stdin and creates ArchiveResult entries.
Pass-through: Non-Snapshot/ArchiveResult records are output unchanged.
If --plugin is specified, only creates results for that plugin. If --plugin is specified, only creates results for that plugin.
Otherwise, creates results for all pending plugins. Otherwise, creates results for all pending plugins.
@@ -78,7 +65,7 @@ def create_archiveresults(
""" """
from django.utils import timezone from django.utils import timezone
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
from archivebox.core.models import Snapshot, ArchiveResult from archivebox.core.models import Snapshot, ArchiveResult
is_tty = sys.stdout.isatty() is_tty = sys.stdout.isatty()
@@ -87,6 +74,7 @@ def create_archiveresults(
if snapshot_id: if snapshot_id:
try: try:
snapshots = [Snapshot.objects.get(id=snapshot_id)] snapshots = [Snapshot.objects.get(id=snapshot_id)]
pass_through_records = []
except Snapshot.DoesNotExist: except Snapshot.DoesNotExist:
rprint(f'[red]Snapshot not found: {snapshot_id}[/red]', file=sys.stderr) rprint(f'[red]Snapshot not found: {snapshot_id}[/red]', file=sys.stderr)
return 1 return 1
@@ -97,17 +85,44 @@ def create_archiveresults(
rprint('[yellow]No Snapshot records provided via stdin[/yellow]', file=sys.stderr) rprint('[yellow]No Snapshot records provided via stdin[/yellow]', file=sys.stderr)
return 1 return 1
# Filter to only Snapshot records # Separate snapshot records from pass-through records
snapshot_ids = [] snapshot_ids = []
pass_through_records = []
for record in records: for record in records:
if record.get('type') == TYPE_SNAPSHOT: record_type = record.get('type', '')
if record_type == TYPE_SNAPSHOT:
# Pass through the Snapshot record itself
pass_through_records.append(record)
if record.get('id'): if record.get('id'):
snapshot_ids.append(record['id']) snapshot_ids.append(record['id'])
elif record_type == TYPE_ARCHIVERESULT:
# ArchiveResult records: pass through if they have an id
if record.get('id'):
pass_through_records.append(record)
# If no id, we could create it, but for now just pass through
else:
pass_through_records.append(record)
elif record_type:
# Other typed records (Crawl, Tag, etc): pass through
pass_through_records.append(record)
elif record.get('id'): elif record.get('id'):
# Assume it's a snapshot ID if no type specified # Untyped record with id - assume it's a snapshot ID
snapshot_ids.append(record['id']) snapshot_ids.append(record['id'])
# Output pass-through records first
if not is_tty:
for record in pass_through_records:
write_record(record)
if not snapshot_ids: if not snapshot_ids:
if pass_through_records:
rprint(f'[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]', file=sys.stderr)
return 0
rprint('[yellow]No valid Snapshot IDs in input[/yellow]', file=sys.stderr) rprint('[yellow]No valid Snapshot IDs in input[/yellow]', file=sys.stderr)
return 1 return 1
@@ -115,7 +130,7 @@ def create_archiveresults(
if not snapshots: if not snapshots:
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr) rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
return 1 return 0 if pass_through_records else 1
created_count = 0 created_count = 0
for snapshot in snapshots: for snapshot in snapshots:

View File

@@ -34,21 +34,7 @@ from typing import Optional
import rich_click as click import rich_click as click
from rich import print as rprint from rich import print as rprint
from archivebox.cli.cli_utils import apply_filters
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
# ============================================================================= # =============================================================================

View File

@@ -39,21 +39,7 @@ from typing import Optional, Iterable
import rich_click as click import rich_click as click
from rich import print as rprint from rich import print as rprint
from archivebox.cli.cli_utils import apply_filters
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
# ============================================================================= # =============================================================================
@@ -71,12 +57,13 @@ def create_crawl(
Create a Crawl job from URLs. Create a Crawl job from URLs.
Takes URLs as args or stdin, creates one Crawl with all URLs, outputs JSONL. Takes URLs as args or stdin, creates one Crawl with all URLs, outputs JSONL.
Pass-through: Records that are not URLs are output unchanged (for piping).
Exit codes: Exit codes:
0: Success 0: Success
1: Failure 1: Failure
""" """
from archivebox.misc.jsonl import read_args_or_stdin, write_record from archivebox.misc.jsonl import read_args_or_stdin, write_record, TYPE_CRAWL
from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.crawls.models import Crawl from archivebox.crawls.models import Crawl
@@ -90,14 +77,46 @@ def create_crawl(
rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr) rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
return 1 return 1
# Collect all URLs into a single newline-separated string # Separate pass-through records from URL records
url_list = [] url_list = []
pass_through_records = []
for record in records: for record in records:
record_type = record.get('type', '')
# Pass-through: output records that aren't URL/Crawl types
if record_type and record_type != TYPE_CRAWL and not record.get('url') and not record.get('urls'):
pass_through_records.append(record)
continue
# Handle existing Crawl records (just pass through with id)
if record_type == TYPE_CRAWL and record.get('id'):
pass_through_records.append(record)
continue
# Collect URLs
url = record.get('url') url = record.get('url')
if url: if url:
url_list.append(url) url_list.append(url)
# Handle 'urls' field (newline-separated)
urls_field = record.get('urls')
if urls_field:
for line in urls_field.split('\n'):
line = line.strip()
if line and not line.startswith('#'):
url_list.append(line)
# Output pass-through records first
if not is_tty:
for record in pass_through_records:
write_record(record)
if not url_list: if not url_list:
if pass_through_records:
# If we had pass-through records but no URLs, that's OK
rprint(f'[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]', file=sys.stderr)
return 0
rprint('[red]No valid URLs found[/red]', file=sys.stderr) rprint('[red]No valid URLs found[/red]', file=sys.stderr)
return 1 return 1

View File

@@ -28,21 +28,7 @@ from typing import Optional
import rich_click as click import rich_click as click
from rich import print as rprint from rich import print as rprint
from archivebox.cli.cli_utils import apply_filters
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
# ============================================================================= # =============================================================================

View File

@@ -31,21 +31,7 @@ from typing import Optional
import rich_click as click import rich_click as click
from rich import print as rprint from rich import print as rprint
from archivebox.cli.cli_utils import apply_filters
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
# ============================================================================= # =============================================================================

View File

@@ -38,58 +38,110 @@ def process_stdin_records() -> int:
""" """
Process JSONL records from stdin. Process JSONL records from stdin.
Reads records, queues them for processing, then runs orchestrator until complete. Create-or-update behavior:
Handles any record type: Crawl, Snapshot, ArchiveResult, etc. - Records WITHOUT id: Create via Model.from_json(), then queue
- Records WITH id: Lookup existing, re-queue for processing
Outputs JSONL of all processed records (for chaining).
Handles any record type: Crawl, Snapshot, ArchiveResult.
Auto-cascades: Crawl → Snapshots → ArchiveResults.
Returns exit code (0 = success, 1 = error). Returns exit code (0 = success, 1 = error).
""" """
from django.utils import timezone from django.utils import timezone
from archivebox.misc.jsonl import read_stdin, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT from archivebox.misc.jsonl import read_stdin, write_record, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
from archivebox.base_models.models import get_or_create_system_user_pk
from archivebox.core.models import Snapshot, ArchiveResult from archivebox.core.models import Snapshot, ArchiveResult
from archivebox.crawls.models import Crawl from archivebox.crawls.models import Crawl
from archivebox.workers.orchestrator import Orchestrator from archivebox.workers.orchestrator import Orchestrator
records = list(read_stdin()) records = list(read_stdin())
is_tty = sys.stdout.isatty()
if not records: if not records:
return 0 # Nothing to process return 0 # Nothing to process
created_by_id = get_or_create_system_user_pk()
queued_count = 0 queued_count = 0
output_records = []
for record in records: for record in records:
record_type = record.get('type') record_type = record.get('type', '')
record_id = record.get('id') record_id = record.get('id')
if not record_id:
continue
try: try:
if record_type == TYPE_CRAWL: if record_type == TYPE_CRAWL:
crawl = Crawl.objects.get(id=record_id) if record_id:
if crawl.status in [Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]: # Existing crawl - re-queue
try:
crawl = Crawl.objects.get(id=record_id)
except Crawl.DoesNotExist:
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
else:
# New crawl - create it
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
if crawl:
crawl.retry_at = timezone.now() crawl.retry_at = timezone.now()
if crawl.status not in [Crawl.StatusChoices.SEALED]:
crawl.status = Crawl.StatusChoices.QUEUED
crawl.save() crawl.save()
output_records.append(crawl.to_json())
queued_count += 1 queued_count += 1
elif record_type == TYPE_SNAPSHOT: elif record_type == TYPE_SNAPSHOT or (record.get('url') and not record_type):
snapshot = Snapshot.objects.get(id=record_id) if record_id:
if snapshot.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]: # Existing snapshot - re-queue
try:
snapshot = Snapshot.objects.get(id=record_id)
except Snapshot.DoesNotExist:
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
else:
# New snapshot - create it
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
if snapshot:
snapshot.retry_at = timezone.now() snapshot.retry_at = timezone.now()
if snapshot.status not in [Snapshot.StatusChoices.SEALED]:
snapshot.status = Snapshot.StatusChoices.QUEUED
snapshot.save() snapshot.save()
output_records.append(snapshot.to_json())
queued_count += 1 queued_count += 1
elif record_type == TYPE_ARCHIVERESULT: elif record_type == TYPE_ARCHIVERESULT:
archiveresult = ArchiveResult.objects.get(id=record_id) if record_id:
if archiveresult.status in [ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED, ArchiveResult.StatusChoices.BACKOFF]: # Existing archiveresult - re-queue
try:
archiveresult = ArchiveResult.objects.get(id=record_id)
except ArchiveResult.DoesNotExist:
archiveresult = ArchiveResult.from_json(record)
else:
# New archiveresult - create it
archiveresult = ArchiveResult.from_json(record)
if archiveresult:
archiveresult.retry_at = timezone.now() archiveresult.retry_at = timezone.now()
if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.BACKOFF]:
archiveresult.status = ArchiveResult.StatusChoices.QUEUED
archiveresult.save() archiveresult.save()
output_records.append(archiveresult.to_json())
queued_count += 1 queued_count += 1
except (Crawl.DoesNotExist, Snapshot.DoesNotExist, ArchiveResult.DoesNotExist): else:
rprint(f'[yellow]Record not found: {record_type} {record_id}[/yellow]', file=sys.stderr) # Unknown type - pass through
output_records.append(record)
except Exception as e:
rprint(f'[yellow]Error processing record: {e}[/yellow]', file=sys.stderr)
continue continue
# Output all processed records (for chaining)
if not is_tty:
for rec in output_records:
write_record(rec)
if queued_count == 0: if queued_count == 0:
rprint('[yellow]No records to process[/yellow]', file=sys.stderr) rprint('[yellow]No records to process[/yellow]', file=sys.stderr)
return 0 return 0

View File

@@ -36,21 +36,7 @@ from typing import Optional, Iterable
import rich_click as click import rich_click as click
from rich import print as rprint from rich import print as rprint
from archivebox.cli.cli_utils import apply_filters
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
# ============================================================================= # =============================================================================
@@ -66,13 +52,12 @@ def create_snapshots(
) -> int: ) -> int:
""" """
Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records). Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records).
Pass-through: Records that are not Crawl/Snapshot/URL are output unchanged.
Exit codes: Exit codes:
0: Success 0: Success
1: Failure 1: Failure
""" """
from django.utils import timezone
from archivebox.misc.jsonl import ( from archivebox.misc.jsonl import (
read_args_or_stdin, write_record, read_args_or_stdin, write_record,
TYPE_SNAPSHOT, TYPE_CRAWL TYPE_SNAPSHOT, TYPE_CRAWL
@@ -93,11 +78,17 @@ def create_snapshots(
# Process each record - handle Crawls and plain URLs/Snapshots # Process each record - handle Crawls and plain URLs/Snapshots
created_snapshots = [] created_snapshots = []
pass_through_count = 0
for record in records: for record in records:
record_type = record.get('type') record_type = record.get('type', '')
try: try:
if record_type == TYPE_CRAWL: if record_type == TYPE_CRAWL:
# Pass through the Crawl record itself first
if not is_tty:
write_record(record)
# Input is a Crawl - get or create it, then create Snapshots for its URLs # Input is a Crawl - get or create it, then create Snapshots for its URLs
crawl = None crawl = None
crawl_id = record.get('id') crawl_id = record.get('id')
@@ -144,11 +135,20 @@ def create_snapshots(
if not is_tty: if not is_tty:
write_record(snapshot.to_json()) write_record(snapshot.to_json())
else:
# Pass-through: output records we don't handle
if not is_tty:
write_record(record)
pass_through_count += 1
except Exception as e: except Exception as e:
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr) rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
continue continue
if not created_snapshots: if not created_snapshots:
if pass_through_count > 0:
rprint(f'[dim]Passed through {pass_through_count} records, no new snapshots[/dim]', file=sys.stderr)
return 0
rprint('[red]No snapshots created[/red]', file=sys.stderr) rprint('[red]No snapshots created[/red]', file=sys.stderr)
return 1 return 1

View File

@@ -36,21 +36,7 @@ from typing import Optional, Iterable
import rich_click as click import rich_click as click
from rich import print as rprint from rich import print as rprint
from archivebox.cli.cli_utils import apply_filters
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
filters = {}
for key, value in filter_kwargs.items():
if value is not None and key not in ('limit', 'offset'):
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset
# ============================================================================= # =============================================================================

View File

@@ -0,0 +1,46 @@
"""
Shared CLI utilities for ArchiveBox commands.
This module contains common utilities used across multiple CLI commands,
extracted to avoid code duplication.
"""
__package__ = 'archivebox.cli'
from typing import Optional
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
"""
Apply Django-style filters from CLI kwargs to a QuerySet.
Supports: --status=queued, --url__icontains=example, --id__in=uuid1,uuid2
Args:
queryset: Django QuerySet to filter
filter_kwargs: Dict of filter key-value pairs from CLI
limit: Optional limit on results
Returns:
Filtered QuerySet
Example:
queryset = Snapshot.objects.all()
filter_kwargs = {'status': 'queued', 'url__icontains': 'example.com'}
filtered = apply_filters(queryset, filter_kwargs, limit=10)
"""
filters = {}
for key, value in filter_kwargs.items():
if value is None or key in ('limit', 'offset'):
continue
# Handle CSV lists for __in filters
if key.endswith('__in') and isinstance(value, str):
value = [v.strip() for v in value.split(',')]
filters[key] = value
if filters:
queryset = queryset.filter(**filters)
if limit:
queryset = queryset[:limit]
return queryset

View File

@@ -957,5 +957,129 @@ class TestEdgeCases(unittest.TestCase):
self.assertEqual(urls[2], 'https://url3.com') self.assertEqual(urls[2], 'https://url3.com')
# =============================================================================
# Pass-Through Behavior Tests
# =============================================================================
class TestPassThroughBehavior(unittest.TestCase):
"""Test pass-through behavior in CLI commands."""
def test_crawl_passes_through_other_types(self):
"""crawl create should pass through records with other types."""
from archivebox.misc.jsonl import TYPE_CRAWL
# Input: a Tag record (not a Crawl or URL)
tag_record = {'type': 'Tag', 'id': 'test-tag', 'name': 'example'}
url_record = {'url': 'https://example.com'}
# Mock stdin with both records
stdin = StringIO(
json.dumps(tag_record) + '\n' +
json.dumps(url_record)
)
stdin.isatty = lambda: False
# The Tag should be passed through, the URL should create a Crawl
# (This is a unit test of the pass-through logic)
from archivebox.misc.jsonl import read_args_or_stdin
records = list(read_args_or_stdin((), stream=stdin))
self.assertEqual(len(records), 2)
# First record is a Tag (other type)
self.assertEqual(records[0]['type'], 'Tag')
# Second record has a URL
self.assertIn('url', records[1])
def test_snapshot_passes_through_crawl(self):
"""snapshot create should pass through Crawl records."""
from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT
crawl_record = {
'type': TYPE_CRAWL,
'id': 'test-crawl',
'urls': 'https://example.com',
}
# Crawl records should be passed through AND create snapshots
# This tests the accumulation behavior
self.assertEqual(crawl_record['type'], TYPE_CRAWL)
self.assertIn('urls', crawl_record)
def test_archiveresult_passes_through_snapshot(self):
"""archiveresult create should pass through Snapshot records."""
from archivebox.misc.jsonl import TYPE_SNAPSHOT
snapshot_record = {
'type': TYPE_SNAPSHOT,
'id': 'test-snapshot',
'url': 'https://example.com',
}
# Snapshot records should be passed through
self.assertEqual(snapshot_record['type'], TYPE_SNAPSHOT)
self.assertIn('url', snapshot_record)
def test_run_passes_through_unknown_types(self):
"""run should pass through records with unknown types."""
unknown_record = {'type': 'Unknown', 'id': 'test', 'data': 'value'}
# Unknown types should be passed through unchanged
self.assertEqual(unknown_record['type'], 'Unknown')
self.assertIn('data', unknown_record)
class TestPipelineAccumulation(unittest.TestCase):
"""Test that pipelines accumulate records correctly."""
def test_full_pipeline_output_types(self):
"""Full pipeline should output all record types."""
from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
# Simulated pipeline output after: crawl | snapshot | archiveresult | run
# Should contain Crawl, Snapshot, and ArchiveResult records
pipeline_output = [
{'type': TYPE_CRAWL, 'id': 'c1', 'urls': 'https://example.com'},
{'type': TYPE_SNAPSHOT, 'id': 's1', 'url': 'https://example.com'},
{'type': TYPE_ARCHIVERESULT, 'id': 'ar1', 'plugin': 'title'},
]
types = {r['type'] for r in pipeline_output}
self.assertIn(TYPE_CRAWL, types)
self.assertIn(TYPE_SNAPSHOT, types)
self.assertIn(TYPE_ARCHIVERESULT, types)
def test_pipeline_preserves_ids(self):
"""Pipeline should preserve record IDs through all stages."""
records = [
{'type': 'Crawl', 'id': 'c1', 'urls': 'https://example.com'},
{'type': 'Snapshot', 'id': 's1', 'url': 'https://example.com'},
]
# All records should have IDs
for record in records:
self.assertIn('id', record)
self.assertTrue(record['id'])
def test_jq_transform_pattern(self):
"""Test pattern for jq transforms in pipeline."""
# Simulated: archiveresult list --status=failed | jq 'del(.id) | .status = "queued"'
failed_record = {
'type': 'ArchiveResult',
'id': 'ar1',
'status': 'failed',
'plugin': 'wget',
}
# Transform: delete id, set status to queued
transformed = {
'type': failed_record['type'],
'status': 'queued',
'plugin': failed_record['plugin'],
}
self.assertNotIn('id', transformed)
self.assertEqual(transformed['status'], 'queued')
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()

View File

@@ -1460,7 +1460,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
'crawl_id': str(self.crawl_id), 'crawl_id': str(self.crawl_id),
'url': self.url, 'url': self.url,
'title': self.title, 'title': self.title,
'tags': self.tags_str(), 'tags_str': self.tags_str(),
'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None, 'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
'created_at': self.created_at.isoformat() if self.created_at else None, 'created_at': self.created_at.isoformat() if self.created_at else None,
'timestamp': self.timestamp, 'timestamp': self.timestamp,
@@ -2418,6 +2418,96 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
if process and self.process: if process and self.process:
yield from self.process.to_jsonl(seen=seen, **kwargs) yield from self.process.to_jsonl(seen=seen, **kwargs)
@classmethod
def from_jsonl(cls, records, overrides: Dict[str, Any] = None) -> list['ArchiveResult']:
"""
Create/update ArchiveResults from an iterable of JSONL records.
Filters to only records with type='ArchiveResult'.
Args:
records: Iterable of dicts (JSONL records)
overrides: Dict of field overrides
Returns:
List of ArchiveResult instances (skips None results)
"""
results = []
for record in records:
record_type = record.get('type', cls.JSONL_TYPE)
if record_type == cls.JSONL_TYPE:
instance = cls.from_json(record, overrides=overrides)
if instance:
results.append(instance)
return results
@staticmethod
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None) -> 'ArchiveResult | None':
"""
Create or update a single ArchiveResult from a JSON record dict.
Args:
record: Dict with 'snapshot_id' and 'plugin' (required for create),
or 'id' (for update)
overrides: Dict of field overrides (e.g., config overrides)
Returns:
ArchiveResult instance or None if invalid
"""
from django.utils import timezone
overrides = overrides or {}
# If 'id' is provided, lookup and update existing
result_id = record.get('id')
if result_id:
try:
result = ArchiveResult.objects.get(id=result_id)
# Update fields from record
if record.get('status'):
result.status = record['status']
result.retry_at = timezone.now()
result.save()
return result
except ArchiveResult.DoesNotExist:
pass # Fall through to create
# Required fields for creation
snapshot_id = record.get('snapshot_id')
plugin = record.get('plugin')
if not snapshot_id or not plugin:
return None
try:
snapshot = Snapshot.objects.get(id=snapshot_id)
except Snapshot.DoesNotExist:
return None
# Check if result already exists for this snapshot+plugin
existing = ArchiveResult.objects.filter(
snapshot=snapshot,
plugin=plugin,
).first()
if existing:
# Update existing result if status provided
if record.get('status'):
existing.status = record['status']
existing.retry_at = timezone.now()
existing.save()
return existing
# Create new ArchiveResult
result = ArchiveResult(
snapshot=snapshot,
plugin=plugin,
status=record.get('status', ArchiveResult.StatusChoices.QUEUED),
retry_at=timezone.now(),
hook_name=record.get('hook_name', ''),
)
result.save()
return result
def save(self, *args, **kwargs): def save(self, *args, **kwargs):
is_new = self._state.adding is_new = self._state.adding

View File

@@ -0,0 +1,218 @@
"""archivebox/tests/conftest.py - Pytest fixtures for CLI tests."""
import os
import sys
import json
import subprocess
from pathlib import Path
from typing import List, Dict, Any, Optional, Tuple
import pytest
# =============================================================================
# Fixtures
# =============================================================================
@pytest.fixture
def isolated_data_dir(tmp_path, settings):
"""
Create isolated DATA_DIR for each test.
Uses tmp_path for isolation, configures Django settings.
"""
data_dir = tmp_path / 'archivebox_data'
data_dir.mkdir()
# Set environment for subprocess calls
os.environ['DATA_DIR'] = str(data_dir)
# Update Django settings
settings.DATA_DIR = data_dir
yield data_dir
# Cleanup handled by tmp_path fixture
@pytest.fixture
def initialized_archive(isolated_data_dir):
"""
Initialize ArchiveBox archive in isolated directory.
Runs `archivebox init` to set up database and directories.
"""
from archivebox.cli.archivebox_init import init
init(setup=True, quick=True)
return isolated_data_dir
@pytest.fixture
def cli_env(initialized_archive):
"""
Environment dict for CLI subprocess calls.
Includes DATA_DIR and disables slow extractors.
"""
return {
**os.environ,
'DATA_DIR': str(initialized_archive),
'USE_COLOR': 'False',
'SHOW_PROGRESS': 'False',
'SAVE_TITLE': 'True',
'SAVE_FAVICON': 'False',
'SAVE_WGET': 'False',
'SAVE_WARC': 'False',
'SAVE_PDF': 'False',
'SAVE_SCREENSHOT': 'False',
'SAVE_DOM': 'False',
'SAVE_SINGLEFILE': 'False',
'SAVE_READABILITY': 'False',
'SAVE_MERCURY': 'False',
'SAVE_GIT': 'False',
'SAVE_YTDLP': 'False',
'SAVE_HEADERS': 'False',
}
# =============================================================================
# CLI Helpers
# =============================================================================
def run_archivebox_cmd(
args: List[str],
stdin: Optional[str] = None,
cwd: Optional[Path] = None,
env: Optional[Dict[str, str]] = None,
timeout: int = 60,
) -> Tuple[str, str, int]:
"""
Run archivebox command, return (stdout, stderr, returncode).
Args:
args: Command arguments (e.g., ['crawl', 'create', 'https://example.com'])
stdin: Optional string to pipe to stdin
cwd: Working directory (defaults to DATA_DIR from env)
env: Environment variables (defaults to os.environ with DATA_DIR)
timeout: Command timeout in seconds
Returns:
Tuple of (stdout, stderr, returncode)
"""
cmd = [sys.executable, '-m', 'archivebox'] + args
env = env or {**os.environ}
cwd = cwd or Path(env.get('DATA_DIR', '.'))
result = subprocess.run(
cmd,
input=stdin,
capture_output=True,
text=True,
cwd=cwd,
env=env,
timeout=timeout,
)
return result.stdout, result.stderr, result.returncode
# =============================================================================
# Output Assertions
# =============================================================================
def parse_jsonl_output(stdout: str) -> List[Dict[str, Any]]:
"""Parse JSONL output into list of dicts."""
records = []
for line in stdout.strip().split('\n'):
line = line.strip()
if line and line.startswith('{'):
try:
records.append(json.loads(line))
except json.JSONDecodeError:
pass
return records
def assert_jsonl_contains_type(stdout: str, record_type: str, min_count: int = 1):
"""Assert output contains at least min_count records of type."""
records = parse_jsonl_output(stdout)
matching = [r for r in records if r.get('type') == record_type]
assert len(matching) >= min_count, \
f"Expected >= {min_count} {record_type}, got {len(matching)}"
return matching
def assert_jsonl_pass_through(stdout: str, input_records: List[Dict[str, Any]]):
"""Assert that input records appear in output (pass-through behavior)."""
output_records = parse_jsonl_output(stdout)
output_ids = {r.get('id') for r in output_records if r.get('id')}
for input_rec in input_records:
input_id = input_rec.get('id')
if input_id:
assert input_id in output_ids, \
f"Input record {input_id} not found in output (pass-through failed)"
def assert_record_has_fields(record: Dict[str, Any], required_fields: List[str]):
"""Assert record has all required fields with non-None values."""
for field in required_fields:
assert field in record, f"Record missing field: {field}"
assert record[field] is not None, f"Record field is None: {field}"
# =============================================================================
# Database Assertions
# =============================================================================
def assert_db_count(model_class, filters: Dict[str, Any], expected: int):
"""Assert database count matches expected."""
actual = model_class.objects.filter(**filters).count()
assert actual == expected, \
f"Expected {expected} {model_class.__name__}, got {actual}"
def assert_db_exists(model_class, **filters):
"""Assert at least one record exists matching filters."""
assert model_class.objects.filter(**filters).exists(), \
f"No {model_class.__name__} found matching {filters}"
# =============================================================================
# Test Data Factories
# =============================================================================
def create_test_url(domain: str = 'example.com', path: str = None) -> str:
"""Generate unique test URL."""
import uuid
path = path or uuid.uuid4().hex[:8]
return f'https://{domain}/{path}'
def create_test_crawl_json(urls: List[str] = None, **kwargs) -> Dict[str, Any]:
"""Create Crawl JSONL record for testing."""
from archivebox.misc.jsonl import TYPE_CRAWL
urls = urls or [create_test_url()]
return {
'type': TYPE_CRAWL,
'urls': '\n'.join(urls),
'max_depth': kwargs.get('max_depth', 0),
'tags_str': kwargs.get('tags_str', ''),
'status': kwargs.get('status', 'queued'),
**{k: v for k, v in kwargs.items() if k not in ('max_depth', 'tags_str', 'status')},
}
def create_test_snapshot_json(url: str = None, **kwargs) -> Dict[str, Any]:
"""Create Snapshot JSONL record for testing."""
from archivebox.misc.jsonl import TYPE_SNAPSHOT
return {
'type': TYPE_SNAPSHOT,
'url': url or create_test_url(),
'tags_str': kwargs.get('tags_str', ''),
'status': kwargs.get('status', 'queued'),
**{k: v for k, v in kwargs.items() if k not in ('tags_str', 'status')},
}

View File

@@ -0,0 +1,264 @@
"""
Tests for archivebox archiveresult CLI command.
Tests cover:
- archiveresult create (from Snapshot JSONL, with --plugin, pass-through)
- archiveresult list (with filters)
- archiveresult update
- archiveresult delete
"""
import json
import pytest
from archivebox.tests.conftest import (
run_archivebox_cmd,
parse_jsonl_output,
create_test_url,
)
class TestArchiveResultCreate:
"""Tests for `archivebox archiveresult create`."""
def test_create_from_snapshot_jsonl(self, cli_env, initialized_archive):
"""Create archive results from Snapshot JSONL input."""
url = create_test_url()
# Create a snapshot first
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
snapshot = parse_jsonl_output(stdout1)[0]
# Pipe snapshot to archiveresult create
stdout2, stderr, code = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
stdin=json.dumps(snapshot),
env=cli_env,
)
assert code == 0, f"Command failed: {stderr}"
records = parse_jsonl_output(stdout2)
# Should have the Snapshot passed through and ArchiveResult created
types = [r.get('type') for r in records]
assert 'Snapshot' in types
assert 'ArchiveResult' in types
ar = next(r for r in records if r['type'] == 'ArchiveResult')
assert ar['plugin'] == 'title'
def test_create_with_specific_plugin(self, cli_env, initialized_archive):
"""Create archive result for specific plugin."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, stderr, code = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=screenshot'],
stdin=json.dumps(snapshot),
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout2)
ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
assert len(ar_records) >= 1
assert ar_records[0]['plugin'] == 'screenshot'
def test_create_pass_through_crawl(self, cli_env, initialized_archive):
"""Pass-through Crawl records unchanged."""
url = create_test_url()
# Create crawl and snapshot
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
crawl = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['snapshot', 'create'],
stdin=json.dumps(crawl),
env=cli_env,
)
# Now pipe all to archiveresult create
stdout3, stderr, code = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
stdin=stdout2,
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout3)
types = [r.get('type') for r in records]
assert 'Crawl' in types
assert 'Snapshot' in types
assert 'ArchiveResult' in types
def test_create_pass_through_only_when_no_snapshots(self, cli_env, initialized_archive):
"""Only pass-through records but no new snapshots returns success."""
crawl_record = {'type': 'Crawl', 'id': 'fake-id', 'urls': 'https://example.com'}
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'create'],
stdin=json.dumps(crawl_record),
env=cli_env,
)
assert code == 0
assert 'Passed through' in stderr
class TestArchiveResultList:
"""Tests for `archivebox archiveresult list`."""
def test_list_empty(self, cli_env, initialized_archive):
"""List with no archive results returns empty."""
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'list'],
env=cli_env,
)
assert code == 0
assert 'Listed 0 archive results' in stderr
def test_list_filter_by_status(self, cli_env, initialized_archive):
"""Filter archive results by status."""
# Create snapshot and archive result
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
snapshot = parse_jsonl_output(stdout1)[0]
run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
stdin=json.dumps(snapshot),
env=cli_env,
)
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'list', '--status=queued'],
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
for r in records:
assert r['status'] == 'queued'
def test_list_filter_by_plugin(self, cli_env, initialized_archive):
"""Filter archive results by plugin."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
snapshot = parse_jsonl_output(stdout1)[0]
run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
stdin=json.dumps(snapshot),
env=cli_env,
)
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'list', '--plugin=title'],
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
for r in records:
assert r['plugin'] == 'title'
def test_list_with_limit(self, cli_env, initialized_archive):
"""Limit number of results."""
# Create multiple archive results
for _ in range(3):
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
snapshot = parse_jsonl_output(stdout1)[0]
run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
stdin=json.dumps(snapshot),
env=cli_env,
)
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'list', '--limit=2'],
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert len(records) == 2
class TestArchiveResultUpdate:
"""Tests for `archivebox archiveresult update`."""
def test_update_status(self, cli_env, initialized_archive):
"""Update archive result status."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
stdin=json.dumps(snapshot),
env=cli_env,
)
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
stdout3, stderr, code = run_archivebox_cmd(
['archiveresult', 'update', '--status=failed'],
stdin=json.dumps(ar),
env=cli_env,
)
assert code == 0
assert 'Updated 1 archive results' in stderr
records = parse_jsonl_output(stdout3)
assert records[0]['status'] == 'failed'
class TestArchiveResultDelete:
"""Tests for `archivebox archiveresult delete`."""
def test_delete_requires_yes(self, cli_env, initialized_archive):
"""Delete requires --yes flag."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
stdin=json.dumps(snapshot),
env=cli_env,
)
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'delete'],
stdin=json.dumps(ar),
env=cli_env,
)
assert code == 1
assert '--yes' in stderr
def test_delete_with_yes(self, cli_env, initialized_archive):
"""Delete with --yes flag works."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
stdin=json.dumps(snapshot),
env=cli_env,
)
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
stdout, stderr, code = run_archivebox_cmd(
['archiveresult', 'delete', '--yes'],
stdin=json.dumps(ar),
env=cli_env,
)
assert code == 0
assert 'Deleted 1 archive results' in stderr

View File

@@ -0,0 +1,261 @@
"""
Tests for archivebox crawl CLI command.
Tests cover:
- crawl create (with URLs, from stdin, pass-through)
- crawl list (with filters)
- crawl update
- crawl delete
"""
import json
import pytest
from archivebox.tests.conftest import (
run_archivebox_cmd,
parse_jsonl_output,
assert_jsonl_contains_type,
create_test_url,
create_test_crawl_json,
)
class TestCrawlCreate:
"""Tests for `archivebox crawl create`."""
def test_create_from_url_args(self, cli_env, initialized_archive):
"""Create crawl from URL arguments."""
url = create_test_url()
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'create', url],
env=cli_env,
)
assert code == 0, f"Command failed: {stderr}"
assert 'Created crawl' in stderr
# Check JSONL output
records = parse_jsonl_output(stdout)
assert len(records) == 1
assert records[0]['type'] == 'Crawl'
assert url in records[0]['urls']
def test_create_from_stdin_urls(self, cli_env, initialized_archive):
"""Create crawl from stdin URLs (one per line)."""
urls = [create_test_url() for _ in range(3)]
stdin = '\n'.join(urls)
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'create'],
stdin=stdin,
env=cli_env,
)
assert code == 0, f"Command failed: {stderr}"
records = parse_jsonl_output(stdout)
assert len(records) == 1
crawl = records[0]
assert crawl['type'] == 'Crawl'
# All URLs should be in the crawl
for url in urls:
assert url in crawl['urls']
def test_create_with_depth(self, cli_env, initialized_archive):
"""Create crawl with --depth flag."""
url = create_test_url()
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'create', '--depth=2', url],
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert records[0]['max_depth'] == 2
def test_create_with_tag(self, cli_env, initialized_archive):
"""Create crawl with --tag flag."""
url = create_test_url()
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'create', '--tag=test-tag', url],
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert 'test-tag' in records[0].get('tags_str', '')
def test_create_pass_through_other_types(self, cli_env, initialized_archive):
"""Pass-through records of other types unchanged."""
tag_record = {'type': 'Tag', 'id': 'fake-tag-id', 'name': 'test'}
url = create_test_url()
stdin = json.dumps(tag_record) + '\n' + json.dumps({'url': url})
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'create'],
stdin=stdin,
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
# Should have both the passed-through Tag and the new Crawl
types = [r.get('type') for r in records]
assert 'Tag' in types
assert 'Crawl' in types
def test_create_pass_through_existing_crawl(self, cli_env, initialized_archive):
"""Existing Crawl records (with id) are passed through."""
# First create a crawl
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
crawl = parse_jsonl_output(stdout1)[0]
# Now pipe it back - should pass through
stdout2, stderr, code = run_archivebox_cmd(
['crawl', 'create'],
stdin=json.dumps(crawl),
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout2)
assert len(records) == 1
assert records[0]['id'] == crawl['id']
class TestCrawlList:
"""Tests for `archivebox crawl list`."""
def test_list_empty(self, cli_env, initialized_archive):
"""List with no crawls returns empty."""
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'list'],
env=cli_env,
)
assert code == 0
assert 'Listed 0 crawls' in stderr
def test_list_returns_created(self, cli_env, initialized_archive):
"""List returns previously created crawls."""
url = create_test_url()
run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'list'],
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert len(records) >= 1
assert any(url in r.get('urls', '') for r in records)
def test_list_filter_by_status(self, cli_env, initialized_archive):
"""Filter crawls by status."""
url = create_test_url()
run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'list', '--status=queued'],
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
for r in records:
assert r['status'] == 'queued'
def test_list_with_limit(self, cli_env, initialized_archive):
"""Limit number of results."""
# Create multiple crawls
for _ in range(3):
run_archivebox_cmd(['crawl', 'create', create_test_url()], env=cli_env)
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'list', '--limit=2'],
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert len(records) == 2
class TestCrawlUpdate:
"""Tests for `archivebox crawl update`."""
def test_update_status(self, cli_env, initialized_archive):
"""Update crawl status."""
# Create a crawl
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
crawl = parse_jsonl_output(stdout1)[0]
# Update it
stdout2, stderr, code = run_archivebox_cmd(
['crawl', 'update', '--status=started'],
stdin=json.dumps(crawl),
env=cli_env,
)
assert code == 0
assert 'Updated 1 crawls' in stderr
records = parse_jsonl_output(stdout2)
assert records[0]['status'] == 'started'
class TestCrawlDelete:
"""Tests for `archivebox crawl delete`."""
def test_delete_requires_yes(self, cli_env, initialized_archive):
"""Delete requires --yes flag."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
crawl = parse_jsonl_output(stdout1)[0]
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'delete'],
stdin=json.dumps(crawl),
env=cli_env,
)
assert code == 1
assert '--yes' in stderr
def test_delete_with_yes(self, cli_env, initialized_archive):
"""Delete with --yes flag works."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
crawl = parse_jsonl_output(stdout1)[0]
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'delete', '--yes'],
stdin=json.dumps(crawl),
env=cli_env,
)
assert code == 0
assert 'Deleted 1 crawls' in stderr
def test_delete_dry_run(self, cli_env, initialized_archive):
"""Dry run shows what would be deleted."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
crawl = parse_jsonl_output(stdout1)[0]
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'delete', '--dry-run'],
stdin=json.dumps(crawl),
env=cli_env,
)
assert code == 0
assert 'Would delete' in stderr
assert 'dry run' in stderr.lower()

View File

@@ -0,0 +1,254 @@
"""
Tests for archivebox run CLI command.
Tests cover:
- run with stdin JSONL (Crawl, Snapshot, ArchiveResult)
- create-or-update behavior (records with/without id)
- pass-through output (for chaining)
"""
import json
import pytest
from archivebox.tests.conftest import (
run_archivebox_cmd,
parse_jsonl_output,
create_test_url,
create_test_crawl_json,
create_test_snapshot_json,
)
class TestRunWithCrawl:
"""Tests for `archivebox run` with Crawl input."""
def test_run_with_new_crawl(self, cli_env, initialized_archive):
"""Run creates and processes a new Crawl (no id)."""
crawl_record = create_test_crawl_json()
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(crawl_record),
env=cli_env,
timeout=120,
)
assert code == 0, f"Command failed: {stderr}"
# Should output the created Crawl
records = parse_jsonl_output(stdout)
crawl_records = [r for r in records if r.get('type') == 'Crawl']
assert len(crawl_records) >= 1
assert crawl_records[0].get('id') # Should have an id now
def test_run_with_existing_crawl(self, cli_env, initialized_archive):
"""Run re-queues an existing Crawl (with id)."""
url = create_test_url()
# First create a crawl
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
crawl = parse_jsonl_output(stdout1)[0]
# Run with the existing crawl
stdout2, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(crawl),
env=cli_env,
timeout=120,
)
assert code == 0
records = parse_jsonl_output(stdout2)
assert len(records) >= 1
class TestRunWithSnapshot:
"""Tests for `archivebox run` with Snapshot input."""
def test_run_with_new_snapshot(self, cli_env, initialized_archive):
"""Run creates and processes a new Snapshot (no id, just url)."""
snapshot_record = create_test_snapshot_json()
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(snapshot_record),
env=cli_env,
timeout=120,
)
assert code == 0, f"Command failed: {stderr}"
records = parse_jsonl_output(stdout)
snapshot_records = [r for r in records if r.get('type') == 'Snapshot']
assert len(snapshot_records) >= 1
assert snapshot_records[0].get('id')
def test_run_with_existing_snapshot(self, cli_env, initialized_archive):
"""Run re-queues an existing Snapshot (with id)."""
url = create_test_url()
# First create a snapshot
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
snapshot = parse_jsonl_output(stdout1)[0]
# Run with the existing snapshot
stdout2, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(snapshot),
env=cli_env,
timeout=120,
)
assert code == 0
records = parse_jsonl_output(stdout2)
assert len(records) >= 1
def test_run_with_plain_url(self, cli_env, initialized_archive):
"""Run accepts plain URL records (no type field)."""
url = create_test_url()
url_record = {'url': url}
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(url_record),
env=cli_env,
timeout=120,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert len(records) >= 1
class TestRunWithArchiveResult:
"""Tests for `archivebox run` with ArchiveResult input."""
def test_run_requeues_failed_archiveresult(self, cli_env, initialized_archive):
"""Run re-queues a failed ArchiveResult."""
url = create_test_url()
# Create snapshot and archive result
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, _, _ = run_archivebox_cmd(
['archiveresult', 'create', '--plugin=title'],
stdin=json.dumps(snapshot),
env=cli_env,
)
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
# Update to failed
ar['status'] = 'failed'
run_archivebox_cmd(
['archiveresult', 'update', '--status=failed'],
stdin=json.dumps(ar),
env=cli_env,
)
# Now run should re-queue it
stdout3, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(ar),
env=cli_env,
timeout=120,
)
assert code == 0
records = parse_jsonl_output(stdout3)
ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
assert len(ar_records) >= 1
class TestRunPassThrough:
"""Tests for pass-through behavior in `archivebox run`."""
def test_run_passes_through_unknown_types(self, cli_env, initialized_archive):
"""Run passes through records with unknown types."""
unknown_record = {'type': 'Unknown', 'id': 'fake-id', 'data': 'test'}
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(unknown_record),
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
unknown_records = [r for r in records if r.get('type') == 'Unknown']
assert len(unknown_records) == 1
assert unknown_records[0]['data'] == 'test'
def test_run_outputs_all_processed_records(self, cli_env, initialized_archive):
"""Run outputs all processed records for chaining."""
url = create_test_url()
crawl_record = create_test_crawl_json(urls=[url])
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(crawl_record),
env=cli_env,
timeout=120,
)
assert code == 0
records = parse_jsonl_output(stdout)
# Should have at least the Crawl in output
assert len(records) >= 1
class TestRunMixedInput:
"""Tests for `archivebox run` with mixed record types."""
def test_run_handles_mixed_types(self, cli_env, initialized_archive):
"""Run handles mixed Crawl/Snapshot/ArchiveResult input."""
crawl = create_test_crawl_json()
snapshot = create_test_snapshot_json()
unknown = {'type': 'Tag', 'id': 'fake', 'name': 'test'}
stdin = '\n'.join([
json.dumps(crawl),
json.dumps(snapshot),
json.dumps(unknown),
])
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=stdin,
env=cli_env,
timeout=120,
)
assert code == 0
records = parse_jsonl_output(stdout)
types = set(r.get('type') for r in records)
# Should have processed Crawl and Snapshot, passed through Tag
assert 'Crawl' in types or 'Snapshot' in types or 'Tag' in types
class TestRunEmpty:
"""Tests for `archivebox run` edge cases."""
def test_run_empty_stdin(self, cli_env, initialized_archive):
"""Run with empty stdin returns success."""
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin='',
env=cli_env,
)
assert code == 0
def test_run_no_records_to_process(self, cli_env, initialized_archive):
"""Run with only pass-through records shows message."""
unknown = {'type': 'Unknown', 'id': 'fake'}
stdout, stderr, code = run_archivebox_cmd(
['run'],
stdin=json.dumps(unknown),
env=cli_env,
)
assert code == 0
assert 'No records to process' in stderr

View File

@@ -0,0 +1,274 @@
"""
Tests for archivebox snapshot CLI command.
Tests cover:
- snapshot create (from URLs, from Crawl JSONL, pass-through)
- snapshot list (with filters)
- snapshot update
- snapshot delete
"""
import json
import pytest
from archivebox.tests.conftest import (
run_archivebox_cmd,
parse_jsonl_output,
assert_jsonl_contains_type,
create_test_url,
)
class TestSnapshotCreate:
"""Tests for `archivebox snapshot create`."""
def test_create_from_url_args(self, cli_env, initialized_archive):
"""Create snapshot from URL arguments."""
url = create_test_url()
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'create', url],
env=cli_env,
)
assert code == 0, f"Command failed: {stderr}"
assert 'Created' in stderr
records = parse_jsonl_output(stdout)
assert len(records) == 1
assert records[0]['type'] == 'Snapshot'
assert records[0]['url'] == url
def test_create_from_crawl_jsonl(self, cli_env, initialized_archive):
"""Create snapshots from Crawl JSONL input."""
url = create_test_url()
# First create a crawl
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
crawl = parse_jsonl_output(stdout1)[0]
# Pipe crawl to snapshot create
stdout2, stderr, code = run_archivebox_cmd(
['snapshot', 'create'],
stdin=json.dumps(crawl),
env=cli_env,
)
assert code == 0, f"Command failed: {stderr}"
records = parse_jsonl_output(stdout2)
# Should have the Crawl passed through and the Snapshot created
types = [r.get('type') for r in records]
assert 'Crawl' in types
assert 'Snapshot' in types
snapshot = next(r for r in records if r['type'] == 'Snapshot')
assert snapshot['url'] == url
def test_create_with_tag(self, cli_env, initialized_archive):
"""Create snapshot with --tag flag."""
url = create_test_url()
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'create', '--tag=test-tag', url],
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert 'test-tag' in records[0].get('tags_str', '')
def test_create_pass_through_other_types(self, cli_env, initialized_archive):
"""Pass-through records of other types unchanged."""
tag_record = {'type': 'Tag', 'id': 'fake-tag-id', 'name': 'test'}
url = create_test_url()
stdin = json.dumps(tag_record) + '\n' + json.dumps({'url': url})
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'create'],
stdin=stdin,
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
types = [r.get('type') for r in records]
assert 'Tag' in types
assert 'Snapshot' in types
def test_create_multiple_urls(self, cli_env, initialized_archive):
"""Create snapshots from multiple URLs."""
urls = [create_test_url() for _ in range(3)]
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'create'] + urls,
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert len(records) == 3
created_urls = {r['url'] for r in records}
for url in urls:
assert url in created_urls
class TestSnapshotList:
"""Tests for `archivebox snapshot list`."""
def test_list_empty(self, cli_env, initialized_archive):
"""List with no snapshots returns empty."""
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'list'],
env=cli_env,
)
assert code == 0
assert 'Listed 0 snapshots' in stderr
def test_list_returns_created(self, cli_env, initialized_archive):
"""List returns previously created snapshots."""
url = create_test_url()
run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'list'],
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert len(records) >= 1
assert any(r.get('url') == url for r in records)
def test_list_filter_by_status(self, cli_env, initialized_archive):
"""Filter snapshots by status."""
url = create_test_url()
run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'list', '--status=queued'],
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
for r in records:
assert r['status'] == 'queued'
def test_list_filter_by_url_contains(self, cli_env, initialized_archive):
"""Filter snapshots by URL contains."""
url = create_test_url(domain='unique-domain-12345.com')
run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'list', '--url__icontains=unique-domain-12345'],
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert len(records) == 1
assert 'unique-domain-12345' in records[0]['url']
def test_list_with_limit(self, cli_env, initialized_archive):
"""Limit number of results."""
for _ in range(3):
run_archivebox_cmd(['snapshot', 'create', create_test_url()], env=cli_env)
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'list', '--limit=2'],
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert len(records) == 2
class TestSnapshotUpdate:
"""Tests for `archivebox snapshot update`."""
def test_update_status(self, cli_env, initialized_archive):
"""Update snapshot status."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, stderr, code = run_archivebox_cmd(
['snapshot', 'update', '--status=started'],
stdin=json.dumps(snapshot),
env=cli_env,
)
assert code == 0
assert 'Updated 1 snapshots' in stderr
records = parse_jsonl_output(stdout2)
assert records[0]['status'] == 'started'
def test_update_add_tag(self, cli_env, initialized_archive):
"""Update snapshot by adding tag."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
snapshot = parse_jsonl_output(stdout1)[0]
stdout2, stderr, code = run_archivebox_cmd(
['snapshot', 'update', '--tag=new-tag'],
stdin=json.dumps(snapshot),
env=cli_env,
)
assert code == 0
assert 'Updated 1 snapshots' in stderr
class TestSnapshotDelete:
"""Tests for `archivebox snapshot delete`."""
def test_delete_requires_yes(self, cli_env, initialized_archive):
"""Delete requires --yes flag."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
snapshot = parse_jsonl_output(stdout1)[0]
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'delete'],
stdin=json.dumps(snapshot),
env=cli_env,
)
assert code == 1
assert '--yes' in stderr
def test_delete_with_yes(self, cli_env, initialized_archive):
"""Delete with --yes flag works."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
snapshot = parse_jsonl_output(stdout1)[0]
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'delete', '--yes'],
stdin=json.dumps(snapshot),
env=cli_env,
)
assert code == 0
assert 'Deleted 1 snapshots' in stderr
def test_delete_dry_run(self, cli_env, initialized_archive):
"""Dry run shows what would be deleted."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
snapshot = parse_jsonl_output(stdout1)[0]
stdout, stderr, code = run_archivebox_cmd(
['snapshot', 'delete', '--dry-run'],
stdin=json.dumps(snapshot),
env=cli_env,
)
assert code == 0
assert 'Would delete' in stderr

View File

@@ -32,7 +32,7 @@ _supervisord_proc = None
ORCHESTRATOR_WORKER = { ORCHESTRATOR_WORKER = {
"name": "worker_orchestrator", "name": "worker_orchestrator",
"command": "archivebox manage orchestrator", # runs forever by default "command": "archivebox run", # runs forever by default
"autostart": "true", "autostart": "true",
"autorestart": "true", "autorestart": "true",
"stdout_logfile": "logs/worker_orchestrator.log", "stdout_logfile": "logs/worker_orchestrator.log",