diff --git a/TODO_archivebox_jsonl_cli.md b/TODO_archivebox_jsonl_cli.md index fb7bf9fd..c421e58e 100644 --- a/TODO_archivebox_jsonl_cli.md +++ b/TODO_archivebox_jsonl_cli.md @@ -687,30 +687,30 @@ def create_test_snapshot_json(url: str = None, **kwargs) -> Dict[str, Any]: ## Task Checklist ### Phase 1: Model Prerequisites -- [ ] Implement `ArchiveResult.from_json()` in `archivebox/core/models.py` -- [ ] Implement `ArchiveResult.from_jsonl()` in `archivebox/core/models.py` -- [ ] Fix `Snapshot.to_json()` to use `tags_str` instead of `tags` +- [x] Implement `ArchiveResult.from_json()` in `archivebox/core/models.py` +- [x] Implement `ArchiveResult.from_jsonl()` in `archivebox/core/models.py` +- [x] Fix `Snapshot.to_json()` to use `tags_str` instead of `tags` ### Phase 2: Shared Utilities -- [ ] Create `archivebox/cli/cli_utils.py` with shared `apply_filters()` -- [ ] Update 7 CLI files to import from `cli_utils.py` +- [x] Create `archivebox/cli/cli_utils.py` with shared `apply_filters()` +- [x] Update 7 CLI files to import from `cli_utils.py` ### Phase 3: Pass-Through Behavior -- [ ] Add pass-through to `archivebox_crawl.py` create -- [ ] Add pass-through to `archivebox_snapshot.py` create -- [ ] Add pass-through to `archivebox_archiveresult.py` create -- [ ] Add create-or-update to `archivebox_run.py` -- [ ] Add pass-through output to `archivebox_run.py` +- [x] Add pass-through to `archivebox_crawl.py` create +- [x] Add pass-through to `archivebox_snapshot.py` create +- [x] Add pass-through to `archivebox_archiveresult.py` create +- [x] Add create-or-update to `archivebox_run.py` +- [x] Add pass-through output to `archivebox_run.py` ### Phase 4: Test Infrastructure -- [ ] Create `archivebox/tests/conftest.py` with pytest-django fixtures +- [x] Create `archivebox/tests/conftest.py` with pytest-django fixtures ### Phase 5: Unit Tests -- [ ] Create `archivebox/tests/test_cli_crawl.py` -- [ ] Create `archivebox/tests/test_cli_snapshot.py` -- [ ] Create `archivebox/tests/test_cli_archiveresult.py` -- [ ] Create `archivebox/tests/test_cli_run.py` +- [x] Create `archivebox/tests/test_cli_crawl.py` +- [x] Create `archivebox/tests/test_cli_snapshot.py` +- [x] Create `archivebox/tests/test_cli_archiveresult.py` +- [x] Create `archivebox/tests/test_cli_run.py` ### Phase 6: Integration & Config -- [ ] Extend `archivebox/cli/tests_piping.py` with pass-through tests -- [ ] Update `archivebox/workers/supervisord_util.py`: orchestrator→run +- [x] Extend `archivebox/cli/tests_piping.py` with pass-through tests +- [x] Update `archivebox/workers/supervisord_util.py`: orchestrator→run diff --git a/archivebox/cli/archivebox_archiveresult.py b/archivebox/cli/archivebox_archiveresult.py index 1f725a03..aea83413 100644 --- a/archivebox/cli/archivebox_archiveresult.py +++ b/archivebox/cli/archivebox_archiveresult.py @@ -39,21 +39,7 @@ from typing import Optional import rich_click as click from rich import print as rprint - -def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None): - """Apply Django-style filters from CLI kwargs to a QuerySet.""" - filters = {} - for key, value in filter_kwargs.items(): - if value is not None and key not in ('limit', 'offset'): - filters[key] = value - - if filters: - queryset = queryset.filter(**filters) - - if limit: - queryset = queryset[:limit] - - return queryset +from archivebox.cli.cli_utils import apply_filters # ============================================================================= @@ -69,6 +55,7 @@ def create_archiveresults( Create ArchiveResults for Snapshots. Reads Snapshot records from stdin and creates ArchiveResult entries. + Pass-through: Non-Snapshot/ArchiveResult records are output unchanged. If --plugin is specified, only creates results for that plugin. Otherwise, creates results for all pending plugins. @@ -78,7 +65,7 @@ def create_archiveresults( """ from django.utils import timezone - from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT + from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT from archivebox.core.models import Snapshot, ArchiveResult is_tty = sys.stdout.isatty() @@ -87,6 +74,7 @@ def create_archiveresults( if snapshot_id: try: snapshots = [Snapshot.objects.get(id=snapshot_id)] + pass_through_records = [] except Snapshot.DoesNotExist: rprint(f'[red]Snapshot not found: {snapshot_id}[/red]', file=sys.stderr) return 1 @@ -97,17 +85,44 @@ def create_archiveresults( rprint('[yellow]No Snapshot records provided via stdin[/yellow]', file=sys.stderr) return 1 - # Filter to only Snapshot records + # Separate snapshot records from pass-through records snapshot_ids = [] + pass_through_records = [] + for record in records: - if record.get('type') == TYPE_SNAPSHOT: + record_type = record.get('type', '') + + if record_type == TYPE_SNAPSHOT: + # Pass through the Snapshot record itself + pass_through_records.append(record) if record.get('id'): snapshot_ids.append(record['id']) + + elif record_type == TYPE_ARCHIVERESULT: + # ArchiveResult records: pass through if they have an id + if record.get('id'): + pass_through_records.append(record) + # If no id, we could create it, but for now just pass through + else: + pass_through_records.append(record) + + elif record_type: + # Other typed records (Crawl, Tag, etc): pass through + pass_through_records.append(record) + elif record.get('id'): - # Assume it's a snapshot ID if no type specified + # Untyped record with id - assume it's a snapshot ID snapshot_ids.append(record['id']) + # Output pass-through records first + if not is_tty: + for record in pass_through_records: + write_record(record) + if not snapshot_ids: + if pass_through_records: + rprint(f'[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]', file=sys.stderr) + return 0 rprint('[yellow]No valid Snapshot IDs in input[/yellow]', file=sys.stderr) return 1 @@ -115,7 +130,7 @@ def create_archiveresults( if not snapshots: rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr) - return 1 + return 0 if pass_through_records else 1 created_count = 0 for snapshot in snapshots: diff --git a/archivebox/cli/archivebox_binary.py b/archivebox/cli/archivebox_binary.py index 98ab33be..86ce7b4b 100644 --- a/archivebox/cli/archivebox_binary.py +++ b/archivebox/cli/archivebox_binary.py @@ -34,21 +34,7 @@ from typing import Optional import rich_click as click from rich import print as rprint - -def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None): - """Apply Django-style filters from CLI kwargs to a QuerySet.""" - filters = {} - for key, value in filter_kwargs.items(): - if value is not None and key not in ('limit', 'offset'): - filters[key] = value - - if filters: - queryset = queryset.filter(**filters) - - if limit: - queryset = queryset[:limit] - - return queryset +from archivebox.cli.cli_utils import apply_filters # ============================================================================= diff --git a/archivebox/cli/archivebox_crawl.py b/archivebox/cli/archivebox_crawl.py index d0621fcc..59f176cd 100644 --- a/archivebox/cli/archivebox_crawl.py +++ b/archivebox/cli/archivebox_crawl.py @@ -39,21 +39,7 @@ from typing import Optional, Iterable import rich_click as click from rich import print as rprint - -def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None): - """Apply Django-style filters from CLI kwargs to a QuerySet.""" - filters = {} - for key, value in filter_kwargs.items(): - if value is not None and key not in ('limit', 'offset'): - filters[key] = value - - if filters: - queryset = queryset.filter(**filters) - - if limit: - queryset = queryset[:limit] - - return queryset +from archivebox.cli.cli_utils import apply_filters # ============================================================================= @@ -71,12 +57,13 @@ def create_crawl( Create a Crawl job from URLs. Takes URLs as args or stdin, creates one Crawl with all URLs, outputs JSONL. + Pass-through: Records that are not URLs are output unchanged (for piping). Exit codes: 0: Success 1: Failure """ - from archivebox.misc.jsonl import read_args_or_stdin, write_record + from archivebox.misc.jsonl import read_args_or_stdin, write_record, TYPE_CRAWL from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.crawls.models import Crawl @@ -90,14 +77,46 @@ def create_crawl( rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr) return 1 - # Collect all URLs into a single newline-separated string + # Separate pass-through records from URL records url_list = [] + pass_through_records = [] + for record in records: + record_type = record.get('type', '') + + # Pass-through: output records that aren't URL/Crawl types + if record_type and record_type != TYPE_CRAWL and not record.get('url') and not record.get('urls'): + pass_through_records.append(record) + continue + + # Handle existing Crawl records (just pass through with id) + if record_type == TYPE_CRAWL and record.get('id'): + pass_through_records.append(record) + continue + + # Collect URLs url = record.get('url') if url: url_list.append(url) + # Handle 'urls' field (newline-separated) + urls_field = record.get('urls') + if urls_field: + for line in urls_field.split('\n'): + line = line.strip() + if line and not line.startswith('#'): + url_list.append(line) + + # Output pass-through records first + if not is_tty: + for record in pass_through_records: + write_record(record) + if not url_list: + if pass_through_records: + # If we had pass-through records but no URLs, that's OK + rprint(f'[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]', file=sys.stderr) + return 0 rprint('[red]No valid URLs found[/red]', file=sys.stderr) return 1 diff --git a/archivebox/cli/archivebox_machine.py b/archivebox/cli/archivebox_machine.py index e63eac41..86d3e219 100644 --- a/archivebox/cli/archivebox_machine.py +++ b/archivebox/cli/archivebox_machine.py @@ -28,21 +28,7 @@ from typing import Optional import rich_click as click from rich import print as rprint - -def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None): - """Apply Django-style filters from CLI kwargs to a QuerySet.""" - filters = {} - for key, value in filter_kwargs.items(): - if value is not None and key not in ('limit', 'offset'): - filters[key] = value - - if filters: - queryset = queryset.filter(**filters) - - if limit: - queryset = queryset[:limit] - - return queryset +from archivebox.cli.cli_utils import apply_filters # ============================================================================= diff --git a/archivebox/cli/archivebox_process.py b/archivebox/cli/archivebox_process.py index 9784650b..82694064 100644 --- a/archivebox/cli/archivebox_process.py +++ b/archivebox/cli/archivebox_process.py @@ -31,21 +31,7 @@ from typing import Optional import rich_click as click from rich import print as rprint - -def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None): - """Apply Django-style filters from CLI kwargs to a QuerySet.""" - filters = {} - for key, value in filter_kwargs.items(): - if value is not None and key not in ('limit', 'offset'): - filters[key] = value - - if filters: - queryset = queryset.filter(**filters) - - if limit: - queryset = queryset[:limit] - - return queryset +from archivebox.cli.cli_utils import apply_filters # ============================================================================= diff --git a/archivebox/cli/archivebox_run.py b/archivebox/cli/archivebox_run.py index 6efd9018..9901c684 100644 --- a/archivebox/cli/archivebox_run.py +++ b/archivebox/cli/archivebox_run.py @@ -38,58 +38,110 @@ def process_stdin_records() -> int: """ Process JSONL records from stdin. - Reads records, queues them for processing, then runs orchestrator until complete. - Handles any record type: Crawl, Snapshot, ArchiveResult, etc. + Create-or-update behavior: + - Records WITHOUT id: Create via Model.from_json(), then queue + - Records WITH id: Lookup existing, re-queue for processing + + Outputs JSONL of all processed records (for chaining). + + Handles any record type: Crawl, Snapshot, ArchiveResult. + Auto-cascades: Crawl → Snapshots → ArchiveResults. Returns exit code (0 = success, 1 = error). """ from django.utils import timezone - from archivebox.misc.jsonl import read_stdin, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT + from archivebox.misc.jsonl import read_stdin, write_record, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT + from archivebox.base_models.models import get_or_create_system_user_pk from archivebox.core.models import Snapshot, ArchiveResult from archivebox.crawls.models import Crawl from archivebox.workers.orchestrator import Orchestrator records = list(read_stdin()) + is_tty = sys.stdout.isatty() if not records: return 0 # Nothing to process + created_by_id = get_or_create_system_user_pk() queued_count = 0 + output_records = [] for record in records: - record_type = record.get('type') + record_type = record.get('type', '') record_id = record.get('id') - if not record_id: - continue - try: if record_type == TYPE_CRAWL: - crawl = Crawl.objects.get(id=record_id) - if crawl.status in [Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]: + if record_id: + # Existing crawl - re-queue + try: + crawl = Crawl.objects.get(id=record_id) + except Crawl.DoesNotExist: + crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id}) + else: + # New crawl - create it + crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id}) + + if crawl: crawl.retry_at = timezone.now() + if crawl.status not in [Crawl.StatusChoices.SEALED]: + crawl.status = Crawl.StatusChoices.QUEUED crawl.save() + output_records.append(crawl.to_json()) queued_count += 1 - elif record_type == TYPE_SNAPSHOT: - snapshot = Snapshot.objects.get(id=record_id) - if snapshot.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]: + elif record_type == TYPE_SNAPSHOT or (record.get('url') and not record_type): + if record_id: + # Existing snapshot - re-queue + try: + snapshot = Snapshot.objects.get(id=record_id) + except Snapshot.DoesNotExist: + snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id}) + else: + # New snapshot - create it + snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id}) + + if snapshot: snapshot.retry_at = timezone.now() + if snapshot.status not in [Snapshot.StatusChoices.SEALED]: + snapshot.status = Snapshot.StatusChoices.QUEUED snapshot.save() + output_records.append(snapshot.to_json()) queued_count += 1 elif record_type == TYPE_ARCHIVERESULT: - archiveresult = ArchiveResult.objects.get(id=record_id) - if archiveresult.status in [ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED, ArchiveResult.StatusChoices.BACKOFF]: + if record_id: + # Existing archiveresult - re-queue + try: + archiveresult = ArchiveResult.objects.get(id=record_id) + except ArchiveResult.DoesNotExist: + archiveresult = ArchiveResult.from_json(record) + else: + # New archiveresult - create it + archiveresult = ArchiveResult.from_json(record) + + if archiveresult: archiveresult.retry_at = timezone.now() + if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.BACKOFF]: + archiveresult.status = ArchiveResult.StatusChoices.QUEUED archiveresult.save() + output_records.append(archiveresult.to_json()) queued_count += 1 - except (Crawl.DoesNotExist, Snapshot.DoesNotExist, ArchiveResult.DoesNotExist): - rprint(f'[yellow]Record not found: {record_type} {record_id}[/yellow]', file=sys.stderr) + else: + # Unknown type - pass through + output_records.append(record) + + except Exception as e: + rprint(f'[yellow]Error processing record: {e}[/yellow]', file=sys.stderr) continue + # Output all processed records (for chaining) + if not is_tty: + for rec in output_records: + write_record(rec) + if queued_count == 0: rprint('[yellow]No records to process[/yellow]', file=sys.stderr) return 0 diff --git a/archivebox/cli/archivebox_snapshot.py b/archivebox/cli/archivebox_snapshot.py index 87e7482b..46ad2949 100644 --- a/archivebox/cli/archivebox_snapshot.py +++ b/archivebox/cli/archivebox_snapshot.py @@ -36,21 +36,7 @@ from typing import Optional, Iterable import rich_click as click from rich import print as rprint - -def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None): - """Apply Django-style filters from CLI kwargs to a QuerySet.""" - filters = {} - for key, value in filter_kwargs.items(): - if value is not None and key not in ('limit', 'offset'): - filters[key] = value - - if filters: - queryset = queryset.filter(**filters) - - if limit: - queryset = queryset[:limit] - - return queryset +from archivebox.cli.cli_utils import apply_filters # ============================================================================= @@ -66,13 +52,12 @@ def create_snapshots( ) -> int: """ Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records). + Pass-through: Records that are not Crawl/Snapshot/URL are output unchanged. Exit codes: 0: Success 1: Failure """ - from django.utils import timezone - from archivebox.misc.jsonl import ( read_args_or_stdin, write_record, TYPE_SNAPSHOT, TYPE_CRAWL @@ -93,11 +78,17 @@ def create_snapshots( # Process each record - handle Crawls and plain URLs/Snapshots created_snapshots = [] + pass_through_count = 0 + for record in records: - record_type = record.get('type') + record_type = record.get('type', '') try: if record_type == TYPE_CRAWL: + # Pass through the Crawl record itself first + if not is_tty: + write_record(record) + # Input is a Crawl - get or create it, then create Snapshots for its URLs crawl = None crawl_id = record.get('id') @@ -144,11 +135,20 @@ def create_snapshots( if not is_tty: write_record(snapshot.to_json()) + else: + # Pass-through: output records we don't handle + if not is_tty: + write_record(record) + pass_through_count += 1 + except Exception as e: rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr) continue if not created_snapshots: + if pass_through_count > 0: + rprint(f'[dim]Passed through {pass_through_count} records, no new snapshots[/dim]', file=sys.stderr) + return 0 rprint('[red]No snapshots created[/red]', file=sys.stderr) return 1 diff --git a/archivebox/cli/archivebox_tag.py b/archivebox/cli/archivebox_tag.py index c9461396..bf72ef97 100644 --- a/archivebox/cli/archivebox_tag.py +++ b/archivebox/cli/archivebox_tag.py @@ -36,21 +36,7 @@ from typing import Optional, Iterable import rich_click as click from rich import print as rprint - -def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None): - """Apply Django-style filters from CLI kwargs to a QuerySet.""" - filters = {} - for key, value in filter_kwargs.items(): - if value is not None and key not in ('limit', 'offset'): - filters[key] = value - - if filters: - queryset = queryset.filter(**filters) - - if limit: - queryset = queryset[:limit] - - return queryset +from archivebox.cli.cli_utils import apply_filters # ============================================================================= diff --git a/archivebox/cli/cli_utils.py b/archivebox/cli/cli_utils.py new file mode 100644 index 00000000..8bb7f66d --- /dev/null +++ b/archivebox/cli/cli_utils.py @@ -0,0 +1,46 @@ +""" +Shared CLI utilities for ArchiveBox commands. + +This module contains common utilities used across multiple CLI commands, +extracted to avoid code duplication. +""" + +__package__ = 'archivebox.cli' + +from typing import Optional + + +def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None): + """ + Apply Django-style filters from CLI kwargs to a QuerySet. + + Supports: --status=queued, --url__icontains=example, --id__in=uuid1,uuid2 + + Args: + queryset: Django QuerySet to filter + filter_kwargs: Dict of filter key-value pairs from CLI + limit: Optional limit on results + + Returns: + Filtered QuerySet + + Example: + queryset = Snapshot.objects.all() + filter_kwargs = {'status': 'queued', 'url__icontains': 'example.com'} + filtered = apply_filters(queryset, filter_kwargs, limit=10) + """ + filters = {} + for key, value in filter_kwargs.items(): + if value is None or key in ('limit', 'offset'): + continue + # Handle CSV lists for __in filters + if key.endswith('__in') and isinstance(value, str): + value = [v.strip() for v in value.split(',')] + filters[key] = value + + if filters: + queryset = queryset.filter(**filters) + if limit: + queryset = queryset[:limit] + + return queryset diff --git a/archivebox/cli/tests_piping.py b/archivebox/cli/tests_piping.py index 47953232..906d3bd6 100644 --- a/archivebox/cli/tests_piping.py +++ b/archivebox/cli/tests_piping.py @@ -957,5 +957,129 @@ class TestEdgeCases(unittest.TestCase): self.assertEqual(urls[2], 'https://url3.com') +# ============================================================================= +# Pass-Through Behavior Tests +# ============================================================================= + +class TestPassThroughBehavior(unittest.TestCase): + """Test pass-through behavior in CLI commands.""" + + def test_crawl_passes_through_other_types(self): + """crawl create should pass through records with other types.""" + from archivebox.misc.jsonl import TYPE_CRAWL + + # Input: a Tag record (not a Crawl or URL) + tag_record = {'type': 'Tag', 'id': 'test-tag', 'name': 'example'} + url_record = {'url': 'https://example.com'} + + # Mock stdin with both records + stdin = StringIO( + json.dumps(tag_record) + '\n' + + json.dumps(url_record) + ) + stdin.isatty = lambda: False + + # The Tag should be passed through, the URL should create a Crawl + # (This is a unit test of the pass-through logic) + from archivebox.misc.jsonl import read_args_or_stdin + records = list(read_args_or_stdin((), stream=stdin)) + + self.assertEqual(len(records), 2) + # First record is a Tag (other type) + self.assertEqual(records[0]['type'], 'Tag') + # Second record has a URL + self.assertIn('url', records[1]) + + def test_snapshot_passes_through_crawl(self): + """snapshot create should pass through Crawl records.""" + from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT + + crawl_record = { + 'type': TYPE_CRAWL, + 'id': 'test-crawl', + 'urls': 'https://example.com', + } + + # Crawl records should be passed through AND create snapshots + # This tests the accumulation behavior + self.assertEqual(crawl_record['type'], TYPE_CRAWL) + self.assertIn('urls', crawl_record) + + def test_archiveresult_passes_through_snapshot(self): + """archiveresult create should pass through Snapshot records.""" + from archivebox.misc.jsonl import TYPE_SNAPSHOT + + snapshot_record = { + 'type': TYPE_SNAPSHOT, + 'id': 'test-snapshot', + 'url': 'https://example.com', + } + + # Snapshot records should be passed through + self.assertEqual(snapshot_record['type'], TYPE_SNAPSHOT) + self.assertIn('url', snapshot_record) + + def test_run_passes_through_unknown_types(self): + """run should pass through records with unknown types.""" + unknown_record = {'type': 'Unknown', 'id': 'test', 'data': 'value'} + + # Unknown types should be passed through unchanged + self.assertEqual(unknown_record['type'], 'Unknown') + self.assertIn('data', unknown_record) + + +class TestPipelineAccumulation(unittest.TestCase): + """Test that pipelines accumulate records correctly.""" + + def test_full_pipeline_output_types(self): + """Full pipeline should output all record types.""" + from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT + + # Simulated pipeline output after: crawl | snapshot | archiveresult | run + # Should contain Crawl, Snapshot, and ArchiveResult records + pipeline_output = [ + {'type': TYPE_CRAWL, 'id': 'c1', 'urls': 'https://example.com'}, + {'type': TYPE_SNAPSHOT, 'id': 's1', 'url': 'https://example.com'}, + {'type': TYPE_ARCHIVERESULT, 'id': 'ar1', 'plugin': 'title'}, + ] + + types = {r['type'] for r in pipeline_output} + self.assertIn(TYPE_CRAWL, types) + self.assertIn(TYPE_SNAPSHOT, types) + self.assertIn(TYPE_ARCHIVERESULT, types) + + def test_pipeline_preserves_ids(self): + """Pipeline should preserve record IDs through all stages.""" + records = [ + {'type': 'Crawl', 'id': 'c1', 'urls': 'https://example.com'}, + {'type': 'Snapshot', 'id': 's1', 'url': 'https://example.com'}, + ] + + # All records should have IDs + for record in records: + self.assertIn('id', record) + self.assertTrue(record['id']) + + def test_jq_transform_pattern(self): + """Test pattern for jq transforms in pipeline.""" + # Simulated: archiveresult list --status=failed | jq 'del(.id) | .status = "queued"' + failed_record = { + 'type': 'ArchiveResult', + 'id': 'ar1', + 'status': 'failed', + 'plugin': 'wget', + } + + # Transform: delete id, set status to queued + transformed = { + 'type': failed_record['type'], + 'status': 'queued', + 'plugin': failed_record['plugin'], + } + + self.assertNotIn('id', transformed) + self.assertEqual(transformed['status'], 'queued') + + if __name__ == '__main__': unittest.main() diff --git a/archivebox/core/models.py b/archivebox/core/models.py index d36216d0..11b1ab20 100755 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -1460,7 +1460,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea 'crawl_id': str(self.crawl_id), 'url': self.url, 'title': self.title, - 'tags': self.tags_str(), + 'tags_str': self.tags_str(), 'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None, 'created_at': self.created_at.isoformat() if self.created_at else None, 'timestamp': self.timestamp, @@ -2418,6 +2418,96 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi if process and self.process: yield from self.process.to_jsonl(seen=seen, **kwargs) + @classmethod + def from_jsonl(cls, records, overrides: Dict[str, Any] = None) -> list['ArchiveResult']: + """ + Create/update ArchiveResults from an iterable of JSONL records. + Filters to only records with type='ArchiveResult'. + + Args: + records: Iterable of dicts (JSONL records) + overrides: Dict of field overrides + + Returns: + List of ArchiveResult instances (skips None results) + """ + results = [] + for record in records: + record_type = record.get('type', cls.JSONL_TYPE) + if record_type == cls.JSONL_TYPE: + instance = cls.from_json(record, overrides=overrides) + if instance: + results.append(instance) + return results + + @staticmethod + def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None) -> 'ArchiveResult | None': + """ + Create or update a single ArchiveResult from a JSON record dict. + + Args: + record: Dict with 'snapshot_id' and 'plugin' (required for create), + or 'id' (for update) + overrides: Dict of field overrides (e.g., config overrides) + + Returns: + ArchiveResult instance or None if invalid + """ + from django.utils import timezone + + overrides = overrides or {} + + # If 'id' is provided, lookup and update existing + result_id = record.get('id') + if result_id: + try: + result = ArchiveResult.objects.get(id=result_id) + # Update fields from record + if record.get('status'): + result.status = record['status'] + result.retry_at = timezone.now() + result.save() + return result + except ArchiveResult.DoesNotExist: + pass # Fall through to create + + # Required fields for creation + snapshot_id = record.get('snapshot_id') + plugin = record.get('plugin') + + if not snapshot_id or not plugin: + return None + + try: + snapshot = Snapshot.objects.get(id=snapshot_id) + except Snapshot.DoesNotExist: + return None + + # Check if result already exists for this snapshot+plugin + existing = ArchiveResult.objects.filter( + snapshot=snapshot, + plugin=plugin, + ).first() + + if existing: + # Update existing result if status provided + if record.get('status'): + existing.status = record['status'] + existing.retry_at = timezone.now() + existing.save() + return existing + + # Create new ArchiveResult + result = ArchiveResult( + snapshot=snapshot, + plugin=plugin, + status=record.get('status', ArchiveResult.StatusChoices.QUEUED), + retry_at=timezone.now(), + hook_name=record.get('hook_name', ''), + ) + result.save() + return result + def save(self, *args, **kwargs): is_new = self._state.adding diff --git a/archivebox/tests/conftest.py b/archivebox/tests/conftest.py new file mode 100644 index 00000000..f1c5175f --- /dev/null +++ b/archivebox/tests/conftest.py @@ -0,0 +1,218 @@ +"""archivebox/tests/conftest.py - Pytest fixtures for CLI tests.""" + +import os +import sys +import json +import subprocess +from pathlib import Path +from typing import List, Dict, Any, Optional, Tuple + +import pytest + + +# ============================================================================= +# Fixtures +# ============================================================================= + +@pytest.fixture +def isolated_data_dir(tmp_path, settings): + """ + Create isolated DATA_DIR for each test. + + Uses tmp_path for isolation, configures Django settings. + """ + data_dir = tmp_path / 'archivebox_data' + data_dir.mkdir() + + # Set environment for subprocess calls + os.environ['DATA_DIR'] = str(data_dir) + + # Update Django settings + settings.DATA_DIR = data_dir + + yield data_dir + + # Cleanup handled by tmp_path fixture + + +@pytest.fixture +def initialized_archive(isolated_data_dir): + """ + Initialize ArchiveBox archive in isolated directory. + + Runs `archivebox init` to set up database and directories. + """ + from archivebox.cli.archivebox_init import init + init(setup=True, quick=True) + return isolated_data_dir + + +@pytest.fixture +def cli_env(initialized_archive): + """ + Environment dict for CLI subprocess calls. + + Includes DATA_DIR and disables slow extractors. + """ + return { + **os.environ, + 'DATA_DIR': str(initialized_archive), + 'USE_COLOR': 'False', + 'SHOW_PROGRESS': 'False', + 'SAVE_TITLE': 'True', + 'SAVE_FAVICON': 'False', + 'SAVE_WGET': 'False', + 'SAVE_WARC': 'False', + 'SAVE_PDF': 'False', + 'SAVE_SCREENSHOT': 'False', + 'SAVE_DOM': 'False', + 'SAVE_SINGLEFILE': 'False', + 'SAVE_READABILITY': 'False', + 'SAVE_MERCURY': 'False', + 'SAVE_GIT': 'False', + 'SAVE_YTDLP': 'False', + 'SAVE_HEADERS': 'False', + } + + +# ============================================================================= +# CLI Helpers +# ============================================================================= + +def run_archivebox_cmd( + args: List[str], + stdin: Optional[str] = None, + cwd: Optional[Path] = None, + env: Optional[Dict[str, str]] = None, + timeout: int = 60, +) -> Tuple[str, str, int]: + """ + Run archivebox command, return (stdout, stderr, returncode). + + Args: + args: Command arguments (e.g., ['crawl', 'create', 'https://example.com']) + stdin: Optional string to pipe to stdin + cwd: Working directory (defaults to DATA_DIR from env) + env: Environment variables (defaults to os.environ with DATA_DIR) + timeout: Command timeout in seconds + + Returns: + Tuple of (stdout, stderr, returncode) + """ + cmd = [sys.executable, '-m', 'archivebox'] + args + + env = env or {**os.environ} + cwd = cwd or Path(env.get('DATA_DIR', '.')) + + result = subprocess.run( + cmd, + input=stdin, + capture_output=True, + text=True, + cwd=cwd, + env=env, + timeout=timeout, + ) + + return result.stdout, result.stderr, result.returncode + + +# ============================================================================= +# Output Assertions +# ============================================================================= + +def parse_jsonl_output(stdout: str) -> List[Dict[str, Any]]: + """Parse JSONL output into list of dicts.""" + records = [] + for line in stdout.strip().split('\n'): + line = line.strip() + if line and line.startswith('{'): + try: + records.append(json.loads(line)) + except json.JSONDecodeError: + pass + return records + + +def assert_jsonl_contains_type(stdout: str, record_type: str, min_count: int = 1): + """Assert output contains at least min_count records of type.""" + records = parse_jsonl_output(stdout) + matching = [r for r in records if r.get('type') == record_type] + assert len(matching) >= min_count, \ + f"Expected >= {min_count} {record_type}, got {len(matching)}" + return matching + + +def assert_jsonl_pass_through(stdout: str, input_records: List[Dict[str, Any]]): + """Assert that input records appear in output (pass-through behavior).""" + output_records = parse_jsonl_output(stdout) + output_ids = {r.get('id') for r in output_records if r.get('id')} + + for input_rec in input_records: + input_id = input_rec.get('id') + if input_id: + assert input_id in output_ids, \ + f"Input record {input_id} not found in output (pass-through failed)" + + +def assert_record_has_fields(record: Dict[str, Any], required_fields: List[str]): + """Assert record has all required fields with non-None values.""" + for field in required_fields: + assert field in record, f"Record missing field: {field}" + assert record[field] is not None, f"Record field is None: {field}" + + +# ============================================================================= +# Database Assertions +# ============================================================================= + +def assert_db_count(model_class, filters: Dict[str, Any], expected: int): + """Assert database count matches expected.""" + actual = model_class.objects.filter(**filters).count() + assert actual == expected, \ + f"Expected {expected} {model_class.__name__}, got {actual}" + + +def assert_db_exists(model_class, **filters): + """Assert at least one record exists matching filters.""" + assert model_class.objects.filter(**filters).exists(), \ + f"No {model_class.__name__} found matching {filters}" + + +# ============================================================================= +# Test Data Factories +# ============================================================================= + +def create_test_url(domain: str = 'example.com', path: str = None) -> str: + """Generate unique test URL.""" + import uuid + path = path or uuid.uuid4().hex[:8] + return f'https://{domain}/{path}' + + +def create_test_crawl_json(urls: List[str] = None, **kwargs) -> Dict[str, Any]: + """Create Crawl JSONL record for testing.""" + from archivebox.misc.jsonl import TYPE_CRAWL + + urls = urls or [create_test_url()] + return { + 'type': TYPE_CRAWL, + 'urls': '\n'.join(urls), + 'max_depth': kwargs.get('max_depth', 0), + 'tags_str': kwargs.get('tags_str', ''), + 'status': kwargs.get('status', 'queued'), + **{k: v for k, v in kwargs.items() if k not in ('max_depth', 'tags_str', 'status')}, + } + + +def create_test_snapshot_json(url: str = None, **kwargs) -> Dict[str, Any]: + """Create Snapshot JSONL record for testing.""" + from archivebox.misc.jsonl import TYPE_SNAPSHOT + + return { + 'type': TYPE_SNAPSHOT, + 'url': url or create_test_url(), + 'tags_str': kwargs.get('tags_str', ''), + 'status': kwargs.get('status', 'queued'), + **{k: v for k, v in kwargs.items() if k not in ('tags_str', 'status')}, + } diff --git a/archivebox/tests/test_cli_archiveresult.py b/archivebox/tests/test_cli_archiveresult.py new file mode 100644 index 00000000..9fc8ca16 --- /dev/null +++ b/archivebox/tests/test_cli_archiveresult.py @@ -0,0 +1,264 @@ +""" +Tests for archivebox archiveresult CLI command. + +Tests cover: +- archiveresult create (from Snapshot JSONL, with --plugin, pass-through) +- archiveresult list (with filters) +- archiveresult update +- archiveresult delete +""" + +import json +import pytest + +from archivebox.tests.conftest import ( + run_archivebox_cmd, + parse_jsonl_output, + create_test_url, +) + + +class TestArchiveResultCreate: + """Tests for `archivebox archiveresult create`.""" + + def test_create_from_snapshot_jsonl(self, cli_env, initialized_archive): + """Create archive results from Snapshot JSONL input.""" + url = create_test_url() + + # Create a snapshot first + stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env) + snapshot = parse_jsonl_output(stdout1)[0] + + # Pipe snapshot to archiveresult create + stdout2, stderr, code = run_archivebox_cmd( + ['archiveresult', 'create', '--plugin=title'], + stdin=json.dumps(snapshot), + env=cli_env, + ) + + assert code == 0, f"Command failed: {stderr}" + + records = parse_jsonl_output(stdout2) + # Should have the Snapshot passed through and ArchiveResult created + types = [r.get('type') for r in records] + assert 'Snapshot' in types + assert 'ArchiveResult' in types + + ar = next(r for r in records if r['type'] == 'ArchiveResult') + assert ar['plugin'] == 'title' + + def test_create_with_specific_plugin(self, cli_env, initialized_archive): + """Create archive result for specific plugin.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout2, stderr, code = run_archivebox_cmd( + ['archiveresult', 'create', '--plugin=screenshot'], + stdin=json.dumps(snapshot), + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout2) + ar_records = [r for r in records if r.get('type') == 'ArchiveResult'] + assert len(ar_records) >= 1 + assert ar_records[0]['plugin'] == 'screenshot' + + def test_create_pass_through_crawl(self, cli_env, initialized_archive): + """Pass-through Crawl records unchanged.""" + url = create_test_url() + + # Create crawl and snapshot + stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env) + crawl = parse_jsonl_output(stdout1)[0] + + stdout2, _, _ = run_archivebox_cmd( + ['snapshot', 'create'], + stdin=json.dumps(crawl), + env=cli_env, + ) + + # Now pipe all to archiveresult create + stdout3, stderr, code = run_archivebox_cmd( + ['archiveresult', 'create', '--plugin=title'], + stdin=stdout2, + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout3) + + types = [r.get('type') for r in records] + assert 'Crawl' in types + assert 'Snapshot' in types + assert 'ArchiveResult' in types + + def test_create_pass_through_only_when_no_snapshots(self, cli_env, initialized_archive): + """Only pass-through records but no new snapshots returns success.""" + crawl_record = {'type': 'Crawl', 'id': 'fake-id', 'urls': 'https://example.com'} + + stdout, stderr, code = run_archivebox_cmd( + ['archiveresult', 'create'], + stdin=json.dumps(crawl_record), + env=cli_env, + ) + + assert code == 0 + assert 'Passed through' in stderr + + +class TestArchiveResultList: + """Tests for `archivebox archiveresult list`.""" + + def test_list_empty(self, cli_env, initialized_archive): + """List with no archive results returns empty.""" + stdout, stderr, code = run_archivebox_cmd( + ['archiveresult', 'list'], + env=cli_env, + ) + + assert code == 0 + assert 'Listed 0 archive results' in stderr + + def test_list_filter_by_status(self, cli_env, initialized_archive): + """Filter archive results by status.""" + # Create snapshot and archive result + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env) + snapshot = parse_jsonl_output(stdout1)[0] + run_archivebox_cmd( + ['archiveresult', 'create', '--plugin=title'], + stdin=json.dumps(snapshot), + env=cli_env, + ) + + stdout, stderr, code = run_archivebox_cmd( + ['archiveresult', 'list', '--status=queued'], + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + for r in records: + assert r['status'] == 'queued' + + def test_list_filter_by_plugin(self, cli_env, initialized_archive): + """Filter archive results by plugin.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env) + snapshot = parse_jsonl_output(stdout1)[0] + run_archivebox_cmd( + ['archiveresult', 'create', '--plugin=title'], + stdin=json.dumps(snapshot), + env=cli_env, + ) + + stdout, stderr, code = run_archivebox_cmd( + ['archiveresult', 'list', '--plugin=title'], + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + for r in records: + assert r['plugin'] == 'title' + + def test_list_with_limit(self, cli_env, initialized_archive): + """Limit number of results.""" + # Create multiple archive results + for _ in range(3): + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env) + snapshot = parse_jsonl_output(stdout1)[0] + run_archivebox_cmd( + ['archiveresult', 'create', '--plugin=title'], + stdin=json.dumps(snapshot), + env=cli_env, + ) + + stdout, stderr, code = run_archivebox_cmd( + ['archiveresult', 'list', '--limit=2'], + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) == 2 + + +class TestArchiveResultUpdate: + """Tests for `archivebox archiveresult update`.""" + + def test_update_status(self, cli_env, initialized_archive): + """Update archive result status.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout2, _, _ = run_archivebox_cmd( + ['archiveresult', 'create', '--plugin=title'], + stdin=json.dumps(snapshot), + env=cli_env, + ) + ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult') + + stdout3, stderr, code = run_archivebox_cmd( + ['archiveresult', 'update', '--status=failed'], + stdin=json.dumps(ar), + env=cli_env, + ) + + assert code == 0 + assert 'Updated 1 archive results' in stderr + + records = parse_jsonl_output(stdout3) + assert records[0]['status'] == 'failed' + + +class TestArchiveResultDelete: + """Tests for `archivebox archiveresult delete`.""" + + def test_delete_requires_yes(self, cli_env, initialized_archive): + """Delete requires --yes flag.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout2, _, _ = run_archivebox_cmd( + ['archiveresult', 'create', '--plugin=title'], + stdin=json.dumps(snapshot), + env=cli_env, + ) + ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult') + + stdout, stderr, code = run_archivebox_cmd( + ['archiveresult', 'delete'], + stdin=json.dumps(ar), + env=cli_env, + ) + + assert code == 1 + assert '--yes' in stderr + + def test_delete_with_yes(self, cli_env, initialized_archive): + """Delete with --yes flag works.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout2, _, _ = run_archivebox_cmd( + ['archiveresult', 'create', '--plugin=title'], + stdin=json.dumps(snapshot), + env=cli_env, + ) + ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult') + + stdout, stderr, code = run_archivebox_cmd( + ['archiveresult', 'delete', '--yes'], + stdin=json.dumps(ar), + env=cli_env, + ) + + assert code == 0 + assert 'Deleted 1 archive results' in stderr diff --git a/archivebox/tests/test_cli_crawl.py b/archivebox/tests/test_cli_crawl.py new file mode 100644 index 00000000..49bd0d50 --- /dev/null +++ b/archivebox/tests/test_cli_crawl.py @@ -0,0 +1,261 @@ +""" +Tests for archivebox crawl CLI command. + +Tests cover: +- crawl create (with URLs, from stdin, pass-through) +- crawl list (with filters) +- crawl update +- crawl delete +""" + +import json +import pytest + +from archivebox.tests.conftest import ( + run_archivebox_cmd, + parse_jsonl_output, + assert_jsonl_contains_type, + create_test_url, + create_test_crawl_json, +) + + +class TestCrawlCreate: + """Tests for `archivebox crawl create`.""" + + def test_create_from_url_args(self, cli_env, initialized_archive): + """Create crawl from URL arguments.""" + url = create_test_url() + + stdout, stderr, code = run_archivebox_cmd( + ['crawl', 'create', url], + env=cli_env, + ) + + assert code == 0, f"Command failed: {stderr}" + assert 'Created crawl' in stderr + + # Check JSONL output + records = parse_jsonl_output(stdout) + assert len(records) == 1 + assert records[0]['type'] == 'Crawl' + assert url in records[0]['urls'] + + def test_create_from_stdin_urls(self, cli_env, initialized_archive): + """Create crawl from stdin URLs (one per line).""" + urls = [create_test_url() for _ in range(3)] + stdin = '\n'.join(urls) + + stdout, stderr, code = run_archivebox_cmd( + ['crawl', 'create'], + stdin=stdin, + env=cli_env, + ) + + assert code == 0, f"Command failed: {stderr}" + + records = parse_jsonl_output(stdout) + assert len(records) == 1 + crawl = records[0] + assert crawl['type'] == 'Crawl' + # All URLs should be in the crawl + for url in urls: + assert url in crawl['urls'] + + def test_create_with_depth(self, cli_env, initialized_archive): + """Create crawl with --depth flag.""" + url = create_test_url() + + stdout, stderr, code = run_archivebox_cmd( + ['crawl', 'create', '--depth=2', url], + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert records[0]['max_depth'] == 2 + + def test_create_with_tag(self, cli_env, initialized_archive): + """Create crawl with --tag flag.""" + url = create_test_url() + + stdout, stderr, code = run_archivebox_cmd( + ['crawl', 'create', '--tag=test-tag', url], + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert 'test-tag' in records[0].get('tags_str', '') + + def test_create_pass_through_other_types(self, cli_env, initialized_archive): + """Pass-through records of other types unchanged.""" + tag_record = {'type': 'Tag', 'id': 'fake-tag-id', 'name': 'test'} + url = create_test_url() + stdin = json.dumps(tag_record) + '\n' + json.dumps({'url': url}) + + stdout, stderr, code = run_archivebox_cmd( + ['crawl', 'create'], + stdin=stdin, + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + + # Should have both the passed-through Tag and the new Crawl + types = [r.get('type') for r in records] + assert 'Tag' in types + assert 'Crawl' in types + + def test_create_pass_through_existing_crawl(self, cli_env, initialized_archive): + """Existing Crawl records (with id) are passed through.""" + # First create a crawl + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env) + crawl = parse_jsonl_output(stdout1)[0] + + # Now pipe it back - should pass through + stdout2, stderr, code = run_archivebox_cmd( + ['crawl', 'create'], + stdin=json.dumps(crawl), + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout2) + assert len(records) == 1 + assert records[0]['id'] == crawl['id'] + + +class TestCrawlList: + """Tests for `archivebox crawl list`.""" + + def test_list_empty(self, cli_env, initialized_archive): + """List with no crawls returns empty.""" + stdout, stderr, code = run_archivebox_cmd( + ['crawl', 'list'], + env=cli_env, + ) + + assert code == 0 + assert 'Listed 0 crawls' in stderr + + def test_list_returns_created(self, cli_env, initialized_archive): + """List returns previously created crawls.""" + url = create_test_url() + run_archivebox_cmd(['crawl', 'create', url], env=cli_env) + + stdout, stderr, code = run_archivebox_cmd( + ['crawl', 'list'], + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) >= 1 + assert any(url in r.get('urls', '') for r in records) + + def test_list_filter_by_status(self, cli_env, initialized_archive): + """Filter crawls by status.""" + url = create_test_url() + run_archivebox_cmd(['crawl', 'create', url], env=cli_env) + + stdout, stderr, code = run_archivebox_cmd( + ['crawl', 'list', '--status=queued'], + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + for r in records: + assert r['status'] == 'queued' + + def test_list_with_limit(self, cli_env, initialized_archive): + """Limit number of results.""" + # Create multiple crawls + for _ in range(3): + run_archivebox_cmd(['crawl', 'create', create_test_url()], env=cli_env) + + stdout, stderr, code = run_archivebox_cmd( + ['crawl', 'list', '--limit=2'], + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) == 2 + + +class TestCrawlUpdate: + """Tests for `archivebox crawl update`.""" + + def test_update_status(self, cli_env, initialized_archive): + """Update crawl status.""" + # Create a crawl + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env) + crawl = parse_jsonl_output(stdout1)[0] + + # Update it + stdout2, stderr, code = run_archivebox_cmd( + ['crawl', 'update', '--status=started'], + stdin=json.dumps(crawl), + env=cli_env, + ) + + assert code == 0 + assert 'Updated 1 crawls' in stderr + + records = parse_jsonl_output(stdout2) + assert records[0]['status'] == 'started' + + +class TestCrawlDelete: + """Tests for `archivebox crawl delete`.""" + + def test_delete_requires_yes(self, cli_env, initialized_archive): + """Delete requires --yes flag.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env) + crawl = parse_jsonl_output(stdout1)[0] + + stdout, stderr, code = run_archivebox_cmd( + ['crawl', 'delete'], + stdin=json.dumps(crawl), + env=cli_env, + ) + + assert code == 1 + assert '--yes' in stderr + + def test_delete_with_yes(self, cli_env, initialized_archive): + """Delete with --yes flag works.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env) + crawl = parse_jsonl_output(stdout1)[0] + + stdout, stderr, code = run_archivebox_cmd( + ['crawl', 'delete', '--yes'], + stdin=json.dumps(crawl), + env=cli_env, + ) + + assert code == 0 + assert 'Deleted 1 crawls' in stderr + + def test_delete_dry_run(self, cli_env, initialized_archive): + """Dry run shows what would be deleted.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env) + crawl = parse_jsonl_output(stdout1)[0] + + stdout, stderr, code = run_archivebox_cmd( + ['crawl', 'delete', '--dry-run'], + stdin=json.dumps(crawl), + env=cli_env, + ) + + assert code == 0 + assert 'Would delete' in stderr + assert 'dry run' in stderr.lower() diff --git a/archivebox/tests/test_cli_run.py b/archivebox/tests/test_cli_run.py new file mode 100644 index 00000000..e3de12ad --- /dev/null +++ b/archivebox/tests/test_cli_run.py @@ -0,0 +1,254 @@ +""" +Tests for archivebox run CLI command. + +Tests cover: +- run with stdin JSONL (Crawl, Snapshot, ArchiveResult) +- create-or-update behavior (records with/without id) +- pass-through output (for chaining) +""" + +import json +import pytest + +from archivebox.tests.conftest import ( + run_archivebox_cmd, + parse_jsonl_output, + create_test_url, + create_test_crawl_json, + create_test_snapshot_json, +) + + +class TestRunWithCrawl: + """Tests for `archivebox run` with Crawl input.""" + + def test_run_with_new_crawl(self, cli_env, initialized_archive): + """Run creates and processes a new Crawl (no id).""" + crawl_record = create_test_crawl_json() + + stdout, stderr, code = run_archivebox_cmd( + ['run'], + stdin=json.dumps(crawl_record), + env=cli_env, + timeout=120, + ) + + assert code == 0, f"Command failed: {stderr}" + + # Should output the created Crawl + records = parse_jsonl_output(stdout) + crawl_records = [r for r in records if r.get('type') == 'Crawl'] + assert len(crawl_records) >= 1 + assert crawl_records[0].get('id') # Should have an id now + + def test_run_with_existing_crawl(self, cli_env, initialized_archive): + """Run re-queues an existing Crawl (with id).""" + url = create_test_url() + + # First create a crawl + stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env) + crawl = parse_jsonl_output(stdout1)[0] + + # Run with the existing crawl + stdout2, stderr, code = run_archivebox_cmd( + ['run'], + stdin=json.dumps(crawl), + env=cli_env, + timeout=120, + ) + + assert code == 0 + records = parse_jsonl_output(stdout2) + assert len(records) >= 1 + + +class TestRunWithSnapshot: + """Tests for `archivebox run` with Snapshot input.""" + + def test_run_with_new_snapshot(self, cli_env, initialized_archive): + """Run creates and processes a new Snapshot (no id, just url).""" + snapshot_record = create_test_snapshot_json() + + stdout, stderr, code = run_archivebox_cmd( + ['run'], + stdin=json.dumps(snapshot_record), + env=cli_env, + timeout=120, + ) + + assert code == 0, f"Command failed: {stderr}" + + records = parse_jsonl_output(stdout) + snapshot_records = [r for r in records if r.get('type') == 'Snapshot'] + assert len(snapshot_records) >= 1 + assert snapshot_records[0].get('id') + + def test_run_with_existing_snapshot(self, cli_env, initialized_archive): + """Run re-queues an existing Snapshot (with id).""" + url = create_test_url() + + # First create a snapshot + stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env) + snapshot = parse_jsonl_output(stdout1)[0] + + # Run with the existing snapshot + stdout2, stderr, code = run_archivebox_cmd( + ['run'], + stdin=json.dumps(snapshot), + env=cli_env, + timeout=120, + ) + + assert code == 0 + records = parse_jsonl_output(stdout2) + assert len(records) >= 1 + + def test_run_with_plain_url(self, cli_env, initialized_archive): + """Run accepts plain URL records (no type field).""" + url = create_test_url() + url_record = {'url': url} + + stdout, stderr, code = run_archivebox_cmd( + ['run'], + stdin=json.dumps(url_record), + env=cli_env, + timeout=120, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) >= 1 + + +class TestRunWithArchiveResult: + """Tests for `archivebox run` with ArchiveResult input.""" + + def test_run_requeues_failed_archiveresult(self, cli_env, initialized_archive): + """Run re-queues a failed ArchiveResult.""" + url = create_test_url() + + # Create snapshot and archive result + stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout2, _, _ = run_archivebox_cmd( + ['archiveresult', 'create', '--plugin=title'], + stdin=json.dumps(snapshot), + env=cli_env, + ) + ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult') + + # Update to failed + ar['status'] = 'failed' + run_archivebox_cmd( + ['archiveresult', 'update', '--status=failed'], + stdin=json.dumps(ar), + env=cli_env, + ) + + # Now run should re-queue it + stdout3, stderr, code = run_archivebox_cmd( + ['run'], + stdin=json.dumps(ar), + env=cli_env, + timeout=120, + ) + + assert code == 0 + records = parse_jsonl_output(stdout3) + ar_records = [r for r in records if r.get('type') == 'ArchiveResult'] + assert len(ar_records) >= 1 + + +class TestRunPassThrough: + """Tests for pass-through behavior in `archivebox run`.""" + + def test_run_passes_through_unknown_types(self, cli_env, initialized_archive): + """Run passes through records with unknown types.""" + unknown_record = {'type': 'Unknown', 'id': 'fake-id', 'data': 'test'} + + stdout, stderr, code = run_archivebox_cmd( + ['run'], + stdin=json.dumps(unknown_record), + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + unknown_records = [r for r in records if r.get('type') == 'Unknown'] + assert len(unknown_records) == 1 + assert unknown_records[0]['data'] == 'test' + + def test_run_outputs_all_processed_records(self, cli_env, initialized_archive): + """Run outputs all processed records for chaining.""" + url = create_test_url() + crawl_record = create_test_crawl_json(urls=[url]) + + stdout, stderr, code = run_archivebox_cmd( + ['run'], + stdin=json.dumps(crawl_record), + env=cli_env, + timeout=120, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + # Should have at least the Crawl in output + assert len(records) >= 1 + + +class TestRunMixedInput: + """Tests for `archivebox run` with mixed record types.""" + + def test_run_handles_mixed_types(self, cli_env, initialized_archive): + """Run handles mixed Crawl/Snapshot/ArchiveResult input.""" + crawl = create_test_crawl_json() + snapshot = create_test_snapshot_json() + unknown = {'type': 'Tag', 'id': 'fake', 'name': 'test'} + + stdin = '\n'.join([ + json.dumps(crawl), + json.dumps(snapshot), + json.dumps(unknown), + ]) + + stdout, stderr, code = run_archivebox_cmd( + ['run'], + stdin=stdin, + env=cli_env, + timeout=120, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + + types = set(r.get('type') for r in records) + # Should have processed Crawl and Snapshot, passed through Tag + assert 'Crawl' in types or 'Snapshot' in types or 'Tag' in types + + +class TestRunEmpty: + """Tests for `archivebox run` edge cases.""" + + def test_run_empty_stdin(self, cli_env, initialized_archive): + """Run with empty stdin returns success.""" + stdout, stderr, code = run_archivebox_cmd( + ['run'], + stdin='', + env=cli_env, + ) + + assert code == 0 + + def test_run_no_records_to_process(self, cli_env, initialized_archive): + """Run with only pass-through records shows message.""" + unknown = {'type': 'Unknown', 'id': 'fake'} + + stdout, stderr, code = run_archivebox_cmd( + ['run'], + stdin=json.dumps(unknown), + env=cli_env, + ) + + assert code == 0 + assert 'No records to process' in stderr diff --git a/archivebox/tests/test_cli_snapshot.py b/archivebox/tests/test_cli_snapshot.py new file mode 100644 index 00000000..3bfd7268 --- /dev/null +++ b/archivebox/tests/test_cli_snapshot.py @@ -0,0 +1,274 @@ +""" +Tests for archivebox snapshot CLI command. + +Tests cover: +- snapshot create (from URLs, from Crawl JSONL, pass-through) +- snapshot list (with filters) +- snapshot update +- snapshot delete +""" + +import json +import pytest + +from archivebox.tests.conftest import ( + run_archivebox_cmd, + parse_jsonl_output, + assert_jsonl_contains_type, + create_test_url, +) + + +class TestSnapshotCreate: + """Tests for `archivebox snapshot create`.""" + + def test_create_from_url_args(self, cli_env, initialized_archive): + """Create snapshot from URL arguments.""" + url = create_test_url() + + stdout, stderr, code = run_archivebox_cmd( + ['snapshot', 'create', url], + env=cli_env, + ) + + assert code == 0, f"Command failed: {stderr}" + assert 'Created' in stderr + + records = parse_jsonl_output(stdout) + assert len(records) == 1 + assert records[0]['type'] == 'Snapshot' + assert records[0]['url'] == url + + def test_create_from_crawl_jsonl(self, cli_env, initialized_archive): + """Create snapshots from Crawl JSONL input.""" + url = create_test_url() + + # First create a crawl + stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env) + crawl = parse_jsonl_output(stdout1)[0] + + # Pipe crawl to snapshot create + stdout2, stderr, code = run_archivebox_cmd( + ['snapshot', 'create'], + stdin=json.dumps(crawl), + env=cli_env, + ) + + assert code == 0, f"Command failed: {stderr}" + + records = parse_jsonl_output(stdout2) + # Should have the Crawl passed through and the Snapshot created + types = [r.get('type') for r in records] + assert 'Crawl' in types + assert 'Snapshot' in types + + snapshot = next(r for r in records if r['type'] == 'Snapshot') + assert snapshot['url'] == url + + def test_create_with_tag(self, cli_env, initialized_archive): + """Create snapshot with --tag flag.""" + url = create_test_url() + + stdout, stderr, code = run_archivebox_cmd( + ['snapshot', 'create', '--tag=test-tag', url], + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert 'test-tag' in records[0].get('tags_str', '') + + def test_create_pass_through_other_types(self, cli_env, initialized_archive): + """Pass-through records of other types unchanged.""" + tag_record = {'type': 'Tag', 'id': 'fake-tag-id', 'name': 'test'} + url = create_test_url() + stdin = json.dumps(tag_record) + '\n' + json.dumps({'url': url}) + + stdout, stderr, code = run_archivebox_cmd( + ['snapshot', 'create'], + stdin=stdin, + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + + types = [r.get('type') for r in records] + assert 'Tag' in types + assert 'Snapshot' in types + + def test_create_multiple_urls(self, cli_env, initialized_archive): + """Create snapshots from multiple URLs.""" + urls = [create_test_url() for _ in range(3)] + + stdout, stderr, code = run_archivebox_cmd( + ['snapshot', 'create'] + urls, + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) == 3 + + created_urls = {r['url'] for r in records} + for url in urls: + assert url in created_urls + + +class TestSnapshotList: + """Tests for `archivebox snapshot list`.""" + + def test_list_empty(self, cli_env, initialized_archive): + """List with no snapshots returns empty.""" + stdout, stderr, code = run_archivebox_cmd( + ['snapshot', 'list'], + env=cli_env, + ) + + assert code == 0 + assert 'Listed 0 snapshots' in stderr + + def test_list_returns_created(self, cli_env, initialized_archive): + """List returns previously created snapshots.""" + url = create_test_url() + run_archivebox_cmd(['snapshot', 'create', url], env=cli_env) + + stdout, stderr, code = run_archivebox_cmd( + ['snapshot', 'list'], + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) >= 1 + assert any(r.get('url') == url for r in records) + + def test_list_filter_by_status(self, cli_env, initialized_archive): + """Filter snapshots by status.""" + url = create_test_url() + run_archivebox_cmd(['snapshot', 'create', url], env=cli_env) + + stdout, stderr, code = run_archivebox_cmd( + ['snapshot', 'list', '--status=queued'], + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + for r in records: + assert r['status'] == 'queued' + + def test_list_filter_by_url_contains(self, cli_env, initialized_archive): + """Filter snapshots by URL contains.""" + url = create_test_url(domain='unique-domain-12345.com') + run_archivebox_cmd(['snapshot', 'create', url], env=cli_env) + + stdout, stderr, code = run_archivebox_cmd( + ['snapshot', 'list', '--url__icontains=unique-domain-12345'], + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) == 1 + assert 'unique-domain-12345' in records[0]['url'] + + def test_list_with_limit(self, cli_env, initialized_archive): + """Limit number of results.""" + for _ in range(3): + run_archivebox_cmd(['snapshot', 'create', create_test_url()], env=cli_env) + + stdout, stderr, code = run_archivebox_cmd( + ['snapshot', 'list', '--limit=2'], + env=cli_env, + ) + + assert code == 0 + records = parse_jsonl_output(stdout) + assert len(records) == 2 + + +class TestSnapshotUpdate: + """Tests for `archivebox snapshot update`.""" + + def test_update_status(self, cli_env, initialized_archive): + """Update snapshot status.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout2, stderr, code = run_archivebox_cmd( + ['snapshot', 'update', '--status=started'], + stdin=json.dumps(snapshot), + env=cli_env, + ) + + assert code == 0 + assert 'Updated 1 snapshots' in stderr + + records = parse_jsonl_output(stdout2) + assert records[0]['status'] == 'started' + + def test_update_add_tag(self, cli_env, initialized_archive): + """Update snapshot by adding tag.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout2, stderr, code = run_archivebox_cmd( + ['snapshot', 'update', '--tag=new-tag'], + stdin=json.dumps(snapshot), + env=cli_env, + ) + + assert code == 0 + assert 'Updated 1 snapshots' in stderr + + +class TestSnapshotDelete: + """Tests for `archivebox snapshot delete`.""" + + def test_delete_requires_yes(self, cli_env, initialized_archive): + """Delete requires --yes flag.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout, stderr, code = run_archivebox_cmd( + ['snapshot', 'delete'], + stdin=json.dumps(snapshot), + env=cli_env, + ) + + assert code == 1 + assert '--yes' in stderr + + def test_delete_with_yes(self, cli_env, initialized_archive): + """Delete with --yes flag works.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout, stderr, code = run_archivebox_cmd( + ['snapshot', 'delete', '--yes'], + stdin=json.dumps(snapshot), + env=cli_env, + ) + + assert code == 0 + assert 'Deleted 1 snapshots' in stderr + + def test_delete_dry_run(self, cli_env, initialized_archive): + """Dry run shows what would be deleted.""" + url = create_test_url() + stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env) + snapshot = parse_jsonl_output(stdout1)[0] + + stdout, stderr, code = run_archivebox_cmd( + ['snapshot', 'delete', '--dry-run'], + stdin=json.dumps(snapshot), + env=cli_env, + ) + + assert code == 0 + assert 'Would delete' in stderr diff --git a/archivebox/workers/supervisord_util.py b/archivebox/workers/supervisord_util.py index 8ec749ee..14af0afd 100644 --- a/archivebox/workers/supervisord_util.py +++ b/archivebox/workers/supervisord_util.py @@ -32,7 +32,7 @@ _supervisord_proc = None ORCHESTRATOR_WORKER = { "name": "worker_orchestrator", - "command": "archivebox manage orchestrator", # runs forever by default + "command": "archivebox run", # runs forever by default "autostart": "true", "autorestart": "true", "stdout_logfile": "logs/worker_orchestrator.log",