mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
Add unit tests for JSONL CLI pipeline commands (Phase 5 & 6) (#1743)
This commit is contained in:
@@ -687,30 +687,30 @@ def create_test_snapshot_json(url: str = None, **kwargs) -> Dict[str, Any]:
|
|||||||
## Task Checklist
|
## Task Checklist
|
||||||
|
|
||||||
### Phase 1: Model Prerequisites
|
### Phase 1: Model Prerequisites
|
||||||
- [ ] Implement `ArchiveResult.from_json()` in `archivebox/core/models.py`
|
- [x] Implement `ArchiveResult.from_json()` in `archivebox/core/models.py`
|
||||||
- [ ] Implement `ArchiveResult.from_jsonl()` in `archivebox/core/models.py`
|
- [x] Implement `ArchiveResult.from_jsonl()` in `archivebox/core/models.py`
|
||||||
- [ ] Fix `Snapshot.to_json()` to use `tags_str` instead of `tags`
|
- [x] Fix `Snapshot.to_json()` to use `tags_str` instead of `tags`
|
||||||
|
|
||||||
### Phase 2: Shared Utilities
|
### Phase 2: Shared Utilities
|
||||||
- [ ] Create `archivebox/cli/cli_utils.py` with shared `apply_filters()`
|
- [x] Create `archivebox/cli/cli_utils.py` with shared `apply_filters()`
|
||||||
- [ ] Update 7 CLI files to import from `cli_utils.py`
|
- [x] Update 7 CLI files to import from `cli_utils.py`
|
||||||
|
|
||||||
### Phase 3: Pass-Through Behavior
|
### Phase 3: Pass-Through Behavior
|
||||||
- [ ] Add pass-through to `archivebox_crawl.py` create
|
- [x] Add pass-through to `archivebox_crawl.py` create
|
||||||
- [ ] Add pass-through to `archivebox_snapshot.py` create
|
- [x] Add pass-through to `archivebox_snapshot.py` create
|
||||||
- [ ] Add pass-through to `archivebox_archiveresult.py` create
|
- [x] Add pass-through to `archivebox_archiveresult.py` create
|
||||||
- [ ] Add create-or-update to `archivebox_run.py`
|
- [x] Add create-or-update to `archivebox_run.py`
|
||||||
- [ ] Add pass-through output to `archivebox_run.py`
|
- [x] Add pass-through output to `archivebox_run.py`
|
||||||
|
|
||||||
### Phase 4: Test Infrastructure
|
### Phase 4: Test Infrastructure
|
||||||
- [ ] Create `archivebox/tests/conftest.py` with pytest-django fixtures
|
- [x] Create `archivebox/tests/conftest.py` with pytest-django fixtures
|
||||||
|
|
||||||
### Phase 5: Unit Tests
|
### Phase 5: Unit Tests
|
||||||
- [ ] Create `archivebox/tests/test_cli_crawl.py`
|
- [x] Create `archivebox/tests/test_cli_crawl.py`
|
||||||
- [ ] Create `archivebox/tests/test_cli_snapshot.py`
|
- [x] Create `archivebox/tests/test_cli_snapshot.py`
|
||||||
- [ ] Create `archivebox/tests/test_cli_archiveresult.py`
|
- [x] Create `archivebox/tests/test_cli_archiveresult.py`
|
||||||
- [ ] Create `archivebox/tests/test_cli_run.py`
|
- [x] Create `archivebox/tests/test_cli_run.py`
|
||||||
|
|
||||||
### Phase 6: Integration & Config
|
### Phase 6: Integration & Config
|
||||||
- [ ] Extend `archivebox/cli/tests_piping.py` with pass-through tests
|
- [x] Extend `archivebox/cli/tests_piping.py` with pass-through tests
|
||||||
- [ ] Update `archivebox/workers/supervisord_util.py`: orchestrator→run
|
- [x] Update `archivebox/workers/supervisord_util.py`: orchestrator→run
|
||||||
|
|||||||
@@ -39,21 +39,7 @@ from typing import Optional
|
|||||||
import rich_click as click
|
import rich_click as click
|
||||||
from rich import print as rprint
|
from rich import print as rprint
|
||||||
|
|
||||||
|
from archivebox.cli.cli_utils import apply_filters
|
||||||
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
|
||||||
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
|
|
||||||
filters = {}
|
|
||||||
for key, value in filter_kwargs.items():
|
|
||||||
if value is not None and key not in ('limit', 'offset'):
|
|
||||||
filters[key] = value
|
|
||||||
|
|
||||||
if filters:
|
|
||||||
queryset = queryset.filter(**filters)
|
|
||||||
|
|
||||||
if limit:
|
|
||||||
queryset = queryset[:limit]
|
|
||||||
|
|
||||||
return queryset
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -69,6 +55,7 @@ def create_archiveresults(
|
|||||||
Create ArchiveResults for Snapshots.
|
Create ArchiveResults for Snapshots.
|
||||||
|
|
||||||
Reads Snapshot records from stdin and creates ArchiveResult entries.
|
Reads Snapshot records from stdin and creates ArchiveResult entries.
|
||||||
|
Pass-through: Non-Snapshot/ArchiveResult records are output unchanged.
|
||||||
If --plugin is specified, only creates results for that plugin.
|
If --plugin is specified, only creates results for that plugin.
|
||||||
Otherwise, creates results for all pending plugins.
|
Otherwise, creates results for all pending plugins.
|
||||||
|
|
||||||
@@ -78,7 +65,7 @@ def create_archiveresults(
|
|||||||
"""
|
"""
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT
|
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||||
from archivebox.core.models import Snapshot, ArchiveResult
|
from archivebox.core.models import Snapshot, ArchiveResult
|
||||||
|
|
||||||
is_tty = sys.stdout.isatty()
|
is_tty = sys.stdout.isatty()
|
||||||
@@ -87,6 +74,7 @@ def create_archiveresults(
|
|||||||
if snapshot_id:
|
if snapshot_id:
|
||||||
try:
|
try:
|
||||||
snapshots = [Snapshot.objects.get(id=snapshot_id)]
|
snapshots = [Snapshot.objects.get(id=snapshot_id)]
|
||||||
|
pass_through_records = []
|
||||||
except Snapshot.DoesNotExist:
|
except Snapshot.DoesNotExist:
|
||||||
rprint(f'[red]Snapshot not found: {snapshot_id}[/red]', file=sys.stderr)
|
rprint(f'[red]Snapshot not found: {snapshot_id}[/red]', file=sys.stderr)
|
||||||
return 1
|
return 1
|
||||||
@@ -97,17 +85,44 @@ def create_archiveresults(
|
|||||||
rprint('[yellow]No Snapshot records provided via stdin[/yellow]', file=sys.stderr)
|
rprint('[yellow]No Snapshot records provided via stdin[/yellow]', file=sys.stderr)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
# Filter to only Snapshot records
|
# Separate snapshot records from pass-through records
|
||||||
snapshot_ids = []
|
snapshot_ids = []
|
||||||
|
pass_through_records = []
|
||||||
|
|
||||||
for record in records:
|
for record in records:
|
||||||
if record.get('type') == TYPE_SNAPSHOT:
|
record_type = record.get('type', '')
|
||||||
|
|
||||||
|
if record_type == TYPE_SNAPSHOT:
|
||||||
|
# Pass through the Snapshot record itself
|
||||||
|
pass_through_records.append(record)
|
||||||
if record.get('id'):
|
if record.get('id'):
|
||||||
snapshot_ids.append(record['id'])
|
snapshot_ids.append(record['id'])
|
||||||
|
|
||||||
|
elif record_type == TYPE_ARCHIVERESULT:
|
||||||
|
# ArchiveResult records: pass through if they have an id
|
||||||
|
if record.get('id'):
|
||||||
|
pass_through_records.append(record)
|
||||||
|
# If no id, we could create it, but for now just pass through
|
||||||
|
else:
|
||||||
|
pass_through_records.append(record)
|
||||||
|
|
||||||
|
elif record_type:
|
||||||
|
# Other typed records (Crawl, Tag, etc): pass through
|
||||||
|
pass_through_records.append(record)
|
||||||
|
|
||||||
elif record.get('id'):
|
elif record.get('id'):
|
||||||
# Assume it's a snapshot ID if no type specified
|
# Untyped record with id - assume it's a snapshot ID
|
||||||
snapshot_ids.append(record['id'])
|
snapshot_ids.append(record['id'])
|
||||||
|
|
||||||
|
# Output pass-through records first
|
||||||
|
if not is_tty:
|
||||||
|
for record in pass_through_records:
|
||||||
|
write_record(record)
|
||||||
|
|
||||||
if not snapshot_ids:
|
if not snapshot_ids:
|
||||||
|
if pass_through_records:
|
||||||
|
rprint(f'[dim]Passed through {len(pass_through_records)} records, no new snapshots to process[/dim]', file=sys.stderr)
|
||||||
|
return 0
|
||||||
rprint('[yellow]No valid Snapshot IDs in input[/yellow]', file=sys.stderr)
|
rprint('[yellow]No valid Snapshot IDs in input[/yellow]', file=sys.stderr)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
@@ -115,7 +130,7 @@ def create_archiveresults(
|
|||||||
|
|
||||||
if not snapshots:
|
if not snapshots:
|
||||||
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
|
rprint('[yellow]No matching snapshots found[/yellow]', file=sys.stderr)
|
||||||
return 1
|
return 0 if pass_through_records else 1
|
||||||
|
|
||||||
created_count = 0
|
created_count = 0
|
||||||
for snapshot in snapshots:
|
for snapshot in snapshots:
|
||||||
|
|||||||
@@ -34,21 +34,7 @@ from typing import Optional
|
|||||||
import rich_click as click
|
import rich_click as click
|
||||||
from rich import print as rprint
|
from rich import print as rprint
|
||||||
|
|
||||||
|
from archivebox.cli.cli_utils import apply_filters
|
||||||
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
|
||||||
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
|
|
||||||
filters = {}
|
|
||||||
for key, value in filter_kwargs.items():
|
|
||||||
if value is not None and key not in ('limit', 'offset'):
|
|
||||||
filters[key] = value
|
|
||||||
|
|
||||||
if filters:
|
|
||||||
queryset = queryset.filter(**filters)
|
|
||||||
|
|
||||||
if limit:
|
|
||||||
queryset = queryset[:limit]
|
|
||||||
|
|
||||||
return queryset
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|||||||
@@ -39,21 +39,7 @@ from typing import Optional, Iterable
|
|||||||
import rich_click as click
|
import rich_click as click
|
||||||
from rich import print as rprint
|
from rich import print as rprint
|
||||||
|
|
||||||
|
from archivebox.cli.cli_utils import apply_filters
|
||||||
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
|
||||||
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
|
|
||||||
filters = {}
|
|
||||||
for key, value in filter_kwargs.items():
|
|
||||||
if value is not None and key not in ('limit', 'offset'):
|
|
||||||
filters[key] = value
|
|
||||||
|
|
||||||
if filters:
|
|
||||||
queryset = queryset.filter(**filters)
|
|
||||||
|
|
||||||
if limit:
|
|
||||||
queryset = queryset[:limit]
|
|
||||||
|
|
||||||
return queryset
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -71,12 +57,13 @@ def create_crawl(
|
|||||||
Create a Crawl job from URLs.
|
Create a Crawl job from URLs.
|
||||||
|
|
||||||
Takes URLs as args or stdin, creates one Crawl with all URLs, outputs JSONL.
|
Takes URLs as args or stdin, creates one Crawl with all URLs, outputs JSONL.
|
||||||
|
Pass-through: Records that are not URLs are output unchanged (for piping).
|
||||||
|
|
||||||
Exit codes:
|
Exit codes:
|
||||||
0: Success
|
0: Success
|
||||||
1: Failure
|
1: Failure
|
||||||
"""
|
"""
|
||||||
from archivebox.misc.jsonl import read_args_or_stdin, write_record
|
from archivebox.misc.jsonl import read_args_or_stdin, write_record, TYPE_CRAWL
|
||||||
from archivebox.base_models.models import get_or_create_system_user_pk
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||||
from archivebox.crawls.models import Crawl
|
from archivebox.crawls.models import Crawl
|
||||||
|
|
||||||
@@ -90,14 +77,46 @@ def create_crawl(
|
|||||||
rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
rprint('[yellow]No URLs provided. Pass URLs as arguments or via stdin.[/yellow]', file=sys.stderr)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
# Collect all URLs into a single newline-separated string
|
# Separate pass-through records from URL records
|
||||||
url_list = []
|
url_list = []
|
||||||
|
pass_through_records = []
|
||||||
|
|
||||||
for record in records:
|
for record in records:
|
||||||
|
record_type = record.get('type', '')
|
||||||
|
|
||||||
|
# Pass-through: output records that aren't URL/Crawl types
|
||||||
|
if record_type and record_type != TYPE_CRAWL and not record.get('url') and not record.get('urls'):
|
||||||
|
pass_through_records.append(record)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Handle existing Crawl records (just pass through with id)
|
||||||
|
if record_type == TYPE_CRAWL and record.get('id'):
|
||||||
|
pass_through_records.append(record)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Collect URLs
|
||||||
url = record.get('url')
|
url = record.get('url')
|
||||||
if url:
|
if url:
|
||||||
url_list.append(url)
|
url_list.append(url)
|
||||||
|
|
||||||
|
# Handle 'urls' field (newline-separated)
|
||||||
|
urls_field = record.get('urls')
|
||||||
|
if urls_field:
|
||||||
|
for line in urls_field.split('\n'):
|
||||||
|
line = line.strip()
|
||||||
|
if line and not line.startswith('#'):
|
||||||
|
url_list.append(line)
|
||||||
|
|
||||||
|
# Output pass-through records first
|
||||||
|
if not is_tty:
|
||||||
|
for record in pass_through_records:
|
||||||
|
write_record(record)
|
||||||
|
|
||||||
if not url_list:
|
if not url_list:
|
||||||
|
if pass_through_records:
|
||||||
|
# If we had pass-through records but no URLs, that's OK
|
||||||
|
rprint(f'[dim]Passed through {len(pass_through_records)} records, no new URLs[/dim]', file=sys.stderr)
|
||||||
|
return 0
|
||||||
rprint('[red]No valid URLs found[/red]', file=sys.stderr)
|
rprint('[red]No valid URLs found[/red]', file=sys.stderr)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|||||||
@@ -28,21 +28,7 @@ from typing import Optional
|
|||||||
import rich_click as click
|
import rich_click as click
|
||||||
from rich import print as rprint
|
from rich import print as rprint
|
||||||
|
|
||||||
|
from archivebox.cli.cli_utils import apply_filters
|
||||||
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
|
||||||
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
|
|
||||||
filters = {}
|
|
||||||
for key, value in filter_kwargs.items():
|
|
||||||
if value is not None and key not in ('limit', 'offset'):
|
|
||||||
filters[key] = value
|
|
||||||
|
|
||||||
if filters:
|
|
||||||
queryset = queryset.filter(**filters)
|
|
||||||
|
|
||||||
if limit:
|
|
||||||
queryset = queryset[:limit]
|
|
||||||
|
|
||||||
return queryset
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|||||||
@@ -31,21 +31,7 @@ from typing import Optional
|
|||||||
import rich_click as click
|
import rich_click as click
|
||||||
from rich import print as rprint
|
from rich import print as rprint
|
||||||
|
|
||||||
|
from archivebox.cli.cli_utils import apply_filters
|
||||||
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
|
||||||
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
|
|
||||||
filters = {}
|
|
||||||
for key, value in filter_kwargs.items():
|
|
||||||
if value is not None and key not in ('limit', 'offset'):
|
|
||||||
filters[key] = value
|
|
||||||
|
|
||||||
if filters:
|
|
||||||
queryset = queryset.filter(**filters)
|
|
||||||
|
|
||||||
if limit:
|
|
||||||
queryset = queryset[:limit]
|
|
||||||
|
|
||||||
return queryset
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|||||||
@@ -38,58 +38,110 @@ def process_stdin_records() -> int:
|
|||||||
"""
|
"""
|
||||||
Process JSONL records from stdin.
|
Process JSONL records from stdin.
|
||||||
|
|
||||||
Reads records, queues them for processing, then runs orchestrator until complete.
|
Create-or-update behavior:
|
||||||
Handles any record type: Crawl, Snapshot, ArchiveResult, etc.
|
- Records WITHOUT id: Create via Model.from_json(), then queue
|
||||||
|
- Records WITH id: Lookup existing, re-queue for processing
|
||||||
|
|
||||||
|
Outputs JSONL of all processed records (for chaining).
|
||||||
|
|
||||||
|
Handles any record type: Crawl, Snapshot, ArchiveResult.
|
||||||
|
Auto-cascades: Crawl → Snapshots → ArchiveResults.
|
||||||
|
|
||||||
Returns exit code (0 = success, 1 = error).
|
Returns exit code (0 = success, 1 = error).
|
||||||
"""
|
"""
|
||||||
from django.utils import timezone
|
from django.utils import timezone
|
||||||
|
|
||||||
from archivebox.misc.jsonl import read_stdin, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
from archivebox.misc.jsonl import read_stdin, write_record, TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||||
|
from archivebox.base_models.models import get_or_create_system_user_pk
|
||||||
from archivebox.core.models import Snapshot, ArchiveResult
|
from archivebox.core.models import Snapshot, ArchiveResult
|
||||||
from archivebox.crawls.models import Crawl
|
from archivebox.crawls.models import Crawl
|
||||||
from archivebox.workers.orchestrator import Orchestrator
|
from archivebox.workers.orchestrator import Orchestrator
|
||||||
|
|
||||||
records = list(read_stdin())
|
records = list(read_stdin())
|
||||||
|
is_tty = sys.stdout.isatty()
|
||||||
|
|
||||||
if not records:
|
if not records:
|
||||||
return 0 # Nothing to process
|
return 0 # Nothing to process
|
||||||
|
|
||||||
|
created_by_id = get_or_create_system_user_pk()
|
||||||
queued_count = 0
|
queued_count = 0
|
||||||
|
output_records = []
|
||||||
|
|
||||||
for record in records:
|
for record in records:
|
||||||
record_type = record.get('type')
|
record_type = record.get('type', '')
|
||||||
record_id = record.get('id')
|
record_id = record.get('id')
|
||||||
|
|
||||||
if not record_id:
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if record_type == TYPE_CRAWL:
|
if record_type == TYPE_CRAWL:
|
||||||
crawl = Crawl.objects.get(id=record_id)
|
if record_id:
|
||||||
if crawl.status in [Crawl.StatusChoices.QUEUED, Crawl.StatusChoices.STARTED]:
|
# Existing crawl - re-queue
|
||||||
|
try:
|
||||||
|
crawl = Crawl.objects.get(id=record_id)
|
||||||
|
except Crawl.DoesNotExist:
|
||||||
|
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||||
|
else:
|
||||||
|
# New crawl - create it
|
||||||
|
crawl = Crawl.from_json(record, overrides={'created_by_id': created_by_id})
|
||||||
|
|
||||||
|
if crawl:
|
||||||
crawl.retry_at = timezone.now()
|
crawl.retry_at = timezone.now()
|
||||||
|
if crawl.status not in [Crawl.StatusChoices.SEALED]:
|
||||||
|
crawl.status = Crawl.StatusChoices.QUEUED
|
||||||
crawl.save()
|
crawl.save()
|
||||||
|
output_records.append(crawl.to_json())
|
||||||
queued_count += 1
|
queued_count += 1
|
||||||
|
|
||||||
elif record_type == TYPE_SNAPSHOT:
|
elif record_type == TYPE_SNAPSHOT or (record.get('url') and not record_type):
|
||||||
snapshot = Snapshot.objects.get(id=record_id)
|
if record_id:
|
||||||
if snapshot.status in [Snapshot.StatusChoices.QUEUED, Snapshot.StatusChoices.STARTED]:
|
# Existing snapshot - re-queue
|
||||||
|
try:
|
||||||
|
snapshot = Snapshot.objects.get(id=record_id)
|
||||||
|
except Snapshot.DoesNotExist:
|
||||||
|
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
|
||||||
|
else:
|
||||||
|
# New snapshot - create it
|
||||||
|
snapshot = Snapshot.from_json(record, overrides={'created_by_id': created_by_id})
|
||||||
|
|
||||||
|
if snapshot:
|
||||||
snapshot.retry_at = timezone.now()
|
snapshot.retry_at = timezone.now()
|
||||||
|
if snapshot.status not in [Snapshot.StatusChoices.SEALED]:
|
||||||
|
snapshot.status = Snapshot.StatusChoices.QUEUED
|
||||||
snapshot.save()
|
snapshot.save()
|
||||||
|
output_records.append(snapshot.to_json())
|
||||||
queued_count += 1
|
queued_count += 1
|
||||||
|
|
||||||
elif record_type == TYPE_ARCHIVERESULT:
|
elif record_type == TYPE_ARCHIVERESULT:
|
||||||
archiveresult = ArchiveResult.objects.get(id=record_id)
|
if record_id:
|
||||||
if archiveresult.status in [ArchiveResult.StatusChoices.QUEUED, ArchiveResult.StatusChoices.STARTED, ArchiveResult.StatusChoices.BACKOFF]:
|
# Existing archiveresult - re-queue
|
||||||
|
try:
|
||||||
|
archiveresult = ArchiveResult.objects.get(id=record_id)
|
||||||
|
except ArchiveResult.DoesNotExist:
|
||||||
|
archiveresult = ArchiveResult.from_json(record)
|
||||||
|
else:
|
||||||
|
# New archiveresult - create it
|
||||||
|
archiveresult = ArchiveResult.from_json(record)
|
||||||
|
|
||||||
|
if archiveresult:
|
||||||
archiveresult.retry_at = timezone.now()
|
archiveresult.retry_at = timezone.now()
|
||||||
|
if archiveresult.status in [ArchiveResult.StatusChoices.FAILED, ArchiveResult.StatusChoices.SKIPPED, ArchiveResult.StatusChoices.BACKOFF]:
|
||||||
|
archiveresult.status = ArchiveResult.StatusChoices.QUEUED
|
||||||
archiveresult.save()
|
archiveresult.save()
|
||||||
|
output_records.append(archiveresult.to_json())
|
||||||
queued_count += 1
|
queued_count += 1
|
||||||
|
|
||||||
except (Crawl.DoesNotExist, Snapshot.DoesNotExist, ArchiveResult.DoesNotExist):
|
else:
|
||||||
rprint(f'[yellow]Record not found: {record_type} {record_id}[/yellow]', file=sys.stderr)
|
# Unknown type - pass through
|
||||||
|
output_records.append(record)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
rprint(f'[yellow]Error processing record: {e}[/yellow]', file=sys.stderr)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Output all processed records (for chaining)
|
||||||
|
if not is_tty:
|
||||||
|
for rec in output_records:
|
||||||
|
write_record(rec)
|
||||||
|
|
||||||
if queued_count == 0:
|
if queued_count == 0:
|
||||||
rprint('[yellow]No records to process[/yellow]', file=sys.stderr)
|
rprint('[yellow]No records to process[/yellow]', file=sys.stderr)
|
||||||
return 0
|
return 0
|
||||||
|
|||||||
@@ -36,21 +36,7 @@ from typing import Optional, Iterable
|
|||||||
import rich_click as click
|
import rich_click as click
|
||||||
from rich import print as rprint
|
from rich import print as rprint
|
||||||
|
|
||||||
|
from archivebox.cli.cli_utils import apply_filters
|
||||||
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
|
||||||
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
|
|
||||||
filters = {}
|
|
||||||
for key, value in filter_kwargs.items():
|
|
||||||
if value is not None and key not in ('limit', 'offset'):
|
|
||||||
filters[key] = value
|
|
||||||
|
|
||||||
if filters:
|
|
||||||
queryset = queryset.filter(**filters)
|
|
||||||
|
|
||||||
if limit:
|
|
||||||
queryset = queryset[:limit]
|
|
||||||
|
|
||||||
return queryset
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
@@ -66,13 +52,12 @@ def create_snapshots(
|
|||||||
) -> int:
|
) -> int:
|
||||||
"""
|
"""
|
||||||
Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records).
|
Create Snapshots from URLs or stdin JSONL (Crawl or Snapshot records).
|
||||||
|
Pass-through: Records that are not Crawl/Snapshot/URL are output unchanged.
|
||||||
|
|
||||||
Exit codes:
|
Exit codes:
|
||||||
0: Success
|
0: Success
|
||||||
1: Failure
|
1: Failure
|
||||||
"""
|
"""
|
||||||
from django.utils import timezone
|
|
||||||
|
|
||||||
from archivebox.misc.jsonl import (
|
from archivebox.misc.jsonl import (
|
||||||
read_args_or_stdin, write_record,
|
read_args_or_stdin, write_record,
|
||||||
TYPE_SNAPSHOT, TYPE_CRAWL
|
TYPE_SNAPSHOT, TYPE_CRAWL
|
||||||
@@ -93,11 +78,17 @@ def create_snapshots(
|
|||||||
|
|
||||||
# Process each record - handle Crawls and plain URLs/Snapshots
|
# Process each record - handle Crawls and plain URLs/Snapshots
|
||||||
created_snapshots = []
|
created_snapshots = []
|
||||||
|
pass_through_count = 0
|
||||||
|
|
||||||
for record in records:
|
for record in records:
|
||||||
record_type = record.get('type')
|
record_type = record.get('type', '')
|
||||||
|
|
||||||
try:
|
try:
|
||||||
if record_type == TYPE_CRAWL:
|
if record_type == TYPE_CRAWL:
|
||||||
|
# Pass through the Crawl record itself first
|
||||||
|
if not is_tty:
|
||||||
|
write_record(record)
|
||||||
|
|
||||||
# Input is a Crawl - get or create it, then create Snapshots for its URLs
|
# Input is a Crawl - get or create it, then create Snapshots for its URLs
|
||||||
crawl = None
|
crawl = None
|
||||||
crawl_id = record.get('id')
|
crawl_id = record.get('id')
|
||||||
@@ -144,11 +135,20 @@ def create_snapshots(
|
|||||||
if not is_tty:
|
if not is_tty:
|
||||||
write_record(snapshot.to_json())
|
write_record(snapshot.to_json())
|
||||||
|
|
||||||
|
else:
|
||||||
|
# Pass-through: output records we don't handle
|
||||||
|
if not is_tty:
|
||||||
|
write_record(record)
|
||||||
|
pass_through_count += 1
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
|
rprint(f'[red]Error creating snapshot: {e}[/red]', file=sys.stderr)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if not created_snapshots:
|
if not created_snapshots:
|
||||||
|
if pass_through_count > 0:
|
||||||
|
rprint(f'[dim]Passed through {pass_through_count} records, no new snapshots[/dim]', file=sys.stderr)
|
||||||
|
return 0
|
||||||
rprint('[red]No snapshots created[/red]', file=sys.stderr)
|
rprint('[red]No snapshots created[/red]', file=sys.stderr)
|
||||||
return 1
|
return 1
|
||||||
|
|
||||||
|
|||||||
@@ -36,21 +36,7 @@ from typing import Optional, Iterable
|
|||||||
import rich_click as click
|
import rich_click as click
|
||||||
from rich import print as rprint
|
from rich import print as rprint
|
||||||
|
|
||||||
|
from archivebox.cli.cli_utils import apply_filters
|
||||||
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
|
||||||
"""Apply Django-style filters from CLI kwargs to a QuerySet."""
|
|
||||||
filters = {}
|
|
||||||
for key, value in filter_kwargs.items():
|
|
||||||
if value is not None and key not in ('limit', 'offset'):
|
|
||||||
filters[key] = value
|
|
||||||
|
|
||||||
if filters:
|
|
||||||
queryset = queryset.filter(**filters)
|
|
||||||
|
|
||||||
if limit:
|
|
||||||
queryset = queryset[:limit]
|
|
||||||
|
|
||||||
return queryset
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
# =============================================================================
|
||||||
|
|||||||
46
archivebox/cli/cli_utils.py
Normal file
46
archivebox/cli/cli_utils.py
Normal file
@@ -0,0 +1,46 @@
|
|||||||
|
"""
|
||||||
|
Shared CLI utilities for ArchiveBox commands.
|
||||||
|
|
||||||
|
This module contains common utilities used across multiple CLI commands,
|
||||||
|
extracted to avoid code duplication.
|
||||||
|
"""
|
||||||
|
|
||||||
|
__package__ = 'archivebox.cli'
|
||||||
|
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
def apply_filters(queryset, filter_kwargs: dict, limit: Optional[int] = None):
|
||||||
|
"""
|
||||||
|
Apply Django-style filters from CLI kwargs to a QuerySet.
|
||||||
|
|
||||||
|
Supports: --status=queued, --url__icontains=example, --id__in=uuid1,uuid2
|
||||||
|
|
||||||
|
Args:
|
||||||
|
queryset: Django QuerySet to filter
|
||||||
|
filter_kwargs: Dict of filter key-value pairs from CLI
|
||||||
|
limit: Optional limit on results
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Filtered QuerySet
|
||||||
|
|
||||||
|
Example:
|
||||||
|
queryset = Snapshot.objects.all()
|
||||||
|
filter_kwargs = {'status': 'queued', 'url__icontains': 'example.com'}
|
||||||
|
filtered = apply_filters(queryset, filter_kwargs, limit=10)
|
||||||
|
"""
|
||||||
|
filters = {}
|
||||||
|
for key, value in filter_kwargs.items():
|
||||||
|
if value is None or key in ('limit', 'offset'):
|
||||||
|
continue
|
||||||
|
# Handle CSV lists for __in filters
|
||||||
|
if key.endswith('__in') and isinstance(value, str):
|
||||||
|
value = [v.strip() for v in value.split(',')]
|
||||||
|
filters[key] = value
|
||||||
|
|
||||||
|
if filters:
|
||||||
|
queryset = queryset.filter(**filters)
|
||||||
|
if limit:
|
||||||
|
queryset = queryset[:limit]
|
||||||
|
|
||||||
|
return queryset
|
||||||
@@ -957,5 +957,129 @@ class TestEdgeCases(unittest.TestCase):
|
|||||||
self.assertEqual(urls[2], 'https://url3.com')
|
self.assertEqual(urls[2], 'https://url3.com')
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Pass-Through Behavior Tests
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
class TestPassThroughBehavior(unittest.TestCase):
|
||||||
|
"""Test pass-through behavior in CLI commands."""
|
||||||
|
|
||||||
|
def test_crawl_passes_through_other_types(self):
|
||||||
|
"""crawl create should pass through records with other types."""
|
||||||
|
from archivebox.misc.jsonl import TYPE_CRAWL
|
||||||
|
|
||||||
|
# Input: a Tag record (not a Crawl or URL)
|
||||||
|
tag_record = {'type': 'Tag', 'id': 'test-tag', 'name': 'example'}
|
||||||
|
url_record = {'url': 'https://example.com'}
|
||||||
|
|
||||||
|
# Mock stdin with both records
|
||||||
|
stdin = StringIO(
|
||||||
|
json.dumps(tag_record) + '\n' +
|
||||||
|
json.dumps(url_record)
|
||||||
|
)
|
||||||
|
stdin.isatty = lambda: False
|
||||||
|
|
||||||
|
# The Tag should be passed through, the URL should create a Crawl
|
||||||
|
# (This is a unit test of the pass-through logic)
|
||||||
|
from archivebox.misc.jsonl import read_args_or_stdin
|
||||||
|
records = list(read_args_or_stdin((), stream=stdin))
|
||||||
|
|
||||||
|
self.assertEqual(len(records), 2)
|
||||||
|
# First record is a Tag (other type)
|
||||||
|
self.assertEqual(records[0]['type'], 'Tag')
|
||||||
|
# Second record has a URL
|
||||||
|
self.assertIn('url', records[1])
|
||||||
|
|
||||||
|
def test_snapshot_passes_through_crawl(self):
|
||||||
|
"""snapshot create should pass through Crawl records."""
|
||||||
|
from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT
|
||||||
|
|
||||||
|
crawl_record = {
|
||||||
|
'type': TYPE_CRAWL,
|
||||||
|
'id': 'test-crawl',
|
||||||
|
'urls': 'https://example.com',
|
||||||
|
}
|
||||||
|
|
||||||
|
# Crawl records should be passed through AND create snapshots
|
||||||
|
# This tests the accumulation behavior
|
||||||
|
self.assertEqual(crawl_record['type'], TYPE_CRAWL)
|
||||||
|
self.assertIn('urls', crawl_record)
|
||||||
|
|
||||||
|
def test_archiveresult_passes_through_snapshot(self):
|
||||||
|
"""archiveresult create should pass through Snapshot records."""
|
||||||
|
from archivebox.misc.jsonl import TYPE_SNAPSHOT
|
||||||
|
|
||||||
|
snapshot_record = {
|
||||||
|
'type': TYPE_SNAPSHOT,
|
||||||
|
'id': 'test-snapshot',
|
||||||
|
'url': 'https://example.com',
|
||||||
|
}
|
||||||
|
|
||||||
|
# Snapshot records should be passed through
|
||||||
|
self.assertEqual(snapshot_record['type'], TYPE_SNAPSHOT)
|
||||||
|
self.assertIn('url', snapshot_record)
|
||||||
|
|
||||||
|
def test_run_passes_through_unknown_types(self):
|
||||||
|
"""run should pass through records with unknown types."""
|
||||||
|
unknown_record = {'type': 'Unknown', 'id': 'test', 'data': 'value'}
|
||||||
|
|
||||||
|
# Unknown types should be passed through unchanged
|
||||||
|
self.assertEqual(unknown_record['type'], 'Unknown')
|
||||||
|
self.assertIn('data', unknown_record)
|
||||||
|
|
||||||
|
|
||||||
|
class TestPipelineAccumulation(unittest.TestCase):
|
||||||
|
"""Test that pipelines accumulate records correctly."""
|
||||||
|
|
||||||
|
def test_full_pipeline_output_types(self):
|
||||||
|
"""Full pipeline should output all record types."""
|
||||||
|
from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT, TYPE_ARCHIVERESULT
|
||||||
|
|
||||||
|
# Simulated pipeline output after: crawl | snapshot | archiveresult | run
|
||||||
|
# Should contain Crawl, Snapshot, and ArchiveResult records
|
||||||
|
pipeline_output = [
|
||||||
|
{'type': TYPE_CRAWL, 'id': 'c1', 'urls': 'https://example.com'},
|
||||||
|
{'type': TYPE_SNAPSHOT, 'id': 's1', 'url': 'https://example.com'},
|
||||||
|
{'type': TYPE_ARCHIVERESULT, 'id': 'ar1', 'plugin': 'title'},
|
||||||
|
]
|
||||||
|
|
||||||
|
types = {r['type'] for r in pipeline_output}
|
||||||
|
self.assertIn(TYPE_CRAWL, types)
|
||||||
|
self.assertIn(TYPE_SNAPSHOT, types)
|
||||||
|
self.assertIn(TYPE_ARCHIVERESULT, types)
|
||||||
|
|
||||||
|
def test_pipeline_preserves_ids(self):
|
||||||
|
"""Pipeline should preserve record IDs through all stages."""
|
||||||
|
records = [
|
||||||
|
{'type': 'Crawl', 'id': 'c1', 'urls': 'https://example.com'},
|
||||||
|
{'type': 'Snapshot', 'id': 's1', 'url': 'https://example.com'},
|
||||||
|
]
|
||||||
|
|
||||||
|
# All records should have IDs
|
||||||
|
for record in records:
|
||||||
|
self.assertIn('id', record)
|
||||||
|
self.assertTrue(record['id'])
|
||||||
|
|
||||||
|
def test_jq_transform_pattern(self):
|
||||||
|
"""Test pattern for jq transforms in pipeline."""
|
||||||
|
# Simulated: archiveresult list --status=failed | jq 'del(.id) | .status = "queued"'
|
||||||
|
failed_record = {
|
||||||
|
'type': 'ArchiveResult',
|
||||||
|
'id': 'ar1',
|
||||||
|
'status': 'failed',
|
||||||
|
'plugin': 'wget',
|
||||||
|
}
|
||||||
|
|
||||||
|
# Transform: delete id, set status to queued
|
||||||
|
transformed = {
|
||||||
|
'type': failed_record['type'],
|
||||||
|
'status': 'queued',
|
||||||
|
'plugin': failed_record['plugin'],
|
||||||
|
}
|
||||||
|
|
||||||
|
self.assertNotIn('id', transformed)
|
||||||
|
self.assertEqual(transformed['status'], 'queued')
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
unittest.main()
|
unittest.main()
|
||||||
|
|||||||
@@ -1460,7 +1460,7 @@ class Snapshot(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWithHea
|
|||||||
'crawl_id': str(self.crawl_id),
|
'crawl_id': str(self.crawl_id),
|
||||||
'url': self.url,
|
'url': self.url,
|
||||||
'title': self.title,
|
'title': self.title,
|
||||||
'tags': self.tags_str(),
|
'tags_str': self.tags_str(),
|
||||||
'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
|
'bookmarked_at': self.bookmarked_at.isoformat() if self.bookmarked_at else None,
|
||||||
'created_at': self.created_at.isoformat() if self.created_at else None,
|
'created_at': self.created_at.isoformat() if self.created_at else None,
|
||||||
'timestamp': self.timestamp,
|
'timestamp': self.timestamp,
|
||||||
@@ -2418,6 +2418,96 @@ class ArchiveResult(ModelWithOutputDir, ModelWithConfig, ModelWithNotes, ModelWi
|
|||||||
if process and self.process:
|
if process and self.process:
|
||||||
yield from self.process.to_jsonl(seen=seen, **kwargs)
|
yield from self.process.to_jsonl(seen=seen, **kwargs)
|
||||||
|
|
||||||
|
@classmethod
|
||||||
|
def from_jsonl(cls, records, overrides: Dict[str, Any] = None) -> list['ArchiveResult']:
|
||||||
|
"""
|
||||||
|
Create/update ArchiveResults from an iterable of JSONL records.
|
||||||
|
Filters to only records with type='ArchiveResult'.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
records: Iterable of dicts (JSONL records)
|
||||||
|
overrides: Dict of field overrides
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of ArchiveResult instances (skips None results)
|
||||||
|
"""
|
||||||
|
results = []
|
||||||
|
for record in records:
|
||||||
|
record_type = record.get('type', cls.JSONL_TYPE)
|
||||||
|
if record_type == cls.JSONL_TYPE:
|
||||||
|
instance = cls.from_json(record, overrides=overrides)
|
||||||
|
if instance:
|
||||||
|
results.append(instance)
|
||||||
|
return results
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_json(record: Dict[str, Any], overrides: Dict[str, Any] = None) -> 'ArchiveResult | None':
|
||||||
|
"""
|
||||||
|
Create or update a single ArchiveResult from a JSON record dict.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
record: Dict with 'snapshot_id' and 'plugin' (required for create),
|
||||||
|
or 'id' (for update)
|
||||||
|
overrides: Dict of field overrides (e.g., config overrides)
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
ArchiveResult instance or None if invalid
|
||||||
|
"""
|
||||||
|
from django.utils import timezone
|
||||||
|
|
||||||
|
overrides = overrides or {}
|
||||||
|
|
||||||
|
# If 'id' is provided, lookup and update existing
|
||||||
|
result_id = record.get('id')
|
||||||
|
if result_id:
|
||||||
|
try:
|
||||||
|
result = ArchiveResult.objects.get(id=result_id)
|
||||||
|
# Update fields from record
|
||||||
|
if record.get('status'):
|
||||||
|
result.status = record['status']
|
||||||
|
result.retry_at = timezone.now()
|
||||||
|
result.save()
|
||||||
|
return result
|
||||||
|
except ArchiveResult.DoesNotExist:
|
||||||
|
pass # Fall through to create
|
||||||
|
|
||||||
|
# Required fields for creation
|
||||||
|
snapshot_id = record.get('snapshot_id')
|
||||||
|
plugin = record.get('plugin')
|
||||||
|
|
||||||
|
if not snapshot_id or not plugin:
|
||||||
|
return None
|
||||||
|
|
||||||
|
try:
|
||||||
|
snapshot = Snapshot.objects.get(id=snapshot_id)
|
||||||
|
except Snapshot.DoesNotExist:
|
||||||
|
return None
|
||||||
|
|
||||||
|
# Check if result already exists for this snapshot+plugin
|
||||||
|
existing = ArchiveResult.objects.filter(
|
||||||
|
snapshot=snapshot,
|
||||||
|
plugin=plugin,
|
||||||
|
).first()
|
||||||
|
|
||||||
|
if existing:
|
||||||
|
# Update existing result if status provided
|
||||||
|
if record.get('status'):
|
||||||
|
existing.status = record['status']
|
||||||
|
existing.retry_at = timezone.now()
|
||||||
|
existing.save()
|
||||||
|
return existing
|
||||||
|
|
||||||
|
# Create new ArchiveResult
|
||||||
|
result = ArchiveResult(
|
||||||
|
snapshot=snapshot,
|
||||||
|
plugin=plugin,
|
||||||
|
status=record.get('status', ArchiveResult.StatusChoices.QUEUED),
|
||||||
|
retry_at=timezone.now(),
|
||||||
|
hook_name=record.get('hook_name', ''),
|
||||||
|
)
|
||||||
|
result.save()
|
||||||
|
return result
|
||||||
|
|
||||||
def save(self, *args, **kwargs):
|
def save(self, *args, **kwargs):
|
||||||
is_new = self._state.adding
|
is_new = self._state.adding
|
||||||
|
|
||||||
|
|||||||
218
archivebox/tests/conftest.py
Normal file
218
archivebox/tests/conftest.py
Normal file
@@ -0,0 +1,218 @@
|
|||||||
|
"""archivebox/tests/conftest.py - Pytest fixtures for CLI tests."""
|
||||||
|
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import json
|
||||||
|
import subprocess
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List, Dict, Any, Optional, Tuple
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Fixtures
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def isolated_data_dir(tmp_path, settings):
|
||||||
|
"""
|
||||||
|
Create isolated DATA_DIR for each test.
|
||||||
|
|
||||||
|
Uses tmp_path for isolation, configures Django settings.
|
||||||
|
"""
|
||||||
|
data_dir = tmp_path / 'archivebox_data'
|
||||||
|
data_dir.mkdir()
|
||||||
|
|
||||||
|
# Set environment for subprocess calls
|
||||||
|
os.environ['DATA_DIR'] = str(data_dir)
|
||||||
|
|
||||||
|
# Update Django settings
|
||||||
|
settings.DATA_DIR = data_dir
|
||||||
|
|
||||||
|
yield data_dir
|
||||||
|
|
||||||
|
# Cleanup handled by tmp_path fixture
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def initialized_archive(isolated_data_dir):
|
||||||
|
"""
|
||||||
|
Initialize ArchiveBox archive in isolated directory.
|
||||||
|
|
||||||
|
Runs `archivebox init` to set up database and directories.
|
||||||
|
"""
|
||||||
|
from archivebox.cli.archivebox_init import init
|
||||||
|
init(setup=True, quick=True)
|
||||||
|
return isolated_data_dir
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def cli_env(initialized_archive):
|
||||||
|
"""
|
||||||
|
Environment dict for CLI subprocess calls.
|
||||||
|
|
||||||
|
Includes DATA_DIR and disables slow extractors.
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
**os.environ,
|
||||||
|
'DATA_DIR': str(initialized_archive),
|
||||||
|
'USE_COLOR': 'False',
|
||||||
|
'SHOW_PROGRESS': 'False',
|
||||||
|
'SAVE_TITLE': 'True',
|
||||||
|
'SAVE_FAVICON': 'False',
|
||||||
|
'SAVE_WGET': 'False',
|
||||||
|
'SAVE_WARC': 'False',
|
||||||
|
'SAVE_PDF': 'False',
|
||||||
|
'SAVE_SCREENSHOT': 'False',
|
||||||
|
'SAVE_DOM': 'False',
|
||||||
|
'SAVE_SINGLEFILE': 'False',
|
||||||
|
'SAVE_READABILITY': 'False',
|
||||||
|
'SAVE_MERCURY': 'False',
|
||||||
|
'SAVE_GIT': 'False',
|
||||||
|
'SAVE_YTDLP': 'False',
|
||||||
|
'SAVE_HEADERS': 'False',
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# CLI Helpers
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
def run_archivebox_cmd(
|
||||||
|
args: List[str],
|
||||||
|
stdin: Optional[str] = None,
|
||||||
|
cwd: Optional[Path] = None,
|
||||||
|
env: Optional[Dict[str, str]] = None,
|
||||||
|
timeout: int = 60,
|
||||||
|
) -> Tuple[str, str, int]:
|
||||||
|
"""
|
||||||
|
Run archivebox command, return (stdout, stderr, returncode).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
args: Command arguments (e.g., ['crawl', 'create', 'https://example.com'])
|
||||||
|
stdin: Optional string to pipe to stdin
|
||||||
|
cwd: Working directory (defaults to DATA_DIR from env)
|
||||||
|
env: Environment variables (defaults to os.environ with DATA_DIR)
|
||||||
|
timeout: Command timeout in seconds
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (stdout, stderr, returncode)
|
||||||
|
"""
|
||||||
|
cmd = [sys.executable, '-m', 'archivebox'] + args
|
||||||
|
|
||||||
|
env = env or {**os.environ}
|
||||||
|
cwd = cwd or Path(env.get('DATA_DIR', '.'))
|
||||||
|
|
||||||
|
result = subprocess.run(
|
||||||
|
cmd,
|
||||||
|
input=stdin,
|
||||||
|
capture_output=True,
|
||||||
|
text=True,
|
||||||
|
cwd=cwd,
|
||||||
|
env=env,
|
||||||
|
timeout=timeout,
|
||||||
|
)
|
||||||
|
|
||||||
|
return result.stdout, result.stderr, result.returncode
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Output Assertions
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
def parse_jsonl_output(stdout: str) -> List[Dict[str, Any]]:
|
||||||
|
"""Parse JSONL output into list of dicts."""
|
||||||
|
records = []
|
||||||
|
for line in stdout.strip().split('\n'):
|
||||||
|
line = line.strip()
|
||||||
|
if line and line.startswith('{'):
|
||||||
|
try:
|
||||||
|
records.append(json.loads(line))
|
||||||
|
except json.JSONDecodeError:
|
||||||
|
pass
|
||||||
|
return records
|
||||||
|
|
||||||
|
|
||||||
|
def assert_jsonl_contains_type(stdout: str, record_type: str, min_count: int = 1):
|
||||||
|
"""Assert output contains at least min_count records of type."""
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
matching = [r for r in records if r.get('type') == record_type]
|
||||||
|
assert len(matching) >= min_count, \
|
||||||
|
f"Expected >= {min_count} {record_type}, got {len(matching)}"
|
||||||
|
return matching
|
||||||
|
|
||||||
|
|
||||||
|
def assert_jsonl_pass_through(stdout: str, input_records: List[Dict[str, Any]]):
|
||||||
|
"""Assert that input records appear in output (pass-through behavior)."""
|
||||||
|
output_records = parse_jsonl_output(stdout)
|
||||||
|
output_ids = {r.get('id') for r in output_records if r.get('id')}
|
||||||
|
|
||||||
|
for input_rec in input_records:
|
||||||
|
input_id = input_rec.get('id')
|
||||||
|
if input_id:
|
||||||
|
assert input_id in output_ids, \
|
||||||
|
f"Input record {input_id} not found in output (pass-through failed)"
|
||||||
|
|
||||||
|
|
||||||
|
def assert_record_has_fields(record: Dict[str, Any], required_fields: List[str]):
|
||||||
|
"""Assert record has all required fields with non-None values."""
|
||||||
|
for field in required_fields:
|
||||||
|
assert field in record, f"Record missing field: {field}"
|
||||||
|
assert record[field] is not None, f"Record field is None: {field}"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Database Assertions
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
def assert_db_count(model_class, filters: Dict[str, Any], expected: int):
|
||||||
|
"""Assert database count matches expected."""
|
||||||
|
actual = model_class.objects.filter(**filters).count()
|
||||||
|
assert actual == expected, \
|
||||||
|
f"Expected {expected} {model_class.__name__}, got {actual}"
|
||||||
|
|
||||||
|
|
||||||
|
def assert_db_exists(model_class, **filters):
|
||||||
|
"""Assert at least one record exists matching filters."""
|
||||||
|
assert model_class.objects.filter(**filters).exists(), \
|
||||||
|
f"No {model_class.__name__} found matching {filters}"
|
||||||
|
|
||||||
|
|
||||||
|
# =============================================================================
|
||||||
|
# Test Data Factories
|
||||||
|
# =============================================================================
|
||||||
|
|
||||||
|
def create_test_url(domain: str = 'example.com', path: str = None) -> str:
|
||||||
|
"""Generate unique test URL."""
|
||||||
|
import uuid
|
||||||
|
path = path or uuid.uuid4().hex[:8]
|
||||||
|
return f'https://{domain}/{path}'
|
||||||
|
|
||||||
|
|
||||||
|
def create_test_crawl_json(urls: List[str] = None, **kwargs) -> Dict[str, Any]:
|
||||||
|
"""Create Crawl JSONL record for testing."""
|
||||||
|
from archivebox.misc.jsonl import TYPE_CRAWL
|
||||||
|
|
||||||
|
urls = urls or [create_test_url()]
|
||||||
|
return {
|
||||||
|
'type': TYPE_CRAWL,
|
||||||
|
'urls': '\n'.join(urls),
|
||||||
|
'max_depth': kwargs.get('max_depth', 0),
|
||||||
|
'tags_str': kwargs.get('tags_str', ''),
|
||||||
|
'status': kwargs.get('status', 'queued'),
|
||||||
|
**{k: v for k, v in kwargs.items() if k not in ('max_depth', 'tags_str', 'status')},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def create_test_snapshot_json(url: str = None, **kwargs) -> Dict[str, Any]:
|
||||||
|
"""Create Snapshot JSONL record for testing."""
|
||||||
|
from archivebox.misc.jsonl import TYPE_SNAPSHOT
|
||||||
|
|
||||||
|
return {
|
||||||
|
'type': TYPE_SNAPSHOT,
|
||||||
|
'url': url or create_test_url(),
|
||||||
|
'tags_str': kwargs.get('tags_str', ''),
|
||||||
|
'status': kwargs.get('status', 'queued'),
|
||||||
|
**{k: v for k, v in kwargs.items() if k not in ('tags_str', 'status')},
|
||||||
|
}
|
||||||
264
archivebox/tests/test_cli_archiveresult.py
Normal file
264
archivebox/tests/test_cli_archiveresult.py
Normal file
@@ -0,0 +1,264 @@
|
|||||||
|
"""
|
||||||
|
Tests for archivebox archiveresult CLI command.
|
||||||
|
|
||||||
|
Tests cover:
|
||||||
|
- archiveresult create (from Snapshot JSONL, with --plugin, pass-through)
|
||||||
|
- archiveresult list (with filters)
|
||||||
|
- archiveresult update
|
||||||
|
- archiveresult delete
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from archivebox.tests.conftest import (
|
||||||
|
run_archivebox_cmd,
|
||||||
|
parse_jsonl_output,
|
||||||
|
create_test_url,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestArchiveResultCreate:
|
||||||
|
"""Tests for `archivebox archiveresult create`."""
|
||||||
|
|
||||||
|
def test_create_from_snapshot_jsonl(self, cli_env, initialized_archive):
|
||||||
|
"""Create archive results from Snapshot JSONL input."""
|
||||||
|
url = create_test_url()
|
||||||
|
|
||||||
|
# Create a snapshot first
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||||
|
snapshot = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
# Pipe snapshot to archiveresult create
|
||||||
|
stdout2, stderr, code = run_archivebox_cmd(
|
||||||
|
['archiveresult', 'create', '--plugin=title'],
|
||||||
|
stdin=json.dumps(snapshot),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0, f"Command failed: {stderr}"
|
||||||
|
|
||||||
|
records = parse_jsonl_output(stdout2)
|
||||||
|
# Should have the Snapshot passed through and ArchiveResult created
|
||||||
|
types = [r.get('type') for r in records]
|
||||||
|
assert 'Snapshot' in types
|
||||||
|
assert 'ArchiveResult' in types
|
||||||
|
|
||||||
|
ar = next(r for r in records if r['type'] == 'ArchiveResult')
|
||||||
|
assert ar['plugin'] == 'title'
|
||||||
|
|
||||||
|
def test_create_with_specific_plugin(self, cli_env, initialized_archive):
|
||||||
|
"""Create archive result for specific plugin."""
|
||||||
|
url = create_test_url()
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||||
|
snapshot = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
stdout2, stderr, code = run_archivebox_cmd(
|
||||||
|
['archiveresult', 'create', '--plugin=screenshot'],
|
||||||
|
stdin=json.dumps(snapshot),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout2)
|
||||||
|
ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
|
||||||
|
assert len(ar_records) >= 1
|
||||||
|
assert ar_records[0]['plugin'] == 'screenshot'
|
||||||
|
|
||||||
|
def test_create_pass_through_crawl(self, cli_env, initialized_archive):
|
||||||
|
"""Pass-through Crawl records unchanged."""
|
||||||
|
url = create_test_url()
|
||||||
|
|
||||||
|
# Create crawl and snapshot
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||||
|
crawl = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
stdout2, _, _ = run_archivebox_cmd(
|
||||||
|
['snapshot', 'create'],
|
||||||
|
stdin=json.dumps(crawl),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Now pipe all to archiveresult create
|
||||||
|
stdout3, stderr, code = run_archivebox_cmd(
|
||||||
|
['archiveresult', 'create', '--plugin=title'],
|
||||||
|
stdin=stdout2,
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout3)
|
||||||
|
|
||||||
|
types = [r.get('type') for r in records]
|
||||||
|
assert 'Crawl' in types
|
||||||
|
assert 'Snapshot' in types
|
||||||
|
assert 'ArchiveResult' in types
|
||||||
|
|
||||||
|
def test_create_pass_through_only_when_no_snapshots(self, cli_env, initialized_archive):
|
||||||
|
"""Only pass-through records but no new snapshots returns success."""
|
||||||
|
crawl_record = {'type': 'Crawl', 'id': 'fake-id', 'urls': 'https://example.com'}
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['archiveresult', 'create'],
|
||||||
|
stdin=json.dumps(crawl_record),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
assert 'Passed through' in stderr
|
||||||
|
|
||||||
|
|
||||||
|
class TestArchiveResultList:
|
||||||
|
"""Tests for `archivebox archiveresult list`."""
|
||||||
|
|
||||||
|
def test_list_empty(self, cli_env, initialized_archive):
|
||||||
|
"""List with no archive results returns empty."""
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['archiveresult', 'list'],
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
assert 'Listed 0 archive results' in stderr
|
||||||
|
|
||||||
|
def test_list_filter_by_status(self, cli_env, initialized_archive):
|
||||||
|
"""Filter archive results by status."""
|
||||||
|
# Create snapshot and archive result
|
||||||
|
url = create_test_url()
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||||
|
snapshot = parse_jsonl_output(stdout1)[0]
|
||||||
|
run_archivebox_cmd(
|
||||||
|
['archiveresult', 'create', '--plugin=title'],
|
||||||
|
stdin=json.dumps(snapshot),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['archiveresult', 'list', '--status=queued'],
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
for r in records:
|
||||||
|
assert r['status'] == 'queued'
|
||||||
|
|
||||||
|
def test_list_filter_by_plugin(self, cli_env, initialized_archive):
|
||||||
|
"""Filter archive results by plugin."""
|
||||||
|
url = create_test_url()
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||||
|
snapshot = parse_jsonl_output(stdout1)[0]
|
||||||
|
run_archivebox_cmd(
|
||||||
|
['archiveresult', 'create', '--plugin=title'],
|
||||||
|
stdin=json.dumps(snapshot),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['archiveresult', 'list', '--plugin=title'],
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
for r in records:
|
||||||
|
assert r['plugin'] == 'title'
|
||||||
|
|
||||||
|
def test_list_with_limit(self, cli_env, initialized_archive):
|
||||||
|
"""Limit number of results."""
|
||||||
|
# Create multiple archive results
|
||||||
|
for _ in range(3):
|
||||||
|
url = create_test_url()
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||||
|
snapshot = parse_jsonl_output(stdout1)[0]
|
||||||
|
run_archivebox_cmd(
|
||||||
|
['archiveresult', 'create', '--plugin=title'],
|
||||||
|
stdin=json.dumps(snapshot),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['archiveresult', 'list', '--limit=2'],
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
assert len(records) == 2
|
||||||
|
|
||||||
|
|
||||||
|
class TestArchiveResultUpdate:
|
||||||
|
"""Tests for `archivebox archiveresult update`."""
|
||||||
|
|
||||||
|
def test_update_status(self, cli_env, initialized_archive):
|
||||||
|
"""Update archive result status."""
|
||||||
|
url = create_test_url()
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||||
|
snapshot = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
stdout2, _, _ = run_archivebox_cmd(
|
||||||
|
['archiveresult', 'create', '--plugin=title'],
|
||||||
|
stdin=json.dumps(snapshot),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
|
||||||
|
|
||||||
|
stdout3, stderr, code = run_archivebox_cmd(
|
||||||
|
['archiveresult', 'update', '--status=failed'],
|
||||||
|
stdin=json.dumps(ar),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
assert 'Updated 1 archive results' in stderr
|
||||||
|
|
||||||
|
records = parse_jsonl_output(stdout3)
|
||||||
|
assert records[0]['status'] == 'failed'
|
||||||
|
|
||||||
|
|
||||||
|
class TestArchiveResultDelete:
|
||||||
|
"""Tests for `archivebox archiveresult delete`."""
|
||||||
|
|
||||||
|
def test_delete_requires_yes(self, cli_env, initialized_archive):
|
||||||
|
"""Delete requires --yes flag."""
|
||||||
|
url = create_test_url()
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||||
|
snapshot = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
stdout2, _, _ = run_archivebox_cmd(
|
||||||
|
['archiveresult', 'create', '--plugin=title'],
|
||||||
|
stdin=json.dumps(snapshot),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['archiveresult', 'delete'],
|
||||||
|
stdin=json.dumps(ar),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 1
|
||||||
|
assert '--yes' in stderr
|
||||||
|
|
||||||
|
def test_delete_with_yes(self, cli_env, initialized_archive):
|
||||||
|
"""Delete with --yes flag works."""
|
||||||
|
url = create_test_url()
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||||
|
snapshot = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
stdout2, _, _ = run_archivebox_cmd(
|
||||||
|
['archiveresult', 'create', '--plugin=title'],
|
||||||
|
stdin=json.dumps(snapshot),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['archiveresult', 'delete', '--yes'],
|
||||||
|
stdin=json.dumps(ar),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
assert 'Deleted 1 archive results' in stderr
|
||||||
261
archivebox/tests/test_cli_crawl.py
Normal file
261
archivebox/tests/test_cli_crawl.py
Normal file
@@ -0,0 +1,261 @@
|
|||||||
|
"""
|
||||||
|
Tests for archivebox crawl CLI command.
|
||||||
|
|
||||||
|
Tests cover:
|
||||||
|
- crawl create (with URLs, from stdin, pass-through)
|
||||||
|
- crawl list (with filters)
|
||||||
|
- crawl update
|
||||||
|
- crawl delete
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from archivebox.tests.conftest import (
|
||||||
|
run_archivebox_cmd,
|
||||||
|
parse_jsonl_output,
|
||||||
|
assert_jsonl_contains_type,
|
||||||
|
create_test_url,
|
||||||
|
create_test_crawl_json,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestCrawlCreate:
|
||||||
|
"""Tests for `archivebox crawl create`."""
|
||||||
|
|
||||||
|
def test_create_from_url_args(self, cli_env, initialized_archive):
|
||||||
|
"""Create crawl from URL arguments."""
|
||||||
|
url = create_test_url()
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['crawl', 'create', url],
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0, f"Command failed: {stderr}"
|
||||||
|
assert 'Created crawl' in stderr
|
||||||
|
|
||||||
|
# Check JSONL output
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
assert len(records) == 1
|
||||||
|
assert records[0]['type'] == 'Crawl'
|
||||||
|
assert url in records[0]['urls']
|
||||||
|
|
||||||
|
def test_create_from_stdin_urls(self, cli_env, initialized_archive):
|
||||||
|
"""Create crawl from stdin URLs (one per line)."""
|
||||||
|
urls = [create_test_url() for _ in range(3)]
|
||||||
|
stdin = '\n'.join(urls)
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['crawl', 'create'],
|
||||||
|
stdin=stdin,
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0, f"Command failed: {stderr}"
|
||||||
|
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
assert len(records) == 1
|
||||||
|
crawl = records[0]
|
||||||
|
assert crawl['type'] == 'Crawl'
|
||||||
|
# All URLs should be in the crawl
|
||||||
|
for url in urls:
|
||||||
|
assert url in crawl['urls']
|
||||||
|
|
||||||
|
def test_create_with_depth(self, cli_env, initialized_archive):
|
||||||
|
"""Create crawl with --depth flag."""
|
||||||
|
url = create_test_url()
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['crawl', 'create', '--depth=2', url],
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
assert records[0]['max_depth'] == 2
|
||||||
|
|
||||||
|
def test_create_with_tag(self, cli_env, initialized_archive):
|
||||||
|
"""Create crawl with --tag flag."""
|
||||||
|
url = create_test_url()
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['crawl', 'create', '--tag=test-tag', url],
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
assert 'test-tag' in records[0].get('tags_str', '')
|
||||||
|
|
||||||
|
def test_create_pass_through_other_types(self, cli_env, initialized_archive):
|
||||||
|
"""Pass-through records of other types unchanged."""
|
||||||
|
tag_record = {'type': 'Tag', 'id': 'fake-tag-id', 'name': 'test'}
|
||||||
|
url = create_test_url()
|
||||||
|
stdin = json.dumps(tag_record) + '\n' + json.dumps({'url': url})
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['crawl', 'create'],
|
||||||
|
stdin=stdin,
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
|
||||||
|
# Should have both the passed-through Tag and the new Crawl
|
||||||
|
types = [r.get('type') for r in records]
|
||||||
|
assert 'Tag' in types
|
||||||
|
assert 'Crawl' in types
|
||||||
|
|
||||||
|
def test_create_pass_through_existing_crawl(self, cli_env, initialized_archive):
|
||||||
|
"""Existing Crawl records (with id) are passed through."""
|
||||||
|
# First create a crawl
|
||||||
|
url = create_test_url()
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||||
|
crawl = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
# Now pipe it back - should pass through
|
||||||
|
stdout2, stderr, code = run_archivebox_cmd(
|
||||||
|
['crawl', 'create'],
|
||||||
|
stdin=json.dumps(crawl),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout2)
|
||||||
|
assert len(records) == 1
|
||||||
|
assert records[0]['id'] == crawl['id']
|
||||||
|
|
||||||
|
|
||||||
|
class TestCrawlList:
|
||||||
|
"""Tests for `archivebox crawl list`."""
|
||||||
|
|
||||||
|
def test_list_empty(self, cli_env, initialized_archive):
|
||||||
|
"""List with no crawls returns empty."""
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['crawl', 'list'],
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
assert 'Listed 0 crawls' in stderr
|
||||||
|
|
||||||
|
def test_list_returns_created(self, cli_env, initialized_archive):
|
||||||
|
"""List returns previously created crawls."""
|
||||||
|
url = create_test_url()
|
||||||
|
run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['crawl', 'list'],
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
assert len(records) >= 1
|
||||||
|
assert any(url in r.get('urls', '') for r in records)
|
||||||
|
|
||||||
|
def test_list_filter_by_status(self, cli_env, initialized_archive):
|
||||||
|
"""Filter crawls by status."""
|
||||||
|
url = create_test_url()
|
||||||
|
run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['crawl', 'list', '--status=queued'],
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
for r in records:
|
||||||
|
assert r['status'] == 'queued'
|
||||||
|
|
||||||
|
def test_list_with_limit(self, cli_env, initialized_archive):
|
||||||
|
"""Limit number of results."""
|
||||||
|
# Create multiple crawls
|
||||||
|
for _ in range(3):
|
||||||
|
run_archivebox_cmd(['crawl', 'create', create_test_url()], env=cli_env)
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['crawl', 'list', '--limit=2'],
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
assert len(records) == 2
|
||||||
|
|
||||||
|
|
||||||
|
class TestCrawlUpdate:
|
||||||
|
"""Tests for `archivebox crawl update`."""
|
||||||
|
|
||||||
|
def test_update_status(self, cli_env, initialized_archive):
|
||||||
|
"""Update crawl status."""
|
||||||
|
# Create a crawl
|
||||||
|
url = create_test_url()
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||||
|
crawl = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
# Update it
|
||||||
|
stdout2, stderr, code = run_archivebox_cmd(
|
||||||
|
['crawl', 'update', '--status=started'],
|
||||||
|
stdin=json.dumps(crawl),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
assert 'Updated 1 crawls' in stderr
|
||||||
|
|
||||||
|
records = parse_jsonl_output(stdout2)
|
||||||
|
assert records[0]['status'] == 'started'
|
||||||
|
|
||||||
|
|
||||||
|
class TestCrawlDelete:
|
||||||
|
"""Tests for `archivebox crawl delete`."""
|
||||||
|
|
||||||
|
def test_delete_requires_yes(self, cli_env, initialized_archive):
|
||||||
|
"""Delete requires --yes flag."""
|
||||||
|
url = create_test_url()
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||||
|
crawl = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['crawl', 'delete'],
|
||||||
|
stdin=json.dumps(crawl),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 1
|
||||||
|
assert '--yes' in stderr
|
||||||
|
|
||||||
|
def test_delete_with_yes(self, cli_env, initialized_archive):
|
||||||
|
"""Delete with --yes flag works."""
|
||||||
|
url = create_test_url()
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||||
|
crawl = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['crawl', 'delete', '--yes'],
|
||||||
|
stdin=json.dumps(crawl),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
assert 'Deleted 1 crawls' in stderr
|
||||||
|
|
||||||
|
def test_delete_dry_run(self, cli_env, initialized_archive):
|
||||||
|
"""Dry run shows what would be deleted."""
|
||||||
|
url = create_test_url()
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||||
|
crawl = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['crawl', 'delete', '--dry-run'],
|
||||||
|
stdin=json.dumps(crawl),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
assert 'Would delete' in stderr
|
||||||
|
assert 'dry run' in stderr.lower()
|
||||||
254
archivebox/tests/test_cli_run.py
Normal file
254
archivebox/tests/test_cli_run.py
Normal file
@@ -0,0 +1,254 @@
|
|||||||
|
"""
|
||||||
|
Tests for archivebox run CLI command.
|
||||||
|
|
||||||
|
Tests cover:
|
||||||
|
- run with stdin JSONL (Crawl, Snapshot, ArchiveResult)
|
||||||
|
- create-or-update behavior (records with/without id)
|
||||||
|
- pass-through output (for chaining)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from archivebox.tests.conftest import (
|
||||||
|
run_archivebox_cmd,
|
||||||
|
parse_jsonl_output,
|
||||||
|
create_test_url,
|
||||||
|
create_test_crawl_json,
|
||||||
|
create_test_snapshot_json,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestRunWithCrawl:
|
||||||
|
"""Tests for `archivebox run` with Crawl input."""
|
||||||
|
|
||||||
|
def test_run_with_new_crawl(self, cli_env, initialized_archive):
|
||||||
|
"""Run creates and processes a new Crawl (no id)."""
|
||||||
|
crawl_record = create_test_crawl_json()
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['run'],
|
||||||
|
stdin=json.dumps(crawl_record),
|
||||||
|
env=cli_env,
|
||||||
|
timeout=120,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0, f"Command failed: {stderr}"
|
||||||
|
|
||||||
|
# Should output the created Crawl
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
crawl_records = [r for r in records if r.get('type') == 'Crawl']
|
||||||
|
assert len(crawl_records) >= 1
|
||||||
|
assert crawl_records[0].get('id') # Should have an id now
|
||||||
|
|
||||||
|
def test_run_with_existing_crawl(self, cli_env, initialized_archive):
|
||||||
|
"""Run re-queues an existing Crawl (with id)."""
|
||||||
|
url = create_test_url()
|
||||||
|
|
||||||
|
# First create a crawl
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||||
|
crawl = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
# Run with the existing crawl
|
||||||
|
stdout2, stderr, code = run_archivebox_cmd(
|
||||||
|
['run'],
|
||||||
|
stdin=json.dumps(crawl),
|
||||||
|
env=cli_env,
|
||||||
|
timeout=120,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout2)
|
||||||
|
assert len(records) >= 1
|
||||||
|
|
||||||
|
|
||||||
|
class TestRunWithSnapshot:
|
||||||
|
"""Tests for `archivebox run` with Snapshot input."""
|
||||||
|
|
||||||
|
def test_run_with_new_snapshot(self, cli_env, initialized_archive):
|
||||||
|
"""Run creates and processes a new Snapshot (no id, just url)."""
|
||||||
|
snapshot_record = create_test_snapshot_json()
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['run'],
|
||||||
|
stdin=json.dumps(snapshot_record),
|
||||||
|
env=cli_env,
|
||||||
|
timeout=120,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0, f"Command failed: {stderr}"
|
||||||
|
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
snapshot_records = [r for r in records if r.get('type') == 'Snapshot']
|
||||||
|
assert len(snapshot_records) >= 1
|
||||||
|
assert snapshot_records[0].get('id')
|
||||||
|
|
||||||
|
def test_run_with_existing_snapshot(self, cli_env, initialized_archive):
|
||||||
|
"""Run re-queues an existing Snapshot (with id)."""
|
||||||
|
url = create_test_url()
|
||||||
|
|
||||||
|
# First create a snapshot
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||||
|
snapshot = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
# Run with the existing snapshot
|
||||||
|
stdout2, stderr, code = run_archivebox_cmd(
|
||||||
|
['run'],
|
||||||
|
stdin=json.dumps(snapshot),
|
||||||
|
env=cli_env,
|
||||||
|
timeout=120,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout2)
|
||||||
|
assert len(records) >= 1
|
||||||
|
|
||||||
|
def test_run_with_plain_url(self, cli_env, initialized_archive):
|
||||||
|
"""Run accepts plain URL records (no type field)."""
|
||||||
|
url = create_test_url()
|
||||||
|
url_record = {'url': url}
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['run'],
|
||||||
|
stdin=json.dumps(url_record),
|
||||||
|
env=cli_env,
|
||||||
|
timeout=120,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
assert len(records) >= 1
|
||||||
|
|
||||||
|
|
||||||
|
class TestRunWithArchiveResult:
|
||||||
|
"""Tests for `archivebox run` with ArchiveResult input."""
|
||||||
|
|
||||||
|
def test_run_requeues_failed_archiveresult(self, cli_env, initialized_archive):
|
||||||
|
"""Run re-queues a failed ArchiveResult."""
|
||||||
|
url = create_test_url()
|
||||||
|
|
||||||
|
# Create snapshot and archive result
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||||
|
snapshot = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
stdout2, _, _ = run_archivebox_cmd(
|
||||||
|
['archiveresult', 'create', '--plugin=title'],
|
||||||
|
stdin=json.dumps(snapshot),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
|
||||||
|
|
||||||
|
# Update to failed
|
||||||
|
ar['status'] = 'failed'
|
||||||
|
run_archivebox_cmd(
|
||||||
|
['archiveresult', 'update', '--status=failed'],
|
||||||
|
stdin=json.dumps(ar),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Now run should re-queue it
|
||||||
|
stdout3, stderr, code = run_archivebox_cmd(
|
||||||
|
['run'],
|
||||||
|
stdin=json.dumps(ar),
|
||||||
|
env=cli_env,
|
||||||
|
timeout=120,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout3)
|
||||||
|
ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
|
||||||
|
assert len(ar_records) >= 1
|
||||||
|
|
||||||
|
|
||||||
|
class TestRunPassThrough:
|
||||||
|
"""Tests for pass-through behavior in `archivebox run`."""
|
||||||
|
|
||||||
|
def test_run_passes_through_unknown_types(self, cli_env, initialized_archive):
|
||||||
|
"""Run passes through records with unknown types."""
|
||||||
|
unknown_record = {'type': 'Unknown', 'id': 'fake-id', 'data': 'test'}
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['run'],
|
||||||
|
stdin=json.dumps(unknown_record),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
unknown_records = [r for r in records if r.get('type') == 'Unknown']
|
||||||
|
assert len(unknown_records) == 1
|
||||||
|
assert unknown_records[0]['data'] == 'test'
|
||||||
|
|
||||||
|
def test_run_outputs_all_processed_records(self, cli_env, initialized_archive):
|
||||||
|
"""Run outputs all processed records for chaining."""
|
||||||
|
url = create_test_url()
|
||||||
|
crawl_record = create_test_crawl_json(urls=[url])
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['run'],
|
||||||
|
stdin=json.dumps(crawl_record),
|
||||||
|
env=cli_env,
|
||||||
|
timeout=120,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
# Should have at least the Crawl in output
|
||||||
|
assert len(records) >= 1
|
||||||
|
|
||||||
|
|
||||||
|
class TestRunMixedInput:
|
||||||
|
"""Tests for `archivebox run` with mixed record types."""
|
||||||
|
|
||||||
|
def test_run_handles_mixed_types(self, cli_env, initialized_archive):
|
||||||
|
"""Run handles mixed Crawl/Snapshot/ArchiveResult input."""
|
||||||
|
crawl = create_test_crawl_json()
|
||||||
|
snapshot = create_test_snapshot_json()
|
||||||
|
unknown = {'type': 'Tag', 'id': 'fake', 'name': 'test'}
|
||||||
|
|
||||||
|
stdin = '\n'.join([
|
||||||
|
json.dumps(crawl),
|
||||||
|
json.dumps(snapshot),
|
||||||
|
json.dumps(unknown),
|
||||||
|
])
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['run'],
|
||||||
|
stdin=stdin,
|
||||||
|
env=cli_env,
|
||||||
|
timeout=120,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
|
||||||
|
types = set(r.get('type') for r in records)
|
||||||
|
# Should have processed Crawl and Snapshot, passed through Tag
|
||||||
|
assert 'Crawl' in types or 'Snapshot' in types or 'Tag' in types
|
||||||
|
|
||||||
|
|
||||||
|
class TestRunEmpty:
|
||||||
|
"""Tests for `archivebox run` edge cases."""
|
||||||
|
|
||||||
|
def test_run_empty_stdin(self, cli_env, initialized_archive):
|
||||||
|
"""Run with empty stdin returns success."""
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['run'],
|
||||||
|
stdin='',
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
|
||||||
|
def test_run_no_records_to_process(self, cli_env, initialized_archive):
|
||||||
|
"""Run with only pass-through records shows message."""
|
||||||
|
unknown = {'type': 'Unknown', 'id': 'fake'}
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['run'],
|
||||||
|
stdin=json.dumps(unknown),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
assert 'No records to process' in stderr
|
||||||
274
archivebox/tests/test_cli_snapshot.py
Normal file
274
archivebox/tests/test_cli_snapshot.py
Normal file
@@ -0,0 +1,274 @@
|
|||||||
|
"""
|
||||||
|
Tests for archivebox snapshot CLI command.
|
||||||
|
|
||||||
|
Tests cover:
|
||||||
|
- snapshot create (from URLs, from Crawl JSONL, pass-through)
|
||||||
|
- snapshot list (with filters)
|
||||||
|
- snapshot update
|
||||||
|
- snapshot delete
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from archivebox.tests.conftest import (
|
||||||
|
run_archivebox_cmd,
|
||||||
|
parse_jsonl_output,
|
||||||
|
assert_jsonl_contains_type,
|
||||||
|
create_test_url,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestSnapshotCreate:
|
||||||
|
"""Tests for `archivebox snapshot create`."""
|
||||||
|
|
||||||
|
def test_create_from_url_args(self, cli_env, initialized_archive):
|
||||||
|
"""Create snapshot from URL arguments."""
|
||||||
|
url = create_test_url()
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['snapshot', 'create', url],
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0, f"Command failed: {stderr}"
|
||||||
|
assert 'Created' in stderr
|
||||||
|
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
assert len(records) == 1
|
||||||
|
assert records[0]['type'] == 'Snapshot'
|
||||||
|
assert records[0]['url'] == url
|
||||||
|
|
||||||
|
def test_create_from_crawl_jsonl(self, cli_env, initialized_archive):
|
||||||
|
"""Create snapshots from Crawl JSONL input."""
|
||||||
|
url = create_test_url()
|
||||||
|
|
||||||
|
# First create a crawl
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
|
||||||
|
crawl = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
# Pipe crawl to snapshot create
|
||||||
|
stdout2, stderr, code = run_archivebox_cmd(
|
||||||
|
['snapshot', 'create'],
|
||||||
|
stdin=json.dumps(crawl),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0, f"Command failed: {stderr}"
|
||||||
|
|
||||||
|
records = parse_jsonl_output(stdout2)
|
||||||
|
# Should have the Crawl passed through and the Snapshot created
|
||||||
|
types = [r.get('type') for r in records]
|
||||||
|
assert 'Crawl' in types
|
||||||
|
assert 'Snapshot' in types
|
||||||
|
|
||||||
|
snapshot = next(r for r in records if r['type'] == 'Snapshot')
|
||||||
|
assert snapshot['url'] == url
|
||||||
|
|
||||||
|
def test_create_with_tag(self, cli_env, initialized_archive):
|
||||||
|
"""Create snapshot with --tag flag."""
|
||||||
|
url = create_test_url()
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['snapshot', 'create', '--tag=test-tag', url],
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
assert 'test-tag' in records[0].get('tags_str', '')
|
||||||
|
|
||||||
|
def test_create_pass_through_other_types(self, cli_env, initialized_archive):
|
||||||
|
"""Pass-through records of other types unchanged."""
|
||||||
|
tag_record = {'type': 'Tag', 'id': 'fake-tag-id', 'name': 'test'}
|
||||||
|
url = create_test_url()
|
||||||
|
stdin = json.dumps(tag_record) + '\n' + json.dumps({'url': url})
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['snapshot', 'create'],
|
||||||
|
stdin=stdin,
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
|
||||||
|
types = [r.get('type') for r in records]
|
||||||
|
assert 'Tag' in types
|
||||||
|
assert 'Snapshot' in types
|
||||||
|
|
||||||
|
def test_create_multiple_urls(self, cli_env, initialized_archive):
|
||||||
|
"""Create snapshots from multiple URLs."""
|
||||||
|
urls = [create_test_url() for _ in range(3)]
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['snapshot', 'create'] + urls,
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
assert len(records) == 3
|
||||||
|
|
||||||
|
created_urls = {r['url'] for r in records}
|
||||||
|
for url in urls:
|
||||||
|
assert url in created_urls
|
||||||
|
|
||||||
|
|
||||||
|
class TestSnapshotList:
|
||||||
|
"""Tests for `archivebox snapshot list`."""
|
||||||
|
|
||||||
|
def test_list_empty(self, cli_env, initialized_archive):
|
||||||
|
"""List with no snapshots returns empty."""
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['snapshot', 'list'],
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
assert 'Listed 0 snapshots' in stderr
|
||||||
|
|
||||||
|
def test_list_returns_created(self, cli_env, initialized_archive):
|
||||||
|
"""List returns previously created snapshots."""
|
||||||
|
url = create_test_url()
|
||||||
|
run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['snapshot', 'list'],
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
assert len(records) >= 1
|
||||||
|
assert any(r.get('url') == url for r in records)
|
||||||
|
|
||||||
|
def test_list_filter_by_status(self, cli_env, initialized_archive):
|
||||||
|
"""Filter snapshots by status."""
|
||||||
|
url = create_test_url()
|
||||||
|
run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['snapshot', 'list', '--status=queued'],
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
for r in records:
|
||||||
|
assert r['status'] == 'queued'
|
||||||
|
|
||||||
|
def test_list_filter_by_url_contains(self, cli_env, initialized_archive):
|
||||||
|
"""Filter snapshots by URL contains."""
|
||||||
|
url = create_test_url(domain='unique-domain-12345.com')
|
||||||
|
run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['snapshot', 'list', '--url__icontains=unique-domain-12345'],
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
assert len(records) == 1
|
||||||
|
assert 'unique-domain-12345' in records[0]['url']
|
||||||
|
|
||||||
|
def test_list_with_limit(self, cli_env, initialized_archive):
|
||||||
|
"""Limit number of results."""
|
||||||
|
for _ in range(3):
|
||||||
|
run_archivebox_cmd(['snapshot', 'create', create_test_url()], env=cli_env)
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['snapshot', 'list', '--limit=2'],
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
records = parse_jsonl_output(stdout)
|
||||||
|
assert len(records) == 2
|
||||||
|
|
||||||
|
|
||||||
|
class TestSnapshotUpdate:
|
||||||
|
"""Tests for `archivebox snapshot update`."""
|
||||||
|
|
||||||
|
def test_update_status(self, cli_env, initialized_archive):
|
||||||
|
"""Update snapshot status."""
|
||||||
|
url = create_test_url()
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||||
|
snapshot = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
stdout2, stderr, code = run_archivebox_cmd(
|
||||||
|
['snapshot', 'update', '--status=started'],
|
||||||
|
stdin=json.dumps(snapshot),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
assert 'Updated 1 snapshots' in stderr
|
||||||
|
|
||||||
|
records = parse_jsonl_output(stdout2)
|
||||||
|
assert records[0]['status'] == 'started'
|
||||||
|
|
||||||
|
def test_update_add_tag(self, cli_env, initialized_archive):
|
||||||
|
"""Update snapshot by adding tag."""
|
||||||
|
url = create_test_url()
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||||
|
snapshot = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
stdout2, stderr, code = run_archivebox_cmd(
|
||||||
|
['snapshot', 'update', '--tag=new-tag'],
|
||||||
|
stdin=json.dumps(snapshot),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
assert 'Updated 1 snapshots' in stderr
|
||||||
|
|
||||||
|
|
||||||
|
class TestSnapshotDelete:
|
||||||
|
"""Tests for `archivebox snapshot delete`."""
|
||||||
|
|
||||||
|
def test_delete_requires_yes(self, cli_env, initialized_archive):
|
||||||
|
"""Delete requires --yes flag."""
|
||||||
|
url = create_test_url()
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||||
|
snapshot = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['snapshot', 'delete'],
|
||||||
|
stdin=json.dumps(snapshot),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 1
|
||||||
|
assert '--yes' in stderr
|
||||||
|
|
||||||
|
def test_delete_with_yes(self, cli_env, initialized_archive):
|
||||||
|
"""Delete with --yes flag works."""
|
||||||
|
url = create_test_url()
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||||
|
snapshot = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['snapshot', 'delete', '--yes'],
|
||||||
|
stdin=json.dumps(snapshot),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
assert 'Deleted 1 snapshots' in stderr
|
||||||
|
|
||||||
|
def test_delete_dry_run(self, cli_env, initialized_archive):
|
||||||
|
"""Dry run shows what would be deleted."""
|
||||||
|
url = create_test_url()
|
||||||
|
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], env=cli_env)
|
||||||
|
snapshot = parse_jsonl_output(stdout1)[0]
|
||||||
|
|
||||||
|
stdout, stderr, code = run_archivebox_cmd(
|
||||||
|
['snapshot', 'delete', '--dry-run'],
|
||||||
|
stdin=json.dumps(snapshot),
|
||||||
|
env=cli_env,
|
||||||
|
)
|
||||||
|
|
||||||
|
assert code == 0
|
||||||
|
assert 'Would delete' in stderr
|
||||||
@@ -32,7 +32,7 @@ _supervisord_proc = None
|
|||||||
|
|
||||||
ORCHESTRATOR_WORKER = {
|
ORCHESTRATOR_WORKER = {
|
||||||
"name": "worker_orchestrator",
|
"name": "worker_orchestrator",
|
||||||
"command": "archivebox manage orchestrator", # runs forever by default
|
"command": "archivebox run", # runs forever by default
|
||||||
"autostart": "true",
|
"autostart": "true",
|
||||||
"autorestart": "true",
|
"autorestart": "true",
|
||||||
"stdout_logfile": "logs/worker_orchestrator.log",
|
"stdout_logfile": "logs/worker_orchestrator.log",
|
||||||
|
|||||||
Reference in New Issue
Block a user