Files
ArchiveBox/archivebox/tests/test_cli_crawl.py
Claude bb52b5902a Add unit tests for JSONL CLI pipeline commands (Phase 5 & 6)
Add comprehensive unit tests for the CLI piping architecture:
- test_cli_crawl.py: crawl create/list/update/delete tests
- test_cli_snapshot.py: snapshot create/list/update/delete tests
- test_cli_archiveresult.py: archiveresult create/list/update/delete tests
- test_cli_run.py: run command create-or-update and pass-through tests

Extend tests_piping.py with:
- TestPassThroughBehavior: tests for pass-through behavior in all commands
- TestPipelineAccumulation: tests for accumulating records through pipeline

All tests use pytest fixtures from conftest.py with isolated DATA_DIR.
2025-12-31 10:21:05 +00:00

262 lines
7.9 KiB
Python

"""
Tests for archivebox crawl CLI command.
Tests cover:
- crawl create (with URLs, from stdin, pass-through)
- crawl list (with filters)
- crawl update
- crawl delete
"""
import json
import pytest
from archivebox.tests.conftest import (
run_archivebox_cmd,
parse_jsonl_output,
assert_jsonl_contains_type,
create_test_url,
create_test_crawl_json,
)
class TestCrawlCreate:
"""Tests for `archivebox crawl create`."""
def test_create_from_url_args(self, cli_env, initialized_archive):
"""Create crawl from URL arguments."""
url = create_test_url()
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'create', url],
env=cli_env,
)
assert code == 0, f"Command failed: {stderr}"
assert 'Created crawl' in stderr
# Check JSONL output
records = parse_jsonl_output(stdout)
assert len(records) == 1
assert records[0]['type'] == 'Crawl'
assert url in records[0]['urls']
def test_create_from_stdin_urls(self, cli_env, initialized_archive):
"""Create crawl from stdin URLs (one per line)."""
urls = [create_test_url() for _ in range(3)]
stdin = '\n'.join(urls)
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'create'],
stdin=stdin,
env=cli_env,
)
assert code == 0, f"Command failed: {stderr}"
records = parse_jsonl_output(stdout)
assert len(records) == 1
crawl = records[0]
assert crawl['type'] == 'Crawl'
# All URLs should be in the crawl
for url in urls:
assert url in crawl['urls']
def test_create_with_depth(self, cli_env, initialized_archive):
"""Create crawl with --depth flag."""
url = create_test_url()
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'create', '--depth=2', url],
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert records[0]['max_depth'] == 2
def test_create_with_tag(self, cli_env, initialized_archive):
"""Create crawl with --tag flag."""
url = create_test_url()
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'create', '--tag=test-tag', url],
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert 'test-tag' in records[0].get('tags_str', '')
def test_create_pass_through_other_types(self, cli_env, initialized_archive):
"""Pass-through records of other types unchanged."""
tag_record = {'type': 'Tag', 'id': 'fake-tag-id', 'name': 'test'}
url = create_test_url()
stdin = json.dumps(tag_record) + '\n' + json.dumps({'url': url})
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'create'],
stdin=stdin,
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
# Should have both the passed-through Tag and the new Crawl
types = [r.get('type') for r in records]
assert 'Tag' in types
assert 'Crawl' in types
def test_create_pass_through_existing_crawl(self, cli_env, initialized_archive):
"""Existing Crawl records (with id) are passed through."""
# First create a crawl
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
crawl = parse_jsonl_output(stdout1)[0]
# Now pipe it back - should pass through
stdout2, stderr, code = run_archivebox_cmd(
['crawl', 'create'],
stdin=json.dumps(crawl),
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout2)
assert len(records) == 1
assert records[0]['id'] == crawl['id']
class TestCrawlList:
"""Tests for `archivebox crawl list`."""
def test_list_empty(self, cli_env, initialized_archive):
"""List with no crawls returns empty."""
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'list'],
env=cli_env,
)
assert code == 0
assert 'Listed 0 crawls' in stderr
def test_list_returns_created(self, cli_env, initialized_archive):
"""List returns previously created crawls."""
url = create_test_url()
run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'list'],
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert len(records) >= 1
assert any(url in r.get('urls', '') for r in records)
def test_list_filter_by_status(self, cli_env, initialized_archive):
"""Filter crawls by status."""
url = create_test_url()
run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'list', '--status=queued'],
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
for r in records:
assert r['status'] == 'queued'
def test_list_with_limit(self, cli_env, initialized_archive):
"""Limit number of results."""
# Create multiple crawls
for _ in range(3):
run_archivebox_cmd(['crawl', 'create', create_test_url()], env=cli_env)
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'list', '--limit=2'],
env=cli_env,
)
assert code == 0
records = parse_jsonl_output(stdout)
assert len(records) == 2
class TestCrawlUpdate:
"""Tests for `archivebox crawl update`."""
def test_update_status(self, cli_env, initialized_archive):
"""Update crawl status."""
# Create a crawl
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
crawl = parse_jsonl_output(stdout1)[0]
# Update it
stdout2, stderr, code = run_archivebox_cmd(
['crawl', 'update', '--status=started'],
stdin=json.dumps(crawl),
env=cli_env,
)
assert code == 0
assert 'Updated 1 crawls' in stderr
records = parse_jsonl_output(stdout2)
assert records[0]['status'] == 'started'
class TestCrawlDelete:
"""Tests for `archivebox crawl delete`."""
def test_delete_requires_yes(self, cli_env, initialized_archive):
"""Delete requires --yes flag."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
crawl = parse_jsonl_output(stdout1)[0]
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'delete'],
stdin=json.dumps(crawl),
env=cli_env,
)
assert code == 1
assert '--yes' in stderr
def test_delete_with_yes(self, cli_env, initialized_archive):
"""Delete with --yes flag works."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
crawl = parse_jsonl_output(stdout1)[0]
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'delete', '--yes'],
stdin=json.dumps(crawl),
env=cli_env,
)
assert code == 0
assert 'Deleted 1 crawls' in stderr
def test_delete_dry_run(self, cli_env, initialized_archive):
"""Dry run shows what would be deleted."""
url = create_test_url()
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], env=cli_env)
crawl = parse_jsonl_output(stdout1)[0]
stdout, stderr, code = run_archivebox_cmd(
['crawl', 'delete', '--dry-run'],
stdin=json.dumps(crawl),
env=cli_env,
)
assert code == 0
assert 'Would delete' in stderr
assert 'dry run' in stderr.lower()