mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 01:15:57 +10:00
- Fix conftest.py: use subprocess for init, remove unused cli_env fixture - Update all test files to use data_dir parameter instead of env - Remove mock-based TestJSONLOutput class from tests_piping.py - Remove unused imports (MagicMock, patch) - Fix file permissions for cli_utils.py All tests now use real subprocess calls per CLAUDE.md guidelines: - NO MOCKS - tests exercise real code paths - NO SKIPS - every test runs
255 lines
8.0 KiB
Python
255 lines
8.0 KiB
Python
"""
|
|
Tests for archivebox run CLI command.
|
|
|
|
Tests cover:
|
|
- run with stdin JSONL (Crawl, Snapshot, ArchiveResult)
|
|
- create-or-update behavior (records with/without id)
|
|
- pass-through output (for chaining)
|
|
"""
|
|
|
|
import json
|
|
import pytest
|
|
|
|
from archivebox.tests.conftest import (
|
|
run_archivebox_cmd,
|
|
parse_jsonl_output,
|
|
create_test_url,
|
|
create_test_crawl_json,
|
|
create_test_snapshot_json,
|
|
)
|
|
|
|
|
|
class TestRunWithCrawl:
|
|
"""Tests for `archivebox run` with Crawl input."""
|
|
|
|
def test_run_with_new_crawl(self, initialized_archive):
|
|
"""Run creates and processes a new Crawl (no id)."""
|
|
crawl_record = create_test_crawl_json()
|
|
|
|
stdout, stderr, code = run_archivebox_cmd(
|
|
['run'],
|
|
stdin=json.dumps(crawl_record),
|
|
data_dir=initialized_archive,
|
|
timeout=120,
|
|
)
|
|
|
|
assert code == 0, f"Command failed: {stderr}"
|
|
|
|
# Should output the created Crawl
|
|
records = parse_jsonl_output(stdout)
|
|
crawl_records = [r for r in records if r.get('type') == 'Crawl']
|
|
assert len(crawl_records) >= 1
|
|
assert crawl_records[0].get('id') # Should have an id now
|
|
|
|
def test_run_with_existing_crawl(self, initialized_archive):
|
|
"""Run re-queues an existing Crawl (with id)."""
|
|
url = create_test_url()
|
|
|
|
# First create a crawl
|
|
stdout1, _, _ = run_archivebox_cmd(['crawl', 'create', url], data_dir=initialized_archive)
|
|
crawl = parse_jsonl_output(stdout1)[0]
|
|
|
|
# Run with the existing crawl
|
|
stdout2, stderr, code = run_archivebox_cmd(
|
|
['run'],
|
|
stdin=json.dumps(crawl),
|
|
data_dir=initialized_archive,
|
|
timeout=120,
|
|
)
|
|
|
|
assert code == 0
|
|
records = parse_jsonl_output(stdout2)
|
|
assert len(records) >= 1
|
|
|
|
|
|
class TestRunWithSnapshot:
|
|
"""Tests for `archivebox run` with Snapshot input."""
|
|
|
|
def test_run_with_new_snapshot(self, initialized_archive):
|
|
"""Run creates and processes a new Snapshot (no id, just url)."""
|
|
snapshot_record = create_test_snapshot_json()
|
|
|
|
stdout, stderr, code = run_archivebox_cmd(
|
|
['run'],
|
|
stdin=json.dumps(snapshot_record),
|
|
data_dir=initialized_archive,
|
|
timeout=120,
|
|
)
|
|
|
|
assert code == 0, f"Command failed: {stderr}"
|
|
|
|
records = parse_jsonl_output(stdout)
|
|
snapshot_records = [r for r in records if r.get('type') == 'Snapshot']
|
|
assert len(snapshot_records) >= 1
|
|
assert snapshot_records[0].get('id')
|
|
|
|
def test_run_with_existing_snapshot(self, initialized_archive):
|
|
"""Run re-queues an existing Snapshot (with id)."""
|
|
url = create_test_url()
|
|
|
|
# First create a snapshot
|
|
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
|
snapshot = parse_jsonl_output(stdout1)[0]
|
|
|
|
# Run with the existing snapshot
|
|
stdout2, stderr, code = run_archivebox_cmd(
|
|
['run'],
|
|
stdin=json.dumps(snapshot),
|
|
data_dir=initialized_archive,
|
|
timeout=120,
|
|
)
|
|
|
|
assert code == 0
|
|
records = parse_jsonl_output(stdout2)
|
|
assert len(records) >= 1
|
|
|
|
def test_run_with_plain_url(self, initialized_archive):
|
|
"""Run accepts plain URL records (no type field)."""
|
|
url = create_test_url()
|
|
url_record = {'url': url}
|
|
|
|
stdout, stderr, code = run_archivebox_cmd(
|
|
['run'],
|
|
stdin=json.dumps(url_record),
|
|
data_dir=initialized_archive,
|
|
timeout=120,
|
|
)
|
|
|
|
assert code == 0
|
|
records = parse_jsonl_output(stdout)
|
|
assert len(records) >= 1
|
|
|
|
|
|
class TestRunWithArchiveResult:
|
|
"""Tests for `archivebox run` with ArchiveResult input."""
|
|
|
|
def test_run_requeues_failed_archiveresult(self, initialized_archive):
|
|
"""Run re-queues a failed ArchiveResult."""
|
|
url = create_test_url()
|
|
|
|
# Create snapshot and archive result
|
|
stdout1, _, _ = run_archivebox_cmd(['snapshot', 'create', url], data_dir=initialized_archive)
|
|
snapshot = parse_jsonl_output(stdout1)[0]
|
|
|
|
stdout2, _, _ = run_archivebox_cmd(
|
|
['archiveresult', 'create', '--plugin=title'],
|
|
stdin=json.dumps(snapshot),
|
|
data_dir=initialized_archive,
|
|
)
|
|
ar = next(r for r in parse_jsonl_output(stdout2) if r.get('type') == 'ArchiveResult')
|
|
|
|
# Update to failed
|
|
ar['status'] = 'failed'
|
|
run_archivebox_cmd(
|
|
['archiveresult', 'update', '--status=failed'],
|
|
stdin=json.dumps(ar),
|
|
data_dir=initialized_archive,
|
|
)
|
|
|
|
# Now run should re-queue it
|
|
stdout3, stderr, code = run_archivebox_cmd(
|
|
['run'],
|
|
stdin=json.dumps(ar),
|
|
data_dir=initialized_archive,
|
|
timeout=120,
|
|
)
|
|
|
|
assert code == 0
|
|
records = parse_jsonl_output(stdout3)
|
|
ar_records = [r for r in records if r.get('type') == 'ArchiveResult']
|
|
assert len(ar_records) >= 1
|
|
|
|
|
|
class TestRunPassThrough:
|
|
"""Tests for pass-through behavior in `archivebox run`."""
|
|
|
|
def test_run_passes_through_unknown_types(self, initialized_archive):
|
|
"""Run passes through records with unknown types."""
|
|
unknown_record = {'type': 'Unknown', 'id': 'fake-id', 'data': 'test'}
|
|
|
|
stdout, stderr, code = run_archivebox_cmd(
|
|
['run'],
|
|
stdin=json.dumps(unknown_record),
|
|
data_dir=initialized_archive,
|
|
)
|
|
|
|
assert code == 0
|
|
records = parse_jsonl_output(stdout)
|
|
unknown_records = [r for r in records if r.get('type') == 'Unknown']
|
|
assert len(unknown_records) == 1
|
|
assert unknown_records[0]['data'] == 'test'
|
|
|
|
def test_run_outputs_all_processed_records(self, initialized_archive):
|
|
"""Run outputs all processed records for chaining."""
|
|
url = create_test_url()
|
|
crawl_record = create_test_crawl_json(urls=[url])
|
|
|
|
stdout, stderr, code = run_archivebox_cmd(
|
|
['run'],
|
|
stdin=json.dumps(crawl_record),
|
|
data_dir=initialized_archive,
|
|
timeout=120,
|
|
)
|
|
|
|
assert code == 0
|
|
records = parse_jsonl_output(stdout)
|
|
# Should have at least the Crawl in output
|
|
assert len(records) >= 1
|
|
|
|
|
|
class TestRunMixedInput:
|
|
"""Tests for `archivebox run` with mixed record types."""
|
|
|
|
def test_run_handles_mixed_types(self, initialized_archive):
|
|
"""Run handles mixed Crawl/Snapshot/ArchiveResult input."""
|
|
crawl = create_test_crawl_json()
|
|
snapshot = create_test_snapshot_json()
|
|
unknown = {'type': 'Tag', 'id': 'fake', 'name': 'test'}
|
|
|
|
stdin = '\n'.join([
|
|
json.dumps(crawl),
|
|
json.dumps(snapshot),
|
|
json.dumps(unknown),
|
|
])
|
|
|
|
stdout, stderr, code = run_archivebox_cmd(
|
|
['run'],
|
|
stdin=stdin,
|
|
data_dir=initialized_archive,
|
|
timeout=120,
|
|
)
|
|
|
|
assert code == 0
|
|
records = parse_jsonl_output(stdout)
|
|
|
|
types = set(r.get('type') for r in records)
|
|
# Should have processed Crawl and Snapshot, passed through Tag
|
|
assert 'Crawl' in types or 'Snapshot' in types or 'Tag' in types
|
|
|
|
|
|
class TestRunEmpty:
|
|
"""Tests for `archivebox run` edge cases."""
|
|
|
|
def test_run_empty_stdin(self, initialized_archive):
|
|
"""Run with empty stdin returns success."""
|
|
stdout, stderr, code = run_archivebox_cmd(
|
|
['run'],
|
|
stdin='',
|
|
data_dir=initialized_archive,
|
|
)
|
|
|
|
assert code == 0
|
|
|
|
def test_run_no_records_to_process(self, initialized_archive):
|
|
"""Run with only pass-through records shows message."""
|
|
unknown = {'type': 'Unknown', 'id': 'fake'}
|
|
|
|
stdout, stderr, code = run_archivebox_cmd(
|
|
['run'],
|
|
stdin=json.dumps(unknown),
|
|
data_dir=initialized_archive,
|
|
)
|
|
|
|
assert code == 0
|
|
assert 'No records to process' in stderr
|