Files
ArchiveBox/archivebox/tests/test_cli_piping.py
2026-03-15 22:09:56 -07:00

378 lines
13 KiB
Python

"""
Tests for JSONL piping contracts and `archivebox run` / `archivebox orchestrator`.
This file covers both:
- low-level JSONL/stdin parsing behavior that makes CLI piping work
- subprocess integration for the supported records `archivebox run` consumes
"""
import sqlite3
import sys
import uuid
from io import StringIO
from pathlib import Path
from archivebox.tests.conftest import (
create_test_url,
parse_jsonl_output,
run_archivebox_cmd,
)
PIPE_TEST_ENV = {
"PLUGINS": "favicon",
"SAVE_FAVICON": "True",
"USE_COLOR": "False",
"SHOW_PROGRESS": "False",
}
class MockTTYStringIO(StringIO):
def __init__(self, initial_value: str = "", *, is_tty: bool):
super().__init__(initial_value)
self._is_tty = is_tty
def isatty(self) -> bool:
return self._is_tty
def _stdout_lines(stdout: str) -> list[str]:
return [line for line in stdout.splitlines() if line.strip()]
def _assert_stdout_is_jsonl_only(stdout: str) -> None:
lines = _stdout_lines(stdout)
assert lines, "Expected stdout to contain JSONL records"
assert all(line.lstrip().startswith("{") for line in lines), stdout
def _sqlite_param(value: object) -> object:
if not isinstance(value, str):
return value
try:
return uuid.UUID(value).hex
except ValueError:
return value
def _db_value(data_dir: Path, sql: str, params: tuple[object, ...] = ()) -> object | None:
conn = sqlite3.connect(data_dir / "index.sqlite3")
try:
row = conn.execute(sql, tuple(_sqlite_param(param) for param in params)).fetchone()
finally:
conn.close()
return row[0] if row else None
def test_parse_line_accepts_supported_piping_inputs():
"""The JSONL parser should normalize the input forms CLI pipes accept."""
from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT, parse_line
assert parse_line("") is None
assert parse_line(" ") is None
assert parse_line("# comment") is None
assert parse_line("not-a-url") is None
assert parse_line("ftp://example.com") is None
plain_url = parse_line("https://example.com")
assert plain_url == {"type": TYPE_SNAPSHOT, "url": "https://example.com"}
file_url = parse_line("file:///tmp/example.txt")
assert file_url == {"type": TYPE_SNAPSHOT, "url": "file:///tmp/example.txt"}
snapshot_json = parse_line('{"type":"Snapshot","url":"https://example.com","tags":"tag1,tag2"}')
assert snapshot_json is not None
assert snapshot_json["type"] == TYPE_SNAPSHOT
assert snapshot_json["tags"] == "tag1,tag2"
crawl_json = parse_line('{"type":"Crawl","id":"abc123","urls":"https://example.com","max_depth":1}')
assert crawl_json is not None
assert crawl_json["type"] == TYPE_CRAWL
assert crawl_json["id"] == "abc123"
assert crawl_json["max_depth"] == 1
snapshot_id = "01234567-89ab-cdef-0123-456789abcdef"
parsed_id = parse_line(snapshot_id)
assert parsed_id == {"type": TYPE_SNAPSHOT, "id": snapshot_id}
compact_snapshot_id = "0123456789abcdef0123456789abcdef"
compact_parsed_id = parse_line(compact_snapshot_id)
assert compact_parsed_id == {"type": TYPE_SNAPSHOT, "id": compact_snapshot_id}
def test_read_args_or_stdin_handles_args_stdin_and_mixed_jsonl():
"""Piping helpers should consume args, structured JSONL, and pass-through records."""
from archivebox.misc.jsonl import TYPE_CRAWL, read_args_or_stdin
records = list(read_args_or_stdin(("https://example1.com", "https://example2.com")))
assert [record["url"] for record in records] == ["https://example1.com", "https://example2.com"]
stdin_records = list(
read_args_or_stdin(
(),
stream=MockTTYStringIO(
'https://plain-url.com\n'
'{"type":"Snapshot","url":"https://jsonl-url.com","tags":"test"}\n'
'{"type":"Tag","id":"tag-1","name":"example"}\n'
'01234567-89ab-cdef-0123-456789abcdef\n'
'not valid json\n',
is_tty=False,
),
)
)
assert len(stdin_records) == 4
assert stdin_records[0]["url"] == "https://plain-url.com"
assert stdin_records[1]["url"] == "https://jsonl-url.com"
assert stdin_records[1]["tags"] == "test"
assert stdin_records[2]["type"] == "Tag"
assert stdin_records[2]["name"] == "example"
assert stdin_records[3]["id"] == "01234567-89ab-cdef-0123-456789abcdef"
crawl_records = list(
read_args_or_stdin(
(),
stream=MockTTYStringIO(
'{"type":"Crawl","id":"crawl-1","urls":"https://example.com\\nhttps://foo.com"}\n',
is_tty=False,
),
)
)
assert len(crawl_records) == 1
assert crawl_records[0]["type"] == TYPE_CRAWL
assert crawl_records[0]["id"] == "crawl-1"
tty_records = list(read_args_or_stdin((), stream=MockTTYStringIO("https://example.com", is_tty=True)))
assert tty_records == []
def test_collect_urls_from_plugins_reads_only_parser_outputs(tmp_path):
"""Parser extractor `urls.jsonl` outputs should be discoverable for recursive piping."""
from archivebox.hooks import collect_urls_from_plugins
(tmp_path / "wget").mkdir()
(tmp_path / "wget" / "urls.jsonl").write_text(
'{"url":"https://wget-link-1.com"}\n'
'{"url":"https://wget-link-2.com"}\n',
encoding="utf-8",
)
(tmp_path / "parse_html_urls").mkdir()
(tmp_path / "parse_html_urls" / "urls.jsonl").write_text(
'{"url":"https://html-link-1.com"}\n'
'{"url":"https://html-link-2.com","title":"HTML Link 2"}\n',
encoding="utf-8",
)
(tmp_path / "screenshot").mkdir()
urls = collect_urls_from_plugins(tmp_path)
assert len(urls) == 4
assert {url["plugin"] for url in urls} == {"wget", "parse_html_urls"}
titled = [url for url in urls if url.get("title") == "HTML Link 2"]
assert len(titled) == 1
assert titled[0]["url"] == "https://html-link-2.com"
assert collect_urls_from_plugins(tmp_path / "nonexistent") == []
def test_crawl_create_stdout_pipes_into_run(initialized_archive):
"""`archivebox crawl create | archivebox run` should queue and materialize snapshots."""
url = create_test_url()
create_stdout, create_stderr, create_code = run_archivebox_cmd(
["crawl", "create", url],
data_dir=initialized_archive,
)
assert create_code == 0, create_stderr
_assert_stdout_is_jsonl_only(create_stdout)
crawl = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Crawl")
run_stdout, run_stderr, run_code = run_archivebox_cmd(
["run"],
stdin=create_stdout,
data_dir=initialized_archive,
timeout=120,
env=PIPE_TEST_ENV,
)
assert run_code == 0, run_stderr
_assert_stdout_is_jsonl_only(run_stdout)
run_records = parse_jsonl_output(run_stdout)
assert any(record.get("type") == "Crawl" and record.get("id") == crawl["id"] for record in run_records)
snapshot_count = _db_value(
initialized_archive,
"SELECT COUNT(*) FROM core_snapshot WHERE crawl_id = ?",
(crawl["id"],),
)
assert isinstance(snapshot_count, int)
assert snapshot_count >= 1
def test_snapshot_list_stdout_pipes_into_run(initialized_archive):
"""`archivebox snapshot list | archivebox run` should requeue listed snapshots."""
url = create_test_url()
create_stdout, create_stderr, create_code = run_archivebox_cmd(
["snapshot", "create", url],
data_dir=initialized_archive,
)
assert create_code == 0, create_stderr
snapshot = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Snapshot")
list_stdout, list_stderr, list_code = run_archivebox_cmd(
["snapshot", "list", "--status=queued", f"--url__icontains={snapshot['id']}"],
data_dir=initialized_archive,
)
if list_code != 0 or not parse_jsonl_output(list_stdout):
list_stdout, list_stderr, list_code = run_archivebox_cmd(
["snapshot", "list", f"--url__icontains={url}"],
data_dir=initialized_archive,
)
assert list_code == 0, list_stderr
_assert_stdout_is_jsonl_only(list_stdout)
run_stdout, run_stderr, run_code = run_archivebox_cmd(
["run"],
stdin=list_stdout,
data_dir=initialized_archive,
timeout=120,
env=PIPE_TEST_ENV,
)
assert run_code == 0, run_stderr
_assert_stdout_is_jsonl_only(run_stdout)
run_records = parse_jsonl_output(run_stdout)
assert any(record.get("type") == "Snapshot" and record.get("id") == snapshot["id"] for record in run_records)
snapshot_status = _db_value(
initialized_archive,
"SELECT status FROM core_snapshot WHERE id = ?",
(snapshot["id"],),
)
assert snapshot_status == "sealed"
def test_archiveresult_list_stdout_pipes_into_orchestrator_alias(initialized_archive):
"""`archivebox archiveresult list | archivebox orchestrator` should preserve clean JSONL stdout."""
url = create_test_url()
snapshot_stdout, snapshot_stderr, snapshot_code = run_archivebox_cmd(
["snapshot", "create", url],
data_dir=initialized_archive,
)
assert snapshot_code == 0, snapshot_stderr
ar_create_stdout, ar_create_stderr, ar_create_code = run_archivebox_cmd(
["archiveresult", "create", "--plugin=favicon"],
stdin=snapshot_stdout,
data_dir=initialized_archive,
)
assert ar_create_code == 0, ar_create_stderr
created_records = parse_jsonl_output(ar_create_stdout)
archiveresult = next(record for record in created_records if record.get("type") == "ArchiveResult")
list_stdout, list_stderr, list_code = run_archivebox_cmd(
["archiveresult", "list", "--plugin=favicon"],
data_dir=initialized_archive,
)
assert list_code == 0, list_stderr
_assert_stdout_is_jsonl_only(list_stdout)
orchestrator_stdout, orchestrator_stderr, orchestrator_code = run_archivebox_cmd(
["orchestrator"],
stdin=list_stdout,
data_dir=initialized_archive,
timeout=120,
env=PIPE_TEST_ENV,
)
assert orchestrator_code == 0, orchestrator_stderr
_assert_stdout_is_jsonl_only(orchestrator_stdout)
assert "renamed to `archivebox run`" in orchestrator_stderr
run_records = parse_jsonl_output(orchestrator_stdout)
assert any(
record.get("type") == "ArchiveResult" and record.get("id") == archiveresult["id"]
for record in run_records
)
def test_binary_create_stdout_pipes_into_run(initialized_archive):
"""`archivebox binary create | archivebox run` should queue the binary record for processing."""
create_stdout, create_stderr, create_code = run_archivebox_cmd(
["binary", "create", "--name=python3", f"--abspath={sys.executable}", "--version=test"],
data_dir=initialized_archive,
)
assert create_code == 0, create_stderr
_assert_stdout_is_jsonl_only(create_stdout)
binary = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Binary")
run_stdout, run_stderr, run_code = run_archivebox_cmd(
["run"],
stdin=create_stdout,
data_dir=initialized_archive,
timeout=120,
)
assert run_code == 0, run_stderr
_assert_stdout_is_jsonl_only(run_stdout)
run_records = parse_jsonl_output(run_stdout)
assert any(record.get("type") == "Binary" and record.get("id") == binary["id"] for record in run_records)
status = _db_value(
initialized_archive,
"SELECT status FROM machine_binary WHERE id = ?",
(binary["id"],),
)
assert status in {"queued", "installed"}
def test_multi_stage_pipeline_into_run(initialized_archive):
"""`crawl create | snapshot create | archiveresult create | run` should preserve JSONL and finish work."""
url = create_test_url()
crawl_stdout, crawl_stderr, crawl_code = run_archivebox_cmd(
["crawl", "create", url],
data_dir=initialized_archive,
)
assert crawl_code == 0, crawl_stderr
_assert_stdout_is_jsonl_only(crawl_stdout)
snapshot_stdout, snapshot_stderr, snapshot_code = run_archivebox_cmd(
["snapshot", "create"],
stdin=crawl_stdout,
data_dir=initialized_archive,
)
assert snapshot_code == 0, snapshot_stderr
_assert_stdout_is_jsonl_only(snapshot_stdout)
archiveresult_stdout, archiveresult_stderr, archiveresult_code = run_archivebox_cmd(
["archiveresult", "create", "--plugin=favicon"],
stdin=snapshot_stdout,
data_dir=initialized_archive,
)
assert archiveresult_code == 0, archiveresult_stderr
_assert_stdout_is_jsonl_only(archiveresult_stdout)
run_stdout, run_stderr, run_code = run_archivebox_cmd(
["run"],
stdin=archiveresult_stdout,
data_dir=initialized_archive,
timeout=120,
env=PIPE_TEST_ENV,
)
assert run_code == 0, run_stderr
_assert_stdout_is_jsonl_only(run_stdout)
run_records = parse_jsonl_output(run_stdout)
snapshot = next(record for record in run_records if record.get("type") == "Snapshot")
assert any(record.get("type") == "ArchiveResult" for record in run_records)
snapshot_status = _db_value(
initialized_archive,
"SELECT status FROM core_snapshot WHERE id = ?",
(snapshot["id"],),
)
assert snapshot_status == "sealed"