mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 23:37:58 +10:00
378 lines
13 KiB
Python
378 lines
13 KiB
Python
"""
|
|
Tests for JSONL piping contracts and `archivebox run` / `archivebox orchestrator`.
|
|
|
|
This file covers both:
|
|
- low-level JSONL/stdin parsing behavior that makes CLI piping work
|
|
- subprocess integration for the supported records `archivebox run` consumes
|
|
"""
|
|
|
|
import sqlite3
|
|
import sys
|
|
import uuid
|
|
from io import StringIO
|
|
from pathlib import Path
|
|
|
|
from archivebox.tests.conftest import (
|
|
create_test_url,
|
|
parse_jsonl_output,
|
|
run_archivebox_cmd,
|
|
)
|
|
|
|
|
|
PIPE_TEST_ENV = {
|
|
"PLUGINS": "favicon",
|
|
"SAVE_FAVICON": "True",
|
|
"USE_COLOR": "False",
|
|
"SHOW_PROGRESS": "False",
|
|
}
|
|
|
|
|
|
class MockTTYStringIO(StringIO):
|
|
def __init__(self, initial_value: str = "", *, is_tty: bool):
|
|
super().__init__(initial_value)
|
|
self._is_tty = is_tty
|
|
|
|
def isatty(self) -> bool:
|
|
return self._is_tty
|
|
|
|
|
|
def _stdout_lines(stdout: str) -> list[str]:
|
|
return [line for line in stdout.splitlines() if line.strip()]
|
|
|
|
|
|
def _assert_stdout_is_jsonl_only(stdout: str) -> None:
|
|
lines = _stdout_lines(stdout)
|
|
assert lines, "Expected stdout to contain JSONL records"
|
|
assert all(line.lstrip().startswith("{") for line in lines), stdout
|
|
|
|
|
|
def _sqlite_param(value: object) -> object:
|
|
if not isinstance(value, str):
|
|
return value
|
|
try:
|
|
return uuid.UUID(value).hex
|
|
except ValueError:
|
|
return value
|
|
|
|
|
|
def _db_value(data_dir: Path, sql: str, params: tuple[object, ...] = ()) -> object | None:
|
|
conn = sqlite3.connect(data_dir / "index.sqlite3")
|
|
try:
|
|
row = conn.execute(sql, tuple(_sqlite_param(param) for param in params)).fetchone()
|
|
finally:
|
|
conn.close()
|
|
return row[0] if row else None
|
|
|
|
|
|
def test_parse_line_accepts_supported_piping_inputs():
|
|
"""The JSONL parser should normalize the input forms CLI pipes accept."""
|
|
from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT, parse_line
|
|
|
|
assert parse_line("") is None
|
|
assert parse_line(" ") is None
|
|
assert parse_line("# comment") is None
|
|
assert parse_line("not-a-url") is None
|
|
assert parse_line("ftp://example.com") is None
|
|
|
|
plain_url = parse_line("https://example.com")
|
|
assert plain_url == {"type": TYPE_SNAPSHOT, "url": "https://example.com"}
|
|
|
|
file_url = parse_line("file:///tmp/example.txt")
|
|
assert file_url == {"type": TYPE_SNAPSHOT, "url": "file:///tmp/example.txt"}
|
|
|
|
snapshot_json = parse_line('{"type":"Snapshot","url":"https://example.com","tags":"tag1,tag2"}')
|
|
assert snapshot_json is not None
|
|
assert snapshot_json["type"] == TYPE_SNAPSHOT
|
|
assert snapshot_json["tags"] == "tag1,tag2"
|
|
|
|
crawl_json = parse_line('{"type":"Crawl","id":"abc123","urls":"https://example.com","max_depth":1}')
|
|
assert crawl_json is not None
|
|
assert crawl_json["type"] == TYPE_CRAWL
|
|
assert crawl_json["id"] == "abc123"
|
|
assert crawl_json["max_depth"] == 1
|
|
|
|
snapshot_id = "01234567-89ab-cdef-0123-456789abcdef"
|
|
parsed_id = parse_line(snapshot_id)
|
|
assert parsed_id == {"type": TYPE_SNAPSHOT, "id": snapshot_id}
|
|
|
|
compact_snapshot_id = "0123456789abcdef0123456789abcdef"
|
|
compact_parsed_id = parse_line(compact_snapshot_id)
|
|
assert compact_parsed_id == {"type": TYPE_SNAPSHOT, "id": compact_snapshot_id}
|
|
|
|
|
|
def test_read_args_or_stdin_handles_args_stdin_and_mixed_jsonl():
|
|
"""Piping helpers should consume args, structured JSONL, and pass-through records."""
|
|
from archivebox.misc.jsonl import TYPE_CRAWL, read_args_or_stdin
|
|
|
|
records = list(read_args_or_stdin(("https://example1.com", "https://example2.com")))
|
|
assert [record["url"] for record in records] == ["https://example1.com", "https://example2.com"]
|
|
|
|
stdin_records = list(
|
|
read_args_or_stdin(
|
|
(),
|
|
stream=MockTTYStringIO(
|
|
'https://plain-url.com\n'
|
|
'{"type":"Snapshot","url":"https://jsonl-url.com","tags":"test"}\n'
|
|
'{"type":"Tag","id":"tag-1","name":"example"}\n'
|
|
'01234567-89ab-cdef-0123-456789abcdef\n'
|
|
'not valid json\n',
|
|
is_tty=False,
|
|
),
|
|
)
|
|
)
|
|
assert len(stdin_records) == 4
|
|
assert stdin_records[0]["url"] == "https://plain-url.com"
|
|
assert stdin_records[1]["url"] == "https://jsonl-url.com"
|
|
assert stdin_records[1]["tags"] == "test"
|
|
assert stdin_records[2]["type"] == "Tag"
|
|
assert stdin_records[2]["name"] == "example"
|
|
assert stdin_records[3]["id"] == "01234567-89ab-cdef-0123-456789abcdef"
|
|
|
|
crawl_records = list(
|
|
read_args_or_stdin(
|
|
(),
|
|
stream=MockTTYStringIO(
|
|
'{"type":"Crawl","id":"crawl-1","urls":"https://example.com\\nhttps://foo.com"}\n',
|
|
is_tty=False,
|
|
),
|
|
)
|
|
)
|
|
assert len(crawl_records) == 1
|
|
assert crawl_records[0]["type"] == TYPE_CRAWL
|
|
assert crawl_records[0]["id"] == "crawl-1"
|
|
|
|
tty_records = list(read_args_or_stdin((), stream=MockTTYStringIO("https://example.com", is_tty=True)))
|
|
assert tty_records == []
|
|
|
|
|
|
def test_collect_urls_from_plugins_reads_only_parser_outputs(tmp_path):
|
|
"""Parser extractor `urls.jsonl` outputs should be discoverable for recursive piping."""
|
|
from archivebox.hooks import collect_urls_from_plugins
|
|
|
|
(tmp_path / "wget").mkdir()
|
|
(tmp_path / "wget" / "urls.jsonl").write_text(
|
|
'{"url":"https://wget-link-1.com"}\n'
|
|
'{"url":"https://wget-link-2.com"}\n',
|
|
encoding="utf-8",
|
|
)
|
|
(tmp_path / "parse_html_urls").mkdir()
|
|
(tmp_path / "parse_html_urls" / "urls.jsonl").write_text(
|
|
'{"url":"https://html-link-1.com"}\n'
|
|
'{"url":"https://html-link-2.com","title":"HTML Link 2"}\n',
|
|
encoding="utf-8",
|
|
)
|
|
(tmp_path / "screenshot").mkdir()
|
|
|
|
urls = collect_urls_from_plugins(tmp_path)
|
|
assert len(urls) == 4
|
|
assert {url["plugin"] for url in urls} == {"wget", "parse_html_urls"}
|
|
titled = [url for url in urls if url.get("title") == "HTML Link 2"]
|
|
assert len(titled) == 1
|
|
assert titled[0]["url"] == "https://html-link-2.com"
|
|
|
|
assert collect_urls_from_plugins(tmp_path / "nonexistent") == []
|
|
|
|
|
|
def test_crawl_create_stdout_pipes_into_run(initialized_archive):
|
|
"""`archivebox crawl create | archivebox run` should queue and materialize snapshots."""
|
|
url = create_test_url()
|
|
|
|
create_stdout, create_stderr, create_code = run_archivebox_cmd(
|
|
["crawl", "create", url],
|
|
data_dir=initialized_archive,
|
|
)
|
|
assert create_code == 0, create_stderr
|
|
_assert_stdout_is_jsonl_only(create_stdout)
|
|
|
|
crawl = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Crawl")
|
|
|
|
run_stdout, run_stderr, run_code = run_archivebox_cmd(
|
|
["run"],
|
|
stdin=create_stdout,
|
|
data_dir=initialized_archive,
|
|
timeout=120,
|
|
env=PIPE_TEST_ENV,
|
|
)
|
|
assert run_code == 0, run_stderr
|
|
_assert_stdout_is_jsonl_only(run_stdout)
|
|
|
|
run_records = parse_jsonl_output(run_stdout)
|
|
assert any(record.get("type") == "Crawl" and record.get("id") == crawl["id"] for record in run_records)
|
|
|
|
snapshot_count = _db_value(
|
|
initialized_archive,
|
|
"SELECT COUNT(*) FROM core_snapshot WHERE crawl_id = ?",
|
|
(crawl["id"],),
|
|
)
|
|
assert isinstance(snapshot_count, int)
|
|
assert snapshot_count >= 1
|
|
|
|
|
|
def test_snapshot_list_stdout_pipes_into_run(initialized_archive):
|
|
"""`archivebox snapshot list | archivebox run` should requeue listed snapshots."""
|
|
url = create_test_url()
|
|
|
|
create_stdout, create_stderr, create_code = run_archivebox_cmd(
|
|
["snapshot", "create", url],
|
|
data_dir=initialized_archive,
|
|
)
|
|
assert create_code == 0, create_stderr
|
|
snapshot = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Snapshot")
|
|
|
|
list_stdout, list_stderr, list_code = run_archivebox_cmd(
|
|
["snapshot", "list", "--status=queued", f"--url__icontains={snapshot['id']}"],
|
|
data_dir=initialized_archive,
|
|
)
|
|
if list_code != 0 or not parse_jsonl_output(list_stdout):
|
|
list_stdout, list_stderr, list_code = run_archivebox_cmd(
|
|
["snapshot", "list", f"--url__icontains={url}"],
|
|
data_dir=initialized_archive,
|
|
)
|
|
assert list_code == 0, list_stderr
|
|
_assert_stdout_is_jsonl_only(list_stdout)
|
|
|
|
run_stdout, run_stderr, run_code = run_archivebox_cmd(
|
|
["run"],
|
|
stdin=list_stdout,
|
|
data_dir=initialized_archive,
|
|
timeout=120,
|
|
env=PIPE_TEST_ENV,
|
|
)
|
|
assert run_code == 0, run_stderr
|
|
_assert_stdout_is_jsonl_only(run_stdout)
|
|
|
|
run_records = parse_jsonl_output(run_stdout)
|
|
assert any(record.get("type") == "Snapshot" and record.get("id") == snapshot["id"] for record in run_records)
|
|
|
|
snapshot_status = _db_value(
|
|
initialized_archive,
|
|
"SELECT status FROM core_snapshot WHERE id = ?",
|
|
(snapshot["id"],),
|
|
)
|
|
assert snapshot_status == "sealed"
|
|
|
|
|
|
def test_archiveresult_list_stdout_pipes_into_orchestrator_alias(initialized_archive):
|
|
"""`archivebox archiveresult list | archivebox orchestrator` should preserve clean JSONL stdout."""
|
|
url = create_test_url()
|
|
|
|
snapshot_stdout, snapshot_stderr, snapshot_code = run_archivebox_cmd(
|
|
["snapshot", "create", url],
|
|
data_dir=initialized_archive,
|
|
)
|
|
assert snapshot_code == 0, snapshot_stderr
|
|
|
|
ar_create_stdout, ar_create_stderr, ar_create_code = run_archivebox_cmd(
|
|
["archiveresult", "create", "--plugin=favicon"],
|
|
stdin=snapshot_stdout,
|
|
data_dir=initialized_archive,
|
|
)
|
|
assert ar_create_code == 0, ar_create_stderr
|
|
|
|
created_records = parse_jsonl_output(ar_create_stdout)
|
|
archiveresult = next(record for record in created_records if record.get("type") == "ArchiveResult")
|
|
|
|
list_stdout, list_stderr, list_code = run_archivebox_cmd(
|
|
["archiveresult", "list", "--plugin=favicon"],
|
|
data_dir=initialized_archive,
|
|
)
|
|
assert list_code == 0, list_stderr
|
|
_assert_stdout_is_jsonl_only(list_stdout)
|
|
|
|
orchestrator_stdout, orchestrator_stderr, orchestrator_code = run_archivebox_cmd(
|
|
["orchestrator"],
|
|
stdin=list_stdout,
|
|
data_dir=initialized_archive,
|
|
timeout=120,
|
|
env=PIPE_TEST_ENV,
|
|
)
|
|
assert orchestrator_code == 0, orchestrator_stderr
|
|
_assert_stdout_is_jsonl_only(orchestrator_stdout)
|
|
assert "renamed to `archivebox run`" in orchestrator_stderr
|
|
|
|
run_records = parse_jsonl_output(orchestrator_stdout)
|
|
assert any(
|
|
record.get("type") == "ArchiveResult" and record.get("id") == archiveresult["id"]
|
|
for record in run_records
|
|
)
|
|
|
|
|
|
def test_binary_create_stdout_pipes_into_run(initialized_archive):
|
|
"""`archivebox binary create | archivebox run` should queue the binary record for processing."""
|
|
create_stdout, create_stderr, create_code = run_archivebox_cmd(
|
|
["binary", "create", "--name=python3", f"--abspath={sys.executable}", "--version=test"],
|
|
data_dir=initialized_archive,
|
|
)
|
|
assert create_code == 0, create_stderr
|
|
_assert_stdout_is_jsonl_only(create_stdout)
|
|
|
|
binary = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Binary")
|
|
|
|
run_stdout, run_stderr, run_code = run_archivebox_cmd(
|
|
["run"],
|
|
stdin=create_stdout,
|
|
data_dir=initialized_archive,
|
|
timeout=120,
|
|
)
|
|
assert run_code == 0, run_stderr
|
|
_assert_stdout_is_jsonl_only(run_stdout)
|
|
|
|
run_records = parse_jsonl_output(run_stdout)
|
|
assert any(record.get("type") == "Binary" and record.get("id") == binary["id"] for record in run_records)
|
|
|
|
status = _db_value(
|
|
initialized_archive,
|
|
"SELECT status FROM machine_binary WHERE id = ?",
|
|
(binary["id"],),
|
|
)
|
|
assert status in {"queued", "installed"}
|
|
|
|
|
|
def test_multi_stage_pipeline_into_run(initialized_archive):
|
|
"""`crawl create | snapshot create | archiveresult create | run` should preserve JSONL and finish work."""
|
|
url = create_test_url()
|
|
|
|
crawl_stdout, crawl_stderr, crawl_code = run_archivebox_cmd(
|
|
["crawl", "create", url],
|
|
data_dir=initialized_archive,
|
|
)
|
|
assert crawl_code == 0, crawl_stderr
|
|
_assert_stdout_is_jsonl_only(crawl_stdout)
|
|
|
|
snapshot_stdout, snapshot_stderr, snapshot_code = run_archivebox_cmd(
|
|
["snapshot", "create"],
|
|
stdin=crawl_stdout,
|
|
data_dir=initialized_archive,
|
|
)
|
|
assert snapshot_code == 0, snapshot_stderr
|
|
_assert_stdout_is_jsonl_only(snapshot_stdout)
|
|
|
|
archiveresult_stdout, archiveresult_stderr, archiveresult_code = run_archivebox_cmd(
|
|
["archiveresult", "create", "--plugin=favicon"],
|
|
stdin=snapshot_stdout,
|
|
data_dir=initialized_archive,
|
|
)
|
|
assert archiveresult_code == 0, archiveresult_stderr
|
|
_assert_stdout_is_jsonl_only(archiveresult_stdout)
|
|
|
|
run_stdout, run_stderr, run_code = run_archivebox_cmd(
|
|
["run"],
|
|
stdin=archiveresult_stdout,
|
|
data_dir=initialized_archive,
|
|
timeout=120,
|
|
env=PIPE_TEST_ENV,
|
|
)
|
|
assert run_code == 0, run_stderr
|
|
_assert_stdout_is_jsonl_only(run_stdout)
|
|
|
|
run_records = parse_jsonl_output(run_stdout)
|
|
snapshot = next(record for record in run_records if record.get("type") == "Snapshot")
|
|
assert any(record.get("type") == "ArchiveResult" for record in run_records)
|
|
|
|
snapshot_status = _db_value(
|
|
initialized_archive,
|
|
"SELECT status FROM core_snapshot WHERE id = ?",
|
|
(snapshot["id"],),
|
|
)
|
|
assert snapshot_status == "sealed"
|