""" Tests for JSONL piping contracts and `archivebox run` / `archivebox orchestrator`. This file covers both: - low-level JSONL/stdin parsing behavior that makes CLI piping work - subprocess integration for the supported records `archivebox run` consumes """ import sqlite3 import sys import uuid from io import StringIO from pathlib import Path from archivebox.tests.conftest import ( create_test_url, parse_jsonl_output, run_archivebox_cmd, ) PIPE_TEST_ENV = { "PLUGINS": "favicon", "SAVE_FAVICON": "True", "USE_COLOR": "False", "SHOW_PROGRESS": "False", } class MockTTYStringIO(StringIO): def __init__(self, initial_value: str = "", *, is_tty: bool): super().__init__(initial_value) self._is_tty = is_tty def isatty(self) -> bool: return self._is_tty def _stdout_lines(stdout: str) -> list[str]: return [line for line in stdout.splitlines() if line.strip()] def _assert_stdout_is_jsonl_only(stdout: str) -> None: lines = _stdout_lines(stdout) assert lines, "Expected stdout to contain JSONL records" assert all(line.lstrip().startswith("{") for line in lines), stdout def _sqlite_param(value: object) -> object: if not isinstance(value, str): return value try: return uuid.UUID(value).hex except ValueError: return value def _db_value(data_dir: Path, sql: str, params: tuple[object, ...] = ()) -> object | None: conn = sqlite3.connect(data_dir / "index.sqlite3") try: row = conn.execute(sql, tuple(_sqlite_param(param) for param in params)).fetchone() finally: conn.close() return row[0] if row else None def test_parse_line_accepts_supported_piping_inputs(): """The JSONL parser should normalize the input forms CLI pipes accept.""" from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT, parse_line assert parse_line("") is None assert parse_line(" ") is None assert parse_line("# comment") is None assert parse_line("not-a-url") is None assert parse_line("ftp://example.com") is None plain_url = parse_line("https://example.com") assert plain_url == {"type": TYPE_SNAPSHOT, "url": "https://example.com"} file_url = parse_line("file:///tmp/example.txt") assert file_url == {"type": TYPE_SNAPSHOT, "url": "file:///tmp/example.txt"} snapshot_json = parse_line('{"type":"Snapshot","url":"https://example.com","tags":"tag1,tag2"}') assert snapshot_json is not None assert snapshot_json["type"] == TYPE_SNAPSHOT assert snapshot_json["tags"] == "tag1,tag2" crawl_json = parse_line('{"type":"Crawl","id":"abc123","urls":"https://example.com","max_depth":1}') assert crawl_json is not None assert crawl_json["type"] == TYPE_CRAWL assert crawl_json["id"] == "abc123" assert crawl_json["max_depth"] == 1 snapshot_id = "01234567-89ab-cdef-0123-456789abcdef" parsed_id = parse_line(snapshot_id) assert parsed_id == {"type": TYPE_SNAPSHOT, "id": snapshot_id} compact_snapshot_id = "0123456789abcdef0123456789abcdef" compact_parsed_id = parse_line(compact_snapshot_id) assert compact_parsed_id == {"type": TYPE_SNAPSHOT, "id": compact_snapshot_id} def test_read_args_or_stdin_handles_args_stdin_and_mixed_jsonl(): """Piping helpers should consume args, structured JSONL, and pass-through records.""" from archivebox.misc.jsonl import TYPE_CRAWL, read_args_or_stdin records = list(read_args_or_stdin(("https://example1.com", "https://example2.com"))) assert [record["url"] for record in records] == ["https://example1.com", "https://example2.com"] stdin_records = list( read_args_or_stdin( (), stream=MockTTYStringIO( 'https://plain-url.com\n' '{"type":"Snapshot","url":"https://jsonl-url.com","tags":"test"}\n' '{"type":"Tag","id":"tag-1","name":"example"}\n' '01234567-89ab-cdef-0123-456789abcdef\n' 'not valid json\n', is_tty=False, ), ) ) assert len(stdin_records) == 4 assert stdin_records[0]["url"] == "https://plain-url.com" assert stdin_records[1]["url"] == "https://jsonl-url.com" assert stdin_records[1]["tags"] == "test" assert stdin_records[2]["type"] == "Tag" assert stdin_records[2]["name"] == "example" assert stdin_records[3]["id"] == "01234567-89ab-cdef-0123-456789abcdef" crawl_records = list( read_args_or_stdin( (), stream=MockTTYStringIO( '{"type":"Crawl","id":"crawl-1","urls":"https://example.com\\nhttps://foo.com"}\n', is_tty=False, ), ) ) assert len(crawl_records) == 1 assert crawl_records[0]["type"] == TYPE_CRAWL assert crawl_records[0]["id"] == "crawl-1" tty_records = list(read_args_or_stdin((), stream=MockTTYStringIO("https://example.com", is_tty=True))) assert tty_records == [] def test_collect_urls_from_plugins_reads_only_parser_outputs(tmp_path): """Parser extractor `urls.jsonl` outputs should be discoverable for recursive piping.""" from archivebox.hooks import collect_urls_from_plugins (tmp_path / "wget").mkdir() (tmp_path / "wget" / "urls.jsonl").write_text( '{"url":"https://wget-link-1.com"}\n' '{"url":"https://wget-link-2.com"}\n', encoding="utf-8", ) (tmp_path / "parse_html_urls").mkdir() (tmp_path / "parse_html_urls" / "urls.jsonl").write_text( '{"url":"https://html-link-1.com"}\n' '{"url":"https://html-link-2.com","title":"HTML Link 2"}\n', encoding="utf-8", ) (tmp_path / "screenshot").mkdir() urls = collect_urls_from_plugins(tmp_path) assert len(urls) == 4 assert {url["plugin"] for url in urls} == {"wget", "parse_html_urls"} titled = [url for url in urls if url.get("title") == "HTML Link 2"] assert len(titled) == 1 assert titled[0]["url"] == "https://html-link-2.com" assert collect_urls_from_plugins(tmp_path / "nonexistent") == [] def test_crawl_create_stdout_pipes_into_run(initialized_archive): """`archivebox crawl create | archivebox run` should queue and materialize snapshots.""" url = create_test_url() create_stdout, create_stderr, create_code = run_archivebox_cmd( ["crawl", "create", url], data_dir=initialized_archive, ) assert create_code == 0, create_stderr _assert_stdout_is_jsonl_only(create_stdout) crawl = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Crawl") run_stdout, run_stderr, run_code = run_archivebox_cmd( ["run"], stdin=create_stdout, data_dir=initialized_archive, timeout=120, env=PIPE_TEST_ENV, ) assert run_code == 0, run_stderr _assert_stdout_is_jsonl_only(run_stdout) run_records = parse_jsonl_output(run_stdout) assert any(record.get("type") == "Crawl" and record.get("id") == crawl["id"] for record in run_records) snapshot_count = _db_value( initialized_archive, "SELECT COUNT(*) FROM core_snapshot WHERE crawl_id = ?", (crawl["id"],), ) assert isinstance(snapshot_count, int) assert snapshot_count >= 1 def test_snapshot_list_stdout_pipes_into_run(initialized_archive): """`archivebox snapshot list | archivebox run` should requeue listed snapshots.""" url = create_test_url() create_stdout, create_stderr, create_code = run_archivebox_cmd( ["snapshot", "create", url], data_dir=initialized_archive, ) assert create_code == 0, create_stderr snapshot = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Snapshot") list_stdout, list_stderr, list_code = run_archivebox_cmd( ["snapshot", "list", "--status=queued", f"--url__icontains={snapshot['id']}"], data_dir=initialized_archive, ) if list_code != 0 or not parse_jsonl_output(list_stdout): list_stdout, list_stderr, list_code = run_archivebox_cmd( ["snapshot", "list", f"--url__icontains={url}"], data_dir=initialized_archive, ) assert list_code == 0, list_stderr _assert_stdout_is_jsonl_only(list_stdout) run_stdout, run_stderr, run_code = run_archivebox_cmd( ["run"], stdin=list_stdout, data_dir=initialized_archive, timeout=120, env=PIPE_TEST_ENV, ) assert run_code == 0, run_stderr _assert_stdout_is_jsonl_only(run_stdout) run_records = parse_jsonl_output(run_stdout) assert any(record.get("type") == "Snapshot" and record.get("id") == snapshot["id"] for record in run_records) snapshot_status = _db_value( initialized_archive, "SELECT status FROM core_snapshot WHERE id = ?", (snapshot["id"],), ) assert snapshot_status == "sealed" def test_archiveresult_list_stdout_pipes_into_orchestrator_alias(initialized_archive): """`archivebox archiveresult list | archivebox orchestrator` should preserve clean JSONL stdout.""" url = create_test_url() snapshot_stdout, snapshot_stderr, snapshot_code = run_archivebox_cmd( ["snapshot", "create", url], data_dir=initialized_archive, ) assert snapshot_code == 0, snapshot_stderr ar_create_stdout, ar_create_stderr, ar_create_code = run_archivebox_cmd( ["archiveresult", "create", "--plugin=favicon"], stdin=snapshot_stdout, data_dir=initialized_archive, ) assert ar_create_code == 0, ar_create_stderr created_records = parse_jsonl_output(ar_create_stdout) archiveresult = next(record for record in created_records if record.get("type") == "ArchiveResult") list_stdout, list_stderr, list_code = run_archivebox_cmd( ["archiveresult", "list", "--plugin=favicon"], data_dir=initialized_archive, ) assert list_code == 0, list_stderr _assert_stdout_is_jsonl_only(list_stdout) orchestrator_stdout, orchestrator_stderr, orchestrator_code = run_archivebox_cmd( ["orchestrator"], stdin=list_stdout, data_dir=initialized_archive, timeout=120, env=PIPE_TEST_ENV, ) assert orchestrator_code == 0, orchestrator_stderr _assert_stdout_is_jsonl_only(orchestrator_stdout) assert "renamed to `archivebox run`" in orchestrator_stderr run_records = parse_jsonl_output(orchestrator_stdout) assert any( record.get("type") == "ArchiveResult" and record.get("id") == archiveresult["id"] for record in run_records ) def test_binary_create_stdout_pipes_into_run(initialized_archive): """`archivebox binary create | archivebox run` should queue the binary record for processing.""" create_stdout, create_stderr, create_code = run_archivebox_cmd( ["binary", "create", "--name=python3", f"--abspath={sys.executable}", "--version=test"], data_dir=initialized_archive, ) assert create_code == 0, create_stderr _assert_stdout_is_jsonl_only(create_stdout) binary = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Binary") run_stdout, run_stderr, run_code = run_archivebox_cmd( ["run"], stdin=create_stdout, data_dir=initialized_archive, timeout=120, ) assert run_code == 0, run_stderr _assert_stdout_is_jsonl_only(run_stdout) run_records = parse_jsonl_output(run_stdout) assert any(record.get("type") == "Binary" and record.get("id") == binary["id"] for record in run_records) status = _db_value( initialized_archive, "SELECT status FROM machine_binary WHERE id = ?", (binary["id"],), ) assert status in {"queued", "installed"} def test_multi_stage_pipeline_into_run(initialized_archive): """`crawl create | snapshot create | archiveresult create | run` should preserve JSONL and finish work.""" url = create_test_url() crawl_stdout, crawl_stderr, crawl_code = run_archivebox_cmd( ["crawl", "create", url], data_dir=initialized_archive, ) assert crawl_code == 0, crawl_stderr _assert_stdout_is_jsonl_only(crawl_stdout) snapshot_stdout, snapshot_stderr, snapshot_code = run_archivebox_cmd( ["snapshot", "create"], stdin=crawl_stdout, data_dir=initialized_archive, ) assert snapshot_code == 0, snapshot_stderr _assert_stdout_is_jsonl_only(snapshot_stdout) archiveresult_stdout, archiveresult_stderr, archiveresult_code = run_archivebox_cmd( ["archiveresult", "create", "--plugin=favicon"], stdin=snapshot_stdout, data_dir=initialized_archive, ) assert archiveresult_code == 0, archiveresult_stderr _assert_stdout_is_jsonl_only(archiveresult_stdout) run_stdout, run_stderr, run_code = run_archivebox_cmd( ["run"], stdin=archiveresult_stdout, data_dir=initialized_archive, timeout=120, env=PIPE_TEST_ENV, ) assert run_code == 0, run_stderr _assert_stdout_is_jsonl_only(run_stdout) run_records = parse_jsonl_output(run_stdout) snapshot = next(record for record in run_records if record.get("type") == "Snapshot") assert any(record.get("type") == "ArchiveResult" for record in run_records) snapshot_status = _db_value( initialized_archive, "SELECT status FROM core_snapshot WHERE id = ?", (snapshot["id"],), ) assert snapshot_status == "sealed"