ArchiveBox/archivebox/tests/test_cli_piping.py

"""
Tests for JSONL piping contracts and `archivebox run` / `archivebox orchestrator`.

This file covers both:
- low-level JSONL/stdin parsing behavior that makes CLI piping work
- subprocess integration for the supported records `archivebox run` consumes
"""

import sqlite3
import sys
import uuid
from io import StringIO
from pathlib import Path

from archivebox.tests.conftest import (
    create_test_url,
    parse_jsonl_output,
    run_archivebox_cmd,
)


PIPE_TEST_ENV = {
    "PLUGINS": "favicon",
    "SAVE_FAVICON": "True",
    "USE_COLOR": "False",
    "SHOW_PROGRESS": "False",
}


class MockTTYStringIO(StringIO):
    def __init__(self, initial_value: str = "", *, is_tty: bool):
        super().__init__(initial_value)
        self._is_tty = is_tty

    def isatty(self) -> bool:
        return self._is_tty


def _stdout_lines(stdout: str) -> list[str]:
    return [line for line in stdout.splitlines() if line.strip()]


def _assert_stdout_is_jsonl_only(stdout: str) -> None:
    lines = _stdout_lines(stdout)
    assert lines, "Expected stdout to contain JSONL records"
    assert all(line.lstrip().startswith("{") for line in lines), stdout


def _sqlite_param(value: object) -> object:
    if not isinstance(value, str):
        return value
    try:
        return uuid.UUID(value).hex
    except ValueError:
        return value


def _db_value(data_dir: Path, sql: str, params: tuple[object, ...] = ()) -> object | None:
    conn = sqlite3.connect(data_dir / "index.sqlite3")
    try:
        row = conn.execute(sql, tuple(_sqlite_param(param) for param in params)).fetchone()
    finally:
        conn.close()
    return row[0] if row else None


def test_parse_line_accepts_supported_piping_inputs():
    """The JSONL parser should normalize the input forms CLI pipes accept."""
    from archivebox.misc.jsonl import TYPE_CRAWL, TYPE_SNAPSHOT, parse_line

    assert parse_line("") is None
    assert parse_line("   ") is None
    assert parse_line("# comment") is None
    assert parse_line("not-a-url") is None
    assert parse_line("ftp://example.com") is None

    plain_url = parse_line("https://example.com")
    assert plain_url == {"type": TYPE_SNAPSHOT, "url": "https://example.com"}

    file_url = parse_line("file:///tmp/example.txt")
    assert file_url == {"type": TYPE_SNAPSHOT, "url": "file:///tmp/example.txt"}

    snapshot_json = parse_line('{"type":"Snapshot","url":"https://example.com","tags":"tag1,tag2"}')
    assert snapshot_json is not None
    assert snapshot_json["type"] == TYPE_SNAPSHOT
    assert snapshot_json["tags"] == "tag1,tag2"

    crawl_json = parse_line('{"type":"Crawl","id":"abc123","urls":"https://example.com","max_depth":1}')
    assert crawl_json is not None
    assert crawl_json["type"] == TYPE_CRAWL
    assert crawl_json["id"] == "abc123"
    assert crawl_json["max_depth"] == 1

    snapshot_id = "01234567-89ab-cdef-0123-456789abcdef"
    parsed_id = parse_line(snapshot_id)
    assert parsed_id == {"type": TYPE_SNAPSHOT, "id": snapshot_id}

    compact_snapshot_id = "0123456789abcdef0123456789abcdef"
    compact_parsed_id = parse_line(compact_snapshot_id)
    assert compact_parsed_id == {"type": TYPE_SNAPSHOT, "id": compact_snapshot_id}


def test_read_args_or_stdin_handles_args_stdin_and_mixed_jsonl():
    """Piping helpers should consume args, structured JSONL, and pass-through records."""
    from archivebox.misc.jsonl import TYPE_CRAWL, read_args_or_stdin

    records = list(read_args_or_stdin(("https://example1.com", "https://example2.com")))
    assert [record["url"] for record in records] == ["https://example1.com", "https://example2.com"]

    stdin_records = list(
        read_args_or_stdin(
            (),
            stream=MockTTYStringIO(
                'https://plain-url.com\n'
                '{"type":"Snapshot","url":"https://jsonl-url.com","tags":"test"}\n'
                '{"type":"Tag","id":"tag-1","name":"example"}\n'
                '01234567-89ab-cdef-0123-456789abcdef\n'
                'not valid json\n',
                is_tty=False,
            ),
        )
    )
    assert len(stdin_records) == 4
    assert stdin_records[0]["url"] == "https://plain-url.com"
    assert stdin_records[1]["url"] == "https://jsonl-url.com"
    assert stdin_records[1]["tags"] == "test"
    assert stdin_records[2]["type"] == "Tag"
    assert stdin_records[2]["name"] == "example"
    assert stdin_records[3]["id"] == "01234567-89ab-cdef-0123-456789abcdef"

    crawl_records = list(
        read_args_or_stdin(
            (),
            stream=MockTTYStringIO(
                '{"type":"Crawl","id":"crawl-1","urls":"https://example.com\\nhttps://foo.com"}\n',
                is_tty=False,
            ),
        )
    )
    assert len(crawl_records) == 1
    assert crawl_records[0]["type"] == TYPE_CRAWL
    assert crawl_records[0]["id"] == "crawl-1"

    tty_records = list(read_args_or_stdin((), stream=MockTTYStringIO("https://example.com", is_tty=True)))
    assert tty_records == []


def test_collect_urls_from_plugins_reads_only_parser_outputs(tmp_path):
    """Parser extractor `urls.jsonl` outputs should be discoverable for recursive piping."""
    from archivebox.hooks import collect_urls_from_plugins

    (tmp_path / "wget").mkdir()
    (tmp_path / "wget" / "urls.jsonl").write_text(
        '{"url":"https://wget-link-1.com"}\n'
        '{"url":"https://wget-link-2.com"}\n',
        encoding="utf-8",
    )
    (tmp_path / "parse_html_urls").mkdir()
    (tmp_path / "parse_html_urls" / "urls.jsonl").write_text(
        '{"url":"https://html-link-1.com"}\n'
        '{"url":"https://html-link-2.com","title":"HTML Link 2"}\n',
        encoding="utf-8",
    )
    (tmp_path / "screenshot").mkdir()

    urls = collect_urls_from_plugins(tmp_path)
    assert len(urls) == 4
    assert {url["plugin"] for url in urls} == {"wget", "parse_html_urls"}
    titled = [url for url in urls if url.get("title") == "HTML Link 2"]
    assert len(titled) == 1
    assert titled[0]["url"] == "https://html-link-2.com"

    assert collect_urls_from_plugins(tmp_path / "nonexistent") == []


def test_crawl_create_stdout_pipes_into_run(initialized_archive):
    """`archivebox crawl create | archivebox run` should queue and materialize snapshots."""
    url = create_test_url()

    create_stdout, create_stderr, create_code = run_archivebox_cmd(
        ["crawl", "create", url],
        data_dir=initialized_archive,
    )
    assert create_code == 0, create_stderr
    _assert_stdout_is_jsonl_only(create_stdout)

    crawl = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Crawl")

    run_stdout, run_stderr, run_code = run_archivebox_cmd(
        ["run"],
        stdin=create_stdout,
        data_dir=initialized_archive,
        timeout=120,
        env=PIPE_TEST_ENV,
    )
    assert run_code == 0, run_stderr
    _assert_stdout_is_jsonl_only(run_stdout)

    run_records = parse_jsonl_output(run_stdout)
    assert any(record.get("type") == "Crawl" and record.get("id") == crawl["id"] for record in run_records)

    snapshot_count = _db_value(
        initialized_archive,
        "SELECT COUNT(*) FROM core_snapshot WHERE crawl_id = ?",
        (crawl["id"],),
    )
    assert isinstance(snapshot_count, int)
    assert snapshot_count >= 1


def test_snapshot_list_stdout_pipes_into_run(initialized_archive):
    """`archivebox snapshot list | archivebox run` should requeue listed snapshots."""
    url = create_test_url()

    create_stdout, create_stderr, create_code = run_archivebox_cmd(
        ["snapshot", "create", url],
        data_dir=initialized_archive,
    )
    assert create_code == 0, create_stderr
    snapshot = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Snapshot")

    list_stdout, list_stderr, list_code = run_archivebox_cmd(
        ["snapshot", "list", "--status=queued", f"--url__icontains={snapshot['id']}"],
        data_dir=initialized_archive,
    )
    if list_code != 0 or not parse_jsonl_output(list_stdout):
        list_stdout, list_stderr, list_code = run_archivebox_cmd(
            ["snapshot", "list", f"--url__icontains={url}"],
            data_dir=initialized_archive,
        )
    assert list_code == 0, list_stderr
    _assert_stdout_is_jsonl_only(list_stdout)

    run_stdout, run_stderr, run_code = run_archivebox_cmd(
        ["run"],
        stdin=list_stdout,
        data_dir=initialized_archive,
        timeout=120,
        env=PIPE_TEST_ENV,
    )
    assert run_code == 0, run_stderr
    _assert_stdout_is_jsonl_only(run_stdout)

    run_records = parse_jsonl_output(run_stdout)
    assert any(record.get("type") == "Snapshot" and record.get("id") == snapshot["id"] for record in run_records)

    snapshot_status = _db_value(
        initialized_archive,
        "SELECT status FROM core_snapshot WHERE id = ?",
        (snapshot["id"],),
    )
    assert snapshot_status == "sealed"


def test_archiveresult_list_stdout_pipes_into_orchestrator_alias(initialized_archive):
    """`archivebox archiveresult list | archivebox orchestrator` should preserve clean JSONL stdout."""
    url = create_test_url()

    snapshot_stdout, snapshot_stderr, snapshot_code = run_archivebox_cmd(
        ["snapshot", "create", url],
        data_dir=initialized_archive,
    )
    assert snapshot_code == 0, snapshot_stderr

    ar_create_stdout, ar_create_stderr, ar_create_code = run_archivebox_cmd(
        ["archiveresult", "create", "--plugin=favicon"],
        stdin=snapshot_stdout,
        data_dir=initialized_archive,
    )
    assert ar_create_code == 0, ar_create_stderr

    created_records = parse_jsonl_output(ar_create_stdout)
    archiveresult = next(record for record in created_records if record.get("type") == "ArchiveResult")

    list_stdout, list_stderr, list_code = run_archivebox_cmd(
        ["archiveresult", "list", "--plugin=favicon"],
        data_dir=initialized_archive,
    )
    assert list_code == 0, list_stderr
    _assert_stdout_is_jsonl_only(list_stdout)

    orchestrator_stdout, orchestrator_stderr, orchestrator_code = run_archivebox_cmd(
        ["orchestrator"],
        stdin=list_stdout,
        data_dir=initialized_archive,
        timeout=120,
        env=PIPE_TEST_ENV,
    )
    assert orchestrator_code == 0, orchestrator_stderr
    _assert_stdout_is_jsonl_only(orchestrator_stdout)
    assert "renamed to `archivebox run`" in orchestrator_stderr

    run_records = parse_jsonl_output(orchestrator_stdout)
    assert any(
        record.get("type") == "ArchiveResult" and record.get("id") == archiveresult["id"]
        for record in run_records
    )


def test_binary_create_stdout_pipes_into_run(initialized_archive):
    """`archivebox binary create | archivebox run` should queue the binary record for processing."""
    create_stdout, create_stderr, create_code = run_archivebox_cmd(
        ["binary", "create", "--name=python3", f"--abspath={sys.executable}", "--version=test"],
        data_dir=initialized_archive,
    )
    assert create_code == 0, create_stderr
    _assert_stdout_is_jsonl_only(create_stdout)

    binary = next(record for record in parse_jsonl_output(create_stdout) if record.get("type") == "Binary")

    run_stdout, run_stderr, run_code = run_archivebox_cmd(
        ["run"],
        stdin=create_stdout,
        data_dir=initialized_archive,
        timeout=120,
    )
    assert run_code == 0, run_stderr
    _assert_stdout_is_jsonl_only(run_stdout)

    run_records = parse_jsonl_output(run_stdout)
    assert any(record.get("type") == "Binary" and record.get("id") == binary["id"] for record in run_records)

    status = _db_value(
        initialized_archive,
        "SELECT status FROM machine_binary WHERE id = ?",
        (binary["id"],),
    )
    assert status in {"queued", "installed"}


def test_multi_stage_pipeline_into_run(initialized_archive):
    """`crawl create | snapshot create | archiveresult create | run` should preserve JSONL and finish work."""
    url = create_test_url()

    crawl_stdout, crawl_stderr, crawl_code = run_archivebox_cmd(
        ["crawl", "create", url],
        data_dir=initialized_archive,
    )
    assert crawl_code == 0, crawl_stderr
    _assert_stdout_is_jsonl_only(crawl_stdout)

    snapshot_stdout, snapshot_stderr, snapshot_code = run_archivebox_cmd(
        ["snapshot", "create"],
        stdin=crawl_stdout,
        data_dir=initialized_archive,
    )
    assert snapshot_code == 0, snapshot_stderr
    _assert_stdout_is_jsonl_only(snapshot_stdout)

    archiveresult_stdout, archiveresult_stderr, archiveresult_code = run_archivebox_cmd(
        ["archiveresult", "create", "--plugin=favicon"],
        stdin=snapshot_stdout,
        data_dir=initialized_archive,
    )
    assert archiveresult_code == 0, archiveresult_stderr
    _assert_stdout_is_jsonl_only(archiveresult_stdout)

    run_stdout, run_stderr, run_code = run_archivebox_cmd(
        ["run"],
        stdin=archiveresult_stdout,
        data_dir=initialized_archive,
        timeout=120,
        env=PIPE_TEST_ENV,
    )
    assert run_code == 0, run_stderr
    _assert_stdout_is_jsonl_only(run_stdout)

    run_records = parse_jsonl_output(run_stdout)
    snapshot = next(record for record in run_records if record.get("type") == "Snapshot")
    assert any(record.get("type") == "ArchiveResult" for record in run_records)

    snapshot_status = _db_value(
        initialized_archive,
        "SELECT status FROM core_snapshot WHERE id = ?",
        (snapshot["id"],),
    )
    assert snapshot_status == "sealed"