Files
ArchiveBox/archivebox/tests/conftest.py
Nick Sweeting 1d94645abd test fixes
2026-03-23 04:12:31 -07:00

530 lines
17 KiB
Python

"""archivebox/tests/conftest.py - Pytest fixtures for CLI tests."""
import os
import secrets
import subprocess
import sys
import tempfile
import textwrap
import time
import shutil
from pathlib import Path
from typing import Any
import pytest
pytest_plugins = ["archivebox.tests.fixtures"]
REPO_ROOT = Path(__file__).resolve().parents[2]
SESSION_DATA_DIR = Path(tempfile.mkdtemp(prefix="archivebox-pytest-session-")).resolve()
# Force ArchiveBox imports to see a temp DATA_DIR and cwd during test collection.
os.environ["DATA_DIR"] = str(SESSION_DATA_DIR)
os.environ.pop("CRAWL_DIR", None)
os.environ.pop("SNAP_DIR", None)
os.chdir(SESSION_DATA_DIR)
def _is_repo_path(path: Path) -> bool:
resolved = path.expanduser().resolve(strict=False)
return resolved == REPO_ROOT or REPO_ROOT in resolved.parents
def _assert_not_repo_path(path: Path, *, label: str) -> None:
if _is_repo_path(path):
raise AssertionError(f"{label} must not point inside the repo root during tests: {path}")
def _assert_safe_runtime_paths(*, cwd: Path | None = None, env: dict[str, str] | None = None) -> None:
if cwd is not None:
_assert_not_repo_path(cwd, label="cwd")
for key in ("DATA_DIR", "CRAWL_DIR", "SNAP_DIR"):
value = (env or {}).get(key)
if value:
_assert_not_repo_path(Path(value), label=key)
# =============================================================================
# CLI Helpers (defined before fixtures that use them)
# =============================================================================
def run_archivebox_cmd(
args: list[str],
data_dir: Path,
stdin: str | None = None,
timeout: int = 60,
env: dict[str, str] | None = None,
) -> tuple[str, str, int]:
"""
Run archivebox command via subprocess, return (stdout, stderr, returncode).
Args:
args: Command arguments (e.g., ['crawl', 'create', 'https://example.com'])
data_dir: The DATA_DIR to use
stdin: Optional string to pipe to stdin
timeout: Command timeout in seconds
env: Additional environment variables
Returns:
Tuple of (stdout, stderr, returncode)
"""
cmd = [sys.executable, "-m", "archivebox"] + args
_assert_not_repo_path(data_dir, label="DATA_DIR")
base_env = os.environ.copy()
base_env["DATA_DIR"] = str(data_dir)
base_env["USE_COLOR"] = "False"
base_env["SHOW_PROGRESS"] = "False"
# Disable slow extractors for faster tests
base_env["SAVE_ARCHIVEDOTORG"] = "False"
base_env["SAVE_TITLE"] = "False"
base_env["SAVE_FAVICON"] = "False"
base_env["SAVE_WGET"] = "False"
base_env["SAVE_WARC"] = "False"
base_env["SAVE_PDF"] = "False"
base_env["SAVE_SCREENSHOT"] = "False"
base_env["SAVE_DOM"] = "False"
base_env["SAVE_SINGLEFILE"] = "False"
base_env["SAVE_READABILITY"] = "False"
base_env["SAVE_MERCURY"] = "False"
base_env["SAVE_GIT"] = "False"
base_env["SAVE_YTDLP"] = "False"
base_env["SAVE_HEADERS"] = "False"
base_env["SAVE_HTMLTOTEXT"] = "False"
if env:
base_env.update(env)
_assert_safe_runtime_paths(cwd=data_dir, env=base_env)
result = subprocess.run(
cmd,
input=stdin,
capture_output=True,
text=True,
cwd=data_dir,
env=base_env,
timeout=timeout,
)
return result.stdout, result.stderr, result.returncode
# =============================================================================
# Fixtures
# =============================================================================
@pytest.fixture(autouse=True)
def isolate_test_runtime(tmp_path, monkeypatch):
"""
Run each pytest test from an isolated temp cwd and restore env mutations.
The maintained pytest suite lives under ``archivebox/tests``. Many of those
CLI tests shell out without passing ``cwd=`` explicitly, so the safest
contract is that every test starts in its own temp directory and any
in-process ``os.environ`` edits are rolled back afterwards.
We intentionally clear ``DATA_DIR`` for the body of each test so subprocess
tests that rely on cwd keep working. During collection/import time we still
seed a separate session-scoped temp ``DATA_DIR`` above so any ArchiveBox
config imported before this fixture runs never points at the repo root.
"""
_assert_not_repo_path(tmp_path, label="tmp_path")
original_cwd = Path.cwd()
original_env = os.environ.copy()
original_chdir = os.chdir
original_popen = subprocess.Popen
os.chdir(tmp_path)
os.environ.pop("DATA_DIR", None)
os.environ.pop("CRAWL_DIR", None)
os.environ.pop("SNAP_DIR", None)
def guarded_chdir(path: os.PathLike[str] | str) -> None:
_assert_not_repo_path(Path(path), label="cwd")
original_chdir(path)
def guarded_popen(*args: Any, **kwargs: Any):
cwd = kwargs.get("cwd")
env = kwargs.get("env")
if cwd is not None:
_assert_not_repo_path(Path(cwd), label="cwd")
_assert_safe_runtime_paths(cwd=Path(cwd) if cwd is not None else None, env=env)
return original_popen(*args, **kwargs)
monkeypatch.setattr(os, "chdir", guarded_chdir)
monkeypatch.setattr(subprocess, "Popen", guarded_popen)
try:
_assert_safe_runtime_paths(cwd=Path.cwd(), env=os.environ)
yield
finally:
original_chdir(original_cwd)
os.environ.clear()
os.environ.update(original_env)
def pytest_sessionfinish(session, exitstatus):
shutil.rmtree(SESSION_DATA_DIR, ignore_errors=True)
@pytest.fixture
def isolated_data_dir(tmp_path):
"""
Create isolated DATA_DIR for each test.
Uses tmp_path for complete isolation.
"""
data_dir = tmp_path / "archivebox_data"
data_dir.mkdir()
return data_dir
@pytest.fixture
def initialized_archive(isolated_data_dir):
"""
Initialize ArchiveBox archive in isolated directory.
Runs `archivebox init` via subprocess to set up database and directories.
"""
stdout, stderr, returncode = run_archivebox_cmd(
["init", "--quick"],
data_dir=isolated_data_dir,
timeout=60,
)
assert returncode == 0, f"archivebox init failed: {stderr}"
return isolated_data_dir
# =============================================================================
# CWD-based CLI Helpers (no DATA_DIR env)
# =============================================================================
def run_archivebox_cmd_cwd(
args: list[str],
cwd: Path,
stdin: str | None = None,
timeout: int = 60,
env: dict[str, str] | None = None,
) -> tuple[str, str, int]:
"""
Run archivebox command via subprocess using cwd as DATA_DIR (no DATA_DIR env).
Returns (stdout, stderr, returncode).
"""
cmd = [sys.executable, "-m", "archivebox"] + args
_assert_not_repo_path(cwd, label="cwd")
base_env = os.environ.copy()
base_env.pop("DATA_DIR", None)
base_env.pop("CRAWL_DIR", None)
base_env.pop("SNAP_DIR", None)
base_env["USE_COLOR"] = "False"
base_env["SHOW_PROGRESS"] = "False"
if env:
base_env.update(env)
_assert_safe_runtime_paths(cwd=cwd, env=base_env)
result = subprocess.run(
cmd,
input=stdin,
capture_output=True,
text=True,
cwd=cwd,
env=base_env,
timeout=timeout,
)
return result.stdout, result.stderr, result.returncode
def stop_process(proc: subprocess.Popen[str]) -> tuple[str, str]:
if proc.poll() is None:
proc.terminate()
try:
return proc.communicate(timeout=5)
except subprocess.TimeoutExpired:
proc.kill()
return proc.communicate()
def run_python_cwd(
script: str,
cwd: Path,
timeout: int = 60,
) -> tuple[str, str, int]:
_assert_not_repo_path(cwd, label="cwd")
base_env = os.environ.copy()
base_env.pop("DATA_DIR", None)
base_env.pop("CRAWL_DIR", None)
base_env.pop("SNAP_DIR", None)
_assert_safe_runtime_paths(cwd=cwd, env=base_env)
result = subprocess.run(
[sys.executable, "-"],
input=script,
capture_output=True,
text=True,
cwd=cwd,
env=base_env,
timeout=timeout,
)
return result.stdout, result.stderr, result.returncode
def wait_for_archive_outputs(
cwd: Path,
url: str,
timeout: int = 120,
interval: float = 1.0,
) -> bool:
script = textwrap.dedent(
f"""\
from pathlib import Path
import os
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'archivebox.core.settings')
import django
django.setup()
from archivebox.core.models import Snapshot
snapshot = Snapshot.objects.filter(url={url!r}).order_by('-created_at').first()
if snapshot is None or snapshot.status != 'sealed':
raise SystemExit(1)
output_rel = None
for output in snapshot.discover_outputs():
candidate = output.get('path')
if not candidate or candidate.startswith('responses/'):
continue
if Path(snapshot.output_dir, candidate).is_file():
output_rel = candidate
break
if output_rel is None:
fallback = Path(snapshot.output_dir, 'index.jsonl')
if fallback.exists():
output_rel = 'index.jsonl'
if output_rel is None:
snapshot_dir = Path(snapshot.output_dir)
for candidate in snapshot_dir.rglob('*'):
if not candidate.is_file():
continue
rel_path = candidate.relative_to(snapshot_dir)
if rel_path.parts and rel_path.parts[0] == 'responses':
continue
if rel_path.name in {"stdout.log", "stderr.log", "cmd.sh"}:
continue
output_rel = str(rel_path)
break
if output_rel is None:
raise SystemExit(1)
responses_root = Path(snapshot.output_dir) / 'responses' / snapshot.domain
if not responses_root.exists():
raise SystemExit(1)
if not any(candidate.is_file() for candidate in responses_root.rglob('*')):
raise SystemExit(1)
print('READY')
""",
)
deadline = time.time() + timeout
while time.time() < deadline:
stdout, _stderr, returncode = run_python_cwd(script, cwd=cwd, timeout=30)
if returncode == 0 and "READY" in stdout:
return True
time.sleep(interval)
return False
def _get_machine_type() -> str:
import platform
os_name = platform.system().lower()
arch = platform.machine().lower()
in_docker = os.environ.get("IN_DOCKER", "").lower() in ("1", "true", "yes")
suffix = "-docker" if in_docker else ""
return f"{arch}-{os_name}{suffix}"
def _find_cached_chromium(lib_dir: Path) -> Path | None:
candidates = [
lib_dir / "puppeteer",
lib_dir / "npm" / "node_modules" / "puppeteer" / ".local-chromium",
]
for base in candidates:
if not base.exists():
continue
for path in base.rglob("Chromium.app/Contents/MacOS/Chromium"):
return path
for path in base.rglob("chrome-linux/chrome"):
return path
for path in base.rglob("chrome-linux64/chrome"):
return path
return None
def _find_system_browser() -> Path | None:
candidates = [
Path("/Applications/Chromium.app/Contents/MacOS/Chromium"),
Path("/usr/bin/chromium"),
Path("/usr/bin/chromium-browser"),
]
for candidate in candidates:
if candidate.exists():
return candidate
return None
def _ensure_puppeteer(shared_lib: Path) -> None:
npm_prefix = shared_lib / "npm"
node_modules = npm_prefix / "node_modules"
puppeteer_dir = node_modules / "puppeteer"
if puppeteer_dir.exists():
return
npm_prefix.mkdir(parents=True, exist_ok=True)
env = os.environ.copy()
env["PUPPETEER_SKIP_DOWNLOAD"] = "1"
subprocess.run(
["npm", "install", "puppeteer"],
cwd=str(npm_prefix),
env=env,
check=True,
capture_output=True,
text=True,
timeout=600,
)
@pytest.fixture(scope="class")
def real_archive_with_example(tmp_path_factory, request):
"""
Initialize archive and add https://example.com using responses only.
Uses cwd for DATA_DIR.
"""
tmp_path = tmp_path_factory.mktemp("archivebox_data")
if getattr(request, "cls", None) is not None:
request.cls.data_dir = tmp_path
stdout, stderr, returncode = run_archivebox_cmd_cwd(
["init", "--quick"],
cwd=tmp_path,
timeout=120,
)
assert returncode == 0, f"archivebox init failed: {stderr}"
stdout, stderr, returncode = run_archivebox_cmd_cwd(
[
"config",
"--set",
"LISTEN_HOST=archivebox.localhost:8000",
"PUBLIC_INDEX=True",
"PUBLIC_SNAPSHOTS=True",
"PUBLIC_ADD_VIEW=True",
],
cwd=tmp_path,
)
assert returncode == 0, f"archivebox config failed: {stderr}"
add_env = {
"RESPONSES_ENABLED": "True",
"SHOW_PROGRESS": "False",
"USE_COLOR": "False",
"RESPONSES_TIMEOUT": "30",
}
cmd = [sys.executable, "-m", "archivebox", "add", "--depth=0", "--plugins=responses", "https://example.com"]
base_env = os.environ.copy()
base_env.pop("DATA_DIR", None)
base_env["USE_COLOR"] = "False"
base_env["SHOW_PROGRESS"] = "False"
base_env.update(add_env)
proc = subprocess.Popen(
cmd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
cwd=tmp_path,
env=base_env,
)
ready = wait_for_archive_outputs(tmp_path, "https://example.com", timeout=600)
stdout, stderr = stop_process(proc)
assert ready, f"archivebox add did not produce required outputs within timeout:\nSTDOUT:\n{stdout}\nSTDERR:\n{stderr}"
return tmp_path
# =============================================================================
# Output Assertions
# =============================================================================
def parse_jsonl_output(stdout: str) -> list[dict[str, Any]]:
"""Parse JSONL output into list of dicts via Process parser."""
from archivebox.machine.models import Process
return Process.parse_records_from_text(stdout or "")
def assert_jsonl_contains_type(stdout: str, record_type: str, min_count: int = 1):
"""Assert output contains at least min_count records of type."""
records = parse_jsonl_output(stdout)
matching = [r for r in records if r.get("type") == record_type]
assert len(matching) >= min_count, f"Expected >= {min_count} {record_type}, got {len(matching)}"
return matching
def assert_jsonl_pass_through(stdout: str, input_records: list[dict[str, Any]]):
"""Assert that input records appear in output (pass-through behavior)."""
output_records = parse_jsonl_output(stdout)
output_ids = {r.get("id") for r in output_records if r.get("id")}
for input_rec in input_records:
input_id = input_rec.get("id")
if input_id:
assert input_id in output_ids, f"Input record {input_id} not found in output (pass-through failed)"
def assert_record_has_fields(record: dict[str, Any], required_fields: list[str]):
"""Assert record has all required fields with non-None values."""
for field in required_fields:
assert field in record, f"Record missing field: {field}"
assert record[field] is not None, f"Record field is None: {field}"
# =============================================================================
# Test Data Factories
# =============================================================================
def create_test_url(domain: str = "example.com", path: str | None = None) -> str:
"""Generate unique test URL."""
path = path or secrets.token_hex(4)
return f"https://{domain}/{path}"
def create_test_crawl_json(urls: list[str] | None = None, **kwargs) -> dict[str, Any]:
"""Create Crawl JSONL record for testing."""
urls = urls or [create_test_url()]
return {
"type": "Crawl",
"urls": "\n".join(urls),
"max_depth": kwargs.get("max_depth", 0),
"tags_str": kwargs.get("tags_str", ""),
"status": kwargs.get("status", "queued"),
**{k: v for k, v in kwargs.items() if k not in ("max_depth", "tags_str", "status")},
}
def create_test_snapshot_json(url: str | None = None, **kwargs) -> dict[str, Any]:
"""Create Snapshot JSONL record for testing."""
return {
"type": "Snapshot",
"url": url or create_test_url(),
"tags_str": kwargs.get("tags_str", ""),
"status": kwargs.get("status", "queued"),
**{k: v for k, v in kwargs.items() if k not in ("tags_str", "status")},
}