From 1d94645abd43cfa7f22bf6e1578d89e0bd3fc583 Mon Sep 17 00:00:00 2001 From: Nick Sweeting Date: Mon, 23 Mar 2026 04:12:31 -0700 Subject: [PATCH] test fixes --- archivebox/machine/env_utils.py | 18 ++------- archivebox/tests/conftest.py | 69 +++++++++++++++++++++++++++++---- bin/release.sh | 54 +++++++++++++++++++------- docker-compose.yml | 25 ++++++------ uv.lock | 2 +- 5 files changed, 120 insertions(+), 48 deletions(-) diff --git a/archivebox/machine/env_utils.py b/archivebox/machine/env_utils.py index 06a42563..9ab2d5f8 100644 --- a/archivebox/machine/env_utils.py +++ b/archivebox/machine/env_utils.py @@ -26,26 +26,14 @@ def is_redacted_env_key(key: str) -> bool: def redact_env(env: dict[str, Any] | None) -> dict[str, Any]: if not isinstance(env, dict): return {} - return { - str(key): value - for key, value in env.items() - if key is not None and not is_redacted_env_key(str(key)) - } + return {str(key): value for key, value in env.items() if key is not None and not is_redacted_env_key(str(key))} def env_to_dotenv_text(env: dict[str, Any] | None) -> str: redacted_env = redact_env(env) - return "\n".join( - f"{key}={shlex.quote(stringify_env_value(value))}" - for key, value in sorted(redacted_env.items()) - if value is not None - ) + return "\n".join(f"{key}={shlex.quote(stringify_env_value(value))}" for key, value in sorted(redacted_env.items()) if value is not None) def env_to_shell_exports(env: dict[str, Any] | None) -> str: redacted_env = redact_env(env) - return " ".join( - f"{key}={shlex.quote(stringify_env_value(value))}" - for key, value in sorted(redacted_env.items()) - if value is not None - ) + return " ".join(f"{key}={shlex.quote(stringify_env_value(value))}" for key, value in sorted(redacted_env.items()) if value is not None) diff --git a/archivebox/tests/conftest.py b/archivebox/tests/conftest.py index 2c25b1ff..b87818c2 100644 --- a/archivebox/tests/conftest.py +++ b/archivebox/tests/conftest.py @@ -1,8 +1,9 @@ """archivebox/tests/conftest.py - Pytest fixtures for CLI tests.""" import os -import sys +import secrets import subprocess +import sys import tempfile import textwrap import time @@ -12,12 +13,35 @@ from typing import Any import pytest -from archivebox.uuid_compat import uuid7 - pytest_plugins = ["archivebox.tests.fixtures"] +REPO_ROOT = Path(__file__).resolve().parents[2] SESSION_DATA_DIR = Path(tempfile.mkdtemp(prefix="archivebox-pytest-session-")).resolve() -os.environ.setdefault("DATA_DIR", str(SESSION_DATA_DIR)) +# Force ArchiveBox imports to see a temp DATA_DIR and cwd during test collection. +os.environ["DATA_DIR"] = str(SESSION_DATA_DIR) +os.environ.pop("CRAWL_DIR", None) +os.environ.pop("SNAP_DIR", None) +os.chdir(SESSION_DATA_DIR) + + +def _is_repo_path(path: Path) -> bool: + resolved = path.expanduser().resolve(strict=False) + return resolved == REPO_ROOT or REPO_ROOT in resolved.parents + + +def _assert_not_repo_path(path: Path, *, label: str) -> None: + if _is_repo_path(path): + raise AssertionError(f"{label} must not point inside the repo root during tests: {path}") + + +def _assert_safe_runtime_paths(*, cwd: Path | None = None, env: dict[str, str] | None = None) -> None: + if cwd is not None: + _assert_not_repo_path(cwd, label="cwd") + + for key in ("DATA_DIR", "CRAWL_DIR", "SNAP_DIR"): + value = (env or {}).get(key) + if value: + _assert_not_repo_path(Path(value), label=key) # ============================================================================= @@ -47,6 +71,7 @@ def run_archivebox_cmd( """ cmd = [sys.executable, "-m", "archivebox"] + args + _assert_not_repo_path(data_dir, label="DATA_DIR") base_env = os.environ.copy() base_env["DATA_DIR"] = str(data_dir) base_env["USE_COLOR"] = "False" @@ -71,6 +96,7 @@ def run_archivebox_cmd( if env: base_env.update(env) + _assert_safe_runtime_paths(cwd=data_dir, env=base_env) result = subprocess.run( cmd, input=stdin, @@ -90,7 +116,7 @@ def run_archivebox_cmd( @pytest.fixture(autouse=True) -def isolate_test_runtime(tmp_path): +def isolate_test_runtime(tmp_path, monkeypatch): """ Run each pytest test from an isolated temp cwd and restore env mutations. @@ -104,14 +130,35 @@ def isolate_test_runtime(tmp_path): seed a separate session-scoped temp ``DATA_DIR`` above so any ArchiveBox config imported before this fixture runs never points at the repo root. """ + _assert_not_repo_path(tmp_path, label="tmp_path") original_cwd = Path.cwd() original_env = os.environ.copy() + original_chdir = os.chdir + original_popen = subprocess.Popen os.chdir(tmp_path) os.environ.pop("DATA_DIR", None) + os.environ.pop("CRAWL_DIR", None) + os.environ.pop("SNAP_DIR", None) + + def guarded_chdir(path: os.PathLike[str] | str) -> None: + _assert_not_repo_path(Path(path), label="cwd") + original_chdir(path) + + def guarded_popen(*args: Any, **kwargs: Any): + cwd = kwargs.get("cwd") + env = kwargs.get("env") + if cwd is not None: + _assert_not_repo_path(Path(cwd), label="cwd") + _assert_safe_runtime_paths(cwd=Path(cwd) if cwd is not None else None, env=env) + return original_popen(*args, **kwargs) + + monkeypatch.setattr(os, "chdir", guarded_chdir) + monkeypatch.setattr(subprocess, "Popen", guarded_popen) try: + _assert_safe_runtime_paths(cwd=Path.cwd(), env=os.environ) yield finally: - os.chdir(original_cwd) + original_chdir(original_cwd) os.environ.clear() os.environ.update(original_env) @@ -166,14 +213,18 @@ def run_archivebox_cmd_cwd( """ cmd = [sys.executable, "-m", "archivebox"] + args + _assert_not_repo_path(cwd, label="cwd") base_env = os.environ.copy() base_env.pop("DATA_DIR", None) + base_env.pop("CRAWL_DIR", None) + base_env.pop("SNAP_DIR", None) base_env["USE_COLOR"] = "False" base_env["SHOW_PROGRESS"] = "False" if env: base_env.update(env) + _assert_safe_runtime_paths(cwd=cwd, env=base_env) result = subprocess.run( cmd, input=stdin, @@ -202,8 +253,12 @@ def run_python_cwd( cwd: Path, timeout: int = 60, ) -> tuple[str, str, int]: + _assert_not_repo_path(cwd, label="cwd") base_env = os.environ.copy() base_env.pop("DATA_DIR", None) + base_env.pop("CRAWL_DIR", None) + base_env.pop("SNAP_DIR", None) + _assert_safe_runtime_paths(cwd=cwd, env=base_env) result = subprocess.run( [sys.executable, "-"], input=script, @@ -446,7 +501,7 @@ def assert_record_has_fields(record: dict[str, Any], required_fields: list[str]) def create_test_url(domain: str = "example.com", path: str | None = None) -> str: """Generate unique test URL.""" - path = path or uuid7().hex[:8] + path = path or secrets.token_hex(4) return f"https://{domain}/{path}" diff --git a/bin/release.sh b/bin/release.sh index c53b0aaa..59996cbf 100755 --- a/bin/release.sh +++ b/bin/release.sh @@ -305,6 +305,10 @@ create_release() { if [[ "${version}" == *rc* ]]; then prerelease_args+=(--prerelease) fi + if gh release view "${TAG_PREFIX}${version}" --repo "${slug}" >/dev/null 2>&1; then + echo "GitHub release ${TAG_PREFIX}${version} already exists" + return 0 + fi gh release create "${TAG_PREFIX}${version}" \ --repo "${slug}" \ @@ -318,13 +322,17 @@ publish_artifacts() { local version="$1" local pypi_token="${UV_PUBLISH_TOKEN:-${PYPI_TOKEN:-${PYPI_PAT_SECRET:-}}}" - if [[ -n "${pypi_token}" ]]; then - UV_PUBLISH_TOKEN="${pypi_token}" uv publish --username=__token__ dist/* - elif [[ -n "${GITHUB_ACTIONS:-}" ]]; then - uv publish --trusted-publishing always dist/* + if curl -fsSL "https://pypi.org/pypi/${PYPI_PACKAGE}/json" | jq -e --arg version "${version}" '.releases[$version] | length > 0' >/dev/null 2>&1; then + echo "${PYPI_PACKAGE} ${version} already published on PyPI" else - echo "Missing PyPI credentials: set UV_PUBLISH_TOKEN or PYPI_TOKEN" >&2 - return 1 + if [[ -n "${pypi_token}" ]]; then + UV_PUBLISH_TOKEN="${pypi_token}" uv publish --username=__token__ dist/* + elif [[ -n "${GITHUB_ACTIONS:-}" ]]; then + uv publish --trusted-publishing always dist/* + else + echo "Missing PyPI credentials: set UV_PUBLISH_TOKEN or PYPI_TOKEN" >&2 + return 1 + fi fi wait_for_pypi "${PYPI_PACKAGE}" "${version}" @@ -347,15 +355,35 @@ main() { return 1 fi - update_internal_dependencies - version="$(bump_version)" - run_checks + version="$(current_version)" + latest="$(latest_release_version "${slug}")" + if [[ -z "${latest}" ]]; then + relation="gt" + else + relation="$(compare_versions "${version}" "${latest}")" + fi - git add -A - git commit -m "release: ${TAG_PREFIX}${version}" - git push origin "${branch}" + if [[ "${relation}" == "eq" ]]; then + update_internal_dependencies + version="$(bump_version)" + run_checks - wait_for_runs "${slug}" push "$(git rev-parse HEAD)" "push" + git add -A + git commit -m "release: ${TAG_PREFIX}${version}" + git push origin "${branch}" + + wait_for_runs "${slug}" push "$(git rev-parse HEAD)" "push" + elif [[ "${relation}" == "gt" ]]; then + if [[ -n "$(git status --short)" ]]; then + echo "Refusing to publish existing unreleased version ${version} with a dirty worktree" >&2 + return 1 + fi + run_checks + wait_for_runs "${slug}" push "$(git rev-parse HEAD)" "push" + else + echo "Current version ${version} is behind latest GitHub release ${latest}" >&2 + return 1 + fi publish_artifacts "${version}" create_release "${slug}" "${version}" diff --git a/docker-compose.yml b/docker-compose.yml index 975f5064..f7066fb3 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -26,8 +26,8 @@ services: - PUBLIC_INDEX=True # set to False to prevent anonymous users from viewing snapshot list - PUBLIC_SNAPSHOTS=True # set to False to prevent anonymous users from viewing snapshot content - PUBLIC_ADD_VIEW=False # set to True to allow anonymous users to submit new URLs to archive - - SEARCH_BACKEND_ENGINE=sonic # tells ArchiveBox to use sonic container below for fast full-text search - - SEARCH_BACKEND_HOST_NAME=sonic + - SEARCH_BACKEND_ENGINE=sonic # tells ArchiveBox to use its built-in Sonic worker for fast full-text search + # - SEARCH_BACKEND_HOST_NAME=127.0.0.1 - SEARCH_BACKEND_PASSWORD=SomeSecretPassword # - PUID=911 # set to your host user's UID & GID if you encounter permissions issues # - PGID=911 # UID/GIDs lower than 500 may clash with system uids and are not recommended @@ -54,20 +54,21 @@ services: # https://github.com/ArchiveBox/ArchiveBox/wiki/Scheduled-Archiving - ### This runs the optional Sonic full-text search backend (much faster than default rg backend). + ### ArchiveBox now starts and uses Sonic automatically when SEARCH_BACKEND_ENGINE=sonic, + # so the old standalone docker sidecar below is no longer necessary. # If Sonic is ever started after not running for a while, update its full-text index by running: # $ docker compose run archivebox update --index-only # https://github.com/ArchiveBox/ArchiveBox/wiki/Setting-up-Search - sonic: - image: archivebox/sonic:latest - expose: - - 1491 - environment: - - SEARCH_BACKEND_PASSWORD=SomeSecretPassword - volumes: - #- ./sonic.cfg:/etc/sonic.cfg:ro # mount to customize: https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/stable/etc/sonic.cfg - - ./data/sonic:/var/lib/sonic/store + # sonic: + # image: archivebox/sonic:latest + # expose: + # - 1491 + # environment: + # - SEARCH_BACKEND_PASSWORD=SomeSecretPassword + # volumes: + # #- ./sonic.cfg:/etc/sonic.cfg:ro # mount to customize: https://raw.githubusercontent.com/ArchiveBox/ArchiveBox/stable/etc/sonic.cfg + # - ./data/sonic:/var/lib/sonic/store ### This optional container runs xvfb+noVNC so you can watch the ArchiveBox browser as it archives things, diff --git a/uv.lock b/uv.lock index c14c79f7..fe4c1893 100644 --- a/uv.lock +++ b/uv.lock @@ -130,7 +130,7 @@ dev = [{ name = "prek", specifier = ">=0.3.6" }] [[package]] name = "abxbus" -version = "2.4.7" +version = "2.4.8" source = { editable = "../abxbus" } dependencies = [ { name = "aiofiles", marker = "sys_platform == 'darwin' or sys_platform == 'linux'" },