Files
ArchiveBox/archivebox/tests/test_server_security_browser.py
Nick Sweeting b749b26c5d wip
2026-03-23 03:58:32 -07:00

605 lines
20 KiB
Python

#!/usr/bin/env python3
"""Browser-level security mode tests using the existing Node/Puppeteer runtime."""
from __future__ import annotations
import json
import os
import shutil
import signal
import socket
import subprocess
import sys
import textwrap
import time
from pathlib import Path
from urllib.parse import urlencode
import pytest
import requests
from .conftest import _ensure_puppeteer, _find_cached_chromium, _find_system_browser, run_python_cwd
PUPPETEER_PROBE_SCRIPT = """\
const fs = require("node:fs");
const puppeteer = require("puppeteer");
async function login(page, config) {
const result = {
reachable: false,
succeeded: false,
finalUrl: null,
status: null,
error: null,
};
try {
const response = await page.goto(config.adminLoginUrl, {
waitUntil: "networkidle2",
timeout: 15000,
});
result.reachable = true;
result.status = response ? response.status() : null;
const usernameInput = await page.$('input[name="username"]');
const passwordInput = await page.$('input[name="password"]');
if (!usernameInput || !passwordInput) {
result.finalUrl = page.url();
return result;
}
await usernameInput.type(config.username);
await passwordInput.type(config.password);
await Promise.all([
page.waitForNavigation({waitUntil: "networkidle2", timeout: 15000}),
page.click('button[type="submit"], input[type="submit"]'),
]);
result.finalUrl = page.url();
result.succeeded = !page.url().includes("/admin/login/");
return result;
} catch (error) {
result.error = String(error);
result.finalUrl = page.url();
return result;
}
}
async function main() {
const config = JSON.parse(fs.readFileSync(0, "utf8"));
const browser = await puppeteer.launch({
executablePath: config.chromePath,
headless: true,
args: [
"--no-sandbox",
"--disable-dev-shm-usage",
"--disable-background-networking",
],
});
const loginPage = await browser.newPage();
const loginResult = await login(loginPage, config);
await loginPage.close();
const page = await browser.newPage();
const consoleMessages = [];
const requestFailures = [];
page.on("console", (message) => {
consoleMessages.push({type: message.type(), text: message.text()});
});
page.on("pageerror", (error) => {
consoleMessages.push({type: "pageerror", text: String(error)});
});
page.on("requestfailed", (request) => {
requestFailures.push({
url: request.url(),
error: request.failure() ? request.failure().errorText : "unknown",
});
});
const response = await page.goto(config.dangerousUrl, {
waitUntil: "networkidle2",
timeout: 15000,
});
await page.waitForFunction(
() => window.__dangerousScriptRan !== true || window.__probeResults !== undefined,
{timeout: 15000},
);
const pageState = await page.evaluate(() => ({
href: location.href,
scriptRan: window.__dangerousScriptRan === true,
probeResults: window.__probeResults || null,
bodyText: document.body ? document.body.innerText.slice(0, 600) : "",
}));
const output = {
mode: config.mode,
login: loginResult,
dangerousPage: {
status: response ? response.status() : null,
finalUrl: page.url(),
contentSecurityPolicy: response ? response.headers()["content-security-policy"] || null : null,
archiveboxSecurityMode: response ? response.headers()["x-archivebox-security-mode"] || null : null,
},
pageState,
consoleMessages,
requestFailures,
};
console.log(JSON.stringify(output));
await browser.close();
}
main().catch((error) => {
console.error(String(error));
process.exit(1);
});
"""
def _resolve_browser(shared_lib: Path) -> Path | None:
env_browser = os.environ.get("CHROME_BINARY") or os.environ.get("CHROME_BIN")
if env_browser:
candidate = Path(env_browser).expanduser()
if candidate.exists():
return candidate
cached = _find_cached_chromium(shared_lib)
if cached and cached.exists():
return cached
system = _find_system_browser()
if system and system.exists():
return system
which_candidates = ("chromium", "chromium-browser", "google-chrome", "google-chrome-stable", "chrome")
for binary in which_candidates:
resolved = shutil.which(binary)
if resolved:
return Path(resolved)
mac_candidates = (
Path("/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"),
Path("/Applications/Chromium.app/Contents/MacOS/Chromium"),
)
for candidate in mac_candidates:
if candidate.exists():
return candidate
return None
@pytest.fixture(scope="session")
def browser_runtime(tmp_path_factory):
if shutil.which("node") is None or shutil.which("npm") is None:
pytest.skip("Node.js and npm are required for browser security tests")
shared_lib = tmp_path_factory.mktemp("archivebox_browser_lib")
_ensure_puppeteer(shared_lib)
browser = _resolve_browser(shared_lib)
if not browser:
pytest.skip("No Chrome/Chromium binary available for browser security tests")
return {
"node_modules_dir": shared_lib / "npm" / "node_modules",
"chrome_binary": browser,
}
def _seed_archive(data_dir: Path) -> dict[str, object]:
script = textwrap.dedent(
"""
import json
import os
from pathlib import Path
from django.utils import timezone
os.environ.setdefault("DJANGO_SETTINGS_MODULE", "archivebox.core.settings")
import django
django.setup()
from django.contrib.auth import get_user_model
from archivebox.core.models import Snapshot
from archivebox.crawls.models import Crawl
User = get_user_model()
admin, _ = User.objects.get_or_create(
username="testadmin",
defaults={"email": "admin@example.com", "is_staff": True, "is_superuser": True},
)
admin.set_password("testpassword")
admin.save()
snapshots = {}
fixture_specs = (
("attacker", "https://attacker.example/entry", "Attacker Snapshot", "ATTACKER_SECRET"),
("victim", "https://victim.example/private", "Victim Snapshot", "VICTIM_SECRET"),
)
for slug, url, title, secret in fixture_specs:
crawl = Crawl.objects.create(
urls=url,
created_by=admin,
status=Crawl.StatusChoices.SEALED,
retry_at=timezone.now(),
)
snapshot = Snapshot.objects.create(
url=url,
title=title,
crawl=crawl,
status=Snapshot.StatusChoices.SEALED,
downloaded_at=timezone.now(),
)
output_dir = Path(snapshot.output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
(output_dir / "safe.json").write_text(
json.dumps({"slug": slug, "secret": secret}),
encoding="utf-8",
)
if slug == "attacker":
(output_dir / "dangerous.html").write_text(
'''
<!doctype html>
<html>
<body>
<h1>Dangerous Replay Fixture</h1>
<script>
window.__dangerousScriptRan = true;
(async () => {
const params = new URLSearchParams(location.search);
const targets = {
own: params.get("own") || "safe.json",
victim: params.get("victim"),
admin: params.get("admin"),
api: params.get("api"),
};
const results = {};
for (const [label, url] of Object.entries(targets)) {
if (!url) continue;
try {
const response = await fetch(url, {credentials: "include"});
const text = await response.text();
results[label] = {
ok: true,
status: response.status,
url: response.url,
sample: text.slice(0, 120),
};
} catch (error) {
results[label] = {
ok: false,
error: String(error),
};
}
}
window.__probeResults = results;
const pre = document.createElement("pre");
pre.id = "probe-results";
pre.textContent = JSON.stringify(results);
document.body.appendChild(pre);
})().catch((error) => {
window.__probeResults = {fatal: String(error)};
});
</script>
</body>
</html>
''',
encoding="utf-8",
)
snapshots[slug] = {
"id": str(snapshot.id),
"domain": snapshot.domain,
}
print(json.dumps({
"username": "testadmin",
"password": "testpassword",
"snapshots": snapshots,
}))
""",
)
stdout, stderr, returncode = run_python_cwd(script, cwd=data_dir, timeout=120)
assert returncode == 0, stderr
return json.loads(stdout.strip())
def _get_free_port() -> int:
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
sock.bind(("127.0.0.1", 0))
return sock.getsockname()[1]
def _wait_for_http(
port: int,
host: str,
timeout: float = 30.0,
process: subprocess.Popen[str] | None = None,
) -> None:
deadline = time.time() + timeout
last_error = "server did not answer"
while time.time() < deadline:
if process is not None and process.poll() is not None:
raise AssertionError(f"Server exited before becoming ready with code {process.returncode}")
try:
response = requests.get(
f"http://127.0.0.1:{port}/",
headers={"Host": host},
timeout=2,
allow_redirects=False,
)
if response.status_code < 500:
return
last_error = f"HTTP {response.status_code}"
except requests.RequestException as exc:
last_error = str(exc)
time.sleep(0.5)
raise AssertionError(f"Timed out waiting for {host}: {last_error}")
def _start_server(data_dir: Path, *, mode: str, port: int) -> subprocess.Popen[str]:
env = os.environ.copy()
env.pop("DATA_DIR", None)
env.update(
{
"PYTHONPATH": str(Path(__file__).resolve().parents[2]),
"LISTEN_HOST": f"archivebox.localhost:{port}",
"ALLOWED_HOSTS": "*",
"CSRF_TRUSTED_ORIGINS": f"http://archivebox.localhost:{port},http://admin.archivebox.localhost:{port}",
"SERVER_SECURITY_MODE": mode,
"USE_COLOR": "False",
"SHOW_PROGRESS": "False",
"SAVE_ARCHIVEDOTORG": "False",
"SAVE_TITLE": "False",
"SAVE_FAVICON": "False",
"SAVE_WGET": "False",
"SAVE_WARC": "False",
"SAVE_PDF": "False",
"SAVE_SCREENSHOT": "False",
"SAVE_DOM": "False",
"SAVE_SINGLEFILE": "False",
"SAVE_READABILITY": "False",
"SAVE_MERCURY": "False",
"SAVE_GIT": "False",
"SAVE_YTDLP": "False",
"SAVE_HEADERS": "False",
"SAVE_HTMLTOTEXT": "False",
"USE_CHROME": "False",
},
)
process = subprocess.Popen(
[sys.executable, "-m", "archivebox", "server", "--debug", "--nothreading", f"127.0.0.1:{port}"],
cwd=data_dir,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
text=True,
start_new_session=True,
)
try:
_wait_for_http(port, f"archivebox.localhost:{port}", process=process)
except AssertionError as exc:
server_log = _stop_server(process)
raise AssertionError(f"{exc}\n\nSERVER LOG:\n{server_log}") from exc
return process
def _stop_server(process: subprocess.Popen[str]) -> str:
try:
if process.poll() is None:
os.killpg(process.pid, signal.SIGTERM)
try:
stdout, _ = process.communicate(timeout=3)
except subprocess.TimeoutExpired:
os.killpg(process.pid, signal.SIGKILL)
stdout, _ = process.communicate(timeout=5)
else:
stdout, _ = process.communicate(timeout=5)
except ProcessLookupError:
stdout, _ = process.communicate(timeout=5)
return stdout
def _build_probe_config(mode: str, port: int, fixture: dict[str, object], runtime: dict[str, Path]) -> dict[str, str]:
snapshots = fixture["snapshots"]
attacker = snapshots["attacker"]
victim = snapshots["victim"]
base_origin = f"http://archivebox.localhost:{port}"
attacker_id = attacker["id"]
victim_id = victim["id"]
if mode == "safe-subdomains-fullreplay":
attacker_origin = f"http://{attacker_id}.archivebox.localhost:{port}"
victim_url = f"http://{victim_id}.archivebox.localhost:{port}/safe.json"
dangerous_base = f"{attacker_origin}/dangerous.html"
admin_origin = f"http://admin.archivebox.localhost:{port}"
else:
attacker_origin = base_origin
victim_url = f"{base_origin}/snapshot/{victim_id}/safe.json"
dangerous_base = f"{base_origin}/snapshot/{attacker_id}/dangerous.html"
admin_origin = base_origin
query = urlencode(
{
"own": "safe.json",
"victim": victim_url,
"admin": f"{admin_origin}/admin/",
"api": f"{admin_origin}/api/v1/docs",
},
)
return {
"mode": mode,
"chromePath": str(runtime["chrome_binary"]),
"adminLoginUrl": f"{admin_origin}/admin/login/",
"dangerousUrl": f"{dangerous_base}?{query}",
"username": fixture["username"],
"password": fixture["password"],
}
def _run_browser_probe(
data_dir: Path,
runtime: dict[str, Path],
mode: str,
fixture: dict[str, object],
tmp_path: Path,
) -> dict[str, object]:
port = _get_free_port()
process = _start_server(data_dir, mode=mode, port=port)
probe_path = tmp_path / "server_security_probe.js"
probe_path.write_text(PUPPETEER_PROBE_SCRIPT, encoding="utf-8")
probe_config = _build_probe_config(mode, port, fixture, runtime)
env = os.environ.copy()
env["NODE_PATH"] = str(runtime["node_modules_dir"])
env["NODE_MODULES_DIR"] = str(runtime["node_modules_dir"])
env["CHROME_BINARY"] = str(runtime["chrome_binary"])
env["USE_COLOR"] = "False"
try:
result = subprocess.run(
["node", str(probe_path)],
cwd=data_dir,
env=env,
input=json.dumps(probe_config),
capture_output=True,
text=True,
timeout=120,
)
finally:
server_log = _stop_server(process)
assert result.returncode == 0, f"{result.stderr}\n\nSERVER LOG:\n{server_log}"
return json.loads(result.stdout.strip())
@pytest.mark.parametrize(
("mode", "expected"),
[
(
"safe-subdomains-fullreplay",
{
"login_succeeds": True,
"script_ran": True,
"victim_ok": False,
"admin_ok": False,
"admin_status": None,
"api_ok": False,
"api_status": None,
"csp_contains": None,
},
),
(
"safe-onedomain-nojsreplay",
{
"login_succeeds": True,
"script_ran": False,
"victim_ok": None,
"admin_ok": None,
"admin_status": None,
"api_ok": None,
"api_status": None,
"csp_contains": "sandbox",
},
),
(
"unsafe-onedomain-noadmin",
{
"login_succeeds": False,
"login_status": 403,
"script_ran": True,
"victim_ok": True,
"victim_status": 200,
"admin_ok": True,
"admin_status": 403,
"api_ok": True,
"api_status": 403,
"csp_contains": None,
},
),
(
"danger-onedomain-fullreplay",
{
"login_succeeds": True,
"script_ran": True,
"victim_ok": True,
"victim_status": 200,
"admin_ok": True,
"admin_status": 200,
"api_ok": True,
"api_status": 200,
"csp_contains": None,
},
),
],
)
def test_server_security_modes_in_chrome(
initialized_archive: Path,
browser_runtime,
tmp_path: Path,
mode: str,
expected: dict[str, object],
) -> None:
fixture = _seed_archive(initialized_archive)
result = _run_browser_probe(initialized_archive, browser_runtime, mode, fixture, tmp_path)
login = result["login"]
dangerous_page = result["dangerousPage"]
page_state = result["pageState"]
probe_results = page_state["probeResults"] or {}
console_texts = [entry["text"] for entry in result["consoleMessages"]]
assert dangerous_page["status"] == 200
assert dangerous_page["archiveboxSecurityMode"] == mode
assert page_state["scriptRan"] is expected["script_ran"]
assert login["succeeded"] is expected["login_succeeds"]
login_status = expected.get("login_status")
if login_status is not None:
assert login["status"] == login_status
csp_contains = expected.get("csp_contains")
if csp_contains:
csp = dangerous_page["contentSecurityPolicy"] or ""
assert csp_contains in csp
else:
assert dangerous_page["contentSecurityPolicy"] is None
if mode == "safe-subdomains-fullreplay":
assert probe_results["own"]["ok"] is True
assert probe_results["own"]["status"] == 200
assert "ATTACKER_SECRET" in probe_results["own"]["sample"]
assert probe_results["victim"]["ok"] is expected["victim_ok"]
assert probe_results["admin"]["ok"] is expected["admin_ok"]
assert probe_results["api"]["ok"] is expected["api_ok"]
assert any("CORS policy" in text for text in console_texts)
return
if mode == "safe-onedomain-nojsreplay":
assert probe_results == {}
assert "Dangerous Replay Fixture" in page_state["bodyText"]
assert any("Blocked script execution" in text for text in console_texts)
return
assert probe_results["own"]["ok"] is True
assert probe_results["own"]["status"] == 200
assert "ATTACKER_SECRET" in probe_results["own"]["sample"]
assert probe_results["victim"]["ok"] is expected["victim_ok"]
assert probe_results["victim"]["status"] == expected["victim_status"]
assert "VICTIM_SECRET" in probe_results["victim"]["sample"]
assert probe_results["admin"]["ok"] is expected["admin_ok"]
assert probe_results["admin"]["status"] == expected["admin_status"]
assert probe_results["api"]["ok"] is expected["api_ok"]
assert probe_results["api"]["status"] == expected["api_status"]
if mode == "unsafe-onedomain-noadmin":
assert "control plane disabled" in probe_results["admin"]["sample"].lower()
assert "control plane disabled" in probe_results["api"]["sample"].lower()
elif mode == "danger-onedomain-fullreplay":
assert "ArchiveBox" in probe_results["admin"]["sample"]
assert "swagger" in probe_results["api"]["sample"].lower()