Add configurable server security modes (#1773)

Fixes https://github.com/ArchiveBox/ArchiveBox/issues/239

## Summary
- add `SERVER_SECURITY_MODE` presets for safe subdomain replay, safe
one-domain no-JS replay, unsafe one-domain no-admin, and dangerous
one-domain full replay
- make host routing, replay URLs, static serving, and control-plane
access mode-aware
- add strict routing/header coverage plus a browser-backed
Chrome/Puppeteer test that verifies real same-origin behavior in all
four modes

## Testing
- `uv run pytest archivebox/tests/test_urls.py -v`
- `uv run pytest archivebox/tests/test_admin_views.py -v`
- `uv run pytest archivebox/tests/test_server_security_browser.py -v`

<!-- devin-review-badge-begin -->

---

<a href="https://app.devin.ai/review/archivebox/archivebox/pull/1773"
target="_blank">
  <picture>
<source media="(prefers-color-scheme: dark)"
srcset="https://static.devin.ai/assets/gh-open-in-devin-review-dark.svg?v=1">
<img
src="https://static.devin.ai/assets/gh-open-in-devin-review-light.svg?v=1"
alt="Open with Devin">
  </picture>
</a>
<!-- devin-review-badge-end -->


<!-- This is an auto-generated description by cubic. -->
---
## Summary by cubic
Adds configurable server security modes to isolate admin/API from
archived content, with a safe subdomain default and single-domain
fallbacks. Routing, replay endpoints, headers, and middleware are
mode-aware, with browser tests validating same-origin behavior.

- New Features
- Introduced SERVER_SECURITY_MODE with presets:
safe-subdomains-fullreplay (default), safe-onedomain-nojsreplay,
unsafe-onedomain-noadmin, danger-onedomain-fullreplay.
- Mode-aware routing and base URLs; one-domain modes use path-based
replay: /snapshot/<id>/... and /original/<domain>/....
- Control plane gate: block admin/API and non-GET methods in
unsafe-onedomain-noadmin; allow full access in
danger-onedomain-fullreplay.
- Safer replay: detect risky HTML/SVG and apply CSP sandbox (no scripts)
in safe-onedomain-nojsreplay; add X-ArchiveBox-Security-Mode and
X-Content-Type-Options: nosniff on replay responses.
- Middleware and serving: added ServerSecurityModeMiddleware, improved
HostRouting, and static server byte-range/CSP handling.
- Tests: added Chrome/Puppeteer browser tests and stricter URL routing
tests covering all modes.

- Migration
- Default requires wildcard subdomains for full isolation (admin., web.,
api., and snapshot-id.<base>).
- To run on one domain, set SERVER_SECURITY_MODE to a one-domain preset;
URLs switch to /snapshot/<id>/ and /original/<domain>/ paths.
- For production, prefer safe-subdomains-fullreplay; lower-security
modes print a startup warning.

<sup>Written for commit ad41b15581.
Summary will update on new commits.</sup>

<!-- End of auto-generated description by cubic. -->
This commit is contained in:
Nick Sweeting
2026-03-22 20:17:21 -07:00
committed by GitHub
9 changed files with 1139 additions and 82 deletions

View File

@@ -81,6 +81,17 @@ MARKDOWN_BOLD_RE = re.compile(r'\*\*([^*]+)\*\*')
MARKDOWN_ITALIC_RE = re.compile(r'(?<!\*)\*([^*]+)\*(?!\*)')
HTML_TAG_RE = re.compile(r'<[A-Za-z][^>]*>')
HTML_BODY_RE = re.compile(r'<body[^>]*>(.*)</body>', flags=re.IGNORECASE | re.DOTALL)
RISKY_REPLAY_MIMETYPES = {
"text/html",
"application/xhtml+xml",
"image/svg+xml",
}
RISKY_REPLAY_EXTENSIONS = {".html", ".htm", ".xhtml", ".svg", ".svgz"}
RISKY_REPLAY_MARKERS = (
"<!doctype html",
"<html",
"<svg",
)
def _extract_markdown_candidate(text: str) -> str:
@@ -278,7 +289,56 @@ def _render_markdown_document(markdown_text: str) -> str:
return wrapped
def serve_static_with_byterange_support(request, path, document_root=None, show_indexes=False):
def _content_type_base(content_type: str) -> str:
return (content_type or "").split(";", 1)[0].strip().lower()
def _is_risky_replay_document(fullpath: Path, content_type: str) -> bool:
if fullpath.suffix.lower() in RISKY_REPLAY_EXTENSIONS:
return True
if _content_type_base(content_type) in RISKY_REPLAY_MIMETYPES:
return True
# Unknown archived response paths often have no extension. Sniff a small prefix
# so one-domain no-JS mode still catches HTML/SVG documents.
try:
head = fullpath.read_bytes()[:4096].decode("utf-8", errors="ignore").lower()
except Exception:
return False
return any(marker in head for marker in RISKY_REPLAY_MARKERS)
def _apply_archive_replay_headers(response: HttpResponse, *, fullpath: Path, content_type: str, is_archive_replay: bool) -> HttpResponse:
if not is_archive_replay:
return response
response.headers.setdefault("X-Content-Type-Options", "nosniff")
response.headers.setdefault("X-ArchiveBox-Security-Mode", SERVER_CONFIG.SERVER_SECURITY_MODE)
if SERVER_CONFIG.SHOULD_NEUTER_RISKY_REPLAY and _is_risky_replay_document(fullpath, content_type):
response.headers["Content-Security-Policy"] = (
"sandbox; "
"default-src 'self' data: blob:; "
"script-src 'none'; "
"object-src 'none'; "
"base-uri 'none'; "
"form-action 'none'; "
"connect-src 'none'; "
"worker-src 'none'; "
"frame-ancestors 'self'; "
"style-src 'self' 'unsafe-inline' data: blob:; "
"img-src 'self' data: blob:; "
"media-src 'self' data: blob:; "
"font-src 'self' data: blob:;"
)
response.headers.setdefault("Referrer-Policy", "no-referrer")
return response
def serve_static_with_byterange_support(request, path, document_root=None, show_indexes=False, is_archive_replay: bool=False):
"""
Overrides Django's built-in django.views.static.serve function to support byte range requests.
This allows you to do things like seek into the middle of a huge mp4 or WACZ without downloading the whole file.
@@ -289,7 +349,8 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
fullpath = Path(safe_join(document_root, path))
if os.access(fullpath, os.R_OK) and fullpath.is_dir():
if show_indexes:
return static.directory_index(path, fullpath)
response = static.directory_index(path, fullpath)
return _apply_archive_replay_headers(response, fullpath=fullpath, content_type="text/html", is_archive_replay=is_archive_replay)
raise Http404(_("Directory indexes are not allowed here."))
if not os.access(fullpath, os.R_OK):
raise Http404(_("%(path)s” does not exist") % {"path": fullpath})
@@ -312,7 +373,7 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
not_modified.headers["ETag"] = etag
not_modified.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
not_modified.headers["Last-Modified"] = http_date(statobj.st_mtime)
return not_modified
return _apply_archive_replay_headers(not_modified, fullpath=fullpath, content_type="", is_archive_replay=is_archive_replay)
content_type, encoding = mimetypes.guess_type(str(fullpath))
content_type = content_type or "application/octet-stream"
@@ -333,7 +394,7 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
# Respect the If-Modified-Since header for non-markdown responses.
if not (content_type.startswith("text/plain") or content_type.startswith("text/html")):
if not static.was_modified_since(request.META.get("HTTP_IF_MODIFIED_SINCE"), statobj.st_mtime):
return HttpResponseNotModified()
return _apply_archive_replay_headers(HttpResponseNotModified(), fullpath=fullpath, content_type=content_type, is_archive_replay=is_archive_replay)
# Heuristic fix: some archived HTML outputs (e.g. mercury content.html)
# are stored with HTML-escaped markup or markdown sources. If so, render sensibly.
@@ -360,7 +421,7 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
if encoding:
response.headers["Content-Encoding"] = encoding
return response
return _apply_archive_replay_headers(response, fullpath=fullpath, content_type="text/html; charset=utf-8", is_archive_replay=is_archive_replay)
if escaped_count and escaped_count > tag_count * 2:
response = HttpResponse(decoded, content_type=content_type)
response.headers["Last-Modified"] = http_date(statobj.st_mtime)
@@ -372,7 +433,7 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
if encoding:
response.headers["Content-Encoding"] = encoding
return response
return _apply_archive_replay_headers(response, fullpath=fullpath, content_type=content_type, is_archive_replay=is_archive_replay)
except Exception:
pass
@@ -416,7 +477,7 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
response.status_code = 206
if encoding:
response.headers["Content-Encoding"] = encoding
return response
return _apply_archive_replay_headers(response, fullpath=fullpath, content_type=content_type, is_archive_replay=is_archive_replay)
def serve_static(request, path, **kwargs):