Add configurable server security modes (#1773)

Fixes https://github.com/ArchiveBox/ArchiveBox/issues/239

## Summary
- add `SERVER_SECURITY_MODE` presets for safe subdomain replay, safe
one-domain no-JS replay, unsafe one-domain no-admin, and dangerous
one-domain full replay
- make host routing, replay URLs, static serving, and control-plane
access mode-aware
- add strict routing/header coverage plus a browser-backed
Chrome/Puppeteer test that verifies real same-origin behavior in all
four modes

## Testing
- `uv run pytest archivebox/tests/test_urls.py -v`
- `uv run pytest archivebox/tests/test_admin_views.py -v`
- `uv run pytest archivebox/tests/test_server_security_browser.py -v`

<!-- devin-review-badge-begin -->

---

<a href="https://app.devin.ai/review/archivebox/archivebox/pull/1773"
target="_blank">
  <picture>
<source media="(prefers-color-scheme: dark)"
srcset="https://static.devin.ai/assets/gh-open-in-devin-review-dark.svg?v=1">
<img
src="https://static.devin.ai/assets/gh-open-in-devin-review-light.svg?v=1"
alt="Open with Devin">
  </picture>
</a>
<!-- devin-review-badge-end -->


<!-- This is an auto-generated description by cubic. -->
---
## Summary by cubic
Adds configurable server security modes to isolate admin/API from
archived content, with a safe subdomain default and single-domain
fallbacks. Routing, replay endpoints, headers, and middleware are
mode-aware, with browser tests validating same-origin behavior.

- New Features
- Introduced SERVER_SECURITY_MODE with presets:
safe-subdomains-fullreplay (default), safe-onedomain-nojsreplay,
unsafe-onedomain-noadmin, danger-onedomain-fullreplay.
- Mode-aware routing and base URLs; one-domain modes use path-based
replay: /snapshot/<id>/... and /original/<domain>/....
- Control plane gate: block admin/API and non-GET methods in
unsafe-onedomain-noadmin; allow full access in
danger-onedomain-fullreplay.
- Safer replay: detect risky HTML/SVG and apply CSP sandbox (no scripts)
in safe-onedomain-nojsreplay; add X-ArchiveBox-Security-Mode and
X-Content-Type-Options: nosniff on replay responses.
- Middleware and serving: added ServerSecurityModeMiddleware, improved
HostRouting, and static server byte-range/CSP handling.
- Tests: added Chrome/Puppeteer browser tests and stricter URL routing
tests covering all modes.

- Migration
- Default requires wildcard subdomains for full isolation (admin., web.,
api., and snapshot-id.<base>).
- To run on one domain, set SERVER_SECURITY_MODE to a one-domain preset;
URLs switch to /snapshot/<id>/ and /original/<domain>/ paths.
- For production, prefer safe-subdomains-fullreplay; lower-security
modes print a startup warning.

<sup>Written for commit ad41b15581.
Summary will update on new commits.</sup>

<!-- End of auto-generated description by cubic. -->
This commit is contained in:
Nick Sweeting
2026-03-22 20:17:21 -07:00
committed by GitHub
9 changed files with 1139 additions and 82 deletions

View File

@@ -55,6 +55,8 @@ def _build_listen_host(subdomain: str | None) -> str:
def get_admin_host() -> str:
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return get_listen_host().lower()
override = _normalize_base_url(SERVER_CONFIG.ADMIN_BASE_URL)
if override:
return urlparse(override).netloc.lower()
@@ -62,23 +64,33 @@ def get_admin_host() -> str:
def get_web_host() -> str:
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return get_listen_host().lower()
override = _normalize_base_url(SERVER_CONFIG.ARCHIVE_BASE_URL)
if override:
return urlparse(override).netloc.lower()
return _build_listen_host("web")
def get_api_host() -> str:
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return get_listen_host().lower()
return _build_listen_host("api")
def get_public_host() -> str:
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return get_listen_host().lower()
return _build_listen_host("public")
def get_snapshot_host(snapshot_id: str) -> str:
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return get_listen_host().lower()
return _build_listen_host(snapshot_id)
def get_original_host(domain: str) -> str:
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return get_listen_host().lower()
return _build_listen_host(domain)
@@ -87,6 +99,8 @@ def is_snapshot_subdomain(subdomain: str) -> bool:
def get_listen_subdomain(request_host: str) -> str:
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return ""
req_host, req_port = split_host_port(request_host)
listen_host, listen_port = get_listen_parts()
if not listen_host:
@@ -127,6 +141,8 @@ def _build_base_url_for_host(host: str, request=None) -> str:
def get_admin_base_url(request=None) -> str:
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return _build_base_url_for_host(get_listen_host(), request=request)
override = _normalize_base_url(SERVER_CONFIG.ADMIN_BASE_URL)
if override:
return override
@@ -134,12 +150,16 @@ def get_admin_base_url(request=None) -> str:
def get_web_base_url(request=None) -> str:
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return _build_base_url_for_host(get_listen_host(), request=request)
override = _normalize_base_url(SERVER_CONFIG.ARCHIVE_BASE_URL)
if override:
return override
return _build_base_url_for_host(get_web_host(), request=request)
def get_api_base_url(request=None) -> str:
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return _build_base_url_for_host(get_listen_host(), request=request)
return _build_base_url_for_host(get_api_host(), request=request)
@@ -149,10 +169,14 @@ def get_archive_base_url(request=None) -> str:
def get_snapshot_base_url(snapshot_id: str, request=None) -> str:
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return _build_url(get_web_base_url(request=request), f"/snapshot/{snapshot_id}")
return _build_base_url_for_host(get_snapshot_host(snapshot_id), request=request)
def get_original_base_url(domain: str, request=None) -> str:
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
return _build_url(get_web_base_url(request=request), f"/original/{domain}")
return _build_base_url_for_host(get_original_host(domain), request=request)

View File

@@ -10,7 +10,7 @@ from django.core.exceptions import ImproperlyConfigured
from django.shortcuts import redirect
from django.contrib.staticfiles import finders
from django.utils.http import http_date
from django.http import HttpResponseNotModified
from django.http import HttpResponseForbidden, HttpResponseNotModified
from archivebox.config.common import SERVER_CONFIG
from archivebox.config import VERSION
@@ -26,6 +26,7 @@ from archivebox.core.host_utils import (
get_web_host,
host_matches,
is_snapshot_subdomain,
split_host_port,
)
from archivebox.core.views import SnapshotHostView, OriginalDomainHostView
@@ -90,6 +91,29 @@ def CacheControlMiddleware(get_response):
return middleware
def ServerSecurityModeMiddleware(get_response):
blocked_prefixes = ("/admin", "/accounts", "/api", "/add", "/web")
allowed_methods = {"GET", "HEAD", "OPTIONS"}
def middleware(request):
if SERVER_CONFIG.CONTROL_PLANE_ENABLED:
return get_response(request)
request.user = AnonymousUser()
request._cached_user = request.user
if request.method.upper() not in allowed_methods:
return HttpResponseForbidden("ArchiveBox is running with the control plane disabled in this security mode.")
for prefix in blocked_prefixes:
if request.path == prefix or request.path.startswith(f"{prefix}/"):
return HttpResponseForbidden("ArchiveBox is running with the control plane disabled in this security mode.")
return get_response(request)
return middleware
def HostRoutingMiddleware(get_response):
def middleware(request):
request_host = (request.get_host() or "").lower()
@@ -100,6 +124,21 @@ def HostRoutingMiddleware(get_response):
listen_host = get_listen_host()
subdomain = get_listen_subdomain(request_host)
if not SERVER_CONFIG.USES_SUBDOMAIN_ROUTING:
if host_matches(request_host, listen_host):
return get_response(request)
req_host, req_port = split_host_port(request_host)
listen_host_only, listen_port = split_host_port(listen_host)
if req_host.endswith(f".{listen_host_only}"):
if not listen_port or not req_port or listen_port == req_port:
target = build_web_url(request.path, request=request)
if request.META.get("QUERY_STRING"):
target = f"{target}?{request.META['QUERY_STRING']}"
return redirect(target)
return get_response(request)
if host_matches(request_host, admin_host):
return get_response(request)

View File

@@ -86,6 +86,7 @@ MIDDLEWARE = [
"django.middleware.csrf.CsrfViewMiddleware",
"django.contrib.auth.middleware.AuthenticationMiddleware",
"archivebox.core.middleware.ReverseProxyAuthMiddleware",
"archivebox.core.middleware.ServerSecurityModeMiddleware",
"archivebox.core.middleware.HostRoutingMiddleware",
"django.contrib.messages.middleware.MessageMiddleware",
"archivebox.core.middleware.CacheControlMiddleware",

View File

@@ -9,7 +9,7 @@ from django.http import HttpRequest
from archivebox.misc.serve_static import serve_static
from archivebox.core.admin_site import archivebox_admin
from archivebox.core.views import HomepageView, SnapshotView, SnapshotPathView, PublicIndexView, AddView, WebAddView, HealthCheckView, live_progress_view
from archivebox.core.views import HomepageView, SnapshotView, SnapshotPathView, SnapshotReplayView, OriginalDomainReplayView, PublicIndexView, AddView, WebAddView, HealthCheckView, live_progress_view
# GLOBAL_CONTEXT doesn't work as-is, disabled for now: https://github.com/ArchiveBox/ArchiveBox/discussions/1306
@@ -33,6 +33,8 @@ urlpatterns = [
path('archive/', RedirectView.as_view(url='/')),
path('archive/<path:path>', SnapshotView.as_view(), name='Snapshot'),
re_path(r'^snapshot\/(?P<snapshot_id>[0-9a-fA-F-]{8,36})(?:\/(?P<path>.*))?$', SnapshotReplayView.as_view(), name='snapshot-replay'),
re_path(r'^original\/(?P<domain>[^/]+)(?:\/(?P<path>.*))?$', OriginalDomainReplayView.as_view(), name='original-replay'),
re_path(r'^web/(?P<url>(?!\d{4}(?:\d{2})?(?:\d{2})?(?:/|$)).+)$', WebAddView.as_view(), name='web-add'),
re_path(r'^(?P<username>[^/]+)/(?P<date>\d{4}(?:\d{2})?(?:\d{2})?)/(?P<url>https?://.*)$', SnapshotPathView.as_view(), name='snapshot-path-url'),
re_path(r'^(?P<username>[^/]+)/(?P<date>\d{4}(?:\d{2})?(?:\d{2})?)/(?P<domain>[^/]+)(?:/(?P<snapshot_id>[0-9a-fA-F-]{8,36})(?:/(?P<path>.*))?)?$', SnapshotPathView.as_view(), name='snapshot-path'),

View File

@@ -52,15 +52,21 @@ def _files_index_target(snapshot: Snapshot, archivefile: str | None) -> str:
return target
def _admin_login_redirect_or_forbidden(request: HttpRequest):
if SERVER_CONFIG.CONTROL_PLANE_ENABLED:
return redirect(f'/admin/login/?next={request.path}')
return HttpResponseForbidden("ArchiveBox is running with the control plane disabled in this security mode.")
class HomepageView(View):
def get(self, request):
if request.user.is_authenticated:
if request.user.is_authenticated and SERVER_CONFIG.CONTROL_PLANE_ENABLED:
return redirect('/admin/core/snapshot/')
if SERVER_CONFIG.PUBLIC_INDEX:
return redirect('/public')
return redirect(f'/admin/login/?next={request.path}')
return _admin_login_redirect_or_forbidden(request)
class SnapshotView(View):
@@ -277,7 +283,7 @@ class SnapshotView(View):
def get(self, request, path):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return redirect(f'/admin/login/?next={request.path}')
return _admin_login_redirect_or_forbidden(request)
snapshot = None
@@ -308,7 +314,7 @@ class SnapshotView(View):
if request.GET.get('files'):
target_path = _files_index_target(snapshot, archivefile)
response = serve_static_with_byterange_support(
request, target_path, document_root=snapshot.output_dir, show_indexes=True,
request, target_path, document_root=snapshot.output_dir, show_indexes=True, is_archive_replay=True,
)
elif archivefile == 'index.html':
# if they requested snapshot index, serve live rendered template instead of static html
@@ -474,7 +480,7 @@ class SnapshotPathView(View):
def get(self, request, username: str, date: str | None = None, domain: str | None = None, snapshot_id: str | None = None, path: str = "", url: str | None = None):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return redirect(f'/admin/login/?next={request.path}')
return _admin_login_redirect_or_forbidden(request)
if username == 'system':
return redirect(request.path.replace('/system/', '/web/', 1))
@@ -573,14 +579,14 @@ class SnapshotPathView(View):
if request.GET.get('files'):
target_path = _files_index_target(snapshot, archivefile)
return serve_static_with_byterange_support(
request, target_path, document_root=snapshot.output_dir, show_indexes=True,
request, target_path, document_root=snapshot.output_dir, show_indexes=True, is_archive_replay=True,
)
if archivefile == "index.html":
return SnapshotView.render_live_index(request, snapshot)
return serve_static_with_byterange_support(
request, archivefile, document_root=snapshot.output_dir, show_indexes=True,
request, archivefile, document_root=snapshot.output_dir, show_indexes=True, is_archive_replay=True,
)
@@ -670,6 +676,7 @@ def _serve_responses_path(request, responses_root: Path, rel_path: str, show_ind
candidate,
document_root=str(responses_root),
show_indexes=show_indexes,
is_archive_replay=True,
)
except Http404:
pass
@@ -682,18 +689,85 @@ def _serve_responses_path(request, responses_root: Path, rel_path: str, show_ind
rel_dir,
document_root=str(responses_root),
show_indexes=True,
is_archive_replay=True,
)
except Http404:
return None
return None
def _serve_snapshot_replay(request: HttpRequest, snapshot: Snapshot, path: str = ""):
rel_path = path or ""
show_indexes = bool(request.GET.get("files"))
if not rel_path or rel_path.endswith("/"):
if show_indexes:
rel_path = rel_path.rstrip("/")
else:
rel_path = f"{rel_path}index.html"
rel_path = _safe_archive_relpath(rel_path)
if rel_path is None:
raise Http404
try:
return serve_static_with_byterange_support(
request,
rel_path,
document_root=snapshot.output_dir,
show_indexes=show_indexes,
is_archive_replay=True,
)
except Http404:
pass
host = urlparse(snapshot.url).hostname or snapshot.domain
responses_root = Path(snapshot.output_dir) / "responses" / host
if responses_root.exists():
response = _serve_responses_path(request, responses_root, rel_path, show_indexes)
if response is not None:
return response
raise Http404
def _serve_original_domain_replay(request: HttpRequest, domain: str, path: str = ""):
rel_path = path or ""
if not rel_path or rel_path.endswith("/"):
rel_path = f"{rel_path}index.html"
rel_path = _safe_archive_relpath(rel_path)
if rel_path is None:
raise Http404
domain = domain.lower()
match = _latest_response_match(domain, rel_path)
if not match and "." not in Path(rel_path).name:
index_path = f"{rel_path.rstrip('/')}/index.html"
match = _latest_response_match(domain, index_path)
if not match and "." not in Path(rel_path).name:
html_path = f"{rel_path}.html"
match = _latest_response_match(domain, html_path)
show_indexes = bool(request.GET.get("files"))
if match:
responses_root, rel_to_root = match
response = _serve_responses_path(request, responses_root, str(rel_to_root), show_indexes)
if response is not None:
return response
responses_root = _latest_responses_root(domain)
if responses_root:
response = _serve_responses_path(request, responses_root, rel_path, show_indexes)
if response is not None:
return response
raise Http404
class SnapshotHostView(View):
"""Serve snapshot directory contents on <snapshot_id>.<listen_host>/<path>."""
def get(self, request, snapshot_id: str, path: str = ""):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return HttpResponseForbidden("Public snapshots are disabled.")
return _admin_login_redirect_or_forbidden(request)
snapshot = None
if snapshot_id:
try:
@@ -708,37 +782,30 @@ class SnapshotHostView(View):
if not snapshot:
raise Http404
return _serve_snapshot_replay(request, snapshot, path)
rel_path = path or ""
show_indexes = bool(request.GET.get("files"))
if not rel_path or rel_path.endswith("/"):
if show_indexes:
rel_path = rel_path.rstrip("/")
else:
rel_path = f"{rel_path}index.html"
rel_path = _safe_archive_relpath(rel_path)
if rel_path is None:
raise Http404
class SnapshotReplayView(View):
"""Serve snapshot directory contents on a one-domain replay path."""
def get(self, request, snapshot_id: str, path: str = ""):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return _admin_login_redirect_or_forbidden(request)
try:
return serve_static_with_byterange_support(
request,
rel_path,
document_root=snapshot.output_dir,
show_indexes=show_indexes,
)
except Http404:
pass
snapshot = Snapshot.objects.get(pk=snapshot_id)
except Snapshot.DoesNotExist:
try:
snapshot = Snapshot.objects.get(id__startswith=snapshot_id)
except Snapshot.DoesNotExist:
raise Http404
except Snapshot.MultipleObjectsReturned:
snapshot = Snapshot.objects.filter(id__startswith=snapshot_id).first()
# Fallback to responses/<domain>/<path>
host = urlparse(snapshot.url).hostname or snapshot.domain
responses_root = Path(snapshot.output_dir) / "responses" / host
if responses_root.exists():
response = _serve_responses_path(request, responses_root, rel_path, show_indexes)
if response is not None:
return response
if snapshot is None:
raise Http404
raise Http404
return _serve_snapshot_replay(request, snapshot, path)
class OriginalDomainHostView(View):
@@ -746,38 +813,17 @@ class OriginalDomainHostView(View):
def get(self, request, domain: str, path: str = ""):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return HttpResponseForbidden("Public snapshots are disabled.")
rel_path = path or ""
if not rel_path or rel_path.endswith("/"):
rel_path = f"{rel_path}index.html"
rel_path = _safe_archive_relpath(rel_path)
if rel_path is None:
raise Http404
return _admin_login_redirect_or_forbidden(request)
return _serve_original_domain_replay(request, domain, path)
domain = domain.lower()
match = _latest_response_match(domain, rel_path)
if not match and "." not in Path(rel_path).name:
index_path = f"{rel_path.rstrip('/')}/index.html"
match = _latest_response_match(domain, index_path)
if not match and "." not in Path(rel_path).name:
html_path = f"{rel_path}.html"
match = _latest_response_match(domain, html_path)
show_indexes = bool(request.GET.get("files"))
if match:
responses_root, rel_to_root = match
response = _serve_responses_path(request, responses_root, str(rel_to_root), show_indexes)
if response is not None:
return response
class OriginalDomainReplayView(View):
"""Serve original-domain replay content on a one-domain replay path."""
# If no direct match, try serving directory index from latest responses root
responses_root = _latest_responses_root(domain)
if responses_root:
response = _serve_responses_path(request, responses_root, rel_path, show_indexes)
if response is not None:
return response
raise Http404
def get(self, request, domain: str, path: str = ""):
if not request.user.is_authenticated and not SERVER_CONFIG.PUBLIC_SNAPSHOTS:
return _admin_login_redirect_or_forbidden(request)
return _serve_original_domain_replay(request, domain, path)
class PublicIndexView(ListView):
@@ -834,7 +880,7 @@ class PublicIndexView(ListView):
response = super().get(*args, **kwargs)
return response
else:
return redirect(f'/admin/login/?next={self.request.path}')
return _admin_login_redirect_or_forbidden(self.request)
@method_decorator(csrf_exempt, name='dispatch')
class AddView(UserPassesTestMixin, FormView):