Files
ArchiveBox/archivebox/config/common.py
Nick Sweeting b749b26c5d wip
2026-03-23 03:58:32 -07:00

316 lines
11 KiB
Python

__package__ = "archivebox.config"
import re
import secrets
import sys
import shutil
from typing import ClassVar
from pathlib import Path
from rich.console import Console
from pydantic import Field, field_validator
from archivebox.config.configset import BaseConfigSet
from .constants import CONSTANTS
from .version import get_COMMIT_HASH, get_BUILD_TIME, VERSION
from .permissions import IN_DOCKER
###################### Config ##########################
_STDOUT_CONSOLE = Console()
_STDERR_CONSOLE = Console(stderr=True)
def rprint(*args, file=None, **kwargs):
console = _STDERR_CONSOLE if file is sys.stderr else _STDOUT_CONSOLE
console.print(*args, **kwargs)
class ShellConfig(BaseConfigSet):
toml_section_header: str = "SHELL_CONFIG"
DEBUG: bool = Field(default="--debug" in sys.argv)
IS_TTY: bool = Field(default=sys.stdout.isatty())
USE_COLOR: bool = Field(default=sys.stdout.isatty())
SHOW_PROGRESS: bool = Field(default=sys.stdout.isatty())
IN_DOCKER: bool = Field(default=IN_DOCKER)
IN_QEMU: bool = Field(default=False)
ANSI: dict[str, str] = Field(
default_factory=lambda: CONSTANTS.DEFAULT_CLI_COLORS if sys.stdout.isatty() else CONSTANTS.DISABLED_CLI_COLORS,
)
@property
def TERM_WIDTH(self) -> int:
if not self.IS_TTY:
return 200
return shutil.get_terminal_size((140, 10)).columns
@property
def COMMIT_HASH(self) -> str | None:
return get_COMMIT_HASH()
@property
def BUILD_TIME(self) -> str:
return get_BUILD_TIME()
SHELL_CONFIG = ShellConfig()
class StorageConfig(BaseConfigSet):
toml_section_header: str = "STORAGE_CONFIG"
# TMP_DIR must be a local, fast, readable/writable dir by archivebox user,
# must be a short path due to unix path length restrictions for socket files (<100 chars)
# must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets
TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR)
# LIB_DIR must be a local, fast, readable/writable dir by archivebox user,
# must be able to contain executable binaries (up to 5GB size)
# should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow
LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR)
# LIB_BIN_DIR is where all installed binaries are symlinked for easy PATH management
# Derived from LIB_DIR / 'bin', should be prepended to PATH for all hook executions
LIB_BIN_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_BIN_DIR)
# CUSTOM_TEMPLATES_DIR allows users to override default templates
# defaults to DATA_DIR / 'user_templates' but can be configured
CUSTOM_TEMPLATES_DIR: Path = Field(default=CONSTANTS.CUSTOM_TEMPLATES_DIR)
OUTPUT_PERMISSIONS: str = Field(default="644")
RESTRICT_FILE_NAMES: str = Field(default="windows")
ENFORCE_ATOMIC_WRITES: bool = Field(default=True)
# not supposed to be user settable:
DIR_OUTPUT_PERMISSIONS: str = Field(default="755") # computed from OUTPUT_PERMISSIONS
STORAGE_CONFIG = StorageConfig()
class GeneralConfig(BaseConfigSet):
toml_section_header: str = "GENERAL_CONFIG"
TAG_SEPARATOR_PATTERN: str = Field(default=r"[,]")
GENERAL_CONFIG = GeneralConfig()
class ServerConfig(BaseConfigSet):
toml_section_header: str = "SERVER_CONFIG"
SERVER_SECURITY_MODES: ClassVar[tuple[str, ...]] = (
"safe-subdomains-fullreplay",
"safe-onedomain-nojsreplay",
"unsafe-onedomain-noadmin",
"danger-onedomain-fullreplay",
)
SECRET_KEY: str = Field(default_factory=lambda: "".join(secrets.choice("abcdefghijklmnopqrstuvwxyz0123456789_") for _ in range(50)))
BIND_ADDR: str = Field(default="127.0.0.1:8000")
LISTEN_HOST: str = Field(default="archivebox.localhost:8000")
ADMIN_BASE_URL: str = Field(default="")
ARCHIVE_BASE_URL: str = Field(default="")
ALLOWED_HOSTS: str = Field(default="*")
CSRF_TRUSTED_ORIGINS: str = Field(default="http://admin.archivebox.localhost:8000")
SERVER_SECURITY_MODE: str = Field(default="safe-subdomains-fullreplay")
SNAPSHOTS_PER_PAGE: int = Field(default=40)
PREVIEW_ORIGINALS: bool = Field(default=True)
FOOTER_INFO: str = Field(
default="Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.",
)
# CUSTOM_TEMPLATES_DIR: Path = Field(default=None) # this is now a constant
PUBLIC_INDEX: bool = Field(default=True)
PUBLIC_SNAPSHOTS: bool = Field(default=True)
PUBLIC_ADD_VIEW: bool = Field(default=False)
ADMIN_USERNAME: str | None = Field(default=None)
ADMIN_PASSWORD: str | None = Field(default=None)
REVERSE_PROXY_USER_HEADER: str = Field(default="Remote-User")
REVERSE_PROXY_WHITELIST: str = Field(default="")
LOGOUT_REDIRECT_URL: str = Field(default="/")
@field_validator("SERVER_SECURITY_MODE", mode="after")
def validate_server_security_mode(cls, v: str) -> str:
mode = (v or "").strip().lower()
if mode not in cls.SERVER_SECURITY_MODES:
raise ValueError(f"SERVER_SECURITY_MODE must be one of: {', '.join(cls.SERVER_SECURITY_MODES)}")
return mode
@property
def USES_SUBDOMAIN_ROUTING(self) -> bool:
return self.SERVER_SECURITY_MODE == "safe-subdomains-fullreplay"
@property
def ENABLES_FULL_JS_REPLAY(self) -> bool:
return self.SERVER_SECURITY_MODE in (
"safe-subdomains-fullreplay",
"unsafe-onedomain-noadmin",
"danger-onedomain-fullreplay",
)
@property
def CONTROL_PLANE_ENABLED(self) -> bool:
return self.SERVER_SECURITY_MODE != "unsafe-onedomain-noadmin"
@property
def BLOCK_UNSAFE_METHODS(self) -> bool:
return self.SERVER_SECURITY_MODE == "unsafe-onedomain-noadmin"
@property
def SHOULD_NEUTER_RISKY_REPLAY(self) -> bool:
return self.SERVER_SECURITY_MODE == "safe-onedomain-nojsreplay"
@property
def IS_UNSAFE_MODE(self) -> bool:
return self.SERVER_SECURITY_MODE == "unsafe-onedomain-noadmin"
@property
def IS_DANGEROUS_MODE(self) -> bool:
return self.SERVER_SECURITY_MODE == "danger-onedomain-fullreplay"
@property
def IS_LOWER_SECURITY_MODE(self) -> bool:
return self.SERVER_SECURITY_MODE in (
"unsafe-onedomain-noadmin",
"danger-onedomain-fullreplay",
)
SERVER_CONFIG = ServerConfig()
def _print_server_security_mode_warning() -> None:
if not SERVER_CONFIG.IS_LOWER_SECURITY_MODE:
return
rprint(
f"[yellow][!] WARNING: ArchiveBox is running with SERVER_SECURITY_MODE={SERVER_CONFIG.SERVER_SECURITY_MODE}[/yellow]",
file=sys.stderr,
)
rprint(
"[yellow] Archived pages may share an origin with privileged app routes in this mode.[/yellow]",
file=sys.stderr,
)
rprint(
"[yellow] To switch to the safer isolated setup:[/yellow]",
file=sys.stderr,
)
rprint(
"[yellow] 1. Set SERVER_SECURITY_MODE=safe-subdomains-fullreplay[/yellow]",
file=sys.stderr,
)
rprint(
"[yellow] 2. Point *.archivebox.localhost (or your chosen base domain) at this server[/yellow]",
file=sys.stderr,
)
rprint(
"[yellow] 3. Configure wildcard DNS/TLS or your reverse proxy so admin., web., api., and snapshot subdomains resolve[/yellow]",
file=sys.stderr,
)
_print_server_security_mode_warning()
class ArchivingConfig(BaseConfigSet):
toml_section_header: str = "ARCHIVING_CONFIG"
ONLY_NEW: bool = Field(default=True)
OVERWRITE: bool = Field(default=False)
TIMEOUT: int = Field(default=60)
MAX_URL_ATTEMPTS: int = Field(default=50)
RESOLUTION: str = Field(default="1440,2000")
CHECK_SSL_VALIDITY: bool = Field(default=True)
USER_AGENT: str = Field(
default=f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)",
)
COOKIES_FILE: Path | None = Field(default=None)
URL_DENYLIST: str = Field(default=r"\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$", alias="URL_BLACKLIST")
URL_ALLOWLIST: str | None = Field(default=None, alias="URL_WHITELIST")
SAVE_ALLOWLIST: dict[str, list[str]] = Field(default={}) # mapping of regex patterns to list of archive methods
SAVE_DENYLIST: dict[str, list[str]] = Field(default={})
DEFAULT_PERSONA: str = Field(default="Default")
def warn_if_invalid(self) -> None:
if int(self.TIMEOUT) < 5:
rprint(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr)
rprint(" You must allow *at least* 5 seconds for indexing and archive methods to run successfully.", file=sys.stderr)
rprint(" (Setting it to somewhere between 30 and 3000 seconds is recommended)", file=sys.stderr)
rprint(file=sys.stderr)
rprint(" If you want to make ArchiveBox run faster, disable specific archive methods instead:", file=sys.stderr)
rprint(" https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles", file=sys.stderr)
rprint(file=sys.stderr)
@field_validator("CHECK_SSL_VALIDITY", mode="after")
def validate_check_ssl_validity(cls, v):
"""SIDE EFFECT: disable "you really shouldnt disable ssl" warnings emitted by requests"""
if not v:
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
return v
@property
def URL_ALLOWLIST_PTN(self) -> re.Pattern | None:
return re.compile(self.URL_ALLOWLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) if self.URL_ALLOWLIST else None
@property
def URL_DENYLIST_PTN(self) -> re.Pattern:
return re.compile(self.URL_DENYLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)
@property
def SAVE_ALLOWLIST_PTNS(self) -> dict[re.Pattern, list[str]]:
return (
{
# regexp: methods list
re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
for key, val in self.SAVE_ALLOWLIST.items()
}
if self.SAVE_ALLOWLIST
else {}
)
@property
def SAVE_DENYLIST_PTNS(self) -> dict[re.Pattern, list[str]]:
return (
{
# regexp: methods list
re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
for key, val in self.SAVE_DENYLIST.items()
}
if self.SAVE_DENYLIST
else {}
)
ARCHIVING_CONFIG = ArchivingConfig()
ARCHIVING_CONFIG.warn_if_invalid()
class SearchBackendConfig(BaseConfigSet):
toml_section_header: str = "SEARCH_BACKEND_CONFIG"
USE_INDEXING_BACKEND: bool = Field(default=True)
USE_SEARCHING_BACKEND: bool = Field(default=True)
SEARCH_BACKEND_ENGINE: str = Field(default="ripgrep")
SEARCH_PROCESS_HTML: bool = Field(default=True)
SEARCH_BACKEND_CONFIG = SearchBackendConfig()