__package__ = "archivebox.config" import re import sys import shutil from typing import Dict, Optional, List from pathlib import Path from rich import print from pydantic import Field, field_validator from django.utils.crypto import get_random_string from archivebox.config.configset import BaseConfigSet from .constants import CONSTANTS from .version import get_COMMIT_HASH, get_BUILD_TIME, VERSION from .permissions import IN_DOCKER ###################### Config ########################## class ShellConfig(BaseConfigSet): toml_section_header: str = "SHELL_CONFIG" DEBUG: bool = Field(default="--debug" in sys.argv) IS_TTY: bool = Field(default=sys.stdout.isatty()) USE_COLOR: bool = Field(default=sys.stdout.isatty()) SHOW_PROGRESS: bool = Field(default=sys.stdout.isatty()) IN_DOCKER: bool = Field(default=IN_DOCKER) IN_QEMU: bool = Field(default=False) ANSI: Dict[str, str] = Field( default_factory=lambda: CONSTANTS.DEFAULT_CLI_COLORS if sys.stdout.isatty() else CONSTANTS.DISABLED_CLI_COLORS ) @property def TERM_WIDTH(self) -> int: if not self.IS_TTY: return 200 return shutil.get_terminal_size((140, 10)).columns @property def COMMIT_HASH(self) -> Optional[str]: return get_COMMIT_HASH() @property def BUILD_TIME(self) -> str: return get_BUILD_TIME() SHELL_CONFIG = ShellConfig() class StorageConfig(BaseConfigSet): toml_section_header: str = "STORAGE_CONFIG" # TMP_DIR must be a local, fast, readable/writable dir by archivebox user, # must be a short path due to unix path length restrictions for socket files (<100 chars) # must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR) # LIB_DIR must be a local, fast, readable/writable dir by archivebox user, # must be able to contain executable binaries (up to 5GB size) # should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR) # LIB_BIN_DIR is where all installed binaries are symlinked for easy PATH management # Derived from LIB_DIR / 'bin', should be prepended to PATH for all hook executions LIB_BIN_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_BIN_DIR) # CUSTOM_TEMPLATES_DIR allows users to override default templates # defaults to DATA_DIR / 'user_templates' but can be configured CUSTOM_TEMPLATES_DIR: Path = Field(default=CONSTANTS.CUSTOM_TEMPLATES_DIR) OUTPUT_PERMISSIONS: str = Field(default="644") RESTRICT_FILE_NAMES: str = Field(default="windows") ENFORCE_ATOMIC_WRITES: bool = Field(default=True) # not supposed to be user settable: DIR_OUTPUT_PERMISSIONS: str = Field(default="755") # computed from OUTPUT_PERMISSIONS STORAGE_CONFIG = StorageConfig() class GeneralConfig(BaseConfigSet): toml_section_header: str = "GENERAL_CONFIG" TAG_SEPARATOR_PATTERN: str = Field(default=r"[,]") GENERAL_CONFIG = GeneralConfig() class ServerConfig(BaseConfigSet): toml_section_header: str = "SERVER_CONFIG" SECRET_KEY: str = Field(default_factory=lambda: get_random_string(50, "abcdefghijklmnopqrstuvwxyz0123456789_")) BIND_ADDR: str = Field(default="127.0.0.1:8000") ALLOWED_HOSTS: str = Field(default="*") CSRF_TRUSTED_ORIGINS: str = Field(default="http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000") SNAPSHOTS_PER_PAGE: int = Field(default=40) PREVIEW_ORIGINALS: bool = Field(default=True) FOOTER_INFO: str = Field( default="Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests." ) # CUSTOM_TEMPLATES_DIR: Path = Field(default=None) # this is now a constant PUBLIC_INDEX: bool = Field(default=True) PUBLIC_SNAPSHOTS: bool = Field(default=True) PUBLIC_ADD_VIEW: bool = Field(default=False) ADMIN_USERNAME: Optional[str] = Field(default=None) ADMIN_PASSWORD: Optional[str] = Field(default=None) REVERSE_PROXY_USER_HEADER: str = Field(default="Remote-User") REVERSE_PROXY_WHITELIST: str = Field(default="") LOGOUT_REDIRECT_URL: str = Field(default="/") SERVER_CONFIG = ServerConfig() class ArchivingConfig(BaseConfigSet): toml_section_header: str = "ARCHIVING_CONFIG" ONLY_NEW: bool = Field(default=True) OVERWRITE: bool = Field(default=False) TIMEOUT: int = Field(default=60) MAX_URL_ATTEMPTS: int = Field(default=50) RESOLUTION: str = Field(default="1440,2000") CHECK_SSL_VALIDITY: bool = Field(default=True) USER_AGENT: str = Field( default=f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)" ) COOKIES_FILE: Path | None = Field(default=None) URL_DENYLIST: str = Field(default=r"\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$", alias="URL_BLACKLIST") URL_ALLOWLIST: str | None = Field(default=None, alias="URL_WHITELIST") SAVE_ALLOWLIST: Dict[str, List[str]] = Field(default={}) # mapping of regex patterns to list of archive methods SAVE_DENYLIST: Dict[str, List[str]] = Field(default={}) DEFAULT_PERSONA: str = Field(default="Default") def validate(self): if int(self.TIMEOUT) < 5: print(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr) print(" You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.", file=sys.stderr) print(" (Setting it to somewhere between 30 and 3000 seconds is recommended)", file=sys.stderr) print(file=sys.stderr) print(" If you want to make ArchiveBox run faster, disable specific archive methods instead:", file=sys.stderr) print(" https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles", file=sys.stderr) print(file=sys.stderr) @field_validator("CHECK_SSL_VALIDITY", mode="after") def validate_check_ssl_validity(cls, v): """SIDE EFFECT: disable "you really shouldnt disable ssl" warnings emitted by requests""" if not v: import requests import urllib3 requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning) urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning) return v @property def URL_ALLOWLIST_PTN(self) -> re.Pattern | None: return re.compile(self.URL_ALLOWLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) if self.URL_ALLOWLIST else None @property def URL_DENYLIST_PTN(self) -> re.Pattern: return re.compile(self.URL_DENYLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) @property def SAVE_ALLOWLIST_PTNS(self) -> Dict[re.Pattern, List[str]]: return ( { # regexp: methods list re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val for key, val in self.SAVE_ALLOWLIST.items() } if self.SAVE_ALLOWLIST else {} ) @property def SAVE_DENYLIST_PTNS(self) -> Dict[re.Pattern, List[str]]: return ( { # regexp: methods list re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val for key, val in self.SAVE_DENYLIST.items() } if self.SAVE_DENYLIST else {} ) ARCHIVING_CONFIG = ArchivingConfig() class SearchBackendConfig(BaseConfigSet): toml_section_header: str = "SEARCH_BACKEND_CONFIG" USE_INDEXING_BACKEND: bool = Field(default=True) USE_SEARCHING_BACKEND: bool = Field(default=True) SEARCH_BACKEND_ENGINE: str = Field(default="ripgrep") SEARCH_PROCESS_HTML: bool = Field(default=True) SEARCH_BACKEND_CONFIG = SearchBackendConfig()