mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-03 09:25:42 +10:00
219 lines
8.0 KiB
Python
219 lines
8.0 KiB
Python
__package__ = "archivebox.config"
|
|
|
|
import re
|
|
import sys
|
|
import shutil
|
|
from typing import Dict, Optional, List
|
|
from pathlib import Path
|
|
|
|
from rich import print
|
|
from pydantic import Field, field_validator
|
|
from django.utils.crypto import get_random_string
|
|
|
|
from archivebox.config.configset import BaseConfigSet
|
|
|
|
from .constants import CONSTANTS
|
|
from .version import get_COMMIT_HASH, get_BUILD_TIME, VERSION
|
|
from .permissions import IN_DOCKER
|
|
|
|
###################### Config ##########################
|
|
|
|
|
|
class ShellConfig(BaseConfigSet):
|
|
toml_section_header: str = "SHELL_CONFIG"
|
|
|
|
DEBUG: bool = Field(default="--debug" in sys.argv)
|
|
|
|
IS_TTY: bool = Field(default=sys.stdout.isatty())
|
|
USE_COLOR: bool = Field(default=sys.stdout.isatty())
|
|
SHOW_PROGRESS: bool = Field(default=sys.stdout.isatty())
|
|
|
|
IN_DOCKER: bool = Field(default=IN_DOCKER)
|
|
IN_QEMU: bool = Field(default=False)
|
|
|
|
ANSI: Dict[str, str] = Field(
|
|
default_factory=lambda: CONSTANTS.DEFAULT_CLI_COLORS if sys.stdout.isatty() else CONSTANTS.DISABLED_CLI_COLORS
|
|
)
|
|
|
|
@property
|
|
def TERM_WIDTH(self) -> int:
|
|
if not self.IS_TTY:
|
|
return 200
|
|
return shutil.get_terminal_size((140, 10)).columns
|
|
|
|
@property
|
|
def COMMIT_HASH(self) -> Optional[str]:
|
|
return get_COMMIT_HASH()
|
|
|
|
@property
|
|
def BUILD_TIME(self) -> str:
|
|
return get_BUILD_TIME()
|
|
|
|
|
|
SHELL_CONFIG = ShellConfig()
|
|
|
|
|
|
class StorageConfig(BaseConfigSet):
|
|
toml_section_header: str = "STORAGE_CONFIG"
|
|
|
|
# TMP_DIR must be a local, fast, readable/writable dir by archivebox user,
|
|
# must be a short path due to unix path length restrictions for socket files (<100 chars)
|
|
# must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets
|
|
TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR)
|
|
|
|
# LIB_DIR must be a local, fast, readable/writable dir by archivebox user,
|
|
# must be able to contain executable binaries (up to 5GB size)
|
|
# should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow
|
|
LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR)
|
|
|
|
# LIB_BIN_DIR is where all installed binaries are symlinked for easy PATH management
|
|
# Derived from LIB_DIR / 'bin', should be prepended to PATH for all hook executions
|
|
LIB_BIN_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_BIN_DIR)
|
|
|
|
# CUSTOM_TEMPLATES_DIR allows users to override default templates
|
|
# defaults to DATA_DIR / 'user_templates' but can be configured
|
|
CUSTOM_TEMPLATES_DIR: Path = Field(default=CONSTANTS.CUSTOM_TEMPLATES_DIR)
|
|
|
|
OUTPUT_PERMISSIONS: str = Field(default="644")
|
|
RESTRICT_FILE_NAMES: str = Field(default="windows")
|
|
ENFORCE_ATOMIC_WRITES: bool = Field(default=True)
|
|
|
|
# not supposed to be user settable:
|
|
DIR_OUTPUT_PERMISSIONS: str = Field(default="755") # computed from OUTPUT_PERMISSIONS
|
|
|
|
|
|
STORAGE_CONFIG = StorageConfig()
|
|
|
|
|
|
class GeneralConfig(BaseConfigSet):
|
|
toml_section_header: str = "GENERAL_CONFIG"
|
|
|
|
TAG_SEPARATOR_PATTERN: str = Field(default=r"[,]")
|
|
|
|
|
|
GENERAL_CONFIG = GeneralConfig()
|
|
|
|
|
|
class ServerConfig(BaseConfigSet):
|
|
toml_section_header: str = "SERVER_CONFIG"
|
|
|
|
SECRET_KEY: str = Field(default_factory=lambda: get_random_string(50, "abcdefghijklmnopqrstuvwxyz0123456789_"))
|
|
BIND_ADDR: str = Field(default="127.0.0.1:8000")
|
|
ALLOWED_HOSTS: str = Field(default="*")
|
|
CSRF_TRUSTED_ORIGINS: str = Field(default="http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000")
|
|
|
|
SNAPSHOTS_PER_PAGE: int = Field(default=40)
|
|
PREVIEW_ORIGINALS: bool = Field(default=True)
|
|
FOOTER_INFO: str = Field(
|
|
default="Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests."
|
|
)
|
|
# CUSTOM_TEMPLATES_DIR: Path = Field(default=None) # this is now a constant
|
|
|
|
PUBLIC_INDEX: bool = Field(default=True)
|
|
PUBLIC_SNAPSHOTS: bool = Field(default=True)
|
|
PUBLIC_ADD_VIEW: bool = Field(default=False)
|
|
|
|
ADMIN_USERNAME: Optional[str] = Field(default=None)
|
|
ADMIN_PASSWORD: Optional[str] = Field(default=None)
|
|
|
|
REVERSE_PROXY_USER_HEADER: str = Field(default="Remote-User")
|
|
REVERSE_PROXY_WHITELIST: str = Field(default="")
|
|
LOGOUT_REDIRECT_URL: str = Field(default="/")
|
|
|
|
|
|
SERVER_CONFIG = ServerConfig()
|
|
|
|
|
|
class ArchivingConfig(BaseConfigSet):
|
|
toml_section_header: str = "ARCHIVING_CONFIG"
|
|
|
|
ONLY_NEW: bool = Field(default=True)
|
|
OVERWRITE: bool = Field(default=False)
|
|
|
|
TIMEOUT: int = Field(default=60)
|
|
MAX_URL_ATTEMPTS: int = Field(default=50)
|
|
|
|
RESOLUTION: str = Field(default="1440,2000")
|
|
CHECK_SSL_VALIDITY: bool = Field(default=True)
|
|
USER_AGENT: str = Field(
|
|
default=f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)"
|
|
)
|
|
COOKIES_FILE: Path | None = Field(default=None)
|
|
|
|
URL_DENYLIST: str = Field(default=r"\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$", alias="URL_BLACKLIST")
|
|
URL_ALLOWLIST: str | None = Field(default=None, alias="URL_WHITELIST")
|
|
|
|
SAVE_ALLOWLIST: Dict[str, List[str]] = Field(default={}) # mapping of regex patterns to list of archive methods
|
|
SAVE_DENYLIST: Dict[str, List[str]] = Field(default={})
|
|
|
|
DEFAULT_PERSONA: str = Field(default="Default")
|
|
|
|
def validate(self):
|
|
if int(self.TIMEOUT) < 5:
|
|
print(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr)
|
|
print(" You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.", file=sys.stderr)
|
|
print(" (Setting it to somewhere between 30 and 3000 seconds is recommended)", file=sys.stderr)
|
|
print(file=sys.stderr)
|
|
print(" If you want to make ArchiveBox run faster, disable specific archive methods instead:", file=sys.stderr)
|
|
print(" https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles", file=sys.stderr)
|
|
print(file=sys.stderr)
|
|
|
|
@field_validator("CHECK_SSL_VALIDITY", mode="after")
|
|
def validate_check_ssl_validity(cls, v):
|
|
"""SIDE EFFECT: disable "you really shouldnt disable ssl" warnings emitted by requests"""
|
|
if not v:
|
|
import requests
|
|
import urllib3
|
|
|
|
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
return v
|
|
|
|
@property
|
|
def URL_ALLOWLIST_PTN(self) -> re.Pattern | None:
|
|
return re.compile(self.URL_ALLOWLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) if self.URL_ALLOWLIST else None
|
|
|
|
@property
|
|
def URL_DENYLIST_PTN(self) -> re.Pattern:
|
|
return re.compile(self.URL_DENYLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)
|
|
|
|
@property
|
|
def SAVE_ALLOWLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
|
|
return (
|
|
{
|
|
# regexp: methods list
|
|
re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
|
|
for key, val in self.SAVE_ALLOWLIST.items()
|
|
}
|
|
if self.SAVE_ALLOWLIST
|
|
else {}
|
|
)
|
|
|
|
@property
|
|
def SAVE_DENYLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
|
|
return (
|
|
{
|
|
# regexp: methods list
|
|
re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
|
|
for key, val in self.SAVE_DENYLIST.items()
|
|
}
|
|
if self.SAVE_DENYLIST
|
|
else {}
|
|
)
|
|
|
|
|
|
ARCHIVING_CONFIG = ArchivingConfig()
|
|
|
|
|
|
class SearchBackendConfig(BaseConfigSet):
|
|
toml_section_header: str = "SEARCH_BACKEND_CONFIG"
|
|
|
|
USE_INDEXING_BACKEND: bool = Field(default=True)
|
|
USE_SEARCHING_BACKEND: bool = Field(default=True)
|
|
|
|
SEARCH_BACKEND_ENGINE: str = Field(default="ripgrep")
|
|
SEARCH_PROCESS_HTML: bool = Field(default=True)
|
|
|
|
|
|
SEARCH_BACKEND_CONFIG = SearchBackendConfig()
|