mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-01-06 19:06:08 +10:00
Adds a new MAX_URL_ATTEMPTS configuration option (default: 50) that stops retrying ArchiveResult hooks for a snapshot once that many failures have been recorded. This prevents infinite retry loops for problematic URLs. When the limit is reached, any pending ArchiveResults for that snapshot are marked as SKIPPED with an explanatory message.
211 lines
7.5 KiB
Python
211 lines
7.5 KiB
Python
__package__ = "archivebox.config"
|
|
|
|
import re
|
|
import sys
|
|
import shutil
|
|
from typing import Dict, Optional, List
|
|
from pathlib import Path
|
|
|
|
from rich import print
|
|
from pydantic import Field, field_validator
|
|
from django.utils.crypto import get_random_string
|
|
|
|
from archivebox.config.configset import BaseConfigSet
|
|
|
|
from .constants import CONSTANTS
|
|
from .version import get_COMMIT_HASH, get_BUILD_TIME, VERSION
|
|
from .permissions import IN_DOCKER
|
|
|
|
###################### Config ##########################
|
|
|
|
|
|
class ShellConfig(BaseConfigSet):
|
|
toml_section_header: str = "SHELL_CONFIG"
|
|
|
|
DEBUG: bool = Field(default="--debug" in sys.argv)
|
|
|
|
IS_TTY: bool = Field(default=sys.stdout.isatty())
|
|
USE_COLOR: bool = Field(default=sys.stdout.isatty())
|
|
SHOW_PROGRESS: bool = Field(default=sys.stdout.isatty())
|
|
|
|
IN_DOCKER: bool = Field(default=IN_DOCKER)
|
|
IN_QEMU: bool = Field(default=False)
|
|
|
|
ANSI: Dict[str, str] = Field(
|
|
default_factory=lambda: CONSTANTS.DEFAULT_CLI_COLORS if sys.stdout.isatty() else CONSTANTS.DISABLED_CLI_COLORS
|
|
)
|
|
|
|
@property
|
|
def TERM_WIDTH(self) -> int:
|
|
if not self.IS_TTY:
|
|
return 200
|
|
return shutil.get_terminal_size((140, 10)).columns
|
|
|
|
@property
|
|
def COMMIT_HASH(self) -> Optional[str]:
|
|
return get_COMMIT_HASH()
|
|
|
|
@property
|
|
def BUILD_TIME(self) -> str:
|
|
return get_BUILD_TIME()
|
|
|
|
|
|
SHELL_CONFIG = ShellConfig()
|
|
|
|
|
|
class StorageConfig(BaseConfigSet):
|
|
toml_section_header: str = "STORAGE_CONFIG"
|
|
|
|
# TMP_DIR must be a local, fast, readable/writable dir by archivebox user,
|
|
# must be a short path due to unix path length restrictions for socket files (<100 chars)
|
|
# must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets
|
|
TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR)
|
|
|
|
# LIB_DIR must be a local, fast, readable/writable dir by archivebox user,
|
|
# must be able to contain executable binaries (up to 5GB size)
|
|
# should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow
|
|
LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR)
|
|
|
|
OUTPUT_PERMISSIONS: str = Field(default="644")
|
|
RESTRICT_FILE_NAMES: str = Field(default="windows")
|
|
ENFORCE_ATOMIC_WRITES: bool = Field(default=True)
|
|
|
|
# not supposed to be user settable:
|
|
DIR_OUTPUT_PERMISSIONS: str = Field(default="755") # computed from OUTPUT_PERMISSIONS
|
|
|
|
|
|
STORAGE_CONFIG = StorageConfig()
|
|
|
|
|
|
class GeneralConfig(BaseConfigSet):
|
|
toml_section_header: str = "GENERAL_CONFIG"
|
|
|
|
TAG_SEPARATOR_PATTERN: str = Field(default=r"[,]")
|
|
|
|
|
|
GENERAL_CONFIG = GeneralConfig()
|
|
|
|
|
|
class ServerConfig(BaseConfigSet):
|
|
toml_section_header: str = "SERVER_CONFIG"
|
|
|
|
SECRET_KEY: str = Field(default_factory=lambda: get_random_string(50, "abcdefghijklmnopqrstuvwxyz0123456789_"))
|
|
BIND_ADDR: str = Field(default="127.0.0.1:8000")
|
|
ALLOWED_HOSTS: str = Field(default="*")
|
|
CSRF_TRUSTED_ORIGINS: str = Field(default="http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000")
|
|
|
|
SNAPSHOTS_PER_PAGE: int = Field(default=40)
|
|
PREVIEW_ORIGINALS: bool = Field(default=True)
|
|
FOOTER_INFO: str = Field(
|
|
default="Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests."
|
|
)
|
|
# CUSTOM_TEMPLATES_DIR: Path = Field(default=None) # this is now a constant
|
|
|
|
PUBLIC_INDEX: bool = Field(default=True)
|
|
PUBLIC_SNAPSHOTS: bool = Field(default=True)
|
|
PUBLIC_ADD_VIEW: bool = Field(default=False)
|
|
|
|
ADMIN_USERNAME: Optional[str] = Field(default=None)
|
|
ADMIN_PASSWORD: Optional[str] = Field(default=None)
|
|
|
|
REVERSE_PROXY_USER_HEADER: str = Field(default="Remote-User")
|
|
REVERSE_PROXY_WHITELIST: str = Field(default="")
|
|
LOGOUT_REDIRECT_URL: str = Field(default="/")
|
|
|
|
|
|
SERVER_CONFIG = ServerConfig()
|
|
|
|
|
|
class ArchivingConfig(BaseConfigSet):
|
|
toml_section_header: str = "ARCHIVING_CONFIG"
|
|
|
|
ONLY_NEW: bool = Field(default=True)
|
|
OVERWRITE: bool = Field(default=False)
|
|
|
|
TIMEOUT: int = Field(default=60)
|
|
MAX_URL_ATTEMPTS: int = Field(default=50)
|
|
|
|
RESOLUTION: str = Field(default="1440,2000")
|
|
CHECK_SSL_VALIDITY: bool = Field(default=True)
|
|
USER_AGENT: str = Field(
|
|
default=f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)"
|
|
)
|
|
COOKIES_FILE: Path | None = Field(default=None)
|
|
|
|
URL_DENYLIST: str = Field(default=r"\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$", alias="URL_BLACKLIST")
|
|
URL_ALLOWLIST: str | None = Field(default=None, alias="URL_WHITELIST")
|
|
|
|
SAVE_ALLOWLIST: Dict[str, List[str]] = Field(default={}) # mapping of regex patterns to list of archive methods
|
|
SAVE_DENYLIST: Dict[str, List[str]] = Field(default={})
|
|
|
|
DEFAULT_PERSONA: str = Field(default="Default")
|
|
|
|
def validate(self):
|
|
if int(self.TIMEOUT) < 5:
|
|
print(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr)
|
|
print(" You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.", file=sys.stderr)
|
|
print(" (Setting it to somewhere between 30 and 3000 seconds is recommended)", file=sys.stderr)
|
|
print(file=sys.stderr)
|
|
print(" If you want to make ArchiveBox run faster, disable specific archive methods instead:", file=sys.stderr)
|
|
print(" https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles", file=sys.stderr)
|
|
print(file=sys.stderr)
|
|
|
|
@field_validator("CHECK_SSL_VALIDITY", mode="after")
|
|
def validate_check_ssl_validity(cls, v):
|
|
"""SIDE EFFECT: disable "you really shouldnt disable ssl" warnings emitted by requests"""
|
|
if not v:
|
|
import requests
|
|
import urllib3
|
|
|
|
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
|
return v
|
|
|
|
@property
|
|
def URL_ALLOWLIST_PTN(self) -> re.Pattern | None:
|
|
return re.compile(self.URL_ALLOWLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) if self.URL_ALLOWLIST else None
|
|
|
|
@property
|
|
def URL_DENYLIST_PTN(self) -> re.Pattern:
|
|
return re.compile(self.URL_DENYLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)
|
|
|
|
@property
|
|
def SAVE_ALLOWLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
|
|
return (
|
|
{
|
|
# regexp: methods list
|
|
re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
|
|
for key, val in self.SAVE_ALLOWLIST.items()
|
|
}
|
|
if self.SAVE_ALLOWLIST
|
|
else {}
|
|
)
|
|
|
|
@property
|
|
def SAVE_DENYLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
|
|
return (
|
|
{
|
|
# regexp: methods list
|
|
re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
|
|
for key, val in self.SAVE_DENYLIST.items()
|
|
}
|
|
if self.SAVE_DENYLIST
|
|
else {}
|
|
)
|
|
|
|
|
|
ARCHIVING_CONFIG = ArchivingConfig()
|
|
|
|
|
|
class SearchBackendConfig(BaseConfigSet):
|
|
toml_section_header: str = "SEARCH_BACKEND_CONFIG"
|
|
|
|
USE_INDEXING_BACKEND: bool = Field(default=True)
|
|
USE_SEARCHING_BACKEND: bool = Field(default=True)
|
|
|
|
SEARCH_BACKEND_ENGINE: str = Field(default="ripgrep")
|
|
SEARCH_PROCESS_HTML: bool = Field(default=True)
|
|
|
|
|
|
SEARCH_BACKEND_CONFIG = SearchBackendConfig()
|