Files
ArchiveBox/archivebox/config/common.py
Claude 88d7906033 Add MAX_URL_ATTEMPTS config option to stop retries after too many failures
Adds a new MAX_URL_ATTEMPTS configuration option (default: 50) that stops
retrying ArchiveResult hooks for a snapshot once that many failures have
been recorded. This prevents infinite retry loops for problematic URLs.

When the limit is reached, any pending ArchiveResults for that snapshot
are marked as SKIPPED with an explanatory message.
2025-12-29 20:20:50 +00:00

211 lines
7.5 KiB
Python

__package__ = "archivebox.config"
import re
import sys
import shutil
from typing import Dict, Optional, List
from pathlib import Path
from rich import print
from pydantic import Field, field_validator
from django.utils.crypto import get_random_string
from archivebox.config.configset import BaseConfigSet
from .constants import CONSTANTS
from .version import get_COMMIT_HASH, get_BUILD_TIME, VERSION
from .permissions import IN_DOCKER
###################### Config ##########################
class ShellConfig(BaseConfigSet):
toml_section_header: str = "SHELL_CONFIG"
DEBUG: bool = Field(default="--debug" in sys.argv)
IS_TTY: bool = Field(default=sys.stdout.isatty())
USE_COLOR: bool = Field(default=sys.stdout.isatty())
SHOW_PROGRESS: bool = Field(default=sys.stdout.isatty())
IN_DOCKER: bool = Field(default=IN_DOCKER)
IN_QEMU: bool = Field(default=False)
ANSI: Dict[str, str] = Field(
default_factory=lambda: CONSTANTS.DEFAULT_CLI_COLORS if sys.stdout.isatty() else CONSTANTS.DISABLED_CLI_COLORS
)
@property
def TERM_WIDTH(self) -> int:
if not self.IS_TTY:
return 200
return shutil.get_terminal_size((140, 10)).columns
@property
def COMMIT_HASH(self) -> Optional[str]:
return get_COMMIT_HASH()
@property
def BUILD_TIME(self) -> str:
return get_BUILD_TIME()
SHELL_CONFIG = ShellConfig()
class StorageConfig(BaseConfigSet):
toml_section_header: str = "STORAGE_CONFIG"
# TMP_DIR must be a local, fast, readable/writable dir by archivebox user,
# must be a short path due to unix path length restrictions for socket files (<100 chars)
# must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets
TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR)
# LIB_DIR must be a local, fast, readable/writable dir by archivebox user,
# must be able to contain executable binaries (up to 5GB size)
# should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow
LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR)
OUTPUT_PERMISSIONS: str = Field(default="644")
RESTRICT_FILE_NAMES: str = Field(default="windows")
ENFORCE_ATOMIC_WRITES: bool = Field(default=True)
# not supposed to be user settable:
DIR_OUTPUT_PERMISSIONS: str = Field(default="755") # computed from OUTPUT_PERMISSIONS
STORAGE_CONFIG = StorageConfig()
class GeneralConfig(BaseConfigSet):
toml_section_header: str = "GENERAL_CONFIG"
TAG_SEPARATOR_PATTERN: str = Field(default=r"[,]")
GENERAL_CONFIG = GeneralConfig()
class ServerConfig(BaseConfigSet):
toml_section_header: str = "SERVER_CONFIG"
SECRET_KEY: str = Field(default_factory=lambda: get_random_string(50, "abcdefghijklmnopqrstuvwxyz0123456789_"))
BIND_ADDR: str = Field(default="127.0.0.1:8000")
ALLOWED_HOSTS: str = Field(default="*")
CSRF_TRUSTED_ORIGINS: str = Field(default="http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000")
SNAPSHOTS_PER_PAGE: int = Field(default=40)
PREVIEW_ORIGINALS: bool = Field(default=True)
FOOTER_INFO: str = Field(
default="Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests."
)
# CUSTOM_TEMPLATES_DIR: Path = Field(default=None) # this is now a constant
PUBLIC_INDEX: bool = Field(default=True)
PUBLIC_SNAPSHOTS: bool = Field(default=True)
PUBLIC_ADD_VIEW: bool = Field(default=False)
ADMIN_USERNAME: Optional[str] = Field(default=None)
ADMIN_PASSWORD: Optional[str] = Field(default=None)
REVERSE_PROXY_USER_HEADER: str = Field(default="Remote-User")
REVERSE_PROXY_WHITELIST: str = Field(default="")
LOGOUT_REDIRECT_URL: str = Field(default="/")
SERVER_CONFIG = ServerConfig()
class ArchivingConfig(BaseConfigSet):
toml_section_header: str = "ARCHIVING_CONFIG"
ONLY_NEW: bool = Field(default=True)
OVERWRITE: bool = Field(default=False)
TIMEOUT: int = Field(default=60)
MAX_URL_ATTEMPTS: int = Field(default=50)
RESOLUTION: str = Field(default="1440,2000")
CHECK_SSL_VALIDITY: bool = Field(default=True)
USER_AGENT: str = Field(
default=f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)"
)
COOKIES_FILE: Path | None = Field(default=None)
URL_DENYLIST: str = Field(default=r"\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$", alias="URL_BLACKLIST")
URL_ALLOWLIST: str | None = Field(default=None, alias="URL_WHITELIST")
SAVE_ALLOWLIST: Dict[str, List[str]] = Field(default={}) # mapping of regex patterns to list of archive methods
SAVE_DENYLIST: Dict[str, List[str]] = Field(default={})
DEFAULT_PERSONA: str = Field(default="Default")
def validate(self):
if int(self.TIMEOUT) < 5:
print(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr)
print(" You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.", file=sys.stderr)
print(" (Setting it to somewhere between 30 and 3000 seconds is recommended)", file=sys.stderr)
print(file=sys.stderr)
print(" If you want to make ArchiveBox run faster, disable specific archive methods instead:", file=sys.stderr)
print(" https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles", file=sys.stderr)
print(file=sys.stderr)
@field_validator("CHECK_SSL_VALIDITY", mode="after")
def validate_check_ssl_validity(cls, v):
"""SIDE EFFECT: disable "you really shouldnt disable ssl" warnings emitted by requests"""
if not v:
import requests
import urllib3
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
return v
@property
def URL_ALLOWLIST_PTN(self) -> re.Pattern | None:
return re.compile(self.URL_ALLOWLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) if self.URL_ALLOWLIST else None
@property
def URL_DENYLIST_PTN(self) -> re.Pattern:
return re.compile(self.URL_DENYLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)
@property
def SAVE_ALLOWLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
return (
{
# regexp: methods list
re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
for key, val in self.SAVE_ALLOWLIST.items()
}
if self.SAVE_ALLOWLIST
else {}
)
@property
def SAVE_DENYLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
return (
{
# regexp: methods list
re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
for key, val in self.SAVE_DENYLIST.items()
}
if self.SAVE_DENYLIST
else {}
)
ARCHIVING_CONFIG = ArchivingConfig()
class SearchBackendConfig(BaseConfigSet):
toml_section_header: str = "SEARCH_BACKEND_CONFIG"
USE_INDEXING_BACKEND: bool = Field(default=True)
USE_SEARCHING_BACKEND: bool = Field(default=True)
SEARCH_BACKEND_ENGINE: str = Field(default="ripgrep")
SEARCH_PROCESS_HTML: bool = Field(default=True)
SEARCH_BACKEND_CONFIG = SearchBackendConfig()