wip major changes

This commit is contained in:
Nick Sweeting
2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions

View File

@@ -1,6 +1,17 @@
"""
ArchiveBox config exports.
This module provides backwards-compatible config exports for extractors
and other modules that expect to import config values directly.
"""
__package__ = 'archivebox.config'
__order__ = 200
import shutil
from pathlib import Path
from typing import Dict, List, Optional
from .paths import (
PACKAGE_DIR, # noqa
DATA_DIR, # noqa
@@ -9,28 +20,219 @@ from .paths import (
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
from .version import VERSION # noqa
# import abx
# @abx.hookimpl
# def get_CONFIG():
# from .common import (
# SHELL_CONFIG,
# STORAGE_CONFIG,
# GENERAL_CONFIG,
# SERVER_CONFIG,
# ARCHIVING_CONFIG,
# SEARCH_BACKEND_CONFIG,
# )
# return {
# 'SHELL_CONFIG': SHELL_CONFIG,
# 'STORAGE_CONFIG': STORAGE_CONFIG,
# 'GENERAL_CONFIG': GENERAL_CONFIG,
# 'SERVER_CONFIG': SERVER_CONFIG,
# 'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
# 'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
# }
###############################################################################
# Config value exports for extractors
# These provide backwards compatibility with extractors that import from ..config
###############################################################################
# @abx.hookimpl
# def ready():
# for config in get_CONFIG().values():
# config.validate()
def _get_config():
"""Lazy import to avoid circular imports."""
from .common import ARCHIVING_CONFIG, STORAGE_CONFIG
return ARCHIVING_CONFIG, STORAGE_CONFIG
# Direct exports (evaluated at import time for backwards compat)
# These are recalculated each time the module attribute is accessed
def __getattr__(name: str):
"""Module-level __getattr__ for lazy config loading."""
# Timeout settings
if name == 'TIMEOUT':
cfg, _ = _get_config()
return cfg.TIMEOUT
if name == 'MEDIA_TIMEOUT':
cfg, _ = _get_config()
return cfg.MEDIA_TIMEOUT
# SSL/Security settings
if name == 'CHECK_SSL_VALIDITY':
cfg, _ = _get_config()
return cfg.CHECK_SSL_VALIDITY
# Storage settings
if name == 'RESTRICT_FILE_NAMES':
_, storage = _get_config()
return storage.RESTRICT_FILE_NAMES
# User agent / cookies
if name == 'COOKIES_FILE':
cfg, _ = _get_config()
return cfg.COOKIES_FILE
if name == 'USER_AGENT':
cfg, _ = _get_config()
return cfg.USER_AGENT
if name == 'CURL_USER_AGENT':
cfg, _ = _get_config()
return cfg.USER_AGENT
if name == 'WGET_USER_AGENT':
cfg, _ = _get_config()
return cfg.USER_AGENT
if name == 'CHROME_USER_AGENT':
cfg, _ = _get_config()
return cfg.USER_AGENT
# Archive method toggles (SAVE_*)
if name == 'SAVE_TITLE':
return True
if name == 'SAVE_FAVICON':
return True
if name == 'SAVE_WGET':
return True
if name == 'SAVE_WARC':
return True
if name == 'SAVE_WGET_REQUISITES':
return True
if name == 'SAVE_SINGLEFILE':
return True
if name == 'SAVE_READABILITY':
return True
if name == 'SAVE_MERCURY':
return True
if name == 'SAVE_HTMLTOTEXT':
return True
if name == 'SAVE_PDF':
return True
if name == 'SAVE_SCREENSHOT':
return True
if name == 'SAVE_DOM':
return True
if name == 'SAVE_HEADERS':
return True
if name == 'SAVE_GIT':
return True
if name == 'SAVE_MEDIA':
return True
if name == 'SAVE_ARCHIVE_DOT_ORG':
return True
# Extractor-specific settings
if name == 'RESOLUTION':
cfg, _ = _get_config()
return cfg.RESOLUTION
if name == 'GIT_DOMAINS':
return 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'
if name == 'MEDIA_MAX_SIZE':
cfg, _ = _get_config()
return cfg.MEDIA_MAX_SIZE
if name == 'FAVICON_PROVIDER':
return 'https://www.google.com/s2/favicons?domain={}'
# Binary paths (use shutil.which for detection)
if name == 'CURL_BINARY':
return shutil.which('curl') or 'curl'
if name == 'WGET_BINARY':
return shutil.which('wget') or 'wget'
if name == 'GIT_BINARY':
return shutil.which('git') or 'git'
if name == 'YOUTUBEDL_BINARY':
return shutil.which('yt-dlp') or shutil.which('youtube-dl') or 'yt-dlp'
if name == 'CHROME_BINARY':
for chrome in ['chromium', 'chromium-browser', 'google-chrome', 'google-chrome-stable', 'chrome']:
path = shutil.which(chrome)
if path:
return path
return 'chromium'
if name == 'NODE_BINARY':
return shutil.which('node') or 'node'
if name == 'SINGLEFILE_BINARY':
return shutil.which('single-file') or shutil.which('singlefile') or 'single-file'
if name == 'READABILITY_BINARY':
return shutil.which('readability-extractor') or 'readability-extractor'
if name == 'MERCURY_BINARY':
return shutil.which('mercury-parser') or shutil.which('postlight-parser') or 'mercury-parser'
# Binary versions (return placeholder, actual version detection happens elsewhere)
if name == 'CURL_VERSION':
return 'curl'
if name == 'WGET_VERSION':
return 'wget'
if name == 'GIT_VERSION':
return 'git'
if name == 'YOUTUBEDL_VERSION':
return 'yt-dlp'
if name == 'CHROME_VERSION':
return 'chromium'
if name == 'SINGLEFILE_VERSION':
return 'singlefile'
if name == 'READABILITY_VERSION':
return 'readability'
if name == 'MERCURY_VERSION':
return 'mercury'
# Binary arguments
if name == 'CURL_ARGS':
return ['--silent', '--location', '--compressed']
if name == 'WGET_ARGS':
return [
'--no-verbose',
'--adjust-extension',
'--convert-links',
'--force-directories',
'--backup-converted',
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
]
if name == 'GIT_ARGS':
return ['--recursive']
if name == 'YOUTUBEDL_ARGS':
cfg, _ = _get_config()
return [
'--write-description',
'--write-info-json',
'--write-annotations',
'--write-thumbnail',
'--no-call-home',
'--write-sub',
'--write-auto-subs',
'--convert-subs=srt',
'--yes-playlist',
'--continue',
'--no-abort-on-error',
'--ignore-errors',
'--geo-bypass',
'--add-metadata',
f'--format=(bv*+ba/b)[filesize<={cfg.MEDIA_MAX_SIZE}][filesize_approx<=?{cfg.MEDIA_MAX_SIZE}]/(bv*+ba/b)',
]
if name == 'SINGLEFILE_ARGS':
return None # Uses defaults
if name == 'CHROME_ARGS':
return []
# Other settings
if name == 'WGET_AUTO_COMPRESSION':
return True
if name == 'DEPENDENCIES':
return {} # Legacy, not used anymore
# Allowlist/Denylist patterns (compiled regexes)
if name == 'SAVE_ALLOWLIST_PTN':
cfg, _ = _get_config()
return cfg.SAVE_ALLOWLIST_PTNS
if name == 'SAVE_DENYLIST_PTN':
cfg, _ = _get_config()
return cfg.SAVE_DENYLIST_PTNS
raise AttributeError(f"module 'archivebox.config' has no attribute '{name}'")
# Re-export common config classes for direct imports
def get_CONFIG():
"""Get all config sections as a dict."""
from .common import (
SHELL_CONFIG,
STORAGE_CONFIG,
GENERAL_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
)
return {
'SHELL_CONFIG': SHELL_CONFIG,
'STORAGE_CONFIG': STORAGE_CONFIG,
'GENERAL_CONFIG': GENERAL_CONFIG,
'SERVER_CONFIG': SERVER_CONFIG,
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
}

View File

@@ -18,13 +18,8 @@ from archivebox.misc.logging import stderr
def get_real_name(key: str) -> str:
"""get the up-to-date canonical name for a given old alias or current key"""
CONFIGS = archivebox.pm.hook.get_CONFIGS()
for section in CONFIGS.values():
try:
return section.aliases[key]
except (KeyError, AttributeError):
pass
# Config aliases are no longer used with the simplified config system
# Just return the key as-is since we no longer have a complex alias mapping
return key
@@ -117,9 +112,20 @@ def load_config_file() -> Optional[benedict]:
def section_for_key(key: str) -> Any:
for config_section in archivebox.pm.hook.get_CONFIGS().values():
if hasattr(config_section, key):
return config_section
"""Find the config section containing a given key."""
from archivebox.config.common import (
SHELL_CONFIG,
STORAGE_CONFIG,
GENERAL_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
)
for section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG,
SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]:
if hasattr(section, key):
return section
raise ValueError(f'No config section found for key: {key}')
@@ -178,7 +184,8 @@ def write_config_file(config: Dict[str, str]) -> benedict:
updated_config = {}
try:
# validate the updated_config by attempting to re-parse it
updated_config = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()}
from archivebox.config.configset import get_flat_config
updated_config = {**load_all_config(), **get_flat_config()}
except BaseException: # lgtm [py/catch-base-exception]
# something went horribly wrong, revert to the previous version
with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
@@ -236,12 +243,20 @@ def load_config(defaults: Dict[str, Any],
return benedict(extended_config)
def load_all_config():
import abx
"""Load all config sections and return as a flat dict."""
from archivebox.config.common import (
SHELL_CONFIG,
STORAGE_CONFIG,
GENERAL_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
)
flat_config = benedict()
for config_section in abx.pm.hook.get_CONFIGS().values():
config_section.__init__()
for config_section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG,
SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]:
flat_config.update(dict(config_section))
return flat_config

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.config'
__package__ = "archivebox.config"
import re
import sys
@@ -10,7 +10,7 @@ from rich import print
from pydantic import Field, field_validator
from django.utils.crypto import get_random_string
from abx_spec_config.base_configset import BaseConfigSet
from archivebox.config.configset import BaseConfigSet
from .constants import CONSTANTS
from .version import get_COMMIT_HASH, get_BUILD_TIME, VERSION
@@ -20,109 +20,127 @@ from .permissions import IN_DOCKER
class ShellConfig(BaseConfigSet):
DEBUG: bool = Field(default=lambda: '--debug' in sys.argv)
IS_TTY: bool = Field(default=sys.stdout.isatty())
USE_COLOR: bool = Field(default=lambda c: c.IS_TTY)
SHOW_PROGRESS: bool = Field(default=lambda c: c.IS_TTY)
IN_DOCKER: bool = Field(default=IN_DOCKER)
IN_QEMU: bool = Field(default=False)
toml_section_header: str = "SHELL_CONFIG"
ANSI: Dict[str, str] = Field(default=lambda c: CONSTANTS.DEFAULT_CLI_COLORS if c.USE_COLOR else CONSTANTS.DISABLED_CLI_COLORS)
DEBUG: bool = Field(default="--debug" in sys.argv)
IS_TTY: bool = Field(default=sys.stdout.isatty())
USE_COLOR: bool = Field(default=sys.stdout.isatty())
SHOW_PROGRESS: bool = Field(default=sys.stdout.isatty())
IN_DOCKER: bool = Field(default=IN_DOCKER)
IN_QEMU: bool = Field(default=False)
ANSI: Dict[str, str] = Field(
default_factory=lambda: CONSTANTS.DEFAULT_CLI_COLORS if sys.stdout.isatty() else CONSTANTS.DISABLED_CLI_COLORS
)
@property
def TERM_WIDTH(self) -> int:
if not self.IS_TTY:
return 200
return shutil.get_terminal_size((140, 10)).columns
@property
def COMMIT_HASH(self) -> Optional[str]:
return get_COMMIT_HASH()
@property
def BUILD_TIME(self) -> str:
return get_BUILD_TIME()
SHELL_CONFIG = ShellConfig()
class StorageConfig(BaseConfigSet):
toml_section_header: str = "STORAGE_CONFIG"
# TMP_DIR must be a local, fast, readable/writable dir by archivebox user,
# must be a short path due to unix path length restrictions for socket files (<100 chars)
# must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets
TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR)
TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR)
# LIB_DIR must be a local, fast, readable/writable dir by archivebox user,
# must be able to contain executable binaries (up to 5GB size)
# should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow
LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR)
OUTPUT_PERMISSIONS: str = Field(default='644')
RESTRICT_FILE_NAMES: str = Field(default='windows')
ENFORCE_ATOMIC_WRITES: bool = Field(default=True)
LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR)
OUTPUT_PERMISSIONS: str = Field(default="644")
RESTRICT_FILE_NAMES: str = Field(default="windows")
ENFORCE_ATOMIC_WRITES: bool = Field(default=True)
# not supposed to be user settable:
DIR_OUTPUT_PERMISSIONS: str = Field(default=lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5'))
DIR_OUTPUT_PERMISSIONS: str = Field(default="755") # computed from OUTPUT_PERMISSIONS
STORAGE_CONFIG = StorageConfig()
class GeneralConfig(BaseConfigSet):
TAG_SEPARATOR_PATTERN: str = Field(default=r'[,]')
toml_section_header: str = "GENERAL_CONFIG"
TAG_SEPARATOR_PATTERN: str = Field(default=r"[,]")
GENERAL_CONFIG = GeneralConfig()
class ServerConfig(BaseConfigSet):
SECRET_KEY: str = Field(default=lambda: get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_'))
BIND_ADDR: str = Field(default=lambda: ['127.0.0.1:8000', '0.0.0.0:8000'][SHELL_CONFIG.IN_DOCKER])
ALLOWED_HOSTS: str = Field(default='*')
CSRF_TRUSTED_ORIGINS: str = Field(default=lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c.BIND_ADDR))
SNAPSHOTS_PER_PAGE: int = Field(default=40)
PREVIEW_ORIGINALS: bool = Field(default=True)
FOOTER_INFO: str = Field(default='Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.')
toml_section_header: str = "SERVER_CONFIG"
SECRET_KEY: str = Field(default_factory=lambda: get_random_string(50, "abcdefghijklmnopqrstuvwxyz0123456789_"))
BIND_ADDR: str = Field(default="127.0.0.1:8000")
ALLOWED_HOSTS: str = Field(default="*")
CSRF_TRUSTED_ORIGINS: str = Field(default="http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000")
SNAPSHOTS_PER_PAGE: int = Field(default=40)
PREVIEW_ORIGINALS: bool = Field(default=True)
FOOTER_INFO: str = Field(
default="Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests."
)
# CUSTOM_TEMPLATES_DIR: Path = Field(default=None) # this is now a constant
PUBLIC_INDEX: bool = Field(default=True)
PUBLIC_SNAPSHOTS: bool = Field(default=True)
PUBLIC_ADD_VIEW: bool = Field(default=False)
ADMIN_USERNAME: str = Field(default=None)
ADMIN_PASSWORD: str = Field(default=None)
REVERSE_PROXY_USER_HEADER: str = Field(default='Remote-User')
REVERSE_PROXY_WHITELIST: str = Field(default='')
LOGOUT_REDIRECT_URL: str = Field(default='/')
PUBLIC_INDEX: bool = Field(default=True)
PUBLIC_SNAPSHOTS: bool = Field(default=True)
PUBLIC_ADD_VIEW: bool = Field(default=False)
ADMIN_USERNAME: Optional[str] = Field(default=None)
ADMIN_PASSWORD: Optional[str] = Field(default=None)
REVERSE_PROXY_USER_HEADER: str = Field(default="Remote-User")
REVERSE_PROXY_WHITELIST: str = Field(default="")
LOGOUT_REDIRECT_URL: str = Field(default="/")
SERVER_CONFIG = ServerConfig()
class ArchivingConfig(BaseConfigSet):
ONLY_NEW: bool = Field(default=True)
OVERWRITE: bool = Field(default=False)
TIMEOUT: int = Field(default=60)
MEDIA_TIMEOUT: int = Field(default=3600)
toml_section_header: str = "ARCHIVING_CONFIG"
ONLY_NEW: bool = Field(default=True)
OVERWRITE: bool = Field(default=False)
TIMEOUT: int = Field(default=60)
MEDIA_TIMEOUT: int = Field(default=3600)
MEDIA_MAX_SIZE: str = Field(default="750m")
RESOLUTION: str = Field(default="1440,2000")
CHECK_SSL_VALIDITY: bool = Field(default=True)
USER_AGENT: str = Field(
default=f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)"
)
COOKIES_FILE: Path | None = Field(default=None)
URL_DENYLIST: str = Field(default=r"\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$", alias="URL_BLACKLIST")
URL_ALLOWLIST: str | None = Field(default=None, alias="URL_WHITELIST")
SAVE_ALLOWLIST: Dict[str, List[str]] = Field(default={}) # mapping of regex patterns to list of archive methods
SAVE_DENYLIST: Dict[str, List[str]] = Field(default={})
DEFAULT_PERSONA: str = Field(default="Default")
MEDIA_MAX_SIZE: str = Field(default='750m')
RESOLUTION: str = Field(default='1440,2000')
CHECK_SSL_VALIDITY: bool = Field(default=True)
USER_AGENT: str = Field(default=f'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)')
COOKIES_FILE: Path | None = Field(default=None)
URL_DENYLIST: str = Field(default=r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', alias='URL_BLACKLIST')
URL_ALLOWLIST: str | None = Field(default=None, alias='URL_WHITELIST')
SAVE_ALLOWLIST: Dict[str, List[str]] = Field(default={}) # mapping of regex patterns to list of archive methods
SAVE_DENYLIST: Dict[str, List[str]] = Field(default={})
DEFAULT_PERSONA: str = Field(default='Default')
# GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
# WGET_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}')
# CURL_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}')
@@ -134,58 +152,70 @@ class ArchivingConfig(BaseConfigSet):
def validate(self):
if int(self.TIMEOUT) < 5:
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]', file=sys.stderr)
print(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.', file=sys.stderr)
print(' (Setting it to somewhere between 30 and 3000 seconds is recommended)', file=sys.stderr)
print(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr)
print(" You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.", file=sys.stderr)
print(" (Setting it to somewhere between 30 and 3000 seconds is recommended)", file=sys.stderr)
print(file=sys.stderr)
print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
print(" If you want to make ArchiveBox run faster, disable specific archive methods instead:", file=sys.stderr)
print(" https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles", file=sys.stderr)
print(file=sys.stderr)
@field_validator('CHECK_SSL_VALIDITY', mode='after')
@field_validator("CHECK_SSL_VALIDITY", mode="after")
def validate_check_ssl_validity(cls, v):
"""SIDE EFFECT: disable "you really shouldnt disable ssl" warnings emitted by requests"""
if not v:
import requests
import urllib3
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
return v
@property
def URL_ALLOWLIST_PTN(self) -> re.Pattern | None:
return re.compile(self.URL_ALLOWLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) if self.URL_ALLOWLIST else None
@property
def URL_DENYLIST_PTN(self) -> re.Pattern:
return re.compile(self.URL_DENYLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)
@property
def SAVE_ALLOWLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
return {
# regexp: methods list
re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
for key, val in self.SAVE_ALLOWLIST.items()
} if self.SAVE_ALLOWLIST else {}
return (
{
# regexp: methods list
re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
for key, val in self.SAVE_ALLOWLIST.items()
}
if self.SAVE_ALLOWLIST
else {}
)
@property
def SAVE_DENYLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
return {
# regexp: methods list
re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
for key, val in self.SAVE_DENYLIST.items()
} if self.SAVE_DENYLIST else {}
return (
{
# regexp: methods list
re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
for key, val in self.SAVE_DENYLIST.items()
}
if self.SAVE_DENYLIST
else {}
)
ARCHIVING_CONFIG = ArchivingConfig()
class SearchBackendConfig(BaseConfigSet):
USE_INDEXING_BACKEND: bool = Field(default=True)
USE_SEARCHING_BACKEND: bool = Field(default=True)
SEARCH_BACKEND_ENGINE: str = Field(default='ripgrep')
SEARCH_PROCESS_HTML: bool = Field(default=True)
SEARCH_BACKEND_TIMEOUT: int = Field(default=10)
toml_section_header: str = "SEARCH_BACKEND_CONFIG"
USE_INDEXING_BACKEND: bool = Field(default=True)
USE_SEARCHING_BACKEND: bool = Field(default=True)
SEARCH_BACKEND_ENGINE: str = Field(default="ripgrep")
SEARCH_PROCESS_HTML: bool = Field(default=True)
SEARCH_BACKEND_TIMEOUT: int = Field(default=10)
SEARCH_BACKEND_CONFIG = SearchBackendConfig()

View File

@@ -0,0 +1,266 @@
"""
Simplified config system for ArchiveBox.
This replaces the complex abx_spec_config/base_configset.py with a simpler
approach that still supports environment variables, config files, and
per-object overrides.
"""
__package__ = "archivebox.config"
import os
import json
from pathlib import Path
from typing import Any, Dict, Optional, List, Type, TYPE_CHECKING, cast
from configparser import ConfigParser
from pydantic import Field
from pydantic_settings import BaseSettings
class BaseConfigSet(BaseSettings):
"""
Base class for config sections.
Automatically loads values from:
1. Environment variables (highest priority)
2. ArchiveBox.conf file (if exists)
3. Default values (lowest priority)
Subclasses define fields with defaults and types:
class ShellConfig(BaseConfigSet):
DEBUG: bool = Field(default=False)
USE_COLOR: bool = Field(default=True)
"""
class Config:
# Use env vars with ARCHIVEBOX_ prefix or raw name
env_prefix = ""
extra = "ignore"
validate_default = True
@classmethod
def load_from_file(cls, config_path: Path) -> Dict[str, str]:
"""Load config values from INI file."""
if not config_path.exists():
return {}
parser = ConfigParser()
parser.optionxform = lambda x: x # type: ignore # preserve case
parser.read(config_path)
# Flatten all sections into single namespace
return {key.upper(): value for section in parser.sections() for key, value in parser.items(section)}
def update_in_place(self, warn: bool = True, persist: bool = False, **kwargs) -> None:
"""
Update config values in place.
This allows runtime updates to config without reloading.
"""
for key, value in kwargs.items():
if hasattr(self, key):
# Use object.__setattr__ to bypass pydantic's frozen model
object.__setattr__(self, key, value)
def get_config(
scope: str = "global",
defaults: Optional[Dict] = None,
user: Any = None,
crawl: Any = None,
snapshot: Any = None,
) -> Dict[str, Any]:
"""
Get merged config from all sources.
Priority (highest to lowest):
1. Per-snapshot config (snapshot.config JSON field)
2. Per-crawl config (crawl.config JSON field)
3. Per-user config (user.config JSON field)
4. Environment variables
5. Config file (ArchiveBox.conf)
6. Plugin schema defaults (config.json)
7. Core config defaults
Args:
scope: Config scope ('global', 'crawl', 'snapshot', etc.)
defaults: Default values to start with
user: User object with config JSON field
crawl: Crawl object with config JSON field
snapshot: Snapshot object with config JSON field
Returns:
Merged config dict
"""
from archivebox.config.constants import CONSTANTS
from archivebox.config.common import (
SHELL_CONFIG,
STORAGE_CONFIG,
GENERAL_CONFIG,
SERVER_CONFIG,
ARCHIVING_CONFIG,
SEARCH_BACKEND_CONFIG,
)
# Start with defaults
config = dict(defaults or {})
# Add plugin config defaults from JSONSchema config.json files
try:
from archivebox.hooks import get_config_defaults_from_plugins
plugin_defaults = get_config_defaults_from_plugins()
config.update(plugin_defaults)
except ImportError:
pass # hooks not available yet during early startup
# Add all core config sections
config.update(dict(SHELL_CONFIG))
config.update(dict(STORAGE_CONFIG))
config.update(dict(GENERAL_CONFIG))
config.update(dict(SERVER_CONFIG))
config.update(dict(ARCHIVING_CONFIG))
config.update(dict(SEARCH_BACKEND_CONFIG))
# Load from config file
config_file = CONSTANTS.CONFIG_FILE
if config_file.exists():
file_config = BaseConfigSet.load_from_file(config_file)
config.update(file_config)
# Override with environment variables
for key in config:
env_val = os.environ.get(key)
if env_val is not None:
config[key] = _parse_env_value(env_val, config.get(key))
# Also check plugin config aliases in environment
try:
from archivebox.hooks import discover_plugin_configs
plugin_configs = discover_plugin_configs()
for plugin_name, schema in plugin_configs.items():
for key, prop_schema in schema.get('properties', {}).items():
# Check x-aliases
for alias in prop_schema.get('x-aliases', []):
if alias in os.environ and key not in os.environ:
config[key] = _parse_env_value(os.environ[alias], config.get(key))
break
# Check x-fallback
fallback = prop_schema.get('x-fallback')
if fallback and fallback in config and key not in config:
config[key] = config[fallback]
except ImportError:
pass
# Apply user config overrides
if user and hasattr(user, "config") and user.config:
config.update(user.config)
# Apply crawl config overrides
if crawl and hasattr(crawl, "config") and crawl.config:
config.update(crawl.config)
# Apply snapshot config overrides (highest priority)
if snapshot and hasattr(snapshot, "config") and snapshot.config:
config.update(snapshot.config)
return config
def get_flat_config() -> Dict[str, Any]:
"""
Get a flat dictionary of all config values.
Replaces abx.pm.hook.get_FLAT_CONFIG()
"""
return get_config(scope="global")
def get_all_configs() -> Dict[str, BaseConfigSet]:
"""
Get all config section objects as a dictionary.
Replaces abx.pm.hook.get_CONFIGS()
"""
from archivebox.config.common import (
SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG
)
return {
'SHELL_CONFIG': SHELL_CONFIG,
'SERVER_CONFIG': SERVER_CONFIG,
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
'SEARCH_BACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
}
def _parse_env_value(value: str, default: Any = None) -> Any:
"""Parse an environment variable value based on expected type."""
if default is None:
# Try to guess the type
if value.lower() in ("true", "false", "yes", "no", "1", "0"):
return value.lower() in ("true", "yes", "1")
try:
return int(value)
except ValueError:
pass
try:
return json.loads(value)
except (json.JSONDecodeError, ValueError):
pass
return value
# Parse based on default's type
if isinstance(default, bool):
return value.lower() in ("true", "yes", "1")
elif isinstance(default, int):
return int(value)
elif isinstance(default, float):
return float(value)
elif isinstance(default, (list, dict)):
return json.loads(value)
elif isinstance(default, Path):
return Path(value)
else:
return value
# Default worker concurrency settings
DEFAULT_WORKER_CONCURRENCY = {
"crawl": 2,
"snapshot": 3,
"wget": 2,
"ytdlp": 2,
"screenshot": 3,
"singlefile": 2,
"title": 5,
"favicon": 5,
"headers": 5,
"archive_org": 2,
"readability": 3,
"mercury": 3,
"git": 2,
"pdf": 2,
"dom": 3,
}
def get_worker_concurrency() -> Dict[str, int]:
"""
Get worker concurrency settings.
Can be configured via WORKER_CONCURRENCY env var as JSON dict.
"""
config = get_config()
# Start with defaults
concurrency = DEFAULT_WORKER_CONCURRENCY.copy()
# Override with config
if "WORKER_CONCURRENCY" in config:
custom = config["WORKER_CONCURRENCY"]
if isinstance(custom, str):
custom = json.loads(custom)
concurrency.update(custom)
return concurrency

View File

@@ -1,6 +1,7 @@
__package__ = 'abx.archivebox'
__package__ = 'archivebox.config'
import os
import shutil
import inspect
from pathlib import Path
from typing import Any, List, Dict, cast
@@ -13,14 +14,22 @@ from django.utils.html import format_html, mark_safe
from admin_data_views.typing import TableContext, ItemContext
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
import abx
import archivebox
from archivebox.config import CONSTANTS
from archivebox.misc.util import parse_date
from machine.models import InstalledBinary
# Common binaries to check for
KNOWN_BINARIES = [
'wget', 'curl', 'chromium', 'chrome', 'google-chrome', 'google-chrome-stable',
'node', 'npm', 'npx', 'yt-dlp', 'ytdlp', 'youtube-dl',
'git', 'singlefile', 'readability-extractor', 'mercury-parser',
'python3', 'python', 'bash', 'zsh',
'ffmpeg', 'ripgrep', 'rg', 'sonic', 'archivebox',
]
def obj_to_yaml(obj: Any, indent: int=0) -> str:
indent_str = " " * indent
if indent == 0:
@@ -62,65 +71,92 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str:
else:
return f" {str(obj)}"
def get_detected_binaries() -> Dict[str, Dict[str, Any]]:
"""Detect available binaries using shutil.which."""
binaries = {}
for name in KNOWN_BINARIES:
path = shutil.which(name)
if path:
binaries[name] = {
'name': name,
'abspath': path,
'version': None, # Could add version detection later
'is_available': True,
}
return binaries
def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]:
"""Discover plugins from filesystem directories."""
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR
plugins = {}
for base_dir, source in [(BUILTIN_PLUGINS_DIR, 'builtin'), (USER_PLUGINS_DIR, 'user')]:
if not base_dir.exists():
continue
for plugin_dir in base_dir.iterdir():
if plugin_dir.is_dir() and not plugin_dir.name.startswith('_'):
plugin_id = f'{source}.{plugin_dir.name}'
# Find hook scripts
hooks = []
for ext in ('sh', 'py', 'js'):
hooks.extend(plugin_dir.glob(f'on_*__*.{ext}'))
plugins[plugin_id] = {
'id': plugin_id,
'name': plugin_dir.name,
'path': str(plugin_dir),
'source': source,
'hooks': [str(h.name) for h in hooks],
}
return plugins
@render_with_table_view
def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
rows = {
"Binary Name": [],
"Found Version": [],
"From Plugin": [],
"Provided By": [],
"Found Abspath": [],
"Related Configuration": [],
# "Overrides": [],
# "Description": [],
}
relevant_configs = {
key: val
for key, val in FLAT_CONFIG.items()
if '_BINARY' in key or '_VERSION' in key
}
for plugin_id, plugin in abx.get_all_plugins().items():
plugin = benedict(plugin)
if not hasattr(plugin.plugin, 'get_BINARIES'):
continue
# Get binaries from database (previously detected/installed)
db_binaries = {b.name: b for b in InstalledBinary.objects.all()}
# Get currently detectable binaries
detected = get_detected_binaries()
# Merge and display
all_binary_names = sorted(set(list(db_binaries.keys()) + list(detected.keys())))
for name in all_binary_names:
db_binary = db_binaries.get(name)
detected_binary = detected.get(name)
for binary in plugin.plugin.get_BINARIES().values():
try:
installed_binary = InstalledBinary.objects.get_from_db_or_cache(binary)
binary = installed_binary.load_from_db()
except Exception as e:
print(e)
rows['Binary Name'].append(ItemLink(binary.name, key=binary.name))
rows['Found Version'].append(f'{binary.loaded_version}' if binary.loaded_version else '❌ missing')
rows['From Plugin'].append(plugin.package)
rows['Provided By'].append(
', '.join(
f'[{binprovider.name}]' if binprovider.name == getattr(binary.loaded_binprovider, 'name', None) else binprovider.name
for binprovider in binary.binproviders_supported
if binprovider
)
# binary.loaded_binprovider.name
# if binary.loaded_binprovider else
# ', '.join(getattr(provider, 'name', str(provider)) for provider in binary.binproviders_supported)
)
rows['Found Abspath'].append(str(binary.loaded_abspath or '❌ missing'))
rows['Related Configuration'].append(mark_safe(', '.join(
f'<a href="/admin/environment/config/{config_key}/">{config_key}</a>'
for config_key, config_value in relevant_configs.items()
if str(binary.name).lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower()
or config_value.lower().endswith(binary.name.lower())
# or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower()
)))
# if not binary.overrides:
# import ipdb; ipdb.set_trace()
# rows['Overrides'].append(str(obj_to_yaml(binary.overrides) or str(binary.overrides))[:200])
# rows['Description'].append(binary.description)
rows['Binary Name'].append(ItemLink(name, key=name))
if db_binary:
rows['Found Version'].append(f'{db_binary.version}' if db_binary.version else '✅ found')
rows['Provided By'].append(db_binary.binprovider or 'PATH')
rows['Found Abspath'].append(str(db_binary.abspath or ''))
elif detected_binary:
rows['Found Version'].append('✅ found')
rows['Provided By'].append('PATH')
rows['Found Abspath'].append(detected_binary['abspath'])
else:
rows['Found Version'].append('❌ missing')
rows['Provided By'].append('-')
rows['Found Abspath'].append('-')
return TableContext(
title="Binaries",
@@ -132,43 +168,65 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert request.user and request.user.is_superuser, 'Must be a superuser to view configuration settings.'
binary = None
plugin = None
for plugin_id, plugin in abx.get_all_plugins().items():
try:
for loaded_binary in plugin['hooks'].get_BINARIES().values():
if loaded_binary.name == key:
binary = loaded_binary
plugin = plugin
# break # last write wins
except Exception as e:
print(e)
assert plugin and binary, f'Could not find a binary matching the specified name: {key}'
# Try database first
try:
binary = binary.load()
except Exception as e:
print(e)
binary = InstalledBinary.objects.get(name=key)
return ItemContext(
slug=key,
title=key,
data=[
{
"name": binary.name,
"description": str(binary.abspath or ''),
"fields": {
'name': binary.name,
'binprovider': binary.binprovider,
'abspath': str(binary.abspath),
'version': binary.version,
'sha256': binary.sha256,
},
"help_texts": {},
},
],
)
except InstalledBinary.DoesNotExist:
pass
# Try to detect from PATH
path = shutil.which(key)
if path:
return ItemContext(
slug=key,
title=key,
data=[
{
"name": key,
"description": path,
"fields": {
'name': key,
'binprovider': 'PATH',
'abspath': path,
'version': 'unknown',
},
"help_texts": {},
},
],
)
return ItemContext(
slug=key,
title=key,
data=[
{
"name": binary.name,
"description": binary.abspath,
"name": key,
"description": "Binary not found",
"fields": {
'plugin': plugin['package'],
'binprovider': binary.loaded_binprovider,
'abspath': binary.loaded_abspath,
'version': binary.loaded_version,
'overrides': obj_to_yaml(binary.overrides),
'providers': obj_to_yaml(binary.binproviders_supported),
},
"help_texts": {
# TODO
'name': key,
'binprovider': 'not installed',
'abspath': 'not found',
'version': 'N/A',
},
"help_texts": {},
},
],
)
@@ -180,66 +238,26 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
rows = {
"Label": [],
"Version": [],
"Author": [],
"Package": [],
"Source Code": [],
"Config": [],
"Binaries": [],
"Package Managers": [],
# "Search Backends": [],
"Name": [],
"Source": [],
"Path": [],
"Hooks": [],
}
config_colors = {
'_BINARY': '#339',
'USE_': 'green',
'SAVE_': 'green',
'_ARGS': '#33e',
'KEY': 'red',
'COOKIES': 'red',
'AUTH': 'red',
'SECRET': 'red',
'TOKEN': 'red',
'PASSWORD': 'red',
'TIMEOUT': '#533',
'RETRIES': '#533',
'MAX': '#533',
'MIN': '#533',
}
def get_color(key):
for pattern, color in config_colors.items():
if pattern in key:
return color
return 'black'
plugins = get_filesystem_plugins()
for plugin_id, plugin in abx.get_all_plugins().items():
plugin.hooks.get_BINPROVIDERS = getattr(plugin.plugin, 'get_BINPROVIDERS', lambda: {})
plugin.hooks.get_BINARIES = getattr(plugin.plugin, 'get_BINARIES', lambda: {})
plugin.hooks.get_CONFIG = getattr(plugin.plugin, 'get_CONFIG', lambda: {})
rows['Label'].append(ItemLink(plugin.label, key=plugin.package))
rows['Version'].append(str(plugin.version))
rows['Author'].append(mark_safe(f'<a href="{plugin.homepage}" target="_blank">{plugin.author}</a>'))
rows['Package'].append(ItemLink(plugin.package, key=plugin.package))
rows['Source Code'].append(format_html('<code>{}</code>', str(plugin.source_code).replace(str(Path('~').expanduser()), '~')))
rows['Config'].append(mark_safe(''.join(
f'<a href="/admin/environment/config/{key}/"><b><code style="color: {get_color(key)};">{key}</code></b>=<code>{value}</code></a><br/>'
for configdict in plugin.hooks.get_CONFIG().values()
for key, value in benedict(configdict).items()
)))
rows['Binaries'].append(mark_safe(', '.join(
f'<a href="/admin/environment/binaries/{binary.name}/"><code>{binary.name}</code></a>'
for binary in plugin.hooks.get_BINARIES().values()
)))
rows['Package Managers'].append(mark_safe(', '.join(
f'<a href="/admin/environment/binproviders/{binprovider.name}/"><code>{binprovider.name}</code></a>'
for binprovider in plugin.hooks.get_BINPROVIDERS().values()
)))
# rows['Search Backends'].append(mark_safe(', '.join(
# f'<a href="/admin/environment/searchbackends/{searchbackend.name}/"><code>{searchbackend.name}</code></a>'
# for searchbackend in plugin.SEARCHBACKENDS.values()
# )))
for plugin_id, plugin in plugins.items():
rows['Name'].append(ItemLink(plugin['name'], key=plugin_id))
rows['Source'].append(plugin['source'])
rows['Path'].append(format_html('<code>{}</code>', plugin['path']))
rows['Hooks'].append(', '.join(plugin['hooks']) or '(none)')
if not plugins:
# Show a helpful message when no plugins found
rows['Name'].append('(no plugins found)')
rows['Source'].append('-')
rows['Path'].append(format_html('<code>archivebox/plugins/</code> or <code>data/plugins/</code>'))
rows['Hooks'].append('-')
return TableContext(
title="Installed plugins",
@@ -251,39 +269,31 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
plugins = abx.get_all_plugins()
plugin_id = None
for check_plugin_id, loaded_plugin in plugins.items():
if check_plugin_id.split('.')[-1] == key.split('.')[-1]:
plugin_id = check_plugin_id
break
assert plugin_id, f'Could not find a plugin matching the specified name: {key}'
plugin = abx.get_plugin(plugin_id)
plugins = get_filesystem_plugins()
plugin = plugins.get(key)
if not plugin:
return ItemContext(
slug=key,
title=f'Plugin not found: {key}',
data=[],
)
return ItemContext(
slug=key,
title=key,
title=plugin['name'],
data=[
{
"name": plugin.package,
"description": plugin.label,
"name": plugin['name'],
"description": plugin['path'],
"fields": {
"id": plugin.id,
"package": plugin.package,
"label": plugin.label,
"version": plugin.version,
"author": plugin.author,
"homepage": plugin.homepage,
"dependencies": getattr(plugin, 'DEPENDENCIES', []),
"source_code": plugin.source_code,
"hooks": plugin.hooks,
},
"help_texts": {
# TODO
"id": plugin['id'],
"name": plugin['name'],
"source": plugin['source'],
"path": plugin['path'],
"hooks": plugin['hooks'],
},
"help_texts": {},
},
],
)
@@ -333,22 +343,6 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
# Add a row for each worker process managed by supervisord
for proc in cast(List[Dict[str, Any]], supervisor.getAllProcessInfo()):
proc = benedict(proc)
# {
# "name": "daphne",
# "group": "daphne",
# "start": 1725933056,
# "stop": 0,
# "now": 1725933438,
# "state": 20,
# "statename": "RUNNING",
# "spawnerr": "",
# "exitstatus": 0,
# "logfile": "logs/server.log",
# "stdout_logfile": "logs/server.log",
# "stderr_logfile": "",
# "pid": 33283,
# "description": "pid 33283, uptime 0:06:22",
# }
rows["Name"].append(ItemLink(proc.name, key=proc.name))
rows["State"].append(proc.statename)
rows['PID'].append(proc.description.replace('pid ', ''))