mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 15:27:53 +10:00
wip major changes
This commit is contained in:
@@ -1,6 +1,17 @@
|
||||
"""
|
||||
ArchiveBox config exports.
|
||||
|
||||
This module provides backwards-compatible config exports for extractors
|
||||
and other modules that expect to import config values directly.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.config'
|
||||
__order__ = 200
|
||||
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
from .paths import (
|
||||
PACKAGE_DIR, # noqa
|
||||
DATA_DIR, # noqa
|
||||
@@ -9,28 +20,219 @@ from .paths import (
|
||||
from .constants import CONSTANTS, CONSTANTS_CONFIG, PACKAGE_DIR, DATA_DIR, ARCHIVE_DIR # noqa
|
||||
from .version import VERSION # noqa
|
||||
|
||||
# import abx
|
||||
|
||||
# @abx.hookimpl
|
||||
# def get_CONFIG():
|
||||
# from .common import (
|
||||
# SHELL_CONFIG,
|
||||
# STORAGE_CONFIG,
|
||||
# GENERAL_CONFIG,
|
||||
# SERVER_CONFIG,
|
||||
# ARCHIVING_CONFIG,
|
||||
# SEARCH_BACKEND_CONFIG,
|
||||
# )
|
||||
# return {
|
||||
# 'SHELL_CONFIG': SHELL_CONFIG,
|
||||
# 'STORAGE_CONFIG': STORAGE_CONFIG,
|
||||
# 'GENERAL_CONFIG': GENERAL_CONFIG,
|
||||
# 'SERVER_CONFIG': SERVER_CONFIG,
|
||||
# 'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
|
||||
# 'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
|
||||
# }
|
||||
###############################################################################
|
||||
# Config value exports for extractors
|
||||
# These provide backwards compatibility with extractors that import from ..config
|
||||
###############################################################################
|
||||
|
||||
# @abx.hookimpl
|
||||
# def ready():
|
||||
# for config in get_CONFIG().values():
|
||||
# config.validate()
|
||||
def _get_config():
|
||||
"""Lazy import to avoid circular imports."""
|
||||
from .common import ARCHIVING_CONFIG, STORAGE_CONFIG
|
||||
return ARCHIVING_CONFIG, STORAGE_CONFIG
|
||||
|
||||
# Direct exports (evaluated at import time for backwards compat)
|
||||
# These are recalculated each time the module attribute is accessed
|
||||
|
||||
def __getattr__(name: str):
|
||||
"""Module-level __getattr__ for lazy config loading."""
|
||||
|
||||
# Timeout settings
|
||||
if name == 'TIMEOUT':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.TIMEOUT
|
||||
if name == 'MEDIA_TIMEOUT':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.MEDIA_TIMEOUT
|
||||
|
||||
# SSL/Security settings
|
||||
if name == 'CHECK_SSL_VALIDITY':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.CHECK_SSL_VALIDITY
|
||||
|
||||
# Storage settings
|
||||
if name == 'RESTRICT_FILE_NAMES':
|
||||
_, storage = _get_config()
|
||||
return storage.RESTRICT_FILE_NAMES
|
||||
|
||||
# User agent / cookies
|
||||
if name == 'COOKIES_FILE':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.COOKIES_FILE
|
||||
if name == 'USER_AGENT':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.USER_AGENT
|
||||
if name == 'CURL_USER_AGENT':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.USER_AGENT
|
||||
if name == 'WGET_USER_AGENT':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.USER_AGENT
|
||||
if name == 'CHROME_USER_AGENT':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.USER_AGENT
|
||||
|
||||
# Archive method toggles (SAVE_*)
|
||||
if name == 'SAVE_TITLE':
|
||||
return True
|
||||
if name == 'SAVE_FAVICON':
|
||||
return True
|
||||
if name == 'SAVE_WGET':
|
||||
return True
|
||||
if name == 'SAVE_WARC':
|
||||
return True
|
||||
if name == 'SAVE_WGET_REQUISITES':
|
||||
return True
|
||||
if name == 'SAVE_SINGLEFILE':
|
||||
return True
|
||||
if name == 'SAVE_READABILITY':
|
||||
return True
|
||||
if name == 'SAVE_MERCURY':
|
||||
return True
|
||||
if name == 'SAVE_HTMLTOTEXT':
|
||||
return True
|
||||
if name == 'SAVE_PDF':
|
||||
return True
|
||||
if name == 'SAVE_SCREENSHOT':
|
||||
return True
|
||||
if name == 'SAVE_DOM':
|
||||
return True
|
||||
if name == 'SAVE_HEADERS':
|
||||
return True
|
||||
if name == 'SAVE_GIT':
|
||||
return True
|
||||
if name == 'SAVE_MEDIA':
|
||||
return True
|
||||
if name == 'SAVE_ARCHIVE_DOT_ORG':
|
||||
return True
|
||||
|
||||
# Extractor-specific settings
|
||||
if name == 'RESOLUTION':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.RESOLUTION
|
||||
if name == 'GIT_DOMAINS':
|
||||
return 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'
|
||||
if name == 'MEDIA_MAX_SIZE':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.MEDIA_MAX_SIZE
|
||||
if name == 'FAVICON_PROVIDER':
|
||||
return 'https://www.google.com/s2/favicons?domain={}'
|
||||
|
||||
# Binary paths (use shutil.which for detection)
|
||||
if name == 'CURL_BINARY':
|
||||
return shutil.which('curl') or 'curl'
|
||||
if name == 'WGET_BINARY':
|
||||
return shutil.which('wget') or 'wget'
|
||||
if name == 'GIT_BINARY':
|
||||
return shutil.which('git') or 'git'
|
||||
if name == 'YOUTUBEDL_BINARY':
|
||||
return shutil.which('yt-dlp') or shutil.which('youtube-dl') or 'yt-dlp'
|
||||
if name == 'CHROME_BINARY':
|
||||
for chrome in ['chromium', 'chromium-browser', 'google-chrome', 'google-chrome-stable', 'chrome']:
|
||||
path = shutil.which(chrome)
|
||||
if path:
|
||||
return path
|
||||
return 'chromium'
|
||||
if name == 'NODE_BINARY':
|
||||
return shutil.which('node') or 'node'
|
||||
if name == 'SINGLEFILE_BINARY':
|
||||
return shutil.which('single-file') or shutil.which('singlefile') or 'single-file'
|
||||
if name == 'READABILITY_BINARY':
|
||||
return shutil.which('readability-extractor') or 'readability-extractor'
|
||||
if name == 'MERCURY_BINARY':
|
||||
return shutil.which('mercury-parser') or shutil.which('postlight-parser') or 'mercury-parser'
|
||||
|
||||
# Binary versions (return placeholder, actual version detection happens elsewhere)
|
||||
if name == 'CURL_VERSION':
|
||||
return 'curl'
|
||||
if name == 'WGET_VERSION':
|
||||
return 'wget'
|
||||
if name == 'GIT_VERSION':
|
||||
return 'git'
|
||||
if name == 'YOUTUBEDL_VERSION':
|
||||
return 'yt-dlp'
|
||||
if name == 'CHROME_VERSION':
|
||||
return 'chromium'
|
||||
if name == 'SINGLEFILE_VERSION':
|
||||
return 'singlefile'
|
||||
if name == 'READABILITY_VERSION':
|
||||
return 'readability'
|
||||
if name == 'MERCURY_VERSION':
|
||||
return 'mercury'
|
||||
|
||||
# Binary arguments
|
||||
if name == 'CURL_ARGS':
|
||||
return ['--silent', '--location', '--compressed']
|
||||
if name == 'WGET_ARGS':
|
||||
return [
|
||||
'--no-verbose',
|
||||
'--adjust-extension',
|
||||
'--convert-links',
|
||||
'--force-directories',
|
||||
'--backup-converted',
|
||||
'--span-hosts',
|
||||
'--no-parent',
|
||||
'-e', 'robots=off',
|
||||
]
|
||||
if name == 'GIT_ARGS':
|
||||
return ['--recursive']
|
||||
if name == 'YOUTUBEDL_ARGS':
|
||||
cfg, _ = _get_config()
|
||||
return [
|
||||
'--write-description',
|
||||
'--write-info-json',
|
||||
'--write-annotations',
|
||||
'--write-thumbnail',
|
||||
'--no-call-home',
|
||||
'--write-sub',
|
||||
'--write-auto-subs',
|
||||
'--convert-subs=srt',
|
||||
'--yes-playlist',
|
||||
'--continue',
|
||||
'--no-abort-on-error',
|
||||
'--ignore-errors',
|
||||
'--geo-bypass',
|
||||
'--add-metadata',
|
||||
f'--format=(bv*+ba/b)[filesize<={cfg.MEDIA_MAX_SIZE}][filesize_approx<=?{cfg.MEDIA_MAX_SIZE}]/(bv*+ba/b)',
|
||||
]
|
||||
if name == 'SINGLEFILE_ARGS':
|
||||
return None # Uses defaults
|
||||
if name == 'CHROME_ARGS':
|
||||
return []
|
||||
|
||||
# Other settings
|
||||
if name == 'WGET_AUTO_COMPRESSION':
|
||||
return True
|
||||
if name == 'DEPENDENCIES':
|
||||
return {} # Legacy, not used anymore
|
||||
|
||||
# Allowlist/Denylist patterns (compiled regexes)
|
||||
if name == 'SAVE_ALLOWLIST_PTN':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.SAVE_ALLOWLIST_PTNS
|
||||
if name == 'SAVE_DENYLIST_PTN':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.SAVE_DENYLIST_PTNS
|
||||
|
||||
raise AttributeError(f"module 'archivebox.config' has no attribute '{name}'")
|
||||
|
||||
|
||||
# Re-export common config classes for direct imports
|
||||
def get_CONFIG():
|
||||
"""Get all config sections as a dict."""
|
||||
from .common import (
|
||||
SHELL_CONFIG,
|
||||
STORAGE_CONFIG,
|
||||
GENERAL_CONFIG,
|
||||
SERVER_CONFIG,
|
||||
ARCHIVING_CONFIG,
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
)
|
||||
return {
|
||||
'SHELL_CONFIG': SHELL_CONFIG,
|
||||
'STORAGE_CONFIG': STORAGE_CONFIG,
|
||||
'GENERAL_CONFIG': GENERAL_CONFIG,
|
||||
'SERVER_CONFIG': SERVER_CONFIG,
|
||||
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
|
||||
'SEARCHBACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
|
||||
}
|
||||
|
||||
@@ -18,13 +18,8 @@ from archivebox.misc.logging import stderr
|
||||
|
||||
def get_real_name(key: str) -> str:
|
||||
"""get the up-to-date canonical name for a given old alias or current key"""
|
||||
CONFIGS = archivebox.pm.hook.get_CONFIGS()
|
||||
|
||||
for section in CONFIGS.values():
|
||||
try:
|
||||
return section.aliases[key]
|
||||
except (KeyError, AttributeError):
|
||||
pass
|
||||
# Config aliases are no longer used with the simplified config system
|
||||
# Just return the key as-is since we no longer have a complex alias mapping
|
||||
return key
|
||||
|
||||
|
||||
@@ -117,9 +112,20 @@ def load_config_file() -> Optional[benedict]:
|
||||
|
||||
|
||||
def section_for_key(key: str) -> Any:
|
||||
for config_section in archivebox.pm.hook.get_CONFIGS().values():
|
||||
if hasattr(config_section, key):
|
||||
return config_section
|
||||
"""Find the config section containing a given key."""
|
||||
from archivebox.config.common import (
|
||||
SHELL_CONFIG,
|
||||
STORAGE_CONFIG,
|
||||
GENERAL_CONFIG,
|
||||
SERVER_CONFIG,
|
||||
ARCHIVING_CONFIG,
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
)
|
||||
|
||||
for section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG,
|
||||
SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]:
|
||||
if hasattr(section, key):
|
||||
return section
|
||||
raise ValueError(f'No config section found for key: {key}')
|
||||
|
||||
|
||||
@@ -178,7 +184,8 @@ def write_config_file(config: Dict[str, str]) -> benedict:
|
||||
updated_config = {}
|
||||
try:
|
||||
# validate the updated_config by attempting to re-parse it
|
||||
updated_config = {**load_all_config(), **archivebox.pm.hook.get_FLAT_CONFIG()}
|
||||
from archivebox.config.configset import get_flat_config
|
||||
updated_config = {**load_all_config(), **get_flat_config()}
|
||||
except BaseException: # lgtm [py/catch-base-exception]
|
||||
# something went horribly wrong, revert to the previous version
|
||||
with open(f'{config_path}.bak', 'r', encoding='utf-8') as old:
|
||||
@@ -236,12 +243,20 @@ def load_config(defaults: Dict[str, Any],
|
||||
return benedict(extended_config)
|
||||
|
||||
def load_all_config():
|
||||
import abx
|
||||
"""Load all config sections and return as a flat dict."""
|
||||
from archivebox.config.common import (
|
||||
SHELL_CONFIG,
|
||||
STORAGE_CONFIG,
|
||||
GENERAL_CONFIG,
|
||||
SERVER_CONFIG,
|
||||
ARCHIVING_CONFIG,
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
)
|
||||
|
||||
flat_config = benedict()
|
||||
|
||||
for config_section in abx.pm.hook.get_CONFIGS().values():
|
||||
config_section.__init__()
|
||||
for config_section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG,
|
||||
SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]:
|
||||
flat_config.update(dict(config_section))
|
||||
|
||||
return flat_config
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.config'
|
||||
__package__ = "archivebox.config"
|
||||
|
||||
import re
|
||||
import sys
|
||||
@@ -10,7 +10,7 @@ from rich import print
|
||||
from pydantic import Field, field_validator
|
||||
from django.utils.crypto import get_random_string
|
||||
|
||||
from abx_spec_config.base_configset import BaseConfigSet
|
||||
from archivebox.config.configset import BaseConfigSet
|
||||
|
||||
from .constants import CONSTANTS
|
||||
from .version import get_COMMIT_HASH, get_BUILD_TIME, VERSION
|
||||
@@ -20,109 +20,127 @@ from .permissions import IN_DOCKER
|
||||
|
||||
|
||||
class ShellConfig(BaseConfigSet):
|
||||
DEBUG: bool = Field(default=lambda: '--debug' in sys.argv)
|
||||
|
||||
IS_TTY: bool = Field(default=sys.stdout.isatty())
|
||||
USE_COLOR: bool = Field(default=lambda c: c.IS_TTY)
|
||||
SHOW_PROGRESS: bool = Field(default=lambda c: c.IS_TTY)
|
||||
|
||||
IN_DOCKER: bool = Field(default=IN_DOCKER)
|
||||
IN_QEMU: bool = Field(default=False)
|
||||
toml_section_header: str = "SHELL_CONFIG"
|
||||
|
||||
ANSI: Dict[str, str] = Field(default=lambda c: CONSTANTS.DEFAULT_CLI_COLORS if c.USE_COLOR else CONSTANTS.DISABLED_CLI_COLORS)
|
||||
DEBUG: bool = Field(default="--debug" in sys.argv)
|
||||
|
||||
IS_TTY: bool = Field(default=sys.stdout.isatty())
|
||||
USE_COLOR: bool = Field(default=sys.stdout.isatty())
|
||||
SHOW_PROGRESS: bool = Field(default=sys.stdout.isatty())
|
||||
|
||||
IN_DOCKER: bool = Field(default=IN_DOCKER)
|
||||
IN_QEMU: bool = Field(default=False)
|
||||
|
||||
ANSI: Dict[str, str] = Field(
|
||||
default_factory=lambda: CONSTANTS.DEFAULT_CLI_COLORS if sys.stdout.isatty() else CONSTANTS.DISABLED_CLI_COLORS
|
||||
)
|
||||
|
||||
@property
|
||||
def TERM_WIDTH(self) -> int:
|
||||
if not self.IS_TTY:
|
||||
return 200
|
||||
return shutil.get_terminal_size((140, 10)).columns
|
||||
|
||||
|
||||
@property
|
||||
def COMMIT_HASH(self) -> Optional[str]:
|
||||
return get_COMMIT_HASH()
|
||||
|
||||
|
||||
@property
|
||||
def BUILD_TIME(self) -> str:
|
||||
return get_BUILD_TIME()
|
||||
|
||||
|
||||
|
||||
SHELL_CONFIG = ShellConfig()
|
||||
|
||||
|
||||
class StorageConfig(BaseConfigSet):
|
||||
toml_section_header: str = "STORAGE_CONFIG"
|
||||
|
||||
# TMP_DIR must be a local, fast, readable/writable dir by archivebox user,
|
||||
# must be a short path due to unix path length restrictions for socket files (<100 chars)
|
||||
# must be a local SSD/tmpfs for speed and because bind mounts/network mounts/FUSE dont support unix sockets
|
||||
TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR)
|
||||
|
||||
TMP_DIR: Path = Field(default=CONSTANTS.DEFAULT_TMP_DIR)
|
||||
|
||||
# LIB_DIR must be a local, fast, readable/writable dir by archivebox user,
|
||||
# must be able to contain executable binaries (up to 5GB size)
|
||||
# should not be a remote/network/FUSE mount for speed reasons, otherwise extractors will be slow
|
||||
LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR)
|
||||
|
||||
OUTPUT_PERMISSIONS: str = Field(default='644')
|
||||
RESTRICT_FILE_NAMES: str = Field(default='windows')
|
||||
ENFORCE_ATOMIC_WRITES: bool = Field(default=True)
|
||||
|
||||
LIB_DIR: Path = Field(default=CONSTANTS.DEFAULT_LIB_DIR)
|
||||
|
||||
OUTPUT_PERMISSIONS: str = Field(default="644")
|
||||
RESTRICT_FILE_NAMES: str = Field(default="windows")
|
||||
ENFORCE_ATOMIC_WRITES: bool = Field(default=True)
|
||||
|
||||
# not supposed to be user settable:
|
||||
DIR_OUTPUT_PERMISSIONS: str = Field(default=lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5'))
|
||||
DIR_OUTPUT_PERMISSIONS: str = Field(default="755") # computed from OUTPUT_PERMISSIONS
|
||||
|
||||
|
||||
STORAGE_CONFIG = StorageConfig()
|
||||
|
||||
|
||||
class GeneralConfig(BaseConfigSet):
|
||||
TAG_SEPARATOR_PATTERN: str = Field(default=r'[,]')
|
||||
toml_section_header: str = "GENERAL_CONFIG"
|
||||
|
||||
TAG_SEPARATOR_PATTERN: str = Field(default=r"[,]")
|
||||
|
||||
|
||||
GENERAL_CONFIG = GeneralConfig()
|
||||
|
||||
|
||||
class ServerConfig(BaseConfigSet):
|
||||
SECRET_KEY: str = Field(default=lambda: get_random_string(50, 'abcdefghijklmnopqrstuvwxyz0123456789_'))
|
||||
BIND_ADDR: str = Field(default=lambda: ['127.0.0.1:8000', '0.0.0.0:8000'][SHELL_CONFIG.IN_DOCKER])
|
||||
ALLOWED_HOSTS: str = Field(default='*')
|
||||
CSRF_TRUSTED_ORIGINS: str = Field(default=lambda c: 'http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000,http://{}'.format(c.BIND_ADDR))
|
||||
|
||||
SNAPSHOTS_PER_PAGE: int = Field(default=40)
|
||||
PREVIEW_ORIGINALS: bool = Field(default=True)
|
||||
FOOTER_INFO: str = Field(default='Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests.')
|
||||
toml_section_header: str = "SERVER_CONFIG"
|
||||
|
||||
SECRET_KEY: str = Field(default_factory=lambda: get_random_string(50, "abcdefghijklmnopqrstuvwxyz0123456789_"))
|
||||
BIND_ADDR: str = Field(default="127.0.0.1:8000")
|
||||
ALLOWED_HOSTS: str = Field(default="*")
|
||||
CSRF_TRUSTED_ORIGINS: str = Field(default="http://localhost:8000,http://127.0.0.1:8000,http://0.0.0.0:8000")
|
||||
|
||||
SNAPSHOTS_PER_PAGE: int = Field(default=40)
|
||||
PREVIEW_ORIGINALS: bool = Field(default=True)
|
||||
FOOTER_INFO: str = Field(
|
||||
default="Content is hosted for personal archiving purposes only. Contact server owner for any takedown requests."
|
||||
)
|
||||
# CUSTOM_TEMPLATES_DIR: Path = Field(default=None) # this is now a constant
|
||||
|
||||
PUBLIC_INDEX: bool = Field(default=True)
|
||||
PUBLIC_SNAPSHOTS: bool = Field(default=True)
|
||||
PUBLIC_ADD_VIEW: bool = Field(default=False)
|
||||
|
||||
ADMIN_USERNAME: str = Field(default=None)
|
||||
ADMIN_PASSWORD: str = Field(default=None)
|
||||
|
||||
REVERSE_PROXY_USER_HEADER: str = Field(default='Remote-User')
|
||||
REVERSE_PROXY_WHITELIST: str = Field(default='')
|
||||
LOGOUT_REDIRECT_URL: str = Field(default='/')
|
||||
|
||||
PUBLIC_INDEX: bool = Field(default=True)
|
||||
PUBLIC_SNAPSHOTS: bool = Field(default=True)
|
||||
PUBLIC_ADD_VIEW: bool = Field(default=False)
|
||||
|
||||
ADMIN_USERNAME: Optional[str] = Field(default=None)
|
||||
ADMIN_PASSWORD: Optional[str] = Field(default=None)
|
||||
|
||||
REVERSE_PROXY_USER_HEADER: str = Field(default="Remote-User")
|
||||
REVERSE_PROXY_WHITELIST: str = Field(default="")
|
||||
LOGOUT_REDIRECT_URL: str = Field(default="/")
|
||||
|
||||
|
||||
SERVER_CONFIG = ServerConfig()
|
||||
|
||||
|
||||
class ArchivingConfig(BaseConfigSet):
|
||||
ONLY_NEW: bool = Field(default=True)
|
||||
OVERWRITE: bool = Field(default=False)
|
||||
|
||||
TIMEOUT: int = Field(default=60)
|
||||
MEDIA_TIMEOUT: int = Field(default=3600)
|
||||
toml_section_header: str = "ARCHIVING_CONFIG"
|
||||
|
||||
ONLY_NEW: bool = Field(default=True)
|
||||
OVERWRITE: bool = Field(default=False)
|
||||
|
||||
TIMEOUT: int = Field(default=60)
|
||||
MEDIA_TIMEOUT: int = Field(default=3600)
|
||||
|
||||
MEDIA_MAX_SIZE: str = Field(default="750m")
|
||||
RESOLUTION: str = Field(default="1440,2000")
|
||||
CHECK_SSL_VALIDITY: bool = Field(default=True)
|
||||
USER_AGENT: str = Field(
|
||||
default=f"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)"
|
||||
)
|
||||
COOKIES_FILE: Path | None = Field(default=None)
|
||||
|
||||
URL_DENYLIST: str = Field(default=r"\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$", alias="URL_BLACKLIST")
|
||||
URL_ALLOWLIST: str | None = Field(default=None, alias="URL_WHITELIST")
|
||||
|
||||
SAVE_ALLOWLIST: Dict[str, List[str]] = Field(default={}) # mapping of regex patterns to list of archive methods
|
||||
SAVE_DENYLIST: Dict[str, List[str]] = Field(default={})
|
||||
|
||||
DEFAULT_PERSONA: str = Field(default="Default")
|
||||
|
||||
MEDIA_MAX_SIZE: str = Field(default='750m')
|
||||
RESOLUTION: str = Field(default='1440,2000')
|
||||
CHECK_SSL_VALIDITY: bool = Field(default=True)
|
||||
USER_AGENT: str = Field(default=f'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)')
|
||||
COOKIES_FILE: Path | None = Field(default=None)
|
||||
|
||||
URL_DENYLIST: str = Field(default=r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$', alias='URL_BLACKLIST')
|
||||
URL_ALLOWLIST: str | None = Field(default=None, alias='URL_WHITELIST')
|
||||
|
||||
SAVE_ALLOWLIST: Dict[str, List[str]] = Field(default={}) # mapping of regex patterns to list of archive methods
|
||||
SAVE_DENYLIST: Dict[str, List[str]] = Field(default={})
|
||||
|
||||
DEFAULT_PERSONA: str = Field(default='Default')
|
||||
|
||||
# GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
|
||||
# WGET_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}')
|
||||
# CURL_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}')
|
||||
@@ -134,58 +152,70 @@ class ArchivingConfig(BaseConfigSet):
|
||||
|
||||
def validate(self):
|
||||
if int(self.TIMEOUT) < 5:
|
||||
print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]', file=sys.stderr)
|
||||
print(' You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.', file=sys.stderr)
|
||||
print(' (Setting it to somewhere between 30 and 3000 seconds is recommended)', file=sys.stderr)
|
||||
print(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr)
|
||||
print(" You must allow *at least* 5 seconds for indexing and archive methods to run succesfully.", file=sys.stderr)
|
||||
print(" (Setting it to somewhere between 30 and 3000 seconds is recommended)", file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:', file=sys.stderr)
|
||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles', file=sys.stderr)
|
||||
print(" If you want to make ArchiveBox run faster, disable specific archive methods instead:", file=sys.stderr)
|
||||
print(" https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles", file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
|
||||
@field_validator('CHECK_SSL_VALIDITY', mode='after')
|
||||
|
||||
@field_validator("CHECK_SSL_VALIDITY", mode="after")
|
||||
def validate_check_ssl_validity(cls, v):
|
||||
"""SIDE EFFECT: disable "you really shouldnt disable ssl" warnings emitted by requests"""
|
||||
if not v:
|
||||
import requests
|
||||
import urllib3
|
||||
|
||||
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
return v
|
||||
|
||||
|
||||
@property
|
||||
def URL_ALLOWLIST_PTN(self) -> re.Pattern | None:
|
||||
return re.compile(self.URL_ALLOWLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS) if self.URL_ALLOWLIST else None
|
||||
|
||||
|
||||
@property
|
||||
def URL_DENYLIST_PTN(self) -> re.Pattern:
|
||||
return re.compile(self.URL_DENYLIST, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS)
|
||||
|
||||
|
||||
@property
|
||||
def SAVE_ALLOWLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
|
||||
return {
|
||||
# regexp: methods list
|
||||
re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
|
||||
for key, val in self.SAVE_ALLOWLIST.items()
|
||||
} if self.SAVE_ALLOWLIST else {}
|
||||
|
||||
return (
|
||||
{
|
||||
# regexp: methods list
|
||||
re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
|
||||
for key, val in self.SAVE_ALLOWLIST.items()
|
||||
}
|
||||
if self.SAVE_ALLOWLIST
|
||||
else {}
|
||||
)
|
||||
|
||||
@property
|
||||
def SAVE_DENYLIST_PTNS(self) -> Dict[re.Pattern, List[str]]:
|
||||
return {
|
||||
# regexp: methods list
|
||||
re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
|
||||
for key, val in self.SAVE_DENYLIST.items()
|
||||
} if self.SAVE_DENYLIST else {}
|
||||
return (
|
||||
{
|
||||
# regexp: methods list
|
||||
re.compile(key, CONSTANTS.ALLOWDENYLIST_REGEX_FLAGS): val
|
||||
for key, val in self.SAVE_DENYLIST.items()
|
||||
}
|
||||
if self.SAVE_DENYLIST
|
||||
else {}
|
||||
)
|
||||
|
||||
|
||||
ARCHIVING_CONFIG = ArchivingConfig()
|
||||
|
||||
|
||||
class SearchBackendConfig(BaseConfigSet):
|
||||
USE_INDEXING_BACKEND: bool = Field(default=True)
|
||||
USE_SEARCHING_BACKEND: bool = Field(default=True)
|
||||
|
||||
SEARCH_BACKEND_ENGINE: str = Field(default='ripgrep')
|
||||
SEARCH_PROCESS_HTML: bool = Field(default=True)
|
||||
SEARCH_BACKEND_TIMEOUT: int = Field(default=10)
|
||||
toml_section_header: str = "SEARCH_BACKEND_CONFIG"
|
||||
|
||||
USE_INDEXING_BACKEND: bool = Field(default=True)
|
||||
USE_SEARCHING_BACKEND: bool = Field(default=True)
|
||||
|
||||
SEARCH_BACKEND_ENGINE: str = Field(default="ripgrep")
|
||||
SEARCH_PROCESS_HTML: bool = Field(default=True)
|
||||
SEARCH_BACKEND_TIMEOUT: int = Field(default=10)
|
||||
|
||||
|
||||
SEARCH_BACKEND_CONFIG = SearchBackendConfig()
|
||||
|
||||
|
||||
266
archivebox/config/configset.py
Normal file
266
archivebox/config/configset.py
Normal file
@@ -0,0 +1,266 @@
|
||||
"""
|
||||
Simplified config system for ArchiveBox.
|
||||
|
||||
This replaces the complex abx_spec_config/base_configset.py with a simpler
|
||||
approach that still supports environment variables, config files, and
|
||||
per-object overrides.
|
||||
"""
|
||||
|
||||
__package__ = "archivebox.config"
|
||||
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
from typing import Any, Dict, Optional, List, Type, TYPE_CHECKING, cast
|
||||
from configparser import ConfigParser
|
||||
|
||||
from pydantic import Field
|
||||
from pydantic_settings import BaseSettings
|
||||
|
||||
|
||||
class BaseConfigSet(BaseSettings):
|
||||
"""
|
||||
Base class for config sections.
|
||||
|
||||
Automatically loads values from:
|
||||
1. Environment variables (highest priority)
|
||||
2. ArchiveBox.conf file (if exists)
|
||||
3. Default values (lowest priority)
|
||||
|
||||
Subclasses define fields with defaults and types:
|
||||
|
||||
class ShellConfig(BaseConfigSet):
|
||||
DEBUG: bool = Field(default=False)
|
||||
USE_COLOR: bool = Field(default=True)
|
||||
"""
|
||||
|
||||
class Config:
|
||||
# Use env vars with ARCHIVEBOX_ prefix or raw name
|
||||
env_prefix = ""
|
||||
extra = "ignore"
|
||||
validate_default = True
|
||||
|
||||
@classmethod
|
||||
def load_from_file(cls, config_path: Path) -> Dict[str, str]:
|
||||
"""Load config values from INI file."""
|
||||
if not config_path.exists():
|
||||
return {}
|
||||
|
||||
parser = ConfigParser()
|
||||
parser.optionxform = lambda x: x # type: ignore # preserve case
|
||||
parser.read(config_path)
|
||||
|
||||
# Flatten all sections into single namespace
|
||||
return {key.upper(): value for section in parser.sections() for key, value in parser.items(section)}
|
||||
|
||||
def update_in_place(self, warn: bool = True, persist: bool = False, **kwargs) -> None:
|
||||
"""
|
||||
Update config values in place.
|
||||
|
||||
This allows runtime updates to config without reloading.
|
||||
"""
|
||||
for key, value in kwargs.items():
|
||||
if hasattr(self, key):
|
||||
# Use object.__setattr__ to bypass pydantic's frozen model
|
||||
object.__setattr__(self, key, value)
|
||||
|
||||
|
||||
def get_config(
|
||||
scope: str = "global",
|
||||
defaults: Optional[Dict] = None,
|
||||
user: Any = None,
|
||||
crawl: Any = None,
|
||||
snapshot: Any = None,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
Get merged config from all sources.
|
||||
|
||||
Priority (highest to lowest):
|
||||
1. Per-snapshot config (snapshot.config JSON field)
|
||||
2. Per-crawl config (crawl.config JSON field)
|
||||
3. Per-user config (user.config JSON field)
|
||||
4. Environment variables
|
||||
5. Config file (ArchiveBox.conf)
|
||||
6. Plugin schema defaults (config.json)
|
||||
7. Core config defaults
|
||||
|
||||
Args:
|
||||
scope: Config scope ('global', 'crawl', 'snapshot', etc.)
|
||||
defaults: Default values to start with
|
||||
user: User object with config JSON field
|
||||
crawl: Crawl object with config JSON field
|
||||
snapshot: Snapshot object with config JSON field
|
||||
|
||||
Returns:
|
||||
Merged config dict
|
||||
"""
|
||||
from archivebox.config.constants import CONSTANTS
|
||||
from archivebox.config.common import (
|
||||
SHELL_CONFIG,
|
||||
STORAGE_CONFIG,
|
||||
GENERAL_CONFIG,
|
||||
SERVER_CONFIG,
|
||||
ARCHIVING_CONFIG,
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
)
|
||||
|
||||
# Start with defaults
|
||||
config = dict(defaults or {})
|
||||
|
||||
# Add plugin config defaults from JSONSchema config.json files
|
||||
try:
|
||||
from archivebox.hooks import get_config_defaults_from_plugins
|
||||
plugin_defaults = get_config_defaults_from_plugins()
|
||||
config.update(plugin_defaults)
|
||||
except ImportError:
|
||||
pass # hooks not available yet during early startup
|
||||
|
||||
# Add all core config sections
|
||||
config.update(dict(SHELL_CONFIG))
|
||||
config.update(dict(STORAGE_CONFIG))
|
||||
config.update(dict(GENERAL_CONFIG))
|
||||
config.update(dict(SERVER_CONFIG))
|
||||
config.update(dict(ARCHIVING_CONFIG))
|
||||
config.update(dict(SEARCH_BACKEND_CONFIG))
|
||||
|
||||
# Load from config file
|
||||
config_file = CONSTANTS.CONFIG_FILE
|
||||
if config_file.exists():
|
||||
file_config = BaseConfigSet.load_from_file(config_file)
|
||||
config.update(file_config)
|
||||
|
||||
# Override with environment variables
|
||||
for key in config:
|
||||
env_val = os.environ.get(key)
|
||||
if env_val is not None:
|
||||
config[key] = _parse_env_value(env_val, config.get(key))
|
||||
|
||||
# Also check plugin config aliases in environment
|
||||
try:
|
||||
from archivebox.hooks import discover_plugin_configs
|
||||
plugin_configs = discover_plugin_configs()
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
for key, prop_schema in schema.get('properties', {}).items():
|
||||
# Check x-aliases
|
||||
for alias in prop_schema.get('x-aliases', []):
|
||||
if alias in os.environ and key not in os.environ:
|
||||
config[key] = _parse_env_value(os.environ[alias], config.get(key))
|
||||
break
|
||||
# Check x-fallback
|
||||
fallback = prop_schema.get('x-fallback')
|
||||
if fallback and fallback in config and key not in config:
|
||||
config[key] = config[fallback]
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Apply user config overrides
|
||||
if user and hasattr(user, "config") and user.config:
|
||||
config.update(user.config)
|
||||
|
||||
# Apply crawl config overrides
|
||||
if crawl and hasattr(crawl, "config") and crawl.config:
|
||||
config.update(crawl.config)
|
||||
|
||||
# Apply snapshot config overrides (highest priority)
|
||||
if snapshot and hasattr(snapshot, "config") and snapshot.config:
|
||||
config.update(snapshot.config)
|
||||
|
||||
return config
|
||||
|
||||
|
||||
def get_flat_config() -> Dict[str, Any]:
|
||||
"""
|
||||
Get a flat dictionary of all config values.
|
||||
|
||||
Replaces abx.pm.hook.get_FLAT_CONFIG()
|
||||
"""
|
||||
return get_config(scope="global")
|
||||
|
||||
|
||||
def get_all_configs() -> Dict[str, BaseConfigSet]:
|
||||
"""
|
||||
Get all config section objects as a dictionary.
|
||||
|
||||
Replaces abx.pm.hook.get_CONFIGS()
|
||||
"""
|
||||
from archivebox.config.common import (
|
||||
SHELL_CONFIG, SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG
|
||||
)
|
||||
return {
|
||||
'SHELL_CONFIG': SHELL_CONFIG,
|
||||
'SERVER_CONFIG': SERVER_CONFIG,
|
||||
'ARCHIVING_CONFIG': ARCHIVING_CONFIG,
|
||||
'SEARCH_BACKEND_CONFIG': SEARCH_BACKEND_CONFIG,
|
||||
}
|
||||
|
||||
|
||||
def _parse_env_value(value: str, default: Any = None) -> Any:
|
||||
"""Parse an environment variable value based on expected type."""
|
||||
if default is None:
|
||||
# Try to guess the type
|
||||
if value.lower() in ("true", "false", "yes", "no", "1", "0"):
|
||||
return value.lower() in ("true", "yes", "1")
|
||||
try:
|
||||
return int(value)
|
||||
except ValueError:
|
||||
pass
|
||||
try:
|
||||
return json.loads(value)
|
||||
except (json.JSONDecodeError, ValueError):
|
||||
pass
|
||||
return value
|
||||
|
||||
# Parse based on default's type
|
||||
if isinstance(default, bool):
|
||||
return value.lower() in ("true", "yes", "1")
|
||||
elif isinstance(default, int):
|
||||
return int(value)
|
||||
elif isinstance(default, float):
|
||||
return float(value)
|
||||
elif isinstance(default, (list, dict)):
|
||||
return json.loads(value)
|
||||
elif isinstance(default, Path):
|
||||
return Path(value)
|
||||
else:
|
||||
return value
|
||||
|
||||
|
||||
# Default worker concurrency settings
|
||||
DEFAULT_WORKER_CONCURRENCY = {
|
||||
"crawl": 2,
|
||||
"snapshot": 3,
|
||||
"wget": 2,
|
||||
"ytdlp": 2,
|
||||
"screenshot": 3,
|
||||
"singlefile": 2,
|
||||
"title": 5,
|
||||
"favicon": 5,
|
||||
"headers": 5,
|
||||
"archive_org": 2,
|
||||
"readability": 3,
|
||||
"mercury": 3,
|
||||
"git": 2,
|
||||
"pdf": 2,
|
||||
"dom": 3,
|
||||
}
|
||||
|
||||
|
||||
def get_worker_concurrency() -> Dict[str, int]:
|
||||
"""
|
||||
Get worker concurrency settings.
|
||||
|
||||
Can be configured via WORKER_CONCURRENCY env var as JSON dict.
|
||||
"""
|
||||
config = get_config()
|
||||
|
||||
# Start with defaults
|
||||
concurrency = DEFAULT_WORKER_CONCURRENCY.copy()
|
||||
|
||||
# Override with config
|
||||
if "WORKER_CONCURRENCY" in config:
|
||||
custom = config["WORKER_CONCURRENCY"]
|
||||
if isinstance(custom, str):
|
||||
custom = json.loads(custom)
|
||||
concurrency.update(custom)
|
||||
|
||||
return concurrency
|
||||
@@ -1,6 +1,7 @@
|
||||
__package__ = 'abx.archivebox'
|
||||
__package__ = 'archivebox.config'
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import inspect
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Dict, cast
|
||||
@@ -13,14 +14,22 @@ from django.utils.html import format_html, mark_safe
|
||||
from admin_data_views.typing import TableContext, ItemContext
|
||||
from admin_data_views.utils import render_with_table_view, render_with_item_view, ItemLink
|
||||
|
||||
import abx
|
||||
import archivebox
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.misc.util import parse_date
|
||||
|
||||
from machine.models import InstalledBinary
|
||||
|
||||
|
||||
# Common binaries to check for
|
||||
KNOWN_BINARIES = [
|
||||
'wget', 'curl', 'chromium', 'chrome', 'google-chrome', 'google-chrome-stable',
|
||||
'node', 'npm', 'npx', 'yt-dlp', 'ytdlp', 'youtube-dl',
|
||||
'git', 'singlefile', 'readability-extractor', 'mercury-parser',
|
||||
'python3', 'python', 'bash', 'zsh',
|
||||
'ffmpeg', 'ripgrep', 'rg', 'sonic', 'archivebox',
|
||||
]
|
||||
|
||||
|
||||
def obj_to_yaml(obj: Any, indent: int=0) -> str:
|
||||
indent_str = " " * indent
|
||||
if indent == 0:
|
||||
@@ -62,65 +71,92 @@ def obj_to_yaml(obj: Any, indent: int=0) -> str:
|
||||
else:
|
||||
return f" {str(obj)}"
|
||||
|
||||
|
||||
def get_detected_binaries() -> Dict[str, Dict[str, Any]]:
|
||||
"""Detect available binaries using shutil.which."""
|
||||
binaries = {}
|
||||
|
||||
for name in KNOWN_BINARIES:
|
||||
path = shutil.which(name)
|
||||
if path:
|
||||
binaries[name] = {
|
||||
'name': name,
|
||||
'abspath': path,
|
||||
'version': None, # Could add version detection later
|
||||
'is_available': True,
|
||||
}
|
||||
|
||||
return binaries
|
||||
|
||||
|
||||
def get_filesystem_plugins() -> Dict[str, Dict[str, Any]]:
|
||||
"""Discover plugins from filesystem directories."""
|
||||
from archivebox.hooks import BUILTIN_PLUGINS_DIR, USER_PLUGINS_DIR
|
||||
|
||||
plugins = {}
|
||||
|
||||
for base_dir, source in [(BUILTIN_PLUGINS_DIR, 'builtin'), (USER_PLUGINS_DIR, 'user')]:
|
||||
if not base_dir.exists():
|
||||
continue
|
||||
|
||||
for plugin_dir in base_dir.iterdir():
|
||||
if plugin_dir.is_dir() and not plugin_dir.name.startswith('_'):
|
||||
plugin_id = f'{source}.{plugin_dir.name}'
|
||||
|
||||
# Find hook scripts
|
||||
hooks = []
|
||||
for ext in ('sh', 'py', 'js'):
|
||||
hooks.extend(plugin_dir.glob(f'on_*__*.{ext}'))
|
||||
|
||||
plugins[plugin_id] = {
|
||||
'id': plugin_id,
|
||||
'name': plugin_dir.name,
|
||||
'path': str(plugin_dir),
|
||||
'source': source,
|
||||
'hooks': [str(h.name) for h in hooks],
|
||||
}
|
||||
|
||||
return plugins
|
||||
|
||||
|
||||
@render_with_table_view
|
||||
def binaries_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
FLAT_CONFIG = archivebox.pm.hook.get_FLAT_CONFIG()
|
||||
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||
|
||||
rows = {
|
||||
"Binary Name": [],
|
||||
"Found Version": [],
|
||||
"From Plugin": [],
|
||||
"Provided By": [],
|
||||
"Found Abspath": [],
|
||||
"Related Configuration": [],
|
||||
# "Overrides": [],
|
||||
# "Description": [],
|
||||
}
|
||||
|
||||
relevant_configs = {
|
||||
key: val
|
||||
for key, val in FLAT_CONFIG.items()
|
||||
if '_BINARY' in key or '_VERSION' in key
|
||||
}
|
||||
|
||||
for plugin_id, plugin in abx.get_all_plugins().items():
|
||||
plugin = benedict(plugin)
|
||||
if not hasattr(plugin.plugin, 'get_BINARIES'):
|
||||
continue
|
||||
# Get binaries from database (previously detected/installed)
|
||||
db_binaries = {b.name: b for b in InstalledBinary.objects.all()}
|
||||
|
||||
# Get currently detectable binaries
|
||||
detected = get_detected_binaries()
|
||||
|
||||
# Merge and display
|
||||
all_binary_names = sorted(set(list(db_binaries.keys()) + list(detected.keys())))
|
||||
|
||||
for name in all_binary_names:
|
||||
db_binary = db_binaries.get(name)
|
||||
detected_binary = detected.get(name)
|
||||
|
||||
for binary in plugin.plugin.get_BINARIES().values():
|
||||
try:
|
||||
installed_binary = InstalledBinary.objects.get_from_db_or_cache(binary)
|
||||
binary = installed_binary.load_from_db()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
rows['Binary Name'].append(ItemLink(binary.name, key=binary.name))
|
||||
rows['Found Version'].append(f'✅ {binary.loaded_version}' if binary.loaded_version else '❌ missing')
|
||||
rows['From Plugin'].append(plugin.package)
|
||||
rows['Provided By'].append(
|
||||
', '.join(
|
||||
f'[{binprovider.name}]' if binprovider.name == getattr(binary.loaded_binprovider, 'name', None) else binprovider.name
|
||||
for binprovider in binary.binproviders_supported
|
||||
if binprovider
|
||||
)
|
||||
# binary.loaded_binprovider.name
|
||||
# if binary.loaded_binprovider else
|
||||
# ', '.join(getattr(provider, 'name', str(provider)) for provider in binary.binproviders_supported)
|
||||
)
|
||||
rows['Found Abspath'].append(str(binary.loaded_abspath or '❌ missing'))
|
||||
rows['Related Configuration'].append(mark_safe(', '.join(
|
||||
f'<a href="/admin/environment/config/{config_key}/">{config_key}</a>'
|
||||
for config_key, config_value in relevant_configs.items()
|
||||
if str(binary.name).lower().replace('-', '').replace('_', '').replace('ytdlp', 'youtubedl') in config_key.lower()
|
||||
or config_value.lower().endswith(binary.name.lower())
|
||||
# or binary.name.lower().replace('-', '').replace('_', '') in str(config_value).lower()
|
||||
)))
|
||||
# if not binary.overrides:
|
||||
# import ipdb; ipdb.set_trace()
|
||||
# rows['Overrides'].append(str(obj_to_yaml(binary.overrides) or str(binary.overrides))[:200])
|
||||
# rows['Description'].append(binary.description)
|
||||
rows['Binary Name'].append(ItemLink(name, key=name))
|
||||
|
||||
if db_binary:
|
||||
rows['Found Version'].append(f'✅ {db_binary.version}' if db_binary.version else '✅ found')
|
||||
rows['Provided By'].append(db_binary.binprovider or 'PATH')
|
||||
rows['Found Abspath'].append(str(db_binary.abspath or ''))
|
||||
elif detected_binary:
|
||||
rows['Found Version'].append('✅ found')
|
||||
rows['Provided By'].append('PATH')
|
||||
rows['Found Abspath'].append(detected_binary['abspath'])
|
||||
else:
|
||||
rows['Found Version'].append('❌ missing')
|
||||
rows['Provided By'].append('-')
|
||||
rows['Found Abspath'].append('-')
|
||||
|
||||
return TableContext(
|
||||
title="Binaries",
|
||||
@@ -132,43 +168,65 @@ def binary_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
|
||||
assert request.user and request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||
|
||||
binary = None
|
||||
plugin = None
|
||||
for plugin_id, plugin in abx.get_all_plugins().items():
|
||||
try:
|
||||
for loaded_binary in plugin['hooks'].get_BINARIES().values():
|
||||
if loaded_binary.name == key:
|
||||
binary = loaded_binary
|
||||
plugin = plugin
|
||||
# break # last write wins
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
assert plugin and binary, f'Could not find a binary matching the specified name: {key}'
|
||||
|
||||
# Try database first
|
||||
try:
|
||||
binary = binary.load()
|
||||
except Exception as e:
|
||||
print(e)
|
||||
|
||||
binary = InstalledBinary.objects.get(name=key)
|
||||
return ItemContext(
|
||||
slug=key,
|
||||
title=key,
|
||||
data=[
|
||||
{
|
||||
"name": binary.name,
|
||||
"description": str(binary.abspath or ''),
|
||||
"fields": {
|
||||
'name': binary.name,
|
||||
'binprovider': binary.binprovider,
|
||||
'abspath': str(binary.abspath),
|
||||
'version': binary.version,
|
||||
'sha256': binary.sha256,
|
||||
},
|
||||
"help_texts": {},
|
||||
},
|
||||
],
|
||||
)
|
||||
except InstalledBinary.DoesNotExist:
|
||||
pass
|
||||
|
||||
# Try to detect from PATH
|
||||
path = shutil.which(key)
|
||||
if path:
|
||||
return ItemContext(
|
||||
slug=key,
|
||||
title=key,
|
||||
data=[
|
||||
{
|
||||
"name": key,
|
||||
"description": path,
|
||||
"fields": {
|
||||
'name': key,
|
||||
'binprovider': 'PATH',
|
||||
'abspath': path,
|
||||
'version': 'unknown',
|
||||
},
|
||||
"help_texts": {},
|
||||
},
|
||||
],
|
||||
)
|
||||
|
||||
return ItemContext(
|
||||
slug=key,
|
||||
title=key,
|
||||
data=[
|
||||
{
|
||||
"name": binary.name,
|
||||
"description": binary.abspath,
|
||||
"name": key,
|
||||
"description": "Binary not found",
|
||||
"fields": {
|
||||
'plugin': plugin['package'],
|
||||
'binprovider': binary.loaded_binprovider,
|
||||
'abspath': binary.loaded_abspath,
|
||||
'version': binary.loaded_version,
|
||||
'overrides': obj_to_yaml(binary.overrides),
|
||||
'providers': obj_to_yaml(binary.binproviders_supported),
|
||||
},
|
||||
"help_texts": {
|
||||
# TODO
|
||||
'name': key,
|
||||
'binprovider': 'not installed',
|
||||
'abspath': 'not found',
|
||||
'version': 'N/A',
|
||||
},
|
||||
"help_texts": {},
|
||||
},
|
||||
],
|
||||
)
|
||||
@@ -180,66 +238,26 @@ def plugins_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||
|
||||
rows = {
|
||||
"Label": [],
|
||||
"Version": [],
|
||||
"Author": [],
|
||||
"Package": [],
|
||||
"Source Code": [],
|
||||
"Config": [],
|
||||
"Binaries": [],
|
||||
"Package Managers": [],
|
||||
# "Search Backends": [],
|
||||
"Name": [],
|
||||
"Source": [],
|
||||
"Path": [],
|
||||
"Hooks": [],
|
||||
}
|
||||
|
||||
config_colors = {
|
||||
'_BINARY': '#339',
|
||||
'USE_': 'green',
|
||||
'SAVE_': 'green',
|
||||
'_ARGS': '#33e',
|
||||
'KEY': 'red',
|
||||
'COOKIES': 'red',
|
||||
'AUTH': 'red',
|
||||
'SECRET': 'red',
|
||||
'TOKEN': 'red',
|
||||
'PASSWORD': 'red',
|
||||
'TIMEOUT': '#533',
|
||||
'RETRIES': '#533',
|
||||
'MAX': '#533',
|
||||
'MIN': '#533',
|
||||
}
|
||||
def get_color(key):
|
||||
for pattern, color in config_colors.items():
|
||||
if pattern in key:
|
||||
return color
|
||||
return 'black'
|
||||
plugins = get_filesystem_plugins()
|
||||
|
||||
for plugin_id, plugin in abx.get_all_plugins().items():
|
||||
plugin.hooks.get_BINPROVIDERS = getattr(plugin.plugin, 'get_BINPROVIDERS', lambda: {})
|
||||
plugin.hooks.get_BINARIES = getattr(plugin.plugin, 'get_BINARIES', lambda: {})
|
||||
plugin.hooks.get_CONFIG = getattr(plugin.plugin, 'get_CONFIG', lambda: {})
|
||||
|
||||
rows['Label'].append(ItemLink(plugin.label, key=plugin.package))
|
||||
rows['Version'].append(str(plugin.version))
|
||||
rows['Author'].append(mark_safe(f'<a href="{plugin.homepage}" target="_blank">{plugin.author}</a>'))
|
||||
rows['Package'].append(ItemLink(plugin.package, key=plugin.package))
|
||||
rows['Source Code'].append(format_html('<code>{}</code>', str(plugin.source_code).replace(str(Path('~').expanduser()), '~')))
|
||||
rows['Config'].append(mark_safe(''.join(
|
||||
f'<a href="/admin/environment/config/{key}/"><b><code style="color: {get_color(key)};">{key}</code></b>=<code>{value}</code></a><br/>'
|
||||
for configdict in plugin.hooks.get_CONFIG().values()
|
||||
for key, value in benedict(configdict).items()
|
||||
)))
|
||||
rows['Binaries'].append(mark_safe(', '.join(
|
||||
f'<a href="/admin/environment/binaries/{binary.name}/"><code>{binary.name}</code></a>'
|
||||
for binary in plugin.hooks.get_BINARIES().values()
|
||||
)))
|
||||
rows['Package Managers'].append(mark_safe(', '.join(
|
||||
f'<a href="/admin/environment/binproviders/{binprovider.name}/"><code>{binprovider.name}</code></a>'
|
||||
for binprovider in plugin.hooks.get_BINPROVIDERS().values()
|
||||
)))
|
||||
# rows['Search Backends'].append(mark_safe(', '.join(
|
||||
# f'<a href="/admin/environment/searchbackends/{searchbackend.name}/"><code>{searchbackend.name}</code></a>'
|
||||
# for searchbackend in plugin.SEARCHBACKENDS.values()
|
||||
# )))
|
||||
for plugin_id, plugin in plugins.items():
|
||||
rows['Name'].append(ItemLink(plugin['name'], key=plugin_id))
|
||||
rows['Source'].append(plugin['source'])
|
||||
rows['Path'].append(format_html('<code>{}</code>', plugin['path']))
|
||||
rows['Hooks'].append(', '.join(plugin['hooks']) or '(none)')
|
||||
|
||||
if not plugins:
|
||||
# Show a helpful message when no plugins found
|
||||
rows['Name'].append('(no plugins found)')
|
||||
rows['Source'].append('-')
|
||||
rows['Path'].append(format_html('<code>archivebox/plugins/</code> or <code>data/plugins/</code>'))
|
||||
rows['Hooks'].append('-')
|
||||
|
||||
return TableContext(
|
||||
title="Installed plugins",
|
||||
@@ -251,39 +269,31 @@ def plugin_detail_view(request: HttpRequest, key: str, **kwargs) -> ItemContext:
|
||||
|
||||
assert request.user.is_superuser, 'Must be a superuser to view configuration settings.'
|
||||
|
||||
plugins = abx.get_all_plugins()
|
||||
|
||||
plugin_id = None
|
||||
for check_plugin_id, loaded_plugin in plugins.items():
|
||||
if check_plugin_id.split('.')[-1] == key.split('.')[-1]:
|
||||
plugin_id = check_plugin_id
|
||||
break
|
||||
|
||||
assert plugin_id, f'Could not find a plugin matching the specified name: {key}'
|
||||
|
||||
plugin = abx.get_plugin(plugin_id)
|
||||
plugins = get_filesystem_plugins()
|
||||
|
||||
plugin = plugins.get(key)
|
||||
if not plugin:
|
||||
return ItemContext(
|
||||
slug=key,
|
||||
title=f'Plugin not found: {key}',
|
||||
data=[],
|
||||
)
|
||||
|
||||
return ItemContext(
|
||||
slug=key,
|
||||
title=key,
|
||||
title=plugin['name'],
|
||||
data=[
|
||||
{
|
||||
"name": plugin.package,
|
||||
"description": plugin.label,
|
||||
"name": plugin['name'],
|
||||
"description": plugin['path'],
|
||||
"fields": {
|
||||
"id": plugin.id,
|
||||
"package": plugin.package,
|
||||
"label": plugin.label,
|
||||
"version": plugin.version,
|
||||
"author": plugin.author,
|
||||
"homepage": plugin.homepage,
|
||||
"dependencies": getattr(plugin, 'DEPENDENCIES', []),
|
||||
"source_code": plugin.source_code,
|
||||
"hooks": plugin.hooks,
|
||||
},
|
||||
"help_texts": {
|
||||
# TODO
|
||||
"id": plugin['id'],
|
||||
"name": plugin['name'],
|
||||
"source": plugin['source'],
|
||||
"path": plugin['path'],
|
||||
"hooks": plugin['hooks'],
|
||||
},
|
||||
"help_texts": {},
|
||||
},
|
||||
],
|
||||
)
|
||||
@@ -333,22 +343,6 @@ def worker_list_view(request: HttpRequest, **kwargs) -> TableContext:
|
||||
# Add a row for each worker process managed by supervisord
|
||||
for proc in cast(List[Dict[str, Any]], supervisor.getAllProcessInfo()):
|
||||
proc = benedict(proc)
|
||||
# {
|
||||
# "name": "daphne",
|
||||
# "group": "daphne",
|
||||
# "start": 1725933056,
|
||||
# "stop": 0,
|
||||
# "now": 1725933438,
|
||||
# "state": 20,
|
||||
# "statename": "RUNNING",
|
||||
# "spawnerr": "",
|
||||
# "exitstatus": 0,
|
||||
# "logfile": "logs/server.log",
|
||||
# "stdout_logfile": "logs/server.log",
|
||||
# "stderr_logfile": "",
|
||||
# "pid": 33283,
|
||||
# "description": "pid 33283, uptime 0:06:22",
|
||||
# }
|
||||
rows["Name"].append(ItemLink(proc.name, key=proc.name))
|
||||
rows["State"].append(proc.statename)
|
||||
rows['PID'].append(proc.description.replace('pid ', ''))
|
||||
|
||||
Reference in New Issue
Block a user