mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 07:17:52 +10:00
wip
This commit is contained in:
@@ -35,177 +35,41 @@ def _get_config():
|
||||
# These are recalculated each time the module attribute is accessed
|
||||
|
||||
def __getattr__(name: str):
|
||||
"""Module-level __getattr__ for lazy config loading."""
|
||||
|
||||
# Timeout settings
|
||||
"""
|
||||
Module-level __getattr__ for lazy config loading.
|
||||
|
||||
Only provides backwards compatibility for GENERIC/SHARED config.
|
||||
Plugin-specific config (binaries, args, toggles) should come from plugin config.json files.
|
||||
"""
|
||||
|
||||
# Generic timeout settings (used by multiple plugins)
|
||||
if name == 'TIMEOUT':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.TIMEOUT
|
||||
if name == 'MEDIA_TIMEOUT':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.MEDIA_TIMEOUT
|
||||
|
||||
# SSL/Security settings
|
||||
|
||||
# Generic SSL/Security settings (used by multiple plugins)
|
||||
if name == 'CHECK_SSL_VALIDITY':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.CHECK_SSL_VALIDITY
|
||||
|
||||
# Storage settings
|
||||
|
||||
# Generic storage settings (used by multiple plugins)
|
||||
if name == 'RESTRICT_FILE_NAMES':
|
||||
_, storage = _get_config()
|
||||
return storage.RESTRICT_FILE_NAMES
|
||||
|
||||
# User agent / cookies
|
||||
|
||||
# Generic user agent / cookies (used by multiple plugins)
|
||||
if name == 'COOKIES_FILE':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.COOKIES_FILE
|
||||
if name == 'USER_AGENT':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.USER_AGENT
|
||||
if name == 'CURL_USER_AGENT':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.USER_AGENT
|
||||
if name == 'WGET_USER_AGENT':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.USER_AGENT
|
||||
if name == 'CHROME_USER_AGENT':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.USER_AGENT
|
||||
|
||||
# Archive method toggles (SAVE_*)
|
||||
if name == 'SAVE_TITLE':
|
||||
return True
|
||||
if name == 'SAVE_FAVICON':
|
||||
return True
|
||||
if name == 'SAVE_WGET':
|
||||
return True
|
||||
if name == 'SAVE_WARC':
|
||||
return True
|
||||
if name == 'SAVE_WGET_REQUISITES':
|
||||
return True
|
||||
if name == 'SAVE_SINGLEFILE':
|
||||
return True
|
||||
if name == 'SAVE_READABILITY':
|
||||
return True
|
||||
if name == 'SAVE_MERCURY':
|
||||
return True
|
||||
if name == 'SAVE_HTMLTOTEXT':
|
||||
return True
|
||||
if name == 'SAVE_PDF':
|
||||
return True
|
||||
if name == 'SAVE_SCREENSHOT':
|
||||
return True
|
||||
if name == 'SAVE_DOM':
|
||||
return True
|
||||
if name == 'SAVE_HEADERS':
|
||||
return True
|
||||
if name == 'SAVE_GIT':
|
||||
return True
|
||||
if name == 'SAVE_MEDIA':
|
||||
return True
|
||||
if name == 'SAVE_ARCHIVE_DOT_ORG':
|
||||
return True
|
||||
|
||||
# Extractor-specific settings
|
||||
|
||||
# Generic resolution settings (used by multiple plugins)
|
||||
if name == 'RESOLUTION':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.RESOLUTION
|
||||
if name == 'GIT_DOMAINS':
|
||||
return 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'
|
||||
if name == 'MEDIA_MAX_SIZE':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.MEDIA_MAX_SIZE
|
||||
if name == 'FAVICON_PROVIDER':
|
||||
return 'https://www.google.com/s2/favicons?domain={}'
|
||||
|
||||
# Binary paths (use shutil.which for detection)
|
||||
if name == 'CURL_BINARY':
|
||||
return shutil.which('curl') or 'curl'
|
||||
if name == 'WGET_BINARY':
|
||||
return shutil.which('wget') or 'wget'
|
||||
if name == 'GIT_BINARY':
|
||||
return shutil.which('git') or 'git'
|
||||
if name == 'YOUTUBEDL_BINARY':
|
||||
return shutil.which('yt-dlp') or shutil.which('youtube-dl') or 'yt-dlp'
|
||||
if name == 'CHROME_BINARY':
|
||||
for chrome in ['chromium', 'chromium-browser', 'google-chrome', 'google-chrome-stable', 'chrome']:
|
||||
path = shutil.which(chrome)
|
||||
if path:
|
||||
return path
|
||||
return 'chromium'
|
||||
if name == 'NODE_BINARY':
|
||||
return shutil.which('node') or 'node'
|
||||
if name == 'SINGLEFILE_BINARY':
|
||||
return shutil.which('single-file') or shutil.which('singlefile') or 'single-file'
|
||||
if name == 'READABILITY_BINARY':
|
||||
return shutil.which('readability-extractor') or 'readability-extractor'
|
||||
if name == 'MERCURY_BINARY':
|
||||
return shutil.which('mercury-parser') or shutil.which('postlight-parser') or 'mercury-parser'
|
||||
|
||||
# Binary versions (return placeholder, actual version detection happens elsewhere)
|
||||
if name == 'CURL_VERSION':
|
||||
return 'curl'
|
||||
if name == 'WGET_VERSION':
|
||||
return 'wget'
|
||||
if name == 'GIT_VERSION':
|
||||
return 'git'
|
||||
if name == 'YOUTUBEDL_VERSION':
|
||||
return 'yt-dlp'
|
||||
if name == 'CHROME_VERSION':
|
||||
return 'chromium'
|
||||
if name == 'SINGLEFILE_VERSION':
|
||||
return 'singlefile'
|
||||
if name == 'READABILITY_VERSION':
|
||||
return 'readability'
|
||||
if name == 'MERCURY_VERSION':
|
||||
return 'mercury'
|
||||
|
||||
# Binary arguments
|
||||
if name == 'CURL_ARGS':
|
||||
return ['--silent', '--location', '--compressed']
|
||||
if name == 'WGET_ARGS':
|
||||
return [
|
||||
'--no-verbose',
|
||||
'--adjust-extension',
|
||||
'--convert-links',
|
||||
'--force-directories',
|
||||
'--backup-converted',
|
||||
'--span-hosts',
|
||||
'--no-parent',
|
||||
'-e', 'robots=off',
|
||||
]
|
||||
if name == 'GIT_ARGS':
|
||||
return ['--recursive']
|
||||
if name == 'YOUTUBEDL_ARGS':
|
||||
cfg, _ = _get_config()
|
||||
return [
|
||||
'--write-description',
|
||||
'--write-info-json',
|
||||
'--write-annotations',
|
||||
'--write-thumbnail',
|
||||
'--no-call-home',
|
||||
'--write-sub',
|
||||
'--write-auto-subs',
|
||||
'--convert-subs=srt',
|
||||
'--yes-playlist',
|
||||
'--continue',
|
||||
'--no-abort-on-error',
|
||||
'--ignore-errors',
|
||||
'--geo-bypass',
|
||||
'--add-metadata',
|
||||
f'--format=(bv*+ba/b)[filesize<={cfg.MEDIA_MAX_SIZE}][filesize_approx<=?{cfg.MEDIA_MAX_SIZE}]/(bv*+ba/b)',
|
||||
]
|
||||
if name == 'SINGLEFILE_ARGS':
|
||||
return None # Uses defaults
|
||||
if name == 'CHROME_ARGS':
|
||||
return []
|
||||
|
||||
# Other settings
|
||||
if name == 'WGET_AUTO_COMPRESSION':
|
||||
return True
|
||||
if name == 'DEPENDENCIES':
|
||||
return {} # Legacy, not used anymore
|
||||
|
||||
|
||||
# Allowlist/Denylist patterns (compiled regexes)
|
||||
if name == 'SAVE_ALLOWLIST_PTN':
|
||||
cfg, _ = _get_config()
|
||||
@@ -213,7 +77,7 @@ def __getattr__(name: str):
|
||||
if name == 'SAVE_DENYLIST_PTN':
|
||||
cfg, _ = _get_config()
|
||||
return cfg.SAVE_DENYLIST_PTNS
|
||||
|
||||
|
||||
raise AttributeError(f"module 'archivebox.config' has no attribute '{name}'")
|
||||
|
||||
|
||||
|
||||
@@ -111,6 +111,24 @@ def load_config_file() -> Optional[benedict]:
|
||||
return None
|
||||
|
||||
|
||||
class PluginConfigSection:
|
||||
"""Pseudo-section for all plugin config keys written to [PLUGINS] section in ArchiveBox.conf"""
|
||||
toml_section_header = "PLUGINS"
|
||||
|
||||
def __init__(self, key: str):
|
||||
self._key = key
|
||||
|
||||
def __getattr__(self, name: str) -> Any:
|
||||
# Allow hasattr checks to pass for the key
|
||||
if name == self._key:
|
||||
return None
|
||||
raise AttributeError(f"PluginConfigSection has no attribute '{name}'")
|
||||
|
||||
def update_in_place(self, warn: bool = True, persist: bool = False, **kwargs):
|
||||
"""No-op update since plugins read config dynamically via get_config()."""
|
||||
pass
|
||||
|
||||
|
||||
def section_for_key(key: str) -> Any:
|
||||
"""Find the config section containing a given key."""
|
||||
from archivebox.config.common import (
|
||||
@@ -121,11 +139,22 @@ def section_for_key(key: str) -> Any:
|
||||
ARCHIVING_CONFIG,
|
||||
SEARCH_BACKEND_CONFIG,
|
||||
)
|
||||
|
||||
for section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG,
|
||||
|
||||
# First check core config sections
|
||||
for section in [SHELL_CONFIG, STORAGE_CONFIG, GENERAL_CONFIG,
|
||||
SERVER_CONFIG, ARCHIVING_CONFIG, SEARCH_BACKEND_CONFIG]:
|
||||
if hasattr(section, key):
|
||||
return section
|
||||
|
||||
# Check if this is a plugin config key
|
||||
from archivebox.hooks import discover_plugin_configs
|
||||
|
||||
plugin_configs = discover_plugin_configs()
|
||||
for plugin_name, schema in plugin_configs.items():
|
||||
if 'properties' in schema and key in schema['properties']:
|
||||
# All plugin config goes to [PLUGINS] section
|
||||
return PluginConfigSection(key)
|
||||
|
||||
raise ValueError(f'No config section found for key: {key}')
|
||||
|
||||
|
||||
|
||||
@@ -123,9 +123,7 @@ class ArchivingConfig(BaseConfigSet):
|
||||
OVERWRITE: bool = Field(default=False)
|
||||
|
||||
TIMEOUT: int = Field(default=60)
|
||||
MEDIA_TIMEOUT: int = Field(default=3600)
|
||||
|
||||
MEDIA_MAX_SIZE: str = Field(default="750m")
|
||||
RESOLUTION: str = Field(default="1440,2000")
|
||||
CHECK_SSL_VALIDITY: bool = Field(default=True)
|
||||
USER_AGENT: str = Field(
|
||||
@@ -141,15 +139,6 @@ class ArchivingConfig(BaseConfigSet):
|
||||
|
||||
DEFAULT_PERSONA: str = Field(default="Default")
|
||||
|
||||
# GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
|
||||
# WGET_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}')
|
||||
# CURL_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}')
|
||||
# CHROME_USER_AGENT: str = Field(default=lambda c: c['USER_AGENT'])
|
||||
# CHROME_USER_DATA_DIR: str | None = Field(default=None)
|
||||
# CHROME_TIMEOUT: int = Field(default=0)
|
||||
# CHROME_HEADLESS: bool = Field(default=True)
|
||||
# CHROME_SANDBOX: bool = Field(default=lambda: not SHELL_CONFIG.IN_DOCKER)
|
||||
|
||||
def validate(self):
|
||||
if int(self.TIMEOUT) < 5:
|
||||
print(f"[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.TIMEOUT} seconds)[/red]", file=sys.stderr)
|
||||
@@ -215,7 +204,6 @@ class SearchBackendConfig(BaseConfigSet):
|
||||
|
||||
SEARCH_BACKEND_ENGINE: str = Field(default="ripgrep")
|
||||
SEARCH_PROCESS_HTML: bool = Field(default=True)
|
||||
SEARCH_BACKEND_TIMEOUT: int = Field(default=10)
|
||||
|
||||
|
||||
SEARCH_BACKEND_CONFIG = SearchBackendConfig()
|
||||
|
||||
@@ -174,7 +174,7 @@ def get_config(
|
||||
config.update(dict(ARCHIVING_CONFIG))
|
||||
config.update(dict(SEARCH_BACKEND_CONFIG))
|
||||
|
||||
# Load from config file
|
||||
# Load from archivebox.config.file
|
||||
config_file = CONSTANTS.CONFIG_FILE
|
||||
if config_file.exists():
|
||||
file_config = BaseConfigSet.load_from_file(config_file)
|
||||
|
||||
@@ -17,7 +17,7 @@ from admin_data_views.utils import render_with_table_view, render_with_item_view
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.misc.util import parse_date
|
||||
|
||||
from machine.models import Binary
|
||||
from archivebox.machine.models import Binary
|
||||
|
||||
|
||||
# Common binaries to check for
|
||||
|
||||
Reference in New Issue
Block a user