This commit is contained in:
Nick Sweeting
2025-12-28 17:51:54 -08:00
parent 54f91c1339
commit f0aa19fa7d
157 changed files with 6774 additions and 5061 deletions

View File

@@ -35,177 +35,41 @@ def _get_config():
# These are recalculated each time the module attribute is accessed
def __getattr__(name: str):
"""Module-level __getattr__ for lazy config loading."""
# Timeout settings
"""
Module-level __getattr__ for lazy config loading.
Only provides backwards compatibility for GENERIC/SHARED config.
Plugin-specific config (binaries, args, toggles) should come from plugin config.json files.
"""
# Generic timeout settings (used by multiple plugins)
if name == 'TIMEOUT':
cfg, _ = _get_config()
return cfg.TIMEOUT
if name == 'MEDIA_TIMEOUT':
cfg, _ = _get_config()
return cfg.MEDIA_TIMEOUT
# SSL/Security settings
# Generic SSL/Security settings (used by multiple plugins)
if name == 'CHECK_SSL_VALIDITY':
cfg, _ = _get_config()
return cfg.CHECK_SSL_VALIDITY
# Storage settings
# Generic storage settings (used by multiple plugins)
if name == 'RESTRICT_FILE_NAMES':
_, storage = _get_config()
return storage.RESTRICT_FILE_NAMES
# User agent / cookies
# Generic user agent / cookies (used by multiple plugins)
if name == 'COOKIES_FILE':
cfg, _ = _get_config()
return cfg.COOKIES_FILE
if name == 'USER_AGENT':
cfg, _ = _get_config()
return cfg.USER_AGENT
if name == 'CURL_USER_AGENT':
cfg, _ = _get_config()
return cfg.USER_AGENT
if name == 'WGET_USER_AGENT':
cfg, _ = _get_config()
return cfg.USER_AGENT
if name == 'CHROME_USER_AGENT':
cfg, _ = _get_config()
return cfg.USER_AGENT
# Archive method toggles (SAVE_*)
if name == 'SAVE_TITLE':
return True
if name == 'SAVE_FAVICON':
return True
if name == 'SAVE_WGET':
return True
if name == 'SAVE_WARC':
return True
if name == 'SAVE_WGET_REQUISITES':
return True
if name == 'SAVE_SINGLEFILE':
return True
if name == 'SAVE_READABILITY':
return True
if name == 'SAVE_MERCURY':
return True
if name == 'SAVE_HTMLTOTEXT':
return True
if name == 'SAVE_PDF':
return True
if name == 'SAVE_SCREENSHOT':
return True
if name == 'SAVE_DOM':
return True
if name == 'SAVE_HEADERS':
return True
if name == 'SAVE_GIT':
return True
if name == 'SAVE_MEDIA':
return True
if name == 'SAVE_ARCHIVE_DOT_ORG':
return True
# Extractor-specific settings
# Generic resolution settings (used by multiple plugins)
if name == 'RESOLUTION':
cfg, _ = _get_config()
return cfg.RESOLUTION
if name == 'GIT_DOMAINS':
return 'github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht'
if name == 'MEDIA_MAX_SIZE':
cfg, _ = _get_config()
return cfg.MEDIA_MAX_SIZE
if name == 'FAVICON_PROVIDER':
return 'https://www.google.com/s2/favicons?domain={}'
# Binary paths (use shutil.which for detection)
if name == 'CURL_BINARY':
return shutil.which('curl') or 'curl'
if name == 'WGET_BINARY':
return shutil.which('wget') or 'wget'
if name == 'GIT_BINARY':
return shutil.which('git') or 'git'
if name == 'YOUTUBEDL_BINARY':
return shutil.which('yt-dlp') or shutil.which('youtube-dl') or 'yt-dlp'
if name == 'CHROME_BINARY':
for chrome in ['chromium', 'chromium-browser', 'google-chrome', 'google-chrome-stable', 'chrome']:
path = shutil.which(chrome)
if path:
return path
return 'chromium'
if name == 'NODE_BINARY':
return shutil.which('node') or 'node'
if name == 'SINGLEFILE_BINARY':
return shutil.which('single-file') or shutil.which('singlefile') or 'single-file'
if name == 'READABILITY_BINARY':
return shutil.which('readability-extractor') or 'readability-extractor'
if name == 'MERCURY_BINARY':
return shutil.which('mercury-parser') or shutil.which('postlight-parser') or 'mercury-parser'
# Binary versions (return placeholder, actual version detection happens elsewhere)
if name == 'CURL_VERSION':
return 'curl'
if name == 'WGET_VERSION':
return 'wget'
if name == 'GIT_VERSION':
return 'git'
if name == 'YOUTUBEDL_VERSION':
return 'yt-dlp'
if name == 'CHROME_VERSION':
return 'chromium'
if name == 'SINGLEFILE_VERSION':
return 'singlefile'
if name == 'READABILITY_VERSION':
return 'readability'
if name == 'MERCURY_VERSION':
return 'mercury'
# Binary arguments
if name == 'CURL_ARGS':
return ['--silent', '--location', '--compressed']
if name == 'WGET_ARGS':
return [
'--no-verbose',
'--adjust-extension',
'--convert-links',
'--force-directories',
'--backup-converted',
'--span-hosts',
'--no-parent',
'-e', 'robots=off',
]
if name == 'GIT_ARGS':
return ['--recursive']
if name == 'YOUTUBEDL_ARGS':
cfg, _ = _get_config()
return [
'--write-description',
'--write-info-json',
'--write-annotations',
'--write-thumbnail',
'--no-call-home',
'--write-sub',
'--write-auto-subs',
'--convert-subs=srt',
'--yes-playlist',
'--continue',
'--no-abort-on-error',
'--ignore-errors',
'--geo-bypass',
'--add-metadata',
f'--format=(bv*+ba/b)[filesize<={cfg.MEDIA_MAX_SIZE}][filesize_approx<=?{cfg.MEDIA_MAX_SIZE}]/(bv*+ba/b)',
]
if name == 'SINGLEFILE_ARGS':
return None # Uses defaults
if name == 'CHROME_ARGS':
return []
# Other settings
if name == 'WGET_AUTO_COMPRESSION':
return True
if name == 'DEPENDENCIES':
return {} # Legacy, not used anymore
# Allowlist/Denylist patterns (compiled regexes)
if name == 'SAVE_ALLOWLIST_PTN':
cfg, _ = _get_config()
@@ -213,7 +77,7 @@ def __getattr__(name: str):
if name == 'SAVE_DENYLIST_PTN':
cfg, _ = _get_config()
return cfg.SAVE_DENYLIST_PTNS
raise AttributeError(f"module 'archivebox.config' has no attribute '{name}'")