move almost all config into new archivebox.CONSTANTS

This commit is contained in:
Nick Sweeting
2024-09-25 05:10:09 -07:00
parent f5e8d99fdf
commit bb65b2dbec
32 changed files with 982 additions and 840 deletions

View File

@@ -26,10 +26,7 @@ import io
import re
import sys
import json
import inspect
import getpass
import shutil
import requests
import archivebox
from hashlib import md5
@@ -38,7 +35,6 @@ from datetime import datetime, timezone
from typing import Optional, Type, Tuple, Dict
from subprocess import run, PIPE, DEVNULL, STDOUT, TimeoutExpired
from configparser import ConfigParser
import importlib.metadata
from pydantic_pkgr import SemVer
from rich.progress import Progress
@@ -49,7 +45,6 @@ from django.db.backends.sqlite3.base import Database as sqlite3
from .config_stubs import (
AttrDict,
SimpleConfigValueDict,
ConfigValue,
ConfigDict,
ConfigDefaultValue,
@@ -61,7 +56,7 @@ from .misc.logging import (
ANSI,
COLOR_DICT,
stderr,
hint,
hint, # noqa
)
# print('STARTING CONFIG LOADING')
@@ -165,8 +160,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}'},
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, # + ' curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, # + ' wget/{WGET_VERSION}'},
'COOKIES_FILE': {'type': str, 'default': None},
@@ -254,12 +249,12 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'CURL_BINARY': {'type': str, 'default': 'curl'},
'GIT_BINARY': {'type': str, 'default': 'git'},
'WGET_BINARY': {'type': str, 'default': 'wget'}, # also can accept wget2
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('postlight-parser')},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
'NODE_BINARY': {'type': str, 'default': 'node'},
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
# 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
# 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
# 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
# 'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
@@ -308,212 +303,16 @@ CONFIG_FILENAME = 'ArchiveBox.conf'
STATICFILE_EXTENSIONS = {
# 99.999% of the time, URLs ending in these extensions are static files
# that can be downloaded as-is, not html pages that need to be rendered
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
'atom', 'rss', 'css', 'js', 'json',
'dmg', 'iso', 'img',
'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
# Less common extensions to consider adding later
# jar, swf, bin, com, exe, dll, deb
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
# These are always treated as pages, not as static files, never add them:
# html, htm, shtml, xhtml, xml, aspx, php, cgi
}
# When initializing archivebox in a new directory, we check to make sure the dir is
# actually empty so that we dont clobber someone's home directory or desktop by accident.
# These files are exceptions to the is_empty check when we're trying to init a new dir,
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
ALLOWED_IN_OUTPUT_DIR = {
".gitignore",
"lost+found",
".DS_Store",
".venv",
"venv",
"virtualenv",
".virtualenv",
"node_modules",
"package.json",
"package-lock.json",
"yarn.lock",
"static",
"sonic",
"search.sqlite3",
CRONTABS_DIR_NAME,
ARCHIVE_DIR_NAME,
SOURCES_DIR_NAME,
LOGS_DIR_NAME,
CACHE_DIR_NAME,
LIB_DIR_NAME,
PERSONAS_DIR_NAME,
SQL_INDEX_FILENAME,
f"{SQL_INDEX_FILENAME}-wal",
f"{SQL_INDEX_FILENAME}-shm",
"queue.sqlite3",
"queue.sqlite3-wal",
"queue.sqlite3-shm",
JSON_INDEX_FILENAME,
HTML_INDEX_FILENAME,
ROBOTS_TXT_FILENAME,
FAVICON_FILENAME,
CONFIG_FILENAME,
f"{CONFIG_FILENAME}.bak",
"static_index.json",
}
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
CONSTANTS = {
"PACKAGE_DIR_NAME": {'default': lambda c: PACKAGE_DIR_NAME},
"LIB_DIR_NAME": {'default': lambda c: LIB_DIR_NAME},
"TEMPLATES_DIR_NAME": {'default': lambda c: TEMPLATES_DIR_NAME},
"ARCHIVE_DIR_NAME": {'default': lambda c: ARCHIVE_DIR_NAME},
"SOURCES_DIR_NAME": {'default': lambda c: SOURCES_DIR_NAME},
"LOGS_DIR_NAME": {'default': lambda c: LOGS_DIR_NAME},
"CACHE_DIR_NAME": {'default': lambda c: CACHE_DIR_NAME},
"PERSONAS_DIR_NAME": {'default': lambda c: PERSONAS_DIR_NAME},
"CRONTABS_DIR_NAME": {'default': lambda c: CRONTABS_DIR_NAME},
"SQL_INDEX_FILENAME": {'default': lambda c: SQL_INDEX_FILENAME},
"JSON_INDEX_FILENAME": {'default': lambda c: JSON_INDEX_FILENAME},
"HTML_INDEX_FILENAME": {'default': lambda c: HTML_INDEX_FILENAME},
"ROBOTS_TXT_FILENAME": {'default': lambda c: ROBOTS_TXT_FILENAME},
"FAVICON_FILENAME": {'default': lambda c: FAVICON_FILENAME},
"CONFIG_FILENAME": {'default': lambda c: CONFIG_FILENAME},
"DEFAULT_CLI_COLORS": {'default': lambda c: DEFAULT_CLI_COLORS},
"ANSI": {'default': lambda c: ANSI},
"COLOR_DICT": {'default': lambda c: COLOR_DICT},
"STATICFILE_EXTENSIONS": {'default': lambda c: STATICFILE_EXTENSIONS},
"ALLOWED_IN_OUTPUT_DIR": {'default': lambda c: ALLOWED_IN_OUTPUT_DIR},
# "ALLOWDENYLIST_REGEX_FLAGS": {'default': lambda c: ALLOWDENYLIST_REGEX_FLAGS},
}
CONSTANTS = archivebox.CONSTANTS._asdict()
############################## Version Config ##################################
def get_system_user() -> str:
# some host OS's are unable to provide a username (k3s, Windows), making this complicated
# uid 999 is especially problematic and breaks many attempts
SYSTEM_USER = None
FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}'
# Option 1
try:
import pwd
SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name
except (ModuleNotFoundError, Exception):
pass
# Option 2
try:
SYSTEM_USER = SYSTEM_USER or getpass.getuser()
except Exception:
pass
# Option 3
try:
SYSTEM_USER = SYSTEM_USER or os.getlogin()
except Exception:
pass
return SYSTEM_USER or FALLBACK_USER_PLACHOLDER
def get_version(config):
try:
return importlib.metadata.version(__package__ or 'archivebox')
except importlib.metadata.PackageNotFoundError:
try:
pyproject_config = (config['PACKAGE_DIR'] / 'pyproject.toml').read_text()
for line in pyproject_config:
if line.startswith('version = '):
return line.split(' = ', 1)[-1].strip('"')
except FileNotFoundError:
# building docs, pyproject.toml is not available
return 'dev'
raise Exception('Failed to detect installed archivebox version!')
def get_commit_hash(config) -> Optional[str]:
try:
git_dir = config['PACKAGE_DIR'] / '../.git'
ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
commit_hash = git_dir.joinpath(ref).read_text().strip()
return commit_hash
except Exception:
pass
try:
return list((config['PACKAGE_DIR'] / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
except Exception:
pass
return None
def get_build_time(config) -> str:
if config['IN_DOCKER']:
docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
return docker_build_end_time
src_last_modified_unix_timestamp = (config['PACKAGE_DIR'] / 'config.py').stat().st_mtime
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
def get_versions_available_on_github(config):
"""
returns a dictionary containing the ArchiveBox GitHub release info for
the recommended upgrade version and the currently installed version
"""
# we only want to perform the (relatively expensive) check for new versions
# when its most relevant, e.g. when the user runs a long-running command
subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
if subcommand_run_by_user not in long_running_commands:
return None
github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
response = requests.get(github_releases_api)
if response.status_code != 200:
stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
return None
all_releases = response.json()
installed_version = parse_version_string(config['VERSION'])
# find current version or nearest older version (to link to)
current_version = None
for idx, release in enumerate(all_releases):
release_version = parse_version_string(release['tag_name'])
if release_version <= installed_version:
current_version = release
break
current_version = current_version or all_releases[-1]
# recommended version is whatever comes after current_version in the release list
# (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
try:
recommended_version = all_releases[idx+1]
except IndexError:
recommended_version = None
return {'recommended_version': recommended_version, 'current_version': current_version}
def can_upgrade(config):
if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
return recommended_version > current_version
return False
############################## Derived Config ##################################
@@ -523,55 +322,25 @@ def can_upgrade(config):
# These are derived/computed values calculated *after* all user-provided config values are ingested
# they appear in `archivebox config` output and are intended to be read-only for the user
DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
**CONSTANTS,
**{
key: {'default': lambda c: val}
for key, val in archivebox.CONSTANTS.items()
},
'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
'USER': {'default': lambda c: get_system_user()},
'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else AttrDict({k: '' for k in DEFAULT_CLI_COLORS.keys()})},
'PACKAGE_DIR': {'default': lambda c: Path(__file__).resolve().parent},
'PACKAGE_DIR': {'default': lambda c: archivebox.PACKAGE_DIR.resolve()},
'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME},
'CUSTOM_TEMPLATES_DIR': {'default': lambda c: c['CUSTOM_TEMPLATES_DIR'] and Path(c['CUSTOM_TEMPLATES_DIR'])},
'OUTPUT_DIR': {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()},
'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
'CACHE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / CACHE_DIR_NAME},
'LIB_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LIB_DIR_NAME},
'BIN_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LIB_DIR_NAME / 'bin'},
'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
'NODE_BIN_PATH': {'default': lambda c: str((Path(c["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))},
'VERSION': {'default': lambda c: get_version(c).split('+', 1)[0]}, # remove +editable from user-displayed version string
'COMMIT_HASH': {'default': lambda c: get_commit_hash(c)}, # short git commit hash of codebase HEAD commit
'BUILD_TIME': {'default': lambda c: get_build_time(c)}, # docker build completed time or python src last modified time
'VERSIONS_AVAILABLE': {'default': lambda c: False}, # get_versions_available_on_github(c)},
'CAN_UPGRADE': {'default': lambda c: False}, # can_upgrade(c)},
'PYTHON_BINARY': {'default': lambda c: sys.executable},
'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
'DJANGO_BINARY': {'default': lambda c: inspect.getfile(django)},
'DJANGO_VERSION': {'default': lambda c: '{}.{}.{}'.format(*django.VERSION[:3])},
'SQLITE_BINARY': {'default': lambda c: inspect.getfile(sqlite3)},
'SQLITE_VERSION': {'default': lambda c: sqlite3.version},
#'SQLITE_JOURNAL_MODE': {'default': lambda c: 'wal'}, # set at runtime below, interesting if changed later but unused for now because its always expected to be wal
#'SQLITE_OPTIONS': {'default': lambda c: ['JSON1']}, # set at runtime below
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
# 'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
@@ -580,23 +349,14 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'USE_WGET': {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
'WGET_VERSION': {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
'WGET_AUTO_COMPRESSION': {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
# 'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
# 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
'SINGLEFILE_EXTRA_ARGS': {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []},
'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []},
'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
@@ -605,21 +365,12 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
'USE_YOUTUBEDL': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'] or c['SAVE_MERCURY'])},
'USE_NODE': {'default': lambda c: True},
'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
# 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
# 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
@@ -696,12 +447,10 @@ def load_config_val(key: str,
raise Exception('Config values can only be str, bool, int, or json')
def load_config_file(out_dir: str | None=None) -> Optional[ConfigDict]:
def load_config_file(out_dir: str | None=archivebox.DATA_DIR) -> Optional[ConfigDict]:
"""load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
assert out_dir and out_dir.is_dir()
config_path = Path(out_dir) / CONFIG_FILENAME
config_path = archivebox.CONSTANTS.CONFIG_FILE
if config_path.exists():
config_file = ConfigParser()
config_file.optionxform = str
@@ -718,7 +467,7 @@ def load_config_file(out_dir: str | None=None) -> Optional[ConfigDict]:
return None
def write_config_file(config: Dict[str, str], out_dir: str | None=None) -> ConfigDict:
def write_config_file(config: Dict[str, str], out_dir: str | None=archivebox.DATA_DIR) -> ConfigDict:
"""load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
from .system import atomic_write
@@ -737,8 +486,7 @@ def write_config_file(config: Dict[str, str], out_dir: str | None=None) -> Confi
""")
out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
config_path = Path(out_dir) / CONFIG_FILENAME
config_path = archivebox.CONSTANTS.CONFIG_FILE
if not config_path.exists():
atomic_write(config_path, CONFIG_HEADER)
@@ -833,7 +581,7 @@ def load_config(defaults: ConfigDefaultDict,
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration')
stderr()
# raise
raise SystemExit(2)
# raise SystemExit(2)
return AttrDict(extended_config)
@@ -984,98 +732,6 @@ def wget_supports_compression(config):
except (FileNotFoundError, OSError):
return False
def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
return {
'PACKAGE_DIR': {
'path': (config['PACKAGE_DIR']).resolve(),
'enabled': True,
'is_valid': (config['PACKAGE_DIR'] / '__main__.py').exists(),
},
'TEMPLATES_DIR': {
'path': (config['TEMPLATES_DIR']).resolve(),
'enabled': True,
'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
},
'LIB_DIR': {
'path': (config['LIB_DIR']).resolve(),
'enabled': True,
'is_valid': config['LIB_DIR'].is_dir(),
},
# 'NODE_MODULES_DIR': {
# 'path': ,
# 'enabled': ,
# 'is_valid': (...).exists(),
# },
}
def get_data_locations(config: ConfigDict) -> ConfigValue:
return {
# OLD: migrating to personas
# 'CHROME_USER_DATA_DIR': {
# 'path': os.path.abspath(config['CHROME_USER_DATA_DIR']),
# 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
# 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
# },
# 'COOKIES_FILE': {
# 'path': os.path.abspath(config['COOKIES_FILE']),
# 'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
# 'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
# },
"OUTPUT_DIR": {
"path": config["OUTPUT_DIR"].resolve(),
"enabled": True,
"is_valid": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).exists(),
"is_mount": os.path.ismount(config["OUTPUT_DIR"].resolve()),
},
"CONFIG_FILE": {
"path": config["CONFIG_FILE"].resolve(),
"enabled": True,
"is_valid": config["CONFIG_FILE"].exists(),
},
"SQL_INDEX": {
"path": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).resolve(),
"enabled": True,
"is_valid": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).exists(),
"is_mount": os.path.ismount((config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).resolve()),
},
"ARCHIVE_DIR": {
"path": config["ARCHIVE_DIR"].resolve(),
"enabled": True,
"is_valid": config["ARCHIVE_DIR"].exists(),
"is_mount": os.path.ismount(config["ARCHIVE_DIR"].resolve()),
},
"SOURCES_DIR": {
"path": config["SOURCES_DIR"].resolve(),
"enabled": True,
"is_valid": config["SOURCES_DIR"].exists(),
},
"PERSONAS_DIR": {
"path": config["PERSONAS_DIR"].resolve(),
"enabled": True,
"is_valid": config["PERSONAS_DIR"].exists(),
},
"LOGS_DIR": {
"path": config["LOGS_DIR"].resolve(),
"enabled": True,
"is_valid": config["LOGS_DIR"].exists(),
},
"CACHE_DIR": {
"path": config["CACHE_DIR"].resolve(),
"enabled": True,
"is_valid": config["CACHE_DIR"].exists(),
},
"CUSTOM_TEMPLATES_DIR": {
"path": config["CUSTOM_TEMPLATES_DIR"] and Path(config["CUSTOM_TEMPLATES_DIR"]).resolve(),
"enabled": bool(config["CUSTOM_TEMPLATES_DIR"]),
"is_valid": config["CUSTOM_TEMPLATES_DIR"] and Path(config["CUSTOM_TEMPLATES_DIR"]).exists(),
},
# managed by bin/docker_entrypoint.sh and python-crontab:
# 'CRONTABS_DIR': {
# 'path': config['CRONTABS_DIR'].resolve(),
# 'enabled': True,
# 'is_valid': config['CRONTABS_DIR'].exists(),
# },
}
def get_dependency_info(config: ConfigDict) -> ConfigValue:
return {
@@ -1129,20 +785,6 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
'enabled': config['USE_NODE'],
'is_valid': bool(config['NODE_VERSION']),
},
'SINGLEFILE_BINARY': {
'path': bin_path(config['SINGLEFILE_BINARY']),
'version': config['SINGLEFILE_VERSION'],
'hash': bin_hash(config['SINGLEFILE_BINARY']),
'enabled': config['USE_SINGLEFILE'],
'is_valid': bool(config['SINGLEFILE_VERSION']),
},
'READABILITY_BINARY': {
'path': bin_path(config['READABILITY_BINARY']),
'version': config['READABILITY_VERSION'],
'hash': bin_hash(config['READABILITY_BINARY']),
'enabled': config['USE_READABILITY'],
'is_valid': bool(config['READABILITY_VERSION']),
},
'MERCURY_BINARY': {
'path': bin_path(config['MERCURY_BINARY']),
'version': config['MERCURY_VERSION'],
@@ -1157,13 +799,27 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
'enabled': config['USE_GIT'],
'is_valid': bool(config['GIT_VERSION']),
},
'YOUTUBEDL_BINARY': {
'path': bin_path(config['YOUTUBEDL_BINARY']),
'version': config['YOUTUBEDL_VERSION'],
'hash': bin_hash(config['YOUTUBEDL_BINARY']),
'enabled': config['USE_YOUTUBEDL'],
'is_valid': bool(config['YOUTUBEDL_VERSION']),
},
# 'SINGLEFILE_BINARY': {
# 'path': bin_path(config['SINGLEFILE_BINARY']),
# 'version': config['SINGLEFILE_VERSION'],
# 'hash': bin_hash(config['SINGLEFILE_BINARY']),
# 'enabled': config['USE_SINGLEFILE'],
# 'is_valid': bool(config['SINGLEFILE_VERSION']),
# },
# 'READABILITY_BINARY': {
# 'path': bin_path(config['READABILITY_BINARY']),
# 'version': config['READABILITY_VERSION'],
# 'hash': bin_hash(config['READABILITY_BINARY']),
# 'enabled': config['USE_READABILITY'],
# 'is_valid': bool(config['READABILITY_VERSION']),
# },
# 'YOUTUBEDL_BINARY': {
# 'path': bin_path(config['YOUTUBEDL_BINARY']),
# 'version': config['YOUTUBEDL_VERSION'],
# 'hash': bin_hash(config['YOUTUBEDL_BINARY']),
# 'enabled': config['USE_YOUTUBEDL'],
# 'is_valid': bool(config['YOUTUBEDL_VERSION']),
# },
# 'CHROME_BINARY': {
# 'path': bin_path(config['CHROME_BINARY']),
# 'version': config['CHROME_VERSION'],
@@ -1227,10 +883,6 @@ assert TIMEZONE == 'UTC', 'The server timezone should always be set to UTC' # n
os.environ["TZ"] = TIMEZONE # noqa: F821
os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
# add ./node_modules/.bin to $PATH so we can use node scripts in extractors
sys.path.append(CONFIG.NODE_BIN_PATH)
########################### Config Validity Checkers ###########################
if not CONFIG.USE_COLOR:
@@ -1256,6 +908,7 @@ def bump_startup_progress_bar():
def setup_django_minimal():
sys.path.append(str(archivebox.PACKAGE_DIR))
os.environ.setdefault('OUTPUT_DIR', str(archivebox.DATA_DIR))
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
django.setup()
@@ -1267,29 +920,18 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
output_dir = out_dir or Path(config['OUTPUT_DIR'])
output_dir = out_dir or archivebox.DATA_DIR
assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path)
assert isinstance(output_dir, Path) and isinstance(archivebox.PACKAGE_DIR, Path)
bump_startup_progress_bar()
try:
from django.core.management import call_command
sys.path.append(str(config['PACKAGE_DIR']))
os.environ.setdefault('OUTPUT_DIR', str(output_dir))
assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
sys.path.append(str(archivebox.PACKAGE_DIR))
os.environ.setdefault('OUTPUT_DIR', str(archivebox.DATA_DIR))
os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
# Check to make sure JSON extension is available in our Sqlite3 instance
try:
cursor = sqlite3.connect(':memory:').cursor()
cursor.execute('SELECT JSON(\'{"a": "b"}\')')
except sqlite3.OperationalError as exc:
stderr(f'[X] Your SQLite3 version is missing the required JSON1 extension: {exc}', color='red')
hint([
'Upgrade your Python version or install the extension manually:',
'https://code.djangoproject.com/wiki/JSON1Extension'
])
bump_startup_progress_bar()
@@ -1310,28 +952,16 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
bump_startup_progress_bar()
from django.conf import settings
from plugins_sys.config.apps import SHELL_CONFIG
# log startup message to the error log
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv)
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
f.write(f"\n> {command}; TS={ts} VERSION={config['VERSION']} IN_DOCKER={config['IN_DOCKER']} IS_TTY={config['IS_TTY']}\n")
f.write(f"\n> {command}; TS={ts} VERSION={archivebox.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
if check_db:
# Enable WAL mode in sqlite3
from django.db import connection
with connection.cursor() as cursor:
# Set Journal mode to WAL to allow for multiple writers
current_mode = cursor.execute("PRAGMA journal_mode")
if current_mode != 'wal':
cursor.execute("PRAGMA journal_mode=wal;")
# Set max blocking delay for concurrent writes and write sync mode
# https://litestream.io/tips/#busy-timeout
cursor.execute("PRAGMA busy_timeout = 5000;")
cursor.execute("PRAGMA synchronous = NORMAL;")
# Create cache table in DB if needed
try:
from django.core.cache import cache
@@ -1348,9 +978,9 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
for conn in connections.all():
conn.close_if_unusable_or_obsolete()
sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
sql_index_path = archivebox.CONSTANTS.DATABASE_FILE
assert sql_index_path.exists(), (
f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
f'No database file {sql_index_path} found in: {archivebox.DATA_DIR} (Are you in an ArchiveBox collection directory?)')
bump_startup_progress_bar()
@@ -1363,7 +993,7 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
logfire.configure()
logfire.instrument_django(is_sql_commentor_enabled=True)
logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv)
logfire.info(f'Started ArchiveBox v{archivebox.VERSION}', argv=sys.argv)
except KeyboardInterrupt:
raise SystemExit(2)