mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-04 14:57:56 +10:00
move almost all config into new archivebox.CONSTANTS
This commit is contained in:
@@ -26,10 +26,7 @@ import io
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
import inspect
|
||||
import getpass
|
||||
import shutil
|
||||
import requests
|
||||
import archivebox
|
||||
|
||||
from hashlib import md5
|
||||
@@ -38,7 +35,6 @@ from datetime import datetime, timezone
|
||||
from typing import Optional, Type, Tuple, Dict
|
||||
from subprocess import run, PIPE, DEVNULL, STDOUT, TimeoutExpired
|
||||
from configparser import ConfigParser
|
||||
import importlib.metadata
|
||||
|
||||
from pydantic_pkgr import SemVer
|
||||
from rich.progress import Progress
|
||||
@@ -49,7 +45,6 @@ from django.db.backends.sqlite3.base import Database as sqlite3
|
||||
|
||||
from .config_stubs import (
|
||||
AttrDict,
|
||||
SimpleConfigValueDict,
|
||||
ConfigValue,
|
||||
ConfigDict,
|
||||
ConfigDefaultValue,
|
||||
@@ -61,7 +56,7 @@ from .misc.logging import (
|
||||
ANSI,
|
||||
COLOR_DICT,
|
||||
stderr,
|
||||
hint,
|
||||
hint, # noqa
|
||||
)
|
||||
|
||||
# print('STARTING CONFIG LOADING')
|
||||
@@ -165,8 +160,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
|
||||
|
||||
'USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/128.0.0.0 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
||||
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' curl/{CURL_VERSION}'},
|
||||
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT'] + ' wget/{WGET_VERSION}'},
|
||||
'CURL_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, # + ' curl/{CURL_VERSION}'},
|
||||
'WGET_USER_AGENT': {'type': str, 'default': lambda c: c['USER_AGENT']}, # + ' wget/{WGET_VERSION}'},
|
||||
|
||||
'COOKIES_FILE': {'type': str, 'default': None},
|
||||
|
||||
@@ -254,12 +249,12 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||
'CURL_BINARY': {'type': str, 'default': 'curl'},
|
||||
'GIT_BINARY': {'type': str, 'default': 'git'},
|
||||
'WGET_BINARY': {'type': str, 'default': 'wget'}, # also can accept wget2
|
||||
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
|
||||
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
|
||||
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('postlight-parser')},
|
||||
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
|
||||
'NODE_BINARY': {'type': str, 'default': 'node'},
|
||||
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
|
||||
# 'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'}, # also can accept youtube-dl
|
||||
# 'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
|
||||
# 'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
|
||||
# 'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
|
||||
|
||||
'POCKET_CONSUMER_KEY': {'type': str, 'default': None},
|
||||
'POCKET_ACCESS_TOKENS': {'type': dict, 'default': {}},
|
||||
@@ -308,212 +303,16 @@ CONFIG_FILENAME = 'ArchiveBox.conf'
|
||||
|
||||
|
||||
|
||||
STATICFILE_EXTENSIONS = {
|
||||
# 99.999% of the time, URLs ending in these extensions are static files
|
||||
# that can be downloaded as-is, not html pages that need to be rendered
|
||||
'gif', 'jpeg', 'jpg', 'png', 'tif', 'tiff', 'wbmp', 'ico', 'jng', 'bmp',
|
||||
'svg', 'svgz', 'webp', 'ps', 'eps', 'ai',
|
||||
'mp3', 'mp4', 'm4a', 'mpeg', 'mpg', 'mkv', 'mov', 'webm', 'm4v',
|
||||
'flv', 'wmv', 'avi', 'ogg', 'ts', 'm3u8',
|
||||
'pdf', 'txt', 'rtf', 'rtfd', 'doc', 'docx', 'ppt', 'pptx', 'xls', 'xlsx',
|
||||
'atom', 'rss', 'css', 'js', 'json',
|
||||
'dmg', 'iso', 'img',
|
||||
'rar', 'war', 'hqx', 'zip', 'gz', 'bz2', '7z',
|
||||
|
||||
# Less common extensions to consider adding later
|
||||
# jar, swf, bin, com, exe, dll, deb
|
||||
# ear, hqx, eot, wmlc, kml, kmz, cco, jardiff, jnlp, run, msi, msp, msm,
|
||||
# pl pm, prc pdb, rar, rpm, sea, sit, tcl tk, der, pem, crt, xpi, xspf,
|
||||
# ra, mng, asx, asf, 3gpp, 3gp, mid, midi, kar, jad, wml, htc, mml
|
||||
|
||||
# These are always treated as pages, not as static files, never add them:
|
||||
# html, htm, shtml, xhtml, xml, aspx, php, cgi
|
||||
}
|
||||
|
||||
# When initializing archivebox in a new directory, we check to make sure the dir is
|
||||
# actually empty so that we dont clobber someone's home directory or desktop by accident.
|
||||
# These files are exceptions to the is_empty check when we're trying to init a new dir,
|
||||
# as they could be from a previous archivebox version, system artifacts, dependencies, etc.
|
||||
ALLOWED_IN_OUTPUT_DIR = {
|
||||
".gitignore",
|
||||
"lost+found",
|
||||
".DS_Store",
|
||||
".venv",
|
||||
"venv",
|
||||
"virtualenv",
|
||||
".virtualenv",
|
||||
"node_modules",
|
||||
"package.json",
|
||||
"package-lock.json",
|
||||
"yarn.lock",
|
||||
"static",
|
||||
"sonic",
|
||||
"search.sqlite3",
|
||||
CRONTABS_DIR_NAME,
|
||||
ARCHIVE_DIR_NAME,
|
||||
SOURCES_DIR_NAME,
|
||||
LOGS_DIR_NAME,
|
||||
CACHE_DIR_NAME,
|
||||
LIB_DIR_NAME,
|
||||
PERSONAS_DIR_NAME,
|
||||
SQL_INDEX_FILENAME,
|
||||
f"{SQL_INDEX_FILENAME}-wal",
|
||||
f"{SQL_INDEX_FILENAME}-shm",
|
||||
"queue.sqlite3",
|
||||
"queue.sqlite3-wal",
|
||||
"queue.sqlite3-shm",
|
||||
JSON_INDEX_FILENAME,
|
||||
HTML_INDEX_FILENAME,
|
||||
ROBOTS_TXT_FILENAME,
|
||||
FAVICON_FILENAME,
|
||||
CONFIG_FILENAME,
|
||||
f"{CONFIG_FILENAME}.bak",
|
||||
"static_index.json",
|
||||
}
|
||||
|
||||
|
||||
ALLOWDENYLIST_REGEX_FLAGS: int = re.IGNORECASE | re.UNICODE | re.MULTILINE
|
||||
|
||||
|
||||
CONSTANTS = {
|
||||
"PACKAGE_DIR_NAME": {'default': lambda c: PACKAGE_DIR_NAME},
|
||||
"LIB_DIR_NAME": {'default': lambda c: LIB_DIR_NAME},
|
||||
"TEMPLATES_DIR_NAME": {'default': lambda c: TEMPLATES_DIR_NAME},
|
||||
"ARCHIVE_DIR_NAME": {'default': lambda c: ARCHIVE_DIR_NAME},
|
||||
"SOURCES_DIR_NAME": {'default': lambda c: SOURCES_DIR_NAME},
|
||||
"LOGS_DIR_NAME": {'default': lambda c: LOGS_DIR_NAME},
|
||||
"CACHE_DIR_NAME": {'default': lambda c: CACHE_DIR_NAME},
|
||||
"PERSONAS_DIR_NAME": {'default': lambda c: PERSONAS_DIR_NAME},
|
||||
"CRONTABS_DIR_NAME": {'default': lambda c: CRONTABS_DIR_NAME},
|
||||
"SQL_INDEX_FILENAME": {'default': lambda c: SQL_INDEX_FILENAME},
|
||||
"JSON_INDEX_FILENAME": {'default': lambda c: JSON_INDEX_FILENAME},
|
||||
"HTML_INDEX_FILENAME": {'default': lambda c: HTML_INDEX_FILENAME},
|
||||
"ROBOTS_TXT_FILENAME": {'default': lambda c: ROBOTS_TXT_FILENAME},
|
||||
"FAVICON_FILENAME": {'default': lambda c: FAVICON_FILENAME},
|
||||
"CONFIG_FILENAME": {'default': lambda c: CONFIG_FILENAME},
|
||||
"DEFAULT_CLI_COLORS": {'default': lambda c: DEFAULT_CLI_COLORS},
|
||||
"ANSI": {'default': lambda c: ANSI},
|
||||
"COLOR_DICT": {'default': lambda c: COLOR_DICT},
|
||||
"STATICFILE_EXTENSIONS": {'default': lambda c: STATICFILE_EXTENSIONS},
|
||||
"ALLOWED_IN_OUTPUT_DIR": {'default': lambda c: ALLOWED_IN_OUTPUT_DIR},
|
||||
# "ALLOWDENYLIST_REGEX_FLAGS": {'default': lambda c: ALLOWDENYLIST_REGEX_FLAGS},
|
||||
}
|
||||
CONSTANTS = archivebox.CONSTANTS._asdict()
|
||||
|
||||
############################## Version Config ##################################
|
||||
|
||||
def get_system_user() -> str:
|
||||
# some host OS's are unable to provide a username (k3s, Windows), making this complicated
|
||||
# uid 999 is especially problematic and breaks many attempts
|
||||
SYSTEM_USER = None
|
||||
FALLBACK_USER_PLACHOLDER = f'user_{os.getuid()}'
|
||||
|
||||
# Option 1
|
||||
try:
|
||||
import pwd
|
||||
SYSTEM_USER = SYSTEM_USER or pwd.getpwuid(os.geteuid()).pw_name
|
||||
except (ModuleNotFoundError, Exception):
|
||||
pass
|
||||
|
||||
# Option 2
|
||||
try:
|
||||
SYSTEM_USER = SYSTEM_USER or getpass.getuser()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Option 3
|
||||
try:
|
||||
SYSTEM_USER = SYSTEM_USER or os.getlogin()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return SYSTEM_USER or FALLBACK_USER_PLACHOLDER
|
||||
|
||||
def get_version(config):
|
||||
try:
|
||||
return importlib.metadata.version(__package__ or 'archivebox')
|
||||
except importlib.metadata.PackageNotFoundError:
|
||||
try:
|
||||
pyproject_config = (config['PACKAGE_DIR'] / 'pyproject.toml').read_text()
|
||||
for line in pyproject_config:
|
||||
if line.startswith('version = '):
|
||||
return line.split(' = ', 1)[-1].strip('"')
|
||||
except FileNotFoundError:
|
||||
# building docs, pyproject.toml is not available
|
||||
return 'dev'
|
||||
|
||||
raise Exception('Failed to detect installed archivebox version!')
|
||||
|
||||
def get_commit_hash(config) -> Optional[str]:
|
||||
try:
|
||||
git_dir = config['PACKAGE_DIR'] / '../.git'
|
||||
ref = (git_dir / 'HEAD').read_text().strip().split(' ')[-1]
|
||||
commit_hash = git_dir.joinpath(ref).read_text().strip()
|
||||
return commit_hash
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
try:
|
||||
return list((config['PACKAGE_DIR'] / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
return None
|
||||
|
||||
def get_build_time(config) -> str:
|
||||
if config['IN_DOCKER']:
|
||||
docker_build_end_time = Path('/VERSION.txt').read_text().rsplit('BUILD_END_TIME=')[-1].split('\n', 1)[0]
|
||||
return docker_build_end_time
|
||||
|
||||
src_last_modified_unix_timestamp = (config['PACKAGE_DIR'] / 'config.py').stat().st_mtime
|
||||
return datetime.fromtimestamp(src_last_modified_unix_timestamp).strftime('%Y-%m-%d %H:%M:%S %s')
|
||||
|
||||
def get_versions_available_on_github(config):
|
||||
"""
|
||||
returns a dictionary containing the ArchiveBox GitHub release info for
|
||||
the recommended upgrade version and the currently installed version
|
||||
"""
|
||||
|
||||
# we only want to perform the (relatively expensive) check for new versions
|
||||
# when its most relevant, e.g. when the user runs a long-running command
|
||||
subcommand_run_by_user = sys.argv[3] if len(sys.argv) > 3 else 'help'
|
||||
long_running_commands = ('add', 'schedule', 'update', 'status', 'server')
|
||||
if subcommand_run_by_user not in long_running_commands:
|
||||
return None
|
||||
|
||||
github_releases_api = "https://api.github.com/repos/ArchiveBox/ArchiveBox/releases"
|
||||
response = requests.get(github_releases_api)
|
||||
if response.status_code != 200:
|
||||
stderr(f'[!] Warning: GitHub API call to check for new ArchiveBox version failed! (status={response.status_code})', color='lightyellow', config=config)
|
||||
return None
|
||||
all_releases = response.json()
|
||||
|
||||
installed_version = parse_version_string(config['VERSION'])
|
||||
|
||||
# find current version or nearest older version (to link to)
|
||||
current_version = None
|
||||
for idx, release in enumerate(all_releases):
|
||||
release_version = parse_version_string(release['tag_name'])
|
||||
if release_version <= installed_version:
|
||||
current_version = release
|
||||
break
|
||||
|
||||
current_version = current_version or all_releases[-1]
|
||||
|
||||
# recommended version is whatever comes after current_version in the release list
|
||||
# (perhaps too conservative to only recommend upgrading one version at a time, but it's safest)
|
||||
try:
|
||||
recommended_version = all_releases[idx+1]
|
||||
except IndexError:
|
||||
recommended_version = None
|
||||
|
||||
return {'recommended_version': recommended_version, 'current_version': current_version}
|
||||
|
||||
def can_upgrade(config):
|
||||
if config['VERSIONS_AVAILABLE'] and config['VERSIONS_AVAILABLE']['recommended_version']:
|
||||
recommended_version = parse_version_string(config['VERSIONS_AVAILABLE']['recommended_version']['tag_name'])
|
||||
current_version = parse_version_string(config['VERSIONS_AVAILABLE']['current_version']['tag_name'])
|
||||
return recommended_version > current_version
|
||||
return False
|
||||
|
||||
|
||||
############################## Derived Config ##################################
|
||||
@@ -523,55 +322,25 @@ def can_upgrade(config):
|
||||
# These are derived/computed values calculated *after* all user-provided config values are ingested
|
||||
# they appear in `archivebox config` output and are intended to be read-only for the user
|
||||
DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||
**CONSTANTS,
|
||||
**{
|
||||
key: {'default': lambda c: val}
|
||||
for key, val in archivebox.CONSTANTS.items()
|
||||
},
|
||||
|
||||
'TERM_WIDTH': {'default': lambda c: lambda: shutil.get_terminal_size((100, 10)).columns},
|
||||
'USER': {'default': lambda c: get_system_user()},
|
||||
'ANSI': {'default': lambda c: DEFAULT_CLI_COLORS if c['USE_COLOR'] else AttrDict({k: '' for k in DEFAULT_CLI_COLORS.keys()})},
|
||||
|
||||
'PACKAGE_DIR': {'default': lambda c: Path(__file__).resolve().parent},
|
||||
'PACKAGE_DIR': {'default': lambda c: archivebox.PACKAGE_DIR.resolve()},
|
||||
'TEMPLATES_DIR': {'default': lambda c: c['PACKAGE_DIR'] / TEMPLATES_DIR_NAME},
|
||||
'CUSTOM_TEMPLATES_DIR': {'default': lambda c: c['CUSTOM_TEMPLATES_DIR'] and Path(c['CUSTOM_TEMPLATES_DIR'])},
|
||||
|
||||
'OUTPUT_DIR': {'default': lambda c: Path(c['OUTPUT_DIR']).resolve() if c['OUTPUT_DIR'] else Path(os.curdir).resolve()},
|
||||
'ARCHIVE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / ARCHIVE_DIR_NAME},
|
||||
'SOURCES_DIR': {'default': lambda c: c['OUTPUT_DIR'] / SOURCES_DIR_NAME},
|
||||
'LOGS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LOGS_DIR_NAME},
|
||||
'CACHE_DIR': {'default': lambda c: c['OUTPUT_DIR'] / CACHE_DIR_NAME},
|
||||
'LIB_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LIB_DIR_NAME},
|
||||
'BIN_DIR': {'default': lambda c: c['OUTPUT_DIR'] / LIB_DIR_NAME / 'bin'},
|
||||
'PERSONAS_DIR': {'default': lambda c: c['OUTPUT_DIR'] / PERSONAS_DIR_NAME},
|
||||
'CONFIG_FILE': {'default': lambda c: Path(c['CONFIG_FILE']).resolve() if c['CONFIG_FILE'] else c['OUTPUT_DIR'] / CONFIG_FILENAME},
|
||||
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and Path(c['COOKIES_FILE']).resolve()},
|
||||
|
||||
'URL_DENYLIST_PTN': {'default': lambda c: c['URL_DENYLIST'] and re.compile(c['URL_DENYLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
|
||||
'URL_ALLOWLIST_PTN': {'default': lambda c: c['URL_ALLOWLIST'] and re.compile(c['URL_ALLOWLIST'] or '', ALLOWDENYLIST_REGEX_FLAGS)},
|
||||
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')}, # exec is always needed to list directories
|
||||
|
||||
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
|
||||
'NODE_BIN_PATH': {'default': lambda c: str((Path(c["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))},
|
||||
|
||||
'VERSION': {'default': lambda c: get_version(c).split('+', 1)[0]}, # remove +editable from user-displayed version string
|
||||
'COMMIT_HASH': {'default': lambda c: get_commit_hash(c)}, # short git commit hash of codebase HEAD commit
|
||||
'BUILD_TIME': {'default': lambda c: get_build_time(c)}, # docker build completed time or python src last modified time
|
||||
|
||||
'VERSIONS_AVAILABLE': {'default': lambda c: False}, # get_versions_available_on_github(c)},
|
||||
'CAN_UPGRADE': {'default': lambda c: False}, # can_upgrade(c)},
|
||||
|
||||
'PYTHON_BINARY': {'default': lambda c: sys.executable},
|
||||
'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
|
||||
|
||||
'DJANGO_BINARY': {'default': lambda c: inspect.getfile(django)},
|
||||
'DJANGO_VERSION': {'default': lambda c: '{}.{}.{}'.format(*django.VERSION[:3])},
|
||||
|
||||
'SQLITE_BINARY': {'default': lambda c: inspect.getfile(sqlite3)},
|
||||
'SQLITE_VERSION': {'default': lambda c: sqlite3.version},
|
||||
#'SQLITE_JOURNAL_MODE': {'default': lambda c: 'wal'}, # set at runtime below, interesting if changed later but unused for now because its always expected to be wal
|
||||
#'SQLITE_OPTIONS': {'default': lambda c: ['JSON1']}, # set at runtime below
|
||||
|
||||
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
|
||||
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
|
||||
'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
|
||||
# 'CURL_USER_AGENT': {'default': lambda c: c['CURL_USER_AGENT'].format(**c)},
|
||||
'CURL_ARGS': {'default': lambda c: c['CURL_ARGS'] or []},
|
||||
'CURL_EXTRA_ARGS': {'default': lambda c: c['CURL_EXTRA_ARGS'] or []},
|
||||
'SAVE_FAVICON': {'default': lambda c: c['USE_CURL'] and c['SAVE_FAVICON']},
|
||||
@@ -580,23 +349,14 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||
'USE_WGET': {'default': lambda c: c['USE_WGET'] and (c['SAVE_WGET'] or c['SAVE_WARC'])},
|
||||
'WGET_VERSION': {'default': lambda c: bin_version(c['WGET_BINARY']) if c['USE_WGET'] else None},
|
||||
'WGET_AUTO_COMPRESSION': {'default': lambda c: wget_supports_compression(c) if c['USE_WGET'] else False},
|
||||
'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
|
||||
# 'WGET_USER_AGENT': {'default': lambda c: c['WGET_USER_AGENT'].format(**c)},
|
||||
'SAVE_WGET': {'default': lambda c: c['USE_WGET'] and c['SAVE_WGET']},
|
||||
'SAVE_WARC': {'default': lambda c: c['USE_WGET'] and c['SAVE_WARC']},
|
||||
'WGET_ARGS': {'default': lambda c: c['WGET_ARGS'] or []},
|
||||
'WGET_EXTRA_ARGS': {'default': lambda c: c['WGET_EXTRA_ARGS'] or []},
|
||||
|
||||
# 'RIPGREP_VERSION': {'default': lambda c: bin_version(c['RIPGREP_BINARY']) if c['USE_RIPGREP'] else None},
|
||||
|
||||
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
|
||||
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
|
||||
'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
|
||||
'SINGLEFILE_EXTRA_ARGS': {'default': lambda c: c['SINGLEFILE_EXTRA_ARGS'] or []},
|
||||
|
||||
'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
|
||||
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
|
||||
|
||||
'USE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['SAVE_MERCURY']},
|
||||
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
|
||||
'MERCURY_VERSION': {'default': lambda c: '1.0.0' if shutil.which(str(bin_path(c['MERCURY_BINARY']))) else None}, # mercury doesnt expose version info until this is merged https://github.com/postlight/parser/pull/750
|
||||
'MERCURY_ARGS': {'default': lambda c: c['MERCURY_ARGS'] or []},
|
||||
'MERCURY_EXTRA_ARGS': {'default': lambda c: c['MERCURY_EXTRA_ARGS'] or []},
|
||||
@@ -605,21 +365,12 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||
'GIT_VERSION': {'default': lambda c: bin_version(c['GIT_BINARY']) if c['USE_GIT'] else None},
|
||||
'SAVE_GIT': {'default': lambda c: c['USE_GIT'] and c['SAVE_GIT']},
|
||||
|
||||
'USE_YOUTUBEDL': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
|
||||
'YOUTUBEDL_VERSION': {'default': lambda c: bin_version(c['YOUTUBEDL_BINARY']) if c['USE_YOUTUBEDL'] else None},
|
||||
'SAVE_MEDIA': {'default': lambda c: c['USE_YOUTUBEDL'] and c['SAVE_MEDIA']},
|
||||
'YOUTUBEDL_ARGS': {'default': lambda c: c['YOUTUBEDL_ARGS'] or []},
|
||||
'YOUTUBEDL_EXTRA_ARGS': {'default': lambda c: c['YOUTUBEDL_EXTRA_ARGS'] or []},
|
||||
|
||||
'SAVE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['USE_NODE']},
|
||||
'SAVE_MERCURY': {'default': lambda c: c['USE_MERCURY'] and c['USE_NODE']},
|
||||
|
||||
'USE_NODE': {'default': lambda c: c['USE_NODE'] and (c['SAVE_READABILITY'] or c['SAVE_SINGLEFILE'] or c['SAVE_MERCURY'])},
|
||||
'USE_NODE': {'default': lambda c: True},
|
||||
'NODE_VERSION': {'default': lambda c: bin_version(c['NODE_BINARY']) if c['USE_NODE'] else None},
|
||||
|
||||
'DEPENDENCIES': {'default': lambda c: get_dependency_info(c)},
|
||||
'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
|
||||
'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
|
||||
# 'CODE_LOCATIONS': {'default': lambda c: get_code_locations(c)},
|
||||
# 'DATA_LOCATIONS': {'default': lambda c: get_data_locations(c)},
|
||||
|
||||
'SAVE_ALLOWLIST_PTN': {'default': lambda c: c['SAVE_ALLOWLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_ALLOWLIST'].items()}},
|
||||
'SAVE_DENYLIST_PTN': {'default': lambda c: c['SAVE_DENYLIST'] and {re.compile(k, ALLOWDENYLIST_REGEX_FLAGS): v for k, v in c['SAVE_DENYLIST'].items()}},
|
||||
@@ -696,12 +447,10 @@ def load_config_val(key: str,
|
||||
raise Exception('Config values can only be str, bool, int, or json')
|
||||
|
||||
|
||||
def load_config_file(out_dir: str | None=None) -> Optional[ConfigDict]:
|
||||
def load_config_file(out_dir: str | None=archivebox.DATA_DIR) -> Optional[ConfigDict]:
|
||||
"""load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
|
||||
|
||||
out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
|
||||
assert out_dir and out_dir.is_dir()
|
||||
config_path = Path(out_dir) / CONFIG_FILENAME
|
||||
config_path = archivebox.CONSTANTS.CONFIG_FILE
|
||||
if config_path.exists():
|
||||
config_file = ConfigParser()
|
||||
config_file.optionxform = str
|
||||
@@ -718,7 +467,7 @@ def load_config_file(out_dir: str | None=None) -> Optional[ConfigDict]:
|
||||
return None
|
||||
|
||||
|
||||
def write_config_file(config: Dict[str, str], out_dir: str | None=None) -> ConfigDict:
|
||||
def write_config_file(config: Dict[str, str], out_dir: str | None=archivebox.DATA_DIR) -> ConfigDict:
|
||||
"""load the ini-formatted config file from OUTPUT_DIR/Archivebox.conf"""
|
||||
|
||||
from .system import atomic_write
|
||||
@@ -737,8 +486,7 @@ def write_config_file(config: Dict[str, str], out_dir: str | None=None) -> Confi
|
||||
|
||||
""")
|
||||
|
||||
out_dir = out_dir or Path(os.getenv('OUTPUT_DIR', '.')).resolve()
|
||||
config_path = Path(out_dir) / CONFIG_FILENAME
|
||||
config_path = archivebox.CONSTANTS.CONFIG_FILE
|
||||
|
||||
if not config_path.exists():
|
||||
atomic_write(config_path, CONFIG_HEADER)
|
||||
@@ -833,7 +581,7 @@ def load_config(defaults: ConfigDefaultDict,
|
||||
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration')
|
||||
stderr()
|
||||
# raise
|
||||
raise SystemExit(2)
|
||||
# raise SystemExit(2)
|
||||
|
||||
return AttrDict(extended_config)
|
||||
|
||||
@@ -984,98 +732,6 @@ def wget_supports_compression(config):
|
||||
except (FileNotFoundError, OSError):
|
||||
return False
|
||||
|
||||
def get_code_locations(config: ConfigDict) -> SimpleConfigValueDict:
|
||||
return {
|
||||
'PACKAGE_DIR': {
|
||||
'path': (config['PACKAGE_DIR']).resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': (config['PACKAGE_DIR'] / '__main__.py').exists(),
|
||||
},
|
||||
'TEMPLATES_DIR': {
|
||||
'path': (config['TEMPLATES_DIR']).resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': (config['TEMPLATES_DIR'] / 'static').exists(),
|
||||
},
|
||||
'LIB_DIR': {
|
||||
'path': (config['LIB_DIR']).resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['LIB_DIR'].is_dir(),
|
||||
},
|
||||
# 'NODE_MODULES_DIR': {
|
||||
# 'path': ,
|
||||
# 'enabled': ,
|
||||
# 'is_valid': (...).exists(),
|
||||
# },
|
||||
}
|
||||
|
||||
def get_data_locations(config: ConfigDict) -> ConfigValue:
|
||||
return {
|
||||
# OLD: migrating to personas
|
||||
# 'CHROME_USER_DATA_DIR': {
|
||||
# 'path': os.path.abspath(config['CHROME_USER_DATA_DIR']),
|
||||
# 'enabled': config['USE_CHROME'] and config['CHROME_USER_DATA_DIR'],
|
||||
# 'is_valid': False if config['CHROME_USER_DATA_DIR'] is None else (Path(config['CHROME_USER_DATA_DIR']) / 'Default').exists(),
|
||||
# },
|
||||
# 'COOKIES_FILE': {
|
||||
# 'path': os.path.abspath(config['COOKIES_FILE']),
|
||||
# 'enabled': config['USE_WGET'] and config['COOKIES_FILE'],
|
||||
# 'is_valid': False if config['COOKIES_FILE'] is None else Path(config['COOKIES_FILE']).exists(),
|
||||
# },
|
||||
"OUTPUT_DIR": {
|
||||
"path": config["OUTPUT_DIR"].resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).exists(),
|
||||
"is_mount": os.path.ismount(config["OUTPUT_DIR"].resolve()),
|
||||
},
|
||||
"CONFIG_FILE": {
|
||||
"path": config["CONFIG_FILE"].resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": config["CONFIG_FILE"].exists(),
|
||||
},
|
||||
"SQL_INDEX": {
|
||||
"path": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": (config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).exists(),
|
||||
"is_mount": os.path.ismount((config["OUTPUT_DIR"] / SQL_INDEX_FILENAME).resolve()),
|
||||
},
|
||||
"ARCHIVE_DIR": {
|
||||
"path": config["ARCHIVE_DIR"].resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": config["ARCHIVE_DIR"].exists(),
|
||||
"is_mount": os.path.ismount(config["ARCHIVE_DIR"].resolve()),
|
||||
},
|
||||
"SOURCES_DIR": {
|
||||
"path": config["SOURCES_DIR"].resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": config["SOURCES_DIR"].exists(),
|
||||
},
|
||||
"PERSONAS_DIR": {
|
||||
"path": config["PERSONAS_DIR"].resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": config["PERSONAS_DIR"].exists(),
|
||||
},
|
||||
"LOGS_DIR": {
|
||||
"path": config["LOGS_DIR"].resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": config["LOGS_DIR"].exists(),
|
||||
},
|
||||
"CACHE_DIR": {
|
||||
"path": config["CACHE_DIR"].resolve(),
|
||||
"enabled": True,
|
||||
"is_valid": config["CACHE_DIR"].exists(),
|
||||
},
|
||||
"CUSTOM_TEMPLATES_DIR": {
|
||||
"path": config["CUSTOM_TEMPLATES_DIR"] and Path(config["CUSTOM_TEMPLATES_DIR"]).resolve(),
|
||||
"enabled": bool(config["CUSTOM_TEMPLATES_DIR"]),
|
||||
"is_valid": config["CUSTOM_TEMPLATES_DIR"] and Path(config["CUSTOM_TEMPLATES_DIR"]).exists(),
|
||||
},
|
||||
# managed by bin/docker_entrypoint.sh and python-crontab:
|
||||
# 'CRONTABS_DIR': {
|
||||
# 'path': config['CRONTABS_DIR'].resolve(),
|
||||
# 'enabled': True,
|
||||
# 'is_valid': config['CRONTABS_DIR'].exists(),
|
||||
# },
|
||||
}
|
||||
|
||||
def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||
return {
|
||||
@@ -1129,20 +785,6 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||
'enabled': config['USE_NODE'],
|
||||
'is_valid': bool(config['NODE_VERSION']),
|
||||
},
|
||||
'SINGLEFILE_BINARY': {
|
||||
'path': bin_path(config['SINGLEFILE_BINARY']),
|
||||
'version': config['SINGLEFILE_VERSION'],
|
||||
'hash': bin_hash(config['SINGLEFILE_BINARY']),
|
||||
'enabled': config['USE_SINGLEFILE'],
|
||||
'is_valid': bool(config['SINGLEFILE_VERSION']),
|
||||
},
|
||||
'READABILITY_BINARY': {
|
||||
'path': bin_path(config['READABILITY_BINARY']),
|
||||
'version': config['READABILITY_VERSION'],
|
||||
'hash': bin_hash(config['READABILITY_BINARY']),
|
||||
'enabled': config['USE_READABILITY'],
|
||||
'is_valid': bool(config['READABILITY_VERSION']),
|
||||
},
|
||||
'MERCURY_BINARY': {
|
||||
'path': bin_path(config['MERCURY_BINARY']),
|
||||
'version': config['MERCURY_VERSION'],
|
||||
@@ -1157,13 +799,27 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||
'enabled': config['USE_GIT'],
|
||||
'is_valid': bool(config['GIT_VERSION']),
|
||||
},
|
||||
'YOUTUBEDL_BINARY': {
|
||||
'path': bin_path(config['YOUTUBEDL_BINARY']),
|
||||
'version': config['YOUTUBEDL_VERSION'],
|
||||
'hash': bin_hash(config['YOUTUBEDL_BINARY']),
|
||||
'enabled': config['USE_YOUTUBEDL'],
|
||||
'is_valid': bool(config['YOUTUBEDL_VERSION']),
|
||||
},
|
||||
# 'SINGLEFILE_BINARY': {
|
||||
# 'path': bin_path(config['SINGLEFILE_BINARY']),
|
||||
# 'version': config['SINGLEFILE_VERSION'],
|
||||
# 'hash': bin_hash(config['SINGLEFILE_BINARY']),
|
||||
# 'enabled': config['USE_SINGLEFILE'],
|
||||
# 'is_valid': bool(config['SINGLEFILE_VERSION']),
|
||||
# },
|
||||
# 'READABILITY_BINARY': {
|
||||
# 'path': bin_path(config['READABILITY_BINARY']),
|
||||
# 'version': config['READABILITY_VERSION'],
|
||||
# 'hash': bin_hash(config['READABILITY_BINARY']),
|
||||
# 'enabled': config['USE_READABILITY'],
|
||||
# 'is_valid': bool(config['READABILITY_VERSION']),
|
||||
# },
|
||||
# 'YOUTUBEDL_BINARY': {
|
||||
# 'path': bin_path(config['YOUTUBEDL_BINARY']),
|
||||
# 'version': config['YOUTUBEDL_VERSION'],
|
||||
# 'hash': bin_hash(config['YOUTUBEDL_BINARY']),
|
||||
# 'enabled': config['USE_YOUTUBEDL'],
|
||||
# 'is_valid': bool(config['YOUTUBEDL_VERSION']),
|
||||
# },
|
||||
# 'CHROME_BINARY': {
|
||||
# 'path': bin_path(config['CHROME_BINARY']),
|
||||
# 'version': config['CHROME_VERSION'],
|
||||
@@ -1227,10 +883,6 @@ assert TIMEZONE == 'UTC', 'The server timezone should always be set to UTC' # n
|
||||
os.environ["TZ"] = TIMEZONE # noqa: F821
|
||||
os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
|
||||
|
||||
# add ./node_modules/.bin to $PATH so we can use node scripts in extractors
|
||||
sys.path.append(CONFIG.NODE_BIN_PATH)
|
||||
|
||||
|
||||
########################### Config Validity Checkers ###########################
|
||||
|
||||
if not CONFIG.USE_COLOR:
|
||||
@@ -1256,6 +908,7 @@ def bump_startup_progress_bar():
|
||||
|
||||
def setup_django_minimal():
|
||||
sys.path.append(str(archivebox.PACKAGE_DIR))
|
||||
os.environ.setdefault('OUTPUT_DIR', str(archivebox.DATA_DIR))
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
|
||||
django.setup()
|
||||
|
||||
@@ -1267,29 +920,18 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
|
||||
with Progress(transient=True, expand=True, console=CONSOLE) as INITIAL_STARTUP_PROGRESS:
|
||||
INITIAL_STARTUP_PROGRESS_TASK = INITIAL_STARTUP_PROGRESS.add_task("[green]Loading modules...", total=25)
|
||||
|
||||
output_dir = out_dir or Path(config['OUTPUT_DIR'])
|
||||
output_dir = out_dir or archivebox.DATA_DIR
|
||||
|
||||
assert isinstance(output_dir, Path) and isinstance(config['PACKAGE_DIR'], Path)
|
||||
assert isinstance(output_dir, Path) and isinstance(archivebox.PACKAGE_DIR, Path)
|
||||
|
||||
bump_startup_progress_bar()
|
||||
try:
|
||||
from django.core.management import call_command
|
||||
|
||||
sys.path.append(str(config['PACKAGE_DIR']))
|
||||
os.environ.setdefault('OUTPUT_DIR', str(output_dir))
|
||||
assert (config['PACKAGE_DIR'] / 'core' / 'settings.py').exists(), 'settings.py was not found at archivebox/core/settings.py'
|
||||
sys.path.append(str(archivebox.PACKAGE_DIR))
|
||||
os.environ.setdefault('OUTPUT_DIR', str(archivebox.DATA_DIR))
|
||||
os.environ.setdefault("ARCHIVEBOX_DATABASE_NAME", ":memory:")
|
||||
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'core.settings')
|
||||
|
||||
# Check to make sure JSON extension is available in our Sqlite3 instance
|
||||
try:
|
||||
cursor = sqlite3.connect(':memory:').cursor()
|
||||
cursor.execute('SELECT JSON(\'{"a": "b"}\')')
|
||||
except sqlite3.OperationalError as exc:
|
||||
stderr(f'[X] Your SQLite3 version is missing the required JSON1 extension: {exc}', color='red')
|
||||
hint([
|
||||
'Upgrade your Python version or install the extension manually:',
|
||||
'https://code.djangoproject.com/wiki/JSON1Extension'
|
||||
])
|
||||
|
||||
bump_startup_progress_bar()
|
||||
|
||||
@@ -1310,28 +952,16 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
|
||||
bump_startup_progress_bar()
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from plugins_sys.config.apps import SHELL_CONFIG
|
||||
|
||||
# log startup message to the error log
|
||||
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
|
||||
command = ' '.join(sys.argv)
|
||||
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
|
||||
f.write(f"\n> {command}; TS={ts} VERSION={config['VERSION']} IN_DOCKER={config['IN_DOCKER']} IS_TTY={config['IS_TTY']}\n")
|
||||
f.write(f"\n> {command}; TS={ts} VERSION={archivebox.VERSION} IN_DOCKER={SHELL_CONFIG.IN_DOCKER} IS_TTY={SHELL_CONFIG.IS_TTY}\n")
|
||||
|
||||
if check_db:
|
||||
# Enable WAL mode in sqlite3
|
||||
from django.db import connection
|
||||
with connection.cursor() as cursor:
|
||||
|
||||
# Set Journal mode to WAL to allow for multiple writers
|
||||
current_mode = cursor.execute("PRAGMA journal_mode")
|
||||
if current_mode != 'wal':
|
||||
cursor.execute("PRAGMA journal_mode=wal;")
|
||||
|
||||
# Set max blocking delay for concurrent writes and write sync mode
|
||||
# https://litestream.io/tips/#busy-timeout
|
||||
cursor.execute("PRAGMA busy_timeout = 5000;")
|
||||
cursor.execute("PRAGMA synchronous = NORMAL;")
|
||||
|
||||
# Create cache table in DB if needed
|
||||
try:
|
||||
from django.core.cache import cache
|
||||
@@ -1348,9 +978,9 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
|
||||
for conn in connections.all():
|
||||
conn.close_if_unusable_or_obsolete()
|
||||
|
||||
sql_index_path = Path(output_dir) / SQL_INDEX_FILENAME
|
||||
sql_index_path = archivebox.CONSTANTS.DATABASE_FILE
|
||||
assert sql_index_path.exists(), (
|
||||
f'No database file {SQL_INDEX_FILENAME} found in: {config["OUTPUT_DIR"]} (Are you in an ArchiveBox collection directory?)')
|
||||
f'No database file {sql_index_path} found in: {archivebox.DATA_DIR} (Are you in an ArchiveBox collection directory?)')
|
||||
|
||||
bump_startup_progress_bar()
|
||||
|
||||
@@ -1363,7 +993,7 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
|
||||
|
||||
logfire.configure()
|
||||
logfire.instrument_django(is_sql_commentor_enabled=True)
|
||||
logfire.info(f'Started ArchiveBox v{CONFIG.VERSION}', argv=sys.argv)
|
||||
logfire.info(f'Started ArchiveBox v{archivebox.VERSION}', argv=sys.argv)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
raise SystemExit(2)
|
||||
|
||||
Reference in New Issue
Block a user