mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-03 06:17:53 +10:00
Merge branch 'master' into node_config
This commit is contained in:
1
archivebox/LICENSE
Symbolic link
1
archivebox/LICENSE
Symbolic link
@@ -0,0 +1 @@
|
||||
../LICENSE
|
||||
1
archivebox/README.md
Symbolic link
1
archivebox/README.md
Symbolic link
@@ -0,0 +1 @@
|
||||
../README.md
|
||||
@@ -1 +0,0 @@
|
||||
0.4.17
|
||||
@@ -104,11 +104,11 @@ def main(args: Optional[List[str]]=NotProvided, stdin: Optional[IO]=NotProvided,
|
||||
)
|
||||
command = parser.parse_args(args or ())
|
||||
|
||||
if command.help or command.subcommand is None:
|
||||
command.subcommand = 'help'
|
||||
elif command.version:
|
||||
if command.version:
|
||||
command.subcommand = 'version'
|
||||
|
||||
elif command.help or command.subcommand is None:
|
||||
command.subcommand = 'help'
|
||||
|
||||
if command.subcommand not in ('help', 'version', 'status'):
|
||||
from ..logging_util import log_cli_command
|
||||
|
||||
|
||||
@@ -4,10 +4,11 @@ import os
|
||||
import io
|
||||
import re
|
||||
import sys
|
||||
import django
|
||||
import json
|
||||
import getpass
|
||||
import shutil
|
||||
import platform
|
||||
import django
|
||||
|
||||
from hashlib import md5
|
||||
from pathlib import Path
|
||||
@@ -58,7 +59,7 @@ CONFIG_DEFAULTS: Dict[str, ConfigDefaultDict] = {
|
||||
'MEDIA_TIMEOUT': {'type': int, 'default': 3600},
|
||||
'OUTPUT_PERMISSIONS': {'type': str, 'default': '755'},
|
||||
'RESTRICT_FILE_NAMES': {'type': str, 'default': 'windows'},
|
||||
'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2)(\?.*)?$'}, # to avoid downloading code assets as their own pages
|
||||
'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'}, # to avoid downloading code assets as their own pages
|
||||
},
|
||||
|
||||
'SERVER_CONFIG': {
|
||||
@@ -186,7 +187,6 @@ STATICFILE_EXTENSIONS = {
|
||||
# html, htm, shtml, xhtml, xml, aspx, php, cgi
|
||||
}
|
||||
|
||||
VERSION_FILENAME = 'VERSION'
|
||||
PYTHON_DIR_NAME = 'archivebox'
|
||||
TEMPLATES_DIR_NAME = 'themes'
|
||||
|
||||
@@ -232,10 +232,10 @@ DERIVED_CONFIG_DEFAULTS: ConfigDefaultDict = {
|
||||
'CONFIG_FILE': {'default': lambda c: os.path.abspath(os.path.expanduser(c['CONFIG_FILE'])) if c['CONFIG_FILE'] else os.path.join(c['OUTPUT_DIR'], CONFIG_FILENAME)},
|
||||
'COOKIES_FILE': {'default': lambda c: c['COOKIES_FILE'] and os.path.abspath(os.path.expanduser(c['COOKIES_FILE']))},
|
||||
'CHROME_USER_DATA_DIR': {'default': lambda c: find_chrome_data_dir() if c['CHROME_USER_DATA_DIR'] is None else (os.path.abspath(os.path.expanduser(c['CHROME_USER_DATA_DIR'])) or None)},
|
||||
'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'], re.IGNORECASE | re.UNICODE | re.MULTILINE)},
|
||||
'URL_BLACKLIST_PTN': {'default': lambda c: c['URL_BLACKLIST'] and re.compile(c['URL_BLACKLIST'] or '', re.IGNORECASE | re.UNICODE | re.MULTILINE)},
|
||||
|
||||
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0]},
|
||||
'VERSION': {'default': lambda c: open(os.path.join(c['PYTHON_DIR'], VERSION_FILENAME), 'r').read().strip()},
|
||||
'VERSION': {'default': lambda c: json.loads((Path(c['PYTHON_DIR']) / 'package.json').read_text().strip())['version']},
|
||||
'GIT_SHA': {'default': lambda c: c['VERSION'].split('+')[-1] or 'unknown'},
|
||||
|
||||
'PYTHON_BINARY': {'default': lambda c: sys.executable},
|
||||
@@ -510,16 +510,9 @@ def bin_version(binary: Optional[str]) -> Optional[str]:
|
||||
return None
|
||||
|
||||
try:
|
||||
if binary.split('/')[-1] in ('single-file',):
|
||||
# these dependencies dont support the --version flag, but are valid still
|
||||
if run([abspath, "--help"], stdout=PIPE).returncode == 0:
|
||||
return '0.0.0'
|
||||
else:
|
||||
return None
|
||||
else:
|
||||
version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
|
||||
# take first 3 columns of first line of version info
|
||||
return ' '.join(version_str.split('\n')[0].strip().split()[:3])
|
||||
version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
|
||||
# take first 3 columns of first line of version info
|
||||
return ' '.join(version_str.split('\n')[0].strip().split()[:3])
|
||||
except OSError:
|
||||
pass
|
||||
# stderr(f'[X] Unable to find working version of dependency: {binary}', color='red')
|
||||
@@ -534,6 +527,10 @@ def bin_path(binary: Optional[str]) -> Optional[str]:
|
||||
if binary is None:
|
||||
return None
|
||||
|
||||
node_modules_bin = Path('.') / 'node_modules' / '.bin' / binary
|
||||
if node_modules_bin.exists():
|
||||
return str(node_modules_bin.resolve())
|
||||
|
||||
return shutil.which(os.path.expanduser(binary)) or binary
|
||||
|
||||
def bin_hash(binary: Optional[str]) -> Optional[str]:
|
||||
@@ -784,6 +781,10 @@ globals().update(CONFIG)
|
||||
# Timezone set as UTC
|
||||
os.environ["TZ"] = 'UTC'
|
||||
|
||||
# add ./node_modules/.bin to $PATH so we can use node scripts in extractors
|
||||
NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))
|
||||
sys.path.append(NODE_BIN_PATH)
|
||||
|
||||
|
||||
############################## Importable Checkers #############################
|
||||
|
||||
@@ -825,16 +826,6 @@ def check_system_config(config: ConfigDict=CONFIG) -> None:
|
||||
stderr(' CHROME_USER_DATA_DIR="{}"'.format(config['CHROME_USER_DATA_DIR'].split('/Default')[0]))
|
||||
raise SystemExit(2)
|
||||
|
||||
def print_dependency_additional_info(dependency: str) -> None:
|
||||
if dependency == "SINGLEFILE_BINARY":
|
||||
hint(('npm install -g git+https://github.com/gildas-lormeau/SingleFile.git"',
|
||||
'or set SAVE_SINGLEFILE=False to silence this warning',
|
||||
''))
|
||||
if dependency == "READABILITY_BINARY":
|
||||
hint(('npm install -g git+https://github.com/pirate/readability-extractor.git"',
|
||||
'or set SAVE_READABILITY=False to silence this warning',
|
||||
''))
|
||||
|
||||
|
||||
def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
|
||||
invalid_dependencies = [
|
||||
@@ -851,9 +842,10 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
|
||||
info['version'] or 'unable to detect version',
|
||||
)
|
||||
)
|
||||
print_dependency_additional_info(dependency)
|
||||
stderr(' {lightred}Hint:{reset} To get more info on dependencies run:'.format(**ANSI))
|
||||
stderr(' archivebox --version')
|
||||
if dependency in ('SINGLEFILE_BINARY', 'READABILITY_BINARY'):
|
||||
hint(('npm install --prefix . "git+https://github.com/pirate/ArchiveBox.git"',
|
||||
f'or archivebox config --set SAVE_{dependency.rsplit("_", 1)[0]}=False to silence this warning',
|
||||
''), prefix=' ')
|
||||
stderr('')
|
||||
|
||||
if config['TIMEOUT'] < 5:
|
||||
|
||||
@@ -31,15 +31,16 @@ class ConfigDict(BaseConfig, total=False):
|
||||
SHOW_PROGRESS: bool
|
||||
IN_DOCKER: bool
|
||||
|
||||
OUTPUT_DIR: str
|
||||
CONFIG_FILE: str
|
||||
OUTPUT_DIR: Optional[str]
|
||||
CONFIG_FILE: Optional[str]
|
||||
ONLY_NEW: bool
|
||||
TIMEOUT: int
|
||||
MEDIA_TIMEOUT: int
|
||||
OUTPUT_PERMISSIONS: str
|
||||
URL_BLACKLIST: Optional[str]
|
||||
RESTRICT_FILE_NAMES: str
|
||||
URL_BLACKLIST: str
|
||||
|
||||
SECRET_KEY: str
|
||||
SECRET_KEY: Optional[str]
|
||||
BIND_ADDR: str
|
||||
ALLOWED_HOSTS: str
|
||||
DEBUG: bool
|
||||
@@ -52,10 +53,11 @@ class ConfigDict(BaseConfig, total=False):
|
||||
SAVE_FAVICON: bool
|
||||
SAVE_WGET: bool
|
||||
SAVE_WGET_REQUISITES: bool
|
||||
SAVE_SINGLEFILE: bool
|
||||
SAVE_READABILITY: bool
|
||||
SAVE_PDF: bool
|
||||
SAVE_SCREENSHOT: bool
|
||||
SAVE_DOM: bool
|
||||
SAVE_SINGLEFILE: bool
|
||||
SAVE_WARC: bool
|
||||
SAVE_GIT: bool
|
||||
SAVE_MEDIA: bool
|
||||
@@ -75,53 +77,18 @@ class ConfigDict(BaseConfig, total=False):
|
||||
|
||||
USE_CURL: bool
|
||||
USE_WGET: bool
|
||||
USE_SINGLEFILE: bool
|
||||
USE_READABILITY: bool
|
||||
USE_GIT: bool
|
||||
USE_CHROME: bool
|
||||
USE_YOUTUBEDL: bool
|
||||
USE_SINGLEFILE: bool
|
||||
|
||||
CURL_BINARY: Optional[str]
|
||||
GIT_BINARY: Optional[str]
|
||||
WGET_BINARY: Optional[str]
|
||||
YOUTUBEDL_BINARY: Optional[str]
|
||||
CURL_BINARY: str
|
||||
GIT_BINARY: str
|
||||
WGET_BINARY: str
|
||||
SINGLEFILE_BINARY: str
|
||||
READABILITY_BINARY: str
|
||||
YOUTUBEDL_BINARY: str
|
||||
CHROME_BINARY: Optional[str]
|
||||
SINGLEFILE_BINARY: Optional[str]
|
||||
|
||||
TERM_WIDTH: Callable[[], int]
|
||||
USER: str
|
||||
ANSI: Dict[str, str]
|
||||
REPO_DIR: str
|
||||
PYTHON_DIR: str
|
||||
TEMPLATES_DIR: str
|
||||
ARCHIVE_DIR: str
|
||||
SOURCES_DIR: str
|
||||
LOGS_DIR: str
|
||||
|
||||
URL_BLACKLIST_PTN: Optional[Pattern]
|
||||
WGET_AUTO_COMPRESSION: bool
|
||||
|
||||
ARCHIVEBOX_BINARY: str
|
||||
VERSION: str
|
||||
GIT_SHA: str
|
||||
|
||||
PYTHON_BINARY: str
|
||||
PYTHON_ENCODING: str
|
||||
PYTHON_VERSION: str
|
||||
|
||||
DJANGO_BINARY: str
|
||||
DJANGO_VERSION: str
|
||||
|
||||
CURL_VERSION: str
|
||||
WGET_VERSION: str
|
||||
YOUTUBEDL_VERSION: str
|
||||
GIT_VERSION: str
|
||||
CHROME_VERSION: str
|
||||
|
||||
DEPENDENCIES: Dict[str, SimpleConfigValueDict]
|
||||
CODE_LOCATIONS: Dict[str, SimpleConfigValueDict]
|
||||
CONFIG_LOCATIONS: Dict[str, SimpleConfigValueDict]
|
||||
DATA_LOCATIONS: Dict[str, SimpleConfigValueDict]
|
||||
CHROME_OPTIONS: Dict[str, SimpleConfigValue]
|
||||
|
||||
|
||||
ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue]
|
||||
|
||||
@@ -17,7 +17,7 @@ from ..util import (
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_READABILITY,
|
||||
READABILITY_BINARY,
|
||||
DEPENDENCIES,
|
||||
READABILITY_VERSION,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
@@ -73,7 +73,7 @@ def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEO
|
||||
temp_doc.close()
|
||||
|
||||
cmd = [
|
||||
READABILITY_BINARY,
|
||||
DEPENDENCIES['READABILITY_BINARY']['path'],
|
||||
temp_doc.name
|
||||
]
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ from ..util import (
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
SAVE_SINGLEFILE,
|
||||
SINGLEFILE_BINARY,
|
||||
DEPENDENCIES,
|
||||
SINGLEFILE_VERSION,
|
||||
CHROME_BINARY,
|
||||
)
|
||||
@@ -43,7 +43,7 @@ def save_singlefile(link: Link, out_dir: Optional[str]=None, timeout: int=TIMEOU
|
||||
|
||||
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
|
||||
cmd = [
|
||||
SINGLEFILE_BINARY,
|
||||
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
|
||||
'--browser-executable-path={}'.format(CHROME_BINARY),
|
||||
'--browser-args="{}"'.format(json.dumps(browser_args[1:])),
|
||||
link.url,
|
||||
|
||||
@@ -127,7 +127,7 @@ class Link:
|
||||
|
||||
|
||||
def __str__(self) -> str:
|
||||
return f'[{self.timestamp}] {self.base_url} "{self.title}"'
|
||||
return f'[{self.timestamp}] {self.url} "{self.title}"'
|
||||
|
||||
def __post_init__(self):
|
||||
self.typecheck()
|
||||
|
||||
@@ -99,15 +99,18 @@ class TimedProgress:
|
||||
|
||||
if self.SHOW_PROGRESS:
|
||||
# terminate if we havent already terminated
|
||||
self.p.terminate()
|
||||
self.p.join()
|
||||
self.p.close()
|
||||
|
||||
# clear whole terminal line
|
||||
try:
|
||||
sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset']))
|
||||
except (IOError, BrokenPipeError):
|
||||
# ignore when the parent proc has stopped listening to our stdout
|
||||
self.p.terminate()
|
||||
self.p.join()
|
||||
self.p.close()
|
||||
|
||||
# clear whole terminal line
|
||||
try:
|
||||
sys.stdout.write('\r{}{}\r'.format((' ' * TERM_WIDTH()), ANSI['reset']))
|
||||
except (IOError, BrokenPipeError):
|
||||
# ignore when the parent proc has stopped listening to our stdout
|
||||
pass
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
|
||||
@@ -466,7 +469,10 @@ def printable_folders(folders: Dict[str, Optional["Link"]],
|
||||
from .index.csv import links_to_csv
|
||||
return links_to_csv(folders.values(), cols=csv.split(','), header=True)
|
||||
|
||||
return '\n'.join(f'{folder} {link}' for folder, link in folders.items())
|
||||
return '\n'.join(
|
||||
f'{folder} {link and link.url} "{link and link.title}"'
|
||||
for folder, link in folders.items()
|
||||
)
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -114,6 +114,8 @@ ALLOWED_IN_OUTPUT_DIR = {
|
||||
'venv',
|
||||
'virtualenv',
|
||||
'.virtualenv',
|
||||
'node_modules',
|
||||
'package-lock.json',
|
||||
ARCHIVE_DIR_NAME,
|
||||
SOURCES_DIR_NAME,
|
||||
LOGS_DIR_NAME,
|
||||
|
||||
1
archivebox/package.json
Symbolic link
1
archivebox/package.json
Symbolic link
@@ -0,0 +1 @@
|
||||
../package.json
|
||||
Reference in New Issue
Block a user