mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 23:37:58 +10:00
Merge branch 'dev' into feat/reverse-proxy-auth
This commit is contained in:
@@ -30,11 +30,17 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
||||
help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--update-all', #'-n',
|
||||
'--update', #'-u',
|
||||
action='store_true',
|
||||
default=not ONLY_NEW, # when ONLY_NEW=True we skip updating old links
|
||||
help="Also retry previously skipped/failed links when adding new links",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--update-all', #'-n',
|
||||
action='store_true',
|
||||
default=False,
|
||||
help="Also update ALL links in index when finished adding new links",
|
||||
)
|
||||
parser.add_argument(
|
||||
'--index-only', #'-o',
|
||||
action='store_true',
|
||||
@@ -104,6 +110,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
||||
urls=stdin_urls or urls,
|
||||
depth=command.depth,
|
||||
tag=command.tag,
|
||||
update=command.update,
|
||||
update_all=command.update_all,
|
||||
index_only=command.index_only,
|
||||
overwrite=command.overwrite,
|
||||
|
||||
@@ -51,6 +51,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
||||
action='store_true',
|
||||
help='Re-archive any URLs that have been previously archived, overwriting existing Snapshots',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--update',
|
||||
action='store_true',
|
||||
help='Re-pull any URLs that have been previously added, as needed to fill missing ArchiveResults',
|
||||
)
|
||||
group.add_argument(
|
||||
'--clear', # '-c'
|
||||
action='store_true',
|
||||
@@ -94,6 +99,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
|
||||
every=command.every,
|
||||
depth=command.depth,
|
||||
overwrite=command.overwrite,
|
||||
update=command.update,
|
||||
import_path=command.import_path,
|
||||
out_dir=pwd or OUTPUT_DIR,
|
||||
)
|
||||
|
||||
@@ -26,11 +26,12 @@ import io
|
||||
import re
|
||||
import sys
|
||||
import json
|
||||
import inspect
|
||||
import getpass
|
||||
import platform
|
||||
import shutil
|
||||
import sqlite3
|
||||
import django
|
||||
from sqlite3 import dbapi2 as sqlite3
|
||||
|
||||
from hashlib import md5
|
||||
from pathlib import Path
|
||||
@@ -48,6 +49,9 @@ from .config_stubs import (
|
||||
ConfigDefaultDict,
|
||||
)
|
||||
|
||||
|
||||
### Pre-Fetch Minimal System Config
|
||||
|
||||
SYSTEM_USER = getpass.getuser() or os.getlogin()
|
||||
|
||||
try:
|
||||
@@ -65,6 +69,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||
'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
|
||||
'SHOW_PROGRESS': {'type': bool, 'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')}, # progress bars are buggy on mac, disable for now
|
||||
'IN_DOCKER': {'type': bool, 'default': False},
|
||||
'PUID': {'type': int, 'default': os.getuid()},
|
||||
'PGID': {'type': int, 'default': os.getgid()},
|
||||
# TODO: 'SHOW_HINTS': {'type: bool, 'default': True},
|
||||
},
|
||||
|
||||
@@ -79,6 +85,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||
'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'}, # to avoid downloading code assets as their own pages
|
||||
'URL_WHITELIST': {'type': str, 'default': None},
|
||||
'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True},
|
||||
'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'},
|
||||
},
|
||||
|
||||
'SERVER_CONFIG': {
|
||||
@@ -93,9 +100,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
|
||||
'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
|
||||
'TIME_ZONE': {'type': str, 'default': 'UTC'},
|
||||
'TIMEZONE': {'type': str, 'default': 'UTC'},
|
||||
'REVERSE_PROXY_USER_HEADER': {'type': str, 'default': 'Remote-User'},
|
||||
'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''},
|
||||
'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
|
||||
'PREVIEW_ORIGINALS': {'type': bool, 'default': True},
|
||||
},
|
||||
|
||||
'ARCHIVE_METHOD_TOGGLES': {
|
||||
@@ -122,9 +131,9 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
|
||||
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
|
||||
|
||||
'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
|
||||
'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
|
||||
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
||||
'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
|
||||
'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
|
||||
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
|
||||
|
||||
'COOKIES_FILE': {'type': str, 'default': None},
|
||||
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
|
||||
@@ -139,10 +148,18 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||
'--no-call-home',
|
||||
'--write-sub',
|
||||
'--all-subs',
|
||||
'--write-auto-sub',
|
||||
# There are too many of these and youtube
|
||||
# throttles you with HTTP error 429
|
||||
#'--write-auto-subs',
|
||||
'--convert-subs=srt',
|
||||
'--yes-playlist',
|
||||
'--continue',
|
||||
# This flag doesn't exist in youtube-dl
|
||||
# only in yt-dlp
|
||||
'--no-abort-on-error',
|
||||
# --ignore-errors must come AFTER
|
||||
# --no-abort-on-error
|
||||
# https://github.com/yt-dlp/yt-dlp/issues/4914
|
||||
'--ignore-errors',
|
||||
'--geo-bypass',
|
||||
'--add-metadata',
|
||||
@@ -164,6 +181,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||
'--compressed'
|
||||
]},
|
||||
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
|
||||
'SINGLEFILE_ARGS': {'type': list, 'default' : None}
|
||||
},
|
||||
|
||||
'SEARCH_BACKEND_CONFIG' : {
|
||||
@@ -197,7 +215,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
|
||||
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
|
||||
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
|
||||
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')},
|
||||
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
|
||||
#'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
|
||||
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'},
|
||||
'NODE_BINARY': {'type': str, 'default': 'node'},
|
||||
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
|
||||
'CHROME_BINARY': {'type': str, 'default': None},
|
||||
@@ -321,6 +340,15 @@ ALLOWED_IN_OUTPUT_DIR = {
|
||||
'static_index.json',
|
||||
}
|
||||
|
||||
def get_version(config):
|
||||
return json.loads((Path(config['PACKAGE_DIR']) / 'package.json').read_text(encoding='utf-8').strip())['version']
|
||||
|
||||
def get_commit_hash(config):
|
||||
try:
|
||||
return list((config['PACKAGE_DIR'] / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
############################## Derived Config ##################################
|
||||
|
||||
|
||||
@@ -345,14 +373,20 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},
|
||||
|
||||
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
|
||||
'VERSION': {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text(encoding='utf-8').strip())['version']},
|
||||
|
||||
'VERSION': {'default': lambda c: get_version(c)},
|
||||
'COMMIT_HASH': {'default': lambda c: get_commit_hash(c)},
|
||||
|
||||
'PYTHON_BINARY': {'default': lambda c: sys.executable},
|
||||
'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()},
|
||||
'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
|
||||
|
||||
'DJANGO_BINARY': {'default': lambda c: django.__file__.replace('__init__.py', 'bin/django-admin.py')},
|
||||
'DJANGO_BINARY': {'default': lambda c: inspect.getfile(django)},
|
||||
'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)},
|
||||
|
||||
'SQLITE_BINARY': {'default': lambda c: inspect.getfile(sqlite3)},
|
||||
'SQLITE_VERSION': {'default': lambda c: sqlite3.version},
|
||||
#'SQLITE_JOURNAL_MODE': {'default': lambda c: 'wal'}, # set at runtime below, interesting but unused for now
|
||||
#'SQLITE_OPTIONS': {'default': lambda c: ['JSON1']}, # set at runtime below
|
||||
|
||||
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
|
||||
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
|
||||
@@ -373,6 +407,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
|
||||
|
||||
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
|
||||
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
|
||||
'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
|
||||
|
||||
'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
|
||||
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
|
||||
@@ -652,7 +687,9 @@ def bin_version(binary: Optional[str]) -> Optional[str]:
|
||||
return None
|
||||
|
||||
try:
|
||||
version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
|
||||
version_str = run([abspath, "--version"], stdout=PIPE, env={'LANG': 'C'}).stdout.strip().decode()
|
||||
if not version_str:
|
||||
version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
|
||||
# take first 3 columns of first line of version info
|
||||
return ' '.join(version_str.split('\n')[0].strip().split()[:3])
|
||||
except OSError:
|
||||
@@ -795,6 +832,7 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
|
||||
'path': config['OUTPUT_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
|
||||
'is_mount': os.path.ismount(config['OUTPUT_DIR'].resolve()),
|
||||
},
|
||||
'SOURCES_DIR': {
|
||||
'path': config['SOURCES_DIR'].resolve(),
|
||||
@@ -810,6 +848,7 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
|
||||
'path': config['ARCHIVE_DIR'].resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': config['ARCHIVE_DIR'].exists(),
|
||||
'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()),
|
||||
},
|
||||
'CONFIG_FILE': {
|
||||
'path': config['CONFIG_FILE'].resolve(),
|
||||
@@ -820,18 +859,12 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
|
||||
'path': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve(),
|
||||
'enabled': True,
|
||||
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
|
||||
'is_mount': os.path.ismount((config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve()),
|
||||
},
|
||||
}
|
||||
|
||||
def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||
return {
|
||||
'ARCHIVEBOX_BINARY': {
|
||||
'path': bin_path(config['ARCHIVEBOX_BINARY']),
|
||||
'version': config['VERSION'],
|
||||
'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
|
||||
'enabled': True,
|
||||
'is_valid': True,
|
||||
},
|
||||
'PYTHON_BINARY': {
|
||||
'path': bin_path(config['PYTHON_BINARY']),
|
||||
'version': config['PYTHON_VERSION'],
|
||||
@@ -839,6 +872,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||
'enabled': True,
|
||||
'is_valid': bool(config['PYTHON_VERSION']),
|
||||
},
|
||||
'SQLITE_BINARY': {
|
||||
'path': bin_path(config['SQLITE_BINARY']),
|
||||
'version': config['SQLITE_VERSION'],
|
||||
'hash': bin_hash(config['SQLITE_BINARY']),
|
||||
'enabled': True,
|
||||
'is_valid': bool(config['SQLITE_VERSION']),
|
||||
},
|
||||
'DJANGO_BINARY': {
|
||||
'path': bin_path(config['DJANGO_BINARY']),
|
||||
'version': config['DJANGO_VERSION'],
|
||||
@@ -846,6 +886,14 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
|
||||
'enabled': True,
|
||||
'is_valid': bool(config['DJANGO_VERSION']),
|
||||
},
|
||||
'ARCHIVEBOX_BINARY': {
|
||||
'path': bin_path(config['ARCHIVEBOX_BINARY']),
|
||||
'version': config['VERSION'],
|
||||
'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
|
||||
'enabled': True,
|
||||
'is_valid': True,
|
||||
},
|
||||
|
||||
'CURL_BINARY': {
|
||||
'path': bin_path(config['CURL_BINARY']),
|
||||
'version': config['CURL_VERSION'],
|
||||
@@ -931,7 +979,7 @@ def get_chrome_info(config: ConfigDict) -> ConfigValue:
|
||||
'TIMEOUT': config['TIMEOUT'],
|
||||
'RESOLUTION': config['RESOLUTION'],
|
||||
'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'],
|
||||
'CHROME_BINARY': config['CHROME_BINARY'],
|
||||
'CHROME_BINARY': bin_path(config['CHROME_BINARY']),
|
||||
'CHROME_HEADLESS': config['CHROME_HEADLESS'],
|
||||
'CHROME_SANDBOX': config['CHROME_SANDBOX'],
|
||||
'CHROME_USER_AGENT': config['CHROME_USER_AGENT'],
|
||||
@@ -972,13 +1020,22 @@ globals().update(CONFIG)
|
||||
|
||||
|
||||
# Set timezone to UTC and umask to OUTPUT_PERMISSIONS
|
||||
os.environ["TZ"] = 'UTC'
|
||||
assert TIMEZONE == 'UTC', 'The server timezone should always be set to UTC' # we may allow this to change later
|
||||
os.environ["TZ"] = TIMEZONE
|
||||
os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
|
||||
|
||||
# add ./node_modules/.bin to $PATH so we can use node scripts in extractors
|
||||
NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))
|
||||
sys.path.append(NODE_BIN_PATH)
|
||||
|
||||
# OPTIONAL: also look around the host system for node modules to use
|
||||
# avoid enabling this unless absolutely needed,
|
||||
# having overlapping potential sources of libs is a big source of bugs/confusing to users
|
||||
# DEV_NODE_BIN_PATH = str((Path(CONFIG["PACKAGE_DIR"]).absolute() / '..' / 'node_modules' / '.bin'))
|
||||
# sys.path.append(DEV_NODE_BIN_PATH)
|
||||
# USER_NODE_BIN_PATH = str(Path('~/.node_modules/.bin').resolve())
|
||||
# sys.path.append(USER_NODE_BIN_PATH)
|
||||
|
||||
# disable stderr "you really shouldnt disable ssl" warnings with library config
|
||||
if not CONFIG['CHECK_SSL_VALIDITY']:
|
||||
import urllib3
|
||||
@@ -986,6 +1043,13 @@ if not CONFIG['CHECK_SSL_VALIDITY']:
|
||||
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
|
||||
# get SQLite database version, compile options, and runtime options
|
||||
# TODO: make this a less hacky proper assertion checker helper function in somewhere like setup_django
|
||||
#cursor = sqlite3.connect(':memory:').cursor()
|
||||
#DYNAMIC_CONFIG_SCHEMA['SQLITE_VERSION'] = lambda c: cursor.execute("SELECT sqlite_version();").fetchone()[0]
|
||||
#DYNAMIC_CONFIG_SCHEMA['SQLITE_JOURNAL_MODE'] = lambda c: cursor.execute('PRAGMA journal_mode;').fetchone()[0]
|
||||
#DYNAMIC_CONFIG_SCHEMA['SQLITE_OPTIONS'] = lambda c: [option[0] for option in cursor.execute('PRAGMA compile_options;').fetchall()]
|
||||
#cursor.close()
|
||||
|
||||
########################### Config Validity Checkers ###########################
|
||||
|
||||
@@ -1082,6 +1146,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
|
||||
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
|
||||
stderr()
|
||||
|
||||
|
||||
def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG) -> None:
|
||||
output_dir = out_dir or config['OUTPUT_DIR']
|
||||
assert isinstance(output_dir, (str, Path))
|
||||
@@ -1156,11 +1221,10 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
|
||||
# without running migrations automatically (user runs them manually by calling init)
|
||||
django.setup()
|
||||
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
# log startup message to the error log
|
||||
with open(settings.ERROR_LOG, "a+", encoding='utf-8') as f:
|
||||
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
|
||||
command = ' '.join(sys.argv)
|
||||
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
|
||||
f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
|
||||
@@ -1170,10 +1234,17 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
|
||||
# Enable WAL mode in sqlite3
|
||||
from django.db import connection
|
||||
with connection.cursor() as cursor:
|
||||
|
||||
# Set Journal mode to WAL to allow for multiple writers
|
||||
current_mode = cursor.execute("PRAGMA journal_mode")
|
||||
if current_mode != 'wal':
|
||||
cursor.execute("PRAGMA journal_mode=wal;")
|
||||
|
||||
# Set max blocking delay for concurrent writes and write sync mode
|
||||
# https://litestream.io/tips/#busy-timeout
|
||||
cursor.execute("PRAGMA busy_timeout = 5000;")
|
||||
cursor.execute("PRAGMA synchronous = NORMAL;")
|
||||
|
||||
# Create cache table in DB if needed
|
||||
try:
|
||||
from django.core.cache import cache
|
||||
@@ -1181,7 +1252,6 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
|
||||
except django.db.utils.OperationalError:
|
||||
call_command("createcachetable", verbosity=0)
|
||||
|
||||
|
||||
# if archivebox gets imported multiple times, we have to close
|
||||
# the sqlite3 whenever we init from scratch to avoid multiple threads
|
||||
# sharing the same connection by accident
|
||||
|
||||
@@ -98,6 +98,7 @@ class ConfigDict(BaseConfig, total=False):
|
||||
WGET_ARGS: List[str]
|
||||
CURL_ARGS: List[str]
|
||||
GIT_ARGS: List[str]
|
||||
TAG_SEPARATOR_PATTERN: str
|
||||
|
||||
|
||||
ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue]
|
||||
|
||||
18
archivebox/core/migrations/0021_auto_20220914_0934.py
Normal file
18
archivebox/core/migrations/0021_auto_20220914_0934.py
Normal file
@@ -0,0 +1,18 @@
|
||||
# Generated by Django 3.1.14 on 2022-09-14 09:34
|
||||
|
||||
from django.db import migrations, models
|
||||
|
||||
|
||||
class Migration(migrations.Migration):
|
||||
|
||||
dependencies = [
|
||||
('core', '0020_auto_20210410_1031'),
|
||||
]
|
||||
|
||||
operations = [
|
||||
migrations.AlterField(
|
||||
model_name='archiveresult',
|
||||
name='extractor',
|
||||
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
|
||||
),
|
||||
]
|
||||
@@ -19,7 +19,7 @@ from ..config import (
|
||||
SQL_INDEX_FILENAME,
|
||||
OUTPUT_DIR,
|
||||
LOGS_DIR,
|
||||
TIME_ZONE,
|
||||
TIMEZONE,
|
||||
)
|
||||
|
||||
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
|
||||
@@ -157,7 +157,7 @@ DATABASES = {
|
||||
'timeout': 60,
|
||||
'check_same_thread': False,
|
||||
},
|
||||
'TIME_ZONE': 'UTC',
|
||||
'TIME_ZONE': TIMEZONE,
|
||||
# DB setup is sometimes modified at runtime by setup_django() in config.py
|
||||
}
|
||||
}
|
||||
@@ -227,7 +227,8 @@ USE_L10N = True
|
||||
USE_TZ = True
|
||||
DATETIME_FORMAT = 'Y-m-d g:iA'
|
||||
SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'
|
||||
TIME_ZONE = TIME_ZONE # noqa
|
||||
TIME_ZONE = TIMEZONE # django convention is TIME_ZONE, archivebox config uses TIMEZONE, they are equivalent
|
||||
|
||||
|
||||
from django.conf.locale.en import formats as en_formats
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ from django.contrib.staticfiles.urls import staticfiles_urlpatterns
|
||||
from django.conf import settings
|
||||
from django.views.generic.base import RedirectView
|
||||
|
||||
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView
|
||||
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
|
||||
|
||||
|
||||
# print('DEBUG', settings.DEBUG)
|
||||
@@ -24,14 +24,16 @@ urlpatterns = [
|
||||
|
||||
path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
|
||||
path('add/', AddView.as_view(), name='add'),
|
||||
|
||||
|
||||
path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
|
||||
path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
|
||||
|
||||
|
||||
path('accounts/', include('django.contrib.auth.urls')),
|
||||
path('admin/', admin.site.urls),
|
||||
|
||||
|
||||
path('health/', HealthCheckView.as_view(), name='healthcheck'),
|
||||
|
||||
path('index.html', RedirectView.as_view(url='/')),
|
||||
path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),
|
||||
path('', HomepageView.as_view(), name='Home'),
|
||||
|
||||
@@ -38,7 +38,7 @@ class HomepageView(View):
|
||||
|
||||
if PUBLIC_INDEX:
|
||||
return redirect('/public')
|
||||
|
||||
|
||||
return redirect(f'/admin/login/?next={request.path}')
|
||||
|
||||
|
||||
@@ -205,7 +205,7 @@ class SnapshotView(View):
|
||||
content_type="text/html",
|
||||
status=404,
|
||||
)
|
||||
|
||||
|
||||
|
||||
class PublicIndexView(ListView):
|
||||
template_name = 'public_index.html'
|
||||
@@ -220,7 +220,7 @@ class PublicIndexView(ListView):
|
||||
'FOOTER_INFO': FOOTER_INFO,
|
||||
}
|
||||
|
||||
def get_queryset(self, **kwargs):
|
||||
def get_queryset(self, **kwargs):
|
||||
qs = super().get_queryset(**kwargs)
|
||||
query = self.request.GET.get('q')
|
||||
if query and query.strip():
|
||||
@@ -249,7 +249,7 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
url = self.request.GET.get('url', None)
|
||||
if url:
|
||||
return {'url': url if '://' in url else f'https://{url}'}
|
||||
|
||||
|
||||
return super().get_initial()
|
||||
|
||||
def test_func(self):
|
||||
@@ -295,3 +295,18 @@ class AddView(UserPassesTestMixin, FormView):
|
||||
"form": AddLinkForm()
|
||||
})
|
||||
return render(template_name=self.template_name, request=self.request, context=context)
|
||||
|
||||
|
||||
class HealthCheckView(View):
|
||||
"""
|
||||
A Django view that renders plain text "OK" for service discovery tools
|
||||
"""
|
||||
def get(self, request):
|
||||
"""
|
||||
Handle a GET request
|
||||
"""
|
||||
return HttpResponse(
|
||||
'OK',
|
||||
content_type='text/plain',
|
||||
status=200
|
||||
)
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
__package__ = 'archivebox.extractors'
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
from typing import Optional, List, Iterable, Union
|
||||
from datetime import datetime, timezone
|
||||
from django.db.models import QuerySet
|
||||
|
||||
from ..core.settings import ERROR_LOG
|
||||
from ..index.schema import Link
|
||||
from ..index.sql import write_link_to_sql_index
|
||||
from ..index import (
|
||||
@@ -42,7 +44,6 @@ from .headers import should_save_headers, save_headers
|
||||
|
||||
def get_default_archive_methods():
|
||||
return [
|
||||
('title', should_save_title, save_title),
|
||||
('favicon', should_save_favicon, save_favicon),
|
||||
('headers', should_save_headers, save_headers),
|
||||
('singlefile', should_save_singlefile, save_singlefile),
|
||||
@@ -50,7 +51,8 @@ def get_default_archive_methods():
|
||||
('screenshot', should_save_screenshot, save_screenshot),
|
||||
('dom', should_save_dom, save_dom),
|
||||
('wget', should_save_wget, save_wget),
|
||||
('readability', should_save_readability, save_readability), # keep readability below wget and singlefile, as it depends on them
|
||||
('title', should_save_title, save_title), # keep title and readability below wget and singlefile, as it depends on them
|
||||
('readability', should_save_readability, save_readability),
|
||||
('mercury', should_save_mercury, save_mercury),
|
||||
('git', should_save_git, save_git),
|
||||
('media', should_save_media, save_media),
|
||||
@@ -127,10 +129,27 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
|
||||
# print('{black} X {}{reset}'.format(method_name, **ANSI))
|
||||
stats['skipped'] += 1
|
||||
except Exception as e:
|
||||
# Disabled until https://github.com/ArchiveBox/ArchiveBox/issues/984
|
||||
# and https://github.com/ArchiveBox/ArchiveBox/issues/1014
|
||||
# are fixed.
|
||||
"""
|
||||
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
|
||||
method_name,
|
||||
link.url,
|
||||
)) from e
|
||||
"""
|
||||
# Instead, use the kludgy workaround from
|
||||
# https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
|
||||
with open(ERROR_LOG, "a", encoding='utf-8') as f:
|
||||
command = ' '.join(sys.argv)
|
||||
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
|
||||
f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format(
|
||||
method_name,
|
||||
link.url,
|
||||
command,
|
||||
ts
|
||||
) + "\n"))
|
||||
#f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
|
||||
|
||||
# print(' ', stats)
|
||||
|
||||
@@ -182,7 +201,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
|
||||
except KeyboardInterrupt:
|
||||
log_archiving_paused(num_links, idx, link.timestamp)
|
||||
raise SystemExit(0)
|
||||
except BaseException: # lgtm [py/catch-base-exception]
|
||||
except BaseException:
|
||||
print()
|
||||
raise
|
||||
|
||||
|
||||
@@ -33,7 +33,7 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio
|
||||
|
||||
@enforce_types
|
||||
def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
|
||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
|
||||
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
|
||||
|
||||
out_dir = out_dir or Path(link.link_dir)
|
||||
output: ArchiveOutput = 'media'
|
||||
@@ -43,6 +43,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
|
||||
YOUTUBEDL_BINARY,
|
||||
*YOUTUBEDL_ARGS,
|
||||
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
|
||||
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
|
||||
link.url,
|
||||
]
|
||||
status = 'succeeded'
|
||||
@@ -60,7 +61,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
|
||||
pass
|
||||
else:
|
||||
hints = (
|
||||
'Got youtube-dl response code: {}.'.format(result.returncode),
|
||||
'Got youtube-dl (or yt-dlp) response code: {}.'.format(result.returncode),
|
||||
*result.stderr.decode().split('\n'),
|
||||
)
|
||||
raise ArchiveError('Failed to save media', hints)
|
||||
@@ -71,8 +72,18 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
|
||||
timer.end()
|
||||
|
||||
# add video description and subtitles to full-text index
|
||||
# Let's try a few different
|
||||
index_texts = [
|
||||
text_file.read_text(encoding='utf-8').strip()
|
||||
# errors:
|
||||
# * 'strict' to raise a ValueError exception if there is an
|
||||
# encoding error. The default value of None has the same effect.
|
||||
# * 'ignore' ignores errors. Note that ignoring encoding errors
|
||||
# can lead to data loss.
|
||||
# * 'xmlcharrefreplace' is only supported when writing to a
|
||||
# file. Characters not supported by the encoding are replaced with
|
||||
# the appropriate XML character reference &#nnn;.
|
||||
# There are a few more options described in https://docs.python.org/3/library/functions.html#open
|
||||
text_file.read_text(encoding='utf-8', errors='xmlcharrefreplace').strip()
|
||||
for text_file in (
|
||||
*output_path.glob('*.description'),
|
||||
*output_path.glob('*.srt'),
|
||||
|
||||
@@ -10,9 +10,7 @@ from ..index.schema import Link, ArchiveResult, ArchiveError
|
||||
from ..system import run, atomic_write
|
||||
from ..util import (
|
||||
enforce_types,
|
||||
download_url,
|
||||
is_static_file,
|
||||
|
||||
)
|
||||
from ..config import (
|
||||
TIMEOUT,
|
||||
@@ -22,28 +20,8 @@ from ..config import (
|
||||
READABILITY_VERSION,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
from .title import get_html
|
||||
|
||||
@enforce_types
|
||||
def get_html(link: Link, path: Path) -> str:
|
||||
"""
|
||||
Try to find wget, singlefile and then dom files.
|
||||
If none is found, download the url again.
|
||||
"""
|
||||
canonical = link.canonical_outputs()
|
||||
abs_path = path.absolute()
|
||||
sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]]
|
||||
document = None
|
||||
for source in sources:
|
||||
try:
|
||||
with open(abs_path / source, "r", encoding="utf-8") as f:
|
||||
document = f.read()
|
||||
break
|
||||
except (FileNotFoundError, TypeError):
|
||||
continue
|
||||
if document is None:
|
||||
return download_url(link.url)
|
||||
else:
|
||||
return document
|
||||
|
||||
@enforce_types
|
||||
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
|
||||
@@ -17,6 +17,7 @@ from ..config import (
|
||||
SAVE_SINGLEFILE,
|
||||
DEPENDENCIES,
|
||||
SINGLEFILE_VERSION,
|
||||
SINGLEFILE_ARGS,
|
||||
CHROME_BINARY,
|
||||
)
|
||||
from ..logging_util import TimedProgress
|
||||
@@ -45,10 +46,31 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
|
||||
|
||||
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
|
||||
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
|
||||
cmd = [
|
||||
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
|
||||
options = [
|
||||
*SINGLEFILE_ARGS,
|
||||
'--browser-executable-path={}'.format(CHROME_BINARY),
|
||||
browser_args,
|
||||
]
|
||||
|
||||
# Deduplicate options (single-file doesn't like when you use the same option two times)
|
||||
#
|
||||
# NOTE: Options names that come first clobber conflicting names that come later
|
||||
# My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most
|
||||
# specificity, therefore the user sets it with a lot intent, therefore it should take precedence
|
||||
# kind of like the ergonomic principle of lexical scope in programming languages.
|
||||
seen_option_names = []
|
||||
def test_seen(argument):
|
||||
option_name = argument.split("=")[0]
|
||||
if option_name in seen_option_names:
|
||||
return False
|
||||
else:
|
||||
seen_option_names.append(option_name)
|
||||
return True
|
||||
deduped_options = list(filter(test_seen, options))
|
||||
|
||||
cmd = [
|
||||
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
|
||||
*deduped_options,
|
||||
link.url,
|
||||
output,
|
||||
]
|
||||
|
||||
@@ -58,6 +58,27 @@ class TitleParser(HTMLParser):
|
||||
if tag.lower() == "title":
|
||||
self.inside_title_tag = False
|
||||
|
||||
@enforce_types
|
||||
def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
|
||||
"""
|
||||
Try to find wget, singlefile and then dom files.
|
||||
If none is found, download the url again.
|
||||
"""
|
||||
canonical = link.canonical_outputs()
|
||||
abs_path = path.absolute()
|
||||
sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]]
|
||||
document = None
|
||||
for source in sources:
|
||||
try:
|
||||
with open(abs_path / source, "r", encoding="utf-8") as f:
|
||||
document = f.read()
|
||||
break
|
||||
except (FileNotFoundError, TypeError):
|
||||
continue
|
||||
if document is None:
|
||||
return download_url(link.url, timeout=timeout)
|
||||
else:
|
||||
return document
|
||||
|
||||
@enforce_types
|
||||
def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
|
||||
@@ -90,7 +111,7 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
|
||||
status = 'succeeded'
|
||||
timer = TimedProgress(timeout, prefix=' ')
|
||||
try:
|
||||
html = download_url(link.url, timeout=timeout)
|
||||
html = get_html(link, out_dir, timeout=timeout)
|
||||
try:
|
||||
# try using relatively strict html parser first
|
||||
parser = TitleParser()
|
||||
|
||||
@@ -24,6 +24,7 @@ from ..config import (
|
||||
FOOTER_INFO,
|
||||
HTML_INDEX_FILENAME,
|
||||
SAVE_ARCHIVE_DOT_ORG,
|
||||
PREVIEW_ORIGINALS,
|
||||
)
|
||||
|
||||
MAIN_INDEX_TEMPLATE = 'static_index.html'
|
||||
@@ -105,6 +106,7 @@ def link_details_template(link: Link) -> str:
|
||||
'status_color': 'success' if link.is_archived else 'danger',
|
||||
'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),
|
||||
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
|
||||
'PREVIEW_ORIGINALS': PREVIEW_ORIGINALS,
|
||||
})
|
||||
|
||||
@enforce_types
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
__package__ = 'archivebox.index'
|
||||
|
||||
import re
|
||||
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from typing import List, Tuple, Iterator
|
||||
@@ -8,7 +10,10 @@ from django.db import transaction
|
||||
|
||||
from .schema import Link
|
||||
from ..util import enforce_types, parse_date
|
||||
from ..config import OUTPUT_DIR
|
||||
from ..config import (
|
||||
OUTPUT_DIR,
|
||||
TAG_SEPARATOR_PATTERN,
|
||||
)
|
||||
|
||||
|
||||
### Main Links Index
|
||||
@@ -33,9 +38,11 @@ def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir:
|
||||
def write_link_to_sql_index(link: Link):
|
||||
from core.models import Snapshot, ArchiveResult
|
||||
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
|
||||
tags = info.pop("tags")
|
||||
if tags is None:
|
||||
tags = []
|
||||
|
||||
tag_list = list(dict.fromkeys(
|
||||
tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '')
|
||||
))
|
||||
info.pop('tags')
|
||||
|
||||
try:
|
||||
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
|
||||
@@ -44,7 +51,7 @@ def write_link_to_sql_index(link: Link):
|
||||
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
|
||||
|
||||
snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
|
||||
snapshot.save_tags(tags)
|
||||
snapshot.save_tags(tag_list)
|
||||
|
||||
for extractor, entries in link.history.items():
|
||||
for entry in entries:
|
||||
@@ -104,10 +111,9 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
|
||||
snap = write_link_to_sql_index(link)
|
||||
snap.title = link.title
|
||||
|
||||
tag_set = (
|
||||
set(tag.strip() for tag in (link.tags or '').split(','))
|
||||
)
|
||||
tag_list = list(tag_set) or []
|
||||
tag_list = list(dict.fromkeys(
|
||||
tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '')
|
||||
))
|
||||
|
||||
snap.save()
|
||||
snap.save_tags(tag_list)
|
||||
|
||||
@@ -432,7 +432,13 @@ def log_archive_method_finished(result: "ArchiveResult"):
|
||||
# Prettify error output hints string and limit to five lines
|
||||
hints = getattr(result.output, 'hints', None) or ()
|
||||
if hints:
|
||||
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
|
||||
if isinstance(hints, (list, tuple, type(_ for _ in ()))):
|
||||
hints = [hint.decode() for hint in hints if isinstance(hint, bytes)]
|
||||
else:
|
||||
if isinstance(hints, bytes):
|
||||
hints = hints.decode()
|
||||
hints = hints.split('\n')
|
||||
|
||||
hints = (
|
||||
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
|
||||
for line in hints[:5] if line.strip()
|
||||
@@ -566,7 +572,7 @@ def printable_config(config: ConfigDict, prefix: str='') -> str:
|
||||
def printable_folder_status(name: str, folder: Dict) -> str:
|
||||
if folder['enabled']:
|
||||
if folder['is_valid']:
|
||||
color, symbol, note = 'green', '√', 'valid'
|
||||
color, symbol, note, num_files = 'green', '√', 'valid', ''
|
||||
else:
|
||||
color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
|
||||
else:
|
||||
@@ -581,6 +587,10 @@ def printable_folder_status(name: str, folder: Dict) -> str:
|
||||
)
|
||||
else:
|
||||
num_files = 'missing'
|
||||
|
||||
if folder.get('is_mount'):
|
||||
# add symbol @ next to filecount if path is a remote filesystem mount
|
||||
num_files = f'{num_files} @' if num_files else '@'
|
||||
|
||||
path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else ''
|
||||
if path and ' ' in path:
|
||||
|
||||
@@ -4,8 +4,9 @@ import os
|
||||
import sys
|
||||
import shutil
|
||||
import platform
|
||||
from django.utils import timezone
|
||||
from pathlib import Path
|
||||
from datetime import date
|
||||
from datetime import date, datetime
|
||||
|
||||
from typing import Dict, List, Optional, Iterable, IO, Union
|
||||
from crontab import CronTab, CronSlices
|
||||
@@ -70,7 +71,12 @@ from .config import (
|
||||
IS_TTY,
|
||||
DEBUG,
|
||||
IN_DOCKER,
|
||||
PUID,
|
||||
PGID,
|
||||
USER,
|
||||
TIMEZONE,
|
||||
ENFORCE_ATOMIC_WRITES,
|
||||
OUTPUT_PERMISSIONS,
|
||||
PYTHON_BINARY,
|
||||
ARCHIVEBOX_BINARY,
|
||||
ONLY_NEW,
|
||||
@@ -90,6 +96,7 @@ from .config import (
|
||||
check_data_folder,
|
||||
write_config_file,
|
||||
VERSION,
|
||||
COMMIT_HASH,
|
||||
CODE_LOCATIONS,
|
||||
EXTERNAL_LOCATIONS,
|
||||
DATA_LOCATIONS,
|
||||
@@ -203,32 +210,44 @@ def help(out_dir: Path=OUTPUT_DIR) -> None:
|
||||
def version(quiet: bool=False,
|
||||
out_dir: Path=OUTPUT_DIR) -> None:
|
||||
"""Print the ArchiveBox version and dependency information"""
|
||||
|
||||
if quiet:
|
||||
print(VERSION)
|
||||
else:
|
||||
# ArchiveBox v0.5.6
|
||||
# Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
|
||||
print('ArchiveBox v{}'.format(VERSION))
|
||||
|
||||
print(VERSION)
|
||||
|
||||
if not quiet:
|
||||
# 0.6.3
|
||||
# ArchiveBox v0.6.3 Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
|
||||
# DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 501:20 SEARCH_BACKEND=ripgrep
|
||||
|
||||
p = platform.uname()
|
||||
print(
|
||||
'ArchiveBox v{}'.format(VERSION),
|
||||
*((COMMIT_HASH[:7],) if COMMIT_HASH else ()),
|
||||
sys.implementation.name.title(),
|
||||
p.system,
|
||||
platform.platform(),
|
||||
p.machine,
|
||||
)
|
||||
OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
|
||||
print(
|
||||
f'IN_DOCKER={IN_DOCKER}',
|
||||
f'DEBUG={DEBUG}',
|
||||
f'IN_DOCKER={IN_DOCKER}',
|
||||
f'IS_TTY={IS_TTY}',
|
||||
f'TZ={os.environ.get("TZ", "UTC")}',
|
||||
f'SEARCH_BACKEND_ENGINE={SEARCH_BACKEND_ENGINE}',
|
||||
f'TZ={TIMEZONE}',
|
||||
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
|
||||
f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
|
||||
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
|
||||
f'FS_PERMS={OUTPUT_PERMISSIONS} {PUID}:{PGID}',
|
||||
f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
|
||||
)
|
||||
print()
|
||||
|
||||
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
|
||||
for name, dependency in DEPENDENCIES.items():
|
||||
print(printable_dependency_version(name, dependency))
|
||||
|
||||
# add a newline between core dependencies and extractor dependencies for easier reading
|
||||
if name == 'ARCHIVEBOX_BINARY':
|
||||
print()
|
||||
|
||||
print()
|
||||
print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
|
||||
@@ -427,7 +446,7 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
|
||||
print(' archivebox server # then visit http://127.0.0.1:8000')
|
||||
print()
|
||||
print(' To add new links, you can run:')
|
||||
print(" archivebox add ~/some/path/or/url/to/list_of_links.txt")
|
||||
print(" archivebox add < ~/some/path/to/list_of_links.txt")
|
||||
print()
|
||||
print(' For more usage and examples, run:')
|
||||
print(' archivebox help')
|
||||
@@ -554,7 +573,8 @@ def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR):
|
||||
def add(urls: Union[str, List[str]],
|
||||
tag: str='',
|
||||
depth: int=0,
|
||||
update_all: bool=not ONLY_NEW,
|
||||
update: bool=not ONLY_NEW,
|
||||
update_all: bool=False,
|
||||
index_only: bool=False,
|
||||
overwrite: bool=False,
|
||||
# duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
|
||||
@@ -587,6 +607,7 @@ def add(urls: Union[str, List[str]],
|
||||
# save verbatim args to sources
|
||||
write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
|
||||
|
||||
|
||||
new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser)
|
||||
|
||||
# If we're going one level deeper, download each link and look for more links
|
||||
@@ -594,8 +615,11 @@ def add(urls: Union[str, List[str]],
|
||||
if new_links and depth == 1:
|
||||
log_crawl_started(new_links)
|
||||
for new_link in new_links:
|
||||
downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
|
||||
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
|
||||
try:
|
||||
downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
|
||||
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
|
||||
except Exception as err:
|
||||
stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')
|
||||
|
||||
imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
|
||||
|
||||
@@ -618,11 +642,21 @@ def add(urls: Union[str, List[str]],
|
||||
if extractors:
|
||||
archive_kwargs["methods"] = extractors
|
||||
|
||||
if update_all:
|
||||
stderr()
|
||||
|
||||
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
|
||||
|
||||
if update:
|
||||
stderr(f'[*] [{ts}] Archiving + updating {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
|
||||
archive_links(imported_links, overwrite=overwrite, **archive_kwargs)
|
||||
elif update_all:
|
||||
stderr(f'[*] [{ts}] Archiving + updating {len(all_links)}/{len(all_links)}', len(all_links), 'URLs from entire library...', color='green')
|
||||
archive_links(all_links, overwrite=overwrite, **archive_kwargs)
|
||||
elif overwrite:
|
||||
stderr(f'[*] [{ts}] Archiving + overwriting {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
|
||||
archive_links(imported_links, overwrite=True, **archive_kwargs)
|
||||
elif new_links:
|
||||
stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green')
|
||||
archive_links(new_links, overwrite=False, **archive_kwargs)
|
||||
|
||||
|
||||
@@ -1113,6 +1147,7 @@ def schedule(add: bool=False,
|
||||
every: Optional[str]=None,
|
||||
depth: int=0,
|
||||
overwrite: bool=False,
|
||||
update: bool=not ONLY_NEW,
|
||||
import_path: Optional[str]=None,
|
||||
out_dir: Path=OUTPUT_DIR):
|
||||
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
|
||||
@@ -1142,6 +1177,7 @@ def schedule(add: bool=False,
|
||||
*([
|
||||
'add',
|
||||
*(['--overwrite'] if overwrite else []),
|
||||
*(['--update'] if update else []),
|
||||
f'--depth={depth}',
|
||||
f'"{import_path}"',
|
||||
] if import_path else ['update']),
|
||||
|
||||
@@ -149,7 +149,17 @@ def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None,
|
||||
def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: Path=OUTPUT_DIR) -> str:
|
||||
ts = str(datetime.now(timezone.utc).timestamp()).split('.', 1)[0]
|
||||
source_path = str(out_dir / SOURCES_DIR_NAME / filename.format(ts=ts))
|
||||
atomic_write(source_path, raw_text)
|
||||
|
||||
referenced_texts = ''
|
||||
|
||||
for entry in raw_text.split():
|
||||
try:
|
||||
if Path(entry).exists():
|
||||
referenced_texts += Path(entry).read_text()
|
||||
except Exception as err:
|
||||
print(err)
|
||||
|
||||
atomic_write(source_path, raw_text + '\n' + referenced_texts)
|
||||
log_source_saved(source_file=source_path)
|
||||
return source_path
|
||||
|
||||
@@ -176,7 +186,7 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
|
||||
ANSI['reset'],
|
||||
))
|
||||
print(' ', e)
|
||||
raise SystemExit(1)
|
||||
raise e
|
||||
|
||||
else:
|
||||
# Source is a path to a local file on the filesystem
|
||||
|
||||
@@ -47,11 +47,11 @@ def get_pocket_articles(api: Pocket, since=None, page=0):
|
||||
|
||||
|
||||
def link_from_article(article: dict, sources: list):
|
||||
url: str = article['resolved_url'] or article['given_url']
|
||||
url: str = article.get('resolved_url') or article['given_url']
|
||||
broken_protocol = _BROKEN_PROTOCOL_RE.match(url)
|
||||
if broken_protocol:
|
||||
url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://')
|
||||
title = article['resolved_title'] or article['given_title'] or url
|
||||
title = article.get('resolved_title') or article.get('given_title') or url
|
||||
|
||||
return Link(
|
||||
url=url,
|
||||
|
||||
@@ -34,13 +34,19 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||
|
||||
trailing_removed = entry.split('</entry>', 1)[0]
|
||||
leading_removed = trailing_removed.strip()
|
||||
rows = leading_removed.split('\n')
|
||||
splits_fixed = leading_removed.replace('"\n href="', '" href="')
|
||||
rows = splits_fixed.split('\n')
|
||||
|
||||
def get_row(key):
|
||||
return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
|
||||
def get_row(prefix):
|
||||
return [
|
||||
row.strip()
|
||||
for row in rows
|
||||
if row.strip().startswith('<{}'.format(prefix))
|
||||
][0]
|
||||
|
||||
title = str_between(get_row('title'), '<title><![CDATA[', ']]></title>').strip()
|
||||
url = str_between(get_row('link rel="via"'), '<link rel="via">', '</link>')
|
||||
url_inside_link = str_between(get_row('link rel="via"'), '<link rel="via">', '</link>')
|
||||
url_inside_attr = str_between(get_row('link rel="via"'), 'href="', '"/>')
|
||||
ts_str = str_between(get_row('published'), '<published>', '</published>')
|
||||
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
|
||||
try:
|
||||
@@ -49,7 +55,7 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
|
||||
tags = None
|
||||
|
||||
yield Link(
|
||||
url=htmldecode(url),
|
||||
url=htmldecode(url_inside_attr or url_inside_link),
|
||||
timestamp=str(time.timestamp()),
|
||||
title=htmldecode(title) or None,
|
||||
tags=tags or '',
|
||||
|
||||
@@ -197,7 +197,7 @@
|
||||
|
||||
// select the action button from the dropdown
|
||||
container.find('select[name=action]')
|
||||
.find('op:selected').removeAttr('selected').end()
|
||||
.find('[selected]').removeAttr('selected').end()
|
||||
.find('[value=' + action_type + ']').attr('selected', 'selected').click()
|
||||
|
||||
// click submit & replace the archivebox logo with a spinner
|
||||
|
||||
@@ -28,6 +28,14 @@
|
||||
<a href="/add" id="submit"> Add more URLs ➕</a>
|
||||
</center>
|
||||
{% else %}
|
||||
<div id="in-progress" style="display: none;">
|
||||
<center><h3>Adding URLs to index and running archive methods...</h3>
|
||||
<br/>
|
||||
<div class="loader"></div>
|
||||
<br/>
|
||||
Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for progress...
|
||||
</center>
|
||||
</div>
|
||||
<form id="add-form" method="POST" class="p-form">{% csrf_token %}
|
||||
<h1>Add new URLs to your archive</h1>
|
||||
<br/>
|
||||
@@ -48,10 +56,9 @@
|
||||
{% endif %}
|
||||
<script>
|
||||
document.getElementById('add-form').addEventListener('submit', function(event) {
|
||||
setTimeout(function() {
|
||||
document.getElementById('add-form').innerHTML = '<center><h3>Adding URLs to index and running archive methods...<h3><br/><div class="loader"></div><br/>Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for progress...</center>'
|
||||
document.getElementById('delay-warning').style.display = 'block'
|
||||
}, 200)
|
||||
document.getElementById('in-progress').style.display = 'block'
|
||||
document.getElementById('add-form').style.display = 'none'
|
||||
document.getElementById('delay-warning').style.display = 'block'
|
||||
return true
|
||||
})
|
||||
</script>
|
||||
|
||||
@@ -414,6 +414,7 @@
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
{% if PREVIEW_ORIGINALS %}
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy" referrerpolicy="no-referrer"></iframe>
|
||||
@@ -427,6 +428,7 @@
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
{% endif %}
|
||||
<div class="col-lg-2">
|
||||
<div class="card">
|
||||
<iframe class="card-img-top" src="{{headers_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>
|
||||
|
||||
Reference in New Issue
Block a user