Merge branch 'dev' into feat/reverse-proxy-auth

2026-04-05 23:37:58 +10:00 · 2023-01-09 18:20:45 -08:00
parent d4f534e612 0cbeeb4346
commit 2538b170c7
36 changed files with 625 additions and 292 deletions
--- a/archivebox/cli/archivebox_add.py
+++ b/archivebox/cli/archivebox_add.py
@@ -30,11 +30,17 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
        help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
    )
    parser.add_argument(
-        '--update-all', #'-n',
+        '--update', #'-u',
        action='store_true',
        default=not ONLY_NEW,  # when ONLY_NEW=True we skip updating old links
        help="Also retry previously skipped/failed links when adding new links",
    )
+    parser.add_argument(
+        '--update-all', #'-n',
+        action='store_true',
+        default=False, 
+        help="Also update ALL links in index when finished adding new links",
+    )
    parser.add_argument(
        '--index-only', #'-o',
        action='store_true',
@@ -104,6 +110,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
        urls=stdin_urls or urls,
        depth=command.depth,
        tag=command.tag,
+        update=command.update,
        update_all=command.update_all,
        index_only=command.index_only,
        overwrite=command.overwrite,
--- a/archivebox/cli/archivebox_schedule.py
+++ b/archivebox/cli/archivebox_schedule.py
@@ -51,6 +51,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
        action='store_true',
        help='Re-archive any URLs that have been previously archived, overwriting existing Snapshots',
    )
+    parser.add_argument(
+        '--update',
+        action='store_true',
+        help='Re-pull any URLs that have been previously added, as needed to fill missing ArchiveResults',
+    )
    group.add_argument(
        '--clear', # '-c'
        action='store_true',
@@ -94,6 +99,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
        every=command.every,
        depth=command.depth,
        overwrite=command.overwrite,
+        update=command.update,
        import_path=command.import_path,
        out_dir=pwd or OUTPUT_DIR,
    )
--- a/archivebox/config.py
+++ b/archivebox/config.py
@@ -26,11 +26,12 @@ import io
 import re
 import sys
 import json
+import inspect
 import getpass
 import platform
 import shutil
-import sqlite3
 import django
+from sqlite3 import dbapi2 as sqlite3

 from hashlib import md5
 from pathlib import Path
@@ -48,6 +49,9 @@ from .config_stubs import (
    ConfigDefaultDict,
 )

+
+### Pre-Fetch Minimal System Config
+
 SYSTEM_USER = getpass.getuser() or os.getlogin()

 try:
@@ -65,6 +69,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'USE_COLOR':                {'type': bool,  'default': lambda c: c['IS_TTY']},
        'SHOW_PROGRESS':            {'type': bool,  'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')},  # progress bars are buggy on mac, disable for now
        'IN_DOCKER':                {'type': bool,  'default': False},
+        'PUID':                     {'type': int,   'default': os.getuid()},
+        'PGID':                     {'type': int,   'default': os.getgid()},
        # TODO: 'SHOW_HINTS':       {'type:  bool,  'default': True},
    },

@@ -79,6 +85,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'URL_BLACKLIST':            {'type': str,   'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'},  # to avoid downloading code assets as their own pages
        'URL_WHITELIST':            {'type': str,   'default': None},
        'ENFORCE_ATOMIC_WRITES':    {'type': bool,  'default': True},
+        'TAG_SEPARATOR_PATTERN':    {'type': str,   'default': r'[,]'},
    },

    'SERVER_CONFIG': {
@@ -93,9 +100,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'SNAPSHOTS_PER_PAGE':        {'type': int,   'default': 40},
        'CUSTOM_TEMPLATES_DIR':      {'type': str,   'default': None},
        'TIME_ZONE':                 {'type': str,   'default': 'UTC'},
+        'TIMEZONE':                 {'type': str,   'default': 'UTC'},
        'REVERSE_PROXY_USER_HEADER': {'type': str,   'default': 'Remote-User'},
        'REVERSE_PROXY_WHITELIST':   {'type': str,   'default': ''},
        'LOGOUT_REDIRECT_URL':       {'type': str,   'default': '/'},
+        'PREVIEW_ORIGINALS':        {'type': bool,  'default': True},
    },

    'ARCHIVE_METHOD_TOGGLES': {
@@ -122,9 +131,9 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'CHECK_SSL_VALIDITY':       {'type': bool,  'default': True},
        'MEDIA_MAX_SIZE':           {'type': str,   'default': '750m'},

-        'CURL_USER_AGENT':          {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
-        'WGET_USER_AGENT':          {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
-        'CHROME_USER_AGENT':        {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
+        'CURL_USER_AGENT':          {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
+        'WGET_USER_AGENT':          {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
+        'CHROME_USER_AGENT':        {'type': str,   'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},

        'COOKIES_FILE':             {'type': str,   'default': None},
        'CHROME_USER_DATA_DIR':     {'type': str,   'default': None},
@@ -139,10 +148,18 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
                                                                '--no-call-home',
                                                                '--write-sub',
                                                                '--all-subs',
-                                                                '--write-auto-sub',
+                                                                # There are too many of these and youtube
+                                                                # throttles you with HTTP error 429
+                                                                #'--write-auto-subs',
                                                                '--convert-subs=srt',
                                                                '--yes-playlist',
                                                                '--continue',
+                                                                # This flag doesn't exist in youtube-dl
+                                                                # only in yt-dlp
+                                                                '--no-abort-on-error',
+                                                                # --ignore-errors must come AFTER
+                                                                # --no-abort-on-error
+                                                                # https://github.com/yt-dlp/yt-dlp/issues/4914
                                                                '--ignore-errors',
                                                                '--geo-bypass',
                                                                '--add-metadata',
@@ -164,6 +181,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
                                                                '--compressed'
                                                               ]},
        'GIT_ARGS':                 {'type': list,  'default': ['--recursive']},
+        'SINGLEFILE_ARGS':          {'type': list,  'default' : None}
    },

    'SEARCH_BACKEND_CONFIG' : {
@@ -197,7 +215,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
        'SINGLEFILE_BINARY':        {'type': str,   'default': lambda c: bin_path('single-file')},
        'READABILITY_BINARY':       {'type': str,   'default': lambda c: bin_path('readability-extractor')},
        'MERCURY_BINARY':           {'type': str,   'default': lambda c: bin_path('mercury-parser')},
-        'YOUTUBEDL_BINARY':         {'type': str,   'default': 'youtube-dl'},
+        #'YOUTUBEDL_BINARY':         {'type': str,   'default': 'youtube-dl'},
+        'YOUTUBEDL_BINARY':         {'type': str,   'default': 'yt-dlp'},
        'NODE_BINARY':              {'type': str,   'default': 'node'},
        'RIPGREP_BINARY':           {'type': str,   'default': 'rg'},
        'CHROME_BINARY':            {'type': str,   'default': None},
@@ -321,6 +340,15 @@ ALLOWED_IN_OUTPUT_DIR = {
    'static_index.json',
 }

+def get_version(config):
+    return json.loads((Path(config['PACKAGE_DIR']) / 'package.json').read_text(encoding='utf-8').strip())['version']
+
+def get_commit_hash(config):
+    try:
+        return list((config['PACKAGE_DIR'] / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
+    except Exception:
+        return None
+
 ############################## Derived Config ##################################


@@ -345,14 +373,20 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
    'DIR_OUTPUT_PERMISSIONS':   {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},

    'ARCHIVEBOX_BINARY':        {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
-    'VERSION':                  {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text(encoding='utf-8').strip())['version']},
-
+    'VERSION':                  {'default': lambda c: get_version(c)},
+    'COMMIT_HASH':              {'default': lambda c: get_commit_hash(c)},
+    
    'PYTHON_BINARY':            {'default': lambda c: sys.executable},
    'PYTHON_ENCODING':          {'default': lambda c: sys.stdout.encoding.upper()},
    'PYTHON_VERSION':           {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},

-    'DJANGO_BINARY':            {'default': lambda c: django.__file__.replace('__init__.py', 'bin/django-admin.py')},
+    'DJANGO_BINARY':            {'default': lambda c: inspect.getfile(django)},
    'DJANGO_VERSION':           {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)},
+    
+    'SQLITE_BINARY':            {'default': lambda c: inspect.getfile(sqlite3)},
+    'SQLITE_VERSION':           {'default': lambda c: sqlite3.version},
+    #'SQLITE_JOURNAL_MODE':      {'default': lambda c: 'wal'},         # set at runtime below, interesting but unused for now
+    #'SQLITE_OPTIONS':           {'default': lambda c: ['JSON1']},     # set at runtime below

    'USE_CURL':                 {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
    'CURL_VERSION':             {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
@@ -373,6 +407,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {

    'USE_SINGLEFILE':           {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
    'SINGLEFILE_VERSION':       {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
+    'SINGLEFILE_ARGS':          {'default': lambda c: c['SINGLEFILE_ARGS'] or []},

    'USE_READABILITY':          {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
    'READABILITY_VERSION':      {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
@@ -652,7 +687,9 @@ def bin_version(binary: Optional[str]) -> Optional[str]:
        return None

    try:
-        version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
+        version_str = run([abspath, "--version"], stdout=PIPE, env={'LANG': 'C'}).stdout.strip().decode()
+        if not version_str:
+            version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
        # take first 3 columns of first line of version info
        return ' '.join(version_str.split('\n')[0].strip().split()[:3])
    except OSError:
@@ -795,6 +832,7 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
            'path': config['OUTPUT_DIR'].resolve(),
            'enabled': True,
            'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
+            'is_mount': os.path.ismount(config['OUTPUT_DIR'].resolve()),
        },
        'SOURCES_DIR': {
            'path': config['SOURCES_DIR'].resolve(),
@@ -810,6 +848,7 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
            'path': config['ARCHIVE_DIR'].resolve(),
            'enabled': True,
            'is_valid': config['ARCHIVE_DIR'].exists(),
+            'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()),
        },
        'CONFIG_FILE': {
            'path': config['CONFIG_FILE'].resolve(),
@@ -820,18 +859,12 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
            'path': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve(),
            'enabled': True,
            'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
+            'is_mount': os.path.ismount((config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve()),
        },
    }

 def get_dependency_info(config: ConfigDict) -> ConfigValue:
    return {
-        'ARCHIVEBOX_BINARY': {
-            'path': bin_path(config['ARCHIVEBOX_BINARY']),
-            'version': config['VERSION'],
-            'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
-            'enabled': True,
-            'is_valid': True,
-        },
        'PYTHON_BINARY': {
            'path': bin_path(config['PYTHON_BINARY']),
            'version': config['PYTHON_VERSION'],
@@ -839,6 +872,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
            'enabled': True,
            'is_valid': bool(config['PYTHON_VERSION']),
        },
+        'SQLITE_BINARY': {
+            'path': bin_path(config['SQLITE_BINARY']),
+            'version': config['SQLITE_VERSION'],
+            'hash': bin_hash(config['SQLITE_BINARY']),
+            'enabled': True,
+            'is_valid': bool(config['SQLITE_VERSION']),
+        },
        'DJANGO_BINARY': {
            'path': bin_path(config['DJANGO_BINARY']),
            'version': config['DJANGO_VERSION'],
@@ -846,6 +886,14 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
            'enabled': True,
            'is_valid': bool(config['DJANGO_VERSION']),
        },
+        'ARCHIVEBOX_BINARY': {
+            'path': bin_path(config['ARCHIVEBOX_BINARY']),
+            'version': config['VERSION'],
+            'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
+            'enabled': True,
+            'is_valid': True,
+        },
+        
        'CURL_BINARY': {
            'path': bin_path(config['CURL_BINARY']),
            'version': config['CURL_VERSION'],
@@ -931,7 +979,7 @@ def get_chrome_info(config: ConfigDict) -> ConfigValue:
        'TIMEOUT': config['TIMEOUT'],
        'RESOLUTION': config['RESOLUTION'],
        'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'],
-        'CHROME_BINARY': config['CHROME_BINARY'],
+        'CHROME_BINARY': bin_path(config['CHROME_BINARY']),
        'CHROME_HEADLESS': config['CHROME_HEADLESS'],
        'CHROME_SANDBOX': config['CHROME_SANDBOX'],
        'CHROME_USER_AGENT': config['CHROME_USER_AGENT'],
@@ -972,13 +1020,22 @@ globals().update(CONFIG)


 # Set timezone to UTC and umask to OUTPUT_PERMISSIONS
-os.environ["TZ"] = 'UTC'
+assert TIMEZONE == 'UTC', 'The server timezone should always be set to UTC'  # we may allow this to change later
+os.environ["TZ"] = TIMEZONE
 os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8))  # noqa: F821

 # add ./node_modules/.bin to $PATH so we can use node scripts in extractors
 NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))
 sys.path.append(NODE_BIN_PATH)

+# OPTIONAL: also look around the host system for node modules to use
+#   avoid enabling this unless absolutely needed,
+#   having overlapping potential sources of libs is a big source of bugs/confusing to users
+# DEV_NODE_BIN_PATH = str((Path(CONFIG["PACKAGE_DIR"]).absolute() / '..' / 'node_modules' / '.bin'))
+# sys.path.append(DEV_NODE_BIN_PATH)
+# USER_NODE_BIN_PATH = str(Path('~/.node_modules/.bin').resolve())
+# sys.path.append(USER_NODE_BIN_PATH)
+
 # disable stderr "you really shouldnt disable ssl" warnings with library config
 if not CONFIG['CHECK_SSL_VALIDITY']:
    import urllib3
@@ -986,6 +1043,13 @@ if not CONFIG['CHECK_SSL_VALIDITY']:
    requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

+# get SQLite database version, compile options, and runtime options
+# TODO: make this a less hacky proper assertion checker helper function in somewhere like setup_django
+#cursor = sqlite3.connect(':memory:').cursor()
+#DYNAMIC_CONFIG_SCHEMA['SQLITE_VERSION'] = lambda c: cursor.execute("SELECT sqlite_version();").fetchone()[0]
+#DYNAMIC_CONFIG_SCHEMA['SQLITE_JOURNAL_MODE'] = lambda c: cursor.execute('PRAGMA journal_mode;').fetchone()[0]
+#DYNAMIC_CONFIG_SCHEMA['SQLITE_OPTIONS'] = lambda c: [option[0] for option in cursor.execute('PRAGMA compile_options;').fetchall()]
+#cursor.close()

 ########################### Config Validity Checkers ###########################

@@ -1082,6 +1146,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
        stderr('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
        stderr()

+        
 def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG) -> None:
    output_dir = out_dir or config['OUTPUT_DIR']
    assert isinstance(output_dir, (str, Path))
@@ -1156,11 +1221,10 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
            # without running migrations automatically (user runs them manually by calling init)
            django.setup()

-
        from django.conf import settings

        # log startup message to the error log
-        with open(settings.ERROR_LOG, "a+", encoding='utf-8') as f:
+        with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
            command = ' '.join(sys.argv)
            ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
            f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
@@ -1170,10 +1234,17 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
            # Enable WAL mode in sqlite3
            from django.db import connection
            with connection.cursor() as cursor:
+
+                # Set Journal mode to WAL to allow for multiple writers
                current_mode = cursor.execute("PRAGMA journal_mode")
                if current_mode != 'wal':
                    cursor.execute("PRAGMA journal_mode=wal;")

+                # Set max blocking delay for concurrent writes and write sync mode
+                # https://litestream.io/tips/#busy-timeout
+                cursor.execute("PRAGMA busy_timeout = 5000;")
+                cursor.execute("PRAGMA synchronous = NORMAL;")
+
            # Create cache table in DB if needed
            try:
                from django.core.cache import cache
@@ -1181,7 +1252,6 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
            except django.db.utils.OperationalError:
                call_command("createcachetable", verbosity=0)

-
            # if archivebox gets imported multiple times, we have to close
            # the sqlite3 whenever we init from scratch to avoid multiple threads
            # sharing the same connection by accident
--- a/archivebox/config_stubs.py
+++ b/archivebox/config_stubs.py
@@ -98,6 +98,7 @@ class ConfigDict(BaseConfig, total=False):
    WGET_ARGS: List[str]
    CURL_ARGS: List[str]
    GIT_ARGS: List[str]
+    TAG_SEPARATOR_PATTERN: str


 ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue]
--- a/archivebox/core/migrations/0021_auto_20220914_0934.py
+++ b/archivebox/core/migrations/0021_auto_20220914_0934.py
@@ -0,0 +1,18 @@
+# Generated by Django 3.1.14 on 2022-09-14 09:34
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('core', '0020_auto_20210410_1031'),
+    ]
+
+    operations = [
+        migrations.AlterField(
+            model_name='archiveresult',
+            name='extractor',
+            field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
+        ),
+    ]
--- a/archivebox/core/settings.py
+++ b/archivebox/core/settings.py
@@ -19,7 +19,7 @@ from ..config import (
    SQL_INDEX_FILENAME,
    OUTPUT_DIR,
    LOGS_DIR,
-    TIME_ZONE,
+    TIMEZONE,
 )

 IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
@@ -157,7 +157,7 @@ DATABASES = {
            'timeout': 60,
            'check_same_thread': False,
        },
-        'TIME_ZONE': 'UTC',
+        'TIME_ZONE': TIMEZONE,
        # DB setup is sometimes modified at runtime by setup_django() in config.py
    }
 }
@@ -227,7 +227,8 @@ USE_L10N = True
 USE_TZ = True
 DATETIME_FORMAT = 'Y-m-d g:iA'
 SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'
-TIME_ZONE = TIME_ZONE                            # noqa
+TIME_ZONE = TIMEZONE        # django convention is TIME_ZONE, archivebox config uses TIMEZONE, they are equivalent
+

 from django.conf.locale.en import formats as en_formats

--- a/archivebox/core/urls.py
+++ b/archivebox/core/urls.py
@@ -6,7 +6,7 @@ from django.contrib.staticfiles.urls import staticfiles_urlpatterns
 from django.conf import settings
 from django.views.generic.base import RedirectView

-from core.views import HomepageView, SnapshotView, PublicIndexView, AddView
+from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView


 # print('DEBUG', settings.DEBUG)
@@ -24,14 +24,16 @@ urlpatterns = [

    path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
    path('add/', AddView.as_view(), name='add'),
-    
+
    path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
    path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),


    path('accounts/', include('django.contrib.auth.urls')),
    path('admin/', admin.site.urls),
-    
+
+    path('health/', HealthCheckView.as_view(), name='healthcheck'),
+
    path('index.html', RedirectView.as_view(url='/')),
    path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),
    path('', HomepageView.as_view(), name='Home'),
--- a/archivebox/core/views.py
+++ b/archivebox/core/views.py
@@ -38,7 +38,7 @@ class HomepageView(View):

        if PUBLIC_INDEX:
            return redirect('/public')
-        
+
        return redirect(f'/admin/login/?next={request.path}')


@@ -205,7 +205,7 @@ class SnapshotView(View):
                content_type="text/html",
                status=404,
            )
-        
+

 class PublicIndexView(ListView):
    template_name = 'public_index.html'
@@ -220,7 +220,7 @@ class PublicIndexView(ListView):
            'FOOTER_INFO': FOOTER_INFO,
        }

-    def get_queryset(self, **kwargs): 
+    def get_queryset(self, **kwargs):
        qs = super().get_queryset(**kwargs)
        query = self.request.GET.get('q')
        if query and query.strip():
@@ -249,7 +249,7 @@ class AddView(UserPassesTestMixin, FormView):
            url = self.request.GET.get('url', None)
            if url:
                return {'url': url if '://' in url else f'https://{url}'}
-        
+
        return super().get_initial()

    def test_func(self):
@@ -295,3 +295,18 @@ class AddView(UserPassesTestMixin, FormView):
            "form": AddLinkForm()
        })
        return render(template_name=self.template_name, request=self.request, context=context)
+
+
+class HealthCheckView(View):
+    """
+    A Django view that renders plain text "OK" for service discovery tools
+    """
+    def get(self, request):
+        """
+        Handle a GET request
+        """
+        return HttpResponse(
+            'OK',
+            content_type='text/plain',
+            status=200
+        )
--- a/archivebox/extractors/init.py
+++ b/archivebox/extractors/init.py
@@ -1,12 +1,14 @@
 __package__ = 'archivebox.extractors'

 import os
+import sys
 from pathlib import Path

 from typing import Optional, List, Iterable, Union
 from datetime import datetime, timezone
 from django.db.models import QuerySet

+from ..core.settings import ERROR_LOG
 from ..index.schema import Link
 from ..index.sql import write_link_to_sql_index
 from ..index import (
@@ -42,7 +44,6 @@ from .headers import should_save_headers, save_headers

 def get_default_archive_methods():
    return [
-        ('title', should_save_title, save_title),
        ('favicon', should_save_favicon, save_favicon),
        ('headers', should_save_headers, save_headers),
        ('singlefile', should_save_singlefile, save_singlefile),
@@ -50,7 +51,8 @@ def get_default_archive_methods():
        ('screenshot', should_save_screenshot, save_screenshot),
        ('dom', should_save_dom, save_dom),
        ('wget', should_save_wget, save_wget),
-        ('readability', should_save_readability, save_readability),  # keep readability below wget and singlefile, as it depends on them
+        ('title', should_save_title, save_title),                   # keep title and readability below wget and singlefile, as it depends on them
+        ('readability', should_save_readability, save_readability),
        ('mercury', should_save_mercury, save_mercury),
        ('git', should_save_git, save_git),
        ('media', should_save_media, save_media),
@@ -127,10 +129,27 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
                    # print('{black}      X {}{reset}'.format(method_name, **ANSI))
                    stats['skipped'] += 1
            except Exception as e:
+                # Disabled until https://github.com/ArchiveBox/ArchiveBox/issues/984
+                # and https://github.com/ArchiveBox/ArchiveBox/issues/1014
+                # are fixed.
+                """
                raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
                    method_name,
                    link.url,
                )) from e
+                """
+                # Instead, use the kludgy workaround from
+                # https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
+                with open(ERROR_LOG, "a", encoding='utf-8') as f:
+                    command = ' '.join(sys.argv)
+                    ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
+                    f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format(
+                        method_name,
+                        link.url,
+                        command,
+                        ts
+                    ) + "\n"))
+                    #f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")

        # print('    ', stats)

@@ -182,7 +201,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
    except KeyboardInterrupt:
        log_archiving_paused(num_links, idx, link.timestamp)
        raise SystemExit(0)
-    except BaseException:                                                       # lgtm [py/catch-base-exception]
+    except BaseException:
        print()
        raise

--- a/archivebox/extractors/media.py
+++ b/archivebox/extractors/media.py
@@ -33,7 +33,7 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio

@enforce_types
 def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
-    """Download playlists or individual video, audio, and subtitles using youtube-dl"""
+    """Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""

    out_dir = out_dir or Path(link.link_dir)
    output: ArchiveOutput = 'media'
@@ -43,6 +43,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
        YOUTUBEDL_BINARY,
        *YOUTUBEDL_ARGS,
        *([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
+        # TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
        link.url,
    ]
    status = 'succeeded'
@@ -60,7 +61,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
                pass
            else:
                hints = (
-                    'Got youtube-dl response code: {}.'.format(result.returncode),
+                    'Got youtube-dl (or yt-dlp) response code: {}.'.format(result.returncode),
                    *result.stderr.decode().split('\n'),
                )
                raise ArchiveError('Failed to save media', hints)
@@ -71,8 +72,18 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
        timer.end()

    # add video description and subtitles to full-text index
+    # Let's try a few different 
    index_texts = [
-        text_file.read_text(encoding='utf-8').strip()
+        # errors:
+        # * 'strict' to raise a ValueError exception if there is an
+        #   encoding error. The default value of None has the same effect.
+        # * 'ignore' ignores errors. Note that ignoring encoding errors
+        #   can lead to data loss.
+        # * 'xmlcharrefreplace' is only supported when writing to a
+        #   file. Characters not supported by the encoding are replaced with
+        #   the appropriate XML character reference &#nnn;.
+        # There are a few more options described in https://docs.python.org/3/library/functions.html#open
+        text_file.read_text(encoding='utf-8', errors='xmlcharrefreplace').strip()
        for text_file in (
            *output_path.glob('*.description'),
            *output_path.glob('*.srt'),
--- a/archivebox/extractors/readability.py
+++ b/archivebox/extractors/readability.py
@@ -10,9 +10,7 @@ from ..index.schema import Link, ArchiveResult, ArchiveError
 from ..system import run, atomic_write
 from ..util import (
    enforce_types,
-    download_url,
    is_static_file,
-    
 )
 from ..config import (
    TIMEOUT,
@@ -22,28 +20,8 @@ from ..config import (
    READABILITY_VERSION,
 )
 from ..logging_util import TimedProgress
+from .title import get_html

-@enforce_types
-def get_html(link: Link, path: Path) -> str:
-    """
-    Try to find wget, singlefile and then dom files.
-    If none is found, download the url again.
-    """
-    canonical = link.canonical_outputs()
-    abs_path = path.absolute()
-    sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]]
-    document = None
-    for source in sources:
-        try:
-            with open(abs_path / source, "r", encoding="utf-8") as f:
-                document = f.read()
-                break
-        except (FileNotFoundError, TypeError):
-            continue
-    if document is None:
-        return download_url(link.url)
-    else:
-        return document

@enforce_types
 def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
--- a/archivebox/extractors/singlefile.py
+++ b/archivebox/extractors/singlefile.py
@@ -17,6 +17,7 @@ from ..config import (
    SAVE_SINGLEFILE,
    DEPENDENCIES,
    SINGLEFILE_VERSION,
+    SINGLEFILE_ARGS,
    CHROME_BINARY,
 )
 from ..logging_util import TimedProgress
@@ -45,10 +46,31 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO

    # SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
    browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
-    cmd = [
-        DEPENDENCIES['SINGLEFILE_BINARY']['path'],
+    options = [
+        *SINGLEFILE_ARGS,
        '--browser-executable-path={}'.format(CHROME_BINARY),
        browser_args,
+    ]
+
+    # Deduplicate options (single-file doesn't like when you use the same option two times)
+    #
+    # NOTE: Options names that come first clobber conflicting names that come later
+    # My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most 
+    # specificity, therefore the user sets it with a lot intent, therefore it should take precedence 
+    # kind of like the ergonomic principle of lexical scope in programming languages.
+    seen_option_names = []
+    def test_seen(argument):
+        option_name = argument.split("=")[0]
+        if option_name in seen_option_names:
+            return False
+        else:
+            seen_option_names.append(option_name)
+            return True
+    deduped_options = list(filter(test_seen, options))
+
+    cmd = [
+        DEPENDENCIES['SINGLEFILE_BINARY']['path'],
+        *deduped_options,
        link.url,
        output,
    ]
--- a/archivebox/extractors/title.py
+++ b/archivebox/extractors/title.py
@@ -58,6 +58,27 @@ class TitleParser(HTMLParser):
        if tag.lower() == "title":
            self.inside_title_tag = False

+@enforce_types
+def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
+    """
+    Try to find wget, singlefile and then dom files.
+    If none is found, download the url again.
+    """
+    canonical = link.canonical_outputs()
+    abs_path = path.absolute()
+    sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]]
+    document = None
+    for source in sources:
+        try:
+            with open(abs_path / source, "r", encoding="utf-8") as f:
+                document = f.read()
+                break
+        except (FileNotFoundError, TypeError):
+            continue
+    if document is None:
+        return download_url(link.url, timeout=timeout)
+    else:
+        return document

@enforce_types
 def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
@@ -90,7 +111,7 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
    status = 'succeeded'
    timer = TimedProgress(timeout, prefix='      ')
    try:
-        html = download_url(link.url, timeout=timeout)
+        html = get_html(link, out_dir, timeout=timeout)
        try:
            # try using relatively strict html parser first
            parser = TitleParser()
--- a/archivebox/index/html.py
+++ b/archivebox/index/html.py
@@ -24,6 +24,7 @@ from ..config import (
    FOOTER_INFO,
    HTML_INDEX_FILENAME,
    SAVE_ARCHIVE_DOT_ORG,
+    PREVIEW_ORIGINALS,
 )

 MAIN_INDEX_TEMPLATE = 'static_index.html'
@@ -105,6 +106,7 @@ def link_details_template(link: Link) -> str:
        'status_color': 'success' if link.is_archived else 'danger',
        'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),
        'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
+        'PREVIEW_ORIGINALS': PREVIEW_ORIGINALS,
    })

@enforce_types
--- a/archivebox/index/sql.py
+++ b/archivebox/index/sql.py
@@ -1,5 +1,7 @@
 __package__ = 'archivebox.index'

+import re
+
 from io import StringIO
 from pathlib import Path
 from typing import List, Tuple, Iterator
@@ -8,7 +10,10 @@ from django.db import transaction

 from .schema import Link
 from ..util import enforce_types, parse_date
-from ..config import OUTPUT_DIR
+from ..config import (
+    OUTPUT_DIR,
+    TAG_SEPARATOR_PATTERN,
+)


 ### Main Links Index
@@ -33,9 +38,11 @@ def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir:
 def write_link_to_sql_index(link: Link):
    from core.models import Snapshot, ArchiveResult
    info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
-    tags = info.pop("tags")
-    if tags is None:
-        tags = []
+
+    tag_list = list(dict.fromkeys(
+        tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '')
+    ))
+    info.pop('tags')

    try:
        info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
@@ -44,7 +51,7 @@ def write_link_to_sql_index(link: Link):
            info["timestamp"] = str(float(info["timestamp"]) + 1.0)

        snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
-    snapshot.save_tags(tags)
+    snapshot.save_tags(tag_list)

    for extractor, entries in link.history.items():
        for entry in entries:
@@ -104,10 +111,9 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
        snap = write_link_to_sql_index(link)
    snap.title = link.title

-    tag_set = (
-        set(tag.strip() for tag in (link.tags or '').split(','))
-    )
-    tag_list = list(tag_set) or []
+    tag_list = list(dict.fromkeys(
+        tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '')
+    ))

    snap.save()
    snap.save_tags(tag_list)
--- a/archivebox/logging_util.py
+++ b/archivebox/logging_util.py
@@ -432,7 +432,13 @@ def log_archive_method_finished(result: "ArchiveResult"):
        # Prettify error output hints string and limit to five lines
        hints = getattr(result.output, 'hints', None) or ()
        if hints:
-            hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
+            if isinstance(hints, (list, tuple, type(_ for _ in ()))):
+                hints = [hint.decode() for hint in hints if isinstance(hint, bytes)]
+            else:
+                if isinstance(hints, bytes):
+                    hints = hints.decode()
+                hints = hints.split('\n')
+
            hints = (
                '    {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
                for line in hints[:5] if line.strip()
@@ -566,7 +572,7 @@ def printable_config(config: ConfigDict, prefix: str='') -> str:
 def printable_folder_status(name: str, folder: Dict) -> str:
    if folder['enabled']:
        if folder['is_valid']:
-            color, symbol, note = 'green', '√', 'valid'
+            color, symbol, note, num_files = 'green', '√', 'valid', ''
        else:
            color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
    else:
@@ -581,6 +587,10 @@ def printable_folder_status(name: str, folder: Dict) -> str:
            )
        else:
            num_files = 'missing'
+        
+    if folder.get('is_mount'):
+        # add symbol @ next to filecount if path is a remote filesystem mount
+        num_files = f'{num_files} @' if num_files else '@'

    path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else ''
    if path and ' ' in path:
--- a/archivebox/main.py
+++ b/archivebox/main.py
@@ -4,8 +4,9 @@ import os
 import sys
 import shutil
 import platform
+from django.utils import timezone
 from pathlib import Path
-from datetime import date
+from datetime import date, datetime

 from typing import Dict, List, Optional, Iterable, IO, Union
 from crontab import CronTab, CronSlices
@@ -70,7 +71,12 @@ from .config import (
    IS_TTY,
    DEBUG,
    IN_DOCKER,
+    PUID,
+    PGID,
    USER,
+    TIMEZONE,
+    ENFORCE_ATOMIC_WRITES,
+    OUTPUT_PERMISSIONS,
    PYTHON_BINARY,
    ARCHIVEBOX_BINARY,
    ONLY_NEW,
@@ -90,6 +96,7 @@ from .config import (
    check_data_folder,
    write_config_file,
    VERSION,
+    COMMIT_HASH,
    CODE_LOCATIONS,
    EXTERNAL_LOCATIONS,
    DATA_LOCATIONS,
@@ -203,32 +210,44 @@ def help(out_dir: Path=OUTPUT_DIR) -> None:
 def version(quiet: bool=False,
            out_dir: Path=OUTPUT_DIR) -> None:
    """Print the ArchiveBox version and dependency information"""
-
-    if quiet:
-        print(VERSION)
-    else:
-        # ArchiveBox v0.5.6
-        # Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
-        print('ArchiveBox v{}'.format(VERSION))
+    
+    print(VERSION)
+    
+    if not quiet:
+        # 0.6.3
+        # ArchiveBox v0.6.3 Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
+        # DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 501:20 SEARCH_BACKEND=ripgrep
+        
        p = platform.uname()
        print(
+            'ArchiveBox v{}'.format(VERSION),
+            *((COMMIT_HASH[:7],) if COMMIT_HASH else ()),
            sys.implementation.name.title(),
            p.system,
            platform.platform(),
            p.machine,
        )
+        OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
        print(
-            f'IN_DOCKER={IN_DOCKER}',
            f'DEBUG={DEBUG}',
+            f'IN_DOCKER={IN_DOCKER}',
            f'IS_TTY={IS_TTY}',
-            f'TZ={os.environ.get("TZ", "UTC")}',
-            f'SEARCH_BACKEND_ENGINE={SEARCH_BACKEND_ENGINE}',
+            f'TZ={TIMEZONE}',
+            #f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})',  # add this if we have more useful info to show eventually
+            f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
+            f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
+            f'FS_PERMS={OUTPUT_PERMISSIONS} {PUID}:{PGID}',
+            f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
        )
        print()

        print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
        for name, dependency in DEPENDENCIES.items():
            print(printable_dependency_version(name, dependency))
+            
+            # add a newline between core dependencies and extractor dependencies for easier reading
+            if name == 'ARCHIVEBOX_BINARY':
+                print()
        
        print()
        print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
@@ -427,7 +446,7 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
        print('        archivebox server  # then visit http://127.0.0.1:8000')
        print()
        print('    To add new links, you can run:')
-        print("        archivebox add ~/some/path/or/url/to/list_of_links.txt")
+        print("        archivebox add < ~/some/path/to/list_of_links.txt")
        print()
        print('    For more usage and examples, run:')
        print('        archivebox help')
@@ -554,7 +573,8 @@ def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR):
 def add(urls: Union[str, List[str]],
        tag: str='',
        depth: int=0,
-        update_all: bool=not ONLY_NEW,
+        update: bool=not ONLY_NEW,
+        update_all: bool=False,
        index_only: bool=False,
        overwrite: bool=False,
        # duplicate: bool=False,  # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
@@ -587,6 +607,7 @@ def add(urls: Union[str, List[str]],
        # save verbatim args to sources
        write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
    
+
    new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser)

    # If we're going one level deeper, download each link and look for more links
@@ -594,8 +615,11 @@ def add(urls: Union[str, List[str]],
    if new_links and depth == 1:
        log_crawl_started(new_links)
        for new_link in new_links:
-            downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
-            new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
+            try:
+                downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
+                new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
+            except Exception as err:
+                stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')

    imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
    
@@ -618,11 +642,21 @@ def add(urls: Union[str, List[str]],
        if extractors:
            archive_kwargs["methods"] = extractors

-        if update_all:
+        stderr()
+
+        ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
+
+        if update:
+            stderr(f'[*] [{ts}] Archiving + updating {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
+            archive_links(imported_links, overwrite=overwrite, **archive_kwargs)
+        elif update_all:
+            stderr(f'[*] [{ts}] Archiving + updating {len(all_links)}/{len(all_links)}', len(all_links), 'URLs from entire library...', color='green')
            archive_links(all_links, overwrite=overwrite, **archive_kwargs)
        elif overwrite:
+            stderr(f'[*] [{ts}] Archiving + overwriting {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
            archive_links(imported_links, overwrite=True, **archive_kwargs)
        elif new_links:
+            stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green')
            archive_links(new_links, overwrite=False, **archive_kwargs)


@@ -1113,6 +1147,7 @@ def schedule(add: bool=False,
             every: Optional[str]=None,
             depth: int=0,
             overwrite: bool=False,
+             update: bool=not ONLY_NEW,
             import_path: Optional[str]=None,
             out_dir: Path=OUTPUT_DIR):
    """Set ArchiveBox to regularly import URLs at specific times using cron"""
@@ -1142,6 +1177,7 @@ def schedule(add: bool=False,
            *([
                'add',
                *(['--overwrite'] if overwrite else []),
+                *(['--update'] if update else []),
                f'--depth={depth}',
                f'"{import_path}"',
            ] if import_path else ['update']),
--- a/archivebox/parsers/init.py
+++ b/archivebox/parsers/init.py
@@ -149,7 +149,17 @@ def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None,
 def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: Path=OUTPUT_DIR) -> str:
    ts = str(datetime.now(timezone.utc).timestamp()).split('.', 1)[0]
    source_path = str(out_dir / SOURCES_DIR_NAME / filename.format(ts=ts))
-    atomic_write(source_path, raw_text)
+
+    referenced_texts = ''
+
+    for entry in raw_text.split():
+        try:
+            if Path(entry).exists():
+                referenced_texts += Path(entry).read_text()
+        except Exception as err:
+            print(err)
+
+    atomic_write(source_path, raw_text + '\n' + referenced_texts)
    log_source_saved(source_file=source_path)
    return source_path

@@ -176,7 +186,7 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
                ANSI['reset'],
            ))
            print('    ', e)
-            raise SystemExit(1)
+            raise e

    else:
        # Source is a path to a local file on the filesystem
--- a/archivebox/parsers/pocket_api.py
+++ b/archivebox/parsers/pocket_api.py
@@ -47,11 +47,11 @@ def get_pocket_articles(api: Pocket, since=None, page=0):


 def link_from_article(article: dict, sources: list):
-    url: str = article['resolved_url'] or article['given_url']
+    url: str = article.get('resolved_url') or article['given_url']
    broken_protocol = _BROKEN_PROTOCOL_RE.match(url)
    if broken_protocol:
        url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://')
-    title = article['resolved_title'] or article['given_title'] or url
+    title = article.get('resolved_title') or article.get('given_title') or url

    return Link(
        url=url,
--- a/archivebox/parsers/wallabag_atom.py
+++ b/archivebox/parsers/wallabag_atom.py
@@ -34,13 +34,19 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:

        trailing_removed = entry.split('</entry>', 1)[0]
        leading_removed = trailing_removed.strip()
-        rows = leading_removed.split('\n')
+        splits_fixed = leading_removed.replace('"\n              href="', '" href="')
+        rows = splits_fixed.split('\n')

-        def get_row(key):
-            return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
+        def get_row(prefix):
+            return [
+                row.strip()
+                for row in rows
+                if row.strip().startswith('<{}'.format(prefix))
+            ][0]

        title = str_between(get_row('title'), '<title><![CDATA[', ']]></title>').strip()
-        url = str_between(get_row('link rel="via"'), '<link rel="via">', '</link>')
+        url_inside_link = str_between(get_row('link rel="via"'), '<link rel="via">', '</link>')
+        url_inside_attr = str_between(get_row('link rel="via"'), 'href="', '"/>')
        ts_str = str_between(get_row('published'), '<published>', '</published>')
        time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
        try:
@@ -49,7 +55,7 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
            tags = None

        yield Link(
-            url=htmldecode(url),
+            url=htmldecode(url_inside_attr or url_inside_link),
            timestamp=str(time.timestamp()),
            title=htmldecode(title) or None,
            tags=tags or '',
--- a/archivebox/templates/admin/base.html
+++ b/archivebox/templates/admin/base.html
@@ -197,7 +197,7 @@

                            // select the action button from the dropdown
                            container.find('select[name=action]')
-                                .find('op:selected').removeAttr('selected').end()
+                                .find('[selected]').removeAttr('selected').end()
                                .find('[value=' + action_type + ']').attr('selected', 'selected').click()
                            
                            // click submit & replace the archivebox logo with a spinner
--- a/archivebox/templates/core/add.html
+++ b/archivebox/templates/core/add.html
@@ -28,6 +28,14 @@
                <a href="/add" id="submit">&nbsp; Add more URLs ➕</a>
            </center>
        {% else %}
+            <div id="in-progress" style="display: none;">
+                <center><h3>Adding URLs to index and running archive methods...</h3>
+                    <br/>
+                    <div class="loader"></div>
+                    <br/>
+                    Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for progress...
+                </center>
+            </div>
            <form id="add-form" method="POST" class="p-form">{% csrf_token %}
                <h1>Add new URLs to your archive</h1>
                <br/>
@@ -48,10 +56,9 @@
            {% endif %}
            <script>
                document.getElementById('add-form').addEventListener('submit', function(event) {
-                    setTimeout(function() {
-                        document.getElementById('add-form').innerHTML = '<center><h3>Adding URLs to index and running archive methods...<h3><br/><div class="loader"></div><br/>Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for progress...</center>'
-                        document.getElementById('delay-warning').style.display = 'block'
-                    }, 200)
+                    document.getElementById('in-progress').style.display = 'block'
+                    document.getElementById('add-form').style.display = 'none'
+                    document.getElementById('delay-warning').style.display = 'block'
                    return true
                })
            </script>
--- a/archivebox/templates/core/snapshot.html
+++ b/archivebox/templates/core/snapshot.html
@@ -414,6 +414,7 @@
                        </div>
                    </div>
                    {% endif %}
+                    {% if PREVIEW_ORIGINALS %}
                    <div class="col-lg-2">
                        <div class="card">
                            <iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy" referrerpolicy="no-referrer"></iframe>
@@ -427,6 +428,7 @@
                          </div>
                        </div>
                    </div>
+                    {% endif %}
                    <div class="col-lg-2">
                        <div class="card">
                            <iframe class="card-img-top" src="{{headers_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>