Merge branch 'dev' into feat/reverse-proxy-auth

This commit is contained in:
Nick Sweeting
2023-01-09 18:20:45 -08:00
committed by GitHub
36 changed files with 625 additions and 292 deletions

View File

@@ -30,11 +30,17 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
help="Tag the added URLs with the provided tags e.g. --tag=tag1,tag2,tag3",
)
parser.add_argument(
'--update-all', #'-n',
'--update', #'-u',
action='store_true',
default=not ONLY_NEW, # when ONLY_NEW=True we skip updating old links
help="Also retry previously skipped/failed links when adding new links",
)
parser.add_argument(
'--update-all', #'-n',
action='store_true',
default=False,
help="Also update ALL links in index when finished adding new links",
)
parser.add_argument(
'--index-only', #'-o',
action='store_true',
@@ -104,6 +110,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
urls=stdin_urls or urls,
depth=command.depth,
tag=command.tag,
update=command.update,
update_all=command.update_all,
index_only=command.index_only,
overwrite=command.overwrite,

View File

@@ -51,6 +51,11 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
action='store_true',
help='Re-archive any URLs that have been previously archived, overwriting existing Snapshots',
)
parser.add_argument(
'--update',
action='store_true',
help='Re-pull any URLs that have been previously added, as needed to fill missing ArchiveResults',
)
group.add_argument(
'--clear', # '-c'
action='store_true',
@@ -94,6 +99,7 @@ def main(args: Optional[List[str]]=None, stdin: Optional[IO]=None, pwd: Optional
every=command.every,
depth=command.depth,
overwrite=command.overwrite,
update=command.update,
import_path=command.import_path,
out_dir=pwd or OUTPUT_DIR,
)

View File

@@ -26,11 +26,12 @@ import io
import re
import sys
import json
import inspect
import getpass
import platform
import shutil
import sqlite3
import django
from sqlite3 import dbapi2 as sqlite3
from hashlib import md5
from pathlib import Path
@@ -48,6 +49,9 @@ from .config_stubs import (
ConfigDefaultDict,
)
### Pre-Fetch Minimal System Config
SYSTEM_USER = getpass.getuser() or os.getlogin()
try:
@@ -65,6 +69,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'USE_COLOR': {'type': bool, 'default': lambda c: c['IS_TTY']},
'SHOW_PROGRESS': {'type': bool, 'default': lambda c: (c['IS_TTY'] and platform.system() != 'Darwin')}, # progress bars are buggy on mac, disable for now
'IN_DOCKER': {'type': bool, 'default': False},
'PUID': {'type': int, 'default': os.getuid()},
'PGID': {'type': int, 'default': os.getgid()},
# TODO: 'SHOW_HINTS': {'type: bool, 'default': True},
},
@@ -79,6 +85,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'URL_BLACKLIST': {'type': str, 'default': r'\.(css|js|otf|ttf|woff|woff2|gstatic\.com|googleapis\.com/css)(\?.*)?$'}, # to avoid downloading code assets as their own pages
'URL_WHITELIST': {'type': str, 'default': None},
'ENFORCE_ATOMIC_WRITES': {'type': bool, 'default': True},
'TAG_SEPARATOR_PATTERN': {'type': str, 'default': r'[,]'},
},
'SERVER_CONFIG': {
@@ -93,9 +100,11 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'SNAPSHOTS_PER_PAGE': {'type': int, 'default': 40},
'CUSTOM_TEMPLATES_DIR': {'type': str, 'default': None},
'TIME_ZONE': {'type': str, 'default': 'UTC'},
'TIMEZONE': {'type': str, 'default': 'UTC'},
'REVERSE_PROXY_USER_HEADER': {'type': str, 'default': 'Remote-User'},
'REVERSE_PROXY_WHITELIST': {'type': str, 'default': ''},
'LOGOUT_REDIRECT_URL': {'type': str, 'default': '/'},
'PREVIEW_ORIGINALS': {'type': bool, 'default': True},
},
'ARCHIVE_METHOD_TOGGLES': {
@@ -122,9 +131,9 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'CHECK_SSL_VALIDITY': {'type': bool, 'default': True},
'MEDIA_MAX_SIZE': {'type': str, 'default': '750m'},
'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.61 Safari/537.36 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
'CURL_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) curl/{CURL_VERSION}'},
'WGET_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/) wget/{WGET_VERSION}'},
'CHROME_USER_AGENT': {'type': str, 'default': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Chrome/102.0.0.0 Safari/605.1.15 ArchiveBox/{VERSION} (+https://github.com/ArchiveBox/ArchiveBox/)'},
'COOKIES_FILE': {'type': str, 'default': None},
'CHROME_USER_DATA_DIR': {'type': str, 'default': None},
@@ -139,10 +148,18 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'--no-call-home',
'--write-sub',
'--all-subs',
'--write-auto-sub',
# There are too many of these and youtube
# throttles you with HTTP error 429
#'--write-auto-subs',
'--convert-subs=srt',
'--yes-playlist',
'--continue',
# This flag doesn't exist in youtube-dl
# only in yt-dlp
'--no-abort-on-error',
# --ignore-errors must come AFTER
# --no-abort-on-error
# https://github.com/yt-dlp/yt-dlp/issues/4914
'--ignore-errors',
'--geo-bypass',
'--add-metadata',
@@ -164,6 +181,7 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'--compressed'
]},
'GIT_ARGS': {'type': list, 'default': ['--recursive']},
'SINGLEFILE_ARGS': {'type': list, 'default' : None}
},
'SEARCH_BACKEND_CONFIG' : {
@@ -197,7 +215,8 @@ CONFIG_SCHEMA: Dict[str, ConfigDefaultDict] = {
'SINGLEFILE_BINARY': {'type': str, 'default': lambda c: bin_path('single-file')},
'READABILITY_BINARY': {'type': str, 'default': lambda c: bin_path('readability-extractor')},
'MERCURY_BINARY': {'type': str, 'default': lambda c: bin_path('mercury-parser')},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
#'YOUTUBEDL_BINARY': {'type': str, 'default': 'youtube-dl'},
'YOUTUBEDL_BINARY': {'type': str, 'default': 'yt-dlp'},
'NODE_BINARY': {'type': str, 'default': 'node'},
'RIPGREP_BINARY': {'type': str, 'default': 'rg'},
'CHROME_BINARY': {'type': str, 'default': None},
@@ -321,6 +340,15 @@ ALLOWED_IN_OUTPUT_DIR = {
'static_index.json',
}
def get_version(config):
return json.loads((Path(config['PACKAGE_DIR']) / 'package.json').read_text(encoding='utf-8').strip())['version']
def get_commit_hash(config):
try:
return list((config['PACKAGE_DIR'] / '../.git/refs/heads/').glob('*'))[0].read_text().strip()
except Exception:
return None
############################## Derived Config ##################################
@@ -345,14 +373,20 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'DIR_OUTPUT_PERMISSIONS': {'default': lambda c: c['OUTPUT_PERMISSIONS'].replace('6', '7').replace('4', '5')},
'ARCHIVEBOX_BINARY': {'default': lambda c: sys.argv[0] or bin_path('archivebox')},
'VERSION': {'default': lambda c: json.loads((Path(c['PACKAGE_DIR']) / 'package.json').read_text(encoding='utf-8').strip())['version']},
'VERSION': {'default': lambda c: get_version(c)},
'COMMIT_HASH': {'default': lambda c: get_commit_hash(c)},
'PYTHON_BINARY': {'default': lambda c: sys.executable},
'PYTHON_ENCODING': {'default': lambda c: sys.stdout.encoding.upper()},
'PYTHON_VERSION': {'default': lambda c: '{}.{}.{}'.format(*sys.version_info[:3])},
'DJANGO_BINARY': {'default': lambda c: django.__file__.replace('__init__.py', 'bin/django-admin.py')},
'DJANGO_BINARY': {'default': lambda c: inspect.getfile(django)},
'DJANGO_VERSION': {'default': lambda c: '{}.{}.{} {} ({})'.format(*django.VERSION)},
'SQLITE_BINARY': {'default': lambda c: inspect.getfile(sqlite3)},
'SQLITE_VERSION': {'default': lambda c: sqlite3.version},
#'SQLITE_JOURNAL_MODE': {'default': lambda c: 'wal'}, # set at runtime below, interesting but unused for now
#'SQLITE_OPTIONS': {'default': lambda c: ['JSON1']}, # set at runtime below
'USE_CURL': {'default': lambda c: c['USE_CURL'] and (c['SAVE_FAVICON'] or c['SAVE_TITLE'] or c['SAVE_ARCHIVE_DOT_ORG'])},
'CURL_VERSION': {'default': lambda c: bin_version(c['CURL_BINARY']) if c['USE_CURL'] else None},
@@ -373,6 +407,7 @@ DYNAMIC_CONFIG_SCHEMA: ConfigDefaultDict = {
'USE_SINGLEFILE': {'default': lambda c: c['USE_SINGLEFILE'] and c['SAVE_SINGLEFILE']},
'SINGLEFILE_VERSION': {'default': lambda c: bin_version(c['SINGLEFILE_BINARY']) if c['USE_SINGLEFILE'] else None},
'SINGLEFILE_ARGS': {'default': lambda c: c['SINGLEFILE_ARGS'] or []},
'USE_READABILITY': {'default': lambda c: c['USE_READABILITY'] and c['SAVE_READABILITY']},
'READABILITY_VERSION': {'default': lambda c: bin_version(c['READABILITY_BINARY']) if c['USE_READABILITY'] else None},
@@ -652,7 +687,9 @@ def bin_version(binary: Optional[str]) -> Optional[str]:
return None
try:
version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
version_str = run([abspath, "--version"], stdout=PIPE, env={'LANG': 'C'}).stdout.strip().decode()
if not version_str:
version_str = run([abspath, "--version"], stdout=PIPE).stdout.strip().decode()
# take first 3 columns of first line of version info
return ' '.join(version_str.split('\n')[0].strip().split()[:3])
except OSError:
@@ -795,6 +832,7 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
'path': config['OUTPUT_DIR'].resolve(),
'enabled': True,
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
'is_mount': os.path.ismount(config['OUTPUT_DIR'].resolve()),
},
'SOURCES_DIR': {
'path': config['SOURCES_DIR'].resolve(),
@@ -810,6 +848,7 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
'path': config['ARCHIVE_DIR'].resolve(),
'enabled': True,
'is_valid': config['ARCHIVE_DIR'].exists(),
'is_mount': os.path.ismount(config['ARCHIVE_DIR'].resolve()),
},
'CONFIG_FILE': {
'path': config['CONFIG_FILE'].resolve(),
@@ -820,18 +859,12 @@ def get_data_locations(config: ConfigDict) -> ConfigValue:
'path': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve(),
'enabled': True,
'is_valid': (config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).exists(),
'is_mount': os.path.ismount((config['OUTPUT_DIR'] / SQL_INDEX_FILENAME).resolve()),
},
}
def get_dependency_info(config: ConfigDict) -> ConfigValue:
return {
'ARCHIVEBOX_BINARY': {
'path': bin_path(config['ARCHIVEBOX_BINARY']),
'version': config['VERSION'],
'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
'enabled': True,
'is_valid': True,
},
'PYTHON_BINARY': {
'path': bin_path(config['PYTHON_BINARY']),
'version': config['PYTHON_VERSION'],
@@ -839,6 +872,13 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
'enabled': True,
'is_valid': bool(config['PYTHON_VERSION']),
},
'SQLITE_BINARY': {
'path': bin_path(config['SQLITE_BINARY']),
'version': config['SQLITE_VERSION'],
'hash': bin_hash(config['SQLITE_BINARY']),
'enabled': True,
'is_valid': bool(config['SQLITE_VERSION']),
},
'DJANGO_BINARY': {
'path': bin_path(config['DJANGO_BINARY']),
'version': config['DJANGO_VERSION'],
@@ -846,6 +886,14 @@ def get_dependency_info(config: ConfigDict) -> ConfigValue:
'enabled': True,
'is_valid': bool(config['DJANGO_VERSION']),
},
'ARCHIVEBOX_BINARY': {
'path': bin_path(config['ARCHIVEBOX_BINARY']),
'version': config['VERSION'],
'hash': bin_hash(config['ARCHIVEBOX_BINARY']),
'enabled': True,
'is_valid': True,
},
'CURL_BINARY': {
'path': bin_path(config['CURL_BINARY']),
'version': config['CURL_VERSION'],
@@ -931,7 +979,7 @@ def get_chrome_info(config: ConfigDict) -> ConfigValue:
'TIMEOUT': config['TIMEOUT'],
'RESOLUTION': config['RESOLUTION'],
'CHECK_SSL_VALIDITY': config['CHECK_SSL_VALIDITY'],
'CHROME_BINARY': config['CHROME_BINARY'],
'CHROME_BINARY': bin_path(config['CHROME_BINARY']),
'CHROME_HEADLESS': config['CHROME_HEADLESS'],
'CHROME_SANDBOX': config['CHROME_SANDBOX'],
'CHROME_USER_AGENT': config['CHROME_USER_AGENT'],
@@ -972,13 +1020,22 @@ globals().update(CONFIG)
# Set timezone to UTC and umask to OUTPUT_PERMISSIONS
os.environ["TZ"] = 'UTC'
assert TIMEZONE == 'UTC', 'The server timezone should always be set to UTC' # we may allow this to change later
os.environ["TZ"] = TIMEZONE
os.umask(0o777 - int(DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
# add ./node_modules/.bin to $PATH so we can use node scripts in extractors
NODE_BIN_PATH = str((Path(CONFIG["OUTPUT_DIR"]).absolute() / 'node_modules' / '.bin'))
sys.path.append(NODE_BIN_PATH)
# OPTIONAL: also look around the host system for node modules to use
# avoid enabling this unless absolutely needed,
# having overlapping potential sources of libs is a big source of bugs/confusing to users
# DEV_NODE_BIN_PATH = str((Path(CONFIG["PACKAGE_DIR"]).absolute() / '..' / 'node_modules' / '.bin'))
# sys.path.append(DEV_NODE_BIN_PATH)
# USER_NODE_BIN_PATH = str(Path('~/.node_modules/.bin').resolve())
# sys.path.append(USER_NODE_BIN_PATH)
# disable stderr "you really shouldnt disable ssl" warnings with library config
if not CONFIG['CHECK_SSL_VALIDITY']:
import urllib3
@@ -986,6 +1043,13 @@ if not CONFIG['CHECK_SSL_VALIDITY']:
requests.packages.urllib3.disable_warnings(requests.packages.urllib3.exceptions.InsecureRequestWarning)
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
# get SQLite database version, compile options, and runtime options
# TODO: make this a less hacky proper assertion checker helper function in somewhere like setup_django
#cursor = sqlite3.connect(':memory:').cursor()
#DYNAMIC_CONFIG_SCHEMA['SQLITE_VERSION'] = lambda c: cursor.execute("SELECT sqlite_version();").fetchone()[0]
#DYNAMIC_CONFIG_SCHEMA['SQLITE_JOURNAL_MODE'] = lambda c: cursor.execute('PRAGMA journal_mode;').fetchone()[0]
#DYNAMIC_CONFIG_SCHEMA['SQLITE_OPTIONS'] = lambda c: [option[0] for option in cursor.execute('PRAGMA compile_options;').fetchall()]
#cursor.close()
########################### Config Validity Checkers ###########################
@@ -1082,6 +1146,7 @@ def check_dependencies(config: ConfigDict=CONFIG, show_help: bool=True) -> None:
stderr(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#save_media')
stderr()
def check_data_folder(out_dir: Union[str, Path, None]=None, config: ConfigDict=CONFIG) -> None:
output_dir = out_dir or config['OUTPUT_DIR']
assert isinstance(output_dir, (str, Path))
@@ -1156,11 +1221,10 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
# without running migrations automatically (user runs them manually by calling init)
django.setup()
from django.conf import settings
# log startup message to the error log
with open(settings.ERROR_LOG, "a+", encoding='utf-8') as f:
with open(settings.ERROR_LOG, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv)
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
@@ -1170,10 +1234,17 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
# Enable WAL mode in sqlite3
from django.db import connection
with connection.cursor() as cursor:
# Set Journal mode to WAL to allow for multiple writers
current_mode = cursor.execute("PRAGMA journal_mode")
if current_mode != 'wal':
cursor.execute("PRAGMA journal_mode=wal;")
# Set max blocking delay for concurrent writes and write sync mode
# https://litestream.io/tips/#busy-timeout
cursor.execute("PRAGMA busy_timeout = 5000;")
cursor.execute("PRAGMA synchronous = NORMAL;")
# Create cache table in DB if needed
try:
from django.core.cache import cache
@@ -1181,7 +1252,6 @@ def setup_django(out_dir: Path=None, check_db=False, config: ConfigDict=CONFIG,
except django.db.utils.OperationalError:
call_command("createcachetable", verbosity=0)
# if archivebox gets imported multiple times, we have to close
# the sqlite3 whenever we init from scratch to avoid multiple threads
# sharing the same connection by accident

View File

@@ -98,6 +98,7 @@ class ConfigDict(BaseConfig, total=False):
WGET_ARGS: List[str]
CURL_ARGS: List[str]
GIT_ARGS: List[str]
TAG_SEPARATOR_PATTERN: str
ConfigDefaultValueGetter = Callable[[ConfigDict], ConfigValue]

View File

@@ -0,0 +1,18 @@
# Generated by Django 3.1.14 on 2022-09-14 09:34
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
('core', '0020_auto_20210410_1031'),
]
operations = [
migrations.AlterField(
model_name='archiveresult',
name='extractor',
field=models.CharField(choices=[('favicon', 'favicon'), ('headers', 'headers'), ('singlefile', 'singlefile'), ('pdf', 'pdf'), ('screenshot', 'screenshot'), ('dom', 'dom'), ('wget', 'wget'), ('title', 'title'), ('readability', 'readability'), ('mercury', 'mercury'), ('git', 'git'), ('media', 'media'), ('archive_org', 'archive_org')], max_length=32),
),
]

View File

@@ -19,7 +19,7 @@ from ..config import (
SQL_INDEX_FILENAME,
OUTPUT_DIR,
LOGS_DIR,
TIME_ZONE,
TIMEZONE,
)
IS_MIGRATING = 'makemigrations' in sys.argv[:3] or 'migrate' in sys.argv[:3]
@@ -157,7 +157,7 @@ DATABASES = {
'timeout': 60,
'check_same_thread': False,
},
'TIME_ZONE': 'UTC',
'TIME_ZONE': TIMEZONE,
# DB setup is sometimes modified at runtime by setup_django() in config.py
}
}
@@ -227,7 +227,8 @@ USE_L10N = True
USE_TZ = True
DATETIME_FORMAT = 'Y-m-d g:iA'
SHORT_DATETIME_FORMAT = 'Y-m-d h:iA'
TIME_ZONE = TIME_ZONE # noqa
TIME_ZONE = TIMEZONE # django convention is TIME_ZONE, archivebox config uses TIMEZONE, they are equivalent
from django.conf.locale.en import formats as en_formats

View File

@@ -6,7 +6,7 @@ from django.contrib.staticfiles.urls import staticfiles_urlpatterns
from django.conf import settings
from django.views.generic.base import RedirectView
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView
from core.views import HomepageView, SnapshotView, PublicIndexView, AddView, HealthCheckView
# print('DEBUG', settings.DEBUG)
@@ -24,14 +24,16 @@ urlpatterns = [
path('admin/core/snapshot/add/', RedirectView.as_view(url='/add/')),
path('add/', AddView.as_view(), name='add'),
path('accounts/login/', RedirectView.as_view(url='/admin/login/')),
path('accounts/logout/', RedirectView.as_view(url='/admin/logout/')),
path('accounts/', include('django.contrib.auth.urls')),
path('admin/', admin.site.urls),
path('health/', HealthCheckView.as_view(), name='healthcheck'),
path('index.html', RedirectView.as_view(url='/')),
path('index.json', static.serve, {'document_root': settings.OUTPUT_DIR, 'path': 'index.json'}),
path('', HomepageView.as_view(), name='Home'),

View File

@@ -38,7 +38,7 @@ class HomepageView(View):
if PUBLIC_INDEX:
return redirect('/public')
return redirect(f'/admin/login/?next={request.path}')
@@ -205,7 +205,7 @@ class SnapshotView(View):
content_type="text/html",
status=404,
)
class PublicIndexView(ListView):
template_name = 'public_index.html'
@@ -220,7 +220,7 @@ class PublicIndexView(ListView):
'FOOTER_INFO': FOOTER_INFO,
}
def get_queryset(self, **kwargs):
def get_queryset(self, **kwargs):
qs = super().get_queryset(**kwargs)
query = self.request.GET.get('q')
if query and query.strip():
@@ -249,7 +249,7 @@ class AddView(UserPassesTestMixin, FormView):
url = self.request.GET.get('url', None)
if url:
return {'url': url if '://' in url else f'https://{url}'}
return super().get_initial()
def test_func(self):
@@ -295,3 +295,18 @@ class AddView(UserPassesTestMixin, FormView):
"form": AddLinkForm()
})
return render(template_name=self.template_name, request=self.request, context=context)
class HealthCheckView(View):
"""
A Django view that renders plain text "OK" for service discovery tools
"""
def get(self, request):
"""
Handle a GET request
"""
return HttpResponse(
'OK',
content_type='text/plain',
status=200
)

View File

@@ -1,12 +1,14 @@
__package__ = 'archivebox.extractors'
import os
import sys
from pathlib import Path
from typing import Optional, List, Iterable, Union
from datetime import datetime, timezone
from django.db.models import QuerySet
from ..core.settings import ERROR_LOG
from ..index.schema import Link
from ..index.sql import write_link_to_sql_index
from ..index import (
@@ -42,7 +44,6 @@ from .headers import should_save_headers, save_headers
def get_default_archive_methods():
return [
('title', should_save_title, save_title),
('favicon', should_save_favicon, save_favicon),
('headers', should_save_headers, save_headers),
('singlefile', should_save_singlefile, save_singlefile),
@@ -50,7 +51,8 @@ def get_default_archive_methods():
('screenshot', should_save_screenshot, save_screenshot),
('dom', should_save_dom, save_dom),
('wget', should_save_wget, save_wget),
('readability', should_save_readability, save_readability), # keep readability below wget and singlefile, as it depends on them
('title', should_save_title, save_title), # keep title and readability below wget and singlefile, as it depends on them
('readability', should_save_readability, save_readability),
('mercury', should_save_mercury, save_mercury),
('git', should_save_git, save_git),
('media', should_save_media, save_media),
@@ -127,10 +129,27 @@ def archive_link(link: Link, overwrite: bool=False, methods: Optional[Iterable[s
# print('{black} X {}{reset}'.format(method_name, **ANSI))
stats['skipped'] += 1
except Exception as e:
# Disabled until https://github.com/ArchiveBox/ArchiveBox/issues/984
# and https://github.com/ArchiveBox/ArchiveBox/issues/1014
# are fixed.
"""
raise Exception('Exception in archive_methods.save_{}(Link(url={}))'.format(
method_name,
link.url,
)) from e
"""
# Instead, use the kludgy workaround from
# https://github.com/ArchiveBox/ArchiveBox/issues/984#issuecomment-1150541627
with open(ERROR_LOG, "a", encoding='utf-8') as f:
command = ' '.join(sys.argv)
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d__%H:%M:%S')
f.write(("\n" + 'Exception in archive_methods.save_{}(Link(url={})) command={}; ts={}'.format(
method_name,
link.url,
command,
ts
) + "\n"))
#f.write(f"\n> {command}; ts={ts} version={config['VERSION']} docker={config['IN_DOCKER']} is_tty={config['IS_TTY']}\n")
# print(' ', stats)
@@ -182,7 +201,7 @@ def archive_links(all_links: Union[Iterable[Link], QuerySet], overwrite: bool=Fa
except KeyboardInterrupt:
log_archiving_paused(num_links, idx, link.timestamp)
raise SystemExit(0)
except BaseException: # lgtm [py/catch-base-exception]
except BaseException:
print()
raise

View File

@@ -33,7 +33,7 @@ def should_save_media(link: Link, out_dir: Optional[Path]=None, overwrite: Optio
@enforce_types
def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIMEOUT) -> ArchiveResult:
"""Download playlists or individual video, audio, and subtitles using youtube-dl"""
"""Download playlists or individual video, audio, and subtitles using youtube-dl or yt-dlp"""
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = 'media'
@@ -43,6 +43,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
YOUTUBEDL_BINARY,
*YOUTUBEDL_ARGS,
*([] if CHECK_SSL_VALIDITY else ['--no-check-certificate']),
# TODO: add --cookies-from-browser={CHROME_USER_DATA_DIR}
link.url,
]
status = 'succeeded'
@@ -60,7 +61,7 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
pass
else:
hints = (
'Got youtube-dl response code: {}.'.format(result.returncode),
'Got youtube-dl (or yt-dlp) response code: {}.'.format(result.returncode),
*result.stderr.decode().split('\n'),
)
raise ArchiveError('Failed to save media', hints)
@@ -71,8 +72,18 @@ def save_media(link: Link, out_dir: Optional[Path]=None, timeout: int=MEDIA_TIME
timer.end()
# add video description and subtitles to full-text index
# Let's try a few different
index_texts = [
text_file.read_text(encoding='utf-8').strip()
# errors:
# * 'strict' to raise a ValueError exception if there is an
# encoding error. The default value of None has the same effect.
# * 'ignore' ignores errors. Note that ignoring encoding errors
# can lead to data loss.
# * 'xmlcharrefreplace' is only supported when writing to a
# file. Characters not supported by the encoding are replaced with
# the appropriate XML character reference &#nnn;.
# There are a few more options described in https://docs.python.org/3/library/functions.html#open
text_file.read_text(encoding='utf-8', errors='xmlcharrefreplace').strip()
for text_file in (
*output_path.glob('*.description'),
*output_path.glob('*.srt'),

View File

@@ -10,9 +10,7 @@ from ..index.schema import Link, ArchiveResult, ArchiveError
from ..system import run, atomic_write
from ..util import (
enforce_types,
download_url,
is_static_file,
)
from ..config import (
TIMEOUT,
@@ -22,28 +20,8 @@ from ..config import (
READABILITY_VERSION,
)
from ..logging_util import TimedProgress
from .title import get_html
@enforce_types
def get_html(link: Link, path: Path) -> str:
"""
Try to find wget, singlefile and then dom files.
If none is found, download the url again.
"""
canonical = link.canonical_outputs()
abs_path = path.absolute()
sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]]
document = None
for source in sources:
try:
with open(abs_path / source, "r", encoding="utf-8") as f:
document = f.read()
break
except (FileNotFoundError, TypeError):
continue
if document is None:
return download_url(link.url)
else:
return document
@enforce_types
def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:

View File

@@ -17,6 +17,7 @@ from ..config import (
SAVE_SINGLEFILE,
DEPENDENCIES,
SINGLEFILE_VERSION,
SINGLEFILE_ARGS,
CHROME_BINARY,
)
from ..logging_util import TimedProgress
@@ -45,10 +46,31 @@ def save_singlefile(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEO
# SingleFile CLI Docs: https://github.com/gildas-lormeau/SingleFile/tree/master/cli
browser_args = '--browser-args={}'.format(json.dumps(browser_args[1:]))
cmd = [
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
options = [
*SINGLEFILE_ARGS,
'--browser-executable-path={}'.format(CHROME_BINARY),
browser_args,
]
# Deduplicate options (single-file doesn't like when you use the same option two times)
#
# NOTE: Options names that come first clobber conflicting names that come later
# My logic is SINGLEFILE_ARGS is the option that affects the singlefile command with most
# specificity, therefore the user sets it with a lot intent, therefore it should take precedence
# kind of like the ergonomic principle of lexical scope in programming languages.
seen_option_names = []
def test_seen(argument):
option_name = argument.split("=")[0]
if option_name in seen_option_names:
return False
else:
seen_option_names.append(option_name)
return True
deduped_options = list(filter(test_seen, options))
cmd = [
DEPENDENCIES['SINGLEFILE_BINARY']['path'],
*deduped_options,
link.url,
output,
]

View File

@@ -58,6 +58,27 @@ class TitleParser(HTMLParser):
if tag.lower() == "title":
self.inside_title_tag = False
@enforce_types
def get_html(link: Link, path: Path, timeout: int=TIMEOUT) -> str:
"""
Try to find wget, singlefile and then dom files.
If none is found, download the url again.
"""
canonical = link.canonical_outputs()
abs_path = path.absolute()
sources = [canonical["singlefile_path"], canonical["wget_path"], canonical["dom_path"]]
document = None
for source in sources:
try:
with open(abs_path / source, "r", encoding="utf-8") as f:
document = f.read()
break
except (FileNotFoundError, TypeError):
continue
if document is None:
return download_url(link.url, timeout=timeout)
else:
return document
@enforce_types
def should_save_title(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
@@ -90,7 +111,7 @@ def save_title(link: Link, out_dir: Optional[Path]=None, timeout: int=TIMEOUT) -
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
html = download_url(link.url, timeout=timeout)
html = get_html(link, out_dir, timeout=timeout)
try:
# try using relatively strict html parser first
parser = TitleParser()

View File

@@ -24,6 +24,7 @@ from ..config import (
FOOTER_INFO,
HTML_INDEX_FILENAME,
SAVE_ARCHIVE_DOT_ORG,
PREVIEW_ORIGINALS,
)
MAIN_INDEX_TEMPLATE = 'static_index.html'
@@ -105,6 +106,7 @@ def link_details_template(link: Link) -> str:
'status_color': 'success' if link.is_archived else 'danger',
'oldest_archive_date': ts_to_date_str(link.oldest_archive_date),
'SAVE_ARCHIVE_DOT_ORG': SAVE_ARCHIVE_DOT_ORG,
'PREVIEW_ORIGINALS': PREVIEW_ORIGINALS,
})
@enforce_types

View File

@@ -1,5 +1,7 @@
__package__ = 'archivebox.index'
import re
from io import StringIO
from pathlib import Path
from typing import List, Tuple, Iterator
@@ -8,7 +10,10 @@ from django.db import transaction
from .schema import Link
from ..util import enforce_types, parse_date
from ..config import OUTPUT_DIR
from ..config import (
OUTPUT_DIR,
TAG_SEPARATOR_PATTERN,
)
### Main Links Index
@@ -33,9 +38,11 @@ def remove_from_sql_main_index(snapshots: QuerySet, atomic: bool=False, out_dir:
def write_link_to_sql_index(link: Link):
from core.models import Snapshot, ArchiveResult
info = {k: v for k, v in link._asdict().items() if k in Snapshot.keys}
tags = info.pop("tags")
if tags is None:
tags = []
tag_list = list(dict.fromkeys(
tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '')
))
info.pop('tags')
try:
info["timestamp"] = Snapshot.objects.get(url=link.url).timestamp
@@ -44,7 +51,7 @@ def write_link_to_sql_index(link: Link):
info["timestamp"] = str(float(info["timestamp"]) + 1.0)
snapshot, _ = Snapshot.objects.update_or_create(url=link.url, defaults=info)
snapshot.save_tags(tags)
snapshot.save_tags(tag_list)
for extractor, entries in link.history.items():
for entry in entries:
@@ -104,10 +111,9 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None:
snap = write_link_to_sql_index(link)
snap.title = link.title
tag_set = (
set(tag.strip() for tag in (link.tags or '').split(','))
)
tag_list = list(tag_set) or []
tag_list = list(dict.fromkeys(
tag.strip() for tag in re.split(TAG_SEPARATOR_PATTERN, link.tags or '')
))
snap.save()
snap.save_tags(tag_list)

View File

@@ -432,7 +432,13 @@ def log_archive_method_finished(result: "ArchiveResult"):
# Prettify error output hints string and limit to five lines
hints = getattr(result.output, 'hints', None) or ()
if hints:
hints = hints if isinstance(hints, (list, tuple)) else hints.split('\n')
if isinstance(hints, (list, tuple, type(_ for _ in ()))):
hints = [hint.decode() for hint in hints if isinstance(hint, bytes)]
else:
if isinstance(hints, bytes):
hints = hints.decode()
hints = hints.split('\n')
hints = (
' {}{}{}'.format(ANSI['lightyellow'], line.strip(), ANSI['reset'])
for line in hints[:5] if line.strip()
@@ -566,7 +572,7 @@ def printable_config(config: ConfigDict, prefix: str='') -> str:
def printable_folder_status(name: str, folder: Dict) -> str:
if folder['enabled']:
if folder['is_valid']:
color, symbol, note = 'green', '', 'valid'
color, symbol, note, num_files = 'green', '', 'valid', ''
else:
color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
else:
@@ -581,6 +587,10 @@ def printable_folder_status(name: str, folder: Dict) -> str:
)
else:
num_files = 'missing'
if folder.get('is_mount'):
# add symbol @ next to filecount if path is a remote filesystem mount
num_files = f'{num_files} @' if num_files else '@'
path = str(folder['path']).replace(str(OUTPUT_DIR), '.') if folder['path'] else ''
if path and ' ' in path:

View File

@@ -4,8 +4,9 @@ import os
import sys
import shutil
import platform
from django.utils import timezone
from pathlib import Path
from datetime import date
from datetime import date, datetime
from typing import Dict, List, Optional, Iterable, IO, Union
from crontab import CronTab, CronSlices
@@ -70,7 +71,12 @@ from .config import (
IS_TTY,
DEBUG,
IN_DOCKER,
PUID,
PGID,
USER,
TIMEZONE,
ENFORCE_ATOMIC_WRITES,
OUTPUT_PERMISSIONS,
PYTHON_BINARY,
ARCHIVEBOX_BINARY,
ONLY_NEW,
@@ -90,6 +96,7 @@ from .config import (
check_data_folder,
write_config_file,
VERSION,
COMMIT_HASH,
CODE_LOCATIONS,
EXTERNAL_LOCATIONS,
DATA_LOCATIONS,
@@ -203,32 +210,44 @@ def help(out_dir: Path=OUTPUT_DIR) -> None:
def version(quiet: bool=False,
out_dir: Path=OUTPUT_DIR) -> None:
"""Print the ArchiveBox version and dependency information"""
if quiet:
print(VERSION)
else:
# ArchiveBox v0.5.6
# Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
print('ArchiveBox v{}'.format(VERSION))
print(VERSION)
if not quiet:
# 0.6.3
# ArchiveBox v0.6.3 Cpython Linux Linux-4.19.121-linuxkit-x86_64-with-glibc2.28 x86_64 (in Docker) (in TTY)
# DEBUG=False IN_DOCKER=True IS_TTY=True TZ=UTC FS_ATOMIC=True FS_REMOTE=False FS_PERMS=644 501:20 SEARCH_BACKEND=ripgrep
p = platform.uname()
print(
'ArchiveBox v{}'.format(VERSION),
*((COMMIT_HASH[:7],) if COMMIT_HASH else ()),
sys.implementation.name.title(),
p.system,
platform.platform(),
p.machine,
)
OUTPUT_IS_REMOTE_FS = DATA_LOCATIONS['OUTPUT_DIR']['is_mount'] or DATA_LOCATIONS['ARCHIVE_DIR']['is_mount']
print(
f'IN_DOCKER={IN_DOCKER}',
f'DEBUG={DEBUG}',
f'IN_DOCKER={IN_DOCKER}',
f'IS_TTY={IS_TTY}',
f'TZ={os.environ.get("TZ", "UTC")}',
f'SEARCH_BACKEND_ENGINE={SEARCH_BACKEND_ENGINE}',
f'TZ={TIMEZONE}',
#f'DB=django.db.backends.sqlite3 (({CONFIG["SQLITE_JOURNAL_MODE"]})', # add this if we have more useful info to show eventually
f'FS_ATOMIC={ENFORCE_ATOMIC_WRITES}',
f'FS_REMOTE={OUTPUT_IS_REMOTE_FS}',
f'FS_PERMS={OUTPUT_PERMISSIONS} {PUID}:{PGID}',
f'SEARCH_BACKEND={SEARCH_BACKEND_ENGINE}',
)
print()
print('{white}[i] Dependency versions:{reset}'.format(**ANSI))
for name, dependency in DEPENDENCIES.items():
print(printable_dependency_version(name, dependency))
# add a newline between core dependencies and extractor dependencies for easier reading
if name == 'ARCHIVEBOX_BINARY':
print()
print()
print('{white}[i] Source-code locations:{reset}'.format(**ANSI))
@@ -427,7 +446,7 @@ def init(force: bool=False, quick: bool=False, setup: bool=False, out_dir: Path=
print(' archivebox server # then visit http://127.0.0.1:8000')
print()
print(' To add new links, you can run:')
print(" archivebox add ~/some/path/or/url/to/list_of_links.txt")
print(" archivebox add < ~/some/path/to/list_of_links.txt")
print()
print(' For more usage and examples, run:')
print(' archivebox help')
@@ -554,7 +573,8 @@ def oneshot(url: str, extractors: str="", out_dir: Path=OUTPUT_DIR):
def add(urls: Union[str, List[str]],
tag: str='',
depth: int=0,
update_all: bool=not ONLY_NEW,
update: bool=not ONLY_NEW,
update_all: bool=False,
index_only: bool=False,
overwrite: bool=False,
# duplicate: bool=False, # TODO: reuse the logic from admin.py resnapshot to allow adding multiple snapshots by appending timestamp automatically
@@ -587,6 +607,7 @@ def add(urls: Union[str, List[str]],
# save verbatim args to sources
write_ahead_log = save_text_as_source('\n'.join(urls), filename='{ts}-import.txt', out_dir=out_dir)
new_links += parse_links_from_source(write_ahead_log, root_url=None, parser=parser)
# If we're going one level deeper, download each link and look for more links
@@ -594,8 +615,11 @@ def add(urls: Union[str, List[str]],
if new_links and depth == 1:
log_crawl_started(new_links)
for new_link in new_links:
downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
try:
downloaded_file = save_file_as_source(new_link.url, filename=f'{new_link.timestamp}-crawl-{new_link.domain}.txt', out_dir=out_dir)
new_links_depth += parse_links_from_source(downloaded_file, root_url=new_link.url)
except Exception as err:
stderr('[!] Failed to get contents of URL {new_link.url}', err, color='red')
imported_links = list({link.url: link for link in (new_links + new_links_depth)}.values())
@@ -618,11 +642,21 @@ def add(urls: Union[str, List[str]],
if extractors:
archive_kwargs["methods"] = extractors
if update_all:
stderr()
ts = datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S')
if update:
stderr(f'[*] [{ts}] Archiving + updating {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
archive_links(imported_links, overwrite=overwrite, **archive_kwargs)
elif update_all:
stderr(f'[*] [{ts}] Archiving + updating {len(all_links)}/{len(all_links)}', len(all_links), 'URLs from entire library...', color='green')
archive_links(all_links, overwrite=overwrite, **archive_kwargs)
elif overwrite:
stderr(f'[*] [{ts}] Archiving + overwriting {len(imported_links)}/{len(all_links)}', len(imported_links), 'URLs from added set...', color='green')
archive_links(imported_links, overwrite=True, **archive_kwargs)
elif new_links:
stderr(f'[*] [{ts}] Archiving {len(new_links)}/{len(all_links)} URLs from added set...', color='green')
archive_links(new_links, overwrite=False, **archive_kwargs)
@@ -1113,6 +1147,7 @@ def schedule(add: bool=False,
every: Optional[str]=None,
depth: int=0,
overwrite: bool=False,
update: bool=not ONLY_NEW,
import_path: Optional[str]=None,
out_dir: Path=OUTPUT_DIR):
"""Set ArchiveBox to regularly import URLs at specific times using cron"""
@@ -1142,6 +1177,7 @@ def schedule(add: bool=False,
*([
'add',
*(['--overwrite'] if overwrite else []),
*(['--update'] if update else []),
f'--depth={depth}',
f'"{import_path}"',
] if import_path else ['update']),

View File

@@ -149,7 +149,17 @@ def run_parser_functions(to_parse: IO[str], timer, root_url: Optional[str]=None,
def save_text_as_source(raw_text: str, filename: str='{ts}-stdin.txt', out_dir: Path=OUTPUT_DIR) -> str:
ts = str(datetime.now(timezone.utc).timestamp()).split('.', 1)[0]
source_path = str(out_dir / SOURCES_DIR_NAME / filename.format(ts=ts))
atomic_write(source_path, raw_text)
referenced_texts = ''
for entry in raw_text.split():
try:
if Path(entry).exists():
referenced_texts += Path(entry).read_text()
except Exception as err:
print(err)
atomic_write(source_path, raw_text + '\n' + referenced_texts)
log_source_saved(source_file=source_path)
return source_path
@@ -176,7 +186,7 @@ def save_file_as_source(path: str, timeout: int=TIMEOUT, filename: str='{ts}-{ba
ANSI['reset'],
))
print(' ', e)
raise SystemExit(1)
raise e
else:
# Source is a path to a local file on the filesystem

View File

@@ -47,11 +47,11 @@ def get_pocket_articles(api: Pocket, since=None, page=0):
def link_from_article(article: dict, sources: list):
url: str = article['resolved_url'] or article['given_url']
url: str = article.get('resolved_url') or article['given_url']
broken_protocol = _BROKEN_PROTOCOL_RE.match(url)
if broken_protocol:
url = url.replace(f'{broken_protocol.group(1)}:/', f'{broken_protocol.group(1)}://')
title = article['resolved_title'] or article['given_title'] or url
title = article.get('resolved_title') or article.get('given_title') or url
return Link(
url=url,

View File

@@ -34,13 +34,19 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
trailing_removed = entry.split('</entry>', 1)[0]
leading_removed = trailing_removed.strip()
rows = leading_removed.split('\n')
splits_fixed = leading_removed.replace('"\n href="', '" href="')
rows = splits_fixed.split('\n')
def get_row(key):
return [r.strip() for r in rows if r.strip().startswith('<{}'.format(key))][0]
def get_row(prefix):
return [
row.strip()
for row in rows
if row.strip().startswith('<{}'.format(prefix))
][0]
title = str_between(get_row('title'), '<title><![CDATA[', ']]></title>').strip()
url = str_between(get_row('link rel="via"'), '<link rel="via">', '</link>')
url_inside_link = str_between(get_row('link rel="via"'), '<link rel="via">', '</link>')
url_inside_attr = str_between(get_row('link rel="via"'), 'href="', '"/>')
ts_str = str_between(get_row('published'), '<published>', '</published>')
time = datetime.strptime(ts_str, "%Y-%m-%dT%H:%M:%S%z")
try:
@@ -49,7 +55,7 @@ def parse_wallabag_atom_export(rss_file: IO[str], **_kwargs) -> Iterable[Link]:
tags = None
yield Link(
url=htmldecode(url),
url=htmldecode(url_inside_attr or url_inside_link),
timestamp=str(time.timestamp()),
title=htmldecode(title) or None,
tags=tags or '',

View File

@@ -197,7 +197,7 @@
// select the action button from the dropdown
container.find('select[name=action]')
.find('op:selected').removeAttr('selected').end()
.find('[selected]').removeAttr('selected').end()
.find('[value=' + action_type + ']').attr('selected', 'selected').click()
// click submit & replace the archivebox logo with a spinner

View File

@@ -28,6 +28,14 @@
<a href="/add" id="submit">&nbsp; Add more URLs </a>
</center>
{% else %}
<div id="in-progress" style="display: none;">
<center><h3>Adding URLs to index and running archive methods...</h3>
<br/>
<div class="loader"></div>
<br/>
Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for progress...
</center>
</div>
<form id="add-form" method="POST" class="p-form">{% csrf_token %}
<h1>Add new URLs to your archive</h1>
<br/>
@@ -48,10 +56,9 @@
{% endif %}
<script>
document.getElementById('add-form').addEventListener('submit', function(event) {
setTimeout(function() {
document.getElementById('add-form').innerHTML = '<center><h3>Adding URLs to index and running archive methods...<h3><br/><div class="loader"></div><br/>Check the server log or the <a href="/admin/core/archiveresult/?o=-1">Log</a> page for progress...</center>'
document.getElementById('delay-warning').style.display = 'block'
}, 200)
document.getElementById('in-progress').style.display = 'block'
document.getElementById('add-form').style.display = 'none'
document.getElementById('delay-warning').style.display = 'block'
return true
})
</script>

View File

@@ -414,6 +414,7 @@
</div>
</div>
{% endif %}
{% if PREVIEW_ORIGINALS %}
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{url}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy" referrerpolicy="no-referrer"></iframe>
@@ -427,6 +428,7 @@
</div>
</div>
</div>
{% endif %}
<div class="col-lg-2">
<div class="card">
<iframe class="card-img-top" src="{{headers_path}}" sandbox="allow-same-origin allow-top-navigation-by-user-activation allow-scripts allow-forms" scrolling="no" loading="lazy"></iframe>