mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-06 07:47:53 +10:00
wip
This commit is contained in:
@@ -1 +1 @@
|
||||
__package__ = 'archivebox.misc'
|
||||
__package__ = "archivebox.misc"
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.misc'
|
||||
__package__ = "archivebox.misc"
|
||||
|
||||
import os
|
||||
import sys
|
||||
@@ -23,69 +23,74 @@ def check_data_folder() -> None:
|
||||
from archivebox import DATA_DIR, ARCHIVE_DIR
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.paths import create_and_chown_dir, get_or_create_working_tmp_dir, get_or_create_working_lib_dir
|
||||
|
||||
|
||||
archive_dir_exists = os.path.isdir(ARCHIVE_DIR)
|
||||
if not archive_dir_exists:
|
||||
print('[red][X] No archivebox index found in the current directory.[/red]', file=sys.stderr)
|
||||
print(f' {DATA_DIR}', file=sys.stderr)
|
||||
print("[red][X] No archivebox index found in the current directory.[/red]", file=sys.stderr)
|
||||
print(f" {DATA_DIR}", file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
print(' [violet]Hint[/violet]: Are you running archivebox in the right folder?', file=sys.stderr)
|
||||
print(' cd path/to/your/archive/folder', file=sys.stderr)
|
||||
print(' archivebox [command]', file=sys.stderr)
|
||||
print(" [violet]Hint[/violet]: Are you running archivebox in the right folder?", file=sys.stderr)
|
||||
print(" cd path/to/your/archive/folder", file=sys.stderr)
|
||||
print(" archivebox [command]", file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
print(' [violet]Hint[/violet]: To create a new archive collection or import existing data in this folder, run:', file=sys.stderr)
|
||||
print(' archivebox init', file=sys.stderr)
|
||||
print(" [violet]Hint[/violet]: To create a new archive collection or import existing data in this folder, run:", file=sys.stderr)
|
||||
print(" archivebox init", file=sys.stderr)
|
||||
raise SystemExit(2)
|
||||
|
||||
|
||||
|
||||
# Create data dir subdirs
|
||||
create_and_chown_dir(CONSTANTS.SOURCES_DIR)
|
||||
create_and_chown_dir(CONSTANTS.PERSONAS_DIR / 'Default')
|
||||
create_and_chown_dir(CONSTANTS.PERSONAS_DIR / "Default")
|
||||
create_and_chown_dir(CONSTANTS.LOGS_DIR)
|
||||
# create_and_chown_dir(CONSTANTS.CACHE_DIR)
|
||||
|
||||
|
||||
# Create /tmp and /lib dirs if they don't exist
|
||||
get_or_create_working_tmp_dir(autofix=True, quiet=False)
|
||||
get_or_create_working_lib_dir(autofix=True, quiet=False)
|
||||
|
||||
|
||||
# Check data dir permissions, /tmp, and /lib permissions
|
||||
check_data_dir_permissions()
|
||||
|
||||
|
||||
|
||||
def check_migrations():
|
||||
from archivebox import DATA_DIR
|
||||
from archivebox.misc.db import list_migrations
|
||||
|
||||
pending_migrations = [name for status, name in list_migrations() if not status]
|
||||
is_migrating = any(arg in sys.argv for arg in ['makemigrations', 'migrate', 'init'])
|
||||
is_migrating = any(arg in sys.argv for arg in ["makemigrations", "migrate", "init"])
|
||||
|
||||
if pending_migrations and not is_migrating:
|
||||
print('[red][X] This collection was created with an older version of ArchiveBox and must be upgraded first.[/red]')
|
||||
print(f' {DATA_DIR}', file=sys.stderr)
|
||||
print("[red][X] This collection was created with an older version of ArchiveBox and must be upgraded first.[/red]")
|
||||
print(f" {DATA_DIR}", file=sys.stderr)
|
||||
print(file=sys.stderr)
|
||||
print(f' [violet]Hint:[/violet] To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:', file=sys.stderr)
|
||||
print(' archivebox init', file=sys.stderr)
|
||||
print(
|
||||
f" [violet]Hint:[/violet] To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(" archivebox init", file=sys.stderr)
|
||||
raise SystemExit(3)
|
||||
|
||||
|
||||
def check_io_encoding():
|
||||
PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8')
|
||||
|
||||
if PYTHON_ENCODING != 'UTF-8':
|
||||
print(f'[red][X] Your system is running python3 scripts with a bad locale setting: {PYTHON_ENCODING} (it should be UTF-8).[/red]', file=sys.stderr)
|
||||
PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace("UTF8", "UTF-8")
|
||||
|
||||
if PYTHON_ENCODING != "UTF-8":
|
||||
print(
|
||||
f"[red][X] Your system is running python3 scripts with a bad locale setting: {PYTHON_ENCODING} (it should be UTF-8).[/red]",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)', file=sys.stderr)
|
||||
print(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"', file=sys.stderr)
|
||||
print('')
|
||||
print(' Confirm that it\'s fixed by opening a new shell and running:', file=sys.stderr)
|
||||
print("")
|
||||
print(" Confirm that it's fixed by opening a new shell and running:", file=sys.stderr)
|
||||
print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8', file=sys.stderr)
|
||||
raise SystemExit(2)
|
||||
|
||||
|
||||
# # hard errors: check python version
|
||||
# if sys.version_info[:3] < (3, 10, 0):
|
||||
# print('[red][X] Python version is not new enough: {sys.version} (>3.10 is required)[/red]', file=sys.stderr)
|
||||
# print(' See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.', file=sys.stderr)
|
||||
# raise SystemExit(2)
|
||||
|
||||
|
||||
# # hard errors: check django version
|
||||
# if int(django.VERSION[0]) < 5:
|
||||
# print('[red][X] Django version is not new enough: {django.VERSION[:3]} (>=5.0 is required)[/red]', file=sys.stderr)
|
||||
@@ -96,35 +101,44 @@ def check_io_encoding():
|
||||
def check_not_root():
|
||||
from archivebox.config.permissions import IS_ROOT, IN_DOCKER
|
||||
|
||||
attempted_command = ' '.join(sys.argv[1:]) if len(sys.argv) > 1 else ''
|
||||
is_getting_help = '-h' in sys.argv or '--help' in sys.argv or 'help' in sys.argv
|
||||
is_getting_version = '--version' in sys.argv or 'version' in sys.argv
|
||||
is_installing = 'setup' in sys.argv or 'install' in sys.argv
|
||||
attempted_command = " ".join(sys.argv[1:]) if len(sys.argv) > 1 else ""
|
||||
is_getting_help = "-h" in sys.argv or "--help" in sys.argv or "help" in sys.argv
|
||||
is_getting_version = "--version" in sys.argv or "version" in sys.argv
|
||||
is_installing = "setup" in sys.argv or "install" in sys.argv
|
||||
|
||||
if IS_ROOT and not (is_getting_help or is_getting_version or is_installing):
|
||||
print('[red][!] ArchiveBox should never be run as root![/red]', file=sys.stderr)
|
||||
print(' For more information, see the security overview documentation:', file=sys.stderr)
|
||||
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root', file=sys.stderr)
|
||||
print("[red][!] ArchiveBox should never be run as root![/red]", file=sys.stderr)
|
||||
print(" For more information, see the security overview documentation:", file=sys.stderr)
|
||||
print(" https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root", file=sys.stderr)
|
||||
|
||||
if IN_DOCKER:
|
||||
print('[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:', file=sys.stderr)
|
||||
print(' docker compose run archivebox {attempted_command}', file=sys.stderr)
|
||||
print(f' docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}', file=sys.stderr)
|
||||
print(' or:', file=sys.stderr)
|
||||
print(f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
|
||||
print(f' docker exec -it --user=archivebox <container id> /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
|
||||
print(
|
||||
"[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:",
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(" docker compose run archivebox {attempted_command}", file=sys.stderr)
|
||||
print(f" docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}", file=sys.stderr)
|
||||
print(" or:", file=sys.stderr)
|
||||
print(
|
||||
f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"',
|
||||
file=sys.stderr,
|
||||
)
|
||||
print(
|
||||
f' docker exec -it --user=archivebox <container id> /bin/bash -c "archivebox {attempted_command}"',
|
||||
file=sys.stderr,
|
||||
)
|
||||
raise SystemExit(2)
|
||||
|
||||
|
||||
def check_not_inside_source_dir():
|
||||
"""Prevent running ArchiveBox from inside its source directory (would pollute repo with data files)."""
|
||||
cwd = Path(os.getcwd()).resolve()
|
||||
is_source_dir = (cwd / 'archivebox' / '__init__.py').exists() and (cwd / 'pyproject.toml').exists()
|
||||
data_dir_set_elsewhere = os.environ.get('DATA_DIR', '').strip() and Path(os.environ['DATA_DIR']).resolve() != cwd
|
||||
is_testing = 'pytest' in sys.modules or 'unittest' in sys.modules
|
||||
is_source_dir = (cwd / "archivebox" / "__init__.py").exists() and (cwd / "pyproject.toml").exists()
|
||||
data_dir_set_elsewhere = os.environ.get("DATA_DIR", "").strip() and Path(os.environ["DATA_DIR"]).resolve() != cwd
|
||||
is_testing = "pytest" in sys.modules or "unittest" in sys.modules
|
||||
|
||||
if is_source_dir and not data_dir_set_elsewhere and not is_testing:
|
||||
raise SystemExit('[!] Cannot run from source dir, set DATA_DIR or cd to a data folder first')
|
||||
raise SystemExit("[!] Cannot run from source dir, set DATA_DIR or cd to a data folder first")
|
||||
|
||||
|
||||
def check_data_dir_permissions():
|
||||
@@ -132,28 +146,42 @@ def check_data_dir_permissions():
|
||||
from archivebox.misc.logging import STDERR
|
||||
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, DEFAULT_PUID, DEFAULT_PGID, IS_ROOT, USER
|
||||
from archivebox.config.paths import get_or_create_working_tmp_dir, get_or_create_working_lib_dir
|
||||
|
||||
|
||||
data_dir_stat = Path(DATA_DIR).stat()
|
||||
data_dir_uid, data_dir_gid = data_dir_stat.st_uid, data_dir_stat.st_gid
|
||||
data_owned_by_root = data_dir_uid == 0
|
||||
|
||||
|
||||
# data_owned_by_default_user = data_dir_uid == DEFAULT_PUID or data_dir_gid == DEFAULT_PGID
|
||||
data_owner_doesnt_match = (data_dir_uid != ARCHIVEBOX_USER and data_dir_gid != ARCHIVEBOX_GROUP) if not IS_ROOT else False
|
||||
data_not_writable = not (os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.W_OK))
|
||||
if data_owned_by_root:
|
||||
STDERR.print('\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]root[/red], it must be changed before archiving can run![/yellow]')
|
||||
STDERR.print(
|
||||
"\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]root[/red], it must be changed before archiving can run![/yellow]",
|
||||
)
|
||||
elif data_owner_doesnt_match or data_not_writable:
|
||||
STDERR.print(f'\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]{data_dir_uid}:{data_dir_gid}[/red], but ArchiveBox user is [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue] ({USER})! (ArchiveBox may not be able to write to the data dir)[/yellow]')
|
||||
|
||||
STDERR.print(
|
||||
f"\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]{data_dir_uid}:{data_dir_gid}[/red], but ArchiveBox user is [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue] ({USER})! (ArchiveBox may not be able to write to the data dir)[/yellow]",
|
||||
)
|
||||
|
||||
if data_owned_by_root or data_owner_doesnt_match or data_not_writable:
|
||||
STDERR.print(f'[violet]Hint:[/violet] Change the current ownership [red]{data_dir_uid}[/red]:{data_dir_gid} (PUID:PGID) to a non-root user & group that will run ArchiveBox, e.g.:')
|
||||
STDERR.print(f' [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {DATA_DIR.resolve()}')
|
||||
STDERR.print(
|
||||
f"[violet]Hint:[/violet] Change the current ownership [red]{data_dir_uid}[/red]:{data_dir_gid} (PUID:PGID) to a non-root user & group that will run ArchiveBox, e.g.:",
|
||||
)
|
||||
STDERR.print(f" [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {DATA_DIR.resolve()}")
|
||||
STDERR.print()
|
||||
STDERR.print('[blue]More info:[/blue]')
|
||||
STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox#storage-requirements]https://github.com/ArchiveBox/ArchiveBox#storage-requirements[/link]')
|
||||
STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]')
|
||||
STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]')
|
||||
STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]')
|
||||
STDERR.print("[blue]More info:[/blue]")
|
||||
STDERR.print(
|
||||
" [link=https://github.com/ArchiveBox/ArchiveBox#storage-requirements]https://github.com/ArchiveBox/ArchiveBox#storage-requirements[/link]",
|
||||
)
|
||||
STDERR.print(
|
||||
" [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]",
|
||||
)
|
||||
STDERR.print(
|
||||
" [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]",
|
||||
)
|
||||
STDERR.print(
|
||||
" [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]",
|
||||
)
|
||||
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
@@ -172,8 +200,8 @@ def check_data_dir_permissions():
|
||||
|
||||
# Check /lib dir permissions
|
||||
check_lib_dir(lib_dir, throw=False, must_exist=True)
|
||||
|
||||
os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
|
||||
|
||||
os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8))
|
||||
|
||||
|
||||
def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
|
||||
@@ -182,45 +210,57 @@ def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
|
||||
from archivebox.misc.logging_util import pretty_path
|
||||
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
|
||||
tmp_dir = tmp_dir or STORAGE_CONFIG.TMP_DIR
|
||||
socket_file = tmp_dir.absolute().resolve() / "supervisord.sock"
|
||||
|
||||
if not must_exist and not os.path.isdir(tmp_dir):
|
||||
# just check that its viable based on its length (because dir may not exist yet, we cant check if its writable)
|
||||
return len(f'file://{socket_file}') <= 96
|
||||
return len(f"file://{socket_file}") <= 96
|
||||
|
||||
tmp_is_valid = False
|
||||
allow_no_unix_sockets = os.environ.get('ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS', '').lower() in ('1', 'true', 'yes')
|
||||
allow_no_unix_sockets = os.environ.get("ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS", "").lower() in ("1", "true", "yes")
|
||||
try:
|
||||
tmp_is_valid = dir_is_writable(tmp_dir)
|
||||
if not allow_no_unix_sockets:
|
||||
tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir)
|
||||
assert tmp_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}'
|
||||
assert len(f'file://{socket_file}') <= 96, f'ArchiveBox TMP_DIR={tmp_dir} is too long, dir containing unix socket files must be <90 chars.'
|
||||
assert tmp_is_valid, f"ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}"
|
||||
assert len(f"file://{socket_file}") <= 96, (
|
||||
f"ArchiveBox TMP_DIR={tmp_dir} is too long, dir containing unix socket files must be <90 chars."
|
||||
)
|
||||
return True
|
||||
except Exception as e:
|
||||
if not quiet:
|
||||
STDERR.print()
|
||||
ERROR_TEXT = '\n'.join((
|
||||
'',
|
||||
f'[red]:cross_mark: ArchiveBox is unable to use TMP_DIR={pretty_path(tmp_dir)}[/red]',
|
||||
f' [yellow]{e}[/yellow]',
|
||||
'',
|
||||
'[blue]Info:[/blue] [grey53]The TMP_DIR is used for the supervisord unix socket file and other temporary files.',
|
||||
' - It [red]must[/red] be on a local drive (not inside a docker volume, remote network drive, or FUSE mount).',
|
||||
f' - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).',
|
||||
' - It [red]must[/red] be a *short* path (less than 90 characters) due to UNIX path length restrictions for sockets.',
|
||||
' - It [yellow]should[/yellow] be able to hold at least 200MB of data (in-progress downloads can be large).[/grey53]',
|
||||
'',
|
||||
'[violet]Hint:[/violet] Fix it by setting TMP_DIR to a path that meets these requirements, e.g.:',
|
||||
f' [green]archivebox config --set TMP_DIR={get_or_create_working_tmp_dir(autofix=False, quiet=True) or "/tmp/archivebox"}[/green]',
|
||||
'',
|
||||
))
|
||||
STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured TMP_DIR[/red]', subtitle='Background workers may fail to start until fixed.'))
|
||||
ERROR_TEXT = "\n".join(
|
||||
(
|
||||
"",
|
||||
f"[red]:cross_mark: ArchiveBox is unable to use TMP_DIR={pretty_path(tmp_dir)}[/red]",
|
||||
f" [yellow]{e}[/yellow]",
|
||||
"",
|
||||
"[blue]Info:[/blue] [grey53]The TMP_DIR is used for the supervisord unix socket file and other temporary files.",
|
||||
" - It [red]must[/red] be on a local drive (not inside a docker volume, remote network drive, or FUSE mount).",
|
||||
f" - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).",
|
||||
" - It [red]must[/red] be a *short* path (less than 90 characters) due to UNIX path length restrictions for sockets.",
|
||||
" - It [yellow]should[/yellow] be able to hold at least 200MB of data (in-progress downloads can be large).[/grey53]",
|
||||
"",
|
||||
"[violet]Hint:[/violet] Fix it by setting TMP_DIR to a path that meets these requirements, e.g.:",
|
||||
f" [green]archivebox config --set TMP_DIR={get_or_create_working_tmp_dir(autofix=False, quiet=True) or '/tmp/archivebox'}[/green]",
|
||||
"",
|
||||
),
|
||||
)
|
||||
STDERR.print(
|
||||
Panel(
|
||||
ERROR_TEXT,
|
||||
expand=False,
|
||||
border_style="red",
|
||||
title="[red]:cross_mark: Error with configured TMP_DIR[/red]",
|
||||
subtitle="Background workers may fail to start until fixed.",
|
||||
),
|
||||
)
|
||||
STDERR.print()
|
||||
if throw:
|
||||
raise OSError(f'TMP_DIR={tmp_dir} is invalid, ArchiveBox is unable to use it and the server will fail to start!') from e
|
||||
raise OSError(f"TMP_DIR={tmp_dir} is invalid, ArchiveBox is unable to use it and the server will fail to start!") from e
|
||||
return False
|
||||
|
||||
|
||||
@@ -230,38 +270,48 @@ def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_ex
|
||||
from archivebox.misc.logging_util import pretty_path
|
||||
from archivebox.config.paths import dir_is_writable, get_or_create_working_lib_dir
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
|
||||
|
||||
lib_dir = lib_dir or STORAGE_CONFIG.LIB_DIR
|
||||
|
||||
|
||||
# assert lib_dir == STORAGE_CONFIG.LIB_DIR, "lib_dir is not the same as the one in the flat config"
|
||||
|
||||
|
||||
if not must_exist and not os.path.isdir(lib_dir):
|
||||
return True
|
||||
|
||||
|
||||
lib_is_valid = False
|
||||
try:
|
||||
lib_is_valid = dir_is_writable(lib_dir)
|
||||
assert lib_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to LIB_DIR={lib_dir}'
|
||||
assert lib_is_valid, f"ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to LIB_DIR={lib_dir}"
|
||||
return True
|
||||
except Exception as e:
|
||||
if not quiet:
|
||||
STDERR.print()
|
||||
ERROR_TEXT = '\n'.join((
|
||||
'',
|
||||
f'[red]:cross_mark: ArchiveBox is unable to use LIB_DIR={pretty_path(lib_dir)}[/red]',
|
||||
f' [yellow]{e}[/yellow]',
|
||||
'',
|
||||
'[blue]Info:[/blue] [grey53]The LIB_DIR is used to store ArchiveBox auto-installed plugin library and binary dependencies.',
|
||||
f' - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).',
|
||||
' - It [yellow]should[/yellow] be on a local (ideally fast) drive like an SSD or HDD (not on a network drive or external HDD).',
|
||||
' - It [yellow]should[/yellow] be able to hold at least 1GB of data (some dependencies like Chrome can be large).[/grey53]',
|
||||
'',
|
||||
'[violet]Hint:[/violet] Fix it by setting LIB_DIR to a path that meets these requirements, e.g.:',
|
||||
f' [green]archivebox config --set LIB_DIR={get_or_create_working_lib_dir(autofix=False, quiet=True) or "/usr/local/share/archivebox"}[/green]',
|
||||
'',
|
||||
))
|
||||
STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured LIB_DIR[/red]', subtitle='[yellow]Dependencies may not auto-install properly until fixed.[/yellow]'))
|
||||
ERROR_TEXT = "\n".join(
|
||||
(
|
||||
"",
|
||||
f"[red]:cross_mark: ArchiveBox is unable to use LIB_DIR={pretty_path(lib_dir)}[/red]",
|
||||
f" [yellow]{e}[/yellow]",
|
||||
"",
|
||||
"[blue]Info:[/blue] [grey53]The LIB_DIR is used to store ArchiveBox auto-installed plugin library and binary dependencies.",
|
||||
f" - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).",
|
||||
" - It [yellow]should[/yellow] be on a local (ideally fast) drive like an SSD or HDD (not on a network drive or external HDD).",
|
||||
" - It [yellow]should[/yellow] be able to hold at least 1GB of data (some dependencies like Chrome can be large).[/grey53]",
|
||||
"",
|
||||
"[violet]Hint:[/violet] Fix it by setting LIB_DIR to a path that meets these requirements, e.g.:",
|
||||
f" [green]archivebox config --set LIB_DIR={get_or_create_working_lib_dir(autofix=False, quiet=True) or '/usr/local/share/archivebox'}[/green]",
|
||||
"",
|
||||
),
|
||||
)
|
||||
STDERR.print(
|
||||
Panel(
|
||||
ERROR_TEXT,
|
||||
expand=False,
|
||||
border_style="red",
|
||||
title="[red]:cross_mark: Error with configured LIB_DIR[/red]",
|
||||
subtitle="[yellow]Dependencies may not auto-install properly until fixed.[/yellow]",
|
||||
),
|
||||
)
|
||||
STDERR.print()
|
||||
if throw:
|
||||
raise OSError(f'LIB_DIR={lib_dir} is invalid, ArchiveBox is unable to use it and dependencies will fail to install.') from e
|
||||
raise OSError(f"LIB_DIR={lib_dir} is invalid, ArchiveBox is unable to use it and dependencies will fail to install.") from e
|
||||
return False
|
||||
|
||||
@@ -2,18 +2,18 @@
|
||||
Database utility functions for ArchiveBox.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.misc'
|
||||
__package__ = "archivebox.misc"
|
||||
|
||||
from io import StringIO
|
||||
from pathlib import Path
|
||||
from typing import Any, List, Tuple
|
||||
from typing import Any
|
||||
|
||||
from archivebox.config import DATA_DIR
|
||||
from archivebox.misc.util import enforce_types
|
||||
|
||||
|
||||
@enforce_types
|
||||
def list_migrations(out_dir: Path = DATA_DIR) -> List[Tuple[bool, str]]:
|
||||
def list_migrations(out_dir: Path = DATA_DIR) -> list[tuple[bool, str]]:
|
||||
"""List all Django migrations and their status"""
|
||||
from django.core.management import call_command
|
||||
|
||||
@@ -23,9 +23,9 @@ def list_migrations(out_dir: Path = DATA_DIR) -> List[Tuple[bool, str]]:
|
||||
|
||||
migrations = []
|
||||
for line in out.readlines():
|
||||
if line.strip() and ']' in line:
|
||||
status_str, name_str = line.strip().split(']', 1)
|
||||
is_applied = 'X' in status_str
|
||||
if line.strip() and "]" in line:
|
||||
status_str, name_str = line.strip().split("]", 1)
|
||||
is_applied = "X" in status_str
|
||||
migration_name = name_str.strip()
|
||||
migrations.append((is_applied, migration_name))
|
||||
|
||||
@@ -33,23 +33,21 @@ def list_migrations(out_dir: Path = DATA_DIR) -> List[Tuple[bool, str]]:
|
||||
|
||||
|
||||
@enforce_types
|
||||
def apply_migrations(out_dir: Path = DATA_DIR) -> List[str]:
|
||||
def apply_migrations(out_dir: Path = DATA_DIR) -> list[str]:
|
||||
"""Apply pending Django migrations"""
|
||||
from django.core.management import call_command
|
||||
|
||||
out1 = StringIO()
|
||||
|
||||
call_command("migrate", interactive=False, database='default', stdout=out1)
|
||||
call_command("migrate", interactive=False, database="default", stdout=out1)
|
||||
out1.seek(0)
|
||||
|
||||
return [
|
||||
line.strip() for line in out1.readlines() if line.strip()
|
||||
]
|
||||
return [line.strip() for line in out1.readlines() if line.strip()]
|
||||
|
||||
|
||||
@enforce_types
|
||||
def get_admins(out_dir: Path = DATA_DIR) -> List[Any]:
|
||||
def get_admins(out_dir: Path = DATA_DIR) -> list[Any]:
|
||||
"""Get list of superuser accounts"""
|
||||
from django.contrib.auth.models import User
|
||||
|
||||
return list(User.objects.filter(is_superuser=True).exclude(username='system'))
|
||||
return list(User.objects.filter(is_superuser=True).exclude(username="system"))
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from functools import wraps
|
||||
from time import time
|
||||
|
||||
|
||||
def timed_function(func):
|
||||
"""
|
||||
Very simple profiling decorator for debugging.
|
||||
@@ -8,23 +9,25 @@ def timed_function(func):
|
||||
@timed_function
|
||||
def my_func():
|
||||
...
|
||||
|
||||
|
||||
More advanced alternatives:
|
||||
- viztracer ../.venv/bin/archivebox manage check # https://viztracer.readthedocs.io/en/latest/filter.html
|
||||
- python -m cProfile -o archivebox.prof ../.venv/bin/archivebox manage check; snakeviz archivebox.prof
|
||||
- Django Debug Toolbar + django-debug-toolbar-flamegraph
|
||||
+ Django Requests Tracker (requests-tracker)
|
||||
"""
|
||||
|
||||
@wraps(func)
|
||||
def wrap(*args, **kwargs):
|
||||
if args and hasattr(args[0], '__module__'):
|
||||
if args and hasattr(args[0], "__module__"):
|
||||
module = args[0].__module__
|
||||
else:
|
||||
module = func.__module__
|
||||
ts_start = time()
|
||||
result = func(*args, **kwargs)
|
||||
ts_end = time()
|
||||
ms_elapsed = int((ts_end-ts_start) * 1000)
|
||||
print(f'[DEBUG][{ms_elapsed}ms] {module}.{func.__name__}(...)')
|
||||
ms_elapsed = int((ts_end - ts_start) * 1000)
|
||||
print(f"[DEBUG][{ms_elapsed}ms] {module}.{func.__name__}(...)")
|
||||
return result
|
||||
|
||||
return wrap
|
||||
|
||||
@@ -5,20 +5,19 @@ Note: This file only contains legacy cleanup utilities.
|
||||
The DB is the single source of truth - use Snapshot.objects queries for all status checks.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.misc'
|
||||
__package__ = "archivebox.misc"
|
||||
|
||||
import os
|
||||
import json
|
||||
import shutil
|
||||
from pathlib import Path
|
||||
from typing import Tuple, List
|
||||
|
||||
from archivebox.config import DATA_DIR, CONSTANTS
|
||||
from archivebox.misc.util import enforce_types
|
||||
|
||||
|
||||
@enforce_types
|
||||
def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], List[str]]:
|
||||
def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> tuple[list[str], list[str]]:
|
||||
"""
|
||||
Legacy cleanup: Move folders to their correct timestamp-named locations based on index.json.
|
||||
|
||||
@@ -29,19 +28,19 @@ def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], L
|
||||
cant_fix = []
|
||||
for entry in os.scandir(out_dir / CONSTANTS.ARCHIVE_DIR_NAME):
|
||||
if entry.is_dir(follow_symlinks=True):
|
||||
index_path = Path(entry.path) / 'index.json'
|
||||
index_path = Path(entry.path) / "index.json"
|
||||
if index_path.exists():
|
||||
try:
|
||||
with open(index_path, 'r') as f:
|
||||
with open(index_path) as f:
|
||||
data = json.load(f)
|
||||
timestamp = data.get('timestamp')
|
||||
timestamp = data.get("timestamp")
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
if not timestamp:
|
||||
continue
|
||||
|
||||
if not entry.path.endswith(f'/{timestamp}'):
|
||||
if not entry.path.endswith(f"/{timestamp}"):
|
||||
dest = out_dir / CONSTANTS.ARCHIVE_DIR_NAME / timestamp
|
||||
if dest.exists():
|
||||
cant_fix.append(entry.path)
|
||||
|
||||
@@ -2,20 +2,22 @@ import hashlib
|
||||
import mimetypes
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Callable
|
||||
from collections.abc import Callable
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
@lru_cache(maxsize=1024)
|
||||
def _cached_file_hash(filepath: str, size: int, mtime: float) -> str:
|
||||
"""Internal function to calculate file hash with cache key based on path, size and mtime."""
|
||||
sha256_hash = hashlib.sha256()
|
||||
|
||||
with open(filepath, 'rb') as f:
|
||||
for chunk in iter(lambda: f.read(4096), b''):
|
||||
with open(filepath, "rb") as f:
|
||||
for chunk in iter(lambda: f.read(4096), b""):
|
||||
sha256_hash.update(chunk)
|
||||
|
||||
return sha256_hash.hexdigest()
|
||||
|
||||
|
||||
@lru_cache(maxsize=10)
|
||||
def hash_file(file_path: Path, pwd: Path | None = None) -> str:
|
||||
"""Calculate SHA256 hash of a file with caching based on path, size and mtime."""
|
||||
@@ -30,9 +32,10 @@ def hash_file(file_path: Path, pwd: Path | None = None) -> str:
|
||||
return _cached_file_hash(
|
||||
str(abs_path),
|
||||
stat_info.st_size,
|
||||
stat_info.st_mtime
|
||||
stat_info.st_mtime,
|
||||
)
|
||||
|
||||
|
||||
@lru_cache(maxsize=10)
|
||||
def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callable | None = None, max_depth: int = -1) -> dict[str, str]:
|
||||
"""Calculate SHA256 hashes for all files and directories recursively."""
|
||||
@@ -48,9 +51,12 @@ def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callabl
|
||||
|
||||
# Get all files recursively
|
||||
all_files = get_dir_entries(
|
||||
dir_path, pwd=pwd, recursive=True,
|
||||
include_files=True, include_dirs=False,
|
||||
filter_func=filter_func
|
||||
dir_path,
|
||||
pwd=pwd,
|
||||
recursive=True,
|
||||
include_files=True,
|
||||
include_dirs=False,
|
||||
filter_func=filter_func,
|
||||
)
|
||||
|
||||
hashes: dict[str, str] = {}
|
||||
@@ -65,39 +71,48 @@ def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callabl
|
||||
|
||||
# Calculate hashes for all directories
|
||||
subdirs = get_dir_entries(
|
||||
dir_path, pwd=pwd, recursive=True,
|
||||
include_files=False, include_dirs=True,
|
||||
include_hidden=False, filter_func=filter_func,
|
||||
max_depth=max_depth
|
||||
dir_path,
|
||||
pwd=pwd,
|
||||
recursive=True,
|
||||
include_files=False,
|
||||
include_dirs=True,
|
||||
include_hidden=False,
|
||||
filter_func=filter_func,
|
||||
max_depth=max_depth,
|
||||
)
|
||||
|
||||
for subdir in subdirs:
|
||||
subdir_path = dir_path / subdir
|
||||
subdir_hashes = get_dir_hashes(
|
||||
subdir_path, filter_func=filter_func,
|
||||
max_depth=0
|
||||
subdir_path,
|
||||
filter_func=filter_func,
|
||||
max_depth=0,
|
||||
)
|
||||
hashes[subdir] = subdir_hashes['.']
|
||||
hashes[subdir] = subdir_hashes["."]
|
||||
|
||||
# Filter results by max_depth
|
||||
if max_depth >= 0:
|
||||
hashes = {
|
||||
path: value for path, value in hashes.items()
|
||||
if len(Path(path).parts) <= max_depth + 1
|
||||
}
|
||||
hashes = {path: value for path, value in hashes.items() if len(Path(path).parts) <= max_depth + 1}
|
||||
|
||||
# Calculate root directory hash
|
||||
hashable_summary.sort()
|
||||
root_sha256 = hashlib.sha256('\n'.join(hashable_summary).encode()).hexdigest()
|
||||
hashes['.'] = root_sha256
|
||||
root_sha256 = hashlib.sha256("\n".join(hashable_summary).encode()).hexdigest()
|
||||
hashes["."] = root_sha256
|
||||
|
||||
return hashes
|
||||
|
||||
|
||||
@lru_cache(maxsize=128)
|
||||
def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = True,
|
||||
include_files: bool = True, include_dirs: bool = True, include_hidden: bool = False,
|
||||
filter_func: Callable | None = None, max_depth: int = -1) -> tuple[str, ...]:
|
||||
def get_dir_entries(
|
||||
dir_path: Path,
|
||||
pwd: Path | None = None,
|
||||
recursive: bool = True,
|
||||
include_files: bool = True,
|
||||
include_dirs: bool = True,
|
||||
include_hidden: bool = False,
|
||||
filter_func: Callable | None = None,
|
||||
max_depth: int = -1,
|
||||
) -> tuple[str, ...]:
|
||||
"""Get filtered list of directory entries."""
|
||||
pwd = Path(pwd) if pwd else None
|
||||
dir_path = Path(dir_path)
|
||||
@@ -107,20 +122,20 @@ def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = T
|
||||
results = []
|
||||
|
||||
def process_path(path: Path, depth: int):
|
||||
if not include_hidden and path.name.startswith('.'):
|
||||
if not include_hidden and path.name.startswith("."):
|
||||
return False
|
||||
if max_depth >= 0 and depth > max_depth:
|
||||
return False
|
||||
if filter_func:
|
||||
info = {
|
||||
"abspath": str(path.absolute()),
|
||||
"relpath": str(path.relative_to(dir_path))
|
||||
"relpath": str(path.relative_to(dir_path)),
|
||||
}
|
||||
if not filter_func(info):
|
||||
return False
|
||||
return True
|
||||
|
||||
for path in dir_path.rglob('*') if recursive else dir_path.glob('*'):
|
||||
for path in dir_path.rglob("*") if recursive else dir_path.glob("*"):
|
||||
current_depth = len(path.relative_to(dir_path).parts)
|
||||
|
||||
if path.is_file() and include_files and process_path(path, current_depth):
|
||||
@@ -133,6 +148,7 @@ def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = T
|
||||
|
||||
return tuple(sorted(results)) # Make immutable for caching
|
||||
|
||||
|
||||
@lru_cache(maxsize=1024)
|
||||
def get_dir_sizes(dir_path: Path, pwd: Path | None = None, **kwargs) -> dict[str, int]:
|
||||
"""Calculate sizes for all files and directories recursively."""
|
||||
@@ -146,10 +162,10 @@ def get_dir_sizes(dir_path: Path, pwd: Path | None = None, **kwargs) -> dict[str
|
||||
sizes[path_key] = full_path.stat().st_size
|
||||
else:
|
||||
total = 0
|
||||
for file_path in full_path.rglob('*'):
|
||||
if file_path.is_file() and not file_path.name.startswith('.'):
|
||||
for file_path in full_path.rglob("*"):
|
||||
if file_path.is_file() and not file_path.name.startswith("."):
|
||||
total += file_path.stat().st_size
|
||||
sizes[path_key + '/'] = total
|
||||
sizes[path_key + "/"] = total
|
||||
|
||||
return sizes
|
||||
|
||||
@@ -165,23 +181,23 @@ def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable
|
||||
hashes = get_dir_hashes(dir_path, pwd=pwd, filter_func=filter_func, max_depth=max_depth)
|
||||
sizes = get_dir_sizes(str(dir_path), pwd=pwd, filter_func=filter_func, max_depth=max_depth)
|
||||
|
||||
num_total_subpaths = sum(1 for name in hashes if name != '.')
|
||||
num_total_subpaths = sum(1 for name in hashes if name != ".")
|
||||
details = {}
|
||||
|
||||
for filename, sha256_hash in sorted(hashes.items()):
|
||||
abs_path = (dir_path / filename).resolve()
|
||||
stat_info = abs_path.stat()
|
||||
num_subpaths = sum(1 for p in hashes if p.startswith(filename + '/'))
|
||||
num_subpaths = sum(1 for p in hashes if p.startswith(filename + "/"))
|
||||
is_dir = abs_path.is_dir()
|
||||
if is_dir:
|
||||
mime_type = 'inode/directory'
|
||||
mime_type = "inode/directory"
|
||||
basename = abs_path.name
|
||||
extension = ''
|
||||
num_bytes = sizes[filename + '/']
|
||||
if filename == '.':
|
||||
extension = ""
|
||||
num_bytes = sizes[filename + "/"]
|
||||
if filename == ".":
|
||||
num_subpaths = num_total_subpaths
|
||||
else:
|
||||
filename += '/'
|
||||
filename += "/"
|
||||
num_subpaths = num_subpaths
|
||||
else: # is_file
|
||||
num_subpaths = None
|
||||
@@ -191,14 +207,14 @@ def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable
|
||||
num_bytes = sizes[filename]
|
||||
|
||||
details[filename] = {
|
||||
'basename': basename,
|
||||
'mime_type': mime_type,
|
||||
'extension': extension,
|
||||
'num_subpaths': num_subpaths,
|
||||
'num_bytes': num_bytes,
|
||||
'hash_sha256': sha256_hash,
|
||||
'created_at': datetime.fromtimestamp(stat_info.st_ctime).isoformat(),
|
||||
'modified_at': datetime.fromtimestamp(stat_info.st_mtime).isoformat(),
|
||||
"basename": basename,
|
||||
"mime_type": mime_type,
|
||||
"extension": extension,
|
||||
"num_subpaths": num_subpaths,
|
||||
"num_bytes": num_bytes,
|
||||
"hash_sha256": sha256_hash,
|
||||
"created_at": datetime.fromtimestamp(stat_info.st_ctime).isoformat(),
|
||||
"modified_at": datetime.fromtimestamp(stat_info.st_mtime).isoformat(),
|
||||
}
|
||||
|
||||
if filter_func and not filter_func(details[filename]):
|
||||
@@ -207,12 +223,13 @@ def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable
|
||||
return details
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
import json
|
||||
dir_info = get_dir_info(Path('.'), max_depth=6)
|
||||
with open('.hashes.json', 'w') as f:
|
||||
|
||||
dir_info = get_dir_info(Path("."), max_depth=6)
|
||||
with open(".hashes.json", "w") as f:
|
||||
json.dump(dir_info, f, indent=4)
|
||||
print('Wrote .hashes.json')
|
||||
print("Wrote .hashes.json")
|
||||
|
||||
# Example output:
|
||||
# {
|
||||
|
||||
@@ -20,72 +20,73 @@ Plain URLs (also supported):
|
||||
https://foo.com
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.misc'
|
||||
__package__ = "archivebox.misc"
|
||||
|
||||
import sys
|
||||
import json
|
||||
import select
|
||||
from typing import Iterable, Iterator, Dict, Any, Optional, TextIO
|
||||
from typing import Any, TextIO
|
||||
from collections.abc import Iterable, Iterator
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
# Type constants for JSONL records
|
||||
TYPE_SNAPSHOT = 'Snapshot'
|
||||
TYPE_ARCHIVERESULT = 'ArchiveResult'
|
||||
TYPE_TAG = 'Tag'
|
||||
TYPE_CRAWL = 'Crawl'
|
||||
TYPE_BINARY = 'Binary'
|
||||
TYPE_PROCESS = 'Process'
|
||||
TYPE_MACHINE = 'Machine'
|
||||
TYPE_SNAPSHOT = "Snapshot"
|
||||
TYPE_ARCHIVERESULT = "ArchiveResult"
|
||||
TYPE_TAG = "Tag"
|
||||
TYPE_CRAWL = "Crawl"
|
||||
TYPE_BINARY = "Binary"
|
||||
TYPE_PROCESS = "Process"
|
||||
TYPE_MACHINE = "Machine"
|
||||
|
||||
VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_BINARY, TYPE_PROCESS, TYPE_MACHINE}
|
||||
|
||||
|
||||
def parse_line(line: str) -> Optional[Dict[str, Any]]:
|
||||
def parse_line(line: str) -> dict[str, Any] | None:
|
||||
"""
|
||||
Parse a single line of input as either JSONL or plain URL.
|
||||
|
||||
Returns a dict with at minimum {'type': '...', 'url': '...'} or None if invalid.
|
||||
"""
|
||||
line = line.strip()
|
||||
if not line or line.startswith('#'):
|
||||
if not line or line.startswith("#"):
|
||||
return None
|
||||
|
||||
# Try to parse as JSON first
|
||||
if line.startswith('{'):
|
||||
if line.startswith("{"):
|
||||
try:
|
||||
record = json.loads(line)
|
||||
# If it has a type, validate it
|
||||
if 'type' in record and record['type'] not in VALID_TYPES:
|
||||
if "type" in record and record["type"] not in VALID_TYPES:
|
||||
# Unknown type, treat as raw data
|
||||
pass
|
||||
# If it has url but no type, assume Snapshot
|
||||
if 'url' in record and 'type' not in record:
|
||||
record['type'] = TYPE_SNAPSHOT
|
||||
if "url" in record and "type" not in record:
|
||||
record["type"] = TYPE_SNAPSHOT
|
||||
return record
|
||||
except json.JSONDecodeError:
|
||||
pass
|
||||
|
||||
# Treat as plain URL if it looks like one
|
||||
if line.startswith('http://') or line.startswith('https://') or line.startswith('file://'):
|
||||
return {'type': TYPE_SNAPSHOT, 'url': line}
|
||||
if line.startswith("http://") or line.startswith("https://") or line.startswith("file://"):
|
||||
return {"type": TYPE_SNAPSHOT, "url": line}
|
||||
|
||||
# Could be a snapshot ID (UUID with dashes or compact 32-char hex)
|
||||
if len(line) == 36 and line.count('-') == 4:
|
||||
return {'type': TYPE_SNAPSHOT, 'id': line}
|
||||
if len(line) == 36 and line.count("-") == 4:
|
||||
return {"type": TYPE_SNAPSHOT, "id": line}
|
||||
if len(line) == 32:
|
||||
try:
|
||||
int(line, 16)
|
||||
except ValueError:
|
||||
pass
|
||||
else:
|
||||
return {'type': TYPE_SNAPSHOT, 'id': line}
|
||||
return {"type": TYPE_SNAPSHOT, "id": line}
|
||||
|
||||
# Unknown format, skip
|
||||
return None
|
||||
|
||||
|
||||
def read_stdin(stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
|
||||
def read_stdin(stream: TextIO | None = None) -> Iterator[dict[str, Any]]:
|
||||
"""
|
||||
Read JSONL or plain URLs from stdin.
|
||||
|
||||
@@ -112,20 +113,20 @@ def read_stdin(stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
|
||||
yield record
|
||||
|
||||
|
||||
def read_file(path: Path) -> Iterator[Dict[str, Any]]:
|
||||
def read_file(path: Path) -> Iterator[dict[str, Any]]:
|
||||
"""
|
||||
Read JSONL or plain URLs from a file.
|
||||
|
||||
Yields parsed records as dicts.
|
||||
"""
|
||||
with open(path, 'r') as f:
|
||||
with open(path) as f:
|
||||
for line in f:
|
||||
record = parse_line(line)
|
||||
if record:
|
||||
yield record
|
||||
|
||||
|
||||
def read_args_or_stdin(args: Iterable[str], stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
|
||||
def read_args_or_stdin(args: Iterable[str], stream: TextIO | None = None) -> Iterator[dict[str, Any]]:
|
||||
"""
|
||||
Read from CLI arguments if provided, otherwise from stdin.
|
||||
|
||||
@@ -145,16 +146,16 @@ def read_args_or_stdin(args: Iterable[str], stream: Optional[TextIO] = None) ->
|
||||
yield from read_stdin(stream)
|
||||
|
||||
|
||||
def write_record(record: Dict[str, Any], stream: Optional[TextIO] = None) -> None:
|
||||
def write_record(record: dict[str, Any], stream: TextIO | None = None) -> None:
|
||||
"""
|
||||
Write a single JSONL record to stdout (or provided stream).
|
||||
"""
|
||||
active_stream: TextIO = sys.stdout if stream is None else stream
|
||||
active_stream.write(json.dumps(record) + '\n')
|
||||
active_stream.write(json.dumps(record) + "\n")
|
||||
active_stream.flush()
|
||||
|
||||
|
||||
def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] = None) -> int:
|
||||
def write_records(records: Iterator[dict[str, Any]], stream: TextIO | None = None) -> int:
|
||||
"""
|
||||
Write multiple JSONL records to stdout (or provided stream).
|
||||
|
||||
|
||||
@@ -8,24 +8,26 @@ This is separate from the hooks-based parser system which handles importing
|
||||
new URLs from bookmark files, RSS feeds, etc.
|
||||
"""
|
||||
|
||||
__package__ = 'archivebox.misc'
|
||||
__package__ = "archivebox.misc"
|
||||
|
||||
import os
|
||||
import json
|
||||
from pathlib import Path
|
||||
from datetime import datetime, timezone
|
||||
from typing import Iterator, TypedDict, List
|
||||
from typing import TypedDict
|
||||
from collections.abc import Iterator
|
||||
|
||||
|
||||
class SnapshotDict(TypedDict, total=False):
|
||||
"""
|
||||
Dictionary type representing a snapshot/link, compatible with Snapshot model fields.
|
||||
"""
|
||||
url: str # Required: the URL to archive
|
||||
timestamp: str # Optional: unix timestamp string
|
||||
title: str # Optional: page title
|
||||
tags: str # Optional: comma-separated tags string
|
||||
sources: List[str] # Optional: list of source file paths
|
||||
|
||||
url: str # Required: the URL to archive
|
||||
timestamp: str # Optional: unix timestamp string
|
||||
title: str # Optional: page title
|
||||
tags: str # Optional: comma-separated tags string
|
||||
sources: list[str] # Optional: list of source file paths
|
||||
|
||||
|
||||
def parse_json_main_index(out_dir: Path) -> Iterator[SnapshotDict]:
|
||||
@@ -41,16 +43,16 @@ def parse_json_main_index(out_dir: Path) -> Iterator[SnapshotDict]:
|
||||
return
|
||||
|
||||
try:
|
||||
with open(index_path, 'r', encoding='utf-8') as f:
|
||||
with open(index_path, encoding="utf-8") as f:
|
||||
data = json.load(f)
|
||||
|
||||
links = data.get('links', [])
|
||||
links = data.get("links", [])
|
||||
for link in links:
|
||||
yield {
|
||||
'url': link.get('url', ''),
|
||||
'timestamp': link.get('timestamp', str(datetime.now(timezone.utc).timestamp())),
|
||||
'title': link.get('title'),
|
||||
'tags': link.get('tags', ''),
|
||||
"url": link.get("url", ""),
|
||||
"timestamp": link.get("timestamp", str(datetime.now(timezone.utc).timestamp())),
|
||||
"title": link.get("title"),
|
||||
"tags": link.get("tags", ""),
|
||||
}
|
||||
except (json.JSONDecodeError, KeyError, TypeError):
|
||||
return
|
||||
@@ -81,12 +83,12 @@ def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
|
||||
|
||||
if jsonl_file.exists():
|
||||
try:
|
||||
with open(jsonl_file, 'r', encoding='utf-8') as f:
|
||||
with open(jsonl_file, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if line.startswith('{'):
|
||||
if line.startswith("{"):
|
||||
record = json.loads(line)
|
||||
if record.get('type') == 'Snapshot':
|
||||
if record.get("type") == "Snapshot":
|
||||
link = record
|
||||
break
|
||||
except (json.JSONDecodeError, KeyError, TypeError):
|
||||
@@ -94,15 +96,15 @@ def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
|
||||
|
||||
if link is None and json_file.exists():
|
||||
try:
|
||||
with open(json_file, 'r', encoding='utf-8') as f:
|
||||
with open(json_file, encoding="utf-8") as f:
|
||||
link = json.load(f)
|
||||
except (json.JSONDecodeError, KeyError, TypeError):
|
||||
pass
|
||||
|
||||
if link:
|
||||
yield {
|
||||
'url': link.get('url', ''),
|
||||
'timestamp': link.get('timestamp', entry.name),
|
||||
'title': link.get('title'),
|
||||
'tags': link.get('tags', ''),
|
||||
"url": link.get("url", ""),
|
||||
"timestamp": link.get("timestamp", entry.name),
|
||||
"title": link.get("title"),
|
||||
"tags": link.get("tags", ""),
|
||||
}
|
||||
|
||||
@@ -1,10 +1,9 @@
|
||||
__package__ = 'archivebox.misc'
|
||||
__package__ = "archivebox.misc"
|
||||
|
||||
# Low-level logging primitives (Rich console, ANSI colors, stdout/stderr helpers)
|
||||
# Higher-level logging functions are in logging_util.py
|
||||
|
||||
import sys
|
||||
from typing import Optional, Union, Tuple, List
|
||||
from collections import defaultdict
|
||||
from random import randint
|
||||
|
||||
@@ -19,11 +18,13 @@ CONSOLE = Console(width=32768, soft_wrap=True, force_terminal=True)
|
||||
STDERR = Console(stderr=True, width=32768, soft_wrap=True, force_terminal=True)
|
||||
IS_TTY = sys.stdout.isatty()
|
||||
|
||||
|
||||
class RainbowHighlighter(Highlighter):
|
||||
def highlight(self, text):
|
||||
for index in range(len(text)):
|
||||
text.stylize(f"color({randint(90, 98)})", index, index + 1)
|
||||
|
||||
|
||||
rainbow = RainbowHighlighter()
|
||||
|
||||
|
||||
@@ -38,49 +39,55 @@ DEFAULT_CLI_COLORS = benedict(
|
||||
"blue": "\033[01;34m",
|
||||
"white": "\033[01;37m",
|
||||
"black": "\033[01;30m",
|
||||
}
|
||||
},
|
||||
)
|
||||
ANSI = benedict({k: "" for k in DEFAULT_CLI_COLORS.keys()})
|
||||
|
||||
COLOR_DICT = defaultdict(
|
||||
lambda: [(0, 0, 0), (0, 0, 0)],
|
||||
{
|
||||
"00": [(0, 0, 0), (0, 0, 0)],
|
||||
"30": [(0, 0, 0), (0, 0, 0)],
|
||||
"31": [(255, 0, 0), (128, 0, 0)],
|
||||
"32": [(0, 200, 0), (0, 128, 0)],
|
||||
"33": [(255, 255, 0), (128, 128, 0)],
|
||||
"34": [(0, 0, 255), (0, 0, 128)],
|
||||
"35": [(255, 0, 255), (128, 0, 128)],
|
||||
"36": [(0, 255, 255), (0, 128, 128)],
|
||||
"37": [(255, 255, 255), (255, 255, 255)],
|
||||
},
|
||||
)
|
||||
ANSI = benedict({k: '' for k in DEFAULT_CLI_COLORS.keys()})
|
||||
|
||||
COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
|
||||
'00': [(0, 0, 0), (0, 0, 0)],
|
||||
'30': [(0, 0, 0), (0, 0, 0)],
|
||||
'31': [(255, 0, 0), (128, 0, 0)],
|
||||
'32': [(0, 200, 0), (0, 128, 0)],
|
||||
'33': [(255, 255, 0), (128, 128, 0)],
|
||||
'34': [(0, 0, 255), (0, 0, 128)],
|
||||
'35': [(255, 0, 255), (128, 0, 128)],
|
||||
'36': [(0, 255, 255), (0, 128, 128)],
|
||||
'37': [(255, 255, 255), (255, 255, 255)],
|
||||
})
|
||||
|
||||
# Logging Helpers (DEPRECATED, use rich.print instead going forward)
|
||||
def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None:
|
||||
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
|
||||
def stdout(*args, color: str | None = None, prefix: str = "", config: benedict | None = None) -> None:
|
||||
ansi = DEFAULT_CLI_COLORS if (config or {}).get("USE_COLOR") else ANSI
|
||||
|
||||
if color:
|
||||
strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
|
||||
strs = [ansi[color], " ".join(str(a) for a in args), ansi["reset"], "\n"]
|
||||
else:
|
||||
strs = [' '.join(str(a) for a in args), '\n']
|
||||
strs = [" ".join(str(a) for a in args), "\n"]
|
||||
|
||||
sys.stdout.write(prefix + ''.join(strs))
|
||||
sys.stdout.write(prefix + "".join(strs))
|
||||
|
||||
def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None:
|
||||
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
|
||||
|
||||
def stderr(*args, color: str | None = None, prefix: str = "", config: benedict | None = None) -> None:
|
||||
ansi = DEFAULT_CLI_COLORS if (config or {}).get("USE_COLOR") else ANSI
|
||||
|
||||
if color:
|
||||
strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
|
||||
strs = [ansi[color], " ".join(str(a) for a in args), ansi["reset"], "\n"]
|
||||
else:
|
||||
strs = [' '.join(str(a) for a in args), '\n']
|
||||
strs = [" ".join(str(a) for a in args), "\n"]
|
||||
|
||||
sys.stderr.write(prefix + ''.join(strs))
|
||||
sys.stderr.write(prefix + "".join(strs))
|
||||
|
||||
def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Optional[benedict]=None) -> None:
|
||||
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
|
||||
|
||||
def hint(text: tuple[str, ...] | list[str] | str, prefix=" ", config: benedict | None = None) -> None:
|
||||
ansi = DEFAULT_CLI_COLORS if (config or {}).get("USE_COLOR") else ANSI
|
||||
|
||||
if isinstance(text, str):
|
||||
stderr(f"{prefix}{ansi['lightred']}Hint:{ansi['reset']} {text}")
|
||||
else:
|
||||
stderr(f"{prefix}{ansi['lightred']}Hint:{ansi['reset']} {text[0]}")
|
||||
for line in text[1:]:
|
||||
stderr(f'{prefix} {line}')
|
||||
stderr(f"{prefix} {line}")
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox'
|
||||
__package__ = "archivebox"
|
||||
|
||||
# High-level logging functions for CLI output and progress tracking
|
||||
# Low-level primitives (Rich console, ANSI colors) are in logging.py
|
||||
@@ -14,7 +14,8 @@ from pathlib import Path
|
||||
|
||||
from datetime import datetime, timezone
|
||||
from dataclasses import dataclass
|
||||
from typing import Any, Optional, List, Dict, Union, Iterable, IO, TYPE_CHECKING, cast
|
||||
from typing import Any, Optional, IO, TYPE_CHECKING, cast
|
||||
from collections.abc import Iterable
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from archivebox.core.models import Snapshot
|
||||
@@ -28,6 +29,7 @@ from archivebox.misc.system import get_dir_size
|
||||
from archivebox.misc.util import enforce_types
|
||||
from archivebox.misc.logging import ANSI
|
||||
|
||||
|
||||
@dataclass
|
||||
class RuntimeStats:
|
||||
"""mutable stats counter for logging archiving timing info to CLI output"""
|
||||
@@ -36,14 +38,15 @@ class RuntimeStats:
|
||||
succeeded: int = 0
|
||||
failed: int = 0
|
||||
|
||||
parse_start_ts: Optional[datetime] = None
|
||||
parse_end_ts: Optional[datetime] = None
|
||||
parse_start_ts: datetime | None = None
|
||||
parse_end_ts: datetime | None = None
|
||||
|
||||
index_start_ts: Optional[datetime] = None
|
||||
index_end_ts: Optional[datetime] = None
|
||||
index_start_ts: datetime | None = None
|
||||
index_end_ts: datetime | None = None
|
||||
|
||||
archiving_start_ts: datetime | None = None
|
||||
archiving_end_ts: datetime | None = None
|
||||
|
||||
archiving_start_ts: Optional[datetime] = None
|
||||
archiving_end_ts: Optional[datetime] = None
|
||||
|
||||
# globals are bad, mmkay
|
||||
_LAST_RUN_STATS = RuntimeStats()
|
||||
@@ -52,49 +55,47 @@ _LAST_RUN_STATS = RuntimeStats()
|
||||
class TimedProgress:
|
||||
"""Show a progress bar and measure elapsed time until .end() is called"""
|
||||
|
||||
def __init__(self, seconds, prefix=''):
|
||||
def __init__(self, seconds, prefix=""):
|
||||
|
||||
self.SHOW_PROGRESS = SHELL_CONFIG.SHOW_PROGRESS
|
||||
self.ANSI = SHELL_CONFIG.ANSI
|
||||
|
||||
|
||||
if self.SHOW_PROGRESS:
|
||||
self.p = Process(target=progress_bar, args=(seconds, prefix, self.ANSI))
|
||||
self.p.start()
|
||||
|
||||
self.stats = {'start_ts': datetime.now(timezone.utc), 'end_ts': None}
|
||||
self.stats = {"start_ts": datetime.now(timezone.utc), "end_ts": None}
|
||||
|
||||
def end(self):
|
||||
"""immediately end progress, clear the progressbar line, and save end_ts"""
|
||||
|
||||
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
self.stats['end_ts'] = end_ts
|
||||
|
||||
self.stats["end_ts"] = end_ts
|
||||
|
||||
if self.SHOW_PROGRESS:
|
||||
# terminate if we havent already terminated
|
||||
try:
|
||||
# kill the progress bar subprocess
|
||||
try:
|
||||
self.p.close() # must be closed *before* its terminnated
|
||||
self.p.close() # must be closed *before* its terminnated
|
||||
except (KeyboardInterrupt, SystemExit):
|
||||
print()
|
||||
raise
|
||||
except BaseException: # lgtm [py/catch-base-exception]
|
||||
except BaseException: # lgtm [py/catch-base-exception]
|
||||
pass
|
||||
self.p.terminate()
|
||||
time.sleep(0.1)
|
||||
# sometimes the timer doesn't terminate properly, then blocks at the join until
|
||||
# the full time has elapsed. sending a kill tries to avoid that.
|
||||
try:
|
||||
self.p.kill()
|
||||
self.p.kill()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
# clear whole terminal line
|
||||
try:
|
||||
sys.stdout.write('\r{}{}\r'.format((' ' * SHELL_CONFIG.TERM_WIDTH), self.ANSI['reset']))
|
||||
except (IOError, BrokenPipeError):
|
||||
sys.stdout.write("\r{}{}\r".format((" " * SHELL_CONFIG.TERM_WIDTH), self.ANSI["reset"]))
|
||||
except (OSError, BrokenPipeError):
|
||||
# ignore when the parent proc has stopped listening to our stdout
|
||||
pass
|
||||
except ValueError:
|
||||
@@ -102,10 +103,10 @@ class TimedProgress:
|
||||
|
||||
|
||||
@enforce_types
|
||||
def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> None:
|
||||
def progress_bar(seconds: int, prefix: str = "", ANSI: dict[str, str] = ANSI) -> None:
|
||||
"""show timer in the form of progress bar, with percentage and seconds remaining"""
|
||||
output_buf = (sys.stdout or sys.__stdout__ or sys.stderr or sys.__stderr__)
|
||||
chunk = '█' if output_buf and output_buf.encoding.upper() == 'UTF-8' else '#'
|
||||
output_buf = sys.stdout or sys.__stdout__ or sys.stderr or sys.__stderr__
|
||||
chunk = "█" if output_buf and output_buf.encoding.upper() == "UTF-8" else "#"
|
||||
last_width = SHELL_CONFIG.TERM_WIDTH
|
||||
chunks = last_width - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
|
||||
try:
|
||||
@@ -114,37 +115,41 @@ def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> Non
|
||||
if max_width < last_width:
|
||||
# when the terminal size is shrunk, we have to write a newline
|
||||
# otherwise the progress bar will keep wrapping incorrectly
|
||||
sys.stdout.write('\r\n')
|
||||
sys.stdout.write("\r\n")
|
||||
sys.stdout.flush()
|
||||
chunks = max_width - len(prefix) - 20
|
||||
pct_complete = s / chunks / seconds * 100
|
||||
log_pct = (log(pct_complete or 1, 10) / 2) * 100 # everyone likes faster progress bars ;)
|
||||
bar_width = round(log_pct/(100/chunks))
|
||||
bar_width = round(log_pct / (100 / chunks))
|
||||
last_width = max_width
|
||||
|
||||
# ████████████████████ 0.9% (1/60sec)
|
||||
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
|
||||
prefix,
|
||||
ANSI['green' if pct_complete < 80 else 'lightyellow'],
|
||||
(chunk * bar_width).ljust(chunks),
|
||||
ANSI['reset'],
|
||||
round(pct_complete, 1),
|
||||
round(s/chunks),
|
||||
seconds,
|
||||
))
|
||||
sys.stdout.write(
|
||||
"\r{}{}{}{} {}% ({}/{}sec)".format(
|
||||
prefix,
|
||||
ANSI["green" if pct_complete < 80 else "lightyellow"],
|
||||
(chunk * bar_width).ljust(chunks),
|
||||
ANSI["reset"],
|
||||
round(pct_complete, 1),
|
||||
round(s / chunks),
|
||||
seconds,
|
||||
),
|
||||
)
|
||||
sys.stdout.flush()
|
||||
time.sleep(1 / chunks)
|
||||
|
||||
# ██████████████████████████████████ 100.0% (60/60sec)
|
||||
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
|
||||
prefix,
|
||||
ANSI['red'],
|
||||
chunk * chunks,
|
||||
ANSI['reset'],
|
||||
100.0,
|
||||
seconds,
|
||||
seconds,
|
||||
))
|
||||
sys.stdout.write(
|
||||
"\r{}{}{}{} {}% ({}/{}sec)".format(
|
||||
prefix,
|
||||
ANSI["red"],
|
||||
chunk * chunks,
|
||||
ANSI["reset"],
|
||||
100.0,
|
||||
seconds,
|
||||
seconds,
|
||||
),
|
||||
)
|
||||
sys.stdout.flush()
|
||||
# uncomment to have it disappear when it hits 100% instead of staying full red:
|
||||
# time.sleep(0.5)
|
||||
@@ -154,10 +159,10 @@ def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> Non
|
||||
print()
|
||||
|
||||
|
||||
def log_cli_command(subcommand: str, subcommand_args: Iterable[str]=(), stdin: str | IO | None=None, pwd: str='.'):
|
||||
args = ' '.join(subcommand_args)
|
||||
version_msg = '[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format(
|
||||
now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
|
||||
def log_cli_command(subcommand: str, subcommand_args: Iterable[str] = (), stdin: str | IO | None = None, pwd: str = "."):
|
||||
args = " ".join(subcommand_args)
|
||||
version_msg = "[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]".format(
|
||||
now=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
|
||||
VERSION=VERSION,
|
||||
subcommand=subcommand,
|
||||
args=args,
|
||||
@@ -166,44 +171,54 @@ def log_cli_command(subcommand: str, subcommand_args: Iterable[str]=(), stdin: s
|
||||
# stderr('[bright_black] > {pwd}[/]'.format(pwd=pwd, **ANSI))
|
||||
# stderr()
|
||||
print(Panel(version_msg), file=sys.stderr)
|
||||
|
||||
|
||||
|
||||
### Parsing Stage
|
||||
|
||||
|
||||
def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: bool):
|
||||
def log_importing_started(urls: str | list[str], depth: int, index_only: bool):
|
||||
_LAST_RUN_STATS.parse_start_ts = datetime.now(timezone.utc)
|
||||
print('[green][+] [{}] Adding {} links to index (crawl depth={}){}...[/]'.format(
|
||||
_LAST_RUN_STATS.parse_start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
len(urls) if isinstance(urls, list) else len(urls.split('\n')),
|
||||
depth,
|
||||
' (index only)' if index_only else '',
|
||||
))
|
||||
print(
|
||||
"[green][+] [{}] Adding {} links to index (crawl depth={}){}...[/]".format(
|
||||
_LAST_RUN_STATS.parse_start_ts.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
len(urls) if isinstance(urls, list) else len(urls.split("\n")),
|
||||
depth,
|
||||
" (index only)" if index_only else "",
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def log_source_saved(source_file: str):
|
||||
print(' > Saved verbatim input to {}/{}'.format(CONSTANTS.SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
|
||||
print(" > Saved verbatim input to {}/{}".format(CONSTANTS.SOURCES_DIR_NAME, source_file.rsplit("/", 1)[-1]))
|
||||
|
||||
|
||||
def log_parsing_finished(num_parsed: int, parser_name: str):
|
||||
_LAST_RUN_STATS.parse_end_ts = datetime.now(timezone.utc)
|
||||
print(' > Parsed {} URLs from input ({})'.format(num_parsed, parser_name))
|
||||
print(f" > Parsed {num_parsed} URLs from input ({parser_name})")
|
||||
|
||||
|
||||
def log_deduping_finished(num_new_links: int):
|
||||
print(' > Found {} new URLs not already in index'.format(num_new_links))
|
||||
print(f" > Found {num_new_links} new URLs not already in index")
|
||||
|
||||
|
||||
def log_crawl_started(new_links):
|
||||
print()
|
||||
print(f'[green][*] Starting crawl of {len(new_links)} sites 1 hop out from starting point[/]')
|
||||
print(f"[green][*] Starting crawl of {len(new_links)} sites 1 hop out from starting point[/]")
|
||||
|
||||
|
||||
### Indexing Stage
|
||||
|
||||
|
||||
def log_indexing_process_started(num_links: int):
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
_LAST_RUN_STATS.index_start_ts = start_ts
|
||||
print()
|
||||
print('[bright_black][*] [{}] Writing {} links to main index...[/]'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_links,
|
||||
))
|
||||
print(
|
||||
"[bright_black][*] [{}] Writing {} links to main index...[/]".format(
|
||||
start_ts.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
num_links,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def log_indexing_process_finished():
|
||||
@@ -213,46 +228,55 @@ def log_indexing_process_finished():
|
||||
|
||||
def log_indexing_started(out_path: str):
|
||||
if SHELL_CONFIG.IS_TTY:
|
||||
sys.stdout.write(f' > ./{Path(out_path).relative_to(DATA_DIR)}')
|
||||
sys.stdout.write(f" > ./{Path(out_path).relative_to(DATA_DIR)}")
|
||||
|
||||
|
||||
def log_indexing_finished(out_path: str):
|
||||
print(f'\r √ ./{Path(out_path).relative_to(DATA_DIR)}')
|
||||
print(f"\r √ ./{Path(out_path).relative_to(DATA_DIR)}")
|
||||
|
||||
|
||||
### Archiving Stage
|
||||
|
||||
def log_archiving_started(num_links: int, resume: Optional[float]=None):
|
||||
|
||||
def log_archiving_started(num_links: int, resume: float | None = None):
|
||||
|
||||
start_ts = datetime.now(timezone.utc)
|
||||
_LAST_RUN_STATS.archiving_start_ts = start_ts
|
||||
print()
|
||||
if resume:
|
||||
print('[green][▶] [{}] Resuming archive updating for {} pages starting from {}...[/]'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_links,
|
||||
resume,
|
||||
))
|
||||
print(
|
||||
"[green][▶] [{}] Resuming archive updating for {} pages starting from {}...[/]".format(
|
||||
start_ts.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
num_links,
|
||||
resume,
|
||||
),
|
||||
)
|
||||
else:
|
||||
print('[green][▶] [{}] Starting archiving of {} snapshots in index...[/]'.format(
|
||||
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_links,
|
||||
))
|
||||
print(
|
||||
"[green][▶] [{}] Starting archiving of {} snapshots in index...[/]".format(
|
||||
start_ts.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
num_links,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def log_archiving_paused(num_links: int, idx: int, timestamp: str):
|
||||
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
_LAST_RUN_STATS.archiving_end_ts = end_ts
|
||||
print()
|
||||
print('\n[yellow3][X] [{now}] Downloading paused on link {timestamp} ({idx}/{total})[/]'.format(
|
||||
now=end_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
idx=idx+1,
|
||||
timestamp=timestamp,
|
||||
total=num_links,
|
||||
))
|
||||
print(
|
||||
"\n[yellow3][X] [{now}] Downloading paused on link {timestamp} ({idx}/{total})[/]".format(
|
||||
now=end_ts.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
idx=idx + 1,
|
||||
timestamp=timestamp,
|
||||
total=num_links,
|
||||
),
|
||||
)
|
||||
print()
|
||||
print(' Continue archiving where you left off by running:')
|
||||
print(' archivebox update --resume={}'.format(timestamp))
|
||||
print(" Continue archiving where you left off by running:")
|
||||
print(f" archivebox update --resume={timestamp}")
|
||||
|
||||
|
||||
def log_archiving_finished(num_links: int):
|
||||
|
||||
@@ -263,24 +287,26 @@ def log_archiving_finished(num_links: int):
|
||||
assert _LAST_RUN_STATS.archiving_start_ts is not None
|
||||
seconds = end_ts.timestamp() - _LAST_RUN_STATS.archiving_start_ts.timestamp()
|
||||
if seconds > 60:
|
||||
duration = '{0:.2f} min'.format(seconds / 60)
|
||||
duration = f"{seconds / 60:.2f} min"
|
||||
else:
|
||||
duration = '{0:.2f} sec'.format(seconds)
|
||||
duration = f"{seconds:.2f} sec"
|
||||
|
||||
print()
|
||||
print('[green][√] [{}] Update of {} pages complete ({})[/]'.format(
|
||||
end_ts.strftime('%Y-%m-%d %H:%M:%S'),
|
||||
num_links,
|
||||
duration,
|
||||
))
|
||||
print(' - {} links skipped'.format(_LAST_RUN_STATS.skipped))
|
||||
print(' - {} links updated'.format(_LAST_RUN_STATS.succeeded + _LAST_RUN_STATS.failed))
|
||||
print(' - {} links had errors'.format(_LAST_RUN_STATS.failed))
|
||||
|
||||
print(
|
||||
"[green][√] [{}] Update of {} pages complete ({})[/]".format(
|
||||
end_ts.strftime("%Y-%m-%d %H:%M:%S"),
|
||||
num_links,
|
||||
duration,
|
||||
),
|
||||
)
|
||||
print(f" - {_LAST_RUN_STATS.skipped} links skipped")
|
||||
print(f" - {_LAST_RUN_STATS.succeeded + _LAST_RUN_STATS.failed} links updated")
|
||||
print(f" - {_LAST_RUN_STATS.failed} links had errors")
|
||||
|
||||
if Snapshot.objects.count() < 50:
|
||||
print()
|
||||
print(' [violet]Hint:[/] To manage your archive in a Web UI, run:')
|
||||
print(' archivebox server 0.0.0.0:8000')
|
||||
print(" [violet]Hint:[/] To manage your archive in a Web UI, run:")
|
||||
print(" archivebox server 0.0.0.0:8000")
|
||||
|
||||
|
||||
def log_snapshot_archiving_started(snapshot: "Snapshot", out_dir: str, is_new: bool):
|
||||
@@ -289,41 +315,51 @@ def log_snapshot_archiving_started(snapshot: "Snapshot", out_dir: str, is_new: b
|
||||
# http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
|
||||
# > output/archive/1478739709
|
||||
|
||||
print('\n[[{symbol_color}]{symbol}[/]] [[{symbol_color}]{now}[/]] "{title}"'.format(
|
||||
symbol_color='green' if is_new else 'bright_black',
|
||||
symbol='+' if is_new else '√',
|
||||
now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
|
||||
title=snapshot.title or snapshot.base_url,
|
||||
))
|
||||
print(f' [sky_blue1]{snapshot.url}[/]')
|
||||
print(' {} {}'.format(
|
||||
'>' if is_new else '√',
|
||||
pretty_path(out_dir),
|
||||
))
|
||||
print(
|
||||
'\n[[{symbol_color}]{symbol}[/]] [[{symbol_color}]{now}[/]] "{title}"'.format(
|
||||
symbol_color="green" if is_new else "bright_black",
|
||||
symbol="+" if is_new else "√",
|
||||
now=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
|
||||
title=snapshot.title or snapshot.base_url,
|
||||
),
|
||||
)
|
||||
print(f" [sky_blue1]{snapshot.url}[/]")
|
||||
print(
|
||||
" {} {}".format(
|
||||
">" if is_new else "√",
|
||||
pretty_path(out_dir),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def log_snapshot_archiving_finished(snapshot: "Snapshot", out_dir: str, is_new: bool, stats: dict, start_ts: datetime):
|
||||
total = sum(stats.values())
|
||||
|
||||
if stats['failed'] > 0 :
|
||||
if stats["failed"] > 0:
|
||||
_LAST_RUN_STATS.failed += 1
|
||||
elif stats['skipped'] == total:
|
||||
elif stats["skipped"] == total:
|
||||
_LAST_RUN_STATS.skipped += 1
|
||||
else:
|
||||
_LAST_RUN_STATS.succeeded += 1
|
||||
|
||||
try:
|
||||
size = get_dir_size(out_dir)
|
||||
except FileNotFoundError:
|
||||
size = (0, None, '0')
|
||||
results = snapshot.archiveresult_set.only("output_files", "output_size")
|
||||
total_bytes = sum(result.output_size or result.output_size_from_files() for result in results)
|
||||
total_files = sum(result.output_file_count() for result in results)
|
||||
size = (total_bytes, 0, total_files)
|
||||
except Exception:
|
||||
try:
|
||||
size = get_dir_size(out_dir)
|
||||
except FileNotFoundError:
|
||||
size = (0, None, "0")
|
||||
|
||||
end_ts = datetime.now(timezone.utc)
|
||||
duration = str(end_ts - start_ts).split('.')[0]
|
||||
print(' [bright_black]{} files ({}) in {}s [/]'.format(size[2], printable_filesize(size[0]), duration))
|
||||
|
||||
duration = str(end_ts - start_ts).split(".")[0]
|
||||
print(f" [bright_black]{size[2]} files ({printable_filesize(size[0])}) in {duration}s [/]")
|
||||
|
||||
|
||||
def log_archive_method_started(method: str):
|
||||
print(' > {}'.format(method))
|
||||
print(f" > {method}")
|
||||
|
||||
|
||||
def log_archive_method_finished(result: dict):
|
||||
@@ -332,122 +368,117 @@ def log_archive_method_finished(result: dict):
|
||||
copy-paste the outputted string directly to run the cmd
|
||||
"""
|
||||
# Prettify CMD string and make it safe to copy-paste by quoting arguments
|
||||
quoted_cmd = ' '.join(
|
||||
'"{}"'.format(arg) if (' ' in arg) or (':' in arg) else arg
|
||||
for arg in result['cmd']
|
||||
)
|
||||
quoted_cmd = " ".join(f'"{arg}"' if (" " in arg) or (":" in arg) else arg for arg in result["cmd"])
|
||||
|
||||
if result['status'] == 'failed':
|
||||
output = result.get('output')
|
||||
if output and output.__class__.__name__ == 'TimeoutExpired':
|
||||
duration = (result['end_ts'] - result['start_ts']).seconds
|
||||
if result["status"] == "failed":
|
||||
output = result.get("output")
|
||||
if output and output.__class__.__name__ == "TimeoutExpired":
|
||||
duration = (result["end_ts"] - result["start_ts"]).seconds
|
||||
hint_header = [
|
||||
f'[yellow3]Extractor timed out after {duration}s.[/]',
|
||||
f"[yellow3]Extractor timed out after {duration}s.[/]",
|
||||
]
|
||||
else:
|
||||
error_name = output.__class__.__name__.replace('ArchiveError', '') if output else 'Error'
|
||||
error_name = output.__class__.__name__.replace("ArchiveError", "") if output else "Error"
|
||||
hint_header = [
|
||||
'[yellow3]Extractor failed:[/]',
|
||||
f' {error_name} [red1]{output}[/]',
|
||||
"[yellow3]Extractor failed:[/]",
|
||||
f" {error_name} [red1]{output}[/]",
|
||||
]
|
||||
|
||||
# Prettify error output hints string and limit to five lines
|
||||
hints = getattr(output, 'hints', None) or () if output else ()
|
||||
hints = getattr(output, "hints", None) or () if output else ()
|
||||
if hints:
|
||||
if isinstance(hints, (list, tuple, type(_ for _ in ()))):
|
||||
hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints]
|
||||
else:
|
||||
if isinstance(hints, bytes):
|
||||
hints = hints.decode()
|
||||
hints = hints.split('\n')
|
||||
hints = hints.split("\n")
|
||||
|
||||
hints = (
|
||||
f' [yellow1]{line.strip()}[/]'
|
||||
for line in list(hints)[:5] if line.strip()
|
||||
)
|
||||
hints = (f" [yellow1]{line.strip()}[/]" for line in list(hints)[:5] if line.strip())
|
||||
|
||||
docker_hints = ()
|
||||
if os.environ.get('IN_DOCKER') in ('1', 'true', 'True', 'TRUE', 'yes'):
|
||||
docker_hints = (
|
||||
' docker run -it -v $PWD/data:/data archivebox/archivebox /bin/bash',
|
||||
)
|
||||
if os.environ.get("IN_DOCKER") in ("1", "true", "True", "TRUE", "yes"):
|
||||
docker_hints = (" docker run -it -v $PWD/data:/data archivebox/archivebox /bin/bash",)
|
||||
|
||||
# Collect and prefix output lines with indentation
|
||||
output_lines = [
|
||||
*hint_header,
|
||||
*hints,
|
||||
'[violet]Run to see full output:[/]',
|
||||
"[violet]Run to see full output:[/]",
|
||||
*docker_hints,
|
||||
*([' cd {};'.format(result.get('pwd'))] if result.get('pwd') else []),
|
||||
' {}'.format(quoted_cmd),
|
||||
*([" cd {};".format(result.get("pwd"))] if result.get("pwd") else []),
|
||||
f" {quoted_cmd}",
|
||||
]
|
||||
print('\n'.join(
|
||||
' {}'.format(line)
|
||||
for line in output_lines
|
||||
if line
|
||||
))
|
||||
print(
|
||||
"\n".join(f" {line}" for line in output_lines if line),
|
||||
)
|
||||
print()
|
||||
|
||||
|
||||
def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
|
||||
print(f'[green][*] Finding links in the archive index matching these {filter_type} patterns:[/]')
|
||||
print(' {}'.format(' '.join(filter_patterns or ())))
|
||||
def log_list_started(filter_patterns: list[str] | None, filter_type: str):
|
||||
print(f"[green][*] Finding links in the archive index matching these {filter_type} patterns:[/]")
|
||||
print(" {}".format(" ".join(filter_patterns or ())))
|
||||
|
||||
|
||||
def log_list_finished(snapshots):
|
||||
from archivebox.core.models import Snapshot
|
||||
|
||||
print()
|
||||
print('---------------------------------------------------------------------------------------------------')
|
||||
print("---------------------------------------------------------------------------------------------------")
|
||||
csv_queryset = cast(Any, Snapshot.objects.filter(pk__in=[s.pk for s in snapshots]))
|
||||
print(csv_queryset.to_csv(cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
|
||||
print('---------------------------------------------------------------------------------------------------')
|
||||
print(csv_queryset.to_csv(cols=["timestamp", "is_archived", "num_outputs", "url"], header=True, ljust=16, separator=" | "))
|
||||
print("---------------------------------------------------------------------------------------------------")
|
||||
print()
|
||||
|
||||
|
||||
def log_removal_started(snapshots, yes: bool, delete: bool):
|
||||
count = snapshots.count() if hasattr(snapshots, 'count') else len(snapshots)
|
||||
print(f'[yellow3][i] Found {count} matching URLs to remove.[/]')
|
||||
count = snapshots.count() if hasattr(snapshots, "count") else len(snapshots)
|
||||
print(f"[yellow3][i] Found {count} matching URLs to remove.[/]")
|
||||
if delete:
|
||||
file_counts = [s.num_outputs for s in snapshots if os.access(s.output_dir, os.R_OK)]
|
||||
print(
|
||||
f' {count} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
|
||||
f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
|
||||
f" {count} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n"
|
||||
f" ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)",
|
||||
)
|
||||
else:
|
||||
print(
|
||||
' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n'
|
||||
' (Pass --delete if you also want to permanently delete the data folders)'
|
||||
" Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n"
|
||||
" (Pass --delete if you also want to permanently delete the data folders)",
|
||||
)
|
||||
|
||||
if not yes:
|
||||
print()
|
||||
print(f'[yellow3][?] Do you want to proceed with removing these {count} links?[/]')
|
||||
print(f"[yellow3][?] Do you want to proceed with removing these {count} links?[/]")
|
||||
try:
|
||||
assert input(' y/[n]: ').lower() == 'y'
|
||||
assert input(" y/[n]: ").lower() == "y"
|
||||
except (KeyboardInterrupt, EOFError, AssertionError):
|
||||
raise SystemExit(0)
|
||||
|
||||
|
||||
def log_removal_finished(remaining_links: int, removed_links: int):
|
||||
if remaining_links == 0 and removed_links == 0:
|
||||
print()
|
||||
print('[red1][X] No matching links found.[/]')
|
||||
print("[red1][X] No matching links found.[/]")
|
||||
else:
|
||||
total_before = remaining_links + removed_links
|
||||
print()
|
||||
print(f'[red1][√] Removed {removed_links} out of {total_before} links from the archive index.[/]')
|
||||
print(f' Index now contains {remaining_links} links.')
|
||||
print(f"[red1][√] Removed {removed_links} out of {total_before} links from the archive index.[/]")
|
||||
print(f" Index now contains {remaining_links} links.")
|
||||
|
||||
|
||||
### Search Indexing Stage
|
||||
|
||||
|
||||
def log_index_started(url: str):
|
||||
print('[green][*] Indexing url: {} in the search index[/]'.format(url))
|
||||
print(f"[green][*] Indexing url: {url} in the search index[/]")
|
||||
print()
|
||||
|
||||
|
||||
### Helpers
|
||||
|
||||
|
||||
@enforce_types
|
||||
def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR, color: bool=True) -> str:
|
||||
def pretty_path(path: Path | str, pwd: Path | str = DATA_DIR, color: bool = True) -> str:
|
||||
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
|
||||
pwd = str(Path(pwd)) # .resolve()
|
||||
path = str(path)
|
||||
@@ -456,46 +487,46 @@ def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR, color: b
|
||||
return path
|
||||
|
||||
# replace long absolute paths with ./ relative ones to save on terminal output width
|
||||
if path.startswith(pwd) and (pwd != '/') and path != pwd:
|
||||
if path.startswith(pwd) and (pwd != "/") and path != pwd:
|
||||
if color:
|
||||
path = path.replace(pwd, '[light_slate_blue].[/light_slate_blue]', 1)
|
||||
path = path.replace(pwd, "[light_slate_blue].[/light_slate_blue]", 1)
|
||||
else:
|
||||
path = path.replace(pwd, '.', 1)
|
||||
|
||||
path = path.replace(pwd, ".", 1)
|
||||
|
||||
# quote paths containing spaces
|
||||
if ' ' in path:
|
||||
if " " in path:
|
||||
path = f'"{path}"'
|
||||
|
||||
|
||||
# replace home directory with ~ for shorter output
|
||||
path = path.replace(str(Path('~').expanduser()), '~')
|
||||
path = path.replace(str(Path("~").expanduser()), "~")
|
||||
|
||||
return path
|
||||
|
||||
|
||||
@enforce_types
|
||||
def printable_filesize(num_bytes: Union[int, float]) -> str:
|
||||
for count in ['Bytes','KB','MB','GB']:
|
||||
def printable_filesize(num_bytes: int | float) -> str:
|
||||
for count in ["Bytes", "KB", "MB", "GB"]:
|
||||
if num_bytes > -1024.0 and num_bytes < 1024.0:
|
||||
return '%3.1f %s' % (num_bytes, count)
|
||||
return f"{num_bytes:3.1f} {count}"
|
||||
num_bytes /= 1024.0
|
||||
return '%3.1f %s' % (num_bytes, 'TB')
|
||||
return "{:3.1f} {}".format(num_bytes, "TB")
|
||||
|
||||
|
||||
@enforce_types
|
||||
def format_duration(seconds: float) -> str:
|
||||
"""Format duration in human-readable form."""
|
||||
if seconds < 1:
|
||||
return f'{seconds*1000:.0f}ms'
|
||||
return f"{seconds * 1000:.0f}ms"
|
||||
elif seconds < 60:
|
||||
return f'{seconds:.1f}s'
|
||||
return f"{seconds:.1f}s"
|
||||
elif seconds < 3600:
|
||||
minutes = int(seconds // 60)
|
||||
secs = int(seconds % 60)
|
||||
return f'{minutes}min {secs}s' if secs else f'{minutes}min'
|
||||
return f"{minutes}min {secs}s" if secs else f"{minutes}min"
|
||||
else:
|
||||
hours = int(seconds // 3600)
|
||||
minutes = int((seconds % 3600) // 60)
|
||||
return f'{hours}hr {minutes}min' if minutes else f'{hours}hr'
|
||||
return f"{hours}hr {minutes}min" if minutes else f"{hours}hr"
|
||||
|
||||
|
||||
@enforce_types
|
||||
@@ -504,15 +535,15 @@ def truncate_url(url: str, max_length: int = 60) -> str:
|
||||
if len(url) <= max_length:
|
||||
return url
|
||||
# Try to keep the domain and beginning of path
|
||||
if '://' in url:
|
||||
protocol, rest = url.split('://', 1)
|
||||
if '/' in rest:
|
||||
domain, path = rest.split('/', 1)
|
||||
if "://" in url:
|
||||
protocol, rest = url.split("://", 1)
|
||||
if "/" in rest:
|
||||
domain, path = rest.split("/", 1)
|
||||
available = max_length - len(protocol) - len(domain) - 6 # for "://", "/", "..."
|
||||
if available > 10:
|
||||
return f'{protocol}://{domain}/{path[:available]}...'
|
||||
return f"{protocol}://{domain}/{path[:available]}..."
|
||||
# Fallback: just truncate
|
||||
return url[:max_length-3] + '...'
|
||||
return url[: max_length - 3] + "..."
|
||||
|
||||
|
||||
@enforce_types
|
||||
@@ -520,12 +551,12 @@ def log_worker_event(
|
||||
worker_type: str,
|
||||
event: str,
|
||||
indent_level: int = 0,
|
||||
pid: Optional[int] = None,
|
||||
worker_id: Optional[str] = None,
|
||||
url: Optional[str] = None,
|
||||
plugin: Optional[str] = None,
|
||||
metadata: Optional[Dict[str, Any]] = None,
|
||||
error: Optional[Exception] = None,
|
||||
pid: int | None = None,
|
||||
worker_id: str | None = None,
|
||||
url: str | None = None,
|
||||
plugin: str | None = None,
|
||||
metadata: dict[str, Any] | None = None,
|
||||
error: Exception | None = None,
|
||||
) -> None:
|
||||
"""
|
||||
Log a worker event with structured metadata and indentation.
|
||||
@@ -541,17 +572,17 @@ def log_worker_event(
|
||||
metadata: Dict of metadata to show in curly braces
|
||||
error: Exception if event is an error
|
||||
"""
|
||||
indent = ' ' * indent_level
|
||||
indent = " " * indent_level
|
||||
|
||||
from rich.markup import escape
|
||||
|
||||
# Build worker identifier (without URL/plugin)
|
||||
worker_parts = [worker_type]
|
||||
# Don't add pid/worker_id for DB operations (they happen in whatever process is running)
|
||||
if pid and worker_type != 'DB':
|
||||
worker_parts.append(f'pid={pid}')
|
||||
if worker_id and worker_type in ('CrawlWorker', 'Orchestrator') and worker_type != 'DB':
|
||||
worker_parts.append(f'id={worker_id}')
|
||||
if pid and worker_type != "DB":
|
||||
worker_parts.append(f"pid={pid}")
|
||||
if worker_id and worker_type in ("CrawlWorker", "Orchestrator") and worker_type != "DB":
|
||||
worker_parts.append(f"id={worker_id}")
|
||||
|
||||
# Build worker label parts for brackets (shown inside brackets)
|
||||
worker_label_base = worker_parts[0]
|
||||
@@ -560,53 +591,53 @@ def log_worker_event(
|
||||
# Build URL/plugin display (shown AFTER the label, outside brackets)
|
||||
url_extractor_parts = []
|
||||
if url:
|
||||
url_extractor_parts.append(f'url: {escape(url)}')
|
||||
url_extractor_parts.append(f"url: {escape(url)}")
|
||||
if plugin:
|
||||
url_extractor_parts.append(f'extractor: {escape(plugin)}')
|
||||
url_extractor_parts.append(f"extractor: {escape(plugin)}")
|
||||
|
||||
url_extractor_str = ' | '.join(url_extractor_parts) if url_extractor_parts else ''
|
||||
url_extractor_str = " | ".join(url_extractor_parts) if url_extractor_parts else ""
|
||||
|
||||
# Build metadata string
|
||||
metadata_str = ''
|
||||
metadata_str = ""
|
||||
if metadata:
|
||||
# Format metadata nicely
|
||||
meta_parts = []
|
||||
for k, v in metadata.items():
|
||||
if isinstance(v, float):
|
||||
# Format floats nicely (durations, sizes)
|
||||
if 'duration' in k.lower():
|
||||
meta_parts.append(f'{k}: {format_duration(v)}')
|
||||
elif 'size' in k.lower():
|
||||
meta_parts.append(f'{k}: {printable_filesize(int(v))}')
|
||||
if "duration" in k.lower():
|
||||
meta_parts.append(f"{k}: {format_duration(v)}")
|
||||
elif "size" in k.lower():
|
||||
meta_parts.append(f"{k}: {printable_filesize(int(v))}")
|
||||
else:
|
||||
meta_parts.append(f'{k}: {v:.2f}')
|
||||
meta_parts.append(f"{k}: {v:.2f}")
|
||||
elif isinstance(v, int):
|
||||
# Format integers - check if it's a size
|
||||
if 'size' in k.lower() or 'bytes' in k.lower():
|
||||
meta_parts.append(f'{k}: {printable_filesize(v)}')
|
||||
if "size" in k.lower() or "bytes" in k.lower():
|
||||
meta_parts.append(f"{k}: {printable_filesize(v)}")
|
||||
else:
|
||||
meta_parts.append(f'{k}: {v}')
|
||||
meta_parts.append(f"{k}: {v}")
|
||||
elif isinstance(v, (list, tuple)):
|
||||
meta_parts.append(f'{k}: {len(v)}')
|
||||
meta_parts.append(f"{k}: {len(v)}")
|
||||
else:
|
||||
meta_parts.append(f'{k}: {v}')
|
||||
metadata_str = ' | '.join(meta_parts)
|
||||
meta_parts.append(f"{k}: {v}")
|
||||
metadata_str = " | ".join(meta_parts)
|
||||
|
||||
# Determine color based on event
|
||||
color = 'white'
|
||||
if event in ('Starting...', 'Started', 'STARTED', 'Started in background'):
|
||||
color = 'green'
|
||||
elif event.startswith('Created'):
|
||||
color = 'cyan' # DB creation events
|
||||
elif event in ('Completed', 'COMPLETED', 'All work complete'):
|
||||
color = 'blue'
|
||||
elif event in ('Failed', 'ERROR', 'Failed to spawn worker'):
|
||||
color = 'red'
|
||||
elif event in ('Shutting down', 'SHUTDOWN'):
|
||||
color = 'grey53'
|
||||
color = "white"
|
||||
if event in ("Starting...", "Started", "STARTED", "Started in background"):
|
||||
color = "green"
|
||||
elif event.startswith("Created"):
|
||||
color = "cyan" # DB creation events
|
||||
elif event in ("Completed", "COMPLETED", "All work complete"):
|
||||
color = "blue"
|
||||
elif event in ("Failed", "ERROR", "Failed to spawn worker"):
|
||||
color = "red"
|
||||
elif event in ("Shutting down", "SHUTDOWN"):
|
||||
color = "grey53"
|
||||
|
||||
# Build final message
|
||||
error_str = f' {type(error).__name__}: {error}' if error else ''
|
||||
error_str = f" {type(error).__name__}: {error}" if error else ""
|
||||
from archivebox.misc.logging import CONSOLE, STDERR
|
||||
from rich.text import Text
|
||||
|
||||
@@ -618,19 +649,19 @@ def log_worker_event(
|
||||
|
||||
# Add bracketed content if present (using Text.append to avoid markup issues)
|
||||
if worker_bracket_content:
|
||||
text.append('[', style=color)
|
||||
text.append("[", style=color)
|
||||
text.append(worker_bracket_content, style=color)
|
||||
text.append(']', style=color)
|
||||
text.append("]", style=color)
|
||||
|
||||
text.append(f' {event}{error_str}', style=color)
|
||||
text.append(f" {event}{error_str}", style=color)
|
||||
|
||||
# Add URL/plugin info first (more important)
|
||||
if url_extractor_str:
|
||||
text.append(f' | {url_extractor_str}')
|
||||
text.append(f" | {url_extractor_str}")
|
||||
|
||||
# Then add other metadata
|
||||
if metadata_str:
|
||||
text.append(f' | {metadata_str}')
|
||||
text.append(f" | {metadata_str}")
|
||||
|
||||
# Stdout is reserved for JSONL records whenever commands are piped together.
|
||||
# Route worker/DB progress to stderr in non-TTY contexts so pipelines like
|
||||
@@ -640,90 +671,85 @@ def log_worker_event(
|
||||
|
||||
|
||||
@enforce_types
|
||||
def printable_folders(folders: Dict[str, Optional["Snapshot"]], with_headers: bool=False) -> str:
|
||||
return '\n'.join(
|
||||
f'{folder} {snapshot and snapshot.url} "{snapshot and snapshot.title}"'
|
||||
for folder, snapshot in folders.items()
|
||||
)
|
||||
|
||||
|
||||
|
||||
@enforce_types
|
||||
def printable_config(config: dict, prefix: str='') -> str:
|
||||
return f'\n{prefix}'.join(
|
||||
f'{key}={val}'
|
||||
for key, val in config.items()
|
||||
if not (isinstance(val, dict) or callable(val))
|
||||
)
|
||||
def printable_folders(folders: dict[str, Optional["Snapshot"]], with_headers: bool = False) -> str:
|
||||
return "\n".join(f'{folder} {snapshot and snapshot.url} "{snapshot and snapshot.title}"' for folder, snapshot in folders.items())
|
||||
|
||||
|
||||
@enforce_types
|
||||
def printable_folder_status(name: str, folder: Dict) -> str:
|
||||
if folder['enabled']:
|
||||
if folder['is_valid']:
|
||||
color, symbol, note, num_files = 'green', '√', 'valid', ''
|
||||
def printable_config(config: dict, prefix: str = "") -> str:
|
||||
return f"\n{prefix}".join(f"{key}={val}" for key, val in config.items() if not (isinstance(val, dict) or callable(val)))
|
||||
|
||||
|
||||
@enforce_types
|
||||
def printable_folder_status(name: str, folder: dict) -> str:
|
||||
if folder["enabled"]:
|
||||
if folder["is_valid"]:
|
||||
color, symbol, note, num_files = "green", "√", "valid", ""
|
||||
else:
|
||||
color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
|
||||
color, symbol, note, num_files = "red", "X", "invalid", "?"
|
||||
else:
|
||||
color, symbol, note, num_files = 'grey53', '-', 'unused', '-'
|
||||
color, symbol, note, num_files = "grey53", "-", "unused", "-"
|
||||
|
||||
|
||||
if folder['path']:
|
||||
if os.access(folder['path'], os.R_OK):
|
||||
if folder["path"]:
|
||||
if os.access(folder["path"], os.R_OK):
|
||||
try:
|
||||
num_files = (
|
||||
f'{len(os.listdir(folder["path"]))} files'
|
||||
if os.path.isdir(folder['path']) else
|
||||
printable_filesize(Path(folder['path']).stat().st_size)
|
||||
f"{len(os.listdir(folder['path']))} files"
|
||||
if os.path.isdir(folder["path"])
|
||||
else printable_filesize(Path(folder["path"]).stat().st_size)
|
||||
)
|
||||
except PermissionError:
|
||||
num_files = 'error'
|
||||
num_files = "error"
|
||||
else:
|
||||
num_files = 'missing'
|
||||
|
||||
if folder.get('is_mount'):
|
||||
num_files = "missing"
|
||||
|
||||
if folder.get("is_mount"):
|
||||
# add symbol @ next to filecount if path is a remote filesystem mount
|
||||
num_files = f'{num_files} @' if num_files else '@'
|
||||
num_files = f"{num_files} @" if num_files else "@"
|
||||
|
||||
path = pretty_path(folder['path'])
|
||||
path = pretty_path(folder["path"])
|
||||
|
||||
return ' '.join((
|
||||
f'[{color}]',
|
||||
symbol,
|
||||
'[/]',
|
||||
name.ljust(21).replace('DATA_DIR', '[light_slate_blue]DATA_DIR[/light_slate_blue]'),
|
||||
num_files.ljust(14).replace('missing', '[grey53]missing[/grey53]'),
|
||||
f'[{color}]',
|
||||
note.ljust(8),
|
||||
'[/]',
|
||||
path.ljust(76),
|
||||
))
|
||||
return " ".join(
|
||||
(
|
||||
f"[{color}]",
|
||||
symbol,
|
||||
"[/]",
|
||||
name.ljust(21).replace("DATA_DIR", "[light_slate_blue]DATA_DIR[/light_slate_blue]"),
|
||||
num_files.ljust(14).replace("missing", "[grey53]missing[/grey53]"),
|
||||
f"[{color}]",
|
||||
note.ljust(8),
|
||||
"[/]",
|
||||
path.ljust(76),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def printable_dependency_version(name: str, dependency: Dict) -> str:
|
||||
color, symbol, note, version = 'red', 'X', 'invalid', '?'
|
||||
def printable_dependency_version(name: str, dependency: dict) -> str:
|
||||
color, symbol, note, version = "red", "X", "invalid", "?"
|
||||
|
||||
if dependency['enabled']:
|
||||
if dependency['is_valid']:
|
||||
color, symbol, note = 'green', '√', 'valid'
|
||||
if dependency["enabled"]:
|
||||
if dependency["is_valid"]:
|
||||
color, symbol, note = "green", "√", "valid"
|
||||
|
||||
parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
|
||||
parsed_version_num = re.search(r"[\d\.]+", dependency["version"])
|
||||
if parsed_version_num:
|
||||
version = f'v{parsed_version_num[0]}'
|
||||
version = f"v{parsed_version_num[0]}"
|
||||
else:
|
||||
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
|
||||
color, symbol, note, version = "lightyellow", "-", "disabled", "-"
|
||||
|
||||
path = pretty_path(dependency['path'])
|
||||
path = pretty_path(dependency["path"])
|
||||
|
||||
return ' '.join((
|
||||
ANSI[color],
|
||||
symbol,
|
||||
ANSI['reset'],
|
||||
name.ljust(21),
|
||||
version.ljust(14),
|
||||
ANSI[color],
|
||||
note.ljust(8),
|
||||
ANSI['reset'],
|
||||
path.ljust(76),
|
||||
))
|
||||
return " ".join(
|
||||
(
|
||||
ANSI[color],
|
||||
symbol,
|
||||
ANSI["reset"],
|
||||
name.ljust(21),
|
||||
version.ljust(14),
|
||||
ANSI[color],
|
||||
note.ljust(8),
|
||||
ANSI["reset"],
|
||||
path.ljust(76),
|
||||
),
|
||||
)
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox'
|
||||
__package__ = "archivebox"
|
||||
|
||||
|
||||
import datetime
|
||||
@@ -13,7 +13,7 @@ django_stubs_ext.monkeypatch()
|
||||
|
||||
|
||||
# monkey patch django timezone to add back utc (it was removed in Django 5.0)
|
||||
setattr(timezone, 'utc', datetime.timezone.utc)
|
||||
setattr(timezone, "utc", datetime.UTC)
|
||||
|
||||
# monkey patch django-signals-webhooks to change how it shows up in Admin UI
|
||||
# from signal_webhooks.apps import DjangoSignalWebhooksConfig
|
||||
@@ -28,28 +28,29 @@ setattr(timezone, 'utc', datetime.timezone.utc)
|
||||
|
||||
# Hide site-packages/sonic/client.py:115: SyntaxWarning
|
||||
# https://github.com/xmonader/python-sonic-client/pull/18
|
||||
warnings.filterwarnings("ignore", category=SyntaxWarning, module='sonic')
|
||||
warnings.filterwarnings("ignore", category=SyntaxWarning, module="sonic")
|
||||
|
||||
# Make daphne log requests quieter and esier to read
|
||||
|
||||
# Make daphne log requests quieter and easier to read
|
||||
class ModifiedAccessLogGenerator(access.AccessLogGenerator):
|
||||
"""Clutge workaround until daphne uses the Python logging framework. https://github.com/django/daphne/pull/473/files"""
|
||||
|
||||
|
||||
def write_entry(self, host, date, request, status=None, length=None, ident=None, user=None):
|
||||
|
||||
|
||||
# Ignore noisy requests to staticfiles / favicons / etc.
|
||||
if 'GET /static/' in request:
|
||||
if "GET /static/" in request:
|
||||
return
|
||||
if "GET /health/" in request:
|
||||
return
|
||||
if 'GET /admin/jsi18n/' in request:
|
||||
if "GET /admin/jsi18n/" in request:
|
||||
return
|
||||
if request.endswith("/favicon.ico") or request.endswith("/robots.txt") or request.endswith("/screenshot.png"):
|
||||
return
|
||||
if request.endswith('.css') or request.endswith('.js') or request.endswith('.woff') or request.endswith('.ttf'):
|
||||
if request.endswith(".css") or request.endswith(".js") or request.endswith(".woff") or request.endswith(".ttf"):
|
||||
return
|
||||
if str(status) in ('404', '304'):
|
||||
if str(status) in ("404", "304"):
|
||||
return
|
||||
|
||||
|
||||
# clean up the log format to mostly match the same format as django.conf.settings.LOGGING rich formats
|
||||
self.stream.write(
|
||||
"%s HTTP %s %s %s\n"
|
||||
@@ -58,13 +59,14 @@ class ModifiedAccessLogGenerator(access.AccessLogGenerator):
|
||||
request,
|
||||
status or "-",
|
||||
"localhost" if host.startswith("127.") else host.split(":")[0],
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
access.AccessLogGenerator.write_entry = ModifiedAccessLogGenerator.write_entry # type: ignore
|
||||
|
||||
|
||||
access.AccessLogGenerator.write_entry = ModifiedAccessLogGenerator.write_entry # type: ignore
|
||||
|
||||
|
||||
# fix benedict objects to pretty-print/repr more nicely with rich
|
||||
# https://stackoverflow.com/a/79048811/2156113
|
||||
# https://rich.readthedocs.io/en/stable/pretty.html#rich-repr-protocol
|
||||
benedict.benedict.__rich_repr__ = lambda self: (dict(self),) # type: ignore
|
||||
benedict.benedict.__rich_repr__ = lambda self: (dict(self),) # type: ignore
|
||||
|
||||
@@ -1,30 +1,30 @@
|
||||
__package__ = 'archivebox.misc'
|
||||
__package__ = "archivebox.misc"
|
||||
|
||||
from django.core.paginator import Paginator
|
||||
from django.utils.functional import cached_property
|
||||
|
||||
|
||||
class AccelleratedPaginator(Paginator):
|
||||
class AcceleratedPaginator(Paginator):
|
||||
"""
|
||||
Accellerated Pagniator ignores DISTINCT when counting total number of rows.
|
||||
Accelerated paginator ignores DISTINCT when counting total number of rows.
|
||||
Speeds up SELECT Count(*) on Admin views by >20x.
|
||||
https://hakibenita.com/optimizing-the-django-admin-paginator
|
||||
"""
|
||||
|
||||
@cached_property
|
||||
def count(self):
|
||||
has_filters = getattr(self.object_list, '_has_filters', None)
|
||||
has_filters = getattr(self.object_list, "_has_filters", None)
|
||||
if callable(has_filters) and has_filters():
|
||||
# fallback to normal count method on filtered queryset
|
||||
return super().count
|
||||
|
||||
model = getattr(self.object_list, 'model', None)
|
||||
model = getattr(self.object_list, "model", None)
|
||||
if model is None:
|
||||
return super().count
|
||||
|
||||
# otherwise count total rows in a separate fast query
|
||||
return model.objects.count()
|
||||
|
||||
|
||||
# Alternative approach for PostgreSQL: fallback count takes > 200ms
|
||||
# from django.db import connection, transaction, OperationalError
|
||||
# with transaction.atomic(), connection.cursor() as cursor:
|
||||
|
||||
@@ -3,26 +3,35 @@ import json
|
||||
import re
|
||||
import os
|
||||
import stat
|
||||
import asyncio
|
||||
import posixpath
|
||||
import mimetypes
|
||||
import importlib
|
||||
import queue
|
||||
import threading
|
||||
import time
|
||||
import zipfile
|
||||
from datetime import datetime
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
from urllib.parse import urlencode
|
||||
|
||||
from django.contrib.staticfiles import finders
|
||||
from django.template import TemplateDoesNotExist, loader
|
||||
from django.views import static
|
||||
from django.http import StreamingHttpResponse, Http404, HttpResponse, HttpResponseNotModified
|
||||
from django.utils._os import safe_join
|
||||
from django.utils.http import http_date
|
||||
from django.utils.translation import gettext as _
|
||||
from archivebox.config.common import SERVER_CONFIG
|
||||
from archivebox.misc.logging_util import printable_filesize
|
||||
|
||||
|
||||
_HASHES_CACHE: dict[Path, tuple[float, dict[str, str]]] = {}
|
||||
|
||||
|
||||
def _load_hash_map(snapshot_dir: Path) -> dict[str, str] | None:
|
||||
hashes_path = snapshot_dir / 'hashes' / 'hashes.json'
|
||||
hashes_path = snapshot_dir / "hashes" / "hashes.json"
|
||||
if not hashes_path.exists():
|
||||
return None
|
||||
try:
|
||||
@@ -35,11 +44,11 @@ def _load_hash_map(snapshot_dir: Path) -> dict[str, str] | None:
|
||||
return cached[1]
|
||||
|
||||
try:
|
||||
data = json.loads(hashes_path.read_text(encoding='utf-8'))
|
||||
data = json.loads(hashes_path.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
file_map = {str(entry.get('path')): entry.get('hash') for entry in data.get('files', []) if entry.get('path')}
|
||||
file_map = {str(entry.get("path")): entry.get("hash") for entry in data.get("files", []) if entry.get("path")}
|
||||
_HASHES_CACHE[hashes_path] = (mtime, file_map)
|
||||
return file_map
|
||||
|
||||
@@ -52,7 +61,192 @@ def _hash_for_path(document_root: Path, rel_path: str) -> str | None:
|
||||
|
||||
|
||||
def _cache_policy() -> str:
|
||||
return 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
|
||||
return "public" if SERVER_CONFIG.PUBLIC_SNAPSHOTS else "private"
|
||||
|
||||
|
||||
def _format_direntry_timestamp(stat_result: os.stat_result) -> str:
|
||||
timestamp = getattr(stat_result, "st_birthtime", None) or stat_result.st_mtime
|
||||
return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M")
|
||||
|
||||
|
||||
def _safe_zip_stem(name: str) -> str:
|
||||
safe_name = re.sub(r"[^A-Za-z0-9._-]+", "-", name).strip("._-")
|
||||
return safe_name or "archivebox"
|
||||
|
||||
|
||||
class _StreamingQueueWriter:
|
||||
"""Expose a write-only file-like object so zipfile can stream into a queue."""
|
||||
|
||||
def __init__(self, output_queue: queue.Queue[bytes | BaseException | object]) -> None:
|
||||
self.output_queue = output_queue
|
||||
self.position = 0
|
||||
|
||||
def write(self, data: bytes) -> int:
|
||||
if data:
|
||||
self.output_queue.put(data)
|
||||
self.position += len(data)
|
||||
return len(data)
|
||||
|
||||
def tell(self) -> int:
|
||||
return self.position
|
||||
|
||||
def flush(self) -> None:
|
||||
return None
|
||||
|
||||
def close(self) -> None:
|
||||
return None
|
||||
|
||||
def writable(self) -> bool:
|
||||
return True
|
||||
|
||||
def seekable(self) -> bool:
|
||||
return False
|
||||
|
||||
|
||||
def _iter_visible_files(root: Path):
|
||||
"""Yield non-hidden files in a stable order so ZIP output is deterministic."""
|
||||
|
||||
for current_root, dirnames, filenames in os.walk(root):
|
||||
dirnames[:] = sorted(dirname for dirname in dirnames if not dirname.startswith("."))
|
||||
for filename in sorted(name for name in filenames if not name.startswith(".")):
|
||||
yield Path(current_root) / filename
|
||||
|
||||
|
||||
def _build_directory_zip_response(
|
||||
fullpath: Path,
|
||||
path: str,
|
||||
*,
|
||||
is_archive_replay: bool,
|
||||
use_async_stream: bool,
|
||||
) -> StreamingHttpResponse:
|
||||
root_name = _safe_zip_stem(fullpath.name or Path(path).name or "archivebox")
|
||||
sentinel = object()
|
||||
output_queue: queue.Queue[bytes | BaseException | object] = queue.Queue(maxsize=8)
|
||||
initial_chunk_target = 64 * 1024
|
||||
initial_chunk_wait = 0.05
|
||||
|
||||
def build_zip() -> None:
|
||||
# zipfile wants a write-only file object. Feed those bytes straight into
|
||||
# a queue so the response can stream them out as soon as they are ready.
|
||||
writer = _StreamingQueueWriter(output_queue)
|
||||
try:
|
||||
with zipfile.ZipFile(writer, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=6) as zip_file:
|
||||
for entry in _iter_visible_files(fullpath):
|
||||
rel_parts = entry.relative_to(fullpath).parts
|
||||
arcname = Path(root_name, *rel_parts).as_posix()
|
||||
zip_file.write(entry, arcname)
|
||||
except BaseException as err:
|
||||
output_queue.put(err)
|
||||
finally:
|
||||
output_queue.put(sentinel)
|
||||
|
||||
threading.Thread(target=build_zip, name=f"zip-stream-{root_name}", daemon=True).start()
|
||||
|
||||
def iter_zip_chunks():
|
||||
# Emit a meaningful first chunk quickly so browsers show the download
|
||||
# immediately instead of waiting on dozens of tiny ZIP header writes.
|
||||
first_chunk = bytearray()
|
||||
initial_deadline = time.monotonic() + initial_chunk_wait
|
||||
|
||||
while True:
|
||||
timeout = max(initial_deadline - time.monotonic(), 0) if len(first_chunk) < initial_chunk_target else None
|
||||
try:
|
||||
chunk = output_queue.get(timeout=timeout) if timeout is not None else output_queue.get()
|
||||
except queue.Empty:
|
||||
if first_chunk:
|
||||
yield bytes(first_chunk)
|
||||
first_chunk.clear()
|
||||
continue
|
||||
chunk = output_queue.get()
|
||||
|
||||
if chunk is sentinel:
|
||||
if first_chunk:
|
||||
yield bytes(first_chunk)
|
||||
break
|
||||
if isinstance(chunk, BaseException):
|
||||
raise chunk
|
||||
if len(first_chunk) < initial_chunk_target:
|
||||
first_chunk.extend(chunk)
|
||||
if len(first_chunk) >= initial_chunk_target or time.monotonic() >= initial_deadline:
|
||||
yield bytes(first_chunk)
|
||||
first_chunk.clear()
|
||||
continue
|
||||
yield chunk
|
||||
|
||||
async def stream_zip_async():
|
||||
# Django ASGI buffers sync StreamingHttpResponse iterators by consuming
|
||||
# them into a list. Drive the same sync iterator from a worker thread so
|
||||
# Daphne can send each chunk as it arrives instead of buffering the ZIP.
|
||||
iterator = iter(iter_zip_chunks())
|
||||
while True:
|
||||
chunk = await asyncio.to_thread(next, iterator, None)
|
||||
if chunk is None:
|
||||
break
|
||||
yield chunk
|
||||
|
||||
response = StreamingHttpResponse(
|
||||
stream_zip_async() if use_async_stream else iter_zip_chunks(),
|
||||
content_type="application/zip",
|
||||
)
|
||||
response.headers["Content-Disposition"] = f'attachment; filename="{root_name}.zip"'
|
||||
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
|
||||
response.headers["Last-Modified"] = http_date(fullpath.stat().st_mtime)
|
||||
response.headers["X-Accel-Buffering"] = "no"
|
||||
return _apply_archive_replay_headers(
|
||||
response,
|
||||
fullpath=fullpath,
|
||||
content_type="application/zip",
|
||||
is_archive_replay=is_archive_replay,
|
||||
)
|
||||
|
||||
|
||||
def _render_directory_index(request, path: str, fullpath: Path) -> HttpResponse:
|
||||
try:
|
||||
template = loader.select_template(
|
||||
[
|
||||
"static/directory_index.html",
|
||||
"static/directory_index",
|
||||
],
|
||||
)
|
||||
except TemplateDoesNotExist:
|
||||
return static.directory_index(path, fullpath)
|
||||
|
||||
entries = []
|
||||
file_list = []
|
||||
visible_entries = sorted(
|
||||
(entry for entry in fullpath.iterdir() if not entry.name.startswith(".")),
|
||||
key=lambda entry: (not entry.is_dir(), entry.name.lower()),
|
||||
)
|
||||
for entry in visible_entries:
|
||||
url = str(entry.relative_to(fullpath))
|
||||
if entry.is_dir():
|
||||
url += "/"
|
||||
file_list.append(url)
|
||||
|
||||
stat_result = entry.stat()
|
||||
entries.append(
|
||||
{
|
||||
"name": url,
|
||||
"url": url,
|
||||
"is_dir": entry.is_dir(),
|
||||
"size": "—" if entry.is_dir() else printable_filesize(stat_result.st_size),
|
||||
"timestamp": _format_direntry_timestamp(stat_result),
|
||||
},
|
||||
)
|
||||
|
||||
zip_query = request.GET.copy()
|
||||
zip_query["download"] = "zip"
|
||||
zip_url = request.path
|
||||
if zip_query:
|
||||
zip_url = f"{zip_url}?{zip_query.urlencode()}"
|
||||
|
||||
context = {
|
||||
"directory": f"{path}/",
|
||||
"file_list": file_list,
|
||||
"entries": entries,
|
||||
"zip_url": zip_url,
|
||||
}
|
||||
return HttpResponse(template.render(context))
|
||||
|
||||
|
||||
# Ensure common web types are mapped consistently across platforms.
|
||||
@@ -71,16 +265,16 @@ mimetypes.add_type("application/xml", ".xml")
|
||||
mimetypes.add_type("image/svg+xml", ".svg")
|
||||
|
||||
try:
|
||||
_markdown = getattr(importlib.import_module('markdown'), 'markdown')
|
||||
_markdown = getattr(importlib.import_module("markdown"), "markdown")
|
||||
except ImportError:
|
||||
_markdown: Callable[..., str] | None = None
|
||||
|
||||
MARKDOWN_INLINE_LINK_RE = re.compile(r'\[([^\]]+)\]\(([^)\s]+(?:\([^)]*\)[^)\s]*)*)\)')
|
||||
MARKDOWN_INLINE_IMAGE_RE = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
|
||||
MARKDOWN_BOLD_RE = re.compile(r'\*\*([^*]+)\*\*')
|
||||
MARKDOWN_ITALIC_RE = re.compile(r'(?<!\*)\*([^*]+)\*(?!\*)')
|
||||
HTML_TAG_RE = re.compile(r'<[A-Za-z][^>]*>')
|
||||
HTML_BODY_RE = re.compile(r'<body[^>]*>(.*)</body>', flags=re.IGNORECASE | re.DOTALL)
|
||||
MARKDOWN_INLINE_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)\s]+(?:\([^)]*\)[^)\s]*)*)\)")
|
||||
MARKDOWN_INLINE_IMAGE_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
|
||||
MARKDOWN_BOLD_RE = re.compile(r"\*\*([^*]+)\*\*")
|
||||
MARKDOWN_ITALIC_RE = re.compile(r"(?<!\*)\*([^*]+)\*(?!\*)")
|
||||
HTML_TAG_RE = re.compile(r"<[A-Za-z][^>]*>")
|
||||
HTML_BODY_RE = re.compile(r"<body[^>]*>(.*)</body>", flags=re.IGNORECASE | re.DOTALL)
|
||||
RISKY_REPLAY_MIMETYPES = {
|
||||
"text/html",
|
||||
"application/xhtml+xml",
|
||||
@@ -99,8 +293,8 @@ def _extract_markdown_candidate(text: str) -> str:
|
||||
body_match = HTML_BODY_RE.search(candidate)
|
||||
if body_match:
|
||||
candidate = body_match.group(1)
|
||||
candidate = re.sub(r'^\s*<p[^>]*>', '', candidate, flags=re.IGNORECASE)
|
||||
candidate = re.sub(r'</p>\s*$', '', candidate, flags=re.IGNORECASE)
|
||||
candidate = re.sub(r"^\s*<p[^>]*>", "", candidate, flags=re.IGNORECASE)
|
||||
candidate = re.sub(r"</p>\s*$", "", candidate, flags=re.IGNORECASE)
|
||||
return candidate.strip()
|
||||
|
||||
|
||||
@@ -109,15 +303,115 @@ def _looks_like_markdown(text: str) -> bool:
|
||||
if "<html" in lower and "<head" in lower and "</body>" in lower:
|
||||
return False
|
||||
md_markers = 0
|
||||
md_markers += len(re.findall(r'^\s{0,3}#{1,6}\s+\S', text, flags=re.MULTILINE))
|
||||
md_markers += len(re.findall(r'^\s*[-*+]\s+\S', text, flags=re.MULTILINE))
|
||||
md_markers += len(re.findall(r'^\s*\d+\.\s+\S', text, flags=re.MULTILINE))
|
||||
md_markers += text.count('[TOC]')
|
||||
md_markers += len(re.findall(r"^\s{0,3}#{1,6}\s+\S", text, flags=re.MULTILINE))
|
||||
md_markers += len(re.findall(r"^\s*[-*+]\s+\S", text, flags=re.MULTILINE))
|
||||
md_markers += len(re.findall(r"^\s*\d+\.\s+\S", text, flags=re.MULTILINE))
|
||||
md_markers += text.count("[TOC]")
|
||||
md_markers += len(MARKDOWN_INLINE_LINK_RE.findall(text))
|
||||
md_markers += text.count('\n---') + text.count('\n***')
|
||||
md_markers += text.count("\n---") + text.count("\n***")
|
||||
return md_markers >= 6
|
||||
|
||||
|
||||
def _render_text_preview_document(text: str, title: str) -> str:
|
||||
escaped_title = html.escape(title)
|
||||
escaped_text = html.escape(text)
|
||||
return f"""<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<title>{escaped_title}</title>
|
||||
<style>
|
||||
:root {{
|
||||
color-scheme: dark;
|
||||
}}
|
||||
html, body {{
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
background: #111;
|
||||
color: #f3f3f3;
|
||||
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", monospace;
|
||||
}}
|
||||
.archivebox-text-preview-header {{
|
||||
position: sticky;
|
||||
top: 0;
|
||||
z-index: 1;
|
||||
padding: 10px 14px;
|
||||
font-size: 12px;
|
||||
line-height: 1.4;
|
||||
color: #bbb;
|
||||
background: rgba(17, 17, 17, 0.96);
|
||||
border-bottom: 1px solid rgba(255, 255, 255, 0.08);
|
||||
backdrop-filter: blur(8px);
|
||||
}}
|
||||
.archivebox-text-preview {{
|
||||
margin: 0;
|
||||
padding: 14px;
|
||||
white-space: pre-wrap;
|
||||
word-break: break-word;
|
||||
tab-size: 2;
|
||||
line-height: 1.45;
|
||||
font-size: 13px;
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="archivebox-text-preview-header">{escaped_title}</div>
|
||||
<pre class="archivebox-text-preview">{escaped_text}</pre>
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
|
||||
def _render_image_preview_document(image_url: str, title: str) -> str:
|
||||
escaped_title = html.escape(title)
|
||||
escaped_url = html.escape(image_url, quote=True)
|
||||
return f"""<!doctype html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1">
|
||||
<title>{escaped_title}</title>
|
||||
<style>
|
||||
:root {{
|
||||
color-scheme: dark;
|
||||
}}
|
||||
html, body {{
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
width: 100%;
|
||||
min-height: 100%;
|
||||
background: #fff;
|
||||
}}
|
||||
body {{
|
||||
overflow: auto;
|
||||
}}
|
||||
.archivebox-image-preview {{
|
||||
width: 100%;
|
||||
min-width: 100%;
|
||||
min-height: 100vh;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
align-items: center;
|
||||
justify-content: flex-start;
|
||||
box-sizing: border-box;
|
||||
}}
|
||||
.archivebox-image-preview img {{
|
||||
display: block;
|
||||
width: auto;
|
||||
max-width: 100%;
|
||||
height: auto;
|
||||
margin: 0 auto;
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="archivebox-image-preview">
|
||||
<img src="{escaped_url}" alt="{escaped_title}">
|
||||
</div>
|
||||
</body>
|
||||
</html>"""
|
||||
|
||||
|
||||
def _render_markdown_fallback(text: str) -> str:
|
||||
if _markdown is not None and not HTML_TAG_RE.search(text):
|
||||
try:
|
||||
@@ -133,11 +427,11 @@ def _render_markdown_fallback(text: str) -> str:
|
||||
headings = []
|
||||
|
||||
def slugify(value: str) -> str:
|
||||
slug = re.sub(r'[^A-Za-z0-9]+', '-', value).strip('-')
|
||||
slug = re.sub(r"[^A-Za-z0-9]+", "-", value).strip("-")
|
||||
return slug or "section"
|
||||
|
||||
for raw_line in lines:
|
||||
heading_match = re.match(r'^\s{0,3}(#{1,6})\s+(.*)$', raw_line)
|
||||
heading_match = re.match(r"^\s{0,3}(#{1,6})\s+(.*)$", raw_line)
|
||||
if heading_match:
|
||||
level = len(heading_match.group(1))
|
||||
content = heading_match.group(2).strip()
|
||||
@@ -152,8 +446,8 @@ def _render_markdown_fallback(text: str) -> str:
|
||||
def render_inline(markup: str) -> str:
|
||||
content = MARKDOWN_INLINE_IMAGE_RE.sub(r'<img alt="\1" src="\2">', markup)
|
||||
content = MARKDOWN_INLINE_LINK_RE.sub(r'<a href="\2">\1</a>', content)
|
||||
content = MARKDOWN_BOLD_RE.sub(r'<strong>\1</strong>', content)
|
||||
content = MARKDOWN_ITALIC_RE.sub(r'<em>\1</em>', content)
|
||||
content = MARKDOWN_BOLD_RE.sub(r"<strong>\1</strong>", content)
|
||||
content = MARKDOWN_ITALIC_RE.sub(r"<em>\1</em>", content)
|
||||
return content
|
||||
|
||||
def close_lists():
|
||||
@@ -194,7 +488,7 @@ def _render_markdown_fallback(text: str) -> str:
|
||||
html_lines.append("<br/>")
|
||||
continue
|
||||
|
||||
heading_match = re.match(r'^\s*((?:<[^>]+>\s*)*)(#{1,6})\s+(.*)$', line)
|
||||
heading_match = re.match(r"^\s*((?:<[^>]+>\s*)*)(#{1,6})\s+(.*)$", line)
|
||||
if heading_match:
|
||||
close_lists()
|
||||
if in_blockquote:
|
||||
@@ -205,7 +499,7 @@ def _render_markdown_fallback(text: str) -> str:
|
||||
content = heading_match.group(3).strip()
|
||||
if leading_tags:
|
||||
html_lines.append(leading_tags)
|
||||
html_lines.append(f"<h{level} id=\"{slugify(content)}\">{render_inline(content)}</h{level}>")
|
||||
html_lines.append(f'<h{level} id="{slugify(content)}">{render_inline(content)}</h{level}>')
|
||||
continue
|
||||
|
||||
if stripped in ("---", "***"):
|
||||
@@ -226,7 +520,7 @@ def _render_markdown_fallback(text: str) -> str:
|
||||
html_lines.append("</blockquote>")
|
||||
in_blockquote = False
|
||||
|
||||
ul_match = re.match(r'^\s*[-*+]\s+(.*)$', line)
|
||||
ul_match = re.match(r"^\s*[-*+]\s+(.*)$", line)
|
||||
if ul_match:
|
||||
if in_ol:
|
||||
html_lines.append("</ol>")
|
||||
@@ -237,7 +531,7 @@ def _render_markdown_fallback(text: str) -> str:
|
||||
html_lines.append(f"<li>{render_inline(ul_match.group(1))}</li>")
|
||||
continue
|
||||
|
||||
ol_match = re.match(r'^\s*\d+\.\s+(.*)$', line)
|
||||
ol_match = re.match(r"^\s*\d+\.\s+(.*)$", line)
|
||||
if ol_match:
|
||||
if in_ul:
|
||||
html_lines.append("</ul>")
|
||||
@@ -255,10 +549,10 @@ def _render_markdown_fallback(text: str) -> str:
|
||||
toc_items = []
|
||||
for level, title, slug in headings:
|
||||
toc_items.append(
|
||||
f'<li class="toc-level-{level}"><a href="#{slug}">{title}</a></li>'
|
||||
f'<li class="toc-level-{level}"><a href="#{slug}">{title}</a></li>',
|
||||
)
|
||||
html_lines.append(
|
||||
'<nav class="toc"><ul>' + "".join(toc_items) + '</ul></nav>'
|
||||
'<nav class="toc"><ul>' + "".join(toc_items) + "</ul></nav>",
|
||||
)
|
||||
continue
|
||||
|
||||
@@ -276,8 +570,8 @@ def _render_markdown_fallback(text: str) -> str:
|
||||
def _render_markdown_document(markdown_text: str) -> str:
|
||||
body = _render_markdown_fallback(markdown_text)
|
||||
wrapped = (
|
||||
"<!doctype html><html><head><meta charset=\"utf-8\">"
|
||||
"<meta name=\"viewport\" content=\"width=device-width,initial-scale=1\">"
|
||||
'<!doctype html><html><head><meta charset="utf-8">'
|
||||
'<meta name="viewport" content="width=device-width,initial-scale=1">'
|
||||
"<style>body{max-width:900px;margin:24px auto;padding:0 16px;"
|
||||
"font-family:system-ui,-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif;"
|
||||
"line-height:1.55;} img{max-width:100%;} pre{background:#f6f6f6;padding:12px;overflow:auto;}"
|
||||
@@ -338,7 +632,7 @@ def _apply_archive_replay_headers(response: HttpResponse, *, fullpath: Path, con
|
||||
return response
|
||||
|
||||
|
||||
def serve_static_with_byterange_support(request, path, document_root=None, show_indexes=False, is_archive_replay: bool=False):
|
||||
def serve_static_with_byterange_support(request, path, document_root=None, show_indexes=False, is_archive_replay: bool = False):
|
||||
"""
|
||||
Overrides Django's built-in django.views.static.serve function to support byte range requests.
|
||||
This allows you to do things like seek into the middle of a huge mp4 or WACZ without downloading the whole file.
|
||||
@@ -348,13 +642,20 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
|
||||
path = posixpath.normpath(path).lstrip("/")
|
||||
fullpath = Path(safe_join(document_root, path))
|
||||
if os.access(fullpath, os.R_OK) and fullpath.is_dir():
|
||||
if request.GET.get("download") == "zip" and show_indexes:
|
||||
return _build_directory_zip_response(
|
||||
fullpath,
|
||||
path,
|
||||
is_archive_replay=is_archive_replay,
|
||||
use_async_stream=hasattr(request, "scope"),
|
||||
)
|
||||
if show_indexes:
|
||||
response = static.directory_index(path, fullpath)
|
||||
response = _render_directory_index(request, path, fullpath)
|
||||
return _apply_archive_replay_headers(response, fullpath=fullpath, content_type="text/html", is_archive_replay=is_archive_replay)
|
||||
raise Http404(_("Directory indexes are not allowed here."))
|
||||
if not os.access(fullpath, os.R_OK):
|
||||
raise Http404(_("“%(path)s” does not exist") % {"path": fullpath})
|
||||
|
||||
|
||||
statobj = fullpath.stat()
|
||||
document_root = Path(document_root) if document_root else None
|
||||
rel_path = path
|
||||
@@ -374,27 +675,91 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
|
||||
not_modified.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
|
||||
not_modified.headers["Last-Modified"] = http_date(statobj.st_mtime)
|
||||
return _apply_archive_replay_headers(not_modified, fullpath=fullpath, content_type="", is_archive_replay=is_archive_replay)
|
||||
|
||||
|
||||
content_type, encoding = mimetypes.guess_type(str(fullpath))
|
||||
content_type = content_type or "application/octet-stream"
|
||||
# Add charset for text-like types (best guess), but don't override the type.
|
||||
is_text_like = (
|
||||
content_type.startswith("text/")
|
||||
or content_type in {
|
||||
"application/json",
|
||||
"application/javascript",
|
||||
"application/xml",
|
||||
"application/x-ndjson",
|
||||
"image/svg+xml",
|
||||
}
|
||||
)
|
||||
is_text_like = content_type.startswith("text/") or content_type in {
|
||||
"application/json",
|
||||
"application/javascript",
|
||||
"application/xml",
|
||||
"application/x-ndjson",
|
||||
"image/svg+xml",
|
||||
}
|
||||
if is_text_like and "charset=" not in content_type:
|
||||
content_type = f"{content_type}; charset=utf-8"
|
||||
preview_as_text_html = (
|
||||
bool(request.GET.get("preview"))
|
||||
and is_text_like
|
||||
and not content_type.startswith("text/html")
|
||||
and not content_type.startswith("image/svg+xml")
|
||||
)
|
||||
preview_as_image_html = (
|
||||
bool(request.GET.get("preview")) and content_type.startswith("image/") and not content_type.startswith("image/svg+xml")
|
||||
)
|
||||
|
||||
# Respect the If-Modified-Since header for non-markdown responses.
|
||||
if not (content_type.startswith("text/plain") or content_type.startswith("text/html")):
|
||||
if not static.was_modified_since(request.META.get("HTTP_IF_MODIFIED_SINCE"), statobj.st_mtime):
|
||||
return _apply_archive_replay_headers(HttpResponseNotModified(), fullpath=fullpath, content_type=content_type, is_archive_replay=is_archive_replay)
|
||||
return _apply_archive_replay_headers(
|
||||
HttpResponseNotModified(),
|
||||
fullpath=fullpath,
|
||||
content_type=content_type,
|
||||
is_archive_replay=is_archive_replay,
|
||||
)
|
||||
|
||||
# Wrap text-like outputs in HTML when explicitly requested for iframe previewing.
|
||||
if preview_as_text_html:
|
||||
try:
|
||||
max_preview_size = 10 * 1024 * 1024
|
||||
if statobj.st_size <= max_preview_size:
|
||||
decoded = fullpath.read_text(encoding="utf-8", errors="replace")
|
||||
wrapped = _render_text_preview_document(decoded, fullpath.name)
|
||||
response = HttpResponse(wrapped, content_type="text/html; charset=utf-8")
|
||||
response.headers["Last-Modified"] = http_date(statobj.st_mtime)
|
||||
if etag:
|
||||
response.headers["ETag"] = etag
|
||||
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
|
||||
else:
|
||||
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
|
||||
response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
|
||||
if encoding:
|
||||
response.headers["Content-Encoding"] = encoding
|
||||
return _apply_archive_replay_headers(
|
||||
response,
|
||||
fullpath=fullpath,
|
||||
content_type="text/html; charset=utf-8",
|
||||
is_archive_replay=is_archive_replay,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if preview_as_image_html:
|
||||
try:
|
||||
preview_query = request.GET.copy()
|
||||
preview_query.pop("preview", None)
|
||||
raw_image_url = request.path
|
||||
if preview_query:
|
||||
raw_image_url = f"{raw_image_url}?{urlencode(list(preview_query.lists()), doseq=True)}"
|
||||
wrapped = _render_image_preview_document(raw_image_url, fullpath.name)
|
||||
response = HttpResponse(wrapped, content_type="text/html; charset=utf-8")
|
||||
response.headers["Last-Modified"] = http_date(statobj.st_mtime)
|
||||
if etag:
|
||||
response.headers["ETag"] = etag
|
||||
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
|
||||
else:
|
||||
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
|
||||
response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
|
||||
if encoding:
|
||||
response.headers["Content-Encoding"] = encoding
|
||||
return _apply_archive_replay_headers(
|
||||
response,
|
||||
fullpath=fullpath,
|
||||
content_type="text/html; charset=utf-8",
|
||||
is_archive_replay=is_archive_replay,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# Heuristic fix: some archived HTML outputs (e.g. mercury content.html)
|
||||
# are stored with HTML-escaped markup or markdown sources. If so, render sensibly.
|
||||
@@ -421,7 +786,12 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
|
||||
response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
|
||||
if encoding:
|
||||
response.headers["Content-Encoding"] = encoding
|
||||
return _apply_archive_replay_headers(response, fullpath=fullpath, content_type="text/html; charset=utf-8", is_archive_replay=is_archive_replay)
|
||||
return _apply_archive_replay_headers(
|
||||
response,
|
||||
fullpath=fullpath,
|
||||
content_type="text/html; charset=utf-8",
|
||||
is_archive_replay=is_archive_replay,
|
||||
)
|
||||
if escaped_count and escaped_count > tag_count * 2:
|
||||
response = HttpResponse(decoded, content_type=content_type)
|
||||
response.headers["Last-Modified"] = http_date(statobj.st_mtime)
|
||||
@@ -433,11 +803,16 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
|
||||
response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
|
||||
if encoding:
|
||||
response.headers["Content-Encoding"] = encoding
|
||||
return _apply_archive_replay_headers(response, fullpath=fullpath, content_type=content_type, is_archive_replay=is_archive_replay)
|
||||
return _apply_archive_replay_headers(
|
||||
response,
|
||||
fullpath=fullpath,
|
||||
content_type=content_type,
|
||||
is_archive_replay=is_archive_replay,
|
||||
)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# setup resposne object
|
||||
# setup response object
|
||||
ranged_file = RangedFileReader(open(fullpath, "rb"))
|
||||
response = StreamingHttpResponse(ranged_file, content_type=content_type)
|
||||
response.headers["Last-Modified"] = http_date(statobj.st_mtime)
|
||||
@@ -451,7 +826,7 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
|
||||
if content_type.startswith("image/"):
|
||||
response.headers["Cache-Control"] = "public, max-age=604800, immutable"
|
||||
|
||||
# handle byte-range requests by serving chunk of file
|
||||
# handle byte-range requests by serving chunk of file
|
||||
if stat.S_ISREG(statobj.st_mode):
|
||||
size = statobj.st_size
|
||||
response["Content-Length"] = size
|
||||
@@ -460,7 +835,7 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
|
||||
# Respect the Range header.
|
||||
if "HTTP_RANGE" in request.META:
|
||||
try:
|
||||
ranges = parse_range_header(request.META['HTTP_RANGE'], size)
|
||||
ranges = parse_range_header(request.META["HTTP_RANGE"], size)
|
||||
except ValueError:
|
||||
ranges = None
|
||||
# only handle syntactically valid headers, that are simple (no
|
||||
@@ -511,7 +886,7 @@ def parse_range_header(header, resource_size):
|
||||
Parses a range header into a list of two-tuples (start, stop) where `start`
|
||||
is the starting byte of the range (inclusive) and `stop` is the ending byte
|
||||
position of the range (exclusive).
|
||||
Returns None if the value of the header is not syntatically valid.
|
||||
Returns None if the value of the header is not syntactically valid.
|
||||
https://github.com/satchamo/django/commit/2ce75c5c4bee2a858c0214d136bfcd351fcde11d
|
||||
"""
|
||||
if not header or "=" not in header:
|
||||
|
||||
@@ -1,57 +1,63 @@
|
||||
__package__ = 'archivebox.core'
|
||||
__package__ = "archivebox.core"
|
||||
|
||||
from rich.console import Console
|
||||
|
||||
# helpful imports that make the shell easier to work with out-of-the-box:
|
||||
import re # noqa
|
||||
import os # noqa
|
||||
import sys # noqa
|
||||
import json # noqa
|
||||
import psutil # noqa
|
||||
import django # noqa
|
||||
import pydantic # noqa
|
||||
import requests # noqa
|
||||
import subprocess # noqa
|
||||
import archivebox # noqa
|
||||
from benedict import benedict # noqa
|
||||
from django.utils import timezone # noqa
|
||||
from datetime import datetime, timedelta # noqa
|
||||
from django.conf import settings # noqa
|
||||
import re # noqa
|
||||
import os # noqa
|
||||
import sys # noqa
|
||||
import json # noqa
|
||||
import psutil # noqa
|
||||
import django # noqa
|
||||
import pydantic # noqa
|
||||
import requests # noqa
|
||||
import subprocess # noqa
|
||||
import archivebox
|
||||
from benedict import benedict # noqa
|
||||
from django.utils import timezone # noqa
|
||||
from datetime import datetime, timedelta # noqa
|
||||
from django.conf import settings # noqa
|
||||
|
||||
from archivebox import CONSTANTS # noqa
|
||||
from archivebox.cli import * # noqa
|
||||
from archivebox import CONSTANTS # noqa
|
||||
from archivebox.cli import * # noqa
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
CONFIG = get_config()
|
||||
|
||||
if __name__ == '__main__':
|
||||
if __name__ == "__main__":
|
||||
# load the rich extension for ipython for pretty printing
|
||||
# https://rich.readthedocs.io/en/stable/introduction.html#ipython-extension
|
||||
get_ipython().run_line_magic('load_ext', 'rich') # type: ignore # noqa
|
||||
get_ipython().run_line_magic("load_ext", "rich") # type: ignore # noqa
|
||||
|
||||
# prnt = print with cropping using ... ellipsis for helptext that doens't matter that much
|
||||
# prnt = print with cropping using ... ellipsis for helptext that doesn't matter that much
|
||||
console = Console()
|
||||
prnt = lambda *args, **kwargs: console.print(*args, overflow='ellipsis', soft_wrap=True, **kwargs)
|
||||
|
||||
prnt = lambda *args, **kwargs: console.print(*args, overflow="ellipsis", soft_wrap=True, **kwargs)
|
||||
|
||||
# print the welcome message
|
||||
prnt('[green]import re, os, sys, psutil, subprocess, requests, json, pydantic, benedict, django[/]')
|
||||
prnt('[yellow4]# ArchiveBox Imports[/]')
|
||||
prnt('[yellow4]import archivebox[/]')
|
||||
prnt('[yellow4]from archivebox.cli import *[/]')
|
||||
prnt("[green]import re, os, sys, psutil, subprocess, requests, json, pydantic, benedict, django[/]")
|
||||
prnt("[yellow4]# ArchiveBox Imports[/]")
|
||||
prnt("[yellow4]import archivebox[/]")
|
||||
prnt("[yellow4]from archivebox.cli import *[/]")
|
||||
prnt()
|
||||
|
||||
|
||||
if console.width >= 80:
|
||||
from archivebox.misc.logging import rainbow
|
||||
|
||||
prnt(rainbow(archivebox.ASCII_LOGO))
|
||||
|
||||
prnt('[i] :heavy_dollar_sign: Welcome to the ArchiveBox Shell!')
|
||||
prnt(' [deep_sky_blue4]Docs:[/deep_sky_blue4] [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage[/link]')
|
||||
prnt(' [link=https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html]https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html[/link]')
|
||||
|
||||
prnt("[i] :heavy_dollar_sign: Welcome to the ArchiveBox Shell!")
|
||||
prnt(
|
||||
" [deep_sky_blue4]Docs:[/deep_sky_blue4] [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage[/link]",
|
||||
)
|
||||
prnt(
|
||||
" [link=https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html]https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html[/link]",
|
||||
)
|
||||
prnt()
|
||||
prnt(' :grey_question: [violet]Hint[/] [i]Here are some examples to get started:[/]')
|
||||
prnt(' add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink] [grey53]# add ? after anything to get help[/]')
|
||||
prnt(" :grey_question: [violet]Hint[/] [i]Here are some examples to get started:[/]")
|
||||
prnt(
|
||||
" add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink] [grey53]# add ? after anything to get help[/]",
|
||||
)
|
||||
prnt(' add("https://example.com/some/new/url") [grey53]# call CLI methods from the shell[/]')
|
||||
prnt(' snap = Snapshot.objects.filter(url__contains="https://example.com").last() [grey53]# query for individual snapshots[/]')
|
||||
prnt(' snap.archiveresult_set.all() [grey53]# see extractor plugin results[/]')
|
||||
prnt(" snap.archiveresult_set.all() [grey53]# see extractor plugin results[/]")
|
||||
prnt(' bool(re.compile(CONFIG.URL_DENYLIST).search("https://example.com/abc.exe")) [grey53]# test out a config change[/]')
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
__package__ = 'archivebox.misc'
|
||||
__package__ = "archivebox.misc"
|
||||
|
||||
|
||||
import os
|
||||
@@ -8,7 +8,6 @@ import sys
|
||||
|
||||
from json import dump
|
||||
from pathlib import Path
|
||||
from typing import Optional, Union, Tuple
|
||||
from subprocess import PIPE, Popen, CalledProcessError, CompletedProcess, TimeoutExpired
|
||||
|
||||
from atomicwrites import atomic_write as lib_atomic_write
|
||||
@@ -16,29 +15,30 @@ from atomicwrites import atomic_write as lib_atomic_write
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
from archivebox.misc.util import enforce_types, ExtendedEncoder
|
||||
|
||||
IS_WINDOWS = os.name == 'nt'
|
||||
IS_WINDOWS = os.name == "nt"
|
||||
|
||||
|
||||
def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs):
|
||||
"""Patched of subprocess.run to kill forked child subprocesses and fix blocking io making timeout=innefective
|
||||
Mostly copied from https://github.com/python/cpython/blob/master/Lib/subprocess.py
|
||||
Mostly copied from https://github.com/python/cpython/blob/master/Lib/subprocess.py
|
||||
"""
|
||||
|
||||
cmd = [str(arg) for arg in cmd]
|
||||
|
||||
if input is not None:
|
||||
if kwargs.get('stdin') is not None:
|
||||
raise ValueError('stdin and input arguments may not both be used.')
|
||||
kwargs['stdin'] = PIPE
|
||||
if kwargs.get("stdin") is not None:
|
||||
raise ValueError("stdin and input arguments may not both be used.")
|
||||
kwargs["stdin"] = PIPE
|
||||
|
||||
if capture_output:
|
||||
if ('stdout' in kwargs) or ('stderr' in kwargs):
|
||||
raise ValueError('stdout and stderr arguments may not be used with capture_output.')
|
||||
kwargs['stdout'] = PIPE
|
||||
kwargs['stderr'] = PIPE
|
||||
if ("stdout" in kwargs) or ("stderr" in kwargs):
|
||||
raise ValueError("stdout and stderr arguments may not be used with capture_output.")
|
||||
kwargs["stdout"] = PIPE
|
||||
kwargs["stderr"] = PIPE
|
||||
|
||||
pgid = None
|
||||
try:
|
||||
if isinstance(cmd, (list, tuple)) and cmd[0].endswith('.py'):
|
||||
if isinstance(cmd, (list, tuple)) and cmd[0].endswith(".py"):
|
||||
PYTHON_BINARY = sys.executable
|
||||
cmd = (PYTHON_BINARY, *cmd)
|
||||
|
||||
@@ -69,8 +69,12 @@ def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False,
|
||||
|
||||
retcode = process.poll()
|
||||
if check and retcode:
|
||||
raise CalledProcessError(retcode, process.args,
|
||||
output=stdout, stderr=stderr)
|
||||
raise CalledProcessError(
|
||||
retcode,
|
||||
process.args,
|
||||
output=stdout,
|
||||
stderr=stderr,
|
||||
)
|
||||
finally:
|
||||
# force kill any straggler subprocesses that were forked from the main proc
|
||||
try:
|
||||
@@ -83,11 +87,11 @@ def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False,
|
||||
|
||||
|
||||
@enforce_types
|
||||
def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], overwrite: bool=True) -> None:
|
||||
def atomic_write(path: Path | str, contents: dict | str | bytes, overwrite: bool = True) -> None:
|
||||
"""Safe atomic write to filesystem by writing to temp file + atomic rename"""
|
||||
|
||||
mode = 'wb+' if isinstance(contents, bytes) else 'w'
|
||||
encoding = None if isinstance(contents, bytes) else 'utf-8' # enforce utf-8 on all text writes
|
||||
mode = "wb+" if isinstance(contents, bytes) else "w"
|
||||
encoding = None if isinstance(contents, bytes) else "utf-8" # enforce utf-8 on all text writes
|
||||
|
||||
# print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
|
||||
try:
|
||||
@@ -99,8 +103,12 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over
|
||||
except OSError as e:
|
||||
if STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES:
|
||||
print(f"[X] OSError: Failed to write {path} with fcntl.F_FULLFSYNC. ({e})")
|
||||
print(" You can store the archive/ subfolder on a hard drive or network share that doesn't support support syncronous writes,")
|
||||
print(" but the main folder containing the index.sqlite3 and ArchiveBox.conf files must be on a filesystem that supports FSYNC.")
|
||||
print(
|
||||
" You can store the archive/ subfolder on a hard drive or network share that doesn't support support synchronous writes,",
|
||||
)
|
||||
print(
|
||||
" but the main folder containing the index.sqlite3 and ArchiveBox.conf files must be on a filesystem that supports FSYNC.",
|
||||
)
|
||||
raise SystemExit(1)
|
||||
|
||||
# retry the write without forcing FSYNC (aka atomic mode)
|
||||
@@ -113,19 +121,20 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over
|
||||
# set file permissions
|
||||
os.chmod(path, int(STORAGE_CONFIG.OUTPUT_PERMISSIONS, base=8))
|
||||
|
||||
|
||||
@enforce_types
|
||||
def chmod_file(path: str, cwd: str='') -> None:
|
||||
def chmod_file(path: str, cwd: str = "") -> None:
|
||||
"""chmod -R <permissions> <cwd>/<path>"""
|
||||
|
||||
root = Path(cwd or os.getcwd()) / path
|
||||
if not os.access(root, os.R_OK):
|
||||
raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
|
||||
raise Exception(f"Failed to chmod: {path} does not exist (did the previous step fail?)")
|
||||
|
||||
if not root.is_dir():
|
||||
# path is just a plain file
|
||||
os.chmod(root, int(STORAGE_CONFIG.OUTPUT_PERMISSIONS, base=8))
|
||||
else:
|
||||
for subpath in Path(path).glob('**/*'):
|
||||
for subpath in Path(path).glob("**/*"):
|
||||
if subpath.is_dir():
|
||||
# directories need execute permissions to be able to list contents
|
||||
os.chmod(subpath, int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8))
|
||||
@@ -134,24 +143,24 @@ def chmod_file(path: str, cwd: str='') -> None:
|
||||
|
||||
|
||||
@enforce_types
|
||||
def copy_and_overwrite(from_path: Union[str, Path], to_path: Union[str, Path]):
|
||||
def copy_and_overwrite(from_path: str | Path, to_path: str | Path):
|
||||
"""copy a given file or directory to a given path, overwriting the destination"""
|
||||
|
||||
|
||||
assert os.access(from_path, os.R_OK)
|
||||
|
||||
|
||||
if Path(from_path).is_dir():
|
||||
shutil.rmtree(to_path, ignore_errors=True)
|
||||
shutil.copytree(from_path, to_path)
|
||||
else:
|
||||
with open(from_path, 'rb') as src:
|
||||
with open(from_path, "rb") as src:
|
||||
contents = src.read()
|
||||
atomic_write(to_path, contents)
|
||||
|
||||
|
||||
@enforce_types
|
||||
def get_dir_size(path: Union[str, Path], recursive: bool=True, pattern: Optional[str]=None) -> Tuple[int, int, int]:
|
||||
"""get the total disk size of a given directory, optionally summing up
|
||||
recursively and limiting to a given filter list
|
||||
def get_dir_size(path: str | Path, recursive: bool = True, pattern: str | None = None) -> tuple[int, int, int]:
|
||||
"""get the total disk size of a given directory, optionally summing up
|
||||
recursively and limiting to a given filter list
|
||||
"""
|
||||
num_bytes, num_dirs, num_files = 0, 0, 0
|
||||
try:
|
||||
@@ -174,20 +183,21 @@ def get_dir_size(path: Union[str, Path], recursive: bool=True, pattern: Optional
|
||||
pass
|
||||
return num_bytes, num_dirs, num_files
|
||||
|
||||
class suppress_output(object):
|
||||
|
||||
class suppress_output:
|
||||
"""
|
||||
A context manager for doing a "deep suppression" of stdout and stderr in
|
||||
Python, i.e. will suppress all print, even if the print originates in a
|
||||
A context manager for doing a "deep suppression" of stdout and stderr in
|
||||
Python, i.e. will suppress all print, even if the print originates in a
|
||||
compiled C/Fortran sub-function.
|
||||
|
||||
|
||||
This will not suppress raised exceptions, since exceptions are printed
|
||||
to stderr just before a script exits, and after the context manager has
|
||||
exited (at least, I think that is why it lets exceptions through).
|
||||
exited (at least, I think that is why it lets exceptions through).
|
||||
|
||||
with suppress_stdout_stderr():
|
||||
rogue_function()
|
||||
"""
|
||||
|
||||
|
||||
def __init__(self, stdout=True, stderr=True):
|
||||
# Open a pair of null files
|
||||
# Save the actual stdout (1) and stderr (2) file descriptors.
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
from typing import Any, List, Callable, cast
|
||||
from typing import Any, cast
|
||||
from collections.abc import Callable
|
||||
|
||||
import json
|
||||
import ast
|
||||
@@ -12,15 +13,16 @@ from pathlib import Path, PosixPath
|
||||
from pydantic.json_schema import GenerateJsonSchema
|
||||
from pydantic_core import to_jsonable_python
|
||||
|
||||
JSONValue = str | bool | int | None | List['JSONValue']
|
||||
JSONValue = str | bool | int | None | list["JSONValue"]
|
||||
|
||||
TOML_HEADER = "# Converted from INI to TOML format: https://toml.io/en/\n\n"
|
||||
|
||||
|
||||
def load_ini_value(val: str) -> JSONValue:
|
||||
"""Convert lax INI values into strict TOML-compliant (JSON) values"""
|
||||
if val.lower() in ('true', 'yes', '1'):
|
||||
if val.lower() in ("true", "yes", "1"):
|
||||
return True
|
||||
if val.lower() in ('false', 'no', '0'):
|
||||
if val.lower() in ("false", "no", "0"):
|
||||
return False
|
||||
if val.isdigit():
|
||||
return int(val)
|
||||
@@ -34,7 +36,7 @@ def load_ini_value(val: str) -> JSONValue:
|
||||
return json.loads(val)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
return val
|
||||
|
||||
|
||||
@@ -42,7 +44,7 @@ def convert(ini_str: str) -> str:
|
||||
"""Convert a string of INI config into its TOML equivalent (warning: strips comments)"""
|
||||
|
||||
config = configparser.ConfigParser()
|
||||
setattr(config, 'optionxform', str) # capitalize key names
|
||||
setattr(config, "optionxform", str) # capitalize key names
|
||||
config.read_string(ini_str)
|
||||
|
||||
# Initialize an empty dictionary to store the TOML representation
|
||||
@@ -70,22 +72,22 @@ def convert(ini_str: str) -> str:
|
||||
return toml_str.strip()
|
||||
|
||||
|
||||
|
||||
class JSONSchemaWithLambdas(GenerateJsonSchema):
|
||||
"""
|
||||
Encode lambda functions in default values properly.
|
||||
Usage:
|
||||
>>> json.dumps(value, encoder=JSONSchemaWithLambdas())
|
||||
"""
|
||||
|
||||
def encode_default(self, dft: Any) -> Any:
|
||||
config = self._config
|
||||
if isinstance(dft, Callable):
|
||||
return '{{lambda ' + inspect.getsource(dft).split('=lambda ')[-1].strip()[:-1] + '}}'
|
||||
return "{{lambda " + inspect.getsource(dft).split("=lambda ")[-1].strip()[:-1] + "}}"
|
||||
return to_jsonable_python(
|
||||
dft,
|
||||
timedelta_mode=config.ser_json_timedelta,
|
||||
bytes_mode=config.ser_json_bytes,
|
||||
serialize_unknown=True
|
||||
serialize_unknown=True,
|
||||
)
|
||||
|
||||
# for computed_field properties render them like this instead:
|
||||
@@ -94,19 +96,21 @@ class JSONSchemaWithLambdas(GenerateJsonSchema):
|
||||
|
||||
def better_toml_dump_str(val: Any) -> str:
|
||||
try:
|
||||
dump_str = cast(Callable[[Any], str], getattr(toml.encoder, '_dump_str'))
|
||||
dump_str = cast(Callable[[Any], str], getattr(toml.encoder, "_dump_str"))
|
||||
return dump_str(val)
|
||||
except Exception:
|
||||
# if we hit any of toml's numerous encoding bugs,
|
||||
# fall back to using json representation of string
|
||||
return json.dumps(str(val))
|
||||
|
||||
|
||||
class CustomTOMLEncoder(toml.encoder.TomlEncoder):
|
||||
"""
|
||||
Custom TomlEncoder to work around https://github.com/uiri/toml's many encoding bugs.
|
||||
More info: https://github.com/fabiocaccamo/python-benedict/issues/439
|
||||
>>> toml.dumps(value, encoder=CustomTOMLEncoder())
|
||||
"""
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
dump_funcs = cast(dict[Any, Callable[[Any], str]], self.dump_funcs)
|
||||
|
||||
@@ -1,12 +1,14 @@
|
||||
__package__ = 'archivebox.misc'
|
||||
__package__ = "archivebox.misc"
|
||||
|
||||
import re
|
||||
import requests
|
||||
import json as pyjson
|
||||
import http.cookiejar
|
||||
from decimal import Decimal, InvalidOperation
|
||||
from dateparser import parse as dateparser
|
||||
|
||||
from typing import List, Optional, Any, Callable
|
||||
from typing import Any
|
||||
from collections.abc import Callable
|
||||
from pathlib import Path
|
||||
from inspect import signature
|
||||
from functools import wraps
|
||||
@@ -18,8 +20,10 @@ from requests.exceptions import RequestException, ReadTimeout
|
||||
|
||||
from base32_crockford import encode as base32_encode
|
||||
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
|
||||
|
||||
try:
|
||||
import chardet # type:ignore
|
||||
import chardet # type:ignore
|
||||
|
||||
detect_encoding = lambda rawdata: chardet.detect(rawdata)["encoding"]
|
||||
except ImportError:
|
||||
detect_encoding = lambda rawdata: "utf-8"
|
||||
@@ -35,57 +39,135 @@ from .logging import COLOR_DICT
|
||||
# All of these are (str) -> str
|
||||
# shortcuts to: https://docs.python.org/3/library/urllib.parse.html#url-parsing
|
||||
scheme = lambda url: urlparse(url).scheme.lower()
|
||||
without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
|
||||
without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
|
||||
without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//')
|
||||
without_path = lambda url: urlparse(url)._replace(path='', fragment='', query='').geturl().strip('//')
|
||||
without_scheme = lambda url: urlparse(url)._replace(scheme="").geturl().strip("//")
|
||||
without_query = lambda url: urlparse(url)._replace(query="").geturl().strip("//")
|
||||
without_fragment = lambda url: urlparse(url)._replace(fragment="").geturl().strip("//")
|
||||
without_path = lambda url: urlparse(url)._replace(path="", fragment="", query="").geturl().strip("//")
|
||||
path = lambda url: urlparse(url).path
|
||||
basename = lambda url: urlparse(url).path.rsplit('/', 1)[-1]
|
||||
basename = lambda url: urlparse(url).path.rsplit("/", 1)[-1]
|
||||
domain = lambda url: urlparse(url).netloc
|
||||
query = lambda url: urlparse(url).query
|
||||
fragment = lambda url: urlparse(url).fragment
|
||||
extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
|
||||
extension = lambda url: basename(url).rsplit(".", 1)[-1].lower() if "." in basename(url) else ""
|
||||
base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
|
||||
|
||||
without_www = lambda url: url.replace('://www.', '://', 1)
|
||||
without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
|
||||
hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20]
|
||||
without_www = lambda url: url.replace("://www.", "://", 1)
|
||||
without_trailing_slash = lambda url: url[:-1] if url[-1] == "/" else url.replace("/?", "?")
|
||||
hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode("utf-8")).hexdigest(), 16))[:20]
|
||||
|
||||
urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
|
||||
urlencode = lambda s: s and quote(s, encoding="utf-8", errors="replace")
|
||||
urldecode = lambda s: s and unquote(s)
|
||||
htmlencode = lambda s: s and escape(s, quote=True)
|
||||
htmldecode = lambda s: s and unescape(s)
|
||||
|
||||
|
||||
def short_ts(ts: Any) -> str | None:
|
||||
parsed = parse_date(ts)
|
||||
return None if parsed is None else str(parsed.timestamp()).split('.')[0]
|
||||
return None if parsed is None else str(parsed.timestamp()).split(".")[0]
|
||||
|
||||
|
||||
def ts_to_date_str(ts: Any) -> str | None:
|
||||
parsed = parse_date(ts)
|
||||
return None if parsed is None else parsed.strftime('%Y-%m-%d %H:%M')
|
||||
return None if parsed is None else parsed.strftime("%Y-%m-%d %H:%M")
|
||||
|
||||
|
||||
def ts_to_iso(ts: Any) -> str | None:
|
||||
parsed = parse_date(ts)
|
||||
return None if parsed is None else parsed.isoformat()
|
||||
|
||||
COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m')
|
||||
|
||||
COLOR_REGEX = re.compile(r"\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m")
|
||||
|
||||
|
||||
# https://mathiasbynens.be/demo/url-regex
|
||||
URL_REGEX = re.compile(
|
||||
r'(?=('
|
||||
r'http[s]?://' # start matching from allowed schemes
|
||||
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
|
||||
r'|[-_$@.&+!*\(\),]' # or allowed symbols (keep hyphen first to match literal hyphen)
|
||||
r'|[^\u0000-\u007F])+' # or allowed unicode bytes
|
||||
r'[^\]\[<>"\'\s]+' # stop parsing at these symbols
|
||||
r'))',
|
||||
r"(?=("
|
||||
r"http[s]?://" # start matching from allowed schemes
|
||||
r"(?:[a-zA-Z]|[0-9]" # followed by allowed alphanum characters
|
||||
r"|[-_$@.&+!*\(\),]" # or allowed symbols (keep hyphen first to match literal hyphen)
|
||||
r"|[^\u0000-\u007F])+" # or allowed unicode bytes
|
||||
r'[^\]\[<>"\'\s]+' # stop parsing at these symbols
|
||||
r"))",
|
||||
re.IGNORECASE | re.UNICODE,
|
||||
)
|
||||
|
||||
def parens_are_matched(string: str, open_char='(', close_char=')'):
|
||||
QUOTE_DELIMITERS = (
|
||||
'"',
|
||||
"'",
|
||||
"`",
|
||||
"“",
|
||||
"”",
|
||||
"‘",
|
||||
"’",
|
||||
)
|
||||
QUOTE_ENTITY_DELIMITERS = (
|
||||
""",
|
||||
""",
|
||||
""",
|
||||
"'",
|
||||
"'",
|
||||
"'",
|
||||
)
|
||||
URL_ENTITY_REPLACEMENTS = (
|
||||
("&", "&"),
|
||||
("&", "&"),
|
||||
("&", "&"),
|
||||
)
|
||||
|
||||
FILESIZE_UNITS: dict[str, int] = {
|
||||
"": 1,
|
||||
"b": 1,
|
||||
"byte": 1,
|
||||
"bytes": 1,
|
||||
"k": 1024,
|
||||
"kb": 1024,
|
||||
"kib": 1024,
|
||||
"m": 1024**2,
|
||||
"mb": 1024**2,
|
||||
"mib": 1024**2,
|
||||
"g": 1024**3,
|
||||
"gb": 1024**3,
|
||||
"gib": 1024**3,
|
||||
"t": 1024**4,
|
||||
"tb": 1024**4,
|
||||
"tib": 1024**4,
|
||||
}
|
||||
|
||||
|
||||
def sanitize_extracted_url(url: str) -> str:
|
||||
"""Trim quote garbage and dangling prose punctuation from an extracted URL candidate."""
|
||||
cleaned = (url or "").strip()
|
||||
if not cleaned:
|
||||
return cleaned
|
||||
|
||||
lower_cleaned = cleaned.lower()
|
||||
cut_index = len(cleaned)
|
||||
|
||||
for delimiter in QUOTE_DELIMITERS:
|
||||
found_index = cleaned.find(delimiter)
|
||||
if found_index != -1:
|
||||
cut_index = min(cut_index, found_index)
|
||||
|
||||
for delimiter in QUOTE_ENTITY_DELIMITERS:
|
||||
found_index = lower_cleaned.find(delimiter)
|
||||
if found_index != -1:
|
||||
cut_index = min(cut_index, found_index)
|
||||
|
||||
cleaned = cleaned[:cut_index].strip()
|
||||
lower_cleaned = cleaned.lower()
|
||||
for entity, replacement in URL_ENTITY_REPLACEMENTS:
|
||||
while entity in lower_cleaned:
|
||||
entity_index = lower_cleaned.find(entity)
|
||||
cleaned = cleaned[:entity_index] + replacement + cleaned[entity_index + len(entity) :]
|
||||
lower_cleaned = cleaned.lower()
|
||||
|
||||
cleaned = cleaned.rstrip(".,;:!?\\'\"")
|
||||
cleaned = cleaned.rstrip('"')
|
||||
|
||||
return cleaned
|
||||
|
||||
|
||||
def parens_are_matched(string: str, open_char="(", close_char=")"):
|
||||
"""check that all parentheses in a string are balanced and nested properly"""
|
||||
count = 0
|
||||
for c in string:
|
||||
@@ -97,6 +179,7 @@ def parens_are_matched(string: str, open_char='(', close_char=')'):
|
||||
return False
|
||||
return count == 0
|
||||
|
||||
|
||||
def fix_url_from_markdown(url_str: str) -> str:
|
||||
"""
|
||||
cleanup a regex-parsed url that may contain dangling trailing parens from markdown link syntax
|
||||
@@ -113,46 +196,91 @@ def fix_url_from_markdown(url_str: str) -> str:
|
||||
|
||||
# cut off one trailing character at a time
|
||||
# until parens are balanced e.g. /a(b)c).x(y)z -> /a(b)c
|
||||
while not parens_are_matched(trimmed_url):
|
||||
while trimmed_url and not parens_are_matched(trimmed_url):
|
||||
trimmed_url = trimmed_url[:-1]
|
||||
|
||||
|
||||
# make sure trimmed url is still valid
|
||||
if re.findall(URL_REGEX, trimmed_url):
|
||||
if any(match == trimmed_url for match in re.findall(URL_REGEX, trimmed_url)):
|
||||
return trimmed_url
|
||||
|
||||
|
||||
return url_str
|
||||
|
||||
|
||||
def split_comma_separated_urls(url: str):
|
||||
offset = 0
|
||||
while True:
|
||||
http_index = url.find('http://', 1)
|
||||
https_index = url.find('https://', 1)
|
||||
http_index = url.find("http://", 1)
|
||||
https_index = url.find("https://", 1)
|
||||
next_indices = [idx for idx in (http_index, https_index) if idx != -1]
|
||||
if not next_indices:
|
||||
yield offset, url
|
||||
return
|
||||
|
||||
next_index = min(next_indices)
|
||||
if url[next_index - 1] != ',':
|
||||
if url[next_index - 1] != ",":
|
||||
yield offset, url
|
||||
return
|
||||
|
||||
yield offset, url[:next_index - 1]
|
||||
yield offset, url[: next_index - 1]
|
||||
offset += next_index
|
||||
url = url[next_index:]
|
||||
|
||||
|
||||
def find_all_urls(urls_str: str):
|
||||
skipped_starts = set()
|
||||
for match in re.finditer(URL_REGEX, urls_str):
|
||||
if match.start() in skipped_starts:
|
||||
continue
|
||||
|
||||
for offset, url in split_comma_separated_urls(fix_url_from_markdown(match.group(1))):
|
||||
cleaned_match = sanitize_extracted_url(fix_url_from_markdown(match.group(1)))
|
||||
for offset, url in split_comma_separated_urls(cleaned_match):
|
||||
if offset:
|
||||
skipped_starts.add(match.start() + offset)
|
||||
yield url
|
||||
|
||||
|
||||
def parse_filesize_to_bytes(value: str | int | float | None) -> int:
|
||||
"""
|
||||
Parse a byte count from an integer or human-readable string like 45mb or 2 GB.
|
||||
"""
|
||||
if value is None:
|
||||
return 0
|
||||
|
||||
if isinstance(value, bool):
|
||||
raise ValueError("Size value must be an integer or size string.")
|
||||
|
||||
if isinstance(value, int):
|
||||
return value
|
||||
|
||||
if isinstance(value, float):
|
||||
if not value.is_integer():
|
||||
raise ValueError("Size value must resolve to a whole number of bytes.")
|
||||
return int(value)
|
||||
|
||||
raw_value = str(value).strip()
|
||||
if not raw_value:
|
||||
return 0
|
||||
|
||||
if raw_value.isdigit():
|
||||
return int(raw_value)
|
||||
|
||||
match = re.fullmatch(r"(?i)(\d+(?:\.\d+)?)\s*([a-z]+)", raw_value)
|
||||
if not match:
|
||||
raise ValueError(f"Invalid size value: {value}")
|
||||
|
||||
amount_str, unit_str = match.groups()
|
||||
multiplier = FILESIZE_UNITS.get(unit_str.lower())
|
||||
if multiplier is None:
|
||||
raise ValueError(f"Unknown size unit: {unit_str}")
|
||||
|
||||
try:
|
||||
amount = Decimal(amount_str)
|
||||
except InvalidOperation as err:
|
||||
raise ValueError(f"Invalid size value: {value}") from err
|
||||
|
||||
return int(amount * multiplier)
|
||||
|
||||
|
||||
def is_static_file(url: str):
|
||||
# TODO: the proper way is with MIME type detection + ext, not only extension
|
||||
return extension(url).lower() in CONSTANTS.STATICFILE_EXTENSIONS
|
||||
@@ -178,14 +306,14 @@ def enforce_types(func):
|
||||
if annotation is not None and annotation.__class__ is type:
|
||||
if not isinstance(arg_val, annotation):
|
||||
raise TypeError(
|
||||
'{}(..., {}: {}) got unexpected {} argument {}={}'.format(
|
||||
"{}(..., {}: {}) got unexpected {} argument {}={}".format(
|
||||
func.__name__,
|
||||
arg_key,
|
||||
annotation.__name__,
|
||||
type(arg_val).__name__,
|
||||
arg_key,
|
||||
str(arg_val)[:64],
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
# check args
|
||||
@@ -201,12 +329,14 @@ def enforce_types(func):
|
||||
return typechecked_function
|
||||
|
||||
|
||||
def docstring(text: Optional[str]):
|
||||
def docstring(text: str | None):
|
||||
"""attach the given docstring to the decorated function"""
|
||||
|
||||
def decorator(func):
|
||||
if text:
|
||||
func.__doc__ = text
|
||||
return func
|
||||
|
||||
return decorator
|
||||
|
||||
|
||||
@@ -224,7 +354,7 @@ def str_between(string: str, start: str, end: str | None = None) -> str:
|
||||
@enforce_types
|
||||
def parse_date(date: Any) -> datetime | None:
|
||||
"""Parse unix timestamps, iso format, and human-readable strings"""
|
||||
|
||||
|
||||
if date is None:
|
||||
return None
|
||||
|
||||
@@ -233,16 +363,16 @@ def parse_date(date: Any) -> datetime | None:
|
||||
return date.replace(tzinfo=timezone.utc)
|
||||
|
||||
offset = date.utcoffset()
|
||||
assert offset == datetime.now(timezone.utc).utcoffset(), 'Refusing to load a non-UTC date!'
|
||||
assert offset == datetime.now(timezone.utc).utcoffset(), "Refusing to load a non-UTC date!"
|
||||
return date
|
||||
|
||||
|
||||
if isinstance(date, (float, int)):
|
||||
date = str(date)
|
||||
|
||||
if isinstance(date, str):
|
||||
normalized = date.strip()
|
||||
if not normalized:
|
||||
raise ValueError(f'Tried to parse invalid date string! {date}')
|
||||
raise ValueError(f"Tried to parse invalid date string! {date}")
|
||||
|
||||
try:
|
||||
return datetime.fromtimestamp(float(normalized), tz=timezone.utc)
|
||||
@@ -250,7 +380,7 @@ def parse_date(date: Any) -> datetime | None:
|
||||
pass
|
||||
|
||||
try:
|
||||
iso_date = normalized.replace('Z', '+00:00')
|
||||
iso_date = normalized.replace("Z", "+00:00")
|
||||
parsed_date = datetime.fromisoformat(iso_date)
|
||||
if parsed_date.tzinfo is None:
|
||||
return parsed_date.replace(tzinfo=timezone.utc)
|
||||
@@ -258,12 +388,12 @@ def parse_date(date: Any) -> datetime | None:
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
parsed_date = dateparser(normalized, settings={'TIMEZONE': 'UTC'})
|
||||
parsed_date = dateparser(normalized, settings={"TIMEZONE": "UTC"})
|
||||
if parsed_date is None:
|
||||
raise ValueError(f'Tried to parse invalid date string! {date}')
|
||||
raise ValueError(f"Tried to parse invalid date string! {date}")
|
||||
return parsed_date.astimezone(timezone.utc)
|
||||
|
||||
raise ValueError('Tried to parse invalid date! {}'.format(date))
|
||||
raise ValueError(f"Tried to parse invalid date! {date}")
|
||||
|
||||
|
||||
@enforce_types
|
||||
@@ -284,12 +414,12 @@ def download_url(url: str, timeout: int | None = None) -> str:
|
||||
|
||||
response = session.get(
|
||||
url,
|
||||
headers={'User-Agent': ARCHIVING_CONFIG.USER_AGENT},
|
||||
headers={"User-Agent": ARCHIVING_CONFIG.USER_AGENT},
|
||||
verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
content_type = response.headers.get('Content-Type', '')
|
||||
content_type = response.headers.get("Content-Type", "")
|
||||
encoding = http_content_type_encoding(content_type) or html_body_declared_encoding(response.text)
|
||||
|
||||
if encoding is not None:
|
||||
@@ -299,21 +429,22 @@ def download_url(url: str, timeout: int | None = None) -> str:
|
||||
return response.text
|
||||
except UnicodeDecodeError:
|
||||
# if response is non-test (e.g. image or other binary files), just return the filename instead
|
||||
return url.rsplit('/', 1)[-1]
|
||||
return url.rsplit("/", 1)[-1]
|
||||
|
||||
|
||||
@enforce_types
|
||||
def get_headers(url: str, timeout: int | None=None) -> str:
|
||||
def get_headers(url: str, timeout: int | None = None) -> str:
|
||||
"""Download the contents of a remote url and return the headers"""
|
||||
# TODO: get rid of this and use an abx pluggy hook instead
|
||||
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
|
||||
timeout = timeout or ARCHIVING_CONFIG.TIMEOUT
|
||||
|
||||
try:
|
||||
response = requests.head(
|
||||
url,
|
||||
headers={'User-Agent': ARCHIVING_CONFIG.USER_AGENT},
|
||||
headers={"User-Agent": ARCHIVING_CONFIG.USER_AGENT},
|
||||
verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY,
|
||||
timeout=timeout,
|
||||
allow_redirects=True,
|
||||
@@ -325,19 +456,19 @@ def get_headers(url: str, timeout: int | None=None) -> str:
|
||||
except RequestException:
|
||||
response = requests.get(
|
||||
url,
|
||||
headers={'User-Agent': ARCHIVING_CONFIG.USER_AGENT},
|
||||
headers={"User-Agent": ARCHIVING_CONFIG.USER_AGENT},
|
||||
verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY,
|
||||
timeout=timeout,
|
||||
stream=True
|
||||
stream=True,
|
||||
)
|
||||
|
||||
|
||||
return pyjson.dumps(
|
||||
{
|
||||
'URL': url,
|
||||
'Status-Code': response.status_code,
|
||||
'Elapsed': response.elapsed.total_seconds()*1000,
|
||||
'Encoding': str(response.encoding),
|
||||
'Apparent-Encoding': response.apparent_encoding,
|
||||
"URL": url,
|
||||
"Status-Code": response.status_code,
|
||||
"Elapsed": response.elapsed.total_seconds() * 1000,
|
||||
"Encoding": str(response.encoding),
|
||||
"Apparent-Encoding": response.apparent_encoding,
|
||||
**dict(response.headers),
|
||||
},
|
||||
indent=4,
|
||||
@@ -352,17 +483,17 @@ def ansi_to_html(text: str) -> str:
|
||||
"""
|
||||
|
||||
TEMPLATE = '<span style="color: rgb{}"><br>'
|
||||
text = text.replace('[m', '</span>')
|
||||
text = text.replace("[m", "</span>")
|
||||
|
||||
def single_sub(match):
|
||||
argsdict = match.groupdict()
|
||||
if argsdict['arg_3'] is None:
|
||||
if argsdict['arg_2'] is None:
|
||||
_, color = 0, argsdict['arg_1']
|
||||
if argsdict["arg_3"] is None:
|
||||
if argsdict["arg_2"] is None:
|
||||
_, color = 0, argsdict["arg_1"]
|
||||
else:
|
||||
_, color = argsdict['arg_1'], argsdict['arg_2']
|
||||
_, color = argsdict["arg_1"], argsdict["arg_2"]
|
||||
else:
|
||||
_, color = argsdict['arg_3'], argsdict['arg_2']
|
||||
_, color = argsdict["arg_3"], argsdict["arg_2"]
|
||||
|
||||
return TEMPLATE.format(COLOR_DICT[color][0])
|
||||
|
||||
@@ -370,20 +501,19 @@ def ansi_to_html(text: str) -> str:
|
||||
|
||||
|
||||
@enforce_types
|
||||
def dedupe(options: List[str]) -> List[str]:
|
||||
def dedupe(options: list[str]) -> list[str]:
|
||||
"""
|
||||
Deduplicates the given CLI args by key=value. Options that come later override earlier.
|
||||
"""
|
||||
deduped = {}
|
||||
|
||||
for option in options:
|
||||
key = option.split('=')[0]
|
||||
key = option.split("=")[0]
|
||||
deduped[key] = option
|
||||
|
||||
return list(deduped.values())
|
||||
|
||||
|
||||
|
||||
class ExtendedEncoder(pyjson.JSONEncoder):
|
||||
"""
|
||||
Extended json serializer that supports serializing several model
|
||||
@@ -393,7 +523,7 @@ class ExtendedEncoder(pyjson.JSONEncoder):
|
||||
def default(self, o):
|
||||
cls_name = o.__class__.__name__
|
||||
|
||||
if hasattr(o, '_asdict'):
|
||||
if hasattr(o, "_asdict"):
|
||||
return o._asdict()
|
||||
|
||||
elif isinstance(o, bytes):
|
||||
@@ -403,12 +533,12 @@ class ExtendedEncoder(pyjson.JSONEncoder):
|
||||
return o.isoformat()
|
||||
|
||||
elif isinstance(o, Exception):
|
||||
return '{}: {}'.format(o.__class__.__name__, o)
|
||||
return f"{o.__class__.__name__}: {o}"
|
||||
|
||||
elif isinstance(o, Path):
|
||||
return str(o)
|
||||
|
||||
elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
|
||||
elif cls_name in ("dict_items", "dict_keys", "dict_values"):
|
||||
return list(o)
|
||||
|
||||
elif isinstance(o, Callable):
|
||||
@@ -434,7 +564,7 @@ class ExtendedEncoder(pyjson.JSONEncoder):
|
||||
|
||||
|
||||
@enforce_types
|
||||
def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True) -> str:
|
||||
def to_json(obj: Any, indent: int | None = 4, sort_keys: bool = True) -> str:
|
||||
"""Serialize object to JSON string with extended type support"""
|
||||
return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
|
||||
|
||||
@@ -447,97 +577,114 @@ def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True) -> str:
|
||||
# the consequences of bad URL parsing could be disastrous and lead to many
|
||||
# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
|
||||
|
||||
assert fix_url_from_markdown('http://example.com/a(b)c).x(y)z') == 'http://example.com/a(b)c'
|
||||
assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext') == 'https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def'
|
||||
assert fix_url_from_markdown("http://example.com/a(b)c).x(y)z") == "http://example.com/a(b)c"
|
||||
assert (
|
||||
fix_url_from_markdown("https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext")
|
||||
== "https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def"
|
||||
)
|
||||
|
||||
URL_REGEX_TESTS = [
|
||||
('https://example.com', ['https://example.com']),
|
||||
('https://sweeting.me,https://google.com', ['https://sweeting.me', 'https://google.com']),
|
||||
('http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234', ['http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234']),
|
||||
|
||||
('https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ abc', ['https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ']),
|
||||
('<a href="https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ"> abc', ['https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ']),
|
||||
|
||||
('///a', []),
|
||||
('http://', []),
|
||||
('http://../', ['http://../']),
|
||||
('http://-error-.invalid/', ['http://-error-.invalid/']),
|
||||
('https://a(b)c+1#2?3&4/', ['https://a(b)c+1#2?3&4/']),
|
||||
('http://उदाहरण.परीक्षा', ['http://उदाहरण.परीक्षा']),
|
||||
('http://例子.测试', ['http://例子.测试']),
|
||||
('http://➡.ws/䨹 htps://abc.1243?234', ['http://➡.ws/䨹']),
|
||||
('http://⌘.ws">https://exa+mple.com//:abc ', ['http://⌘.ws', 'https://exa+mple.com//:abc']),
|
||||
('http://مثال.إختبار/abc?def=ت&ب=abc#abc=234', ['http://مثال.إختبار/abc?def=ت&ب=abc#abc=234']),
|
||||
('http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c\'om', ['http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c']),
|
||||
|
||||
('http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3', ['http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3', 'http://ex.co:19/a?_d=4#-a=2.3']),
|
||||
('http://code.google.com/events/#&product=browser', ['http://code.google.com/events/#&product=browser']),
|
||||
('http://foo.bar?q=Spaces should be encoded', ['http://foo.bar?q=Spaces']),
|
||||
('http://foo.com/blah_(wikipedia)#c(i)t[e]-1', ['http://foo.com/blah_(wikipedia)#c(i)t']),
|
||||
('http://foo.com/(something)?after=parens', ['http://foo.com/(something)?after=parens']),
|
||||
('http://foo.com/unicode_(✪)_in_parens) abc', ['http://foo.com/unicode_(✪)_in_parens']),
|
||||
('http://foo.bar/?q=Test%20URL-encoded%20stuff', ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
|
||||
|
||||
('[xyz](http://a.b/?q=(Test)%20U)RL-encoded%20stuff', ['http://a.b/?q=(Test)%20U']),
|
||||
('[xyz](http://a.b/?q=(Test)%20U)-ab https://abc+123', ['http://a.b/?q=(Test)%20U', 'https://abc+123']),
|
||||
('[xyz](http://a.b/?q=(Test)%20U) https://a(b)c+12)3', ['http://a.b/?q=(Test)%20U', 'https://a(b)c+12']),
|
||||
('[xyz](http://a.b/?q=(Test)a\nabchttps://a(b)c+12)3', ['http://a.b/?q=(Test)a', 'https://a(b)c+12']),
|
||||
('http://foo.bar/?q=Test%20URL-encoded%20stuff', ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
|
||||
("https://example.com", ["https://example.com"]),
|
||||
("https://sweeting.me,https://google.com", ["https://sweeting.me", "https://google.com"]),
|
||||
(
|
||||
"http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234",
|
||||
["http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234"],
|
||||
),
|
||||
(
|
||||
"https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ abc",
|
||||
[
|
||||
"https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ",
|
||||
"https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ",
|
||||
],
|
||||
),
|
||||
(
|
||||
'<a href="https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ"> abc',
|
||||
[
|
||||
"https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ",
|
||||
"https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ",
|
||||
],
|
||||
),
|
||||
("///a", []),
|
||||
("http://", []),
|
||||
("http://../", ["http://../"]),
|
||||
("http://-error-.invalid/", ["http://-error-.invalid/"]),
|
||||
("https://a(b)c+1#2?3&4/", ["https://a(b)c+1#2?3&4/"]),
|
||||
("http://उदाहरण.परीक्षा", ["http://उदाहरण.परीक्षा"]),
|
||||
("http://例子.测试", ["http://例子.测试"]),
|
||||
("http://➡.ws/䨹 htps://abc.1243?234", ["http://➡.ws/䨹"]),
|
||||
('http://⌘.ws">https://exa+mple.com//:abc ', ["http://⌘.ws", "https://exa+mple.com//:abc"]),
|
||||
("http://مثال.إختبار/abc?def=ت&ب=abc#abc=234", ["http://مثال.إختبار/abc?def=ت&ب=abc#abc=234"]),
|
||||
("http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c'om", ["http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c"]),
|
||||
(
|
||||
"http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3",
|
||||
["http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3", "http://ex.co:19/a?_d=4#-a=2.3"],
|
||||
),
|
||||
("http://code.google.com/events/#&product=browser", ["http://code.google.com/events/#&product=browser"]),
|
||||
("http://foo.bar?q=Spaces should be encoded", ["http://foo.bar?q=Spaces"]),
|
||||
("http://foo.com/blah_(wikipedia)#c(i)t[e]-1", ["http://foo.com/blah_(wikipedia)#c(i)t"]),
|
||||
("http://foo.com/(something)?after=parens", ["http://foo.com/(something)?after=parens"]),
|
||||
("http://foo.com/unicode_(✪)_in_parens) abc", ["http://foo.com/unicode_(✪)_in_parens"]),
|
||||
("http://foo.bar/?q=Test%20URL-encoded%20stuff", ["http://foo.bar/?q=Test%20URL-encoded%20stuff"]),
|
||||
("[xyz](http://a.b/?q=(Test)%20U)RL-encoded%20stuff", ["http://a.b/?q=(Test)%20U"]),
|
||||
("[xyz](http://a.b/?q=(Test)%20U)-ab https://abc+123", ["http://a.b/?q=(Test)%20U", "https://abc+123"]),
|
||||
("[xyz](http://a.b/?q=(Test)%20U) https://a(b)c+12)3", ["http://a.b/?q=(Test)%20U", "https://a(b)c+12"]),
|
||||
("[xyz](http://a.b/?q=(Test)a\nabchttps://a(b)c+12)3", ["http://a.b/?q=(Test)a", "https://a(b)c+12"]),
|
||||
("http://foo.bar/?q=Test%20URL-encoded%20stuff", ["http://foo.bar/?q=Test%20URL-encoded%20stuff"]),
|
||||
]
|
||||
for urls_str, expected_url_matches in URL_REGEX_TESTS:
|
||||
url_matches = list(find_all_urls(urls_str))
|
||||
assert url_matches == expected_url_matches, 'FAILED URL_REGEX CHECK!'
|
||||
assert url_matches == expected_url_matches, "FAILED URL_REGEX CHECK!"
|
||||
|
||||
|
||||
# More test cases
|
||||
_test_url_strs = {
|
||||
'example.com': 0,
|
||||
'/example.com': 0,
|
||||
'//example.com': 0,
|
||||
':/example.com': 0,
|
||||
'://example.com': 0,
|
||||
'htt://example8.com': 0,
|
||||
'/htt://example.com': 0,
|
||||
'https://example': 1,
|
||||
'https://localhost/2345': 1,
|
||||
'https://localhost:1234/123': 1,
|
||||
'://': 0,
|
||||
'https://': 0,
|
||||
'http://': 0,
|
||||
'ftp://': 0,
|
||||
'ftp://example.com': 0,
|
||||
'https://example.com': 1,
|
||||
'https://example.com/': 1,
|
||||
'https://a.example.com': 1,
|
||||
'https://a.example.com/': 1,
|
||||
'https://a.example.com/what/is/happening.html': 1,
|
||||
'https://a.example.com/what/ís/happening.html': 1,
|
||||
'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
|
||||
'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
|
||||
'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
|
||||
'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
|
||||
'https://example.com?what=1#how-about-this=1&2%20baf': 1,
|
||||
'<test>http://example7.com</test>': 1,
|
||||
'https://<test>': 0,
|
||||
'https://[test]': 0,
|
||||
"example.com": 0,
|
||||
"/example.com": 0,
|
||||
"//example.com": 0,
|
||||
":/example.com": 0,
|
||||
"://example.com": 0,
|
||||
"htt://example8.com": 0,
|
||||
"/htt://example.com": 0,
|
||||
"https://example": 1,
|
||||
"https://localhost/2345": 1,
|
||||
"https://localhost:1234/123": 1,
|
||||
"://": 0,
|
||||
"https://": 0,
|
||||
"http://": 0,
|
||||
"ftp://": 0,
|
||||
"ftp://example.com": 0,
|
||||
"https://example.com": 1,
|
||||
"https://example.com/": 1,
|
||||
"https://a.example.com": 1,
|
||||
"https://a.example.com/": 1,
|
||||
"https://a.example.com/what/is/happening.html": 1,
|
||||
"https://a.example.com/what/ís/happening.html": 1,
|
||||
"https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a": 1,
|
||||
"https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a": 1,
|
||||
"HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b": 1,
|
||||
"https://example.com/?what=1#how-about-this=1&2%20baf": 1,
|
||||
"https://example.com?what=1#how-about-this=1&2%20baf": 1,
|
||||
"<test>http://example7.com</test>": 1,
|
||||
"https://<test>": 0,
|
||||
"https://[test]": 0,
|
||||
'http://"test"': 0,
|
||||
'http://\'test\'': 0,
|
||||
'[https://example8.com/what/is/this.php?what=1]': 1,
|
||||
'[and http://example9.com?what=1&other=3#and-thing=2]': 1,
|
||||
"http://'test'": 0,
|
||||
"[https://example8.com/what/is/this.php?what=1]": 1,
|
||||
"[and http://example9.com?what=1&other=3#and-thing=2]": 1,
|
||||
'<what>https://example10.com#and-thing=2 "</about>': 1,
|
||||
'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
|
||||
'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
|
||||
'<or>http://examplehttp://15.badc</that>': 2,
|
||||
'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
|
||||
'[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
|
||||
"sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi": 1,
|
||||
"<or>http://examplehttp://15.badc</that>": 2,
|
||||
"https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://": 2,
|
||||
"[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)": 3,
|
||||
}
|
||||
for url_str, num_urls in _test_url_strs.items():
|
||||
assert len(list(find_all_urls(url_str))) == num_urls, (
|
||||
f'{url_str} does not contain {num_urls} urls')
|
||||
assert len(list(find_all_urls(url_str))) == num_urls, f"{url_str} does not contain {num_urls} urls"
|
||||
|
||||
|
||||
### Chrome Helpers
|
||||
|
||||
|
||||
def chrome_cleanup():
|
||||
"""
|
||||
Cleans up any state or runtime files that Chrome leaves behind when killed by
|
||||
@@ -560,10 +707,11 @@ def chrome_cleanup():
|
||||
# Also clean up the active persona's explicit CHROME_USER_DATA_DIR if set
|
||||
# (in case it's a custom path not under PERSONAS_DIR)
|
||||
from archivebox.config.configset import get_config
|
||||
|
||||
config = get_config()
|
||||
chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR')
|
||||
chrome_user_data_dir = config.get("CHROME_USER_DATA_DIR")
|
||||
if chrome_user_data_dir:
|
||||
singleton_lock = Path(chrome_user_data_dir) / 'SingletonLock'
|
||||
singleton_lock = Path(chrome_user_data_dir) / "SingletonLock"
|
||||
if os.path.lexists(singleton_lock):
|
||||
try:
|
||||
singleton_lock.unlink()
|
||||
|
||||
Reference in New Issue
Block a user