This commit is contained in:
Nick Sweeting
2026-03-23 03:58:32 -07:00
parent 268856bcfb
commit b749b26c5d
286 changed files with 21704 additions and 13480 deletions

View File

@@ -1 +1 @@
__package__ = 'archivebox.misc'
__package__ = "archivebox.misc"

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.misc'
__package__ = "archivebox.misc"
import os
import sys
@@ -23,69 +23,74 @@ def check_data_folder() -> None:
from archivebox import DATA_DIR, ARCHIVE_DIR
from archivebox.config import CONSTANTS
from archivebox.config.paths import create_and_chown_dir, get_or_create_working_tmp_dir, get_or_create_working_lib_dir
archive_dir_exists = os.path.isdir(ARCHIVE_DIR)
if not archive_dir_exists:
print('[red][X] No archivebox index found in the current directory.[/red]', file=sys.stderr)
print(f' {DATA_DIR}', file=sys.stderr)
print("[red][X] No archivebox index found in the current directory.[/red]", file=sys.stderr)
print(f" {DATA_DIR}", file=sys.stderr)
print(file=sys.stderr)
print(' [violet]Hint[/violet]: Are you running archivebox in the right folder?', file=sys.stderr)
print(' cd path/to/your/archive/folder', file=sys.stderr)
print(' archivebox [command]', file=sys.stderr)
print(" [violet]Hint[/violet]: Are you running archivebox in the right folder?", file=sys.stderr)
print(" cd path/to/your/archive/folder", file=sys.stderr)
print(" archivebox [command]", file=sys.stderr)
print(file=sys.stderr)
print(' [violet]Hint[/violet]: To create a new archive collection or import existing data in this folder, run:', file=sys.stderr)
print(' archivebox init', file=sys.stderr)
print(" [violet]Hint[/violet]: To create a new archive collection or import existing data in this folder, run:", file=sys.stderr)
print(" archivebox init", file=sys.stderr)
raise SystemExit(2)
# Create data dir subdirs
create_and_chown_dir(CONSTANTS.SOURCES_DIR)
create_and_chown_dir(CONSTANTS.PERSONAS_DIR / 'Default')
create_and_chown_dir(CONSTANTS.PERSONAS_DIR / "Default")
create_and_chown_dir(CONSTANTS.LOGS_DIR)
# create_and_chown_dir(CONSTANTS.CACHE_DIR)
# Create /tmp and /lib dirs if they don't exist
get_or_create_working_tmp_dir(autofix=True, quiet=False)
get_or_create_working_lib_dir(autofix=True, quiet=False)
# Check data dir permissions, /tmp, and /lib permissions
check_data_dir_permissions()
def check_migrations():
from archivebox import DATA_DIR
from archivebox.misc.db import list_migrations
pending_migrations = [name for status, name in list_migrations() if not status]
is_migrating = any(arg in sys.argv for arg in ['makemigrations', 'migrate', 'init'])
is_migrating = any(arg in sys.argv for arg in ["makemigrations", "migrate", "init"])
if pending_migrations and not is_migrating:
print('[red][X] This collection was created with an older version of ArchiveBox and must be upgraded first.[/red]')
print(f' {DATA_DIR}', file=sys.stderr)
print("[red][X] This collection was created with an older version of ArchiveBox and must be upgraded first.[/red]")
print(f" {DATA_DIR}", file=sys.stderr)
print(file=sys.stderr)
print(f' [violet]Hint:[/violet] To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:', file=sys.stderr)
print(' archivebox init', file=sys.stderr)
print(
f" [violet]Hint:[/violet] To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:",
file=sys.stderr,
)
print(" archivebox init", file=sys.stderr)
raise SystemExit(3)
def check_io_encoding():
PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8')
if PYTHON_ENCODING != 'UTF-8':
print(f'[red][X] Your system is running python3 scripts with a bad locale setting: {PYTHON_ENCODING} (it should be UTF-8).[/red]', file=sys.stderr)
PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace("UTF8", "UTF-8")
if PYTHON_ENCODING != "UTF-8":
print(
f"[red][X] Your system is running python3 scripts with a bad locale setting: {PYTHON_ENCODING} (it should be UTF-8).[/red]",
file=sys.stderr,
)
print(' To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)', file=sys.stderr)
print(' Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"', file=sys.stderr)
print('')
print(' Confirm that it\'s fixed by opening a new shell and running:', file=sys.stderr)
print("")
print(" Confirm that it's fixed by opening a new shell and running:", file=sys.stderr)
print(' python3 -c "import sys; print(sys.stdout.encoding)" # should output UTF-8', file=sys.stderr)
raise SystemExit(2)
# # hard errors: check python version
# if sys.version_info[:3] < (3, 10, 0):
# print('[red][X] Python version is not new enough: {sys.version} (>3.10 is required)[/red]', file=sys.stderr)
# print(' See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.', file=sys.stderr)
# raise SystemExit(2)
# # hard errors: check django version
# if int(django.VERSION[0]) < 5:
# print('[red][X] Django version is not new enough: {django.VERSION[:3]} (>=5.0 is required)[/red]', file=sys.stderr)
@@ -96,35 +101,44 @@ def check_io_encoding():
def check_not_root():
from archivebox.config.permissions import IS_ROOT, IN_DOCKER
attempted_command = ' '.join(sys.argv[1:]) if len(sys.argv) > 1 else ''
is_getting_help = '-h' in sys.argv or '--help' in sys.argv or 'help' in sys.argv
is_getting_version = '--version' in sys.argv or 'version' in sys.argv
is_installing = 'setup' in sys.argv or 'install' in sys.argv
attempted_command = " ".join(sys.argv[1:]) if len(sys.argv) > 1 else ""
is_getting_help = "-h" in sys.argv or "--help" in sys.argv or "help" in sys.argv
is_getting_version = "--version" in sys.argv or "version" in sys.argv
is_installing = "setup" in sys.argv or "install" in sys.argv
if IS_ROOT and not (is_getting_help or is_getting_version or is_installing):
print('[red][!] ArchiveBox should never be run as root![/red]', file=sys.stderr)
print(' For more information, see the security overview documentation:', file=sys.stderr)
print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root', file=sys.stderr)
print("[red][!] ArchiveBox should never be run as root![/red]", file=sys.stderr)
print(" For more information, see the security overview documentation:", file=sys.stderr)
print(" https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root", file=sys.stderr)
if IN_DOCKER:
print('[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:', file=sys.stderr)
print(' docker compose run archivebox {attempted_command}', file=sys.stderr)
print(f' docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}', file=sys.stderr)
print(' or:', file=sys.stderr)
print(f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
print(f' docker exec -it --user=archivebox <container id> /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
print(
"[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:",
file=sys.stderr,
)
print(" docker compose run archivebox {attempted_command}", file=sys.stderr)
print(f" docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}", file=sys.stderr)
print(" or:", file=sys.stderr)
print(
f' docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"',
file=sys.stderr,
)
print(
f' docker exec -it --user=archivebox <container id> /bin/bash -c "archivebox {attempted_command}"',
file=sys.stderr,
)
raise SystemExit(2)
def check_not_inside_source_dir():
"""Prevent running ArchiveBox from inside its source directory (would pollute repo with data files)."""
cwd = Path(os.getcwd()).resolve()
is_source_dir = (cwd / 'archivebox' / '__init__.py').exists() and (cwd / 'pyproject.toml').exists()
data_dir_set_elsewhere = os.environ.get('DATA_DIR', '').strip() and Path(os.environ['DATA_DIR']).resolve() != cwd
is_testing = 'pytest' in sys.modules or 'unittest' in sys.modules
is_source_dir = (cwd / "archivebox" / "__init__.py").exists() and (cwd / "pyproject.toml").exists()
data_dir_set_elsewhere = os.environ.get("DATA_DIR", "").strip() and Path(os.environ["DATA_DIR"]).resolve() != cwd
is_testing = "pytest" in sys.modules or "unittest" in sys.modules
if is_source_dir and not data_dir_set_elsewhere and not is_testing:
raise SystemExit('[!] Cannot run from source dir, set DATA_DIR or cd to a data folder first')
raise SystemExit("[!] Cannot run from source dir, set DATA_DIR or cd to a data folder first")
def check_data_dir_permissions():
@@ -132,28 +146,42 @@ def check_data_dir_permissions():
from archivebox.misc.logging import STDERR
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, DEFAULT_PUID, DEFAULT_PGID, IS_ROOT, USER
from archivebox.config.paths import get_or_create_working_tmp_dir, get_or_create_working_lib_dir
data_dir_stat = Path(DATA_DIR).stat()
data_dir_uid, data_dir_gid = data_dir_stat.st_uid, data_dir_stat.st_gid
data_owned_by_root = data_dir_uid == 0
# data_owned_by_default_user = data_dir_uid == DEFAULT_PUID or data_dir_gid == DEFAULT_PGID
data_owner_doesnt_match = (data_dir_uid != ARCHIVEBOX_USER and data_dir_gid != ARCHIVEBOX_GROUP) if not IS_ROOT else False
data_not_writable = not (os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.W_OK))
if data_owned_by_root:
STDERR.print('\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]root[/red], it must be changed before archiving can run![/yellow]')
STDERR.print(
"\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]root[/red], it must be changed before archiving can run![/yellow]",
)
elif data_owner_doesnt_match or data_not_writable:
STDERR.print(f'\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]{data_dir_uid}:{data_dir_gid}[/red], but ArchiveBox user is [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue] ({USER})! (ArchiveBox may not be able to write to the data dir)[/yellow]')
STDERR.print(
f"\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]{data_dir_uid}:{data_dir_gid}[/red], but ArchiveBox user is [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue] ({USER})! (ArchiveBox may not be able to write to the data dir)[/yellow]",
)
if data_owned_by_root or data_owner_doesnt_match or data_not_writable:
STDERR.print(f'[violet]Hint:[/violet] Change the current ownership [red]{data_dir_uid}[/red]:{data_dir_gid} (PUID:PGID) to a non-root user & group that will run ArchiveBox, e.g.:')
STDERR.print(f' [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {DATA_DIR.resolve()}')
STDERR.print(
f"[violet]Hint:[/violet] Change the current ownership [red]{data_dir_uid}[/red]:{data_dir_gid} (PUID:PGID) to a non-root user & group that will run ArchiveBox, e.g.:",
)
STDERR.print(f" [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {DATA_DIR.resolve()}")
STDERR.print()
STDERR.print('[blue]More info:[/blue]')
STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox#storage-requirements]https://github.com/ArchiveBox/ArchiveBox#storage-requirements[/link]')
STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]')
STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]')
STDERR.print(' [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]')
STDERR.print("[blue]More info:[/blue]")
STDERR.print(
" [link=https://github.com/ArchiveBox/ArchiveBox#storage-requirements]https://github.com/ArchiveBox/ArchiveBox#storage-requirements[/link]",
)
STDERR.print(
" [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]",
)
STDERR.print(
" [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]",
)
STDERR.print(
" [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]",
)
from archivebox.config.common import STORAGE_CONFIG
@@ -172,8 +200,8 @@ def check_data_dir_permissions():
# Check /lib dir permissions
check_lib_dir(lib_dir, throw=False, must_exist=True)
os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8)) # noqa: F821
os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8))
def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
@@ -182,45 +210,57 @@ def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
from archivebox.misc.logging_util import pretty_path
from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
from archivebox.config.common import STORAGE_CONFIG
tmp_dir = tmp_dir or STORAGE_CONFIG.TMP_DIR
socket_file = tmp_dir.absolute().resolve() / "supervisord.sock"
if not must_exist and not os.path.isdir(tmp_dir):
# just check that its viable based on its length (because dir may not exist yet, we cant check if its writable)
return len(f'file://{socket_file}') <= 96
return len(f"file://{socket_file}") <= 96
tmp_is_valid = False
allow_no_unix_sockets = os.environ.get('ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS', '').lower() in ('1', 'true', 'yes')
allow_no_unix_sockets = os.environ.get("ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS", "").lower() in ("1", "true", "yes")
try:
tmp_is_valid = dir_is_writable(tmp_dir)
if not allow_no_unix_sockets:
tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir)
assert tmp_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}'
assert len(f'file://{socket_file}') <= 96, f'ArchiveBox TMP_DIR={tmp_dir} is too long, dir containing unix socket files must be <90 chars.'
assert tmp_is_valid, f"ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}"
assert len(f"file://{socket_file}") <= 96, (
f"ArchiveBox TMP_DIR={tmp_dir} is too long, dir containing unix socket files must be <90 chars."
)
return True
except Exception as e:
if not quiet:
STDERR.print()
ERROR_TEXT = '\n'.join((
'',
f'[red]:cross_mark: ArchiveBox is unable to use TMP_DIR={pretty_path(tmp_dir)}[/red]',
f' [yellow]{e}[/yellow]',
'',
'[blue]Info:[/blue] [grey53]The TMP_DIR is used for the supervisord unix socket file and other temporary files.',
' - It [red]must[/red] be on a local drive (not inside a docker volume, remote network drive, or FUSE mount).',
f' - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).',
' - It [red]must[/red] be a *short* path (less than 90 characters) due to UNIX path length restrictions for sockets.',
' - It [yellow]should[/yellow] be able to hold at least 200MB of data (in-progress downloads can be large).[/grey53]',
'',
'[violet]Hint:[/violet] Fix it by setting TMP_DIR to a path that meets these requirements, e.g.:',
f' [green]archivebox config --set TMP_DIR={get_or_create_working_tmp_dir(autofix=False, quiet=True) or "/tmp/archivebox"}[/green]',
'',
))
STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured TMP_DIR[/red]', subtitle='Background workers may fail to start until fixed.'))
ERROR_TEXT = "\n".join(
(
"",
f"[red]:cross_mark: ArchiveBox is unable to use TMP_DIR={pretty_path(tmp_dir)}[/red]",
f" [yellow]{e}[/yellow]",
"",
"[blue]Info:[/blue] [grey53]The TMP_DIR is used for the supervisord unix socket file and other temporary files.",
" - It [red]must[/red] be on a local drive (not inside a docker volume, remote network drive, or FUSE mount).",
f" - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).",
" - It [red]must[/red] be a *short* path (less than 90 characters) due to UNIX path length restrictions for sockets.",
" - It [yellow]should[/yellow] be able to hold at least 200MB of data (in-progress downloads can be large).[/grey53]",
"",
"[violet]Hint:[/violet] Fix it by setting TMP_DIR to a path that meets these requirements, e.g.:",
f" [green]archivebox config --set TMP_DIR={get_or_create_working_tmp_dir(autofix=False, quiet=True) or '/tmp/archivebox'}[/green]",
"",
),
)
STDERR.print(
Panel(
ERROR_TEXT,
expand=False,
border_style="red",
title="[red]:cross_mark: Error with configured TMP_DIR[/red]",
subtitle="Background workers may fail to start until fixed.",
),
)
STDERR.print()
if throw:
raise OSError(f'TMP_DIR={tmp_dir} is invalid, ArchiveBox is unable to use it and the server will fail to start!') from e
raise OSError(f"TMP_DIR={tmp_dir} is invalid, ArchiveBox is unable to use it and the server will fail to start!") from e
return False
@@ -230,38 +270,48 @@ def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_ex
from archivebox.misc.logging_util import pretty_path
from archivebox.config.paths import dir_is_writable, get_or_create_working_lib_dir
from archivebox.config.common import STORAGE_CONFIG
lib_dir = lib_dir or STORAGE_CONFIG.LIB_DIR
# assert lib_dir == STORAGE_CONFIG.LIB_DIR, "lib_dir is not the same as the one in the flat config"
if not must_exist and not os.path.isdir(lib_dir):
return True
lib_is_valid = False
try:
lib_is_valid = dir_is_writable(lib_dir)
assert lib_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to LIB_DIR={lib_dir}'
assert lib_is_valid, f"ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to LIB_DIR={lib_dir}"
return True
except Exception as e:
if not quiet:
STDERR.print()
ERROR_TEXT = '\n'.join((
'',
f'[red]:cross_mark: ArchiveBox is unable to use LIB_DIR={pretty_path(lib_dir)}[/red]',
f' [yellow]{e}[/yellow]',
'',
'[blue]Info:[/blue] [grey53]The LIB_DIR is used to store ArchiveBox auto-installed plugin library and binary dependencies.',
f' - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).',
' - It [yellow]should[/yellow] be on a local (ideally fast) drive like an SSD or HDD (not on a network drive or external HDD).',
' - It [yellow]should[/yellow] be able to hold at least 1GB of data (some dependencies like Chrome can be large).[/grey53]',
'',
'[violet]Hint:[/violet] Fix it by setting LIB_DIR to a path that meets these requirements, e.g.:',
f' [green]archivebox config --set LIB_DIR={get_or_create_working_lib_dir(autofix=False, quiet=True) or "/usr/local/share/archivebox"}[/green]',
'',
))
STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured LIB_DIR[/red]', subtitle='[yellow]Dependencies may not auto-install properly until fixed.[/yellow]'))
ERROR_TEXT = "\n".join(
(
"",
f"[red]:cross_mark: ArchiveBox is unable to use LIB_DIR={pretty_path(lib_dir)}[/red]",
f" [yellow]{e}[/yellow]",
"",
"[blue]Info:[/blue] [grey53]The LIB_DIR is used to store ArchiveBox auto-installed plugin library and binary dependencies.",
f" - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).",
" - It [yellow]should[/yellow] be on a local (ideally fast) drive like an SSD or HDD (not on a network drive or external HDD).",
" - It [yellow]should[/yellow] be able to hold at least 1GB of data (some dependencies like Chrome can be large).[/grey53]",
"",
"[violet]Hint:[/violet] Fix it by setting LIB_DIR to a path that meets these requirements, e.g.:",
f" [green]archivebox config --set LIB_DIR={get_or_create_working_lib_dir(autofix=False, quiet=True) or '/usr/local/share/archivebox'}[/green]",
"",
),
)
STDERR.print(
Panel(
ERROR_TEXT,
expand=False,
border_style="red",
title="[red]:cross_mark: Error with configured LIB_DIR[/red]",
subtitle="[yellow]Dependencies may not auto-install properly until fixed.[/yellow]",
),
)
STDERR.print()
if throw:
raise OSError(f'LIB_DIR={lib_dir} is invalid, ArchiveBox is unable to use it and dependencies will fail to install.') from e
raise OSError(f"LIB_DIR={lib_dir} is invalid, ArchiveBox is unable to use it and dependencies will fail to install.") from e
return False

View File

@@ -2,18 +2,18 @@
Database utility functions for ArchiveBox.
"""
__package__ = 'archivebox.misc'
__package__ = "archivebox.misc"
from io import StringIO
from pathlib import Path
from typing import Any, List, Tuple
from typing import Any
from archivebox.config import DATA_DIR
from archivebox.misc.util import enforce_types
@enforce_types
def list_migrations(out_dir: Path = DATA_DIR) -> List[Tuple[bool, str]]:
def list_migrations(out_dir: Path = DATA_DIR) -> list[tuple[bool, str]]:
"""List all Django migrations and their status"""
from django.core.management import call_command
@@ -23,9 +23,9 @@ def list_migrations(out_dir: Path = DATA_DIR) -> List[Tuple[bool, str]]:
migrations = []
for line in out.readlines():
if line.strip() and ']' in line:
status_str, name_str = line.strip().split(']', 1)
is_applied = 'X' in status_str
if line.strip() and "]" in line:
status_str, name_str = line.strip().split("]", 1)
is_applied = "X" in status_str
migration_name = name_str.strip()
migrations.append((is_applied, migration_name))
@@ -33,23 +33,21 @@ def list_migrations(out_dir: Path = DATA_DIR) -> List[Tuple[bool, str]]:
@enforce_types
def apply_migrations(out_dir: Path = DATA_DIR) -> List[str]:
def apply_migrations(out_dir: Path = DATA_DIR) -> list[str]:
"""Apply pending Django migrations"""
from django.core.management import call_command
out1 = StringIO()
call_command("migrate", interactive=False, database='default', stdout=out1)
call_command("migrate", interactive=False, database="default", stdout=out1)
out1.seek(0)
return [
line.strip() for line in out1.readlines() if line.strip()
]
return [line.strip() for line in out1.readlines() if line.strip()]
@enforce_types
def get_admins(out_dir: Path = DATA_DIR) -> List[Any]:
def get_admins(out_dir: Path = DATA_DIR) -> list[Any]:
"""Get list of superuser accounts"""
from django.contrib.auth.models import User
return list(User.objects.filter(is_superuser=True).exclude(username='system'))
return list(User.objects.filter(is_superuser=True).exclude(username="system"))

View File

@@ -1,6 +1,7 @@
from functools import wraps
from time import time
def timed_function(func):
"""
Very simple profiling decorator for debugging.
@@ -8,23 +9,25 @@ def timed_function(func):
@timed_function
def my_func():
...
More advanced alternatives:
- viztracer ../.venv/bin/archivebox manage check # https://viztracer.readthedocs.io/en/latest/filter.html
- python -m cProfile -o archivebox.prof ../.venv/bin/archivebox manage check; snakeviz archivebox.prof
- Django Debug Toolbar + django-debug-toolbar-flamegraph
+ Django Requests Tracker (requests-tracker)
"""
@wraps(func)
def wrap(*args, **kwargs):
if args and hasattr(args[0], '__module__'):
if args and hasattr(args[0], "__module__"):
module = args[0].__module__
else:
module = func.__module__
ts_start = time()
result = func(*args, **kwargs)
ts_end = time()
ms_elapsed = int((ts_end-ts_start) * 1000)
print(f'[DEBUG][{ms_elapsed}ms] {module}.{func.__name__}(...)')
ms_elapsed = int((ts_end - ts_start) * 1000)
print(f"[DEBUG][{ms_elapsed}ms] {module}.{func.__name__}(...)")
return result
return wrap

View File

@@ -5,20 +5,19 @@ Note: This file only contains legacy cleanup utilities.
The DB is the single source of truth - use Snapshot.objects queries for all status checks.
"""
__package__ = 'archivebox.misc'
__package__ = "archivebox.misc"
import os
import json
import shutil
from pathlib import Path
from typing import Tuple, List
from archivebox.config import DATA_DIR, CONSTANTS
from archivebox.misc.util import enforce_types
@enforce_types
def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], List[str]]:
def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> tuple[list[str], list[str]]:
"""
Legacy cleanup: Move folders to their correct timestamp-named locations based on index.json.
@@ -29,19 +28,19 @@ def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], L
cant_fix = []
for entry in os.scandir(out_dir / CONSTANTS.ARCHIVE_DIR_NAME):
if entry.is_dir(follow_symlinks=True):
index_path = Path(entry.path) / 'index.json'
index_path = Path(entry.path) / "index.json"
if index_path.exists():
try:
with open(index_path, 'r') as f:
with open(index_path) as f:
data = json.load(f)
timestamp = data.get('timestamp')
timestamp = data.get("timestamp")
except Exception:
continue
if not timestamp:
continue
if not entry.path.endswith(f'/{timestamp}'):
if not entry.path.endswith(f"/{timestamp}"):
dest = out_dir / CONSTANTS.ARCHIVE_DIR_NAME / timestamp
if dest.exists():
cant_fix.append(entry.path)

View File

@@ -2,20 +2,22 @@ import hashlib
import mimetypes
from functools import lru_cache
from pathlib import Path
from typing import Callable
from collections.abc import Callable
from datetime import datetime
@lru_cache(maxsize=1024)
def _cached_file_hash(filepath: str, size: int, mtime: float) -> str:
"""Internal function to calculate file hash with cache key based on path, size and mtime."""
sha256_hash = hashlib.sha256()
with open(filepath, 'rb') as f:
for chunk in iter(lambda: f.read(4096), b''):
with open(filepath, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
sha256_hash.update(chunk)
return sha256_hash.hexdigest()
@lru_cache(maxsize=10)
def hash_file(file_path: Path, pwd: Path | None = None) -> str:
"""Calculate SHA256 hash of a file with caching based on path, size and mtime."""
@@ -30,9 +32,10 @@ def hash_file(file_path: Path, pwd: Path | None = None) -> str:
return _cached_file_hash(
str(abs_path),
stat_info.st_size,
stat_info.st_mtime
stat_info.st_mtime,
)
@lru_cache(maxsize=10)
def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callable | None = None, max_depth: int = -1) -> dict[str, str]:
"""Calculate SHA256 hashes for all files and directories recursively."""
@@ -48,9 +51,12 @@ def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callabl
# Get all files recursively
all_files = get_dir_entries(
dir_path, pwd=pwd, recursive=True,
include_files=True, include_dirs=False,
filter_func=filter_func
dir_path,
pwd=pwd,
recursive=True,
include_files=True,
include_dirs=False,
filter_func=filter_func,
)
hashes: dict[str, str] = {}
@@ -65,39 +71,48 @@ def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callabl
# Calculate hashes for all directories
subdirs = get_dir_entries(
dir_path, pwd=pwd, recursive=True,
include_files=False, include_dirs=True,
include_hidden=False, filter_func=filter_func,
max_depth=max_depth
dir_path,
pwd=pwd,
recursive=True,
include_files=False,
include_dirs=True,
include_hidden=False,
filter_func=filter_func,
max_depth=max_depth,
)
for subdir in subdirs:
subdir_path = dir_path / subdir
subdir_hashes = get_dir_hashes(
subdir_path, filter_func=filter_func,
max_depth=0
subdir_path,
filter_func=filter_func,
max_depth=0,
)
hashes[subdir] = subdir_hashes['.']
hashes[subdir] = subdir_hashes["."]
# Filter results by max_depth
if max_depth >= 0:
hashes = {
path: value for path, value in hashes.items()
if len(Path(path).parts) <= max_depth + 1
}
hashes = {path: value for path, value in hashes.items() if len(Path(path).parts) <= max_depth + 1}
# Calculate root directory hash
hashable_summary.sort()
root_sha256 = hashlib.sha256('\n'.join(hashable_summary).encode()).hexdigest()
hashes['.'] = root_sha256
root_sha256 = hashlib.sha256("\n".join(hashable_summary).encode()).hexdigest()
hashes["."] = root_sha256
return hashes
@lru_cache(maxsize=128)
def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = True,
include_files: bool = True, include_dirs: bool = True, include_hidden: bool = False,
filter_func: Callable | None = None, max_depth: int = -1) -> tuple[str, ...]:
def get_dir_entries(
dir_path: Path,
pwd: Path | None = None,
recursive: bool = True,
include_files: bool = True,
include_dirs: bool = True,
include_hidden: bool = False,
filter_func: Callable | None = None,
max_depth: int = -1,
) -> tuple[str, ...]:
"""Get filtered list of directory entries."""
pwd = Path(pwd) if pwd else None
dir_path = Path(dir_path)
@@ -107,20 +122,20 @@ def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = T
results = []
def process_path(path: Path, depth: int):
if not include_hidden and path.name.startswith('.'):
if not include_hidden and path.name.startswith("."):
return False
if max_depth >= 0 and depth > max_depth:
return False
if filter_func:
info = {
"abspath": str(path.absolute()),
"relpath": str(path.relative_to(dir_path))
"relpath": str(path.relative_to(dir_path)),
}
if not filter_func(info):
return False
return True
for path in dir_path.rglob('*') if recursive else dir_path.glob('*'):
for path in dir_path.rglob("*") if recursive else dir_path.glob("*"):
current_depth = len(path.relative_to(dir_path).parts)
if path.is_file() and include_files and process_path(path, current_depth):
@@ -133,6 +148,7 @@ def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = T
return tuple(sorted(results)) # Make immutable for caching
@lru_cache(maxsize=1024)
def get_dir_sizes(dir_path: Path, pwd: Path | None = None, **kwargs) -> dict[str, int]:
"""Calculate sizes for all files and directories recursively."""
@@ -146,10 +162,10 @@ def get_dir_sizes(dir_path: Path, pwd: Path | None = None, **kwargs) -> dict[str
sizes[path_key] = full_path.stat().st_size
else:
total = 0
for file_path in full_path.rglob('*'):
if file_path.is_file() and not file_path.name.startswith('.'):
for file_path in full_path.rglob("*"):
if file_path.is_file() and not file_path.name.startswith("."):
total += file_path.stat().st_size
sizes[path_key + '/'] = total
sizes[path_key + "/"] = total
return sizes
@@ -165,23 +181,23 @@ def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable
hashes = get_dir_hashes(dir_path, pwd=pwd, filter_func=filter_func, max_depth=max_depth)
sizes = get_dir_sizes(str(dir_path), pwd=pwd, filter_func=filter_func, max_depth=max_depth)
num_total_subpaths = sum(1 for name in hashes if name != '.')
num_total_subpaths = sum(1 for name in hashes if name != ".")
details = {}
for filename, sha256_hash in sorted(hashes.items()):
abs_path = (dir_path / filename).resolve()
stat_info = abs_path.stat()
num_subpaths = sum(1 for p in hashes if p.startswith(filename + '/'))
num_subpaths = sum(1 for p in hashes if p.startswith(filename + "/"))
is_dir = abs_path.is_dir()
if is_dir:
mime_type = 'inode/directory'
mime_type = "inode/directory"
basename = abs_path.name
extension = ''
num_bytes = sizes[filename + '/']
if filename == '.':
extension = ""
num_bytes = sizes[filename + "/"]
if filename == ".":
num_subpaths = num_total_subpaths
else:
filename += '/'
filename += "/"
num_subpaths = num_subpaths
else: # is_file
num_subpaths = None
@@ -191,14 +207,14 @@ def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable
num_bytes = sizes[filename]
details[filename] = {
'basename': basename,
'mime_type': mime_type,
'extension': extension,
'num_subpaths': num_subpaths,
'num_bytes': num_bytes,
'hash_sha256': sha256_hash,
'created_at': datetime.fromtimestamp(stat_info.st_ctime).isoformat(),
'modified_at': datetime.fromtimestamp(stat_info.st_mtime).isoformat(),
"basename": basename,
"mime_type": mime_type,
"extension": extension,
"num_subpaths": num_subpaths,
"num_bytes": num_bytes,
"hash_sha256": sha256_hash,
"created_at": datetime.fromtimestamp(stat_info.st_ctime).isoformat(),
"modified_at": datetime.fromtimestamp(stat_info.st_mtime).isoformat(),
}
if filter_func and not filter_func(details[filename]):
@@ -207,12 +223,13 @@ def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable
return details
if __name__ == '__main__':
if __name__ == "__main__":
import json
dir_info = get_dir_info(Path('.'), max_depth=6)
with open('.hashes.json', 'w') as f:
dir_info = get_dir_info(Path("."), max_depth=6)
with open(".hashes.json", "w") as f:
json.dump(dir_info, f, indent=4)
print('Wrote .hashes.json')
print("Wrote .hashes.json")
# Example output:
# {

View File

@@ -20,72 +20,73 @@ Plain URLs (also supported):
https://foo.com
"""
__package__ = 'archivebox.misc'
__package__ = "archivebox.misc"
import sys
import json
import select
from typing import Iterable, Iterator, Dict, Any, Optional, TextIO
from typing import Any, TextIO
from collections.abc import Iterable, Iterator
from pathlib import Path
# Type constants for JSONL records
TYPE_SNAPSHOT = 'Snapshot'
TYPE_ARCHIVERESULT = 'ArchiveResult'
TYPE_TAG = 'Tag'
TYPE_CRAWL = 'Crawl'
TYPE_BINARY = 'Binary'
TYPE_PROCESS = 'Process'
TYPE_MACHINE = 'Machine'
TYPE_SNAPSHOT = "Snapshot"
TYPE_ARCHIVERESULT = "ArchiveResult"
TYPE_TAG = "Tag"
TYPE_CRAWL = "Crawl"
TYPE_BINARY = "Binary"
TYPE_PROCESS = "Process"
TYPE_MACHINE = "Machine"
VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_BINARY, TYPE_PROCESS, TYPE_MACHINE}
def parse_line(line: str) -> Optional[Dict[str, Any]]:
def parse_line(line: str) -> dict[str, Any] | None:
"""
Parse a single line of input as either JSONL or plain URL.
Returns a dict with at minimum {'type': '...', 'url': '...'} or None if invalid.
"""
line = line.strip()
if not line or line.startswith('#'):
if not line or line.startswith("#"):
return None
# Try to parse as JSON first
if line.startswith('{'):
if line.startswith("{"):
try:
record = json.loads(line)
# If it has a type, validate it
if 'type' in record and record['type'] not in VALID_TYPES:
if "type" in record and record["type"] not in VALID_TYPES:
# Unknown type, treat as raw data
pass
# If it has url but no type, assume Snapshot
if 'url' in record and 'type' not in record:
record['type'] = TYPE_SNAPSHOT
if "url" in record and "type" not in record:
record["type"] = TYPE_SNAPSHOT
return record
except json.JSONDecodeError:
pass
# Treat as plain URL if it looks like one
if line.startswith('http://') or line.startswith('https://') or line.startswith('file://'):
return {'type': TYPE_SNAPSHOT, 'url': line}
if line.startswith("http://") or line.startswith("https://") or line.startswith("file://"):
return {"type": TYPE_SNAPSHOT, "url": line}
# Could be a snapshot ID (UUID with dashes or compact 32-char hex)
if len(line) == 36 and line.count('-') == 4:
return {'type': TYPE_SNAPSHOT, 'id': line}
if len(line) == 36 and line.count("-") == 4:
return {"type": TYPE_SNAPSHOT, "id": line}
if len(line) == 32:
try:
int(line, 16)
except ValueError:
pass
else:
return {'type': TYPE_SNAPSHOT, 'id': line}
return {"type": TYPE_SNAPSHOT, "id": line}
# Unknown format, skip
return None
def read_stdin(stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
def read_stdin(stream: TextIO | None = None) -> Iterator[dict[str, Any]]:
"""
Read JSONL or plain URLs from stdin.
@@ -112,20 +113,20 @@ def read_stdin(stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
yield record
def read_file(path: Path) -> Iterator[Dict[str, Any]]:
def read_file(path: Path) -> Iterator[dict[str, Any]]:
"""
Read JSONL or plain URLs from a file.
Yields parsed records as dicts.
"""
with open(path, 'r') as f:
with open(path) as f:
for line in f:
record = parse_line(line)
if record:
yield record
def read_args_or_stdin(args: Iterable[str], stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
def read_args_or_stdin(args: Iterable[str], stream: TextIO | None = None) -> Iterator[dict[str, Any]]:
"""
Read from CLI arguments if provided, otherwise from stdin.
@@ -145,16 +146,16 @@ def read_args_or_stdin(args: Iterable[str], stream: Optional[TextIO] = None) ->
yield from read_stdin(stream)
def write_record(record: Dict[str, Any], stream: Optional[TextIO] = None) -> None:
def write_record(record: dict[str, Any], stream: TextIO | None = None) -> None:
"""
Write a single JSONL record to stdout (or provided stream).
"""
active_stream: TextIO = sys.stdout if stream is None else stream
active_stream.write(json.dumps(record) + '\n')
active_stream.write(json.dumps(record) + "\n")
active_stream.flush()
def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] = None) -> int:
def write_records(records: Iterator[dict[str, Any]], stream: TextIO | None = None) -> int:
"""
Write multiple JSONL records to stdout (or provided stream).

View File

@@ -8,24 +8,26 @@ This is separate from the hooks-based parser system which handles importing
new URLs from bookmark files, RSS feeds, etc.
"""
__package__ = 'archivebox.misc'
__package__ = "archivebox.misc"
import os
import json
from pathlib import Path
from datetime import datetime, timezone
from typing import Iterator, TypedDict, List
from typing import TypedDict
from collections.abc import Iterator
class SnapshotDict(TypedDict, total=False):
"""
Dictionary type representing a snapshot/link, compatible with Snapshot model fields.
"""
url: str # Required: the URL to archive
timestamp: str # Optional: unix timestamp string
title: str # Optional: page title
tags: str # Optional: comma-separated tags string
sources: List[str] # Optional: list of source file paths
url: str # Required: the URL to archive
timestamp: str # Optional: unix timestamp string
title: str # Optional: page title
tags: str # Optional: comma-separated tags string
sources: list[str] # Optional: list of source file paths
def parse_json_main_index(out_dir: Path) -> Iterator[SnapshotDict]:
@@ -41,16 +43,16 @@ def parse_json_main_index(out_dir: Path) -> Iterator[SnapshotDict]:
return
try:
with open(index_path, 'r', encoding='utf-8') as f:
with open(index_path, encoding="utf-8") as f:
data = json.load(f)
links = data.get('links', [])
links = data.get("links", [])
for link in links:
yield {
'url': link.get('url', ''),
'timestamp': link.get('timestamp', str(datetime.now(timezone.utc).timestamp())),
'title': link.get('title'),
'tags': link.get('tags', ''),
"url": link.get("url", ""),
"timestamp": link.get("timestamp", str(datetime.now(timezone.utc).timestamp())),
"title": link.get("title"),
"tags": link.get("tags", ""),
}
except (json.JSONDecodeError, KeyError, TypeError):
return
@@ -81,12 +83,12 @@ def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
if jsonl_file.exists():
try:
with open(jsonl_file, 'r', encoding='utf-8') as f:
with open(jsonl_file, encoding="utf-8") as f:
for line in f:
line = line.strip()
if line.startswith('{'):
if line.startswith("{"):
record = json.loads(line)
if record.get('type') == 'Snapshot':
if record.get("type") == "Snapshot":
link = record
break
except (json.JSONDecodeError, KeyError, TypeError):
@@ -94,15 +96,15 @@ def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:
if link is None and json_file.exists():
try:
with open(json_file, 'r', encoding='utf-8') as f:
with open(json_file, encoding="utf-8") as f:
link = json.load(f)
except (json.JSONDecodeError, KeyError, TypeError):
pass
if link:
yield {
'url': link.get('url', ''),
'timestamp': link.get('timestamp', entry.name),
'title': link.get('title'),
'tags': link.get('tags', ''),
"url": link.get("url", ""),
"timestamp": link.get("timestamp", entry.name),
"title": link.get("title"),
"tags": link.get("tags", ""),
}

View File

@@ -1,10 +1,9 @@
__package__ = 'archivebox.misc'
__package__ = "archivebox.misc"
# Low-level logging primitives (Rich console, ANSI colors, stdout/stderr helpers)
# Higher-level logging functions are in logging_util.py
import sys
from typing import Optional, Union, Tuple, List
from collections import defaultdict
from random import randint
@@ -19,11 +18,13 @@ CONSOLE = Console(width=32768, soft_wrap=True, force_terminal=True)
STDERR = Console(stderr=True, width=32768, soft_wrap=True, force_terminal=True)
IS_TTY = sys.stdout.isatty()
class RainbowHighlighter(Highlighter):
def highlight(self, text):
for index in range(len(text)):
text.stylize(f"color({randint(90, 98)})", index, index + 1)
rainbow = RainbowHighlighter()
@@ -38,49 +39,55 @@ DEFAULT_CLI_COLORS = benedict(
"blue": "\033[01;34m",
"white": "\033[01;37m",
"black": "\033[01;30m",
}
},
)
ANSI = benedict({k: "" for k in DEFAULT_CLI_COLORS.keys()})
COLOR_DICT = defaultdict(
lambda: [(0, 0, 0), (0, 0, 0)],
{
"00": [(0, 0, 0), (0, 0, 0)],
"30": [(0, 0, 0), (0, 0, 0)],
"31": [(255, 0, 0), (128, 0, 0)],
"32": [(0, 200, 0), (0, 128, 0)],
"33": [(255, 255, 0), (128, 128, 0)],
"34": [(0, 0, 255), (0, 0, 128)],
"35": [(255, 0, 255), (128, 0, 128)],
"36": [(0, 255, 255), (0, 128, 128)],
"37": [(255, 255, 255), (255, 255, 255)],
},
)
ANSI = benedict({k: '' for k in DEFAULT_CLI_COLORS.keys()})
COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
'00': [(0, 0, 0), (0, 0, 0)],
'30': [(0, 0, 0), (0, 0, 0)],
'31': [(255, 0, 0), (128, 0, 0)],
'32': [(0, 200, 0), (0, 128, 0)],
'33': [(255, 255, 0), (128, 128, 0)],
'34': [(0, 0, 255), (0, 0, 128)],
'35': [(255, 0, 255), (128, 0, 128)],
'36': [(0, 255, 255), (0, 128, 128)],
'37': [(255, 255, 255), (255, 255, 255)],
})
# Logging Helpers (DEPRECATED, use rich.print instead going forward)
def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None:
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
def stdout(*args, color: str | None = None, prefix: str = "", config: benedict | None = None) -> None:
ansi = DEFAULT_CLI_COLORS if (config or {}).get("USE_COLOR") else ANSI
if color:
strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
strs = [ansi[color], " ".join(str(a) for a in args), ansi["reset"], "\n"]
else:
strs = [' '.join(str(a) for a in args), '\n']
strs = [" ".join(str(a) for a in args), "\n"]
sys.stdout.write(prefix + ''.join(strs))
sys.stdout.write(prefix + "".join(strs))
def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None:
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
def stderr(*args, color: str | None = None, prefix: str = "", config: benedict | None = None) -> None:
ansi = DEFAULT_CLI_COLORS if (config or {}).get("USE_COLOR") else ANSI
if color:
strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
strs = [ansi[color], " ".join(str(a) for a in args), ansi["reset"], "\n"]
else:
strs = [' '.join(str(a) for a in args), '\n']
strs = [" ".join(str(a) for a in args), "\n"]
sys.stderr.write(prefix + ''.join(strs))
sys.stderr.write(prefix + "".join(strs))
def hint(text: Union[Tuple[str, ...], List[str], str], prefix=' ', config: Optional[benedict]=None) -> None:
ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
def hint(text: tuple[str, ...] | list[str] | str, prefix=" ", config: benedict | None = None) -> None:
ansi = DEFAULT_CLI_COLORS if (config or {}).get("USE_COLOR") else ANSI
if isinstance(text, str):
stderr(f"{prefix}{ansi['lightred']}Hint:{ansi['reset']} {text}")
else:
stderr(f"{prefix}{ansi['lightred']}Hint:{ansi['reset']} {text[0]}")
for line in text[1:]:
stderr(f'{prefix} {line}')
stderr(f"{prefix} {line}")

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox'
__package__ = "archivebox"
# High-level logging functions for CLI output and progress tracking
# Low-level primitives (Rich console, ANSI colors) are in logging.py
@@ -14,7 +14,8 @@ from pathlib import Path
from datetime import datetime, timezone
from dataclasses import dataclass
from typing import Any, Optional, List, Dict, Union, Iterable, IO, TYPE_CHECKING, cast
from typing import Any, Optional, IO, TYPE_CHECKING, cast
from collections.abc import Iterable
if TYPE_CHECKING:
from archivebox.core.models import Snapshot
@@ -28,6 +29,7 @@ from archivebox.misc.system import get_dir_size
from archivebox.misc.util import enforce_types
from archivebox.misc.logging import ANSI
@dataclass
class RuntimeStats:
"""mutable stats counter for logging archiving timing info to CLI output"""
@@ -36,14 +38,15 @@ class RuntimeStats:
succeeded: int = 0
failed: int = 0
parse_start_ts: Optional[datetime] = None
parse_end_ts: Optional[datetime] = None
parse_start_ts: datetime | None = None
parse_end_ts: datetime | None = None
index_start_ts: Optional[datetime] = None
index_end_ts: Optional[datetime] = None
index_start_ts: datetime | None = None
index_end_ts: datetime | None = None
archiving_start_ts: datetime | None = None
archiving_end_ts: datetime | None = None
archiving_start_ts: Optional[datetime] = None
archiving_end_ts: Optional[datetime] = None
# globals are bad, mmkay
_LAST_RUN_STATS = RuntimeStats()
@@ -52,49 +55,47 @@ _LAST_RUN_STATS = RuntimeStats()
class TimedProgress:
"""Show a progress bar and measure elapsed time until .end() is called"""
def __init__(self, seconds, prefix=''):
def __init__(self, seconds, prefix=""):
self.SHOW_PROGRESS = SHELL_CONFIG.SHOW_PROGRESS
self.ANSI = SHELL_CONFIG.ANSI
if self.SHOW_PROGRESS:
self.p = Process(target=progress_bar, args=(seconds, prefix, self.ANSI))
self.p.start()
self.stats = {'start_ts': datetime.now(timezone.utc), 'end_ts': None}
self.stats = {"start_ts": datetime.now(timezone.utc), "end_ts": None}
def end(self):
"""immediately end progress, clear the progressbar line, and save end_ts"""
end_ts = datetime.now(timezone.utc)
self.stats['end_ts'] = end_ts
self.stats["end_ts"] = end_ts
if self.SHOW_PROGRESS:
# terminate if we havent already terminated
try:
# kill the progress bar subprocess
try:
self.p.close() # must be closed *before* its terminnated
self.p.close() # must be closed *before* its terminnated
except (KeyboardInterrupt, SystemExit):
print()
raise
except BaseException: # lgtm [py/catch-base-exception]
except BaseException: # lgtm [py/catch-base-exception]
pass
self.p.terminate()
time.sleep(0.1)
# sometimes the timer doesn't terminate properly, then blocks at the join until
# the full time has elapsed. sending a kill tries to avoid that.
try:
self.p.kill()
self.p.kill()
except Exception:
pass
# clear whole terminal line
try:
sys.stdout.write('\r{}{}\r'.format((' ' * SHELL_CONFIG.TERM_WIDTH), self.ANSI['reset']))
except (IOError, BrokenPipeError):
sys.stdout.write("\r{}{}\r".format((" " * SHELL_CONFIG.TERM_WIDTH), self.ANSI["reset"]))
except (OSError, BrokenPipeError):
# ignore when the parent proc has stopped listening to our stdout
pass
except ValueError:
@@ -102,10 +103,10 @@ class TimedProgress:
@enforce_types
def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> None:
def progress_bar(seconds: int, prefix: str = "", ANSI: dict[str, str] = ANSI) -> None:
"""show timer in the form of progress bar, with percentage and seconds remaining"""
output_buf = (sys.stdout or sys.__stdout__ or sys.stderr or sys.__stderr__)
chunk = '' if output_buf and output_buf.encoding.upper() == 'UTF-8' else '#'
output_buf = sys.stdout or sys.__stdout__ or sys.stderr or sys.__stderr__
chunk = "" if output_buf and output_buf.encoding.upper() == "UTF-8" else "#"
last_width = SHELL_CONFIG.TERM_WIDTH
chunks = last_width - len(prefix) - 20 # number of progress chunks to show (aka max bar width)
try:
@@ -114,37 +115,41 @@ def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> Non
if max_width < last_width:
# when the terminal size is shrunk, we have to write a newline
# otherwise the progress bar will keep wrapping incorrectly
sys.stdout.write('\r\n')
sys.stdout.write("\r\n")
sys.stdout.flush()
chunks = max_width - len(prefix) - 20
pct_complete = s / chunks / seconds * 100
log_pct = (log(pct_complete or 1, 10) / 2) * 100 # everyone likes faster progress bars ;)
bar_width = round(log_pct/(100/chunks))
bar_width = round(log_pct / (100 / chunks))
last_width = max_width
# ████████████████████ 0.9% (1/60sec)
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
prefix,
ANSI['green' if pct_complete < 80 else 'lightyellow'],
(chunk * bar_width).ljust(chunks),
ANSI['reset'],
round(pct_complete, 1),
round(s/chunks),
seconds,
))
sys.stdout.write(
"\r{}{}{}{} {}% ({}/{}sec)".format(
prefix,
ANSI["green" if pct_complete < 80 else "lightyellow"],
(chunk * bar_width).ljust(chunks),
ANSI["reset"],
round(pct_complete, 1),
round(s / chunks),
seconds,
),
)
sys.stdout.flush()
time.sleep(1 / chunks)
# ██████████████████████████████████ 100.0% (60/60sec)
sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
prefix,
ANSI['red'],
chunk * chunks,
ANSI['reset'],
100.0,
seconds,
seconds,
))
sys.stdout.write(
"\r{}{}{}{} {}% ({}/{}sec)".format(
prefix,
ANSI["red"],
chunk * chunks,
ANSI["reset"],
100.0,
seconds,
seconds,
),
)
sys.stdout.flush()
# uncomment to have it disappear when it hits 100% instead of staying full red:
# time.sleep(0.5)
@@ -154,10 +159,10 @@ def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> Non
print()
def log_cli_command(subcommand: str, subcommand_args: Iterable[str]=(), stdin: str | IO | None=None, pwd: str='.'):
args = ' '.join(subcommand_args)
version_msg = '[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format(
now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
def log_cli_command(subcommand: str, subcommand_args: Iterable[str] = (), stdin: str | IO | None = None, pwd: str = "."):
args = " ".join(subcommand_args)
version_msg = "[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]".format(
now=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
VERSION=VERSION,
subcommand=subcommand,
args=args,
@@ -166,44 +171,54 @@ def log_cli_command(subcommand: str, subcommand_args: Iterable[str]=(), stdin: s
# stderr('[bright_black] > {pwd}[/]'.format(pwd=pwd, **ANSI))
# stderr()
print(Panel(version_msg), file=sys.stderr)
### Parsing Stage
def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: bool):
def log_importing_started(urls: str | list[str], depth: int, index_only: bool):
_LAST_RUN_STATS.parse_start_ts = datetime.now(timezone.utc)
print('[green][+] [{}] Adding {} links to index (crawl depth={}){}...[/]'.format(
_LAST_RUN_STATS.parse_start_ts.strftime('%Y-%m-%d %H:%M:%S'),
len(urls) if isinstance(urls, list) else len(urls.split('\n')),
depth,
' (index only)' if index_only else '',
))
print(
"[green][+] [{}] Adding {} links to index (crawl depth={}){}...[/]".format(
_LAST_RUN_STATS.parse_start_ts.strftime("%Y-%m-%d %H:%M:%S"),
len(urls) if isinstance(urls, list) else len(urls.split("\n")),
depth,
" (index only)" if index_only else "",
),
)
def log_source_saved(source_file: str):
print(' > Saved verbatim input to {}/{}'.format(CONSTANTS.SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
print(" > Saved verbatim input to {}/{}".format(CONSTANTS.SOURCES_DIR_NAME, source_file.rsplit("/", 1)[-1]))
def log_parsing_finished(num_parsed: int, parser_name: str):
_LAST_RUN_STATS.parse_end_ts = datetime.now(timezone.utc)
print(' > Parsed {} URLs from input ({})'.format(num_parsed, parser_name))
print(f" > Parsed {num_parsed} URLs from input ({parser_name})")
def log_deduping_finished(num_new_links: int):
print(' > Found {} new URLs not already in index'.format(num_new_links))
print(f" > Found {num_new_links} new URLs not already in index")
def log_crawl_started(new_links):
print()
print(f'[green][*] Starting crawl of {len(new_links)} sites 1 hop out from starting point[/]')
print(f"[green][*] Starting crawl of {len(new_links)} sites 1 hop out from starting point[/]")
### Indexing Stage
def log_indexing_process_started(num_links: int):
start_ts = datetime.now(timezone.utc)
_LAST_RUN_STATS.index_start_ts = start_ts
print()
print('[bright_black][*] [{}] Writing {} links to main index...[/]'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
))
print(
"[bright_black][*] [{}] Writing {} links to main index...[/]".format(
start_ts.strftime("%Y-%m-%d %H:%M:%S"),
num_links,
),
)
def log_indexing_process_finished():
@@ -213,46 +228,55 @@ def log_indexing_process_finished():
def log_indexing_started(out_path: str):
if SHELL_CONFIG.IS_TTY:
sys.stdout.write(f' > ./{Path(out_path).relative_to(DATA_DIR)}')
sys.stdout.write(f" > ./{Path(out_path).relative_to(DATA_DIR)}")
def log_indexing_finished(out_path: str):
print(f'\r √ ./{Path(out_path).relative_to(DATA_DIR)}')
print(f"\r √ ./{Path(out_path).relative_to(DATA_DIR)}")
### Archiving Stage
def log_archiving_started(num_links: int, resume: Optional[float]=None):
def log_archiving_started(num_links: int, resume: float | None = None):
start_ts = datetime.now(timezone.utc)
_LAST_RUN_STATS.archiving_start_ts = start_ts
print()
if resume:
print('[green][▶] [{}] Resuming archive updating for {} pages starting from {}...[/]'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
resume,
))
print(
"[green][▶] [{}] Resuming archive updating for {} pages starting from {}...[/]".format(
start_ts.strftime("%Y-%m-%d %H:%M:%S"),
num_links,
resume,
),
)
else:
print('[green][▶] [{}] Starting archiving of {} snapshots in index...[/]'.format(
start_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
))
print(
"[green][▶] [{}] Starting archiving of {} snapshots in index...[/]".format(
start_ts.strftime("%Y-%m-%d %H:%M:%S"),
num_links,
),
)
def log_archiving_paused(num_links: int, idx: int, timestamp: str):
end_ts = datetime.now(timezone.utc)
_LAST_RUN_STATS.archiving_end_ts = end_ts
print()
print('\n[yellow3][X] [{now}] Downloading paused on link {timestamp} ({idx}/{total})[/]'.format(
now=end_ts.strftime('%Y-%m-%d %H:%M:%S'),
idx=idx+1,
timestamp=timestamp,
total=num_links,
))
print(
"\n[yellow3][X] [{now}] Downloading paused on link {timestamp} ({idx}/{total})[/]".format(
now=end_ts.strftime("%Y-%m-%d %H:%M:%S"),
idx=idx + 1,
timestamp=timestamp,
total=num_links,
),
)
print()
print(' Continue archiving where you left off by running:')
print(' archivebox update --resume={}'.format(timestamp))
print(" Continue archiving where you left off by running:")
print(f" archivebox update --resume={timestamp}")
def log_archiving_finished(num_links: int):
@@ -263,24 +287,26 @@ def log_archiving_finished(num_links: int):
assert _LAST_RUN_STATS.archiving_start_ts is not None
seconds = end_ts.timestamp() - _LAST_RUN_STATS.archiving_start_ts.timestamp()
if seconds > 60:
duration = '{0:.2f} min'.format(seconds / 60)
duration = f"{seconds / 60:.2f} min"
else:
duration = '{0:.2f} sec'.format(seconds)
duration = f"{seconds:.2f} sec"
print()
print('[green][√] [{}] Update of {} pages complete ({})[/]'.format(
end_ts.strftime('%Y-%m-%d %H:%M:%S'),
num_links,
duration,
))
print(' - {} links skipped'.format(_LAST_RUN_STATS.skipped))
print(' - {} links updated'.format(_LAST_RUN_STATS.succeeded + _LAST_RUN_STATS.failed))
print(' - {} links had errors'.format(_LAST_RUN_STATS.failed))
print(
"[green][√] [{}] Update of {} pages complete ({})[/]".format(
end_ts.strftime("%Y-%m-%d %H:%M:%S"),
num_links,
duration,
),
)
print(f" - {_LAST_RUN_STATS.skipped} links skipped")
print(f" - {_LAST_RUN_STATS.succeeded + _LAST_RUN_STATS.failed} links updated")
print(f" - {_LAST_RUN_STATS.failed} links had errors")
if Snapshot.objects.count() < 50:
print()
print(' [violet]Hint:[/] To manage your archive in a Web UI, run:')
print(' archivebox server 0.0.0.0:8000')
print(" [violet]Hint:[/] To manage your archive in a Web UI, run:")
print(" archivebox server 0.0.0.0:8000")
def log_snapshot_archiving_started(snapshot: "Snapshot", out_dir: str, is_new: bool):
@@ -289,41 +315,51 @@ def log_snapshot_archiving_started(snapshot: "Snapshot", out_dir: str, is_new: b
# http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
# > output/archive/1478739709
print('\n[[{symbol_color}]{symbol}[/]] [[{symbol_color}]{now}[/]] "{title}"'.format(
symbol_color='green' if is_new else 'bright_black',
symbol='+' if is_new else '',
now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
title=snapshot.title or snapshot.base_url,
))
print(f' [sky_blue1]{snapshot.url}[/]')
print(' {} {}'.format(
'>' if is_new else '',
pretty_path(out_dir),
))
print(
'\n[[{symbol_color}]{symbol}[/]] [[{symbol_color}]{now}[/]] "{title}"'.format(
symbol_color="green" if is_new else "bright_black",
symbol="+" if is_new else "",
now=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
title=snapshot.title or snapshot.base_url,
),
)
print(f" [sky_blue1]{snapshot.url}[/]")
print(
" {} {}".format(
">" if is_new else "",
pretty_path(out_dir),
),
)
def log_snapshot_archiving_finished(snapshot: "Snapshot", out_dir: str, is_new: bool, stats: dict, start_ts: datetime):
total = sum(stats.values())
if stats['failed'] > 0 :
if stats["failed"] > 0:
_LAST_RUN_STATS.failed += 1
elif stats['skipped'] == total:
elif stats["skipped"] == total:
_LAST_RUN_STATS.skipped += 1
else:
_LAST_RUN_STATS.succeeded += 1
try:
size = get_dir_size(out_dir)
except FileNotFoundError:
size = (0, None, '0')
results = snapshot.archiveresult_set.only("output_files", "output_size")
total_bytes = sum(result.output_size or result.output_size_from_files() for result in results)
total_files = sum(result.output_file_count() for result in results)
size = (total_bytes, 0, total_files)
except Exception:
try:
size = get_dir_size(out_dir)
except FileNotFoundError:
size = (0, None, "0")
end_ts = datetime.now(timezone.utc)
duration = str(end_ts - start_ts).split('.')[0]
print(' [bright_black]{} files ({}) in {}s [/]'.format(size[2], printable_filesize(size[0]), duration))
duration = str(end_ts - start_ts).split(".")[0]
print(f" [bright_black]{size[2]} files ({printable_filesize(size[0])}) in {duration}s [/]")
def log_archive_method_started(method: str):
print(' > {}'.format(method))
print(f" > {method}")
def log_archive_method_finished(result: dict):
@@ -332,122 +368,117 @@ def log_archive_method_finished(result: dict):
copy-paste the outputted string directly to run the cmd
"""
# Prettify CMD string and make it safe to copy-paste by quoting arguments
quoted_cmd = ' '.join(
'"{}"'.format(arg) if (' ' in arg) or (':' in arg) else arg
for arg in result['cmd']
)
quoted_cmd = " ".join(f'"{arg}"' if (" " in arg) or (":" in arg) else arg for arg in result["cmd"])
if result['status'] == 'failed':
output = result.get('output')
if output and output.__class__.__name__ == 'TimeoutExpired':
duration = (result['end_ts'] - result['start_ts']).seconds
if result["status"] == "failed":
output = result.get("output")
if output and output.__class__.__name__ == "TimeoutExpired":
duration = (result["end_ts"] - result["start_ts"]).seconds
hint_header = [
f'[yellow3]Extractor timed out after {duration}s.[/]',
f"[yellow3]Extractor timed out after {duration}s.[/]",
]
else:
error_name = output.__class__.__name__.replace('ArchiveError', '') if output else 'Error'
error_name = output.__class__.__name__.replace("ArchiveError", "") if output else "Error"
hint_header = [
'[yellow3]Extractor failed:[/]',
f' {error_name} [red1]{output}[/]',
"[yellow3]Extractor failed:[/]",
f" {error_name} [red1]{output}[/]",
]
# Prettify error output hints string and limit to five lines
hints = getattr(output, 'hints', None) or () if output else ()
hints = getattr(output, "hints", None) or () if output else ()
if hints:
if isinstance(hints, (list, tuple, type(_ for _ in ()))):
hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints]
else:
if isinstance(hints, bytes):
hints = hints.decode()
hints = hints.split('\n')
hints = hints.split("\n")
hints = (
f' [yellow1]{line.strip()}[/]'
for line in list(hints)[:5] if line.strip()
)
hints = (f" [yellow1]{line.strip()}[/]" for line in list(hints)[:5] if line.strip())
docker_hints = ()
if os.environ.get('IN_DOCKER') in ('1', 'true', 'True', 'TRUE', 'yes'):
docker_hints = (
' docker run -it -v $PWD/data:/data archivebox/archivebox /bin/bash',
)
if os.environ.get("IN_DOCKER") in ("1", "true", "True", "TRUE", "yes"):
docker_hints = (" docker run -it -v $PWD/data:/data archivebox/archivebox /bin/bash",)
# Collect and prefix output lines with indentation
output_lines = [
*hint_header,
*hints,
'[violet]Run to see full output:[/]',
"[violet]Run to see full output:[/]",
*docker_hints,
*([' cd {};'.format(result.get('pwd'))] if result.get('pwd') else []),
' {}'.format(quoted_cmd),
*([" cd {};".format(result.get("pwd"))] if result.get("pwd") else []),
f" {quoted_cmd}",
]
print('\n'.join(
' {}'.format(line)
for line in output_lines
if line
))
print(
"\n".join(f" {line}" for line in output_lines if line),
)
print()
def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
print(f'[green][*] Finding links in the archive index matching these {filter_type} patterns:[/]')
print(' {}'.format(' '.join(filter_patterns or ())))
def log_list_started(filter_patterns: list[str] | None, filter_type: str):
print(f"[green][*] Finding links in the archive index matching these {filter_type} patterns:[/]")
print(" {}".format(" ".join(filter_patterns or ())))
def log_list_finished(snapshots):
from archivebox.core.models import Snapshot
print()
print('---------------------------------------------------------------------------------------------------')
print("---------------------------------------------------------------------------------------------------")
csv_queryset = cast(Any, Snapshot.objects.filter(pk__in=[s.pk for s in snapshots]))
print(csv_queryset.to_csv(cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
print('---------------------------------------------------------------------------------------------------')
print(csv_queryset.to_csv(cols=["timestamp", "is_archived", "num_outputs", "url"], header=True, ljust=16, separator=" | "))
print("---------------------------------------------------------------------------------------------------")
print()
def log_removal_started(snapshots, yes: bool, delete: bool):
count = snapshots.count() if hasattr(snapshots, 'count') else len(snapshots)
print(f'[yellow3][i] Found {count} matching URLs to remove.[/]')
count = snapshots.count() if hasattr(snapshots, "count") else len(snapshots)
print(f"[yellow3][i] Found {count} matching URLs to remove.[/]")
if delete:
file_counts = [s.num_outputs for s in snapshots if os.access(s.output_dir, os.R_OK)]
print(
f' {count} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
f' ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
f" {count} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n"
f" ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)",
)
else:
print(
' Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n'
' (Pass --delete if you also want to permanently delete the data folders)'
" Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n"
" (Pass --delete if you also want to permanently delete the data folders)",
)
if not yes:
print()
print(f'[yellow3][?] Do you want to proceed with removing these {count} links?[/]')
print(f"[yellow3][?] Do you want to proceed with removing these {count} links?[/]")
try:
assert input(' y/[n]: ').lower() == 'y'
assert input(" y/[n]: ").lower() == "y"
except (KeyboardInterrupt, EOFError, AssertionError):
raise SystemExit(0)
def log_removal_finished(remaining_links: int, removed_links: int):
if remaining_links == 0 and removed_links == 0:
print()
print('[red1][X] No matching links found.[/]')
print("[red1][X] No matching links found.[/]")
else:
total_before = remaining_links + removed_links
print()
print(f'[red1][√] Removed {removed_links} out of {total_before} links from the archive index.[/]')
print(f' Index now contains {remaining_links} links.')
print(f"[red1][√] Removed {removed_links} out of {total_before} links from the archive index.[/]")
print(f" Index now contains {remaining_links} links.")
### Search Indexing Stage
def log_index_started(url: str):
print('[green][*] Indexing url: {} in the search index[/]'.format(url))
print(f"[green][*] Indexing url: {url} in the search index[/]")
print()
### Helpers
@enforce_types
def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR, color: bool=True) -> str:
def pretty_path(path: Path | str, pwd: Path | str = DATA_DIR, color: bool = True) -> str:
"""convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
pwd = str(Path(pwd)) # .resolve()
path = str(path)
@@ -456,46 +487,46 @@ def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR, color: b
return path
# replace long absolute paths with ./ relative ones to save on terminal output width
if path.startswith(pwd) and (pwd != '/') and path != pwd:
if path.startswith(pwd) and (pwd != "/") and path != pwd:
if color:
path = path.replace(pwd, '[light_slate_blue].[/light_slate_blue]', 1)
path = path.replace(pwd, "[light_slate_blue].[/light_slate_blue]", 1)
else:
path = path.replace(pwd, '.', 1)
path = path.replace(pwd, ".", 1)
# quote paths containing spaces
if ' ' in path:
if " " in path:
path = f'"{path}"'
# replace home directory with ~ for shorter output
path = path.replace(str(Path('~').expanduser()), '~')
path = path.replace(str(Path("~").expanduser()), "~")
return path
@enforce_types
def printable_filesize(num_bytes: Union[int, float]) -> str:
for count in ['Bytes','KB','MB','GB']:
def printable_filesize(num_bytes: int | float) -> str:
for count in ["Bytes", "KB", "MB", "GB"]:
if num_bytes > -1024.0 and num_bytes < 1024.0:
return '%3.1f %s' % (num_bytes, count)
return f"{num_bytes:3.1f} {count}"
num_bytes /= 1024.0
return '%3.1f %s' % (num_bytes, 'TB')
return "{:3.1f} {}".format(num_bytes, "TB")
@enforce_types
def format_duration(seconds: float) -> str:
"""Format duration in human-readable form."""
if seconds < 1:
return f'{seconds*1000:.0f}ms'
return f"{seconds * 1000:.0f}ms"
elif seconds < 60:
return f'{seconds:.1f}s'
return f"{seconds:.1f}s"
elif seconds < 3600:
minutes = int(seconds // 60)
secs = int(seconds % 60)
return f'{minutes}min {secs}s' if secs else f'{minutes}min'
return f"{minutes}min {secs}s" if secs else f"{minutes}min"
else:
hours = int(seconds // 3600)
minutes = int((seconds % 3600) // 60)
return f'{hours}hr {minutes}min' if minutes else f'{hours}hr'
return f"{hours}hr {minutes}min" if minutes else f"{hours}hr"
@enforce_types
@@ -504,15 +535,15 @@ def truncate_url(url: str, max_length: int = 60) -> str:
if len(url) <= max_length:
return url
# Try to keep the domain and beginning of path
if '://' in url:
protocol, rest = url.split('://', 1)
if '/' in rest:
domain, path = rest.split('/', 1)
if "://" in url:
protocol, rest = url.split("://", 1)
if "/" in rest:
domain, path = rest.split("/", 1)
available = max_length - len(protocol) - len(domain) - 6 # for "://", "/", "..."
if available > 10:
return f'{protocol}://{domain}/{path[:available]}...'
return f"{protocol}://{domain}/{path[:available]}..."
# Fallback: just truncate
return url[:max_length-3] + '...'
return url[: max_length - 3] + "..."
@enforce_types
@@ -520,12 +551,12 @@ def log_worker_event(
worker_type: str,
event: str,
indent_level: int = 0,
pid: Optional[int] = None,
worker_id: Optional[str] = None,
url: Optional[str] = None,
plugin: Optional[str] = None,
metadata: Optional[Dict[str, Any]] = None,
error: Optional[Exception] = None,
pid: int | None = None,
worker_id: str | None = None,
url: str | None = None,
plugin: str | None = None,
metadata: dict[str, Any] | None = None,
error: Exception | None = None,
) -> None:
"""
Log a worker event with structured metadata and indentation.
@@ -541,17 +572,17 @@ def log_worker_event(
metadata: Dict of metadata to show in curly braces
error: Exception if event is an error
"""
indent = ' ' * indent_level
indent = " " * indent_level
from rich.markup import escape
# Build worker identifier (without URL/plugin)
worker_parts = [worker_type]
# Don't add pid/worker_id for DB operations (they happen in whatever process is running)
if pid and worker_type != 'DB':
worker_parts.append(f'pid={pid}')
if worker_id and worker_type in ('CrawlWorker', 'Orchestrator') and worker_type != 'DB':
worker_parts.append(f'id={worker_id}')
if pid and worker_type != "DB":
worker_parts.append(f"pid={pid}")
if worker_id and worker_type in ("CrawlWorker", "Orchestrator") and worker_type != "DB":
worker_parts.append(f"id={worker_id}")
# Build worker label parts for brackets (shown inside brackets)
worker_label_base = worker_parts[0]
@@ -560,53 +591,53 @@ def log_worker_event(
# Build URL/plugin display (shown AFTER the label, outside brackets)
url_extractor_parts = []
if url:
url_extractor_parts.append(f'url: {escape(url)}')
url_extractor_parts.append(f"url: {escape(url)}")
if plugin:
url_extractor_parts.append(f'extractor: {escape(plugin)}')
url_extractor_parts.append(f"extractor: {escape(plugin)}")
url_extractor_str = ' | '.join(url_extractor_parts) if url_extractor_parts else ''
url_extractor_str = " | ".join(url_extractor_parts) if url_extractor_parts else ""
# Build metadata string
metadata_str = ''
metadata_str = ""
if metadata:
# Format metadata nicely
meta_parts = []
for k, v in metadata.items():
if isinstance(v, float):
# Format floats nicely (durations, sizes)
if 'duration' in k.lower():
meta_parts.append(f'{k}: {format_duration(v)}')
elif 'size' in k.lower():
meta_parts.append(f'{k}: {printable_filesize(int(v))}')
if "duration" in k.lower():
meta_parts.append(f"{k}: {format_duration(v)}")
elif "size" in k.lower():
meta_parts.append(f"{k}: {printable_filesize(int(v))}")
else:
meta_parts.append(f'{k}: {v:.2f}')
meta_parts.append(f"{k}: {v:.2f}")
elif isinstance(v, int):
# Format integers - check if it's a size
if 'size' in k.lower() or 'bytes' in k.lower():
meta_parts.append(f'{k}: {printable_filesize(v)}')
if "size" in k.lower() or "bytes" in k.lower():
meta_parts.append(f"{k}: {printable_filesize(v)}")
else:
meta_parts.append(f'{k}: {v}')
meta_parts.append(f"{k}: {v}")
elif isinstance(v, (list, tuple)):
meta_parts.append(f'{k}: {len(v)}')
meta_parts.append(f"{k}: {len(v)}")
else:
meta_parts.append(f'{k}: {v}')
metadata_str = ' | '.join(meta_parts)
meta_parts.append(f"{k}: {v}")
metadata_str = " | ".join(meta_parts)
# Determine color based on event
color = 'white'
if event in ('Starting...', 'Started', 'STARTED', 'Started in background'):
color = 'green'
elif event.startswith('Created'):
color = 'cyan' # DB creation events
elif event in ('Completed', 'COMPLETED', 'All work complete'):
color = 'blue'
elif event in ('Failed', 'ERROR', 'Failed to spawn worker'):
color = 'red'
elif event in ('Shutting down', 'SHUTDOWN'):
color = 'grey53'
color = "white"
if event in ("Starting...", "Started", "STARTED", "Started in background"):
color = "green"
elif event.startswith("Created"):
color = "cyan" # DB creation events
elif event in ("Completed", "COMPLETED", "All work complete"):
color = "blue"
elif event in ("Failed", "ERROR", "Failed to spawn worker"):
color = "red"
elif event in ("Shutting down", "SHUTDOWN"):
color = "grey53"
# Build final message
error_str = f' {type(error).__name__}: {error}' if error else ''
error_str = f" {type(error).__name__}: {error}" if error else ""
from archivebox.misc.logging import CONSOLE, STDERR
from rich.text import Text
@@ -618,19 +649,19 @@ def log_worker_event(
# Add bracketed content if present (using Text.append to avoid markup issues)
if worker_bracket_content:
text.append('[', style=color)
text.append("[", style=color)
text.append(worker_bracket_content, style=color)
text.append(']', style=color)
text.append("]", style=color)
text.append(f' {event}{error_str}', style=color)
text.append(f" {event}{error_str}", style=color)
# Add URL/plugin info first (more important)
if url_extractor_str:
text.append(f' | {url_extractor_str}')
text.append(f" | {url_extractor_str}")
# Then add other metadata
if metadata_str:
text.append(f' | {metadata_str}')
text.append(f" | {metadata_str}")
# Stdout is reserved for JSONL records whenever commands are piped together.
# Route worker/DB progress to stderr in non-TTY contexts so pipelines like
@@ -640,90 +671,85 @@ def log_worker_event(
@enforce_types
def printable_folders(folders: Dict[str, Optional["Snapshot"]], with_headers: bool=False) -> str:
return '\n'.join(
f'{folder} {snapshot and snapshot.url} "{snapshot and snapshot.title}"'
for folder, snapshot in folders.items()
)
@enforce_types
def printable_config(config: dict, prefix: str='') -> str:
return f'\n{prefix}'.join(
f'{key}={val}'
for key, val in config.items()
if not (isinstance(val, dict) or callable(val))
)
def printable_folders(folders: dict[str, Optional["Snapshot"]], with_headers: bool = False) -> str:
return "\n".join(f'{folder} {snapshot and snapshot.url} "{snapshot and snapshot.title}"' for folder, snapshot in folders.items())
@enforce_types
def printable_folder_status(name: str, folder: Dict) -> str:
if folder['enabled']:
if folder['is_valid']:
color, symbol, note, num_files = 'green', '', 'valid', ''
def printable_config(config: dict, prefix: str = "") -> str:
return f"\n{prefix}".join(f"{key}={val}" for key, val in config.items() if not (isinstance(val, dict) or callable(val)))
@enforce_types
def printable_folder_status(name: str, folder: dict) -> str:
if folder["enabled"]:
if folder["is_valid"]:
color, symbol, note, num_files = "green", "", "valid", ""
else:
color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
color, symbol, note, num_files = "red", "X", "invalid", "?"
else:
color, symbol, note, num_files = 'grey53', '-', 'unused', '-'
color, symbol, note, num_files = "grey53", "-", "unused", "-"
if folder['path']:
if os.access(folder['path'], os.R_OK):
if folder["path"]:
if os.access(folder["path"], os.R_OK):
try:
num_files = (
f'{len(os.listdir(folder["path"]))} files'
if os.path.isdir(folder['path']) else
printable_filesize(Path(folder['path']).stat().st_size)
f"{len(os.listdir(folder['path']))} files"
if os.path.isdir(folder["path"])
else printable_filesize(Path(folder["path"]).stat().st_size)
)
except PermissionError:
num_files = 'error'
num_files = "error"
else:
num_files = 'missing'
if folder.get('is_mount'):
num_files = "missing"
if folder.get("is_mount"):
# add symbol @ next to filecount if path is a remote filesystem mount
num_files = f'{num_files} @' if num_files else '@'
num_files = f"{num_files} @" if num_files else "@"
path = pretty_path(folder['path'])
path = pretty_path(folder["path"])
return ' '.join((
f'[{color}]',
symbol,
'[/]',
name.ljust(21).replace('DATA_DIR', '[light_slate_blue]DATA_DIR[/light_slate_blue]'),
num_files.ljust(14).replace('missing', '[grey53]missing[/grey53]'),
f'[{color}]',
note.ljust(8),
'[/]',
path.ljust(76),
))
return " ".join(
(
f"[{color}]",
symbol,
"[/]",
name.ljust(21).replace("DATA_DIR", "[light_slate_blue]DATA_DIR[/light_slate_blue]"),
num_files.ljust(14).replace("missing", "[grey53]missing[/grey53]"),
f"[{color}]",
note.ljust(8),
"[/]",
path.ljust(76),
),
)
@enforce_types
def printable_dependency_version(name: str, dependency: Dict) -> str:
color, symbol, note, version = 'red', 'X', 'invalid', '?'
def printable_dependency_version(name: str, dependency: dict) -> str:
color, symbol, note, version = "red", "X", "invalid", "?"
if dependency['enabled']:
if dependency['is_valid']:
color, symbol, note = 'green', '', 'valid'
if dependency["enabled"]:
if dependency["is_valid"]:
color, symbol, note = "green", "", "valid"
parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
parsed_version_num = re.search(r"[\d\.]+", dependency["version"])
if parsed_version_num:
version = f'v{parsed_version_num[0]}'
version = f"v{parsed_version_num[0]}"
else:
color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
color, symbol, note, version = "lightyellow", "-", "disabled", "-"
path = pretty_path(dependency['path'])
path = pretty_path(dependency["path"])
return ' '.join((
ANSI[color],
symbol,
ANSI['reset'],
name.ljust(21),
version.ljust(14),
ANSI[color],
note.ljust(8),
ANSI['reset'],
path.ljust(76),
))
return " ".join(
(
ANSI[color],
symbol,
ANSI["reset"],
name.ljust(21),
version.ljust(14),
ANSI[color],
note.ljust(8),
ANSI["reset"],
path.ljust(76),
),
)

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox'
__package__ = "archivebox"
import datetime
@@ -13,7 +13,7 @@ django_stubs_ext.monkeypatch()
# monkey patch django timezone to add back utc (it was removed in Django 5.0)
setattr(timezone, 'utc', datetime.timezone.utc)
setattr(timezone, "utc", datetime.UTC)
# monkey patch django-signals-webhooks to change how it shows up in Admin UI
# from signal_webhooks.apps import DjangoSignalWebhooksConfig
@@ -28,28 +28,29 @@ setattr(timezone, 'utc', datetime.timezone.utc)
# Hide site-packages/sonic/client.py:115: SyntaxWarning
# https://github.com/xmonader/python-sonic-client/pull/18
warnings.filterwarnings("ignore", category=SyntaxWarning, module='sonic')
warnings.filterwarnings("ignore", category=SyntaxWarning, module="sonic")
# Make daphne log requests quieter and esier to read
# Make daphne log requests quieter and easier to read
class ModifiedAccessLogGenerator(access.AccessLogGenerator):
"""Clutge workaround until daphne uses the Python logging framework. https://github.com/django/daphne/pull/473/files"""
def write_entry(self, host, date, request, status=None, length=None, ident=None, user=None):
# Ignore noisy requests to staticfiles / favicons / etc.
if 'GET /static/' in request:
if "GET /static/" in request:
return
if "GET /health/" in request:
return
if 'GET /admin/jsi18n/' in request:
if "GET /admin/jsi18n/" in request:
return
if request.endswith("/favicon.ico") or request.endswith("/robots.txt") or request.endswith("/screenshot.png"):
return
if request.endswith('.css') or request.endswith('.js') or request.endswith('.woff') or request.endswith('.ttf'):
if request.endswith(".css") or request.endswith(".js") or request.endswith(".woff") or request.endswith(".ttf"):
return
if str(status) in ('404', '304'):
if str(status) in ("404", "304"):
return
# clean up the log format to mostly match the same format as django.conf.settings.LOGGING rich formats
self.stream.write(
"%s HTTP %s %s %s\n"
@@ -58,13 +59,14 @@ class ModifiedAccessLogGenerator(access.AccessLogGenerator):
request,
status or "-",
"localhost" if host.startswith("127.") else host.split(":")[0],
)
),
)
access.AccessLogGenerator.write_entry = ModifiedAccessLogGenerator.write_entry # type: ignore
access.AccessLogGenerator.write_entry = ModifiedAccessLogGenerator.write_entry # type: ignore
# fix benedict objects to pretty-print/repr more nicely with rich
# https://stackoverflow.com/a/79048811/2156113
# https://rich.readthedocs.io/en/stable/pretty.html#rich-repr-protocol
benedict.benedict.__rich_repr__ = lambda self: (dict(self),) # type: ignore
benedict.benedict.__rich_repr__ = lambda self: (dict(self),) # type: ignore

View File

@@ -1,30 +1,30 @@
__package__ = 'archivebox.misc'
__package__ = "archivebox.misc"
from django.core.paginator import Paginator
from django.utils.functional import cached_property
class AccelleratedPaginator(Paginator):
class AcceleratedPaginator(Paginator):
"""
Accellerated Pagniator ignores DISTINCT when counting total number of rows.
Accelerated paginator ignores DISTINCT when counting total number of rows.
Speeds up SELECT Count(*) on Admin views by >20x.
https://hakibenita.com/optimizing-the-django-admin-paginator
"""
@cached_property
def count(self):
has_filters = getattr(self.object_list, '_has_filters', None)
has_filters = getattr(self.object_list, "_has_filters", None)
if callable(has_filters) and has_filters():
# fallback to normal count method on filtered queryset
return super().count
model = getattr(self.object_list, 'model', None)
model = getattr(self.object_list, "model", None)
if model is None:
return super().count
# otherwise count total rows in a separate fast query
return model.objects.count()
# Alternative approach for PostgreSQL: fallback count takes > 200ms
# from django.db import connection, transaction, OperationalError
# with transaction.atomic(), connection.cursor() as cursor:

View File

@@ -3,26 +3,35 @@ import json
import re
import os
import stat
import asyncio
import posixpath
import mimetypes
import importlib
import queue
import threading
import time
import zipfile
from datetime import datetime
from collections.abc import Callable
from pathlib import Path
from urllib.parse import urlencode
from django.contrib.staticfiles import finders
from django.template import TemplateDoesNotExist, loader
from django.views import static
from django.http import StreamingHttpResponse, Http404, HttpResponse, HttpResponseNotModified
from django.utils._os import safe_join
from django.utils.http import http_date
from django.utils.translation import gettext as _
from archivebox.config.common import SERVER_CONFIG
from archivebox.misc.logging_util import printable_filesize
_HASHES_CACHE: dict[Path, tuple[float, dict[str, str]]] = {}
def _load_hash_map(snapshot_dir: Path) -> dict[str, str] | None:
hashes_path = snapshot_dir / 'hashes' / 'hashes.json'
hashes_path = snapshot_dir / "hashes" / "hashes.json"
if not hashes_path.exists():
return None
try:
@@ -35,11 +44,11 @@ def _load_hash_map(snapshot_dir: Path) -> dict[str, str] | None:
return cached[1]
try:
data = json.loads(hashes_path.read_text(encoding='utf-8'))
data = json.loads(hashes_path.read_text(encoding="utf-8"))
except Exception:
return None
file_map = {str(entry.get('path')): entry.get('hash') for entry in data.get('files', []) if entry.get('path')}
file_map = {str(entry.get("path")): entry.get("hash") for entry in data.get("files", []) if entry.get("path")}
_HASHES_CACHE[hashes_path] = (mtime, file_map)
return file_map
@@ -52,7 +61,192 @@ def _hash_for_path(document_root: Path, rel_path: str) -> str | None:
def _cache_policy() -> str:
return 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
return "public" if SERVER_CONFIG.PUBLIC_SNAPSHOTS else "private"
def _format_direntry_timestamp(stat_result: os.stat_result) -> str:
timestamp = getattr(stat_result, "st_birthtime", None) or stat_result.st_mtime
return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M")
def _safe_zip_stem(name: str) -> str:
safe_name = re.sub(r"[^A-Za-z0-9._-]+", "-", name).strip("._-")
return safe_name or "archivebox"
class _StreamingQueueWriter:
"""Expose a write-only file-like object so zipfile can stream into a queue."""
def __init__(self, output_queue: queue.Queue[bytes | BaseException | object]) -> None:
self.output_queue = output_queue
self.position = 0
def write(self, data: bytes) -> int:
if data:
self.output_queue.put(data)
self.position += len(data)
return len(data)
def tell(self) -> int:
return self.position
def flush(self) -> None:
return None
def close(self) -> None:
return None
def writable(self) -> bool:
return True
def seekable(self) -> bool:
return False
def _iter_visible_files(root: Path):
"""Yield non-hidden files in a stable order so ZIP output is deterministic."""
for current_root, dirnames, filenames in os.walk(root):
dirnames[:] = sorted(dirname for dirname in dirnames if not dirname.startswith("."))
for filename in sorted(name for name in filenames if not name.startswith(".")):
yield Path(current_root) / filename
def _build_directory_zip_response(
fullpath: Path,
path: str,
*,
is_archive_replay: bool,
use_async_stream: bool,
) -> StreamingHttpResponse:
root_name = _safe_zip_stem(fullpath.name or Path(path).name or "archivebox")
sentinel = object()
output_queue: queue.Queue[bytes | BaseException | object] = queue.Queue(maxsize=8)
initial_chunk_target = 64 * 1024
initial_chunk_wait = 0.05
def build_zip() -> None:
# zipfile wants a write-only file object. Feed those bytes straight into
# a queue so the response can stream them out as soon as they are ready.
writer = _StreamingQueueWriter(output_queue)
try:
with zipfile.ZipFile(writer, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=6) as zip_file:
for entry in _iter_visible_files(fullpath):
rel_parts = entry.relative_to(fullpath).parts
arcname = Path(root_name, *rel_parts).as_posix()
zip_file.write(entry, arcname)
except BaseException as err:
output_queue.put(err)
finally:
output_queue.put(sentinel)
threading.Thread(target=build_zip, name=f"zip-stream-{root_name}", daemon=True).start()
def iter_zip_chunks():
# Emit a meaningful first chunk quickly so browsers show the download
# immediately instead of waiting on dozens of tiny ZIP header writes.
first_chunk = bytearray()
initial_deadline = time.monotonic() + initial_chunk_wait
while True:
timeout = max(initial_deadline - time.monotonic(), 0) if len(first_chunk) < initial_chunk_target else None
try:
chunk = output_queue.get(timeout=timeout) if timeout is not None else output_queue.get()
except queue.Empty:
if first_chunk:
yield bytes(first_chunk)
first_chunk.clear()
continue
chunk = output_queue.get()
if chunk is sentinel:
if first_chunk:
yield bytes(first_chunk)
break
if isinstance(chunk, BaseException):
raise chunk
if len(first_chunk) < initial_chunk_target:
first_chunk.extend(chunk)
if len(first_chunk) >= initial_chunk_target or time.monotonic() >= initial_deadline:
yield bytes(first_chunk)
first_chunk.clear()
continue
yield chunk
async def stream_zip_async():
# Django ASGI buffers sync StreamingHttpResponse iterators by consuming
# them into a list. Drive the same sync iterator from a worker thread so
# Daphne can send each chunk as it arrives instead of buffering the ZIP.
iterator = iter(iter_zip_chunks())
while True:
chunk = await asyncio.to_thread(next, iterator, None)
if chunk is None:
break
yield chunk
response = StreamingHttpResponse(
stream_zip_async() if use_async_stream else iter_zip_chunks(),
content_type="application/zip",
)
response.headers["Content-Disposition"] = f'attachment; filename="{root_name}.zip"'
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
response.headers["Last-Modified"] = http_date(fullpath.stat().st_mtime)
response.headers["X-Accel-Buffering"] = "no"
return _apply_archive_replay_headers(
response,
fullpath=fullpath,
content_type="application/zip",
is_archive_replay=is_archive_replay,
)
def _render_directory_index(request, path: str, fullpath: Path) -> HttpResponse:
try:
template = loader.select_template(
[
"static/directory_index.html",
"static/directory_index",
],
)
except TemplateDoesNotExist:
return static.directory_index(path, fullpath)
entries = []
file_list = []
visible_entries = sorted(
(entry for entry in fullpath.iterdir() if not entry.name.startswith(".")),
key=lambda entry: (not entry.is_dir(), entry.name.lower()),
)
for entry in visible_entries:
url = str(entry.relative_to(fullpath))
if entry.is_dir():
url += "/"
file_list.append(url)
stat_result = entry.stat()
entries.append(
{
"name": url,
"url": url,
"is_dir": entry.is_dir(),
"size": "" if entry.is_dir() else printable_filesize(stat_result.st_size),
"timestamp": _format_direntry_timestamp(stat_result),
},
)
zip_query = request.GET.copy()
zip_query["download"] = "zip"
zip_url = request.path
if zip_query:
zip_url = f"{zip_url}?{zip_query.urlencode()}"
context = {
"directory": f"{path}/",
"file_list": file_list,
"entries": entries,
"zip_url": zip_url,
}
return HttpResponse(template.render(context))
# Ensure common web types are mapped consistently across platforms.
@@ -71,16 +265,16 @@ mimetypes.add_type("application/xml", ".xml")
mimetypes.add_type("image/svg+xml", ".svg")
try:
_markdown = getattr(importlib.import_module('markdown'), 'markdown')
_markdown = getattr(importlib.import_module("markdown"), "markdown")
except ImportError:
_markdown: Callable[..., str] | None = None
MARKDOWN_INLINE_LINK_RE = re.compile(r'\[([^\]]+)\]\(([^)\s]+(?:\([^)]*\)[^)\s]*)*)\)')
MARKDOWN_INLINE_IMAGE_RE = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
MARKDOWN_BOLD_RE = re.compile(r'\*\*([^*]+)\*\*')
MARKDOWN_ITALIC_RE = re.compile(r'(?<!\*)\*([^*]+)\*(?!\*)')
HTML_TAG_RE = re.compile(r'<[A-Za-z][^>]*>')
HTML_BODY_RE = re.compile(r'<body[^>]*>(.*)</body>', flags=re.IGNORECASE | re.DOTALL)
MARKDOWN_INLINE_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)\s]+(?:\([^)]*\)[^)\s]*)*)\)")
MARKDOWN_INLINE_IMAGE_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
MARKDOWN_BOLD_RE = re.compile(r"\*\*([^*]+)\*\*")
MARKDOWN_ITALIC_RE = re.compile(r"(?<!\*)\*([^*]+)\*(?!\*)")
HTML_TAG_RE = re.compile(r"<[A-Za-z][^>]*>")
HTML_BODY_RE = re.compile(r"<body[^>]*>(.*)</body>", flags=re.IGNORECASE | re.DOTALL)
RISKY_REPLAY_MIMETYPES = {
"text/html",
"application/xhtml+xml",
@@ -99,8 +293,8 @@ def _extract_markdown_candidate(text: str) -> str:
body_match = HTML_BODY_RE.search(candidate)
if body_match:
candidate = body_match.group(1)
candidate = re.sub(r'^\s*<p[^>]*>', '', candidate, flags=re.IGNORECASE)
candidate = re.sub(r'</p>\s*$', '', candidate, flags=re.IGNORECASE)
candidate = re.sub(r"^\s*<p[^>]*>", "", candidate, flags=re.IGNORECASE)
candidate = re.sub(r"</p>\s*$", "", candidate, flags=re.IGNORECASE)
return candidate.strip()
@@ -109,15 +303,115 @@ def _looks_like_markdown(text: str) -> bool:
if "<html" in lower and "<head" in lower and "</body>" in lower:
return False
md_markers = 0
md_markers += len(re.findall(r'^\s{0,3}#{1,6}\s+\S', text, flags=re.MULTILINE))
md_markers += len(re.findall(r'^\s*[-*+]\s+\S', text, flags=re.MULTILINE))
md_markers += len(re.findall(r'^\s*\d+\.\s+\S', text, flags=re.MULTILINE))
md_markers += text.count('[TOC]')
md_markers += len(re.findall(r"^\s{0,3}#{1,6}\s+\S", text, flags=re.MULTILINE))
md_markers += len(re.findall(r"^\s*[-*+]\s+\S", text, flags=re.MULTILINE))
md_markers += len(re.findall(r"^\s*\d+\.\s+\S", text, flags=re.MULTILINE))
md_markers += text.count("[TOC]")
md_markers += len(MARKDOWN_INLINE_LINK_RE.findall(text))
md_markers += text.count('\n---') + text.count('\n***')
md_markers += text.count("\n---") + text.count("\n***")
return md_markers >= 6
def _render_text_preview_document(text: str, title: str) -> str:
escaped_title = html.escape(title)
escaped_text = html.escape(text)
return f"""<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>{escaped_title}</title>
<style>
:root {{
color-scheme: dark;
}}
html, body {{
margin: 0;
padding: 0;
background: #111;
color: #f3f3f3;
font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", monospace;
}}
.archivebox-text-preview-header {{
position: sticky;
top: 0;
z-index: 1;
padding: 10px 14px;
font-size: 12px;
line-height: 1.4;
color: #bbb;
background: rgba(17, 17, 17, 0.96);
border-bottom: 1px solid rgba(255, 255, 255, 0.08);
backdrop-filter: blur(8px);
}}
.archivebox-text-preview {{
margin: 0;
padding: 14px;
white-space: pre-wrap;
word-break: break-word;
tab-size: 2;
line-height: 1.45;
font-size: 13px;
}}
</style>
</head>
<body>
<div class="archivebox-text-preview-header">{escaped_title}</div>
<pre class="archivebox-text-preview">{escaped_text}</pre>
</body>
</html>"""
def _render_image_preview_document(image_url: str, title: str) -> str:
escaped_title = html.escape(title)
escaped_url = html.escape(image_url, quote=True)
return f"""<!doctype html>
<html lang="en">
<head>
<meta charset="utf-8">
<meta name="viewport" content="width=device-width, initial-scale=1">
<title>{escaped_title}</title>
<style>
:root {{
color-scheme: dark;
}}
html, body {{
margin: 0;
padding: 0;
width: 100%;
min-height: 100%;
background: #fff;
}}
body {{
overflow: auto;
}}
.archivebox-image-preview {{
width: 100%;
min-width: 100%;
min-height: 100vh;
display: flex;
flex-direction: column;
align-items: center;
justify-content: flex-start;
box-sizing: border-box;
}}
.archivebox-image-preview img {{
display: block;
width: auto;
max-width: 100%;
height: auto;
margin: 0 auto;
}}
</style>
</head>
<body>
<div class="archivebox-image-preview">
<img src="{escaped_url}" alt="{escaped_title}">
</div>
</body>
</html>"""
def _render_markdown_fallback(text: str) -> str:
if _markdown is not None and not HTML_TAG_RE.search(text):
try:
@@ -133,11 +427,11 @@ def _render_markdown_fallback(text: str) -> str:
headings = []
def slugify(value: str) -> str:
slug = re.sub(r'[^A-Za-z0-9]+', '-', value).strip('-')
slug = re.sub(r"[^A-Za-z0-9]+", "-", value).strip("-")
return slug or "section"
for raw_line in lines:
heading_match = re.match(r'^\s{0,3}(#{1,6})\s+(.*)$', raw_line)
heading_match = re.match(r"^\s{0,3}(#{1,6})\s+(.*)$", raw_line)
if heading_match:
level = len(heading_match.group(1))
content = heading_match.group(2).strip()
@@ -152,8 +446,8 @@ def _render_markdown_fallback(text: str) -> str:
def render_inline(markup: str) -> str:
content = MARKDOWN_INLINE_IMAGE_RE.sub(r'<img alt="\1" src="\2">', markup)
content = MARKDOWN_INLINE_LINK_RE.sub(r'<a href="\2">\1</a>', content)
content = MARKDOWN_BOLD_RE.sub(r'<strong>\1</strong>', content)
content = MARKDOWN_ITALIC_RE.sub(r'<em>\1</em>', content)
content = MARKDOWN_BOLD_RE.sub(r"<strong>\1</strong>", content)
content = MARKDOWN_ITALIC_RE.sub(r"<em>\1</em>", content)
return content
def close_lists():
@@ -194,7 +488,7 @@ def _render_markdown_fallback(text: str) -> str:
html_lines.append("<br/>")
continue
heading_match = re.match(r'^\s*((?:<[^>]+>\s*)*)(#{1,6})\s+(.*)$', line)
heading_match = re.match(r"^\s*((?:<[^>]+>\s*)*)(#{1,6})\s+(.*)$", line)
if heading_match:
close_lists()
if in_blockquote:
@@ -205,7 +499,7 @@ def _render_markdown_fallback(text: str) -> str:
content = heading_match.group(3).strip()
if leading_tags:
html_lines.append(leading_tags)
html_lines.append(f"<h{level} id=\"{slugify(content)}\">{render_inline(content)}</h{level}>")
html_lines.append(f'<h{level} id="{slugify(content)}">{render_inline(content)}</h{level}>')
continue
if stripped in ("---", "***"):
@@ -226,7 +520,7 @@ def _render_markdown_fallback(text: str) -> str:
html_lines.append("</blockquote>")
in_blockquote = False
ul_match = re.match(r'^\s*[-*+]\s+(.*)$', line)
ul_match = re.match(r"^\s*[-*+]\s+(.*)$", line)
if ul_match:
if in_ol:
html_lines.append("</ol>")
@@ -237,7 +531,7 @@ def _render_markdown_fallback(text: str) -> str:
html_lines.append(f"<li>{render_inline(ul_match.group(1))}</li>")
continue
ol_match = re.match(r'^\s*\d+\.\s+(.*)$', line)
ol_match = re.match(r"^\s*\d+\.\s+(.*)$", line)
if ol_match:
if in_ul:
html_lines.append("</ul>")
@@ -255,10 +549,10 @@ def _render_markdown_fallback(text: str) -> str:
toc_items = []
for level, title, slug in headings:
toc_items.append(
f'<li class="toc-level-{level}"><a href="#{slug}">{title}</a></li>'
f'<li class="toc-level-{level}"><a href="#{slug}">{title}</a></li>',
)
html_lines.append(
'<nav class="toc"><ul>' + "".join(toc_items) + '</ul></nav>'
'<nav class="toc"><ul>' + "".join(toc_items) + "</ul></nav>",
)
continue
@@ -276,8 +570,8 @@ def _render_markdown_fallback(text: str) -> str:
def _render_markdown_document(markdown_text: str) -> str:
body = _render_markdown_fallback(markdown_text)
wrapped = (
"<!doctype html><html><head><meta charset=\"utf-8\">"
"<meta name=\"viewport\" content=\"width=device-width,initial-scale=1\">"
'<!doctype html><html><head><meta charset="utf-8">'
'<meta name="viewport" content="width=device-width,initial-scale=1">'
"<style>body{max-width:900px;margin:24px auto;padding:0 16px;"
"font-family:system-ui,-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif;"
"line-height:1.55;} img{max-width:100%;} pre{background:#f6f6f6;padding:12px;overflow:auto;}"
@@ -338,7 +632,7 @@ def _apply_archive_replay_headers(response: HttpResponse, *, fullpath: Path, con
return response
def serve_static_with_byterange_support(request, path, document_root=None, show_indexes=False, is_archive_replay: bool=False):
def serve_static_with_byterange_support(request, path, document_root=None, show_indexes=False, is_archive_replay: bool = False):
"""
Overrides Django's built-in django.views.static.serve function to support byte range requests.
This allows you to do things like seek into the middle of a huge mp4 or WACZ without downloading the whole file.
@@ -348,13 +642,20 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
path = posixpath.normpath(path).lstrip("/")
fullpath = Path(safe_join(document_root, path))
if os.access(fullpath, os.R_OK) and fullpath.is_dir():
if request.GET.get("download") == "zip" and show_indexes:
return _build_directory_zip_response(
fullpath,
path,
is_archive_replay=is_archive_replay,
use_async_stream=hasattr(request, "scope"),
)
if show_indexes:
response = static.directory_index(path, fullpath)
response = _render_directory_index(request, path, fullpath)
return _apply_archive_replay_headers(response, fullpath=fullpath, content_type="text/html", is_archive_replay=is_archive_replay)
raise Http404(_("Directory indexes are not allowed here."))
if not os.access(fullpath, os.R_OK):
raise Http404(_("%(path)s” does not exist") % {"path": fullpath})
statobj = fullpath.stat()
document_root = Path(document_root) if document_root else None
rel_path = path
@@ -374,27 +675,91 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
not_modified.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
not_modified.headers["Last-Modified"] = http_date(statobj.st_mtime)
return _apply_archive_replay_headers(not_modified, fullpath=fullpath, content_type="", is_archive_replay=is_archive_replay)
content_type, encoding = mimetypes.guess_type(str(fullpath))
content_type = content_type or "application/octet-stream"
# Add charset for text-like types (best guess), but don't override the type.
is_text_like = (
content_type.startswith("text/")
or content_type in {
"application/json",
"application/javascript",
"application/xml",
"application/x-ndjson",
"image/svg+xml",
}
)
is_text_like = content_type.startswith("text/") or content_type in {
"application/json",
"application/javascript",
"application/xml",
"application/x-ndjson",
"image/svg+xml",
}
if is_text_like and "charset=" not in content_type:
content_type = f"{content_type}; charset=utf-8"
preview_as_text_html = (
bool(request.GET.get("preview"))
and is_text_like
and not content_type.startswith("text/html")
and not content_type.startswith("image/svg+xml")
)
preview_as_image_html = (
bool(request.GET.get("preview")) and content_type.startswith("image/") and not content_type.startswith("image/svg+xml")
)
# Respect the If-Modified-Since header for non-markdown responses.
if not (content_type.startswith("text/plain") or content_type.startswith("text/html")):
if not static.was_modified_since(request.META.get("HTTP_IF_MODIFIED_SINCE"), statobj.st_mtime):
return _apply_archive_replay_headers(HttpResponseNotModified(), fullpath=fullpath, content_type=content_type, is_archive_replay=is_archive_replay)
return _apply_archive_replay_headers(
HttpResponseNotModified(),
fullpath=fullpath,
content_type=content_type,
is_archive_replay=is_archive_replay,
)
# Wrap text-like outputs in HTML when explicitly requested for iframe previewing.
if preview_as_text_html:
try:
max_preview_size = 10 * 1024 * 1024
if statobj.st_size <= max_preview_size:
decoded = fullpath.read_text(encoding="utf-8", errors="replace")
wrapped = _render_text_preview_document(decoded, fullpath.name)
response = HttpResponse(wrapped, content_type="text/html; charset=utf-8")
response.headers["Last-Modified"] = http_date(statobj.st_mtime)
if etag:
response.headers["ETag"] = etag
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
else:
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
if encoding:
response.headers["Content-Encoding"] = encoding
return _apply_archive_replay_headers(
response,
fullpath=fullpath,
content_type="text/html; charset=utf-8",
is_archive_replay=is_archive_replay,
)
except Exception:
pass
if preview_as_image_html:
try:
preview_query = request.GET.copy()
preview_query.pop("preview", None)
raw_image_url = request.path
if preview_query:
raw_image_url = f"{raw_image_url}?{urlencode(list(preview_query.lists()), doseq=True)}"
wrapped = _render_image_preview_document(raw_image_url, fullpath.name)
response = HttpResponse(wrapped, content_type="text/html; charset=utf-8")
response.headers["Last-Modified"] = http_date(statobj.st_mtime)
if etag:
response.headers["ETag"] = etag
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
else:
response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
if encoding:
response.headers["Content-Encoding"] = encoding
return _apply_archive_replay_headers(
response,
fullpath=fullpath,
content_type="text/html; charset=utf-8",
is_archive_replay=is_archive_replay,
)
except Exception:
pass
# Heuristic fix: some archived HTML outputs (e.g. mercury content.html)
# are stored with HTML-escaped markup or markdown sources. If so, render sensibly.
@@ -421,7 +786,12 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
if encoding:
response.headers["Content-Encoding"] = encoding
return _apply_archive_replay_headers(response, fullpath=fullpath, content_type="text/html; charset=utf-8", is_archive_replay=is_archive_replay)
return _apply_archive_replay_headers(
response,
fullpath=fullpath,
content_type="text/html; charset=utf-8",
is_archive_replay=is_archive_replay,
)
if escaped_count and escaped_count > tag_count * 2:
response = HttpResponse(decoded, content_type=content_type)
response.headers["Last-Modified"] = http_date(statobj.st_mtime)
@@ -433,11 +803,16 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
if encoding:
response.headers["Content-Encoding"] = encoding
return _apply_archive_replay_headers(response, fullpath=fullpath, content_type=content_type, is_archive_replay=is_archive_replay)
return _apply_archive_replay_headers(
response,
fullpath=fullpath,
content_type=content_type,
is_archive_replay=is_archive_replay,
)
except Exception:
pass
# setup resposne object
# setup response object
ranged_file = RangedFileReader(open(fullpath, "rb"))
response = StreamingHttpResponse(ranged_file, content_type=content_type)
response.headers["Last-Modified"] = http_date(statobj.st_mtime)
@@ -451,7 +826,7 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
if content_type.startswith("image/"):
response.headers["Cache-Control"] = "public, max-age=604800, immutable"
# handle byte-range requests by serving chunk of file
# handle byte-range requests by serving chunk of file
if stat.S_ISREG(statobj.st_mode):
size = statobj.st_size
response["Content-Length"] = size
@@ -460,7 +835,7 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
# Respect the Range header.
if "HTTP_RANGE" in request.META:
try:
ranges = parse_range_header(request.META['HTTP_RANGE'], size)
ranges = parse_range_header(request.META["HTTP_RANGE"], size)
except ValueError:
ranges = None
# only handle syntactically valid headers, that are simple (no
@@ -511,7 +886,7 @@ def parse_range_header(header, resource_size):
Parses a range header into a list of two-tuples (start, stop) where `start`
is the starting byte of the range (inclusive) and `stop` is the ending byte
position of the range (exclusive).
Returns None if the value of the header is not syntatically valid.
Returns None if the value of the header is not syntactically valid.
https://github.com/satchamo/django/commit/2ce75c5c4bee2a858c0214d136bfcd351fcde11d
"""
if not header or "=" not in header:

View File

@@ -1,57 +1,63 @@
__package__ = 'archivebox.core'
__package__ = "archivebox.core"
from rich.console import Console
# helpful imports that make the shell easier to work with out-of-the-box:
import re # noqa
import os # noqa
import sys # noqa
import json # noqa
import psutil # noqa
import django # noqa
import pydantic # noqa
import requests # noqa
import subprocess # noqa
import archivebox # noqa
from benedict import benedict # noqa
from django.utils import timezone # noqa
from datetime import datetime, timedelta # noqa
from django.conf import settings # noqa
import re # noqa
import os # noqa
import sys # noqa
import json # noqa
import psutil # noqa
import django # noqa
import pydantic # noqa
import requests # noqa
import subprocess # noqa
import archivebox
from benedict import benedict # noqa
from django.utils import timezone # noqa
from datetime import datetime, timedelta # noqa
from django.conf import settings # noqa
from archivebox import CONSTANTS # noqa
from archivebox.cli import * # noqa
from archivebox import CONSTANTS # noqa
from archivebox.cli import * # noqa
from archivebox.config.configset import get_config
CONFIG = get_config()
if __name__ == '__main__':
if __name__ == "__main__":
# load the rich extension for ipython for pretty printing
# https://rich.readthedocs.io/en/stable/introduction.html#ipython-extension
get_ipython().run_line_magic('load_ext', 'rich') # type: ignore # noqa
get_ipython().run_line_magic("load_ext", "rich") # type: ignore # noqa
# prnt = print with cropping using ... ellipsis for helptext that doens't matter that much
# prnt = print with cropping using ... ellipsis for helptext that doesn't matter that much
console = Console()
prnt = lambda *args, **kwargs: console.print(*args, overflow='ellipsis', soft_wrap=True, **kwargs)
prnt = lambda *args, **kwargs: console.print(*args, overflow="ellipsis", soft_wrap=True, **kwargs)
# print the welcome message
prnt('[green]import re, os, sys, psutil, subprocess, requests, json, pydantic, benedict, django[/]')
prnt('[yellow4]# ArchiveBox Imports[/]')
prnt('[yellow4]import archivebox[/]')
prnt('[yellow4]from archivebox.cli import *[/]')
prnt("[green]import re, os, sys, psutil, subprocess, requests, json, pydantic, benedict, django[/]")
prnt("[yellow4]# ArchiveBox Imports[/]")
prnt("[yellow4]import archivebox[/]")
prnt("[yellow4]from archivebox.cli import *[/]")
prnt()
if console.width >= 80:
from archivebox.misc.logging import rainbow
prnt(rainbow(archivebox.ASCII_LOGO))
prnt('[i] :heavy_dollar_sign: Welcome to the ArchiveBox Shell!')
prnt(' [deep_sky_blue4]Docs:[/deep_sky_blue4] [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage[/link]')
prnt(' [link=https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html]https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html[/link]')
prnt("[i] :heavy_dollar_sign: Welcome to the ArchiveBox Shell!")
prnt(
" [deep_sky_blue4]Docs:[/deep_sky_blue4] [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage[/link]",
)
prnt(
" [link=https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html]https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html[/link]",
)
prnt()
prnt(' :grey_question: [violet]Hint[/] [i]Here are some examples to get started:[/]')
prnt(' add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink] [grey53]# add ? after anything to get help[/]')
prnt(" :grey_question: [violet]Hint[/] [i]Here are some examples to get started:[/]")
prnt(
" add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink] [grey53]# add ? after anything to get help[/]",
)
prnt(' add("https://example.com/some/new/url") [grey53]# call CLI methods from the shell[/]')
prnt(' snap = Snapshot.objects.filter(url__contains="https://example.com").last() [grey53]# query for individual snapshots[/]')
prnt(' snap.archiveresult_set.all() [grey53]# see extractor plugin results[/]')
prnt(" snap.archiveresult_set.all() [grey53]# see extractor plugin results[/]")
prnt(' bool(re.compile(CONFIG.URL_DENYLIST).search("https://example.com/abc.exe")) [grey53]# test out a config change[/]')

View File

@@ -1,4 +1,4 @@
__package__ = 'archivebox.misc'
__package__ = "archivebox.misc"
import os
@@ -8,7 +8,6 @@ import sys
from json import dump
from pathlib import Path
from typing import Optional, Union, Tuple
from subprocess import PIPE, Popen, CalledProcessError, CompletedProcess, TimeoutExpired
from atomicwrites import atomic_write as lib_atomic_write
@@ -16,29 +15,30 @@ from atomicwrites import atomic_write as lib_atomic_write
from archivebox.config.common import STORAGE_CONFIG
from archivebox.misc.util import enforce_types, ExtendedEncoder
IS_WINDOWS = os.name == 'nt'
IS_WINDOWS = os.name == "nt"
def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs):
"""Patched of subprocess.run to kill forked child subprocesses and fix blocking io making timeout=innefective
Mostly copied from https://github.com/python/cpython/blob/master/Lib/subprocess.py
Mostly copied from https://github.com/python/cpython/blob/master/Lib/subprocess.py
"""
cmd = [str(arg) for arg in cmd]
if input is not None:
if kwargs.get('stdin') is not None:
raise ValueError('stdin and input arguments may not both be used.')
kwargs['stdin'] = PIPE
if kwargs.get("stdin") is not None:
raise ValueError("stdin and input arguments may not both be used.")
kwargs["stdin"] = PIPE
if capture_output:
if ('stdout' in kwargs) or ('stderr' in kwargs):
raise ValueError('stdout and stderr arguments may not be used with capture_output.')
kwargs['stdout'] = PIPE
kwargs['stderr'] = PIPE
if ("stdout" in kwargs) or ("stderr" in kwargs):
raise ValueError("stdout and stderr arguments may not be used with capture_output.")
kwargs["stdout"] = PIPE
kwargs["stderr"] = PIPE
pgid = None
try:
if isinstance(cmd, (list, tuple)) and cmd[0].endswith('.py'):
if isinstance(cmd, (list, tuple)) and cmd[0].endswith(".py"):
PYTHON_BINARY = sys.executable
cmd = (PYTHON_BINARY, *cmd)
@@ -69,8 +69,12 @@ def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False,
retcode = process.poll()
if check and retcode:
raise CalledProcessError(retcode, process.args,
output=stdout, stderr=stderr)
raise CalledProcessError(
retcode,
process.args,
output=stdout,
stderr=stderr,
)
finally:
# force kill any straggler subprocesses that were forked from the main proc
try:
@@ -83,11 +87,11 @@ def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False,
@enforce_types
def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], overwrite: bool=True) -> None:
def atomic_write(path: Path | str, contents: dict | str | bytes, overwrite: bool = True) -> None:
"""Safe atomic write to filesystem by writing to temp file + atomic rename"""
mode = 'wb+' if isinstance(contents, bytes) else 'w'
encoding = None if isinstance(contents, bytes) else 'utf-8' # enforce utf-8 on all text writes
mode = "wb+" if isinstance(contents, bytes) else "w"
encoding = None if isinstance(contents, bytes) else "utf-8" # enforce utf-8 on all text writes
# print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
try:
@@ -99,8 +103,12 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over
except OSError as e:
if STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES:
print(f"[X] OSError: Failed to write {path} with fcntl.F_FULLFSYNC. ({e})")
print(" You can store the archive/ subfolder on a hard drive or network share that doesn't support support syncronous writes,")
print(" but the main folder containing the index.sqlite3 and ArchiveBox.conf files must be on a filesystem that supports FSYNC.")
print(
" You can store the archive/ subfolder on a hard drive or network share that doesn't support support synchronous writes,",
)
print(
" but the main folder containing the index.sqlite3 and ArchiveBox.conf files must be on a filesystem that supports FSYNC.",
)
raise SystemExit(1)
# retry the write without forcing FSYNC (aka atomic mode)
@@ -113,19 +121,20 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over
# set file permissions
os.chmod(path, int(STORAGE_CONFIG.OUTPUT_PERMISSIONS, base=8))
@enforce_types
def chmod_file(path: str, cwd: str='') -> None:
def chmod_file(path: str, cwd: str = "") -> None:
"""chmod -R <permissions> <cwd>/<path>"""
root = Path(cwd or os.getcwd()) / path
if not os.access(root, os.R_OK):
raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
raise Exception(f"Failed to chmod: {path} does not exist (did the previous step fail?)")
if not root.is_dir():
# path is just a plain file
os.chmod(root, int(STORAGE_CONFIG.OUTPUT_PERMISSIONS, base=8))
else:
for subpath in Path(path).glob('**/*'):
for subpath in Path(path).glob("**/*"):
if subpath.is_dir():
# directories need execute permissions to be able to list contents
os.chmod(subpath, int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8))
@@ -134,24 +143,24 @@ def chmod_file(path: str, cwd: str='') -> None:
@enforce_types
def copy_and_overwrite(from_path: Union[str, Path], to_path: Union[str, Path]):
def copy_and_overwrite(from_path: str | Path, to_path: str | Path):
"""copy a given file or directory to a given path, overwriting the destination"""
assert os.access(from_path, os.R_OK)
if Path(from_path).is_dir():
shutil.rmtree(to_path, ignore_errors=True)
shutil.copytree(from_path, to_path)
else:
with open(from_path, 'rb') as src:
with open(from_path, "rb") as src:
contents = src.read()
atomic_write(to_path, contents)
@enforce_types
def get_dir_size(path: Union[str, Path], recursive: bool=True, pattern: Optional[str]=None) -> Tuple[int, int, int]:
"""get the total disk size of a given directory, optionally summing up
recursively and limiting to a given filter list
def get_dir_size(path: str | Path, recursive: bool = True, pattern: str | None = None) -> tuple[int, int, int]:
"""get the total disk size of a given directory, optionally summing up
recursively and limiting to a given filter list
"""
num_bytes, num_dirs, num_files = 0, 0, 0
try:
@@ -174,20 +183,21 @@ def get_dir_size(path: Union[str, Path], recursive: bool=True, pattern: Optional
pass
return num_bytes, num_dirs, num_files
class suppress_output(object):
class suppress_output:
"""
A context manager for doing a "deep suppression" of stdout and stderr in
Python, i.e. will suppress all print, even if the print originates in a
A context manager for doing a "deep suppression" of stdout and stderr in
Python, i.e. will suppress all print, even if the print originates in a
compiled C/Fortran sub-function.
This will not suppress raised exceptions, since exceptions are printed
to stderr just before a script exits, and after the context manager has
exited (at least, I think that is why it lets exceptions through).
exited (at least, I think that is why it lets exceptions through).
with suppress_stdout_stderr():
rogue_function()
"""
def __init__(self, stdout=True, stderr=True):
# Open a pair of null files
# Save the actual stdout (1) and stderr (2) file descriptors.

View File

@@ -1,4 +1,5 @@
from typing import Any, List, Callable, cast
from typing import Any, cast
from collections.abc import Callable
import json
import ast
@@ -12,15 +13,16 @@ from pathlib import Path, PosixPath
from pydantic.json_schema import GenerateJsonSchema
from pydantic_core import to_jsonable_python
JSONValue = str | bool | int | None | List['JSONValue']
JSONValue = str | bool | int | None | list["JSONValue"]
TOML_HEADER = "# Converted from INI to TOML format: https://toml.io/en/\n\n"
def load_ini_value(val: str) -> JSONValue:
"""Convert lax INI values into strict TOML-compliant (JSON) values"""
if val.lower() in ('true', 'yes', '1'):
if val.lower() in ("true", "yes", "1"):
return True
if val.lower() in ('false', 'no', '0'):
if val.lower() in ("false", "no", "0"):
return False
if val.isdigit():
return int(val)
@@ -34,7 +36,7 @@ def load_ini_value(val: str) -> JSONValue:
return json.loads(val)
except Exception:
pass
return val
@@ -42,7 +44,7 @@ def convert(ini_str: str) -> str:
"""Convert a string of INI config into its TOML equivalent (warning: strips comments)"""
config = configparser.ConfigParser()
setattr(config, 'optionxform', str) # capitalize key names
setattr(config, "optionxform", str) # capitalize key names
config.read_string(ini_str)
# Initialize an empty dictionary to store the TOML representation
@@ -70,22 +72,22 @@ def convert(ini_str: str) -> str:
return toml_str.strip()
class JSONSchemaWithLambdas(GenerateJsonSchema):
"""
Encode lambda functions in default values properly.
Usage:
>>> json.dumps(value, encoder=JSONSchemaWithLambdas())
"""
def encode_default(self, dft: Any) -> Any:
config = self._config
if isinstance(dft, Callable):
return '{{lambda ' + inspect.getsource(dft).split('=lambda ')[-1].strip()[:-1] + '}}'
return "{{lambda " + inspect.getsource(dft).split("=lambda ")[-1].strip()[:-1] + "}}"
return to_jsonable_python(
dft,
timedelta_mode=config.ser_json_timedelta,
bytes_mode=config.ser_json_bytes,
serialize_unknown=True
serialize_unknown=True,
)
# for computed_field properties render them like this instead:
@@ -94,19 +96,21 @@ class JSONSchemaWithLambdas(GenerateJsonSchema):
def better_toml_dump_str(val: Any) -> str:
try:
dump_str = cast(Callable[[Any], str], getattr(toml.encoder, '_dump_str'))
dump_str = cast(Callable[[Any], str], getattr(toml.encoder, "_dump_str"))
return dump_str(val)
except Exception:
# if we hit any of toml's numerous encoding bugs,
# fall back to using json representation of string
return json.dumps(str(val))
class CustomTOMLEncoder(toml.encoder.TomlEncoder):
"""
Custom TomlEncoder to work around https://github.com/uiri/toml's many encoding bugs.
More info: https://github.com/fabiocaccamo/python-benedict/issues/439
>>> toml.dumps(value, encoder=CustomTOMLEncoder())
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
dump_funcs = cast(dict[Any, Callable[[Any], str]], self.dump_funcs)

View File

@@ -1,12 +1,14 @@
__package__ = 'archivebox.misc'
__package__ = "archivebox.misc"
import re
import requests
import json as pyjson
import http.cookiejar
from decimal import Decimal, InvalidOperation
from dateparser import parse as dateparser
from typing import List, Optional, Any, Callable
from typing import Any
from collections.abc import Callable
from pathlib import Path
from inspect import signature
from functools import wraps
@@ -18,8 +20,10 @@ from requests.exceptions import RequestException, ReadTimeout
from base32_crockford import encode as base32_encode
from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
try:
import chardet # type:ignore
import chardet # type:ignore
detect_encoding = lambda rawdata: chardet.detect(rawdata)["encoding"]
except ImportError:
detect_encoding = lambda rawdata: "utf-8"
@@ -35,57 +39,135 @@ from .logging import COLOR_DICT
# All of these are (str) -> str
# shortcuts to: https://docs.python.org/3/library/urllib.parse.html#url-parsing
scheme = lambda url: urlparse(url).scheme.lower()
without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//')
without_path = lambda url: urlparse(url)._replace(path='', fragment='', query='').geturl().strip('//')
without_scheme = lambda url: urlparse(url)._replace(scheme="").geturl().strip("//")
without_query = lambda url: urlparse(url)._replace(query="").geturl().strip("//")
without_fragment = lambda url: urlparse(url)._replace(fragment="").geturl().strip("//")
without_path = lambda url: urlparse(url)._replace(path="", fragment="", query="").geturl().strip("//")
path = lambda url: urlparse(url).path
basename = lambda url: urlparse(url).path.rsplit('/', 1)[-1]
basename = lambda url: urlparse(url).path.rsplit("/", 1)[-1]
domain = lambda url: urlparse(url).netloc
query = lambda url: urlparse(url).query
fragment = lambda url: urlparse(url).fragment
extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
extension = lambda url: basename(url).rsplit(".", 1)[-1].lower() if "." in basename(url) else ""
base_url = lambda url: without_scheme(url) # uniq base url used to dedupe links
without_www = lambda url: url.replace('://www.', '://', 1)
without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20]
without_www = lambda url: url.replace("://www.", "://", 1)
without_trailing_slash = lambda url: url[:-1] if url[-1] == "/" else url.replace("/?", "?")
hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode("utf-8")).hexdigest(), 16))[:20]
urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
urlencode = lambda s: s and quote(s, encoding="utf-8", errors="replace")
urldecode = lambda s: s and unquote(s)
htmlencode = lambda s: s and escape(s, quote=True)
htmldecode = lambda s: s and unescape(s)
def short_ts(ts: Any) -> str | None:
parsed = parse_date(ts)
return None if parsed is None else str(parsed.timestamp()).split('.')[0]
return None if parsed is None else str(parsed.timestamp()).split(".")[0]
def ts_to_date_str(ts: Any) -> str | None:
parsed = parse_date(ts)
return None if parsed is None else parsed.strftime('%Y-%m-%d %H:%M')
return None if parsed is None else parsed.strftime("%Y-%m-%d %H:%M")
def ts_to_iso(ts: Any) -> str | None:
parsed = parse_date(ts)
return None if parsed is None else parsed.isoformat()
COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m')
COLOR_REGEX = re.compile(r"\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m")
# https://mathiasbynens.be/demo/url-regex
URL_REGEX = re.compile(
r'(?=('
r'http[s]?://' # start matching from allowed schemes
r'(?:[a-zA-Z]|[0-9]' # followed by allowed alphanum characters
r'|[-_$@.&+!*\(\),]' # or allowed symbols (keep hyphen first to match literal hyphen)
r'|[^\u0000-\u007F])+' # or allowed unicode bytes
r'[^\]\[<>"\'\s]+' # stop parsing at these symbols
r'))',
r"(?=("
r"http[s]?://" # start matching from allowed schemes
r"(?:[a-zA-Z]|[0-9]" # followed by allowed alphanum characters
r"|[-_$@.&+!*\(\),]" # or allowed symbols (keep hyphen first to match literal hyphen)
r"|[^\u0000-\u007F])+" # or allowed unicode bytes
r'[^\]\[<>"\'\s]+' # stop parsing at these symbols
r"))",
re.IGNORECASE | re.UNICODE,
)
def parens_are_matched(string: str, open_char='(', close_char=')'):
QUOTE_DELIMITERS = (
'"',
"'",
"`",
"",
"",
"",
"",
)
QUOTE_ENTITY_DELIMITERS = (
"&quot;",
"&#34;",
"&#x22;",
"&apos;",
"&#39;",
"&#x27;",
)
URL_ENTITY_REPLACEMENTS = (
("&amp;", "&"),
("&#38;", "&"),
("&#x26;", "&"),
)
FILESIZE_UNITS: dict[str, int] = {
"": 1,
"b": 1,
"byte": 1,
"bytes": 1,
"k": 1024,
"kb": 1024,
"kib": 1024,
"m": 1024**2,
"mb": 1024**2,
"mib": 1024**2,
"g": 1024**3,
"gb": 1024**3,
"gib": 1024**3,
"t": 1024**4,
"tb": 1024**4,
"tib": 1024**4,
}
def sanitize_extracted_url(url: str) -> str:
"""Trim quote garbage and dangling prose punctuation from an extracted URL candidate."""
cleaned = (url or "").strip()
if not cleaned:
return cleaned
lower_cleaned = cleaned.lower()
cut_index = len(cleaned)
for delimiter in QUOTE_DELIMITERS:
found_index = cleaned.find(delimiter)
if found_index != -1:
cut_index = min(cut_index, found_index)
for delimiter in QUOTE_ENTITY_DELIMITERS:
found_index = lower_cleaned.find(delimiter)
if found_index != -1:
cut_index = min(cut_index, found_index)
cleaned = cleaned[:cut_index].strip()
lower_cleaned = cleaned.lower()
for entity, replacement in URL_ENTITY_REPLACEMENTS:
while entity in lower_cleaned:
entity_index = lower_cleaned.find(entity)
cleaned = cleaned[:entity_index] + replacement + cleaned[entity_index + len(entity) :]
lower_cleaned = cleaned.lower()
cleaned = cleaned.rstrip(".,;:!?\\'\"")
cleaned = cleaned.rstrip('"')
return cleaned
def parens_are_matched(string: str, open_char="(", close_char=")"):
"""check that all parentheses in a string are balanced and nested properly"""
count = 0
for c in string:
@@ -97,6 +179,7 @@ def parens_are_matched(string: str, open_char='(', close_char=')'):
return False
return count == 0
def fix_url_from_markdown(url_str: str) -> str:
"""
cleanup a regex-parsed url that may contain dangling trailing parens from markdown link syntax
@@ -113,46 +196,91 @@ def fix_url_from_markdown(url_str: str) -> str:
# cut off one trailing character at a time
# until parens are balanced e.g. /a(b)c).x(y)z -> /a(b)c
while not parens_are_matched(trimmed_url):
while trimmed_url and not parens_are_matched(trimmed_url):
trimmed_url = trimmed_url[:-1]
# make sure trimmed url is still valid
if re.findall(URL_REGEX, trimmed_url):
if any(match == trimmed_url for match in re.findall(URL_REGEX, trimmed_url)):
return trimmed_url
return url_str
def split_comma_separated_urls(url: str):
offset = 0
while True:
http_index = url.find('http://', 1)
https_index = url.find('https://', 1)
http_index = url.find("http://", 1)
https_index = url.find("https://", 1)
next_indices = [idx for idx in (http_index, https_index) if idx != -1]
if not next_indices:
yield offset, url
return
next_index = min(next_indices)
if url[next_index - 1] != ',':
if url[next_index - 1] != ",":
yield offset, url
return
yield offset, url[:next_index - 1]
yield offset, url[: next_index - 1]
offset += next_index
url = url[next_index:]
def find_all_urls(urls_str: str):
skipped_starts = set()
for match in re.finditer(URL_REGEX, urls_str):
if match.start() in skipped_starts:
continue
for offset, url in split_comma_separated_urls(fix_url_from_markdown(match.group(1))):
cleaned_match = sanitize_extracted_url(fix_url_from_markdown(match.group(1)))
for offset, url in split_comma_separated_urls(cleaned_match):
if offset:
skipped_starts.add(match.start() + offset)
yield url
def parse_filesize_to_bytes(value: str | int | float | None) -> int:
"""
Parse a byte count from an integer or human-readable string like 45mb or 2 GB.
"""
if value is None:
return 0
if isinstance(value, bool):
raise ValueError("Size value must be an integer or size string.")
if isinstance(value, int):
return value
if isinstance(value, float):
if not value.is_integer():
raise ValueError("Size value must resolve to a whole number of bytes.")
return int(value)
raw_value = str(value).strip()
if not raw_value:
return 0
if raw_value.isdigit():
return int(raw_value)
match = re.fullmatch(r"(?i)(\d+(?:\.\d+)?)\s*([a-z]+)", raw_value)
if not match:
raise ValueError(f"Invalid size value: {value}")
amount_str, unit_str = match.groups()
multiplier = FILESIZE_UNITS.get(unit_str.lower())
if multiplier is None:
raise ValueError(f"Unknown size unit: {unit_str}")
try:
amount = Decimal(amount_str)
except InvalidOperation as err:
raise ValueError(f"Invalid size value: {value}") from err
return int(amount * multiplier)
def is_static_file(url: str):
# TODO: the proper way is with MIME type detection + ext, not only extension
return extension(url).lower() in CONSTANTS.STATICFILE_EXTENSIONS
@@ -178,14 +306,14 @@ def enforce_types(func):
if annotation is not None and annotation.__class__ is type:
if not isinstance(arg_val, annotation):
raise TypeError(
'{}(..., {}: {}) got unexpected {} argument {}={}'.format(
"{}(..., {}: {}) got unexpected {} argument {}={}".format(
func.__name__,
arg_key,
annotation.__name__,
type(arg_val).__name__,
arg_key,
str(arg_val)[:64],
)
),
)
# check args
@@ -201,12 +329,14 @@ def enforce_types(func):
return typechecked_function
def docstring(text: Optional[str]):
def docstring(text: str | None):
"""attach the given docstring to the decorated function"""
def decorator(func):
if text:
func.__doc__ = text
return func
return decorator
@@ -224,7 +354,7 @@ def str_between(string: str, start: str, end: str | None = None) -> str:
@enforce_types
def parse_date(date: Any) -> datetime | None:
"""Parse unix timestamps, iso format, and human-readable strings"""
if date is None:
return None
@@ -233,16 +363,16 @@ def parse_date(date: Any) -> datetime | None:
return date.replace(tzinfo=timezone.utc)
offset = date.utcoffset()
assert offset == datetime.now(timezone.utc).utcoffset(), 'Refusing to load a non-UTC date!'
assert offset == datetime.now(timezone.utc).utcoffset(), "Refusing to load a non-UTC date!"
return date
if isinstance(date, (float, int)):
date = str(date)
if isinstance(date, str):
normalized = date.strip()
if not normalized:
raise ValueError(f'Tried to parse invalid date string! {date}')
raise ValueError(f"Tried to parse invalid date string! {date}")
try:
return datetime.fromtimestamp(float(normalized), tz=timezone.utc)
@@ -250,7 +380,7 @@ def parse_date(date: Any) -> datetime | None:
pass
try:
iso_date = normalized.replace('Z', '+00:00')
iso_date = normalized.replace("Z", "+00:00")
parsed_date = datetime.fromisoformat(iso_date)
if parsed_date.tzinfo is None:
return parsed_date.replace(tzinfo=timezone.utc)
@@ -258,12 +388,12 @@ def parse_date(date: Any) -> datetime | None:
except ValueError:
pass
parsed_date = dateparser(normalized, settings={'TIMEZONE': 'UTC'})
parsed_date = dateparser(normalized, settings={"TIMEZONE": "UTC"})
if parsed_date is None:
raise ValueError(f'Tried to parse invalid date string! {date}')
raise ValueError(f"Tried to parse invalid date string! {date}")
return parsed_date.astimezone(timezone.utc)
raise ValueError('Tried to parse invalid date! {}'.format(date))
raise ValueError(f"Tried to parse invalid date! {date}")
@enforce_types
@@ -284,12 +414,12 @@ def download_url(url: str, timeout: int | None = None) -> str:
response = session.get(
url,
headers={'User-Agent': ARCHIVING_CONFIG.USER_AGENT},
headers={"User-Agent": ARCHIVING_CONFIG.USER_AGENT},
verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY,
timeout=timeout,
)
content_type = response.headers.get('Content-Type', '')
content_type = response.headers.get("Content-Type", "")
encoding = http_content_type_encoding(content_type) or html_body_declared_encoding(response.text)
if encoding is not None:
@@ -299,21 +429,22 @@ def download_url(url: str, timeout: int | None = None) -> str:
return response.text
except UnicodeDecodeError:
# if response is non-test (e.g. image or other binary files), just return the filename instead
return url.rsplit('/', 1)[-1]
return url.rsplit("/", 1)[-1]
@enforce_types
def get_headers(url: str, timeout: int | None=None) -> str:
def get_headers(url: str, timeout: int | None = None) -> str:
"""Download the contents of a remote url and return the headers"""
# TODO: get rid of this and use an abx pluggy hook instead
from archivebox.config.common import ARCHIVING_CONFIG
timeout = timeout or ARCHIVING_CONFIG.TIMEOUT
try:
response = requests.head(
url,
headers={'User-Agent': ARCHIVING_CONFIG.USER_AGENT},
headers={"User-Agent": ARCHIVING_CONFIG.USER_AGENT},
verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY,
timeout=timeout,
allow_redirects=True,
@@ -325,19 +456,19 @@ def get_headers(url: str, timeout: int | None=None) -> str:
except RequestException:
response = requests.get(
url,
headers={'User-Agent': ARCHIVING_CONFIG.USER_AGENT},
headers={"User-Agent": ARCHIVING_CONFIG.USER_AGENT},
verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY,
timeout=timeout,
stream=True
stream=True,
)
return pyjson.dumps(
{
'URL': url,
'Status-Code': response.status_code,
'Elapsed': response.elapsed.total_seconds()*1000,
'Encoding': str(response.encoding),
'Apparent-Encoding': response.apparent_encoding,
"URL": url,
"Status-Code": response.status_code,
"Elapsed": response.elapsed.total_seconds() * 1000,
"Encoding": str(response.encoding),
"Apparent-Encoding": response.apparent_encoding,
**dict(response.headers),
},
indent=4,
@@ -352,17 +483,17 @@ def ansi_to_html(text: str) -> str:
"""
TEMPLATE = '<span style="color: rgb{}"><br>'
text = text.replace('[m', '</span>')
text = text.replace("[m", "</span>")
def single_sub(match):
argsdict = match.groupdict()
if argsdict['arg_3'] is None:
if argsdict['arg_2'] is None:
_, color = 0, argsdict['arg_1']
if argsdict["arg_3"] is None:
if argsdict["arg_2"] is None:
_, color = 0, argsdict["arg_1"]
else:
_, color = argsdict['arg_1'], argsdict['arg_2']
_, color = argsdict["arg_1"], argsdict["arg_2"]
else:
_, color = argsdict['arg_3'], argsdict['arg_2']
_, color = argsdict["arg_3"], argsdict["arg_2"]
return TEMPLATE.format(COLOR_DICT[color][0])
@@ -370,20 +501,19 @@ def ansi_to_html(text: str) -> str:
@enforce_types
def dedupe(options: List[str]) -> List[str]:
def dedupe(options: list[str]) -> list[str]:
"""
Deduplicates the given CLI args by key=value. Options that come later override earlier.
"""
deduped = {}
for option in options:
key = option.split('=')[0]
key = option.split("=")[0]
deduped[key] = option
return list(deduped.values())
class ExtendedEncoder(pyjson.JSONEncoder):
"""
Extended json serializer that supports serializing several model
@@ -393,7 +523,7 @@ class ExtendedEncoder(pyjson.JSONEncoder):
def default(self, o):
cls_name = o.__class__.__name__
if hasattr(o, '_asdict'):
if hasattr(o, "_asdict"):
return o._asdict()
elif isinstance(o, bytes):
@@ -403,12 +533,12 @@ class ExtendedEncoder(pyjson.JSONEncoder):
return o.isoformat()
elif isinstance(o, Exception):
return '{}: {}'.format(o.__class__.__name__, o)
return f"{o.__class__.__name__}: {o}"
elif isinstance(o, Path):
return str(o)
elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
elif cls_name in ("dict_items", "dict_keys", "dict_values"):
return list(o)
elif isinstance(o, Callable):
@@ -434,7 +564,7 @@ class ExtendedEncoder(pyjson.JSONEncoder):
@enforce_types
def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True) -> str:
def to_json(obj: Any, indent: int | None = 4, sort_keys: bool = True) -> str:
"""Serialize object to JSON string with extended type support"""
return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
@@ -447,97 +577,114 @@ def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True) -> str:
# the consequences of bad URL parsing could be disastrous and lead to many
# incorrect/badly parsed links being added to the archive, so this is worth the cost of checking
assert fix_url_from_markdown('http://example.com/a(b)c).x(y)z') == 'http://example.com/a(b)c'
assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext') == 'https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def'
assert fix_url_from_markdown("http://example.com/a(b)c).x(y)z") == "http://example.com/a(b)c"
assert (
fix_url_from_markdown("https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext")
== "https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def"
)
URL_REGEX_TESTS = [
('https://example.com', ['https://example.com']),
('https://sweeting.me,https://google.com', ['https://sweeting.me', 'https://google.com']),
('http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234', ['http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234']),
('https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ abc', ['https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ']),
('<a href="https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ"> abc', ['https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ']),
('///a', []),
('http://', []),
('http://../', ['http://../']),
('http://-error-.invalid/', ['http://-error-.invalid/']),
('https://a(b)c+1#2?3&4/', ['https://a(b)c+1#2?3&4/']),
('http://उदाहरण.परीक्षा', ['http://उदाहरण.परीक्षा']),
('http://例子.测试', ['http://例子.测试']),
('http://➡.ws/䨹 htps://abc.1243?234', ['http://➡.ws/䨹']),
('http://⌘.ws">https://exa+mple.com//:abc ', ['http://⌘.ws', 'https://exa+mple.com//:abc']),
('http://مثال.إختبار/abc?def=ت&ب=abc#abc=234', ['http://مثال.إختبار/abc?def=ت&ب=abc#abc=234']),
('http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c\'om', ['http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c']),
('http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3', ['http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3', 'http://ex.co:19/a?_d=4#-a=2.3']),
('http://code.google.com/events/#&product=browser', ['http://code.google.com/events/#&product=browser']),
('http://foo.bar?q=Spaces should be encoded', ['http://foo.bar?q=Spaces']),
('http://foo.com/blah_(wikipedia)#c(i)t[e]-1', ['http://foo.com/blah_(wikipedia)#c(i)t']),
('http://foo.com/(something)?after=parens', ['http://foo.com/(something)?after=parens']),
('http://foo.com/unicode_(✪)_in_parens) abc', ['http://foo.com/unicode_(✪)_in_parens']),
('http://foo.bar/?q=Test%20URL-encoded%20stuff', ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
('[xyz](http://a.b/?q=(Test)%20U)RL-encoded%20stuff', ['http://a.b/?q=(Test)%20U']),
('[xyz](http://a.b/?q=(Test)%20U)-ab https://abc+123', ['http://a.b/?q=(Test)%20U', 'https://abc+123']),
('[xyz](http://a.b/?q=(Test)%20U) https://a(b)c+12)3', ['http://a.b/?q=(Test)%20U', 'https://a(b)c+12']),
('[xyz](http://a.b/?q=(Test)a\nabchttps://a(b)c+12)3', ['http://a.b/?q=(Test)a', 'https://a(b)c+12']),
('http://foo.bar/?q=Test%20URL-encoded%20stuff', ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
("https://example.com", ["https://example.com"]),
("https://sweeting.me,https://google.com", ["https://sweeting.me", "https://google.com"]),
(
"http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234",
["http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234"],
),
(
"https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ abc",
[
"https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ",
"https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ",
],
),
(
'<a href="https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ"> abc',
[
"https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ",
"https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ",
],
),
("///a", []),
("http://", []),
("http://../", ["http://../"]),
("http://-error-.invalid/", ["http://-error-.invalid/"]),
("https://a(b)c+1#2?3&4/", ["https://a(b)c+1#2?3&4/"]),
("http://उदाहरण.परीक्षा", ["http://उदाहरण.परीक्षा"]),
("http://例子.测试", ["http://例子.测试"]),
("http://➡.ws/䨹 htps://abc.1243?234", ["http://➡.ws/䨹"]),
('http://⌘.ws">https://exa+mple.com//:abc ', ["http://⌘.ws", "https://exa+mple.com//:abc"]),
("http://مثال.إختبار/abc?def=ت&ب=abc#abc=234", ["http://مثال.إختبار/abc?def=ت&ب=abc#abc=234"]),
("http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c'om", ["http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c"]),
(
"http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3",
["http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3", "http://ex.co:19/a?_d=4#-a=2.3"],
),
("http://code.google.com/events/#&product=browser", ["http://code.google.com/events/#&product=browser"]),
("http://foo.bar?q=Spaces should be encoded", ["http://foo.bar?q=Spaces"]),
("http://foo.com/blah_(wikipedia)#c(i)t[e]-1", ["http://foo.com/blah_(wikipedia)#c(i)t"]),
("http://foo.com/(something)?after=parens", ["http://foo.com/(something)?after=parens"]),
("http://foo.com/unicode_(✪)_in_parens) abc", ["http://foo.com/unicode_(✪)_in_parens"]),
("http://foo.bar/?q=Test%20URL-encoded%20stuff", ["http://foo.bar/?q=Test%20URL-encoded%20stuff"]),
("[xyz](http://a.b/?q=(Test)%20U)RL-encoded%20stuff", ["http://a.b/?q=(Test)%20U"]),
("[xyz](http://a.b/?q=(Test)%20U)-ab https://abc+123", ["http://a.b/?q=(Test)%20U", "https://abc+123"]),
("[xyz](http://a.b/?q=(Test)%20U) https://a(b)c+12)3", ["http://a.b/?q=(Test)%20U", "https://a(b)c+12"]),
("[xyz](http://a.b/?q=(Test)a\nabchttps://a(b)c+12)3", ["http://a.b/?q=(Test)a", "https://a(b)c+12"]),
("http://foo.bar/?q=Test%20URL-encoded%20stuff", ["http://foo.bar/?q=Test%20URL-encoded%20stuff"]),
]
for urls_str, expected_url_matches in URL_REGEX_TESTS:
url_matches = list(find_all_urls(urls_str))
assert url_matches == expected_url_matches, 'FAILED URL_REGEX CHECK!'
assert url_matches == expected_url_matches, "FAILED URL_REGEX CHECK!"
# More test cases
_test_url_strs = {
'example.com': 0,
'/example.com': 0,
'//example.com': 0,
':/example.com': 0,
'://example.com': 0,
'htt://example8.com': 0,
'/htt://example.com': 0,
'https://example': 1,
'https://localhost/2345': 1,
'https://localhost:1234/123': 1,
'://': 0,
'https://': 0,
'http://': 0,
'ftp://': 0,
'ftp://example.com': 0,
'https://example.com': 1,
'https://example.com/': 1,
'https://a.example.com': 1,
'https://a.example.com/': 1,
'https://a.example.com/what/is/happening.html': 1,
'https://a.example.com/what/ís/happening.html': 1,
'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
'https://example.com?what=1#how-about-this=1&2%20baf': 1,
'<test>http://example7.com</test>': 1,
'https://<test>': 0,
'https://[test]': 0,
"example.com": 0,
"/example.com": 0,
"//example.com": 0,
":/example.com": 0,
"://example.com": 0,
"htt://example8.com": 0,
"/htt://example.com": 0,
"https://example": 1,
"https://localhost/2345": 1,
"https://localhost:1234/123": 1,
"://": 0,
"https://": 0,
"http://": 0,
"ftp://": 0,
"ftp://example.com": 0,
"https://example.com": 1,
"https://example.com/": 1,
"https://a.example.com": 1,
"https://a.example.com/": 1,
"https://a.example.com/what/is/happening.html": 1,
"https://a.example.com/what/ís/happening.html": 1,
"https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a": 1,
"https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a": 1,
"HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b": 1,
"https://example.com/?what=1#how-about-this=1&2%20baf": 1,
"https://example.com?what=1#how-about-this=1&2%20baf": 1,
"<test>http://example7.com</test>": 1,
"https://<test>": 0,
"https://[test]": 0,
'http://"test"': 0,
'http://\'test\'': 0,
'[https://example8.com/what/is/this.php?what=1]': 1,
'[and http://example9.com?what=1&other=3#and-thing=2]': 1,
"http://'test'": 0,
"[https://example8.com/what/is/this.php?what=1]": 1,
"[and http://example9.com?what=1&other=3#and-thing=2]": 1,
'<what>https://example10.com#and-thing=2 "</about>': 1,
'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
'<or>http://examplehttp://15.badc</that>': 2,
'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
'[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
"sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi": 1,
"<or>http://examplehttp://15.badc</that>": 2,
"https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://": 2,
"[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)": 3,
}
for url_str, num_urls in _test_url_strs.items():
assert len(list(find_all_urls(url_str))) == num_urls, (
f'{url_str} does not contain {num_urls} urls')
assert len(list(find_all_urls(url_str))) == num_urls, f"{url_str} does not contain {num_urls} urls"
### Chrome Helpers
def chrome_cleanup():
"""
Cleans up any state or runtime files that Chrome leaves behind when killed by
@@ -560,10 +707,11 @@ def chrome_cleanup():
# Also clean up the active persona's explicit CHROME_USER_DATA_DIR if set
# (in case it's a custom path not under PERSONAS_DIR)
from archivebox.config.configset import get_config
config = get_config()
chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR')
chrome_user_data_dir = config.get("CHROME_USER_DATA_DIR")
if chrome_user_data_dir:
singleton_lock = Path(chrome_user_data_dir) / 'SingletonLock'
singleton_lock = Path(chrome_user_data_dir) / "SingletonLock"
if os.path.lexists(singleton_lock):
try:
singleton_lock.unlink()