wip

2026-04-06 07:47:53 +10:00 · 2026-03-23 03:58:32 -07:00
parent 268856bcfb
commit b749b26c5d
286 changed files with 21704 additions and 13480 deletions
--- a/archivebox/misc/init.py
+++ b/archivebox/misc/init.py
@@ -1 +1 @@
-__package__ = 'archivebox.misc'
+__package__ = "archivebox.misc"
--- a/archivebox/misc/checks.py
+++ b/archivebox/misc/checks.py
@@ -1,4 +1,4 @@
-__package__ = 'archivebox.misc'
+__package__ = "archivebox.misc"

 import os
 import sys
@@ -23,69 +23,74 @@ def check_data_folder() -> None:
    from archivebox import DATA_DIR, ARCHIVE_DIR
    from archivebox.config import CONSTANTS
    from archivebox.config.paths import create_and_chown_dir, get_or_create_working_tmp_dir, get_or_create_working_lib_dir
-    
+
    archive_dir_exists = os.path.isdir(ARCHIVE_DIR)
    if not archive_dir_exists:
-        print('[red][X] No archivebox index found in the current directory.[/red]', file=sys.stderr)
-        print(f'    {DATA_DIR}', file=sys.stderr)
+        print("[red][X] No archivebox index found in the current directory.[/red]", file=sys.stderr)
+        print(f"    {DATA_DIR}", file=sys.stderr)
        print(file=sys.stderr)
-        print('    [violet]Hint[/violet]: Are you running archivebox in the right folder?', file=sys.stderr)
-        print('        cd path/to/your/archive/folder', file=sys.stderr)
-        print('        archivebox [command]', file=sys.stderr)
+        print("    [violet]Hint[/violet]: Are you running archivebox in the right folder?", file=sys.stderr)
+        print("        cd path/to/your/archive/folder", file=sys.stderr)
+        print("        archivebox [command]", file=sys.stderr)
        print(file=sys.stderr)
-        print('    [violet]Hint[/violet]: To create a new archive collection or import existing data in this folder, run:', file=sys.stderr)
-        print('        archivebox init', file=sys.stderr)
+        print("    [violet]Hint[/violet]: To create a new archive collection or import existing data in this folder, run:", file=sys.stderr)
+        print("        archivebox init", file=sys.stderr)
        raise SystemExit(2)
-    
-    
+
    # Create data dir subdirs
    create_and_chown_dir(CONSTANTS.SOURCES_DIR)
-    create_and_chown_dir(CONSTANTS.PERSONAS_DIR / 'Default')
+    create_and_chown_dir(CONSTANTS.PERSONAS_DIR / "Default")
    create_and_chown_dir(CONSTANTS.LOGS_DIR)
    # create_and_chown_dir(CONSTANTS.CACHE_DIR)
-    
+
    # Create /tmp and /lib dirs if they don't exist
    get_or_create_working_tmp_dir(autofix=True, quiet=False)
    get_or_create_working_lib_dir(autofix=True, quiet=False)
-    
+
    # Check data dir permissions, /tmp, and /lib permissions
    check_data_dir_permissions()

-    
+
 def check_migrations():
    from archivebox import DATA_DIR
    from archivebox.misc.db import list_migrations

    pending_migrations = [name for status, name in list_migrations() if not status]
-    is_migrating = any(arg in sys.argv for arg in ['makemigrations', 'migrate', 'init'])
+    is_migrating = any(arg in sys.argv for arg in ["makemigrations", "migrate", "init"])

    if pending_migrations and not is_migrating:
-        print('[red][X] This collection was created with an older version of ArchiveBox and must be upgraded first.[/red]')
-        print(f'    {DATA_DIR}', file=sys.stderr)
+        print("[red][X] This collection was created with an older version of ArchiveBox and must be upgraded first.[/red]")
+        print(f"    {DATA_DIR}", file=sys.stderr)
        print(file=sys.stderr)
-        print(f'    [violet]Hint:[/violet] To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:', file=sys.stderr)
-        print('        archivebox init', file=sys.stderr)
+        print(
+            f"    [violet]Hint:[/violet] To upgrade it to the latest version and apply the {len(pending_migrations)} pending migrations, run:",
+            file=sys.stderr,
+        )
+        print("        archivebox init", file=sys.stderr)
        raise SystemExit(3)


 def check_io_encoding():
-    PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace('UTF8', 'UTF-8')
-            
-    if PYTHON_ENCODING != 'UTF-8':
-        print(f'[red][X] Your system is running python3 scripts with a bad locale setting: {PYTHON_ENCODING} (it should be UTF-8).[/red]', file=sys.stderr)
+    PYTHON_ENCODING = (sys.__stdout__ or sys.stdout or sys.__stderr__ or sys.stderr).encoding.upper().replace("UTF8", "UTF-8")
+
+    if PYTHON_ENCODING != "UTF-8":
+        print(
+            f"[red][X] Your system is running python3 scripts with a bad locale setting: {PYTHON_ENCODING} (it should be UTF-8).[/red]",
+            file=sys.stderr,
+        )
        print('    To fix it, add the line "export PYTHONIOENCODING=UTF-8" to your ~/.bashrc file (without quotes)', file=sys.stderr)
        print('    Or if you\'re using ubuntu/debian, run "dpkg-reconfigure locales"', file=sys.stderr)
-        print('')
-        print('    Confirm that it\'s fixed by opening a new shell and running:', file=sys.stderr)
+        print("")
+        print("    Confirm that it's fixed by opening a new shell and running:", file=sys.stderr)
        print('        python3 -c "import sys; print(sys.stdout.encoding)"   # should output UTF-8', file=sys.stderr)
        raise SystemExit(2)
-    
+
    # # hard errors: check python version
    # if sys.version_info[:3] < (3, 10, 0):
    #     print('[red][X] Python version is not new enough: {sys.version} (>3.10 is required)[/red]', file=sys.stderr)
    #     print('    See https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#python for help upgrading your Python installation.', file=sys.stderr)
    #     raise SystemExit(2)
-    
+
    # # hard errors: check django version
    # if int(django.VERSION[0]) < 5:
    #     print('[red][X] Django version is not new enough: {django.VERSION[:3]} (>=5.0 is required)[/red]', file=sys.stderr)
@@ -96,35 +101,44 @@ def check_io_encoding():
 def check_not_root():
    from archivebox.config.permissions import IS_ROOT, IN_DOCKER

-    attempted_command = ' '.join(sys.argv[1:]) if len(sys.argv) > 1 else ''
-    is_getting_help = '-h' in sys.argv or '--help' in sys.argv or 'help' in sys.argv
-    is_getting_version = '--version' in sys.argv or 'version' in sys.argv
-    is_installing = 'setup' in sys.argv or 'install' in sys.argv
+    attempted_command = " ".join(sys.argv[1:]) if len(sys.argv) > 1 else ""
+    is_getting_help = "-h" in sys.argv or "--help" in sys.argv or "help" in sys.argv
+    is_getting_version = "--version" in sys.argv or "version" in sys.argv
+    is_installing = "setup" in sys.argv or "install" in sys.argv

    if IS_ROOT and not (is_getting_help or is_getting_version or is_installing):
-        print('[red][!] ArchiveBox should never be run as root![/red]', file=sys.stderr)
-        print('    For more information, see the security overview documentation:', file=sys.stderr)
-        print('        https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root', file=sys.stderr)
+        print("[red][!] ArchiveBox should never be run as root![/red]", file=sys.stderr)
+        print("    For more information, see the security overview documentation:", file=sys.stderr)
+        print("        https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#do-not-run-as-root", file=sys.stderr)

        if IN_DOCKER:
-            print('[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:', file=sys.stderr)
-            print('        docker compose run archivebox {attempted_command}', file=sys.stderr)
-            print(f'        docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}', file=sys.stderr)
-            print('        or:', file=sys.stderr)
-            print(f'        docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
-            print(f'        docker exec -it --user=archivebox <container id> /bin/bash -c "archivebox {attempted_command}"', file=sys.stderr)
+            print(
+                "[red][!] When using Docker, you must run commands with [green]docker run[/green] instead of [yellow3]docker exec[/yellow3], e.g.:",
+                file=sys.stderr,
+            )
+            print("        docker compose run archivebox {attempted_command}", file=sys.stderr)
+            print(f"        docker run -it -v $PWD/data:/data archivebox/archivebox {attempted_command}", file=sys.stderr)
+            print("        or:", file=sys.stderr)
+            print(
+                f'        docker compose exec --user=archivebox archivebox /bin/bash -c "archivebox {attempted_command}"',
+                file=sys.stderr,
+            )
+            print(
+                f'        docker exec -it --user=archivebox <container id> /bin/bash -c "archivebox {attempted_command}"',
+                file=sys.stderr,
+            )
        raise SystemExit(2)


 def check_not_inside_source_dir():
    """Prevent running ArchiveBox from inside its source directory (would pollute repo with data files)."""
    cwd = Path(os.getcwd()).resolve()
-    is_source_dir = (cwd / 'archivebox' / '__init__.py').exists() and (cwd / 'pyproject.toml').exists()
-    data_dir_set_elsewhere = os.environ.get('DATA_DIR', '').strip() and Path(os.environ['DATA_DIR']).resolve() != cwd
-    is_testing = 'pytest' in sys.modules or 'unittest' in sys.modules
+    is_source_dir = (cwd / "archivebox" / "__init__.py").exists() and (cwd / "pyproject.toml").exists()
+    data_dir_set_elsewhere = os.environ.get("DATA_DIR", "").strip() and Path(os.environ["DATA_DIR"]).resolve() != cwd
+    is_testing = "pytest" in sys.modules or "unittest" in sys.modules

    if is_source_dir and not data_dir_set_elsewhere and not is_testing:
-        raise SystemExit('[!] Cannot run from source dir, set DATA_DIR or cd to a data folder first')
+        raise SystemExit("[!] Cannot run from source dir, set DATA_DIR or cd to a data folder first")


 def check_data_dir_permissions():
@@ -132,28 +146,42 @@ def check_data_dir_permissions():
    from archivebox.misc.logging import STDERR
    from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP, DEFAULT_PUID, DEFAULT_PGID, IS_ROOT, USER
    from archivebox.config.paths import get_or_create_working_tmp_dir, get_or_create_working_lib_dir
-    
+
    data_dir_stat = Path(DATA_DIR).stat()
    data_dir_uid, data_dir_gid = data_dir_stat.st_uid, data_dir_stat.st_gid
    data_owned_by_root = data_dir_uid == 0
-    
+
    # data_owned_by_default_user = data_dir_uid == DEFAULT_PUID or data_dir_gid == DEFAULT_PGID
    data_owner_doesnt_match = (data_dir_uid != ARCHIVEBOX_USER and data_dir_gid != ARCHIVEBOX_GROUP) if not IS_ROOT else False
    data_not_writable = not (os.path.isdir(DATA_DIR) and os.access(DATA_DIR, os.W_OK))
    if data_owned_by_root:
-        STDERR.print('\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]root[/red], it must be changed before archiving can run![/yellow]')
+        STDERR.print(
+            "\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]root[/red], it must be changed before archiving can run![/yellow]",
+        )
    elif data_owner_doesnt_match or data_not_writable:
-        STDERR.print(f'\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]{data_dir_uid}:{data_dir_gid}[/red], but ArchiveBox user is [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue] ({USER})! (ArchiveBox may not be able to write to the data dir)[/yellow]')
-        
+        STDERR.print(
+            f"\n[yellow]:warning: Warning: ArchiveBox [blue]DATA_DIR[/blue] is currently owned by [red]{data_dir_uid}:{data_dir_gid}[/red], but ArchiveBox user is [blue]{ARCHIVEBOX_USER}:{ARCHIVEBOX_GROUP}[/blue] ({USER})! (ArchiveBox may not be able to write to the data dir)[/yellow]",
+        )
+
    if data_owned_by_root or data_owner_doesnt_match or data_not_writable:
-        STDERR.print(f'[violet]Hint:[/violet] Change the current ownership [red]{data_dir_uid}[/red]:{data_dir_gid} (PUID:PGID) to a non-root user & group that will run ArchiveBox, e.g.:')
-        STDERR.print(f'    [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {DATA_DIR.resolve()}')
+        STDERR.print(
+            f"[violet]Hint:[/violet] Change the current ownership [red]{data_dir_uid}[/red]:{data_dir_gid} (PUID:PGID) to a non-root user & group that will run ArchiveBox, e.g.:",
+        )
+        STDERR.print(f"    [grey53]sudo[/grey53] chown -R [blue]{DEFAULT_PUID}:{DEFAULT_PGID}[/blue] {DATA_DIR.resolve()}")
        STDERR.print()
-        STDERR.print('[blue]More info:[/blue]')
-        STDERR.print('    [link=https://github.com/ArchiveBox/ArchiveBox#storage-requirements]https://github.com/ArchiveBox/ArchiveBox#storage-requirements[/link]')
-        STDERR.print('    [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]')
-        STDERR.print('    [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]')
-        STDERR.print('    [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]')
+        STDERR.print("[blue]More info:[/blue]")
+        STDERR.print(
+            "    [link=https://github.com/ArchiveBox/ArchiveBox#storage-requirements]https://github.com/ArchiveBox/ArchiveBox#storage-requirements[/link]",
+        )
+        STDERR.print(
+            "    [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions]https://github.com/ArchiveBox/ArchiveBox/wiki/Security-Overview#permissions[/link]",
+        )
+        STDERR.print(
+            "    [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid]https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#puid--pgid[/link]",
+        )
+        STDERR.print(
+            "    [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts]https://github.com/ArchiveBox/ArchiveBox/wiki/Troubleshooting#filesystem-doesnt-support-fsync-eg-network-mounts[/link]",
+        )

    from archivebox.config.common import STORAGE_CONFIG

@@ -172,8 +200,8 @@ def check_data_dir_permissions():

    # Check /lib dir permissions
    check_lib_dir(lib_dir, throw=False, must_exist=True)
-    
-    os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8))                        # noqa: F821
+
+    os.umask(0o777 - int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8))


 def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
@@ -182,45 +210,57 @@ def check_tmp_dir(tmp_dir=None, throw=False, quiet=False, must_exist=True):
    from archivebox.misc.logging_util import pretty_path
    from archivebox.config.permissions import ARCHIVEBOX_USER, ARCHIVEBOX_GROUP
    from archivebox.config.common import STORAGE_CONFIG
-    
+
    tmp_dir = tmp_dir or STORAGE_CONFIG.TMP_DIR
    socket_file = tmp_dir.absolute().resolve() / "supervisord.sock"

    if not must_exist and not os.path.isdir(tmp_dir):
        # just check that its viable based on its length (because dir may not exist yet, we cant check if its writable)
-        return len(f'file://{socket_file}') <= 96
+        return len(f"file://{socket_file}") <= 96

    tmp_is_valid = False
-    allow_no_unix_sockets = os.environ.get('ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS', '').lower() in ('1', 'true', 'yes')
+    allow_no_unix_sockets = os.environ.get("ARCHIVEBOX_ALLOW_NO_UNIX_SOCKETS", "").lower() in ("1", "true", "yes")
    try:
        tmp_is_valid = dir_is_writable(tmp_dir)
        if not allow_no_unix_sockets:
            tmp_is_valid = tmp_is_valid and assert_dir_can_contain_unix_sockets(tmp_dir)
-        assert tmp_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}'            
-        assert len(f'file://{socket_file}') <= 96, f'ArchiveBox TMP_DIR={tmp_dir} is too long, dir containing unix socket files must be <90 chars.'
+        assert tmp_is_valid, f"ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to TMP_DIR={tmp_dir}"
+        assert len(f"file://{socket_file}") <= 96, (
+            f"ArchiveBox TMP_DIR={tmp_dir} is too long, dir containing unix socket files must be <90 chars."
+        )
        return True
    except Exception as e:
        if not quiet:
            STDERR.print()
-            ERROR_TEXT = '\n'.join((
-                '',
-                f'[red]:cross_mark: ArchiveBox is unable to use TMP_DIR={pretty_path(tmp_dir)}[/red]',
-                f'   [yellow]{e}[/yellow]',
-                '',
-                '[blue]Info:[/blue] [grey53]The TMP_DIR is used for the supervisord unix socket file and other temporary files.',
-                '  - It [red]must[/red] be on a local drive (not inside a docker volume, remote network drive, or FUSE mount).',
-                f'  - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).',
-                '  - It [red]must[/red] be a *short* path (less than 90 characters) due to UNIX path length restrictions for sockets.',
-                '  - It [yellow]should[/yellow] be able to hold at least 200MB of data (in-progress downloads can be large).[/grey53]',
-                '',
-                '[violet]Hint:[/violet] Fix it by setting TMP_DIR to a path that meets these requirements, e.g.:',
-                f'      [green]archivebox config --set TMP_DIR={get_or_create_working_tmp_dir(autofix=False, quiet=True) or "/tmp/archivebox"}[/green]',
-                '',
-            ))
-            STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured TMP_DIR[/red]', subtitle='Background workers may fail to start until fixed.'))
+            ERROR_TEXT = "\n".join(
+                (
+                    "",
+                    f"[red]:cross_mark: ArchiveBox is unable to use TMP_DIR={pretty_path(tmp_dir)}[/red]",
+                    f"   [yellow]{e}[/yellow]",
+                    "",
+                    "[blue]Info:[/blue] [grey53]The TMP_DIR is used for the supervisord unix socket file and other temporary files.",
+                    "  - It [red]must[/red] be on a local drive (not inside a docker volume, remote network drive, or FUSE mount).",
+                    f"  - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).",
+                    "  - It [red]must[/red] be a *short* path (less than 90 characters) due to UNIX path length restrictions for sockets.",
+                    "  - It [yellow]should[/yellow] be able to hold at least 200MB of data (in-progress downloads can be large).[/grey53]",
+                    "",
+                    "[violet]Hint:[/violet] Fix it by setting TMP_DIR to a path that meets these requirements, e.g.:",
+                    f"      [green]archivebox config --set TMP_DIR={get_or_create_working_tmp_dir(autofix=False, quiet=True) or '/tmp/archivebox'}[/green]",
+                    "",
+                ),
+            )
+            STDERR.print(
+                Panel(
+                    ERROR_TEXT,
+                    expand=False,
+                    border_style="red",
+                    title="[red]:cross_mark: Error with configured TMP_DIR[/red]",
+                    subtitle="Background workers may fail to start until fixed.",
+                ),
+            )
            STDERR.print()
        if throw:
-            raise OSError(f'TMP_DIR={tmp_dir} is invalid, ArchiveBox is unable to use it and the server will fail to start!') from e
+            raise OSError(f"TMP_DIR={tmp_dir} is invalid, ArchiveBox is unable to use it and the server will fail to start!") from e
    return False


@@ -230,38 +270,48 @@ def check_lib_dir(lib_dir: Path | None = None, throw=False, quiet=False, must_ex
    from archivebox.misc.logging_util import pretty_path
    from archivebox.config.paths import dir_is_writable, get_or_create_working_lib_dir
    from archivebox.config.common import STORAGE_CONFIG
-    
+
    lib_dir = lib_dir or STORAGE_CONFIG.LIB_DIR
-    
+
    # assert lib_dir == STORAGE_CONFIG.LIB_DIR, "lib_dir is not the same as the one in the flat config"
-    
+
    if not must_exist and not os.path.isdir(lib_dir):
        return True
-    
+
    lib_is_valid = False
    try:
        lib_is_valid = dir_is_writable(lib_dir)
-        assert lib_is_valid, f'ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to LIB_DIR={lib_dir}'
+        assert lib_is_valid, f"ArchiveBox user PUID={ARCHIVEBOX_USER} PGID={ARCHIVEBOX_GROUP} is unable to write to LIB_DIR={lib_dir}"
        return True
    except Exception as e:
        if not quiet:
            STDERR.print()
-            ERROR_TEXT = '\n'.join((
-                '',
-                f'[red]:cross_mark: ArchiveBox is unable to use LIB_DIR={pretty_path(lib_dir)}[/red]',
-                f'   [yellow]{e}[/yellow]',
-                '',
-                '[blue]Info:[/blue] [grey53]The LIB_DIR is used to store ArchiveBox auto-installed plugin library and binary dependencies.',
-                f'  - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).',
-                '  - It [yellow]should[/yellow] be on a local (ideally fast) drive like an SSD or HDD (not on a network drive or external HDD).',
-                '  - It [yellow]should[/yellow] be able to hold at least 1GB of data (some dependencies like Chrome can be large).[/grey53]',
-                '',
-                '[violet]Hint:[/violet] Fix it by setting LIB_DIR to a path that meets these requirements, e.g.:',
-                f'      [green]archivebox config --set LIB_DIR={get_or_create_working_lib_dir(autofix=False, quiet=True) or "/usr/local/share/archivebox"}[/green]',
-                '',
-            ))
-            STDERR.print(Panel(ERROR_TEXT, expand=False, border_style='red', title='[red]:cross_mark: Error with configured LIB_DIR[/red]', subtitle='[yellow]Dependencies may not auto-install properly until fixed.[/yellow]'))
+            ERROR_TEXT = "\n".join(
+                (
+                    "",
+                    f"[red]:cross_mark: ArchiveBox is unable to use LIB_DIR={pretty_path(lib_dir)}[/red]",
+                    f"   [yellow]{e}[/yellow]",
+                    "",
+                    "[blue]Info:[/blue] [grey53]The LIB_DIR is used to store ArchiveBox auto-installed plugin library and binary dependencies.",
+                    f"  - It [red]must[/red] be readable and writable by the ArchiveBox user (PUID={ARCHIVEBOX_USER}, PGID={ARCHIVEBOX_GROUP}).",
+                    "  - It [yellow]should[/yellow] be on a local (ideally fast) drive like an SSD or HDD (not on a network drive or external HDD).",
+                    "  - It [yellow]should[/yellow] be able to hold at least 1GB of data (some dependencies like Chrome can be large).[/grey53]",
+                    "",
+                    "[violet]Hint:[/violet] Fix it by setting LIB_DIR to a path that meets these requirements, e.g.:",
+                    f"      [green]archivebox config --set LIB_DIR={get_or_create_working_lib_dir(autofix=False, quiet=True) or '/usr/local/share/archivebox'}[/green]",
+                    "",
+                ),
+            )
+            STDERR.print(
+                Panel(
+                    ERROR_TEXT,
+                    expand=False,
+                    border_style="red",
+                    title="[red]:cross_mark: Error with configured LIB_DIR[/red]",
+                    subtitle="[yellow]Dependencies may not auto-install properly until fixed.[/yellow]",
+                ),
+            )
            STDERR.print()
        if throw:
-            raise OSError(f'LIB_DIR={lib_dir} is invalid, ArchiveBox is unable to use it and dependencies will fail to install.') from e
+            raise OSError(f"LIB_DIR={lib_dir} is invalid, ArchiveBox is unable to use it and dependencies will fail to install.") from e
    return False
--- a/archivebox/misc/db.py
+++ b/archivebox/misc/db.py
@@ -2,18 +2,18 @@
 Database utility functions for ArchiveBox.
 """

-__package__ = 'archivebox.misc'
+__package__ = "archivebox.misc"

 from io import StringIO
 from pathlib import Path
-from typing import Any, List, Tuple
+from typing import Any

 from archivebox.config import DATA_DIR
 from archivebox.misc.util import enforce_types


@enforce_types
-def list_migrations(out_dir: Path = DATA_DIR) -> List[Tuple[bool, str]]:
+def list_migrations(out_dir: Path = DATA_DIR) -> list[tuple[bool, str]]:
    """List all Django migrations and their status"""
    from django.core.management import call_command

@@ -23,9 +23,9 @@ def list_migrations(out_dir: Path = DATA_DIR) -> List[Tuple[bool, str]]:

    migrations = []
    for line in out.readlines():
-        if line.strip() and ']' in line:
-            status_str, name_str = line.strip().split(']', 1)
-            is_applied = 'X' in status_str
+        if line.strip() and "]" in line:
+            status_str, name_str = line.strip().split("]", 1)
+            is_applied = "X" in status_str
            migration_name = name_str.strip()
            migrations.append((is_applied, migration_name))

@@ -33,23 +33,21 @@ def list_migrations(out_dir: Path = DATA_DIR) -> List[Tuple[bool, str]]:


@enforce_types
-def apply_migrations(out_dir: Path = DATA_DIR) -> List[str]:
+def apply_migrations(out_dir: Path = DATA_DIR) -> list[str]:
    """Apply pending Django migrations"""
    from django.core.management import call_command

    out1 = StringIO()

-    call_command("migrate", interactive=False, database='default', stdout=out1)
+    call_command("migrate", interactive=False, database="default", stdout=out1)
    out1.seek(0)

-    return [
-        line.strip() for line in out1.readlines() if line.strip()
-    ]
+    return [line.strip() for line in out1.readlines() if line.strip()]


@enforce_types
-def get_admins(out_dir: Path = DATA_DIR) -> List[Any]:
+def get_admins(out_dir: Path = DATA_DIR) -> list[Any]:
    """Get list of superuser accounts"""
    from django.contrib.auth.models import User

-    return list(User.objects.filter(is_superuser=True).exclude(username='system'))
+    return list(User.objects.filter(is_superuser=True).exclude(username="system"))
--- a/archivebox/misc/debugging.py
+++ b/archivebox/misc/debugging.py
@@ -1,6 +1,7 @@
 from functools import wraps
 from time import time

+
 def timed_function(func):
    """
    Very simple profiling decorator for debugging.
@@ -8,23 +9,25 @@ def timed_function(func):
        @timed_function
        def my_func():
            ...
-    
+
    More advanced alternatives:
        - viztracer ../.venv/bin/archivebox manage check          # https://viztracer.readthedocs.io/en/latest/filter.html
        - python -m cProfile -o archivebox.prof ../.venv/bin/archivebox manage check; snakeviz archivebox.prof
        - Django Debug Toolbar + django-debug-toolbar-flamegraph
        + Django Requests Tracker (requests-tracker)
    """
+
    @wraps(func)
    def wrap(*args, **kwargs):
-        if args and hasattr(args[0], '__module__'):
+        if args and hasattr(args[0], "__module__"):
            module = args[0].__module__
        else:
            module = func.__module__
        ts_start = time()
        result = func(*args, **kwargs)
        ts_end = time()
-        ms_elapsed = int((ts_end-ts_start) * 1000)
-        print(f'[DEBUG][{ms_elapsed}ms] {module}.{func.__name__}(...)')
+        ms_elapsed = int((ts_end - ts_start) * 1000)
+        print(f"[DEBUG][{ms_elapsed}ms] {module}.{func.__name__}(...)")
        return result
+
    return wrap
--- a/archivebox/misc/folders.py
+++ b/archivebox/misc/folders.py
@@ -5,20 +5,19 @@ Note: This file only contains legacy cleanup utilities.
 The DB is the single source of truth - use Snapshot.objects queries for all status checks.
 """

-__package__ = 'archivebox.misc'
+__package__ = "archivebox.misc"

 import os
 import json
 import shutil
 from pathlib import Path
-from typing import Tuple, List

 from archivebox.config import DATA_DIR, CONSTANTS
 from archivebox.misc.util import enforce_types


@enforce_types
-def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], List[str]]:
+def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> tuple[list[str], list[str]]:
    """
    Legacy cleanup: Move folders to their correct timestamp-named locations based on index.json.

@@ -29,19 +28,19 @@ def fix_invalid_folder_locations(out_dir: Path = DATA_DIR) -> Tuple[List[str], L
    cant_fix = []
    for entry in os.scandir(out_dir / CONSTANTS.ARCHIVE_DIR_NAME):
        if entry.is_dir(follow_symlinks=True):
-            index_path = Path(entry.path) / 'index.json'
+            index_path = Path(entry.path) / "index.json"
            if index_path.exists():
                try:
-                    with open(index_path, 'r') as f:
+                    with open(index_path) as f:
                        data = json.load(f)
-                    timestamp = data.get('timestamp')
+                    timestamp = data.get("timestamp")
                except Exception:
                    continue

                if not timestamp:
                    continue

-                if not entry.path.endswith(f'/{timestamp}'):
+                if not entry.path.endswith(f"/{timestamp}"):
                    dest = out_dir / CONSTANTS.ARCHIVE_DIR_NAME / timestamp
                    if dest.exists():
                        cant_fix.append(entry.path)
--- a/archivebox/misc/hashing.py
+++ b/archivebox/misc/hashing.py
@@ -2,20 +2,22 @@ import hashlib
 import mimetypes
 from functools import lru_cache
 from pathlib import Path
-from typing import Callable
+from collections.abc import Callable
 from datetime import datetime

+
@lru_cache(maxsize=1024)
 def _cached_file_hash(filepath: str, size: int, mtime: float) -> str:
    """Internal function to calculate file hash with cache key based on path, size and mtime."""
    sha256_hash = hashlib.sha256()

-    with open(filepath, 'rb') as f:
-        for chunk in iter(lambda: f.read(4096), b''):
+    with open(filepath, "rb") as f:
+        for chunk in iter(lambda: f.read(4096), b""):
            sha256_hash.update(chunk)

    return sha256_hash.hexdigest()

+
@lru_cache(maxsize=10)
 def hash_file(file_path: Path, pwd: Path | None = None) -> str:
    """Calculate SHA256 hash of a file with caching based on path, size and mtime."""
@@ -30,9 +32,10 @@ def hash_file(file_path: Path, pwd: Path | None = None) -> str:
    return _cached_file_hash(
        str(abs_path),
        stat_info.st_size,
-        stat_info.st_mtime
+        stat_info.st_mtime,
    )

+
@lru_cache(maxsize=10)
 def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callable | None = None, max_depth: int = -1) -> dict[str, str]:
    """Calculate SHA256 hashes for all files and directories recursively."""
@@ -48,9 +51,12 @@ def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callabl

    # Get all files recursively
    all_files = get_dir_entries(
-        dir_path, pwd=pwd, recursive=True,
-        include_files=True, include_dirs=False,
-        filter_func=filter_func
+        dir_path,
+        pwd=pwd,
+        recursive=True,
+        include_files=True,
+        include_dirs=False,
+        filter_func=filter_func,
    )

    hashes: dict[str, str] = {}
@@ -65,39 +71,48 @@ def get_dir_hashes(dir_path: Path, pwd: Path | None = None, filter_func: Callabl

    # Calculate hashes for all directories
    subdirs = get_dir_entries(
-        dir_path, pwd=pwd, recursive=True,
-        include_files=False, include_dirs=True,
-        include_hidden=False, filter_func=filter_func,
-        max_depth=max_depth
+        dir_path,
+        pwd=pwd,
+        recursive=True,
+        include_files=False,
+        include_dirs=True,
+        include_hidden=False,
+        filter_func=filter_func,
+        max_depth=max_depth,
    )

    for subdir in subdirs:
        subdir_path = dir_path / subdir
        subdir_hashes = get_dir_hashes(
-            subdir_path, filter_func=filter_func,
-            max_depth=0
+            subdir_path,
+            filter_func=filter_func,
+            max_depth=0,
        )
-        hashes[subdir] = subdir_hashes['.']
+        hashes[subdir] = subdir_hashes["."]

    # Filter results by max_depth
    if max_depth >= 0:
-        hashes = {
-            path: value for path, value in hashes.items()
-            if len(Path(path).parts) <= max_depth + 1
-        }
+        hashes = {path: value for path, value in hashes.items() if len(Path(path).parts) <= max_depth + 1}

    # Calculate root directory hash
    hashable_summary.sort()
-    root_sha256 = hashlib.sha256('\n'.join(hashable_summary).encode()).hexdigest()
-    hashes['.'] = root_sha256
+    root_sha256 = hashlib.sha256("\n".join(hashable_summary).encode()).hexdigest()
+    hashes["."] = root_sha256

    return hashes


@lru_cache(maxsize=128)
-def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = True,
-                    include_files: bool = True, include_dirs: bool = True, include_hidden: bool = False,
-                    filter_func: Callable | None = None, max_depth: int = -1) -> tuple[str, ...]:
+def get_dir_entries(
+    dir_path: Path,
+    pwd: Path | None = None,
+    recursive: bool = True,
+    include_files: bool = True,
+    include_dirs: bool = True,
+    include_hidden: bool = False,
+    filter_func: Callable | None = None,
+    max_depth: int = -1,
+) -> tuple[str, ...]:
    """Get filtered list of directory entries."""
    pwd = Path(pwd) if pwd else None
    dir_path = Path(dir_path)
@@ -107,20 +122,20 @@ def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = T
    results = []

    def process_path(path: Path, depth: int):
-        if not include_hidden and path.name.startswith('.'):
+        if not include_hidden and path.name.startswith("."):
            return False
        if max_depth >= 0 and depth > max_depth:
            return False
        if filter_func:
            info = {
                "abspath": str(path.absolute()),
-                "relpath": str(path.relative_to(dir_path))
+                "relpath": str(path.relative_to(dir_path)),
            }
            if not filter_func(info):
                return False
        return True

-    for path in dir_path.rglob('*') if recursive else dir_path.glob('*'):
+    for path in dir_path.rglob("*") if recursive else dir_path.glob("*"):
        current_depth = len(path.relative_to(dir_path).parts)

        if path.is_file() and include_files and process_path(path, current_depth):
@@ -133,6 +148,7 @@ def get_dir_entries(dir_path: Path, pwd: Path | None = None, recursive: bool = T

    return tuple(sorted(results))  # Make immutable for caching

+
@lru_cache(maxsize=1024)
 def get_dir_sizes(dir_path: Path, pwd: Path | None = None, **kwargs) -> dict[str, int]:
    """Calculate sizes for all files and directories recursively."""
@@ -146,10 +162,10 @@ def get_dir_sizes(dir_path: Path, pwd: Path | None = None, **kwargs) -> dict[str
            sizes[path_key] = full_path.stat().st_size
        else:
            total = 0
-            for file_path in full_path.rglob('*'):
-                if file_path.is_file() and not file_path.name.startswith('.'):
+            for file_path in full_path.rglob("*"):
+                if file_path.is_file() and not file_path.name.startswith("."):
                    total += file_path.stat().st_size
-            sizes[path_key + '/'] = total
+            sizes[path_key + "/"] = total

    return sizes

@@ -165,23 +181,23 @@ def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable
    hashes = get_dir_hashes(dir_path, pwd=pwd, filter_func=filter_func, max_depth=max_depth)
    sizes = get_dir_sizes(str(dir_path), pwd=pwd, filter_func=filter_func, max_depth=max_depth)

-    num_total_subpaths = sum(1 for name in hashes if name != '.')
+    num_total_subpaths = sum(1 for name in hashes if name != ".")
    details = {}

    for filename, sha256_hash in sorted(hashes.items()):
        abs_path = (dir_path / filename).resolve()
        stat_info = abs_path.stat()
-        num_subpaths = sum(1 for p in hashes if p.startswith(filename + '/'))
+        num_subpaths = sum(1 for p in hashes if p.startswith(filename + "/"))
        is_dir = abs_path.is_dir()
        if is_dir:
-            mime_type = 'inode/directory'
+            mime_type = "inode/directory"
            basename = abs_path.name
-            extension = ''
-            num_bytes = sizes[filename + '/']
-            if filename == '.':
+            extension = ""
+            num_bytes = sizes[filename + "/"]
+            if filename == ".":
                num_subpaths = num_total_subpaths
            else:
-                filename += '/'
+                filename += "/"
                num_subpaths = num_subpaths
        else:  # is_file
            num_subpaths = None
@@ -191,14 +207,14 @@ def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable
            num_bytes = sizes[filename]

        details[filename] = {
-            'basename': basename,
-            'mime_type': mime_type,
-            'extension': extension,
-            'num_subpaths': num_subpaths,
-            'num_bytes': num_bytes,
-            'hash_sha256': sha256_hash,
-            'created_at': datetime.fromtimestamp(stat_info.st_ctime).isoformat(),
-            'modified_at': datetime.fromtimestamp(stat_info.st_mtime).isoformat(),
+            "basename": basename,
+            "mime_type": mime_type,
+            "extension": extension,
+            "num_subpaths": num_subpaths,
+            "num_bytes": num_bytes,
+            "hash_sha256": sha256_hash,
+            "created_at": datetime.fromtimestamp(stat_info.st_ctime).isoformat(),
+            "modified_at": datetime.fromtimestamp(stat_info.st_mtime).isoformat(),
        }

        if filter_func and not filter_func(details[filename]):
@@ -207,12 +223,13 @@ def get_dir_info(dir_path: Path, pwd: Path | None = None, filter_func: Callable
    return details


-if __name__ == '__main__':
+if __name__ == "__main__":
    import json
-    dir_info = get_dir_info(Path('.'), max_depth=6)
-    with open('.hashes.json', 'w') as f:
+
+    dir_info = get_dir_info(Path("."), max_depth=6)
+    with open(".hashes.json", "w") as f:
        json.dump(dir_info, f, indent=4)
-    print('Wrote .hashes.json')
+    print("Wrote .hashes.json")

 # Example output:
 # {
--- a/archivebox/misc/jsonl.py
+++ b/archivebox/misc/jsonl.py
@@ -20,72 +20,73 @@ Plain URLs (also supported):
    https://foo.com
 """

-__package__ = 'archivebox.misc'
+__package__ = "archivebox.misc"

 import sys
 import json
 import select
-from typing import Iterable, Iterator, Dict, Any, Optional, TextIO
+from typing import Any, TextIO
+from collections.abc import Iterable, Iterator
 from pathlib import Path


 # Type constants for JSONL records
-TYPE_SNAPSHOT = 'Snapshot'
-TYPE_ARCHIVERESULT = 'ArchiveResult'
-TYPE_TAG = 'Tag'
-TYPE_CRAWL = 'Crawl'
-TYPE_BINARY = 'Binary'
-TYPE_PROCESS = 'Process'
-TYPE_MACHINE = 'Machine'
+TYPE_SNAPSHOT = "Snapshot"
+TYPE_ARCHIVERESULT = "ArchiveResult"
+TYPE_TAG = "Tag"
+TYPE_CRAWL = "Crawl"
+TYPE_BINARY = "Binary"
+TYPE_PROCESS = "Process"
+TYPE_MACHINE = "Machine"

 VALID_TYPES = {TYPE_SNAPSHOT, TYPE_ARCHIVERESULT, TYPE_TAG, TYPE_CRAWL, TYPE_BINARY, TYPE_PROCESS, TYPE_MACHINE}


-def parse_line(line: str) -> Optional[Dict[str, Any]]:
+def parse_line(line: str) -> dict[str, Any] | None:
    """
    Parse a single line of input as either JSONL or plain URL.

    Returns a dict with at minimum {'type': '...', 'url': '...'} or None if invalid.
    """
    line = line.strip()
-    if not line or line.startswith('#'):
+    if not line or line.startswith("#"):
        return None

    # Try to parse as JSON first
-    if line.startswith('{'):
+    if line.startswith("{"):
        try:
            record = json.loads(line)
            # If it has a type, validate it
-            if 'type' in record and record['type'] not in VALID_TYPES:
+            if "type" in record and record["type"] not in VALID_TYPES:
                # Unknown type, treat as raw data
                pass
            # If it has url but no type, assume Snapshot
-            if 'url' in record and 'type' not in record:
-                record['type'] = TYPE_SNAPSHOT
+            if "url" in record and "type" not in record:
+                record["type"] = TYPE_SNAPSHOT
            return record
        except json.JSONDecodeError:
            pass

    # Treat as plain URL if it looks like one
-    if line.startswith('http://') or line.startswith('https://') or line.startswith('file://'):
-        return {'type': TYPE_SNAPSHOT, 'url': line}
+    if line.startswith("http://") or line.startswith("https://") or line.startswith("file://"):
+        return {"type": TYPE_SNAPSHOT, "url": line}

    # Could be a snapshot ID (UUID with dashes or compact 32-char hex)
-    if len(line) == 36 and line.count('-') == 4:
-        return {'type': TYPE_SNAPSHOT, 'id': line}
+    if len(line) == 36 and line.count("-") == 4:
+        return {"type": TYPE_SNAPSHOT, "id": line}
    if len(line) == 32:
        try:
            int(line, 16)
        except ValueError:
            pass
        else:
-            return {'type': TYPE_SNAPSHOT, 'id': line}
+            return {"type": TYPE_SNAPSHOT, "id": line}

    # Unknown format, skip
    return None


-def read_stdin(stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
+def read_stdin(stream: TextIO | None = None) -> Iterator[dict[str, Any]]:
    """
    Read JSONL or plain URLs from stdin.

@@ -112,20 +113,20 @@ def read_stdin(stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
            yield record


-def read_file(path: Path) -> Iterator[Dict[str, Any]]:
+def read_file(path: Path) -> Iterator[dict[str, Any]]:
    """
    Read JSONL or plain URLs from a file.

    Yields parsed records as dicts.
    """
-    with open(path, 'r') as f:
+    with open(path) as f:
        for line in f:
            record = parse_line(line)
            if record:
                yield record


-def read_args_or_stdin(args: Iterable[str], stream: Optional[TextIO] = None) -> Iterator[Dict[str, Any]]:
+def read_args_or_stdin(args: Iterable[str], stream: TextIO | None = None) -> Iterator[dict[str, Any]]:
    """
    Read from CLI arguments if provided, otherwise from stdin.

@@ -145,16 +146,16 @@ def read_args_or_stdin(args: Iterable[str], stream: Optional[TextIO] = None) ->
        yield from read_stdin(stream)


-def write_record(record: Dict[str, Any], stream: Optional[TextIO] = None) -> None:
+def write_record(record: dict[str, Any], stream: TextIO | None = None) -> None:
    """
    Write a single JSONL record to stdout (or provided stream).
    """
    active_stream: TextIO = sys.stdout if stream is None else stream
-    active_stream.write(json.dumps(record) + '\n')
+    active_stream.write(json.dumps(record) + "\n")
    active_stream.flush()


-def write_records(records: Iterator[Dict[str, Any]], stream: Optional[TextIO] = None) -> int:
+def write_records(records: Iterator[dict[str, Any]], stream: TextIO | None = None) -> int:
    """
    Write multiple JSONL records to stdout (or provided stream).

--- a/archivebox/misc/legacy.py
+++ b/archivebox/misc/legacy.py
@@ -8,24 +8,26 @@ This is separate from the hooks-based parser system which handles importing
 new URLs from bookmark files, RSS feeds, etc.
 """

-__package__ = 'archivebox.misc'
+__package__ = "archivebox.misc"

 import os
 import json
 from pathlib import Path
 from datetime import datetime, timezone
-from typing import Iterator, TypedDict, List
+from typing import TypedDict
+from collections.abc import Iterator


 class SnapshotDict(TypedDict, total=False):
    """
    Dictionary type representing a snapshot/link, compatible with Snapshot model fields.
    """
-    url: str              # Required: the URL to archive
-    timestamp: str        # Optional: unix timestamp string
-    title: str            # Optional: page title
-    tags: str             # Optional: comma-separated tags string
-    sources: List[str]    # Optional: list of source file paths
+
+    url: str  # Required: the URL to archive
+    timestamp: str  # Optional: unix timestamp string
+    title: str  # Optional: page title
+    tags: str  # Optional: comma-separated tags string
+    sources: list[str]  # Optional: list of source file paths


 def parse_json_main_index(out_dir: Path) -> Iterator[SnapshotDict]:
@@ -41,16 +43,16 @@ def parse_json_main_index(out_dir: Path) -> Iterator[SnapshotDict]:
        return

    try:
-        with open(index_path, 'r', encoding='utf-8') as f:
+        with open(index_path, encoding="utf-8") as f:
            data = json.load(f)

-        links = data.get('links', [])
+        links = data.get("links", [])
        for link in links:
            yield {
-                'url': link.get('url', ''),
-                'timestamp': link.get('timestamp', str(datetime.now(timezone.utc).timestamp())),
-                'title': link.get('title'),
-                'tags': link.get('tags', ''),
+                "url": link.get("url", ""),
+                "timestamp": link.get("timestamp", str(datetime.now(timezone.utc).timestamp())),
+                "title": link.get("title"),
+                "tags": link.get("tags", ""),
            }
    except (json.JSONDecodeError, KeyError, TypeError):
        return
@@ -81,12 +83,12 @@ def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:

        if jsonl_file.exists():
            try:
-                with open(jsonl_file, 'r', encoding='utf-8') as f:
+                with open(jsonl_file, encoding="utf-8") as f:
                    for line in f:
                        line = line.strip()
-                        if line.startswith('{'):
+                        if line.startswith("{"):
                            record = json.loads(line)
-                            if record.get('type') == 'Snapshot':
+                            if record.get("type") == "Snapshot":
                                link = record
                                break
            except (json.JSONDecodeError, KeyError, TypeError):
@@ -94,15 +96,15 @@ def parse_json_links_details(out_dir: Path) -> Iterator[SnapshotDict]:

        if link is None and json_file.exists():
            try:
-                with open(json_file, 'r', encoding='utf-8') as f:
+                with open(json_file, encoding="utf-8") as f:
                    link = json.load(f)
            except (json.JSONDecodeError, KeyError, TypeError):
                pass

        if link:
            yield {
-                'url': link.get('url', ''),
-                'timestamp': link.get('timestamp', entry.name),
-                'title': link.get('title'),
-                'tags': link.get('tags', ''),
+                "url": link.get("url", ""),
+                "timestamp": link.get("timestamp", entry.name),
+                "title": link.get("title"),
+                "tags": link.get("tags", ""),
            }
--- a/archivebox/misc/logging.py
+++ b/archivebox/misc/logging.py
@@ -1,10 +1,9 @@
-__package__ = 'archivebox.misc'
+__package__ = "archivebox.misc"

 # Low-level logging primitives (Rich console, ANSI colors, stdout/stderr helpers)
 # Higher-level logging functions are in logging_util.py

 import sys
-from typing import Optional, Union, Tuple, List
 from collections import defaultdict
 from random import randint

@@ -19,11 +18,13 @@ CONSOLE = Console(width=32768, soft_wrap=True, force_terminal=True)
 STDERR = Console(stderr=True, width=32768, soft_wrap=True, force_terminal=True)
 IS_TTY = sys.stdout.isatty()

+
 class RainbowHighlighter(Highlighter):
    def highlight(self, text):
        for index in range(len(text)):
            text.stylize(f"color({randint(90, 98)})", index, index + 1)

+
 rainbow = RainbowHighlighter()


@@ -38,49 +39,55 @@ DEFAULT_CLI_COLORS = benedict(
        "blue": "\033[01;34m",
        "white": "\033[01;37m",
        "black": "\033[01;30m",
-    }
+    },
+)
+ANSI = benedict({k: "" for k in DEFAULT_CLI_COLORS.keys()})
+
+COLOR_DICT = defaultdict(
+    lambda: [(0, 0, 0), (0, 0, 0)],
+    {
+        "00": [(0, 0, 0), (0, 0, 0)],
+        "30": [(0, 0, 0), (0, 0, 0)],
+        "31": [(255, 0, 0), (128, 0, 0)],
+        "32": [(0, 200, 0), (0, 128, 0)],
+        "33": [(255, 255, 0), (128, 128, 0)],
+        "34": [(0, 0, 255), (0, 0, 128)],
+        "35": [(255, 0, 255), (128, 0, 128)],
+        "36": [(0, 255, 255), (0, 128, 128)],
+        "37": [(255, 255, 255), (255, 255, 255)],
+    },
 )
-ANSI = benedict({k: '' for k in DEFAULT_CLI_COLORS.keys()})

-COLOR_DICT = defaultdict(lambda: [(0, 0, 0), (0, 0, 0)], {
-    '00': [(0, 0, 0), (0, 0, 0)],
-    '30': [(0, 0, 0), (0, 0, 0)],
-    '31': [(255, 0, 0), (128, 0, 0)],
-    '32': [(0, 200, 0), (0, 128, 0)],
-    '33': [(255, 255, 0), (128, 128, 0)],
-    '34': [(0, 0, 255), (0, 0, 128)],
-    '35': [(255, 0, 255), (128, 0, 128)],
-    '36': [(0, 255, 255), (0, 128, 128)],
-    '37': [(255, 255, 255), (255, 255, 255)],
-})

 # Logging Helpers (DEPRECATED, use rich.print instead going forward)
-def stdout(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None:
-    ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
+def stdout(*args, color: str | None = None, prefix: str = "", config: benedict | None = None) -> None:
+    ansi = DEFAULT_CLI_COLORS if (config or {}).get("USE_COLOR") else ANSI

    if color:
-        strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
+        strs = [ansi[color], " ".join(str(a) for a in args), ansi["reset"], "\n"]
    else:
-        strs = [' '.join(str(a) for a in args), '\n']
+        strs = [" ".join(str(a) for a in args), "\n"]

-    sys.stdout.write(prefix + ''.join(strs))
+    sys.stdout.write(prefix + "".join(strs))

-def stderr(*args, color: Optional[str]=None, prefix: str='', config: Optional[benedict]=None) -> None:
-    ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
+
+def stderr(*args, color: str | None = None, prefix: str = "", config: benedict | None = None) -> None:
+    ansi = DEFAULT_CLI_COLORS if (config or {}).get("USE_COLOR") else ANSI

    if color:
-        strs = [ansi[color], ' '.join(str(a) for a in args), ansi['reset'], '\n']
+        strs = [ansi[color], " ".join(str(a) for a in args), ansi["reset"], "\n"]
    else:
-        strs = [' '.join(str(a) for a in args), '\n']
+        strs = [" ".join(str(a) for a in args), "\n"]

-    sys.stderr.write(prefix + ''.join(strs))
+    sys.stderr.write(prefix + "".join(strs))

-def hint(text: Union[Tuple[str, ...], List[str], str], prefix='    ', config: Optional[benedict]=None) -> None:
-    ansi = DEFAULT_CLI_COLORS if (config or {}).get('USE_COLOR') else ANSI
+
+def hint(text: tuple[str, ...] | list[str] | str, prefix="    ", config: benedict | None = None) -> None:
+    ansi = DEFAULT_CLI_COLORS if (config or {}).get("USE_COLOR") else ANSI

    if isinstance(text, str):
        stderr(f"{prefix}{ansi['lightred']}Hint:{ansi['reset']} {text}")
    else:
        stderr(f"{prefix}{ansi['lightred']}Hint:{ansi['reset']} {text[0]}")
        for line in text[1:]:
-            stderr(f'{prefix}      {line}')
+            stderr(f"{prefix}      {line}")
--- a/archivebox/misc/logging_util.py
+++ b/archivebox/misc/logging_util.py
@@ -1,4 +1,4 @@
-__package__ = 'archivebox'
+__package__ = "archivebox"

 # High-level logging functions for CLI output and progress tracking
 # Low-level primitives (Rich console, ANSI colors) are in logging.py
@@ -14,7 +14,8 @@ from pathlib import Path

 from datetime import datetime, timezone
 from dataclasses import dataclass
-from typing import Any, Optional, List, Dict, Union, Iterable, IO, TYPE_CHECKING, cast
+from typing import Any, Optional, IO, TYPE_CHECKING, cast
+from collections.abc import Iterable

 if TYPE_CHECKING:
    from archivebox.core.models import Snapshot
@@ -28,6 +29,7 @@ from archivebox.misc.system import get_dir_size
 from archivebox.misc.util import enforce_types
 from archivebox.misc.logging import ANSI

+
@dataclass
 class RuntimeStats:
    """mutable stats counter for logging archiving timing info to CLI output"""
@@ -36,14 +38,15 @@ class RuntimeStats:
    succeeded: int = 0
    failed: int = 0

-    parse_start_ts: Optional[datetime] = None
-    parse_end_ts: Optional[datetime] = None
+    parse_start_ts: datetime | None = None
+    parse_end_ts: datetime | None = None

-    index_start_ts: Optional[datetime] = None
-    index_end_ts: Optional[datetime] = None
+    index_start_ts: datetime | None = None
+    index_end_ts: datetime | None = None
+
+    archiving_start_ts: datetime | None = None
+    archiving_end_ts: datetime | None = None

-    archiving_start_ts: Optional[datetime] = None
-    archiving_end_ts: Optional[datetime] = None

 # globals are bad, mmkay
 _LAST_RUN_STATS = RuntimeStats()
@@ -52,49 +55,47 @@ _LAST_RUN_STATS = RuntimeStats()
 class TimedProgress:
    """Show a progress bar and measure elapsed time until .end() is called"""

-    def __init__(self, seconds, prefix=''):
+    def __init__(self, seconds, prefix=""):

        self.SHOW_PROGRESS = SHELL_CONFIG.SHOW_PROGRESS
        self.ANSI = SHELL_CONFIG.ANSI
-        
+
        if self.SHOW_PROGRESS:
            self.p = Process(target=progress_bar, args=(seconds, prefix, self.ANSI))
            self.p.start()

-        self.stats = {'start_ts': datetime.now(timezone.utc), 'end_ts': None}
+        self.stats = {"start_ts": datetime.now(timezone.utc), "end_ts": None}

    def end(self):
        """immediately end progress, clear the progressbar line, and save end_ts"""

-
        end_ts = datetime.now(timezone.utc)
-        self.stats['end_ts'] = end_ts
-        
+        self.stats["end_ts"] = end_ts
+
        if self.SHOW_PROGRESS:
            # terminate if we havent already terminated
            try:
                # kill the progress bar subprocess
                try:
-                    self.p.close()   # must be closed *before* its terminnated
+                    self.p.close()  # must be closed *before* its terminnated
                except (KeyboardInterrupt, SystemExit):
                    print()
                    raise
-                except BaseException:                                           # lgtm [py/catch-base-exception]
+                except BaseException:  # lgtm [py/catch-base-exception]
                    pass
                self.p.terminate()
                time.sleep(0.1)
                # sometimes the timer doesn't terminate properly, then blocks at the join until
                # the full time has elapsed. sending a kill tries to avoid that.
                try:
-                    self.p.kill() 
+                    self.p.kill()
                except Exception:
                    pass

-
                # clear whole terminal line
                try:
-                    sys.stdout.write('\r{}{}\r'.format((' ' * SHELL_CONFIG.TERM_WIDTH), self.ANSI['reset']))
-                except (IOError, BrokenPipeError):
+                    sys.stdout.write("\r{}{}\r".format((" " * SHELL_CONFIG.TERM_WIDTH), self.ANSI["reset"]))
+                except (OSError, BrokenPipeError):
                    # ignore when the parent proc has stopped listening to our stdout
                    pass
            except ValueError:
@@ -102,10 +103,10 @@ class TimedProgress:


@enforce_types
-def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> None:
+def progress_bar(seconds: int, prefix: str = "", ANSI: dict[str, str] = ANSI) -> None:
    """show timer in the form of progress bar, with percentage and seconds remaining"""
-    output_buf = (sys.stdout or sys.__stdout__ or sys.stderr or sys.__stderr__)
-    chunk = '█' if output_buf and output_buf.encoding.upper() == 'UTF-8' else '#'
+    output_buf = sys.stdout or sys.__stdout__ or sys.stderr or sys.__stderr__
+    chunk = "█" if output_buf and output_buf.encoding.upper() == "UTF-8" else "#"
    last_width = SHELL_CONFIG.TERM_WIDTH
    chunks = last_width - len(prefix) - 20  # number of progress chunks to show (aka max bar width)
    try:
@@ -114,37 +115,41 @@ def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> Non
            if max_width < last_width:
                # when the terminal size is shrunk, we have to write a newline
                # otherwise the progress bar will keep wrapping incorrectly
-                sys.stdout.write('\r\n')
+                sys.stdout.write("\r\n")
                sys.stdout.flush()
            chunks = max_width - len(prefix) - 20
            pct_complete = s / chunks / seconds * 100
            log_pct = (log(pct_complete or 1, 10) / 2) * 100  # everyone likes faster progress bars ;)
-            bar_width = round(log_pct/(100/chunks))
+            bar_width = round(log_pct / (100 / chunks))
            last_width = max_width

            # ████████████████████           0.9% (1/60sec)
-            sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
-                prefix,
-                ANSI['green' if pct_complete < 80 else 'lightyellow'],
-                (chunk * bar_width).ljust(chunks),
-                ANSI['reset'],
-                round(pct_complete, 1),
-                round(s/chunks),
-                seconds,
-            ))
+            sys.stdout.write(
+                "\r{}{}{}{} {}% ({}/{}sec)".format(
+                    prefix,
+                    ANSI["green" if pct_complete < 80 else "lightyellow"],
+                    (chunk * bar_width).ljust(chunks),
+                    ANSI["reset"],
+                    round(pct_complete, 1),
+                    round(s / chunks),
+                    seconds,
+                ),
+            )
            sys.stdout.flush()
            time.sleep(1 / chunks)

        # ██████████████████████████████████ 100.0% (60/60sec)
-        sys.stdout.write('\r{0}{1}{2}{3} {4}% ({5}/{6}sec)'.format(
-            prefix,
-            ANSI['red'],
-            chunk * chunks,
-            ANSI['reset'],
-            100.0,
-            seconds,
-            seconds,
-        ))
+        sys.stdout.write(
+            "\r{}{}{}{} {}% ({}/{}sec)".format(
+                prefix,
+                ANSI["red"],
+                chunk * chunks,
+                ANSI["reset"],
+                100.0,
+                seconds,
+                seconds,
+            ),
+        )
        sys.stdout.flush()
        # uncomment to have it disappear when it hits 100% instead of staying full red:
        # time.sleep(0.5)
@@ -154,10 +159,10 @@ def progress_bar(seconds: int, prefix: str='', ANSI: Dict[str, str]=ANSI) -> Non
        print()


-def log_cli_command(subcommand: str, subcommand_args: Iterable[str]=(), stdin: str | IO | None=None, pwd: str='.'):
-    args = ' '.join(subcommand_args)
-    version_msg = '[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]'.format(
-        now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
+def log_cli_command(subcommand: str, subcommand_args: Iterable[str] = (), stdin: str | IO | None = None, pwd: str = "."):
+    args = " ".join(subcommand_args)
+    version_msg = "[dark_magenta]\\[{now}][/dark_magenta] [dark_red]ArchiveBox[/dark_red] [dark_goldenrod]v{VERSION}[/dark_goldenrod]: [green4]archivebox [green3]{subcommand}[green2] {args}[/green2]".format(
+        now=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
        VERSION=VERSION,
        subcommand=subcommand,
        args=args,
@@ -166,44 +171,54 @@ def log_cli_command(subcommand: str, subcommand_args: Iterable[str]=(), stdin: s
    # stderr('[bright_black]    > {pwd}[/]'.format(pwd=pwd, **ANSI))
    # stderr()
    print(Panel(version_msg), file=sys.stderr)
-    
+
+
 ### Parsing Stage


-def log_importing_started(urls: Union[str, List[str]], depth: int, index_only: bool):
+def log_importing_started(urls: str | list[str], depth: int, index_only: bool):
    _LAST_RUN_STATS.parse_start_ts = datetime.now(timezone.utc)
-    print('[green][+] [{}] Adding {} links to index (crawl depth={}){}...[/]'.format(
-        _LAST_RUN_STATS.parse_start_ts.strftime('%Y-%m-%d %H:%M:%S'),
-        len(urls) if isinstance(urls, list) else len(urls.split('\n')),
-        depth,
-        ' (index only)' if index_only else '',
-    ))
+    print(
+        "[green][+] [{}] Adding {} links to index (crawl depth={}){}...[/]".format(
+            _LAST_RUN_STATS.parse_start_ts.strftime("%Y-%m-%d %H:%M:%S"),
+            len(urls) if isinstance(urls, list) else len(urls.split("\n")),
+            depth,
+            " (index only)" if index_only else "",
+        ),
+    )
+

 def log_source_saved(source_file: str):
-    print('    > Saved verbatim input to {}/{}'.format(CONSTANTS.SOURCES_DIR_NAME, source_file.rsplit('/', 1)[-1]))
+    print("    > Saved verbatim input to {}/{}".format(CONSTANTS.SOURCES_DIR_NAME, source_file.rsplit("/", 1)[-1]))
+

 def log_parsing_finished(num_parsed: int, parser_name: str):
    _LAST_RUN_STATS.parse_end_ts = datetime.now(timezone.utc)
-    print('    > Parsed {} URLs from input ({})'.format(num_parsed, parser_name))
+    print(f"    > Parsed {num_parsed} URLs from input ({parser_name})")
+

 def log_deduping_finished(num_new_links: int):
-    print('    > Found {} new URLs not already in index'.format(num_new_links))
+    print(f"    > Found {num_new_links} new URLs not already in index")


 def log_crawl_started(new_links):
    print()
-    print(f'[green][*] Starting crawl of {len(new_links)} sites 1 hop out from starting point[/]')
+    print(f"[green][*] Starting crawl of {len(new_links)} sites 1 hop out from starting point[/]")
+

 ### Indexing Stage

+
 def log_indexing_process_started(num_links: int):
    start_ts = datetime.now(timezone.utc)
    _LAST_RUN_STATS.index_start_ts = start_ts
    print()
-    print('[bright_black][*] [{}] Writing {} links to main index...[/]'.format(
-        start_ts.strftime('%Y-%m-%d %H:%M:%S'),
-        num_links,
-    ))
+    print(
+        "[bright_black][*] [{}] Writing {} links to main index...[/]".format(
+            start_ts.strftime("%Y-%m-%d %H:%M:%S"),
+            num_links,
+        ),
+    )


 def log_indexing_process_finished():
@@ -213,46 +228,55 @@ def log_indexing_process_finished():

 def log_indexing_started(out_path: str):
    if SHELL_CONFIG.IS_TTY:
-        sys.stdout.write(f'    > ./{Path(out_path).relative_to(DATA_DIR)}')
+        sys.stdout.write(f"    > ./{Path(out_path).relative_to(DATA_DIR)}")


 def log_indexing_finished(out_path: str):
-    print(f'\r    √ ./{Path(out_path).relative_to(DATA_DIR)}')
+    print(f"\r    √ ./{Path(out_path).relative_to(DATA_DIR)}")


 ### Archiving Stage

-def log_archiving_started(num_links: int, resume: Optional[float]=None):
+
+def log_archiving_started(num_links: int, resume: float | None = None):

    start_ts = datetime.now(timezone.utc)
    _LAST_RUN_STATS.archiving_start_ts = start_ts
    print()
    if resume:
-        print('[green][▶] [{}] Resuming archive updating for {} pages starting from {}...[/]'.format(
-            start_ts.strftime('%Y-%m-%d %H:%M:%S'),
-            num_links,
-            resume,
-        ))
+        print(
+            "[green][▶] [{}] Resuming archive updating for {} pages starting from {}...[/]".format(
+                start_ts.strftime("%Y-%m-%d %H:%M:%S"),
+                num_links,
+                resume,
+            ),
+        )
    else:
-        print('[green][▶] [{}] Starting archiving of {} snapshots in index...[/]'.format(
-            start_ts.strftime('%Y-%m-%d %H:%M:%S'),
-            num_links,
-        ))
+        print(
+            "[green][▶] [{}] Starting archiving of {} snapshots in index...[/]".format(
+                start_ts.strftime("%Y-%m-%d %H:%M:%S"),
+                num_links,
+            ),
+        )
+

 def log_archiving_paused(num_links: int, idx: int, timestamp: str):

    end_ts = datetime.now(timezone.utc)
    _LAST_RUN_STATS.archiving_end_ts = end_ts
    print()
-    print('\n[yellow3][X] [{now}] Downloading paused on link {timestamp} ({idx}/{total})[/]'.format(
-        now=end_ts.strftime('%Y-%m-%d %H:%M:%S'),
-        idx=idx+1,
-        timestamp=timestamp,
-        total=num_links,
-    ))
+    print(
+        "\n[yellow3][X] [{now}] Downloading paused on link {timestamp} ({idx}/{total})[/]".format(
+            now=end_ts.strftime("%Y-%m-%d %H:%M:%S"),
+            idx=idx + 1,
+            timestamp=timestamp,
+            total=num_links,
+        ),
+    )
    print()
-    print('    Continue archiving where you left off by running:')
-    print('        archivebox update --resume={}'.format(timestamp))
+    print("    Continue archiving where you left off by running:")
+    print(f"        archivebox update --resume={timestamp}")
+

 def log_archiving_finished(num_links: int):

@@ -263,24 +287,26 @@ def log_archiving_finished(num_links: int):
    assert _LAST_RUN_STATS.archiving_start_ts is not None
    seconds = end_ts.timestamp() - _LAST_RUN_STATS.archiving_start_ts.timestamp()
    if seconds > 60:
-        duration = '{0:.2f} min'.format(seconds / 60)
+        duration = f"{seconds / 60:.2f} min"
    else:
-        duration = '{0:.2f} sec'.format(seconds)
+        duration = f"{seconds:.2f} sec"

    print()
-    print('[green][√] [{}] Update of {} pages complete ({})[/]'.format(
-        end_ts.strftime('%Y-%m-%d %H:%M:%S'),
-        num_links,
-        duration,
-    ))
-    print('    - {} links skipped'.format(_LAST_RUN_STATS.skipped))
-    print('    - {} links updated'.format(_LAST_RUN_STATS.succeeded + _LAST_RUN_STATS.failed))
-    print('    - {} links had errors'.format(_LAST_RUN_STATS.failed))
-    
+    print(
+        "[green][√] [{}] Update of {} pages complete ({})[/]".format(
+            end_ts.strftime("%Y-%m-%d %H:%M:%S"),
+            num_links,
+            duration,
+        ),
+    )
+    print(f"    - {_LAST_RUN_STATS.skipped} links skipped")
+    print(f"    - {_LAST_RUN_STATS.succeeded + _LAST_RUN_STATS.failed} links updated")
+    print(f"    - {_LAST_RUN_STATS.failed} links had errors")
+
    if Snapshot.objects.count() < 50:
        print()
-        print('    [violet]Hint:[/] To manage your archive in a Web UI, run:')
-        print('        archivebox server 0.0.0.0:8000')
+        print("    [violet]Hint:[/] To manage your archive in a Web UI, run:")
+        print("        archivebox server 0.0.0.0:8000")


 def log_snapshot_archiving_started(snapshot: "Snapshot", out_dir: str, is_new: bool):
@@ -289,41 +315,51 @@ def log_snapshot_archiving_started(snapshot: "Snapshot", out_dir: str, is_new: b
    #     http://www.benstopford.com/2015/02/14/log-structured-merge-trees/
    #     > output/archive/1478739709

-    print('\n[[{symbol_color}]{symbol}[/]] [[{symbol_color}]{now}[/]] "{title}"'.format(
-        symbol_color='green' if is_new else 'bright_black',
-        symbol='+' if is_new else '√',
-        now=datetime.now(timezone.utc).strftime('%Y-%m-%d %H:%M:%S'),
-        title=snapshot.title or snapshot.base_url,
-    ))
-    print(f'    [sky_blue1]{snapshot.url}[/]')
-    print('    {} {}'.format(
-        '>' if is_new else '√',
-        pretty_path(out_dir),
-    ))
+    print(
+        '\n[[{symbol_color}]{symbol}[/]] [[{symbol_color}]{now}[/]] "{title}"'.format(
+            symbol_color="green" if is_new else "bright_black",
+            symbol="+" if is_new else "√",
+            now=datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M:%S"),
+            title=snapshot.title or snapshot.base_url,
+        ),
+    )
+    print(f"    [sky_blue1]{snapshot.url}[/]")
+    print(
+        "    {} {}".format(
+            ">" if is_new else "√",
+            pretty_path(out_dir),
+        ),
+    )
+

 def log_snapshot_archiving_finished(snapshot: "Snapshot", out_dir: str, is_new: bool, stats: dict, start_ts: datetime):
    total = sum(stats.values())

-    if stats['failed'] > 0 :
+    if stats["failed"] > 0:
        _LAST_RUN_STATS.failed += 1
-    elif stats['skipped'] == total:
+    elif stats["skipped"] == total:
        _LAST_RUN_STATS.skipped += 1
    else:
        _LAST_RUN_STATS.succeeded += 1

    try:
-        size = get_dir_size(out_dir)
-    except FileNotFoundError:
-        size = (0, None, '0')
+        results = snapshot.archiveresult_set.only("output_files", "output_size")
+        total_bytes = sum(result.output_size or result.output_size_from_files() for result in results)
+        total_files = sum(result.output_file_count() for result in results)
+        size = (total_bytes, 0, total_files)
+    except Exception:
+        try:
+            size = get_dir_size(out_dir)
+        except FileNotFoundError:
+            size = (0, None, "0")

    end_ts = datetime.now(timezone.utc)
-    duration = str(end_ts - start_ts).split('.')[0]
-    print('        [bright_black]{} files ({}) in {}s [/]'.format(size[2], printable_filesize(size[0]), duration))
-
+    duration = str(end_ts - start_ts).split(".")[0]
+    print(f"        [bright_black]{size[2]} files ({printable_filesize(size[0])}) in {duration}s [/]")


 def log_archive_method_started(method: str):
-    print('      > {}'.format(method))
+    print(f"      > {method}")


 def log_archive_method_finished(result: dict):
@@ -332,122 +368,117 @@ def log_archive_method_finished(result: dict):
    copy-paste the outputted string directly to run the cmd
    """
    # Prettify CMD string and make it safe to copy-paste by quoting arguments
-    quoted_cmd = ' '.join(
-        '"{}"'.format(arg) if (' ' in arg) or (':' in arg) else arg
-        for arg in result['cmd']
-    )
+    quoted_cmd = " ".join(f'"{arg}"' if (" " in arg) or (":" in arg) else arg for arg in result["cmd"])

-    if result['status'] == 'failed':
-        output = result.get('output')
-        if output and output.__class__.__name__ == 'TimeoutExpired':
-            duration = (result['end_ts'] - result['start_ts']).seconds
+    if result["status"] == "failed":
+        output = result.get("output")
+        if output and output.__class__.__name__ == "TimeoutExpired":
+            duration = (result["end_ts"] - result["start_ts"]).seconds
            hint_header = [
-                f'[yellow3]Extractor timed out after {duration}s.[/]',
+                f"[yellow3]Extractor timed out after {duration}s.[/]",
            ]
        else:
-            error_name = output.__class__.__name__.replace('ArchiveError', '') if output else 'Error'
+            error_name = output.__class__.__name__.replace("ArchiveError", "") if output else "Error"
            hint_header = [
-                '[yellow3]Extractor failed:[/]',
-                f'    {error_name} [red1]{output}[/]',
+                "[yellow3]Extractor failed:[/]",
+                f"    {error_name} [red1]{output}[/]",
            ]

        # Prettify error output hints string and limit to five lines
-        hints = getattr(output, 'hints', None) or () if output else ()
+        hints = getattr(output, "hints", None) or () if output else ()
        if hints:
            if isinstance(hints, (list, tuple, type(_ for _ in ()))):
                hints = [hint.decode() if isinstance(hint, bytes) else str(hint) for hint in hints]
            else:
                if isinstance(hints, bytes):
                    hints = hints.decode()
-                hints = hints.split('\n')
+                hints = hints.split("\n")

-            hints = (
-                f'    [yellow1]{line.strip()}[/]'
-                for line in list(hints)[:5] if line.strip()
-            )
+            hints = (f"    [yellow1]{line.strip()}[/]" for line in list(hints)[:5] if line.strip())

        docker_hints = ()
-        if os.environ.get('IN_DOCKER') in ('1', 'true', 'True', 'TRUE', 'yes'):
-            docker_hints = (
-                '  docker run -it -v $PWD/data:/data archivebox/archivebox /bin/bash',
-            )
+        if os.environ.get("IN_DOCKER") in ("1", "true", "True", "TRUE", "yes"):
+            docker_hints = ("  docker run -it -v $PWD/data:/data archivebox/archivebox /bin/bash",)

        # Collect and prefix output lines with indentation
        output_lines = [
            *hint_header,
            *hints,
-            '[violet]Run to see full output:[/]',
+            "[violet]Run to see full output:[/]",
            *docker_hints,
-            *(['    cd {};'.format(result.get('pwd'))] if result.get('pwd') else []),
-            '    {}'.format(quoted_cmd),
+            *(["    cd {};".format(result.get("pwd"))] if result.get("pwd") else []),
+            f"    {quoted_cmd}",
        ]
-        print('\n'.join(
-            '        {}'.format(line)
-            for line in output_lines
-            if line
-        ))
+        print(
+            "\n".join(f"        {line}" for line in output_lines if line),
+        )
        print()


-def log_list_started(filter_patterns: Optional[List[str]], filter_type: str):
-    print(f'[green][*] Finding links in the archive index matching these {filter_type} patterns:[/]')
-    print('    {}'.format(' '.join(filter_patterns or ())))
+def log_list_started(filter_patterns: list[str] | None, filter_type: str):
+    print(f"[green][*] Finding links in the archive index matching these {filter_type} patterns:[/]")
+    print("    {}".format(" ".join(filter_patterns or ())))
+

 def log_list_finished(snapshots):
    from archivebox.core.models import Snapshot
+
    print()
-    print('---------------------------------------------------------------------------------------------------')
+    print("---------------------------------------------------------------------------------------------------")
    csv_queryset = cast(Any, Snapshot.objects.filter(pk__in=[s.pk for s in snapshots]))
-    print(csv_queryset.to_csv(cols=['timestamp', 'is_archived', 'num_outputs', 'url'], header=True, ljust=16, separator=' | '))
-    print('---------------------------------------------------------------------------------------------------')
+    print(csv_queryset.to_csv(cols=["timestamp", "is_archived", "num_outputs", "url"], header=True, ljust=16, separator=" | "))
+    print("---------------------------------------------------------------------------------------------------")
    print()


 def log_removal_started(snapshots, yes: bool, delete: bool):
-    count = snapshots.count() if hasattr(snapshots, 'count') else len(snapshots)
-    print(f'[yellow3][i] Found {count} matching URLs to remove.[/]')
+    count = snapshots.count() if hasattr(snapshots, "count") else len(snapshots)
+    print(f"[yellow3][i] Found {count} matching URLs to remove.[/]")
    if delete:
        file_counts = [s.num_outputs for s in snapshots if os.access(s.output_dir, os.R_OK)]
        print(
-            f'    {count} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n'
-            f'    ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)'
+            f"    {count} Links will be de-listed from the main index, and their archived content folders will be deleted from disk.\n"
+            f"    ({len(file_counts)} data folders with {sum(file_counts)} archived files will be deleted!)",
        )
    else:
        print(
-            '    Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n'
-            '    (Pass --delete if you also want to permanently delete the data folders)'
+            "    Matching links will be de-listed from the main index, but their archived content folders will remain in place on disk.\n"
+            "    (Pass --delete if you also want to permanently delete the data folders)",
        )

    if not yes:
        print()
-        print(f'[yellow3][?] Do you want to proceed with removing these {count} links?[/]')
+        print(f"[yellow3][?] Do you want to proceed with removing these {count} links?[/]")
        try:
-            assert input('    y/[n]: ').lower() == 'y'
+            assert input("    y/[n]: ").lower() == "y"
        except (KeyboardInterrupt, EOFError, AssertionError):
            raise SystemExit(0)

+
 def log_removal_finished(remaining_links: int, removed_links: int):
    if remaining_links == 0 and removed_links == 0:
        print()
-        print('[red1][X] No matching links found.[/]')
+        print("[red1][X] No matching links found.[/]")
    else:
        total_before = remaining_links + removed_links
        print()
-        print(f'[red1][√] Removed {removed_links} out of {total_before} links from the archive index.[/]')
-        print(f'    Index now contains {remaining_links} links.')
+        print(f"[red1][√] Removed {removed_links} out of {total_before} links from the archive index.[/]")
+        print(f"    Index now contains {remaining_links} links.")


 ### Search Indexing Stage

+
 def log_index_started(url: str):
-    print('[green][*] Indexing url: {} in the search index[/]'.format(url))
+    print(f"[green][*] Indexing url: {url} in the search index[/]")
    print()


 ### Helpers

+
@enforce_types
-def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR, color: bool=True) -> str:
+def pretty_path(path: Path | str, pwd: Path | str = DATA_DIR, color: bool = True) -> str:
    """convert paths like .../ArchiveBox/archivebox/../output/abc into output/abc"""
    pwd = str(Path(pwd))  # .resolve()
    path = str(path)
@@ -456,46 +487,46 @@ def pretty_path(path: Union[Path, str], pwd: Union[Path, str]=DATA_DIR, color: b
        return path

    # replace long absolute paths with ./ relative ones to save on terminal output width
-    if path.startswith(pwd) and (pwd != '/') and path != pwd:
+    if path.startswith(pwd) and (pwd != "/") and path != pwd:
        if color:
-            path = path.replace(pwd, '[light_slate_blue].[/light_slate_blue]', 1)
+            path = path.replace(pwd, "[light_slate_blue].[/light_slate_blue]", 1)
        else:
-            path = path.replace(pwd, '.', 1)
-    
+            path = path.replace(pwd, ".", 1)
+
    # quote paths containing spaces
-    if ' ' in path:
+    if " " in path:
        path = f'"{path}"'
-        
+
    # replace home directory with ~ for shorter output
-    path = path.replace(str(Path('~').expanduser()), '~')
+    path = path.replace(str(Path("~").expanduser()), "~")

    return path


@enforce_types
-def printable_filesize(num_bytes: Union[int, float]) -> str:
-    for count in ['Bytes','KB','MB','GB']:
+def printable_filesize(num_bytes: int | float) -> str:
+    for count in ["Bytes", "KB", "MB", "GB"]:
        if num_bytes > -1024.0 and num_bytes < 1024.0:
-            return '%3.1f %s' % (num_bytes, count)
+            return f"{num_bytes:3.1f} {count}"
        num_bytes /= 1024.0
-    return '%3.1f %s' % (num_bytes, 'TB')
+    return "{:3.1f} {}".format(num_bytes, "TB")


@enforce_types
 def format_duration(seconds: float) -> str:
    """Format duration in human-readable form."""
    if seconds < 1:
-        return f'{seconds*1000:.0f}ms'
+        return f"{seconds * 1000:.0f}ms"
    elif seconds < 60:
-        return f'{seconds:.1f}s'
+        return f"{seconds:.1f}s"
    elif seconds < 3600:
        minutes = int(seconds // 60)
        secs = int(seconds % 60)
-        return f'{minutes}min {secs}s' if secs else f'{minutes}min'
+        return f"{minutes}min {secs}s" if secs else f"{minutes}min"
    else:
        hours = int(seconds // 3600)
        minutes = int((seconds % 3600) // 60)
-        return f'{hours}hr {minutes}min' if minutes else f'{hours}hr'
+        return f"{hours}hr {minutes}min" if minutes else f"{hours}hr"


@enforce_types
@@ -504,15 +535,15 @@ def truncate_url(url: str, max_length: int = 60) -> str:
    if len(url) <= max_length:
        return url
    # Try to keep the domain and beginning of path
-    if '://' in url:
-        protocol, rest = url.split('://', 1)
-        if '/' in rest:
-            domain, path = rest.split('/', 1)
+    if "://" in url:
+        protocol, rest = url.split("://", 1)
+        if "/" in rest:
+            domain, path = rest.split("/", 1)
            available = max_length - len(protocol) - len(domain) - 6  # for "://", "/", "..."
            if available > 10:
-                return f'{protocol}://{domain}/{path[:available]}...'
+                return f"{protocol}://{domain}/{path[:available]}..."
    # Fallback: just truncate
-    return url[:max_length-3] + '...'
+    return url[: max_length - 3] + "..."


@enforce_types
@@ -520,12 +551,12 @@ def log_worker_event(
    worker_type: str,
    event: str,
    indent_level: int = 0,
-    pid: Optional[int] = None,
-    worker_id: Optional[str] = None,
-    url: Optional[str] = None,
-    plugin: Optional[str] = None,
-    metadata: Optional[Dict[str, Any]] = None,
-    error: Optional[Exception] = None,
+    pid: int | None = None,
+    worker_id: str | None = None,
+    url: str | None = None,
+    plugin: str | None = None,
+    metadata: dict[str, Any] | None = None,
+    error: Exception | None = None,
 ) -> None:
    """
    Log a worker event with structured metadata and indentation.
@@ -541,17 +572,17 @@ def log_worker_event(
        metadata: Dict of metadata to show in curly braces
        error: Exception if event is an error
    """
-    indent = '    ' * indent_level
+    indent = "    " * indent_level

    from rich.markup import escape

    # Build worker identifier (without URL/plugin)
    worker_parts = [worker_type]
    # Don't add pid/worker_id for DB operations (they happen in whatever process is running)
-    if pid and worker_type != 'DB':
-        worker_parts.append(f'pid={pid}')
-    if worker_id and worker_type in ('CrawlWorker', 'Orchestrator') and worker_type != 'DB':
-        worker_parts.append(f'id={worker_id}')
+    if pid and worker_type != "DB":
+        worker_parts.append(f"pid={pid}")
+    if worker_id and worker_type in ("CrawlWorker", "Orchestrator") and worker_type != "DB":
+        worker_parts.append(f"id={worker_id}")

    # Build worker label parts for brackets (shown inside brackets)
    worker_label_base = worker_parts[0]
@@ -560,53 +591,53 @@ def log_worker_event(
    # Build URL/plugin display (shown AFTER the label, outside brackets)
    url_extractor_parts = []
    if url:
-        url_extractor_parts.append(f'url: {escape(url)}')
+        url_extractor_parts.append(f"url: {escape(url)}")
    if plugin:
-        url_extractor_parts.append(f'extractor: {escape(plugin)}')
+        url_extractor_parts.append(f"extractor: {escape(plugin)}")

-    url_extractor_str = ' | '.join(url_extractor_parts) if url_extractor_parts else ''
+    url_extractor_str = " | ".join(url_extractor_parts) if url_extractor_parts else ""

    # Build metadata string
-    metadata_str = ''
+    metadata_str = ""
    if metadata:
        # Format metadata nicely
        meta_parts = []
        for k, v in metadata.items():
            if isinstance(v, float):
                # Format floats nicely (durations, sizes)
-                if 'duration' in k.lower():
-                    meta_parts.append(f'{k}: {format_duration(v)}')
-                elif 'size' in k.lower():
-                    meta_parts.append(f'{k}: {printable_filesize(int(v))}')
+                if "duration" in k.lower():
+                    meta_parts.append(f"{k}: {format_duration(v)}")
+                elif "size" in k.lower():
+                    meta_parts.append(f"{k}: {printable_filesize(int(v))}")
                else:
-                    meta_parts.append(f'{k}: {v:.2f}')
+                    meta_parts.append(f"{k}: {v:.2f}")
            elif isinstance(v, int):
                # Format integers - check if it's a size
-                if 'size' in k.lower() or 'bytes' in k.lower():
-                    meta_parts.append(f'{k}: {printable_filesize(v)}')
+                if "size" in k.lower() or "bytes" in k.lower():
+                    meta_parts.append(f"{k}: {printable_filesize(v)}")
                else:
-                    meta_parts.append(f'{k}: {v}')
+                    meta_parts.append(f"{k}: {v}")
            elif isinstance(v, (list, tuple)):
-                meta_parts.append(f'{k}: {len(v)}')
+                meta_parts.append(f"{k}: {len(v)}")
            else:
-                meta_parts.append(f'{k}: {v}')
-        metadata_str = ' | '.join(meta_parts)
+                meta_parts.append(f"{k}: {v}")
+        metadata_str = " | ".join(meta_parts)

    # Determine color based on event
-    color = 'white'
-    if event in ('Starting...', 'Started', 'STARTED', 'Started in background'):
-        color = 'green'
-    elif event.startswith('Created'):
-        color = 'cyan'  # DB creation events
-    elif event in ('Completed', 'COMPLETED', 'All work complete'):
-        color = 'blue'
-    elif event in ('Failed', 'ERROR', 'Failed to spawn worker'):
-        color = 'red'
-    elif event in ('Shutting down', 'SHUTDOWN'):
-        color = 'grey53'
+    color = "white"
+    if event in ("Starting...", "Started", "STARTED", "Started in background"):
+        color = "green"
+    elif event.startswith("Created"):
+        color = "cyan"  # DB creation events
+    elif event in ("Completed", "COMPLETED", "All work complete"):
+        color = "blue"
+    elif event in ("Failed", "ERROR", "Failed to spawn worker"):
+        color = "red"
+    elif event in ("Shutting down", "SHUTDOWN"):
+        color = "grey53"

    # Build final message
-    error_str = f' {type(error).__name__}: {error}' if error else ''
+    error_str = f" {type(error).__name__}: {error}" if error else ""
    from archivebox.misc.logging import CONSOLE, STDERR
    from rich.text import Text

@@ -618,19 +649,19 @@ def log_worker_event(

    # Add bracketed content if present (using Text.append to avoid markup issues)
    if worker_bracket_content:
-        text.append('[', style=color)
+        text.append("[", style=color)
        text.append(worker_bracket_content, style=color)
-        text.append(']', style=color)
+        text.append("]", style=color)

-    text.append(f' {event}{error_str}', style=color)
+    text.append(f" {event}{error_str}", style=color)

    # Add URL/plugin info first (more important)
    if url_extractor_str:
-        text.append(f' | {url_extractor_str}')
+        text.append(f" | {url_extractor_str}")

    # Then add other metadata
    if metadata_str:
-        text.append(f' | {metadata_str}')
+        text.append(f" | {metadata_str}")

    # Stdout is reserved for JSONL records whenever commands are piped together.
    # Route worker/DB progress to stderr in non-TTY contexts so pipelines like
@@ -640,90 +671,85 @@ def log_worker_event(


@enforce_types
-def printable_folders(folders: Dict[str, Optional["Snapshot"]], with_headers: bool=False) -> str:
-    return '\n'.join(
-        f'{folder} {snapshot and snapshot.url} "{snapshot and snapshot.title}"'
-        for folder, snapshot in folders.items()
-    )
-
-
-
-@enforce_types
-def printable_config(config: dict, prefix: str='') -> str:
-    return f'\n{prefix}'.join(
-        f'{key}={val}'
-        for key, val in config.items()
-        if not (isinstance(val, dict) or callable(val))
-    )
+def printable_folders(folders: dict[str, Optional["Snapshot"]], with_headers: bool = False) -> str:
+    return "\n".join(f'{folder} {snapshot and snapshot.url} "{snapshot and snapshot.title}"' for folder, snapshot in folders.items())


@enforce_types
-def printable_folder_status(name: str, folder: Dict) -> str:
-    if folder['enabled']:
-        if folder['is_valid']:
-            color, symbol, note, num_files = 'green', '√', 'valid', ''
+def printable_config(config: dict, prefix: str = "") -> str:
+    return f"\n{prefix}".join(f"{key}={val}" for key, val in config.items() if not (isinstance(val, dict) or callable(val)))
+
+
+@enforce_types
+def printable_folder_status(name: str, folder: dict) -> str:
+    if folder["enabled"]:
+        if folder["is_valid"]:
+            color, symbol, note, num_files = "green", "√", "valid", ""
        else:
-            color, symbol, note, num_files = 'red', 'X', 'invalid', '?'
+            color, symbol, note, num_files = "red", "X", "invalid", "?"
    else:
-        color, symbol, note, num_files = 'grey53', '-', 'unused', '-'
+        color, symbol, note, num_files = "grey53", "-", "unused", "-"

-
-    if folder['path']:
-        if os.access(folder['path'], os.R_OK):
+    if folder["path"]:
+        if os.access(folder["path"], os.R_OK):
            try:
                num_files = (
-                    f'{len(os.listdir(folder["path"]))} files'
-                    if os.path.isdir(folder['path']) else
-                    printable_filesize(Path(folder['path']).stat().st_size)
+                    f"{len(os.listdir(folder['path']))} files"
+                    if os.path.isdir(folder["path"])
+                    else printable_filesize(Path(folder["path"]).stat().st_size)
                )
            except PermissionError:
-                num_files = 'error'
+                num_files = "error"
        else:
-            num_files = 'missing'
-        
-    if folder.get('is_mount'):
+            num_files = "missing"
+
+    if folder.get("is_mount"):
        # add symbol @ next to filecount if path is a remote filesystem mount
-        num_files = f'{num_files} @' if num_files else '@'
+        num_files = f"{num_files} @" if num_files else "@"

-    path = pretty_path(folder['path'])
+    path = pretty_path(folder["path"])

-    return ' '.join((
-        f'[{color}]',
-        symbol,
-        '[/]',
-        name.ljust(21).replace('DATA_DIR', '[light_slate_blue]DATA_DIR[/light_slate_blue]'),
-        num_files.ljust(14).replace('missing', '[grey53]missing[/grey53]'),
-        f'[{color}]',
-        note.ljust(8),
-        '[/]',
-        path.ljust(76),
-    ))
+    return " ".join(
+        (
+            f"[{color}]",
+            symbol,
+            "[/]",
+            name.ljust(21).replace("DATA_DIR", "[light_slate_blue]DATA_DIR[/light_slate_blue]"),
+            num_files.ljust(14).replace("missing", "[grey53]missing[/grey53]"),
+            f"[{color}]",
+            note.ljust(8),
+            "[/]",
+            path.ljust(76),
+        ),
+    )


@enforce_types
-def printable_dependency_version(name: str, dependency: Dict) -> str:
-    color, symbol, note, version = 'red', 'X', 'invalid', '?'
+def printable_dependency_version(name: str, dependency: dict) -> str:
+    color, symbol, note, version = "red", "X", "invalid", "?"

-    if dependency['enabled']:
-        if dependency['is_valid']:
-            color, symbol, note = 'green', '√', 'valid'
+    if dependency["enabled"]:
+        if dependency["is_valid"]:
+            color, symbol, note = "green", "√", "valid"

-            parsed_version_num = re.search(r'[\d\.]+', dependency['version'])
+            parsed_version_num = re.search(r"[\d\.]+", dependency["version"])
            if parsed_version_num:
-                version = f'v{parsed_version_num[0]}'
+                version = f"v{parsed_version_num[0]}"
    else:
-        color, symbol, note, version = 'lightyellow', '-', 'disabled', '-'
+        color, symbol, note, version = "lightyellow", "-", "disabled", "-"

-    path = pretty_path(dependency['path'])
+    path = pretty_path(dependency["path"])

-    return ' '.join((
-        ANSI[color],
-        symbol,
-        ANSI['reset'],
-        name.ljust(21),
-        version.ljust(14),
-        ANSI[color],
-        note.ljust(8),
-        ANSI['reset'],
-        path.ljust(76),
-    ))
+    return " ".join(
+        (
+            ANSI[color],
+            symbol,
+            ANSI["reset"],
+            name.ljust(21),
+            version.ljust(14),
+            ANSI[color],
+            note.ljust(8),
+            ANSI["reset"],
+            path.ljust(76),
+        ),
+    )
--- a/archivebox/misc/monkey_patches.py
+++ b/archivebox/misc/monkey_patches.py
@@ -1,4 +1,4 @@
-__package__ = 'archivebox'
+__package__ = "archivebox"


 import datetime
@@ -13,7 +13,7 @@ django_stubs_ext.monkeypatch()


 # monkey patch django timezone to add back utc (it was removed in Django 5.0)
-setattr(timezone, 'utc', datetime.timezone.utc)
+setattr(timezone, "utc", datetime.UTC)

 # monkey patch django-signals-webhooks to change how it shows up in Admin UI
 # from signal_webhooks.apps import DjangoSignalWebhooksConfig
@@ -28,28 +28,29 @@ setattr(timezone, 'utc', datetime.timezone.utc)

 # Hide site-packages/sonic/client.py:115: SyntaxWarning
 # https://github.com/xmonader/python-sonic-client/pull/18
-warnings.filterwarnings("ignore", category=SyntaxWarning, module='sonic')
+warnings.filterwarnings("ignore", category=SyntaxWarning, module="sonic")

-# Make daphne log requests quieter and esier to read
+
+# Make daphne log requests quieter and easier to read
 class ModifiedAccessLogGenerator(access.AccessLogGenerator):
    """Clutge workaround until daphne uses the Python logging framework. https://github.com/django/daphne/pull/473/files"""
-    
+
    def write_entry(self, host, date, request, status=None, length=None, ident=None, user=None):
-        
+
        # Ignore noisy requests to staticfiles / favicons / etc.
-        if 'GET /static/' in request:
+        if "GET /static/" in request:
            return
        if "GET /health/" in request:
            return
-        if 'GET /admin/jsi18n/' in request:
+        if "GET /admin/jsi18n/" in request:
            return
        if request.endswith("/favicon.ico") or request.endswith("/robots.txt") or request.endswith("/screenshot.png"):
            return
-        if request.endswith('.css') or request.endswith('.js') or request.endswith('.woff') or request.endswith('.ttf'):
+        if request.endswith(".css") or request.endswith(".js") or request.endswith(".woff") or request.endswith(".ttf"):
            return
-        if str(status) in ('404', '304'):
+        if str(status) in ("404", "304"):
            return
-        
+
        # clean up the log format to mostly match the same format as django.conf.settings.LOGGING rich formats
        self.stream.write(
            "%s HTTP     %s %s %s\n"
@@ -58,13 +59,14 @@ class ModifiedAccessLogGenerator(access.AccessLogGenerator):
                request,
                status or "-",
                "localhost" if host.startswith("127.") else host.split(":")[0],
-            )
+            ),
        )
-        
-access.AccessLogGenerator.write_entry = ModifiedAccessLogGenerator.write_entry # type: ignore
+
+
+access.AccessLogGenerator.write_entry = ModifiedAccessLogGenerator.write_entry  # type: ignore


 # fix benedict objects to pretty-print/repr more nicely with rich
 # https://stackoverflow.com/a/79048811/2156113
 # https://rich.readthedocs.io/en/stable/pretty.html#rich-repr-protocol
-benedict.benedict.__rich_repr__ = lambda self: (dict(self),)     # type: ignore
+benedict.benedict.__rich_repr__ = lambda self: (dict(self),)  # type: ignore
--- a/archivebox/misc/paginators.py
+++ b/archivebox/misc/paginators.py
@@ -1,30 +1,30 @@
-__package__ = 'archivebox.misc'
+__package__ = "archivebox.misc"

 from django.core.paginator import Paginator
 from django.utils.functional import cached_property


-class AccelleratedPaginator(Paginator):
+class AcceleratedPaginator(Paginator):
    """
-    Accellerated Pagniator ignores DISTINCT when counting total number of rows.
+    Accelerated paginator ignores DISTINCT when counting total number of rows.
    Speeds up SELECT Count(*) on Admin views by >20x.
    https://hakibenita.com/optimizing-the-django-admin-paginator
    """

    @cached_property
    def count(self):
-        has_filters = getattr(self.object_list, '_has_filters', None)
+        has_filters = getattr(self.object_list, "_has_filters", None)
        if callable(has_filters) and has_filters():
            # fallback to normal count method on filtered queryset
            return super().count

-        model = getattr(self.object_list, 'model', None)
+        model = getattr(self.object_list, "model", None)
        if model is None:
            return super().count

        # otherwise count total rows in a separate fast query
        return model.objects.count()
-    
+
        # Alternative approach for PostgreSQL: fallback count takes > 200ms
        # from django.db import connection, transaction, OperationalError
        # with transaction.atomic(), connection.cursor() as cursor:
--- a/archivebox/misc/serve_static.py
+++ b/archivebox/misc/serve_static.py
@@ -3,26 +3,35 @@ import json
 import re
 import os
 import stat
+import asyncio
 import posixpath
 import mimetypes
 import importlib
+import queue
+import threading
+import time
+import zipfile
+from datetime import datetime
 from collections.abc import Callable
 from pathlib import Path
+from urllib.parse import urlencode

 from django.contrib.staticfiles import finders
+from django.template import TemplateDoesNotExist, loader
 from django.views import static
 from django.http import StreamingHttpResponse, Http404, HttpResponse, HttpResponseNotModified
 from django.utils._os import safe_join
 from django.utils.http import http_date
 from django.utils.translation import gettext as _
 from archivebox.config.common import SERVER_CONFIG
+from archivebox.misc.logging_util import printable_filesize


 _HASHES_CACHE: dict[Path, tuple[float, dict[str, str]]] = {}


 def _load_hash_map(snapshot_dir: Path) -> dict[str, str] | None:
-    hashes_path = snapshot_dir / 'hashes' / 'hashes.json'
+    hashes_path = snapshot_dir / "hashes" / "hashes.json"
    if not hashes_path.exists():
        return None
    try:
@@ -35,11 +44,11 @@ def _load_hash_map(snapshot_dir: Path) -> dict[str, str] | None:
        return cached[1]

    try:
-        data = json.loads(hashes_path.read_text(encoding='utf-8'))
+        data = json.loads(hashes_path.read_text(encoding="utf-8"))
    except Exception:
        return None

-    file_map = {str(entry.get('path')): entry.get('hash') for entry in data.get('files', []) if entry.get('path')}
+    file_map = {str(entry.get("path")): entry.get("hash") for entry in data.get("files", []) if entry.get("path")}
    _HASHES_CACHE[hashes_path] = (mtime, file_map)
    return file_map

@@ -52,7 +61,192 @@ def _hash_for_path(document_root: Path, rel_path: str) -> str | None:


 def _cache_policy() -> str:
-    return 'public' if SERVER_CONFIG.PUBLIC_SNAPSHOTS else 'private'
+    return "public" if SERVER_CONFIG.PUBLIC_SNAPSHOTS else "private"
+
+
+def _format_direntry_timestamp(stat_result: os.stat_result) -> str:
+    timestamp = getattr(stat_result, "st_birthtime", None) or stat_result.st_mtime
+    return datetime.fromtimestamp(timestamp).strftime("%Y-%m-%d %H:%M")
+
+
+def _safe_zip_stem(name: str) -> str:
+    safe_name = re.sub(r"[^A-Za-z0-9._-]+", "-", name).strip("._-")
+    return safe_name or "archivebox"
+
+
+class _StreamingQueueWriter:
+    """Expose a write-only file-like object so zipfile can stream into a queue."""
+
+    def __init__(self, output_queue: queue.Queue[bytes | BaseException | object]) -> None:
+        self.output_queue = output_queue
+        self.position = 0
+
+    def write(self, data: bytes) -> int:
+        if data:
+            self.output_queue.put(data)
+            self.position += len(data)
+        return len(data)
+
+    def tell(self) -> int:
+        return self.position
+
+    def flush(self) -> None:
+        return None
+
+    def close(self) -> None:
+        return None
+
+    def writable(self) -> bool:
+        return True
+
+    def seekable(self) -> bool:
+        return False
+
+
+def _iter_visible_files(root: Path):
+    """Yield non-hidden files in a stable order so ZIP output is deterministic."""
+
+    for current_root, dirnames, filenames in os.walk(root):
+        dirnames[:] = sorted(dirname for dirname in dirnames if not dirname.startswith("."))
+        for filename in sorted(name for name in filenames if not name.startswith(".")):
+            yield Path(current_root) / filename
+
+
+def _build_directory_zip_response(
+    fullpath: Path,
+    path: str,
+    *,
+    is_archive_replay: bool,
+    use_async_stream: bool,
+) -> StreamingHttpResponse:
+    root_name = _safe_zip_stem(fullpath.name or Path(path).name or "archivebox")
+    sentinel = object()
+    output_queue: queue.Queue[bytes | BaseException | object] = queue.Queue(maxsize=8)
+    initial_chunk_target = 64 * 1024
+    initial_chunk_wait = 0.05
+
+    def build_zip() -> None:
+        # zipfile wants a write-only file object. Feed those bytes straight into
+        # a queue so the response can stream them out as soon as they are ready.
+        writer = _StreamingQueueWriter(output_queue)
+        try:
+            with zipfile.ZipFile(writer, mode="w", compression=zipfile.ZIP_DEFLATED, compresslevel=6) as zip_file:
+                for entry in _iter_visible_files(fullpath):
+                    rel_parts = entry.relative_to(fullpath).parts
+                    arcname = Path(root_name, *rel_parts).as_posix()
+                    zip_file.write(entry, arcname)
+        except BaseException as err:
+            output_queue.put(err)
+        finally:
+            output_queue.put(sentinel)
+
+    threading.Thread(target=build_zip, name=f"zip-stream-{root_name}", daemon=True).start()
+
+    def iter_zip_chunks():
+        # Emit a meaningful first chunk quickly so browsers show the download
+        # immediately instead of waiting on dozens of tiny ZIP header writes.
+        first_chunk = bytearray()
+        initial_deadline = time.monotonic() + initial_chunk_wait
+
+        while True:
+            timeout = max(initial_deadline - time.monotonic(), 0) if len(first_chunk) < initial_chunk_target else None
+            try:
+                chunk = output_queue.get(timeout=timeout) if timeout is not None else output_queue.get()
+            except queue.Empty:
+                if first_chunk:
+                    yield bytes(first_chunk)
+                    first_chunk.clear()
+                    continue
+                chunk = output_queue.get()
+
+            if chunk is sentinel:
+                if first_chunk:
+                    yield bytes(first_chunk)
+                break
+            if isinstance(chunk, BaseException):
+                raise chunk
+            if len(first_chunk) < initial_chunk_target:
+                first_chunk.extend(chunk)
+                if len(first_chunk) >= initial_chunk_target or time.monotonic() >= initial_deadline:
+                    yield bytes(first_chunk)
+                    first_chunk.clear()
+                continue
+            yield chunk
+
+    async def stream_zip_async():
+        # Django ASGI buffers sync StreamingHttpResponse iterators by consuming
+        # them into a list. Drive the same sync iterator from a worker thread so
+        # Daphne can send each chunk as it arrives instead of buffering the ZIP.
+        iterator = iter(iter_zip_chunks())
+        while True:
+            chunk = await asyncio.to_thread(next, iterator, None)
+            if chunk is None:
+                break
+            yield chunk
+
+    response = StreamingHttpResponse(
+        stream_zip_async() if use_async_stream else iter_zip_chunks(),
+        content_type="application/zip",
+    )
+    response.headers["Content-Disposition"] = f'attachment; filename="{root_name}.zip"'
+    response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
+    response.headers["Last-Modified"] = http_date(fullpath.stat().st_mtime)
+    response.headers["X-Accel-Buffering"] = "no"
+    return _apply_archive_replay_headers(
+        response,
+        fullpath=fullpath,
+        content_type="application/zip",
+        is_archive_replay=is_archive_replay,
+    )
+
+
+def _render_directory_index(request, path: str, fullpath: Path) -> HttpResponse:
+    try:
+        template = loader.select_template(
+            [
+                "static/directory_index.html",
+                "static/directory_index",
+            ],
+        )
+    except TemplateDoesNotExist:
+        return static.directory_index(path, fullpath)
+
+    entries = []
+    file_list = []
+    visible_entries = sorted(
+        (entry for entry in fullpath.iterdir() if not entry.name.startswith(".")),
+        key=lambda entry: (not entry.is_dir(), entry.name.lower()),
+    )
+    for entry in visible_entries:
+        url = str(entry.relative_to(fullpath))
+        if entry.is_dir():
+            url += "/"
+        file_list.append(url)
+
+        stat_result = entry.stat()
+        entries.append(
+            {
+                "name": url,
+                "url": url,
+                "is_dir": entry.is_dir(),
+                "size": "—" if entry.is_dir() else printable_filesize(stat_result.st_size),
+                "timestamp": _format_direntry_timestamp(stat_result),
+            },
+        )
+
+    zip_query = request.GET.copy()
+    zip_query["download"] = "zip"
+    zip_url = request.path
+    if zip_query:
+        zip_url = f"{zip_url}?{zip_query.urlencode()}"
+
+    context = {
+        "directory": f"{path}/",
+        "file_list": file_list,
+        "entries": entries,
+        "zip_url": zip_url,
+    }
+    return HttpResponse(template.render(context))


 # Ensure common web types are mapped consistently across platforms.
@@ -71,16 +265,16 @@ mimetypes.add_type("application/xml", ".xml")
 mimetypes.add_type("image/svg+xml", ".svg")

 try:
-    _markdown = getattr(importlib.import_module('markdown'), 'markdown')
+    _markdown = getattr(importlib.import_module("markdown"), "markdown")
 except ImportError:
    _markdown: Callable[..., str] | None = None

-MARKDOWN_INLINE_LINK_RE = re.compile(r'\[([^\]]+)\]\(([^)\s]+(?:\([^)]*\)[^)\s]*)*)\)')
-MARKDOWN_INLINE_IMAGE_RE = re.compile(r'!\[([^\]]*)\]\(([^)]+)\)')
-MARKDOWN_BOLD_RE = re.compile(r'\*\*([^*]+)\*\*')
-MARKDOWN_ITALIC_RE = re.compile(r'(?<!\*)\*([^*]+)\*(?!\*)')
-HTML_TAG_RE = re.compile(r'<[A-Za-z][^>]*>')
-HTML_BODY_RE = re.compile(r'<body[^>]*>(.*)</body>', flags=re.IGNORECASE | re.DOTALL)
+MARKDOWN_INLINE_LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)\s]+(?:\([^)]*\)[^)\s]*)*)\)")
+MARKDOWN_INLINE_IMAGE_RE = re.compile(r"!\[([^\]]*)\]\(([^)]+)\)")
+MARKDOWN_BOLD_RE = re.compile(r"\*\*([^*]+)\*\*")
+MARKDOWN_ITALIC_RE = re.compile(r"(?<!\*)\*([^*]+)\*(?!\*)")
+HTML_TAG_RE = re.compile(r"<[A-Za-z][^>]*>")
+HTML_BODY_RE = re.compile(r"<body[^>]*>(.*)</body>", flags=re.IGNORECASE | re.DOTALL)
 RISKY_REPLAY_MIMETYPES = {
    "text/html",
    "application/xhtml+xml",
@@ -99,8 +293,8 @@ def _extract_markdown_candidate(text: str) -> str:
    body_match = HTML_BODY_RE.search(candidate)
    if body_match:
        candidate = body_match.group(1)
-    candidate = re.sub(r'^\s*<p[^>]*>', '', candidate, flags=re.IGNORECASE)
-    candidate = re.sub(r'</p>\s*$', '', candidate, flags=re.IGNORECASE)
+    candidate = re.sub(r"^\s*<p[^>]*>", "", candidate, flags=re.IGNORECASE)
+    candidate = re.sub(r"</p>\s*$", "", candidate, flags=re.IGNORECASE)
    return candidate.strip()


@@ -109,15 +303,115 @@ def _looks_like_markdown(text: str) -> bool:
    if "<html" in lower and "<head" in lower and "</body>" in lower:
        return False
    md_markers = 0
-    md_markers += len(re.findall(r'^\s{0,3}#{1,6}\s+\S', text, flags=re.MULTILINE))
-    md_markers += len(re.findall(r'^\s*[-*+]\s+\S', text, flags=re.MULTILINE))
-    md_markers += len(re.findall(r'^\s*\d+\.\s+\S', text, flags=re.MULTILINE))
-    md_markers += text.count('[TOC]')
+    md_markers += len(re.findall(r"^\s{0,3}#{1,6}\s+\S", text, flags=re.MULTILINE))
+    md_markers += len(re.findall(r"^\s*[-*+]\s+\S", text, flags=re.MULTILINE))
+    md_markers += len(re.findall(r"^\s*\d+\.\s+\S", text, flags=re.MULTILINE))
+    md_markers += text.count("[TOC]")
    md_markers += len(MARKDOWN_INLINE_LINK_RE.findall(text))
-    md_markers += text.count('\n---') + text.count('\n***')
+    md_markers += text.count("\n---") + text.count("\n***")
    return md_markers >= 6


+def _render_text_preview_document(text: str, title: str) -> str:
+    escaped_title = html.escape(title)
+    escaped_text = html.escape(text)
+    return f"""<!doctype html>
+<html lang="en">
+<head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>{escaped_title}</title>
+    <style>
+        :root {{
+            color-scheme: dark;
+        }}
+        html, body {{
+            margin: 0;
+            padding: 0;
+            background: #111;
+            color: #f3f3f3;
+            font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", monospace;
+        }}
+        .archivebox-text-preview-header {{
+            position: sticky;
+            top: 0;
+            z-index: 1;
+            padding: 10px 14px;
+            font-size: 12px;
+            line-height: 1.4;
+            color: #bbb;
+            background: rgba(17, 17, 17, 0.96);
+            border-bottom: 1px solid rgba(255, 255, 255, 0.08);
+            backdrop-filter: blur(8px);
+        }}
+        .archivebox-text-preview {{
+            margin: 0;
+            padding: 14px;
+            white-space: pre-wrap;
+            word-break: break-word;
+            tab-size: 2;
+            line-height: 1.45;
+            font-size: 13px;
+        }}
+    </style>
+</head>
+<body>
+    <div class="archivebox-text-preview-header">{escaped_title}</div>
+    <pre class="archivebox-text-preview">{escaped_text}</pre>
+</body>
+</html>"""
+
+
+def _render_image_preview_document(image_url: str, title: str) -> str:
+    escaped_title = html.escape(title)
+    escaped_url = html.escape(image_url, quote=True)
+    return f"""<!doctype html>
+<html lang="en">
+<head>
+    <meta charset="utf-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1">
+    <title>{escaped_title}</title>
+    <style>
+        :root {{
+            color-scheme: dark;
+        }}
+        html, body {{
+            margin: 0;
+            padding: 0;
+            width: 100%;
+            min-height: 100%;
+            background: #fff;
+        }}
+        body {{
+            overflow: auto;
+        }}
+        .archivebox-image-preview {{
+            width: 100%;
+            min-width: 100%;
+            min-height: 100vh;
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            justify-content: flex-start;
+            box-sizing: border-box;
+        }}
+        .archivebox-image-preview img {{
+            display: block;
+            width: auto;
+            max-width: 100%;
+            height: auto;
+            margin: 0 auto;
+        }}
+    </style>
+</head>
+<body>
+    <div class="archivebox-image-preview">
+        <img src="{escaped_url}" alt="{escaped_title}">
+    </div>
+</body>
+</html>"""
+
+
 def _render_markdown_fallback(text: str) -> str:
    if _markdown is not None and not HTML_TAG_RE.search(text):
        try:
@@ -133,11 +427,11 @@ def _render_markdown_fallback(text: str) -> str:
    headings = []

    def slugify(value: str) -> str:
-        slug = re.sub(r'[^A-Za-z0-9]+', '-', value).strip('-')
+        slug = re.sub(r"[^A-Za-z0-9]+", "-", value).strip("-")
        return slug or "section"

    for raw_line in lines:
-        heading_match = re.match(r'^\s{0,3}(#{1,6})\s+(.*)$', raw_line)
+        heading_match = re.match(r"^\s{0,3}(#{1,6})\s+(.*)$", raw_line)
        if heading_match:
            level = len(heading_match.group(1))
            content = heading_match.group(2).strip()
@@ -152,8 +446,8 @@ def _render_markdown_fallback(text: str) -> str:
    def render_inline(markup: str) -> str:
        content = MARKDOWN_INLINE_IMAGE_RE.sub(r'<img alt="\1" src="\2">', markup)
        content = MARKDOWN_INLINE_LINK_RE.sub(r'<a href="\2">\1</a>', content)
-        content = MARKDOWN_BOLD_RE.sub(r'<strong>\1</strong>', content)
-        content = MARKDOWN_ITALIC_RE.sub(r'<em>\1</em>', content)
+        content = MARKDOWN_BOLD_RE.sub(r"<strong>\1</strong>", content)
+        content = MARKDOWN_ITALIC_RE.sub(r"<em>\1</em>", content)
        return content

    def close_lists():
@@ -194,7 +488,7 @@ def _render_markdown_fallback(text: str) -> str:
            html_lines.append("<br/>")
            continue

-        heading_match = re.match(r'^\s*((?:<[^>]+>\s*)*)(#{1,6})\s+(.*)$', line)
+        heading_match = re.match(r"^\s*((?:<[^>]+>\s*)*)(#{1,6})\s+(.*)$", line)
        if heading_match:
            close_lists()
            if in_blockquote:
@@ -205,7 +499,7 @@ def _render_markdown_fallback(text: str) -> str:
            content = heading_match.group(3).strip()
            if leading_tags:
                html_lines.append(leading_tags)
-            html_lines.append(f"<h{level} id=\"{slugify(content)}\">{render_inline(content)}</h{level}>")
+            html_lines.append(f'<h{level} id="{slugify(content)}">{render_inline(content)}</h{level}>')
            continue

        if stripped in ("---", "***"):
@@ -226,7 +520,7 @@ def _render_markdown_fallback(text: str) -> str:
                html_lines.append("</blockquote>")
                in_blockquote = False

-        ul_match = re.match(r'^\s*[-*+]\s+(.*)$', line)
+        ul_match = re.match(r"^\s*[-*+]\s+(.*)$", line)
        if ul_match:
            if in_ol:
                html_lines.append("</ol>")
@@ -237,7 +531,7 @@ def _render_markdown_fallback(text: str) -> str:
            html_lines.append(f"<li>{render_inline(ul_match.group(1))}</li>")
            continue

-        ol_match = re.match(r'^\s*\d+\.\s+(.*)$', line)
+        ol_match = re.match(r"^\s*\d+\.\s+(.*)$", line)
        if ol_match:
            if in_ul:
                html_lines.append("</ul>")
@@ -255,10 +549,10 @@ def _render_markdown_fallback(text: str) -> str:
            toc_items = []
            for level, title, slug in headings:
                toc_items.append(
-                    f'<li class="toc-level-{level}"><a href="#{slug}">{title}</a></li>'
+                    f'<li class="toc-level-{level}"><a href="#{slug}">{title}</a></li>',
                )
            html_lines.append(
-                '<nav class="toc"><ul>' + "".join(toc_items) + '</ul></nav>'
+                '<nav class="toc"><ul>' + "".join(toc_items) + "</ul></nav>",
            )
            continue

@@ -276,8 +570,8 @@ def _render_markdown_fallback(text: str) -> str:
 def _render_markdown_document(markdown_text: str) -> str:
    body = _render_markdown_fallback(markdown_text)
    wrapped = (
-        "<!doctype html><html><head><meta charset=\"utf-8\">"
-        "<meta name=\"viewport\" content=\"width=device-width,initial-scale=1\">"
+        '<!doctype html><html><head><meta charset="utf-8">'
+        '<meta name="viewport" content="width=device-width,initial-scale=1">'
        "<style>body{max-width:900px;margin:24px auto;padding:0 16px;"
        "font-family:system-ui,-apple-system,Segoe UI,Roboto,Helvetica,Arial,sans-serif;"
        "line-height:1.55;} img{max-width:100%;} pre{background:#f6f6f6;padding:12px;overflow:auto;}"
@@ -338,7 +632,7 @@ def _apply_archive_replay_headers(response: HttpResponse, *, fullpath: Path, con
    return response


-def serve_static_with_byterange_support(request, path, document_root=None, show_indexes=False, is_archive_replay: bool=False):
+def serve_static_with_byterange_support(request, path, document_root=None, show_indexes=False, is_archive_replay: bool = False):
    """
    Overrides Django's built-in django.views.static.serve function to support byte range requests.
    This allows you to do things like seek into the middle of a huge mp4 or WACZ without downloading the whole file.
@@ -348,13 +642,20 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
    path = posixpath.normpath(path).lstrip("/")
    fullpath = Path(safe_join(document_root, path))
    if os.access(fullpath, os.R_OK) and fullpath.is_dir():
+        if request.GET.get("download") == "zip" and show_indexes:
+            return _build_directory_zip_response(
+                fullpath,
+                path,
+                is_archive_replay=is_archive_replay,
+                use_async_stream=hasattr(request, "scope"),
+            )
        if show_indexes:
-            response = static.directory_index(path, fullpath)
+            response = _render_directory_index(request, path, fullpath)
            return _apply_archive_replay_headers(response, fullpath=fullpath, content_type="text/html", is_archive_replay=is_archive_replay)
        raise Http404(_("Directory indexes are not allowed here."))
    if not os.access(fullpath, os.R_OK):
        raise Http404(_("“%(path)s” does not exist") % {"path": fullpath})
-    
+
    statobj = fullpath.stat()
    document_root = Path(document_root) if document_root else None
    rel_path = path
@@ -374,27 +675,91 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
                not_modified.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
                not_modified.headers["Last-Modified"] = http_date(statobj.st_mtime)
                return _apply_archive_replay_headers(not_modified, fullpath=fullpath, content_type="", is_archive_replay=is_archive_replay)
-    
+
    content_type, encoding = mimetypes.guess_type(str(fullpath))
    content_type = content_type or "application/octet-stream"
    # Add charset for text-like types (best guess), but don't override the type.
-    is_text_like = (
-        content_type.startswith("text/")
-        or content_type in {
-            "application/json",
-            "application/javascript",
-            "application/xml",
-            "application/x-ndjson",
-            "image/svg+xml",
-        }
-    )
+    is_text_like = content_type.startswith("text/") or content_type in {
+        "application/json",
+        "application/javascript",
+        "application/xml",
+        "application/x-ndjson",
+        "image/svg+xml",
+    }
    if is_text_like and "charset=" not in content_type:
        content_type = f"{content_type}; charset=utf-8"
+    preview_as_text_html = (
+        bool(request.GET.get("preview"))
+        and is_text_like
+        and not content_type.startswith("text/html")
+        and not content_type.startswith("image/svg+xml")
+    )
+    preview_as_image_html = (
+        bool(request.GET.get("preview")) and content_type.startswith("image/") and not content_type.startswith("image/svg+xml")
+    )

    # Respect the If-Modified-Since header for non-markdown responses.
    if not (content_type.startswith("text/plain") or content_type.startswith("text/html")):
        if not static.was_modified_since(request.META.get("HTTP_IF_MODIFIED_SINCE"), statobj.st_mtime):
-            return _apply_archive_replay_headers(HttpResponseNotModified(), fullpath=fullpath, content_type=content_type, is_archive_replay=is_archive_replay)
+            return _apply_archive_replay_headers(
+                HttpResponseNotModified(),
+                fullpath=fullpath,
+                content_type=content_type,
+                is_archive_replay=is_archive_replay,
+            )
+
+    # Wrap text-like outputs in HTML when explicitly requested for iframe previewing.
+    if preview_as_text_html:
+        try:
+            max_preview_size = 10 * 1024 * 1024
+            if statobj.st_size <= max_preview_size:
+                decoded = fullpath.read_text(encoding="utf-8", errors="replace")
+                wrapped = _render_text_preview_document(decoded, fullpath.name)
+                response = HttpResponse(wrapped, content_type="text/html; charset=utf-8")
+                response.headers["Last-Modified"] = http_date(statobj.st_mtime)
+                if etag:
+                    response.headers["ETag"] = etag
+                    response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
+                else:
+                    response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
+                response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
+                if encoding:
+                    response.headers["Content-Encoding"] = encoding
+                return _apply_archive_replay_headers(
+                    response,
+                    fullpath=fullpath,
+                    content_type="text/html; charset=utf-8",
+                    is_archive_replay=is_archive_replay,
+                )
+        except Exception:
+            pass
+
+    if preview_as_image_html:
+        try:
+            preview_query = request.GET.copy()
+            preview_query.pop("preview", None)
+            raw_image_url = request.path
+            if preview_query:
+                raw_image_url = f"{raw_image_url}?{urlencode(list(preview_query.lists()), doseq=True)}"
+            wrapped = _render_image_preview_document(raw_image_url, fullpath.name)
+            response = HttpResponse(wrapped, content_type="text/html; charset=utf-8")
+            response.headers["Last-Modified"] = http_date(statobj.st_mtime)
+            if etag:
+                response.headers["ETag"] = etag
+                response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=31536000, immutable"
+            else:
+                response.headers["Cache-Control"] = f"{_cache_policy()}, max-age=60, stale-while-revalidate=300"
+            response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
+            if encoding:
+                response.headers["Content-Encoding"] = encoding
+            return _apply_archive_replay_headers(
+                response,
+                fullpath=fullpath,
+                content_type="text/html; charset=utf-8",
+                is_archive_replay=is_archive_replay,
+            )
+        except Exception:
+            pass

    # Heuristic fix: some archived HTML outputs (e.g. mercury content.html)
    # are stored with HTML-escaped markup or markdown sources. If so, render sensibly.
@@ -421,7 +786,12 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
                    response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
                    if encoding:
                        response.headers["Content-Encoding"] = encoding
-                    return _apply_archive_replay_headers(response, fullpath=fullpath, content_type="text/html; charset=utf-8", is_archive_replay=is_archive_replay)
+                    return _apply_archive_replay_headers(
+                        response,
+                        fullpath=fullpath,
+                        content_type="text/html; charset=utf-8",
+                        is_archive_replay=is_archive_replay,
+                    )
                if escaped_count and escaped_count > tag_count * 2:
                    response = HttpResponse(decoded, content_type=content_type)
                    response.headers["Last-Modified"] = http_date(statobj.st_mtime)
@@ -433,11 +803,16 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
                    response.headers["Content-Disposition"] = f'inline; filename="{fullpath.name}"'
                    if encoding:
                        response.headers["Content-Encoding"] = encoding
-                    return _apply_archive_replay_headers(response, fullpath=fullpath, content_type=content_type, is_archive_replay=is_archive_replay)
+                    return _apply_archive_replay_headers(
+                        response,
+                        fullpath=fullpath,
+                        content_type=content_type,
+                        is_archive_replay=is_archive_replay,
+                    )
        except Exception:
            pass

-    # setup resposne object
+    # setup response object
    ranged_file = RangedFileReader(open(fullpath, "rb"))
    response = StreamingHttpResponse(ranged_file, content_type=content_type)
    response.headers["Last-Modified"] = http_date(statobj.st_mtime)
@@ -451,7 +826,7 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
    if content_type.startswith("image/"):
        response.headers["Cache-Control"] = "public, max-age=604800, immutable"

-    # handle byte-range requests by serving chunk of file    
+    # handle byte-range requests by serving chunk of file
    if stat.S_ISREG(statobj.st_mode):
        size = statobj.st_size
        response["Content-Length"] = size
@@ -460,7 +835,7 @@ def serve_static_with_byterange_support(request, path, document_root=None, show_
        # Respect the Range header.
        if "HTTP_RANGE" in request.META:
            try:
-                ranges = parse_range_header(request.META['HTTP_RANGE'], size)
+                ranges = parse_range_header(request.META["HTTP_RANGE"], size)
            except ValueError:
                ranges = None
            # only handle syntactically valid headers, that are simple (no
@@ -511,7 +886,7 @@ def parse_range_header(header, resource_size):
    Parses a range header into a list of two-tuples (start, stop) where `start`
    is the starting byte of the range (inclusive) and `stop` is the ending byte
    position of the range (exclusive).
-    Returns None if the value of the header is not syntatically valid.
+    Returns None if the value of the header is not syntactically valid.
    https://github.com/satchamo/django/commit/2ce75c5c4bee2a858c0214d136bfcd351fcde11d
    """
    if not header or "=" not in header:
--- a/archivebox/misc/shell_welcome_message.py
+++ b/archivebox/misc/shell_welcome_message.py
@@ -1,57 +1,63 @@
-__package__ = 'archivebox.core'
+__package__ = "archivebox.core"

 from rich.console import Console

 # helpful imports that make the shell easier to work with out-of-the-box:
-import re                                  # noqa
-import os                                  # noqa
-import sys                                 # noqa
-import json                                # noqa
-import psutil                              # noqa
-import django                              # noqa
-import pydantic                            # noqa
-import requests                            # noqa
-import subprocess                          # noqa
-import archivebox                          # noqa
-from benedict import benedict              # noqa
-from django.utils import timezone          # noqa
-from datetime import datetime, timedelta   # noqa
-from django.conf import settings           # noqa
+import re  # noqa
+import os  # noqa
+import sys  # noqa
+import json  # noqa
+import psutil  # noqa
+import django  # noqa
+import pydantic  # noqa
+import requests  # noqa
+import subprocess  # noqa
+import archivebox
+from benedict import benedict  # noqa
+from django.utils import timezone  # noqa
+from datetime import datetime, timedelta  # noqa
+from django.conf import settings  # noqa

-from archivebox import CONSTANTS           # noqa
-from archivebox.cli import *               # noqa
+from archivebox import CONSTANTS  # noqa
+from archivebox.cli import *  # noqa
 from archivebox.config.configset import get_config

 CONFIG = get_config()

-if __name__ == '__main__':
+if __name__ == "__main__":
    # load the rich extension for ipython for pretty printing
    # https://rich.readthedocs.io/en/stable/introduction.html#ipython-extension
-    get_ipython().run_line_magic('load_ext', 'rich')         # type: ignore # noqa
+    get_ipython().run_line_magic("load_ext", "rich")  # type: ignore # noqa

-    # prnt = print with cropping using ... ellipsis for helptext that doens't matter that much
+    # prnt = print with cropping using ... ellipsis for helptext that doesn't matter that much
    console = Console()
-    prnt = lambda *args, **kwargs: console.print(*args, overflow='ellipsis', soft_wrap=True, **kwargs)
-
+    prnt = lambda *args, **kwargs: console.print(*args, overflow="ellipsis", soft_wrap=True, **kwargs)

    # print the welcome message
-    prnt('[green]import re, os, sys, psutil, subprocess, requests, json, pydantic, benedict, django[/]')
-    prnt('[yellow4]# ArchiveBox Imports[/]')
-    prnt('[yellow4]import archivebox[/]')
-    prnt('[yellow4]from archivebox.cli import *[/]')
+    prnt("[green]import re, os, sys, psutil, subprocess, requests, json, pydantic, benedict, django[/]")
+    prnt("[yellow4]# ArchiveBox Imports[/]")
+    prnt("[yellow4]import archivebox[/]")
+    prnt("[yellow4]from archivebox.cli import *[/]")
    prnt()
-    
+
    if console.width >= 80:
        from archivebox.misc.logging import rainbow
+
        prnt(rainbow(archivebox.ASCII_LOGO))
-        
-    prnt('[i] :heavy_dollar_sign: Welcome to the ArchiveBox Shell!')
-    prnt('    [deep_sky_blue4]Docs:[/deep_sky_blue4] [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage[/link]')
-    prnt('          [link=https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html]https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html[/link]')
+
+    prnt("[i] :heavy_dollar_sign: Welcome to the ArchiveBox Shell!")
+    prnt(
+        "    [deep_sky_blue4]Docs:[/deep_sky_blue4] [link=https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage]https://github.com/ArchiveBox/ArchiveBox/wiki/Usage#Shell-Usage[/link]",
+    )
+    prnt(
+        "          [link=https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html]https://docs.archivebox.io/en/dev/apidocs/archivebox/archivebox.html[/link]",
+    )
    prnt()
-    prnt(' :grey_question: [violet]Hint[/] [i]Here are some examples to get started:[/]')
-    prnt('    add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink]                                                                        [grey53]# add ? after anything to get help[/]')
+    prnt(" :grey_question: [violet]Hint[/] [i]Here are some examples to get started:[/]")
+    prnt(
+        "    add[blink][deep_sky_blue4]?[/deep_sky_blue4][/blink]                                                                        [grey53]# add ? after anything to get help[/]",
+    )
    prnt('    add("https://example.com/some/new/url")                                     [grey53]# call CLI methods from the shell[/]')
    prnt('    snap = Snapshot.objects.filter(url__contains="https://example.com").last()  [grey53]# query for individual snapshots[/]')
-    prnt('    snap.archiveresult_set.all()                                                [grey53]# see extractor plugin results[/]')
+    prnt("    snap.archiveresult_set.all()                                                [grey53]# see extractor plugin results[/]")
    prnt('    bool(re.compile(CONFIG.URL_DENYLIST).search("https://example.com/abc.exe")) [grey53]# test out a config change[/]')
--- a/archivebox/misc/system.py
+++ b/archivebox/misc/system.py
@@ -1,4 +1,4 @@
-__package__ = 'archivebox.misc'
+__package__ = "archivebox.misc"


 import os
@@ -8,7 +8,6 @@ import sys

 from json import dump
 from pathlib import Path
-from typing import Optional, Union, Tuple
 from subprocess import PIPE, Popen, CalledProcessError, CompletedProcess, TimeoutExpired

 from atomicwrites import atomic_write as lib_atomic_write
@@ -16,29 +15,30 @@ from atomicwrites import atomic_write as lib_atomic_write
 from archivebox.config.common import STORAGE_CONFIG
 from archivebox.misc.util import enforce_types, ExtendedEncoder

-IS_WINDOWS = os.name == 'nt'
+IS_WINDOWS = os.name == "nt"
+

 def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False, text=False, start_new_session=True, **kwargs):
    """Patched of subprocess.run to kill forked child subprocesses and fix blocking io making timeout=innefective
-        Mostly copied from https://github.com/python/cpython/blob/master/Lib/subprocess.py
+    Mostly copied from https://github.com/python/cpython/blob/master/Lib/subprocess.py
    """

    cmd = [str(arg) for arg in cmd]

    if input is not None:
-        if kwargs.get('stdin') is not None:
-            raise ValueError('stdin and input arguments may not both be used.')
-        kwargs['stdin'] = PIPE
+        if kwargs.get("stdin") is not None:
+            raise ValueError("stdin and input arguments may not both be used.")
+        kwargs["stdin"] = PIPE

    if capture_output:
-        if ('stdout' in kwargs) or ('stderr' in kwargs):
-            raise ValueError('stdout and stderr arguments may not be used with capture_output.')
-        kwargs['stdout'] = PIPE
-        kwargs['stderr'] = PIPE
+        if ("stdout" in kwargs) or ("stderr" in kwargs):
+            raise ValueError("stdout and stderr arguments may not be used with capture_output.")
+        kwargs["stdout"] = PIPE
+        kwargs["stderr"] = PIPE

    pgid = None
    try:
-        if isinstance(cmd, (list, tuple)) and cmd[0].endswith('.py'):
+        if isinstance(cmd, (list, tuple)) and cmd[0].endswith(".py"):
            PYTHON_BINARY = sys.executable
            cmd = (PYTHON_BINARY, *cmd)

@@ -69,8 +69,12 @@ def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False,

            retcode = process.poll()
            if check and retcode:
-                raise CalledProcessError(retcode, process.args,
-                                         output=stdout, stderr=stderr)
+                raise CalledProcessError(
+                    retcode,
+                    process.args,
+                    output=stdout,
+                    stderr=stderr,
+                )
    finally:
        # force kill any straggler subprocesses that were forked from the main proc
        try:
@@ -83,11 +87,11 @@ def run(cmd, *args, input=None, capture_output=True, timeout=None, check=False,


@enforce_types
-def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], overwrite: bool=True) -> None:
+def atomic_write(path: Path | str, contents: dict | str | bytes, overwrite: bool = True) -> None:
    """Safe atomic write to filesystem by writing to temp file + atomic rename"""

-    mode = 'wb+' if isinstance(contents, bytes) else 'w'
-    encoding = None if isinstance(contents, bytes) else 'utf-8'  # enforce utf-8 on all text writes
+    mode = "wb+" if isinstance(contents, bytes) else "w"
+    encoding = None if isinstance(contents, bytes) else "utf-8"  # enforce utf-8 on all text writes

    # print('\n> Atomic Write:', mode, path, len(contents), f'overwrite={overwrite}')
    try:
@@ -99,8 +103,12 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over
    except OSError as e:
        if STORAGE_CONFIG.ENFORCE_ATOMIC_WRITES:
            print(f"[X] OSError: Failed to write {path} with fcntl.F_FULLFSYNC. ({e})")
-            print("    You can store the archive/ subfolder on a hard drive or network share that doesn't support support syncronous writes,")
-            print("    but the main folder containing the index.sqlite3 and ArchiveBox.conf files must be on a filesystem that supports FSYNC.")
+            print(
+                "    You can store the archive/ subfolder on a hard drive or network share that doesn't support support synchronous writes,",
+            )
+            print(
+                "    but the main folder containing the index.sqlite3 and ArchiveBox.conf files must be on a filesystem that supports FSYNC.",
+            )
            raise SystemExit(1)

        # retry the write without forcing FSYNC (aka atomic mode)
@@ -113,19 +121,20 @@ def atomic_write(path: Union[Path, str], contents: Union[dict, str, bytes], over
    # set file permissions
    os.chmod(path, int(STORAGE_CONFIG.OUTPUT_PERMISSIONS, base=8))

+
@enforce_types
-def chmod_file(path: str, cwd: str='') -> None:
+def chmod_file(path: str, cwd: str = "") -> None:
    """chmod -R <permissions> <cwd>/<path>"""

    root = Path(cwd or os.getcwd()) / path
    if not os.access(root, os.R_OK):
-        raise Exception('Failed to chmod: {} does not exist (did the previous step fail?)'.format(path))
+        raise Exception(f"Failed to chmod: {path} does not exist (did the previous step fail?)")

    if not root.is_dir():
        # path is just a plain file
        os.chmod(root, int(STORAGE_CONFIG.OUTPUT_PERMISSIONS, base=8))
    else:
-        for subpath in Path(path).glob('**/*'):
+        for subpath in Path(path).glob("**/*"):
            if subpath.is_dir():
                # directories need execute permissions to be able to list contents
                os.chmod(subpath, int(STORAGE_CONFIG.DIR_OUTPUT_PERMISSIONS, base=8))
@@ -134,24 +143,24 @@ def chmod_file(path: str, cwd: str='') -> None:


@enforce_types
-def copy_and_overwrite(from_path: Union[str, Path], to_path: Union[str, Path]):
+def copy_and_overwrite(from_path: str | Path, to_path: str | Path):
    """copy a given file or directory to a given path, overwriting the destination"""
-    
+
    assert os.access(from_path, os.R_OK)
-    
+
    if Path(from_path).is_dir():
        shutil.rmtree(to_path, ignore_errors=True)
        shutil.copytree(from_path, to_path)
    else:
-        with open(from_path, 'rb') as src:
+        with open(from_path, "rb") as src:
            contents = src.read()
        atomic_write(to_path, contents)


@enforce_types
-def get_dir_size(path: Union[str, Path], recursive: bool=True, pattern: Optional[str]=None) -> Tuple[int, int, int]:
-    """get the total disk size of a given directory, optionally summing up 
-       recursively and limiting to a given filter list
+def get_dir_size(path: str | Path, recursive: bool = True, pattern: str | None = None) -> tuple[int, int, int]:
+    """get the total disk size of a given directory, optionally summing up
+    recursively and limiting to a given filter list
    """
    num_bytes, num_dirs, num_files = 0, 0, 0
    try:
@@ -174,20 +183,21 @@ def get_dir_size(path: Union[str, Path], recursive: bool=True, pattern: Optional
        pass
    return num_bytes, num_dirs, num_files

-class suppress_output(object):
+
+class suppress_output:
    """
-    A context manager for doing a "deep suppression" of stdout and stderr in 
-    Python, i.e. will suppress all print, even if the print originates in a 
+    A context manager for doing a "deep suppression" of stdout and stderr in
+    Python, i.e. will suppress all print, even if the print originates in a
    compiled C/Fortran sub-function.
-    
+
    This will not suppress raised exceptions, since exceptions are printed
    to stderr just before a script exits, and after the context manager has
-    exited (at least, I think that is why it lets exceptions through).      
+    exited (at least, I think that is why it lets exceptions through).

    with suppress_stdout_stderr():
        rogue_function()
    """
-    
+
    def __init__(self, stdout=True, stderr=True):
        # Open a pair of null files
        # Save the actual stdout (1) and stderr (2) file descriptors.
--- a/archivebox/misc/toml_util.py
+++ b/archivebox/misc/toml_util.py
@@ -1,4 +1,5 @@
-from typing import Any, List, Callable, cast
+from typing import Any, cast
+from collections.abc import Callable

 import json
 import ast
@@ -12,15 +13,16 @@ from pathlib import Path, PosixPath
 from pydantic.json_schema import GenerateJsonSchema
 from pydantic_core import to_jsonable_python

-JSONValue = str | bool | int | None | List['JSONValue']
+JSONValue = str | bool | int | None | list["JSONValue"]

 TOML_HEADER = "# Converted from INI to TOML format: https://toml.io/en/\n\n"

+
 def load_ini_value(val: str) -> JSONValue:
    """Convert lax INI values into strict TOML-compliant (JSON) values"""
-    if val.lower() in ('true', 'yes', '1'):
+    if val.lower() in ("true", "yes", "1"):
        return True
-    if val.lower() in ('false', 'no', '0'):
+    if val.lower() in ("false", "no", "0"):
        return False
    if val.isdigit():
        return int(val)
@@ -34,7 +36,7 @@ def load_ini_value(val: str) -> JSONValue:
        return json.loads(val)
    except Exception:
        pass
-    
+
    return val


@@ -42,7 +44,7 @@ def convert(ini_str: str) -> str:
    """Convert a string of INI config into its TOML equivalent (warning: strips comments)"""

    config = configparser.ConfigParser()
-    setattr(config, 'optionxform', str)  # capitalize key names
+    setattr(config, "optionxform", str)  # capitalize key names
    config.read_string(ini_str)

    # Initialize an empty dictionary to store the TOML representation
@@ -70,22 +72,22 @@ def convert(ini_str: str) -> str:
    return toml_str.strip()


-
 class JSONSchemaWithLambdas(GenerateJsonSchema):
    """
    Encode lambda functions in default values properly.
    Usage:
    >>> json.dumps(value, encoder=JSONSchemaWithLambdas())
    """
+
    def encode_default(self, dft: Any) -> Any:
        config = self._config
        if isinstance(dft, Callable):
-            return '{{lambda ' + inspect.getsource(dft).split('=lambda ')[-1].strip()[:-1] + '}}'
+            return "{{lambda " + inspect.getsource(dft).split("=lambda ")[-1].strip()[:-1] + "}}"
        return to_jsonable_python(
            dft,
            timedelta_mode=config.ser_json_timedelta,
            bytes_mode=config.ser_json_bytes,
-            serialize_unknown=True
+            serialize_unknown=True,
        )

    # for computed_field properties render them like this instead:
@@ -94,19 +96,21 @@ class JSONSchemaWithLambdas(GenerateJsonSchema):

 def better_toml_dump_str(val: Any) -> str:
    try:
-        dump_str = cast(Callable[[Any], str], getattr(toml.encoder, '_dump_str'))
+        dump_str = cast(Callable[[Any], str], getattr(toml.encoder, "_dump_str"))
        return dump_str(val)
    except Exception:
        # if we hit any of toml's numerous encoding bugs,
        # fall back to using json representation of string
        return json.dumps(str(val))

+
 class CustomTOMLEncoder(toml.encoder.TomlEncoder):
    """
    Custom TomlEncoder to work around https://github.com/uiri/toml's many encoding bugs.
    More info: https://github.com/fabiocaccamo/python-benedict/issues/439
    >>> toml.dumps(value, encoder=CustomTOMLEncoder())
    """
+
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        dump_funcs = cast(dict[Any, Callable[[Any], str]], self.dump_funcs)
--- a/archivebox/misc/util.py
+++ b/archivebox/misc/util.py
@@ -1,12 +1,14 @@
-__package__ = 'archivebox.misc'
+__package__ = "archivebox.misc"

 import re
 import requests
 import json as pyjson
 import http.cookiejar
+from decimal import Decimal, InvalidOperation
 from dateparser import parse as dateparser

-from typing import List, Optional, Any, Callable
+from typing import Any
+from collections.abc import Callable
 from pathlib import Path
 from inspect import signature
 from functools import wraps
@@ -18,8 +20,10 @@ from requests.exceptions import RequestException, ReadTimeout

 from base32_crockford import encode as base32_encode
 from w3lib.encoding import html_body_declared_encoding, http_content_type_encoding
+
 try:
-    import chardet    # type:ignore
+    import chardet  # type:ignore
+
    detect_encoding = lambda rawdata: chardet.detect(rawdata)["encoding"]
 except ImportError:
    detect_encoding = lambda rawdata: "utf-8"
@@ -35,57 +39,135 @@ from .logging import COLOR_DICT
 # All of these are (str) -> str
 # shortcuts to: https://docs.python.org/3/library/urllib.parse.html#url-parsing
 scheme = lambda url: urlparse(url).scheme.lower()
-without_scheme = lambda url: urlparse(url)._replace(scheme='').geturl().strip('//')
-without_query = lambda url: urlparse(url)._replace(query='').geturl().strip('//')
-without_fragment = lambda url: urlparse(url)._replace(fragment='').geturl().strip('//')
-without_path = lambda url: urlparse(url)._replace(path='', fragment='', query='').geturl().strip('//')
+without_scheme = lambda url: urlparse(url)._replace(scheme="").geturl().strip("//")
+without_query = lambda url: urlparse(url)._replace(query="").geturl().strip("//")
+without_fragment = lambda url: urlparse(url)._replace(fragment="").geturl().strip("//")
+without_path = lambda url: urlparse(url)._replace(path="", fragment="", query="").geturl().strip("//")
 path = lambda url: urlparse(url).path
-basename = lambda url: urlparse(url).path.rsplit('/', 1)[-1]
+basename = lambda url: urlparse(url).path.rsplit("/", 1)[-1]
 domain = lambda url: urlparse(url).netloc
 query = lambda url: urlparse(url).query
 fragment = lambda url: urlparse(url).fragment
-extension = lambda url: basename(url).rsplit('.', 1)[-1].lower() if '.' in basename(url) else ''
+extension = lambda url: basename(url).rsplit(".", 1)[-1].lower() if "." in basename(url) else ""
 base_url = lambda url: without_scheme(url)  # uniq base url used to dedupe links

-without_www = lambda url: url.replace('://www.', '://', 1)
-without_trailing_slash = lambda url: url[:-1] if url[-1] == '/' else url.replace('/?', '?')
-hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode('utf-8')).hexdigest(), 16))[:20]
+without_www = lambda url: url.replace("://www.", "://", 1)
+without_trailing_slash = lambda url: url[:-1] if url[-1] == "/" else url.replace("/?", "?")
+hashurl = lambda url: base32_encode(int(sha256(base_url(url).encode("utf-8")).hexdigest(), 16))[:20]

-urlencode = lambda s: s and quote(s, encoding='utf-8', errors='replace')
+urlencode = lambda s: s and quote(s, encoding="utf-8", errors="replace")
 urldecode = lambda s: s and unquote(s)
 htmlencode = lambda s: s and escape(s, quote=True)
 htmldecode = lambda s: s and unescape(s)

+
 def short_ts(ts: Any) -> str | None:
    parsed = parse_date(ts)
-    return None if parsed is None else str(parsed.timestamp()).split('.')[0]
+    return None if parsed is None else str(parsed.timestamp()).split(".")[0]


 def ts_to_date_str(ts: Any) -> str | None:
    parsed = parse_date(ts)
-    return None if parsed is None else parsed.strftime('%Y-%m-%d %H:%M')
+    return None if parsed is None else parsed.strftime("%Y-%m-%d %H:%M")


 def ts_to_iso(ts: Any) -> str | None:
    parsed = parse_date(ts)
    return None if parsed is None else parsed.isoformat()

-COLOR_REGEX = re.compile(r'\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m')
+
+COLOR_REGEX = re.compile(r"\[(?P<arg_1>\d+)(;(?P<arg_2>\d+)(;(?P<arg_3>\d+))?)?m")


 # https://mathiasbynens.be/demo/url-regex
 URL_REGEX = re.compile(
-    r'(?=('                          
-    r'http[s]?://'                     # start matching from allowed schemes
-    r'(?:[a-zA-Z]|[0-9]'               # followed by allowed alphanum characters
-    r'|[-_$@.&+!*\(\),]'               #   or allowed symbols (keep hyphen first to match literal hyphen)
-    r'|[^\u0000-\u007F])+'             #   or allowed unicode bytes
-    r'[^\]\[<>"\'\s]+'                 # stop parsing at these symbols
-    r'))',
+    r"(?=("
+    r"http[s]?://"  # start matching from allowed schemes
+    r"(?:[a-zA-Z]|[0-9]"  # followed by allowed alphanum characters
+    r"|[-_$@.&+!*\(\),]"  #   or allowed symbols (keep hyphen first to match literal hyphen)
+    r"|[^\u0000-\u007F])+"  #   or allowed unicode bytes
+    r'[^\]\[<>"\'\s]+'  # stop parsing at these symbols
+    r"))",
    re.IGNORECASE | re.UNICODE,
 )

-def parens_are_matched(string: str, open_char='(', close_char=')'):
+QUOTE_DELIMITERS = (
+    '"',
+    "'",
+    "`",
+    "“",
+    "”",
+    "‘",
+    "’",
+)
+QUOTE_ENTITY_DELIMITERS = (
+    "&quot;",
+    "&#34;",
+    "&#x22;",
+    "&apos;",
+    "&#39;",
+    "&#x27;",
+)
+URL_ENTITY_REPLACEMENTS = (
+    ("&amp;", "&"),
+    ("&#38;", "&"),
+    ("&#x26;", "&"),
+)
+
+FILESIZE_UNITS: dict[str, int] = {
+    "": 1,
+    "b": 1,
+    "byte": 1,
+    "bytes": 1,
+    "k": 1024,
+    "kb": 1024,
+    "kib": 1024,
+    "m": 1024**2,
+    "mb": 1024**2,
+    "mib": 1024**2,
+    "g": 1024**3,
+    "gb": 1024**3,
+    "gib": 1024**3,
+    "t": 1024**4,
+    "tb": 1024**4,
+    "tib": 1024**4,
+}
+
+
+def sanitize_extracted_url(url: str) -> str:
+    """Trim quote garbage and dangling prose punctuation from an extracted URL candidate."""
+    cleaned = (url or "").strip()
+    if not cleaned:
+        return cleaned
+
+    lower_cleaned = cleaned.lower()
+    cut_index = len(cleaned)
+
+    for delimiter in QUOTE_DELIMITERS:
+        found_index = cleaned.find(delimiter)
+        if found_index != -1:
+            cut_index = min(cut_index, found_index)
+
+    for delimiter in QUOTE_ENTITY_DELIMITERS:
+        found_index = lower_cleaned.find(delimiter)
+        if found_index != -1:
+            cut_index = min(cut_index, found_index)
+
+    cleaned = cleaned[:cut_index].strip()
+    lower_cleaned = cleaned.lower()
+    for entity, replacement in URL_ENTITY_REPLACEMENTS:
+        while entity in lower_cleaned:
+            entity_index = lower_cleaned.find(entity)
+            cleaned = cleaned[:entity_index] + replacement + cleaned[entity_index + len(entity) :]
+            lower_cleaned = cleaned.lower()
+
+    cleaned = cleaned.rstrip(".,;:!?\\'\"")
+    cleaned = cleaned.rstrip('"')
+
+    return cleaned
+
+
+def parens_are_matched(string: str, open_char="(", close_char=")"):
    """check that all parentheses in a string are balanced and nested properly"""
    count = 0
    for c in string:
@@ -97,6 +179,7 @@ def parens_are_matched(string: str, open_char='(', close_char=')'):
            return False
    return count == 0

+
 def fix_url_from_markdown(url_str: str) -> str:
    """
    cleanup a regex-parsed url that may contain dangling trailing parens from markdown link syntax
@@ -113,46 +196,91 @@ def fix_url_from_markdown(url_str: str) -> str:

    # cut off one trailing character at a time
    # until parens are balanced e.g. /a(b)c).x(y)z -> /a(b)c
-    while not parens_are_matched(trimmed_url):
+    while trimmed_url and not parens_are_matched(trimmed_url):
        trimmed_url = trimmed_url[:-1]
-    
+
    # make sure trimmed url is still valid
-    if re.findall(URL_REGEX, trimmed_url):
+    if any(match == trimmed_url for match in re.findall(URL_REGEX, trimmed_url)):
        return trimmed_url
-    
+
    return url_str

+
 def split_comma_separated_urls(url: str):
    offset = 0
    while True:
-        http_index = url.find('http://', 1)
-        https_index = url.find('https://', 1)
+        http_index = url.find("http://", 1)
+        https_index = url.find("https://", 1)
        next_indices = [idx for idx in (http_index, https_index) if idx != -1]
        if not next_indices:
            yield offset, url
            return

        next_index = min(next_indices)
-        if url[next_index - 1] != ',':
+        if url[next_index - 1] != ",":
            yield offset, url
            return

-        yield offset, url[:next_index - 1]
+        yield offset, url[: next_index - 1]
        offset += next_index
        url = url[next_index:]

+
 def find_all_urls(urls_str: str):
    skipped_starts = set()
    for match in re.finditer(URL_REGEX, urls_str):
        if match.start() in skipped_starts:
            continue

-        for offset, url in split_comma_separated_urls(fix_url_from_markdown(match.group(1))):
+        cleaned_match = sanitize_extracted_url(fix_url_from_markdown(match.group(1)))
+        for offset, url in split_comma_separated_urls(cleaned_match):
            if offset:
                skipped_starts.add(match.start() + offset)
            yield url


+def parse_filesize_to_bytes(value: str | int | float | None) -> int:
+    """
+    Parse a byte count from an integer or human-readable string like 45mb or 2 GB.
+    """
+    if value is None:
+        return 0
+
+    if isinstance(value, bool):
+        raise ValueError("Size value must be an integer or size string.")
+
+    if isinstance(value, int):
+        return value
+
+    if isinstance(value, float):
+        if not value.is_integer():
+            raise ValueError("Size value must resolve to a whole number of bytes.")
+        return int(value)
+
+    raw_value = str(value).strip()
+    if not raw_value:
+        return 0
+
+    if raw_value.isdigit():
+        return int(raw_value)
+
+    match = re.fullmatch(r"(?i)(\d+(?:\.\d+)?)\s*([a-z]+)", raw_value)
+    if not match:
+        raise ValueError(f"Invalid size value: {value}")
+
+    amount_str, unit_str = match.groups()
+    multiplier = FILESIZE_UNITS.get(unit_str.lower())
+    if multiplier is None:
+        raise ValueError(f"Unknown size unit: {unit_str}")
+
+    try:
+        amount = Decimal(amount_str)
+    except InvalidOperation as err:
+        raise ValueError(f"Invalid size value: {value}") from err
+
+    return int(amount * multiplier)
+
+
 def is_static_file(url: str):
    # TODO: the proper way is with MIME type detection + ext, not only extension
    return extension(url).lower() in CONSTANTS.STATICFILE_EXTENSIONS
@@ -178,14 +306,14 @@ def enforce_types(func):
            if annotation is not None and annotation.__class__ is type:
                if not isinstance(arg_val, annotation):
                    raise TypeError(
-                        '{}(..., {}: {}) got unexpected {} argument {}={}'.format(
+                        "{}(..., {}: {}) got unexpected {} argument {}={}".format(
                            func.__name__,
                            arg_key,
                            annotation.__name__,
                            type(arg_val).__name__,
                            arg_key,
                            str(arg_val)[:64],
-                        )
+                        ),
                    )

        # check args
@@ -201,12 +329,14 @@ def enforce_types(func):
    return typechecked_function


-def docstring(text: Optional[str]):
+def docstring(text: str | None):
    """attach the given docstring to the decorated function"""
+
    def decorator(func):
        if text:
            func.__doc__ = text
        return func
+
    return decorator


@@ -224,7 +354,7 @@ def str_between(string: str, start: str, end: str | None = None) -> str:
@enforce_types
 def parse_date(date: Any) -> datetime | None:
    """Parse unix timestamps, iso format, and human-readable strings"""
-    
+
    if date is None:
        return None

@@ -233,16 +363,16 @@ def parse_date(date: Any) -> datetime | None:
            return date.replace(tzinfo=timezone.utc)

        offset = date.utcoffset()
-        assert offset == datetime.now(timezone.utc).utcoffset(), 'Refusing to load a non-UTC date!'
+        assert offset == datetime.now(timezone.utc).utcoffset(), "Refusing to load a non-UTC date!"
        return date
-    
+
    if isinstance(date, (float, int)):
        date = str(date)

    if isinstance(date, str):
        normalized = date.strip()
        if not normalized:
-            raise ValueError(f'Tried to parse invalid date string! {date}')
+            raise ValueError(f"Tried to parse invalid date string! {date}")

        try:
            return datetime.fromtimestamp(float(normalized), tz=timezone.utc)
@@ -250,7 +380,7 @@ def parse_date(date: Any) -> datetime | None:
            pass

        try:
-            iso_date = normalized.replace('Z', '+00:00')
+            iso_date = normalized.replace("Z", "+00:00")
            parsed_date = datetime.fromisoformat(iso_date)
            if parsed_date.tzinfo is None:
                return parsed_date.replace(tzinfo=timezone.utc)
@@ -258,12 +388,12 @@ def parse_date(date: Any) -> datetime | None:
        except ValueError:
            pass

-        parsed_date = dateparser(normalized, settings={'TIMEZONE': 'UTC'})
+        parsed_date = dateparser(normalized, settings={"TIMEZONE": "UTC"})
        if parsed_date is None:
-            raise ValueError(f'Tried to parse invalid date string! {date}')
+            raise ValueError(f"Tried to parse invalid date string! {date}")
        return parsed_date.astimezone(timezone.utc)

-    raise ValueError('Tried to parse invalid date! {}'.format(date))
+    raise ValueError(f"Tried to parse invalid date! {date}")


@enforce_types
@@ -284,12 +414,12 @@ def download_url(url: str, timeout: int | None = None) -> str:

    response = session.get(
        url,
-        headers={'User-Agent': ARCHIVING_CONFIG.USER_AGENT},
+        headers={"User-Agent": ARCHIVING_CONFIG.USER_AGENT},
        verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY,
        timeout=timeout,
    )

-    content_type = response.headers.get('Content-Type', '')
+    content_type = response.headers.get("Content-Type", "")
    encoding = http_content_type_encoding(content_type) or html_body_declared_encoding(response.text)

    if encoding is not None:
@@ -299,21 +429,22 @@ def download_url(url: str, timeout: int | None = None) -> str:
        return response.text
    except UnicodeDecodeError:
        # if response is non-test (e.g. image or other binary files), just return the filename instead
-        return url.rsplit('/', 1)[-1]
+        return url.rsplit("/", 1)[-1]
+

@enforce_types
-def get_headers(url: str, timeout: int | None=None) -> str:
+def get_headers(url: str, timeout: int | None = None) -> str:
    """Download the contents of a remote url and return the headers"""
    # TODO: get rid of this and use an abx pluggy hook instead
-    
+
    from archivebox.config.common import ARCHIVING_CONFIG
-    
+
    timeout = timeout or ARCHIVING_CONFIG.TIMEOUT

    try:
        response = requests.head(
            url,
-            headers={'User-Agent': ARCHIVING_CONFIG.USER_AGENT},
+            headers={"User-Agent": ARCHIVING_CONFIG.USER_AGENT},
            verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY,
            timeout=timeout,
            allow_redirects=True,
@@ -325,19 +456,19 @@ def get_headers(url: str, timeout: int | None=None) -> str:
    except RequestException:
        response = requests.get(
            url,
-            headers={'User-Agent': ARCHIVING_CONFIG.USER_AGENT},
+            headers={"User-Agent": ARCHIVING_CONFIG.USER_AGENT},
            verify=ARCHIVING_CONFIG.CHECK_SSL_VALIDITY,
            timeout=timeout,
-            stream=True
+            stream=True,
        )
-    
+
    return pyjson.dumps(
        {
-            'URL': url,
-            'Status-Code': response.status_code,
-            'Elapsed': response.elapsed.total_seconds()*1000,
-            'Encoding': str(response.encoding),
-            'Apparent-Encoding': response.apparent_encoding,
+            "URL": url,
+            "Status-Code": response.status_code,
+            "Elapsed": response.elapsed.total_seconds() * 1000,
+            "Encoding": str(response.encoding),
+            "Apparent-Encoding": response.apparent_encoding,
            **dict(response.headers),
        },
        indent=4,
@@ -352,17 +483,17 @@ def ansi_to_html(text: str) -> str:
    """

    TEMPLATE = '<span style="color: rgb{}"><br>'
-    text = text.replace('[m', '</span>')
+    text = text.replace("[m", "</span>")

    def single_sub(match):
        argsdict = match.groupdict()
-        if argsdict['arg_3'] is None:
-            if argsdict['arg_2'] is None:
-                _, color = 0, argsdict['arg_1']
+        if argsdict["arg_3"] is None:
+            if argsdict["arg_2"] is None:
+                _, color = 0, argsdict["arg_1"]
            else:
-                _, color = argsdict['arg_1'], argsdict['arg_2']
+                _, color = argsdict["arg_1"], argsdict["arg_2"]
        else:
-            _, color = argsdict['arg_3'], argsdict['arg_2']
+            _, color = argsdict["arg_3"], argsdict["arg_2"]

        return TEMPLATE.format(COLOR_DICT[color][0])

@@ -370,20 +501,19 @@ def ansi_to_html(text: str) -> str:


@enforce_types
-def dedupe(options: List[str]) -> List[str]:
+def dedupe(options: list[str]) -> list[str]:
    """
    Deduplicates the given CLI args by key=value. Options that come later override earlier.
    """
    deduped = {}

    for option in options:
-        key = option.split('=')[0]
+        key = option.split("=")[0]
        deduped[key] = option

    return list(deduped.values())


-
 class ExtendedEncoder(pyjson.JSONEncoder):
    """
    Extended json serializer that supports serializing several model
@@ -393,7 +523,7 @@ class ExtendedEncoder(pyjson.JSONEncoder):
    def default(self, o):
        cls_name = o.__class__.__name__

-        if hasattr(o, '_asdict'):
+        if hasattr(o, "_asdict"):
            return o._asdict()

        elif isinstance(o, bytes):
@@ -403,12 +533,12 @@ class ExtendedEncoder(pyjson.JSONEncoder):
            return o.isoformat()

        elif isinstance(o, Exception):
-            return '{}: {}'.format(o.__class__.__name__, o)
+            return f"{o.__class__.__name__}: {o}"

        elif isinstance(o, Path):
            return str(o)

-        elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
+        elif cls_name in ("dict_items", "dict_keys", "dict_values"):
            return list(o)

        elif isinstance(o, Callable):
@@ -434,7 +564,7 @@ class ExtendedEncoder(pyjson.JSONEncoder):


@enforce_types
-def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True) -> str:
+def to_json(obj: Any, indent: int | None = 4, sort_keys: bool = True) -> str:
    """Serialize object to JSON string with extended type support"""
    return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)

@@ -447,97 +577,114 @@ def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True) -> str:
 #   the consequences of bad URL parsing could be disastrous and lead to many
 #   incorrect/badly parsed links being added to the archive, so this is worth the cost of checking

-assert fix_url_from_markdown('http://example.com/a(b)c).x(y)z') == 'http://example.com/a(b)c'
-assert fix_url_from_markdown('https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext') == 'https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def'
+assert fix_url_from_markdown("http://example.com/a(b)c).x(y)z") == "http://example.com/a(b)c"
+assert (
+    fix_url_from_markdown("https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def).link(with)_trailingtext")
+    == "https://wikipedia.org/en/some_article_(Disambiguation).html?abc=def"
+)

 URL_REGEX_TESTS = [
-    ('https://example.com', ['https://example.com']),
-    ('https://sweeting.me,https://google.com', ['https://sweeting.me', 'https://google.com']),
-    ('http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234', ['http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234']),
-
-    ('https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ abc', ['https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ']),
-    ('<a href="https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ"> abc', ['https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ', 'https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ']),
-
-    ('///a',                                                []),
-    ('http://',                                             []),
-    ('http://../',                                          ['http://../']),
-    ('http://-error-.invalid/',                             ['http://-error-.invalid/']),
-    ('https://a(b)c+1#2?3&4/',                              ['https://a(b)c+1#2?3&4/']),
-    ('http://उदाहरण.परीक्षा',                                   ['http://उदाहरण.परीक्षा']),
-    ('http://例子.测试',                                     ['http://例子.测试']),
-    ('http://➡.ws/䨹 htps://abc.1243?234',                  ['http://➡.ws/䨹']),
-    ('http://⌘.ws">https://exa+mple.com//:abc ',            ['http://⌘.ws', 'https://exa+mple.com//:abc']),
-    ('http://مثال.إختبار/abc?def=ت&ب=abc#abc=234',          ['http://مثال.إختبار/abc?def=ت&ب=abc#abc=234']),
-    ('http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c\'om', ['http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c']),
-    
-    ('http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3', ['http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3', 'http://ex.co:19/a?_d=4#-a=2.3']),
-    ('http://code.google.com/events/#&product=browser',     ['http://code.google.com/events/#&product=browser']),
-    ('http://foo.bar?q=Spaces should be encoded',           ['http://foo.bar?q=Spaces']),
-    ('http://foo.com/blah_(wikipedia)#c(i)t[e]-1',          ['http://foo.com/blah_(wikipedia)#c(i)t']),
-    ('http://foo.com/(something)?after=parens',             ['http://foo.com/(something)?after=parens']),
-    ('http://foo.com/unicode_(✪)_in_parens) abc',           ['http://foo.com/unicode_(✪)_in_parens']),
-    ('http://foo.bar/?q=Test%20URL-encoded%20stuff',        ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
-
-    ('[xyz](http://a.b/?q=(Test)%20U)RL-encoded%20stuff',   ['http://a.b/?q=(Test)%20U']),
-    ('[xyz](http://a.b/?q=(Test)%20U)-ab https://abc+123',  ['http://a.b/?q=(Test)%20U', 'https://abc+123']),
-    ('[xyz](http://a.b/?q=(Test)%20U) https://a(b)c+12)3',  ['http://a.b/?q=(Test)%20U', 'https://a(b)c+12']),
-    ('[xyz](http://a.b/?q=(Test)a\nabchttps://a(b)c+12)3',  ['http://a.b/?q=(Test)a', 'https://a(b)c+12']),
-    ('http://foo.bar/?q=Test%20URL-encoded%20stuff',        ['http://foo.bar/?q=Test%20URL-encoded%20stuff']),
+    ("https://example.com", ["https://example.com"]),
+    ("https://sweeting.me,https://google.com", ["https://sweeting.me", "https://google.com"]),
+    (
+        "http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234",
+        ["http://abc-file234example.com/abc?def=abc&23423=sdfsdf#abc=234&234=a234"],
+    ),
+    (
+        "https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ abc",
+        [
+            "https://twitter.com/share?url=https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ",
+            "https://akaao.success-corp.co.jp&text=ア@サ!ト&hashtags=ア%オ,元+ア.ア-オ_イ*シ$ロ",
+        ],
+    ),
+    (
+        '<a href="https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ"> abc',
+        [
+            "https://twitter.com/share#url=https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ",
+            "https://akaao.success-corp.co.jp&text=ア@サ!ト?hashtags=ア%オ,元+ア&abc=.ア-オ_イ*シ$ロ",
+        ],
+    ),
+    ("///a", []),
+    ("http://", []),
+    ("http://../", ["http://../"]),
+    ("http://-error-.invalid/", ["http://-error-.invalid/"]),
+    ("https://a(b)c+1#2?3&4/", ["https://a(b)c+1#2?3&4/"]),
+    ("http://उदाहरण.परीक्षा", ["http://उदाहरण.परीक्षा"]),
+    ("http://例子.测试", ["http://例子.测试"]),
+    ("http://➡.ws/䨹 htps://abc.1243?234", ["http://➡.ws/䨹"]),
+    ('http://⌘.ws">https://exa+mple.com//:abc ', ["http://⌘.ws", "https://exa+mple.com//:abc"]),
+    ("http://مثال.إختبار/abc?def=ت&ب=abc#abc=234", ["http://مثال.إختبار/abc?def=ت&ب=abc#abc=234"]),
+    ("http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c'om", ["http://-.~_!$&()*+,;=:%40:80%2f::::::@example.c"]),
+    (
+        "http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3",
+        ["http://us:pa@ex.co:42/http://ex.co:19/a?_d=4#-a=2.3", "http://ex.co:19/a?_d=4#-a=2.3"],
+    ),
+    ("http://code.google.com/events/#&product=browser", ["http://code.google.com/events/#&product=browser"]),
+    ("http://foo.bar?q=Spaces should be encoded", ["http://foo.bar?q=Spaces"]),
+    ("http://foo.com/blah_(wikipedia)#c(i)t[e]-1", ["http://foo.com/blah_(wikipedia)#c(i)t"]),
+    ("http://foo.com/(something)?after=parens", ["http://foo.com/(something)?after=parens"]),
+    ("http://foo.com/unicode_(✪)_in_parens) abc", ["http://foo.com/unicode_(✪)_in_parens"]),
+    ("http://foo.bar/?q=Test%20URL-encoded%20stuff", ["http://foo.bar/?q=Test%20URL-encoded%20stuff"]),
+    ("[xyz](http://a.b/?q=(Test)%20U)RL-encoded%20stuff", ["http://a.b/?q=(Test)%20U"]),
+    ("[xyz](http://a.b/?q=(Test)%20U)-ab https://abc+123", ["http://a.b/?q=(Test)%20U", "https://abc+123"]),
+    ("[xyz](http://a.b/?q=(Test)%20U) https://a(b)c+12)3", ["http://a.b/?q=(Test)%20U", "https://a(b)c+12"]),
+    ("[xyz](http://a.b/?q=(Test)a\nabchttps://a(b)c+12)3", ["http://a.b/?q=(Test)a", "https://a(b)c+12"]),
+    ("http://foo.bar/?q=Test%20URL-encoded%20stuff", ["http://foo.bar/?q=Test%20URL-encoded%20stuff"]),
 ]
 for urls_str, expected_url_matches in URL_REGEX_TESTS:
    url_matches = list(find_all_urls(urls_str))
-    assert url_matches == expected_url_matches, 'FAILED URL_REGEX CHECK!'
+    assert url_matches == expected_url_matches, "FAILED URL_REGEX CHECK!"


 # More test cases
 _test_url_strs = {
-    'example.com': 0,
-    '/example.com': 0,
-    '//example.com': 0,
-    ':/example.com': 0,
-    '://example.com': 0,
-    'htt://example8.com': 0,
-    '/htt://example.com': 0,
-    'https://example': 1,
-    'https://localhost/2345': 1,
-    'https://localhost:1234/123': 1,
-    '://': 0,
-    'https://': 0,
-    'http://': 0,
-    'ftp://': 0,
-    'ftp://example.com': 0,
-    'https://example.com': 1,
-    'https://example.com/': 1,
-    'https://a.example.com': 1,
-    'https://a.example.com/': 1,
-    'https://a.example.com/what/is/happening.html': 1,
-    'https://a.example.com/what/ís/happening.html': 1,
-    'https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a': 1,
-    'https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a': 1,
-    'HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b': 1,
-    'https://example.com/?what=1#how-about-this=1&2%20baf': 1,
-    'https://example.com?what=1#how-about-this=1&2%20baf': 1,
-    '<test>http://example7.com</test>': 1,
-    'https://<test>': 0,
-    'https://[test]': 0,
+    "example.com": 0,
+    "/example.com": 0,
+    "//example.com": 0,
+    ":/example.com": 0,
+    "://example.com": 0,
+    "htt://example8.com": 0,
+    "/htt://example.com": 0,
+    "https://example": 1,
+    "https://localhost/2345": 1,
+    "https://localhost:1234/123": 1,
+    "://": 0,
+    "https://": 0,
+    "http://": 0,
+    "ftp://": 0,
+    "ftp://example.com": 0,
+    "https://example.com": 1,
+    "https://example.com/": 1,
+    "https://a.example.com": 1,
+    "https://a.example.com/": 1,
+    "https://a.example.com/what/is/happening.html": 1,
+    "https://a.example.com/what/ís/happening.html": 1,
+    "https://a.example.com/what/is/happening.html?what=1&2%20b#höw-about-this=1a": 1,
+    "https://a.example.com/what/is/happéning/?what=1&2%20b#how-aboüt-this=1a": 1,
+    "HTtpS://a.example.com/what/is/happening/?what=1&2%20b#how-about-this=1af&2f%20b": 1,
+    "https://example.com/?what=1#how-about-this=1&2%20baf": 1,
+    "https://example.com?what=1#how-about-this=1&2%20baf": 1,
+    "<test>http://example7.com</test>": 1,
+    "https://<test>": 0,
+    "https://[test]": 0,
    'http://"test"': 0,
-    'http://\'test\'': 0,
-    '[https://example8.com/what/is/this.php?what=1]': 1,
-    '[and http://example9.com?what=1&other=3#and-thing=2]': 1,
+    "http://'test'": 0,
+    "[https://example8.com/what/is/this.php?what=1]": 1,
+    "[and http://example9.com?what=1&other=3#and-thing=2]": 1,
    '<what>https://example10.com#and-thing=2 "</about>': 1,
    'abc<this["https://example11.com/what/is#and-thing=2?whoami=23&where=1"]that>def': 1,
-    'sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi': 1,
-    '<or>http://examplehttp://15.badc</that>': 2,
-    'https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://': 2,
-    '[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)': 3,
+    "sdflkf[what](https://example12.com/who/what.php?whoami=1#whatami=2)?am=hi": 1,
+    "<or>http://examplehttp://15.badc</that>": 2,
+    "https://a.example.com/one.html?url=http://example.com/inside/of/another?=http://": 2,
+    "[https://a.example.com/one.html?url=http://example.com/inside/of/another?=](http://a.example.com)": 3,
 }
 for url_str, num_urls in _test_url_strs.items():
-    assert len(list(find_all_urls(url_str))) == num_urls, (
-        f'{url_str} does not contain {num_urls} urls')
+    assert len(list(find_all_urls(url_str))) == num_urls, f"{url_str} does not contain {num_urls} urls"


 ### Chrome Helpers

+
 def chrome_cleanup():
    """
    Cleans up any state or runtime files that Chrome leaves behind when killed by
@@ -560,10 +707,11 @@ def chrome_cleanup():
        # Also clean up the active persona's explicit CHROME_USER_DATA_DIR if set
        # (in case it's a custom path not under PERSONAS_DIR)
        from archivebox.config.configset import get_config
+
        config = get_config()
-        chrome_user_data_dir = config.get('CHROME_USER_DATA_DIR')
+        chrome_user_data_dir = config.get("CHROME_USER_DATA_DIR")
        if chrome_user_data_dir:
-            singleton_lock = Path(chrome_user_data_dir) / 'SingletonLock'
+            singleton_lock = Path(chrome_user_data_dir) / "SingletonLock"
            if os.path.lexists(singleton_lock):
                try:
                    singleton_lock.unlink()