rename vendor dir to pkgs

2026-04-03 06:17:53 +10:00 · 2024-10-28 20:05:20 -07:00
parent 7d75867650
commit dee4eb7992
168 changed files with 47 additions and 54 deletions
--- a/archivebox/pkgs/init.py
+++ b/archivebox/pkgs/init.py
@@ -0,0 +1,39 @@
+import sys
+import importlib
+from pathlib import Path
+
+PKGS_DIR = Path(__file__).parent
+
+VENDORED_PKGS = [
+    'abx',
+    # 'pydantic-pkgr',
+]
+
+# scan ./pkgs and add all dirs present to list of available VENDORED_PKGS
+for subdir in reversed(sorted(PKGS_DIR.iterdir())):
+    if subdir.is_dir() and subdir.name not in VENDORED_PKGS and not subdir.name.startswith('_'):
+        VENDORED_PKGS.append(subdir.name)
+
+
+def load_vendored_pkgs():
+    """Add archivebox/vendor to sys.path and import all vendored libraries present within"""
+    if str(PKGS_DIR) not in sys.path:
+        sys.path.append(str(PKGS_DIR))
+    
+    for pkg_name in VENDORED_PKGS:
+        pkg_dir = PKGS_DIR / pkg_name
+        assert pkg_dir.is_dir(), f'Required vendored pkg {pkg_name} could not be found in {pkg_dir}'
+
+        try:
+            lib = importlib.import_module(pkg_name)
+            # print(f"Successfully imported lib from environment {pkg_name}")
+        except ImportError:
+            sys.path.append(str(pkg_dir))
+            try:
+                lib = importlib.import_module(pkg_name)
+                # print(f"Successfully imported lib from vendored fallback {pkg_name}: {inspect.getfile(lib)}")
+            except ImportError as e:
+                print(f"Failed to import lib from environment or vendored fallback {pkg_name}: {e}", file=sys.stderr)
+                sys.exit(1)
+        
+
--- a/archivebox/pkgs/abx-plugin-archivedotorg/README.md
+++ b/archivebox/pkgs/abx-plugin-archivedotorg/README.md
--- a/archivebox/pkgs/abx-plugin-archivedotorg/abx_plugin_archivedotorg/init.py
+++ b/archivebox/pkgs/abx-plugin-archivedotorg/abx_plugin_archivedotorg/init.py
@@ -0,0 +1,21 @@
+__label__ = 'Archive.org'
+__homepage__ = 'https://archive.org'
+
+import abx
+
+@abx.hookimpl
+def get_CONFIG():
+    from .config import ARCHIVEDOTORG_CONFIG
+    
+    return {
+        'ARCHIVEDOTORG_CONFIG': ARCHIVEDOTORG_CONFIG
+    }
+
+
+# @abx.hookimpl
+# def get_EXTRACTORS():
+#     from .extractors import ARCHIVEDOTORG_EXTRACTOR
+#
+#     return {
+#         'archivedotorg': ARCHIVEDOTORG_EXTRACTOR,
+#     }
--- a/archivebox/pkgs/abx-plugin-archivedotorg/abx_plugin_archivedotorg/archive_org.py
+++ b/archivebox/pkgs/abx-plugin-archivedotorg/abx_plugin_archivedotorg/archive_org.py
@@ -0,0 +1,115 @@
+__package__ = 'archivebox.extractors'
+
+
+from pathlib import Path
+from typing import Optional, List, Dict, Tuple
+from collections import defaultdict
+
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from archivebox.misc.system import run, chmod_file
+from archivebox.misc.util import enforce_types, is_static_file, dedupe
+from archivebox.plugins_extractor.archivedotorg.config import ARCHIVEDOTORG_CONFIG
+from archivebox.plugins_extractor.curl.config import CURL_CONFIG
+from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
+
+from ..logging_util import TimedProgress
+
+
+def get_output_path():
+    return 'archive.org.txt'
+
+
+@enforce_types
+def should_save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
+    if is_static_file(link.url):
+        return False
+
+    out_dir = out_dir or Path(link.link_dir)
+    if not overwrite and (out_dir / get_output_path()).exists():
+        # if open(path, 'r', encoding='utf-8').read().strip() != 'None':
+        return False
+
+    return ARCHIVEDOTORG_CONFIG.SAVE_ARCHIVE_DOT_ORG
+
+@enforce_types
+def save_archive_dot_org(link: Link, out_dir: Optional[Path]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
+    """submit site to archive.org for archiving via their service, save returned archive url"""
+
+    curl_binary = CURL_BINARY.load()
+    assert curl_binary.abspath and curl_binary.version
+
+    out_dir = out_dir or Path(link.link_dir)
+    output: ArchiveOutput = get_output_path()
+    archive_org_url = None
+    submit_url = 'https://web.archive.org/save/{}'.format(link.url)
+    # later options take precedence
+    options = [
+        *CURL_CONFIG.CURL_ARGS,
+        *CURL_CONFIG.CURL_EXTRA_ARGS,
+        '--head',
+        '--max-time', str(timeout),
+        *(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
+        *([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
+    ]
+    cmd = [
+        str(curl_binary.abspath),
+        *dedupe(options),
+        submit_url,
+    ]
+    status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True)
+        content_location, errors = parse_archive_dot_org_response(result.stdout)
+        if content_location:
+            archive_org_url = content_location[0]
+        elif len(errors) == 1 and 'RobotAccessControlException' in errors[0]:
+            archive_org_url = None
+            # raise ArchiveError('Archive.org denied by {}/robots.txt'.format(domain(link.url)))
+        elif errors:
+            raise ArchiveError(', '.join(errors))
+        else:
+            raise ArchiveError('Failed to find "content-location" URL header in Archive.org response.')
+    except Exception as err:
+        status = 'failed'
+        output = err
+    finally:
+        timer.end()
+
+    if output and not isinstance(output, Exception):
+        # instead of writing None when archive.org rejects the url write the
+        # url to resubmit it to archive.org. This is so when the user visits
+        # the URL in person, it will attempt to re-archive it, and it'll show the
+        # nicer error message explaining why the url was rejected if it fails.
+        archive_org_url = archive_org_url or submit_url
+        with open(str(out_dir / output), 'w', encoding='utf-8') as f:
+            f.write(archive_org_url)
+        chmod_file(str(out_dir / output), cwd=str(out_dir))
+        output = archive_org_url
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=str(out_dir),
+        cmd_version=str(curl_binary.version),
+        output=output,
+        status=status,
+        **timer.stats,
+    )
+
+@enforce_types
+def parse_archive_dot_org_response(response: str) -> Tuple[List[str], List[str]]:
+    # Parse archive.org response headers
+    headers: Dict[str, List[str]] = defaultdict(list)
+
+    # lowercase all the header names and store in dict
+    for header in response.splitlines():
+        if ':' not in header or not header.strip():
+            continue
+        name, val = header.split(':', 1)
+        headers[name.lower().strip()].append(val.strip())
+
+    # Get successful archive url in "content-location" header or any errors
+    content_location = headers.get('content-location', headers['location'])
+    errors = headers['x-archive-wayback-runtime-error']
+    return content_location, errors
+
--- a/archivebox/pkgs/abx-plugin-archivedotorg/abx_plugin_archivedotorg/config.py
+++ b/archivebox/pkgs/abx-plugin-archivedotorg/abx_plugin_archivedotorg/config.py
@@ -0,0 +1,8 @@
+from abx_spec_config.base_configset import BaseConfigSet
+
+
+class ArchivedotorgConfig(BaseConfigSet):
+    SAVE_ARCHIVE_DOT_ORG: bool = True
+
+
+ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig()
--- a/archivebox/pkgs/abx-plugin-archivedotorg/pyproject.toml
+++ b/archivebox/pkgs/abx-plugin-archivedotorg/pyproject.toml
@@ -0,0 +1,18 @@
+[project]
+name = "abx-plugin-archivedotorg"
+version = "2024.10.28"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+    "abx-plugin-curl>=2024.10.24",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_archivedotorg = "abx_plugin_archivedotorg"
--- a/archivebox/pkgs/abx-plugin-chrome/README.md
+++ b/archivebox/pkgs/abx-plugin-chrome/README.md
--- a/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/init.py
+++ b/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/init.py
@@ -0,0 +1,34 @@
+__label__ = 'Chrome'
+__author__ = 'ArchiveBox'
+
+import abx
+
+@abx.hookimpl
+def get_CONFIG():
+    from .config import CHROME_CONFIG
+    
+    return {
+        'CHROME_CONFIG': CHROME_CONFIG
+    }
+
+@abx.hookimpl
+def get_BINARIES():
+    from .binaries import CHROME_BINARY
+    
+    return {
+        'chrome': CHROME_BINARY,
+    }
+
+@abx.hookimpl
+def ready():
+    from .config import CHROME_CONFIG
+    CHROME_CONFIG.validate()
+
+
+# @abx.hookimpl
+# def get_EXTRACTORS():
+#     return {
+#         'pdf': PDF_EXTRACTOR,
+#         'screenshot': SCREENSHOT_EXTRACTOR,
+#         'dom': DOM_EXTRACTOR,
+#     }
--- a/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/binaries.py
+++ b/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/binaries.py
@@ -0,0 +1,149 @@
+import os
+import platform
+from pathlib import Path
+from typing import List, Optional
+
+from pydantic import InstanceOf
+from pydantic_pkgr import (
+    Binary,
+    BinProvider,
+    BinName,
+    BinaryOverrides,
+    bin_abspath,
+)
+
+import abx
+
+from abx_plugin_default_binproviders import apt, brew, env
+from abx_plugin_puppeteer.binproviders import PUPPETEER_BINPROVIDER
+from abx_plugin_playwright.binproviders import PLAYWRIGHT_BINPROVIDER
+
+
+from .config import CHROME_CONFIG
+
+CHROMIUM_BINARY_NAMES_LINUX = [
+    "chromium",
+    "chromium-browser",
+    "chromium-browser-beta",
+    "chromium-browser-unstable",
+    "chromium-browser-canary",
+    "chromium-browser-dev",
+]
+CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"]
+CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS
+
+CHROME_BINARY_NAMES_LINUX = [
+    "google-chrome",
+    "google-chrome-stable",
+    "google-chrome-beta",
+    "google-chrome-canary",
+    "google-chrome-unstable",
+    "google-chrome-dev",
+    "chrome"
+]
+CHROME_BINARY_NAMES_MACOS = [
+    "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
+    "/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
+]
+CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
+
+CHROME_APT_DEPENDENCIES = [
+    'apt-transport-https', 'at-spi2-common',
+    'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei',
+    'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2',
+    'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1',
+    'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings',
+    'chromium-browser',
+]
+
+
+def autodetect_system_chrome_install(PATH=None) -> Optional[Path]:
+    for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES:
+        abspath = bin_abspath(bin_name, PATH=env.PATH)
+        if abspath:
+            return abspath
+    return None
+
+def create_macos_app_symlink(target: Path, shortcut: Path):
+    """
+    on macOS, some binaries are inside of .app, so we need to
+    create a tiny bash script instead of a symlink
+    (so that ../ parent relationships are relative to original .app instead of callsite dir)
+    """
+    # TODO: should we enforce this? is it useful in any other situation?
+    # if platform.system().lower() != 'darwin':
+    #     raise Exception(...)
+    shortcut.unlink(missing_ok=True)
+    shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""")
+    shortcut.chmod(0o777)   # make sure its executable by everyone
+
+###################### Config ##########################
+
+
+class ChromeBinary(Binary):
+    name: BinName = CHROME_CONFIG.CHROME_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
+    
+    overrides: BinaryOverrides = {
+        env.name: {
+            'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH),  # /usr/bin/google-chrome-stable
+        },
+        PUPPETEER_BINPROVIDER.name: {
+            'packages': ['chrome@stable'],              # npx @puppeteer/browsers install chrome@stable
+        },
+        PLAYWRIGHT_BINPROVIDER.name: {
+            'packages': ['chromium'],                   # playwright install chromium
+        },
+        apt.name: {
+            'packages': CHROME_APT_DEPENDENCIES,
+        },
+        brew.name: {
+            'packages': ['--cask', 'chromium'] if platform.system().lower() == 'darwin' else [],
+        },
+    }
+
+    @staticmethod
+    def symlink_to_lib(binary, bin_dir=None) -> None:
+        bin_dir = bin_dir or abx.pm.hook.get_BIN_DIR()
+        
+        if not (binary.abspath and os.path.isfile(binary.abspath)):
+            return
+        
+        bin_dir.mkdir(parents=True, exist_ok=True)
+        symlink = bin_dir / binary.name
+        
+        try:
+            if platform.system().lower() == 'darwin':
+                # if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
+                create_macos_app_symlink(binary.abspath, symlink)
+            else:
+                # otherwise on linux we can symlink directly to binary executable
+                symlink.unlink(missing_ok=True)
+                symlink.symlink_to(binary.abspath)
+        except Exception:
+            # print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
+            # not actually needed, we can just run without it
+            pass
+
+    @staticmethod            
+    def chrome_cleanup_lockfile():
+        """
+        Cleans up any state or runtime files that chrome leaves behind when killed by
+        a timeout or other error
+        """
+        try:
+            linux_lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
+            linux_lock_file.unlink(missing_ok=True)
+        except Exception:
+            pass
+        
+        if CHROME_CONFIG.CHROME_USER_DATA_DIR:
+            try:
+                (CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock').unlink(missing_ok=True)
+            except Exception:
+                pass
+
+
+
+CHROME_BINARY = ChromeBinary()
+
--- a/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/config.py
+++ b/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/config.py
@@ -0,0 +1,201 @@
+import os
+from pathlib import Path
+from typing import List, Optional
+
+from pydantic import Field
+from pydantic_pkgr import bin_abspath
+
+from abx_spec_config.base_configset import BaseConfigSet
+from abx_plugin_default_binproviders import env
+
+from archivebox.config import CONSTANTS
+from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
+from archivebox.misc.logging import STDERR
+from archivebox.misc.util import dedupe
+from archivebox.logging_util import pretty_path
+
+
+CHROMIUM_BINARY_NAMES_LINUX = [
+    "chromium",
+    "chromium-browser",
+    "chromium-browser-beta",
+    "chromium-browser-unstable",
+    "chromium-browser-canary",
+    "chromium-browser-dev",
+]
+CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"]
+CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS
+
+CHROME_BINARY_NAMES_LINUX = [
+    "google-chrome",
+    "google-chrome-stable",
+    "google-chrome-beta",
+    "google-chrome-canary",
+    "google-chrome-unstable",
+    "google-chrome-dev",
+    "chrome"
+]
+CHROME_BINARY_NAMES_MACOS = [
+    "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
+    "/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
+]
+CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
+
+APT_DEPENDENCIES = [
+    'apt-transport-https', 'at-spi2-common', 'chromium-browser',
+    'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei',
+    'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2',
+    'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1',
+    'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings',
+]
+
+
+def autodetect_system_chrome_install(PATH=None) -> Optional[Path]:
+    for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES:
+        abspath = bin_abspath(bin_name, PATH=env.PATH)
+        if abspath:
+            return abspath
+    return None
+
+def create_macos_app_symlink(target: Path, shortcut: Path):
+    """
+    on macOS, some binaries are inside of .app, so we need to
+    create a tiny bash script instead of a symlink
+    (so that ../ parent relationships are relative to original .app instead of callsite dir)
+    """
+    # TODO: should we enforce this? is it useful in any other situation?
+    # if platform.system().lower() != 'darwin':
+    #     raise Exception(...)
+    shortcut.unlink(missing_ok=True)
+    shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""")
+    shortcut.chmod(0o777)   # make sure its executable by everyone
+
+###################### Config ##########################
+
+
+class ChromeConfig(BaseConfigSet):
+    USE_CHROME: bool                        = Field(default=True)
+
+    # Chrome Binary
+    CHROME_BINARY: str                      = Field(default='chrome')
+    CHROME_DEFAULT_ARGS: List[str]          = Field(default=[
+        '--virtual-time-budget=15000',
+        '--disable-features=DarkMode',
+        "--run-all-compositor-stages-before-draw",
+        "--hide-scrollbars",
+        "--autoplay-policy=no-user-gesture-required",
+        "--no-first-run",
+        "--use-fake-ui-for-media-stream",
+        "--use-fake-device-for-media-stream",
+        "--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",
+    ])
+    CHROME_EXTRA_ARGS: List[str]           = Field(default=[])
+    
+    # Chrome Options Tuning
+    CHROME_TIMEOUT: int                     = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT - 10)
+    CHROME_HEADLESS: bool                   = Field(default=True)
+    CHROME_SANDBOX: bool                    = Field(default=lambda: not SHELL_CONFIG.IN_DOCKER)
+    CHROME_RESOLUTION: str                  = Field(default=lambda: ARCHIVING_CONFIG.RESOLUTION)
+    CHROME_CHECK_SSL_VALIDITY: bool         = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
+    
+    # Cookies & Auth
+    CHROME_USER_AGENT: str                  = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
+    CHROME_USER_DATA_DIR: Path | None       = Field(default=CONSTANTS.PERSONAS_DIR / 'Default' / 'chrome_profile')
+    CHROME_PROFILE_NAME: str                = Field(default='Default')
+
+    # Extractor Toggles
+    SAVE_SCREENSHOT: bool                   = Field(default=True, alias='FETCH_SCREENSHOT')
+    SAVE_DOM: bool                          = Field(default=True, alias='FETCH_DOM')
+    SAVE_PDF: bool                          = Field(default=True, alias='FETCH_PDF')
+
+    def validate(self):
+        from archivebox.config.paths import create_and_chown_dir
+
+        if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
+            STDERR.print()
+            STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]')
+            STDERR.print('    Chrome will fail to archive all sites if set to less than ~15 seconds.')
+            STDERR.print('    (Setting it to somewhere between 30 and 300 seconds is recommended)')
+            STDERR.print()
+            STDERR.print('    If you want to make ArchiveBox run faster, disable specific archive methods instead:')
+            STDERR.print('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
+            STDERR.print()
+
+        # if user has specified a user data dir, make sure its valid
+        if self.USE_CHROME and self.CHROME_USER_DATA_DIR:
+            try:
+                create_and_chown_dir(self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME)
+            except Exception:
+                pass
+            
+            # check to make sure user_data_dir/<profile_name> exists
+            if not os.path.isdir(self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME):
+                STDERR.print()
+                STDERR.print()
+                STDERR.print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]')
+                STDERR.print(f'    {self.CHROME_USER_DATA_DIR}')
+                STDERR.print('    Make sure you set it to a Chrome user data directory containing a Default profile folder.')
+                STDERR.print('    For more info see:')
+                STDERR.print('        https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
+                
+                # show special hint if they made the common mistake of putting /Default at the end of the path
+                if str(self.CHROME_USER_DATA_DIR).replace(str(CONSTANTS.PERSONAS_DIR / 'Default'), '').endswith('/Default'):
+                    STDERR.print()
+                    STDERR.print('    Try removing /Default from the end e.g.:')
+                    STDERR.print('        CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).rsplit('/Default', 1)[0]))
+                
+                self.update_in_place(CHROME_USER_DATA_DIR=None)
+            
+
+    def chrome_args(self, **options) -> List[str]:
+        """helper to build up a chrome shell command with arguments"""
+    
+        # Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
+    
+        options = self.model_copy(update=options)
+    
+        cmd_args = [*options.CHROME_DEFAULT_ARGS, *options.CHROME_EXTRA_ARGS]
+    
+        if options.CHROME_HEADLESS:
+            cmd_args += ["--headless=new"]   # expects chrome version >= 111
+    
+        if not options.CHROME_SANDBOX:
+            # assume this means we are running inside a docker container
+            # in docker, GPU support is limited, sandboxing is unecessary,
+            # and SHM is limited to 64MB by default (which is too low to be usable).
+            cmd_args += (
+                "--no-sandbox",
+                "--no-zygote",
+                "--disable-dev-shm-usage",
+                "--disable-software-rasterizer",
+                "--disable-sync",
+                # "--password-store=basic",
+            )
+
+    
+        # set window size for screenshot/pdf/etc. rendering
+        cmd_args += ('--window-size={}'.format(options.CHROME_RESOLUTION),)
+    
+        if not options.CHROME_CHECK_SSL_VALIDITY:
+            cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
+    
+        if options.CHROME_USER_AGENT:
+            cmd_args += ('--user-agent={}'.format(options.CHROME_USER_AGENT),)
+    
+        # this no longer works on newer chrome version for some reason, just causes chrome to hang indefinitely:
+        # if options.CHROME_TIMEOUT:
+        #   cmd_args += ('--timeout={}'.format(options.CHROME_TIMEOUT * 1000),)
+    
+        if options.CHROME_USER_DATA_DIR:
+            cmd_args.append('--user-data-dir={}'.format(options.CHROME_USER_DATA_DIR))
+            cmd_args.append('--profile-directory={}'.format(options.CHROME_PROFILE_NAME or 'Default'))
+        
+            if not os.path.isfile(options.CHROME_USER_DATA_DIR / options.CHROME_PROFILE_NAME / 'Preferences'):
+                STDERR.print(f'[green]        + creating new Chrome profile in: {pretty_path(options.CHROME_USER_DATA_DIR / options.CHROME_PROFILE_NAME)}[/green]')
+                cmd_args.remove('--no-first-run')
+                cmd_args.append('--first-run')
+    
+        return dedupe(cmd_args)
+
+CHROME_CONFIG = ChromeConfig()
+
--- a/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/dom.py
+++ b/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/dom.py
@@ -0,0 +1,76 @@
+__package__ = 'archivebox.extractors'
+
+from pathlib import Path
+from typing import Optional
+
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from archivebox.misc.system import run, chmod_file, atomic_write
+from archivebox.misc.util import (
+    enforce_types,
+    is_static_file,
+)
+from ..logging_util import TimedProgress
+
+from plugins_extractor.chrome.config import CHROME_CONFIG
+from plugins_extractor.chrome.binaries import CHROME_BINARY
+
+
+def get_output_path():
+    return 'output.html'
+
+
+@enforce_types
+def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
+    
+    if is_static_file(link.url):
+        return False
+
+    out_dir = out_dir or Path(link.link_dir)
+    if not overwrite and (out_dir / get_output_path()).exists():
+        if (out_dir / get_output_path()).stat().st_size > 1:
+            return False
+
+    return CHROME_CONFIG.SAVE_DOM
+
+@enforce_types
+def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
+    """print HTML of site to file using chrome --dump-html"""
+
+    CHROME_BIN = CHROME_BINARY.load()
+    assert CHROME_BIN.abspath and CHROME_BIN.version
+
+    out_dir = out_dir or Path(link.link_dir)
+    output: ArchiveOutput = get_output_path()
+    output_path = out_dir / output
+    cmd = [
+        str(CHROME_BIN.abspath),
+        *CHROME_CONFIG.chrome_args(),
+        '--dump-dom',
+        link.url
+    ]
+    status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True)
+        atomic_write(output_path, result.stdout)
+
+        if result.returncode:
+            hints = result.stderr
+            raise ArchiveError('Failed to save DOM', hints)
+
+        chmod_file(output, cwd=str(out_dir))
+    except Exception as err:
+        status = 'failed'
+        output = err
+        CHROME_BINARY.chrome_cleanup_lockfile()
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=str(out_dir),
+        cmd_version=str(CHROME_BIN.version),
+        output=output,
+        status=status,
+        **timer.stats,
+    )
--- a/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/pdf.py
+++ b/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/pdf.py
@@ -0,0 +1,75 @@
+__package__ = 'archivebox.extractors'
+
+from pathlib import Path
+from typing import Optional
+
+from archivebox.misc.system import run, chmod_file
+from archivebox.misc.util import (
+    enforce_types,
+    is_static_file,
+)
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..logging_util import TimedProgress
+
+from plugins_extractor.chrome.config import CHROME_CONFIG
+from plugins_extractor.chrome.binaries import CHROME_BINARY
+
+
+def get_output_path():
+    return 'output.pdf'
+
+
+@enforce_types
+def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
+    
+    if is_static_file(link.url):
+        return False
+
+    out_dir = out_dir or Path(link.link_dir)
+    if not overwrite and (out_dir / get_output_path()).exists():
+        return False
+
+    return CHROME_CONFIG.SAVE_PDF
+
+
+@enforce_types
+def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
+    """print PDF of site to file using chrome --headless"""
+
+    CHROME_BIN = CHROME_BINARY.load()
+    assert CHROME_BIN.abspath and CHROME_BIN.version
+
+    out_dir = out_dir or Path(link.link_dir)
+    output: ArchiveOutput = get_output_path()
+    cmd = [
+        str(CHROME_BIN.abspath),
+        *CHROME_CONFIG.chrome_args(),
+        '--print-to-pdf',
+        link.url,
+    ]
+    status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True)
+
+        if result.returncode:
+            hints = (result.stderr or result.stdout)
+            raise ArchiveError('Failed to save PDF', hints)
+        
+        chmod_file(get_output_path(), cwd=str(out_dir))
+    except Exception as err:
+        status = 'failed'
+        output = err
+        CHROME_BINARY.chrome_cleanup_lockfile()
+    finally:
+        timer.end()
+
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=str(out_dir),
+        cmd_version=str(CHROME_BINARY.version),
+        output=output,
+        status=status,
+        **timer.stats,
+    )
--- a/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/screenshot.py
+++ b/archivebox/pkgs/abx-plugin-chrome/abx_plugin_chrome/screenshot.py
@@ -0,0 +1,70 @@
+__package__ = 'archivebox.extractors'
+
+from pathlib import Path
+from typing import Optional
+
+from archivebox.misc.system import run, chmod_file
+from archivebox.misc.util import enforce_types, is_static_file
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+from ..logging_util import TimedProgress
+
+from plugins_extractor.chrome.config import CHROME_CONFIG
+from plugins_extractor.chrome.binaries import CHROME_BINARY
+
+
+def get_output_path():
+    return 'screenshot.png'
+
+
+@enforce_types
+def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
+    
+    if is_static_file(link.url):
+        return False
+
+    out_dir = out_dir or Path(link.link_dir)
+    if not overwrite and (out_dir / get_output_path()).exists():
+        return False
+
+    return CHROME_CONFIG.SAVE_SCREENSHOT
+
+@enforce_types
+def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
+    """take screenshot of site using chrome --headless"""
+    
+    CHROME_BIN = CHROME_BINARY.load()
+    assert CHROME_BIN.abspath and CHROME_BIN.version
+
+    out_dir = out_dir or Path(link.link_dir)
+    output: ArchiveOutput = get_output_path()
+    cmd = [
+        str(CHROME_BIN.abspath),
+        *CHROME_CONFIG.chrome_args(),
+        '--screenshot',
+        link.url,
+    ]
+    status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True)
+
+        if result.returncode:
+            hints = (result.stderr or result.stdout)
+            raise ArchiveError('Failed to save screenshot', hints)
+
+        chmod_file(output, cwd=str(out_dir))
+    except Exception as err:
+        status = 'failed'
+        output = err
+        CHROME_BINARY.chrome_cleanup_lockfile()
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=str(out_dir),
+        cmd_version=str(CHROME_BIN.version),
+        output=output,
+        status=status,
+        **timer.stats,
+    )
--- a/archivebox/pkgs/abx-plugin-chrome/pyproject.toml
+++ b/archivebox/pkgs/abx-plugin-chrome/pyproject.toml
@@ -0,0 +1,18 @@
+[project]
+name = "abx-plugin-chrome"
+version = "2024.10.28"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+    "abx-spec-pydantic-pkgr>=0.1.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_chrome = "abx_plugin_chrome"
--- a/archivebox/pkgs/abx-plugin-curl/README.md
+++ b/archivebox/pkgs/abx-plugin-curl/README.md
--- a/archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/init.py
+++ b/archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/init.py
@@ -0,0 +1,18 @@
+import abx
+
+
+@abx.hookimpl
+def get_CONFIG():
+    from .config import CURL_CONFIG
+    
+    return {
+        'curl': CURL_CONFIG
+    }
+
+@abx.hookimpl
+def get_BINARIES():
+    from .binaries import CURL_BINARY
+    
+    return {
+        'curl': CURL_BINARY,
+    }
--- a/archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/binaries.py
+++ b/archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/binaries.py
@@ -0,0 +1,18 @@
+__package__ = 'abx_plugin_curl'
+
+from typing import List
+
+from pydantic import InstanceOf
+from pydantic_pkgr import BinProvider, BinName, Binary
+
+from abx_plugin_default_binproviders import apt, brew, env
+
+
+from .config import CURL_CONFIG
+
+
+class CurlBinary(Binary):
+    name: BinName = CURL_CONFIG.CURL_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
+
+CURL_BINARY = CurlBinary()
--- a/archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/config.py
+++ b/archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/config.py
@@ -0,0 +1,33 @@
+__package__ = 'abx_plugin_curl'
+
+from typing import List, Optional
+from pathlib import Path
+
+from pydantic import Field
+
+from abx_spec_config.base_configset import BaseConfigSet
+
+from archivebox.config.common import ARCHIVING_CONFIG
+
+
+class CurlConfig(BaseConfigSet):
+    
+    SAVE_TITLE: bool = Field(default=True)
+    SAVE_HEADERS: bool = Field(default=True)
+    USE_CURL: bool = Field(default=True)
+    
+    CURL_BINARY: str = Field(default='curl')
+    CURL_ARGS: List[str] = [
+        '--silent',
+        '--location',
+        '--compressed',
+    ]
+    CURL_EXTRA_ARGS: List[str] = []
+    
+    CURL_TIMEOUT: int =  Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
+    CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
+    CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
+    CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
+    
+
+CURL_CONFIG = CurlConfig()
--- a/archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/headers.py
+++ b/archivebox/pkgs/abx-plugin-curl/abx_plugin_curl/headers.py
@@ -0,0 +1,76 @@
+__package__ = 'archivebox.extractors'
+
+from pathlib import Path
+
+from typing import Optional
+
+from archivebox.misc.system import atomic_write
+from archivebox.misc.util import (
+    enforce_types,
+    get_headers,
+    dedupe,
+)
+from archivebox.plugins_extractor.curl.config import CURL_CONFIG
+from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..logging_util import TimedProgress
+
+def get_output_path():
+    return 'headers.json'
+
+
+@enforce_types
+def should_save_headers(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
+    out_dir_path = Path(out_dir or link.link_dir)
+    assert out_dir_path
+    if not overwrite and (out_dir_path / get_output_path()).exists():
+        return False
+
+    return CURL_CONFIG.SAVE_HEADERS
+
+
+@enforce_types
+def save_headers(link: Link, out_dir: Optional[str]=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
+    """Download site headers"""
+
+    curl_binary = CURL_BINARY.load()
+    assert curl_binary.abspath and curl_binary.version
+
+    out_dir_path = Path(out_dir or link.link_dir)
+    output_folder = out_dir_path.absolute()
+    output: ArchiveOutput = get_output_path()
+
+    status = 'succeeded'
+    timer = TimedProgress(timeout + 1, prefix='      ')
+    # later options take precedence
+    options = [
+        *CURL_CONFIG.CURL_ARGS,
+        *CURL_CONFIG.CURL_EXTRA_ARGS,
+        '--head',
+        '--max-time', str(timeout),
+        *(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
+        *([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
+    ]
+    cmd = [
+        str(curl_binary.abspath),
+        *dedupe(options),
+        link.url,
+    ]
+    try:
+        json_headers = get_headers(link.url, timeout=timeout)
+        output_folder.mkdir(exist_ok=True)
+        atomic_write(str(output_folder / get_output_path()), json_headers)
+    except (Exception, OSError) as err:
+        status = 'failed'
+        output = err
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=str(out_dir_path),
+        cmd_version=str(curl_binary.version),
+        output=output,
+        status=status,
+        **timer.stats,
+    )
--- a/archivebox/pkgs/abx-plugin-curl/pyproject.toml
+++ b/archivebox/pkgs/abx-plugin-curl/pyproject.toml
@@ -0,0 +1,18 @@
+[project]
+name = "abx-plugin-curl"
+version = "2024.10.24"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+    "abx-spec-pydantic-pkgr>=0.1.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_curl = "abx_plugin_curl"
--- a/archivebox/pkgs/abx-plugin-default-binproviders/README.md
+++ b/archivebox/pkgs/abx-plugin-default-binproviders/README.md
--- a/archivebox/pkgs/abx-plugin-default-binproviders/abx_plugin_default_binproviders.py
+++ b/archivebox/pkgs/abx-plugin-default-binproviders/abx_plugin_default_binproviders.py
@@ -0,0 +1,23 @@
+
+import abx
+
+from typing import Dict
+
+from pydantic_pkgr import (
+    AptProvider,
+    BrewProvider,
+    EnvProvider,
+    BinProvider,
+)
+apt = APT_BINPROVIDER = AptProvider()
+brew = BREW_BINPROVIDER = BrewProvider()
+env = ENV_BINPROVIDER = EnvProvider()
+
+
+@abx.hookimpl(tryfirst=True)
+def get_BINPROVIDERS() -> Dict[str, BinProvider]:
+    return {
+        'apt': APT_BINPROVIDER,
+        'brew': BREW_BINPROVIDER,
+        'env': ENV_BINPROVIDER,
+    }
--- a/archivebox/pkgs/abx-plugin-default-binproviders/pyproject.toml
+++ b/archivebox/pkgs/abx-plugin-default-binproviders/pyproject.toml
@@ -0,0 +1,18 @@
+[project]
+name = "abx-plugin-default-binproviders"
+version = "2024.10.24"
+description = "Default BinProviders for ABX (apt, brew, env)"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "pydantic-pkgr>=0.5.4",
+    "abx-spec-pydantic-pkgr>=0.1.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_default_binproviders = "abx_plugin_default_binproviders"
--- a/archivebox/pkgs/abx-plugin-favicon/README.md
+++ b/archivebox/pkgs/abx-plugin-favicon/README.md
--- a/archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/init.py
+++ b/archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/init.py
@@ -0,0 +1,29 @@
+__label__ = 'Favicon'
+__version__ = '2024.10.24'
+__author__ = 'ArchiveBox'
+__homepage__ = 'https://github.com/ArchiveBox/archivebox'
+__dependencies__ = [
+    'abx>=0.1.0',
+    'abx-spec-config>=0.1.0',
+    'abx-plugin-curl-extractor>=2024.10.24',
+]
+
+import abx
+
+
+@abx.hookimpl
+def get_CONFIG():
+    from .config import FAVICON_CONFIG
+    
+    return {
+        'FAVICON_CONFIG': FAVICON_CONFIG
+    }
+
+
+# @abx.hookimpl
+# def get_EXTRACTORS():
+#     from .extractors import FAVICON_EXTRACTOR
+    
+#     return {
+#         'favicon': FAVICON_EXTRACTOR,
+#     }
--- a/archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/config.py
+++ b/archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/config.py
@@ -0,0 +1,10 @@
+from abx_spec_config.base_configset import BaseConfigSet
+
+
+class FaviconConfig(BaseConfigSet):
+    SAVE_FAVICON: bool = True
+    
+    FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}'
+
+
+FAVICON_CONFIG = FaviconConfig()
--- a/archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/favicon.py
+++ b/archivebox/pkgs/abx-plugin-favicon/abx_plugin_favicon/favicon.py
@@ -0,0 +1,71 @@
+__package__ = 'archivebox.extractors'
+
+from pathlib import Path
+
+from archivebox.misc.system import chmod_file, run
+from archivebox.misc.util import enforce_types, domain, dedupe
+from archivebox.plugins_extractor.favicon.config import FAVICON_CONFIG
+from archivebox.plugins_extractor.curl.config import CURL_CONFIG
+from archivebox.plugins_extractor.curl.binaries import CURL_BINARY
+from ..index.schema import Link, ArchiveResult, ArchiveOutput
+from ..logging_util import TimedProgress
+
+
+@enforce_types
+def should_save_favicon(link: Link, out_dir: str | Path | None=None, overwrite: bool=False) -> bool:
+    assert link.link_dir
+    out_dir = Path(out_dir or link.link_dir)
+    if not overwrite and (out_dir / 'favicon.ico').exists():
+        return False
+
+    return FAVICON_CONFIG.SAVE_FAVICON
+
+@enforce_types
+def get_output_path():
+    return 'favicon.ico'
+
+
+@enforce_types
+def save_favicon(link: Link, out_dir: str | Path | None=None, timeout: int=CURL_CONFIG.CURL_TIMEOUT) -> ArchiveResult:
+    """download site favicon from google's favicon api"""
+
+    curl_binary = CURL_BINARY.load()
+    assert curl_binary.abspath and curl_binary.version
+
+    out_dir = Path(out_dir or link.link_dir)
+    assert out_dir.exists()
+
+    output: ArchiveOutput = 'favicon.ico'
+    # later options take precedence
+    options = [
+        *CURL_CONFIG.CURL_ARGS,
+        *CURL_CONFIG.CURL_EXTRA_ARGS,
+        '--max-time', str(timeout),
+        '--output', str(output),
+        *(['--user-agent', '{}'.format(CURL_CONFIG.CURL_USER_AGENT)] if CURL_CONFIG.CURL_USER_AGENT else []),
+        *([] if CURL_CONFIG.CURL_CHECK_SSL_VALIDITY else ['--insecure']),
+    ]
+    cmd = [
+        str(curl_binary.abspath),
+        *dedupe(options),
+        FAVICON_CONFIG.FAVICON_PROVIDER.format(domain(link.url)),
+    ]
+    status = 'failed'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        run(cmd, cwd=str(out_dir), timeout=timeout)
+        chmod_file(output, cwd=str(out_dir))
+        status = 'succeeded'
+    except Exception as err:
+        output = err
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=str(out_dir),
+        cmd_version=str(curl_binary.version),
+        output=output,
+        status=status,
+        **timer.stats,
+    )
--- a/archivebox/pkgs/abx-plugin-favicon/pyproject.toml
+++ b/archivebox/pkgs/abx-plugin-favicon/pyproject.toml
@@ -0,0 +1,18 @@
+[project]
+name = "abx-plugin-favicon"
+version = "2024.10.28"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+    "abx-plugin-curl>=2024.10.28",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_favicon = "abx_plugin_favicon"
--- a/archivebox/pkgs/abx-plugin-git/README.md
+++ b/archivebox/pkgs/abx-plugin-git/README.md
--- a/archivebox/pkgs/abx-plugin-git/abx_plugin_git/init.py
+++ b/archivebox/pkgs/abx-plugin-git/abx_plugin_git/init.py
@@ -0,0 +1,29 @@
+__package__ = 'abx_plugin_git'
+__label__ = 'Git'
+
+import abx
+
+
+@abx.hookimpl
+def get_CONFIG():
+    from .config import GIT_CONFIG
+    
+    return {
+        'GIT_CONFIG': GIT_CONFIG
+    }
+
+@abx.hookimpl
+def get_BINARIES():
+    from .binaries import GIT_BINARY
+    
+    return {
+        'git': GIT_BINARY,
+    }
+
+@abx.hookimpl
+def get_EXTRACTORS():
+    from .extractors import GIT_EXTRACTOR
+    
+    return {
+        'git': GIT_EXTRACTOR,
+    }
--- a/archivebox/pkgs/abx-plugin-git/abx_plugin_git/binaries.py
+++ b/archivebox/pkgs/abx-plugin-git/abx_plugin_git/binaries.py
@@ -0,0 +1,18 @@
+__package__ = 'abx_plugin_git'
+
+from typing import List
+
+from pydantic import InstanceOf
+from pydantic_pkgr import BinProvider, BinName, Binary
+
+from abx_plugin_default_binproviders import apt, brew, env
+
+from .config import GIT_CONFIG
+
+
+
+class GitBinary(Binary):
+    name: BinName = GIT_CONFIG.GIT_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
+
+GIT_BINARY = GitBinary()
--- a/archivebox/pkgs/abx-plugin-git/abx_plugin_git/config.py
+++ b/archivebox/pkgs/abx-plugin-git/abx_plugin_git/config.py
@@ -0,0 +1,28 @@
+__package__ = 'abx_plugin_git'
+
+from typing import List
+
+from pydantic import Field
+
+from abx_spec_config.base_configset import BaseConfigSet
+
+from archivebox.config.common import ARCHIVING_CONFIG
+
+
+class GitConfig(BaseConfigSet):
+
+    SAVE_GIT: bool = True
+    
+    GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
+    
+    GIT_BINARY: str = Field(default='git')
+    GIT_ARGS: List[str] = [
+        '--recursive',
+    ]
+    GIT_EXTRA_ARGS: List[str] = []
+    
+    GIT_TIMEOUT: int =  Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
+    GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
+    
+
+GIT_CONFIG = GitConfig()
--- a/archivebox/pkgs/abx-plugin-git/abx_plugin_git/extractors.py
+++ b/archivebox/pkgs/abx-plugin-git/abx_plugin_git/extractors.py
@@ -0,0 +1,15 @@
+__package__ = 'abx_plugin_git'
+
+# from pathlib import Path
+
+# from .binaries import GIT_BINARY
+
+
+# class GitExtractor(BaseExtractor):
+#     name: ExtractorName = 'git'
+#     binary: str = GIT_BINARY.name
+
+#     def get_output_path(self, snapshot) -> Path | None:
+#         return snapshot.as_link() / 'git'
+
+# GIT_EXTRACTOR = GitExtractor()
--- a/archivebox/pkgs/abx-plugin-git/abx_plugin_git/git.py
+++ b/archivebox/pkgs/abx-plugin-git/abx_plugin_git/git.py
@@ -0,0 +1,100 @@
+__package__ = 'archivebox.extractors'
+
+
+from pathlib import Path
+from typing import Optional
+
+from archivebox.misc.system import run, chmod_file
+from archivebox.misc.util import (
+    enforce_types,
+    is_static_file,
+    domain,
+    extension,
+    without_query,
+    without_fragment,
+)
+from ..logging_util import TimedProgress
+from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
+
+from abx_plugin_git.config import GIT_CONFIG
+from abx_plugin_git.binaries import GIT_BINARY
+
+
+def get_output_path():
+    return 'git/'
+
+def get_embed_path(archiveresult=None):
+    if not archiveresult:
+        return get_output_path()
+
+    try:
+        return get_output_path() + list((archiveresult.snapshot_dir / get_output_path()).glob('*'))[0].name + '/'
+    except IndexError:
+        pass
+
+    return get_output_path()
+
+@enforce_types
+def should_save_git(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
+    if is_static_file(link.url):
+        return False
+
+    out_dir = out_dir or Path(link.link_dir)
+    if not overwrite and (out_dir / get_output_path()).exists():
+        return False
+
+    is_clonable_url = (
+        (domain(link.url) in GIT_CONFIG.GIT_DOMAINS)
+        or (extension(link.url) == 'git')
+    )
+    if not is_clonable_url:
+        return False
+
+    return GIT_CONFIG.SAVE_GIT
+
+
+@enforce_types
+def save_git(link: Link, out_dir: Optional[Path]=None, timeout: int=GIT_CONFIG.GIT_TIMEOUT) -> ArchiveResult:
+    """download full site using git"""
+    
+    git_binary = GIT_BINARY.load()
+    assert git_binary.abspath and git_binary.version
+
+    out_dir = out_dir or Path(link.link_dir)
+    output: ArchiveOutput = get_output_path()
+    output_path = out_dir / output
+    output_path.mkdir(exist_ok=True)
+    cmd = [
+        str(git_binary.abspath),
+        'clone',
+        *GIT_CONFIG.GIT_ARGS,
+        *([] if GIT_CONFIG.GIT_CHECK_SSL_VALIDITY else ['-c', 'http.sslVerify=false']),
+        without_query(without_fragment(link.url)),
+    ]
+    status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        result = run(cmd, cwd=str(output_path), timeout=timeout + 1)
+        if result.returncode == 128:
+            # ignore failed re-download when the folder already exists
+            pass
+        elif result.returncode > 0:
+            hints = 'Got git response code: {}.'.format(result.returncode)
+            raise ArchiveError('Failed to save git clone', hints)
+
+        chmod_file(output, cwd=str(out_dir))
+
+    except Exception as err:
+        status = 'failed'
+        output = err
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=str(out_dir),
+        cmd_version=str(git_binary.version),
+        output=output,
+        status=status,
+        **timer.stats,
+    )
--- a/archivebox/pkgs/abx-plugin-git/pyproject.toml
+++ b/archivebox/pkgs/abx-plugin-git/pyproject.toml
@@ -0,0 +1,19 @@
+[project]
+name = "abx-plugin-git"
+version = "2024.10.28"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+    "abx-spec-pydantic-pkgr>=0.1.0",
+    "abx-plugin-default-binproviders>=2024.10.24",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_git = "abx_plugin_git"
--- a/archivebox/pkgs/abx-plugin-htmltotext/README.md
+++ b/archivebox/pkgs/abx-plugin-htmltotext/README.md
--- a/archivebox/pkgs/abx-plugin-htmltotext/abx_plugin_htmltotext/init.py
+++ b/archivebox/pkgs/abx-plugin-htmltotext/abx_plugin_htmltotext/init.py
@@ -0,0 +1,22 @@
+__package__ = 'abx_plugin_htmltotext'
+__label__ = 'HTML-to-Text'
+
+import abx
+
+
+@abx.hookimpl
+def get_CONFIG():
+    from .config import HTMLTOTEXT_CONFIG
+    
+    return {
+        'HTMLTOTEXT_CONFIG': HTMLTOTEXT_CONFIG
+    }
+
+
+# @abx.hookimpl
+# def get_EXTRACTORS():
+#     from .extractors import FAVICON_EXTRACTOR
+    
+#     return {
+#         'htmltotext': FAVICON_EXTRACTOR,
+#     }
--- a/archivebox/pkgs/abx-plugin-htmltotext/abx_plugin_htmltotext/config.py
+++ b/archivebox/pkgs/abx-plugin-htmltotext/abx_plugin_htmltotext/config.py
@@ -0,0 +1,8 @@
+from abx_spec_config.base_configset import BaseConfigSet
+
+
+class HtmltotextConfig(BaseConfigSet):
+    SAVE_HTMLTOTEXT: bool = True
+
+
+HTMLTOTEXT_CONFIG = HtmltotextConfig()
--- a/archivebox/pkgs/abx-plugin-htmltotext/abx_plugin_htmltotext/htmltotext.py
+++ b/archivebox/pkgs/abx-plugin-htmltotext/abx_plugin_htmltotext/htmltotext.py
@@ -0,0 +1,158 @@
+__package__ = 'archivebox.extractors'
+
+from html.parser import HTMLParser
+import io
+from pathlib import Path
+from typing import Optional
+
+from archivebox.config import VERSION
+from archivebox.config.common import ARCHIVING_CONFIG
+from archivebox.misc.system import atomic_write
+from archivebox.misc.util import enforce_types, is_static_file
+
+from archivebox.plugins_extractor.htmltotext.config import HTMLTOTEXT_CONFIG
+
+from ..logging_util import TimedProgress
+from ..index.schema import Link, ArchiveResult, ArchiveError
+from .title import get_html
+
+
+def get_output_path():
+    return "htmltotext.txt"
+
+
+
+class HTMLTextExtractor(HTMLParser):
+    TEXT_ATTRS = [
+        "alt", "cite", "href", "label",
+        "list", "placeholder", "title", "value"
+    ]
+    NOTEXT_TAGS = ["script", "style", "template"]
+    NOTEXT_HREF = ["data:", "javascript:", "#"]
+
+    def __init__(self):
+        super().__init__()
+
+        self.output = io.StringIO()
+        self._tag_stack = []
+
+    def _is_text_attr(self, name, value):
+        if not isinstance(value, str):
+            return False
+        if name == "href" and any(map(lambda p: value.startswith(p), self.NOTEXT_HREF)):
+            return False
+
+        if name in self.TEXT_ATTRS:
+            return True
+
+        return False
+
+    def _parent_tag(self):
+        try:
+            return self._tag_stack[-1]
+        except IndexError:
+            return None
+
+    def _in_notext_tag(self):
+        return any([t in self._tag_stack for t in self.NOTEXT_TAGS])
+
+    def handle_starttag(self, tag, attrs):
+        self._tag_stack.append(tag)
+
+        # Don't write out attribute values if any ancestor
+        # is in NOTEXT_TAGS
+        if self._in_notext_tag():
+            return
+
+        for name, value in attrs:
+            if self._is_text_attr(name, value):
+                self.output.write(f"({value.strip()}) ")
+
+    def handle_endtag(self, tag):
+        orig_stack = self._tag_stack.copy()
+        try:
+            # Keep popping tags until we find the nearest
+            # ancestor matching this end tag
+            while tag != self._tag_stack.pop():
+                pass
+            # Write a space after every tag, to ensure that tokens
+            # in tag text aren't concatenated. This may result in
+            # excess spaces, which should be ignored by search tokenizers.
+            if not self._in_notext_tag() and tag not in self.NOTEXT_TAGS:
+                self.output.write(" ")
+        except IndexError:
+            # Got to the top of the stack, but somehow missed
+            # this end tag -- maybe malformed markup -- restore the
+            # stack
+            self._tag_stack = orig_stack
+
+    def handle_data(self, data):
+        # Don't output text data if any ancestor is in NOTEXT_TAGS
+        if self._in_notext_tag():
+            return
+
+        data = data.lstrip()
+        len_before_rstrip = len(data)
+        data = data.rstrip()
+        spaces_rstripped = len_before_rstrip - len(data)
+        if data:
+            self.output.write(data)
+            if spaces_rstripped:
+                # Add back a single space if 1 or more
+                # whitespace characters were stripped
+                self.output.write(' ')
+
+    def __str__(self):
+        return self.output.getvalue()
+
+
+@enforce_types
+def should_save_htmltotext(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
+    if is_static_file(link.url):
+        return False
+
+    out_dir = out_dir or Path(link.link_dir)
+    if not overwrite and (out_dir / get_output_path()).exists():
+        return False
+
+    return HTMLTOTEXT_CONFIG.SAVE_HTMLTOTEXT
+
+
+@enforce_types
+def save_htmltotext(link: Link, out_dir: Optional[Path]=None, timeout: int=ARCHIVING_CONFIG.TIMEOUT) -> ArchiveResult:
+    """extract search-indexing-friendly text from an HTML document"""
+
+    out_dir = Path(out_dir or link.link_dir)
+    output = get_output_path()
+    cmd = ['(internal) archivebox.extractors.htmltotext', './{singlefile,dom}.html']
+
+    timer = TimedProgress(timeout, prefix='      ')
+    extracted_text = None
+    status = 'failed'
+    try:
+        extractor = HTMLTextExtractor()
+        document = get_html(link, out_dir)
+
+        if not document:
+            raise ArchiveError('htmltotext could not find HTML to parse for article text')
+
+        extractor.feed(document)
+        extractor.close()
+        extracted_text = str(extractor)
+
+        atomic_write(str(out_dir / output), extracted_text)
+        status = 'succeeded'
+    except (Exception, OSError) as err:
+        output = err
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=str(out_dir),
+        cmd_version=VERSION,
+        output=output,
+        status=status,
+        index_texts=[extracted_text] if extracted_text else [],
+        **timer.stats,  
+    )
--- a/archivebox/pkgs/abx-plugin-htmltotext/pyproject.toml
+++ b/archivebox/pkgs/abx-plugin-htmltotext/pyproject.toml
@@ -0,0 +1,17 @@
+[project]
+name = "abx-plugin-htmltotext"
+version = "2024.10.28"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_htmltotext = "abx_plugin_htmltotext"
--- a/archivebox/pkgs/abx-plugin-ldap-auth/README.md
+++ b/archivebox/pkgs/abx-plugin-ldap-auth/README.md
--- a/archivebox/pkgs/abx-plugin-ldap-auth/abx_plugin_ldap_auth/init.py
+++ b/archivebox/pkgs/abx-plugin-ldap-auth/abx_plugin_ldap_auth/init.py
@@ -0,0 +1,54 @@
+__package__ = 'abx_plugin_ldap_auth'
+__label__ = 'LDAP'
+__homepage__ = 'https://github.com/django-auth-ldap/django-auth-ldap'
+
+import abx
+
+@abx.hookimpl
+def get_CONFIG():
+    from .config import LDAP_CONFIG
+    
+    return {
+        'LDAP_CONFIG': LDAP_CONFIG
+    }
+
+@abx.hookimpl
+def get_BINARIES():
+    from .binaries import LDAP_BINARY
+    
+    return {
+        'ldap': LDAP_BINARY,
+    }
+
+
+def create_superuser_from_ldap_user(sender, user=None, ldap_user=None, **kwargs):
+    """
+    Invoked after LDAP authenticates a user, but before they have a local User account created.
+    ArchiveBox requires staff/superuser status to view the admin at all, so we must create a user
+    + set staff and superuser when LDAP authenticates a new person.
+    """
+    from .config import LDAP_CONFIG
+    
+    if user is None:
+        return                        # not authenticated at all
+    
+    if not user.id and LDAP_CONFIG.LDAP_CREATE_SUPERUSER:
+        user.is_superuser = True      # authenticated via LDAP, but user is not set up in DB yet
+
+    user.is_staff = True
+    print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})')
+
+
+@abx.hookimpl
+def ready():
+    """
+    Called at AppConfig.ready() time (settings + models are all loaded)
+    """
+    from .config import LDAP_CONFIG
+    
+    LDAP_CONFIG.validate()
+    
+    if LDAP_CONFIG.LDAP_ENABLED:
+        # tell django-auth-ldap to call our function when a user is authenticated via LDAP
+        import django_auth_ldap.backend
+        django_auth_ldap.backend.populate_user.connect(create_superuser_from_ldap_user)
--- a/archivebox/pkgs/abx-plugin-ldap-auth/abx_plugin_ldap_auth/binaries.py
+++ b/archivebox/pkgs/abx-plugin-ldap-auth/abx_plugin_ldap_auth/binaries.py
@@ -0,0 +1,67 @@
+__package__ = 'abx_plugin_ldap_auth'
+
+import inspect
+
+from typing import List
+from pathlib import Path
+from pydantic import InstanceOf
+
+from pydantic_pkgr import BinaryOverrides, SemVer, Binary, BinProvider
+
+from abx_plugin_default_binproviders import apt
+from abx_plugin_pip.binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER, VENV_SITE_PACKAGES, LIB_SITE_PACKAGES, USER_SITE_PACKAGES, SYS_SITE_PACKAGES
+
+from .config import get_ldap_lib
+
+
+
+def get_LDAP_LIB_path(paths=()):
+    LDAP_LIB = get_ldap_lib()[0]
+    if not LDAP_LIB:
+        return None
+    
+    # check that LDAP_LIB path is in one of the specified site packages dirs
+    lib_path = Path(inspect.getfile(LDAP_LIB))
+    if not paths:
+        return lib_path
+    
+    for site_packges_dir in paths:
+        if str(lib_path.parent.parent.resolve()) == str(Path(site_packges_dir).resolve()):
+            return lib_path
+    return None
+
+
+def get_LDAP_LIB_version():
+    LDAP_LIB = get_ldap_lib()[0]
+    return LDAP_LIB and SemVer(LDAP_LIB.__version__)
+
+
+class LdapBinary(Binary):
+    name: str = 'ldap'
+    description: str = 'LDAP Authentication'
+    binproviders_supported: List[InstanceOf[BinProvider]] = [VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER, apt]
+
+    overrides: BinaryOverrides = {
+        LIB_PIP_BINPROVIDER.name: {
+            "abspath": lambda: get_LDAP_LIB_path(LIB_SITE_PACKAGES),
+            "version": lambda: get_LDAP_LIB_version(),
+            "packages": ['python-ldap>=3.4.3', 'django-auth-ldap>=4.1.0'],
+        },
+        VENV_PIP_BINPROVIDER.name: {
+            "abspath": lambda: get_LDAP_LIB_path(VENV_SITE_PACKAGES),
+            "version": lambda: get_LDAP_LIB_version(),
+            "packages": ['python-ldap>=3.4.3', 'django-auth-ldap>=4.1.0'],
+        },
+        SYS_PIP_BINPROVIDER.name: {
+            "abspath": lambda: get_LDAP_LIB_path((*USER_SITE_PACKAGES, *SYS_SITE_PACKAGES)),
+            "version": lambda: get_LDAP_LIB_version(),
+            "packages": ['python-ldap>=3.4.3', 'django-auth-ldap>=4.1.0'],
+        },
+        apt.name: {
+            "abspath": lambda: get_LDAP_LIB_path(),
+            "version": lambda: get_LDAP_LIB_version(),
+            "packages": ['libssl-dev', 'libldap2-dev', 'libsasl2-dev', 'python3-ldap', 'python3-msgpack', 'python3-mutagen'],
+        },
+    }
+
+LDAP_BINARY = LdapBinary()
--- a/archivebox/pkgs/abx-plugin-ldap-auth/abx_plugin_ldap_auth/config.py
+++ b/archivebox/pkgs/abx-plugin-ldap-auth/abx_plugin_ldap_auth/config.py
@@ -0,0 +1,122 @@
+__package__ = 'abx_plugin_ldap_auth'
+
+import sys
+
+from typing import Dict, List, Optional
+from pydantic import Field, computed_field
+
+from abx_spec_config.base_configset import BaseConfigSet
+
+LDAP_LIB = None
+LDAP_SEARCH = None
+
+def get_ldap_lib(extra_paths=()):
+    global LDAP_LIB, LDAP_SEARCH
+    if LDAP_LIB and LDAP_SEARCH:
+        return LDAP_LIB, LDAP_SEARCH
+    try:
+        for path in extra_paths:
+            if path not in sys.path:
+                sys.path.append(path)
+            
+        import ldap
+        from django_auth_ldap.config import LDAPSearch
+        LDAP_LIB, LDAP_SEARCH = ldap, LDAPSearch
+    except ImportError:
+        pass
+    return LDAP_LIB, LDAP_SEARCH
+
+###################### Config ##########################
+
+
+class LdapConfig(BaseConfigSet):
+    """
+    LDAP Config gets imported by core/settings.py very early during startup.
+    It needs to be in a separate file from apps.py so that it can be imported
+    during settings.py initialization before the apps are loaded.
+    """
+
+    LDAP_ENABLED: bool                  = Field(default=False, alias='LDAP')
+    
+    LDAP_SERVER_URI: str                = Field(default=None)
+    LDAP_BIND_DN: str                   = Field(default=None)
+    LDAP_BIND_PASSWORD: str             = Field(default=None)
+    LDAP_USER_BASE: str                 = Field(default=None)
+    LDAP_USER_FILTER: str               = Field(default=None)
+    LDAP_CREATE_SUPERUSER: bool         = Field(default=False)
+
+    LDAP_USERNAME_ATTR: str             = Field(default='username')
+    LDAP_FIRSTNAME_ATTR: str            = Field(default='first_name')
+    LDAP_LASTNAME_ATTR: str             = Field(default='last_name')
+    LDAP_EMAIL_ATTR: str                = Field(default='email')
+    
+    def validate(self):
+        if self.LDAP_ENABLED:
+            LDAP_LIB, _LDAPSearch = get_ldap_lib()
+            # Check that LDAP libraries are installed
+            if LDAP_LIB is None:
+                sys.stderr.write('[X] Error: LDAP Authentication is enabled but LDAP libraries are not installed. You may need to run: pip install archivebox[ldap]\n')
+                # dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap
+                # sys.exit(1)
+                self.update_in_place(LDAP_ENABLED=False)
+
+            # Check that all required LDAP config options are set
+            if self.LDAP_CONFIG_IS_SET:
+                missing_config_options = [
+                    key for key, value in self.model_dump().items()
+                    if value is None and key != 'LDAP_ENABLED'
+                ]
+                sys.stderr.write('[X] Error: LDAP_* config options must all be set if LDAP_ENABLED=True\n')
+                sys.stderr.write(f'    Missing: {", ".join(missing_config_options)}\n')
+                self.update_in_place(LDAP_ENABLED=False)
+        return self
+    
+    @computed_field
+    @property
+    def LDAP_CONFIG_IS_SET(self) -> bool:
+        """Check that all required LDAP config options are set"""
+        if self.LDAP_ENABLED:
+            LDAP_LIB, _LDAPSearch = get_ldap_lib()
+            return bool(LDAP_LIB) and self.LDAP_ENABLED and bool(
+                self.LDAP_SERVER_URI
+                and self.LDAP_BIND_DN
+                and self.LDAP_BIND_PASSWORD
+                and self.LDAP_USER_BASE
+                and self.LDAP_USER_FILTER
+            )
+        return False
+
+    @computed_field
+    @property
+    def LDAP_USER_ATTR_MAP(self) -> Dict[str, str]:
+        return {
+            'username': self.LDAP_USERNAME_ATTR,
+            'first_name': self.LDAP_FIRSTNAME_ATTR,
+            'last_name': self.LDAP_LASTNAME_ATTR,
+            'email': self.LDAP_EMAIL_ATTR,
+        }
+
+    @computed_field
+    @property
+    def AUTHENTICATION_BACKENDS(self) -> List[str]:
+        if self.LDAP_ENABLED:
+            return [
+                'django.contrib.auth.backends.ModelBackend',
+                'django_auth_ldap.backend.LDAPBackend',
+            ]
+        return []
+
+    @computed_field
+    @property
+    def AUTH_LDAP_USER_SEARCH(self) -> Optional[object]:
+        if self.LDAP_ENABLED:
+            LDAP_LIB, LDAPSearch = get_ldap_lib()
+            return self.LDAP_USER_FILTER and LDAPSearch(
+                self.LDAP_USER_BASE,
+                LDAP_LIB.SCOPE_SUBTREE,                                                                         # type: ignore
+                '(&(' + self.LDAP_USERNAME_ATTR + '=%(user)s)' + self.LDAP_USER_FILTER + ')',
+            )
+        return None
+
+
+LDAP_CONFIG = LdapConfig()
--- a/archivebox/pkgs/abx-plugin-ldap-auth/pyproject.toml
+++ b/archivebox/pkgs/abx-plugin-ldap-auth/pyproject.toml
@@ -0,0 +1,20 @@
+[project]
+name = "abx-plugin-ldap-auth"
+version = "2024.10.28"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+    "abx-spec-django>=0.1.0",
+]
+
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+
+[project.entry-points.abx]
+abx_plugin_ldap_auth = "abx_plugin_ldap_auth"
--- a/archivebox/pkgs/abx-plugin-mercury/README.md
+++ b/archivebox/pkgs/abx-plugin-mercury/README.md
--- a/archivebox/pkgs/abx-plugin-mercury/abx_plugin_mercury/init.py
+++ b/archivebox/pkgs/abx-plugin-mercury/abx_plugin_mercury/init.py
@@ -0,0 +1,29 @@
+__package__ = 'abx_plugin_mercury'
+__label__ = 'Postlight Parser'
+__homepage__ = 'https://github.com/postlight/mercury-parser'
+
+import abx
+
+@abx.hookimpl
+def get_CONFIG():
+    from .config import MERCURY_CONFIG
+    
+    return {
+        'MERCURY_CONFIG': MERCURY_CONFIG
+    }
+
+@abx.hookimpl
+def get_BINARIES():
+    from .binaries import MERCURY_BINARY
+    
+    return {
+        'mercury': MERCURY_BINARY,
+    }
+
+@abx.hookimpl
+def get_EXTRACTORS():
+    from .extractors import MERCURY_EXTRACTOR
+    
+    return {
+        'mercury': MERCURY_EXTRACTOR,
+    }
--- a/archivebox/pkgs/abx-plugin-mercury/abx_plugin_mercury/binaries.py
+++ b/archivebox/pkgs/abx-plugin-mercury/abx_plugin_mercury/binaries.py
@@ -0,0 +1,32 @@
+__package__ = 'abx_plugin_mercury'
+
+from typing import List
+
+from pydantic import InstanceOf
+from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, bin_abspath, Binary
+
+from abx_plugin_default_binproviders import env
+
+from abx_plugin_npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
+
+from .config import MERCURY_CONFIG
+
+
+class MercuryBinary(Binary):
+    name: BinName = MERCURY_CONFIG.MERCURY_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
+
+    overrides: BinaryOverrides = {
+        LIB_NPM_BINPROVIDER.name: {
+            'packages': ['@postlight/parser@^2.2.3'],
+        },
+        SYS_NPM_BINPROVIDER.name: {
+            'packages': ['@postlight/parser@^2.2.3'],
+            'install': lambda: None,                          # never try to install things into global prefix
+        },
+        env.name: {
+            'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None,
+        },
+    }
+
+MERCURY_BINARY = MercuryBinary()
--- a/archivebox/pkgs/abx-plugin-mercury/abx_plugin_mercury/config.py
+++ b/archivebox/pkgs/abx-plugin-mercury/abx_plugin_mercury/config.py
@@ -0,0 +1,31 @@
+__package__ = 'abx_plugin_mercury'
+
+from typing import List, Optional
+from pathlib import Path
+
+from pydantic import Field
+
+from abx_spec_config.base_configset import BaseConfigSet
+
+from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
+
+
+
+class MercuryConfig(BaseConfigSet):
+
+    SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY')
+    
+    MERCURY_BINARY: str = Field(default='postlight-parser')
+    MERCURY_EXTRA_ARGS: List[str] = []
+    
+    SAVE_MERCURY_REQUISITES: bool = Field(default=True)
+    MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
+    
+    MERCURY_TIMEOUT: int =  Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
+    MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
+    MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
+    MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
+    
+
+
+MERCURY_CONFIG = MercuryConfig()
--- a/archivebox/pkgs/abx-plugin-mercury/abx_plugin_mercury/extractors.py
+++ b/archivebox/pkgs/abx-plugin-mercury/abx_plugin_mercury/extractors.py
@@ -0,0 +1,17 @@
+__package__ = 'abx_plugin_mercury'
+
+# from pathlib import Path
+
+# from .binaries import MERCURY_BINARY
+
+
+
+# class MercuryExtractor(BaseExtractor):
+#     name: ExtractorName = 'mercury'
+#     binary: str = MERCURY_BINARY.name
+
+#     def get_output_path(self, snapshot) -> Path | None:
+#         return snapshot.link_dir / 'mercury' / 'content.html'
+
+
+# MERCURY_EXTRACTOR = MercuryExtractor()
--- a/archivebox/pkgs/abx-plugin-mercury/abx_plugin_mercury/mercury.py
+++ b/archivebox/pkgs/abx-plugin-mercury/abx_plugin_mercury/mercury.py
@@ -0,0 +1,122 @@
+__package__ = 'archivebox.extractors'
+
+from pathlib import Path
+
+from subprocess import CompletedProcess
+from typing import Optional, List
+import json
+
+from ..index.schema import Link, ArchiveResult, ArchiveError
+from archivebox.misc.system import run, atomic_write
+from archivebox.misc.util import (
+    enforce_types,
+    is_static_file,
+)
+from archivebox.plugins_extractor.mercury.config import MERCURY_CONFIG
+from archivebox.plugins_extractor.mercury.binaries import MERCURY_BINARY
+
+from ..logging_util import TimedProgress
+
+
+def get_output_path():
+    return 'mercury/'
+
+def get_embed_path(archiveresult=None):
+    return get_output_path() + 'content.html'
+
+
+@enforce_types
+def ShellError(cmd: List[str], result: CompletedProcess, lines: int=20) -> ArchiveError:
+    # parse out last line of stderr
+    return ArchiveError(
+        f'Got {cmd[0]} response code: {result.returncode}).',
+        " ".join(
+            line.strip()
+            for line in (result.stdout + result.stderr).decode().rsplit('\n', lines)[-lines:]
+            if line.strip()
+        ),
+    )
+
+
+@enforce_types
+def should_save_mercury(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
+    if is_static_file(link.url):
+        return False
+
+    out_dir = Path(out_dir or link.link_dir)
+
+    if not overwrite and (out_dir / get_output_path()).exists():
+        return False
+
+    return MERCURY_CONFIG.SAVE_MERCURY
+
+
+@enforce_types
+def save_mercury(link: Link, out_dir: Optional[Path]=None, timeout: int=MERCURY_CONFIG.MERCURY_TIMEOUT) -> ArchiveResult:
+    """download reader friendly version using @postlight/mercury-parser"""
+
+    out_dir = Path(out_dir or link.link_dir)
+    output_folder = out_dir.absolute() / get_output_path()
+    output = get_output_path()
+    
+    mercury_binary = MERCURY_BINARY.load()
+    assert mercury_binary.abspath and mercury_binary.version
+
+    status = 'succeeded'
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        output_folder.mkdir(exist_ok=True)
+        # later options take precedence
+        # By default, get plain text version of article
+        cmd = [
+            str(mercury_binary.abspath),
+            *MERCURY_CONFIG.MERCURY_EXTRA_ARGS,
+            '--format=text',
+            link.url,
+        ]
+        result = run(cmd, cwd=out_dir, timeout=timeout)
+        try:
+            article_text = json.loads(result.stdout)
+        except json.JSONDecodeError:
+            raise ShellError(cmd, result)
+        
+        if article_text.get('failed'):
+            raise ArchiveError('Mercury was not able to get article text from the URL')
+
+        atomic_write(str(output_folder / "content.txt"), article_text["content"])
+
+        # Get HTML version of article
+        cmd = [
+            str(mercury_binary.abspath),
+            *MERCURY_CONFIG.MERCURY_EXTRA_ARGS,
+            link.url
+        ]
+        result = run(cmd, cwd=out_dir, timeout=timeout)
+        try:
+            article_json = json.loads(result.stdout)
+        except json.JSONDecodeError:
+            raise ShellError(cmd, result)
+
+        if article_text.get('failed'):
+            raise ArchiveError('Mercury was not able to get article HTML from the URL')
+
+        atomic_write(str(output_folder / "content.html"), article_json.pop("content"))
+        atomic_write(str(output_folder / "article.json"), article_json)
+
+        # Check for common failure cases
+        if (result.returncode > 0):
+            raise ShellError(cmd, result)
+    except (ArchiveError, Exception, OSError) as err:
+        status = 'failed'
+        output = err
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=str(out_dir),
+        cmd_version=str(mercury_binary.version),
+        output=output,
+        status=status,
+        **timer.stats,
+    )
--- a/archivebox/pkgs/abx-plugin-mercury/pyproject.toml
+++ b/archivebox/pkgs/abx-plugin-mercury/pyproject.toml
@@ -0,0 +1,17 @@
+[project]
+name = "abx-plugin-mercury"
+version = "2024.10.28"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_mercury = "abx_plugin_mercury"
--- a/archivebox/pkgs/abx-plugin-npm/README.md
+++ b/archivebox/pkgs/abx-plugin-npm/README.md
--- a/archivebox/pkgs/abx-plugin-npm/abx_plugin_npm/init.py
+++ b/archivebox/pkgs/abx-plugin-npm/abx_plugin_npm/init.py
@@ -0,0 +1,32 @@
+__label__ = 'NPM'
+__author__ = 'ArchiveBox'
+__homepage__ = 'https://www.npmjs.com/'
+
+import abx
+
+
+@abx.hookimpl
+def get_CONFIG():
+    from .config import NPM_CONFIG
+    return {
+        'NPM_CONFIG': NPM_CONFIG,
+    }
+
+@abx.hookimpl
+def get_BINARIES():
+    from .binaries import NODE_BINARY, NPM_BINARY, NPX_BINARY
+    
+    return {
+        'node': NODE_BINARY,
+        'npm': NPM_BINARY,
+        'npx': NPX_BINARY,
+    }
+
+@abx.hookimpl
+def get_BINPROVIDERS():
+    from .binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
+    
+    return {
+        'sys_npm': SYS_NPM_BINPROVIDER,
+        'lib_npm': LIB_NPM_BINPROVIDER,
+    }
--- a/archivebox/pkgs/abx-plugin-npm/abx_plugin_npm/binaries.py
+++ b/archivebox/pkgs/abx-plugin-npm/abx_plugin_npm/binaries.py
@@ -0,0 +1,53 @@
+__package__ = 'plugins_pkg.npm'
+
+
+from typing import List
+
+from pydantic import InstanceOf
+from benedict import benedict
+
+from pydantic_pkgr import BinProvider, Binary, BinName, BinaryOverrides
+
+from abx_plugin_default_binproviders import get_BINPROVIDERS
+
+DEFAULT_BINPROVIDERS = benedict(get_BINPROVIDERS())
+env = DEFAULT_BINPROVIDERS.env
+apt = DEFAULT_BINPROVIDERS.apt
+brew = DEFAULT_BINPROVIDERS.brew
+
+
+class NodeBinary(Binary):
+    name: BinName = 'node'
+    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
+    
+    overrides: BinaryOverrides = {
+        apt.name: {'packages': ['nodejs']},
+    }
+
+
+NODE_BINARY = NodeBinary()
+
+
+class NpmBinary(Binary):
+    name: BinName = 'npm'
+    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
+
+    overrides: BinaryOverrides = {
+        apt.name: {'packages': ['npm']},   # already installed when nodejs is installed
+        brew.name: {'install': lambda: None},  # already installed when nodejs is installed
+    }
+    
+NPM_BINARY = NpmBinary()
+
+
+class NpxBinary(Binary):
+    name: BinName = 'npx'
+    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
+    
+    overrides: BinaryOverrides = {
+        apt.name: {'install': lambda: None},   # already installed when nodejs is installed
+        brew.name: {'install': lambda: None},  # already installed when nodejs is installed
+    }
+
+NPX_BINARY = NpxBinary()
+
--- a/archivebox/pkgs/abx-plugin-npm/abx_plugin_npm/binproviders.py
+++ b/archivebox/pkgs/abx-plugin-npm/abx_plugin_npm/binproviders.py
@@ -0,0 +1,38 @@
+import os
+from pathlib import Path
+from typing import Optional
+
+from pydantic_pkgr import NpmProvider, PATHStr, BinProviderName
+
+import abx
+
+DEFAULT_LIB_NPM_DIR = Path('/usr/local/share/abx/npm')
+
+OLD_NODE_BIN_PATH = Path(os.getcwd()) / 'node_modules' / '.bin'
+NEW_NODE_BIN_PATH = DEFAULT_LIB_NPM_DIR / 'node_modules' / '.bin'
+
+
+class SystemNpmBinProvider(NpmProvider):
+    name: BinProviderName = "sys_npm"
+    
+    npm_prefix: Optional[Path] = None
+
+
+class LibNpmBinProvider(NpmProvider):
+    name: BinProviderName = "lib_npm"
+    PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}'
+    
+    npm_prefix: Optional[Path] = DEFAULT_LIB_NPM_DIR
+    
+    def setup(self) -> None:
+        # update paths from config at runtime
+        LIB_DIR = abx.pm.hook.get_LIB_DIR()
+        self.npm_prefix = LIB_DIR / 'npm'
+        self.PATH = f'{LIB_DIR / "npm" / "node_modules" / ".bin"}:{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}'
+
+        super().setup()
+
+
+SYS_NPM_BINPROVIDER = SystemNpmBinProvider()
+LIB_NPM_BINPROVIDER = LibNpmBinProvider()
+npm = LIB_NPM_BINPROVIDER
--- a/archivebox/pkgs/abx-plugin-npm/abx_plugin_npm/config.py
+++ b/archivebox/pkgs/abx-plugin-npm/abx_plugin_npm/config.py
@@ -0,0 +1,17 @@
+from abx_spec_config import BaseConfigSet
+
+
+###################### Config ##########################
+
+
+class NpmDependencyConfigs(BaseConfigSet):
+    # USE_NPM: bool = True
+    # NPM_BINARY: str = Field(default='npm')
+    # NPM_ARGS: Optional[List[str]] = Field(default=None)
+    # NPM_EXTRA_ARGS: List[str] = []
+    # NPM_DEFAULT_ARGS: List[str] = []
+    pass
+
+
+NPM_CONFIG = NpmDependencyConfigs()
+
--- a/archivebox/pkgs/abx-plugin-npm/pyproject.toml
+++ b/archivebox/pkgs/abx-plugin-npm/pyproject.toml
@@ -0,0 +1,20 @@
+[project]
+name = "abx-plugin-npm"
+version = "2024.10.24"
+description = "NPM binary provider plugin for ABX"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "pydantic-pkgr>=0.5.4",
+    "abx-spec-pydantic-pkgr>=0.1.0",
+    "abx-spec-config>=0.1.0",
+    "abx-plugin-default-binproviders>=2024.10.24",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_npm = "abx_plugin_npm"
--- a/archivebox/pkgs/abx-plugin-pip/README.md
+++ b/archivebox/pkgs/abx-plugin-pip/README.md
--- a/archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/.plugin_order
+++ b/archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/.plugin_order
@@ -0,0 +1 @@
+0
--- a/archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/init.py
+++ b/archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/init.py
@@ -0,0 +1,36 @@
+__package__ = 'abx_plugin_pip'
+__label__ = 'PIP'
+
+import abx
+
+
+@abx.hookimpl
+def get_CONFIG():
+    from .config import PIP_CONFIG
+    
+    return {
+        'PIP_CONFIG': PIP_CONFIG
+    }
+
+@abx.hookimpl(tryfirst=True)
+def get_BINARIES():
+    from .binaries import ARCHIVEBOX_BINARY, PYTHON_BINARY, DJANGO_BINARY, SQLITE_BINARY, PIP_BINARY, PIPX_BINARY
+    
+    return {
+        'archivebox': ARCHIVEBOX_BINARY,
+        'python': PYTHON_BINARY,
+        'django': DJANGO_BINARY,
+        'sqlite': SQLITE_BINARY,
+        'pip': PIP_BINARY,
+        'pipx': PIPX_BINARY,
+    }
+
+@abx.hookimpl
+def get_BINPROVIDERS():
+    from .binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER
+    
+    return {
+        'sys_pip': SYS_PIP_BINPROVIDER,
+        'venv_pip': VENV_PIP_BINPROVIDER,
+        'lib_pip': LIB_PIP_BINPROVIDER,
+    }
--- a/archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/binaries.py
+++ b/archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/binaries.py
@@ -0,0 +1,162 @@
+__package__ = 'abx_plugin_pip'
+
+import sys
+from pathlib import Path
+from typing import List
+from pydantic import InstanceOf, Field, model_validator
+
+
+import django
+import django.db.backends.sqlite3.base
+from django.db.backends.sqlite3.base import Database as django_sqlite3     # type: ignore[import-type]
+from pydantic_pkgr import BinProvider, Binary, BinName, BinaryOverrides, SemVer
+
+
+from .binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env, apt, brew
+
+###################### Config ##########################
+
+def get_archivebox_version():
+    try:
+        from archivebox import VERSION
+        return VERSION
+    except Exception:
+        return None
+
+
+class ArchiveboxBinary(Binary):
+    name: BinName = 'archivebox'
+
+    binproviders_supported: List[InstanceOf[BinProvider]] = [VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
+    overrides: BinaryOverrides = {
+        VENV_PIP_BINPROVIDER.name:  {'packages': [], 'version': get_archivebox_version},
+        SYS_PIP_BINPROVIDER.name:   {'packages': [], 'version': get_archivebox_version},
+        apt.name:                   {'packages': [], 'version': get_archivebox_version},
+        brew.name:                  {'packages': [], 'version': get_archivebox_version},
+    }
+    
+    # @validate_call
+    def install(self, **kwargs):
+        return self.load()                  # obviously it's already installed if we are running this ;)
+    
+    # @validate_call
+    def load_or_install(self, **kwargs):
+        return self.load()                  # obviously it's already installed if we are running this ;)
+
+ARCHIVEBOX_BINARY = ArchiveboxBinary()
+
+
+class PythonBinary(Binary):
+    name: BinName = 'python'
+
+    binproviders_supported: List[InstanceOf[BinProvider]] = [VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
+    overrides: BinaryOverrides = {
+        SYS_PIP_BINPROVIDER.name: {
+            'abspath': sys.executable,
+            'version': '{}.{}.{}'.format(*sys.version_info[:3]),
+        },
+    }
+    
+    # @validate_call
+    def install(self, **kwargs):
+        return self.load()                  # obviously it's already installed if we are running this ;)
+    
+    # @validate_call
+    def load_or_install(self, **kwargs):
+        return self.load()                  # obviously it's already installed if we are running this ;)
+
+PYTHON_BINARY = PythonBinary()
+
+
+LOADED_SQLITE_PATH = Path(django.db.backends.sqlite3.base.__file__)
+LOADED_SQLITE_VERSION = SemVer(django_sqlite3.version)
+LOADED_SQLITE_FROM_VENV = str(LOADED_SQLITE_PATH.absolute().resolve()).startswith(str(VENV_PIP_BINPROVIDER.pip_venv.absolute().resolve()))
+
+class SqliteBinary(Binary):
+    name: BinName = 'sqlite'
+    binproviders_supported: List[InstanceOf[BinProvider]] = Field(default=[VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER])
+    overrides: BinaryOverrides = {
+        VENV_PIP_BINPROVIDER.name: {
+            "abspath": LOADED_SQLITE_PATH if LOADED_SQLITE_FROM_VENV else None,
+            "version": LOADED_SQLITE_VERSION if LOADED_SQLITE_FROM_VENV else None,
+        },
+        SYS_PIP_BINPROVIDER.name: {
+            "abspath": LOADED_SQLITE_PATH if not LOADED_SQLITE_FROM_VENV else None,
+            "version": LOADED_SQLITE_VERSION if not LOADED_SQLITE_FROM_VENV else None,
+        },
+    }
+    
+    @model_validator(mode='after')
+    def validate_json_extension_is_available(self):
+        # Check to make sure JSON extension is available in our Sqlite3 instance
+        try:
+            cursor = django_sqlite3.connect(':memory:').cursor()
+            cursor.execute('SELECT JSON(\'{"a": "b"}\')')
+        except django_sqlite3.OperationalError as exc:
+            print(f'[red][X] Your SQLite3 version is missing the required JSON1 extension: {exc}[/red]')
+            print(
+                '[violet]Hint:[/violet] Upgrade your Python version or install the extension manually:\n' +
+                '      https://code.djangoproject.com/wiki/JSON1Extension\n'
+            )
+        return self
+    
+    # @validate_call
+    def install(self, **kwargs):
+        return self.load()                  # obviously it's already installed if we are running this ;)
+    
+    # @validate_call
+    def load_or_install(self, **kwargs):
+        return self.load()                  # obviously it's already installed if we are running this ;)
+
+SQLITE_BINARY = SqliteBinary()
+
+
+LOADED_DJANGO_PATH = Path(django.__file__)
+LOADED_DJANGO_VERSION = SemVer(django.VERSION[:3])
+LOADED_DJANGO_FROM_VENV = str(LOADED_DJANGO_PATH.absolute().resolve()).startswith(str(VENV_PIP_BINPROVIDER.pip_venv and VENV_PIP_BINPROVIDER.pip_venv.absolute().resolve()))
+
+class DjangoBinary(Binary):
+    name: BinName = 'django'
+
+    binproviders_supported: List[InstanceOf[BinProvider]] = Field(default=[VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER])
+    overrides: BinaryOverrides = {
+        VENV_PIP_BINPROVIDER.name: {
+            "abspath": LOADED_DJANGO_PATH if LOADED_DJANGO_FROM_VENV else None,
+            "version": LOADED_DJANGO_VERSION if LOADED_DJANGO_FROM_VENV else None,
+        },
+        SYS_PIP_BINPROVIDER.name: {
+            "abspath": LOADED_DJANGO_PATH if not LOADED_DJANGO_FROM_VENV else None,
+            "version": LOADED_DJANGO_VERSION if not LOADED_DJANGO_FROM_VENV else None,
+        },
+    }
+    
+    # @validate_call
+    def install(self, **kwargs):
+        return self.load()                  # obviously it's already installed if we are running this ;)
+    
+    # @validate_call
+    def load_or_install(self, **kwargs):
+        return self.load()                  # obviously it's already installed if we are running this ;)
+
+DJANGO_BINARY = DjangoBinary()
+
+class PipBinary(Binary):
+    name: BinName = "pip"
+    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
+
+    # @validate_call
+    def install(self, **kwargs):
+        return self.load()                  # obviously it's already installed if we are running this ;)
+    
+    # @validate_call
+    def load_or_install(self, **kwargs):
+        return self.load()                  # obviously it's already installed if we are running this ;)
+
+PIP_BINARY = PipBinary()
+
+
+class PipxBinary(Binary):
+    name: BinName = "pipx"
+    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
+
+PIPX_BINARY = PipxBinary()
--- a/archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/binproviders.py
+++ b/archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/binproviders.py
@@ -0,0 +1,91 @@
+import os
+import sys
+import site
+from pathlib import Path
+from typing import Optional
+
+from benedict import benedict
+
+from pydantic_pkgr import PipProvider, BinName, BinProviderName
+
+import abx
+
+from abx_plugin_default_binproviders import get_BINPROVIDERS
+
+DEFAULT_BINPROVIDERS = benedict(get_BINPROVIDERS())
+env = DEFAULT_BINPROVIDERS.env
+apt = DEFAULT_BINPROVIDERS.apt
+brew = DEFAULT_BINPROVIDERS.brew
+
+
+###################### Config ##########################
+
+class SystemPipBinProvider(PipProvider):
+    name: BinProviderName = "sys_pip"
+    INSTALLER_BIN: BinName = "pip"
+    
+    pip_venv: Optional[Path] = None        # global pip scope
+    
+    def on_install(self, bin_name: str, **kwargs):
+        # never modify system pip packages
+        return 'refusing to install packages globally with system pip, use a venv instead'
+
+class SystemPipxBinProvider(PipProvider):
+    name: BinProviderName = "pipx"
+    INSTALLER_BIN: BinName = "pipx"
+    
+    pip_venv: Optional[Path] = None        # global pipx scope
+
+
+IS_INSIDE_VENV = sys.prefix != sys.base_prefix
+
+class VenvPipBinProvider(PipProvider):
+    name: BinProviderName = "venv_pip"
+    INSTALLER_BIN: BinName = "pip"
+
+    pip_venv: Optional[Path] = Path(sys.prefix if IS_INSIDE_VENV else os.environ.get("VIRTUAL_ENV", '/tmp/NotInsideAVenv/lib'))
+    
+    def setup(self):
+        """never attempt to create a venv here, this is just used to detect if we are inside an existing one"""
+        return None
+    
+
+class LibPipBinProvider(PipProvider):
+    name: BinProviderName = "lib_pip"
+    INSTALLER_BIN: BinName = "pip"
+    
+    pip_venv: Optional[Path] = Path('/usr/local/share/abx/pip/venv')
+    
+    def setup(self) -> None:
+        # update venv path to match most up-to-date LIB_DIR based on runtime config
+        LIB_DIR = abx.pm.hook.get_LIB_DIR()
+        self.pip_venv = LIB_DIR / 'pip' / 'venv'
+        super().setup()
+
+SYS_PIP_BINPROVIDER = SystemPipBinProvider()
+PIPX_PIP_BINPROVIDER = SystemPipxBinProvider()
+VENV_PIP_BINPROVIDER = VenvPipBinProvider()
+LIB_PIP_BINPROVIDER = LibPipBinProvider()
+pip = LIB_PIP_BINPROVIDER
+
+# ensure python libraries are importable from these locations (if archivebox wasnt executed from one of these then they wont already be in sys.path)
+assert VENV_PIP_BINPROVIDER.pip_venv is not None
+assert LIB_PIP_BINPROVIDER.pip_venv is not None
+
+major, minor, patch = sys.version_info[:3]
+site_packages_dir = f'lib/python{major}.{minor}/site-packages'
+
+LIB_SITE_PACKAGES = (LIB_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
+VENV_SITE_PACKAGES = (VENV_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
+USER_SITE_PACKAGES = site.getusersitepackages()
+SYS_SITE_PACKAGES = site.getsitepackages()
+
+ALL_SITE_PACKAGES = (
+    *LIB_SITE_PACKAGES,
+    *VENV_SITE_PACKAGES,
+    *USER_SITE_PACKAGES,
+    *SYS_SITE_PACKAGES,
+)
+for site_packages_dir in ALL_SITE_PACKAGES:
+    if site_packages_dir not in sys.path:
+        sys.path.append(str(site_packages_dir))
--- a/archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/config.py
+++ b/archivebox/pkgs/abx-plugin-pip/abx_plugin_pip/config.py
@@ -0,0 +1,16 @@
+__package__ = 'pip'
+
+from typing import List, Optional
+from pydantic import Field
+
+from abx_spec_config.base_configset import BaseConfigSet
+
+
+class PipDependencyConfigs(BaseConfigSet):
+    USE_PIP: bool = True
+    PIP_BINARY: str = Field(default='pip')
+    PIP_ARGS: Optional[List[str]] = Field(default=None)
+    PIP_EXTRA_ARGS: List[str] = []
+    PIP_DEFAULT_ARGS: List[str] = []
+    
+PIP_CONFIG = PipDependencyConfigs()
--- a/archivebox/pkgs/abx-plugin-pip/pyproject.toml
+++ b/archivebox/pkgs/abx-plugin-pip/pyproject.toml
@@ -0,0 +1,22 @@
+[project]
+name = "abx-plugin-pip"
+version = "2024.10.24"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "pydantic-pkgr>=0.5.4",
+    "abx-spec-config>=0.1.0",
+    "abx-spec-pydantic-pkgr>=0.1.0",
+    "abx-plugin-default-binproviders>=2024.10.24",
+    "django>=5.0.0",
+]
+
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_pip = "abx_plugin_pip"
--- a/archivebox/pkgs/abx-plugin-playwright/README.md
+++ b/archivebox/pkgs/abx-plugin-playwright/README.md
--- a/archivebox/pkgs/abx-plugin-playwright/abx_plugin_playwright/init.py
+++ b/archivebox/pkgs/abx-plugin-playwright/abx_plugin_playwright/init.py
@@ -0,0 +1,28 @@
+__label__ = 'Playwright'
+__homepage__ = 'https://github.com/microsoft/playwright-python'
+
+import abx
+
+
+@abx.hookimpl
+def get_CONFIG():
+    from .config import PLAYWRIGHT_CONFIG
+    return {
+        'PLAYWRIGHT_CONFIG': PLAYWRIGHT_CONFIG
+    }
+
+@abx.hookimpl
+def get_BINARIES():
+    from .binaries import PLAYWRIGHT_BINARY
+    
+    return {
+        'playwright': PLAYWRIGHT_BINARY,
+    }
+
+@abx.hookimpl
+def get_BINPROVIDERS():
+    from .binproviders import PLAYWRIGHT_BINPROVIDER
+    
+    return {
+        'playwright': PLAYWRIGHT_BINPROVIDER,
+    }
--- a/archivebox/pkgs/abx-plugin-playwright/abx_plugin_playwright/binaries.py
+++ b/archivebox/pkgs/abx-plugin-playwright/abx_plugin_playwright/binaries.py
@@ -0,0 +1,21 @@
+__package__ = 'abx_plugin_playwright'
+
+from typing import List
+
+from pydantic import InstanceOf
+from pydantic_pkgr import BinName, BinProvider, Binary
+
+
+from abx_plugin_pip.binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER
+from abx_plugin_default_binproviders import env
+
+from .config import PLAYWRIGHT_CONFIG
+
+
+class PlaywrightBinary(Binary):
+    name: BinName = PLAYWRIGHT_CONFIG.PLAYWRIGHT_BINARY
+
+    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env]
+    
+
+PLAYWRIGHT_BINARY = PlaywrightBinary()
--- a/archivebox/pkgs/abx-plugin-playwright/abx_plugin_playwright/binproviders.py
+++ b/archivebox/pkgs/abx-plugin-playwright/abx_plugin_playwright/binproviders.py
@@ -0,0 +1,163 @@
+__package__ = 'abx_plugin_playwright'
+
+import os
+import shutil
+import platform
+from pathlib import Path
+from typing import List, Optional, Dict, ClassVar
+
+from pydantic import computed_field, Field
+from pydantic_pkgr import (
+    BinName,
+    BinProvider,
+    BinProviderName,
+    BinProviderOverrides,
+    InstallArgs,
+    PATHStr,
+    HostBinPath,
+    bin_abspath,
+    OPERATING_SYSTEM,
+    DEFAULT_ENV_PATH,
+)
+
+import abx
+
+
+from .binaries import PLAYWRIGHT_BINARY
+
+
+MACOS_PLAYWRIGHT_CACHE_DIR: Path = Path("~/Library/Caches/ms-playwright")
+LINUX_PLAYWRIGHT_CACHE_DIR: Path = Path("~/.cache/ms-playwright")
+
+
+class PlaywrightBinProvider(BinProvider):
+    name: BinProviderName = "playwright"
+    INSTALLER_BIN: BinName = PLAYWRIGHT_BINARY.name
+
+    PATH: PATHStr = f"{Path('/usr/share/abx') / 'bin'}:{DEFAULT_ENV_PATH}"
+
+    playwright_browsers_dir: Path = (
+        MACOS_PLAYWRIGHT_CACHE_DIR.expanduser()
+        if OPERATING_SYSTEM == "darwin" else
+        LINUX_PLAYWRIGHT_CACHE_DIR.expanduser()
+    )
+    playwright_install_args: List[str] = ["install"]
+
+    packages_handler: BinProviderOverrides = Field(default={
+        "chrome": ["chromium"],
+    }, exclude=True)
+
+    _browser_abspaths: ClassVar[Dict[str, HostBinPath]] = {}
+
+    @computed_field
+    @property
+    def INSTALLER_BIN_ABSPATH(self) -> HostBinPath | None:
+        try:
+            return PLAYWRIGHT_BINARY.load().abspath
+        except Exception as e:
+            return None
+
+    def setup(self) -> None:
+        # update paths from config at runtime
+        LIB_DIR = abx.pm.hook.get_LIB_DIR()
+        
+        self.PATH = f"{LIB_DIR / 'bin'}:{DEFAULT_ENV_PATH}"
+
+        assert shutil.which('pip'), "Pip bin provider not initialized"
+
+        if self.playwright_browsers_dir:
+            self.playwright_browsers_dir.mkdir(parents=True, exist_ok=True)
+
+    def installed_browser_bins(self, browser_name: str = "*") -> List[Path]:
+        if browser_name == 'chrome':
+            browser_name = 'chromium'
+        
+        # if on macOS, browser binary is inside a .app, otherwise it's just a plain binary
+        if platform.system().lower() == "darwin":
+            # ~/Library/caches/ms-playwright/chromium-1097/chrome-mac/Chromium.app/Contents/MacOS/Chromium
+            return sorted(
+                self.playwright_browsers_dir.glob(
+                    f"{browser_name}-*/*-mac*/*.app/Contents/MacOS/*"
+                )
+            )
+
+        # ~/Library/caches/ms-playwright/chromium-1097/chrome-linux/chromium
+        paths = []
+        for path in sorted(self.playwright_browsers_dir.glob(f"{browser_name}-*/*-linux/*")):
+            if 'xdg-settings' in str(path):
+                continue
+            if 'ffmpeg' in str(path):
+                continue
+            if '/chrom' in str(path) and 'chrom' in path.name.lower():
+                paths.append(path)
+        return paths
+
+    def default_abspath_handler(self, bin_name: BinName, **context) -> Optional[HostBinPath]:
+        assert bin_name == "chrome", "Only chrome is supported using the @puppeteer/browsers install method currently."
+
+        # already loaded, return abspath from cache
+        if bin_name in self._browser_abspaths:
+            return self._browser_abspaths[bin_name]
+
+        # first time loading, find browser in self.playwright_browsers_dir by searching filesystem for installed binaries
+        matching_bins = [abspath for abspath in self.installed_browser_bins() if bin_name in str(abspath)]
+        if matching_bins:
+            newest_bin = matching_bins[-1]  # already sorted alphabetically, last should theoretically be highest version number
+            self._browser_abspaths[bin_name] = newest_bin
+            return self._browser_abspaths[bin_name]
+        
+        # playwright sometimes installs google-chrome-stable via apt into system $PATH, check there as well
+        abspath = bin_abspath('google-chrome-stable', PATH=env.PATH)
+        if abspath:
+            self._browser_abspaths[bin_name] = abspath
+            return self._browser_abspaths[bin_name]
+
+        return None
+
+    def default_install_handler(self, bin_name: str, packages: Optional[InstallArgs] = None, **context) -> str:
+        """playwright install chrome"""
+        self.setup()
+        assert bin_name == "chrome", "Only chrome is supported using the playwright install method currently."
+
+        if not self.INSTALLER_BIN_ABSPATH:
+            raise Exception(
+                f"{self.__class__.__name__} install method is not available on this host ({self.INSTALLER_BIN} not found in $PATH)"
+            )
+        packages = packages or self.get_packages(bin_name)
+
+        # print(f'[*] {self.__class__.__name__}: Installing {bin_name}: {self.INSTALLER_BIN_ABSPATH} install {packages}')
+
+
+        # playwright install-deps (to install system dependencies like fonts, graphics libraries, etc.)
+        if platform.system().lower() != 'darwin':
+            # libglib2.0-0, libnss3, libnspr4, libdbus-1-3, libatk1.0-0, libatk-bridge2.0-0, libcups2, libdrm2, libxcb1, libxkbcommon0, libatspi2.0-0, libx11-6, libxcomposite1, libxdamage1, libxext6, libxfixes3, libxrandr2, libgbm1, libcairo2, libasound2
+            proc = self.exec(bin_name=self.INSTALLER_BIN_ABSPATH, cmd=['install-deps'])
+            if proc.returncode != 0:
+                print(proc.stdout.strip())
+                print(proc.stderr.strip())
+
+        proc = self.exec(bin_name=self.INSTALLER_BIN_ABSPATH, cmd=['install', *packages])
+
+        if proc.returncode != 0:
+            print(proc.stdout.strip())
+            print(proc.stderr.strip())
+            raise Exception(f"{self.__class__.__name__}: install got returncode {proc.returncode} while installing {packages}: {packages} PACKAGES={packages}")
+
+        # chrome@129.0.6668.58 /data/lib/browsers/chrome/mac_arm-129.0.6668.58/chrome-mac-arm64/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing
+        # playwright build v1010 downloaded to /home/squash/.cache/ms-playwright/ffmpeg-1010
+        output_lines = [
+            line for line in proc.stdout.strip().split('\n')
+            if '/chrom' in line
+            and 'chrom' in line.rsplit('/', 1)[-1].lower()   # if final path segment (filename) contains chrome or chromium
+            and 'xdg-settings' not in line
+            and 'ffmpeg' not in line
+        ]
+        if output_lines:
+            relpath = output_lines[0].split(str(self.playwright_browsers_dir))[-1]
+            abspath = self.playwright_browsers_dir / relpath
+            if os.path.isfile(abspath) and os.access(abspath, os.X_OK):
+                self._browser_abspaths[bin_name] = abspath
+        
+        return (proc.stderr.strip() + "\n" + proc.stdout.strip()).strip()
+
+PLAYWRIGHT_BINPROVIDER = PlaywrightBinProvider()
--- a/archivebox/pkgs/abx-plugin-playwright/abx_plugin_playwright/config.py
+++ b/archivebox/pkgs/abx-plugin-playwright/abx_plugin_playwright/config.py
@@ -0,0 +1,7 @@
+from abx_spec_config import BaseConfigSet
+
+class PlaywrightConfigs(BaseConfigSet):
+    PLAYWRIGHT_BINARY: str = 'playwright'
+
+
+PLAYWRIGHT_CONFIG = PlaywrightConfigs()
--- a/archivebox/pkgs/abx-plugin-playwright/pyproject.toml
+++ b/archivebox/pkgs/abx-plugin-playwright/pyproject.toml
@@ -0,0 +1,20 @@
+[project]
+name = "abx-plugin-playwright"
+version = "2024.10.28"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "pydantic>=2.4.2",
+    "pydantic-pkgr>=0.5.4",
+    "abx-spec-pydantic-pkgr>=0.1.0",
+    "abx-spec-config>=0.1.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_playwright = "abx_plugin_playwright"
--- a/archivebox/pkgs/abx-plugin-pocket/README.md
+++ b/archivebox/pkgs/abx-plugin-pocket/README.md
--- a/archivebox/pkgs/abx-plugin-pocket/abx_plugin_pocket/init.py
+++ b/archivebox/pkgs/abx-plugin-pocket/abx_plugin_pocket/init.py
@@ -0,0 +1,18 @@
+__package__ = 'abx_plugin_pocket'
+__label__ = 'Pocket'
+
+import abx
+
+
+@abx.hookimpl
+def get_CONFIG():
+    from .config import POCKET_CONFIG
+    
+    return {
+        'POCKET_CONFIG': POCKET_CONFIG
+    }
+
+@abx.hookimpl
+def ready():
+    from .config import POCKET_CONFIG
+    POCKET_CONFIG.validate()
--- a/archivebox/pkgs/abx-plugin-pocket/abx_plugin_pocket/config.py
+++ b/archivebox/pkgs/abx-plugin-pocket/abx_plugin_pocket/config.py
@@ -0,0 +1,15 @@
+__package__ = 'abx_plugin_pocket'
+
+from typing import Dict
+
+from pydantic import Field
+
+from abx_spec_config.base_configset import BaseConfigSet
+
+
+class PocketConfig(BaseConfigSet):
+    POCKET_CONSUMER_KEY: str | None                   = Field(default=None)
+    POCKET_ACCESS_TOKENS: Dict[str, str]              = Field(default=lambda: {})   # {<username>: <access_token>, ...}
+
+
+POCKET_CONFIG = PocketConfig()
--- a/archivebox/pkgs/abx-plugin-pocket/pyproject.toml
+++ b/archivebox/pkgs/abx-plugin-pocket/pyproject.toml
@@ -0,0 +1,18 @@
+[project]
+name = "abx-plugin-pocket"
+version = "2024.10.28"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+    "pocket>=0.3.6",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_pocket = "abx_plugin_pocket"
--- a/archivebox/pkgs/abx-plugin-puppeteer/README.md
+++ b/archivebox/pkgs/abx-plugin-puppeteer/README.md
--- a/archivebox/pkgs/abx-plugin-puppeteer/abx_plugin_puppeteer/init.py
+++ b/archivebox/pkgs/abx-plugin-puppeteer/abx_plugin_puppeteer/init.py
@@ -0,0 +1,30 @@
+__package__ = 'abx_plugin_puppeteer'
+__label__ = 'Puppeteer'
+__homepage__ = 'https://github.com/puppeteer/puppeteer'
+
+import abx
+
+
+@abx.hookimpl
+def get_CONFIG():
+    from .config import PUPPETEER_CONFIG
+    
+    return {
+        'PUPPETEER_CONFIG': PUPPETEER_CONFIG
+    }
+
+@abx.hookimpl
+def get_BINARIES():
+    from .binaries import PUPPETEER_BINARY
+    
+    return {
+        'puppeteer': PUPPETEER_BINARY,
+    }
+
+@abx.hookimpl
+def get_BINPROVIDERS():
+    from .binproviders import PUPPETEER_BINPROVIDER
+    
+    return {
+        'puppeteer': PUPPETEER_BINPROVIDER,
+    }
--- a/archivebox/pkgs/abx-plugin-puppeteer/abx_plugin_puppeteer/binaries.py
+++ b/archivebox/pkgs/abx-plugin-puppeteer/abx_plugin_puppeteer/binaries.py
@@ -0,0 +1,23 @@
+__package__ = 'abx_plugin_puppeteer'
+
+from typing import List
+
+from pydantic import InstanceOf
+from pydantic_pkgr import BinProvider, BinName, Binary
+
+
+from abx_plugin_default_binproviders import env
+
+from abx_plugin_npm.binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
+
+
+###################### Config ##########################
+
+
+class PuppeteerBinary(Binary):
+    name: BinName = "puppeteer"
+
+    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
+
+
+PUPPETEER_BINARY = PuppeteerBinary()
--- a/archivebox/pkgs/abx-plugin-puppeteer/abx_plugin_puppeteer/binproviders.py
+++ b/archivebox/pkgs/abx-plugin-puppeteer/abx_plugin_puppeteer/binproviders.py
@@ -0,0 +1,131 @@
+import os
+import platform
+from pathlib import Path
+from typing import List, Optional, Dict, ClassVar
+
+from pydantic import Field
+from pydantic_pkgr import (
+    BinProvider,
+    BinName,
+    BinProviderName,
+    BinProviderOverrides,
+    InstallArgs,
+    PATHStr,
+    HostBinPath,
+)
+
+import abx
+
+from archivebox.config import CONSTANTS
+from archivebox.config.permissions import ARCHIVEBOX_USER
+
+from abx_plugin_npm.binproviders import SYS_NPM_BINPROVIDER
+
+
+class PuppeteerBinProvider(BinProvider):
+    name: BinProviderName = "puppeteer"
+    INSTALLER_BIN: BinName = "npx"
+
+    PATH: PATHStr = str(CONSTANTS.DEFAULT_LIB_DIR / 'bin')
+    
+    euid: Optional[int] = ARCHIVEBOX_USER
+
+    puppeteer_browsers_dir: Path = CONSTANTS.DEFAULT_LIB_DIR / 'browsers'
+    puppeteer_install_args: List[str] = ['--yes', "@puppeteer/browsers", "install"]
+
+    packages_handler: BinProviderOverrides = Field(default={
+        "chrome": lambda:
+            ['chrome@stable'],
+    }, exclude=True)
+    
+    _browser_abspaths: ClassVar[Dict[str, HostBinPath]] = {}
+    
+    def setup(self) -> None:
+        # update paths from config, don't do this lazily because we dont want to import archivebox.config.common at import-time
+        # we want to avoid depending on archivebox from abx code if at all possible
+        LIB_DIR = abx.pm.hook.get_LIB_DIR()
+        BIN_DIR = abx.pm.hook.get_BIN_DIR()
+        self.puppeteer_browsers_dir = LIB_DIR / 'browsers'
+        self.PATH = str(BIN_DIR)
+        
+        assert SYS_NPM_BINPROVIDER.INSTALLER_BIN_ABSPATH, "NPM bin provider not initialized"
+        
+        if self.puppeteer_browsers_dir:
+            self.puppeteer_browsers_dir.mkdir(parents=True, exist_ok=True)
+    
+    def installed_browser_bins(self, browser_name: str='*') -> List[Path]:
+        # if on macOS, browser binary is inside a .app, otherwise it's just a plain binary
+        if platform.system().lower() == 'darwin':
+            # /data/lib/browsers/chrome/mac_arm-129.0.6668.58/chrome-mac-arm64/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing
+            return sorted(self.puppeteer_browsers_dir.glob(f'{browser_name}/mac*/chrome*/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing'))
+
+        # /data/lib/browsers/chrome/linux-131.0.6730.0/chrome-linux64/chrome
+        # /data/lib/aarch64-linux/browsers/chrome/linux-129.0.6668.100/chrome-linux64/chrome
+        return sorted(self.puppeteer_browsers_dir.glob(f"{browser_name}/linux*/chrome*/chrome"))
+
+    def default_abspath_handler(self, bin_name: BinName, **context) -> Optional[HostBinPath]:
+        assert bin_name == 'chrome', 'Only chrome is supported using the @puppeteer/browsers install method currently.'
+        
+        # already loaded, return abspath from cache
+        if bin_name in self._browser_abspaths:
+            return self._browser_abspaths[bin_name]
+        
+        # first time loading, find browser in self.puppeteer_browsers_dir by searching filesystem for installed binaries
+        matching_bins = [abspath for abspath in self.installed_browser_bins() if bin_name in str(abspath)]
+        if matching_bins:
+            newest_bin = matching_bins[-1]  # already sorted alphabetically, last should theoretically be highest version number
+            self._browser_abspaths[bin_name] = newest_bin
+            return newest_bin
+        
+        return None
+
+    def default_install_handler(self, bin_name: str, packages: Optional[InstallArgs] = None, **context) -> str:
+        """npx @puppeteer/browsers install chrome@stable"""
+        self.setup()
+        assert bin_name == 'chrome', 'Only chrome is supported using the @puppeteer/browsers install method currently.'
+
+        if not self.INSTALLER_BIN_ABSPATH:
+            raise Exception(
+                f"{self.__class__.__name__} install method is not available on this host ({self.INSTALLER_BIN} not found in $PATH)"
+            )
+        packages = packages or self.get_packages(bin_name)
+        assert packages, f"No packages specified for installation of {bin_name}"
+
+        # print(f'[*] {self.__class__.__name__}: Installing {bin_name}: {self.INSTALLER_BIN_ABSPATH} install {packages}')
+
+        install_args = [*self.puppeteer_install_args, "--path", str(self.puppeteer_browsers_dir)]
+
+        proc = self.exec(bin_name=self.INSTALLER_BIN_ABSPATH, cmd=[*install_args, *packages])
+
+        if proc.returncode != 0:
+            print(proc.stdout.strip())
+            print(proc.stderr.strip())
+            raise Exception(f"{self.__class__.__name__}: install got returncode {proc.returncode} while installing {packages}: {packages}")
+
+        # chrome@129.0.6668.91 /tmp/test3/lib/x86_64-linux/browsers/chrome/linux-129.0.6668.91/chrome-linux64/chrome
+        # chrome@129.0.6668.58 /data/lib/browsers/chrome/mac_arm-129.0.6668.58/chrome-mac-arm64/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing
+        # /data/lib/aarch64-linux/browsers/chrome/linux-129.0.6668.100/chrome-linux64/chrome
+        relpath = proc.stdout.strip().split(str(self.puppeteer_browsers_dir))[-1].split('\n', 1)[0]
+        abspath = self.puppeteer_browsers_dir / relpath
+        
+        if os.path.isfile(abspath) and os.access(abspath, os.X_OK):
+            self._browser_abspaths[bin_name] = abspath
+            return abspath
+
+        return (proc.stderr.strip() + "\n" + proc.stdout.strip()).strip()
+
+PUPPETEER_BINPROVIDER = PuppeteerBinProvider()
+
+
+# ALTERNATIVE INSTALL METHOD using Ansible:
+# install_playbook = self.plugin_dir / 'install_puppeteer.yml'
+# chrome_bin = run_playbook(install_playbook, data_dir=DATA_DIR, quiet=quiet).BINARIES.chrome
+# return self.__class__.model_validate(
+#     {
+#         **self.model_dump(),
+#         "loaded_abspath": chrome_bin.symlink,
+#         "loaded_version": chrome_bin.version,
+#         "loaded_binprovider": env,
+#         "binproviders_supported": self.binproviders_supported,
+#     }
+# )
--- a/archivebox/pkgs/abx-plugin-puppeteer/abx_plugin_puppeteer/config.py
+++ b/archivebox/pkgs/abx-plugin-puppeteer/abx_plugin_puppeteer/config.py
@@ -0,0 +1,18 @@
+__package__ = 'abx_plugin_puppeteer'
+
+
+from abx_spec_config.base_configset import BaseConfigSet
+
+
+###################### Config ##########################
+
+
+class PuppeteerConfig(BaseConfigSet):
+    PUPPETEER_BINARY: str = 'puppeteer'
+    # PUPPETEER_ARGS: Optional[List[str]] = Field(default=None)
+    # PUPPETEER_EXTRA_ARGS: List[str] = []
+    # PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
+    pass
+
+
+PUPPETEER_CONFIG = PuppeteerConfig()
--- a/archivebox/pkgs/abx-plugin-puppeteer/pyproject.toml
+++ b/archivebox/pkgs/abx-plugin-puppeteer/pyproject.toml
@@ -0,0 +1,19 @@
+[project]
+name = "abx-plugin-puppeteer"
+version = "2024.10.28"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+    "abx-spec-pydantic-pkgr>=0.1.0",
+    "pydantic-pkgr>=0.5.4",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_puppeteer = "abx_plugin_puppeteer"
--- a/archivebox/pkgs/abx-plugin-readability/README.md
+++ b/archivebox/pkgs/abx-plugin-readability/README.md
--- a/archivebox/pkgs/abx-plugin-readability/abx_plugin_readability/init.py
+++ b/archivebox/pkgs/abx-plugin-readability/abx_plugin_readability/init.py
@@ -0,0 +1,30 @@
+__package__ = 'abx_plugin_readability'
+__label__ = 'Readability'
+__homepage__ = 'https://github.com/ArchiveBox/readability-extractor'
+
+import abx
+
+
+@abx.hookimpl
+def get_CONFIG():
+    from .config import READABILITY_CONFIG
+    
+    return {
+        'READABILITY_CONFIG': READABILITY_CONFIG
+    }
+
+@abx.hookimpl
+def get_BINARIES():
+    from .binaries import READABILITY_BINARY
+    
+    return {
+        'readability': READABILITY_BINARY,
+    }
+
+@abx.hookimpl
+def get_EXTRACTORS():
+    from .extractors import READABILITY_EXTRACTOR
+    
+    return {
+        'readability': READABILITY_EXTRACTOR,
+    }
--- a/archivebox/pkgs/abx-plugin-readability/abx_plugin_readability/binaries.py
+++ b/archivebox/pkgs/abx-plugin-readability/abx_plugin_readability/binaries.py
@@ -0,0 +1,26 @@
+__package__ = 'abx_plugin_readability'
+
+from typing import List
+
+from pydantic import InstanceOf
+from pydantic_pkgr import Binary, BinProvider, BinaryOverrides, BinName
+
+from abx_plugin_default_binproviders import env
+from abx_plugin_npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
+
+from .config import READABILITY_CONFIG
+
+
+READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
+
+class ReadabilityBinary(Binary):
+    name: BinName = READABILITY_CONFIG.READABILITY_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
+
+    overrides: BinaryOverrides = {
+        LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]},
+        SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None},    # prevent modifying system global npm packages
+    }
+
+
+READABILITY_BINARY = ReadabilityBinary()
--- a/archivebox/pkgs/abx-plugin-readability/abx_plugin_readability/config.py
+++ b/archivebox/pkgs/abx-plugin-readability/abx_plugin_readability/config.py
@@ -0,0 +1,17 @@
+from pydantic import Field
+
+from abx_spec_config.base_configset import BaseConfigSet
+
+from archivebox.config.common import ARCHIVING_CONFIG
+
+
+class ReadabilityConfig(BaseConfigSet):
+    SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
+
+    READABILITY_TIMEOUT: int                 = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
+
+    READABILITY_BINARY: str = Field(default='readability-extractor')
+    # READABILITY_EXTRA_ARGS: List[str] = []                                # readability-extractor doesn't take any extra args
+
+
+READABILITY_CONFIG = ReadabilityConfig()
--- a/archivebox/pkgs/abx-plugin-readability/abx_plugin_readability/extractors.py
+++ b/archivebox/pkgs/abx-plugin-readability/abx_plugin_readability/extractors.py
@@ -0,0 +1,19 @@
+# __package__ = 'abx_plugin_readability'
+
+# from pathlib import Path
+
+# from pydantic_pkgr import BinName
+
+
+# from .binaries import READABILITY_BINARY
+
+
+# class ReadabilityExtractor(BaseExtractor):
+#     name: str = 'readability'
+#     binary: BinName = READABILITY_BINARY.name
+
+#     def get_output_path(self, snapshot) -> Path:
+#         return Path(snapshot.link_dir) / 'readability' / 'content.html'
+
+
+# READABILITY_EXTRACTOR = ReadabilityExtractor()
--- a/archivebox/pkgs/abx-plugin-readability/abx_plugin_readability/readability.py
+++ b/archivebox/pkgs/abx-plugin-readability/abx_plugin_readability/readability.py
@@ -0,0 +1,118 @@
+__package__ = 'archivebox.extractors'
+
+from pathlib import Path
+from tempfile import NamedTemporaryFile
+
+from typing import Optional
+import json
+
+from archivebox.misc.system import run, atomic_write
+from archivebox.misc.util import enforce_types, is_static_file
+from ..index.schema import Link, ArchiveResult, ArchiveError
+from ..logging_util import TimedProgress
+from .title import get_html
+
+from plugins_extractor.readability.config import READABILITY_CONFIG
+from plugins_extractor.readability.binaries import READABILITY_BINARY
+
+
+def get_output_path():
+    return 'readability/'
+
+def get_embed_path(archiveresult=None):
+    return get_output_path() + 'content.html'
+
+
+@enforce_types
+def should_save_readability(link: Link, out_dir: Optional[str]=None, overwrite: Optional[bool]=False) -> bool:
+    
+    if is_static_file(link.url):
+        return False
+
+    output_subdir = (Path(out_dir or link.link_dir) / get_output_path())
+    if not overwrite and output_subdir.exists():
+        return False
+
+    return READABILITY_CONFIG.SAVE_READABILITY
+
+
+@enforce_types
+def save_readability(link: Link, out_dir: Optional[str]=None, timeout: int=0) -> ArchiveResult:
+    """download reader friendly version using @mozilla/readability"""
+    
+    READABILITY_BIN = READABILITY_BINARY.load()
+    assert READABILITY_BIN.abspath and READABILITY_BIN.version
+
+    timeout = timeout or READABILITY_CONFIG.READABILITY_TIMEOUT
+    output_subdir = Path(out_dir or link.link_dir).absolute() / get_output_path()
+    output = get_output_path()
+
+    # Readability Docs: https://github.com/mozilla/readability
+
+    status = 'succeeded'
+    # fake command to show the user so they have something to try debugging if get_html fails
+    cmd = [
+        str(READABILITY_BIN.abspath),
+        '{dom,singlefile}.html',
+        link.url,
+    ]
+    readability_content = None
+    timer = TimedProgress(timeout, prefix='      ')
+    try:
+        document = get_html(link, Path(out_dir or link.link_dir))
+        temp_doc = NamedTemporaryFile(delete=False)
+        temp_doc.write(document.encode("utf-8"))
+        temp_doc.close()
+
+        if not document or len(document) < 10:
+            raise ArchiveError('Readability could not find HTML to parse for article text')
+
+        cmd = [
+            str(READABILITY_BIN.abspath),
+            temp_doc.name,
+            link.url,
+        ]
+        result = run(cmd, cwd=out_dir, timeout=timeout, text=True)
+        try:
+            result_json = json.loads(result.stdout)
+            assert result_json and 'content' in result_json, 'Readability output is not valid JSON'
+        except json.JSONDecodeError:
+            raise ArchiveError('Readability was not able to archive the page (invalid JSON)', result.stdout + result.stderr)
+
+        output_subdir.mkdir(exist_ok=True)
+        readability_content = result_json.pop("textContent") 
+        atomic_write(str(output_subdir / "content.html"), result_json.pop("content"))
+        atomic_write(str(output_subdir / "content.txt"), readability_content)
+        atomic_write(str(output_subdir / "article.json"), result_json)
+
+        output_tail = [
+            line.strip()
+            for line in (result.stdout + result.stderr).rsplit('\n', 5)[-5:]
+            if line.strip()
+        ]
+        hints = (
+            'Got readability response code: {}.'.format(result.returncode),
+            *output_tail,
+        )
+
+        # Check for common failure cases
+        if (result.returncode > 0):
+            raise ArchiveError(f'Readability was not able to archive the page (status={result.returncode})', hints)
+    except (Exception, OSError) as err:
+        status = 'failed'
+        output = err
+
+        # prefer Chrome dom output to singlefile because singlefile often contains huge url(data:image/...base64) strings that make the html too long to parse with readability
+        cmd = [cmd[0], './{dom,singlefile}.html']
+    finally:
+        timer.end()
+
+    return ArchiveResult(
+        cmd=cmd,
+        pwd=str(out_dir),
+        cmd_version=str(READABILITY_BIN.version),
+        output=output,
+        status=status,
+        index_texts=[readability_content] if readability_content else [],
+        **timer.stats,  
+    )
--- a/archivebox/pkgs/abx-plugin-readability/pyproject.toml
+++ b/archivebox/pkgs/abx-plugin-readability/pyproject.toml
@@ -0,0 +1,17 @@
+[project]
+name = "abx-plugin-readability"
+version = "2024.10.28"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_readability = "abx_plugin_readability"
--- a/archivebox/pkgs/abx-plugin-readwise/README.md
+++ b/archivebox/pkgs/abx-plugin-readwise/README.md
--- a/archivebox/pkgs/abx-plugin-readwise/abx_plugin_readwise.py
+++ b/archivebox/pkgs/abx-plugin-readwise/abx_plugin_readwise.py
@@ -0,0 +1,35 @@
+__package__ = 'abx_plugin_readwise_extractor'
+__id__ = 'abx_plugin_readwise_extractor'
+__label__ = 'Readwise API'
+__version__ = '2024.10.27'
+__author__ = 'ArchiveBox'
+__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/dev/archivebox/plugins_extractor/readwise'
+__dependencies__ = []
+
+import abx
+
+from typing import Dict
+from pathlib import Path
+
+from pydantic import Field
+
+from abx_spec_config.base_configset import BaseConfigSet
+
+SOURCES_DIR = abx.pm.hook.get_CONFIG().SOURCES_DIR
+
+
+class ReadwiseConfig(BaseConfigSet):
+    READWISE_DB_PATH: Path                  = Field(default=SOURCES_DIR / "readwise_reader_api.db")
+    READWISE_READER_TOKENS: Dict[str, str]  = Field(default=lambda: {})   # {<username>: <access_token>, ...}
+
+
+@abx.hookimpl
+def get_CONFIG():
+    return {
+        __id__: ReadwiseConfig()
+    }
+
+@abx.hookimpl
+def ready():
+    READWISE_CONFIG = abx.pm.hook.get_CONFIG()[__id__]
+    READWISE_CONFIG.validate()
--- a/archivebox/pkgs/abx-plugin-readwise/pyproject.toml
+++ b/archivebox/pkgs/abx-plugin-readwise/pyproject.toml
@@ -0,0 +1,18 @@
+[project]
+name = "abx-plugin-readwise"
+version = "2024.10.28"
+description = "Readwise API Extractor"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_readwise = "abx_plugin_readwise"
+
--- a/archivebox/pkgs/abx-plugin-ripgrep-search/README.md
+++ b/archivebox/pkgs/abx-plugin-ripgrep-search/README.md
--- a/archivebox/pkgs/abx-plugin-ripgrep-search/abx_plugin_ripgrep_search/init.py
+++ b/archivebox/pkgs/abx-plugin-ripgrep-search/abx_plugin_ripgrep_search/init.py
@@ -0,0 +1,31 @@
+__package__ = 'abx_plugin_ripgrep_search'
+__label__ = 'Ripgrep Search'
+__homepage__ = 'https://github.com/BurntSushi/ripgrep'
+
+import abx
+
+@abx.hookimpl
+def get_CONFIG():
+    from .config import RIPGREP_CONFIG
+    
+    return {
+        'RIPGREP_CONFIG': RIPGREP_CONFIG
+    }
+
+
+@abx.hookimpl
+def get_BINARIES():
+    from .binaries import RIPGREP_BINARY
+    
+    return {
+        'ripgrep': RIPGREP_BINARY
+    }
+
+
+@abx.hookimpl
+def get_SEARCHBACKENDS():
+    from .searchbackend import RIPGREP_SEARCH_BACKEND
+    
+    return {
+        'ripgrep': RIPGREP_SEARCH_BACKEND,
+    }
--- a/archivebox/pkgs/abx-plugin-ripgrep-search/abx_plugin_ripgrep_search/binaries.py
+++ b/archivebox/pkgs/abx-plugin-ripgrep-search/abx_plugin_ripgrep_search/binaries.py
@@ -0,0 +1,23 @@
+__package__ = 'abx_plugin_ripgrep_search'
+
+from typing import List
+
+from pydantic import InstanceOf
+from pydantic_pkgr import BinProvider, BinaryOverrides, BinName, Binary
+
+from abx_plugin_default_binproviders import apt, brew, env
+
+
+from .config import RIPGREP_CONFIG
+
+
+class RipgrepBinary(Binary):
+    name: BinName = RIPGREP_CONFIG.RIPGREP_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
+
+    overrides: BinaryOverrides = {
+        apt.name: {'packages': ['ripgrep']},
+        brew.name: {'packages': ['ripgrep']},
+    }
+
+RIPGREP_BINARY = RipgrepBinary()
--- a/archivebox/pkgs/abx-plugin-ripgrep-search/abx_plugin_ripgrep_search/config.py
+++ b/archivebox/pkgs/abx-plugin-ripgrep-search/abx_plugin_ripgrep_search/config.py
@@ -0,0 +1,29 @@
+__package__ = 'abx_plugin_ripgrep_search'
+
+from pathlib import Path
+from typing import List
+
+from pydantic import Field
+
+from abx_spec_config.base_configset import BaseConfigSet
+
+from archivebox.config import CONSTANTS
+from archivebox.config.common import SEARCH_BACKEND_CONFIG
+
+
+class RipgrepConfig(BaseConfigSet):
+    RIPGREP_BINARY: str = Field(default='rg')
+    
+    RIPGREP_IGNORE_EXTENSIONS: str = Field(default='css,js,orig,svg')
+    RIPGREP_ARGS_DEFAULT: List[str] = Field(default=lambda c: [
+        # https://github.com/BurntSushi/ripgrep/blob/master/GUIDE.md
+        f'--type-add=ignore:*.{{{c.RIPGREP_IGNORE_EXTENSIONS}}}',
+        '--type-not=ignore',
+        '--ignore-case',
+        '--files-with-matches',
+        '--regexp',
+    ])
+    RIPGREP_SEARCH_DIR: Path = CONSTANTS.ARCHIVE_DIR
+    RIPGREP_TIMEOUT: int = Field(default=lambda: SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_TIMEOUT)
+
+RIPGREP_CONFIG = RipgrepConfig()
--- a/archivebox/pkgs/abx-plugin-ripgrep-search/abx_plugin_ripgrep_search/searchbackend.py
+++ b/archivebox/pkgs/abx-plugin-ripgrep-search/abx_plugin_ripgrep_search/searchbackend.py
@@ -0,0 +1,55 @@
+__package__ = 'abx_plugin_ripgrep_search'
+
+import re
+import subprocess
+
+from typing import List, Iterable
+
+from abx_spec_searchbackend import BaseSearchBackend
+
+from .binaries import RIPGREP_BINARY
+from .config import RIPGREP_CONFIG
+
+
+
+# regex to match archive/<ts>/... snapshot dir names
+TIMESTAMP_REGEX =  re.compile(r'\/([\d]+\.[\d]+)\/')
+
+class RipgrepSearchBackend(BaseSearchBackend):
+    name: str = 'ripgrep'
+    docs_url: str = 'https://github.com/BurntSushi/ripgrep'
+    
+    @staticmethod
+    def index(snapshot_id: str, texts: List[str]):
+        return
+
+    @staticmethod
+    def flush(snapshot_ids: Iterable[str]):
+        return
+
+    @staticmethod
+    def search(text: str) -> List[str]:
+        from core.models import Snapshot
+        
+        ripgrep_binary = RIPGREP_BINARY.load()
+        if not ripgrep_binary.version:
+            raise Exception("ripgrep binary not found, install ripgrep to use this search backend")
+    
+        cmd = [
+            ripgrep_binary.abspath, 
+            *RIPGREP_CONFIG.RIPGREP_ARGS_DEFAULT,
+            text,
+            str(RIPGREP_CONFIG.RIPGREP_SEARCH_DIR),
+        ]
+        proc = subprocess.run(cmd, timeout=RIPGREP_CONFIG.RIPGREP_TIMEOUT, capture_output=True, text=True)
+        timestamps = set()
+        for path in proc.stdout.splitlines():
+            ts = TIMESTAMP_REGEX.findall(path)
+            if ts:
+                timestamps.add(ts[0])
+        
+        snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)]
+    
+        return snap_ids
+
+RIPGREP_SEARCH_BACKEND = RipgrepSearchBackend()
--- a/archivebox/pkgs/abx-plugin-ripgrep-search/pyproject.toml
+++ b/archivebox/pkgs/abx-plugin-ripgrep-search/pyproject.toml
@@ -0,0 +1,18 @@
+[project]
+name = "abx-plugin-ripgrep-search"
+version = "2024.10.28"
+description = "Add your description here"
+readme = "README.md"
+requires-python = ">=3.10"
+dependencies = [
+    "abx>=0.1.0",
+    "abx-spec-config>=0.1.0",
+    "abx-spec-searchbackend>=0.1.0",
+]
+
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+
+[project.entry-points.abx]
+abx_plugin_ripgrep_search = "abx_plugin_ripgrep_search"
--- a/archivebox/pkgs/abx-plugin-singlefile/README.md
+++ b/archivebox/pkgs/abx-plugin-singlefile/README.md
--- a/archivebox/pkgs/abx-plugin-singlefile/abx_plugin_singlefile/init.py
+++ b/archivebox/pkgs/abx-plugin-singlefile/abx_plugin_singlefile/init.py
@@ -0,0 +1,35 @@
+__package__ = 'abx_plugin_singlefile'
+__label__ = 'Singlefile'
+__homepage__ = 'https://github.com/gildas-lormeau/singlefile'
+
+import abx
+
+
+@abx.hookimpl
+def get_CONFIG():
+    from .config import SINGLEFILE_CONFIG
+    
+    return {
+        'SINGLEFILE_CONFIG': SINGLEFILE_CONFIG
+    }
+
+@abx.hookimpl
+def get_BINARIES():
+    from .binaries import SINGLEFILE_BINARY
+    
+    return {
+        'singlefile': SINGLEFILE_BINARY,
+    }
+
+@abx.hookimpl
+def get_EXTRACTORS():
+    from .extractors import SINGLEFILE_EXTRACTOR
+    
+    return {
+        'singlefile': SINGLEFILE_EXTRACTOR,
+    }
+
+# @abx.hookimpl
+# def get_INSTALLED_APPS():
+#     # needed to load ./models.py
+#     return [__package__]
--- a/archivebox/pkgs/abx-plugin-singlefile/abx_plugin_singlefile/binaries.py
+++ b/archivebox/pkgs/abx-plugin-singlefile/abx_plugin_singlefile/binaries.py
@@ -0,0 +1,45 @@
+from typing import List
+
+from pydantic import InstanceOf
+from pydantic_pkgr import Binary, BinProvider, BinaryOverrides, BinName, bin_abspath
+
+from abx_plugin_default_binproviders import env
+from abx_plugin_npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
+
+from .config import SINGLEFILE_CONFIG
+
+
+SINGLEFILE_MIN_VERSION = '1.1.54'
+SINGLEFILE_MAX_VERSION = '1.1.60'
+
+
+class SinglefileBinary(Binary):
+    name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY
+    binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
+
+    overrides: BinaryOverrides = {
+        LIB_NPM_BINPROVIDER.name: {
+            "abspath": lambda:
+                bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=LIB_NPM_BINPROVIDER.PATH)
+                or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH)
+                or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
+            "packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
+        },
+        SYS_NPM_BINPROVIDER.name: {
+            "abspath": lambda:
+                bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=SYS_NPM_BINPROVIDER.PATH)
+                or bin_abspath("single-file", PATH=SYS_NPM_BINPROVIDER.PATH)
+                or bin_abspath("single-file-node.js", PATH=SYS_NPM_BINPROVIDER.PATH),
+            "packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
+            "install": lambda: None,
+        },
+        env.name: {
+            'abspath': lambda:
+                bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
+                or bin_abspath('single-file', PATH=env.PATH)
+                or bin_abspath('single-file-node.js', PATH=env.PATH),
+        },
+    }
+
+
+SINGLEFILE_BINARY = SinglefileBinary()
--- a/Show More
+++ b/Show More