mirror of
https://github.com/ArchiveBox/ArchiveBox.git
synced 2026-04-05 07:17:52 +10:00
wip
This commit is contained in:
39
packages/abx-plugin-archivedotorg-extractor/__init__.py
Normal file
39
packages/abx-plugin-archivedotorg-extractor/__init__.py
Normal file
@@ -0,0 +1,39 @@
|
||||
__package__ = 'plugins_extractor.archivedotorg'
|
||||
__label__ = 'archivedotorg'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'ArchiveBox'
|
||||
__homepage__ = 'https://archive.org'
|
||||
__dependencies__ = []
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'archivedotorg': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import ARCHIVEDOTORG_CONFIG
|
||||
|
||||
return {
|
||||
'archivedotorg': ARCHIVEDOTORG_CONFIG
|
||||
}
|
||||
|
||||
|
||||
# @abx.hookimpl
|
||||
# def get_EXTRACTORS():
|
||||
# from .extractors import ARCHIVEDOTORG_EXTRACTOR
|
||||
#
|
||||
# return {
|
||||
# 'archivedotorg': ARCHIVEDOTORG_EXTRACTOR,
|
||||
# }
|
||||
11
packages/abx-plugin-archivedotorg-extractor/config.py
Normal file
11
packages/abx-plugin-archivedotorg-extractor/config.py
Normal file
@@ -0,0 +1,11 @@
|
||||
__package__ = 'plugins_extractor.archivedotorg'
|
||||
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
|
||||
class ArchivedotorgConfig(BaseConfigSet):
|
||||
SAVE_ARCHIVE_DOT_ORG: bool = True
|
||||
|
||||
|
||||
ARCHIVEDOTORG_CONFIG = ArchivedotorgConfig()
|
||||
@@ -0,0 +1,7 @@
|
||||
[project]
|
||||
name = "abx-archivedotorg-extractor"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = []
|
||||
0
packages/abx-plugin-chrome-extractor/README.md
Normal file
0
packages/abx-plugin-chrome-extractor/README.md
Normal file
54
packages/abx-plugin-chrome-extractor/__init__.py
Normal file
54
packages/abx-plugin-chrome-extractor/__init__.py
Normal file
@@ -0,0 +1,54 @@
|
||||
__package__ = 'plugins_extractor.chrome'
|
||||
__id__ = 'chrome'
|
||||
__label__ = 'Chrome'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'ArchiveBox'
|
||||
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/main/archivebox/plugins_extractor/chrome'
|
||||
__dependencies__ = []
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
__id__: {
|
||||
'id': __id__,
|
||||
'package': __package__,
|
||||
'label': __label__,
|
||||
'version': __version__,
|
||||
'author': __author__,
|
||||
'homepage': __homepage__,
|
||||
'dependencies': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import CHROME_CONFIG
|
||||
|
||||
return {
|
||||
__id__: CHROME_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import CHROME_BINARY
|
||||
|
||||
return {
|
||||
'chrome': CHROME_BINARY,
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def ready():
|
||||
from .config import CHROME_CONFIG
|
||||
CHROME_CONFIG.validate()
|
||||
|
||||
|
||||
# @abx.hookimpl
|
||||
# def get_EXTRACTORS():
|
||||
# return {
|
||||
# 'pdf': PDF_EXTRACTOR,
|
||||
# 'screenshot': SCREENSHOT_EXTRACTOR,
|
||||
# 'dom': DOM_EXTRACTOR,
|
||||
# }
|
||||
150
packages/abx-plugin-chrome-extractor/binaries.py
Normal file
150
packages/abx-plugin-chrome-extractor/binaries.py
Normal file
@@ -0,0 +1,150 @@
|
||||
__package__ = 'plugins_extractor.chrome'
|
||||
|
||||
import os
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import (
|
||||
BinProvider,
|
||||
BinName,
|
||||
BinaryOverrides,
|
||||
bin_abspath,
|
||||
)
|
||||
|
||||
import abx.archivebox.reads
|
||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||
|
||||
from abx_puppeteer_binprovider.binproviders import PUPPETEER_BINPROVIDER
|
||||
from abx_playwright_binprovider.binproviders import PLAYWRIGHT_BINPROVIDER
|
||||
|
||||
|
||||
from .config import CHROME_CONFIG
|
||||
|
||||
CHROMIUM_BINARY_NAMES_LINUX = [
|
||||
"chromium",
|
||||
"chromium-browser",
|
||||
"chromium-browser-beta",
|
||||
"chromium-browser-unstable",
|
||||
"chromium-browser-canary",
|
||||
"chromium-browser-dev",
|
||||
]
|
||||
CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"]
|
||||
CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS
|
||||
|
||||
CHROME_BINARY_NAMES_LINUX = [
|
||||
"google-chrome",
|
||||
"google-chrome-stable",
|
||||
"google-chrome-beta",
|
||||
"google-chrome-canary",
|
||||
"google-chrome-unstable",
|
||||
"google-chrome-dev",
|
||||
"chrome"
|
||||
]
|
||||
CHROME_BINARY_NAMES_MACOS = [
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||
"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
|
||||
]
|
||||
CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
|
||||
|
||||
CHROME_APT_DEPENDENCIES = [
|
||||
'apt-transport-https', 'at-spi2-common',
|
||||
'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei',
|
||||
'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2',
|
||||
'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1',
|
||||
'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings',
|
||||
'chromium-browser',
|
||||
]
|
||||
|
||||
|
||||
def autodetect_system_chrome_install(PATH=None) -> Optional[Path]:
|
||||
for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES:
|
||||
abspath = bin_abspath(bin_name, PATH=env.PATH)
|
||||
if abspath:
|
||||
return abspath
|
||||
return None
|
||||
|
||||
def create_macos_app_symlink(target: Path, shortcut: Path):
|
||||
"""
|
||||
on macOS, some binaries are inside of .app, so we need to
|
||||
create a tiny bash script instead of a symlink
|
||||
(so that ../ parent relationships are relative to original .app instead of callsite dir)
|
||||
"""
|
||||
# TODO: should we enforce this? is it useful in any other situation?
|
||||
# if platform.system().lower() != 'darwin':
|
||||
# raise Exception(...)
|
||||
shortcut.unlink(missing_ok=True)
|
||||
shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""")
|
||||
shortcut.chmod(0o777) # make sure its executable by everyone
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
|
||||
class ChromeBinary(BaseBinary):
|
||||
name: BinName = CHROME_CONFIG.CHROME_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
env.name: {
|
||||
'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable
|
||||
},
|
||||
PUPPETEER_BINPROVIDER.name: {
|
||||
'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable
|
||||
},
|
||||
PLAYWRIGHT_BINPROVIDER.name: {
|
||||
'packages': ['chromium'], # playwright install chromium
|
||||
},
|
||||
apt.name: {
|
||||
'packages': CHROME_APT_DEPENDENCIES,
|
||||
},
|
||||
brew.name: {
|
||||
'packages': ['--cask', 'chromium'] if platform.system().lower() == 'darwin' else [],
|
||||
},
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def symlink_to_lib(binary, bin_dir=None) -> None:
|
||||
bin_dir = bin_dir or abx.archivebox.reads.get_CONFIGS().STORAGE_CONFIG.LIB_DIR / 'bin'
|
||||
|
||||
if not (binary.abspath and os.path.isfile(binary.abspath)):
|
||||
return
|
||||
|
||||
bin_dir.mkdir(parents=True, exist_ok=True)
|
||||
symlink = bin_dir / binary.name
|
||||
|
||||
try:
|
||||
if platform.system().lower() == 'darwin':
|
||||
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
|
||||
create_macos_app_symlink(binary.abspath, symlink)
|
||||
else:
|
||||
# otherwise on linux we can symlink directly to binary executable
|
||||
symlink.unlink(missing_ok=True)
|
||||
symlink.symlink_to(binary.abspath)
|
||||
except Exception:
|
||||
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
|
||||
# not actually needed, we can just run without it
|
||||
pass
|
||||
|
||||
@staticmethod
|
||||
def chrome_cleanup_lockfile():
|
||||
"""
|
||||
Cleans up any state or runtime files that chrome leaves behind when killed by
|
||||
a timeout or other error
|
||||
"""
|
||||
try:
|
||||
linux_lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
|
||||
linux_lock_file.unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
|
||||
try:
|
||||
(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock').unlink(missing_ok=True)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
|
||||
|
||||
CHROME_BINARY = ChromeBinary()
|
||||
|
||||
203
packages/abx-plugin-chrome-extractor/config.py
Normal file
203
packages/abx-plugin-chrome-extractor/config.py
Normal file
@@ -0,0 +1,203 @@
|
||||
__package__ = 'plugins_extractor.chrome'
|
||||
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
from pydantic import Field
|
||||
from pydantic_pkgr import bin_abspath
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
from abx.archivebox.base_binary import env
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
|
||||
from archivebox.misc.logging import STDERR
|
||||
from archivebox.misc.util import dedupe
|
||||
from archivebox.logging_util import pretty_path
|
||||
|
||||
|
||||
CHROMIUM_BINARY_NAMES_LINUX = [
|
||||
"chromium",
|
||||
"chromium-browser",
|
||||
"chromium-browser-beta",
|
||||
"chromium-browser-unstable",
|
||||
"chromium-browser-canary",
|
||||
"chromium-browser-dev",
|
||||
]
|
||||
CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"]
|
||||
CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS
|
||||
|
||||
CHROME_BINARY_NAMES_LINUX = [
|
||||
"google-chrome",
|
||||
"google-chrome-stable",
|
||||
"google-chrome-beta",
|
||||
"google-chrome-canary",
|
||||
"google-chrome-unstable",
|
||||
"google-chrome-dev",
|
||||
"chrome"
|
||||
]
|
||||
CHROME_BINARY_NAMES_MACOS = [
|
||||
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
|
||||
"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
|
||||
]
|
||||
CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
|
||||
|
||||
APT_DEPENDENCIES = [
|
||||
'apt-transport-https', 'at-spi2-common', 'chromium-browser',
|
||||
'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei',
|
||||
'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2',
|
||||
'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1',
|
||||
'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings',
|
||||
]
|
||||
|
||||
|
||||
def autodetect_system_chrome_install(PATH=None) -> Optional[Path]:
|
||||
for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES:
|
||||
abspath = bin_abspath(bin_name, PATH=env.PATH)
|
||||
if abspath:
|
||||
return abspath
|
||||
return None
|
||||
|
||||
def create_macos_app_symlink(target: Path, shortcut: Path):
|
||||
"""
|
||||
on macOS, some binaries are inside of .app, so we need to
|
||||
create a tiny bash script instead of a symlink
|
||||
(so that ../ parent relationships are relative to original .app instead of callsite dir)
|
||||
"""
|
||||
# TODO: should we enforce this? is it useful in any other situation?
|
||||
# if platform.system().lower() != 'darwin':
|
||||
# raise Exception(...)
|
||||
shortcut.unlink(missing_ok=True)
|
||||
shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""")
|
||||
shortcut.chmod(0o777) # make sure its executable by everyone
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
|
||||
class ChromeConfig(BaseConfigSet):
|
||||
USE_CHROME: bool = Field(default=True)
|
||||
|
||||
# Chrome Binary
|
||||
CHROME_BINARY: str = Field(default='chrome')
|
||||
CHROME_DEFAULT_ARGS: List[str] = Field(default=[
|
||||
'--virtual-time-budget=15000',
|
||||
'--disable-features=DarkMode',
|
||||
"--run-all-compositor-stages-before-draw",
|
||||
"--hide-scrollbars",
|
||||
"--autoplay-policy=no-user-gesture-required",
|
||||
"--no-first-run",
|
||||
"--use-fake-ui-for-media-stream",
|
||||
"--use-fake-device-for-media-stream",
|
||||
"--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",
|
||||
])
|
||||
CHROME_EXTRA_ARGS: List[str] = Field(default=[])
|
||||
|
||||
# Chrome Options Tuning
|
||||
CHROME_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT - 10)
|
||||
CHROME_HEADLESS: bool = Field(default=True)
|
||||
CHROME_SANDBOX: bool = Field(default=lambda: not SHELL_CONFIG.IN_DOCKER)
|
||||
CHROME_RESOLUTION: str = Field(default=lambda: ARCHIVING_CONFIG.RESOLUTION)
|
||||
CHROME_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
|
||||
# Cookies & Auth
|
||||
CHROME_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||
CHROME_USER_DATA_DIR: Path | None = Field(default=CONSTANTS.PERSONAS_DIR / 'Default' / 'chrome_profile')
|
||||
CHROME_PROFILE_NAME: str = Field(default='Default')
|
||||
|
||||
# Extractor Toggles
|
||||
SAVE_SCREENSHOT: bool = Field(default=True, alias='FETCH_SCREENSHOT')
|
||||
SAVE_DOM: bool = Field(default=True, alias='FETCH_DOM')
|
||||
SAVE_PDF: bool = Field(default=True, alias='FETCH_PDF')
|
||||
|
||||
def validate(self):
|
||||
from archivebox.config.paths import create_and_chown_dir
|
||||
|
||||
if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
|
||||
STDERR.print()
|
||||
STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]')
|
||||
STDERR.print(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
|
||||
STDERR.print(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
|
||||
STDERR.print()
|
||||
STDERR.print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
|
||||
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
|
||||
STDERR.print()
|
||||
|
||||
# if user has specified a user data dir, make sure its valid
|
||||
if self.USE_CHROME and self.CHROME_USER_DATA_DIR:
|
||||
try:
|
||||
create_and_chown_dir(self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
# check to make sure user_data_dir/<profile_name> exists
|
||||
if not os.path.isdir(self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME):
|
||||
STDERR.print()
|
||||
STDERR.print()
|
||||
STDERR.print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]')
|
||||
STDERR.print(f' {self.CHROME_USER_DATA_DIR}')
|
||||
STDERR.print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
|
||||
STDERR.print(' For more info see:')
|
||||
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
|
||||
|
||||
# show special hint if they made the common mistake of putting /Default at the end of the path
|
||||
if str(self.CHROME_USER_DATA_DIR).replace(str(CONSTANTS.PERSONAS_DIR / 'Default'), '').endswith('/Default'):
|
||||
STDERR.print()
|
||||
STDERR.print(' Try removing /Default from the end e.g.:')
|
||||
STDERR.print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).rsplit('/Default', 1)[0]))
|
||||
|
||||
self.update_in_place(CHROME_USER_DATA_DIR=None)
|
||||
|
||||
|
||||
def chrome_args(self, **options) -> List[str]:
|
||||
"""helper to build up a chrome shell command with arguments"""
|
||||
|
||||
# Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
|
||||
|
||||
options = self.model_copy(update=options)
|
||||
|
||||
cmd_args = [*options.CHROME_DEFAULT_ARGS, *options.CHROME_EXTRA_ARGS]
|
||||
|
||||
if options.CHROME_HEADLESS:
|
||||
cmd_args += ["--headless=new"] # expects chrome version >= 111
|
||||
|
||||
if not options.CHROME_SANDBOX:
|
||||
# assume this means we are running inside a docker container
|
||||
# in docker, GPU support is limited, sandboxing is unecessary,
|
||||
# and SHM is limited to 64MB by default (which is too low to be usable).
|
||||
cmd_args += (
|
||||
"--no-sandbox",
|
||||
"--no-zygote",
|
||||
"--disable-dev-shm-usage",
|
||||
"--disable-software-rasterizer",
|
||||
"--disable-sync",
|
||||
# "--password-store=basic",
|
||||
)
|
||||
|
||||
|
||||
# set window size for screenshot/pdf/etc. rendering
|
||||
cmd_args += ('--window-size={}'.format(options.CHROME_RESOLUTION),)
|
||||
|
||||
if not options.CHROME_CHECK_SSL_VALIDITY:
|
||||
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
|
||||
|
||||
if options.CHROME_USER_AGENT:
|
||||
cmd_args += ('--user-agent={}'.format(options.CHROME_USER_AGENT),)
|
||||
|
||||
# this no longer works on newer chrome version for some reason, just causes chrome to hang indefinitely:
|
||||
# if options.CHROME_TIMEOUT:
|
||||
# cmd_args += ('--timeout={}'.format(options.CHROME_TIMEOUT * 1000),)
|
||||
|
||||
if options.CHROME_USER_DATA_DIR:
|
||||
cmd_args.append('--user-data-dir={}'.format(options.CHROME_USER_DATA_DIR))
|
||||
cmd_args.append('--profile-directory={}'.format(options.CHROME_PROFILE_NAME or 'Default'))
|
||||
|
||||
if not os.path.isfile(options.CHROME_USER_DATA_DIR / options.CHROME_PROFILE_NAME / 'Preferences'):
|
||||
STDERR.print(f'[green] + creating new Chrome profile in: {pretty_path(options.CHROME_USER_DATA_DIR / options.CHROME_PROFILE_NAME)}[/green]')
|
||||
cmd_args.remove('--no-first-run')
|
||||
cmd_args.append('--first-run')
|
||||
|
||||
return dedupe(cmd_args)
|
||||
|
||||
CHROME_CONFIG = ChromeConfig()
|
||||
|
||||
7
packages/abx-plugin-chrome-extractor/pyproject.toml
Normal file
7
packages/abx-plugin-chrome-extractor/pyproject.toml
Normal file
@@ -0,0 +1,7 @@
|
||||
[project]
|
||||
name = "abx-chrome-extractor"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = []
|
||||
0
packages/abx-plugin-curl-extractor/README.md
Normal file
0
packages/abx-plugin-curl-extractor/README.md
Normal file
38
packages/abx-plugin-curl-extractor/__init__.py
Normal file
38
packages/abx-plugin-curl-extractor/__init__.py
Normal file
@@ -0,0 +1,38 @@
|
||||
__package__ = 'plugins_extractor.curl'
|
||||
__label__ = 'curl'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'ArchiveBox'
|
||||
__homepage__ = 'https://github.com/curl/curl'
|
||||
__dependencies__ = []
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'curl': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import CURL_CONFIG
|
||||
|
||||
return {
|
||||
'curl': CURL_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import CURL_BINARY
|
||||
|
||||
return {
|
||||
'curl': CURL_BINARY,
|
||||
}
|
||||
18
packages/abx-plugin-curl-extractor/binaries.py
Normal file
18
packages/abx-plugin-curl-extractor/binaries.py
Normal file
@@ -0,0 +1,18 @@
|
||||
__package__ = 'plugins_extractor.curl'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import BinProvider, BinName
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||
|
||||
|
||||
from .config import CURL_CONFIG
|
||||
|
||||
|
||||
class CurlBinary(BaseBinary):
|
||||
name: BinName = CURL_CONFIG.CURL_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
||||
CURL_BINARY = CurlBinary()
|
||||
33
packages/abx-plugin-curl-extractor/config.py
Normal file
33
packages/abx-plugin-curl-extractor/config.py
Normal file
@@ -0,0 +1,33 @@
|
||||
__package__ = 'plugins_extractor.curl'
|
||||
|
||||
from typing import List, Optional
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
|
||||
class CurlConfig(BaseConfigSet):
|
||||
|
||||
SAVE_TITLE: bool = Field(default=True)
|
||||
SAVE_HEADERS: bool = Field(default=True)
|
||||
USE_CURL: bool = Field(default=True)
|
||||
|
||||
CURL_BINARY: str = Field(default='curl')
|
||||
CURL_ARGS: List[str] = [
|
||||
'--silent',
|
||||
'--location',
|
||||
'--compressed',
|
||||
]
|
||||
CURL_EXTRA_ARGS: List[str] = []
|
||||
|
||||
CURL_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||
CURL_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
CURL_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||
CURL_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||
|
||||
|
||||
CURL_CONFIG = CurlConfig()
|
||||
7
packages/abx-plugin-curl-extractor/pyproject.toml
Normal file
7
packages/abx-plugin-curl-extractor/pyproject.toml
Normal file
@@ -0,0 +1,7 @@
|
||||
[project]
|
||||
name = "abx-curl-extractor"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = []
|
||||
0
packages/abx-plugin-default-binproviders/README.md
Normal file
0
packages/abx-plugin-default-binproviders/README.md
Normal file
@@ -0,0 +1,24 @@
|
||||
|
||||
import abx
|
||||
|
||||
from typing import Dict
|
||||
|
||||
from pydantic_pkgr import (
|
||||
AptProvider,
|
||||
BrewProvider,
|
||||
EnvProvider,
|
||||
BinProvider,
|
||||
)
|
||||
apt = APT_BINPROVIDER = AptProvider()
|
||||
brew = BREW_BINPROVIDER = BrewProvider()
|
||||
env = ENV_BINPROVIDER = EnvProvider()
|
||||
|
||||
|
||||
@abx.hookimpl(tryfirst=True)
|
||||
def get_BINPROVIDERS() -> Dict[str, BinProvider]:
|
||||
|
||||
return {
|
||||
'apt': APT_BINPROVIDER,
|
||||
'brew': BREW_BINPROVIDER,
|
||||
'env': ENV_BINPROVIDER,
|
||||
}
|
||||
18
packages/abx-plugin-default-binproviders/pyproject.toml
Normal file
18
packages/abx-plugin-default-binproviders/pyproject.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[project]
|
||||
name = "abx-plugin-default-binproviders"
|
||||
version = "2024.10.24"
|
||||
description = "Default BinProviders for ABX (apt, brew, env)"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"abx>=0.1.0",
|
||||
"pydantic-pkgr>=0.5.4",
|
||||
"abx-spec-pydantic-pkgr>=0.1.0",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project.entry-points.abx]
|
||||
abx_plugin_default_binproviders = "abx_plugin_default_binproviders"
|
||||
0
packages/abx-plugin-favicon-extractor/README.md
Normal file
0
packages/abx-plugin-favicon-extractor/README.md
Normal file
39
packages/abx-plugin-favicon-extractor/__init__.py
Normal file
39
packages/abx-plugin-favicon-extractor/__init__.py
Normal file
@@ -0,0 +1,39 @@
|
||||
__package__ = 'plugins_extractor.favicon'
|
||||
__label__ = 'favicon'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'ArchiveBox'
|
||||
__homepage__ = 'https://github.com/ArchiveBox/archivebox'
|
||||
__dependencies__ = []
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'favicon': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import FAVICON_CONFIG
|
||||
|
||||
return {
|
||||
'favicon': FAVICON_CONFIG
|
||||
}
|
||||
|
||||
|
||||
# @abx.hookimpl
|
||||
# def get_EXTRACTORS():
|
||||
# from .extractors import FAVICON_EXTRACTOR
|
||||
|
||||
# return {
|
||||
# 'favicon': FAVICON_EXTRACTOR,
|
||||
# }
|
||||
13
packages/abx-plugin-favicon-extractor/config.py
Normal file
13
packages/abx-plugin-favicon-extractor/config.py
Normal file
@@ -0,0 +1,13 @@
|
||||
__package__ = 'plugins_extractor.favicon'
|
||||
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
|
||||
class FaviconConfig(BaseConfigSet):
|
||||
SAVE_FAVICON: bool = True
|
||||
|
||||
FAVICON_PROVIDER: str = 'https://www.google.com/s2/favicons?domain={}'
|
||||
|
||||
|
||||
FAVICON_CONFIG = FaviconConfig()
|
||||
7
packages/abx-plugin-favicon-extractor/pyproject.toml
Normal file
7
packages/abx-plugin-favicon-extractor/pyproject.toml
Normal file
@@ -0,0 +1,7 @@
|
||||
[project]
|
||||
name = "abx-favicon-extractor"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = []
|
||||
0
packages/abx-plugin-git-extractor/README.md
Normal file
0
packages/abx-plugin-git-extractor/README.md
Normal file
46
packages/abx-plugin-git-extractor/__init__.py
Normal file
46
packages/abx-plugin-git-extractor/__init__.py
Normal file
@@ -0,0 +1,46 @@
|
||||
__package__ = 'plugins_extractor.git'
|
||||
__label__ = 'git'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'ArchiveBox'
|
||||
__homepage__ = 'https://github.com/git/git'
|
||||
__dependencies__ = []
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'git': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import GIT_CONFIG
|
||||
|
||||
return {
|
||||
'git': GIT_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import GIT_BINARY
|
||||
|
||||
return {
|
||||
'git': GIT_BINARY,
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_EXTRACTORS():
|
||||
from .extractors import GIT_EXTRACTOR
|
||||
|
||||
return {
|
||||
'git': GIT_EXTRACTOR,
|
||||
}
|
||||
18
packages/abx-plugin-git-extractor/binaries.py
Normal file
18
packages/abx-plugin-git-extractor/binaries.py
Normal file
@@ -0,0 +1,18 @@
|
||||
__package__ = 'plugins_extractor.git'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import BinProvider, BinName
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||
|
||||
from .config import GIT_CONFIG
|
||||
|
||||
|
||||
|
||||
class GitBinary(BaseBinary):
|
||||
name: BinName = GIT_CONFIG.GIT_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
||||
GIT_BINARY = GitBinary()
|
||||
28
packages/abx-plugin-git-extractor/config.py
Normal file
28
packages/abx-plugin-git-extractor/config.py
Normal file
@@ -0,0 +1,28 @@
|
||||
__package__ = 'plugins_extractor.git'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
|
||||
class GitConfig(BaseConfigSet):
|
||||
|
||||
SAVE_GIT: bool = True
|
||||
|
||||
GIT_DOMAINS: str = Field(default='github.com,bitbucket.org,gitlab.com,gist.github.com,codeberg.org,gitea.com,git.sr.ht')
|
||||
|
||||
GIT_BINARY: str = Field(default='git')
|
||||
GIT_ARGS: List[str] = [
|
||||
'--recursive',
|
||||
]
|
||||
GIT_EXTRA_ARGS: List[str] = []
|
||||
|
||||
GIT_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||
GIT_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
|
||||
|
||||
GIT_CONFIG = GitConfig()
|
||||
17
packages/abx-plugin-git-extractor/extractors.py
Normal file
17
packages/abx-plugin-git-extractor/extractors.py
Normal file
@@ -0,0 +1,17 @@
|
||||
__package__ = 'plugins_extractor.git'
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
||||
|
||||
from .binaries import GIT_BINARY
|
||||
|
||||
|
||||
class GitExtractor(BaseExtractor):
|
||||
name: ExtractorName = 'git'
|
||||
binary: str = GIT_BINARY.name
|
||||
|
||||
def get_output_path(self, snapshot) -> Path | None:
|
||||
return snapshot.as_link() / 'git'
|
||||
|
||||
GIT_EXTRACTOR = GitExtractor()
|
||||
7
packages/abx-plugin-git-extractor/pyproject.toml
Normal file
7
packages/abx-plugin-git-extractor/pyproject.toml
Normal file
@@ -0,0 +1,7 @@
|
||||
[project]
|
||||
name = "abx-git-extractor"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = []
|
||||
0
packages/abx-plugin-htmltotext-extractor/README.md
Normal file
0
packages/abx-plugin-htmltotext-extractor/README.md
Normal file
41
packages/abx-plugin-htmltotext-extractor/__init__.py
Normal file
41
packages/abx-plugin-htmltotext-extractor/__init__.py
Normal file
@@ -0,0 +1,41 @@
|
||||
__package__ = 'plugins_extractor.htmltotext'
|
||||
__id__ = 'htmltotext'
|
||||
__label__ = 'HTML-to-Text'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'ArchiveBox'
|
||||
__homepage__ = 'https://github.com/ArchiveBox/archivebox'
|
||||
__dependencies__ = []
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
__id__: {
|
||||
'id': __id__,
|
||||
'package': __package__,
|
||||
'label': __label__,
|
||||
'version': __version__,
|
||||
'author': __author__,
|
||||
'homepage': __homepage__,
|
||||
'dependencies': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import HTMLTOTEXT_CONFIG
|
||||
|
||||
return {
|
||||
__id__: HTMLTOTEXT_CONFIG
|
||||
}
|
||||
|
||||
|
||||
# @abx.hookimpl
|
||||
# def get_EXTRACTORS():
|
||||
# from .extractors import FAVICON_EXTRACTOR
|
||||
|
||||
# return {
|
||||
# 'htmltotext': FAVICON_EXTRACTOR,
|
||||
# }
|
||||
11
packages/abx-plugin-htmltotext-extractor/config.py
Normal file
11
packages/abx-plugin-htmltotext-extractor/config.py
Normal file
@@ -0,0 +1,11 @@
|
||||
__package__ = 'plugins_extractor.htmltotext'
|
||||
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
|
||||
class HtmltotextConfig(BaseConfigSet):
|
||||
SAVE_HTMLTOTEXT: bool = True
|
||||
|
||||
|
||||
HTMLTOTEXT_CONFIG = HtmltotextConfig()
|
||||
7
packages/abx-plugin-htmltotext-extractor/pyproject.toml
Normal file
7
packages/abx-plugin-htmltotext-extractor/pyproject.toml
Normal file
@@ -0,0 +1,7 @@
|
||||
[project]
|
||||
name = "abx-htmltotext-extractor"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = []
|
||||
0
packages/abx-plugin-ldap-auth/README.md
Normal file
0
packages/abx-plugin-ldap-auth/README.md
Normal file
77
packages/abx-plugin-ldap-auth/__init__.py
Normal file
77
packages/abx-plugin-ldap-auth/__init__.py
Normal file
@@ -0,0 +1,77 @@
|
||||
__package__ = 'plugins_auth.ldap'
|
||||
__id__ = 'ldap'
|
||||
__label__ = 'LDAP'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'ArchiveBox'
|
||||
__homepage__ = 'https://github.com/django-auth-ldap/django-auth-ldap'
|
||||
__dependencies__ = ['pip']
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
__id__: {
|
||||
'id': __id__,
|
||||
'package': __package__,
|
||||
'label': __label__,
|
||||
'version': __version__,
|
||||
'author': __author__,
|
||||
'homepage': __homepage__,
|
||||
'dependencies': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import LDAP_CONFIG
|
||||
|
||||
return {
|
||||
__id__: LDAP_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import LDAP_BINARY
|
||||
|
||||
return {
|
||||
'ldap': LDAP_BINARY,
|
||||
}
|
||||
|
||||
|
||||
def create_superuser_from_ldap_user(sender, user=None, ldap_user=None, **kwargs):
|
||||
"""
|
||||
Invoked after LDAP authenticates a user, but before they have a local User account created.
|
||||
ArchiveBox requires staff/superuser status to view the admin at all, so we must create a user
|
||||
+ set staff and superuser when LDAP authenticates a new person.
|
||||
"""
|
||||
from django.conf import settings
|
||||
|
||||
if user is None:
|
||||
return # not authenticated at all
|
||||
|
||||
if not user.id and settings.CONFIGS.ldap.LDAP_CREATE_SUPERUSER:
|
||||
user.is_superuser = True # authenticated via LDAP, but user is not set up in DB yet
|
||||
|
||||
user.is_staff = True
|
||||
print(f'[!] WARNING: Creating new user {user} based on LDAP user {ldap_user} (is_staff={user.is_staff}, is_superuser={user.is_superuser})')
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def ready():
|
||||
"""
|
||||
Called at AppConfig.ready() time (settings + models are all loaded)
|
||||
"""
|
||||
from .config import LDAP_CONFIG
|
||||
|
||||
LDAP_CONFIG.validate()
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
if settings.CONFIGS.ldap.LDAP_ENABLED:
|
||||
# tell django-auth-ldap to call our function when a user is authenticated via LDAP
|
||||
import django_auth_ldap.backend
|
||||
django_auth_ldap.backend.populate_user.connect(create_superuser_from_ldap_user)
|
||||
70
packages/abx-plugin-ldap-auth/binaries.py
Normal file
70
packages/abx-plugin-ldap-auth/binaries.py
Normal file
@@ -0,0 +1,70 @@
|
||||
__package__ = 'plugins_auth.ldap'
|
||||
|
||||
|
||||
import inspect
|
||||
|
||||
from typing import List
|
||||
from pathlib import Path
|
||||
from pydantic import InstanceOf
|
||||
|
||||
from pydantic_pkgr import BinaryOverrides, SemVer
|
||||
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, BaseBinProvider, apt
|
||||
|
||||
from plugins_pkg.pip.binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER, VENV_SITE_PACKAGES, LIB_SITE_PACKAGES, USER_SITE_PACKAGES, SYS_SITE_PACKAGES
|
||||
|
||||
from .config import get_ldap_lib
|
||||
|
||||
|
||||
|
||||
def get_LDAP_LIB_path(paths=()):
|
||||
LDAP_LIB = get_ldap_lib()[0]
|
||||
if not LDAP_LIB:
|
||||
return None
|
||||
|
||||
# check that LDAP_LIB path is in one of the specified site packages dirs
|
||||
lib_path = Path(inspect.getfile(LDAP_LIB))
|
||||
if not paths:
|
||||
return lib_path
|
||||
|
||||
for site_packges_dir in paths:
|
||||
if str(lib_path.parent.parent.resolve()) == str(Path(site_packges_dir).resolve()):
|
||||
return lib_path
|
||||
return None
|
||||
|
||||
|
||||
def get_LDAP_LIB_version():
|
||||
LDAP_LIB = get_ldap_lib()[0]
|
||||
return LDAP_LIB and SemVer(LDAP_LIB.__version__)
|
||||
|
||||
|
||||
class LdapBinary(BaseBinary):
|
||||
name: str = 'ldap'
|
||||
description: str = 'LDAP Authentication'
|
||||
binproviders_supported: List[InstanceOf[BaseBinProvider]] = [VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER, apt]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
LIB_PIP_BINPROVIDER.name: {
|
||||
"abspath": lambda: get_LDAP_LIB_path(LIB_SITE_PACKAGES),
|
||||
"version": lambda: get_LDAP_LIB_version(),
|
||||
"packages": ['python-ldap>=3.4.3', 'django-auth-ldap>=4.1.0'],
|
||||
},
|
||||
VENV_PIP_BINPROVIDER.name: {
|
||||
"abspath": lambda: get_LDAP_LIB_path(VENV_SITE_PACKAGES),
|
||||
"version": lambda: get_LDAP_LIB_version(),
|
||||
"packages": ['python-ldap>=3.4.3', 'django-auth-ldap>=4.1.0'],
|
||||
},
|
||||
SYS_PIP_BINPROVIDER.name: {
|
||||
"abspath": lambda: get_LDAP_LIB_path((*USER_SITE_PACKAGES, *SYS_SITE_PACKAGES)),
|
||||
"version": lambda: get_LDAP_LIB_version(),
|
||||
"packages": ['python-ldap>=3.4.3', 'django-auth-ldap>=4.1.0'],
|
||||
},
|
||||
apt.name: {
|
||||
"abspath": lambda: get_LDAP_LIB_path(),
|
||||
"version": lambda: get_LDAP_LIB_version(),
|
||||
"packages": ['libssl-dev', 'libldap2-dev', 'libsasl2-dev', 'python3-ldap', 'python3-msgpack', 'python3-mutagen'],
|
||||
},
|
||||
}
|
||||
|
||||
LDAP_BINARY = LdapBinary()
|
||||
122
packages/abx-plugin-ldap-auth/config.py
Normal file
122
packages/abx-plugin-ldap-auth/config.py
Normal file
@@ -0,0 +1,122 @@
|
||||
__package__ = 'plugins_auth.ldap'
|
||||
|
||||
import sys
|
||||
|
||||
from typing import Dict, List, Optional
|
||||
from pydantic import Field, model_validator, computed_field
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
LDAP_LIB = None
|
||||
LDAP_SEARCH = None
|
||||
|
||||
def get_ldap_lib(extra_paths=()):
|
||||
global LDAP_LIB, LDAP_SEARCH
|
||||
if LDAP_LIB and LDAP_SEARCH:
|
||||
return LDAP_LIB, LDAP_SEARCH
|
||||
try:
|
||||
for path in extra_paths:
|
||||
if path not in sys.path:
|
||||
sys.path.append(path)
|
||||
|
||||
import ldap
|
||||
from django_auth_ldap.config import LDAPSearch
|
||||
LDAP_LIB, LDAP_SEARCH = ldap, LDAPSearch
|
||||
except ImportError:
|
||||
pass
|
||||
return LDAP_LIB, LDAP_SEARCH
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
|
||||
class LdapConfig(BaseConfigSet):
|
||||
"""
|
||||
LDAP Config gets imported by core/settings.py very early during startup.
|
||||
It needs to be in a separate file from apps.py so that it can be imported
|
||||
during settings.py initialization before the apps are loaded.
|
||||
"""
|
||||
|
||||
LDAP_ENABLED: bool = Field(default=False, alias='LDAP')
|
||||
|
||||
LDAP_SERVER_URI: str = Field(default=None)
|
||||
LDAP_BIND_DN: str = Field(default=None)
|
||||
LDAP_BIND_PASSWORD: str = Field(default=None)
|
||||
LDAP_USER_BASE: str = Field(default=None)
|
||||
LDAP_USER_FILTER: str = Field(default=None)
|
||||
LDAP_CREATE_SUPERUSER: bool = Field(default=False)
|
||||
|
||||
LDAP_USERNAME_ATTR: str = Field(default='username')
|
||||
LDAP_FIRSTNAME_ATTR: str = Field(default='first_name')
|
||||
LDAP_LASTNAME_ATTR: str = Field(default='last_name')
|
||||
LDAP_EMAIL_ATTR: str = Field(default='email')
|
||||
|
||||
def validate(self):
|
||||
if self.LDAP_ENABLED:
|
||||
LDAP_LIB, _LDAPSearch = get_ldap_lib()
|
||||
# Check that LDAP libraries are installed
|
||||
if LDAP_LIB is None:
|
||||
sys.stderr.write('[X] Error: LDAP Authentication is enabled but LDAP libraries are not installed. You may need to run: pip install archivebox[ldap]\n')
|
||||
# dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap
|
||||
# sys.exit(1)
|
||||
self.update_in_place(LDAP_ENABLED=False)
|
||||
|
||||
# Check that all required LDAP config options are set
|
||||
if self.LDAP_CONFIG_IS_SET:
|
||||
missing_config_options = [
|
||||
key for key, value in self.model_dump().items()
|
||||
if value is None and key != 'LDAP_ENABLED'
|
||||
]
|
||||
sys.stderr.write('[X] Error: LDAP_* config options must all be set if LDAP_ENABLED=True\n')
|
||||
sys.stderr.write(f' Missing: {", ".join(missing_config_options)}\n')
|
||||
self.update_in_place(LDAP_ENABLED=False)
|
||||
return self
|
||||
|
||||
@computed_field
|
||||
@property
|
||||
def LDAP_CONFIG_IS_SET(self) -> bool:
|
||||
"""Check that all required LDAP config options are set"""
|
||||
if self.LDAP_ENABLED:
|
||||
LDAP_LIB, _LDAPSearch = get_ldap_lib()
|
||||
return bool(LDAP_LIB) and self.LDAP_ENABLED and bool(
|
||||
self.LDAP_SERVER_URI
|
||||
and self.LDAP_BIND_DN
|
||||
and self.LDAP_BIND_PASSWORD
|
||||
and self.LDAP_USER_BASE
|
||||
and self.LDAP_USER_FILTER
|
||||
)
|
||||
return False
|
||||
|
||||
@computed_field
|
||||
@property
|
||||
def LDAP_USER_ATTR_MAP(self) -> Dict[str, str]:
|
||||
return {
|
||||
'username': self.LDAP_USERNAME_ATTR,
|
||||
'first_name': self.LDAP_FIRSTNAME_ATTR,
|
||||
'last_name': self.LDAP_LASTNAME_ATTR,
|
||||
'email': self.LDAP_EMAIL_ATTR,
|
||||
}
|
||||
|
||||
@computed_field
|
||||
@property
|
||||
def AUTHENTICATION_BACKENDS(self) -> List[str]:
|
||||
if self.LDAP_ENABLED:
|
||||
return [
|
||||
'django.contrib.auth.backends.ModelBackend',
|
||||
'django_auth_ldap.backend.LDAPBackend',
|
||||
]
|
||||
return []
|
||||
|
||||
@computed_field
|
||||
@property
|
||||
def AUTH_LDAP_USER_SEARCH(self) -> Optional[object]:
|
||||
if self.LDAP_ENABLED:
|
||||
LDAP_LIB, LDAPSearch = get_ldap_lib()
|
||||
return self.LDAP_USER_FILTER and LDAPSearch(
|
||||
self.LDAP_USER_BASE,
|
||||
LDAP_LIB.SCOPE_SUBTREE, # type: ignore
|
||||
'(&(' + self.LDAP_USERNAME_ATTR + '=%(user)s)' + self.LDAP_USER_FILTER + ')',
|
||||
)
|
||||
return None
|
||||
|
||||
|
||||
LDAP_CONFIG = LdapConfig()
|
||||
22
packages/abx-plugin-ldap-auth/pyproject.toml
Normal file
22
packages/abx-plugin-ldap-auth/pyproject.toml
Normal file
@@ -0,0 +1,22 @@
|
||||
[project]
|
||||
name = "abx-ldap-auth"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = []
|
||||
|
||||
|
||||
[project.entry-points.abx]
|
||||
ldap = "abx_ldap_auth"
|
||||
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[tool.hatch.build.targets.sdist]
|
||||
packages = ["."]
|
||||
|
||||
[tool.hatch.build.targets.wheel]
|
||||
packages = ["."]
|
||||
0
packages/abx-plugin-mercury-extractor/README.md
Normal file
0
packages/abx-plugin-mercury-extractor/README.md
Normal file
46
packages/abx-plugin-mercury-extractor/__init__.py
Normal file
46
packages/abx-plugin-mercury-extractor/__init__.py
Normal file
@@ -0,0 +1,46 @@
|
||||
__package__ = 'plugins_extractor.mercury'
|
||||
__label__ = 'mercury'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'ArchiveBox'
|
||||
__homepage__ = 'https://github.com/postlight/mercury-parser'
|
||||
__dependencies__ = ['npm']
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'mercury': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import MERCURY_CONFIG
|
||||
|
||||
return {
|
||||
'mercury': MERCURY_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import MERCURY_BINARY
|
||||
|
||||
return {
|
||||
'mercury': MERCURY_BINARY,
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_EXTRACTORS():
|
||||
from .extractors import MERCURY_EXTRACTOR
|
||||
|
||||
return {
|
||||
'mercury': MERCURY_EXTRACTOR,
|
||||
}
|
||||
32
packages/abx-plugin-mercury-extractor/binaries.py
Normal file
32
packages/abx-plugin-mercury-extractor/binaries.py
Normal file
@@ -0,0 +1,32 @@
|
||||
__package__ = 'plugins_extractor.mercury'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import BinProvider, BinName, BinaryOverrides, bin_abspath
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env
|
||||
|
||||
from archivebox.plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
|
||||
|
||||
from .config import MERCURY_CONFIG
|
||||
|
||||
|
||||
class MercuryBinary(BaseBinary):
|
||||
name: BinName = MERCURY_CONFIG.MERCURY_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
LIB_NPM_BINPROVIDER.name: {
|
||||
'packages': ['@postlight/parser@^2.2.3'],
|
||||
},
|
||||
SYS_NPM_BINPROVIDER.name: {
|
||||
'packages': ['@postlight/parser@^2.2.3'],
|
||||
'install': lambda: None, # never try to install things into global prefix
|
||||
},
|
||||
env.name: {
|
||||
'version': lambda: '999.999.999' if bin_abspath('postlight-parser', PATH=env.PATH) else None,
|
||||
},
|
||||
}
|
||||
|
||||
MERCURY_BINARY = MercuryBinary()
|
||||
31
packages/abx-plugin-mercury-extractor/config.py
Normal file
31
packages/abx-plugin-mercury-extractor/config.py
Normal file
@@ -0,0 +1,31 @@
|
||||
__package__ = 'plugins_extractor.mercury'
|
||||
|
||||
from typing import List, Optional
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG, STORAGE_CONFIG
|
||||
|
||||
|
||||
|
||||
class MercuryConfig(BaseConfigSet):
|
||||
|
||||
SAVE_MERCURY: bool = Field(default=True, alias='USE_MERCURY')
|
||||
|
||||
MERCURY_BINARY: str = Field(default='postlight-parser')
|
||||
MERCURY_EXTRA_ARGS: List[str] = []
|
||||
|
||||
SAVE_MERCURY_REQUISITES: bool = Field(default=True)
|
||||
MERCURY_RESTRICT_FILE_NAMES: str = Field(default=lambda: STORAGE_CONFIG.RESTRICT_FILE_NAMES)
|
||||
|
||||
MERCURY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||
MERCURY_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
MERCURY_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||
MERCURY_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||
|
||||
|
||||
|
||||
MERCURY_CONFIG = MercuryConfig()
|
||||
19
packages/abx-plugin-mercury-extractor/extractors.py
Normal file
19
packages/abx-plugin-mercury-extractor/extractors.py
Normal file
@@ -0,0 +1,19 @@
|
||||
__package__ = 'plugins_extractor.mercury'
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from abx.archivebox.base_extractor import BaseExtractor, ExtractorName
|
||||
|
||||
from .binaries import MERCURY_BINARY
|
||||
|
||||
|
||||
|
||||
class MercuryExtractor(BaseExtractor):
|
||||
name: ExtractorName = 'mercury'
|
||||
binary: str = MERCURY_BINARY.name
|
||||
|
||||
def get_output_path(self, snapshot) -> Path | None:
|
||||
return snapshot.link_dir / 'mercury' / 'content.html'
|
||||
|
||||
|
||||
MERCURY_EXTRACTOR = MercuryExtractor()
|
||||
7
packages/abx-plugin-mercury-extractor/pyproject.toml
Normal file
7
packages/abx-plugin-mercury-extractor/pyproject.toml
Normal file
@@ -0,0 +1,7 @@
|
||||
[project]
|
||||
name = "abx-mercury-extractor"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = []
|
||||
0
packages/abx-plugin-npm-binprovider/README.md
Normal file
0
packages/abx-plugin-npm-binprovider/README.md
Normal file
@@ -0,0 +1,35 @@
|
||||
__package__ = 'abx_plugin_npm_binprovider'
|
||||
__id__ = 'npm'
|
||||
__label__ = 'NPM'
|
||||
__author__ = 'ArchiveBox'
|
||||
__homepage__ = 'https://www.npmjs.com/'
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import NPM_CONFIG
|
||||
|
||||
return {
|
||||
__id__: NPM_CONFIG,
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import NODE_BINARY, NPM_BINARY, NPX_BINARY
|
||||
|
||||
return {
|
||||
'node': NODE_BINARY,
|
||||
'npm': NPM_BINARY,
|
||||
'npx': NPX_BINARY,
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINPROVIDERS():
|
||||
from .binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
|
||||
|
||||
return {
|
||||
'sys_npm': SYS_NPM_BINPROVIDER,
|
||||
'lib_npm': LIB_NPM_BINPROVIDER,
|
||||
}
|
||||
@@ -0,0 +1,53 @@
|
||||
__package__ = 'plugins_pkg.npm'
|
||||
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from benedict import benedict
|
||||
|
||||
from pydantic_pkgr import BinProvider, Binary, BinName, BinaryOverrides
|
||||
|
||||
from abx_plugin_default_binproviders import get_BINPROVIDERS
|
||||
|
||||
DEFAULT_BINPROVIDERS = benedict(get_BINPROVIDERS())
|
||||
env = DEFAULT_BINPROVIDERS.env
|
||||
apt = DEFAULT_BINPROVIDERS.apt
|
||||
brew = DEFAULT_BINPROVIDERS.brew
|
||||
|
||||
|
||||
class NodeBinary(Binary):
|
||||
name: BinName = 'node'
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
apt.name: {'packages': ['nodejs']},
|
||||
}
|
||||
|
||||
|
||||
NODE_BINARY = NodeBinary()
|
||||
|
||||
|
||||
class NpmBinary(Binary):
|
||||
name: BinName = 'npm'
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
apt.name: {'packages': ['npm']}, # already installed when nodejs is installed
|
||||
brew.name: {'install': lambda: None}, # already installed when nodejs is installed
|
||||
}
|
||||
|
||||
NPM_BINARY = NpmBinary()
|
||||
|
||||
|
||||
class NpxBinary(Binary):
|
||||
name: BinName = 'npx'
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
apt.name: {'install': lambda: None}, # already installed when nodejs is installed
|
||||
brew.name: {'install': lambda: None}, # already installed when nodejs is installed
|
||||
}
|
||||
|
||||
NPX_BINARY = NpxBinary()
|
||||
|
||||
@@ -0,0 +1,39 @@
|
||||
import os
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from pydantic_pkgr import NpmProvider, PATHStr, BinProviderName
|
||||
|
||||
import abx
|
||||
|
||||
DEFAULT_LIB_NPM_DIR = Path('/usr/local/share/abx/npm')
|
||||
|
||||
OLD_NODE_BIN_PATH = Path(os.getcwd()) / 'node_modules' / '.bin'
|
||||
NEW_NODE_BIN_PATH = DEFAULT_LIB_NPM_DIR / 'node_modules' / '.bin'
|
||||
|
||||
|
||||
class SystemNpmBinProvider(NpmProvider):
|
||||
name: BinProviderName = "sys_npm"
|
||||
|
||||
npm_prefix: Optional[Path] = None
|
||||
|
||||
|
||||
class LibNpmBinProvider(NpmProvider):
|
||||
name: BinProviderName = "lib_npm"
|
||||
PATH: PATHStr = f'{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}'
|
||||
|
||||
npm_prefix: Optional[Path] = DEFAULT_LIB_NPM_DIR
|
||||
|
||||
def setup(self) -> None:
|
||||
# update paths from config at runtime
|
||||
LIB_DIR = abx.pm.hook.get_CONFIG().LIB_DIR
|
||||
|
||||
self.npm_prefix = LIB_DIR / 'npm'
|
||||
self.PATH = f'{LIB_DIR / "npm" / "node_modules" / ".bin"}:{NEW_NODE_BIN_PATH}:{OLD_NODE_BIN_PATH}'
|
||||
|
||||
super().setup()
|
||||
|
||||
|
||||
SYS_NPM_BINPROVIDER = SystemNpmBinProvider()
|
||||
LIB_NPM_BINPROVIDER = LibNpmBinProvider()
|
||||
npm = LIB_NPM_BINPROVIDER
|
||||
@@ -0,0 +1,17 @@
|
||||
from abx_spec_config import BaseConfigSet
|
||||
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
|
||||
class NpmDependencyConfigs(BaseConfigSet):
|
||||
# USE_NPM: bool = True
|
||||
# NPM_BINARY: str = Field(default='npm')
|
||||
# NPM_ARGS: Optional[List[str]] = Field(default=None)
|
||||
# NPM_EXTRA_ARGS: List[str] = []
|
||||
# NPM_DEFAULT_ARGS: List[str] = []
|
||||
pass
|
||||
|
||||
|
||||
NPM_CONFIG = NpmDependencyConfigs()
|
||||
|
||||
20
packages/abx-plugin-npm-binprovider/pyproject.toml
Normal file
20
packages/abx-plugin-npm-binprovider/pyproject.toml
Normal file
@@ -0,0 +1,20 @@
|
||||
[project]
|
||||
name = "abx-plugin-npm-binprovider"
|
||||
version = "2024.10.24"
|
||||
description = "NPM binary provider plugin for ABX"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"abx>=0.1.0",
|
||||
"pydantic-pkgr>=0.5.4",
|
||||
"abx-spec-pydantic-pkgr>=0.1.0",
|
||||
"abx-spec-config>=0.1.0",
|
||||
"abx-plugin-default-binproviders>=2024.10.24",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project.entry-points.abx]
|
||||
abx_plugin_npm_binprovider = "abx_plugin_npm_binprovider"
|
||||
0
packages/abx-plugin-pip-binprovider/README.md
Normal file
0
packages/abx-plugin-pip-binprovider/README.md
Normal file
@@ -0,0 +1 @@
|
||||
0
|
||||
@@ -0,0 +1,37 @@
|
||||
__package__ = 'abx_plugin_pip_binprovider'
|
||||
__id__ = 'pip'
|
||||
__label__ = 'PIP'
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import PIP_CONFIG
|
||||
|
||||
return {
|
||||
__id__: PIP_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl(tryfirst=True)
|
||||
def get_BINARIES():
|
||||
from .binaries import ARCHIVEBOX_BINARY, PYTHON_BINARY, DJANGO_BINARY, SQLITE_BINARY, PIP_BINARY, PIPX_BINARY
|
||||
|
||||
return {
|
||||
'archivebox': ARCHIVEBOX_BINARY,
|
||||
'python': PYTHON_BINARY,
|
||||
'django': DJANGO_BINARY,
|
||||
'sqlite': SQLITE_BINARY,
|
||||
'pip': PIP_BINARY,
|
||||
'pipx': PIPX_BINARY,
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINPROVIDERS():
|
||||
from .binproviders import SYS_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, LIB_PIP_BINPROVIDER
|
||||
|
||||
return {
|
||||
'sys_pip': SYS_PIP_BINPROVIDER,
|
||||
'venv_pip': VENV_PIP_BINPROVIDER,
|
||||
'lib_pip': LIB_PIP_BINPROVIDER,
|
||||
}
|
||||
@@ -0,0 +1,162 @@
|
||||
__package__ = 'abx_plugin_pip_binprovider'
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from pydantic import InstanceOf, Field, model_validator
|
||||
|
||||
|
||||
import django
|
||||
import django.db.backends.sqlite3.base
|
||||
from django.db.backends.sqlite3.base import Database as django_sqlite3 # type: ignore[import-type]
|
||||
from pydantic_pkgr import BinProvider, Binary, BinName, BinaryOverrides, SemVer
|
||||
|
||||
|
||||
from .binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env, apt, brew
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
def get_archivebox_version():
|
||||
try:
|
||||
from archivebox import VERSION
|
||||
return VERSION
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
|
||||
class ArchiveboxBinary(Binary):
|
||||
name: BinName = 'archivebox'
|
||||
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
|
||||
overrides: BinaryOverrides = {
|
||||
VENV_PIP_BINPROVIDER.name: {'packages': [], 'version': get_archivebox_version},
|
||||
SYS_PIP_BINPROVIDER.name: {'packages': [], 'version': get_archivebox_version},
|
||||
apt.name: {'packages': [], 'version': get_archivebox_version},
|
||||
brew.name: {'packages': [], 'version': get_archivebox_version},
|
||||
}
|
||||
|
||||
# @validate_call
|
||||
def install(self, **kwargs):
|
||||
return self.load() # obviously it's already installed if we are running this ;)
|
||||
|
||||
# @validate_call
|
||||
def load_or_install(self, **kwargs):
|
||||
return self.load() # obviously it's already installed if we are running this ;)
|
||||
|
||||
ARCHIVEBOX_BINARY = ArchiveboxBinary()
|
||||
|
||||
|
||||
class PythonBinary(Binary):
|
||||
name: BinName = 'python'
|
||||
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
|
||||
overrides: BinaryOverrides = {
|
||||
SYS_PIP_BINPROVIDER.name: {
|
||||
'abspath': sys.executable,
|
||||
'version': '{}.{}.{}'.format(*sys.version_info[:3]),
|
||||
},
|
||||
}
|
||||
|
||||
# @validate_call
|
||||
def install(self, **kwargs):
|
||||
return self.load() # obviously it's already installed if we are running this ;)
|
||||
|
||||
# @validate_call
|
||||
def load_or_install(self, **kwargs):
|
||||
return self.load() # obviously it's already installed if we are running this ;)
|
||||
|
||||
PYTHON_BINARY = PythonBinary()
|
||||
|
||||
|
||||
LOADED_SQLITE_PATH = Path(django.db.backends.sqlite3.base.__file__)
|
||||
LOADED_SQLITE_VERSION = SemVer(django_sqlite3.version)
|
||||
LOADED_SQLITE_FROM_VENV = str(LOADED_SQLITE_PATH.absolute().resolve()).startswith(str(VENV_PIP_BINPROVIDER.pip_venv.absolute().resolve()))
|
||||
|
||||
class SqliteBinary(Binary):
|
||||
name: BinName = 'sqlite'
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = Field(default=[VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER])
|
||||
overrides: BinaryOverrides = {
|
||||
VENV_PIP_BINPROVIDER.name: {
|
||||
"abspath": LOADED_SQLITE_PATH if LOADED_SQLITE_FROM_VENV else None,
|
||||
"version": LOADED_SQLITE_VERSION if LOADED_SQLITE_FROM_VENV else None,
|
||||
},
|
||||
SYS_PIP_BINPROVIDER.name: {
|
||||
"abspath": LOADED_SQLITE_PATH if not LOADED_SQLITE_FROM_VENV else None,
|
||||
"version": LOADED_SQLITE_VERSION if not LOADED_SQLITE_FROM_VENV else None,
|
||||
},
|
||||
}
|
||||
|
||||
@model_validator(mode='after')
|
||||
def validate_json_extension_is_available(self):
|
||||
# Check to make sure JSON extension is available in our Sqlite3 instance
|
||||
try:
|
||||
cursor = django_sqlite3.connect(':memory:').cursor()
|
||||
cursor.execute('SELECT JSON(\'{"a": "b"}\')')
|
||||
except django_sqlite3.OperationalError as exc:
|
||||
print(f'[red][X] Your SQLite3 version is missing the required JSON1 extension: {exc}[/red]')
|
||||
print(
|
||||
'[violet]Hint:[/violet] Upgrade your Python version or install the extension manually:\n' +
|
||||
' https://code.djangoproject.com/wiki/JSON1Extension\n'
|
||||
)
|
||||
return self
|
||||
|
||||
# @validate_call
|
||||
def install(self, **kwargs):
|
||||
return self.load() # obviously it's already installed if we are running this ;)
|
||||
|
||||
# @validate_call
|
||||
def load_or_install(self, **kwargs):
|
||||
return self.load() # obviously it's already installed if we are running this ;)
|
||||
|
||||
SQLITE_BINARY = SqliteBinary()
|
||||
|
||||
|
||||
LOADED_DJANGO_PATH = Path(django.__file__)
|
||||
LOADED_DJANGO_VERSION = SemVer(django.VERSION[:3])
|
||||
LOADED_DJANGO_FROM_VENV = str(LOADED_DJANGO_PATH.absolute().resolve()).startswith(str(VENV_PIP_BINPROVIDER.pip_venv and VENV_PIP_BINPROVIDER.pip_venv.absolute().resolve()))
|
||||
|
||||
class DjangoBinary(Binary):
|
||||
name: BinName = 'django'
|
||||
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = Field(default=[VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER])
|
||||
overrides: BinaryOverrides = {
|
||||
VENV_PIP_BINPROVIDER.name: {
|
||||
"abspath": LOADED_DJANGO_PATH if LOADED_DJANGO_FROM_VENV else None,
|
||||
"version": LOADED_DJANGO_VERSION if LOADED_DJANGO_FROM_VENV else None,
|
||||
},
|
||||
SYS_PIP_BINPROVIDER.name: {
|
||||
"abspath": LOADED_DJANGO_PATH if not LOADED_DJANGO_FROM_VENV else None,
|
||||
"version": LOADED_DJANGO_VERSION if not LOADED_DJANGO_FROM_VENV else None,
|
||||
},
|
||||
}
|
||||
|
||||
# @validate_call
|
||||
def install(self, **kwargs):
|
||||
return self.load() # obviously it's already installed if we are running this ;)
|
||||
|
||||
# @validate_call
|
||||
def load_or_install(self, **kwargs):
|
||||
return self.load() # obviously it's already installed if we are running this ;)
|
||||
|
||||
DJANGO_BINARY = DjangoBinary()
|
||||
|
||||
class PipBinary(Binary):
|
||||
name: BinName = "pip"
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
|
||||
|
||||
# @validate_call
|
||||
def install(self, **kwargs):
|
||||
return self.load() # obviously it's already installed if we are running this ;)
|
||||
|
||||
# @validate_call
|
||||
def load_or_install(self, **kwargs):
|
||||
return self.load() # obviously it's already installed if we are running this ;)
|
||||
|
||||
PIP_BINARY = PipBinary()
|
||||
|
||||
|
||||
class PipxBinary(Binary):
|
||||
name: BinName = "pipx"
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, apt, brew, env]
|
||||
|
||||
PIPX_BINARY = PipxBinary()
|
||||
@@ -0,0 +1,91 @@
|
||||
import os
|
||||
import sys
|
||||
import site
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
from benedict import benedict
|
||||
|
||||
from pydantic_pkgr import PipProvider, BinName, BinProviderName
|
||||
|
||||
import abx
|
||||
|
||||
from abx_plugin_default_binproviders import get_BINPROVIDERS
|
||||
|
||||
DEFAULT_BINPROVIDERS = benedict(get_BINPROVIDERS())
|
||||
env = DEFAULT_BINPROVIDERS.env
|
||||
apt = DEFAULT_BINPROVIDERS.apt
|
||||
brew = DEFAULT_BINPROVIDERS.brew
|
||||
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
class SystemPipBinProvider(PipProvider):
|
||||
name: BinProviderName = "sys_pip"
|
||||
INSTALLER_BIN: BinName = "pip"
|
||||
|
||||
pip_venv: Optional[Path] = None # global pip scope
|
||||
|
||||
def on_install(self, bin_name: str, **kwargs):
|
||||
# never modify system pip packages
|
||||
return 'refusing to install packages globally with system pip, use a venv instead'
|
||||
|
||||
class SystemPipxBinProvider(PipProvider):
|
||||
name: BinProviderName = "pipx"
|
||||
INSTALLER_BIN: BinName = "pipx"
|
||||
|
||||
pip_venv: Optional[Path] = None # global pipx scope
|
||||
|
||||
|
||||
IS_INSIDE_VENV = sys.prefix != sys.base_prefix
|
||||
|
||||
class VenvPipBinProvider(PipProvider):
|
||||
name: BinProviderName = "venv_pip"
|
||||
INSTALLER_BIN: BinName = "pip"
|
||||
|
||||
pip_venv: Optional[Path] = Path(sys.prefix if IS_INSIDE_VENV else os.environ.get("VIRTUAL_ENV", '/tmp/NotInsideAVenv/lib'))
|
||||
|
||||
def setup(self):
|
||||
"""never attempt to create a venv here, this is just used to detect if we are inside an existing one"""
|
||||
return None
|
||||
|
||||
|
||||
class LibPipBinProvider(PipProvider):
|
||||
name: BinProviderName = "lib_pip"
|
||||
INSTALLER_BIN: BinName = "pip"
|
||||
|
||||
pip_venv: Optional[Path] = Path('/usr/local/share/abx/pip/venv')
|
||||
|
||||
def setup(self) -> None:
|
||||
# update venv path to match most up-to-date LIB_DIR based on runtime config
|
||||
LIB_DIR = abx.pm.hook.get_FLAT_CONFIG().LIB_DIR
|
||||
self.pip_venv = LIB_DIR / 'pip' / 'venv'
|
||||
super().setup()
|
||||
|
||||
SYS_PIP_BINPROVIDER = SystemPipBinProvider()
|
||||
PIPX_PIP_BINPROVIDER = SystemPipxBinProvider()
|
||||
VENV_PIP_BINPROVIDER = VenvPipBinProvider()
|
||||
LIB_PIP_BINPROVIDER = LibPipBinProvider()
|
||||
pip = LIB_PIP_BINPROVIDER
|
||||
|
||||
# ensure python libraries are importable from these locations (if archivebox wasnt executed from one of these then they wont already be in sys.path)
|
||||
assert VENV_PIP_BINPROVIDER.pip_venv is not None
|
||||
assert LIB_PIP_BINPROVIDER.pip_venv is not None
|
||||
|
||||
major, minor, patch = sys.version_info[:3]
|
||||
site_packages_dir = f'lib/python{major}.{minor}/site-packages'
|
||||
|
||||
LIB_SITE_PACKAGES = (LIB_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
|
||||
VENV_SITE_PACKAGES = (VENV_PIP_BINPROVIDER.pip_venv / site_packages_dir,)
|
||||
USER_SITE_PACKAGES = site.getusersitepackages()
|
||||
SYS_SITE_PACKAGES = site.getsitepackages()
|
||||
|
||||
ALL_SITE_PACKAGES = (
|
||||
*LIB_SITE_PACKAGES,
|
||||
*VENV_SITE_PACKAGES,
|
||||
*USER_SITE_PACKAGES,
|
||||
*SYS_SITE_PACKAGES,
|
||||
)
|
||||
for site_packages_dir in ALL_SITE_PACKAGES:
|
||||
if site_packages_dir not in sys.path:
|
||||
sys.path.append(str(site_packages_dir))
|
||||
@@ -0,0 +1,16 @@
|
||||
__package__ = 'pip'
|
||||
|
||||
from typing import List, Optional
|
||||
from pydantic import Field
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
|
||||
class PipDependencyConfigs(BaseConfigSet):
|
||||
USE_PIP: bool = True
|
||||
PIP_BINARY: str = Field(default='pip')
|
||||
PIP_ARGS: Optional[List[str]] = Field(default=None)
|
||||
PIP_EXTRA_ARGS: List[str] = []
|
||||
PIP_DEFAULT_ARGS: List[str] = []
|
||||
|
||||
PIP_CONFIG = PipDependencyConfigs()
|
||||
22
packages/abx-plugin-pip-binprovider/pyproject.toml
Normal file
22
packages/abx-plugin-pip-binprovider/pyproject.toml
Normal file
@@ -0,0 +1,22 @@
|
||||
[project]
|
||||
name = "abx-plugin-pip-binprovider"
|
||||
version = "2024.10.24"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"abx>=0.1.0",
|
||||
"pydantic-pkgr>=0.5.4",
|
||||
"abx-spec-config>=0.1.0",
|
||||
"abx-spec-pydantic-pkgr>=0.1.0",
|
||||
"abx-plugin-default-binproviders>=2024.10.24",
|
||||
"django>=5.0.0",
|
||||
]
|
||||
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project.entry-points.abx]
|
||||
abx_plugin_pip_binprovider = "abx_plugin_pip_binprovider"
|
||||
@@ -0,0 +1,32 @@
|
||||
__package__ = 'abx_plugin_playwright_binprovider'
|
||||
__id__ = 'playwright'
|
||||
__label__ = 'Playwright'
|
||||
__author__ = 'ArchiveBox'
|
||||
__homepage__ = 'https://github.com/microsoft/playwright-python'
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import PLAYWRIGHT_CONFIG
|
||||
|
||||
return {
|
||||
__id__: PLAYWRIGHT_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import PLAYWRIGHT_BINARY
|
||||
|
||||
return {
|
||||
'playwright': PLAYWRIGHT_BINARY,
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINPROVIDERS():
|
||||
from .binproviders import PLAYWRIGHT_BINPROVIDER
|
||||
|
||||
return {
|
||||
'playwright': PLAYWRIGHT_BINPROVIDER,
|
||||
}
|
||||
@@ -0,0 +1,21 @@
|
||||
__package__ = 'abx_plugin_playwright_binprovider'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import BinName, BinProvider, Binary
|
||||
|
||||
|
||||
from abx_plugin_pip_binprovider.binproviders import LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER
|
||||
from abx_plugin_default_binproviders import env
|
||||
|
||||
from .config import PLAYWRIGHT_CONFIG
|
||||
|
||||
|
||||
class PlaywrightBinary(Binary):
|
||||
name: BinName = PLAYWRIGHT_CONFIG.PLAYWRIGHT_BINARY
|
||||
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_PIP_BINPROVIDER, VENV_PIP_BINPROVIDER, SYS_PIP_BINPROVIDER, env]
|
||||
|
||||
|
||||
PLAYWRIGHT_BINARY = PlaywrightBinary()
|
||||
@@ -0,0 +1,163 @@
|
||||
__package__ = 'abx_plugin_playwright_binprovider'
|
||||
|
||||
import os
|
||||
import shutil
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, ClassVar
|
||||
|
||||
from pydantic import computed_field, Field
|
||||
from pydantic_pkgr import (
|
||||
BinName,
|
||||
BinProvider,
|
||||
BinProviderName,
|
||||
BinProviderOverrides,
|
||||
InstallArgs,
|
||||
PATHStr,
|
||||
HostBinPath,
|
||||
bin_abspath,
|
||||
OPERATING_SYSTEM,
|
||||
DEFAULT_ENV_PATH,
|
||||
)
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
from .binaries import PLAYWRIGHT_BINARY
|
||||
|
||||
|
||||
MACOS_PLAYWRIGHT_CACHE_DIR: Path = Path("~/Library/Caches/ms-playwright")
|
||||
LINUX_PLAYWRIGHT_CACHE_DIR: Path = Path("~/.cache/ms-playwright")
|
||||
|
||||
|
||||
class PlaywrightBinProvider(BinProvider):
|
||||
name: BinProviderName = "playwright"
|
||||
INSTALLER_BIN: BinName = PLAYWRIGHT_BINARY.name
|
||||
|
||||
PATH: PATHStr = f"{Path('/usr/share/abx') / 'bin'}:{DEFAULT_ENV_PATH}"
|
||||
|
||||
playwright_browsers_dir: Path = (
|
||||
MACOS_PLAYWRIGHT_CACHE_DIR.expanduser()
|
||||
if OPERATING_SYSTEM == "darwin" else
|
||||
LINUX_PLAYWRIGHT_CACHE_DIR.expanduser()
|
||||
)
|
||||
playwright_install_args: List[str] = ["install"]
|
||||
|
||||
packages_handler: BinProviderOverrides = Field(default={
|
||||
"chrome": ["chromium"],
|
||||
}, exclude=True)
|
||||
|
||||
_browser_abspaths: ClassVar[Dict[str, HostBinPath]] = {}
|
||||
|
||||
@computed_field
|
||||
@property
|
||||
def INSTALLER_BIN_ABSPATH(self) -> HostBinPath | None:
|
||||
try:
|
||||
return PLAYWRIGHT_BINARY.load().abspath
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
def setup(self) -> None:
|
||||
# update paths from config at runtime
|
||||
LIB_DIR = abx.pm.hook.get_FLAT_CONFIG().LIB_DIR
|
||||
|
||||
self.PATH = f"{LIB_DIR / 'bin'}:{DEFAULT_ENV_PATH}"
|
||||
|
||||
assert shutil.which('pip'), "Pip bin provider not initialized"
|
||||
|
||||
if self.playwright_browsers_dir:
|
||||
self.playwright_browsers_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def installed_browser_bins(self, browser_name: str = "*") -> List[Path]:
|
||||
if browser_name == 'chrome':
|
||||
browser_name = 'chromium'
|
||||
|
||||
# if on macOS, browser binary is inside a .app, otherwise it's just a plain binary
|
||||
if platform.system().lower() == "darwin":
|
||||
# ~/Library/caches/ms-playwright/chromium-1097/chrome-mac/Chromium.app/Contents/MacOS/Chromium
|
||||
return sorted(
|
||||
self.playwright_browsers_dir.glob(
|
||||
f"{browser_name}-*/*-mac*/*.app/Contents/MacOS/*"
|
||||
)
|
||||
)
|
||||
|
||||
# ~/Library/caches/ms-playwright/chromium-1097/chrome-linux/chromium
|
||||
paths = []
|
||||
for path in sorted(self.playwright_browsers_dir.glob(f"{browser_name}-*/*-linux/*")):
|
||||
if 'xdg-settings' in str(path):
|
||||
continue
|
||||
if 'ffmpeg' in str(path):
|
||||
continue
|
||||
if '/chrom' in str(path) and 'chrom' in path.name.lower():
|
||||
paths.append(path)
|
||||
return paths
|
||||
|
||||
def default_abspath_handler(self, bin_name: BinName, **context) -> Optional[HostBinPath]:
|
||||
assert bin_name == "chrome", "Only chrome is supported using the @puppeteer/browsers install method currently."
|
||||
|
||||
# already loaded, return abspath from cache
|
||||
if bin_name in self._browser_abspaths:
|
||||
return self._browser_abspaths[bin_name]
|
||||
|
||||
# first time loading, find browser in self.playwright_browsers_dir by searching filesystem for installed binaries
|
||||
matching_bins = [abspath for abspath in self.installed_browser_bins() if bin_name in str(abspath)]
|
||||
if matching_bins:
|
||||
newest_bin = matching_bins[-1] # already sorted alphabetically, last should theoretically be highest version number
|
||||
self._browser_abspaths[bin_name] = newest_bin
|
||||
return self._browser_abspaths[bin_name]
|
||||
|
||||
# playwright sometimes installs google-chrome-stable via apt into system $PATH, check there as well
|
||||
abspath = bin_abspath('google-chrome-stable', PATH=env.PATH)
|
||||
if abspath:
|
||||
self._browser_abspaths[bin_name] = abspath
|
||||
return self._browser_abspaths[bin_name]
|
||||
|
||||
return None
|
||||
|
||||
def default_install_handler(self, bin_name: str, packages: Optional[InstallArgs] = None, **context) -> str:
|
||||
"""playwright install chrome"""
|
||||
self.setup()
|
||||
assert bin_name == "chrome", "Only chrome is supported using the playwright install method currently."
|
||||
|
||||
if not self.INSTALLER_BIN_ABSPATH:
|
||||
raise Exception(
|
||||
f"{self.__class__.__name__} install method is not available on this host ({self.INSTALLER_BIN} not found in $PATH)"
|
||||
)
|
||||
packages = packages or self.get_packages(bin_name)
|
||||
|
||||
# print(f'[*] {self.__class__.__name__}: Installing {bin_name}: {self.INSTALLER_BIN_ABSPATH} install {packages}')
|
||||
|
||||
|
||||
# playwright install-deps (to install system dependencies like fonts, graphics libraries, etc.)
|
||||
if platform.system().lower() != 'darwin':
|
||||
# libglib2.0-0, libnss3, libnspr4, libdbus-1-3, libatk1.0-0, libatk-bridge2.0-0, libcups2, libdrm2, libxcb1, libxkbcommon0, libatspi2.0-0, libx11-6, libxcomposite1, libxdamage1, libxext6, libxfixes3, libxrandr2, libgbm1, libcairo2, libasound2
|
||||
proc = self.exec(bin_name=self.INSTALLER_BIN_ABSPATH, cmd=['install-deps'])
|
||||
if proc.returncode != 0:
|
||||
print(proc.stdout.strip())
|
||||
print(proc.stderr.strip())
|
||||
|
||||
proc = self.exec(bin_name=self.INSTALLER_BIN_ABSPATH, cmd=['install', *packages])
|
||||
|
||||
if proc.returncode != 0:
|
||||
print(proc.stdout.strip())
|
||||
print(proc.stderr.strip())
|
||||
raise Exception(f"{self.__class__.__name__}: install got returncode {proc.returncode} while installing {packages}: {packages} PACKAGES={packages}")
|
||||
|
||||
# chrome@129.0.6668.58 /data/lib/browsers/chrome/mac_arm-129.0.6668.58/chrome-mac-arm64/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing
|
||||
# playwright build v1010 downloaded to /home/squash/.cache/ms-playwright/ffmpeg-1010
|
||||
output_lines = [
|
||||
line for line in proc.stdout.strip().split('\n')
|
||||
if '/chrom' in line
|
||||
and 'chrom' in line.rsplit('/', 1)[-1].lower() # if final path segment (filename) contains chrome or chromium
|
||||
and 'xdg-settings' not in line
|
||||
and 'ffmpeg' not in line
|
||||
]
|
||||
if output_lines:
|
||||
relpath = output_lines[0].split(str(self.playwright_browsers_dir))[-1]
|
||||
abspath = self.playwright_browsers_dir / relpath
|
||||
if os.path.isfile(abspath) and os.access(abspath, os.X_OK):
|
||||
self._browser_abspaths[bin_name] = abspath
|
||||
|
||||
return (proc.stderr.strip() + "\n" + proc.stdout.strip()).strip()
|
||||
|
||||
PLAYWRIGHT_BINPROVIDER = PlaywrightBinProvider()
|
||||
@@ -0,0 +1,7 @@
|
||||
from abx_spec_config import BaseConfigSet
|
||||
|
||||
class PlaywrightConfigs(BaseConfigSet):
|
||||
PLAYWRIGHT_BINARY: str = 'playwright'
|
||||
|
||||
|
||||
PLAYWRIGHT_CONFIG = PlaywrightConfigs()
|
||||
20
packages/abx-plugin-playwright-binprovider/pyproject.toml
Normal file
20
packages/abx-plugin-playwright-binprovider/pyproject.toml
Normal file
@@ -0,0 +1,20 @@
|
||||
[project]
|
||||
name = "abx-plugin-playwright-binprovider"
|
||||
version = "2024.10.24"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"abx>=0.1.0",
|
||||
"pydantic>=2.4.2",
|
||||
"pydantic-pkgr>=0.5.4",
|
||||
"abx-spec-pydantic-pkgr>=0.1.0",
|
||||
"abx-spec-config>=0.1.0",
|
||||
]
|
||||
|
||||
[build-system]
|
||||
requires = ["hatchling"]
|
||||
build-backend = "hatchling.build"
|
||||
|
||||
[project.entry-points.abx]
|
||||
abx_plugin_playwright_binprovider = "abx_plugin_playwright_binprovider"
|
||||
0
packages/abx-plugin-pocket-extractor/README.md
Normal file
0
packages/abx-plugin-pocket-extractor/README.md
Normal file
37
packages/abx-plugin-pocket-extractor/__init__.py
Normal file
37
packages/abx-plugin-pocket-extractor/__init__.py
Normal file
@@ -0,0 +1,37 @@
|
||||
__package__ = 'plugins_extractor.pocket'
|
||||
__id__ = 'pocket'
|
||||
__label__ = 'pocket'
|
||||
__version__ = '2024.10.21'
|
||||
__author__ = 'ArchiveBox'
|
||||
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/dev/archivebox/plugins_extractor/pocket'
|
||||
__dependencies__ = []
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
__id__: {
|
||||
'id': __id__,
|
||||
'package': __package__,
|
||||
'label': __label__,
|
||||
'version': __version__,
|
||||
'author': __author__,
|
||||
'homepage': __homepage__,
|
||||
'dependencies': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import POCKET_CONFIG
|
||||
|
||||
return {
|
||||
__id__: POCKET_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def ready():
|
||||
from .config import POCKET_CONFIG
|
||||
POCKET_CONFIG.validate()
|
||||
15
packages/abx-plugin-pocket-extractor/config.py
Normal file
15
packages/abx-plugin-pocket-extractor/config.py
Normal file
@@ -0,0 +1,15 @@
|
||||
__package__ = 'plugins_extractor.pocket'
|
||||
|
||||
from typing import Dict
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
|
||||
class PocketConfig(BaseConfigSet):
|
||||
POCKET_CONSUMER_KEY: str | None = Field(default=None)
|
||||
POCKET_ACCESS_TOKENS: Dict[str, str] = Field(default=lambda: {}) # {<username>: <access_token>, ...}
|
||||
|
||||
|
||||
POCKET_CONFIG = PocketConfig()
|
||||
7
packages/abx-plugin-pocket-extractor/pyproject.toml
Normal file
7
packages/abx-plugin-pocket-extractor/pyproject.toml
Normal file
@@ -0,0 +1,7 @@
|
||||
[project]
|
||||
name = "abx-pocket-extractor"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = []
|
||||
0
packages/abx-plugin-puppeteer-binprovider/README.md
Normal file
0
packages/abx-plugin-puppeteer-binprovider/README.md
Normal file
46
packages/abx-plugin-puppeteer-binprovider/__init__.py
Normal file
46
packages/abx-plugin-puppeteer-binprovider/__init__.py
Normal file
@@ -0,0 +1,46 @@
|
||||
__package__ = 'plugins_pkg.puppeteer'
|
||||
__label__ = 'puppeteer'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'ArchiveBox'
|
||||
__homepage__ = 'https://github.com/puppeteer/puppeteer'
|
||||
__dependencies__ = ['npm']
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'puppeteer': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import PUPPETEER_CONFIG
|
||||
|
||||
return {
|
||||
'puppeteer': PUPPETEER_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import PUPPETEER_BINARY
|
||||
|
||||
return {
|
||||
'puppeteer': PUPPETEER_BINARY,
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINPROVIDERS():
|
||||
from .binproviders import PUPPETEER_BINPROVIDER
|
||||
|
||||
return {
|
||||
'puppeteer': PUPPETEER_BINPROVIDER,
|
||||
}
|
||||
23
packages/abx-plugin-puppeteer-binprovider/binaries.py
Normal file
23
packages/abx-plugin-puppeteer-binprovider/binaries.py
Normal file
@@ -0,0 +1,23 @@
|
||||
__package__ = 'plugins_pkg.puppeteer'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import BinProvider, BinName
|
||||
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env
|
||||
|
||||
from plugins_pkg.npm.binproviders import LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER
|
||||
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
|
||||
class PuppeteerBinary(BaseBinary):
|
||||
name: BinName = "puppeteer"
|
||||
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
||||
|
||||
|
||||
PUPPETEER_BINARY = PuppeteerBinary()
|
||||
131
packages/abx-plugin-puppeteer-binprovider/binproviders.py
Normal file
131
packages/abx-plugin-puppeteer-binprovider/binproviders.py
Normal file
@@ -0,0 +1,131 @@
|
||||
__package__ = 'plugins_pkg.puppeteer'
|
||||
|
||||
import os
|
||||
import platform
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Dict, ClassVar
|
||||
|
||||
from pydantic import Field
|
||||
from pydantic_pkgr import (
|
||||
BinName,
|
||||
BinProviderName,
|
||||
BinProviderOverrides,
|
||||
InstallArgs,
|
||||
PATHStr,
|
||||
HostBinPath,
|
||||
)
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.permissions import ARCHIVEBOX_USER
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinProvider
|
||||
|
||||
from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER
|
||||
|
||||
|
||||
class PuppeteerBinProvider(BaseBinProvider):
|
||||
name: BinProviderName = "puppeteer"
|
||||
INSTALLER_BIN: BinName = "npx"
|
||||
|
||||
PATH: PATHStr = str(CONSTANTS.DEFAULT_LIB_DIR / 'bin')
|
||||
|
||||
euid: Optional[int] = ARCHIVEBOX_USER
|
||||
|
||||
puppeteer_browsers_dir: Path = CONSTANTS.DEFAULT_LIB_DIR / 'browsers'
|
||||
puppeteer_install_args: List[str] = ['--yes', "@puppeteer/browsers", "install"]
|
||||
|
||||
packages_handler: BinProviderOverrides = Field(default={
|
||||
"chrome": lambda:
|
||||
['chrome@stable'],
|
||||
}, exclude=True)
|
||||
|
||||
_browser_abspaths: ClassVar[Dict[str, HostBinPath]] = {}
|
||||
|
||||
def setup(self) -> None:
|
||||
# update paths from config, don't do this lazily because we dont want to import archivebox.config.common at import-time
|
||||
# we want to avoid depending on archivebox from abx code if at all possible
|
||||
from archivebox.config.common import STORAGE_CONFIG
|
||||
self.puppeteer_browsers_dir = STORAGE_CONFIG.LIB_DIR / 'browsers'
|
||||
self.PATH = str(STORAGE_CONFIG.LIB_DIR / 'bin')
|
||||
|
||||
assert SYS_NPM_BINPROVIDER.INSTALLER_BIN_ABSPATH, "NPM bin provider not initialized"
|
||||
|
||||
if self.puppeteer_browsers_dir:
|
||||
self.puppeteer_browsers_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
def installed_browser_bins(self, browser_name: str='*') -> List[Path]:
|
||||
# if on macOS, browser binary is inside a .app, otherwise it's just a plain binary
|
||||
if platform.system().lower() == 'darwin':
|
||||
# /data/lib/browsers/chrome/mac_arm-129.0.6668.58/chrome-mac-arm64/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing
|
||||
return sorted(self.puppeteer_browsers_dir.glob(f'{browser_name}/mac*/chrome*/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing'))
|
||||
|
||||
# /data/lib/browsers/chrome/linux-131.0.6730.0/chrome-linux64/chrome
|
||||
# /data/lib/aarch64-linux/browsers/chrome/linux-129.0.6668.100/chrome-linux64/chrome
|
||||
return sorted(self.puppeteer_browsers_dir.glob(f"{browser_name}/linux*/chrome*/chrome"))
|
||||
|
||||
def default_abspath_handler(self, bin_name: BinName, **context) -> Optional[HostBinPath]:
|
||||
assert bin_name == 'chrome', 'Only chrome is supported using the @puppeteer/browsers install method currently.'
|
||||
|
||||
# already loaded, return abspath from cache
|
||||
if bin_name in self._browser_abspaths:
|
||||
return self._browser_abspaths[bin_name]
|
||||
|
||||
# first time loading, find browser in self.puppeteer_browsers_dir by searching filesystem for installed binaries
|
||||
matching_bins = [abspath for abspath in self.installed_browser_bins() if bin_name in str(abspath)]
|
||||
if matching_bins:
|
||||
newest_bin = matching_bins[-1] # already sorted alphabetically, last should theoretically be highest version number
|
||||
self._browser_abspaths[bin_name] = newest_bin
|
||||
return newest_bin
|
||||
|
||||
return None
|
||||
|
||||
def default_install_handler(self, bin_name: str, packages: Optional[InstallArgs] = None, **context) -> str:
|
||||
"""npx @puppeteer/browsers install chrome@stable"""
|
||||
self.setup()
|
||||
assert bin_name == 'chrome', 'Only chrome is supported using the @puppeteer/browsers install method currently.'
|
||||
|
||||
if not self.INSTALLER_BIN_ABSPATH:
|
||||
raise Exception(
|
||||
f"{self.__class__.__name__} install method is not available on this host ({self.INSTALLER_BIN} not found in $PATH)"
|
||||
)
|
||||
packages = packages or self.get_packages(bin_name)
|
||||
assert packages, f"No packages specified for installation of {bin_name}"
|
||||
|
||||
# print(f'[*] {self.__class__.__name__}: Installing {bin_name}: {self.INSTALLER_BIN_ABSPATH} install {packages}')
|
||||
|
||||
install_args = [*self.puppeteer_install_args, "--path", str(self.puppeteer_browsers_dir)]
|
||||
|
||||
proc = self.exec(bin_name=self.INSTALLER_BIN_ABSPATH, cmd=[*install_args, *packages])
|
||||
|
||||
if proc.returncode != 0:
|
||||
print(proc.stdout.strip())
|
||||
print(proc.stderr.strip())
|
||||
raise Exception(f"{self.__class__.__name__}: install got returncode {proc.returncode} while installing {packages}: {packages}")
|
||||
|
||||
# chrome@129.0.6668.91 /tmp/test3/lib/x86_64-linux/browsers/chrome/linux-129.0.6668.91/chrome-linux64/chrome
|
||||
# chrome@129.0.6668.58 /data/lib/browsers/chrome/mac_arm-129.0.6668.58/chrome-mac-arm64/Google Chrome for Testing.app/Contents/MacOS/Google Chrome for Testing
|
||||
# /data/lib/aarch64-linux/browsers/chrome/linux-129.0.6668.100/chrome-linux64/chrome
|
||||
relpath = proc.stdout.strip().split(str(self.puppeteer_browsers_dir))[-1].split('\n', 1)[0]
|
||||
abspath = self.puppeteer_browsers_dir / relpath
|
||||
|
||||
if os.path.isfile(abspath) and os.access(abspath, os.X_OK):
|
||||
self._browser_abspaths[bin_name] = abspath
|
||||
return abspath
|
||||
|
||||
return (proc.stderr.strip() + "\n" + proc.stdout.strip()).strip()
|
||||
|
||||
PUPPETEER_BINPROVIDER = PuppeteerBinProvider()
|
||||
|
||||
|
||||
# ALTERNATIVE INSTALL METHOD using Ansible:
|
||||
# install_playbook = self.plugin_dir / 'install_puppeteer.yml'
|
||||
# chrome_bin = run_playbook(install_playbook, data_dir=DATA_DIR, quiet=quiet).BINARIES.chrome
|
||||
# return self.__class__.model_validate(
|
||||
# {
|
||||
# **self.model_dump(),
|
||||
# "loaded_abspath": chrome_bin.symlink,
|
||||
# "loaded_version": chrome_bin.version,
|
||||
# "loaded_binprovider": env,
|
||||
# "binproviders_supported": self.binproviders_supported,
|
||||
# }
|
||||
# )
|
||||
18
packages/abx-plugin-puppeteer-binprovider/config.py
Normal file
18
packages/abx-plugin-puppeteer-binprovider/config.py
Normal file
@@ -0,0 +1,18 @@
|
||||
__package__ = 'plugins_pkg.puppeteer'
|
||||
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
|
||||
class PuppeteerConfig(BaseConfigSet):
|
||||
PUPPETEER_BINARY: str = 'puppeteer'
|
||||
# PUPPETEER_ARGS: Optional[List[str]] = Field(default=None)
|
||||
# PUPPETEER_EXTRA_ARGS: List[str] = []
|
||||
# PUPPETEER_DEFAULT_ARGS: List[str] = ['--timeout={TIMEOUT-10}']
|
||||
pass
|
||||
|
||||
|
||||
PUPPETEER_CONFIG = PuppeteerConfig()
|
||||
7
packages/abx-plugin-puppeteer-binprovider/pyproject.toml
Normal file
7
packages/abx-plugin-puppeteer-binprovider/pyproject.toml
Normal file
@@ -0,0 +1,7 @@
|
||||
[project]
|
||||
name = "abx-puppeteer-binprovider"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = []
|
||||
0
packages/abx-plugin-readability-extractor/README.md
Normal file
0
packages/abx-plugin-readability-extractor/README.md
Normal file
46
packages/abx-plugin-readability-extractor/__init__.py
Normal file
46
packages/abx-plugin-readability-extractor/__init__.py
Normal file
@@ -0,0 +1,46 @@
|
||||
__package__ = 'plugins_extractor.readability'
|
||||
__label__ = 'readability'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'ArchiveBox'
|
||||
__homepage__ = 'https://github.com/ArchiveBox/readability-extractor'
|
||||
__dependencies__ = ['npm']
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'readability': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import READABILITY_CONFIG
|
||||
|
||||
return {
|
||||
'readability': READABILITY_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import READABILITY_BINARY
|
||||
|
||||
return {
|
||||
'readability': READABILITY_BINARY,
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_EXTRACTORS():
|
||||
from .extractors import READABILITY_EXTRACTOR
|
||||
|
||||
return {
|
||||
'readability': READABILITY_EXTRACTOR,
|
||||
}
|
||||
27
packages/abx-plugin-readability-extractor/binaries.py
Normal file
27
packages/abx-plugin-readability-extractor/binaries.py
Normal file
@@ -0,0 +1,27 @@
|
||||
__package__ = 'plugins_extractor.readability'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env
|
||||
|
||||
from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
|
||||
|
||||
from .config import READABILITY_CONFIG
|
||||
|
||||
|
||||
READABILITY_PACKAGE_NAME = 'github:ArchiveBox/readability-extractor'
|
||||
|
||||
class ReadabilityBinary(BaseBinary):
|
||||
name: BinName = READABILITY_CONFIG.READABILITY_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
LIB_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME]},
|
||||
SYS_NPM_BINPROVIDER.name: {"packages": [READABILITY_PACKAGE_NAME], "install": lambda: None}, # prevent modifying system global npm packages
|
||||
}
|
||||
|
||||
|
||||
READABILITY_BINARY = ReadabilityBinary()
|
||||
19
packages/abx-plugin-readability-extractor/config.py
Normal file
19
packages/abx-plugin-readability-extractor/config.py
Normal file
@@ -0,0 +1,19 @@
|
||||
__package__ = 'plugins_extractor.readability'
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
|
||||
class ReadabilityConfig(BaseConfigSet):
|
||||
SAVE_READABILITY: bool = Field(default=True, alias='USE_READABILITY')
|
||||
|
||||
READABILITY_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||
|
||||
READABILITY_BINARY: str = Field(default='readability-extractor')
|
||||
# READABILITY_EXTRA_ARGS: List[str] = [] # readability-extractor doesn't take any extra args
|
||||
|
||||
|
||||
READABILITY_CONFIG = ReadabilityConfig()
|
||||
20
packages/abx-plugin-readability-extractor/extractors.py
Normal file
20
packages/abx-plugin-readability-extractor/extractors.py
Normal file
@@ -0,0 +1,20 @@
|
||||
__package__ = 'plugins_extractor.readability'
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic_pkgr import BinName
|
||||
|
||||
from abx.archivebox.base_extractor import BaseExtractor
|
||||
|
||||
from .binaries import READABILITY_BINARY
|
||||
|
||||
|
||||
class ReadabilityExtractor(BaseExtractor):
|
||||
name: str = 'readability'
|
||||
binary: BinName = READABILITY_BINARY.name
|
||||
|
||||
def get_output_path(self, snapshot) -> Path:
|
||||
return Path(snapshot.link_dir) / 'readability' / 'content.html'
|
||||
|
||||
|
||||
READABILITY_EXTRACTOR = ReadabilityExtractor()
|
||||
7
packages/abx-plugin-readability-extractor/pyproject.toml
Normal file
7
packages/abx-plugin-readability-extractor/pyproject.toml
Normal file
@@ -0,0 +1,7 @@
|
||||
[project]
|
||||
name = "abx-readability-extractor"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = []
|
||||
0
packages/abx-plugin-readwise-extractor/README.md
Normal file
0
packages/abx-plugin-readwise-extractor/README.md
Normal file
37
packages/abx-plugin-readwise-extractor/__init__.py
Normal file
37
packages/abx-plugin-readwise-extractor/__init__.py
Normal file
@@ -0,0 +1,37 @@
|
||||
__package__ = 'plugins_extractor.readwise'
|
||||
__id__ = 'readwise'
|
||||
__label__ = 'readwise'
|
||||
__version__ = '2024.10.21'
|
||||
__author__ = 'ArchiveBox'
|
||||
__homepage__ = 'https://github.com/ArchiveBox/ArchiveBox/tree/dev/archivebox/plugins_extractor/readwise'
|
||||
__dependencies__ = []
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
__id__: {
|
||||
'id': __id__,
|
||||
'package': __package__,
|
||||
'label': __label__,
|
||||
'version': __version__,
|
||||
'author': __author__,
|
||||
'homepage': __homepage__,
|
||||
'dependencies': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import READWISE_CONFIG
|
||||
|
||||
return {
|
||||
__id__: READWISE_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def ready():
|
||||
from .config import READWISE_CONFIG
|
||||
READWISE_CONFIG.validate()
|
||||
17
packages/abx-plugin-readwise-extractor/config.py
Normal file
17
packages/abx-plugin-readwise-extractor/config.py
Normal file
@@ -0,0 +1,17 @@
|
||||
__package__ = 'plugins_extractor.readwise'
|
||||
|
||||
from typing import Dict
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
|
||||
|
||||
class ReadwiseConfig(BaseConfigSet):
|
||||
READWISE_DB_PATH: Path = Field(default=CONSTANTS.SOURCES_DIR / "readwise_reader_api.db")
|
||||
READWISE_READER_TOKENS: Dict[str, str] = Field(default=lambda: {}) # {<username>: <access_token>, ...}
|
||||
|
||||
READWISE_CONFIG = ReadwiseConfig()
|
||||
7
packages/abx-plugin-readwise-extractor/pyproject.toml
Normal file
7
packages/abx-plugin-readwise-extractor/pyproject.toml
Normal file
@@ -0,0 +1,7 @@
|
||||
[project]
|
||||
name = "abx-readwise-extractor"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = []
|
||||
0
packages/abx-plugin-ripgrep-search/README.md
Normal file
0
packages/abx-plugin-ripgrep-search/README.md
Normal file
48
packages/abx-plugin-ripgrep-search/__init__.py
Normal file
48
packages/abx-plugin-ripgrep-search/__init__.py
Normal file
@@ -0,0 +1,48 @@
|
||||
__package__ = 'plugins_search.ripgrep'
|
||||
__label__ = 'ripgrep'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'ArchiveBox'
|
||||
__homepage__ = 'https://github.com/BurntSushi/ripgrep'
|
||||
__dependencies__ = []
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'ripgrep': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import RIPGREP_CONFIG
|
||||
|
||||
return {
|
||||
'ripgrep': RIPGREP_CONFIG
|
||||
}
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import RIPGREP_BINARY
|
||||
|
||||
return {
|
||||
'ripgrep': RIPGREP_BINARY
|
||||
}
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_SEARCHBACKENDS():
|
||||
from .searchbackend import RIPGREP_SEARCH_BACKEND
|
||||
|
||||
return {
|
||||
'ripgrep': RIPGREP_SEARCH_BACKEND,
|
||||
}
|
||||
23
packages/abx-plugin-ripgrep-search/binaries.py
Normal file
23
packages/abx-plugin-ripgrep-search/binaries.py
Normal file
@@ -0,0 +1,23 @@
|
||||
__package__ = 'plugins_search.ripgrep'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env, apt, brew
|
||||
|
||||
|
||||
from .config import RIPGREP_CONFIG
|
||||
|
||||
|
||||
class RipgrepBinary(BaseBinary):
|
||||
name: BinName = RIPGREP_CONFIG.RIPGREP_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [apt, brew, env]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
apt.name: {'packages': ['ripgrep']},
|
||||
brew.name: {'packages': ['ripgrep']},
|
||||
}
|
||||
|
||||
RIPGREP_BINARY = RipgrepBinary()
|
||||
29
packages/abx-plugin-ripgrep-search/config.py
Normal file
29
packages/abx-plugin-ripgrep-search/config.py
Normal file
@@ -0,0 +1,29 @@
|
||||
__package__ = 'plugins_search.ripgrep'
|
||||
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
from archivebox.config import CONSTANTS
|
||||
from archivebox.config.common import SEARCH_BACKEND_CONFIG
|
||||
|
||||
|
||||
class RipgrepConfig(BaseConfigSet):
|
||||
RIPGREP_BINARY: str = Field(default='rg')
|
||||
|
||||
RIPGREP_IGNORE_EXTENSIONS: str = Field(default='css,js,orig,svg')
|
||||
RIPGREP_ARGS_DEFAULT: List[str] = Field(default=lambda c: [
|
||||
# https://github.com/BurntSushi/ripgrep/blob/master/GUIDE.md
|
||||
f'--type-add=ignore:*.{{{c.RIPGREP_IGNORE_EXTENSIONS}}}',
|
||||
'--type-not=ignore',
|
||||
'--ignore-case',
|
||||
'--files-with-matches',
|
||||
'--regexp',
|
||||
])
|
||||
RIPGREP_SEARCH_DIR: Path = CONSTANTS.ARCHIVE_DIR
|
||||
RIPGREP_TIMEOUT: int = Field(default=lambda: SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_TIMEOUT)
|
||||
|
||||
RIPGREP_CONFIG = RipgrepConfig()
|
||||
7
packages/abx-plugin-ripgrep-search/pyproject.toml
Normal file
7
packages/abx-plugin-ripgrep-search/pyproject.toml
Normal file
@@ -0,0 +1,7 @@
|
||||
[project]
|
||||
name = "abx-ripgrep-search"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = []
|
||||
55
packages/abx-plugin-ripgrep-search/searchbackend.py
Normal file
55
packages/abx-plugin-ripgrep-search/searchbackend.py
Normal file
@@ -0,0 +1,55 @@
|
||||
__package__ = 'plugins_search.ripgrep'
|
||||
|
||||
import re
|
||||
import subprocess
|
||||
|
||||
from typing import List, Iterable
|
||||
|
||||
from abx.archivebox.base_searchbackend import BaseSearchBackend
|
||||
|
||||
from .binaries import RIPGREP_BINARY
|
||||
from .config import RIPGREP_CONFIG
|
||||
|
||||
|
||||
|
||||
# regex to match archive/<ts>/... snapshot dir names
|
||||
TIMESTAMP_REGEX = re.compile(r'\/([\d]+\.[\d]+)\/')
|
||||
|
||||
class RipgrepSearchBackend(BaseSearchBackend):
|
||||
name: str = 'ripgrep'
|
||||
docs_url: str = 'https://github.com/BurntSushi/ripgrep'
|
||||
|
||||
@staticmethod
|
||||
def index(snapshot_id: str, texts: List[str]):
|
||||
return
|
||||
|
||||
@staticmethod
|
||||
def flush(snapshot_ids: Iterable[str]):
|
||||
return
|
||||
|
||||
@staticmethod
|
||||
def search(text: str) -> List[str]:
|
||||
from core.models import Snapshot
|
||||
|
||||
ripgrep_binary = RIPGREP_BINARY.load()
|
||||
if not ripgrep_binary.version:
|
||||
raise Exception("ripgrep binary not found, install ripgrep to use this search backend")
|
||||
|
||||
cmd = [
|
||||
ripgrep_binary.abspath,
|
||||
*RIPGREP_CONFIG.RIPGREP_ARGS_DEFAULT,
|
||||
text,
|
||||
str(RIPGREP_CONFIG.RIPGREP_SEARCH_DIR),
|
||||
]
|
||||
proc = subprocess.run(cmd, timeout=RIPGREP_CONFIG.RIPGREP_TIMEOUT, capture_output=True, text=True)
|
||||
timestamps = set()
|
||||
for path in proc.stdout.splitlines():
|
||||
ts = TIMESTAMP_REGEX.findall(path)
|
||||
if ts:
|
||||
timestamps.add(ts[0])
|
||||
|
||||
snap_ids = [str(id) for id in Snapshot.objects.filter(timestamp__in=timestamps).values_list('pk', flat=True)]
|
||||
|
||||
return snap_ids
|
||||
|
||||
RIPGREP_SEARCH_BACKEND = RipgrepSearchBackend()
|
||||
0
packages/abx-plugin-singlefile-extractor/README.md
Normal file
0
packages/abx-plugin-singlefile-extractor/README.md
Normal file
51
packages/abx-plugin-singlefile-extractor/__init__.py
Normal file
51
packages/abx-plugin-singlefile-extractor/__init__.py
Normal file
@@ -0,0 +1,51 @@
|
||||
__package__ = 'plugins_extractor.singlefile'
|
||||
__label__ = 'singlefile'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'ArchiveBox'
|
||||
__homepage__ = 'https://github.com/gildas-lormeau/singlefile'
|
||||
__dependencies__ = ['npm']
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'singlefile': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import SINGLEFILE_CONFIG
|
||||
|
||||
return {
|
||||
'singlefile': SINGLEFILE_CONFIG
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import SINGLEFILE_BINARY
|
||||
|
||||
return {
|
||||
'singlefile': SINGLEFILE_BINARY,
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_EXTRACTORS():
|
||||
from .extractors import SINGLEFILE_EXTRACTOR
|
||||
|
||||
return {
|
||||
'singlefile': SINGLEFILE_EXTRACTOR,
|
||||
}
|
||||
|
||||
# @abx.hookimpl
|
||||
# def get_INSTALLED_APPS():
|
||||
# # needed to load ./models.py
|
||||
# return [__package__]
|
||||
48
packages/abx-plugin-singlefile-extractor/binaries.py
Normal file
48
packages/abx-plugin-singlefile-extractor/binaries.py
Normal file
@@ -0,0 +1,48 @@
|
||||
__package__ = 'plugins_extractor.singlefile'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName, bin_abspath
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env
|
||||
|
||||
from plugins_pkg.npm.binproviders import SYS_NPM_BINPROVIDER, LIB_NPM_BINPROVIDER
|
||||
|
||||
from .config import SINGLEFILE_CONFIG
|
||||
|
||||
|
||||
SINGLEFILE_MIN_VERSION = '1.1.54'
|
||||
SINGLEFILE_MAX_VERSION = '1.1.60'
|
||||
|
||||
|
||||
class SinglefileBinary(BaseBinary):
|
||||
name: BinName = SINGLEFILE_CONFIG.SINGLEFILE_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [LIB_NPM_BINPROVIDER, SYS_NPM_BINPROVIDER, env]
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
LIB_NPM_BINPROVIDER.name: {
|
||||
"abspath": lambda:
|
||||
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=LIB_NPM_BINPROVIDER.PATH)
|
||||
or bin_abspath("single-file", PATH=LIB_NPM_BINPROVIDER.PATH)
|
||||
or bin_abspath("single-file-node.js", PATH=LIB_NPM_BINPROVIDER.PATH),
|
||||
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
|
||||
},
|
||||
SYS_NPM_BINPROVIDER.name: {
|
||||
"abspath": lambda:
|
||||
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=SYS_NPM_BINPROVIDER.PATH)
|
||||
or bin_abspath("single-file", PATH=SYS_NPM_BINPROVIDER.PATH)
|
||||
or bin_abspath("single-file-node.js", PATH=SYS_NPM_BINPROVIDER.PATH),
|
||||
"packages": [f"single-file-cli@>={SINGLEFILE_MIN_VERSION} <{SINGLEFILE_MAX_VERSION}"],
|
||||
"install": lambda: None,
|
||||
},
|
||||
env.name: {
|
||||
'abspath': lambda:
|
||||
bin_abspath(SINGLEFILE_CONFIG.SINGLEFILE_BINARY, PATH=env.PATH)
|
||||
or bin_abspath('single-file', PATH=env.PATH)
|
||||
or bin_abspath('single-file-node.js', PATH=env.PATH),
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
SINGLEFILE_BINARY = SinglefileBinary()
|
||||
25
packages/abx-plugin-singlefile-extractor/config.py
Normal file
25
packages/abx-plugin-singlefile-extractor/config.py
Normal file
@@ -0,0 +1,25 @@
|
||||
__package__ = 'plugins_extractor.singlefile'
|
||||
|
||||
from pathlib import Path
|
||||
from typing import List, Optional
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
from archivebox.config.common import ARCHIVING_CONFIG
|
||||
|
||||
|
||||
class SinglefileConfig(BaseConfigSet):
|
||||
SAVE_SINGLEFILE: bool = True
|
||||
|
||||
SINGLEFILE_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
|
||||
SINGLEFILE_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT)
|
||||
SINGLEFILE_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
|
||||
SINGLEFILE_COOKIES_FILE: Optional[Path] = Field(default=lambda: ARCHIVING_CONFIG.COOKIES_FILE)
|
||||
|
||||
SINGLEFILE_BINARY: str = Field(default='single-file')
|
||||
SINGLEFILE_EXTRA_ARGS: List[str] = []
|
||||
|
||||
|
||||
SINGLEFILE_CONFIG = SinglefileConfig()
|
||||
19
packages/abx-plugin-singlefile-extractor/extractors.py
Normal file
19
packages/abx-plugin-singlefile-extractor/extractors.py
Normal file
@@ -0,0 +1,19 @@
|
||||
__package__ = 'plugins_extractor.singlefile'
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
from pydantic_pkgr import BinName
|
||||
from abx.archivebox.base_extractor import BaseExtractor
|
||||
|
||||
from .binaries import SINGLEFILE_BINARY
|
||||
|
||||
|
||||
class SinglefileExtractor(BaseExtractor):
|
||||
name: str = 'singlefile'
|
||||
binary: BinName = SINGLEFILE_BINARY.name
|
||||
|
||||
def get_output_path(self, snapshot) -> Path:
|
||||
return Path(snapshot.link_dir) / 'singlefile.html'
|
||||
|
||||
|
||||
SINGLEFILE_EXTRACTOR = SinglefileExtractor()
|
||||
14
packages/abx-plugin-singlefile-extractor/models.py
Normal file
14
packages/abx-plugin-singlefile-extractor/models.py
Normal file
@@ -0,0 +1,14 @@
|
||||
from django.db import models
|
||||
|
||||
from core.models import ArchiveResult
|
||||
|
||||
class SinglefileResultManager(models.Manager):
|
||||
def get_queryset(self):
|
||||
return super().get_queryset().filter(extractor='singlefile')
|
||||
|
||||
|
||||
class SinglefileResult(ArchiveResult):
|
||||
objects = SinglefileResultManager()
|
||||
|
||||
class Meta:
|
||||
proxy = True
|
||||
7
packages/abx-plugin-singlefile-extractor/pyproject.toml
Normal file
7
packages/abx-plugin-singlefile-extractor/pyproject.toml
Normal file
@@ -0,0 +1,7 @@
|
||||
[project]
|
||||
name = "abx-singlefile-extractor"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = []
|
||||
0
packages/abx-plugin-sonic-search/README.md
Normal file
0
packages/abx-plugin-sonic-search/README.md
Normal file
53
packages/abx-plugin-sonic-search/__init__.py
Normal file
53
packages/abx-plugin-sonic-search/__init__.py
Normal file
@@ -0,0 +1,53 @@
|
||||
__package__ = 'plugins_search.sonic'
|
||||
__label__ = 'sonic'
|
||||
__version__ = '2024.10.14'
|
||||
__author__ = 'ArchiveBox'
|
||||
__homepage__ = 'https://github.com/valeriansaliou/sonic'
|
||||
__dependencies__ = []
|
||||
|
||||
import abx
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_PLUGIN():
|
||||
return {
|
||||
'sonic': {
|
||||
'PACKAGE': __package__,
|
||||
'LABEL': __label__,
|
||||
'VERSION': __version__,
|
||||
'AUTHOR': __author__,
|
||||
'HOMEPAGE': __homepage__,
|
||||
'DEPENDENCIES': __dependencies__,
|
||||
}
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def get_CONFIG():
|
||||
from .config import SONIC_CONFIG
|
||||
|
||||
return {
|
||||
'sonic': SONIC_CONFIG
|
||||
}
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_BINARIES():
|
||||
from .binaries import SONIC_BINARY
|
||||
|
||||
return {
|
||||
'sonic': SONIC_BINARY
|
||||
}
|
||||
|
||||
|
||||
@abx.hookimpl
|
||||
def get_SEARCHBACKENDS():
|
||||
from .searchbackend import SONIC_SEARCH_BACKEND
|
||||
|
||||
return {
|
||||
'sonic': SONIC_SEARCH_BACKEND,
|
||||
}
|
||||
|
||||
@abx.hookimpl
|
||||
def ready():
|
||||
from .config import SONIC_CONFIG
|
||||
SONIC_CONFIG.validate()
|
||||
27
packages/abx-plugin-sonic-search/binaries.py
Normal file
27
packages/abx-plugin-sonic-search/binaries.py
Normal file
@@ -0,0 +1,27 @@
|
||||
__package__ = 'plugins_search.sonic'
|
||||
|
||||
from typing import List
|
||||
|
||||
from pydantic import InstanceOf
|
||||
from pydantic_pkgr import BinProvider, BinaryOverrides, BinName
|
||||
|
||||
from abx.archivebox.base_binary import BaseBinary, env, brew
|
||||
|
||||
from .config import SONIC_CONFIG
|
||||
|
||||
|
||||
class SonicBinary(BaseBinary):
|
||||
name: BinName = SONIC_CONFIG.SONIC_BINARY
|
||||
binproviders_supported: List[InstanceOf[BinProvider]] = [brew, env] # TODO: add cargo
|
||||
|
||||
overrides: BinaryOverrides = {
|
||||
brew.name: {'packages': ['sonic']},
|
||||
# cargo.name: {'packages': ['sonic-server']}, # TODO: add cargo
|
||||
}
|
||||
|
||||
# TODO: add version checking over protocol? for when sonic backend is on remote server and binary is not installed locally
|
||||
# def on_get_version(self):
|
||||
# with sonic.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl:
|
||||
# return SemVer.parse(str(ingestcl.protocol))
|
||||
|
||||
SONIC_BINARY = SonicBinary()
|
||||
41
packages/abx-plugin-sonic-search/config.py
Normal file
41
packages/abx-plugin-sonic-search/config.py
Normal file
@@ -0,0 +1,41 @@
|
||||
__package__ = 'plugins_search.sonic'
|
||||
|
||||
import sys
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
from abx.archivebox.base_configset import BaseConfigSet
|
||||
|
||||
from archivebox.config.common import SEARCH_BACKEND_CONFIG
|
||||
|
||||
|
||||
SONIC_LIB = None
|
||||
try:
|
||||
import sonic
|
||||
SONIC_LIB = sonic
|
||||
except ImportError:
|
||||
SONIC_LIB = None
|
||||
|
||||
###################### Config ##########################
|
||||
|
||||
|
||||
class SonicConfig(BaseConfigSet):
|
||||
SONIC_BINARY: str = Field(default='sonic')
|
||||
|
||||
SONIC_HOST: str = Field(default='localhost', alias='SEARCH_BACKEND_HOST_NAME')
|
||||
SONIC_PORT: int = Field(default=1491, alias='SEARCH_BACKEND_PORT')
|
||||
SONIC_PASSWORD: str = Field(default='SecretPassword', alias='SEARCH_BACKEND_PASSWORD')
|
||||
SONIC_COLLECTION: str = Field(default='archivebox')
|
||||
SONIC_BUCKET: str = Field(default='archivebox')
|
||||
|
||||
SONIC_MAX_CHUNK_LENGTH: int = Field(default=2000)
|
||||
SONIC_MAX_TEXT_LENGTH: int = Field(default=100000000)
|
||||
SONIC_MAX_RETRIES: int = Field(default=5)
|
||||
|
||||
def validate(self):
|
||||
if SEARCH_BACKEND_CONFIG.SEARCH_BACKEND_ENGINE == 'sonic' and SONIC_LIB is None:
|
||||
sys.stderr.write('[X] Error: Sonic search backend is enabled but sonic-client lib is not installed. You may need to run: pip install archivebox[sonic]\n')
|
||||
# dont hard exit here. in case the user is just running "archivebox version" or "archivebox help", we still want those to work despite broken ldap
|
||||
SEARCH_BACKEND_CONFIG.update_in_place(SEARCH_BACKEND_ENGINE='ripgrep')
|
||||
|
||||
SONIC_CONFIG = SonicConfig()
|
||||
7
packages/abx-plugin-sonic-search/pyproject.toml
Normal file
7
packages/abx-plugin-sonic-search/pyproject.toml
Normal file
@@ -0,0 +1,7 @@
|
||||
[project]
|
||||
name = "abx-sonic-search"
|
||||
version = "0.1.0"
|
||||
description = "Add your description here"
|
||||
readme = "README.md"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = []
|
||||
51
packages/abx-plugin-sonic-search/searchbackend.py
Normal file
51
packages/abx-plugin-sonic-search/searchbackend.py
Normal file
@@ -0,0 +1,51 @@
|
||||
__package__ = 'plugins_search.sonic'
|
||||
|
||||
from typing import List, Generator, cast
|
||||
|
||||
from abx.archivebox.base_searchbackend import BaseSearchBackend
|
||||
|
||||
|
||||
from .config import SONIC_CONFIG, SONIC_LIB
|
||||
|
||||
|
||||
class SonicSearchBackend(BaseSearchBackend):
|
||||
name: str = 'sonic'
|
||||
docs_url: str = 'https://github.com/valeriansaliou/sonic'
|
||||
|
||||
@staticmethod
|
||||
def index(snapshot_id: str, texts: List[str]):
|
||||
error_count = 0
|
||||
with SONIC_LIB.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl:
|
||||
for text in texts:
|
||||
chunks = (
|
||||
text[i:i+SONIC_CONFIG.SONIC_MAX_CHUNK_LENGTH]
|
||||
for i in range(
|
||||
0,
|
||||
min(len(text), SONIC_CONFIG.SONIC_MAX_TEXT_LENGTH),
|
||||
SONIC_CONFIG.SONIC_MAX_CHUNK_LENGTH,
|
||||
)
|
||||
)
|
||||
try:
|
||||
for chunk in chunks:
|
||||
ingestcl.push(SONIC_CONFIG.SONIC_COLLECTION, SONIC_CONFIG.SONIC_BUCKET, snapshot_id, str(chunk))
|
||||
except Exception as err:
|
||||
print(f'[!] Sonic search backend threw an error while indexing: {err.__class__.__name__} {err}')
|
||||
error_count += 1
|
||||
if error_count > SONIC_CONFIG.SONIC_MAX_RETRIES:
|
||||
raise
|
||||
|
||||
@staticmethod
|
||||
def flush(snapshot_ids: Generator[str, None, None]):
|
||||
with SONIC_LIB.IngestClient(SONIC_CONFIG.SONIC_HOST, str(SONIC_CONFIG.SONIC_PORT), SONIC_CONFIG.SONIC_PASSWORD) as ingestcl:
|
||||
for id in snapshot_ids:
|
||||
ingestcl.flush_object(SONIC_CONFIG.SONIC_COLLECTION, SONIC_CONFIG.SONIC_BUCKET, str(id))
|
||||
|
||||
|
||||
@staticmethod
|
||||
def search(text: str) -> List[str]:
|
||||
with SONIC_LIB.SearchClient(SONIC_CONFIG.SONIC_HOST, SONIC_CONFIG.SONIC_PORT, SONIC_CONFIG.SONIC_PASSWORD) as querycl:
|
||||
snap_ids = cast(List[str], querycl.query(SONIC_CONFIG.SONIC_COLLECTION, SONIC_CONFIG.SONIC_BUCKET, text))
|
||||
return [str(id) for id in snap_ids]
|
||||
|
||||
|
||||
SONIC_SEARCH_BACKEND = SonicSearchBackend()
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user