rename vendor dir to pkgs

This commit is contained in:
Nick Sweeting
2024-10-28 20:05:20 -07:00
parent 7d75867650
commit dee4eb7992
168 changed files with 47 additions and 54 deletions

View File

@@ -0,0 +1,34 @@
__label__ = 'Chrome'
__author__ = 'ArchiveBox'
import abx
@abx.hookimpl
def get_CONFIG():
from .config import CHROME_CONFIG
return {
'CHROME_CONFIG': CHROME_CONFIG
}
@abx.hookimpl
def get_BINARIES():
from .binaries import CHROME_BINARY
return {
'chrome': CHROME_BINARY,
}
@abx.hookimpl
def ready():
from .config import CHROME_CONFIG
CHROME_CONFIG.validate()
# @abx.hookimpl
# def get_EXTRACTORS():
# return {
# 'pdf': PDF_EXTRACTOR,
# 'screenshot': SCREENSHOT_EXTRACTOR,
# 'dom': DOM_EXTRACTOR,
# }

View File

@@ -0,0 +1,149 @@
import os
import platform
from pathlib import Path
from typing import List, Optional
from pydantic import InstanceOf
from pydantic_pkgr import (
Binary,
BinProvider,
BinName,
BinaryOverrides,
bin_abspath,
)
import abx
from abx_plugin_default_binproviders import apt, brew, env
from abx_plugin_puppeteer.binproviders import PUPPETEER_BINPROVIDER
from abx_plugin_playwright.binproviders import PLAYWRIGHT_BINPROVIDER
from .config import CHROME_CONFIG
CHROMIUM_BINARY_NAMES_LINUX = [
"chromium",
"chromium-browser",
"chromium-browser-beta",
"chromium-browser-unstable",
"chromium-browser-canary",
"chromium-browser-dev",
]
CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"]
CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS
CHROME_BINARY_NAMES_LINUX = [
"google-chrome",
"google-chrome-stable",
"google-chrome-beta",
"google-chrome-canary",
"google-chrome-unstable",
"google-chrome-dev",
"chrome"
]
CHROME_BINARY_NAMES_MACOS = [
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
]
CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
CHROME_APT_DEPENDENCIES = [
'apt-transport-https', 'at-spi2-common',
'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei',
'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2',
'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1',
'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings',
'chromium-browser',
]
def autodetect_system_chrome_install(PATH=None) -> Optional[Path]:
for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES:
abspath = bin_abspath(bin_name, PATH=env.PATH)
if abspath:
return abspath
return None
def create_macos_app_symlink(target: Path, shortcut: Path):
"""
on macOS, some binaries are inside of .app, so we need to
create a tiny bash script instead of a symlink
(so that ../ parent relationships are relative to original .app instead of callsite dir)
"""
# TODO: should we enforce this? is it useful in any other situation?
# if platform.system().lower() != 'darwin':
# raise Exception(...)
shortcut.unlink(missing_ok=True)
shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""")
shortcut.chmod(0o777) # make sure its executable by everyone
###################### Config ##########################
class ChromeBinary(Binary):
name: BinName = CHROME_CONFIG.CHROME_BINARY
binproviders_supported: List[InstanceOf[BinProvider]] = [PUPPETEER_BINPROVIDER, env, PLAYWRIGHT_BINPROVIDER, apt, brew]
overrides: BinaryOverrides = {
env.name: {
'abspath': lambda: autodetect_system_chrome_install(PATH=env.PATH), # /usr/bin/google-chrome-stable
},
PUPPETEER_BINPROVIDER.name: {
'packages': ['chrome@stable'], # npx @puppeteer/browsers install chrome@stable
},
PLAYWRIGHT_BINPROVIDER.name: {
'packages': ['chromium'], # playwright install chromium
},
apt.name: {
'packages': CHROME_APT_DEPENDENCIES,
},
brew.name: {
'packages': ['--cask', 'chromium'] if platform.system().lower() == 'darwin' else [],
},
}
@staticmethod
def symlink_to_lib(binary, bin_dir=None) -> None:
bin_dir = bin_dir or abx.pm.hook.get_BIN_DIR()
if not (binary.abspath and os.path.isfile(binary.abspath)):
return
bin_dir.mkdir(parents=True, exist_ok=True)
symlink = bin_dir / binary.name
try:
if platform.system().lower() == 'darwin':
# if on macOS, browser binary is inside a .app, so we need to create a tiny bash script instead of a symlink
create_macos_app_symlink(binary.abspath, symlink)
else:
# otherwise on linux we can symlink directly to binary executable
symlink.unlink(missing_ok=True)
symlink.symlink_to(binary.abspath)
except Exception:
# print(f'[red]:warning: Failed to symlink {symlink} -> {binary.abspath}[/red] {err}')
# not actually needed, we can just run without it
pass
@staticmethod
def chrome_cleanup_lockfile():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
"""
try:
linux_lock_file = Path("~/.config/chromium/SingletonLock").expanduser()
linux_lock_file.unlink(missing_ok=True)
except Exception:
pass
if CHROME_CONFIG.CHROME_USER_DATA_DIR:
try:
(CHROME_CONFIG.CHROME_USER_DATA_DIR / 'SingletonLock').unlink(missing_ok=True)
except Exception:
pass
CHROME_BINARY = ChromeBinary()

View File

@@ -0,0 +1,201 @@
import os
from pathlib import Path
from typing import List, Optional
from pydantic import Field
from pydantic_pkgr import bin_abspath
from abx_spec_config.base_configset import BaseConfigSet
from abx_plugin_default_binproviders import env
from archivebox.config import CONSTANTS
from archivebox.config.common import ARCHIVING_CONFIG, SHELL_CONFIG
from archivebox.misc.logging import STDERR
from archivebox.misc.util import dedupe
from archivebox.logging_util import pretty_path
CHROMIUM_BINARY_NAMES_LINUX = [
"chromium",
"chromium-browser",
"chromium-browser-beta",
"chromium-browser-unstable",
"chromium-browser-canary",
"chromium-browser-dev",
]
CHROMIUM_BINARY_NAMES_MACOS = ["/Applications/Chromium.app/Contents/MacOS/Chromium"]
CHROMIUM_BINARY_NAMES = CHROMIUM_BINARY_NAMES_LINUX + CHROMIUM_BINARY_NAMES_MACOS
CHROME_BINARY_NAMES_LINUX = [
"google-chrome",
"google-chrome-stable",
"google-chrome-beta",
"google-chrome-canary",
"google-chrome-unstable",
"google-chrome-dev",
"chrome"
]
CHROME_BINARY_NAMES_MACOS = [
"/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"/Applications/Google Chrome Canary.app/Contents/MacOS/Google Chrome Canary",
]
CHROME_BINARY_NAMES = CHROME_BINARY_NAMES_LINUX + CHROME_BINARY_NAMES_MACOS
APT_DEPENDENCIES = [
'apt-transport-https', 'at-spi2-common', 'chromium-browser',
'fontconfig', 'fonts-freefont-ttf', 'fonts-ipafont-gothic', 'fonts-kacst', 'fonts-khmeros', 'fonts-liberation', 'fonts-noto', 'fonts-noto-color-emoji', 'fonts-symbola', 'fonts-thai-tlwg', 'fonts-tlwg-loma-otf', 'fonts-unifont', 'fonts-wqy-zenhei',
'libasound2', 'libatk-bridge2.0-0', 'libatk1.0-0', 'libatspi2.0-0', 'libavahi-client3', 'libavahi-common-data', 'libavahi-common3', 'libcairo2', 'libcups2',
'libdbus-1-3', 'libdrm2', 'libfontenc1', 'libgbm1', 'libglib2.0-0', 'libice6', 'libnspr4', 'libnss3', 'libsm6', 'libunwind8', 'libx11-6', 'libxaw7', 'libxcb1',
'libxcomposite1', 'libxdamage1', 'libxext6', 'libxfixes3', 'libxfont2', 'libxkbcommon0', 'libxkbfile1', 'libxmu6', 'libxpm4', 'libxrandr2', 'libxt6', 'x11-utils', 'x11-xkb-utils', 'xfonts-encodings',
]
def autodetect_system_chrome_install(PATH=None) -> Optional[Path]:
for bin_name in CHROME_BINARY_NAMES + CHROMIUM_BINARY_NAMES:
abspath = bin_abspath(bin_name, PATH=env.PATH)
if abspath:
return abspath
return None
def create_macos_app_symlink(target: Path, shortcut: Path):
"""
on macOS, some binaries are inside of .app, so we need to
create a tiny bash script instead of a symlink
(so that ../ parent relationships are relative to original .app instead of callsite dir)
"""
# TODO: should we enforce this? is it useful in any other situation?
# if platform.system().lower() != 'darwin':
# raise Exception(...)
shortcut.unlink(missing_ok=True)
shortcut.write_text(f"""#!/usr/bin/env bash\nexec '{target}' "$@"\n""")
shortcut.chmod(0o777) # make sure its executable by everyone
###################### Config ##########################
class ChromeConfig(BaseConfigSet):
USE_CHROME: bool = Field(default=True)
# Chrome Binary
CHROME_BINARY: str = Field(default='chrome')
CHROME_DEFAULT_ARGS: List[str] = Field(default=[
'--virtual-time-budget=15000',
'--disable-features=DarkMode',
"--run-all-compositor-stages-before-draw",
"--hide-scrollbars",
"--autoplay-policy=no-user-gesture-required",
"--no-first-run",
"--use-fake-ui-for-media-stream",
"--use-fake-device-for-media-stream",
"--simulate-outdated-no-au='Tue, 31 Dec 2099 23:59:59 GMT'",
])
CHROME_EXTRA_ARGS: List[str] = Field(default=[])
# Chrome Options Tuning
CHROME_TIMEOUT: int = Field(default=lambda: ARCHIVING_CONFIG.TIMEOUT - 10)
CHROME_HEADLESS: bool = Field(default=True)
CHROME_SANDBOX: bool = Field(default=lambda: not SHELL_CONFIG.IN_DOCKER)
CHROME_RESOLUTION: str = Field(default=lambda: ARCHIVING_CONFIG.RESOLUTION)
CHROME_CHECK_SSL_VALIDITY: bool = Field(default=lambda: ARCHIVING_CONFIG.CHECK_SSL_VALIDITY)
# Cookies & Auth
CHROME_USER_AGENT: str = Field(default=lambda: ARCHIVING_CONFIG.USER_AGENT)
CHROME_USER_DATA_DIR: Path | None = Field(default=CONSTANTS.PERSONAS_DIR / 'Default' / 'chrome_profile')
CHROME_PROFILE_NAME: str = Field(default='Default')
# Extractor Toggles
SAVE_SCREENSHOT: bool = Field(default=True, alias='FETCH_SCREENSHOT')
SAVE_DOM: bool = Field(default=True, alias='FETCH_DOM')
SAVE_PDF: bool = Field(default=True, alias='FETCH_PDF')
def validate(self):
from archivebox.config.paths import create_and_chown_dir
if self.USE_CHROME and self.CHROME_TIMEOUT < 15:
STDERR.print()
STDERR.print(f'[red][!] Warning: TIMEOUT is set too low! (currently set to TIMEOUT={self.CHROME_TIMEOUT} seconds)[/red]')
STDERR.print(' Chrome will fail to archive all sites if set to less than ~15 seconds.')
STDERR.print(' (Setting it to somewhere between 30 and 300 seconds is recommended)')
STDERR.print()
STDERR.print(' If you want to make ArchiveBox run faster, disable specific archive methods instead:')
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#archive-method-toggles')
STDERR.print()
# if user has specified a user data dir, make sure its valid
if self.USE_CHROME and self.CHROME_USER_DATA_DIR:
try:
create_and_chown_dir(self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME)
except Exception:
pass
# check to make sure user_data_dir/<profile_name> exists
if not os.path.isdir(self.CHROME_USER_DATA_DIR / self.CHROME_PROFILE_NAME):
STDERR.print()
STDERR.print()
STDERR.print(f'[red][X] Could not find profile "{self.CHROME_PROFILE_NAME}" in CHROME_USER_DATA_DIR.[/red]')
STDERR.print(f' {self.CHROME_USER_DATA_DIR}')
STDERR.print(' Make sure you set it to a Chrome user data directory containing a Default profile folder.')
STDERR.print(' For more info see:')
STDERR.print(' https://github.com/ArchiveBox/ArchiveBox/wiki/Configuration#CHROME_USER_DATA_DIR')
# show special hint if they made the common mistake of putting /Default at the end of the path
if str(self.CHROME_USER_DATA_DIR).replace(str(CONSTANTS.PERSONAS_DIR / 'Default'), '').endswith('/Default'):
STDERR.print()
STDERR.print(' Try removing /Default from the end e.g.:')
STDERR.print(' CHROME_USER_DATA_DIR="{}"'.format(str(self.CHROME_USER_DATA_DIR).rsplit('/Default', 1)[0]))
self.update_in_place(CHROME_USER_DATA_DIR=None)
def chrome_args(self, **options) -> List[str]:
"""helper to build up a chrome shell command with arguments"""
# Chrome CLI flag documentation: https://peter.sh/experiments/chromium-command-line-switches/
options = self.model_copy(update=options)
cmd_args = [*options.CHROME_DEFAULT_ARGS, *options.CHROME_EXTRA_ARGS]
if options.CHROME_HEADLESS:
cmd_args += ["--headless=new"] # expects chrome version >= 111
if not options.CHROME_SANDBOX:
# assume this means we are running inside a docker container
# in docker, GPU support is limited, sandboxing is unecessary,
# and SHM is limited to 64MB by default (which is too low to be usable).
cmd_args += (
"--no-sandbox",
"--no-zygote",
"--disable-dev-shm-usage",
"--disable-software-rasterizer",
"--disable-sync",
# "--password-store=basic",
)
# set window size for screenshot/pdf/etc. rendering
cmd_args += ('--window-size={}'.format(options.CHROME_RESOLUTION),)
if not options.CHROME_CHECK_SSL_VALIDITY:
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
if options.CHROME_USER_AGENT:
cmd_args += ('--user-agent={}'.format(options.CHROME_USER_AGENT),)
# this no longer works on newer chrome version for some reason, just causes chrome to hang indefinitely:
# if options.CHROME_TIMEOUT:
# cmd_args += ('--timeout={}'.format(options.CHROME_TIMEOUT * 1000),)
if options.CHROME_USER_DATA_DIR:
cmd_args.append('--user-data-dir={}'.format(options.CHROME_USER_DATA_DIR))
cmd_args.append('--profile-directory={}'.format(options.CHROME_PROFILE_NAME or 'Default'))
if not os.path.isfile(options.CHROME_USER_DATA_DIR / options.CHROME_PROFILE_NAME / 'Preferences'):
STDERR.print(f'[green] + creating new Chrome profile in: {pretty_path(options.CHROME_USER_DATA_DIR / options.CHROME_PROFILE_NAME)}[/green]')
cmd_args.remove('--no-first-run')
cmd_args.append('--first-run')
return dedupe(cmd_args)
CHROME_CONFIG = ChromeConfig()

View File

@@ -0,0 +1,76 @@
__package__ = 'archivebox.extractors'
from pathlib import Path
from typing import Optional
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from archivebox.misc.system import run, chmod_file, atomic_write
from archivebox.misc.util import (
enforce_types,
is_static_file,
)
from ..logging_util import TimedProgress
from plugins_extractor.chrome.config import CHROME_CONFIG
from plugins_extractor.chrome.binaries import CHROME_BINARY
def get_output_path():
return 'output.html'
@enforce_types
def should_save_dom(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url):
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / get_output_path()).exists():
if (out_dir / get_output_path()).stat().st_size > 1:
return False
return CHROME_CONFIG.SAVE_DOM
@enforce_types
def save_dom(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""print HTML of site to file using chrome --dump-html"""
CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path()
output_path = out_dir / output
cmd = [
str(CHROME_BIN.abspath),
*CHROME_CONFIG.chrome_args(),
'--dump-dom',
link.url
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True)
atomic_write(output_path, result.stdout)
if result.returncode:
hints = result.stderr
raise ArchiveError('Failed to save DOM', hints)
chmod_file(output, cwd=str(out_dir))
except Exception as err:
status = 'failed'
output = err
CHROME_BINARY.chrome_cleanup_lockfile()
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=str(out_dir),
cmd_version=str(CHROME_BIN.version),
output=output,
status=status,
**timer.stats,
)

View File

@@ -0,0 +1,75 @@
__package__ = 'archivebox.extractors'
from pathlib import Path
from typing import Optional
from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import (
enforce_types,
is_static_file,
)
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..logging_util import TimedProgress
from plugins_extractor.chrome.config import CHROME_CONFIG
from plugins_extractor.chrome.binaries import CHROME_BINARY
def get_output_path():
return 'output.pdf'
@enforce_types
def should_save_pdf(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url):
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / get_output_path()).exists():
return False
return CHROME_CONFIG.SAVE_PDF
@enforce_types
def save_pdf(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""print PDF of site to file using chrome --headless"""
CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path()
cmd = [
str(CHROME_BIN.abspath),
*CHROME_CONFIG.chrome_args(),
'--print-to-pdf',
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True)
if result.returncode:
hints = (result.stderr or result.stdout)
raise ArchiveError('Failed to save PDF', hints)
chmod_file(get_output_path(), cwd=str(out_dir))
except Exception as err:
status = 'failed'
output = err
CHROME_BINARY.chrome_cleanup_lockfile()
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=str(out_dir),
cmd_version=str(CHROME_BINARY.version),
output=output,
status=status,
**timer.stats,
)

View File

@@ -0,0 +1,70 @@
__package__ = 'archivebox.extractors'
from pathlib import Path
from typing import Optional
from archivebox.misc.system import run, chmod_file
from archivebox.misc.util import enforce_types, is_static_file
from ..index.schema import Link, ArchiveResult, ArchiveOutput, ArchiveError
from ..logging_util import TimedProgress
from plugins_extractor.chrome.config import CHROME_CONFIG
from plugins_extractor.chrome.binaries import CHROME_BINARY
def get_output_path():
return 'screenshot.png'
@enforce_types
def should_save_screenshot(link: Link, out_dir: Optional[Path]=None, overwrite: Optional[bool]=False) -> bool:
if is_static_file(link.url):
return False
out_dir = out_dir or Path(link.link_dir)
if not overwrite and (out_dir / get_output_path()).exists():
return False
return CHROME_CONFIG.SAVE_SCREENSHOT
@enforce_types
def save_screenshot(link: Link, out_dir: Optional[Path]=None, timeout: int=60) -> ArchiveResult:
"""take screenshot of site using chrome --headless"""
CHROME_BIN = CHROME_BINARY.load()
assert CHROME_BIN.abspath and CHROME_BIN.version
out_dir = out_dir or Path(link.link_dir)
output: ArchiveOutput = get_output_path()
cmd = [
str(CHROME_BIN.abspath),
*CHROME_CONFIG.chrome_args(),
'--screenshot',
link.url,
]
status = 'succeeded'
timer = TimedProgress(timeout, prefix=' ')
try:
result = run(cmd, cwd=str(out_dir), timeout=timeout, text=True)
if result.returncode:
hints = (result.stderr or result.stdout)
raise ArchiveError('Failed to save screenshot', hints)
chmod_file(output, cwd=str(out_dir))
except Exception as err:
status = 'failed'
output = err
CHROME_BINARY.chrome_cleanup_lockfile()
finally:
timer.end()
return ArchiveResult(
cmd=cmd,
pwd=str(out_dir),
cmd_version=str(CHROME_BIN.version),
output=output,
status=status,
**timer.stats,
)

View File

@@ -0,0 +1,18 @@
[project]
name = "abx-plugin-chrome"
version = "2024.10.28"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.10"
dependencies = [
"abx>=0.1.0",
"abx-spec-config>=0.1.0",
"abx-spec-pydantic-pkgr>=0.1.0",
]
[build-system]
requires = ["hatchling"]
build-backend = "hatchling.build"
[project.entry-points.abx]
abx_plugin_chrome = "abx_plugin_chrome"