wip major changes

This commit is contained in:
Nick Sweeting
2025-12-24 20:09:51 -08:00
parent c1335fed37
commit 1915333b81
450 changed files with 35814 additions and 19015 deletions

View File

@@ -345,19 +345,41 @@ class ExtendedEncoder(pyjson.JSONEncoder):
elif isinstance(obj, Exception):
return '{}: {}'.format(obj.__class__.__name__, obj)
elif isinstance(obj, Path):
return str(obj)
elif cls_name in ('dict_items', 'dict_keys', 'dict_values'):
return tuple(obj)
return list(obj)
elif isinstance(obj, Callable):
return str(obj)
# Try dict/list conversion as fallback
try:
return dict(obj)
except Exception:
pass
try:
return list(obj)
except Exception:
pass
try:
return str(obj)
except Exception:
pass
return pyjson.JSONEncoder.default(self, obj)
@enforce_types
def to_json(obj: Any, indent: Optional[int]=4, sort_keys: bool=True) -> str:
"""Serialize object to JSON string with extended type support"""
return pyjson.dumps(obj, indent=indent, sort_keys=sort_keys, cls=ExtendedEncoder)
### URL PARSING TESTS / ASSERTIONS
# Check that plain text regex URL parsing works as expected
@@ -452,3 +474,78 @@ _test_url_strs = {
for url_str, num_urls in _test_url_strs.items():
assert len(list(find_all_urls(url_str))) == num_urls, (
f'{url_str} does not contain {num_urls} urls')
### Chrome Helpers
def chrome_args(**options) -> List[str]:
"""Helper to build up a chrome shell command with arguments."""
import shutil
from archivebox.config import CHECK_SSL_VALIDITY, RESOLUTION, USER_AGENT, CHROME_BINARY
chrome_binary = options.get('CHROME_BINARY', CHROME_BINARY)
chrome_headless = options.get('CHROME_HEADLESS', True)
chrome_sandbox = options.get('CHROME_SANDBOX', True)
check_ssl = options.get('CHECK_SSL_VALIDITY', CHECK_SSL_VALIDITY)
user_agent = options.get('CHROME_USER_AGENT', USER_AGENT)
resolution = options.get('RESOLUTION', RESOLUTION)
timeout = options.get('CHROME_TIMEOUT', 0)
user_data_dir = options.get('CHROME_USER_DATA_DIR', None)
if not chrome_binary:
raise Exception('Could not find any CHROME_BINARY installed on your system')
cmd_args = [chrome_binary]
if chrome_headless:
cmd_args += ("--headless=new",)
if not chrome_sandbox:
# running in docker or other sandboxed environment
cmd_args += (
"--no-sandbox",
"--no-zygote",
"--disable-dev-shm-usage",
"--disable-software-rasterizer",
"--run-all-compositor-stages-before-draw",
"--hide-scrollbars",
"--autoplay-policy=no-user-gesture-required",
"--no-first-run",
"--use-fake-ui-for-media-stream",
"--use-fake-device-for-media-stream",
"--disable-sync",
)
if not check_ssl:
cmd_args += ('--disable-web-security', '--ignore-certificate-errors')
if user_agent:
cmd_args += (f'--user-agent={user_agent}',)
if resolution:
cmd_args += (f'--window-size={resolution}',)
if timeout:
cmd_args += (f'--timeout={timeout * 1000}',)
if user_data_dir:
cmd_args += (f'--user-data-dir={user_data_dir}',)
return cmd_args
def chrome_cleanup():
"""
Cleans up any state or runtime files that chrome leaves behind when killed by
a timeout or other error
"""
import os
from archivebox.config.permissions import IN_DOCKER
if IN_DOCKER:
singleton_lock = "/home/archivebox/.config/chromium/SingletonLock"
if os.path.lexists(singleton_lock):
try:
os.remove(singleton_lock)
except OSError:
pass